recoll / Code / Diff of /src/query/plaintorich.cpp

Diff of /src/query/plaintorich.cpp [7dcc7c] .. [93d0aa]

Switch to unified view


...
        sterms += "GROUP: ";
        sterms += vecStringToString(*vit);
        sterms += "\n";
    }
    LOGDEB0(("  %s", sterms.c_str()));
        LOGDEB2(("  TEXT:[%s]\n", in.c_str()));
    }

    // Compute the positions for the query terms.  We use the text
    // splitter to break the text into words, and compare the words to
    // the search terms,
...
    }
#endif

    // Input character iterator
    Utf8Iter chariter(in);

    // State variable used to limit the number of consecutive empty lines,
    // and convert all eol to '\n'
    int eol = 0;
    int hadcr = 0;

    // Value for numbered anchors at each term match
    int anchoridx = 1;
    // HTML state
    bool intag = false, inparamvalue = false;
    // My tag state
    int inrcltag = 0;

    unsigned int headend = 0;
    if (m_inputhtml) {
    headend = in.find("</head>");
    if (headend == string::npos)
        headend = in.find("</HEAD>");
    if (headend != string::npos)
        headend += 7;
    }

    for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
    // Check from time to time if we need to stop
    if ((pos & 0xfff) == 0) {
        CancelCheck::instance().checkCancel();
    }
...
        if (!intag && ibyteidx > (int)headend) {
            *olit += startAnchor(anchoridx);
            *olit += startMatch();
        }
        anchoridx++;
                inrcltag = 1;
        } else if (ibyteidx == tPosIt->second) {
        // Output end of match region tags
        if (!intag && ibyteidx > (int)headend) {
            *olit += endMatch();
            *olit += endAnchor();
        }
        // Skip all highlight areas that would overlap this one
        int crend = tPosIt->second;
        while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
            tPosIt++;
                inrcltag = 0;







        }
    }
        
        unsigned int car = *chariter;

        if (car == '\n') {
            if (!hadcr)
                eol++;
            hadcr = 0;
            continue;
        } else if (car == '\r') {
            hadcr++;
            eol++;
            continue;
        } else if (eol) {
            // Do line break;
            hadcr = 0;
            if (eol > 2)
                eol = 2;
            while (eol) {
                *olit += "\n";
                eol--;
            }
            // Maybe end this chunk, begin next. Don't do it on html
            // there is just no way to do it right (qtextedit cant grok
            // chunks cut in the middle of <a></a> for example).
            if (!m_inputhtml && !inrcltag && 
                olit->size() > (unsigned int)chunksize) {
                out.push_back(string(startChunk()));
                olit++;
            }
        }

        switch (car) {
        case '<':
            if (m_inputhtml) {
                if (!inparamvalue)
                    intag = true;
                chariter.appendchartostring(*olit);    
            } else {
                *olit += "&lt;";
            }
            break;
        case '>':
            if (m_inputhtml) {
                if (!inparamvalue)
                    intag = false;
            }
            chariter.appendchartostring(*olit);    
            break;
        case '&':
            if (m_inputhtml) {
                chariter.appendchartostring(*olit);
            } else {
                *olit += "&amp;";
            }
            break;
        case '"':
            if (m_inputhtml && intag) {
                inparamvalue = !inparamvalue;
            }


            chariter.appendchartostring(*olit);
            break;
















        default:





            chariter.appendchartostring(*olit);

        }

    } // End chariter loop

#if 0
    {
    FILE *fp = fopen("/tmp/debugplaintorich", "a");
    fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");
    for (list<string>::iterator it = out.begin();

	a/src/query/plaintorich.cpp		b/src/query/plaintorich.cpp
	...		...
325	sterms += "GROUP: ";	325	sterms += "GROUP: ";
326	sterms += vecStringToString(*vit);	326	sterms += vecStringToString(*vit);
327	sterms += "\n";	327	sterms += "\n";
328	}	328	}
329	LOGDEB0((" %s", sterms.c_str()));	329	LOGDEB0((" %s", sterms.c_str()));
		330	LOGDEB2((" TEXT:[%s]\n", in.c_str()));
330	}	331	}
331		332
332	// Compute the positions for the query terms. We use the text	333	// Compute the positions for the query terms. We use the text
333	// splitter to break the text into words, and compare the words to	334	// splitter to break the text into words, and compare the words to
334	// the search terms,	335	// the search terms,
	...		...
362	}	363	}
363	#endif	364	#endif
364		365
365	// Input character iterator	366	// Input character iterator
366	Utf8Iter chariter(in);	367	Utf8Iter chariter(in);
		368
367	// State variable used to limit the number of consecutive empty lines	369	// State variable used to limit the number of consecutive empty lines,
		370	// and convert all eol to '\n'
368	int ateol = 0;	371	int eol = 0;
		372	int hadcr = 0;
369		373
370	// Value for numbered anchors at each term match	374	// Value for numbered anchors at each term match
371	int anchoridx = 1;	375	int anchoridx = 1;
372	// html state	376	// HTML state
373	bool intag = false, inparamvalue = false;	377	bool intag = false, inparamvalue = false;
		378	// My tag state
		379	int inrcltag = 0;
		380
374	unsigned int headend = 0;	381	unsigned int headend = 0;
375	if (m_inputhtml) {	382	if (m_inputhtml) {
376	headend = in.find("</head>");	383	headend = in.find("</head>");
377	if (headend == string::npos)	384	if (headend == string::npos)
378	headend = in.find("</HEAD>");	385	headend = in.find("</HEAD>");
379	if (headend != string::npos)	386	if (headend != string::npos)
380	headend += 7;	387	headend += 7;
381	}	388	}
		389
382	for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {	390	for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
383	// Check from time to time if we need to stop	391	// Check from time to time if we need to stop
384	if ((pos & 0xfff) == 0) {	392	if ((pos & 0xfff) == 0) {
385	CancelCheck::instance().checkCancel();	393	CancelCheck::instance().checkCancel();
386	}	394	}
	...		...
393	if (!intag && ibyteidx > (int)headend) {	401	if (!intag && ibyteidx > (int)headend) {
394	*olit += startAnchor(anchoridx);	402	*olit += startAnchor(anchoridx);
395	*olit += startMatch();	403	*olit += startMatch();
396	}	404	}
397	anchoridx++;	405	anchoridx++;
		406	inrcltag = 1;
398	} else if (ibyteidx == tPosIt->second) {	407	} else if (ibyteidx == tPosIt->second) {
399	// Output end or match region tags	408	// Output end of match region tags
400	if (!intag && ibyteidx > (int)headend) {	409	if (!intag && ibyteidx > (int)headend) {
401	*olit += endMatch();	410	*olit += endMatch();
402	*olit += endAnchor();	411	*olit += endAnchor();
403	}	412	}
404	// Skip all highlight areas that would overlap this one	413	// Skip all highlight areas that would overlap this one
405	int crend = tPosIt->second;	414	int crend = tPosIt->second;
406	while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)	415	while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
407	tPosIt++;	416	tPosIt++;
408		417	inrcltag = 0;
409	// Maybe end this chunk, begin next. Don't do it on html
410	// there is just no way to do it right (qtextedit cant grok
411	// chunks cut in the middle of <a></a> for example).
412	if (!m_inputhtml && olit->size() > (unsigned int)chunksize) {
413	out.push_back("");
414	olit++;
415	}
416	}	418	}
417	}	419	}
		420
		421	unsigned int car = *chariter;
418		422
419	if (m_inputhtml) {	423	if (car == '\n') {
		424	if (!hadcr)
		425	eol++;
		426	hadcr = 0;
		427	continue;
		428	} else if (car == '\r') {
		429	hadcr++;
		430	eol++;
		431	continue;
		432	} else if (eol) {
		433	// Do line break;
		434	hadcr = 0;
		435	if (eol > 2)
		436	eol = 2;
		437	while (eol) {
		438	*olit += "\n";
		439	eol--;
		440	}
		441	// Maybe end this chunk, begin next. Don't do it on html
		442	// there is just no way to do it right (qtextedit cant grok
		443	// chunks cut in the middle of <a></a> for example).
		444	if (!m_inputhtml && !inrcltag &&
		445	olit->size() > (unsigned int)chunksize) {
		446	out.push_back(string(startChunk()));
		447	olit++;
		448	}
		449	}
		450
420	switch (*chariter) {	451	switch (car) {
421	case '<':	452	case '<':
422	if (!inparamvalue)	453	if (m_inputhtml) {
423	intag = true;	454	if (!inparamvalue)
424	break;	455	intag = true;
		456	chariter.appendchartostring(*olit);
		457	} else {
		458	*olit += "<";
		459	}
		460	break;
425	case '>':	461	case '>':
426	if (!inparamvalue)	462	if (m_inputhtml) {
427	intag = false;	463	if (!inparamvalue)
428	break;	464	intag = false;
		465	}
		466	chariter.appendchartostring(*olit);
		467	break;
		468	case '&':
		469	if (m_inputhtml) {
		470	chariter.appendchartostring(*olit);
		471	} else {
		472	*olit += "&";
		473	}
		474	break;
429	case '"':	475	case '"':
430	if (intag) {	476	if (m_inputhtml && intag) {
431	inparamvalue = !inparamvalue;	477	inparamvalue = !inparamvalue;
432	}	478	}
433	break;
434	}
435	chariter.appendchartostring(*olit);	479	chariter.appendchartostring(*olit);
436	} else switch (*chariter) {	480	break;
437	case '\n':
438	if (ateol < 2) {
439	*olit += "<br>\n";
440	ateol++;
441	}
442	break;
443	case '\r':
444	break;
445	case '<':
446	ateol = 0;
447	*olit += "<";
448	break;
449	case '&':
450	ateol = 0;
451	*olit += "&";
452	break;
453	default:	481	default:
454	// We don't change the eol status for whitespace, want
455	// a real line
456	if (!(chariter == ' ' \|\| chariter == '\t')) {
457	ateol = 0;
458	}
459	chariter.appendchartostring(*olit);	482	chariter.appendchartostring(*olit);
460	}
461	}	483	}
		484
		485	} // End chariter loop
		486
462	#if 0	487	#if 0
463	{	488	{
464	FILE *fp = fopen("/tmp/debugplaintorich", "a");	489	FILE *fp = fopen("/tmp/debugplaintorich", "a");
465	fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");	490	fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");
466	for (list<string>::iterator it = out.begin();	491	for (list<string>::iterator it = out.begin();