recoll / Code / Diff of /src/qtgui/plaintorich.cpp

Diff of /src/qtgui/plaintorich.cpp [3809fd] .. [42cf41]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.10 2006-02-07 09:44:33 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...

// Text splitter callback used to take note of the position of query terms 
// inside the result text. This is then used to post highlight tags. 
class myTextSplitCB : public TextSplitCB {
 public:
    string firstTerm;
    set<string>    terms;          // in: user query terms
    list<pair<int, int> > tboffs;  // out: begin and end positions of
                                   // query terms in text

    myTextSplitCB(const list<string>& its) {
...
    virtual bool takeword(const std::string& term, int, int bts, int bte) {
    string dumb;
    Rcl::dumb_string(term, dumb);
    //LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
    // pos, bts, bte));
    if (terms.find(dumb) != terms.end()) {
        tboffs.push_back(pair<int, int>(bts, bte));
      if (firstTerm.empty())
      firstTerm = term;
  }
    CancelCheck::instance().checkCancel();
    return true;
    }
};

// Fix result text for display inside the gui text window.
//
// To compute the term character positions in the output text, we used
// to emulate how qt's textedit counts chars (ignoring tags and
// duplicate whitespace etc...). This was tricky business, dependant
// on qtextedit internals, and we don't do it any more, so we finally
// don't know the term par/car positions in the editor text.  Instead,
// we return the first term encountered, and the caller will use the
// editor's find() function to position on it
bool plaintorich(const string& in, string& out, const list<string>& terms,
       string *firstTerm)
{
    Chrono chron;
    LOGDEB(("plaintorich: terms: %s\n", 
        stringlistdisp(terms).c_str()));
    out.erase();


    // We first use the text splitter to break the text into words,
    // and compare the words to the search terms, which yields the
    // query terms positions inside the text
    myTextSplitCB cb(terms);
    TextSplit splitter(&cb, true);
    // Note that splitter returns the term locations in byte, not
    // character offset
    splitter.text_to_words(in);

    if (firstTerm)
  *firstTerm = cb.firstTerm;
    LOGDEB(("plaintorich: split done %d mS\n", chron.millis()));

    // Rich text output
    out = "<qt><head><title></title></head><body><p>";

    // Iterator for the list of input term positions. We use it to
    // output highlight tags and to compute term positions in the
    // output text
    list<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();





    // Input character iterator
    Utf8Iter chariter(in);
    // State variable used to limitate the number of consecutive empty lines 
    int ateol = 0;
    // State variable to update the char pos only for the first of
...
    for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
    if (pos && (pos % 1000) == 0) {
        CancelCheck::instance().checkCancel();
    }
    // If we still have terms positions, check (byte) position
    if (tPosIt != cb.tboffs.end()) {
        int ibyteidx = chariter.getBpos();
        if (ibyteidx == tPosIt->first) {
        out += "<termtag>";

        } else if (ibyteidx == tPosIt->second) {
        if (tPosIt != cb.tboffs.end())
            tPosIt++;


        out += "</termtag>";
        }
    }
    switch(*chariter) {
    case '\n':
        if (ateol < 2) {
        out += "<br>\n";
        ateol++;

        }
        break;
    case '\r': 
        break;
    case '<':
        ateol = 0;
        out += "&lt;";

        break;
    case '&':
        ateol = 0;
        out += "&amp;";

        break;
    default:
        // We don't change the eol status for whitespace, want a real line
        if (*chariter == ' ' || *chariter == '\t') {


        atblank = 1;
        } else {
        ateol = 0;
        atblank = 0;

        }
        chariter.appendchartostring(out);
    }
    }
#if 0

	a/src/qtgui/plaintorich.cpp		b/src/qtgui/plaintorich.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.9 2006-01-27 13:42:02 dockes Exp $ (C) 2005 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.10 2006-02-07 09:44:33 dockes Exp $ (C) 2005 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
41		41
42	// Text splitter callback used to take note of the position of query terms	42	// Text splitter callback used to take note of the position of query terms
43	// inside the result text. This is then used to post highlight tags.	43	// inside the result text. This is then used to post highlight tags.
44	class myTextSplitCB : public TextSplitCB {	44	class myTextSplitCB : public TextSplitCB {
45	public:	45	public:
		46	string firstTerm;
46	set<string> terms; // in: user query terms	47	set<string> terms; // in: user query terms
47	list<pair<int, int> > tboffs; // out: begin and end positions of	48	list<pair<int, int> > tboffs; // out: begin and end positions of
48	// query terms in text	49	// query terms in text
49		50
50	myTextSplitCB(const list<string>& its) {	51	myTextSplitCB(const list<string>& its) {
	...		...
60	virtual bool takeword(const std::string& term, int, int bts, int bte) {	61	virtual bool takeword(const std::string& term, int, int bts, int bte) {
61	string dumb;	62	string dumb;
62	Rcl::dumb_string(term, dumb);	63	Rcl::dumb_string(term, dumb);
63	//LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),	64	//LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
64	// pos, bts, bte));	65	// pos, bts, bte));
65	if (terms.find(dumb) != terms.end())	66	if (terms.find(dumb) != terms.end()) {
66	tboffs.push_back(pair<int, int>(bts, bte));	67	tboffs.push_back(pair<int, int>(bts, bte));
		68	if (firstTerm.empty())
		69	firstTerm = term;
		70	}
67	CancelCheck::instance().checkCancel();	71	CancelCheck::instance().checkCancel();
68	return true;	72	return true;
69	}	73	}
70	};	74	};
71		75
72	// Fix result text for display inside the gui text window.	76	// Fix result text for display inside the gui text window.
73	//	77	//
74	// To compute the term character positions in the output text, we have	78	// To compute the term character positions in the output text, we used
75	// to emulate how qt's textedit counts chars (ignoring tags and	79	// to emulate how qt's textedit counts chars (ignoring tags and
76	// duplicate whitespace etc...). This is tricky business and it might	80	// duplicate whitespace etc...). This was tricky business, dependant
77	// be better to insert the text char by char, taking note of where qt	81	// on qtextedit internals, and we don't do it any more, so we finally
78	// thinks it is at each term.	82	// don't know the term par/car positions in the editor text. Instead,
		83	// we return the first term encountered, and the caller will use the
		84	// editor's find() function to position on it
79	bool plaintorich(const string& in, string& out, const list<string>& terms,	85	bool plaintorich(const string& in, string& out, const list<string>& terms,
80	list<pair<int, int> >&termoffsets)	86	string *firstTerm)
81	{	87	{
82	Chrono chron;	88	Chrono chron;
83	LOGDEB(("plaintorich: terms: %s\n",	89	LOGDEB(("plaintorich: terms: %s\n",
84	stringlistdisp(terms).c_str()));	90	stringlistdisp(terms).c_str()));
85	out.erase();	91	out.erase();
86	termoffsets.erase(termoffsets.begin(), termoffsets.end());
87		92
88	// We first use the text splitter to break the text into words,	93	// We first use the text splitter to break the text into words,
89	// and compare the words to the search terms, which yields the	94	// and compare the words to the search terms, which yields the
90	// query terms positions inside the text	95	// query terms positions inside the text
91	myTextSplitCB cb(terms);	96	myTextSplitCB cb(terms);
92	TextSplit splitter(&cb, true);	97	TextSplit splitter(&cb, true);
93	// Note that splitter returns the term locations in byte, not	98	// Note that splitter returns the term locations in byte, not
94	// character offset	99	// character offset
95	splitter.text_to_words(in);	100	splitter.text_to_words(in);
96		101
		102	if (firstTerm)
		103	*firstTerm = cb.firstTerm;
97	LOGDEB(("plaintorich: split done %d mS\n", chron.millis()));	104	LOGDEB(("plaintorich: split done %d mS\n", chron.millis()));
98		105
99	// Rich text output	106	// Rich text output
100	out = "<qt><head><title></title></head><body><p>";	107	out = "<qt><head><title></title></head><body><p>";
101		108
102	// Iterator for the list of input term positions. We use it to	109	// Iterator for the list of input term positions. We use it to
103	// output highlight tags and to compute term positions in the	110	// output highlight tags and to compute term positions in the
104	// output text	111	// output text
105	list<pair<int, int> >::iterator it = cb.tboffs.begin();	112	list<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
106		113
107	// Storage for the current term _character_ position in output.
108	pair<int, int> otermcpos;
109	// Current char position in output, excluding tags
110	int outcpos=0;
111	// Input character iterator	114	// Input character iterator
112	Utf8Iter chariter(in);	115	Utf8Iter chariter(in);
113	// State variable used to limitate the number of consecutive empty lines	116	// State variable used to limitate the number of consecutive empty lines
114	int ateol = 0;	117	int ateol = 0;
115	// State variable to update the char pos only for the first of	118	// State variable to update the char pos only for the first of
	...		...
118	for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {	121	for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
119	if (pos && (pos % 1000) == 0) {	122	if (pos && (pos % 1000) == 0) {
120	CancelCheck::instance().checkCancel();	123	CancelCheck::instance().checkCancel();
121	}	124	}
122	// If we still have terms positions, check (byte) position	125	// If we still have terms positions, check (byte) position
123	if (it != cb.tboffs.end()) {	126	if (tPosIt != cb.tboffs.end()) {
124	int ibyteidx = chariter.getBpos();	127	int ibyteidx = chariter.getBpos();
125	if (ibyteidx == it->first) {	128	if (ibyteidx == tPosIt->first) {
126	out += "<termtag>";	129	out += "<termtag>";
127	otermcpos.first = outcpos;
128	} else if (ibyteidx == it->second) {	130	} else if (ibyteidx == tPosIt->second) {
129	if (it != cb.tboffs.end())	131	if (tPosIt != cb.tboffs.end())
130	it++;	132	tPosIt++;
131	otermcpos.second = outcpos;
132	termoffsets.push_back(otermcpos);
133	out += "</termtag>";	133	out += "</termtag>";
134	}	134	}
135	}	135	}
136	switch(*chariter) {	136	switch(*chariter) {
137	case '\n':	137	case '\n':
138	if (ateol < 2) {	138	if (ateol < 2) {
139	out += "<br>\n";	139	out += "<br>\n";
140	ateol++;	140	ateol++;
141	outcpos++;
142	}	141	}
143	break;	142	break;
144	case '\r':	143	case '\r':
145	break;	144	break;
146	case '<':	145	case '<':
147	ateol = 0;	146	ateol = 0;
148	out += "<";	147	out += "<";
149	outcpos++;
150	break;	148	break;
151	case '&':	149	case '&':
152	ateol = 0;	150	ateol = 0;
153	out += "&";	151	out += "&";
154	outcpos++;
155	break;	152	break;
156	default:	153	default:
157	// We don't change the eol status for whitespace, want a real line	154	// We don't change the eol status for whitespace, want a real line
158	if (chariter == ' ' \|\| chariter == '\t') {	155	if (chariter == ' ' \|\| chariter == '\t') {
159	if (!atblank)
160	outcpos++;
161	atblank = 1;	156	atblank = 1;
162	} else {	157	} else {
163	ateol = 0;	158	ateol = 0;
164	atblank = 0;	159	atblank = 0;
165	outcpos++;
166	}	160	}
167	chariter.appendchartostring(out);	161	chariter.appendchartostring(out);
168	}	162	}
169	}	163	}
170	#if 0	164	#if 0