recoll / Code / Diff of /src/qtgui/plaintorich.cpp

Diff of /src/qtgui/plaintorich.cpp [34ab3a] .. [ed3de7]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.9 2006-01-27 13:42:02 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...


#include <string>
#include <utility>
#include <list>
#include <set>
#ifndef NO_NAMESPACES
using std::list;
using std::pair;
using std::set;
#endif /* NO_NAMESPACES */

#include "rcldb.h"
#include "rclconfig.h"
#include "debuglog.h"
#include "textsplit.h"
#include "utf8iter.h"
#include "transcode.h"
#include "smallut.h"
#include "plaintorich.h"
#include "cancelcheck.h"

// Text splitter callback used to take note of the position of query terms 
// inside the result text. This is then used to post highlight tags. 
class myTextSplitCB : public TextSplitCB {
 public:
    set<string>    terms;          // in: user query terms
    list<pair<int, int> > tboffs;  // out: begin and end positions of
                                   // query terms in text

    myTextSplitCB(const list<string>& its) {
  for (list<string>::const_iterator it = its.begin(); it != its.end();
       it++) {
      string s;
      Rcl::dumb_string(*it, s);
      terms.insert(s);
  }
    }

    // Callback called by the text-to-words breaker for each word
    virtual bool takeword(const std::string& term, int, int bts, int bte) {
    string dumb;
    Rcl::dumb_string(term, dumb);
    //LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
    // pos, bts, bte));
  if (terms.find(dumb) != terms.end()) 


        tboffs.push_back(pair<int, int>(bts, bte));
  CancelCheck::instance().checkCancel();



    return true;
    }
};

// Fix result text for display inside the gui text window.
...
// To compute the term character positions in the output text, we have
// to emulate how qt's textedit counts chars (ignoring tags and
// duplicate whitespace etc...). This is tricky business and it might
// be better to insert the text char by char, taking note of where qt
// thinks it is at each term.
bool plaintorich(const string& in, string& out, const list<string>& terms,
         list<pair<int, int> >&termoffsets)
{
    Chrono chron;
    LOGDEB(("plaintorich: terms: %s\n", 
        stringlistdisp(terms).c_str()));
    out.erase();
    termoffsets.erase(termoffsets.begin(), termoffsets.end());

    // We first use the text splitter to break the text into words,
    // and compare the words to the search terms, which yields the
    // query terms positions inside the text
...
    TextSplit splitter(&cb, true);
    // Note that splitter returns the term locations in byte, not
    // character offset
    splitter.text_to_words(in);

    LOGDEB(("plaintorich: split done %d mS\n", chron.millis()));


    // Rich text output
    out = "<qt><head><title></title></head><body><p>";

    // Iterator for the list of input term positions. We use it to
    // output highlight tags and to compute term positions in the
    // output text
    list<pair<int, int> >::iterator it = cb.tboffs.begin();
...
    int ateol = 0;
    // State variable to update the char pos only for the first of
    // consecutive blank chars
    int atblank = 0;
    for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
  if (pos && (pos % 1000) == 0) {
      CancelCheck::instance().checkCancel();
  }
    // If we still have terms positions, check (byte) position
    if (it != cb.tboffs.end()) {
        int ibyteidx = chariter.getBpos();
        if (ibyteidx == it->first) {
        out += "<termtag>";
        otermcpos.first = outcpos;
...
        out += "&amp;";
        outcpos++;
        break;
    default:
        // We don't change the eol status for whitespace, want a real line
        if (*chariter == ' ' || *chariter == '\t') {
        if (!atblank)
            outcpos++;
        atblank = 1;
        } else {
        ateol = 0;
...
    FILE *fp = fopen("/tmp/debugplaintorich", "w");
    fprintf(fp, "%s\n", out.c_str());
    fclose(fp);
    }
#endif
    LOGDEB(("plaintorich: done %d mS\n", chron.millis()));
    return true;
}

	a/src/qtgui/plaintorich.cpp		b/src/qtgui/plaintorich.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.8 2006-01-23 13:32:05 dockes Exp $ (C) 2005 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.9 2006-01-27 13:42:02 dockes Exp $ (C) 2005 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
20		20
21		21
22	#include <string>	22	#include <string>
23	#include <utility>	23	#include <utility>
24	#include <list>	24	#include <list>
		25	#include <set>
25	#ifndef NO_NAMESPACES	26	#ifndef NO_NAMESPACES
26	using std::list;	27	using std::list;
27	using std::pair;	28	using std::pair;
		29	using std::set;
28	#endif /* NO_NAMESPACES */	30	#endif /* NO_NAMESPACES */
29		31
30	#include "rcldb.h"	32	#include "rcldb.h"
31	#include "rclconfig.h"	33	#include "rclconfig.h"
32	#include "debuglog.h"	34	#include "debuglog.h"
33	#include "textsplit.h"	35	#include "textsplit.h"
34	#include "utf8iter.h"	36	#include "utf8iter.h"
35	#include "transcode.h"	37	#include "transcode.h"
36	#include "smallut.h"	38	#include "smallut.h"
		39	#include "plaintorich.h"
		40	#include "cancelcheck.h"
37		41
38	// Text splitter callback used to take note of the position of query terms	42	// Text splitter callback used to take note of the position of query terms
39	// inside the result text. This is then used to post highlight tags.	43	// inside the result text. This is then used to post highlight tags.
40	class myTextSplitCB : public TextSplitCB {	44	class myTextSplitCB : public TextSplitCB {
41	public:	45	public:
42	const list<string> *terms; // in: query terms	46	set<string> terms; // in: user query terms
43	list<pair<int, int> > tboffs; // out: begin and end positions of	47	list<pair<int, int> > tboffs; // out: begin and end positions of
44	// query terms in text	48	// query terms in text
45		49
46	myTextSplitCB(const list<string>& terms)	50	myTextSplitCB(const list<string>& its) {
47	: terms(&terms) {	51	for (list<string>::const_iterator it = its.begin(); it != its.end();
		52	it++) {
		53	string s;
		54	Rcl::dumb_string(*it, s);
		55	terms.insert(s);
		56	}
48	}	57	}
49		58
50	// Callback called by the text-to-words breaker for each word	59	// Callback called by the text-to-words breaker for each word
51	virtual bool takeword(const std::string& term, int, int bts, int bte) {	60	virtual bool takeword(const std::string& term, int, int bts, int bte) {
52	string dumb;	61	string dumb;
53	Rcl::dumb_string(term, dumb);	62	Rcl::dumb_string(term, dumb);
54	//LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),	63	//LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
55	// pos, bts, bte));	64	// pos, bts, bte));
56	for (list<string>::const_iterator it = terms->begin();	65	if (terms.find(dumb) != terms.end())
57	it != terms->end(); it++) {
58	if (!stringlowercmp(*it, dumb)) {
59	tboffs.push_back(pair<int, int>(bts, bte));	66	tboffs.push_back(pair<int, int>(bts, bte));
60	break;	67	CancelCheck::instance().checkCancel();
61	}
62	}
63
64	return true;	68	return true;
65	}	69	}
66	};	70	};
67		71
68	// Fix result text for display inside the gui text window.	72	// Fix result text for display inside the gui text window.
	...		...
70	// To compute the term character positions in the output text, we have	74	// To compute the term character positions in the output text, we have
71	// to emulate how qt's textedit counts chars (ignoring tags and	75	// to emulate how qt's textedit counts chars (ignoring tags and
72	// duplicate whitespace etc...). This is tricky business and it might	76	// duplicate whitespace etc...). This is tricky business and it might
73	// be better to insert the text char by char, taking note of where qt	77	// be better to insert the text char by char, taking note of where qt
74	// thinks it is at each term.	78	// thinks it is at each term.
75	string plaintorich(const string &in, const list<string>& terms,	79	bool plaintorich(const string& in, string& out, const list<string>& terms,
76	list<pair<int, int> >&termoffsets)	80	list<pair<int, int> >&termoffsets)
77	{	81	{
		82	Chrono chron;
78	LOGDEB(("plaintorich: terms: %s\n",	83	LOGDEB(("plaintorich: terms: %s\n",
79	stringlistdisp(terms).c_str()));	84	stringlistdisp(terms).c_str()));
80		85	out.erase();
81	termoffsets.erase(termoffsets.begin(), termoffsets.end());	86	termoffsets.erase(termoffsets.begin(), termoffsets.end());
82		87
83	// We first use the text splitter to break the text into words,	88	// We first use the text splitter to break the text into words,
84	// and compare the words to the search terms, which yields the	89	// and compare the words to the search terms, which yields the
85	// query terms positions inside the text	90	// query terms positions inside the text
	...		...
87	TextSplit splitter(&cb, true);	92	TextSplit splitter(&cb, true);
88	// Note that splitter returns the term locations in byte, not	93	// Note that splitter returns the term locations in byte, not
89	// character offset	94	// character offset
90	splitter.text_to_words(in);	95	splitter.text_to_words(in);
91		96
92	LOGDEB(("Split done\n"));	97	LOGDEB(("plaintorich: split done %d mS\n", chron.millis()));
93
94		98
95	// Rich text output	99	// Rich text output
96	string out = "<qt><head><title></title></head><body><p>";	100	out = "<qt><head><title></title></head><body><p>";
97		101
98	// Iterator for the list of input term positions. We use it to	102	// Iterator for the list of input term positions. We use it to
99	// output highlight tags and to compute term positions in the	103	// output highlight tags and to compute term positions in the
100	// output text	104	// output text
101	list<pair<int, int> >::iterator it = cb.tboffs.begin();	105	list<pair<int, int> >::iterator it = cb.tboffs.begin();
	...		...
110	int ateol = 0;	114	int ateol = 0;
111	// State variable to update the char pos only for the first of	115	// State variable to update the char pos only for the first of
112	// consecutive blank chars	116	// consecutive blank chars
113	int atblank = 0;	117	int atblank = 0;
114	for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {	118	for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
		119	if (pos && (pos % 1000) == 0) {
		120	CancelCheck::instance().checkCancel();
		121	}
115	// If we still have terms, check (byte) position	122	// If we still have terms positions, check (byte) position
116	if (it != cb.tboffs.end()) {	123	if (it != cb.tboffs.end()) {
117	int ibyteidx = chariter.getBpos();	124	int ibyteidx = chariter.getBpos();
118	if (ibyteidx == it->first) {	125	if (ibyteidx == it->first) {
119	out += "<termtag>";	126	out += "<termtag>";
120	otermcpos.first = outcpos;	127	otermcpos.first = outcpos;
	...		...
146	out += "&";	153	out += "&";
147	outcpos++;	154	outcpos++;
148	break;	155	break;
149	default:	156	default:
150	// We don't change the eol status for whitespace, want a real line	157	// We don't change the eol status for whitespace, want a real line
151	if (chariter == ' ' \|\| chariter == ' ') {	158	if (chariter == ' ' \|\| chariter == '\t') {
152	if (!atblank)	159	if (!atblank)
153	outcpos++;	160	outcpos++;
154	atblank = 1;	161	atblank = 1;
155	} else {	162	} else {
156	ateol = 0;	163	ateol = 0;
	...		...
165	FILE *fp = fopen("/tmp/debugplaintorich", "w");	172	FILE *fp = fopen("/tmp/debugplaintorich", "w");
166	fprintf(fp, "%s\n", out.c_str());	173	fprintf(fp, "%s\n", out.c_str());
167	fclose(fp);	174	fclose(fp);
168	}	175	}
169	#endif	176	#endif
		177	LOGDEB(("plaintorich: done %d mS\n", chron.millis()));
170	return out;	178	return true;
171	}	179	}