Switch to unified view

a/src/qtgui/plaintorich.cpp b/src/qtgui/plaintorich.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.9 2006-01-27 13:42:02 dockes Exp $ (C) 2005 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.10 2006-02-07 09:44:33 dockes Exp $ (C) 2005 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
41
41
42
// Text splitter callback used to take note of the position of query terms 
42
// Text splitter callback used to take note of the position of query terms 
43
// inside the result text. This is then used to post highlight tags. 
43
// inside the result text. This is then used to post highlight tags. 
44
class myTextSplitCB : public TextSplitCB {
44
class myTextSplitCB : public TextSplitCB {
45
 public:
45
 public:
46
    string firstTerm;
46
    set<string>    terms;          // in: user query terms
47
    set<string>    terms;          // in: user query terms
47
    list<pair<int, int> > tboffs;  // out: begin and end positions of
48
    list<pair<int, int> > tboffs;  // out: begin and end positions of
48
                                   // query terms in text
49
                                   // query terms in text
49
50
50
    myTextSplitCB(const list<string>& its) {
51
    myTextSplitCB(const list<string>& its) {
...
...
60
    virtual bool takeword(const std::string& term, int, int bts, int bte) {
61
    virtual bool takeword(const std::string& term, int, int bts, int bte) {
61
    string dumb;
62
    string dumb;
62
    Rcl::dumb_string(term, dumb);
63
    Rcl::dumb_string(term, dumb);
63
    //LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
64
    //LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
64
    // pos, bts, bte));
65
    // pos, bts, bte));
65
    if (terms.find(dumb) != terms.end()) 
66
    if (terms.find(dumb) != terms.end()) {
66
        tboffs.push_back(pair<int, int>(bts, bte));
67
        tboffs.push_back(pair<int, int>(bts, bte));
68
      if (firstTerm.empty())
69
      firstTerm = term;
70
  }
67
    CancelCheck::instance().checkCancel();
71
    CancelCheck::instance().checkCancel();
68
    return true;
72
    return true;
69
    }
73
    }
70
};
74
};
71
75
72
// Fix result text for display inside the gui text window.
76
// Fix result text for display inside the gui text window.
73
//
77
//
74
// To compute the term character positions in the output text, we have
78
// To compute the term character positions in the output text, we used
75
// to emulate how qt's textedit counts chars (ignoring tags and
79
// to emulate how qt's textedit counts chars (ignoring tags and
76
// duplicate whitespace etc...). This is tricky business and it might
80
// duplicate whitespace etc...). This was tricky business, dependant
77
// be better to insert the text char by char, taking note of where qt
81
// on qtextedit internals, and we don't do it any more, so we finally
78
// thinks it is at each term.
82
// don't know the term par/car positions in the editor text.  Instead,
83
// we return the first term encountered, and the caller will use the
84
// editor's find() function to position on it
79
bool plaintorich(const string& in, string& out, const list<string>& terms,
85
bool plaintorich(const string& in, string& out, const list<string>& terms,
80
       list<pair<int, int> >&termoffsets)
86
       string *firstTerm)
81
{
87
{
82
    Chrono chron;
88
    Chrono chron;
83
    LOGDEB(("plaintorich: terms: %s\n", 
89
    LOGDEB(("plaintorich: terms: %s\n", 
84
        stringlistdisp(terms).c_str()));
90
        stringlistdisp(terms).c_str()));
85
    out.erase();
91
    out.erase();
86
    termoffsets.erase(termoffsets.begin(), termoffsets.end());
87
92
88
    // We first use the text splitter to break the text into words,
93
    // We first use the text splitter to break the text into words,
89
    // and compare the words to the search terms, which yields the
94
    // and compare the words to the search terms, which yields the
90
    // query terms positions inside the text
95
    // query terms positions inside the text
91
    myTextSplitCB cb(terms);
96
    myTextSplitCB cb(terms);
92
    TextSplit splitter(&cb, true);
97
    TextSplit splitter(&cb, true);
93
    // Note that splitter returns the term locations in byte, not
98
    // Note that splitter returns the term locations in byte, not
94
    // character offset
99
    // character offset
95
    splitter.text_to_words(in);
100
    splitter.text_to_words(in);
96
101
102
    if (firstTerm)
103
  *firstTerm = cb.firstTerm;
97
    LOGDEB(("plaintorich: split done %d mS\n", chron.millis()));
104
    LOGDEB(("plaintorich: split done %d mS\n", chron.millis()));
98
105
99
    // Rich text output
106
    // Rich text output
100
    out = "<qt><head><title></title></head><body><p>";
107
    out = "<qt><head><title></title></head><body><p>";
101
108
102
    // Iterator for the list of input term positions. We use it to
109
    // Iterator for the list of input term positions. We use it to
103
    // output highlight tags and to compute term positions in the
110
    // output highlight tags and to compute term positions in the
104
    // output text
111
    // output text
105
    list<pair<int, int> >::iterator it = cb.tboffs.begin();
112
    list<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
106
113
107
    // Storage for the current term _character_ position in output.
108
    pair<int, int> otermcpos;
109
    // Current char position in output, excluding tags
110
    int outcpos=0; 
111
    // Input character iterator
114
    // Input character iterator
112
    Utf8Iter chariter(in);
115
    Utf8Iter chariter(in);
113
    // State variable used to limitate the number of consecutive empty lines 
116
    // State variable used to limitate the number of consecutive empty lines 
114
    int ateol = 0;
117
    int ateol = 0;
115
    // State variable to update the char pos only for the first of
118
    // State variable to update the char pos only for the first of
...
...
118
    for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
121
    for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
119
    if (pos && (pos % 1000) == 0) {
122
    if (pos && (pos % 1000) == 0) {
120
        CancelCheck::instance().checkCancel();
123
        CancelCheck::instance().checkCancel();
121
    }
124
    }
122
    // If we still have terms positions, check (byte) position
125
    // If we still have terms positions, check (byte) position
123
    if (it != cb.tboffs.end()) {
126
    if (tPosIt != cb.tboffs.end()) {
124
        int ibyteidx = chariter.getBpos();
127
        int ibyteidx = chariter.getBpos();
125
        if (ibyteidx == it->first) {
128
        if (ibyteidx == tPosIt->first) {
126
        out += "<termtag>";
129
        out += "<termtag>";
127
      otermcpos.first = outcpos;
128
        } else if (ibyteidx == it->second) {
130
        } else if (ibyteidx == tPosIt->second) {
129
        if (it != cb.tboffs.end())
131
        if (tPosIt != cb.tboffs.end())
130
            it++;
132
            tPosIt++;
131
      otermcpos.second = outcpos;
132
      termoffsets.push_back(otermcpos);
133
        out += "</termtag>";
133
        out += "</termtag>";
134
        }
134
        }
135
    }
135
    }
136
    switch(*chariter) {
136
    switch(*chariter) {
137
    case '\n':
137
    case '\n':
138
        if (ateol < 2) {
138
        if (ateol < 2) {
139
        out += "<br>\n";
139
        out += "<br>\n";
140
        ateol++;
140
        ateol++;
141
      outcpos++;
142
        }
141
        }
143
        break;
142
        break;
144
    case '\r': 
143
    case '\r': 
145
        break;
144
        break;
146
    case '<':
145
    case '<':
147
        ateol = 0;
146
        ateol = 0;
148
        out += "&lt;";
147
        out += "&lt;";
149
      outcpos++;
150
        break;
148
        break;
151
    case '&':
149
    case '&':
152
        ateol = 0;
150
        ateol = 0;
153
        out += "&amp;";
151
        out += "&amp;";
154
      outcpos++;
155
        break;
152
        break;
156
    default:
153
    default:
157
        // We don't change the eol status for whitespace, want a real line
154
        // We don't change the eol status for whitespace, want a real line
158
        if (*chariter == ' ' || *chariter == '\t') {
155
        if (*chariter == ' ' || *chariter == '\t') {
159
      if (!atblank)
160
          outcpos++;
161
        atblank = 1;
156
        atblank = 1;
162
        } else {
157
        } else {
163
        ateol = 0;
158
        ateol = 0;
164
        atblank = 0;
159
        atblank = 0;
165
      outcpos++;
166
        }
160
        }
167
        chariter.appendchartostring(out);
161
        chariter.appendchartostring(out);
168
    }
162
    }
169
    }
163
    }
170
#if 0
164
#if 0