Switch to unified view

a/src/qtgui/plaintorich.cpp b/src/qtgui/plaintorich.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.30 2007-11-15 18:05:32 dockes Exp $ (C) 2005 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.31 2008-07-01 08:27:58 dockes Exp $ (C) 2005 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
41
#include "utf8iter.h"
41
#include "utf8iter.h"
42
#include "smallut.h"
42
#include "smallut.h"
43
#include "plaintorich.h"
43
#include "plaintorich.h"
44
#include "cancelcheck.h"
44
#include "cancelcheck.h"
45
45
46
const string PlainToRich::snull = "";
47
46
// For debug printing
48
// For debug printing
47
static string vecStringToString(const vector<string>& t)
49
static string vecStringToString(const vector<string>& t)
48
{
50
{
49
    string sterms;
51
    string sterms;
50
    for (vector<string>::const_iterator it = t.begin(); it != t.end(); it++) {
52
    for (vector<string>::const_iterator it = t.begin(); it != t.end(); it++) {
...
...
56
// Text splitter callback used to take note of the position of query terms 
58
// Text splitter callback used to take note of the position of query terms 
57
// inside the result text. This is then used to insert highlight tags. 
59
// inside the result text. This is then used to insert highlight tags. 
58
class myTextSplitCB : public TextSplitCB {
60
class myTextSplitCB : public TextSplitCB {
59
 public:
61
 public:
60
62
61
    // Out: first query term found in text
62
    string firstTerm;
63
    int    firstTermOcc;
64
    int m_firstTermPos;
65
    int m_firstTermBPos;
66
67
    // Out: begin and end byte positions of query terms/groups in text
63
    // Out: begin and end byte positions of query terms/groups in text
68
    vector<pair<int, int> > tboffs;  
64
    vector<pair<int, int> > tboffs;  
69
65
70
    myTextSplitCB(const vector<string>& its, 
66
    myTextSplitCB(const vector<string>& its, 
71
          const vector<vector<string> >&groups, 
67
          const vector<vector<string> >&groups, 
72
          const vector<int>& slacks) 
68
          const vector<int>& slacks) 
73
    :  firstTermOcc(1), m_wcount(0), m_groups(groups), m_slacks(slacks)
69
    :  m_wcount(0), m_groups(groups), m_slacks(slacks)
74
    {
70
    {
75
    for (vector<string>::const_iterator it = its.begin(); 
71
    for (vector<string>::const_iterator it = its.begin(); 
76
         it != its.end(); it++) {
72
         it != its.end(); it++) {
77
        m_terms.insert(*it);
73
        m_terms.insert(*it);
78
    }
74
    }
...
...
93
    // pos, bts, bte));
89
    // pos, bts, bte));
94
90
95
    // If this word is a search term, remember its byte-offset span. 
91
    // If this word is a search term, remember its byte-offset span. 
96
    if (m_terms.find(dumb) != m_terms.end()) {
92
    if (m_terms.find(dumb) != m_terms.end()) {
97
        tboffs.push_back(pair<int, int>(bts, bte));
93
        tboffs.push_back(pair<int, int>(bts, bte));
98
      if (firstTerm.empty()) {
99
      firstTerm = term;
100
      m_firstTermPos = pos;
101
      m_firstTermBPos = bts;
102
      }
103
    }
94
    }
104
    
95
    
105
    if (m_gterms.find(dumb) != m_gterms.end()) {
96
    if (m_gterms.find(dumb) != m_gterms.end()) {
106
        // Term group (phrase/near) handling
97
        // Term group (phrase/near) handling
107
        m_plists[dumb].push_back(pos);
98
        m_plists[dumb].push_back(pos);
...
...
146
};
137
};
147
138
148
#define SETMINMAX(POS, STA, STO)  {if ((POS) < (STA)) (STA) = (POS); \
139
#define SETMINMAX(POS, STA, STO)  {if ((POS) < (STA)) (STA) = (POS); \
149
    if ((POS) > (STO)) (STO) = (POS);}
140
    if ((POS) > (STO)) (STO) = (POS);}
150
141
151
// Recursively check that each term is inside the window (which is readjusted
142
// Recursively check that each term is inside the window (which is
152
// as the successive terms are found)
143
// readjusted as the successive terms are found). i is the index for
144
// the next position list to use (initially 1)
153
static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
145
static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
154
            unsigned int i, int min, int max, int *sp, int *ep)
146
                unsigned int i, int min, int max, 
147
                int *sp, int *ep)
155
{
148
{
156
    int tmp = max + 1;
149
    int tmp = max + 1;
157
    // take care to avoid underflow
150
    // take care to avoid underflow
158
    if (window <= tmp) 
151
    if (window <= tmp) 
159
    tmp -= window; 
152
    tmp -= window; 
...
...
208
    // stem-expanded: we don't know which matched)
201
    // stem-expanded: we don't know which matched)
209
    for (vector<string>::const_iterator it = terms.begin(); 
202
    for (vector<string>::const_iterator it = terms.begin(); 
210
     it != terms.end(); it++) {
203
     it != terms.end(); it++) {
211
    map<string, vector<int> >::iterator pl = m_plists.find(*it);
204
    map<string, vector<int> >::iterator pl = m_plists.find(*it);
212
    if (pl == m_plists.end()) {
205
    if (pl == m_plists.end()) {
213
        LOGDEB1(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
206
        LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
214
            (*it).c_str()));
207
            (*it).c_str()));
215
        continue;
208
        continue;
216
    }
209
    }
217
    plists.push_back(&(pl->second));
210
    plists.push_back(&(pl->second));
218
    plistToTerm[&(pl->second)] = *it;
211
    plistToTerm[&(pl->second)] = *it;
219
    realgroup.push_back(*it);
212
    realgroup.push_back(*it);
220
    }
213
    }
221
    LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group %s\n", window,
214
    LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n", 
222
         vecStringToString(realgroup).c_str()));
215
         window, vecStringToString(realgroup).c_str()));
223
    if (plists.size() < 2)
216
    if (plists.size() < 2) {
217
  LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n"));
224
    return false;
218
    return false;
219
    }
225
    // Sort the positions lists so that the shorter is first
220
    // Sort the positions lists so that the shorter is first
226
    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
221
    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
227
222
223
    { // Debug
224
  map<vector<int>*, string>::iterator it;
225
  it =  plistToTerm.find(plists[0]);
226
  if (it == plistToTerm.end()) {
227
      // SuperWeird
228
      LOGERR(("matchGroup: term for first list not found !?!\n"));
229
      return false;
230
  }
231
  LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
232
      it->second.c_str(), plists[0]->size()));
233
    }
234
228
    // Walk the shortest plist and look for matches
235
    // Walk the shortest plist and look for matches
229
    int sta = int(10E9), sto = 0;
230
    int pos;
231
    // Occurrences are from 1->N
232
    firstTermOcc = 0;
233
    vector<int>::iterator it = plists[0]->begin();
236
    for (vector<int>::iterator it = plists[0]->begin(); 
234
    do {
235
    if (it == plists[0]->end())
237
     it != plists[0]->end(); it++) {
236
      return false;
237
    pos = *it++;
238
    int pos = *it;
238
  firstTermOcc++;
239
  int sta = int(10E9), sto = 0;
240
  LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
239
    } while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto));
241
  if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
242
      LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n", 
243
           sta, sto)); 
244
      // Maybe extend the window by 1st term position, this was not
245
      // done by do_prox..
240
    SETMINMAX(pos, sta, sto);
246
      SETMINMAX(pos, sta, sto);
241
242
    LOGDEB0(("myTextSplitCB::matchGroup: MATCH [%d,%d]\n", sta, sto)); 
243
244
    // Translate the position window into a byte offset window
247
      // Translate the position window into a byte offset window
245
    int bs = 0;
248
      int bs = 0;
246
    map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
249
      map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
247
    map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
250
      map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
248
    if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
251
      if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
249
    LOGDEB1(("myTextSplitCB::matchGroup: pushing %d %d\n",
252
     LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n",
250
         i1->second.first, i2->second.second));
253
          i1->second.first, i2->second.second));
251
    tboffs.push_back(pair<int, int>(i1->second.first, i2->second.second));
254
     tboffs.push_back(pair<int, int>(i1->second.first, 
255
                      i2->second.second));
252
    bs = i1->second.first;
256
     bs = i1->second.first;
253
    } else {
257
      } else {
254
    LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n", 
258
     LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
255
      sta, sto));
256
    }
259
      }
257
260
  }
258
    if (firstTerm.empty() || m_firstTermPos > sta) {
259
  // firsTerm is used to try an position the preview window over
260
  // the match. As it's difficult to divine byte/word positions
261
  // in qtextedit, we use a string search. Use the
262
  // shortest plist for this, which hopefully gives a better
263
  // chance for the group to be found (it's hopeless to try and
264
  // match the whole group)
265
  map<vector<int>*, string>::iterator it = 
266
      plistToTerm.find(plists.front());
267
  if (it != plistToTerm.end())
268
      firstTerm = it->second;
269
  LOGDEB0(("myTextSplitCB:: best group term %s, firstTermOcc %d\n",
270
       firstTerm.c_str(), firstTermOcc));
271
  m_firstTermPos = sta;
272
  m_firstTermBPos = bs;
273
    }
261
    }
274
262
275
    return true;
263
    return true;
276
}
264
}
277
265
...
...
296
284
297
    // Sort by start and end offsets. The merging of overlapping entries
285
    // Sort by start and end offsets. The merging of overlapping entries
298
    // will be handled during output.
286
    // will be handled during output.
299
    std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
287
    std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
300
    return true;
288
    return true;
301
}
302
303
// Setting searchable beacons in the text to walk the term list.
304
static const char *termAnchorNameBase = "TRM";
305
string termAnchorName(int i)
306
{
307
    char acname[sizeof(termAnchorNameBase) + 20];
308
    sprintf(acname, "%s%d", termAnchorNameBase, i);
309
    return string(acname);
310
}
311
312
static string termBeacon(int i)
313
{
314
    return string("<a name=\"") + termAnchorName(i) + "\">";
315
}
289
}
316
290
317
291
318
// Fix result text for display inside the gui text window.
292
// Fix result text for display inside the gui text window.
319
//
293
//
...
...
323
// on qtextedit internals, and we don't do it any more, so we finally
297
// on qtextedit internals, and we don't do it any more, so we finally
324
// don't know the term par/car positions in the editor text.  
298
// don't know the term par/car positions in the editor text.  
325
// Instead, we mark the search term positions either with html anchor
299
// Instead, we mark the search term positions either with html anchor
326
// (qt currently has problems with them), or a special string, and the
300
// (qt currently has problems with them), or a special string, and the
327
// caller will use the editor's find() function to position on it
301
// caller will use the editor's find() function to position on it
328
bool plaintorich(const string& in, list<string>& out, 
302
bool PlainToRich::plaintorich(const string& in, list<string>& out, 
329
         const HiliteData& hdata,
303
                const HiliteData& hdata,
330
       bool noHeader, int *lastAnchor, int chunksize)
304
                int chunksize)
331
{
305
{
332
    Chrono chron;
306
    Chrono chron;
333
    const vector<string>& terms(hdata.terms);
307
    const vector<string>& terms(hdata.terms);
334
    const vector<vector<string> >& groups(hdata.groups);
308
    const vector<vector<string> >& groups(hdata.groups);
335
    const vector<int>& slacks(hdata.gslks);
309
    const vector<int>& slacks(hdata.gslks);
...
...
340
    LOGDEB0(("  %s\n", sterms.c_str()));
314
    LOGDEB0(("  %s\n", sterms.c_str()));
341
    sterms = "\n";
315
    sterms = "\n";
342
    LOGDEB0(("plaintorich: groups: \n"));
316
    LOGDEB0(("plaintorich: groups: \n"));
343
    for (vector<vector<string> >::const_iterator vit = groups.begin(); 
317
    for (vector<vector<string> >::const_iterator vit = groups.begin(); 
344
         vit != groups.end(); vit++) {
318
         vit != groups.end(); vit++) {
319
      sterms += "GROUP: ";
345
        sterms += vecStringToString(*vit);
320
        sterms += vecStringToString(*vit);
346
        sterms += "\n";
321
        sterms += "\n";
347
    }
322
    }
348
    LOGDEB0(("  %s", sterms.c_str()));
323
    LOGDEB0(("  %s", sterms.c_str()));
349
    }
324
    }
...
...
360
335
361
    cb.matchGroups();
336
    cb.matchGroups();
362
337
363
    out.clear();
338
    out.clear();
364
    out.push_back("");
339
    out.push_back("");
365
    list<string>::iterator sit = out.begin();
340
    list<string>::iterator olit = out.begin();
366
341
367
    // Rich text output
342
    // Rich text output
368
    if (noHeader)
343
    *olit = header();
369
  *sit = "";
370
    else 
371
  *sit = "<qt><head><title></title></head><body><p>";
372
344
373
    // Iterator for the list of input term positions. We use it to
345
    // Iterator for the list of input term positions. We use it to
374
    // output highlight tags and to compute term positions in the
346
    // output highlight tags and to compute term positions in the
375
    // output text
347
    // output text
376
    vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
348
    vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
...
...
386
    // Input character iterator
358
    // Input character iterator
387
    Utf8Iter chariter(in);
359
    Utf8Iter chariter(in);
388
    // State variable used to limitate the number of consecutive empty lines 
360
    // State variable used to limitate the number of consecutive empty lines 
389
    int ateol = 0;
361
    int ateol = 0;
390
362
391
    // Stuff for numbered anchors at each term match
363
    // Value for numbered anchors at each term match
392
    int anchoridx = 1;
364
    int anchoridx = 1;
393
365
394
    for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
366
    for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
367
  // Check from time to time if we need to stop
395
    if ((pos & 0xfff) == 0) {
368
    if ((pos & 0xfff) == 0) {
396
        CancelCheck::instance().checkCancel();
369
        CancelCheck::instance().checkCancel();
397
    }
370
    }
398
371
399
    // If we still have terms positions, check (byte) position. If
372
    // If we still have terms positions, check (byte) position. If
400
    // we are at or after a term match, mark.
373
    // we are at or after a term match, mark.
401
    if (tPosIt != tboffsend) {
374
    if (tPosIt != tboffsend) {
402
        int ibyteidx = chariter.getBpos();
375
        int ibyteidx = chariter.getBpos();
403
        if (ibyteidx == tPosIt->first) {
376
        if (ibyteidx == tPosIt->first) {
404
      if (lastAnchor)
377
      *olit += startAnchor(anchoridx++);
405
          *sit += termBeacon(anchoridx++);
378
      *olit += startMatch();
406
      *sit += "<termtag>";
407
        } else if (ibyteidx == tPosIt->second) {
379
        } else if (ibyteidx == tPosIt->second) {
408
        // Output end tag, then skip all highlight areas that
380
        // Output end tag, then skip all highlight areas that
409
        // would overlap this one
381
        // would overlap this one
410
      *sit += "</termtag>";
382
      *olit += endMatch();
383
      *olit += endAnchor();
411
        int crend = tPosIt->second;
384
        int crend = tPosIt->second;
412
        while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
385
        while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
413
            tPosIt++;
386
            tPosIt++;
414
      // Maybe end chunk
387
      // Maybe end this chunk, begin next
415
        if (sit->size() > (unsigned int)chunksize) {
388
        if (olit->size() > (unsigned int)chunksize) {
416
            out.push_back("");
389
            out.push_back("");
417
            sit++;
390
            olit++;
418
        }
391
        }
419
        }
392
        }
420
    }
393
    }
421
394
422
    switch(*chariter) {
395
    switch(*chariter) {
423
    case '\n':
396
    case '\n':
424
        if (ateol < 2) {
397
        if (ateol < 2) {
425
        *sit += "<br>\n";
398
        *olit += "<br>\n";
426
        ateol++;
399
        ateol++;
427
        }
400
        }
428
        break;
401
        break;
429
    case '\r': 
402
    case '\r': 
430
        break;
403
        break;
431
  case '\007': // used as anchor char, strip other instances
432
      break;
433
    case '<':
404
    case '<':
434
        ateol = 0;
405
        ateol = 0;
435
        *sit += "&lt;";
406
        *olit += "&lt;";
436
        break;
407
        break;
437
    case '&':
408
    case '&':
438
        ateol = 0;
409
        ateol = 0;
439
        *sit += "&amp;";
410
        *olit += "&amp;";
440
        break;
411
        break;
441
    default:
412
    default:
442
        // We don't change the eol status for whitespace, want a real line
413
        // We don't change the eol status for whitespace, want a real line
443
        if (!(*chariter == ' ' || *chariter == '\t')) {
414
        if (!(*chariter == ' ' || *chariter == '\t')) {
444
        ateol = 0;
415
        ateol = 0;
445
        }
416
        }
446
        chariter.appendchartostring(*sit);
417
        chariter.appendchartostring(*olit);
447
    }
418
    }
448
    }
419
    }
449
    if (lastAnchor)
450
  *lastAnchor = anchoridx - 1;
451
#if 0
420
#if 1
452
    {
421
    {
453
    FILE *fp = fopen("/tmp/debugplaintorich", "a");
422
    FILE *fp = fopen("/tmp/debugplaintorich", "a");
454
    fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");
423
    fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");
455
    for (list<string>::iterator it = out.begin();
424
    for (list<string>::iterator it = out.begin();
456
         it != out.end(); it++) {
425
         it != out.end(); it++) {