Switch to unified view

a/src/qtgui/plaintorich.cpp b/src/qtgui/plaintorich.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.8 2006-01-23 13:32:05 dockes Exp $ (C) 2005 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.9 2006-01-27 13:42:02 dockes Exp $ (C) 2005 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
20
20
21
21
22
#include <string>
22
#include <string>
23
#include <utility>
23
#include <utility>
24
#include <list>
24
#include <list>
25
#include <set>
25
#ifndef NO_NAMESPACES
26
#ifndef NO_NAMESPACES
26
using std::list;
27
using std::list;
27
using std::pair;
28
using std::pair;
29
using std::set;
28
#endif /* NO_NAMESPACES */
30
#endif /* NO_NAMESPACES */
29
31
30
#include "rcldb.h"
32
#include "rcldb.h"
31
#include "rclconfig.h"
33
#include "rclconfig.h"
32
#include "debuglog.h"
34
#include "debuglog.h"
33
#include "textsplit.h"
35
#include "textsplit.h"
34
#include "utf8iter.h"
36
#include "utf8iter.h"
35
#include "transcode.h"
37
#include "transcode.h"
36
#include "smallut.h"
38
#include "smallut.h"
39
#include "plaintorich.h"
40
#include "cancelcheck.h"
37
41
38
// Text splitter callback used to take note of the position of query terms 
42
// Text splitter callback used to take note of the position of query terms 
39
// inside the result text. This is then used to post highlight tags. 
43
// inside the result text. This is then used to post highlight tags. 
40
class myTextSplitCB : public TextSplitCB {
44
class myTextSplitCB : public TextSplitCB {
41
 public:
45
 public:
42
    const list<string>    *terms;  // in: query terms
46
    set<string>    terms;          // in: user query terms
43
    list<pair<int, int> > tboffs;  // out: begin and end positions of
47
    list<pair<int, int> > tboffs;  // out: begin and end positions of
44
                                   // query terms in text
48
                                   // query terms in text
45
49
46
    myTextSplitCB(const list<string>& terms) 
50
    myTextSplitCB(const list<string>& its) {
47
  : terms(&terms) {
51
  for (list<string>::const_iterator it = its.begin(); it != its.end();
52
       it++) {
53
      string s;
54
      Rcl::dumb_string(*it, s);
55
      terms.insert(s);
56
  }
48
    }
57
    }
49
58
50
    // Callback called by the text-to-words breaker for each word
59
    // Callback called by the text-to-words breaker for each word
51
    virtual bool takeword(const std::string& term, int, int bts, int bte) {
60
    virtual bool takeword(const std::string& term, int, int bts, int bte) {
52
    string dumb;
61
    string dumb;
53
    Rcl::dumb_string(term, dumb);
62
    Rcl::dumb_string(term, dumb);
54
    //LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
63
    //LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
55
    // pos, bts, bte));
64
    // pos, bts, bte));
56
  for (list<string>::const_iterator it = terms->begin(); 
65
  if (terms.find(dumb) != terms.end()) 
57
       it != terms->end(); it++) {
58
      if (!stringlowercmp(*it, dumb)) {
59
      tboffs.push_back(pair<int, int>(bts, bte));
66
        tboffs.push_back(pair<int, int>(bts, bte));
60
      break;
67
  CancelCheck::instance().checkCancel();
61
      }
62
  }
63
       
64
    return true;
68
    return true;
65
    }
69
    }
66
};
70
};
67
71
68
// Fix result text for display inside the gui text window.
72
// Fix result text for display inside the gui text window.
...
...
70
// To compute the term character positions in the output text, we have
74
// To compute the term character positions in the output text, we have
71
// to emulate how qt's textedit counts chars (ignoring tags and
75
// to emulate how qt's textedit counts chars (ignoring tags and
72
// duplicate whitespace etc...). This is tricky business and it might
76
// duplicate whitespace etc...). This is tricky business and it might
73
// be better to insert the text char by char, taking note of where qt
77
// be better to insert the text char by char, taking note of where qt
74
// thinks it is at each term.
78
// thinks it is at each term.
75
string plaintorich(const string &in,  const list<string>& terms,
79
bool plaintorich(const string& in, string& out, const list<string>& terms,
76
           list<pair<int, int> >&termoffsets)
80
         list<pair<int, int> >&termoffsets)
77
{
81
{
82
    Chrono chron;
78
    LOGDEB(("plaintorich: terms: %s\n", 
83
    LOGDEB(("plaintorich: terms: %s\n", 
79
        stringlistdisp(terms).c_str()));
84
        stringlistdisp(terms).c_str()));
80
85
    out.erase();
81
    termoffsets.erase(termoffsets.begin(), termoffsets.end());
86
    termoffsets.erase(termoffsets.begin(), termoffsets.end());
82
87
83
    // We first use the text splitter to break the text into words,
88
    // We first use the text splitter to break the text into words,
84
    // and compare the words to the search terms, which yields the
89
    // and compare the words to the search terms, which yields the
85
    // query terms positions inside the text
90
    // query terms positions inside the text
...
...
87
    TextSplit splitter(&cb, true);
92
    TextSplit splitter(&cb, true);
88
    // Note that splitter returns the term locations in byte, not
93
    // Note that splitter returns the term locations in byte, not
89
    // character offset
94
    // character offset
90
    splitter.text_to_words(in);
95
    splitter.text_to_words(in);
91
96
92
    LOGDEB(("Split done\n"));
97
    LOGDEB(("plaintorich: split done %d mS\n", chron.millis()));
93
94
98
95
    // Rich text output
99
    // Rich text output
96
    string out = "<qt><head><title></title></head><body><p>";
100
    out = "<qt><head><title></title></head><body><p>";
97
101
98
    // Iterator for the list of input term positions. We use it to
102
    // Iterator for the list of input term positions. We use it to
99
    // output highlight tags and to compute term positions in the
103
    // output highlight tags and to compute term positions in the
100
    // output text
104
    // output text
101
    list<pair<int, int> >::iterator it = cb.tboffs.begin();
105
    list<pair<int, int> >::iterator it = cb.tboffs.begin();
...
...
110
    int ateol = 0;
114
    int ateol = 0;
111
    // State variable to update the char pos only for the first of
115
    // State variable to update the char pos only for the first of
112
    // consecutive blank chars
116
    // consecutive blank chars
113
    int atblank = 0;
117
    int atblank = 0;
114
    for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
118
    for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
119
  if (pos && (pos % 1000) == 0) {
120
      CancelCheck::instance().checkCancel();
121
  }
115
    // If we still have terms, check (byte) position
122
    // If we still have terms positions, check (byte) position
116
    if (it != cb.tboffs.end()) {
123
    if (it != cb.tboffs.end()) {
117
        int ibyteidx = chariter.getBpos();
124
        int ibyteidx = chariter.getBpos();
118
        if (ibyteidx == it->first) {
125
        if (ibyteidx == it->first) {
119
        out += "<termtag>";
126
        out += "<termtag>";
120
        otermcpos.first = outcpos;
127
        otermcpos.first = outcpos;
...
...
146
        out += "&amp;";
153
        out += "&amp;";
147
        outcpos++;
154
        outcpos++;
148
        break;
155
        break;
149
    default:
156
    default:
150
        // We don't change the eol status for whitespace, want a real line
157
        // We don't change the eol status for whitespace, want a real line
151
        if (*chariter == ' ' || *chariter == '    ') {
158
        if (*chariter == ' ' || *chariter == '\t') {
152
        if (!atblank)
159
        if (!atblank)
153
            outcpos++;
160
            outcpos++;
154
        atblank = 1;
161
        atblank = 1;
155
        } else {
162
        } else {
156
        ateol = 0;
163
        ateol = 0;
...
...
165
    FILE *fp = fopen("/tmp/debugplaintorich", "w");
172
    FILE *fp = fopen("/tmp/debugplaintorich", "w");
166
    fprintf(fp, "%s\n", out.c_str());
173
    fprintf(fp, "%s\n", out.c_str());
167
    fclose(fp);
174
    fclose(fp);
168
    }
175
    }
169
#endif
176
#endif
177
    LOGDEB(("plaintorich: done %d mS\n", chron.millis()));
170
    return out;
178
    return true;
171
}
179
}