|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.29 2007-01-25 15:40:55 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.30 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
101 |
int btstart, int btend)
|
101 |
int btstart, int btend)
|
102 |
{
|
102 |
{
|
103 |
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
103 |
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
104 |
|
104 |
|
105 |
unsigned int l = w.length();
|
105 |
unsigned int l = w.length();
|
106 |
if (l > 0 && l < (unsigned)maxWordLength) {
|
106 |
if (l > 0 && l < (unsigned)m_maxWordLength) {
|
107 |
// 1 char word: we index single letters and digits, but
|
107 |
// 1 char word: we index single letters and digits, but
|
108 |
// nothing else. We might want to turn this into a test for a single
|
108 |
// nothing else. We might want to turn this into a test for a single
|
109 |
// utf8 character instead.
|
109 |
// utf8 character instead.
|
110 |
if (l == 1) {
|
110 |
if (l == 1) {
|
111 |
int c = (int)w[0];
|
111 |
int c = (int)w[0];
|
112 |
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
112 |
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
113 |
//cerr << "ERASING single letter term " << c << endl;
|
113 |
//cerr << "ERASING single letter term " << c << endl;
|
114 |
return true;
|
114 |
return true;
|
115 |
}
|
115 |
}
|
116 |
}
|
116 |
}
|
117 |
if (pos != prevpos || l != prevlen) {
|
117 |
if (pos != m_prevpos || l != m_prevlen) {
|
118 |
bool ret = cb->takeword(w, pos, btstart, btend);
|
118 |
bool ret = m_cb->takeword(w, pos, btstart, btend);
|
|
|
119 |
m_prevpos = pos;
|
119 |
prevlen = w.length();
|
120 |
m_prevlen = w.length();
|
120 |
prevpos = pos;
|
|
|
121 |
return ret;
|
121 |
return ret;
|
122 |
}
|
122 |
}
|
123 |
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
|
123 |
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
|
124 |
}
|
124 |
}
|
125 |
return true;
|
125 |
return true;
|
|
... |
|
... |
144 |
span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
|
144 |
span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
|
145 |
|
145 |
|
146 |
// Emit span. When splitting for query, we only emit final spans
|
146 |
// Emit span. When splitting for query, we only emit final spans
|
147 |
bool spanemitted = false;
|
147 |
bool spanemitted = false;
|
148 |
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
|
148 |
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
|
149 |
// Maybe trim at end These are chars that we would keep inside
|
149 |
// Maybe trim at end. These are chars that we would keep inside
|
150 |
// a span, but not at the end
|
150 |
// a span, but not at the end
|
151 |
while (span.length() > 0) {
|
151 |
while (m_span.length() > 0) {
|
152 |
switch (span[span.length()-1]) {
|
152 |
switch (m_span[m_span.length()-1]) {
|
153 |
case '.':
|
153 |
case '.':
|
154 |
case ',':
|
154 |
case ',':
|
155 |
case '@':
|
155 |
case '@':
|
156 |
case '\'':
|
156 |
case '\'':
|
157 |
span.resize(span.length()-1);
|
157 |
m_span.resize(m_span.length()-1);
|
158 |
if (--bp < 0)
|
158 |
if (--bp < 0)
|
159 |
bp=0;
|
159 |
bp = 0;
|
160 |
break;
|
160 |
break;
|
161 |
default:
|
161 |
default:
|
162 |
goto breakloop1;
|
162 |
goto breakloop1;
|
163 |
}
|
163 |
}
|
164 |
}
|
164 |
}
|
165 |
breakloop1:
|
165 |
breakloop1:
|
166 |
spanemitted = true;
|
166 |
spanemitted = true;
|
167 |
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
|
167 |
if (!emitterm(true, m_span, m_spanpos, bp - m_span.length(), bp))
|
168 |
return false;
|
168 |
return false;
|
169 |
}
|
169 |
}
|
170 |
|
170 |
|
171 |
// Emit word if different from span and not 'no words' mode
|
171 |
// Emit word if different from span and not 'no words' mode
|
172 |
if (!(m_flags & TXTS_ONLYSPANS) && wordLen &&
|
172 |
if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen &&
|
173 |
(!spanemitted || wordLen != span.length())) {
|
173 |
(!spanemitted || m_wordLen != m_span.length())) {
|
174 |
string s(span.substr(wordStart, wordLen));
|
174 |
string s(m_span.substr(m_wordStart, m_wordLen));
|
175 |
if (!emitterm(false, s, wordpos, bp-wordLen, bp))
|
175 |
if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
|
176 |
return false;
|
176 |
return false;
|
177 |
}
|
177 |
}
|
178 |
|
178 |
|
179 |
// Adjust state
|
179 |
// Adjust state
|
180 |
wordpos++;
|
180 |
m_wordpos++;
|
181 |
wordLen = 0;
|
181 |
m_wordLen = 0;
|
182 |
if (spanerase) {
|
182 |
if (spanerase) {
|
183 |
span.erase();
|
183 |
m_span.erase();
|
184 |
spanpos = wordpos;
|
184 |
m_spanpos = m_wordpos;
|
185 |
wordStart = 0;
|
185 |
m_wordStart = 0;
|
186 |
} else {
|
186 |
} else {
|
187 |
wordStart = span.length();
|
187 |
m_wordStart = m_span.length();
|
188 |
}
|
188 |
}
|
189 |
|
189 |
|
190 |
return true;
|
190 |
return true;
|
191 |
}
|
191 |
}
|
192 |
|
192 |
|
|
... |
|
... |
213 |
LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb,
|
213 |
LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb,
|
214 |
in.substr(0,50).c_str()));
|
214 |
in.substr(0,50).c_str()));
|
215 |
|
215 |
|
216 |
setcharclasses();
|
216 |
setcharclasses();
|
217 |
|
217 |
|
218 |
span.erase();
|
218 |
m_span.erase();
|
219 |
number = false;
|
219 |
m_inNumber = false;
|
220 |
wordStart = wordLen = prevpos = prevlen = wordpos = spanpos = 0;
|
220 |
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
221 |
|
221 |
|
222 |
Utf8Iter it(in);
|
222 |
Utf8Iter it(in);
|
223 |
|
223 |
|
224 |
for (; !it.eof(); it++) {
|
224 |
for (; !it.eof(); it++) {
|
225 |
unsigned int c = *it;
|
225 |
unsigned int c = *it;
|
|
... |
|
... |
229 |
return false;
|
229 |
return false;
|
230 |
}
|
230 |
}
|
231 |
int cc = whatcc(c);
|
231 |
int cc = whatcc(c);
|
232 |
switch (cc) {
|
232 |
switch (cc) {
|
233 |
case LETTER:
|
233 |
case LETTER:
|
234 |
wordLen += it.appendchartostring(span);
|
234 |
m_wordLen += it.appendchartostring(m_span);
|
235 |
break;
|
235 |
break;
|
236 |
|
236 |
|
237 |
case DIGIT:
|
237 |
case DIGIT:
|
238 |
if (wordLen == 0)
|
238 |
if (m_wordLen == 0)
|
239 |
number = true;
|
239 |
m_inNumber = true;
|
240 |
wordLen += it.appendchartostring(span);
|
240 |
m_wordLen += it.appendchartostring(m_span);
|
241 |
break;
|
241 |
break;
|
242 |
|
242 |
|
243 |
case SPACE:
|
243 |
case SPACE:
|
244 |
SPACE:
|
244 |
SPACE:
|
245 |
if (wordLen || span.length()) {
|
245 |
if (m_wordLen || m_span.length()) {
|
246 |
if (!doemit(true, it.getBpos()))
|
246 |
if (!doemit(true, it.getBpos()))
|
247 |
return false;
|
247 |
return false;
|
248 |
number = false;
|
248 |
m_inNumber = false;
|
249 |
}
|
249 |
}
|
250 |
break;
|
250 |
break;
|
251 |
case WILD:
|
251 |
case WILD:
|
252 |
if (m_flags & TXTS_KEEPWILD)
|
252 |
if (m_flags & TXTS_KEEPWILD)
|
253 |
goto NORMALCHAR;
|
253 |
goto NORMALCHAR;
|
254 |
else
|
254 |
else
|
255 |
goto SPACE;
|
255 |
goto SPACE;
|
256 |
break;
|
256 |
break;
|
257 |
case '-':
|
257 |
case '-':
|
258 |
case '+':
|
258 |
case '+':
|
259 |
if (wordLen == 0) {
|
259 |
if (m_wordLen == 0) {
|
260 |
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
260 |
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
261 |
number = true;
|
261 |
m_inNumber = true;
|
262 |
wordLen += it.appendchartostring(span);
|
262 |
m_wordLen += it.appendchartostring(m_span);
|
263 |
} else {
|
263 |
} else {
|
264 |
wordStart += it.appendchartostring(span);
|
264 |
m_wordStart += it.appendchartostring(m_span);
|
265 |
}
|
265 |
}
|
266 |
} else {
|
266 |
} else {
|
267 |
if (!doemit(false, it.getBpos()))
|
267 |
if (!doemit(false, it.getBpos()))
|
268 |
return false;
|
268 |
return false;
|
269 |
number = false;
|
269 |
m_inNumber = false;
|
270 |
wordStart += it.appendchartostring(span);
|
270 |
m_wordStart += it.appendchartostring(m_span);
|
271 |
}
|
271 |
}
|
272 |
break;
|
272 |
break;
|
273 |
case '.':
|
273 |
case '.':
|
274 |
case ',':
|
274 |
case ',':
|
275 |
if (number) {
|
275 |
if (m_inNumber) {
|
276 |
// 132.jpg ?
|
276 |
// 132.jpg ?
|
277 |
if (whatcc(it[it.getCpos()+1]) != DIGIT)
|
277 |
if (whatcc(it[it.getCpos()+1]) != DIGIT)
|
278 |
goto SPACE;
|
278 |
goto SPACE;
|
279 |
wordLen += it.appendchartostring(span);
|
279 |
m_wordLen += it.appendchartostring(m_span);
|
280 |
break;
|
280 |
break;
|
281 |
} else {
|
281 |
} else {
|
282 |
// If . inside a word, keep it, else, this is whitespace.
|
282 |
// If . inside a word, keep it, else, this is whitespace.
|
283 |
// We also keep an initial '.' for catching .net, but this adds
|
283 |
// We also keep an initial '.' for catching .net, but this adds
|
284 |
// quite a few spurious terms !
|
284 |
// quite a few spurious terms !
|
285 |
// Another problem is that something like .x-errs
|
285 |
// Another problem is that something like .x-errs
|
286 |
// will be split as .x-errs, x, errs but not x-errs
|
286 |
// will be split as .x-errs, x, errs but not x-errs
|
287 |
// A final comma in a word will be removed by doemit
|
287 |
// A final comma in a word will be removed by doemit
|
288 |
if (cc == '.') {
|
288 |
if (cc == '.') {
|
289 |
if (wordLen) {
|
289 |
if (m_wordLen) {
|
290 |
if (!doemit(false, it.getBpos()))
|
290 |
if (!doemit(false, it.getBpos()))
|
291 |
return false;
|
291 |
return false;
|
292 |
// span length could have been adjusted by trimming
|
292 |
// span length could have been adjusted by trimming
|
293 |
// inside doemit
|
293 |
// inside doemit
|
294 |
if (span.length())
|
294 |
if (m_span.length())
|
295 |
wordStart += it.appendchartostring(span);
|
295 |
m_wordStart += it.appendchartostring(m_span);
|
296 |
break;
|
296 |
break;
|
297 |
} else {
|
297 |
} else {
|
298 |
wordStart += it.appendchartostring(span);
|
298 |
m_wordStart += it.appendchartostring(m_span);
|
299 |
break;
|
299 |
break;
|
300 |
}
|
300 |
}
|
301 |
}
|
301 |
}
|
302 |
}
|
302 |
}
|
303 |
goto SPACE;
|
303 |
goto SPACE;
|
304 |
break;
|
304 |
break;
|
305 |
case '@':
|
305 |
case '@':
|
306 |
if (wordLen) {
|
306 |
if (m_wordLen) {
|
307 |
if (!doemit(false, it.getBpos()))
|
307 |
if (!doemit(false, it.getBpos()))
|
308 |
return false;
|
308 |
return false;
|
309 |
number = false;
|
309 |
m_inNumber = false;
|
310 |
}
|
310 |
}
|
311 |
wordStart += it.appendchartostring(span);
|
311 |
m_wordStart += it.appendchartostring(m_span);
|
312 |
break;
|
312 |
break;
|
313 |
case '\'':
|
313 |
case '\'':
|
314 |
// If in word, potential span: o'brien, else, this is more
|
314 |
// If in word, potential span: o'brien, else, this is more
|
315 |
// whitespace
|
315 |
// whitespace
|
316 |
if (wordLen) {
|
316 |
if (m_wordLen) {
|
317 |
if (!doemit(false, it.getBpos()))
|
317 |
if (!doemit(false, it.getBpos()))
|
318 |
return false;
|
318 |
return false;
|
319 |
number = false;
|
319 |
m_inNumber = false;
|
320 |
wordStart += it.appendchartostring(span);
|
320 |
m_wordStart += it.appendchartostring(m_span);
|
321 |
}
|
321 |
}
|
322 |
break;
|
322 |
break;
|
323 |
case '#':
|
323 |
case '#':
|
324 |
// Keep it only at end of word ... Special case for c# you see...
|
324 |
// Keep it only at end of word ... Special case for c# you see...
|
325 |
if (wordLen > 0) {
|
325 |
if (m_wordLen > 0) {
|
326 |
int w = whatcc(it[it.getCpos()+1]);
|
326 |
int w = whatcc(it[it.getCpos()+1]);
|
327 |
if (w == SPACE || w == '\n' || w == '\r') {
|
327 |
if (w == SPACE || w == '\n' || w == '\r') {
|
328 |
wordLen += it.appendchartostring(span);
|
328 |
m_wordLen += it.appendchartostring(m_span);
|
329 |
break;
|
329 |
break;
|
330 |
}
|
330 |
}
|
331 |
}
|
331 |
}
|
332 |
goto SPACE;
|
332 |
goto SPACE;
|
333 |
break;
|
333 |
break;
|
334 |
case '\n':
|
334 |
case '\n':
|
335 |
case '\r':
|
335 |
case '\r':
|
336 |
if (span.length() && span[span.length() - 1] == '-') {
|
336 |
if (m_span.length() && m_span[m_span.length() - 1] == '-') {
|
337 |
// if '-' is the last char before end of line, just
|
337 |
// if '-' is the last char before end of line, just
|
338 |
// ignore the line change. This is the right thing to
|
338 |
// ignore the line change. This is the right thing to
|
339 |
// do almost always. We'd then need a way to check if
|
339 |
// do almost always. We'd then need a way to check if
|
340 |
// the - was added as part of the word hyphenation, or was
|
340 |
// the - was added as part of the word hyphenation, or was
|
341 |
// there in the first place, but this would need a dictionary.
|
341 |
// there in the first place, but this would need a dictionary.
|
|
... |
|
... |
347 |
}
|
347 |
}
|
348 |
break;
|
348 |
break;
|
349 |
|
349 |
|
350 |
default:
|
350 |
default:
|
351 |
NORMALCHAR:
|
351 |
NORMALCHAR:
|
352 |
wordLen += it.appendchartostring(span);
|
352 |
m_wordLen += it.appendchartostring(m_span);
|
353 |
break;
|
353 |
break;
|
354 |
}
|
354 |
}
|
355 |
}
|
355 |
}
|
356 |
if (wordLen || span.length()) {
|
356 |
if (m_wordLen || m_span.length()) {
|
357 |
if (!doemit(true, it.getBpos()))
|
357 |
if (!doemit(true, it.getBpos()))
|
358 |
return false;
|
358 |
return false;
|
359 |
}
|
359 |
}
|
360 |
return true;
|
360 |
return true;
|
361 |
}
|
361 |
}
|