|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
206 |
* @param bp The current BYTE position in the stream
|
206 |
* @param bp The current BYTE position in the stream
|
207 |
* @param spanemit This is set for intermediate spans: glue char changed.
|
207 |
* @param spanemit This is set for intermediate spans: glue char changed.
|
208 |
*/
|
208 |
*/
|
209 |
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
209 |
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
210 |
{
|
210 |
{
|
211 |
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d "
|
211 |
LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
|
212 |
"innum %d\n", m_span.c_str(), m_spanpos, m_wordStart,
|
212 |
"inn %d span [%s]\n",
|
213 |
m_wordLen, spanerase, bp, m_inNumber));
|
213 |
spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
|
|
|
214 |
m_inNumber, m_span.c_str()));
|
214 |
|
215 |
|
215 |
// Emit span. When splitting for query, we only emit final spans
|
216 |
// Emit span? When splitting for query, we only emit final spans
|
|
|
217 |
// (spanerase)
|
216 |
bool spanemitted = false;
|
218 |
bool spanemitted = false;
|
217 |
if (!(m_flags & TXTS_NOSPANS) &&
|
219 |
if (!(m_flags & TXTS_NOSPANS) &&
|
218 |
!((m_wordLen == m_span.length()) &&
|
220 |
!((m_wordLen == m_span.length()) &&
|
219 |
(o_noNumbers) && m_inNumber) &&
|
221 |
(o_noNumbers) && m_inNumber) &&
|
220 |
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
|
222 |
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
|
221 |
// Maybe trim at end. These are chars that we would keep inside
|
223 |
// Maybe trim at end. These are chars that we would keep inside
|
222 |
// a span, but not at the end
|
224 |
// a span, but not at the end
|
223 |
while (m_span.length() > 0) {
|
225 |
while (m_span.length() > 0) {
|
224 |
switch (m_span[m_span.length()-1]) {
|
226 |
switch (m_span[m_span.length()-1]) {
|
225 |
case '.':
|
227 |
case '.':
|
|
|
228 |
case '-':
|
226 |
case ',':
|
229 |
case ',':
|
227 |
case '@':
|
230 |
case '@':
|
228 |
case '\'':
|
231 |
case '\'':
|
229 |
m_span.resize(m_span.length()-1);
|
232 |
m_span.resize(m_span.length()-1);
|
230 |
if (--bp < 0)
|
233 |
if (--bp < 0)
|
|
... |
|
... |
248 |
if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
|
251 |
if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
|
249 |
return false;
|
252 |
return false;
|
250 |
}
|
253 |
}
|
251 |
|
254 |
|
252 |
// Adjust state
|
255 |
// Adjust state
|
|
|
256 |
if (m_wordLen) {
|
253 |
m_wordpos++;
|
257 |
m_wordpos++;
|
254 |
m_wordLen = 0;
|
258 |
m_wordLen = 0;
|
|
|
259 |
}
|
255 |
if (spanerase) {
|
260 |
if (spanerase) {
|
256 |
m_span.erase();
|
261 |
discardspan();
|
257 |
m_spanpos = m_wordpos;
|
|
|
258 |
m_wordStart = 0;
|
|
|
259 |
} else {
|
262 |
} else {
|
260 |
m_wordStart = m_span.length();
|
263 |
m_wordStart = m_span.length();
|
261 |
}
|
264 |
}
|
262 |
|
265 |
|
263 |
return true;
|
266 |
return true;
|
|
|
267 |
}
|
|
|
268 |
|
|
|
269 |
void TextSplit::discardspan()
|
|
|
270 |
{
|
|
|
271 |
m_span.erase();
|
|
|
272 |
m_spanpos = m_wordpos;
|
|
|
273 |
m_wordStart = 0;
|
|
|
274 |
m_wordLen = 0;
|
264 |
}
|
275 |
}
|
265 |
|
276 |
|
266 |
/**
|
277 |
/**
|
267 |
* Splitting a text into terms to be indexed.
|
278 |
* Splitting a text into terms to be indexed.
|
268 |
* We basically emit a word every time we see a separator, but some chars are
|
279 |
* We basically emit a word every time we see a separator, but some chars are
|
|
... |
|
... |
281 |
m_span.erase();
|
292 |
m_span.erase();
|
282 |
m_inNumber = false;
|
293 |
m_inNumber = false;
|
283 |
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
294 |
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
284 |
int curspanglue = 0;
|
295 |
int curspanglue = 0;
|
285 |
|
296 |
|
|
|
297 |
// Running count of non-alphanum chars. Reset when we see one;
|
|
|
298 |
int nonalnumcnt = 0;
|
|
|
299 |
|
286 |
Utf8Iter it(in);
|
300 |
Utf8Iter it(in);
|
287 |
|
301 |
|
288 |
for (; !it.eof(); it++) {
|
302 |
for (; !it.eof(); it++) {
|
289 |
unsigned int c = *it;
|
303 |
unsigned int c = *it;
|
|
|
304 |
nonalnumcnt++;
|
290 |
|
305 |
|
291 |
if (c == (unsigned int)-1) {
|
306 |
if (c == (unsigned int)-1) {
|
292 |
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
307 |
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
293 |
return false;
|
308 |
return false;
|
294 |
}
|
309 |
}
|
|
... |
|
... |
317 |
switch (cc) {
|
332 |
switch (cc) {
|
318 |
case DIGIT:
|
333 |
case DIGIT:
|
319 |
if (m_wordLen == 0)
|
334 |
if (m_wordLen == 0)
|
320 |
m_inNumber = true;
|
335 |
m_inNumber = true;
|
321 |
m_wordLen += it.appendchartostring(m_span);
|
336 |
m_wordLen += it.appendchartostring(m_span);
|
|
|
337 |
nonalnumcnt = 0;
|
322 |
break;
|
338 |
break;
|
323 |
|
339 |
|
324 |
case SPACE:
|
340 |
case SPACE:
|
325 |
SPACE:
|
341 |
SPACE:
|
326 |
curspanglue = 0;
|
342 |
curspanglue = 0;
|
|
|
343 |
nonalnumcnt = 0;
|
327 |
if (m_wordLen || m_span.length()) {
|
344 |
if (m_wordLen || m_span.length()) {
|
328 |
if (!doemit(true, it.getBpos()))
|
345 |
if (!doemit(true, it.getBpos()))
|
329 |
return false;
|
346 |
return false;
|
330 |
m_inNumber = false;
|
347 |
m_inNumber = false;
|
331 |
}
|
348 |
}
|
|
... |
|
... |
336 |
else
|
353 |
else
|
337 |
goto SPACE;
|
354 |
goto SPACE;
|
338 |
break;
|
355 |
break;
|
339 |
case '-':
|
356 |
case '-':
|
340 |
case '+':
|
357 |
case '+':
|
|
|
358 |
curspanglue = cc;
|
341 |
if (m_wordLen == 0 ||
|
359 |
if (m_wordLen == 0) {
|
|
|
360 |
if (cc == '-') {
|
|
|
361 |
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
|
|
362 |
// -10
|
|
|
363 |
m_inNumber = true;
|
|
|
364 |
m_wordLen += it.appendchartostring(m_span);
|
|
|
365 |
} else {
|
|
|
366 |
goto SPACE;
|
|
|
367 |
}
|
|
|
368 |
} else {
|
|
|
369 |
if (nonalnumcnt > 2) {
|
|
|
370 |
discardspan();
|
|
|
371 |
} else {
|
|
|
372 |
m_wordStart += it.appendchartostring(m_span);
|
|
|
373 |
}
|
|
|
374 |
}
|
342 |
(m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
375 |
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
343 |
m_span[m_span.length() - 1] == 'E'))) {
|
376 |
m_span[m_span.length() - 1] == 'E')) {
|
344 |
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
377 |
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
345 |
m_inNumber = true;
|
|
|
346 |
m_wordLen += it.appendchartostring(m_span);
|
378 |
m_wordLen += it.appendchartostring(m_span);
|
347 |
} else {
|
379 |
} else {
|
348 |
m_wordStart += it.appendchartostring(m_span);
|
380 |
goto SPACE;
|
349 |
}
|
381 |
}
|
350 |
curspanglue = cc;
|
|
|
351 |
} else {
|
382 |
} else {
|
352 |
if (!doemit(false, it.getBpos()))
|
383 |
if (!doemit(false, it.getBpos()))
|
353 |
return false;
|
384 |
return false;
|
354 |
curspanglue = cc;
|
|
|
355 |
m_inNumber = false;
|
385 |
m_inNumber = false;
|
356 |
m_wordStart += it.appendchartostring(m_span);
|
386 |
m_wordStart += it.appendchartostring(m_span);
|
357 |
}
|
387 |
}
|
358 |
break;
|
388 |
break;
|
359 |
case '.':
|
389 |
case '.':
|
|
... |
|
... |
365 |
goto SPACE;
|
395 |
goto SPACE;
|
366 |
m_wordLen += it.appendchartostring(m_span);
|
396 |
m_wordLen += it.appendchartostring(m_span);
|
367 |
curspanglue = cc;
|
397 |
curspanglue = cc;
|
368 |
break;
|
398 |
break;
|
369 |
} else {
|
399 |
} else {
|
370 |
// If . inside a word, keep it, else, this is whitespace.
|
400 |
// If . inside a word, it's spanglue, else, it's whitespace.
|
371 |
// We also keep an initial '.' for catching .net, but this adds
|
401 |
// We also keep an initial '.' for catching .net, but this adds
|
372 |
// quite a few spurious terms !
|
402 |
// quite a few spurious terms !
|
373 |
// Another problem is that something like .x-errs
|
403 |
// Another problem is that something like .x-errs
|
374 |
// will be split as .x-errs, x, errs but not x-errs
|
404 |
// will be split as .x-errs, x, errs but not x-errs
|
375 |
// A final comma in a word will be removed by doemit
|
405 |
// A final comma in a word will be removed by doemit
|
376 |
if (cc == '.') {
|
406 |
if (cc == '.' && it[it.getCpos()+1] != '.') {
|
377 |
// Check for number like .1
|
407 |
// Check for number like .1
|
378 |
if (m_span.length() == 0 &&
|
408 |
if (m_span.length() == 0 &&
|
379 |
whatcc(it[it.getCpos()+1]) == DIGIT) {
|
409 |
whatcc(it[it.getCpos()+1]) == DIGIT) {
|
380 |
m_inNumber = true;
|
410 |
m_inNumber = true;
|
381 |
m_wordLen += it.appendchartostring(m_span);
|
411 |
m_wordLen += it.appendchartostring(m_span);
|
|
... |
|
... |
384 |
}
|
414 |
}
|
385 |
|
415 |
|
386 |
if (m_wordLen) {
|
416 |
if (m_wordLen) {
|
387 |
// Disputable special case: set spanemit to
|
417 |
// Disputable special case: set spanemit to
|
388 |
// true when encountering a '.' while spanglue
|
418 |
// true when encountering a '.' while spanglue
|
389 |
// is '_'. Think of a_b.c Done because to
|
419 |
// is '_'. Think of a_b.c Done to
|
390 |
// avoid breaking stuff after changing '_'
|
420 |
// avoid breaking stuff after changing '_'
|
391 |
// from wordchar to spanglue
|
421 |
// from wordchar to spanglue
|
392 |
if (!doemit(false, it.getBpos(), curspanglue == '_'))
|
422 |
if (!doemit(false, it.getBpos(), curspanglue == '_'))
|
393 |
return false;
|
423 |
return false;
|
394 |
curspanglue = cc;
|
424 |
curspanglue = cc;
|
|
... |
|
... |
507 |
NORMALCHAR:
|
537 |
NORMALCHAR:
|
508 |
if (m_inNumber && c != 'e' && c != 'E') {
|
538 |
if (m_inNumber && c != 'e' && c != 'E') {
|
509 |
m_inNumber = false;
|
539 |
m_inNumber = false;
|
510 |
}
|
540 |
}
|
511 |
m_wordLen += it.appendchartostring(m_span);
|
541 |
m_wordLen += it.appendchartostring(m_span);
|
|
|
542 |
nonalnumcnt = 0;
|
512 |
break;
|
543 |
break;
|
513 |
}
|
544 |
}
|
514 |
}
|
545 |
}
|
515 |
if (m_wordLen || m_span.length()) {
|
546 |
if (m_wordLen || m_span.length()) {
|
516 |
if (!doemit(true, it.getBpos()))
|
547 |
if (!doemit(true, it.getBpos()))
|