|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
333 |
m_spanpos = m_wordpos;
|
333 |
m_spanpos = m_wordpos;
|
334 |
m_wordStart = 0;
|
334 |
m_wordStart = 0;
|
335 |
m_wordLen = 0;
|
335 |
m_wordLen = 0;
|
336 |
}
|
336 |
}
|
337 |
|
337 |
|
|
|
338 |
static inline bool isalphanum(int what, unsigned int flgs)
|
|
|
339 |
{
|
|
|
340 |
return what == A_LLETTER || what == A_ULETTER ||
|
|
|
341 |
what == DIGIT || what == LETTER ||
|
|
|
342 |
((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
|
|
|
343 |
}
|
|
|
344 |
static inline bool isdigit(int what, unsigned int flgs)
|
|
|
345 |
{
|
|
|
346 |
return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
|
|
|
347 |
}
|
|
|
348 |
|
338 |
/**
|
349 |
/**
|
339 |
* Splitting a text into terms to be indexed.
|
350 |
* Splitting a text into terms to be indexed.
|
340 |
* We basically emit a word every time we see a separator, but some chars are
|
351 |
* We basically emit a word every time we see a separator, but some chars are
|
341 |
* handled specially so that special cases, ie, c++ and jfd@recoll.com etc,
|
352 |
* handled specially so that special cases, ie, c++ and jfd@recoll.com etc,
|
342 |
* are handled properly,
|
353 |
* are handled properly,
|
|
... |
|
... |
441 |
case '+':
|
452 |
case '+':
|
442 |
curspanglue = cc;
|
453 |
curspanglue = cc;
|
443 |
if (m_wordLen == 0) {
|
454 |
if (m_wordLen == 0) {
|
444 |
// + or - don't start a term except if this looks like
|
455 |
// + or - don't start a term except if this looks like
|
445 |
// it's going to be to be a number
|
456 |
// it's going to be to be a number
|
446 |
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
457 |
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
|
447 |
// -10
|
458 |
// -10
|
448 |
m_inNumber = true;
|
459 |
m_inNumber = true;
|
449 |
m_wordLen += it.appendchartostring(m_span);
|
460 |
m_wordLen += it.appendchartostring(m_span);
|
450 |
} else {
|
461 |
} else {
|
451 |
goto SPACE;
|
462 |
goto SPACE;
|
452 |
}
|
463 |
}
|
453 |
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
464 |
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
454 |
m_span[m_span.length() - 1] == 'E')) {
|
465 |
m_span[m_span.length() - 1] == 'E')) {
|
455 |
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
466 |
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
|
456 |
m_wordLen += it.appendchartostring(m_span);
|
467 |
m_wordLen += it.appendchartostring(m_span);
|
457 |
} else {
|
468 |
} else {
|
458 |
goto SPACE;
|
469 |
goto SPACE;
|
459 |
}
|
470 |
}
|
460 |
} else {
|
471 |
} else {
|
|
... |
|
... |
466 |
{
|
477 |
{
|
467 |
// Need a little lookahead here. At worse this gets the end null
|
478 |
// Need a little lookahead here. At worse this gets the end null
|
468 |
int nextc = it[it.getCpos()+1];
|
479 |
int nextc = it[it.getCpos()+1];
|
469 |
int nextwhat = whatcc(nextc);
|
480 |
int nextwhat = whatcc(nextc);
|
470 |
if (m_inNumber) {
|
481 |
if (m_inNumber) {
|
471 |
if (nextwhat != DIGIT)
|
482 |
if (!isdigit(nextwhat, m_flags))
|
472 |
goto SPACE;
|
483 |
goto SPACE;
|
473 |
m_wordLen += it.appendchartostring(m_span);
|
484 |
m_wordLen += it.appendchartostring(m_span);
|
474 |
curspanglue = cc;
|
485 |
curspanglue = cc;
|
475 |
break;
|
486 |
break;
|
476 |
} else {
|
487 |
} else {
|
|
... |
|
... |
480 |
// Another problem is that something like .x-errs
|
491 |
// Another problem is that something like .x-errs
|
481 |
// will be split as .x-errs, x, errs but not x-errs
|
492 |
// will be split as .x-errs, x, errs but not x-errs
|
482 |
// A final comma in a word will be removed by doemit
|
493 |
// A final comma in a word will be removed by doemit
|
483 |
|
494 |
|
484 |
// Only letters and digits make sense after
|
495 |
// Only letters and digits make sense after
|
485 |
if (nextwhat != A_LLETTER && nextwhat != A_ULETTER &&
|
496 |
if (!isalphanum(nextwhat, m_flags))
|
486 |
nextwhat != DIGIT && nextwhat != LETTER)
|
|
|
487 |
goto SPACE;
|
497 |
goto SPACE;
|
488 |
|
498 |
|
489 |
if (cc == '.') {
|
499 |
if (cc == '.') {
|
490 |
// Check for number like .1
|
500 |
// Check for number like .1
|
491 |
if (m_span.length() == 0 && nextwhat == DIGIT) {
|
501 |
if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
|
492 |
m_inNumber = true;
|
502 |
m_inNumber = true;
|
493 |
m_wordLen += it.appendchartostring(m_span);
|
503 |
m_wordLen += it.appendchartostring(m_span);
|
494 |
curspanglue = cc;
|
504 |
curspanglue = cc;
|
495 |
break;
|
505 |
break;
|
496 |
}
|
506 |
}
|