|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
203 |
int btstart, int btend)
|
203 |
int btstart, int btend)
|
204 |
{
|
204 |
{
|
205 |
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
205 |
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
206 |
|
206 |
|
207 |
unsigned int l = w.length();
|
207 |
unsigned int l = w.length();
|
|
|
208 |
|
|
|
209 |
#ifdef TEXTSPLIT_STATS
|
|
|
210 |
// Update word length statistics. Do this before we filter out
|
|
|
211 |
// long words because stats are used to detect bad text
|
|
|
212 |
if (!isspan || m_wordLen == m_span.length())
|
|
|
213 |
m_stats.newsamp(m_wordChars);
|
|
|
214 |
#endif
|
|
|
215 |
|
208 |
if (l > 0 && l < (unsigned)m_maxWordLength) {
|
216 |
if (l > 0 && l < (unsigned)m_maxWordLength) {
|
209 |
// 1 byte word: we index single ascii letters and digits, but
|
217 |
// 1 byte word: we index single ascii letters and digits, but
|
210 |
// nothing else. We might want to turn this into a test for a
|
218 |
// nothing else. We might want to turn this into a test for a
|
211 |
// single utf8 character instead ?
|
219 |
// single utf8 character instead ?
|
212 |
if (l == 1) {
|
220 |
if (l == 1) {
|
|
... |
|
... |
314 |
}
|
322 |
}
|
315 |
|
323 |
|
316 |
// Adjust state
|
324 |
// Adjust state
|
317 |
if (m_wordLen) {
|
325 |
if (m_wordLen) {
|
318 |
m_wordpos++;
|
326 |
m_wordpos++;
|
319 |
m_wordLen = 0;
|
327 |
m_wordLen = m_wordChars = 0;
|
320 |
}
|
328 |
}
|
321 |
if (spanerase) {
|
329 |
if (spanerase) {
|
322 |
discardspan();
|
330 |
discardspan();
|
323 |
} else {
|
331 |
} else {
|
324 |
m_wordStart = m_span.length();
|
332 |
m_wordStart = m_span.length();
|
|
... |
|
... |
330 |
void TextSplit::discardspan()
|
338 |
void TextSplit::discardspan()
|
331 |
{
|
339 |
{
|
332 |
m_span.erase();
|
340 |
m_span.erase();
|
333 |
m_spanpos = m_wordpos;
|
341 |
m_spanpos = m_wordpos;
|
334 |
m_wordStart = 0;
|
342 |
m_wordStart = 0;
|
335 |
m_wordLen = 0;
|
343 |
m_wordLen = m_wordChars = 0;
|
336 |
}
|
344 |
}
|
337 |
|
345 |
|
338 |
static inline bool isalphanum(int what, unsigned int flgs)
|
346 |
static inline bool isalphanum(int what, unsigned int flgs)
|
339 |
{
|
347 |
{
|
340 |
return what == A_LLETTER || what == A_ULETTER ||
|
348 |
return what == A_LLETTER || what == A_ULETTER ||
|
|
... |
|
... |
343 |
}
|
351 |
}
|
344 |
static inline bool isdigit(int what, unsigned int flgs)
|
352 |
static inline bool isdigit(int what, unsigned int flgs)
|
345 |
{
|
353 |
{
|
346 |
return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
|
354 |
return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
|
347 |
}
|
355 |
}
|
|
|
356 |
|
|
|
357 |
#ifdef TEXTSPLIT_STATS
|
|
|
358 |
#define INC_WORDCHARS ++m_wordChars
|
|
|
359 |
#else
|
|
|
360 |
#define INC_WORDCHARS
|
|
|
361 |
#endif
|
348 |
|
362 |
|
349 |
/**
|
363 |
/**
|
350 |
* Splitting a text into terms to be indexed.
|
364 |
* Splitting a text into terms to be indexed.
|
351 |
* We basically emit a word every time we see a separator, but some chars are
|
365 |
* We basically emit a word every time we see a separator, but some chars are
|
352 |
* handled specially so that special cases, ie, c++ and jfd@recoll.com etc,
|
366 |
* handled specially so that special cases, ie, c++ and jfd@recoll.com etc,
|
|
... |
|
... |
364 |
if (in.empty())
|
378 |
if (in.empty())
|
365 |
return true;
|
379 |
return true;
|
366 |
|
380 |
|
367 |
m_span.erase();
|
381 |
m_span.erase();
|
368 |
m_inNumber = false;
|
382 |
m_inNumber = false;
|
369 |
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
383 |
m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos
|
|
|
384 |
= m_spanpos = 0;
|
370 |
int curspanglue = 0;
|
385 |
int curspanglue = 0;
|
371 |
bool pagepending = false;
|
386 |
bool pagepending = false;
|
372 |
bool softhyphenpending = false;
|
387 |
bool softhyphenpending = false;
|
373 |
|
388 |
|
374 |
// Running count of non-alphanum chars. Reset when we see one;
|
389 |
// Running count of non-alphanum chars. Reset when we see one;
|
|
... |
|
... |
421 |
continue;
|
436 |
continue;
|
422 |
case DIGIT:
|
437 |
case DIGIT:
|
423 |
if (m_wordLen == 0)
|
438 |
if (m_wordLen == 0)
|
424 |
m_inNumber = true;
|
439 |
m_inNumber = true;
|
425 |
m_wordLen += it.appendchartostring(m_span);
|
440 |
m_wordLen += it.appendchartostring(m_span);
|
|
|
441 |
INC_WORDCHARS;
|
426 |
nonalnumcnt = 0;
|
442 |
nonalnumcnt = 0;
|
427 |
break;
|
443 |
break;
|
428 |
|
444 |
|
429 |
case SPACE:
|
445 |
case SPACE:
|
430 |
SPACE:
|
446 |
SPACE:
|
|
... |
|
... |
456 |
// it's going to be to be a number
|
472 |
// it's going to be to be a number
|
457 |
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
|
473 |
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
|
458 |
// -10
|
474 |
// -10
|
459 |
m_inNumber = true;
|
475 |
m_inNumber = true;
|
460 |
m_wordLen += it.appendchartostring(m_span);
|
476 |
m_wordLen += it.appendchartostring(m_span);
|
|
|
477 |
INC_WORDCHARS;
|
461 |
} else {
|
478 |
} else {
|
462 |
goto SPACE;
|
479 |
goto SPACE;
|
463 |
}
|
480 |
}
|
464 |
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
481 |
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
465 |
m_span[m_span.length() - 1] == 'E')) {
|
482 |
m_span[m_span.length() - 1] == 'E')) {
|
466 |
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
|
483 |
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
|
467 |
m_wordLen += it.appendchartostring(m_span);
|
484 |
m_wordLen += it.appendchartostring(m_span);
|
|
|
485 |
INC_WORDCHARS;
|
468 |
} else {
|
486 |
} else {
|
469 |
goto SPACE;
|
487 |
goto SPACE;
|
470 |
}
|
488 |
}
|
471 |
} else {
|
489 |
} else {
|
472 |
goto SPACE;
|
490 |
goto SPACE;
|
|
... |
|
... |
480 |
int nextwhat = whatcc(nextc);
|
498 |
int nextwhat = whatcc(nextc);
|
481 |
if (m_inNumber) {
|
499 |
if (m_inNumber) {
|
482 |
if (!isdigit(nextwhat, m_flags))
|
500 |
if (!isdigit(nextwhat, m_flags))
|
483 |
goto SPACE;
|
501 |
goto SPACE;
|
484 |
m_wordLen += it.appendchartostring(m_span);
|
502 |
m_wordLen += it.appendchartostring(m_span);
|
|
|
503 |
INC_WORDCHARS;
|
485 |
curspanglue = cc;
|
504 |
curspanglue = cc;
|
486 |
break;
|
505 |
break;
|
487 |
} else {
|
506 |
} else {
|
488 |
// If . inside a word, it's spanglue, else, it's whitespace.
|
507 |
// If . inside a word, it's spanglue, else, it's whitespace.
|
489 |
// We also keep an initial '.' for catching .net, but this adds
|
508 |
// We also keep an initial '.' for catching .net, but this adds
|
|
... |
|
... |
499 |
if (cc == '.') {
|
518 |
if (cc == '.') {
|
500 |
// Check for number like .1
|
519 |
// Check for number like .1
|
501 |
if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
|
520 |
if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
|
502 |
m_inNumber = true;
|
521 |
m_inNumber = true;
|
503 |
m_wordLen += it.appendchartostring(m_span);
|
522 |
m_wordLen += it.appendchartostring(m_span);
|
|
|
523 |
INC_WORDCHARS;
|
504 |
curspanglue = cc;
|
524 |
curspanglue = cc;
|
505 |
break;
|
525 |
break;
|
506 |
}
|
526 |
}
|
507 |
|
527 |
|
508 |
if (m_wordLen) {
|
528 |
if (m_wordLen) {
|
|
... |
|
... |
565 |
// Keep it only at end of word ... Special case for c# you see...
|
585 |
// Keep it only at end of word ... Special case for c# you see...
|
566 |
if (m_wordLen > 0) {
|
586 |
if (m_wordLen > 0) {
|
567 |
int w = whatcc(it[it.getCpos()+1]);
|
587 |
int w = whatcc(it[it.getCpos()+1]);
|
568 |
if (w == SPACE || w == '\n' || w == '\r') {
|
588 |
if (w == SPACE || w == '\n' || w == '\r') {
|
569 |
m_wordLen += it.appendchartostring(m_span);
|
589 |
m_wordLen += it.appendchartostring(m_span);
|
|
|
590 |
INC_WORDCHARS;
|
570 |
break;
|
591 |
break;
|
571 |
}
|
592 |
}
|
572 |
}
|
593 |
}
|
573 |
goto SPACE;
|
594 |
goto SPACE;
|
574 |
break;
|
595 |
break;
|
|
... |
|
... |
637 |
NORMALCHAR:
|
658 |
NORMALCHAR:
|
638 |
if (m_inNumber && c != 'e' && c != 'E') {
|
659 |
if (m_inNumber && c != 'e' && c != 'E') {
|
639 |
m_inNumber = false;
|
660 |
m_inNumber = false;
|
640 |
}
|
661 |
}
|
641 |
m_wordLen += it.appendchartostring(m_span);
|
662 |
m_wordLen += it.appendchartostring(m_span);
|
|
|
663 |
INC_WORDCHARS;
|
642 |
nonalnumcnt = 0;
|
664 |
nonalnumcnt = 0;
|
643 |
break;
|
665 |
break;
|
644 |
}
|
666 |
}
|
645 |
softhyphenpending = false;
|
667 |
softhyphenpending = false;
|
646 |
}
|
668 |
}
|
|
... |
|
... |
736 |
}
|
758 |
}
|
737 |
}
|
759 |
}
|
738 |
|
760 |
|
739 |
m_span.erase();
|
761 |
m_span.erase();
|
740 |
m_inNumber = false;
|
762 |
m_inNumber = false;
|
741 |
m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;
|
763 |
m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0;
|
742 |
m_spanpos = m_wordpos;
|
764 |
m_spanpos = m_wordpos;
|
743 |
*cp = c;
|
765 |
*cp = c;
|
744 |
return true;
|
766 |
return true;
|
745 |
}
|
767 |
}
|
746 |
|
768 |
|
|
... |
|
... |
862 |
#include <stdlib.h>
|
884 |
#include <stdlib.h>
|
863 |
#include <unistd.h>
|
885 |
#include <unistd.h>
|
864 |
#include <errno.h>
|
886 |
#include <errno.h>
|
865 |
#include <fcntl.h>
|
887 |
#include <fcntl.h>
|
866 |
#include <string.h>
|
888 |
#include <string.h>
|
|
|
889 |
#include <math.h>
|
867 |
|
890 |
|
868 |
#include <iostream>
|
891 |
#include <iostream>
|
869 |
|
892 |
|
870 |
#include "textsplit.h"
|
893 |
#include "textsplit.h"
|
871 |
#include "readfile.h"
|
894 |
#include "readfile.h"
|
|
... |
|
... |
878 |
|
901 |
|
879 |
class myTermProc : public Rcl::TermProc {
|
902 |
class myTermProc : public Rcl::TermProc {
|
880 |
int first;
|
903 |
int first;
|
881 |
bool nooutput;
|
904 |
bool nooutput;
|
882 |
public:
|
905 |
public:
|
883 |
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
906 |
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
884 |
void setNoOut(bool val) {nooutput = val;}
|
907 |
void setNoOut(bool val) {nooutput = val;}
|
885 |
virtual bool takeword(const string &term, int pos, int bs, int be)
|
908 |
virtual bool takeword(const string &term, int pos, int bs, int be)
|
886 |
{
|
909 |
{
|
887 |
if (nooutput)
|
910 |
if (nooutput)
|
888 |
return true;
|
911 |
return true;
|
|
... |
|
... |
1056 |
|
1079 |
|
1057 |
if (op_flags & OPT_q)
|
1080 |
if (op_flags & OPT_q)
|
1058 |
printproc.setNoOut(true);
|
1081 |
printproc.setNoOut(true);
|
1059 |
|
1082 |
|
1060 |
splitter.text_to_words(data);
|
1083 |
splitter.text_to_words(data);
|
1061 |
|
1084 |
#ifdef TEXTSPLIT_STATS
|
|
|
1085 |
TextSplit::Stats::Values v = splitter.getStats();
|
|
|
1086 |
cout << "Average length: "
|
|
|
1087 |
<< v.avglen
|
|
|
1088 |
<< " Standard deviation: "
|
|
|
1089 |
<< v.sigma
|
|
|
1090 |
<< " Coef of variation "
|
|
|
1091 |
<< v.sigma / v.avglen
|
|
|
1092 |
<< endl;
|
|
|
1093 |
#endif
|
1062 |
}
|
1094 |
}
|
1063 |
}
|
1095 |
}
|
1064 |
#endif // TEST
|
1096 |
#endif // TEST
|