recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [d06e45] .. [2b2cfd]

Switch to unified view


...
                int btstart, int btend)
{
    LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));

    unsigned int l = w.length();

#ifdef TEXTSPLIT_STATS
    // Update word length statistics. Do this before we filter out
    // long words because stats are used to detect bad text
    if (!isspan || m_wordLen == m_span.length())
  m_stats.newsamp(m_wordChars);
#endif

    if (l > 0 && l < (unsigned)m_maxWordLength) {
    // 1 byte word: we index single ascii letters and digits, but
    // nothing else. We might want to turn this into a test for a
    // single utf8 character instead ?
    if (l == 1) {
...
    }

    // Adjust state
    if (m_wordLen) {
    m_wordpos++;
  m_wordLen = m_wordChars = 0;
    }
    if (spanerase) {
    discardspan();
    } else {
    m_wordStart = m_span.length();
...
void TextSplit::discardspan()
{
    m_span.erase();
    m_spanpos = m_wordpos;
    m_wordStart = 0;
    m_wordLen = m_wordChars = 0;
}

static inline bool isalphanum(int what, unsigned int flgs)
{
    return what == A_LLETTER || what == A_ULETTER ||
...
}
static inline bool isdigit(int what, unsigned int flgs)
{
    return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
}

#ifdef TEXTSPLIT_STATS
#define INC_WORDCHARS ++m_wordChars
#else
#define INC_WORDCHARS
#endif

/** 
 * Splitting a text into terms to be indexed.
 * We basically emit a word every time we see a separator, but some chars are
 * handled specially so that special cases, ie, c++ and jfd@recoll.com etc, 
...
    if (in.empty())
    return true;

    m_span.erase();
    m_inNumber = false;
    m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos 
  = m_spanpos = 0;
    int curspanglue = 0;
    bool pagepending = false;
    bool softhyphenpending = false;

    // Running count of non-alphanum chars. Reset when we see one;
...
        continue;
    case DIGIT:
        if (m_wordLen == 0)
        m_inNumber = true;
        m_wordLen += it.appendchartostring(m_span);
      INC_WORDCHARS;
        nonalnumcnt = 0;
        break;

    case SPACE:
    SPACE:
...
        // it's going to be to be a number
        if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
            // -10
            m_inNumber = true;
            m_wordLen += it.appendchartostring(m_span);
          INC_WORDCHARS;
        } else {
            goto SPACE;
        } 
        } else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
                      m_span[m_span.length() - 1] == 'E')) {
        if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
            m_wordLen += it.appendchartostring(m_span);
          INC_WORDCHARS;
        } else {
            goto SPACE;
        }
        } else {
        goto SPACE;
...
        int nextwhat = whatcc(nextc);
        if (m_inNumber) {
        if (!isdigit(nextwhat, m_flags))
            goto SPACE;
        m_wordLen += it.appendchartostring(m_span);
      INC_WORDCHARS;
        curspanglue = cc;
        break;
        } else {
        // If . inside a word, it's spanglue, else, it's whitespace. 
        // We also keep an initial '.' for catching .net, but this adds
...
        if (cc == '.') {
                    // Check for number like .1
                    if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
                        m_inNumber = true;
                        m_wordLen += it.appendchartostring(m_span);
          INC_WORDCHARS;
                        curspanglue = cc;
                        break;
                    }
                            
            if (m_wordLen) {
...
        // Keep it only at end of word ... Special case for c# you see...
        if (m_wordLen > 0) {
        int w = whatcc(it[it.getCpos()+1]);
        if (w == SPACE || w == '\n' || w == '\r') {
            m_wordLen += it.appendchartostring(m_span);
          INC_WORDCHARS;
            break;
        }
        }
        goto SPACE;
        break;
...
    NORMALCHAR:
            if (m_inNumber && c != 'e' && c != 'E') {
                m_inNumber = false;
            }
        m_wordLen += it.appendchartostring(m_span);
      INC_WORDCHARS;
        nonalnumcnt = 0;
        break;
    }
    softhyphenpending = false;
    }
...
    }
    }

    m_span.erase();
    m_inNumber = false;
    m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0;
    m_spanpos = m_wordpos;
    *cp = c;
    return true;
}

...
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <math.h>

#include <iostream>

#include "textsplit.h"
#include "readfile.h"
...

class myTermProc : public Rcl::TermProc {
    int first;
    bool nooutput;
public:
    myTermProc() : TermProc(0), first(1), nooutput(false) {}
    void setNoOut(bool val) {nooutput = val;}
    virtual bool takeword(const string &term, int pos, int bs, int be)
    {
    if (nooutput)
        return true;
...

        if (op_flags & OPT_q)
            printproc.setNoOut(true);

    splitter.text_to_words(data);
#ifdef TEXTSPLIT_STATS
  TextSplit::Stats::Values v = splitter.getStats();
  cout << "Average length: " 
       <<  v.avglen
       << " Standard deviation: " 
       << v.sigma
       << " Coef of variation "
       << v.sigma / v.avglen
       << endl;
#endif
    }    
}
#endif // TEST

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
	...		...
203	int btstart, int btend)	203	int btstart, int btend)
204	{	204	{
205	LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));	205	LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
206		206
207	unsigned int l = w.length();	207	unsigned int l = w.length();
		208
		209	#ifdef TEXTSPLIT_STATS
		210	// Update word length statistics. Do this before we filter out
		211	// long words because stats are used to detect bad text
		212	if (!isspan \|\| m_wordLen == m_span.length())
		213	m_stats.newsamp(m_wordChars);
		214	#endif
		215
208	if (l > 0 && l < (unsigned)m_maxWordLength) {	216	if (l > 0 && l < (unsigned)m_maxWordLength) {
209	// 1 byte word: we index single ascii letters and digits, but	217	// 1 byte word: we index single ascii letters and digits, but
210	// nothing else. We might want to turn this into a test for a	218	// nothing else. We might want to turn this into a test for a
211	// single utf8 character instead ?	219	// single utf8 character instead ?
212	if (l == 1) {	220	if (l == 1) {
	...		...
314	}	322	}
315		323
316	// Adjust state	324	// Adjust state
317	if (m_wordLen) {	325	if (m_wordLen) {
318	m_wordpos++;	326	m_wordpos++;
319	m_wordLen = 0;	327	m_wordLen = m_wordChars = 0;
320	}	328	}
321	if (spanerase) {	329	if (spanerase) {
322	discardspan();	330	discardspan();
323	} else {	331	} else {
324	m_wordStart = m_span.length();	332	m_wordStart = m_span.length();
	...		...
330	void TextSplit::discardspan()	338	void TextSplit::discardspan()
331	{	339	{
332	m_span.erase();	340	m_span.erase();
333	m_spanpos = m_wordpos;	341	m_spanpos = m_wordpos;
334	m_wordStart = 0;	342	m_wordStart = 0;
335	m_wordLen = 0;	343	m_wordLen = m_wordChars = 0;
336	}	344	}
337		345
338	static inline bool isalphanum(int what, unsigned int flgs)	346	static inline bool isalphanum(int what, unsigned int flgs)
339	{	347	{
340	return what == A_LLETTER \|\| what == A_ULETTER \|\|	348	return what == A_LLETTER \|\| what == A_ULETTER \|\|
	...		...
343	}	351	}
344	static inline bool isdigit(int what, unsigned int flgs)	352	static inline bool isdigit(int what, unsigned int flgs)
345	{	353	{
346	return what == DIGIT \|\| ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);	354	return what == DIGIT \|\| ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
347	}	355	}
		356
		357	#ifdef TEXTSPLIT_STATS
		358	#define INC_WORDCHARS ++m_wordChars
		359	#else
		360	#define INC_WORDCHARS
		361	#endif
348		362
349	/**	363	/**
350	* Splitting a text into terms to be indexed.	364	* Splitting a text into terms to be indexed.
351	* We basically emit a word every time we see a separator, but some chars are	365	* We basically emit a word every time we see a separator, but some chars are
352	* handled specially so that special cases, ie, c++ and jfd@recoll.com etc,	366	* handled specially so that special cases, ie, c++ and jfd@recoll.com etc,
	...		...
364	if (in.empty())	378	if (in.empty())
365	return true;	379	return true;
366		380
367	m_span.erase();	381	m_span.erase();
368	m_inNumber = false;	382	m_inNumber = false;
369	m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;	383	m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos
		384	= m_spanpos = 0;
370	int curspanglue = 0;	385	int curspanglue = 0;
371	bool pagepending = false;	386	bool pagepending = false;
372	bool softhyphenpending = false;	387	bool softhyphenpending = false;
373		388
374	// Running count of non-alphanum chars. Reset when we see one;	389	// Running count of non-alphanum chars. Reset when we see one;
	...		...
421	continue;	436	continue;
422	case DIGIT:	437	case DIGIT:
423	if (m_wordLen == 0)	438	if (m_wordLen == 0)
424	m_inNumber = true;	439	m_inNumber = true;
425	m_wordLen += it.appendchartostring(m_span);	440	m_wordLen += it.appendchartostring(m_span);
		441	INC_WORDCHARS;
426	nonalnumcnt = 0;	442	nonalnumcnt = 0;
427	break;	443	break;
428		444
429	case SPACE:	445	case SPACE:
430	SPACE:	446	SPACE:
	...		...
456	// it's going to be to be a number	472	// it's going to be to be a number
457	if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {	473	if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
458	// -10	474	// -10
459	m_inNumber = true;	475	m_inNumber = true;
460	m_wordLen += it.appendchartostring(m_span);	476	m_wordLen += it.appendchartostring(m_span);
		477	INC_WORDCHARS;
461	} else {	478	} else {
462	goto SPACE;	479	goto SPACE;
463	}	480	}
464	} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' \|\|	481	} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' \|\|
465	m_span[m_span.length() - 1] == 'E')) {	482	m_span[m_span.length() - 1] == 'E')) {
466	if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {	483	if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
467	m_wordLen += it.appendchartostring(m_span);	484	m_wordLen += it.appendchartostring(m_span);
		485	INC_WORDCHARS;
468	} else {	486	} else {
469	goto SPACE;	487	goto SPACE;
470	}	488	}
471	} else {	489	} else {
472	goto SPACE;	490	goto SPACE;
	...		...
480	int nextwhat = whatcc(nextc);	498	int nextwhat = whatcc(nextc);
481	if (m_inNumber) {	499	if (m_inNumber) {
482	if (!isdigit(nextwhat, m_flags))	500	if (!isdigit(nextwhat, m_flags))
483	goto SPACE;	501	goto SPACE;
484	m_wordLen += it.appendchartostring(m_span);	502	m_wordLen += it.appendchartostring(m_span);
		503	INC_WORDCHARS;
485	curspanglue = cc;	504	curspanglue = cc;
486	break;	505	break;
487	} else {	506	} else {
488	// If . inside a word, it's spanglue, else, it's whitespace.	507	// If . inside a word, it's spanglue, else, it's whitespace.
489	// We also keep an initial '.' for catching .net, but this adds	508	// We also keep an initial '.' for catching .net, but this adds
	...		...
499	if (cc == '.') {	518	if (cc == '.') {
500	// Check for number like .1	519	// Check for number like .1
501	if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {	520	if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
502	m_inNumber = true;	521	m_inNumber = true;
503	m_wordLen += it.appendchartostring(m_span);	522	m_wordLen += it.appendchartostring(m_span);
		523	INC_WORDCHARS;
504	curspanglue = cc;	524	curspanglue = cc;
505	break;	525	break;
506	}	526	}
507		527
508	if (m_wordLen) {	528	if (m_wordLen) {
	...		...
565	// Keep it only at end of word ... Special case for c# you see...	585	// Keep it only at end of word ... Special case for c# you see...
566	if (m_wordLen > 0) {	586	if (m_wordLen > 0) {
567	int w = whatcc(it[it.getCpos()+1]);	587	int w = whatcc(it[it.getCpos()+1]);
568	if (w == SPACE \|\| w == '\n' \|\| w == '\r') {	588	if (w == SPACE \|\| w == '\n' \|\| w == '\r') {
569	m_wordLen += it.appendchartostring(m_span);	589	m_wordLen += it.appendchartostring(m_span);
		590	INC_WORDCHARS;
570	break;	591	break;
571	}	592	}
572	}	593	}
573	goto SPACE;	594	goto SPACE;
574	break;	595	break;
	...		...
637	NORMALCHAR:	658	NORMALCHAR:
638	if (m_inNumber && c != 'e' && c != 'E') {	659	if (m_inNumber && c != 'e' && c != 'E') {
639	m_inNumber = false;	660	m_inNumber = false;
640	}	661	}
641	m_wordLen += it.appendchartostring(m_span);	662	m_wordLen += it.appendchartostring(m_span);
		663	INC_WORDCHARS;
642	nonalnumcnt = 0;	664	nonalnumcnt = 0;
643	break;	665	break;
644	}	666	}
645	softhyphenpending = false;	667	softhyphenpending = false;
646	}	668	}
	...		...
736	}	758	}
737	}	759	}
738		760
739	m_span.erase();	761	m_span.erase();
740	m_inNumber = false;	762	m_inNumber = false;
741	m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;	763	m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0;
742	m_spanpos = m_wordpos;	764	m_spanpos = m_wordpos;
743	*cp = c;	765	*cp = c;
744	return true;	766	return true;
745	}	767	}
746		768
	...		...
862	#include <stdlib.h>	884	#include <stdlib.h>
863	#include <unistd.h>	885	#include <unistd.h>
864	#include <errno.h>	886	#include <errno.h>
865	#include <fcntl.h>	887	#include <fcntl.h>
866	#include <string.h>	888	#include <string.h>
		889	#include <math.h>
867		890
868	#include <iostream>	891	#include <iostream>
869		892
870	#include "textsplit.h"	893	#include "textsplit.h"
871	#include "readfile.h"	894	#include "readfile.h"
	...		...
878		901
879	class myTermProc : public Rcl::TermProc {	902	class myTermProc : public Rcl::TermProc {
880	int first;	903	int first;
881	bool nooutput;	904	bool nooutput;
882	public:	905	public:
883	myTermProc() : TermProc(0), first(1), nooutput(false) {}	906	myTermProc() : TermProc(0), first(1), nooutput(false) {}
884	void setNoOut(bool val) {nooutput = val;}	907	void setNoOut(bool val) {nooutput = val;}
885	virtual bool takeword(const string &term, int pos, int bs, int be)	908	virtual bool takeword(const string &term, int pos, int bs, int be)
886	{	909	{
887	if (nooutput)	910	if (nooutput)
888	return true;	911	return true;
	...		...
1056		1079
1057	if (op_flags & OPT_q)	1080	if (op_flags & OPT_q)
1058	printproc.setNoOut(true);	1081	printproc.setNoOut(true);
1059		1082
1060	splitter.text_to_words(data);	1083	splitter.text_to_words(data);
1061		1084	#ifdef TEXTSPLIT_STATS
		1085	TextSplit::Stats::Values v = splitter.getStats();
		1086	cout << "Average length: "
		1087	<< v.avglen
		1088	<< " Standard deviation: "
		1089	<< v.sigma
		1090	<< " Coef of variation "
		1091	<< v.sigma / v.avglen
		1092	<< endl;
		1093	#endif
1062	}	1094	}
1063	}	1095	}
1064	#endif // TEST	1096	#endif // TEST