recoll / Code / Diff of /src/common/textsplit.h

Diff of /src/common/textsplit.h [2b2cfd] .. [ece153]

Switch to unified view


...
#include <string>
#include <vector>

using std::string;
using std::vector;
using std::pair;

class Utf8Iter;

/** 
 * Split text into words. 
...
    static void noNumbers()
    {
    o_noNumbers = true;
    }

    enum Flags {
        // Default: will return spans and words (a_b, a, b)
        TXTS_NONE = 0, 
        // Only return maximum spans (a@b.com, not a, b, or com) 
        TXTS_ONLYSPANS = 1,  
        // Special: Only return atomic words (a, b, com).  This is not
        // used for indexing, but for position computation during
        // abstract generation,
        TXTS_NOSPANS = 2,  
        // Handle wildcards as letters. This is used with ONLYSPANS
        // for parsing a user query (never alone).
        TXTS_KEEPWILD = 4 
    };

    
    TextSplit(Flags flags = Flags(TXTS_NONE))
    : m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
    {
    }
...
    int           m_maxWordLength;

    // Current span. Might be jf.dockes@wanadoo.f
    string        m_span; 

    vector <pair<unsigned int, unsigned int> > m_words_in_span;

    // Current word: no punctuation at all in there. Byte offset
    // relative to the current span and byte length
    int           m_wordStart;
    unsigned int  m_wordLen;

...

    // This processes cjk text:
    bool cjk_to_words(Utf8Iter *it, unsigned int *cp);

    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
    bool doemit(bool spanerase, int bp);
    void discardspan();
    bool span_is_acronym(std::string *acronym);
    bool words_from_span();
};

#endif /* _TEXTSPLIT_H_INCLUDED_ */

	a/src/common/textsplit.h		b/src/common/textsplit.h
	...		...
22	#include <string>	22	#include <string>
23	#include <vector>	23	#include <vector>
24		24
25	using std::string;	25	using std::string;
26	using std::vector;	26	using std::vector;
		27	using std::pair;
27		28
28	class Utf8Iter;	29	class Utf8Iter;
29		30
30	/**	31	/**
31	* Split text into words.	32	* Split text into words.
	...		...
53	static void noNumbers()	54	static void noNumbers()
54	{	55	{
55	o_noNumbers = true;	56	o_noNumbers = true;
56	}	57	}
57		58
58	enum Flags {TXTS_NONE = 0,	59	enum Flags {
59	TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)	60	// Default: will return spans and words (a_b, a, b)
60	TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)	61	TXTS_NONE = 0,
61	TXTS_KEEPWILD = 4 // Handle wildcards as letters	62	// Only return maximum spans (a@b.com, not a, b, or com)
		63	TXTS_ONLYSPANS = 1,
		64	// Special: Only return atomic words (a, b, com). This is not
		65	// used for indexing, but for position computation during
		66	// abstract generation,
		67	TXTS_NOSPANS = 2,
		68	// Handle wildcards as letters. This is used with ONLYSPANS
		69	// for parsing a user query (never alone).
		70	TXTS_KEEPWILD = 4
62	};	71	};
63
64		72
65	TextSplit(Flags flags = Flags(TXTS_NONE))	73	TextSplit(Flags flags = Flags(TXTS_NONE))
66	: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)	74	: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
67	{	75	{
68	}	76	}
	...		...
175	int m_maxWordLength;	183	int m_maxWordLength;
176		184
177	// Current span. Might be jf.dockes@wanadoo.f	185	// Current span. Might be jf.dockes@wanadoo.f
178	string m_span;	186	string m_span;
179		187
		188	vector <pair<unsigned int, unsigned int> > m_words_in_span;
		189
180	// Current word: no punctuation at all in there. Byte offset	190	// Current word: no punctuation at all in there. Byte offset
181	// relative to the current span and byte length	191	// relative to the current span and byte length
182	int m_wordStart;	192	int m_wordStart;
183	unsigned int m_wordLen;	193	unsigned int m_wordLen;
184		194
	...		...
205		215
206	// This processes cjk text:	216	// This processes cjk text:
207	bool cjk_to_words(Utf8Iter it, unsigned int cp);	217	bool cjk_to_words(Utf8Iter it, unsigned int cp);
208		218
209	bool emitterm(bool isspan, string &term, int pos, int bs, int be);	219	bool emitterm(bool isspan, string &term, int pos, int bs, int be);
210	bool doemit(bool spanerase, int bp, bool spanemit=false);	220	bool doemit(bool spanerase, int bp);
211	void discardspan();	221	void discardspan();
		222	bool span_is_acronym(std::string *acronym);
		223	bool words_from_span();
212	};	224	};
213		225
214	#endif /* _TEXTSPLIT_H_INCLUDED_ */	226	#endif /* _TEXTSPLIT_H_INCLUDED_ */