recoll / Code / Diff of /src/common/textsplit.h

Diff of /src/common/textsplit.h [9d8ce3] .. [9661a4]

Switch to unified view


...
              ) = 0; 

    /** Called when we encounter formfeed \f 0x0c. Override to use the event.
     * Mostly or exclusively used with pdftoxx output. Other filters mostly 
     * just don't know about pages. */
    virtual void newpage(int /*pos*/) {

    }

    // Static utility functions:

    /** Count words in string, as the splitter would generate them */
...
     * This has to be kept separate from smallut.cpp's stringsToStrings, which
     * basically works only if whitespace is ascii, and which processes 
     * non-utf-8 input (iso-8859 config files work ok). This hopefully
     * handles all Unicode whitespace, but needs correct utf-8 input
     */
    static bool stringToStrings(const std::string &s,
                                std::vector<std::string> &tokens);

    /** Is char CJK ? (excluding Katakana) */
    static bool isCJK(int c);
    static bool isKATAKANA(int c);

    /** Statistics about word length (average and dispersion) can
     * detect bad data like undecoded base64 or other mis-identified
     * pieces of data taken as text. In practise, this keeps some junk out 
     * of the index, but does not decrease the index size much, and is

	a/src/common/textsplit.h		b/src/common/textsplit.h
	...		...
90	) = 0;	90	) = 0;
91		91
92	/** Called when we encounter formfeed \f 0x0c. Override to use the event.	92	/** Called when we encounter formfeed \f 0x0c. Override to use the event.
93	* Mostly or exclusively used with pdftoxx output. Other filters mostly	93	* Mostly or exclusively used with pdftoxx output. Other filters mostly
94	* just don't know about pages. */	94	* just don't know about pages. */
95	virtual void newpage(int /pos/)	95	virtual void newpage(int /pos/) {
96	{
97	}	96	}
98		97
99	// Static utility functions:	98	// Static utility functions:
100		99
101	/** Count words in string, as the splitter would generate them */	100	/** Count words in string, as the splitter would generate them */
	...		...
109	* This has to be kept separate from smallut.cpp's stringsToStrings, which	108	* This has to be kept separate from smallut.cpp's stringsToStrings, which
110	* basically works only if whitespace is ascii, and which processes	109	* basically works only if whitespace is ascii, and which processes
111	* non-utf-8 input (iso-8859 config files work ok). This hopefully	110	* non-utf-8 input (iso-8859 config files work ok). This hopefully
112	* handles all Unicode whitespace, but needs correct utf-8 input	111	* handles all Unicode whitespace, but needs correct utf-8 input
113	*/	112	*/
114	static bool stringToStrings(const std::string &s, std::vector<std::string> &tokens);	113	static bool stringToStrings(const std::string &s,
		114	std::vector<std::string> &tokens);
115		115
116	/** Is char CJK ? */	116	/** Is char CJK ? (excluding Katakana) */
117	static bool isCJK(int c);	117	static bool isCJK(int c);
		118	static bool isKATAKANA(int c);
118		119
119	/** Statistics about word length (average and dispersion) can	120	/** Statistics about word length (average and dispersion) can
120	* detect bad data like undecoded base64 or other mis-identified	121	* detect bad data like undecoded base64 or other mis-identified
121	* pieces of data taken as text. In practise, this keeps some junk out	122	* pieces of data taken as text. In practise, this keeps some junk out
122	* of the index, but does not decrease the index size much, and is	123	* of the index, but does not decrease the index size much, and is