|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
|
... |
|
... |
90 |
) = 0;
|
90 |
) = 0;
|
91 |
|
91 |
|
92 |
/** Called when we encounter formfeed \f 0x0c. Override to use the event.
|
92 |
/** Called when we encounter formfeed \f 0x0c. Override to use the event.
|
93 |
* Mostly or exclusively used with pdftoxx output. Other filters mostly
|
93 |
* Mostly or exclusively used with pdftoxx output. Other filters mostly
|
94 |
* just don't know about pages. */
|
94 |
* just don't know about pages. */
|
95 |
virtual void newpage(int /*pos*/)
|
95 |
virtual void newpage(int /*pos*/) {
|
96 |
{
|
|
|
97 |
}
|
96 |
}
|
98 |
|
97 |
|
99 |
// Static utility functions:
|
98 |
// Static utility functions:
|
100 |
|
99 |
|
101 |
/** Count words in string, as the splitter would generate them */
|
100 |
/** Count words in string, as the splitter would generate them */
|
|
... |
|
... |
109 |
* This has to be kept separate from smallut.cpp's stringsToStrings, which
|
108 |
* This has to be kept separate from smallut.cpp's stringsToStrings, which
|
110 |
* basically works only if whitespace is ascii, and which processes
|
109 |
* basically works only if whitespace is ascii, and which processes
|
111 |
* non-utf-8 input (iso-8859 config files work ok). This hopefully
|
110 |
* non-utf-8 input (iso-8859 config files work ok). This hopefully
|
112 |
* handles all Unicode whitespace, but needs correct utf-8 input
|
111 |
* handles all Unicode whitespace, but needs correct utf-8 input
|
113 |
*/
|
112 |
*/
|
114 |
static bool stringToStrings(const std::string &s, std::vector<std::string> &tokens);
|
113 |
static bool stringToStrings(const std::string &s,
|
|
|
114 |
std::vector<std::string> &tokens);
|
115 |
|
115 |
|
116 |
/** Is char CJK ? */
|
116 |
/** Is char CJK ? (excluding Katakana) */
|
117 |
static bool isCJK(int c);
|
117 |
static bool isCJK(int c);
|
|
|
118 |
static bool isKATAKANA(int c);
|
118 |
|
119 |
|
119 |
/** Statistics about word length (average and dispersion) can
|
120 |
/** Statistics about word length (average and dispersion) can
|
120 |
* detect bad data like undecoded base64 or other mis-identified
|
121 |
* detect bad data like undecoded base64 or other mis-identified
|
121 |
* pieces of data taken as text. In practise, this keeps some junk out
|
122 |
* pieces of data taken as text. In practise, this keeps some junk out
|
122 |
* of the index, but does not decrease the index size much, and is
|
123 |
* of the index, but does not decrease the index size much, and is
|