Switch to unified view

a/src/common/textsplit.h b/src/common/textsplit.h
...
...
90
              ) = 0; 
90
              ) = 0; 
91
91
92
    /** Called when we encounter formfeed \f 0x0c. Override to use the event.
92
    /** Called when we encounter formfeed \f 0x0c. Override to use the event.
93
     * Mostly or exclusively used with pdftoxx output. Other filters mostly 
93
     * Mostly or exclusively used with pdftoxx output. Other filters mostly 
94
     * just don't know about pages. */
94
     * just don't know about pages. */
95
    virtual void newpage(int /*pos*/)
95
    virtual void newpage(int /*pos*/) {
96
    {
97
    }
96
    }
98
97
99
    // Static utility functions:
98
    // Static utility functions:
100
99
101
    /** Count words in string, as the splitter would generate them */
100
    /** Count words in string, as the splitter would generate them */
...
...
109
     * This has to be kept separate from smallut.cpp's stringsToStrings, which
108
     * This has to be kept separate from smallut.cpp's stringsToStrings, which
110
     * basically works only if whitespace is ascii, and which processes 
109
     * basically works only if whitespace is ascii, and which processes 
111
     * non-utf-8 input (iso-8859 config files work ok). This hopefully
110
     * non-utf-8 input (iso-8859 config files work ok). This hopefully
112
     * handles all Unicode whitespace, but needs correct utf-8 input
111
     * handles all Unicode whitespace, but needs correct utf-8 input
113
     */
112
     */
114
    static bool stringToStrings(const std::string &s, std::vector<std::string> &tokens);
113
    static bool stringToStrings(const std::string &s,
114
                                std::vector<std::string> &tokens);
115
115
116
    /** Is char CJK ? */
116
    /** Is char CJK ? (excluding Katakana) */
117
    static bool isCJK(int c);
117
    static bool isCJK(int c);
118
    static bool isKATAKANA(int c);
118
119
119
    /** Statistics about word length (average and dispersion) can
120
    /** Statistics about word length (average and dispersion) can
120
     * detect bad data like undecoded base64 or other mis-identified
121
     * detect bad data like undecoded base64 or other mis-identified
121
     * pieces of data taken as text. In practise, this keeps some junk out 
122
     * pieces of data taken as text. In practise, this keeps some junk out 
122
     * of the index, but does not decrease the index size much, and is
123
     * of the index, but does not decrease the index size much, and is