Parent: [4eaf12] (diff)

Child: [b4c7ef] (diff)

Download this file

textsplit.h    139 lines (113 with data), 4.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_
#include <string>
#include <vector>
using std::string;
using std::vector;
class Utf8Iter;
/**
* Split text into words.
* See comments at top of .cpp for more explanations.
* This uses a callback function. It could be done with an iterator instead,
* but 'ts much simpler this way...
*/
class TextSplit {
public:
// Should we activate special processing of Chinese characters ? This
// needs a little more cpu, so it can be turned off globally. This is set
// by rclconfig, changing it means reindexing
static bool o_processCJK;
static unsigned int o_CJKNgramLen;
static const unsigned int o_CJKMaxNgramLen = 5;
static void cjkProcessing(bool onoff, unsigned int ngramlen = 2)
{
o_processCJK = onoff;
o_CJKNgramLen = ngramlen <= o_CJKMaxNgramLen ?
ngramlen : o_CJKMaxNgramLen;
}
// Are we indexing numbers ? Set by rclconfig. Change needs reindex
static bool o_noNumbers;
static void noNumbers()
{
o_noNumbers = true;
}
enum Flags {TXTS_NONE = 0,
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
TXTS_KEEPWILD = 4 // Handle wildcards as letters
};
TextSplit(Flags flags = Flags(TXTS_NONE))
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
{
}
virtual ~TextSplit() {}
/** Split text, emit words and positions. */
virtual bool text_to_words(const string &in);
/** Process one output word: to be implemented by the actual user class */
virtual bool takeword(const string& term,
int pos, // term pos
int bts, // byte offset of first char in term
int bte // byte offset of first char after term
) = 0;
/** Called when we encounter formfeed \f 0x0c. Override to use the event.
* Mostly or exclusively used with pdftoxx output. Other filters mostly
* just don't know about pages. */
virtual void newpage(int /*pos*/)
{
}
// Static utility functions:
/** Count words in string, as the splitter would generate them */
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
/** Check if this is visibly not a single block of text */
static bool hasVisibleWhite(const string &in);
/** Split text span into strings, at white space, allowing for substrings
* quoted with " . Escaping with \ works as usual inside the quoted areas.
* This has to be kept separate from smallut.cpp's stringsToStrings, which
* basically works only if whitespace is ascii, and which processes
* non-utf-8 input (iso-8859 config files work ok). This hopefully
* handles all Unicode whitespace, but needs correct utf-8 input
*/
static bool stringToStrings(const string &s, vector<string> &tokens);
/** Is char CJK ? */
static bool isCJK(int c);
private:
Flags m_flags;
int m_maxWordLength;
// Current span. Might be jf.dockes@wanadoo.f
string m_span;
// Current word: no punctuation at all in there. Byte offset
// relative to the current span and byte length
int m_wordStart;
unsigned int m_wordLen;
// Currently inside number
bool m_inNumber;
// Term position of current word and span
int m_wordpos;
int m_spanpos;
// It may happen that our cleanup would result in emitting the
// same term twice. We try to avoid this
int m_prevpos;
unsigned int m_prevlen;
// This processes cjk text:
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
bool doemit(bool spanerase, int bp, bool spanemit=false);
void discardspan();
};
#endif /* _TEXTSPLIT_H_INCLUDED_ */