|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
|
... |
|
... |
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
19 |
/* @(#$Id: textsplit.h,v 1.20 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes */
|
19 |
/* @(#$Id: textsplit.h,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
20 |
|
20 |
|
21 |
#include <string>
|
21 |
#include <string>
|
|
|
22 |
#include <list>
|
22 |
#ifndef NO_NAMESPACES
|
23 |
#ifndef NO_NAMESPACES
|
23 |
using std::string;
|
24 |
using std::string;
|
|
|
25 |
using std::list;
|
24 |
#endif
|
26 |
#endif
|
25 |
|
27 |
|
26 |
/**
|
28 |
/**
|
27 |
* Function class whose takeword method is called for every detected word while * splitting text.
|
29 |
* Function class whose takeword method is called for every detected word while * splitting text.
|
28 |
*/
|
30 |
*/
|
|
... |
|
... |
72 |
: m_flags(flags), m_cb(t), m_maxWordLength(40),
|
74 |
: m_flags(flags), m_cb(t), m_maxWordLength(40),
|
73 |
m_prevpos(-1)
|
75 |
m_prevpos(-1)
|
74 |
{
|
76 |
{
|
75 |
}
|
77 |
}
|
76 |
|
78 |
|
77 |
/**
|
|
|
78 |
* Split text, emit words and positions.
|
79 |
/** Split text, emit words and positions. */
|
79 |
*/
|
|
|
80 |
bool text_to_words(const string &in);
|
80 |
bool text_to_words(const string &in);
|
81 |
|
81 |
|
82 |
// Utility functions : these does not need the user to setup a callback
|
82 |
//Utility functions : these does not need the user to setup a callback
|
83 |
// etc.
|
83 |
// etc.
|
|
|
84 |
|
|
|
85 |
/** Count words in string, as the splitter would generate them */
|
84 |
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
86 |
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
85 |
|
87 |
|
|
|
88 |
/** Check if this is visibly not a single block of text */
|
|
|
89 |
static bool hasVisibleWhite(const string &in);
|
|
|
90 |
|
|
|
91 |
/** Split text span into strings, at white space, allowing for substrings
|
|
|
92 |
* quoted with " . Escaping with \ works as usual inside the quoted areas.
|
|
|
93 |
* This has to be kept separate from smallut.cpp's stringsToStrings, which
|
|
|
94 |
* basically works only if whitespace is ascii, and which processes
|
|
|
95 |
* non-utf-8 input (iso-8859 config files work ok). This hopefully
|
|
|
96 |
* handles all Unicode whitespace, but needs correct utf-8 input
|
|
|
97 |
*/
|
|
|
98 |
static bool stringToStrings(const string &s, list<string> &tokens);
|
|
|
99 |
|
86 |
private:
|
100 |
private:
|
87 |
Flags m_flags;
|
101 |
Flags m_flags;
|
88 |
TextSplitCB *m_cb;
|
102 |
TextSplitCB *m_cb;
|
89 |
int m_maxWordLength;
|
103 |
int m_maxWordLength;
|
90 |
|
104 |
|