recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [b28eaf] .. [297ff2]

Switch to unified view


...
// of Unicode properties, but seems to do the job well enough in most
// common cases
static set<unsigned int> unicign;
static set<unsigned int> visiblewhite;

class CharClassInit {
public:
    CharClassInit() 
    {


  unsigned int i;

  // Set default value for all: SPACE
  for (i = 0 ; i < 256 ; i ++)
        charclasses[i] = SPACE;

  char digits[] = "0123456789";
  for (i = 0; i  < strlen(digits); i++)
        charclasses[int(digits[i])] = DIGIT;

  char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  for (i = 0; i  < strlen(upper); i++)
        charclasses[int(upper[i])] = A_ULETTER;

  char lower[] = "abcdefghijklmnopqrstuvwxyz";
  for (i = 0; i  < strlen(lower); i++)
        charclasses[int(lower[i])] = A_LLETTER;

  char wild[] = "*?[]";
  for (i = 0; i  < strlen(wild); i++)
        charclasses[int(wild[i])] = WILD;

  char special[] = ".@+-,#'_\n\r";
  for (i = 0; i  < strlen(special); i++)
        charclasses[int(special[i])] = special[i];

  for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
        unicign.insert(uniign[i]);
  }
  unicign.insert((unsigned int)-1);

  for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
        visiblewhite.insert(avsbwht[i]);



  }
    }
};
static const CharClassInit charClassInitInstance;

static inline int whatcc(unsigned int c)
{
    if (c <= 127) {
    return charclasses[c]; 
...
         o_processCJK, o_CJKNgramLen,
         m_flags & TXTS_NOSPANS ? " nospans" : "",
         m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
         m_flags & TXTS_KEEPWILD ? " keepwild" : "",
         in.substr(0,50).c_str()));



    m_span.erase();
    m_inNumber = false;
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
    int curspanglue = 0;
...
    return splitter.wcnt;
}

bool TextSplit::hasVisibleWhite(const string &in)
{

    Utf8Iter it(in);
    for (; !it.eof(); it++) {
    unsigned int c = (unsigned char)*it;
    LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
    if (c == (unsigned int)-1) {
...
    return false;
}

template <class T> bool u8stringToStrings(const string &s, T &tokens)
{

    Utf8Iter it(s);

    string current;
    tokens.clear();
    enum states {SPACE, TOKEN, INQUOTE, ESCAPE};

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
	...		...
58	// of Unicode properties, but seems to do the job well enough in most	58	// of Unicode properties, but seems to do the job well enough in most
59	// common cases	59	// common cases
60	static set<unsigned int> unicign;	60	static set<unsigned int> unicign;
61	static set<unsigned int> visiblewhite;	61	static set<unsigned int> visiblewhite;
62		62
63	// Set up character classes array and the additional unicode sets	63	class CharClassInit {
64	static void setcharclasses()	64	public:
65	{	65	CharClassInit()
66	static int init = 0;	66	{
67	if (init)
68	return;
69	unsigned int i;	67	unsigned int i;
70		68
71	// Set default value for all: SPACE	69	// Set default value for all: SPACE
72	for (i = 0 ; i < 256 ; i ++)	70	for (i = 0 ; i < 256 ; i ++)
73	charclasses[i] = SPACE;	71	charclasses[i] = SPACE;
74		72
75	char digits[] = "0123456789";	73	char digits[] = "0123456789";
76	for (i = 0; i < strlen(digits); i++)	74	for (i = 0; i < strlen(digits); i++)
77	charclasses[int(digits[i])] = DIGIT;	75	charclasses[int(digits[i])] = DIGIT;
78		76
79	char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";	77	char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
80	for (i = 0; i < strlen(upper); i++)	78	for (i = 0; i < strlen(upper); i++)
81	charclasses[int(upper[i])] = A_ULETTER;	79	charclasses[int(upper[i])] = A_ULETTER;
82		80
83	char lower[] = "abcdefghijklmnopqrstuvwxyz";	81	char lower[] = "abcdefghijklmnopqrstuvwxyz";
84	for (i = 0; i < strlen(lower); i++)	82	for (i = 0; i < strlen(lower); i++)
85	charclasses[int(lower[i])] = A_LLETTER;	83	charclasses[int(lower[i])] = A_LLETTER;
86		84
87	char wild[] = "*?[]";	85	char wild[] = "*?[]";
88	for (i = 0; i < strlen(wild); i++)	86	for (i = 0; i < strlen(wild); i++)
89	charclasses[int(wild[i])] = WILD;	87	charclasses[int(wild[i])] = WILD;
90		88
91	char special[] = ".@+-,#'_\n\r";	89	char special[] = ".@+-,#'_\n\r";
92	for (i = 0; i < strlen(special); i++)	90	for (i = 0; i < strlen(special); i++)
93	charclasses[int(special[i])] = special[i];	91	charclasses[int(special[i])] = special[i];
94		92
95	for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {	93	for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
96	unicign.insert(uniign[i]);	94	unicign.insert(uniign[i]);
97	}	95	}
98	unicign.insert((unsigned int)-1);	96	unicign.insert((unsigned int)-1);
99		97
100	for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {	98	for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
101	visiblewhite.insert(avsbwht[i]);	99	visiblewhite.insert(avsbwht[i]);
102	}
103
104	init = 1;
105	}	100	}
		101	}
		102	};
		103	static const CharClassInit charClassInitInstance;
106		104
107	static inline int whatcc(unsigned int c)	105	static inline int whatcc(unsigned int c)
108	{	106	{
109	if (c <= 127) {	107	if (c <= 127) {
110	return charclasses[c];	108	return charclasses[c];
	...		...
277	o_processCJK, o_CJKNgramLen,	275	o_processCJK, o_CJKNgramLen,
278	m_flags & TXTS_NOSPANS ? " nospans" : "",	276	m_flags & TXTS_NOSPANS ? " nospans" : "",
279	m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",	277	m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
280	m_flags & TXTS_KEEPWILD ? " keepwild" : "",	278	m_flags & TXTS_KEEPWILD ? " keepwild" : "",
281	in.substr(0,50).c_str()));	279	in.substr(0,50).c_str()));
282
283	setcharclasses();
284		280
285	m_span.erase();	281	m_span.erase();
286	m_inNumber = false;	282	m_inNumber = false;
287	m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;	283	m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
288	int curspanglue = 0;	284	int curspanglue = 0;
	...		...
631	return splitter.wcnt;	627	return splitter.wcnt;
632	}	628	}
633		629
634	bool TextSplit::hasVisibleWhite(const string &in)	630	bool TextSplit::hasVisibleWhite(const string &in)
635	{	631	{
636	setcharclasses();
637	Utf8Iter it(in);	632	Utf8Iter it(in);
638	for (; !it.eof(); it++) {	633	for (; !it.eof(); it++) {
639	unsigned int c = (unsigned char)*it;	634	unsigned int c = (unsigned char)*it;
640	LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));	635	LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
641	if (c == (unsigned int)-1) {	636	if (c == (unsigned int)-1) {
	...		...
648	return false;	643	return false;
649	}	644	}
650		645
651	template <class T> bool u8stringToStrings(const string &s, T &tokens)	646	template <class T> bool u8stringToStrings(const string &s, T &tokens)
652	{	647	{
653	setcharclasses();
654	Utf8Iter it(s);	648	Utf8Iter it(s);
655		649
656	string current;	650	string current;
657	tokens.clear();	651	tokens.clear();
658	enum states {SPACE, TOKEN, INQUOTE, ESCAPE};	652	enum states {SPACE, TOKEN, INQUOTE, ESCAPE};