recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [ab9053] .. [930bdc]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...
#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */

/**
 * Splitting a text into words. The code in this file works with utf-8
 * in a semi-clean way (see uproplist.h)






 *
 * We are also not using capitalization information.
 *
 * There are a few remnants of the initial utf8-ignorant version in this file.




 */

// Character classes: we have three main groups, and then some chars
// are their own class because they want special handling.
// 
// We have an array with 256 slots where we keep the character types. 
// The array could be fully static, but we use a small function to fill it 
// once.
// The array is actually a remnant of the original version which did no utf8
// It could be reduced to 128, because real (over 128) utf8 chars are now 
// handled with a set holding all the separator values.
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
static int charclasses[256];

static set<unsigned int> unicign;
static void setcharclasses()
...

    char special[] = ".@+-,#'\n\r";
    for (i = 0; i  < strlen(special); i++)
    charclasses[int(special[i])] = special[i];



    for (i = 0; i < sizeof(uniign); i++) 
    unicign.insert(uniign[i]);
    unicign.insert((unsigned int)-1);

    init = 1;
}

// Do some checking (the kind which is simpler to do here than in the
// main loop), then send term to our client.
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
...
{
    LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));

    unsigned int l = w.length();
    if (l > 0 && l < (unsigned)maxWordLength) {
  // 1 char word: we index single letters and digits, but
  // nothing else. We might want to turn this into a test for a single
  // utf8 character instead.
    if (l == 1) {


        int c = (int)w[0];
        if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
        //cerr << "ERASING single letter term " << c << endl;
        return true;
        }
...
        LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
        return false;
    }
    int cc = whatcc(c);
    switch (cc) {
  case LETTER:
      word += it;
      span += it;
      break;

  case DIGIT:
      if (word.length() == 0)
      number = true;
      word += it;
      span += it;
      break;

    case SPACE:
    SPACE:
        if (word.length() || span.length()) {
        if (!doemit(true, it.getBpos()))
            return false;
...
        } else {
        // Handle like a normal separator
        goto SPACE;
        }
        break;





    default:
        word += it;
        span += it;
        break;
    }

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.20 2006-02-01 14:18:20 dockes Exp $ (C) 2004 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
30	#ifndef NO_NAMESPACES	30	#ifndef NO_NAMESPACES
31	using namespace std;	31	using namespace std;
32	#endif /* NO_NAMESPACES */	32	#endif /* NO_NAMESPACES */
33		33
34	/**	34	/**
35	* Splitting a text into words. The code in this file will work with any	35	* Splitting a text into words. The code in this file works with utf-8
36	* charset where the basic separators (.,- etc.) have their ascii values	36	* in a semi-clean way (see uproplist.h)
37	* (ok for UTF-8, ascii, iso8859* and quite a few others).
38	*
39	* We work in a way which would make it quite difficult to handle non-ascii
40	* separator chars (en-dash, etc.). We would then need to actually parse the
41	* utf-8 stream, and use a different way to classify the characters (instead
42	* of a 256 slot array).
43	*	37	*
44	* We are also not using capitalization information.	38	* We are also not using capitalization information.
45	*	39	*
46	* How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.	40	* There are a few remnants of the initial utf8-ignorant version in this file.
47	* Then specialcase all 'real' utf chars, by checking for the few
48	* punctuation ones we're interested in (put them in a map). Then
49	* classify all other non-ascii as letter, and use the current method
50	* for chars < 127.
51	*/	41	*/
52		42
53	// Character classes: we have three main groups, and then some chars	43	// Character classes: we have three main groups, and then some chars
54	// are their own class because they want special handling.	44	// are their own class because they want special handling.
		45	//
55	// We have an array with 256 slots where we keep the character types.	46	// We have an array with 256 slots where we keep the character types.
56	// The array could be fully static, but we use a small function to fill it	47	// The array could be fully static, but we use a small function to fill it
57	// once.	48	// once.
		49	// The array is actually a remnant of the original version which did no utf8
		50	// It could be reduced to 128, because real (over 128) utf8 chars are now
		51	// handled with a set holding all the separator values.
58	enum CharClass {LETTER=256, SPACE=257, DIGIT=258};	52	enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
59	static int charclasses[256];	53	static int charclasses[256];
60		54
61	static set<unsigned int> unicign;	55	static set<unsigned int> unicign;
62	static void setcharclasses()	56	static void setcharclasses()
	...		...
85		79
86	char special[] = ".@+-,#'\n\r";	80	char special[] = ".@+-,#'\n\r";
87	for (i = 0; i < strlen(special); i++)	81	for (i = 0; i < strlen(special); i++)
88	charclasses[int(special[i])] = special[i];	82	charclasses[int(special[i])] = special[i];
89		83
90	init = 1;
91	//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
92	for (i = 0; i < sizeof(uniign); i++)	84	for (i = 0; i < sizeof(uniign); i++)
93	unicign.insert(uniign[i]);	85	unicign.insert(uniign[i]);
94	unicign.insert((unsigned int)-1);	86	unicign.insert((unsigned int)-1);
		87
		88	init = 1;
95	}	89	}
96		90
97	// Do some checking (the kind which is simpler to do here than in the	91	// Do some checking (the kind which is simpler to do here than in the
98	// main loop), then send term to our client.	92	// main loop), then send term to our client.
99	inline bool TextSplit::emitterm(bool isspan, string &w, int pos,	93	inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
	...		...
101	{	95	{
102	LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));	96	LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
103		97
104	unsigned int l = w.length();	98	unsigned int l = w.length();
105	if (l > 0 && l < (unsigned)maxWordLength) {	99	if (l > 0 && l < (unsigned)maxWordLength) {
		100	// 1 char word: we index single letters and digits, but
		101	// nothing else. We might want to turn this into a test for a single
		102	// utf8 character instead.
106	if (l == 1) {	103	if (l == 1) {
107	// 1 char word: we index single letters and digits, but
108	// nothing else
109	int c = (int)w[0];	104	int c = (int)w[0];
110	if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {	105	if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
111	//cerr << "ERASING single letter term " << c << endl;	106	//cerr << "ERASING single letter term " << c << endl;
112	return true;	107	return true;
113	}	108	}
	...		...
225	LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));	220	LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
226	return false;	221	return false;
227	}	222	}
228	int cc = whatcc(c);	223	int cc = whatcc(c);
229	switch (cc) {	224	switch (cc) {
		225	case LETTER:
		226	word += it;
		227	span += it;
		228	break;
		229
		230	case DIGIT:
		231	if (word.length() == 0)
		232	number = true;
		233	word += it;
		234	span += it;
		235	break;
		236
230	case SPACE:	237	case SPACE:
231	SPACE:	238	SPACE:
232	if (word.length() \|\| span.length()) {	239	if (word.length() \|\| span.length()) {
233	if (!doemit(true, it.getBpos()))	240	if (!doemit(true, it.getBpos()))
234	return false;	241	return false;
	...		...
324	} else {	331	} else {
325	// Handle like a normal separator	332	// Handle like a normal separator
326	goto SPACE;	333	goto SPACE;
327	}	334	}
328	break;	335	break;
329	case DIGIT:	336
330	if (word.length() == 0)
331	number = true;
332	/* FALLTHROUGH */
333	case LETTER:
334	default:	337	default:
335	word += it;	338	word += it;
336	span += it;	339	span += it;
337	break;	340	break;
338	}	341	}