recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [ab9053] .. [930bdc]

Switch to side-by-side view

--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.20 2006-02-01 14:18:20 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
  *   This program is free software; you can redistribute it and/or modify
@@ -32,29 +32,23 @@
 #endif /* NO_NAMESPACES */
 
 /**
- * Splitting a text into words. The code in this file will work with any 
- * charset where the basic separators (.,- etc.) have their ascii values 
- * (ok for UTF-8, ascii, iso8859* and quite a few others).
- *
- * We work in a way which would make it quite difficult to handle non-ascii
- * separator chars (en-dash, etc.). We would then need to actually parse the 
- * utf-8 stream, and use a different way to classify the characters (instead 
- * of a 256 slot array).
+ * Splitting a text into words. The code in this file works with utf-8
+ * in a semi-clean way (see uproplist.h)
  *
  * We are also not using capitalization information.
  *
- * How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
- * Then specialcase all 'real' utf chars, by checking for the few
- * punctuation ones we're interested in (put them in a map). Then
- * classify all other non-ascii as letter, and use the current method
- * for chars < 127.
+ * There are a few remnants of the initial utf8-ignorant version in this file.
  */
 
 // Character classes: we have three main groups, and then some chars
 // are their own class because they want special handling.
+// 
 // We have an array with 256 slots where we keep the character types. 
 // The array could be fully static, but we use a small function to fill it 
 // once.
+// The array is actually a remnant of the original version which did no utf8
+// It could be reduced to 128, because real (over 128) utf8 chars are now 
+// handled with a set holding all the separator values.
 enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
 static int charclasses[256];
 
@@ -87,11 +81,11 @@
     for (i = 0; i  < strlen(special); i++)
 	charclasses[int(special[i])] = special[i];
 
-    init = 1;
-    //for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
     for (i = 0; i < sizeof(uniign); i++) 
 	unicign.insert(uniign[i]);
     unicign.insert((unsigned int)-1);
+
+    init = 1;
 }
 
 // Do some checking (the kind which is simpler to do here than in the
@@ -103,9 +97,10 @@
 
     unsigned int l = w.length();
     if (l > 0 && l < (unsigned)maxWordLength) {
+	// 1 char word: we index single letters and digits, but
+	// nothing else. We might want to turn this into a test for a single
+	// utf8 character instead.
 	if (l == 1) {
-	    // 1 char word: we index single letters and digits, but
-	    // nothing else
 	    int c = (int)w[0];
 	    if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
 		//cerr << "ERASING single letter term " << c << endl;
@@ -227,6 +222,18 @@
 	}
 	int cc = whatcc(c);
 	switch (cc) {
+	case LETTER:
+	    word += it;
+	    span += it;
+	    break;
+
+	case DIGIT:
+	    if (word.length() == 0)
+		number = true;
+	    word += it;
+	    span += it;
+	    break;
+
 	case SPACE:
 	SPACE:
 	    if (word.length() || span.length()) {
@@ -326,11 +333,7 @@
 		goto SPACE;
 	    }
 	    break;
-	case DIGIT:
-	    if (word.length() == 0)
-		number = true;
-	    /* FALLTHROUGH */
-	case LETTER:
+
 	default:
 	    word += it;
 	    span += it;