recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [40a590] .. [d42db8]

Switch to side-by-side view

--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -1,13 +1,15 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.9 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.10 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_TEXTSPLIT
 
 #include <iostream>
 #include <string>
-
+#include <set>
 #include "textsplit.h"
 #include "debuglog.h"
+#include "utf8iter.h"
+#include "uproplist.h"
 
 using namespace std;
 
@@ -37,6 +39,8 @@
 // once.
 enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
 static int charclasses[256];
+
+static set<unsigned int> unicign;
 static void setcharclasses()
 {
     static int init = 0;
@@ -67,6 +71,8 @@
 
     init = 1;
     //for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
+    for (i = 0; i < sizeof(uniign); i++) 
+	unicign.insert(uniign[i]);
 }
 
 // Do some cleanup (the kind which is simpler to do here than in the main loop,
@@ -152,6 +158,22 @@
     return true;
 }
 
+static inline int whatcc(unsigned int c)
+{
+    int cc;
+    if (c <= 127) {
+	cc = charclasses[c]; 
+    } else {
+	if (c == (unsigned int)-1)
+	    cc = SPACE;
+	else if (unicign.find(c) != unicign.end())
+	    cc = SPACE;
+	else
+	    cc = LETTER;
+    }
+    return cc;
+}
+
 /** 
  * Splitting a text into terms to be indexed.
  * We basically emit a word every time we see a separator, but some chars are
@@ -167,16 +189,21 @@
     bool number = false;
     int wordpos = 0;
     int spanpos = 0;
-    unsigned int i;
-
-    for (i = 0; i < in.length(); i++) {
-	int c = in[i];
-	int cc = charclasses[c]; 
+    int charpos = 0;
+    Utf8Iter it(in);
+
+    for (; !it.eof(); it++, charpos++) {
+	unsigned int c = *it;
+	if (c == (unsigned int)-1) {
+	    LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
+	    return false;
+	}
+	int cc = whatcc(c);
 	switch (cc) {
 	case SPACE:
 	SPACE:
 	    if (word.length()) {
-		if (!doemit(word, wordpos, span, spanpos, true, i))
+		if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
 		    return false;
 		number = false;
 	    }
@@ -186,56 +213,57 @@
 	case '-':
 	case '+':
 	    if (word.length() == 0) {
-		if (i < in.length() && charclasses[int(in[i+1])] == DIGIT) {
+		if (whatcc(it[charpos+1]) == DIGIT) {
 		    number = true;
-		    word += c;
-		    span += c;
+		    word += it;
+		    span += it;
 		}
 	    } else {
-		if (!doemit(word, wordpos, span, spanpos, false, i))
+		if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
 		    return false;
 		number = false;
-		span += c;
+		span += it;
 	    }
 	    break;
 	case '@':
 	    if (word.length()) {
-		if (!doemit(word, wordpos, span, spanpos, false, i))
+		if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
 		    return false;
 		number = false;
 	    } else
-		word += c;
-	    span += c;
+		word += it;
+	    span += it;
 	    break;
 	case '\'':
 	    if (word.length()) {
-		if (!doemit(word, wordpos, span, spanpos, false, i))
+		if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
 		    return false;
 		number = false;
-		span += c;
+		span += it;
 	    }
 	    break;
 	case '.':
 	    if (number) {
-		word += c;
+		word += it;
 	    } else {
 		//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
 		if (word.length()) {
-		    if (!doemit(word, wordpos, span, spanpos, false, i))
+		    if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
 			return false;
 		    number = false;
 		} else 
-		    word += c;
-	    }
-	    span += c;
+		    word += it;
+	    }
+	    span += it;
 	    break;
 	case '#': 
 	    // Keep it only at end of word...
 	    if (word.length() > 0 && 
-		(i == in.length() -1 || charclasses[int(in[i+1])] == SPACE ||
-		 in[i+1] == '\n' || in[i+1] == '\r')) {
-		word += c;
-		span += c;
+		(whatcc(it[charpos+1]) == SPACE ||
+		 whatcc(it[charpos+1]) == '\n' || 
+		 whatcc(it[charpos+1]) == '\r')) {
+		word += it;
+		span += it;
 	    }
 		
 	    break;
@@ -261,13 +289,13 @@
 		else
 		    number = false;
 	    }
-	    word += (char)c;
-	    span += (char)c;
-	    break;
-	}
-    }
-    if (word.length()) {
-	if (!doemit(word, wordpos, span, spanpos, true, i))
+	    word += it;
+	    span += it;
+	    break;
+	}
+    }
+    if (span.length()) {
+	if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
 	    return false;
     }
     return true;
@@ -306,7 +334,8 @@
     "192.168.4.1 "
     "one\n\rtwo\nthree-\nfour "
     "[olala][ululu] "
-    "'o'brien' "						
+    "'o'brien' "
+    "utf-8 ucs-4��"
     "\n"							      
 ;