recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [6169fd] .. [a5c937]

Switch to side-by-side view

--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -19,6 +19,8 @@
  */
 #ifndef TEST_TEXTSPLIT
 
+#include <assert.h>
+
 #include <iostream>
 #include <string>
 #include <set>
@@ -26,11 +28,8 @@
 
 #include "textsplit.h"
 #include "debuglog.h"
-#include "assert.h"
-
 //#define UTF8ITER_CHECK
 #include "utf8iter.h"
-
 #include "uproplist.h"
 
 #ifndef NO_NAMESPACES
@@ -39,11 +38,7 @@
 
 /**
  * Splitting a text into words. The code in this file works with utf-8
- * in a semi-clean way (see uproplist.h)
- *
- * We are also not using capitalization information.
- *
- * There are a few remnants of the initial utf8-ignorant version in this file.
+ * in a semi-clean way (see uproplist.h). Ascii still gets special treatment.
  */
 
 // Character classes: we have three main groups, and then some chars
@@ -52,37 +47,43 @@
 // We have an array with 256 slots where we keep the character types. 
 // The array could be fully static, but we use a small function to fill it 
 // once.
-// The array is actually a remnant of the original version which did no utf8
-// It could be reduced to 128, because real (over 128) utf8 chars are now 
-// handled with a set holding all the separator values.
-enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
+// The array is actually a remnant of the original version which did no utf8.
+// Only the lower 127 slots are  now used, but keep it at 256
+// because it makes some tests in the code simpler.
+enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259, 
+                A_ULETTER=260, A_LLETTER=261};
 static int charclasses[256];
 
+// Real UTF-8 characters are handled with sets holding all characters
+// with interesting properties. This is far from full-blown management
+// of Unicode properties, but seems to do the job well enough in most
+// common cases
 static set<unsigned int> unicign;
 static set<unsigned int> visiblewhite;
+
+// Set up character classes array and the additional unicode sets
 static void setcharclasses()
 {
     static int init = 0;
     if (init)
 	return;
     unsigned int i;
+
+    // Set default value for all: SPACE
     for (i = 0 ; i < 256 ; i ++)
-	charclasses[i] = LETTER;
-
-    for (i = 0; i < ' ';i++)
 	charclasses[i] = SPACE;
 
     char digits[] = "0123456789";
     for (i = 0; i  < strlen(digits); i++)
 	charclasses[int(digits[i])] = DIGIT;
 
-    char blankspace[] = "\t\v\f ";
-    for (i = 0; i < strlen(blankspace); i++)
-	charclasses[int(blankspace[i])] = SPACE;
-
-    char seps[] = "!\"$%&()/<=>\\^{|}~:;`";
-    for (i = 0; i  < strlen(seps); i++)
-	charclasses[int(seps[i])] = SPACE;
+    char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+    for (i = 0; i  < strlen(upper); i++)
+	charclasses[int(upper[i])] = A_ULETTER;
+
+    char lower[] = "abcdefghijklmnopqrstuvwxyz";
+    for (i = 0; i  < strlen(lower); i++)
+	charclasses[int(lower[i])] = A_LLETTER;
 
     char wild[] = "*?[]";
     for (i = 0; i  < strlen(wild); i++)
@@ -116,6 +117,9 @@
     }
 }
 
+
+// CJK Unicode character detection:
+//
 // 2E80..2EFF; CJK Radicals Supplement
 // 3000..303F; CJK Symbols and Punctuation
 // 3040..309F; Hiragana
@@ -168,12 +172,13 @@
 
     unsigned int l = w.length();
     if (l > 0 && l < (unsigned)m_maxWordLength) {
-	// 1 char word: we index single letters and digits, but
-	// nothing else. We might want to turn this into a test for a single
-	// utf8 character instead.
+	// 1 byte word: we index single ascii letters and digits, but
+	// nothing else. We might want to turn this into a test for a
+	// single utf8 character instead ?
 	if (l == 1) {
 	    int c = (int)w[0];
-	    if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
+	    if (charclasses[c] != A_ULETTER && charclasses[c] != A_LLETTER && 
+                charclasses[c] != DIGIT) {
 		//cerr << "ERASING single letter term " << c << endl;
 		return true;
 	    }
@@ -195,7 +200,7 @@
  * handler/emitter. Emit and reset the current word, possibly emit the current
  * span (if different). In query mode, words are not emitted, only final spans
  * 
- * This is purely for factoring common code from different places
+ * This is purely for factoring common code from different places in
  * text_to_words(). 
  * 
  * @return true if ok, false for error. Splitting should stop in this case.
@@ -259,7 +264,7 @@
 /** 
  * Splitting a text into terms to be indexed.
  * We basically emit a word every time we see a separator, but some chars are
- * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, 
+ * handled specially so that special cases, ie, c++ and jfd@recoll.com etc, 
  * are handled properly,
  */
 bool TextSplit::text_to_words(const string &in)
@@ -310,10 +315,6 @@
 
 	int cc = whatcc(c);
 	switch (cc) {
-	case LETTER:
-	    m_wordLen += it.appendchartostring(m_span);
-	    break;
-
 	case DIGIT:
 	    if (m_wordLen == 0)
 		m_inNumber = true;
@@ -448,6 +449,41 @@
 	    }
 	    break;
 
+            // Camelcase handling. 
+            // If we get uppercase ascii after lowercase ascii, emit word.
+            // This emits "camel" when hitting the 'C' of camelCase
+	case A_ULETTER:
+	    if (m_span.length() && 
+                charclasses[(unsigned int)m_span[m_span.length() - 1]] == 
+                A_LLETTER) {
+                if (m_wordLen) {
+                    if (!doemit(false, it.getBpos()))
+                        return false;
+                }
+            }
+            goto NORMALCHAR;
+
+            // CamelCase handling.
+            // If we get lowercase after uppercase and the current
+            // word length is bigger than one, it means we had a
+            // string of several upper-case letters:  an
+            // acronym (readHTML) or a single letter article (ALittleHelp).
+            // Emit the uppercase word before proceeding
+        case A_LLETTER:
+	    if (m_span.length() && 
+                charclasses[(unsigned int)m_span[m_span.length() - 1]] == 
+                A_ULETTER && m_wordLen > 1) {
+                // Multiple upper-case letters. Single letter word
+                // or acronym which we want to emit now
+                m_wordLen--;
+                if (!doemit(false, it.getBpos()))
+                    return false;
+                m_wordStart--;
+                m_wordLen++;
+            }
+            goto NORMALCHAR;
+
+
 	default:
 	NORMALCHAR:
 	    m_wordLen += it.appendchartostring(m_span);
@@ -678,6 +714,7 @@
 #include "textsplit.h"
 #include "readfile.h"
 #include "debuglog.h"
+#include "transcode.h"
 
 using namespace std;
 
@@ -711,6 +748,7 @@
 	    "Debut-\ncontinue\n" 
 	    "[olala][ululu]  (valeur) (23)\n"
 	    "utf-8 ucs-4�� \\nodef\n"
+            "A b C 2 . +"
 	    "','this\n"
 	    " ,able,test-domain "
 	    " -wl,--export-dynamic "
@@ -727,6 +765,7 @@
     "   -w:  only words\n"
     "   -k:  preserve wildcards (?*)\n"
     "   -c: just count words\n"
+    "   -C [charset] : input charset\n"
     " if filename is 'stdin', will read stdin for data (end with ^D)\n"
     "  \n\n"
     ;
@@ -748,6 +787,7 @@
 
 int main(int argc, char **argv)
 {
+    string charset;
     thisprog = argv[0];
     argc--; argv++;
 
@@ -759,14 +799,16 @@
 	while (**argv)
 	    switch (*(*argv)++) {
 	    case 'c':	op_flags |= OPT_c; break;
-	    case 'C':	op_flags |= OPT_C; break;
+            case 'C':	op_flags |= OPT_C; if (argc < 2)  Usage();
+                charset = *(++argv); argc--; 
+                goto b1;
 	    case 'k':	op_flags |= OPT_k; break;
 	    case 's':	op_flags |= OPT_s; break;
 	    case 'S':	op_flags |= OPT_S; break;
 	    case 'w':	op_flags |= OPT_w; break;
 	    default: Usage();	break;
 	    }
-	argc--; argv++;
+    b1: argc--; argv++;
     }
     DebugLog::getdbl()->setloglevel(DEBDEB1);
     DebugLog::setfilename("stderr");
@@ -784,21 +826,35 @@
     if (op_flags & OPT_k) 
 	flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); 
 
-    string data;
+    string odata, reason;
     if (argc == 1) {
 	const char *filename = *argv++;	argc--;
 	if (!strcmp(filename, "stdin")) {
 	    char buf[1024];
 	    int nread;
 	    while ((nread = read(0, buf, 1024)) > 0) {
-		data.append(buf, nread);
-	    }
-	} else if (!file_to_string(filename, data)) 
+		odata.append(buf, nread);
+	    }
+	} else if (!file_to_string(filename, odata, &reason)) {
+            cerr << "Failed: file_to_string(" << filename << ") failed: " 
+                 << reason << endl;
 	    exit(1);
+        }
     } else {
 	cout << endl << teststring << endl << endl;  
-	data = teststring;
-    }
+	odata = teststring;
+    }
+    string& data = odata;
+    string ndata;
+    if ((op_flags & OPT_C)) {
+        if (!transcode(odata, ndata, charset, "UTF-8")) {
+            cerr << "Failed: transcode error" << endl;
+            exit(1);
+        } else {
+            data = ndata;
+        }
+    }
+
     if (op_flags & OPT_c) {
 	int n = TextSplit::countWords(data, flags);
 	cout << n << " words" << endl;