--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -19,6 +19,8 @@
*/
#ifndef TEST_TEXTSPLIT
+#include <assert.h>
+
#include <iostream>
#include <string>
#include <set>
@@ -26,11 +28,8 @@
#include "textsplit.h"
#include "debuglog.h"
-#include "assert.h"
-
//#define UTF8ITER_CHECK
#include "utf8iter.h"
-
#include "uproplist.h"
#ifndef NO_NAMESPACES
@@ -39,11 +38,7 @@
/**
* Splitting a text into words. The code in this file works with utf-8
- * in a semi-clean way (see uproplist.h)
- *
- * We are also not using capitalization information.
- *
- * There are a few remnants of the initial utf8-ignorant version in this file.
+ * in a semi-clean way (see uproplist.h). Ascii still gets special treatment.
*/
// Character classes: we have three main groups, and then some chars
@@ -52,37 +47,43 @@
// We have an array with 256 slots where we keep the character types.
// The array could be fully static, but we use a small function to fill it
// once.
-// The array is actually a remnant of the original version which did no utf8
-// It could be reduced to 128, because real (over 128) utf8 chars are now
-// handled with a set holding all the separator values.
-enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
+// The array is actually a remnant of the original version which did no utf8.
+// Only the lower 127 slots are now used, but keep it at 256
+// because it makes some tests in the code simpler.
+enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,
+ A_ULETTER=260, A_LLETTER=261};
static int charclasses[256];
+// Real UTF-8 characters are handled with sets holding all characters
+// with interesting properties. This is far from full-blown management
+// of Unicode properties, but seems to do the job well enough in most
+// common cases
static set<unsigned int> unicign;
static set<unsigned int> visiblewhite;
+
+// Set up character classes array and the additional unicode sets
static void setcharclasses()
{
static int init = 0;
if (init)
return;
unsigned int i;
+
+ // Set default value for all: SPACE
for (i = 0 ; i < 256 ; i ++)
- charclasses[i] = LETTER;
-
- for (i = 0; i < ' ';i++)
charclasses[i] = SPACE;
char digits[] = "0123456789";
for (i = 0; i < strlen(digits); i++)
charclasses[int(digits[i])] = DIGIT;
- char blankspace[] = "\t\v\f ";
- for (i = 0; i < strlen(blankspace); i++)
- charclasses[int(blankspace[i])] = SPACE;
-
- char seps[] = "!\"$%&()/<=>\\^{|}~:;`";
- for (i = 0; i < strlen(seps); i++)
- charclasses[int(seps[i])] = SPACE;
+ char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ for (i = 0; i < strlen(upper); i++)
+ charclasses[int(upper[i])] = A_ULETTER;
+
+ char lower[] = "abcdefghijklmnopqrstuvwxyz";
+ for (i = 0; i < strlen(lower); i++)
+ charclasses[int(lower[i])] = A_LLETTER;
char wild[] = "*?[]";
for (i = 0; i < strlen(wild); i++)
@@ -116,6 +117,9 @@
}
}
+
+// CJK Unicode character detection:
+//
// 2E80..2EFF; CJK Radicals Supplement
// 3000..303F; CJK Symbols and Punctuation
// 3040..309F; Hiragana
@@ -168,12 +172,13 @@
unsigned int l = w.length();
if (l > 0 && l < (unsigned)m_maxWordLength) {
- // 1 char word: we index single letters and digits, but
- // nothing else. We might want to turn this into a test for a single
- // utf8 character instead.
+ // 1 byte word: we index single ascii letters and digits, but
+ // nothing else. We might want to turn this into a test for a
+ // single utf8 character instead ?
if (l == 1) {
int c = (int)w[0];
- if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
+ if (charclasses[c] != A_ULETTER && charclasses[c] != A_LLETTER &&
+ charclasses[c] != DIGIT) {
//cerr << "ERASING single letter term " << c << endl;
return true;
}
@@ -195,7 +200,7 @@
* handler/emitter. Emit and reset the current word, possibly emit the current
* span (if different). In query mode, words are not emitted, only final spans
*
- * This is purely for factoring common code from different places
+ * This is purely for factoring common code from different places in
* text_to_words().
*
* @return true if ok, false for error. Splitting should stop in this case.
@@ -259,7 +264,7 @@
/**
* Splitting a text into terms to be indexed.
* We basically emit a word every time we see a separator, but some chars are
- * handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
+ * handled specially so that special cases, ie, c++ and jfd@recoll.com etc,
* are handled properly,
*/
bool TextSplit::text_to_words(const string &in)
@@ -310,10 +315,6 @@
int cc = whatcc(c);
switch (cc) {
- case LETTER:
- m_wordLen += it.appendchartostring(m_span);
- break;
-
case DIGIT:
if (m_wordLen == 0)
m_inNumber = true;
@@ -448,6 +449,41 @@
}
break;
+ // Camelcase handling.
+ // If we get uppercase ascii after lowercase ascii, emit word.
+ // This emits "camel" when hitting the 'C' of camelCase
+ case A_ULETTER:
+ if (m_span.length() &&
+ charclasses[(unsigned int)m_span[m_span.length() - 1]] ==
+ A_LLETTER) {
+ if (m_wordLen) {
+ if (!doemit(false, it.getBpos()))
+ return false;
+ }
+ }
+ goto NORMALCHAR;
+
+ // CamelCase handling.
+ // If we get lowercase after uppercase and the current
+ // word length is bigger than one, it means we had a
+ // string of several upper-case letters: an
+ // acronym (readHTML) or a single letter article (ALittleHelp).
+ // Emit the uppercase word before proceeding
+ case A_LLETTER:
+ if (m_span.length() &&
+ charclasses[(unsigned int)m_span[m_span.length() - 1]] ==
+ A_ULETTER && m_wordLen > 1) {
+ // Multiple upper-case letters. Single letter word
+ // or acronym which we want to emit now
+ m_wordLen--;
+ if (!doemit(false, it.getBpos()))
+ return false;
+ m_wordStart--;
+ m_wordLen++;
+ }
+ goto NORMALCHAR;
+
+
default:
NORMALCHAR:
m_wordLen += it.appendchartostring(m_span);
@@ -678,6 +714,7 @@
#include "textsplit.h"
#include "readfile.h"
#include "debuglog.h"
+#include "transcode.h"
using namespace std;
@@ -711,6 +748,7 @@
"Debut-\ncontinue\n"
"[olala][ululu] (valeur) (23)\n"
"utf-8 ucs-4�� \\nodef\n"
+ "A b C 2 . +"
"','this\n"
" ,able,test-domain "
" -wl,--export-dynamic "
@@ -727,6 +765,7 @@
" -w: only words\n"
" -k: preserve wildcards (?*)\n"
" -c: just count words\n"
+ " -C [charset] : input charset\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
" \n\n"
;
@@ -748,6 +787,7 @@
int main(int argc, char **argv)
{
+ string charset;
thisprog = argv[0];
argc--; argv++;
@@ -759,14 +799,16 @@
while (**argv)
switch (*(*argv)++) {
case 'c': op_flags |= OPT_c; break;
- case 'C': op_flags |= OPT_C; break;
+ case 'C': op_flags |= OPT_C; if (argc < 2) Usage();
+ charset = *(++argv); argc--;
+ goto b1;
case 'k': op_flags |= OPT_k; break;
case 's': op_flags |= OPT_s; break;
case 'S': op_flags |= OPT_S; break;
case 'w': op_flags |= OPT_w; break;
default: Usage(); break;
}
- argc--; argv++;
+ b1: argc--; argv++;
}
DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::setfilename("stderr");
@@ -784,21 +826,35 @@
if (op_flags & OPT_k)
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
- string data;
+ string odata, reason;
if (argc == 1) {
const char *filename = *argv++; argc--;
if (!strcmp(filename, "stdin")) {
char buf[1024];
int nread;
while ((nread = read(0, buf, 1024)) > 0) {
- data.append(buf, nread);
- }
- } else if (!file_to_string(filename, data))
+ odata.append(buf, nread);
+ }
+ } else if (!file_to_string(filename, odata, &reason)) {
+ cerr << "Failed: file_to_string(" << filename << ") failed: "
+ << reason << endl;
exit(1);
+ }
} else {
cout << endl << teststring << endl << endl;
- data = teststring;
- }
+ odata = teststring;
+ }
+ string& data = odata;
+ string ndata;
+ if ((op_flags & OPT_C)) {
+ if (!transcode(odata, ndata, charset, "UTF-8")) {
+ cerr << "Failed: transcode error" << endl;
+ exit(1);
+ } else {
+ data = ndata;
+ }
+ }
+
if (op_flags & OPT_c) {
int n = TextSplit::countWords(data, flags);
cout << n << " words" << endl;