|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.34 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.35 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
139 |
|| ((p) >= 0xFE30 && (p) <= 0xFE4F) \
|
139 |
|| ((p) >= 0xFE30 && (p) <= 0xFE4F) \
|
140 |
|| ((p) >= 0xFF00 && (p) <= 0xFFEF) \
|
140 |
|| ((p) >= 0xFF00 && (p) <= 0xFFEF) \
|
141 |
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|
141 |
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|
142 |
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
|
142 |
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
|
143 |
|
143 |
|
144 |
bool TextSplit::t_processCJK = true;
|
144 |
bool TextSplit::o_processCJK = true;
|
|
|
145 |
unsigned int TextSplit::o_CJKNgramLen = 2;
|
145 |
|
146 |
|
146 |
// Do some checking (the kind which is simpler to do here than in the
|
147 |
// Do some checking (the kind which is simpler to do here than in the
|
147 |
// main loop), then send term to our client.
|
148 |
// main loop), then send term to our client.
|
148 |
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
149 |
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
149 |
int btstart, int btend)
|
150 |
int btstart, int btend)
|
|
... |
|
... |
244 |
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
|
245 |
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
|
245 |
* are handled properly,
|
246 |
* are handled properly,
|
246 |
*/
|
247 |
*/
|
247 |
bool TextSplit::text_to_words(const string &in)
|
248 |
bool TextSplit::text_to_words(const string &in)
|
248 |
{
|
249 |
{
|
249 |
LOGDEB1(("TextSplit::text_to_words: docjk %d %s%s%s [%s]\n",
|
250 |
LOGDEB1(("TextSplit::text_to_words: docjk %d (%d) %s%s%s [%s]\n",
|
250 |
t_processCJK,
|
251 |
o_processCJK, o_CJKNgramLen,
|
251 |
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
252 |
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
252 |
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
253 |
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
253 |
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
254 |
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
254 |
in.substr(0,50).c_str()));
|
255 |
in.substr(0,50).c_str()));
|
255 |
|
256 |
|
256 |
setcharclasses();
|
257 |
setcharclasses();
|
257 |
|
258 |
|
258 |
m_span.erase();
|
259 |
m_span.erase();
|
259 |
m_inNumber = false;
|
260 |
m_inNumber = false;
|
|
... |
|
... |
267 |
if (c == (unsigned int)-1) {
|
268 |
if (c == (unsigned int)-1) {
|
268 |
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
269 |
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
269 |
return false;
|
270 |
return false;
|
270 |
}
|
271 |
}
|
271 |
|
272 |
|
272 |
if (t_processCJK && UNICODE_IS_CJK(c)) {
|
273 |
if (o_processCJK && UNICODE_IS_CJK(c)) {
|
273 |
// CJK character hit.
|
274 |
// CJK character hit.
|
274 |
// Do like at EOF with the current non-cjk data.
|
275 |
// Do like at EOF with the current non-cjk data.
|
275 |
if (m_wordLen || m_span.length()) {
|
276 |
if (m_wordLen || m_span.length()) {
|
276 |
if (!doemit(true, it.getBpos()))
|
277 |
if (!doemit(true, it.getBpos()))
|
277 |
return false;
|
278 |
return false;
|
|
... |
|
... |
419 |
return false;
|
420 |
return false;
|
420 |
}
|
421 |
}
|
421 |
return true;
|
422 |
return true;
|
422 |
}
|
423 |
}
|
423 |
|
424 |
|
424 |
const unsigned int ngramlen = 2;
|
|
|
425 |
#define MAXNGRAMLEN 5
|
|
|
426 |
|
|
|
427 |
// Using an utf8iter pointer just to avoid needing its definition in
|
425 |
// Using an utf8iter pointer just to avoid needing its definition in
|
428 |
// textsplit.h
|
426 |
// textsplit.h
|
429 |
//
|
427 |
//
|
430 |
// We output ngrams for exemple for char input a b c and ngramlen== 2,
|
428 |
// We output ngrams for exemple for char input a b c and ngramlen== 2,
|
431 |
// we generate: a ab b bc c as words
|
429 |
// we generate: a ab b bc c as words
|
|
... |
|
... |
440 |
LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
|
438 |
LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
|
441 |
Utf8Iter &it = *itp;
|
439 |
Utf8Iter &it = *itp;
|
442 |
|
440 |
|
443 |
// We use an offset buffer to remember the starts of the utf-8
|
441 |
// We use an offset buffer to remember the starts of the utf-8
|
444 |
// characters which we still need to use.
|
442 |
// characters which we still need to use.
|
445 |
// Fixed size array. ngramlen over 3 doesn't make sense.
|
443 |
assert(o_CJKNgramLen < o_CJKMaxNgramLen);
|
446 |
assert(ngramlen < MAXNGRAMLEN);
|
444 |
unsigned int boffs[o_CJKMaxNgramLen+1];
|
447 |
unsigned int boffs[MAXNGRAMLEN];
|
|
|
448 |
|
445 |
|
449 |
// Current number of valid offsets;
|
446 |
// Current number of valid offsets;
|
450 |
unsigned int nchars = 0;
|
447 |
unsigned int nchars = 0;
|
451 |
unsigned int c = 0;
|
448 |
unsigned int c = 0;
|
452 |
for (; !it.eof(); it++) {
|
449 |
for (; !it.eof(); it++) {
|
|
... |
|
... |
454 |
if (!UNICODE_IS_CJK(c)) {
|
451 |
if (!UNICODE_IS_CJK(c)) {
|
455 |
// Return to normal handler
|
452 |
// Return to normal handler
|
456 |
break;
|
453 |
break;
|
457 |
}
|
454 |
}
|
458 |
|
455 |
|
459 |
if (nchars == ngramlen) {
|
456 |
if (nchars == o_CJKNgramLen) {
|
460 |
// Offset buffer full, shift it. Might be more efficient
|
457 |
// Offset buffer full, shift it. Might be more efficient
|
461 |
// to have a circular one, but things are complicated
|
458 |
// to have a circular one, but things are complicated
|
462 |
// enough already...
|
459 |
// enough already...
|
463 |
for (unsigned int i = 0; i < nchars-1; i++) {
|
460 |
for (unsigned int i = 0; i < nchars-1; i++) {
|
464 |
boffs[i] = boffs[i+1];
|
461 |
boffs[i] = boffs[i+1];
|
|
... |
|
... |
471 |
boffs[nchars-1] = it.getBpos();
|
468 |
boffs[nchars-1] = it.getBpos();
|
472 |
|
469 |
|
473 |
// Output all new ngrams: they begin at each existing position
|
470 |
// Output all new ngrams: they begin at each existing position
|
474 |
// and end after the new character. onlyspans->only output
|
471 |
// and end after the new character. onlyspans->only output
|
475 |
// maximum words, nospans=> single chars
|
472 |
// maximum words, nospans=> single chars
|
476 |
if (!(m_flags & TXTS_ONLYSPANS) || nchars == ngramlen) {
|
473 |
if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) {
|
477 |
unsigned int btend = it.getBpos() + it.getBlen();
|
474 |
unsigned int btend = it.getBpos() + it.getBlen();
|
478 |
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
475 |
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
479 |
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
476 |
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
480 |
for (unsigned int i = loopbeg; i < loopend; i++) {
|
477 |
for (unsigned int i = loopbeg; i < loopend; i++) {
|
481 |
if (!m_cb->takeword(it.buffer().substr(boffs[i],
|
478 |
if (!m_cb->takeword(it.buffer().substr(boffs[i],
|
|
... |
|
... |
495 |
m_wordpos++;
|
492 |
m_wordpos++;
|
496 |
}
|
493 |
}
|
497 |
|
494 |
|
498 |
// If onlyspans is set, there may be things to flush in the buffer
|
495 |
// If onlyspans is set, there may be things to flush in the buffer
|
499 |
// first
|
496 |
// first
|
500 |
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != ngramlen) {
|
497 |
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
501 |
unsigned int btend = it.getBpos(); // Current char is out
|
498 |
unsigned int btend = it.getBpos(); // Current char is out
|
502 |
if (!m_cb->takeword(it.buffer().substr(boffs[0],
|
499 |
if (!m_cb->takeword(it.buffer().substr(boffs[0],
|
503 |
btend-boffs[0]),
|
500 |
btend-boffs[0]),
|
504 |
m_wordpos - nchars,
|
501 |
m_wordpos - nchars,
|
505 |
boffs[0], btend)) {
|
502 |
boffs[0], btend)) {
|