|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
30 |
#include "log.h"
|
30 |
#include "log.h"
|
31 |
//#define UTF8ITER_CHECK
|
31 |
//#define UTF8ITER_CHECK
|
32 |
#include "utf8iter.h"
|
32 |
#include "utf8iter.h"
|
33 |
#include "uproplist.h"
|
33 |
#include "uproplist.h"
|
34 |
#include "smallut.h"
|
34 |
#include "smallut.h"
|
|
|
35 |
|
|
|
36 |
// Decide if we treat katakana as western scripts, splitting into
|
|
|
37 |
// words instead of n-grams. This is not absurd (katakana is a kind of
|
|
|
38 |
// alphabet, albeit phonetic and syllabic and is mostly used to
|
|
|
39 |
// transcribe western words), but it does not work well because
|
|
|
40 |
// japanese uses separator-less compound katakana words, and because
|
|
|
41 |
// the plural terminaisons are irregular and would need a specialized
|
|
|
42 |
// stemmer. So we for now process katakana as the rest of cjk, using
|
|
|
43 |
// ngrams
|
|
|
44 |
#undef KATAKANA_AS_WORDS
|
35 |
|
45 |
|
36 |
using namespace std;
|
46 |
using namespace std;
|
37 |
|
47 |
|
38 |
/**
|
48 |
/**
|
39 |
* Splitting a text into words. The code in this file works with utf-8
|
49 |
* Splitting a text into words. The code in this file works with utf-8
|
|
... |
|
... |
207 |
|
217 |
|
208 |
// We should probably map 'fullwidth ascii variants' and 'halfwidth
|
218 |
// We should probably map 'fullwidth ascii variants' and 'halfwidth
|
209 |
// katakana variants' to something else. Look up "Kuromoji" Lucene
|
219 |
// katakana variants' to something else. Look up "Kuromoji" Lucene
|
210 |
// filter, KuromojiNormalizeFilter.java
|
220 |
// filter, KuromojiNormalizeFilter.java
|
211 |
// 309F is Hiragana.
|
221 |
// 309F is Hiragana.
|
|
|
222 |
#ifdef KATAKANA_AS_WORDS
|
212 |
#define UNICODE_IS_KATAKANA(p) \
|
223 |
#define UNICODE_IS_KATAKANA(p) \
|
213 |
((p) != 0x309F && \
|
224 |
((p) != 0x309F && \
|
214 |
(((p) >= 0x3099 && (p) <= 0x30FF) || \
|
225 |
(((p) >= 0x3099 && (p) <= 0x30FF) || \
|
215 |
((p) >= 0x31F0 && (p) <= 0x31FF)))
|
226 |
((p) >= 0x31F0 && (p) <= 0x31FF)))
|
216 |
|
227 |
#else
|
|
|
228 |
#define UNICODE_IS_KATAKANA(p) false
|
|
|
229 |
#endif
|
|
|
230 |
|
217 |
bool TextSplit::isCJK(int c)
|
231 |
bool TextSplit::isCJK(int c)
|
218 |
{
|
232 |
{
|
219 |
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
|
233 |
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
|
220 |
}
|
234 |
}
|
221 |
bool TextSplit::isKATAKANA(int c)
|
235 |
bool TextSplit::isKATAKANA(int c)
|
|
... |
|
... |
518 |
|
532 |
|
519 |
if (c == (unsigned int)-1) {
|
533 |
if (c == (unsigned int)-1) {
|
520 |
LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
|
534 |
LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
|
521 |
return false;
|
535 |
return false;
|
522 |
}
|
536 |
}
|
|
|
537 |
|
523 |
CharSpanClass csc;
|
538 |
CharSpanClass csc;
|
524 |
if (UNICODE_IS_KATAKANA(c)) {
|
539 |
if (UNICODE_IS_KATAKANA(c)) {
|
525 |
csc = CSC_KATAKANA;
|
540 |
csc = CSC_KATAKANA;
|
526 |
} else if (UNICODE_IS_CJK(c)) {
|
541 |
} else if (UNICODE_IS_CJK(c)) {
|
527 |
csc = CSC_CJK;
|
542 |
csc = CSC_CJK;
|
528 |
} else {
|
543 |
} else {
|
529 |
csc = CSC_OTHER;
|
544 |
csc = CSC_OTHER;
|
530 |
}
|
545 |
}
|
|
|
546 |
|
531 |
if (o_processCJK && csc == CSC_CJK) {
|
547 |
if (o_processCJK && csc == CSC_CJK) {
|
532 |
// CJK excluding Katakana character hit.
|
548 |
// CJK excluding Katakana character hit.
|
533 |
// Do like at EOF with the current non-cjk data.
|
549 |
// Do like at EOF with the current non-cjk data.
|
534 |
if (m_wordLen || m_span.length()) {
|
550 |
if (m_wordLen || m_span.length()) {
|
535 |
if (!doemit(true, it.getBpos()))
|
551 |
if (!doemit(true, it.getBpos()))
|
|
... |
|
... |
546 |
// character after the cjk sequence, just go on.
|
562 |
// character after the cjk sequence, just go on.
|
547 |
if (it.eof())
|
563 |
if (it.eof())
|
548 |
break;
|
564 |
break;
|
549 |
}
|
565 |
}
|
550 |
|
566 |
|
|
|
567 |
#ifdef KATAKANA_AS_WORDS
|
|
|
568 |
// Only needed if we have script transitions inside this
|
|
|
569 |
// routine, else the call to cjk_to_words does the job.
|
551 |
if (csc != prev_csc && (m_wordLen || m_span.length())) {
|
570 |
if (csc != prev_csc && (m_wordLen || m_span.length())) {
|
552 |
LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
|
571 |
LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
|
553 |
m_wordLen << " spl " << m_span.length() << endl);
|
572 |
m_wordLen << " spl " << m_span.length() << endl);
|
554 |
if (!doemit(true, it.getBpos())) {
|
573 |
if (!doemit(true, it.getBpos())) {
|
555 |
return false;
|
574 |
return false;
|
556 |
}
|
575 |
}
|
557 |
}
|
576 |
}
|
|
|
577 |
#endif
|
|
|
578 |
|
558 |
prev_csc = csc;
|
579 |
prev_csc = csc;
|
559 |
|
|
|
560 |
int cc = whatcc(c);
|
580 |
int cc = whatcc(c);
|
561 |
|
581 |
|
562 |
switch (cc) {
|
582 |
switch (cc) {
|
563 |
case SKIP:
|
583 |
case SKIP:
|
564 |
// Special-case soft-hyphen. To work, this depends on the
|
584 |
// Special-case soft-hyphen. To work, this depends on the
|