Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
...
...
30
#include "log.h"
30
#include "log.h"
31
//#define UTF8ITER_CHECK
31
//#define UTF8ITER_CHECK
32
#include "utf8iter.h"
32
#include "utf8iter.h"
33
#include "uproplist.h"
33
#include "uproplist.h"
34
#include "smallut.h"
34
#include "smallut.h"
35
36
// Decide if we treat katakana as western scripts, splitting into
37
// words instead of n-grams. This is not absurd (katakana is a kind of
38
// alphabet, albeit phonetic and syllabic and is mostly used to
39
// transcribe western words), but it does not work well because
40
// japanese uses separator-less compound katakana words, and because
41
// the plural terminaisons are irregular and would need a specialized
42
// stemmer. So we for now process katakana as the rest of cjk, using
43
// ngrams
44
#undef KATAKANA_AS_WORDS
35
45
36
using namespace std;
46
using namespace std;
37
47
38
/**
48
/**
39
 * Splitting a text into words. The code in this file works with utf-8
49
 * Splitting a text into words. The code in this file works with utf-8
...
...
207
217
208
// We should probably map 'fullwidth ascii variants' and 'halfwidth
218
// We should probably map 'fullwidth ascii variants' and 'halfwidth
209
// katakana variants' to something else.  Look up "Kuromoji" Lucene
219
// katakana variants' to something else.  Look up "Kuromoji" Lucene
210
// filter, KuromojiNormalizeFilter.java
220
// filter, KuromojiNormalizeFilter.java
211
// 309F is Hiragana.
221
// 309F is Hiragana.
222
#ifdef KATAKANA_AS_WORDS
212
#define UNICODE_IS_KATAKANA(p)                                          \
223
#define UNICODE_IS_KATAKANA(p)                                          \
213
    ((p) != 0x309F &&                                                   \
224
    ((p) != 0x309F &&                                                   \
214
     (((p) >= 0x3099 && (p) <= 0x30FF) ||                               \
225
     (((p) >= 0x3099 && (p) <= 0x30FF) ||                               \
215
      ((p) >= 0x31F0 && (p) <= 0x31FF)))
226
      ((p) >= 0x31F0 && (p) <= 0x31FF)))
216
    
227
#else
228
#define UNICODE_IS_KATAKANA(p) false
229
#endif
230
217
bool TextSplit::isCJK(int c)
231
bool TextSplit::isCJK(int c)
218
{
232
{
219
    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
233
    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
220
}
234
}
221
bool TextSplit::isKATAKANA(int c)
235
bool TextSplit::isKATAKANA(int c)
...
...
518
532
519
    if (c == (unsigned int)-1) {
533
    if (c == (unsigned int)-1) {
520
        LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
534
        LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
521
        return false;
535
        return false;
522
    }
536
    }
537
523
        CharSpanClass csc;
538
        CharSpanClass csc;
524
        if (UNICODE_IS_KATAKANA(c)) {
539
        if (UNICODE_IS_KATAKANA(c)) {
525
            csc = CSC_KATAKANA;
540
            csc = CSC_KATAKANA;
526
        } else if (UNICODE_IS_CJK(c)) {
541
        } else if (UNICODE_IS_CJK(c)) {
527
            csc = CSC_CJK;
542
            csc = CSC_CJK;
528
        } else {
543
        } else {
529
            csc = CSC_OTHER;
544
            csc = CSC_OTHER;
530
        }
545
        }
546
531
    if (o_processCJK && csc == CSC_CJK) {
547
    if (o_processCJK && csc == CSC_CJK) {
532
        // CJK excluding Katakana character hit. 
548
        // CJK excluding Katakana character hit. 
533
        // Do like at EOF with the current non-cjk data.
549
        // Do like at EOF with the current non-cjk data.
534
        if (m_wordLen || m_span.length()) {
550
        if (m_wordLen || m_span.length()) {
535
        if (!doemit(true, it.getBpos()))
551
        if (!doemit(true, it.getBpos()))
...
...
546
        // character after the cjk sequence, just go on.
562
        // character after the cjk sequence, just go on.
547
        if (it.eof())
563
        if (it.eof())
548
        break;
564
        break;
549
    }
565
    }
550
566
567
#ifdef KATAKANA_AS_WORDS
568
        // Only needed if we have script transitions inside this
569
        // routine, else the call to cjk_to_words does the job.
551
        if (csc != prev_csc && (m_wordLen || m_span.length())) {
570
        if (csc != prev_csc && (m_wordLen || m_span.length())) {
552
            LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
571
            LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
553
                   m_wordLen << " spl " << m_span.length() << endl);
572
                   m_wordLen << " spl " << m_span.length() << endl);
554
            if (!doemit(true, it.getBpos())) {
573
            if (!doemit(true, it.getBpos())) {
555
                return false;
574
                return false;
556
            }
575
            }
557
        }
576
        }
577
#endif
578
558
        prev_csc = csc;
579
        prev_csc = csc;
559
        
560
    int cc = whatcc(c);
580
    int cc = whatcc(c);
561
581
562
    switch (cc) {
582
    switch (cc) {
563
    case SKIP:
583
    case SKIP:
564
        // Special-case soft-hyphen. To work, this depends on the
584
        // Special-case soft-hyphen. To work, this depends on the