recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [59e5cc] .. [f853f3]

Switch to unified view


...
#include "log.h"
//#define UTF8ITER_CHECK
#include "utf8iter.h"
#include "uproplist.h"
#include "smallut.h"

// Decide if we treat katakana as western scripts, splitting into
// words instead of n-grams. This is not absurd (katakana is a kind of
// alphabet, albeit phonetic and syllabic and is mostly used to
// transcribe western words), but it does not work well because
// japanese uses separator-less compound katakana words, and because
// the plural terminaisons are irregular and would need a specialized
// stemmer. So we for now process katakana as the rest of cjk, using
// ngrams
#undef KATAKANA_AS_WORDS

using namespace std;

/**
 * Splitting a text into words. The code in this file works with utf-8
...

// We should probably map 'fullwidth ascii variants' and 'halfwidth
// katakana variants' to something else.  Look up "Kuromoji" Lucene
// filter, KuromojiNormalizeFilter.java
// 309F is Hiragana.
#ifdef KATAKANA_AS_WORDS
#define UNICODE_IS_KATAKANA(p)                                          \
    ((p) != 0x309F &&                                                   \
     (((p) >= 0x3099 && (p) <= 0x30FF) ||                               \
      ((p) >= 0x31F0 && (p) <= 0x31FF)))
#else
#define UNICODE_IS_KATAKANA(p) false
#endif

bool TextSplit::isCJK(int c)
{
    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
}
bool TextSplit::isKATAKANA(int c)
...

    if (c == (unsigned int)-1) {
        LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
        return false;
    }

        CharSpanClass csc;
        if (UNICODE_IS_KATAKANA(c)) {
            csc = CSC_KATAKANA;
        } else if (UNICODE_IS_CJK(c)) {
            csc = CSC_CJK;
        } else {
            csc = CSC_OTHER;
        }

    if (o_processCJK && csc == CSC_CJK) {
        // CJK excluding Katakana character hit. 
        // Do like at EOF with the current non-cjk data.
        if (m_wordLen || m_span.length()) {
        if (!doemit(true, it.getBpos()))
...
        // character after the cjk sequence, just go on.
        if (it.eof())
        break;
    }

#ifdef KATAKANA_AS_WORDS
        // Only needed if we have script transitions inside this
        // routine, else the call to cjk_to_words does the job.
        if (csc != prev_csc && (m_wordLen || m_span.length())) {
            LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
                   m_wordLen << " spl " << m_span.length() << endl);
            if (!doemit(true, it.getBpos())) {
                return false;
            }
        }
#endif

        prev_csc = csc;

    int cc = whatcc(c);

    switch (cc) {
    case SKIP:
        // Special-case soft-hyphen. To work, this depends on the

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
	...		...
30	#include "log.h"	30	#include "log.h"
31	//#define UTF8ITER_CHECK	31	//#define UTF8ITER_CHECK
32	#include "utf8iter.h"	32	#include "utf8iter.h"
33	#include "uproplist.h"	33	#include "uproplist.h"
34	#include "smallut.h"	34	#include "smallut.h"
		35
		36	// Decide if we treat katakana as western scripts, splitting into
		37	// words instead of n-grams. This is not absurd (katakana is a kind of
		38	// alphabet, albeit phonetic and syllabic and is mostly used to
		39	// transcribe western words), but it does not work well because
		40	// japanese uses separator-less compound katakana words, and because
		41	// the plural terminaisons are irregular and would need a specialized
		42	// stemmer. So we for now process katakana as the rest of cjk, using
		43	// ngrams
		44	#undef KATAKANA_AS_WORDS
35		45
36	using namespace std;	46	using namespace std;
37		47
38	/**	48	/**
39	* Splitting a text into words. The code in this file works with utf-8	49	* Splitting a text into words. The code in this file works with utf-8
	...		...
207		217
208	// We should probably map 'fullwidth ascii variants' and 'halfwidth	218	// We should probably map 'fullwidth ascii variants' and 'halfwidth
209	// katakana variants' to something else. Look up "Kuromoji" Lucene	219	// katakana variants' to something else. Look up "Kuromoji" Lucene
210	// filter, KuromojiNormalizeFilter.java	220	// filter, KuromojiNormalizeFilter.java
211	// 309F is Hiragana.	221	// 309F is Hiragana.
		222	#ifdef KATAKANA_AS_WORDS
212	#define UNICODE_IS_KATAKANA(p) \	223	#define UNICODE_IS_KATAKANA(p) \
213	((p) != 0x309F && \	224	((p) != 0x309F && \
214	(((p) >= 0x3099 && (p) <= 0x30FF) \|\| \	225	(((p) >= 0x3099 && (p) <= 0x30FF) \|\| \
215	((p) >= 0x31F0 && (p) <= 0x31FF)))	226	((p) >= 0x31F0 && (p) <= 0x31FF)))
216		227	#else
		228	#define UNICODE_IS_KATAKANA(p) false
		229	#endif
		230
217	bool TextSplit::isCJK(int c)	231	bool TextSplit::isCJK(int c)
218	{	232	{
219	return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);	233	return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
220	}	234	}
221	bool TextSplit::isKATAKANA(int c)	235	bool TextSplit::isKATAKANA(int c)
	...		...
518		532
519	if (c == (unsigned int)-1) {	533	if (c == (unsigned int)-1) {
520	LOGERR("Textsplit: error occured while scanning UTF-8 string\n");	534	LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
521	return false;	535	return false;
522	}	536	}
		537
523	CharSpanClass csc;	538	CharSpanClass csc;
524	if (UNICODE_IS_KATAKANA(c)) {	539	if (UNICODE_IS_KATAKANA(c)) {
525	csc = CSC_KATAKANA;	540	csc = CSC_KATAKANA;
526	} else if (UNICODE_IS_CJK(c)) {	541	} else if (UNICODE_IS_CJK(c)) {
527	csc = CSC_CJK;	542	csc = CSC_CJK;
528	} else {	543	} else {
529	csc = CSC_OTHER;	544	csc = CSC_OTHER;
530	}	545	}
		546
531	if (o_processCJK && csc == CSC_CJK) {	547	if (o_processCJK && csc == CSC_CJK) {
532	// CJK excluding Katakana character hit.	548	// CJK excluding Katakana character hit.
533	// Do like at EOF with the current non-cjk data.	549	// Do like at EOF with the current non-cjk data.
534	if (m_wordLen \|\| m_span.length()) {	550	if (m_wordLen \|\| m_span.length()) {
535	if (!doemit(true, it.getBpos()))	551	if (!doemit(true, it.getBpos()))
	...		...
546	// character after the cjk sequence, just go on.	562	// character after the cjk sequence, just go on.
547	if (it.eof())	563	if (it.eof())
548	break;	564	break;
549	}	565	}
550		566
		567	#ifdef KATAKANA_AS_WORDS
		568	// Only needed if we have script transitions inside this
		569	// routine, else the call to cjk_to_words does the job.
551	if (csc != prev_csc && (m_wordLen \|\| m_span.length())) {	570	if (csc != prev_csc && (m_wordLen \|\| m_span.length())) {
552	LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<	571	LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
553	m_wordLen << " spl " << m_span.length() << endl);	572	m_wordLen << " spl " << m_span.length() << endl);
554	if (!doemit(true, it.getBpos())) {	573	if (!doemit(true, it.getBpos())) {
555	return false;	574	return false;
556	}	575	}
557	}	576	}
		577	#endif
		578
558	prev_csc = csc;	579	prev_csc = csc;
559
560	int cc = whatcc(c);	580	int cc = whatcc(c);
561		581
562	switch (cc) {	582	switch (cc) {
563	case SKIP:	583	case SKIP:
564	// Special-case soft-hyphen. To work, this depends on the	584	// Special-case soft-hyphen. To work, this depends on the