recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [58d859] .. [7a2270]

Switch to unified view


...

    char wild[] = "*?[]";
    for (i = 0; i  < strlen(wild); i++)
    charclasses[int(wild[i])] = WILD;

    char special[] = ".@+-,#'_\n\r";
    for (i = 0; i  < strlen(special); i++)
    charclasses[int(special[i])] = special[i];

    for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
    unicign.insert(uniign[i]);
...
// F900..FAFF; CJK Compatibility Ideographs
// FE30..FE4F; CJK Compatibility Forms
// FF00..FFEF; Halfwidth and Fullwidth Forms
// 20000..2A6DF; CJK Unified Ideographs Extension B
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
// Note: the p > 127 test is not necessary, but optimizes away the ascii case
#define UNICODE_IS_CJK(p)                       \
    ((p) > 127 &&                         \
     (((p) >= 0x2E80 && (p) <= 0x2EFF) ||             \
      ((p) >= 0x3000 && (p) <= 0x9FFF) ||             \
      ((p) >= 0xA700 && (p) <= 0xA71F) ||             \
      ((p) >= 0xAC00 && (p) <= 0xD7AF) ||             \
      ((p) >= 0xF900 && (p) <= 0xFAFF) ||             \
      ((p) >= 0xFE30 && (p) <= 0xFE4F) ||             \
      ((p) >= 0xFF00 && (p) <= 0xFFEF) ||             \
      ((p) >= 0x20000 && (p) <= 0x2A6DF) ||               \
      ((p) >= 0x2F800 && (p) <= 0x2FA1F)))

bool TextSplit::isCJK(int c)
{
    return UNICODE_IS_CJK(c);
}
...
            return false;
        m_inNumber = false;
        }
        m_wordStart += it.appendchartostring(m_span);
        break;
  case '_':
      if (m_wordLen) {
      if (!doemit(false, it.getBpos()))
          return false;
      m_inNumber = false;
      }
      m_wordStart += it.appendchartostring(m_span);
      break;
    case '\'':
        // If in word, potential span: o'brien, else, this is more 
        // whitespace
        if (m_wordLen) {
        if (!doemit(false, it.getBpos()))

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
	...		...
86		86
87	char wild[] = "*?[]";	87	char wild[] = "*?[]";
88	for (i = 0; i < strlen(wild); i++)	88	for (i = 0; i < strlen(wild); i++)
89	charclasses[int(wild[i])] = WILD;	89	charclasses[int(wild[i])] = WILD;
90		90
91	char special[] = ".@+-,#'\n\r";	91	char special[] = ".@+-,#'_\n\r";
92	for (i = 0; i < strlen(special); i++)	92	for (i = 0; i < strlen(special); i++)
93	charclasses[int(special[i])] = special[i];	93	charclasses[int(special[i])] = special[i];
94		94
95	for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {	95	for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
96	unicign.insert(uniign[i]);	96	unicign.insert(uniign[i]);
	...		...
136	// F900..FAFF; CJK Compatibility Ideographs	136	// F900..FAFF; CJK Compatibility Ideographs
137	// FE30..FE4F; CJK Compatibility Forms	137	// FE30..FE4F; CJK Compatibility Forms
138	// FF00..FFEF; Halfwidth and Fullwidth Forms	138	// FF00..FFEF; Halfwidth and Fullwidth Forms
139	// 20000..2A6DF; CJK Unified Ideographs Extension B	139	// 20000..2A6DF; CJK Unified Ideographs Extension B
140	// 2F800..2FA1F; CJK Compatibility Ideographs Supplement	140	// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
		141	// Note: the p > 127 test is not necessary, but optimizes away the ascii case
141	#define UNICODE_IS_CJK(p) \	142	#define UNICODE_IS_CJK(p) \
142	(((p) >= 0x2E80 && (p) <= 0x2EFF) \	143	((p) > 127 && \
143	\|\| ((p) >= 0x3000 && (p) <= 0x9FFF) \	144	(((p) >= 0x2E80 && (p) <= 0x2EFF) \|\| \
144	\|\| ((p) >= 0xA700 && (p) <= 0xA71F) \	145	((p) >= 0x3000 && (p) <= 0x9FFF) \|\| \
145	\|\| ((p) >= 0xAC00 && (p) <= 0xD7AF) \	146	((p) >= 0xA700 && (p) <= 0xA71F) \|\| \
146	\|\| ((p) >= 0xF900 && (p) <= 0xFAFF) \	147	((p) >= 0xAC00 && (p) <= 0xD7AF) \|\| \
147	\|\| ((p) >= 0xFE30 && (p) <= 0xFE4F) \	148	((p) >= 0xF900 && (p) <= 0xFAFF) \|\| \
148	\|\| ((p) >= 0xFF00 && (p) <= 0xFFEF) \	149	((p) >= 0xFE30 && (p) <= 0xFE4F) \|\| \
149	\|\| ((p) >= 0x20000 && (p) <= 0x2A6DF) \	150	((p) >= 0xFF00 && (p) <= 0xFFEF) \|\| \
		151	((p) >= 0x20000 && (p) <= 0x2A6DF) \|\| \
150	\|\| ((p) >= 0x2F800 && (p) <= 0x2FA1F))	152	((p) >= 0x2F800 && (p) <= 0x2FA1F)))
151		153
152	bool TextSplit::isCJK(int c)	154	bool TextSplit::isCJK(int c)
153	{	155	{
154	return UNICODE_IS_CJK(c);	156	return UNICODE_IS_CJK(c);
155	}	157	}
	...		...
383	return false;	385	return false;
384	m_inNumber = false;	386	m_inNumber = false;
385	}	387	}
386	m_wordStart += it.appendchartostring(m_span);	388	m_wordStart += it.appendchartostring(m_span);
387	break;	389	break;
		390	case '_':
		391	if (m_wordLen) {
		392	if (!doemit(false, it.getBpos()))
		393	return false;
		394	m_inNumber = false;
		395	}
		396	m_wordStart += it.appendchartostring(m_span);
		397	break;
388	case '\'':	398	case '\'':
389	// If in word, potential span: o'brien, else, this is more	399	// If in word, potential span: o'brien, else, this is more
390	// whitespace	400	// whitespace
391	if (m_wordLen) {	401	if (m_wordLen) {
392	if (!doemit(false, it.getBpos()))	402	if (!doemit(false, it.getBpos()))