|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
90 |
|
90 |
|
91 |
char wild[] = "*?[]";
|
91 |
char wild[] = "*?[]";
|
92 |
for (i = 0; i < strlen(wild); i++)
|
92 |
for (i = 0; i < strlen(wild); i++)
|
93 |
charclasses[int(wild[i])] = WILD;
|
93 |
charclasses[int(wild[i])] = WILD;
|
94 |
|
94 |
|
|
|
95 |
// Characters with special treatment:
|
|
|
96 |
//
|
|
|
97 |
// The first ones are mostly span-constructing "glue"
|
|
|
98 |
// characters, for example those typically allowing us to
|
|
|
99 |
// search for an email address as a whole (bob@isp.org instead
|
|
|
100 |
// of as a phrase "bob isp org"
|
|
|
101 |
//
|
|
|
102 |
// The case of the minus sign is a complicated one. It went
|
|
|
103 |
// from glue to non-glue to glue along Recoll versions.
|
|
|
104 |
// See minus-hyphen-dash.txt in doc/notes
|
95 |
char special[] = ".@+-#'_\n\r\f";
|
105 |
char special[] = ".@+-#'_\n\r\f";
|
96 |
for (i = 0; i < strlen(special); i++)
|
106 |
for (i = 0; i < strlen(special); i++)
|
97 |
charclasses[int(special[i])] = special[i];
|
107 |
charclasses[int(special[i])] = special[i];
|
98 |
|
108 |
|
99 |
for (i = 0; i < sizeof(unipunc) / sizeof(int); i++) {
|
109 |
for (i = 0; i < sizeof(unipunc) / sizeof(int); i++) {
|
|
... |
|
... |
119 |
static inline int whatcc(unsigned int c)
|
129 |
static inline int whatcc(unsigned int c)
|
120 |
{
|
130 |
{
|
121 |
if (c <= 127) {
|
131 |
if (c <= 127) {
|
122 |
return charclasses[c];
|
132 |
return charclasses[c];
|
123 |
} else {
|
133 |
} else {
|
|
|
134 |
if (c == 0x2010) {
|
|
|
135 |
// Special treatment for hyphen: handle as ascii minus. See
|
|
|
136 |
// doc/notes/minus-hyphen-dash.txt
|
|
|
137 |
return 0x2010;
|
124 |
if (sskip.find(c) != sskip.end()) {
|
138 |
} else if (sskip.find(c) != sskip.end()) {
|
125 |
return SKIP;
|
139 |
return SKIP;
|
126 |
} else if (spunc.find(c) != spunc.end()) {
|
140 |
} else if (spunc.find(c) != spunc.end()) {
|
127 |
return SPACE;
|
141 |
return SPACE;
|
128 |
} else {
|
142 |
} else {
|
129 |
vector<unsigned int>::iterator it =
|
143 |
vector<unsigned int>::iterator it =
|
|
... |
|
... |
572 |
}
|
586 |
}
|
573 |
}
|
587 |
}
|
574 |
goto SPACE;
|
588 |
goto SPACE;
|
575 |
break;
|
589 |
break;
|
576 |
|
590 |
|
|
|
591 |
case 0x2010:
|
|
|
592 |
// Hyphen is replaced with ascii minus
|
|
|
593 |
if (m_wordLen != 0) {
|
|
|
594 |
// Treat '-' inside span as glue char
|
|
|
595 |
if (!doemit(false, it.getBpos()))
|
|
|
596 |
return false;
|
|
|
597 |
m_inNumber = false;
|
|
|
598 |
m_span += '-';
|
|
|
599 |
m_wordStart++;
|
|
|
600 |
break;
|
|
|
601 |
}
|
|
|
602 |
goto SPACE;
|
|
|
603 |
|
577 |
case '.':
|
604 |
case '.':
|
578 |
{
|
605 |
{
|
579 |
// Need a little lookahead here. At worse this gets the end null
|
606 |
// Need a little lookahead here. At worse this gets the end null
|
580 |
int nextc = it[it.getCpos()+1];
|
607 |
int nextc = it[it.getCpos()+1];
|
581 |
int nextwhat = whatcc(nextc);
|
608 |
int nextwhat = whatcc(nextc);
|
|
... |
|
... |
1034 |
" ~/.xsession-errors",
|
1061 |
" ~/.xsession-errors",
|
1035 |
"this_very_long_span_this_very_long_span_this_very_long_span",
|
1062 |
"this_very_long_span_this_very_long_span_this_very_long_span",
|
1036 |
"soft\xc2\xadhyphen",
|
1063 |
"soft\xc2\xadhyphen",
|
1037 |
"soft\xc2\xad\nhyphen",
|
1064 |
"soft\xc2\xad\nhyphen",
|
1038 |
"soft\xc2\xad\n\rhyphen",
|
1065 |
"soft\xc2\xad\n\rhyphen",
|
1039 |
"hard-\nhyphen",
|
1066 |
"real\xe2\x80\x90hyphen",
|
|
|
1067 |
"real\xe2\x80\x90\nhyphen",
|
|
|
1068 |
"hyphen-\nminus",
|
1040 |
};
|
1069 |
};
|
1041 |
const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
|
1070 |
const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
|
1042 |
|
1071 |
|
1043 |
static string teststring1 = " nouvel-an ";
|
1072 |
static string teststring1 = " nouvel-an ";
|
1044 |
|
1073 |
|