Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
...
...
90
90
91
    char wild[] = "*?[]";
91
    char wild[] = "*?[]";
92
    for (i = 0; i  < strlen(wild); i++)
92
    for (i = 0; i  < strlen(wild); i++)
93
        charclasses[int(wild[i])] = WILD;
93
        charclasses[int(wild[i])] = WILD;
94
94
95
        // Characters with special treatment:
96
        //
97
        // The first ones are mostly span-constructing "glue"
98
        // characters, for example those typically allowing us to
99
        // search for an email address as a whole (bob@isp.org instead
100
        // of as a phrase "bob isp org"
101
        //
102
        // The case of the minus sign is a complicated one. It went
103
        // from glue to non-glue to glue along Recoll versions. 
104
        // See minus-hyphen-dash.txt in doc/notes
95
    char special[] = ".@+-#'_\n\r\f";
105
    char special[] = ".@+-#'_\n\r\f";
96
    for (i = 0; i  < strlen(special); i++)
106
    for (i = 0; i  < strlen(special); i++)
97
        charclasses[int(special[i])] = special[i];
107
        charclasses[int(special[i])] = special[i];
98
108
99
    for (i = 0; i < sizeof(unipunc) / sizeof(int); i++) {
109
    for (i = 0; i < sizeof(unipunc) / sizeof(int); i++) {
...
...
119
static inline int whatcc(unsigned int c)
129
static inline int whatcc(unsigned int c)
120
{
130
{
121
    if (c <= 127) {
131
    if (c <= 127) {
122
    return charclasses[c]; 
132
    return charclasses[c]; 
123
    } else {
133
    } else {
134
        if (c == 0x2010) {
135
            // Special treatment for hyphen: handle as ascii minus. See
136
            // doc/notes/minus-hyphen-dash.txt
137
            return 0x2010;
124
  if (sskip.find(c) != sskip.end()) {
138
        } else if (sskip.find(c) != sskip.end()) {
125
        return SKIP;
139
        return SKIP;
126
    } else if (spunc.find(c) != spunc.end()) {
140
    } else if (spunc.find(c) != spunc.end()) {
127
        return SPACE;
141
        return SPACE;
128
    } else {
142
    } else {
129
        vector<unsigned int>::iterator it = 
143
        vector<unsigned int>::iterator it = 
...
...
572
                }
586
                }
573
        }
587
        }
574
            goto SPACE;
588
            goto SPACE;
575
        break;
589
        break;
576
590
591
  case 0x2010:
592
            // Hyphen is replaced with ascii minus
593
      if (m_wordLen != 0) {
594
                // Treat '-' inside span as glue char
595
                if (!doemit(false, it.getBpos()))
596
                    return false;
597
                m_inNumber = false;
598
                m_span += '-';
599
                m_wordStart++;
600
                break;
601
            }
602
            goto SPACE;
603
577
    case '.':
604
    case '.':
578
    {
605
    {
579
        // Need a little lookahead here. At worse this gets the end null
606
        // Need a little lookahead here. At worse this gets the end null
580
        int nextc = it[it.getCpos()+1];
607
        int nextc = it[it.getCpos()+1];
581
        int nextwhat = whatcc(nextc);
608
        int nextwhat = whatcc(nextc);
...
...
1034
    " ~/.xsession-errors",
1061
    " ~/.xsession-errors",
1035
    "this_very_long_span_this_very_long_span_this_very_long_span",
1062
    "this_very_long_span_this_very_long_span_this_very_long_span",
1036
    "soft\xc2\xadhyphen",
1063
    "soft\xc2\xadhyphen",
1037
    "soft\xc2\xad\nhyphen",
1064
    "soft\xc2\xad\nhyphen",
1038
    "soft\xc2\xad\n\rhyphen",
1065
    "soft\xc2\xad\n\rhyphen",
1039
    "hard-\nhyphen",
1066
    "real\xe2\x80\x90hyphen",
1067
    "real\xe2\x80\x90\nhyphen",
1068
    "hyphen-\nminus",
1040
};
1069
};
1041
const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
1070
const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
1042
1071
1043
static string teststring1 = " nouvel-an ";
1072
static string teststring1 = " nouvel-an ";
1044
1073