|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
206 |
* @param bp The current BYTE position in the stream
|
206 |
* @param bp The current BYTE position in the stream
|
207 |
* @param spanemit This is set for intermediate spans: glue char changed.
|
207 |
* @param spanemit This is set for intermediate spans: glue char changed.
|
208 |
*/
|
208 |
*/
|
209 |
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
209 |
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
210 |
{
|
210 |
{
|
211 |
LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
|
211 |
LOGDEB2(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
|
212 |
"inn %d span [%s]\n",
|
212 |
"inn %d span [%s]\n",
|
213 |
spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
|
213 |
spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
|
214 |
m_inNumber, m_span.c_str()));
|
214 |
m_inNumber, m_span.c_str()));
|
215 |
|
215 |
|
216 |
// Emit span? When splitting for query, we only emit final spans
|
216 |
// Emit span? When splitting for query, we only emit final spans
|
217 |
// (spanerase)
|
217 |
// (spanerase)
|
218 |
bool spanemitted = false;
|
218 |
bool spanemitted = false;
|
219 |
if (!(m_flags & TXTS_NOSPANS) &&
|
219 |
if (!(m_flags & TXTS_NOSPANS) &&
|
220 |
!((m_wordLen == m_span.length()) &&
|
220 |
!((m_wordLen == m_span.length()) &&
|
221 |
(o_noNumbers) && m_inNumber) &&
|
221 |
(o_noNumbers) && m_inNumber) &&
|
222 |
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
|
222 |
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
|
|
|
223 |
|
|
|
224 |
// Check for an acronym/abbreviation ie I.B.M.
|
|
|
225 |
if (spanerase && m_wordLen != m_span.length() && m_span.length() > 2
|
|
|
226 |
&& m_span.length() <= 20) {
|
|
|
227 |
bool acron = true;
|
|
|
228 |
for (unsigned int i = 1 ; i < m_span.length(); i += 2) {
|
|
|
229 |
if (m_span[i] != '.') {
|
|
|
230 |
acron = false;
|
|
|
231 |
break;
|
|
|
232 |
}
|
|
|
233 |
}
|
|
|
234 |
if (acron) {
|
|
|
235 |
string acronym;
|
|
|
236 |
for (unsigned int i = 0; i < m_span.length(); i += 2) {
|
|
|
237 |
acronym += m_span[i];
|
|
|
238 |
}
|
|
|
239 |
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(),
|
|
|
240 |
bp))
|
|
|
241 |
return false;
|
|
|
242 |
}
|
|
|
243 |
}
|
|
|
244 |
|
223 |
// Maybe trim at end. These are chars that we would keep inside
|
245 |
// Maybe trim at end. These are chars that we would keep inside
|
224 |
// a span, but not at the end
|
246 |
// a span, but not at the end
|
225 |
while (m_span.length() > 0) {
|
247 |
while (m_span.length() > 0) {
|
226 |
switch (m_span[m_span.length()-1]) {
|
248 |
switch (m_span[m_span.length()-1]) {
|
227 |
case '.':
|
249 |
case '.':
|