recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [297ff2] .. [cb0794]

Switch to unified view


...
 * @param bp        The current BYTE position in the stream
 * @param spanemit  This is set for intermediate spans: glue char changed.
 */
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
{
    LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
      "inn %d span [%s]\n",
      spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
      m_inNumber, m_span.c_str()));

    // Emit span? When splitting for query, we only emit final spans
    // (spanerase)
    bool spanemitted = false;
    if (!(m_flags & TXTS_NOSPANS) && 
        !((m_wordLen == m_span.length()) && 
          (o_noNumbers) && m_inNumber) &&
    ((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
    // Maybe trim at end. These are chars that we would keep inside 
    // a span, but not at the end
    while (m_span.length() > 0) {
        switch (m_span[m_span.length()-1]) {
        case '.':
      case '-':
        case ',':
        case '@':
        case '\'':
        m_span.resize(m_span.length()-1);
        if (--bp < 0) 
...
    if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
        return false;
    }

    // Adjust state
    if (m_wordLen) {
  m_wordpos++;
  m_wordLen = 0;
    }
    if (spanerase) {
  discardspan();


    } else {
    m_wordStart = m_span.length();
    }

    return true;
}

void TextSplit::discardspan()
{
    m_span.erase();
    m_spanpos = m_wordpos;
    m_wordStart = 0;
    m_wordLen = 0;
}

/** 
 * Splitting a text into terms to be indexed.
 * We basically emit a word every time we see a separator, but some chars are
...
    m_span.erase();
    m_inNumber = false;
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
    int curspanglue = 0;

    // Running count of non-alphanum chars. Reset when we see one;
    int nonalnumcnt = 0;

    Utf8Iter it(in);

    for (; !it.eof(); it++) {
    unsigned int c = *it;
  nonalnumcnt++;

    if (c == (unsigned int)-1) {
        LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
        return false;
    }
...
    switch (cc) {
    case DIGIT:
        if (m_wordLen == 0)
        m_inNumber = true;
        m_wordLen += it.appendchartostring(m_span);
      nonalnumcnt = 0;
        break;

    case SPACE:
    SPACE:
        curspanglue = 0;
      nonalnumcnt = 0;
        if (m_wordLen || m_span.length()) {
        if (!doemit(true, it.getBpos()))
            return false;
        m_inNumber = false;
        }
...
        else
        goto SPACE;
        break;
    case '-':
    case '+':
      curspanglue = cc;
        if (m_wordLen == 0) {
      if (cc == '-') {
          if (whatcc(it[it.getCpos()+1]) == DIGIT) {
          // -10
          m_inNumber = true;
          m_wordLen += it.appendchartostring(m_span);
          } else {
          goto SPACE;
          } 
      } else {
          if (nonalnumcnt > 2) {
          discardspan();
          } else {
          m_wordStart += it.appendchartostring(m_span);
          }
      }
      } else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
                    m_span[m_span.length() - 1] == 'E')) {
        if (whatcc(it[it.getCpos()+1]) == DIGIT) {

            m_wordLen += it.appendchartostring(m_span);
        } else {
          goto SPACE;
        }

        } else {
        if (!doemit(false, it.getBpos()))
            return false;

        m_inNumber = false;
        m_wordStart += it.appendchartostring(m_span);
        }
        break;
    case '.':
...
            goto SPACE;
        m_wordLen += it.appendchartostring(m_span);
        curspanglue = cc;
        break;
        } else {
        // If . inside a word, it's spanglue, else, it's whitespace. 
        // We also keep an initial '.' for catching .net, but this adds
        // quite a few spurious terms !
                // Another problem is that something like .x-errs 
        // will be split as .x-errs, x, errs but not x-errs
        // A final comma in a word will be removed by doemit
      if (cc == '.' && it[it.getCpos()+1] != '.') {
                    // Check for number like .1
                    if (m_span.length() == 0 &&
                        whatcc(it[it.getCpos()+1]) == DIGIT) {
                        m_inNumber = true;
                        m_wordLen += it.appendchartostring(m_span);
...
                    }
                            
            if (m_wordLen) {
            // Disputable special case: set spanemit to
            // true when encountering a '.' while spanglue
            // is '_'. Think of a_b.c Done to
            // avoid breaking stuff after changing '_'
            // from wordchar to spanglue
            if (!doemit(false, it.getBpos(), curspanglue == '_'))
                return false;
            curspanglue = cc;
...
    NORMALCHAR:
            if (m_inNumber && c != 'e' && c != 'E') {
                m_inNumber = false;
            }
        m_wordLen += it.appendchartostring(m_span);
      nonalnumcnt = 0;
        break;
    }
    }
    if (m_wordLen || m_span.length()) {
    if (!doemit(true, it.getBpos()))

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
	...		...
206	* @param bp The current BYTE position in the stream	206	* @param bp The current BYTE position in the stream
207	* @param spanemit This is set for intermediate spans: glue char changed.	207	* @param spanemit This is set for intermediate spans: glue char changed.
208	*/	208	*/
209	inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)	209	inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
210	{	210	{
211	LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d "	211	LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
212	"innum %d\n", m_span.c_str(), m_spanpos, m_wordStart,	212	"inn %d span [%s]\n",
213	m_wordLen, spanerase, bp, m_inNumber));	213	spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
		214	m_inNumber, m_span.c_str()));
214		215
215	// Emit span. When splitting for query, we only emit final spans	216	// Emit span? When splitting for query, we only emit final spans
		217	// (spanerase)
216	bool spanemitted = false;	218	bool spanemitted = false;
217	if (!(m_flags & TXTS_NOSPANS) &&	219	if (!(m_flags & TXTS_NOSPANS) &&
218	!((m_wordLen == m_span.length()) &&	220	!((m_wordLen == m_span.length()) &&
219	(o_noNumbers) && m_inNumber) &&	221	(o_noNumbers) && m_inNumber) &&
220	((spanemit && !(m_flags & TXTS_ONLYSPANS)) \|\| spanerase) ) {	222	((spanemit && !(m_flags & TXTS_ONLYSPANS)) \|\| spanerase) ) {
221	// Maybe trim at end. These are chars that we would keep inside	223	// Maybe trim at end. These are chars that we would keep inside
222	// a span, but not at the end	224	// a span, but not at the end
223	while (m_span.length() > 0) {	225	while (m_span.length() > 0) {
224	switch (m_span[m_span.length()-1]) {	226	switch (m_span[m_span.length()-1]) {
225	case '.':	227	case '.':
		228	case '-':
226	case ',':	229	case ',':
227	case '@':	230	case '@':
228	case '\'':	231	case '\'':
229	m_span.resize(m_span.length()-1);	232	m_span.resize(m_span.length()-1);
230	if (--bp < 0)	233	if (--bp < 0)
	...		...
248	if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))	251	if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
249	return false;	252	return false;
250	}	253	}
251		254
252	// Adjust state	255	// Adjust state
		256	if (m_wordLen) {
253	m_wordpos++;	257	m_wordpos++;
254	m_wordLen = 0;	258	m_wordLen = 0;
		259	}
255	if (spanerase) {	260	if (spanerase) {
256	m_span.erase();	261	discardspan();
257	m_spanpos = m_wordpos;
258	m_wordStart = 0;
259	} else {	262	} else {
260	m_wordStart = m_span.length();	263	m_wordStart = m_span.length();
261	}	264	}
262		265
263	return true;	266	return true;
		267	}
		268
		269	void TextSplit::discardspan()
		270	{
		271	m_span.erase();
		272	m_spanpos = m_wordpos;
		273	m_wordStart = 0;
		274	m_wordLen = 0;
264	}	275	}
265		276
266	/**	277	/**
267	* Splitting a text into terms to be indexed.	278	* Splitting a text into terms to be indexed.
268	* We basically emit a word every time we see a separator, but some chars are	279	* We basically emit a word every time we see a separator, but some chars are
	...		...
281	m_span.erase();	292	m_span.erase();
282	m_inNumber = false;	293	m_inNumber = false;
283	m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;	294	m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
284	int curspanglue = 0;	295	int curspanglue = 0;
285		296
		297	// Running count of non-alphanum chars. Reset when we see one;
		298	int nonalnumcnt = 0;
		299
286	Utf8Iter it(in);	300	Utf8Iter it(in);
287		301
288	for (; !it.eof(); it++) {	302	for (; !it.eof(); it++) {
289	unsigned int c = *it;	303	unsigned int c = *it;
		304	nonalnumcnt++;
290		305
291	if (c == (unsigned int)-1) {	306	if (c == (unsigned int)-1) {
292	LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));	307	LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
293	return false;	308	return false;
294	}	309	}
	...		...
317	switch (cc) {	332	switch (cc) {
318	case DIGIT:	333	case DIGIT:
319	if (m_wordLen == 0)	334	if (m_wordLen == 0)
320	m_inNumber = true;	335	m_inNumber = true;
321	m_wordLen += it.appendchartostring(m_span);	336	m_wordLen += it.appendchartostring(m_span);
		337	nonalnumcnt = 0;
322	break;	338	break;
323		339
324	case SPACE:	340	case SPACE:
325	SPACE:	341	SPACE:
326	curspanglue = 0;	342	curspanglue = 0;
		343	nonalnumcnt = 0;
327	if (m_wordLen \|\| m_span.length()) {	344	if (m_wordLen \|\| m_span.length()) {
328	if (!doemit(true, it.getBpos()))	345	if (!doemit(true, it.getBpos()))
329	return false;	346	return false;
330	m_inNumber = false;	347	m_inNumber = false;
331	}	348	}
	...		...
336	else	353	else
337	goto SPACE;	354	goto SPACE;
338	break;	355	break;
339	case '-':	356	case '-':
340	case '+':	357	case '+':
		358	curspanglue = cc;
341	if (m_wordLen == 0 \|\|	359	if (m_wordLen == 0) {
		360	if (cc == '-') {
		361	if (whatcc(it[it.getCpos()+1]) == DIGIT) {
		362	// -10
		363	m_inNumber = true;
		364	m_wordLen += it.appendchartostring(m_span);
		365	} else {
		366	goto SPACE;
		367	}
		368	} else {
		369	if (nonalnumcnt > 2) {
		370	discardspan();
		371	} else {
		372	m_wordStart += it.appendchartostring(m_span);
		373	}
		374	}
342	(m_inNumber && (m_span[m_span.length() - 1] == 'e' \|\|	375	} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' \|\|
343	m_span[m_span.length() - 1] == 'E'))) {	376	m_span[m_span.length() - 1] == 'E')) {
344	if (whatcc(it[it.getCpos()+1]) == DIGIT) {	377	if (whatcc(it[it.getCpos()+1]) == DIGIT) {
345	m_inNumber = true;
346	m_wordLen += it.appendchartostring(m_span);	378	m_wordLen += it.appendchartostring(m_span);
347	} else {	379	} else {
348	m_wordStart += it.appendchartostring(m_span);	380	goto SPACE;
349	}	381	}
350	curspanglue = cc;
351	} else {	382	} else {
352	if (!doemit(false, it.getBpos()))	383	if (!doemit(false, it.getBpos()))
353	return false;	384	return false;
354	curspanglue = cc;
355	m_inNumber = false;	385	m_inNumber = false;
356	m_wordStart += it.appendchartostring(m_span);	386	m_wordStart += it.appendchartostring(m_span);
357	}	387	}
358	break;	388	break;
359	case '.':	389	case '.':
	...		...
365	goto SPACE;	395	goto SPACE;
366	m_wordLen += it.appendchartostring(m_span);	396	m_wordLen += it.appendchartostring(m_span);
367	curspanglue = cc;	397	curspanglue = cc;
368	break;	398	break;
369	} else {	399	} else {
370	// If . inside a word, keep it, else, this is whitespace.	400	// If . inside a word, it's spanglue, else, it's whitespace.
371	// We also keep an initial '.' for catching .net, but this adds	401	// We also keep an initial '.' for catching .net, but this adds
372	// quite a few spurious terms !	402	// quite a few spurious terms !
373	// Another problem is that something like .x-errs	403	// Another problem is that something like .x-errs
374	// will be split as .x-errs, x, errs but not x-errs	404	// will be split as .x-errs, x, errs but not x-errs
375	// A final comma in a word will be removed by doemit	405	// A final comma in a word will be removed by doemit
376	if (cc == '.') {	406	if (cc == '.' && it[it.getCpos()+1] != '.') {
377	// Check for number like .1	407	// Check for number like .1
378	if (m_span.length() == 0 &&	408	if (m_span.length() == 0 &&
379	whatcc(it[it.getCpos()+1]) == DIGIT) {	409	whatcc(it[it.getCpos()+1]) == DIGIT) {
380	m_inNumber = true;	410	m_inNumber = true;
381	m_wordLen += it.appendchartostring(m_span);	411	m_wordLen += it.appendchartostring(m_span);
	...		...
384	}	414	}
385		415
386	if (m_wordLen) {	416	if (m_wordLen) {
387	// Disputable special case: set spanemit to	417	// Disputable special case: set spanemit to
388	// true when encountering a '.' while spanglue	418	// true when encountering a '.' while spanglue
389	// is '_'. Think of a_b.c Done because to	419	// is '_'. Think of a_b.c Done to
390	// avoid breaking stuff after changing '_'	420	// avoid breaking stuff after changing '_'
391	// from wordchar to spanglue	421	// from wordchar to spanglue
392	if (!doemit(false, it.getBpos(), curspanglue == '_'))	422	if (!doemit(false, it.getBpos(), curspanglue == '_'))
393	return false;	423	return false;
394	curspanglue = cc;	424	curspanglue = cc;
	...		...
507	NORMALCHAR:	537	NORMALCHAR:
508	if (m_inNumber && c != 'e' && c != 'E') {	538	if (m_inNumber && c != 'e' && c != 'E') {
509	m_inNumber = false;	539	m_inNumber = false;
510	}	540	}
511	m_wordLen += it.appendchartostring(m_span);	541	m_wordLen += it.appendchartostring(m_span);
		542	nonalnumcnt = 0;
512	break;	543	break;
513	}	544	}
514	}	545	}
515	if (m_wordLen \|\| m_span.length()) {	546	if (m_wordLen \|\| m_span.length()) {
516	if (!doemit(true, it.getBpos()))	547	if (!doemit(true, it.getBpos()))