recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [48bb4a] .. [b3ab39]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...
// Do some checking (the kind which is simpler to do here than in the
// main loop), then send term to our client.
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
             int btstart, int btend)
{
    LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));

    unsigned int l = w.length();
    if (l > 0 && l < (unsigned)maxWordLength) {
    // 1 char word: we index single letters and digits, but
    // nothing else. We might want to turn this into a test for a single
...
        if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
        //cerr << "ERASING single letter term " << c << endl;
        return true;
        }
    }
  if (pos != prevpos || l != prevlen) {
        bool ret = cb->takeword(w, pos, btstart, btend);
      prevlen = w.length();
        prevpos = pos;
        return ret;
    }
  LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
    }
    return true;
}

/**
...
 * @param spanerase Set if the current span is at its end. Reset it.
 * @param bp        The current BYTE position in the stream
 */
inline bool TextSplit::doemit(bool spanerase, int bp)
{
    LOGDEB3(("TextSplit::doemit: wrd [%s] wp %d spn [%s] sp %d spe %d bp %d\n",
      word.c_str(), wordpos, span.c_str(), spanpos, spanerase, bp));




    // Emit span. When splitting for query, we only emit final spans
    bool spanemitted = false;
    if (spanerase && !(m_flags & TXTS_NOSPANS)) {
    // Maybe trim at end These are chars that we would keep inside 
...
    setcharclasses();

    span.erase();
    word.erase(); // Current word: no punctuation at all in there
    number = false;
    prevpos = prevlen = wordpos = spanpos = charpos = 0;


    Utf8Iter it(in);

    for (; !it.eof(); it++, charpos++) {
    unsigned int c = *it;
...
        return false;
    }
    int cc = whatcc(c);
    switch (cc) {
    case LETTER:
      it.appendchartostring(word);
      it.appendchartostring(span);
        break;

    case DIGIT:
        if (word.length() == 0)
        number = true;
      it.appendchartostring(word);
      it.appendchartostring(span);
        break;

    case SPACE:
    SPACE:
        if (word.length() || span.length()) {
...
    case '-':
    case '+':
        if (word.length() == 0) {
        if (whatcc(it[charpos+1]) == DIGIT) {
            number = true;
          it.appendchartostring(word);
          it.appendchartostring(span);
        } else
          it.appendchartostring(span);
        } else {
        if (!doemit(false, it.getBpos()))
            return false;
        number = false;
      it.appendchartostring(span);
        }
        break;
    case '.':
    case ',':
        if (number) {
        // 132.jpg ?
        if (whatcc(it[charpos+1]) != DIGIT)
            goto SPACE;
      it.appendchartostring(word);
      it.appendchartostring(span);
        break;
        } else {
        // If . inside a word, keep it, else, this is whitespace. 
        // We also keep an initial '.' for catching .net, but this adds
        // quite a few spurious terms !
...
            if (!doemit(false, it.getBpos()))
                return false;
            // span length could have been adjusted by trimming
            // inside doemit
            if (span.length())
              it.appendchartostring(span);
            break;
            } else {
          it.appendchartostring(span);
            break;
            }
        }
        }
        goto SPACE;
...
        if (word.length()) {
        if (!doemit(false, it.getBpos()))
            return false;
        number = false;
        }
      it.appendchartostring(span);
        break;
    case '\'':
        // If in word, potential span: o'brien, else, this is more 
        // whitespace
        if (word.length()) {
        if (!doemit(false, it.getBpos()))
            return false;
        number = false;
      it.appendchartostring(span);
        }
        break;
    case '#': 
        // Keep it only at end of word... Special case for c# you see...
        if (word.length() > 0) {
        int w = whatcc(it[charpos+1]);
        if (w == SPACE || w == '\n' || w == '\r') {
          it.appendchartostring(word);
          it.appendchartostring(span);
            break;
        }
        }
        goto SPACE;
        break;
...
        goto SPACE;
        }
        break;

    default:
      it.appendchartostring(word);
      it.appendchartostring(span);
        break;
    }
    }
    if (word.length() || span.length()) {
    if (!doemit(true, it.getBpos()))
...
using namespace std;

// A small class to hold state while splitting text
class mySplitterCB : public TextSplitCB {
    int first;
    bool nooutput;
 public:
    mySplitterCB() : first(1), nooutput(false) {}
    void setNoOut(bool val) {nooutput = val;}
    bool takeword(const std::string &term, int pos, int bs, int be) {
  if (nooutput)
      return true;
    if (first) {
        printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
        first = 0;
    }
    printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
...
static string teststring1 = " 124, ";

static string thisprog;

static string usage =
      " textsplit [opts] [filename]\n"
      "   -S: no output\n"
      "   -s:  only spans\n"
      "   -w:  only words\n"
    " if filename is 'stdin', will read stdin for data (end with ^D)\n"
    "  \n\n"
    ;

static void
...
}

static int        op_flags;
#define OPT_s     0x1 
#define OPT_w     0x2
#define OPT_S   0x4

int main(int argc, char **argv)
{
    thisprog = argv[0];
    argc--; argv++;
...
        /* Cas du "adb - core" */
        Usage();
    while (**argv)
        switch (*(*argv)++) {
        case 's':   op_flags |= OPT_s; break;
      case 'S':   op_flags |= OPT_S; break;
        case 'w':   op_flags |= OPT_w; break;
        default: Usage();   break;
        }
    argc--; argv++;
    }
    DebugLog::getdbl()->setloglevel(DEBDEB1);
    DebugLog::setfilename("stderr");

    mySplitterCB cb;
    TextSplit::Flags flags = TextSplit::TXTS_NONE;

    if (op_flags&OPT_S)
  cb.setNoOut(true);

    if (op_flags&OPT_s)
    flags = TextSplit::TXTS_ONLYSPANS;
    else if (op_flags&OPT_w)
    flags = TextSplit::TXTS_NOSPANS;
    TextSplit splitter(&cb,  flags);

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
91	// Do some checking (the kind which is simpler to do here than in the	91	// Do some checking (the kind which is simpler to do here than in the
92	// main loop), then send term to our client.	92	// main loop), then send term to our client.
93	inline bool TextSplit::emitterm(bool isspan, string &w, int pos,	93	inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
94	int btstart, int btend)	94	int btstart, int btend)
95	{	95	{
96	LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));	96	LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
97		97
98	unsigned int l = w.length();	98	unsigned int l = w.length();
99	if (l > 0 && l < (unsigned)maxWordLength) {	99	if (l > 0 && l < (unsigned)maxWordLength) {
100	// 1 char word: we index single letters and digits, but	100	// 1 char word: we index single letters and digits, but
101	// nothing else. We might want to turn this into a test for a single	101	// nothing else. We might want to turn this into a test for a single
	...		...
105	if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {	105	if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
106	//cerr << "ERASING single letter term " << c << endl;	106	//cerr << "ERASING single letter term " << c << endl;
107	return true;	107	return true;
108	}	108	}
109	}	109	}
110	if (pos != prevpos \|\| l != prevterm.length() \|\| w != prevterm) {	110	if (pos != prevpos \|\| l != prevlen) {
111	bool ret = cb->takeword(w, pos, btstart, btend);	111	bool ret = cb->takeword(w, pos, btstart, btend);
112	prevterm = w;	112	prevlen = w.length();
113	prevpos = pos;	113	prevpos = pos;
114	return ret;	114	return ret;
115	}	115	}
		116	LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
116	}	117	}
117	return true;	118	return true;
118	}	119	}
119		120
120	/**	121	/**
	...		...
135	* @param spanerase Set if the current span is at its end. Reset it.	136	* @param spanerase Set if the current span is at its end. Reset it.
136	* @param bp The current BYTE position in the stream	137	* @param bp The current BYTE position in the stream
137	*/	138	*/
138	inline bool TextSplit::doemit(bool spanerase, int bp)	139	inline bool TextSplit::doemit(bool spanerase, int bp)
139	{	140	{
140	#if 0	141	LOGDEB3(("TextSplit::doemit: wrd [%s] wp %d spn [%s] sp %d spe %d bp %d\n",
141	cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" <<	142	word.c_str(), wordpos, span.c_str(), spanpos, spanerase, bp));
142	span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp
143	<< endl;
144	#endif
145		143
146	// Emit span. When splitting for query, we only emit final spans	144	// Emit span. When splitting for query, we only emit final spans
147	bool spanemitted = false;	145	bool spanemitted = false;
148	if (spanerase && !(m_flags & TXTS_NOSPANS)) {	146	if (spanerase && !(m_flags & TXTS_NOSPANS)) {
149	// Maybe trim at end These are chars that we would keep inside	147	// Maybe trim at end These are chars that we would keep inside
	...		...
212	setcharclasses();	210	setcharclasses();
213		211
214	span.erase();	212	span.erase();
215	word.erase(); // Current word: no punctuation at all in there	213	word.erase(); // Current word: no punctuation at all in there
216	number = false;	214	number = false;
217	prevpos = wordpos = spanpos = charpos = 0;	215	prevpos = prevlen = wordpos = spanpos = charpos = 0;
218	prevterm.erase();
219		216
220	Utf8Iter it(in);	217	Utf8Iter it(in);
221		218
222	for (; !it.eof(); it++, charpos++) {	219	for (; !it.eof(); it++, charpos++) {
223	unsigned int c = *it;	220	unsigned int c = *it;
	...		...
226	return false;	223	return false;
227	}	224	}
228	int cc = whatcc(c);	225	int cc = whatcc(c);
229	switch (cc) {	226	switch (cc) {
230	case LETTER:	227	case LETTER:
231	word += it;	228	it.appendchartostring(word);
232	span += it;	229	it.appendchartostring(span);
233	break;	230	break;
234		231
235	case DIGIT:	232	case DIGIT:
236	if (word.length() == 0)	233	if (word.length() == 0)
237	number = true;	234	number = true;
238	word += it;	235	it.appendchartostring(word);
239	span += it;	236	it.appendchartostring(span);
240	break;	237	break;
241		238
242	case SPACE:	239	case SPACE:
243	SPACE:	240	SPACE:
244	if (word.length() \|\| span.length()) {	241	if (word.length() \|\| span.length()) {
	...		...
250	case '-':	247	case '-':
251	case '+':	248	case '+':
252	if (word.length() == 0) {	249	if (word.length() == 0) {
253	if (whatcc(it[charpos+1]) == DIGIT) {	250	if (whatcc(it[charpos+1]) == DIGIT) {
254	number = true;	251	number = true;
255	word += it;	252	it.appendchartostring(word);
256	span += it;	253	it.appendchartostring(span);
257	} else	254	} else
258	span += it;	255	it.appendchartostring(span);
259	} else {	256	} else {
260	if (!doemit(false, it.getBpos()))	257	if (!doemit(false, it.getBpos()))
261	return false;	258	return false;
262	number = false;	259	number = false;
263	span += it;	260	it.appendchartostring(span);
264	}	261	}
265	break;	262	break;
266	case '.':	263	case '.':
267	case ',':	264	case ',':
268	if (number) {	265	if (number) {
269	// 132.jpg ?	266	// 132.jpg ?
270	if (whatcc(it[charpos+1]) != DIGIT)	267	if (whatcc(it[charpos+1]) != DIGIT)
271	goto SPACE;	268	goto SPACE;
272	word += it;	269	it.appendchartostring(word);
273	span += it;	270	it.appendchartostring(span);
274	break;	271	break;
275	} else {	272	} else {
276	// If . inside a word, keep it, else, this is whitespace.	273	// If . inside a word, keep it, else, this is whitespace.
277	// We also keep an initial '.' for catching .net, but this adds	274	// We also keep an initial '.' for catching .net, but this adds
278	// quite a few spurious terms !	275	// quite a few spurious terms !
	...		...
284	if (!doemit(false, it.getBpos()))	281	if (!doemit(false, it.getBpos()))
285	return false;	282	return false;
286	// span length could have been adjusted by trimming	283	// span length could have been adjusted by trimming
287	// inside doemit	284	// inside doemit
288	if (span.length())	285	if (span.length())
289	span += it;	286	it.appendchartostring(span);
290	break;	287	break;
291	} else {	288	} else {
292	span += it;	289	it.appendchartostring(span);
293	break;	290	break;
294	}	291	}
295	}	292	}
296	}	293	}
297	goto SPACE;	294	goto SPACE;
	...		...
300	if (word.length()) {	297	if (word.length()) {
301	if (!doemit(false, it.getBpos()))	298	if (!doemit(false, it.getBpos()))
302	return false;	299	return false;
303	number = false;	300	number = false;
304	}	301	}
305	span += it;	302	it.appendchartostring(span);
306	break;	303	break;
307	case '\'':	304	case '\'':
308	// If in word, potential span: o'brien, else, this is more	305	// If in word, potential span: o'brien, else, this is more
309	// whitespace	306	// whitespace
310	if (word.length()) {	307	if (word.length()) {
311	if (!doemit(false, it.getBpos()))	308	if (!doemit(false, it.getBpos()))
312	return false;	309	return false;
313	number = false;	310	number = false;
314	span += it;	311	it.appendchartostring(span);
315	}	312	}
316	break;	313	break;
317	case '#':	314	case '#':
318	// Keep it only at end of word... Special case for c# you see...	315	// Keep it only at end of word... Special case for c# you see...
319	if (word.length() > 0) {	316	if (word.length() > 0) {
320	int w = whatcc(it[charpos+1]);	317	int w = whatcc(it[charpos+1]);
321	if (w == SPACE \|\| w == '\n' \|\| w == '\r') {	318	if (w == SPACE \|\| w == '\n' \|\| w == '\r') {
322	word += it;	319	it.appendchartostring(word);
323	span += it;	320	it.appendchartostring(span);
324	break;	321	break;
325	}	322	}
326	}	323	}
327	goto SPACE;	324	goto SPACE;
328	break;	325	break;
	...		...
341	goto SPACE;	338	goto SPACE;
342	}	339	}
343	break;	340	break;
344		341
345	default:	342	default:
346	word += it;	343	it.appendchartostring(word);
347	span += it;	344	it.appendchartostring(span);
348	break;	345	break;
349	}	346	}
350	}	347	}
351	if (word.length() \|\| span.length()) {	348	if (word.length() \|\| span.length()) {
352	if (!doemit(true, it.getBpos()))	349	if (!doemit(true, it.getBpos()))
	...		...
371	using namespace std;	368	using namespace std;
372		369
373	// A small class to hold state while splitting text	370	// A small class to hold state while splitting text
374	class mySplitterCB : public TextSplitCB {	371	class mySplitterCB : public TextSplitCB {
375	int first;	372	int first;
		373	bool nooutput;
376	public:	374	public:
377	mySplitterCB() : first(1) {}	375	mySplitterCB() : first(1), nooutput(false) {}
378		376	void setNoOut(bool val) {nooutput = val;}
379	bool takeword(const std::string &term, int pos, int bs, int be) {	377	bool takeword(const std::string &term, int pos, int bs, int be) {
		378	if (nooutput)
		379	return true;
380	if (first) {	380	if (first) {
381	printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");	381	printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
382	first = 0;	382	first = 0;
383	}	383	}
384	printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);	384	printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
	...		...
404	static string teststring1 = " 124, ";	404	static string teststring1 = " 124, ";
405		405
406	static string thisprog;	406	static string thisprog;
407		407
408	static string usage =	408	static string usage =
409	" textsplit [opts] [filename]\n"	409	" textsplit [opts] [filename]\n"
		410	" -S: no output\n"
410	" -s: only spans\n"	411	" -s: only spans\n"
411	" -w: only words\n"	412	" -w: only words\n"
412	" if filename is 'stdin', will read stdin for data (end with ^D)\n"	413	" if filename is 'stdin', will read stdin for data (end with ^D)\n"
413	" \n\n"	414	" \n\n"
414	;	415	;
415		416
416	static void	417	static void
	...		...
421	}	422	}
422		423
423	static int op_flags;	424	static int op_flags;
424	#define OPT_s 0x1	425	#define OPT_s 0x1
425	#define OPT_w 0x2	426	#define OPT_w 0x2
		427	#define OPT_S 0x4
426		428
427	int main(int argc, char **argv)	429	int main(int argc, char **argv)
428	{	430	{
429	thisprog = argv[0];	431	thisprog = argv[0];
430	argc--; argv++;	432	argc--; argv++;
	...		...
435	/* Cas du "adb - core" */	437	/* Cas du "adb - core" */
436	Usage();	438	Usage();
437	while (**argv)	439	while (**argv)
438	switch ((argv)++) {	440	switch ((argv)++) {
439	case 's': op_flags \|= OPT_s; break;	441	case 's': op_flags \|= OPT_s; break;
		442	case 'S': op_flags \|= OPT_S; break;
440	case 'w': op_flags \|= OPT_w; break;	443	case 'w': op_flags \|= OPT_w; break;
441	default: Usage(); break;	444	default: Usage(); break;
442	}	445	}
443	argc--; argv++;	446	argc--; argv++;
444	}	447	}
445	DebugLog::getdbl()->setloglevel(DEBDEB1);	448	DebugLog::getdbl()->setloglevel(DEBDEB1);
446	DebugLog::setfilename("stderr");	449	DebugLog::setfilename("stderr");
		450
447	mySplitterCB cb;	451	mySplitterCB cb;
448	TextSplit::Flags flags = TextSplit::TXTS_NONE;	452	TextSplit::Flags flags = TextSplit::TXTS_NONE;
		453
		454	if (op_flags&OPT_S)
		455	cb.setNoOut(true);
		456
449	if (op_flags&OPT_s)	457	if (op_flags&OPT_s)
450	flags = TextSplit::TXTS_ONLYSPANS;	458	flags = TextSplit::TXTS_ONLYSPANS;
451	else if (op_flags&OPT_w)	459	else if (op_flags&OPT_w)
452	flags = TextSplit::TXTS_NOSPANS;	460	flags = TextSplit::TXTS_NOSPANS;
453	TextSplit splitter(&cb, flags);	461	TextSplit splitter(&cb, flags);