recoll / Code / Diff of /src/rcldb/stemdb.cpp

Diff of /src/rcldb/stemdb.cpp [dff775] .. [ac14b9]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.4 2006-09-20 06:21:43 dockes Exp $ (C) 2005 J.F.Dockes";
#endif

/**
 * Management of the auxiliary databases listing stems and their expansion 
 * terms
...
        rmdir(dir.c_str());
    }
    }
};



inline static bool
p_notlowerascii(unsigned int c)
{
    if (c < 'a' || (c > 'z' && c < 128))
    return true;
...
    try {
    Xapian::Stem stemmer(lang);
    Xapian::TermIterator it;
    for (it = xdb.allterms_begin(); 
         it != xdb.allterms_end(); it++) {
      // Deciding if we try to stem the term. If it has any
      // non-lowercase 7bit char, dont. Note that
      // as we are dealing with unaccented data, we are still
      // processing most of western european languages (where
      // most unaccented letters are ascii)
        string::iterator sit = (*it).begin(), eit = sit + (*it).length();
        if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
        ++nostem;
        // LOGDEB(("stemskipped: [%s], because of 0x%x\n", 
        // (*it).c_str(), *sit));

	a/src/rcldb/stemdb.cpp		b/src/rcldb/stemdb.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.3 2006-09-19 14:30:39 dockes Exp $ (C) 2005 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.4 2006-09-20 06:21:43 dockes Exp $ (C) 2005 J.F.Dockes";
3	#endif	3	#endif
4		4
5	/**	5	/**
6	* Management of the auxiliary databases listing stems and their expansion	6	* Management of the auxiliary databases listing stems and their expansion
7	* terms	7	* terms
	...		...
65	rmdir(dir.c_str());	65	rmdir(dir.c_str());
66	}	66	}
67	}	67	}
68	};	68	};
69		69
70	// Deciding if we try to stem the term. If it has numerals or capitals
71	// we don't
72	inline static bool	70	inline static bool
73	p_notlowerascii(unsigned int c)	71	p_notlowerascii(unsigned int c)
74	{	72	{
75	if (c < 'a' \|\| (c > 'z' && c < 128))	73	if (c < 'a' \|\| (c > 'z' && c < 128))
76	return true;	74	return true;
	...		...
103	try {	101	try {
104	Xapian::Stem stemmer(lang);	102	Xapian::Stem stemmer(lang);
105	Xapian::TermIterator it;	103	Xapian::TermIterator it;
106	for (it = xdb.allterms_begin();	104	for (it = xdb.allterms_begin();
107	it != xdb.allterms_end(); it++) {	105	it != xdb.allterms_end(); it++) {
108	// If it has any non-lowercase 7bit char, cant be stemmable	106	// Deciding if we try to stem the term. If it has any
		107	// non-lowercase 7bit char, dont. Note that
		108	// as we are dealing with unaccented data, we are still
		109	// processing most of western european languages (where
		110	// most unaccented letters are ascii)
109	string::iterator sit = (it).begin(), eit = sit + (it).length();	111	string::iterator sit = (it).begin(), eit = sit + (it).length();
110	if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {	112	if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
111	++nostem;	113	++nostem;
112	// LOGDEB(("stemskipped: [%s], because of 0x%x\n",	114	// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
113	// (it).c_str(), sit));	115	// (it).c_str(), sit));