|
a/src/rcldb/stemdb.h |
|
b/src/rcldb/stemdb.h |
|
... |
|
... |
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _STEMDB_H_INCLUDED_
|
17 |
#ifndef _STEMDB_H_INCLUDED_
|
18 |
#define _STEMDB_H_INCLUDED_
|
18 |
#define _STEMDB_H_INCLUDED_
|
|
|
19 |
|
19 |
/// Stem database code
|
20 |
/** Stem database code
|
20 |
///
|
21 |
*
|
21 |
/// Stem databases list stems and the set of index terms they expand to. They
|
22 |
* Stem databases list stems and the set of index terms they expand to. They
|
22 |
/// are computed from index data by stemming each term and regrouping those
|
23 |
* are computed from index data by stemming each term and regrouping those
|
23 |
/// that stem to the same value.
|
24 |
* that stem to the same value.
|
|
|
25 |
*
|
24 |
/// Stem databases are stored as separate xapian databases (used as an
|
26 |
* Stem databases are stored as separate Xapian databases, in
|
25 |
/// Isam method), in subdirectories of the index.
|
27 |
* subdirectories of the index (e.g.: stem_french, stem_german2)
|
|
|
28 |
*
|
|
|
29 |
* The stem database is generated at the end of an indexing session by
|
|
|
30 |
* walking the whole index term list, computing the stem for each
|
|
|
31 |
* term, and building a stem->terms map.
|
|
|
32 |
*
|
|
|
33 |
* The map is then stored as a Xapian index where each stem is the
|
|
|
34 |
* unique term indexing a document, and the list of expansions is stored
|
|
|
35 |
* as the document data record. It would probably be possible to store
|
|
|
36 |
* the expansions as the document term list instead (using a prefix to
|
|
|
37 |
* distinguish the stem term).
|
|
|
38 |
*
|
|
|
39 |
* Another possible approach would be to update the stem map as we index.
|
|
|
40 |
* This would probably be be less efficient for a full index pass because
|
|
|
41 |
* each term would be seen and stemmed many times, but it might be
|
|
|
42 |
* more efficient for an incremental pass with a limited number of
|
|
|
43 |
* updated documents. For a small update, the stem building part often
|
|
|
44 |
* dominates the indexing time.
|
|
|
45 |
*
|
|
|
46 |
* For future reference, I did try to store the map in a gdbm file and
|
|
|
47 |
* the result is bigger and takes more time to create than the Xapian version.
|
|
|
48 |
*/
|
26 |
|
49 |
|
27 |
#include <vector>
|
50 |
#include <vector>
|
28 |
#include <string>
|
51 |
#include <string>
|
29 |
|
52 |
|
30 |
#include <xapian.h>
|
53 |
#include <xapian.h>
|