recoll / Code / Diff of /src/rcldb/expansiondbs.cpp

Diff of /src/rcldb/expansiondbs.cpp [af8b82] .. [3fbcbc]

Switch to unified view


...
    // Erase and recreate all the expansion groups

    // If langs is empty and we don't need casediac expansion, then no need to
    // walk the big list
    if (langs.empty()) {

    if (o_index_stripchars)

        return true;
    }

    // Stem dbs
    vector<XapWritableComputableSynFamMember> stemdbs;
...
        XapWritableComputableSynFamMember(wdb, synFamStem, langs[i], 
                          stemmers.back().getptr()));
    stemdbs.back().recreate();
    }


    // Unaccented stem dbs
    vector<XapWritableComputableSynFamMember> unacstemdbs;
    // We can reuse the same stemmer pointers, the objects are stateless.
    if (!o_index_stripchars) {
    for (unsigned int i = 0; i < langs.size(); i++) {
...
    SynTermTransUnac transunac(UNACOP_UNACFOLD);
    XapWritableComputableSynFamMember 
    diacasedb(wdb, synFamDiCa, "all", &transunac);
    if (!o_index_stripchars)
    diacasedb.recreate();


    // Walk the list of all terms, and stem/unac each.
    string ermsg;
    try {
    Xapian::TermIterator it = wdb.allterms_begin();
...
        // LOGDEB(("stemskipped: Skipping CJK\n"));
        continue;
        }

        string lower = *it;

        // If the index is raw, compute the case-folded term which
        // is the input to the stem db, and add a synonym from the
        // stripped term to the cased and accented one, for accent
        // and case expansion at query time
        if (!o_index_stripchars) {
        unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
        diacasedb.addSynonym(*it);
        }


        // Dont' apply stemming to terms which don't look like
        // natural language words.
            if (!Db::isSpellingCandidate(*it)) {
                LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
...
        // lowercase accented term
        for (unsigned int i = 0; i < langs.size(); i++) {
        stemdbs[i].addSynonym(lower);
        }


        // For a raw index, also maybe create a stem expansion for
        // the unaccented term. While this may be incorrect, it is
        // also necessary for searching in a diacritic-unsensitive
        // way on a raw index
        if (!o_index_stripchars) {
...
            for (unsigned int i = 0; i < langs.size(); i++) {
            unacstemdbs[i].addSynonym(unac);
            }
        }
        }

        }
    } XCATCHERROR(ermsg);
    if (!ermsg.empty()) {
        LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
        return false;

	a/src/rcldb/expansiondbs.cpp		b/src/rcldb/expansiondbs.cpp
	...		...
46	// Erase and recreate all the expansion groups	46	// Erase and recreate all the expansion groups
47		47
48	// If langs is empty and we don't need casediac expansion, then no need to	48	// If langs is empty and we don't need casediac expansion, then no need to
49	// walk the big list	49	// walk the big list
50	if (langs.empty()) {	50	if (langs.empty()) {
51	#ifndef RCL_INDEX_STRIPCHARS
52	if (o_index_stripchars)	51	if (o_index_stripchars)
53	#endif
54	return true;	52	return true;
55	}	53	}
56		54
57	// Stem dbs	55	// Stem dbs
58	vector<XapWritableComputableSynFamMember> stemdbs;	56	vector<XapWritableComputableSynFamMember> stemdbs;
	...		...
66	XapWritableComputableSynFamMember(wdb, synFamStem, langs[i],	64	XapWritableComputableSynFamMember(wdb, synFamStem, langs[i],
67	stemmers.back().getptr()));	65	stemmers.back().getptr()));
68	stemdbs.back().recreate();	66	stemdbs.back().recreate();
69	}	67	}
70		68
71	#ifndef RCL_INDEX_STRIPCHARS
72	// Unaccented stem dbs	69	// Unaccented stem dbs
73	vector<XapWritableComputableSynFamMember> unacstemdbs;	70	vector<XapWritableComputableSynFamMember> unacstemdbs;
74	// We can reuse the same stemmer pointers, the objects are stateless.	71	// We can reuse the same stemmer pointers, the objects are stateless.
75	if (!o_index_stripchars) {	72	if (!o_index_stripchars) {
76	for (unsigned int i = 0; i < langs.size(); i++) {	73	for (unsigned int i = 0; i < langs.size(); i++) {
	...		...
83	SynTermTransUnac transunac(UNACOP_UNACFOLD);	80	SynTermTransUnac transunac(UNACOP_UNACFOLD);
84	XapWritableComputableSynFamMember	81	XapWritableComputableSynFamMember
85	diacasedb(wdb, synFamDiCa, "all", &transunac);	82	diacasedb(wdb, synFamDiCa, "all", &transunac);
86	if (!o_index_stripchars)	83	if (!o_index_stripchars)
87	diacasedb.recreate();	84	diacasedb.recreate();
88	#endif
89		85
90	// Walk the list of all terms, and stem/unac each.	86	// Walk the list of all terms, and stem/unac each.
91	string ermsg;	87	string ermsg;
92	try {	88	try {
93	Xapian::TermIterator it = wdb.allterms_begin();	89	Xapian::TermIterator it = wdb.allterms_begin();
	...		...
105	// LOGDEB(("stemskipped: Skipping CJK\n"));	101	// LOGDEB(("stemskipped: Skipping CJK\n"));
106	continue;	102	continue;
107	}	103	}
108		104
109	string lower = *it;	105	string lower = *it;
110	#ifndef RCL_INDEX_STRIPCHARS
111	// If the index is raw, compute the case-folded term which	106	// If the index is raw, compute the case-folded term which
112	// is the input to the stem db, and add a synonym from the	107	// is the input to the stem db, and add a synonym from the
113	// stripped term to the cased and accented one, for accent	108	// stripped term to the cased and accented one, for accent
114	// and case expansion at query time	109	// and case expansion at query time
115	if (!o_index_stripchars) {	110	if (!o_index_stripchars) {
116	unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);	111	unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
117	diacasedb.addSynonym(*it);	112	diacasedb.addSynonym(*it);
118	}	113	}
119	#endif
120		114
121	// Dont' apply stemming to terms which don't look like	115	// Dont' apply stemming to terms which don't look like
122	// natural language words.	116	// natural language words.
123	if (!Db::isSpellingCandidate(*it)) {	117	if (!Db::isSpellingCandidate(*it)) {
124	LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));	118	LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
	...		...
129	// lowercase accented term	123	// lowercase accented term
130	for (unsigned int i = 0; i < langs.size(); i++) {	124	for (unsigned int i = 0; i < langs.size(); i++) {
131	stemdbs[i].addSynonym(lower);	125	stemdbs[i].addSynonym(lower);
132	}	126	}
133		127
134	#ifndef RCL_INDEX_STRIPCHARS
135	// For a raw index, also maybe create a stem expansion for	128	// For a raw index, also maybe create a stem expansion for
136	// the unaccented term. While this may be incorrect, it is	129	// the unaccented term. While this may be incorrect, it is
137	// also necessary for searching in a diacritic-unsensitive	130	// also necessary for searching in a diacritic-unsensitive
138	// way on a raw index	131	// way on a raw index
139	if (!o_index_stripchars) {	132	if (!o_index_stripchars) {
	...		...
143	for (unsigned int i = 0; i < langs.size(); i++) {	136	for (unsigned int i = 0; i < langs.size(); i++) {
144	unacstemdbs[i].addSynonym(unac);	137	unacstemdbs[i].addSynonym(unac);
145	}	138	}
146	}	139	}
147	}	140	}
148	#endif
149	}	141	}
150	} XCATCHERROR(ermsg);	142	} XCATCHERROR(ermsg);
151	if (!ermsg.empty()) {	143	if (!ermsg.empty()) {
152	LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));	144	LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
153	return false;	145	return false;