recoll / Code / Diff of /src/rcldb/expansiondbs.cpp

Diff of /src/rcldb/expansiondbs.cpp [f624d3] .. [994995]

Switch to unified view


...
    string ermsg;
    try {
        for (Xapian::TermIterator it = wdb.allterms_begin(); 
         it != wdb.allterms_end(); it++) {







        // Detect and skip CJK terms.








        Utf8Iter utfit(*it);
        if (TextSplit::isCJK(*utfit)) {
        // LOGDEB(("stemskipped: Skipping CJK\n"));
        continue;
        }
...
        if (!o_index_stripchars) {
        unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
        diacasedb.addSynonym(*it);
        }
#endif

      // Dont' apply stemming to terms which don't look like
      // natural language words.
            if (!Db::isSpellingCandidate(*it)) {
                LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
                continue;
            }

        // Create stemming synonym for every language. The input is the 
        // lowercase accented term
        for (unsigned int i = 0; i < langs.size(); i++) {
        stemdbs[i].addSynonym(lower);

	a/src/rcldb/expansiondbs.cpp		b/src/rcldb/expansiondbs.cpp
	...		...
82	string ermsg;	82	string ermsg;
83	try {	83	try {
84	for (Xapian::TermIterator it = wdb.allterms_begin();	84	for (Xapian::TermIterator it = wdb.allterms_begin();
85	it != wdb.allterms_end(); it++) {	85	it != wdb.allterms_end(); it++) {
86		86
87	// Skip terms which don't look like natural language words.
88	if (!Db::isSpellingCandidate(*it)) {
89	LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
90	continue;
91	}
92
93	// Detect and skip CJK terms.	87	// Detect and skip CJK terms.
94	// We're still sending all other multibyte utf-8 chars to
95	// the stemmer, which is not too well defined for
96	// xapian<1.0 (very obsolete now), but seems to work
97	// anyway. There shouldn't be too many in any case because
98	// accents are stripped at this point.
99	// The effect of stripping accents on stemming is not good,
100	// (e.g: in french partimes -> partim, parti^mes -> part)
101	// but fixing the issue would be complicated.
102	Utf8Iter utfit(*it);	88	Utf8Iter utfit(*it);
103	if (TextSplit::isCJK(*utfit)) {	89	if (TextSplit::isCJK(*utfit)) {
104	// LOGDEB(("stemskipped: Skipping CJK\n"));	90	// LOGDEB(("stemskipped: Skipping CJK\n"));
105	continue;	91	continue;
106	}	92	}
	...		...
114	if (!o_index_stripchars) {	100	if (!o_index_stripchars) {
115	unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);	101	unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
116	diacasedb.addSynonym(*it);	102	diacasedb.addSynonym(*it);
117	}	103	}
118	#endif	104	#endif
		105
		106	// Dont' apply stemming to terms which don't look like
		107	// natural language words.
		108	if (!Db::isSpellingCandidate(*it)) {
		109	LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
		110	continue;
		111	}
119		112
120	// Create stemming synonym for every language. The input is the	113	// Create stemming synonym for every language. The input is the
121	// lowercase accented term	114	// lowercase accented term
122	for (unsigned int i = 0; i < langs.size(); i++) {	115	for (unsigned int i = 0; i < langs.size(); i++) {
123	stemdbs[i].addSynonym(lower);	116	stemdbs[i].addSynonym(lower);