recoll / Code / Diff of /src/rcldb/expansiondbs.cpp

Diff of /src/rcldb/expansiondbs.cpp [8b40cb] .. [52bc9f]

Switch to unified view


...

#ifndef RCL_INDEX_STRIPCHARS
    // Unaccented stem dbs
    vector<XapWritableComputableSynFamMember> unacstemdbs;
    // We can reuse the same stemmer pointers, the objects are stateless.
    if (!o_index_stripchars) {
  for (unsigned int i = 0; i < langs.size(); i++) {
        unacstemdbs.push_back(
      XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i], 
                        stemmers.back().getptr()));
        unacstemdbs.back().recreate();
  }
    }

    SynTermTransUnac transunac(UNACOP_UNACFOLD);
    XapWritableComputableSynFamMember 
    diacasedb(wdb, synFamDiCa, "all", &transunac);
    if (!o_index_stripchars)
  diacasedb.recreate();
#endif

    // Walk the list of all terms, and stem/unac each.
    string ermsg;
    try {
...
#ifndef RCL_INDEX_STRIPCHARS
        // If the index is raw, compute the case-folded term which
        // is the input to the stem db, and add a synonym from the
        // stripped term to the cased and accented one, for accent
        // and case expansion at query time
      if (!o_index_stripchars) {
      unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
      diacasedb.addSynonym(*it);
      }
#endif

        // Create stemming synonym for every language. The input is the 
        // lowercase accented term
        for (unsigned int i = 0; i < langs.size(); i++) {
...
#ifndef RCL_INDEX_STRIPCHARS
        // For a raw index, also maybe create a stem expansion for
        // the unaccented term. While this may be incorrect, it is
        // also necessary for searching in a diacritic-unsensitive
        // way on a raw index
      if (!o_index_stripchars) {
      string unac;
      unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
      if (unac != lower) {
            for (unsigned int i = 0; i < langs.size(); i++) {
          unacstemdbs[i].addSynonym(unac);
          }
        }
      }
#endif
        }
    } XCATCHERROR(ermsg);
    if (!ermsg.empty()) {
        LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));

	a/src/rcldb/expansiondbs.cpp		b/src/rcldb/expansiondbs.cpp
	...		...
61		61
62	#ifndef RCL_INDEX_STRIPCHARS	62	#ifndef RCL_INDEX_STRIPCHARS
63	// Unaccented stem dbs	63	// Unaccented stem dbs
64	vector<XapWritableComputableSynFamMember> unacstemdbs;	64	vector<XapWritableComputableSynFamMember> unacstemdbs;
65	// We can reuse the same stemmer pointers, the objects are stateless.	65	// We can reuse the same stemmer pointers, the objects are stateless.
		66	if (!o_index_stripchars) {
66	for (unsigned int i = 0; i < langs.size(); i++) {	67	for (unsigned int i = 0; i < langs.size(); i++) {
67	unacstemdbs.push_back(	68	unacstemdbs.push_back(
68	XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],	69	XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],
69	stemmers.back().getptr()));	70	stemmers.back().getptr()));
70	unacstemdbs.back().recreate();	71	unacstemdbs.back().recreate();
		72	}
71	}	73	}
72
73	SynTermTransUnac transunac(UNACOP_UNACFOLD);	74	SynTermTransUnac transunac(UNACOP_UNACFOLD);
74	XapWritableComputableSynFamMember	75	XapWritableComputableSynFamMember
75	diacasedb(wdb, synFamDiac, "all", &transunac);	76	diacasedb(wdb, synFamDiCa, "all", &transunac);
		77	if (!o_index_stripchars)
76	diacasedb.recreate();	78	diacasedb.recreate();
77	#endif	79	#endif
78		80
79	// Walk the list of all terms, and stem/unac each.	81	// Walk the list of all terms, and stem/unac each.
80	string ermsg;	82	string ermsg;
81	try {	83	try {
	...		...
107	#ifndef RCL_INDEX_STRIPCHARS	109	#ifndef RCL_INDEX_STRIPCHARS
108	// If the index is raw, compute the case-folded term which	110	// If the index is raw, compute the case-folded term which
109	// is the input to the stem db, and add a synonym from the	111	// is the input to the stem db, and add a synonym from the
110	// stripped term to the cased and accented one, for accent	112	// stripped term to the cased and accented one, for accent
111	// and case expansion at query time	113	// and case expansion at query time
		114	if (!o_index_stripchars) {
112	unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);	115	unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
113	diacasedb.addSynonym(*it);	116	diacasedb.addSynonym(*it);
		117	}
114	#endif	118	#endif
115		119
116	// Create stemming synonym for every language. The input is the	120	// Create stemming synonym for every language. The input is the
117	// lowercase accented term	121	// lowercase accented term
118	for (unsigned int i = 0; i < langs.size(); i++) {	122	for (unsigned int i = 0; i < langs.size(); i++) {
	...		...
122	#ifndef RCL_INDEX_STRIPCHARS	126	#ifndef RCL_INDEX_STRIPCHARS
123	// For a raw index, also maybe create a stem expansion for	127	// For a raw index, also maybe create a stem expansion for
124	// the unaccented term. While this may be incorrect, it is	128	// the unaccented term. While this may be incorrect, it is
125	// also necessary for searching in a diacritic-unsensitive	129	// also necessary for searching in a diacritic-unsensitive
126	// way on a raw index	130	// way on a raw index
		131	if (!o_index_stripchars) {
127	string unac;	132	string unac;
128	unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);	133	unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
129	if (unac != lower)	134	if (unac != lower) {
130	for (unsigned int i = 0; i < langs.size(); i++) {	135	for (unsigned int i = 0; i < langs.size(); i++) {
131	unacstemdbs[i].addSynonym(unac);	136	unacstemdbs[i].addSynonym(unac);
		137	}
132	}	138	}
		139	}
133	#endif	140	#endif
134	}	141	}
135	} XCATCHERROR(ermsg);	142	} XCATCHERROR(ermsg);
136	if (!ermsg.empty()) {	143	if (!ermsg.empty()) {
137	LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));	144	LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));