Switch to side-by-side view

--- a/src/rcldb/stemdb.cpp
+++ b/src/rcldb/stemdb.cpp
@@ -130,22 +130,18 @@
     // Else, we add an entry to the multimap.
     // At the end, we only save stem-terms associations with several terms, the
     // others are not useful
-    // Note: a map<string, list<string> > would probably be more efficient
-    multimap<string, string> assocs;
+    // Note: a map<string, vector<string> > would probably be more efficient
+    map<string, vector<string> > assocs;
     // Statistics
     int nostem=0; // Dont even try: not-alphanum (incomplete for now)
     int stemconst=0; // Stem == term
-    int stemdiff=0;  // Count of all different stems
     int stemmultiple = 0; // Count of stems with multiple derivatives
     try {
         Xapian::Stem stemmer(lang);
         Xapian::TermIterator it;
         for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
-            // Deciding if we try to stem the term. 
-
-	    // If it has any
-            // non-lowercase 7bit char (that is, numbers, capitals and
-            // punctuation) dont. 
+	    // If the term has any non-lowercase 7bit char (that is,
+            // numbers, capitals and punctuation) dont stem.
             string::iterator sit = (*it).begin(), eit = sit + (*it).length();
             if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
                 ++nostem;
@@ -158,10 +154,11 @@
 	    // We're still sending all other multibyte utf-8 chars to
             // the stemmer, which is not too well defined for
             // xapian<1.0 (very obsolete now), but seems to work
-            // anyway. There shouldnt be too many in any case because
-            // accents are stripped at this point. Effect of stripping
-            // accents on stemming unknown, hopefuly none, there is
-            // nothing we can do about it.
+            // anyway. There shouldn't be too many in any case because
+            // accents are stripped at this point. 
+	    // The effect of stripping accents on stemming is not good, 
+            // (e.g: in french partimes -> partim, parti^mes -> part)
+	    // but fixing the issue would be complicated.
 	    Utf8Iter utfit(*it);
 	    if (TextSplit::isCJK(*utfit)) {
 		// LOGDEB(("stemskipped: Skipping CJK\n"));
@@ -175,7 +172,7 @@
                 ++stemconst;
                 continue;
             }
-            assocs.insert(pair<string,string>(stem, *it));
+            assocs[stem].push_back(*it);
         }
     } catch (const Xapian::Error &e) {
         LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
@@ -213,51 +210,25 @@
         return false;
     }
 
-    // Enter pseud-docs in db by walking the multimap.
-    string stem;
-    vector<string> derivs;
-    for (multimap<string,string>::const_iterator it = assocs.begin();
+    // Enter pseud-docs in db by walking the map.
+    for (map<string, vector<string> >::const_iterator it = assocs.begin();
          it != assocs.end(); it++) {
-        if (stem == it->first) {
-            // Staying with same stem
-            derivs.push_back(it->second);
-            // cerr << " " << it->second << endl;
-        } else {
-            // Changing stems 
-            ++stemdiff;
-            LOGDEB2(("createStemDb: stem [%s]\n", stem.c_str()));
-
-            // We need an entry even if there is only one derivative
-            // so that it is possible to search by entering the stem
-            // even if it doesnt exist as a term
-            if (!derivs.empty()) {
-
-                if (derivs.size() > 1)
-                    ++stemmultiple;
+	LOGDEB2(("createStemDb: stem [%s]\n", it->first.c_str()));
+	// We need an entry even if there is only one derivative
+	// so that it is possible to search by entering the stem
+	// even if it doesnt exist as a term
+	if (it->second.size() > 1)
+	    ++stemmultiple;
                     
-                if (!addAssoc(sdb, stem, derivs)) {
-                    return false;
-                }
-                derivs.clear();
-            }
-            stem = it->first;
-            derivs.push_back(it->second);
-            //	    cerr << "\n" << stem << " " << it->second;
-        }
-    }
-    if (!derivs.empty()) {
-        if (derivs.size() > 1)
-            ++stemmultiple;
-                    
-        if (!addAssoc(sdb, stem, derivs)) {
-            return false;
-        }
+	if (!addAssoc(sdb, it->first, it->second)) {
+	    return false;
+	}
     }
 
     LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n", 
              lang.c_str(), cron.secs()));
-    LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", 
-	    assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
+    LOGDEB(("Stem map size: %d mult %d const %d no %d \n", 
+	    assocs.size(), stemmultiple, stemconst, nostem));
     wiper.do_it = false;
     return true;
 }