recoll / Code / Diff of /src/rcldb/stoplist.cpp

Diff of /src/rcldb/stoplist.cpp [c25272] .. [e4eba0]

Switch to side-by-side view

--- a/src/rcldb/stoplist.cpp
+++ b/src/rcldb/stoplist.cpp
@@ -19,7 +19,7 @@
 #include "debuglog.h"
 #include "readfile.h"
 #include "unacpp.h"
-#include "textsplit.h"
+#include "smallut.h"
 #include "stoplist.h"
 
 #ifndef NO_NAMESPACES
@@ -27,40 +27,33 @@
 {
 #endif
 
-class TextSplitSW : public TextSplit {
-public:
-    set<string>& stops;
-    TextSplitSW(Flags flags, set<string>& stps) 
-        : TextSplit(flags), stops(stps) 
-    {}
-    virtual bool takeword(const string& term, int, int, int)
-    {
-        string dterm;
-        unacmaybefold(term, dterm, "UTF-8", true);
-        stops.insert(dterm);
-        return true;
-    }
-};
-
 bool StopList::setFile(const string &filename)
 {
-    m_hasStops = false;
     m_stops.clear();
     string stoptext, reason;
     if (!file_to_string(filename, stoptext, &reason)) {
-	LOGDEB(("StopList::StopList: file_to_string(%s) failed: %s\n", 
-		filename.c_str(), reason.c_str()));
+	LOGDEB0(("StopList::StopList: file_to_string(%s) failed: %s\n", 
+		 filename.c_str(), reason.c_str()));
 	return false;
     }
-    TextSplitSW ts(TextSplit::TXTS_ONLYSPANS, m_stops);
-    ts.text_to_words(stoptext);
-    m_hasStops = !m_stops.empty();
+    set<string> stops;
+    stringToStrings(stoptext, stops);
+    for (set<string>::iterator it = stops.begin(); 
+	 it != stops.end(); it++) {
+	string dterm;
+	unacmaybefold(*it, dterm, "UTF-8", true);
+	m_stops.insert(dterm);
+    }
+
     return true;
 }
 
+// Most sites will have an empty stop list. We try to optimize the
+// empty set case as much as possible. empty() is probably sligtly faster than
+// find() in this case.
 bool StopList::isStop(const string &term) const
 {
-    return m_hasStops ? m_stops.find(term) != m_stops.end() : false;
+    return m_stops.empty() ? false : m_stops.find(term) != m_stops.end();
 }
 
 
@@ -97,7 +90,7 @@
 }
 
 const string tstwords[] = {
-    "the", "is", "xweird"
+    "the", "is", "xweird", "autre", "autre double", "mot1", "mot double",
 };
 const int tstsz = sizeof(tstwords) / sizeof(string);