rclkar: renamed files for compat with install script

Jean-Francois Dockes Jean-Francois Dockes 2011-01-31

changed src/filters/rclkar
copied src/filters/eulangclass.py -> src/filters/rcllatinclass.py
copied src/filters/iso8859stops.zip -> src/filters/rcllatinstops.zip
src/filters/rclkar Diff Switch to side-by-side view
Loading...
src/filters/eulangclass.py to src/filters/rcllatinclass.py
--- a/src/filters/eulangclass.py
+++ b/src/filters/rcllatinclass.py
@@ -12,11 +12,11 @@
         self.langtables = self.readlanguages(langzip)
 
         # Table to translate from punctuation to spaces
-        punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r"
+        self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r"
         spaces = ""
-        for c in punct:
+        for c in self.punct:
             spaces += " " 
-        self.spacetable = string.maketrans(punct, spaces)
+        self.spacetable = string.maketrans(self.punct, spaces)
 
     # Read the languages stopwords lists
     def readlanguages(self, langzip):
@@ -33,10 +33,17 @@
         return langs
 
     def classify(self, rawtext):
-
+        # Note: we can't use an re-based method to split the data because it
+        # should be considered binary, not text.
+        # Limit to reasonable size.
+        if len(rawtext) > 10000:
+            i = rawtext.find(" ", 9000)
+            if i == -1:
+                i = 9000
+            rawtext = rawtext[0:i]
         # Remove punctuation
         rawtext = rawtext.translate(self.spacetable)
-        # Split words
+        # Split words. 
         words = rawtext.split()
         # Count frequencies
         dict = {}
@@ -45,8 +52,8 @@
         # Order word list by frequency
         lfreq = sorted(dict.iteritems(), \
                        key=lambda entry: entry[1], reverse=True)
-        # Check the ntest most frequent words against the language lists and
-        # chose the best match
+        # Check the text's ntest most frequent words against the
+        # language lists and chose the best match
         ntest = 10
         maxcount = 0
         maxlang = ""
@@ -56,7 +63,7 @@
             for w,c in lfreq[0:ntest]:
                 if w in lwords:
                     count += 1
-            print "Lang %s code %s count %d" % (lang, code, count)
+            #print "Lang %s code %s count %d" % (lang, code, count)
             if maxcount < count:
                 maxlang = lang
                 maxcount = count
@@ -73,7 +80,7 @@
     f.close()
 
     dir = os.path.dirname(__file__)
-    langszip = os.path.join(dir, 'iso8859stops.zip')
+    langszip = os.path.join(dir, 'rcllatinstops.zip')
 
     classifier = European8859TextClassifier(langszip)