recoll / Code / [533068] /src/filters/rcllatinclass.py

[533068]: src / filters / rcllatinclass.py History

rcllatinclass.py 124 lines (103 with data), 4.5 kB

#!/usr/bin/env python
"""Try to guess a text's language and character set by checking how it matches lists of
common words. This is not a primary method of detection because it's slow and unreliable, but it
may be a help in discrimating, for exemple, before european languages using relatively close
variations of iso-8859.
This is used in association with a zip file containing a number of stopwords list: rcllatinstops.zip

As a note, I am looking for a good iso-8859-7 stop words list for greek, the only ones I found
were utf-8 and there are errors when transcoding to iso-8859-7. I guess that there is something
about Greek accents that I don't know and would enable fixing this (some kind of simplification
allowing transliteration from utf-8 to iso-8859-7). An exemple of difficulty is the small letter
epsilon with dasia (in unicode but not iso). Can this be replaced by either epsilon or epsilon
with acute accent ?
"""

import sys
import string
import glob
import os
import os.path
from zipfile import ZipFile

class European8859TextClassifier:
    def __init__(self, langzip=""):
        """langzip contains text files. Each text file is named like lang_code.txt
        (ie: french_cp1252.txt) and contains an encoded stop word list for the language"""

        if langzip == "":
            langzip = os.path.join(os.path.dirname(__file__), 'rcllatinstops.zip')
            
        self.readlanguages(langzip)

        # Table to translate from punctuation to spaces
        self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "'\n\r"
        spaces = len(self.punct) * " "
        self.spacetable = string.maketrans(self.punct, spaces)

    def readlanguages(self, langzip):
        """Extract the stop words lists from the zip file.
        We build a merge dictionary from the lists.
        The keys are the words from all the files. The
        values are a list of the (lang,code) origin(s) for the each word.
        """
        zip = ZipFile(langzip)
        langfiles = zip.namelist()
        self.allwords = {}
        for fn in langfiles:
            langcode = os.path.basename(fn)
            langcode = os.path.splitext(langcode)[0]
            (lang,code) = langcode.split('_')
            text = zip.read(fn)
            words = text.split()
            for word in words:
                if self.allwords.has_key(word):
                    self.allwords[word].append((lang, code))
                else:
                    self.allwords[word] = [(lang, code)]

    def classify(self, rawtext):
        # Note: we can't use an re-based method to split the data because it
        # should be considered binary, not text.

        # Limit to reasonable size.
        if len(rawtext) > 10000:
            i = rawtext.find(" ", 9000)
            if i == -1:
                i = 9000
            rawtext = rawtext[0:i]

        # Remove punctuation
        rawtext = rawtext.translate(self.spacetable)

        # Make of list of all text words, order it by frequency, we only
        # use the ntest most frequent words.
        ntest = 20
        words = rawtext.split()
        dict = {}
        for w in words:
            dict[w] = dict.get(w, 0) + 1
        lfreq = [a[0] for a in sorted(dict.iteritems(), \
                       key=lambda entry: entry[1], reverse=True)[0:ntest]]
        #print lfreq

        # Build a dict (lang,code)->matchcount
        langstats = {}
        for w in lfreq:
            lcl = self.allwords.get(w, [])
            for lc in lcl:
                langstats[lc] = langstats.get(lc, 0) + 1

        # Get a list of (lang,code) sorted by match count
        lcfreq = sorted(langstats.iteritems(), \
                        key=lambda entry: entry[1], reverse=True)
        #print lcfreq[0:3]
        if len(lcfreq) != 0:
            lc,maxcount = lcfreq[0]
            maxlang = lc[0]
            maxcode = lc[1]
        else:
            maxcount = 0

        # If the match is too bad, default to most common. Maybe we should
        # generate an error instead, but the caller can look at the count
        # anyway.
        if maxcount == 0:
            maxlang,maxcode = ('english', 'cp1252')

        return (maxlang, maxcode, maxcount)


if __name__ == "__main__":
    f = open(sys.argv[1], "rb")
    rawtext = f.read()
    f.close()

    classifier = European8859TextClassifier()

    lang,code,count = classifier.classify(rawtext)
    if count > 0:
        print "%s %s %d" % (code, lang, count)
    else:
        print "UNKNOWN UNKNOWN 0"