Parent: [dc3aa5] (diff)

Child: [f344e8] (diff)

Download this file

rcllatinclass.py    124 lines (103 with data), 4.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
"""Try to guess a text's language and character set by checking how it matches lists of
common words. This is not a primary method of detection because it's slow and unreliable, but it
may be a help in discrimating, for exemple, before european languages using relatively close
variations of iso-8859.
This is used in association with a zip file containing a number of stopwords list: rcllatinstops.zip
As a note, I am looking for a good iso-8859-7 stop words list for greek, the only ones I found
were utf-8 and there are errors when transcoding to iso-8859-7. I guess that there is something
about Greek accents that I don't know and would enable fixing this (some kind of simplification
allowing transliteration from utf-8 to iso-8859-7). An exemple of difficulty is the small letter
epsilon with dasia (in unicode but not iso). Can this be replaced by either epsilon or epsilon
with acute accent ?
"""
import sys
import string
import glob
import os
import os.path
from zipfile import ZipFile
class European8859TextClassifier:
def __init__(self, langzip=""):
"""langzip contains text files. Each text file is named like lang_code.txt
(ie: french_cp1252.txt) and contains an encoded stop word list for the language"""
if langzip == "":
langzip = os.path.join(os.path.dirname(__file__), 'rcllatinstops.zip')
self.readlanguages(langzip)
# Table to translate from punctuation to spaces
self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "'\n\r"
spaces = len(self.punct) * " "
self.spacetable = string.maketrans(self.punct, spaces)
def readlanguages(self, langzip):
"""Extract the stop words lists from the zip file.
We build a merge dictionary from the lists.
The keys are the words from all the files. The
values are a list of the (lang,code) origin(s) for the each word.
"""
zip = ZipFile(langzip)
langfiles = zip.namelist()
self.allwords = {}
for fn in langfiles:
langcode = os.path.basename(fn)
langcode = os.path.splitext(langcode)[0]
(lang,code) = langcode.split('_')
text = zip.read(fn)
words = text.split()
for word in words:
if self.allwords.has_key(word):
self.allwords[word].append((lang, code))
else:
self.allwords[word] = [(lang, code)]
def classify(self, rawtext):
# Note: we can't use an re-based method to split the data because it
# should be considered binary, not text.
# Limit to reasonable size.
if len(rawtext) > 10000:
i = rawtext.find(" ", 9000)
if i == -1:
i = 9000
rawtext = rawtext[0:i]
# Remove punctuation
rawtext = rawtext.translate(self.spacetable)
# Make of list of all text words, order it by frequency, we only
# use the ntest most frequent words.
ntest = 20
words = rawtext.split()
dict = {}
for w in words:
dict[w] = dict.get(w, 0) + 1
lfreq = [a[0] for a in sorted(dict.iteritems(), \
key=lambda entry: entry[1], reverse=True)[0:ntest]]
#print lfreq
# Build a dict (lang,code)->matchcount
langstats = {}
for w in lfreq:
lcl = self.allwords.get(w, [])
for lc in lcl:
langstats[lc] = langstats.get(lc, 0) + 1
# Get a list of (lang,code) sorted by match count
lcfreq = sorted(langstats.iteritems(), \
key=lambda entry: entry[1], reverse=True)
#print lcfreq[0:3]
if len(lcfreq) != 0:
lc,maxcount = lcfreq[0]
maxlang = lc[0]
maxcode = lc[1]
else:
maxcount = 0
# If the match is too bad, default to most common. Maybe we should
# generate an error instead, but the caller can look at the count
# anyway.
if maxcount == 0:
maxlang,maxcode = ('english', 'cp1252')
return (maxlang, maxcode, maxcount)
if __name__ == "__main__":
f = open(sys.argv[1], "rb")
rawtext = f.read()
f.close()
classifier = European8859TextClassifier()
lang,code,count = classifier.classify(rawtext)
if count > 0:
print "%s %s %d" % (code, lang, count)
else:
print "UNKNOWN UNKNOWN 0"