Parent: [dfe00a] (diff)

Download this file

rcllatinclass.py    132 lines (109 with data), 4.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python2
"""Try to guess a text's language and character set by checking how it matches lists of
common words. This is not a primary method of detection because it's slow and unreliable, but it
may be a help in discrimating, for exemple, before european languages using relatively close
variations of iso-8859.
This is used in association with a zip file containing a number of stopwords list: rcllatinstops.zip
As a note, I am looking for a good iso-8859-7 stop words list for greek, the only ones I found
were utf-8 and there are errors when transcoding to iso-8859-7. I guess that there is something
about Greek accents that I don't know and would enable fixing this (some kind of simplification
allowing transliteration from utf-8 to iso-8859-7). An exemple of difficulty is the small letter
epsilon with dasia (in unicode but not iso). Can this be replaced by either epsilon or epsilon
with acute accent ?
"""
from __future__ import print_function
import sys
PY3 = sys.version > '3'
if not PY3:
import string
import glob
import os
import os.path
from zipfile import ZipFile
class European8859TextClassifier:
def __init__(self, langzip=""):
"""langzip contains text files. Each text file is named like lang_code.txt
(ie: french_cp1252.txt) and contains an encoded stop word list for the language"""
if langzip == "":
langzip = os.path.join(os.path.dirname(__file__), 'rcllatinstops.zip')
self.readlanguages(langzip)
# Table to translate from punctuation to spaces
self.punct = b'''0123456789<>/*?[].@+-,#_$%&={};.,:!"''' + b"'\n\r"
spaces = len(self.punct) * b' '
if PY3:
self.spacetable = bytes.maketrans(self.punct, spaces)
else:
self.spacetable = string.maketrans(self.punct, spaces)
def readlanguages(self, langzip):
"""Extract the stop words lists from the zip file.
We build a merge dictionary from the lists.
The keys are the words from all the files. The
values are a list of the (lang,code) origin(s) for the each word.
"""
zip = ZipFile(langzip)
langfiles = zip.namelist()
self.allwords = {}
for fn in langfiles:
langcode = os.path.basename(fn)
langcode = os.path.splitext(langcode)[0]
(lang,code) = langcode.split('_')
text = zip.read(fn)
words = text.split()
for word in words:
if word in self.allwords:
self.allwords[word].append((lang, code))
else:
self.allwords[word] = [(lang, code)]
def classify(self, rawtext):
# Note: we can't use an re-based method to split the data because it
# should be considered binary, not text.
# Limit to reasonable size.
if len(rawtext) > 10000:
i = rawtext.find(b' ', 9000)
if i == -1:
i = 9000
rawtext = rawtext[0:i]
# Remove punctuation
rawtext = rawtext.translate(self.spacetable)
# Make of list of all text words, order it by frequency, we only
# use the ntest most frequent words.
ntest = 20
words = rawtext.split()
dict = {}
for w in words:
dict[w] = dict.get(w, 0) + 1
lfreq = [a[0] for a in sorted(dict.items(), \
key=lambda entry: entry[1], reverse=True)[0:ntest]]
#print(lfreq)
# Build a dict (lang,code)->matchcount
langstats = {}
for w in lfreq:
lcl = self.allwords.get(w, [])
for lc in lcl:
langstats[lc] = langstats.get(lc, 0) + 1
# Get a list of (lang,code) sorted by match count
lcfreq = sorted(langstats.items(), \
key=lambda entry: entry[1], reverse=True)
#print(lcfreq[0:3])
if len(lcfreq) != 0:
lc,maxcount = lcfreq[0]
maxlang = lc[0]
maxcode = lc[1]
else:
maxcount = 0
# If the match is too bad, default to most common. Maybe we should
# generate an error instead, but the caller can look at the count
# anyway.
if maxcount == 0:
maxlang,maxcode = ('english', 'cp1252')
return (maxlang, maxcode, maxcount)
if __name__ == "__main__":
f = open(sys.argv[1], "rb")
rawtext = f.read()
f.close()
classifier = European8859TextClassifier()
lang,code,count = classifier.classify(rawtext)
if count > 0:
print("%s %s %d" % (code, lang, count))
else:
print("UNKNOWN UNKNOWN 0")