Child: [f344e8] (diff)

Download this file

rcldoc.py    166 lines (147 with data), 5.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python
import rclexecm
import rclexec1
import re
import sys
import os
# Processing the output from antiword: create html header and tail, process
# continuation lines escape, HTML special characters, accumulate the data.
class WordProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.cont = ""
self.gotdata = False
# Line with continued word (ending in -)
# we strip the - which is not nice for actually hyphenated word.
# What to do ?
self.patcont = re.compile('''[\w][-]$''')
# Pattern for breaking continuation at last word start
self.patws = re.compile('''([\s])([\w]+)(-)$''')
def takeLine(self, line):
if not self.gotdata:
if line == "":
return
self.out = '<html><head><title></title>' + \
'<meta http-equiv="Content-Type"' + \
'content="text/html;charset=UTF-8">' + \
'</head><body><p>'
self.gotdata = True
if self.cont:
line = self.cont + line
self.cont = ""
if line == "\f":
self.out += "</p><hr><p>"
return
if self.patcont.search(line):
# Break at last whitespace
match = self.patws.search(line)
if match:
self.cont = line[match.start(2):match.end(2)]
line = line[0:match.start(1)]
else:
self.cont = line
line = ""
if line:
self.out += self.em.htmlescape(line) + "<br>"
else:
self.out += "<br>"
def wrapData(self):
if self.gotdata:
self.out += "</p></body></html>"
self.em.setmimetype("text/html")
return self.out
# Null data accumulator. We use this when antiword has fail, and the
# data actually comes from rclrtf, rcltext or vwWare, which all
# output HTML
class WordPassData:
def __init__(self, em):
self.out = ""
self.em = em
def takeLine(self, line):
self.out += line
def wrapData(self):
self.em.setmimetype("text/html")
return self.out
# Filter for msword docs. Try antiword, and if this fails, check for
# an rtf or text document (.doc are sometimes like this...). Also try
# vwWare if the doc is actually a word doc
class WordFilter:
def __init__(self, em, td):
self.em = em
self.ntry = 0
self.execdir = td
def reset(self):
self.ntry = 0
def hasControlChars(self, data):
for c in data:
if c < chr(32) and c != '\n' and c != '\t' and \
c != '\f' and c != '\r':
return True
return False
def mimetype(self, fn):
rtfprolog ="{\\rtf1"
docprolog = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
try:
f = open(fn, "rb")
except:
return ""
data = f.read(100)
if data[0:6] == rtfprolog:
return "text/rtf"
elif data[0:8] == docprolog:
return "application/msword"
elif self.hasControlChars(data):
return "application/octet-stream"
else:
return "text/plain"
def getCmd(self, fn):
'''Return command to execute, and postprocessor, according to
our state: first try antiword, then others depending on mime
identification. Do 2 tries at most'''
if self.ntry == 0:
self.ntry = 1
cmd = rclexecm.which("antiword")
if cmd:
return ([cmd, "-t", "-i", "1", "-m", "UTF-8"],
WordProcessData(self.em))
else:
return ([],None)
elif self.ntry == 1:
self.ntry = 2
# antiword failed. Check for an rtf file, or text and
# process accordingly. It the doc is actually msword, try
# wvWare.
mt = self.mimetype(fn)
self.em.rclog("rcldoc.py: actual MIME type %s" % mt)
if mt == "text/plain":
return ([python, os.path.join(self.execdir, "rcltext.py")],
WordPassData(self.em))
elif mt == "text/rtf":
cmd = ["python", os.path.join(self.execdir, "rclrtf.py"),
"-s"]
self.em.rclog("rcldoc.py: returning cmd %s" % cmd)
return (cmd, WordPassData(self.em))
elif mt == "application/msword":
cmd = rclexecm.which("wvWare")
if cmd:
return ([cmd, "--nographics", "--charset=utf-8"],
WordPassData(self.em))
else:
return ([],None)
else:
return ([],None)
else:
return ([],None)
if __name__ == '__main__':
# Remember where we execute filters from, in case we need to exec another
execdir = os.path.dirname(sys.argv[0])
# Check that we have antiword. We could fallback to wvWare, but
# this is not what the old filter did.
if not rclexecm.which("antiword"):
print("RECFILTERROR HELPERNOTFOUND antiword")
sys.exit(1)
proto = rclexecm.RclExecM()
filter = WordFilter(proto, execdir)
extract = rclexec1.Executor(proto, filter)
rclexecm.main(proto, extract)