--- a/src/filters/rcldjvu
+++ b/src/filters/rcldjvu.py
@@ -1,6 +1,5 @@
-#!/bin/sh
-# @(#$Id: rcldjvu,v 1.6 2008-10-08 08:27:34 dockes Exp $ (C) 2005 J.F.Dockes
-
+#!/usr/bin/env python
+# Copyright (C) 2016 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
@@ -16,165 +15,93 @@
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#================================================================
-# Extract text from a djvu file by executing djvused and djvutxt
-#
-# We use djvused to extract a possible title, djvutxt for the text
-#
-# Of course this only means anything if the djvu document actually has
-# a text layer !
-#
-# djvu utilities (04-2010) have a bug in which they try to interpret
-# and convert file paths as character data, and fail miserably if the
-# locale is not consistent with the actual encoding of the path (which
-# could be arbitrary binary for all they know). We use a temporary
-# symbolic link to get around this.
-#
-#================================================================
+# Recoll DJVU extractor
-progname="rcldjvu"
-filetype=dejavu
+from __future__ import print_function
+import os
+import sys
+import re
+import rclexecm
+import subprocess
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
+class DJVUExtractor:
+ def __init__(self, em):
+ self.currentindex = 0
+ self.djvused = None
+ self.djvutxt = None
+ self.em = em
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
+ def extractone(self, params):
+ self.em.setmimetype('text/html')
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
- echo RECFILTERROR $*
- # Also alert on stderr just in case
- echo ":2:$progname::: $*" 1>&2
- exit 1
-}
+ # Extract metadata
+ if self.djvused:
+ try:
+ metadata = subprocess.check_output([self.djvused, self.filename,
+ "-e", "select 1;print-meta"])
+ except Exception as e:
+ self.em.rclog("djvused failed: %s" % e)
+ author = ""
+ title = ""
+ metadata = metadata.decode('UTF-8', 'replace')
+ for line in metadata.split('\n'):
+ line = line.split('"')
+ if len(line) >= 2:
+ nm = line[0].strip()
+ if nm == "author":
+ author = ' '.join(line[1:])
+ elif nm == "title":
+ title = ' '.join(line[1:])
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
+ # Main text
+ try:
+ txtdata = subprocess.check_output([self.djvutxt, "--escape", self.filename])
+ except Exception as e:
+ self.em.rclog("djvused failed: %s" % e)
+ return (False, "", "", rclexecm.RclExecM.eofnow)
+ txtdata = txtdata.decode('UTF-8', 'replace')
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
+ data = '''<html><head><title>''' + self.em.htmlescape(title) + '''</title>'''
+ data += '''<meta http-equiv="Content-Type" '''
+ data += '''content="text/html;charset=UTF-8">'''
+ if author:
+ data += '''<meta name="author" content="''' + \
+ self.em.htmlescape(author) + '''">'''
+ data += '''</head><body><pre>'''
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
+ data += self.em.htmlescape(txtdata)
+ data += '''</pre></body></html>'''
+ return (True, data, "", rclexecm.RclExecM.eofnext)
-infile="$1"
+ ###### File type handler api, used by rclexecm ---------->
+ def openfile(self, params):
+ self.filename = params["filename:"]
+ self.currentindex = 0
+ #self.em.rclog("openfile: [%s]" % self.filename)
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
+ if not self.djvutxt:
+ self.djvutxt = rclexecm.which("djvutxt")
+ if not self.djvutxt:
+ print("RECFILTERROR HELPERNOTFOUND djvutxt")
+ sys.exit(1);
+ self.djvused = rclexecm.which("djvused")
-# protect access to our temp files and directories
-umask 77
+ return True
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
+ def getipath(self, params):
+ return self.extractone(params)
+ return (ok, data, ipath, eof)
+
+ def getnext(self, params):
+ if self.currentindex >= 1:
+ return (False, "", "", rclexecm.RclExecM.eofnow)
+ else:
+ ret= self.extractone(params)
+ self.currentindex += 1
+ return ret
-checkcmds djvutxt djvused awk
-
-# We need a temporary symlink to avoid path encoding issues
-if test z"$RECOLL_TMPDIR" != z; then
- ttdir=$RECOLL_TMPDIR
-elif test z"$TMPDIR" != z ; then
- ttdir=$TMPDIR
-else
- ttdir=/tmp
-fi
-tmplink=$ttdir/rcldjvu_tmp$$.djvu
-rm -f $tmplink
-ln -s "$infile" $tmplink || exit 1
-
-cleanup()
-{
- rm -f $tmplink
-}
-
-trap cleanup EXIT HUP QUIT INT TERM
-
-# Title: we try to extract it from the annotations. djvused outputs string
-# in C/awk \-escaped notation. Awk can only process this in string
-# constants, so we have a first awk pass to create an awk program to parse
-# the string as a constant (...). This is not exactly robust or nice
-title=`djvused "$tmplink" -e 'select 1;output-ant' | \
-grep ' (title ' | sed -e 's/^.* (title //' -e 's/)$//' |\
-awk '
-{
- printf("BEGIN" " {s = %s; print s}\n", $0)
-}' | awk -f -`
-
-
-cat <<EOF
-<html>
-<head>
- <title>$title</title>
- <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
-</head>
-<body>
-<pre>
-EOF
-
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-djvutxt "$tmplink" | sed -e 's/[ ][ ]*$//' | \
-awk 'BEGIN'\
-' {
- cont = ""
-}
-{
- $0 = cont $0
- cont = ""
-
- if ($0 == "\f") {
- print "</p>\n<hr>\n<p>"
- next
- } else if ($0 ~ /[-]$/) {
- # Break at last whitespace
- match($0, "[ \t][^ \t]+$")
- line = substr($0, 0, RSTART)
- cont = substr($0, RSTART, RLENGTH)
- $0 = line
- gsub("-", "", cont)
- }
- gsub(/&/, "\\&", $0)
- gsub(/</, "\\<", $0)
- gsub(/>/, "\\>", $0)
- print $0
-}'
-
-cat <<EOF
-</pre>
-</body>
-</html>
-EOF
+# Main program: create protocol handler and extractor and run them
+proto = rclexecm.RclExecM()
+extract = DJVUExtractor(proto)
+rclexecm.main(proto, extract)