converted rcldjvu to python

Jean-Francois Dockes Jean-Francois Dockes 2016-04-08

changed src/sampleconf/mimeconf
changed src/Makefile.am
copied src/filters/rcldjvu -> src/filters/rcldjvu.py
src/sampleconf/mimeconf Diff Switch to side-by-side view
Loading...
src/Makefile.am Diff Switch to side-by-side view
Loading...
src/filters/rcldjvu to src/filters/rcldjvu.py
--- a/src/filters/rcldjvu
+++ b/src/filters/rcldjvu.py
@@ -1,6 +1,5 @@
-#!/bin/sh
-# @(#$Id: rcldjvu,v 1.6 2008-10-08 08:27:34 dockes Exp $  (C) 2005 J.F.Dockes
-
+#!/usr/bin/env python
+# Copyright (C) 2016 J.F.Dockes
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 2 of the License, or
@@ -16,165 +15,93 @@
 # Free Software Foundation, Inc.,
 # 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 
-#================================================================
-# Extract text from a djvu file by executing djvused and djvutxt
-#
-# We use djvused to extract a possible title, djvutxt for the text
-#
-# Of course this only means anything if the djvu document actually has
-# a text layer !
-#
-# djvu utilities (04-2010) have a bug in which they try to interpret
-# and convert file paths as character data, and fail miserably if the
-# locale is not consistent with the actual encoding of the path (which
-# could be arbitrary binary for all they know). We use a temporary
-# symbolic link to get around this.
-# 
-#================================================================
+# Recoll DJVU extractor
 
-progname="rcldjvu"
-filetype=dejavu
+from __future__ import print_function
 
+import os
+import sys
+import re
+import rclexecm
+import subprocess
 
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
+class DJVUExtractor:
+    def __init__(self, em):
+        self.currentindex = 0
+        self.djvused = None
+        self.djvutxt = None
+        self.em = em
 
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
+    def extractone(self, params):
+        self.em.setmimetype('text/html')
 
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
+        # Extract metadata
+        if self.djvused:
+            try:
+                metadata = subprocess.check_output([self.djvused, self.filename,
+                                                    "-e", "select 1;print-meta"])
+            except Exception as e:
+                self.em.rclog("djvused failed: %s" % e)
+        author = ""
+        title = ""
+        metadata = metadata.decode('UTF-8', 'replace')
+        for line in metadata.split('\n'):
+            line = line.split('"')
+            if len(line) >= 2:
+                nm = line[0].strip()
+                if nm == "author":
+                    author = ' '.join(line[1:])
+                elif nm == "title":
+                    title = ' '.join(line[1:])
 
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
+        # Main text
+        try:
+            txtdata = subprocess.check_output([self.djvutxt, "--escape", self.filename])
+        except Exception as e:
+            self.em.rclog("djvused failed: %s" % e)
+            return (False, "", "", rclexecm.RclExecM.eofnow)
+        txtdata = txtdata.decode('UTF-8', 'replace')
 
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
+        data = '''<html><head><title>''' + self.em.htmlescape(title) + '''</title>'''
+        data += '''<meta http-equiv="Content-Type" '''
+        data += '''content="text/html;charset=UTF-8">'''
+        if author:
+            data += '''<meta name="author" content="''' + \
+                    self.em.htmlescape(author) + '''">'''
+        data += '''</head><body><pre>'''
 
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
+        data += self.em.htmlescape(txtdata)
+        data += '''</pre></body></html>'''
+        return (True, data, "", rclexecm.RclExecM.eofnext)
 
-infile="$1"
+    ###### File type handler api, used by rclexecm ---------->
+    def openfile(self, params):
+        self.filename = params["filename:"]
+        self.currentindex = 0
+        #self.em.rclog("openfile: [%s]" % self.filename)
 
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
+        if not self.djvutxt:
+            self.djvutxt = rclexecm.which("djvutxt")
+            if not self.djvutxt:
+                print("RECFILTERROR HELPERNOTFOUND djvutxt")
+                sys.exit(1);
+            self.djvused = rclexecm.which("djvused")
 
-# protect access to our temp files and directories
-umask 77
+        return True
 
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
+    def getipath(self, params):
+        return self.extractone(params)
+        return (ok, data, ipath, eof)
+        
+    def getnext(self, params):
+        if self.currentindex >= 1:
+            return (False, "", "", rclexecm.RclExecM.eofnow)
+        else:
+            ret= self.extractone(params)
+            self.currentindex += 1
+            return ret
 
-checkcmds djvutxt djvused awk
-
-# We need a temporary symlink to avoid path encoding issues
-if test z"$RECOLL_TMPDIR" != z; then
-   ttdir=$RECOLL_TMPDIR
-elif test z"$TMPDIR" != z ; then
-   ttdir=$TMPDIR
-else
-   ttdir=/tmp
-fi
-tmplink=$ttdir/rcldjvu_tmp$$.djvu
-rm -f $tmplink
-ln -s "$infile" $tmplink || exit 1
-
-cleanup()
-{
-    rm -f $tmplink
-}
-    
-trap cleanup EXIT HUP QUIT INT TERM
-
-# Title: we try to extract it from the annotations. djvused outputs string
-# in C/awk \-escaped notation. Awk can only process this in string
-# constants, so we have a first awk pass to create an awk program to parse
-# the string as a constant (...). This is not exactly robust or nice
-title=`djvused "$tmplink" -e 'select 1;output-ant' | \
-grep ' (title ' | sed -e 's/^.* (title //' -e 's/)$//' |\
-awk '
-{
-      printf("BEGIN" " {s = %s; print s}\n", $0)
-}' | awk -f -`
-
-
-cat <<EOF
-<html>
-<head>
-    <title>$title</title>
-    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
-</head>
-<body>
-<pre>
-EOF
-
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-djvutxt "$tmplink" | sed -e 's/[ 	][ 	]*$//' | \
-awk 'BEGIN'\
-' {
-  cont = ""
-}
-{
-    $0 = cont $0
-    cont = ""
-
-    if ($0 == "\f") {
-       print "</p>\n<hr>\n<p>"
-       next
-    } else if ($0 ~ /[-]$/) {
-      # Break at last whitespace
-      match($0, "[ \t][^ \t]+$")
-      line = substr($0, 0, RSTART)
-      cont = substr($0, RSTART, RLENGTH)
-      $0 = line
-      gsub("-", "", cont)
-    }
-    gsub(/&/, "\\&amp;", $0)
-    gsub(/</, "\\&lt;", $0)
-    gsub(/>/, "\\&gt;", $0)
-    print $0      
-}'
-
-cat <<EOF
-</pre>
-</body>
-</html>
-EOF
+# Main program: create protocol handler and extractor and run them
+proto = rclexecm.RclExecM()
+extract = DJVUExtractor(proto)
+rclexecm.main(proto, extract)