recoll / Code / Commit [f385ff]

Commit [f385ff] RECOLL_1_22_MAINT RECOLL_1_23_MAINT RECOLL_1_24_MAINT internal-xsl master History

updated filters page for current status

Authored by: Jean-Francois Dockes 2010-05-04

Child(ren): [cd7469]

removed	website/filters/rclimg
removed	website/filters/rclkwd
removed	website/filters/rcllyx
removed	website/filters/rclopxml
removed	website/filters/rclscribus
removed	website/filters/rclsoff
removed	website/filters/rclsvg
removed	website/filters/rcltex
removed	website/filters/rclwpd
changed	website/filters/filters.html
changed	website/filters/mimeconf
changed	website/filters/mimemap
changed	website/filters/mimeview
copied	website/filters/rclabw -> website/filters/rclics

website/filters/rclimg

File was removed.

website/filters/rclkwd

File was removed.

website/filters/rcllyx

File was removed.

website/filters/rclopxml

File was removed.

website/filters/rclscribus

File was removed.

website/filters/rclsoff

File was removed.

website/filters/rclsvg

File was removed.

website/filters/rcltex

File was removed.

website/filters/rclwpd

File was removed.

website/filters/filters.html Diff Switch to side-by-side view

website/filters/mimeconf Diff Switch to side-by-side view

website/filters/mimemap Diff Switch to side-by-side view

website/filters/mimeview Diff Switch to side-by-side view

website/filters/rclabw to website/filters/rclics

--- a/website/filters/rclabw
+++ b/website/filters/rclics
@@ -1,175 +1,180 @@
-#!/bin/sh
-# @(#$Id: rclabw,v 1.2 2007/06/15 11:41:50 dockes Exp $  (C) 2004 J.F.Dockes
-# Parts taken from Estraier:
-#================================================================
-# Estraier: a personal full-text search system
-# Copyright (C) 2003-2004 Mikio Hirabayashi
-#================================================================
-#================================================================
-# Extract text from an abiword file
-#================================================================
+#!/usr/bin/env python
 
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclabw"
-filetype=abiword
+# Read an ICS file, break it into "documents" which are events, todos,
+# or journal entries, and interface with recoll execm
+#
+# For historical reasons, this can use either the icalendar or the
+# vobject Python modules, or an internal splitter. The default is now
+# to use the internal splitter, the other modules are more trouble
+# than they're worth (to us and until we will want to get into date
+# computations etc.)
 
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
+import rclexecm
+import sys
 
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds iconv sed
-
-# check the input file existence
-if test ! -f "$infile"
-then
-  printf '%s: %s: no such file\n' "$progname" "$infile"
-  exit 1
-fi
-
-encoding=`sed -e  '/<?xml version=/s/"?>$//' \
-	      -e '/^<?xml version=/s/.*encoding="//p;D;q' \
-	      -e D \
-< $infile`
-if test X$encoding = X ; then encoding=UTF-8;fi
-
-# Note: there can be newlines inside the description field, we don't want
-# them... Have 2 use 2 different selectors for the single-line and
-# multiple-line cases because of the generic tag end (</m> for all meta
-# tags)
-descsedprog='
-/<m key="dc.description">\([^<]*\)<\/m>/ {
-s//\1/
-p
-q
-}
-/<m key="dc.description">/,/<\/m>/ {
-s!.*<m key="dc.description">!!
-s!</m>.*!!
-H
-}
-${
-g
-s/\n/ /g
-p
-}
-'
-
-description=`sed -n -e "$descsedprog" < "$infile"`
-#echo description: "$description"
-
-# Set program for the single line meta elements. Takes element name as
-# parameter 
-setmetasedprog() {
-metasedprog='/<m key="'$1'">/{
-s/.*<m key="'$1'">\([^<]*\).*/\1/
-'"s/\"/'/g"'
-p
-}'
-}
-
-setmetasedprog dc.subject
-subject=`sed -n -e "$metasedprog" "$infile"`
-#echo subject: "$subject"
-
-setmetasedprog dc.title
-title=`sed -n -e "$metasedprog" "$infile"`
-#echo titre: "$title"
-
-setmetasedprog abiword.keywords
-keywords=`sed -n -e "$metasedprog" "$infile"`
-#echo keywords: "$keywords"
-
-setmetasedprog dc.creator
-creator=`sed -n -e "$metasedprog" "$infile"`
-#echo creator: "$creator"
-
-# Note: next expr supposes that paragraphs are always all by themselves on
-# a single line in the xml (no multiple <p> per line, no embedded newlines
-# in text).
-contentsedprog='
-/<p[ >]/{
-s/<[^>]*>/ /g
-p
-}
-'
-content=`sed -n -e "$contentsedprog" "$infile"`
-#echo content: "$content"
-
-# output the result
-(echo '<html><head><title>' "$title" '</title>'
-echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
-echo '<meta name="description" content="' "$description $subject" '">'
-echo '<meta name="keywords" content="' "$keywords" '">'
-echo '<meta name="author" content="' "$creator" '">'
-echo '</head><body><pre>'
-echo "$content" 
-echo '</pre></body></html>') \
-| iconv -f $encoding -t UTF-8 -c -s 
+# Decide how we'll process the file.
+modules = ('internal', 'icalendar', 'vobject')
+usemodule = 'internal'
+forcevobject = 0
+if usemodule != 'internal':
+    try:
+        if forcevobject:
+            raise Exception
+        from icalendar import Calendar, Event
+        usemodule = 'icalendar'
+    except:
+        try:
+            import vobject
+            usemodule = 'vobject'
+        except:
+            print "RECFILTERROR HELPERNOTFOUND python:icalendar"
+            print "RECFILTERROR HELPERNOTFOUND python:vobject"
+            sys.exit(1);
 
 
-# exit normally
-exit 0
+class IcalExtractor:
+    def __init__(self, em):
+        self.file = ""
+	self.contents = []
+        self.em = em
+        self.em.setmimetype("text/plain")
+
+    def extractone(self, index):
+        if index >= len(self.contents):
+            return(False, "", "", True)
+        docdata = self.contents[index]
+	#self.em.rclog(docdata)
+
+        iseof = rclexecm.RclExecM.noteof
+        if self.currentindex >= len(self.contents) -1:
+            iseof = rclexecm.RclExecM.eofnext
+        return (True, docdata, str(index), iseof)
+
+    ###### File type handler api, used by rclexecm ---------->
+    def openfile(self, params):
+        self.file = params["filename:"]
+
+        try:
+            calstr = open(self.file, 'rb')
+        except Exception, e:
+            self.em.rclog("Openfile: open: %s" % str(e))
+            return False
+
+        self.currentindex = 0
+
+        if usemodule == 'internal':
+            self.contents = ICalSimpleSplitter().splitcalendar(calstr)
+        elif usemodule == 'icalendar':
+            try:
+                cal = Calendar.from_string(calstr.read())
+            except Exception, e:
+                self.em.rclog("Openfile: read or parse error: %s" % str(e))
+                return False
+            self.contents = cal.walk()
+            self.contents = [item.as_string() for item in self.contents
+                             if (item.name == 'VEVENT' or item.name == 'VTODO'
+                                 or item.name == 'VJOURNAL')]
+        else:
+            try:
+                cal = vobject.readOne(calstr)
+            except Exception, e:
+                self.em.rclog("Openfile: cant parse object: %s" % str(e))
+                return False
+            for lstnm in ('vevent_list', 'vtodo_list', 'vjournal_list'):
+                lst = getattr(cal, lstnm, [])
+                for ev in lst:
+                    self.contents.append(ev.serialize())
+
+        #self.em.rclog("openfile: Entry count: %d"%(len(self.contents)))
+        return True
+
+    def getipath(self, params):
+        try:
+            index = int(params["ipath:"])
+        except:
+            return False
+        return self.extractone(index)
+        
+    def getnext(self, params):
+        if self.currentindex >= len(self.contents):
+            self.em.rclog("getnext: EOF hit")
+            return (False, "", "", rclexecm.RclExecM.eofnow)
+        else:
+            ret= self.extractone(self.currentindex)
+            self.currentindex += 1
+            return ret
+
+# Trivial splitter: cut objects on BEGIN/END (only for 'interesting' objects)
+# ignore all other syntax
+class ICalSimpleSplitter:
+    # Note that if an 'interesting' element is nested inside another one,
+    # it will not be extracted (stay as text in external event). This is
+    # not an issue and I don't think it can happen with the current list
+    interesting = ('VTODO', 'VEVENT', 'VJOURNAL')
+
+    def splitcalendar(self, fin):
+        curblkname = ''
+        curblk = ''
+
+        lo = []
+        for line in fin:
+            line = line.rstrip()
+            if line == '':
+                continue
+
+            if curblkname:
+                curblk = curblk + line + "\n"
+
+            l = line.split(":")
+            if len(l) < 2:
+                continue
+
+            # If not currently inside a block and we see an
+            # 'interesting' BEGIN, start block
+            if curblkname == '' and l[0].upper() == "BEGIN" :
+                name = l[1].upper()
+                if name in ICalSimpleSplitter.interesting:
+                    curblkname = name
+                    curblk = curblk + line + "\n"
+
+            # If currently accumulating block lines, check for end
+            if curblkname and l[0].upper() == "END" and \
+                   l[1].upper() == curblkname:
+                lo.append(curblk)
+                curblkname = ''
+                curblk = ''
+
+        if curblk:
+            lo.append(curblk)
+            curblkname = ''
+            curblk = ''
+
+        return lo
+ 
+
+##### Main program: either talk to the parent or execute test loop
+
+e = rclexecm.RclExecM()
+ical = IcalExtractor(e)
+
+if len(sys.argv) == 1:
+    e.mainloop(ical)
+else:
+    # Got a file name parameter: testing without an execm parent
+    # Loop on all entries
+    if not ical.openfile({'filename:':sys.argv[1]}):
+        print "Open error"
+        sys.exit(1)
+
+    ecnt = 0   
+    while 1:
+        ok, data, ipath, eof = ical.getnext("")
+        if ok:
+            ecnt = ecnt + 1
+            print "=========== ENTRY %d =================" % ecnt
+            print data
+            print
+        else:
+            print "Got error, eof %d"%eof
+            break
+