changed | src/desktop/hotrecoll.py |
changed | src/filters/rclabw.py |
changed | src/filters/rcldvi |
changed | src/filters/rclsvg.py |
changed | src/filters/rclxml.py |
changed | src/filters/rclxslt.py |
changed | src/sampleconf/mimeconf |
changed | src/windows/mimeconf |
changed | src/Makefile.am |
copied | src/filters/rclfb2 -> src/filters/rclgnm.py |
copied | src/filters/rclgnm -> src/filters/rclfb2.py |
copied | src/filters/rclokulnote -> src/filters/rclokulnote.py |
copied | src/filters/rclsiduxman -> src/filters/rclgenxslt.py |
src/filters/rclfb2 to src/filters/rclgnm.py
--- a/src/filters/rclfb2 +++ b/src/filters/rclgnm.py @@ -1,139 +1,112 @@ -#!/bin/sh -# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes -#================================================================ -# Extract text from an fb2 ebook (xml) -#================================================================ +#!/usr/bin/env python3 +# Copyright (C) 2014 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +###################################### -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname=rclfb2 -filetype=fb2 +from __future__ import print_function + +import sys +import rclexecm +import rclgenxslt -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds xsltproc - -xsltproc --nonet --novalid - "$infile" <<EOF -<?xml version="1.0"?> +stylesheet_all = '''<?xml version="1.0"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:fb="http://www.gribuser.ru/xml/fictionbook/2.0" - exclude-result-prefixes="fb" + xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" + xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0" + xmlns:ooo="http://openoffice.org/2004/office" + xmlns:gnm="http://www.gnumeric.org/v10.dtd" + + exclude-result-prefixes="office xlink meta ooo dc" > <xsl:output method="html" encoding="UTF-8"/> -<xsl:template match="/fb:FictionBook"> - <html> - <xsl:apply-templates select="fb:description"/> - <xsl:apply-templates select="fb:body"/> - </html> +<xsl:template match="/"> +<html> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> + <xsl:apply-templates select="//office:document-meta/office:meta"/> + </head> + + <body> + <xsl:apply-templates select="//gnm:Cells"/> + <xsl:apply-templates select="//gnm:Objects"/> + </body> +</html> </xsl:template> -<xsl:template match="fb:description"> - <head> - <xsl:apply-templates select="fb:title-info"/> - </head><xsl:text> -</xsl:text> +<xsl:template match="//dc:date"> + <meta> + <xsl:attribute name="name">date</xsl:attribute> + <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> + </meta> </xsl:template> -<xsl:template match="fb:description/fb:title-info"> - <xsl:apply-templates select="fb:book-title"/> - <xsl:apply-templates select="fb:author"/> -</xsl:template> - -<xsl:template match="fb:description/fb:title-info/fb:book-title"> -<title> <xsl:value-of select="."/> </title> -</xsl:template> - -<xsl:template match="fb:description/fb:title-info/fb:author"> +<xsl:template match="//dc:description"> <meta> - <xsl:attribute name="name">author</xsl:attribute> - <xsl:attribute name="content"> - <xsl:value-of select="fb:first-name"/><xsl:text> </xsl:text> - <xsl:value-of select="fb:middle-name"/><xsl:text> </xsl:text> - <xsl:value-of select="fb:last-name"/> - </xsl:attribute> + <xsl:attribute name="name">abstract</xsl:attribute> + <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> </meta> </xsl:template> -<xsl:template match="fb:body"> - <body> - <xsl:apply-templates select="fb:section"/> - </body> +<xsl:template match="//meta:keyword"> + <meta> + <xsl:attribute name="name">keywords</xsl:attribute> + <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> + </meta> </xsl:template> -<xsl:template match="fb:body/fb:section"> - <xsl:for-each select="fb:p"> +<xsl:template match="//dc:subject"> + <meta> + <xsl:attribute name="name">keywords</xsl:attribute> + <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> + </meta> +</xsl:template> + +<xsl:template match="//dc:title"> + <title> <xsl:value-of select="."/> </title> +</xsl:template> + +<xsl:template match="//meta:initial-creator"> + <meta> + <xsl:attribute name="name">author</xsl:attribute> + <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> + </meta> +</xsl:template> + +<xsl:template match="office:meta/*"/> + +<xsl:template match="gnm:Cell"> <p><xsl:value-of select="."/></p> - </xsl:for-each> +</xsl:template> + +<xsl:template match="gnm:CellComment"> + <blockquote><xsl:value-of select="@Text"/></blockquote> </xsl:template> </xsl:stylesheet> -EOF +''' + + +if __name__ == '__main__': + proto = rclexecm.RclExecM() + extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all, gzip=True) + rclexecm.main(proto, extract) +
src/filters/rclgnm to src/filters/rclfb2.py
--- a/src/filters/rclgnm +++ b/src/filters/rclfb2.py @@ -1,191 +1,87 @@ -#!/bin/sh -# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Extract text from a gnumeric spreadsheet -#================================================================ +#!/usr/bin/env python3 +# Copyright (C) 2014 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +###################################### -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclgnumeric" -filetype=gnumeric +from __future__ import print_function +import sys +import rclexecm +import rclxslt +import rclgenxslt -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds xsltproc gunzip - -# We need a temporary file -if test z"$RECOLL_TMPDIR" != z; then - ttdir=$RECOLL_TMPDIR -elif test z"$TMPDIR" != z ; then - ttdir=$TMPDIR -else - ttdir=/tmp -fi -tmpfile=$ttdir/rclgnm.XXXXXX - -tmpfile=`mktemp "$tmpfile"` -if [ $? -ne 0 ]; then - senderror "$0: Can't create temp file, exiting..." -fi - -cleanup() -{ - rm -f $tmpfile -} - -trap cleanup EXIT HUP QUIT INT TERM - -gunzip < $1 > $tmpfile || senderror "Cant uncompress input" -xsltproc --novalid --nonet - $tmpfile <<EOF -<?xml version="1.0"?> +stylesheet_all = '''<?xml version="1.0"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" - xmlns:xlink="http://www.w3.org/1999/xlink" - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0" - xmlns:ooo="http://openoffice.org/2004/office" - xmlns:gnm="http://www.gnumeric.org/v10.dtd" - - exclude-result-prefixes="office xlink meta ooo dc" + xmlns:fb="http://www.gribuser.ru/xml/fictionbook/2.0" + exclude-result-prefixes="fb" > <xsl:output method="html" encoding="UTF-8"/> -<xsl:template match="/"> -<html> - <head> - <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> - <xsl:apply-templates select="//office:document-meta/office:meta"/> - </head> - - <body> - <xsl:apply-templates select="//gnm:Cells"/> - <xsl:apply-templates select="//gnm:Objects"/> - </body> -</html> +<xsl:template match="/fb:FictionBook"> + <html> + <xsl:apply-templates select="fb:description"/> + <xsl:apply-templates select="fb:body"/> + </html> </xsl:template> -<xsl:template match="//dc:date"> - <meta> - <xsl:attribute name="name">date</xsl:attribute> - <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> - </meta> +<xsl:template match="fb:description"> + <head> + <xsl:apply-templates select="fb:title-info"/> + </head><xsl:text> +</xsl:text> </xsl:template> -<xsl:template match="//dc:description"> +<xsl:template match="fb:description/fb:title-info"> + <xsl:apply-templates select="fb:book-title"/> + <xsl:apply-templates select="fb:author"/> +</xsl:template> + +<xsl:template match="fb:description/fb:title-info/fb:book-title"> +<title> <xsl:value-of select="."/> </title> +</xsl:template> + +<xsl:template match="fb:description/fb:title-info/fb:author"> <meta> - <xsl:attribute name="name">abstract</xsl:attribute> - <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> + <xsl:attribute name="name">author</xsl:attribute> + <xsl:attribute name="content"> + <xsl:value-of select="fb:first-name"/><xsl:text> </xsl:text> + <xsl:value-of select="fb:middle-name"/><xsl:text> </xsl:text> + <xsl:value-of select="fb:last-name"/> + </xsl:attribute> </meta> </xsl:template> -<xsl:template match="//meta:keyword"> - <meta> - <xsl:attribute name="name">keywords</xsl:attribute> - <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> - </meta> +<xsl:template match="fb:body"> + <body> + <xsl:apply-templates select="fb:section"/> + </body> </xsl:template> -<xsl:template match="//dc:subject"> - <meta> - <xsl:attribute name="name">keywords</xsl:attribute> - <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> - </meta> -</xsl:template> - -<xsl:template match="//dc:title"> - <title> <xsl:value-of select="."/> </title> -</xsl:template> - -<xsl:template match="//meta:initial-creator"> - <meta> - <xsl:attribute name="name">author</xsl:attribute> - <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> - </meta> -</xsl:template> - -<xsl:template match="office:meta/*"/> - -<xsl:template match="gnm:Cell"> +<xsl:template match="fb:body/fb:section"> + <xsl:for-each select="fb:p"> <p><xsl:value-of select="."/></p> -</xsl:template> - -<xsl:template match="gnm:CellComment"> - <blockquote><xsl:value-of select="@Text"/></blockquote> + </xsl:for-each> </xsl:template> </xsl:stylesheet> -EOF +''' +if __name__ == '__main__': + proto = rclexecm.RclExecM() + extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all) + rclexecm.main(proto, extract)
src/filters/rclokulnote to src/filters/rclokulnote.py
--- a/src/filters/rclokulnote +++ b/src/filters/rclokulnote.py @@ -1,97 +1,32 @@ -#!/bin/sh -# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Extract text from a gnumeric spreadsheet -#================================================================ +#!/usr/bin/env python3 +# Copyright (C) 2014 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +###################################### +from __future__ import print_function -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclgnumeric" -filetype=gnumeric +import sys +import rclexecm +import rclgenxslt - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds xsltproc - -xsltproc --novalid --nonet - "$infile" <<EOF -<?xml version="1.0"?> +stylesheet_all = '''<?xml version="1.0"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output method="html" encoding="UTF-8"/> <xsl:strip-space elements="*" /> - <xsl:template match="/"> <html> @@ -126,5 +61,10 @@ <xsl:template match="@*"/> </xsl:stylesheet> -EOF +''' +if __name__ == '__main__': + proto = rclexecm.RclExecM() + extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all) + rclexecm.main(proto, extract) +
src/filters/rclsiduxman to src/filters/rclgenxslt.py
--- a/src/filters/rclsiduxman +++ b/src/filters/rclgenxslt.py @@ -1,92 +1,65 @@ -#!/bin/sh -# @(#$Id: rclsiduxman,v 1.1 2008-06-09 09:12:05 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Strip the menu part from sidux manual pages to improve search precision -#================================================================ +#!/usr/bin/env python3 +# Copyright (C) 2018 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +###################################### +from __future__ import print_function -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclsiduxman" -filetype="sidux manual htm" +import sys +import rclexecm +import rclxslt +import gzip + +class XSLTExtractor: + def __init__(self, em, stylesheet, gzip=False): + self.em = em + self.currentindex = 0 + self.stylesheet = stylesheet + self.dogz = gzip -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file + def extractone(self, params): + if "filename:" not in params: + self.em.rclog("extractone: no mime or file name") + return (False, "", "", rclexecm.RclExecM.eofnow) + fn = params["filename:"] + try: + if self.dogz: + data = gzip.open(fn, 'rb').read() + else: + data = open(fn, 'rb').read() + docdata = rclxslt.apply_sheet_data(self.stylesheet, data) + except Exception as err: + self.em.rclog("%s: bad data: %s" % (fn, err)) + return (False, "", "", rclexecm.RclExecM.eofnow) -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). + return (True, docdata, "", rclexecm.RclExecM.eofnext) + -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} + ###### File type handler api, used by rclexecm ----------> + def openfile(self, params): + self.currentindex = 0 + return True -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds sed -# Delete everything from <div id="menu"> to <div id="main-page"> -# This prints an additional blank line at top which does not matter -sed -n -e '1,/<div id="menu">/{x;p' -e '}' \ - -e '/<div id="main-page">/,$p' < "$infile" - -# exit normally -exit 0 + def getipath(self, params): + return self.extractone(params) + + def getnext(self, params): + if self.currentindex >= 1: + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(params) + self.currentindex += 1 + return ret