src/common/beaglequeuecache.cpp to src/common/webstore.cpp
--- a/src/common/beaglequeuecache.cpp +++ b/src/common/webstore.cpp @@ -17,10 +17,11 @@ #include "autoconfig.h" +#include "webstore.h" + #include <stdint.h> #include "cstr.h" -#include "beaglequeuecache.h" #include "circache.h" #include "log.h" #include "rclconfig.h" @@ -29,42 +30,43 @@ const string cstr_bgc_mimetype("mimetype"); -BeagleQueueCache::BeagleQueueCache(RclConfig *cnf) +WebStore::WebStore(RclConfig *cnf) { string ccdir = cnf->getWebcacheDir(); int maxmbs = 40; cnf->getConfParam("webcachemaxmbs", &maxmbs); if ((m_cache = new CirCache(ccdir)) == 0) { - LOGERR("BeagleQueueCache: cant create CirCache object\n" ); + LOGERR("WebStore: cant create CirCache object\n" ); return; } if (!m_cache->create(int64_t(maxmbs)*1000*1024, CirCache::CC_CRUNIQUE)) { - LOGERR("BeagleQueueCache: cache file creation failed: " << (m_cache->getReason()) << "\n" ); + LOGERR("WebStore: cache file creation failed: " << + m_cache->getReason() << "\n"); delete m_cache; m_cache = 0; return; } } -BeagleQueueCache::~BeagleQueueCache() +WebStore::~WebStore() { delete m_cache; } // Read document from cache. Return the metadata as an Rcl::Doc -// @param htt Beagle Hit Type -bool BeagleQueueCache::getFromCache(const string& udi, Rcl::Doc &dotdoc, +// @param htt Web Hit Type +bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc, string& data, string *htt) { string dict; if (m_cache == 0) { - LOGERR("BeagleQueueCache::getFromCache: cache is null\n" ); + LOGERR("WebStore::getFromCache: cache is null\n"); return false; } if (!m_cache->get(udi, dict, &data)) { - LOGDEB("BeagleQueueCache::getFromCache: get failed\n" ); + LOGDEB("WebStore::getFromCache: get failed\n"); return false; }
src/common/beaglequeuecache.h to src/common/webstore.h
--- a/src/common/beaglequeuecache.h +++ b/src/common/webstore.h @@ -14,11 +14,10 @@ * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#ifndef _beaglequeuecache_h_included_ -#define _beaglequeuecache_h_included_ +#ifndef _webstore_h_included_ +#define _webstore_h_included_ #include <string> -using std::string; class RclConfig; namespace Rcl { @@ -28,23 +27,24 @@ class CirCache; /** - * Manage the CirCache for the Beagle Queue indexer. Separated from the main + * Manage the CirCache for the Web Queue indexer. Separated from the main * indexer code because it's also used for querying (getting the data for a * preview */ -class BeagleQueueCache { +class WebStore { public: - BeagleQueueCache(RclConfig *config); - ~BeagleQueueCache(); + WebStore(RclConfig *config); + ~WebStore(); - bool getFromCache(const string& udi, Rcl::Doc &doc, string& data, - string *hittype = 0); + bool getFromCache(const std::string& udi, Rcl::Doc &doc, std::string& data, + std::string *hittype = 0); // We could write proxies for all the circache ops, but why bother? CirCache *cc() {return m_cache;} private: CirCache *m_cache; }; -extern const string cstr_bgc_mimetype; -#endif /* _beaglequeuecache_h_included_ */ +extern const std::string cstr_bgc_mimetype; + +#endif /* _webstore_h_included_ */
src/filters/rclabw to src/filters/rclabw.py
--- a/src/filters/rclabw +++ b/src/filters/rclabw.py @@ -1,91 +1,28 @@ -#!/bin/sh -# @(#$Id: rclabw,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Extract text from an abiword file -#================================================================ +#!/usr/bin/env python3 +# Copyright (C) 2014 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +###################################### -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclabw" -filetype=abiword +from __future__ import print_function +import sys +import rclexecm +import rclgenxslt -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds xsltproc - -xsltproc --nonet --novalid - "$infile" <<EOF -<?xml version="1.0"?> +stylesheet_all = '''<?xml version="1.0"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:ab="http://www.abisource.com/awml.dtd" @@ -173,7 +110,9 @@ </xsl:template> </xsl:stylesheet> -EOF +''' -# exit normally -exit 0 +if __name__ == '__main__': + proto = rclexecm.RclExecM() + extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all) + rclexecm.main(proto, extract)
src/filters/rclfb2 to src/filters/rclgnm.py
--- a/src/filters/rclfb2 +++ b/src/filters/rclgnm.py @@ -1,139 +1,112 @@ -#!/bin/sh -# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes -#================================================================ -# Extract text from an fb2 ebook (xml) -#================================================================ +#!/usr/bin/env python3 +# Copyright (C) 2014 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +###################################### -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname=rclfb2 -filetype=fb2 +from __future__ import print_function + +import sys +import rclexecm +import rclgenxslt -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds xsltproc - -xsltproc --nonet --novalid - "$infile" <<EOF -<?xml version="1.0"?> +stylesheet_all = '''<?xml version="1.0"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:fb="http://www.gribuser.ru/xml/fictionbook/2.0" - exclude-result-prefixes="fb" + xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" + xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0" + xmlns:ooo="http://openoffice.org/2004/office" + xmlns:gnm="http://www.gnumeric.org/v10.dtd" + + exclude-result-prefixes="office xlink meta ooo dc" > <xsl:output method="html" encoding="UTF-8"/> -<xsl:template match="/fb:FictionBook"> - <html> - <xsl:apply-templates select="fb:description"/> - <xsl:apply-templates select="fb:body"/> - </html> +<xsl:template match="/"> +<html> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> + <xsl:apply-templates select="//office:document-meta/office:meta"/> + </head> + + <body> + <xsl:apply-templates select="//gnm:Cells"/> + <xsl:apply-templates select="//gnm:Objects"/> + </body> +</html> </xsl:template> -<xsl:template match="fb:description"> - <head> - <xsl:apply-templates select="fb:title-info"/> - </head><xsl:text> -</xsl:text> +<xsl:template match="//dc:date"> + <meta> + <xsl:attribute name="name">date</xsl:attribute> + <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> + </meta> </xsl:template> -<xsl:template match="fb:description/fb:title-info"> - <xsl:apply-templates select="fb:book-title"/> - <xsl:apply-templates select="fb:author"/> -</xsl:template> - -<xsl:template match="fb:description/fb:title-info/fb:book-title"> -<title> <xsl:value-of select="."/> </title> -</xsl:template> - -<xsl:template match="fb:description/fb:title-info/fb:author"> +<xsl:template match="//dc:description"> <meta> - <xsl:attribute name="name">author</xsl:attribute> - <xsl:attribute name="content"> - <xsl:value-of select="fb:first-name"/><xsl:text> </xsl:text> - <xsl:value-of select="fb:middle-name"/><xsl:text> </xsl:text> - <xsl:value-of select="fb:last-name"/> - </xsl:attribute> + <xsl:attribute name="name">abstract</xsl:attribute> + <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> </meta> </xsl:template> -<xsl:template match="fb:body"> - <body> - <xsl:apply-templates select="fb:section"/> - </body> +<xsl:template match="//meta:keyword"> + <meta> + <xsl:attribute name="name">keywords</xsl:attribute> + <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> + </meta> </xsl:template> -<xsl:template match="fb:body/fb:section"> - <xsl:for-each select="fb:p"> +<xsl:template match="//dc:subject"> + <meta> + <xsl:attribute name="name">keywords</xsl:attribute> + <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> + </meta> +</xsl:template> + +<xsl:template match="//dc:title"> + <title> <xsl:value-of select="."/> </title> +</xsl:template> + +<xsl:template match="//meta:initial-creator"> + <meta> + <xsl:attribute name="name">author</xsl:attribute> + <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> + </meta> +</xsl:template> + +<xsl:template match="office:meta/*"/> + +<xsl:template match="gnm:Cell"> <p><xsl:value-of select="."/></p> - </xsl:for-each> +</xsl:template> + +<xsl:template match="gnm:CellComment"> + <blockquote><xsl:value-of select="@Text"/></blockquote> </xsl:template> </xsl:stylesheet> -EOF +''' + + +if __name__ == '__main__': + proto = rclexecm.RclExecM() + extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all, gzip=True) + rclexecm.main(proto, extract) +
src/filters/rclgnm to src/filters/rclfb2.py
--- a/src/filters/rclgnm +++ b/src/filters/rclfb2.py @@ -1,191 +1,87 @@ -#!/bin/sh -# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Extract text from a gnumeric spreadsheet -#================================================================ +#!/usr/bin/env python3 +# Copyright (C) 2014 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +###################################### -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclgnumeric" -filetype=gnumeric +from __future__ import print_function +import sys +import rclexecm +import rclxslt +import rclgenxslt -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds xsltproc gunzip - -# We need a temporary file -if test z"$RECOLL_TMPDIR" != z; then - ttdir=$RECOLL_TMPDIR -elif test z"$TMPDIR" != z ; then - ttdir=$TMPDIR -else - ttdir=/tmp -fi -tmpfile=$ttdir/rclgnm.XXXXXX - -tmpfile=`mktemp "$tmpfile"` -if [ $? -ne 0 ]; then - senderror "$0: Can't create temp file, exiting..." -fi - -cleanup() -{ - rm -f $tmpfile -} - -trap cleanup EXIT HUP QUIT INT TERM - -gunzip < $1 > $tmpfile || senderror "Cant uncompress input" -xsltproc --novalid --nonet - $tmpfile <<EOF -<?xml version="1.0"?> +stylesheet_all = '''<?xml version="1.0"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" - xmlns:xlink="http://www.w3.org/1999/xlink" - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0" - xmlns:ooo="http://openoffice.org/2004/office" - xmlns:gnm="http://www.gnumeric.org/v10.dtd" - - exclude-result-prefixes="office xlink meta ooo dc" + xmlns:fb="http://www.gribuser.ru/xml/fictionbook/2.0" + exclude-result-prefixes="fb" > <xsl:output method="html" encoding="UTF-8"/> -<xsl:template match="/"> -<html> - <head> - <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> - <xsl:apply-templates select="//office:document-meta/office:meta"/> - </head> - - <body> - <xsl:apply-templates select="//gnm:Cells"/> - <xsl:apply-templates select="//gnm:Objects"/> - </body> -</html> +<xsl:template match="/fb:FictionBook"> + <html> + <xsl:apply-templates select="fb:description"/> + <xsl:apply-templates select="fb:body"/> + </html> </xsl:template> -<xsl:template match="//dc:date"> - <meta> - <xsl:attribute name="name">date</xsl:attribute> - <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> - </meta> +<xsl:template match="fb:description"> + <head> + <xsl:apply-templates select="fb:title-info"/> + </head><xsl:text> +</xsl:text> </xsl:template> -<xsl:template match="//dc:description"> +<xsl:template match="fb:description/fb:title-info"> + <xsl:apply-templates select="fb:book-title"/> + <xsl:apply-templates select="fb:author"/> +</xsl:template> + +<xsl:template match="fb:description/fb:title-info/fb:book-title"> +<title> <xsl:value-of select="."/> </title> +</xsl:template> + +<xsl:template match="fb:description/fb:title-info/fb:author"> <meta> - <xsl:attribute name="name">abstract</xsl:attribute> - <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> + <xsl:attribute name="name">author</xsl:attribute> + <xsl:attribute name="content"> + <xsl:value-of select="fb:first-name"/><xsl:text> </xsl:text> + <xsl:value-of select="fb:middle-name"/><xsl:text> </xsl:text> + <xsl:value-of select="fb:last-name"/> + </xsl:attribute> </meta> </xsl:template> -<xsl:template match="//meta:keyword"> - <meta> - <xsl:attribute name="name">keywords</xsl:attribute> - <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> - </meta> +<xsl:template match="fb:body"> + <body> + <xsl:apply-templates select="fb:section"/> + </body> </xsl:template> -<xsl:template match="//dc:subject"> - <meta> - <xsl:attribute name="name">keywords</xsl:attribute> - <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> - </meta> -</xsl:template> - -<xsl:template match="//dc:title"> - <title> <xsl:value-of select="."/> </title> -</xsl:template> - -<xsl:template match="//meta:initial-creator"> - <meta> - <xsl:attribute name="name">author</xsl:attribute> - <xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute> - </meta> -</xsl:template> - -<xsl:template match="office:meta/*"/> - -<xsl:template match="gnm:Cell"> +<xsl:template match="fb:body/fb:section"> + <xsl:for-each select="fb:p"> <p><xsl:value-of select="."/></p> -</xsl:template> - -<xsl:template match="gnm:CellComment"> - <blockquote><xsl:value-of select="@Text"/></blockquote> + </xsl:for-each> </xsl:template> </xsl:stylesheet> -EOF +''' +if __name__ == '__main__': + proto = rclexecm.RclExecM() + extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all) + rclexecm.main(proto, extract)
src/filters/rclokulnote to src/filters/rclokulnote.py
--- a/src/filters/rclokulnote +++ b/src/filters/rclokulnote.py @@ -1,97 +1,32 @@ -#!/bin/sh -# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Extract text from a gnumeric spreadsheet -#================================================================ +#!/usr/bin/env python3 +# Copyright (C) 2014 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +###################################### +from __future__ import print_function -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclgnumeric" -filetype=gnumeric +import sys +import rclexecm +import rclgenxslt - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds xsltproc - -xsltproc --novalid --nonet - "$infile" <<EOF -<?xml version="1.0"?> +stylesheet_all = '''<?xml version="1.0"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output method="html" encoding="UTF-8"/> <xsl:strip-space elements="*" /> - <xsl:template match="/"> <html> @@ -126,5 +61,10 @@ <xsl:template match="@*"/> </xsl:stylesheet> -EOF +''' +if __name__ == '__main__': + proto = rclexecm.RclExecM() + extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all) + rclexecm.main(proto, extract) +
src/filters/rclps to src/index/webqueue.h
--- a/src/filters/rclps +++ b/src/index/webqueue.h @@ -1,135 +1,79 @@ -#!/bin/sh -# @(#$Id: rclps,v 1.10 2007-06-08 13:51:09 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Extract text from a postscript file by executing pstotext or ps2ascii. -# -# The default is to use pstotext which can deal with accents, but in a -# partially broken way (it always outputs iso8859-1, when it should use utf. -# -# OTOH, ps2ascii is much faster, comes with ghostscript, and sometimes work -# better (ie: on some openoffice output files). -# -#================================================================ +/* Copyright (C) 2009 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#ifndef _webqueue_h_included_ +#define _webqueue_h_included_ -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclps" -decoder=pstotext -#decoder=ps2ascii -filetype=postscript +#include <list> +/** + * Process the WEB indexing queue. + * + * This was originally written to reuse the Beagle Firefox plug-in (which + * copied visited pages and bookmarks to the queue), long dead and replaced by a + * recoll-specific plugin. + */ -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file +#include "fstreewalk.h" +#include "rcldoc.h" -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 +class DbIxStatusUpdater; +class CirCache; +class RclConfig; +class WebStore; +namespace Rcl { + class Db; } -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} +class WebQueueIndexer : public FsTreeWalkerCB { +public: + WebQueueIndexer(RclConfig *cnf, Rcl::Db *db, + DbIxStatusUpdater *updfunc = 0); + ~WebQueueIndexer(); -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} + /** This is called by the top indexer in recollindex. + * Does the walking and the talking */ + bool index(); -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi + /** Called when we fstreewalk the queue dir */ + FsTreeWalker::Status + processone(const string &, const struct stat *, FsTreeWalker::CbFlag); -infile="$1" + /** Index a list of files. No db cleaning or stemdb updating. + * Used by the real time monitor */ + bool indexFiles(std::list<std::string>& files); + /** Purge a list of files. No way to do this currently and dont want + * to do anything as this is mostly called by the monitor when *I* delete + * files inside the queue dir */ + bool purgeFiles(std::list<std::string>& files) {return true;} -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi + /** Called when indexing data from the cache, and from internfile for + * search result preview */ + bool getFromCache(const string& udi, Rcl::Doc &doc, string& data, + string *hittype = 0); +private: + RclConfig *m_config; + Rcl::Db *m_db; + WebStore *m_cache; + string m_queuedir; + DbIxStatusUpdater *m_updater; + bool m_nocacheindex; -# protect access to our temp files and directories -umask 77 + bool indexFromCache(const string& udi); + void updstatus(const string& udi); +}; -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds $decoder iconv awk - -# output the result -# The strange 'BEGIN' setup is to prevent 'file' from thinking this file -# is an awk program -$decoder "$infile" | -awk 'BEGIN'\ -' { - printf("<html><head><title></title>\n") - printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n") - printf("</head>\n<body><p>"); - doescape = 1 - cont = "" -} -{ - $0 = cont $0 - cont = "" - - if ($0 == "\f") { - print "</p>\n<hr>\n\f<p>" - next - } else if ($0 ~ /�$/) { - # Note : soft-hyphen is iso8859 0xad - # Break at last whitespace - match($0, "[ \t][^ \t]+$") - line = substr($0, 0, RSTART) - cont = substr($0, RSTART, RLENGTH) - $0 = line - gsub("�", "", cont) - } - - if(doescape > 0) { - gsub(/&/, "\\&", $0) - gsub(/</, "\\<", $0) - gsub(/>/, "\\>", $0) - } - print $0 "<br>" -} -END { - print "</p></body></html>" -}' | iconv -f iso-8859-1 -t UTF-8 -c -s - +#endif /* _webqueue_h_included_ */
src/filters/rclsiduxman to src/filters/rclbasehandler.py
--- a/src/filters/rclsiduxman +++ b/src/filters/rclbasehandler.py @@ -1,92 +1,64 @@ -#!/bin/sh -# @(#$Id: rclsiduxman,v 1.1 2008-06-09 09:12:05 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Strip the menu part from sidux manual pages to improve search precision -#================================================================ +#!/usr/bin/env python3 +# Copyright (C) 2016 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclsiduxman" -filetype="sidux manual htm" +# Base for extractor classes. With some common generic implementations +# for the boilerplate functions. + +from __future__ import print_function + +import os +import sys +import rclexecm + +class RclBaseHandler(object): + def __init__(self, em): + self.em = em -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file + def extractone(self, params): + #self.em.rclog("extractone %s %s" % (params["filename:"], \ + #params["mimetype:"])) + if not "filename:" in params: + self.em.rclog("extractone: no file name") + return (False, "", "", rclexecm.RclExecM.eofnow) + fn = params["filename:"] -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). + try: + html = self.html_text(fn) + except Exception as err: + self.em.rclog("RclBaseDumper: %s : %s" % (fn, err)) + return (False, "", "", rclexecm.RclExecM.eofnow) -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} + self.em.setmimetype('text/html') + return (True, html, "", rclexecm.RclExecM.eofnext) + -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} + ###### File type handler api, used by rclexecm ----------> + def openfile(self, params): + self.currentindex = 0 + return True -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} + def getipath(self, params): + return self.extractone(params) -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds sed -# Delete everything from <div id="menu"> to <div id="main-page"> -# This prints an additional blank line at top which does not matter -sed -n -e '1,/<div id="menu">/{x;p' -e '}' \ - -e '/<div id="main-page">/,$p' < "$infile" - -# exit normally -exit 0 + def getnext(self, params): + if self.currentindex >= 1: + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(params) + self.currentindex += 1 + return ret
src/filters/rclwpd to src/index/webqueuefetcher.cpp
--- a/src/filters/rclwpd +++ b/src/index/webqueuefetcher.cpp @@ -1,87 +1,66 @@ -#!/bin/sh -# @(#$Id: rclwpd,v 1.1 2007-08-26 13:34:59 dockes Exp $ (C) 2004 J.F.Dockes -# Some inspiration from estraier -#================================================================ -# convert wordperfect documents to html, by executing the wpd2html program: -# http://libwpd.sourceforge.net/download.html -#================================================================ +/* Copyright (C) 2012 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include "autoconfig.h" -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclwpd" -filetype=wpd +#include "webqueuefetcher.h" +#include <mutex> +#include "rcldoc.h" +#include "fetcher.h" +#include "log.h" +#include "webstore.h" -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file +using std::string; -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). +// We use a single WebStore object to access the data. We protect it +// against multiple thread access. +static std::mutex o_beagler_mutex; -# Describe error in a way that can be interpreted by our caller -senderror() +bool WQDocFetcher::fetch(RclConfig* cnf, const Rcl::Doc& idoc, RawDoc& out) { - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 + string udi; + if (!idoc.getmeta(Rcl::Doc::keyudi, &udi) || udi.empty()) { + LOGERR("WQDocFetcher:: no udi in idoc\n" ); + return false; + } + Rcl::Doc dotdoc; + { + std::unique_lock<std::mutex> locker(o_beagler_mutex); + // Retrieve from our webcache (beagle data). The beagler + // object is created at the first call of this routine and + // deleted when the program exits. + static WebStore o_beagler(cnf); + if (!o_beagler.getFromCache(udi, dotdoc, out.data)) { + LOGINFO("WQDocFetcher::fetch: failed for [" << udi << "]\n"); + return false; + } + } + if (dotdoc.mimetype.compare(idoc.mimetype)) { + LOGINFO("WQDocFetcher:: udi [" << udi << "], mimetp mismatch: in: [" << + idoc.mimetype << "], bgl [" << dotdoc.mimetype << "]\n"); + } + out.kind = RawDoc::RDK_DATA; + return true; } - -iscmd() + +bool WQDocFetcher::makesig(RclConfig* cnf, const Rcl::Doc& idoc, string& sig) { - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac + // Web queue sigs are empty + sig.clear(); + return true; } - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds wpd2html - -# output the result. wpd2html output doesn't seem to need any adjustment? - -wpd2html "$infile" 2> /dev/null
src/index/beaglequeue.cpp to src/index/webqueue.cpp
--- a/src/index/beaglequeue.cpp +++ b/src/index/webqueue.cpp @@ -16,6 +16,8 @@ */ #include "autoconfig.h" +#include "webqueue.h" + #include <string.h> #include <errno.h> #include "safesysstat.h" @@ -26,8 +28,7 @@ #include "rclutil.h" #include "log.h" #include "fstreewalk.h" -#include "beaglequeue.h" -#include "beaglequeuecache.h" +#include "webstore.h" #include "circache.h" #include "smallut.h" #include "fileudi.h" @@ -44,12 +45,13 @@ using namespace std; -// Beagle creates a file named .xxx (where xxx is the name for the main file -// in the queue), to hold external metadata (http or created by Beagle). -// This class reads the .xxx, dotfile, and turns it into an Rcl::Doc holder -class BeagleDotFile { +// The browser plugin creates a file named .xxx (where xxx is the name +// for the main file in the queue), to hold external metadata (http or +// created by the plugin). This class reads the .xxx, dotfile, and turns +// it into an Rcl::Doc holder +class WebQueueDotFile { public: - BeagleDotFile(RclConfig *conf, const string& fn) + WebQueueDotFile(RclConfig *conf, const string& fn) : m_conf(conf), m_fn(fn) {} @@ -62,7 +64,7 @@ m_input.getline(cline, LL-1); if (!m_input.good()) { if (m_input.bad()) { - LOGERR("beagleDotFileRead: input.bad()\n" ); + LOGERR("WebQueueDotFileRead: input.bad()\n" ); } return false; } @@ -72,18 +74,18 @@ ll--; } line.assign(cline, ll); - LOGDEB2("BeagleDotFile:readLine: [" << (line) << "]\n" ); + LOGDEB2("WebQueueDotFile:readLine: [" << (line) << "]\n" ); return true; } - // Process a beagle dot file and set interesting stuff in the doc + // Process a Web queue dot file and set interesting stuff in the doc bool toDoc(Rcl::Doc& doc) { string line; m_input.open(m_fn.c_str(), ios::in); if (!m_input.good()) { - LOGERR("BeagleDotFile: open failed for [" << (m_fn) << "]\n" ); + LOGERR("WebQueueDotFile: open failed for [" << (m_fn) << "]\n" ); return false; } @@ -173,24 +175,24 @@ // Initialize. Compute paths and create a temporary directory that will be // used by internfile() -BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db, +WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc) : m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc), m_nocacheindex(false) { m_queuedir = m_config->getWebQueueDir(); path_catslash(m_queuedir); - m_cache = new BeagleQueueCache(cnf); -} - -BeagleQueueIndexer::~BeagleQueueIndexer() -{ - LOGDEB("BeagleQueueIndexer::~\n" ); + m_cache = new WebStore(cnf); +} + +WebQueueIndexer::~WebQueueIndexer() +{ + LOGDEB("WebQueueIndexer::~\n" ); deleteZ(m_cache); } // Index document stored in the cache. -bool BeagleQueueIndexer::indexFromCache(const string& udi) +bool WebQueueIndexer::indexFromCache(const string& udi) { if (!m_db) return false; @@ -202,12 +204,12 @@ string hittype; if (!m_cache || !m_cache->getFromCache(udi, dotdoc, data, &hittype)) { - LOGERR("BeagleQueueIndexer::indexFromCache: cache failed\n" ); + LOGERR("WebQueueIndexer::indexFromCache: cache failed\n" ); return false; } if (hittype.empty()) { - LOGERR("BeagleIndexer::index: cc entry has no hit type\n" ); + LOGERR("WebQueueIndexer::index: cc entry has no hit type\n" ); return false; } @@ -224,11 +226,11 @@ try { fis = interner.internfile(doc); } catch (CancelExcept) { - LOGERR("BeagleQueueIndexer: interrupted\n" ); + LOGERR("WebQueueIndexer: interrupted\n" ); return false; } if (fis != FileInterner::FIDone) { - LOGERR("BeagleQueueIndexer: bad status from internfile\n" ); + LOGERR("WebQueueIndexer: bad status from internfile\n" ); return false; } @@ -242,7 +244,7 @@ } } -void BeagleQueueIndexer::updstatus(const string& udi) +void WebQueueIndexer::updstatus(const string& udi) { if (m_updater) { ++(m_updater->status.docsdone); @@ -253,18 +255,18 @@ } } -bool BeagleQueueIndexer::index() +bool WebQueueIndexer::index() { if (!m_db) return false; - LOGDEB("BeagleQueueIndexer::processqueue: [" << (m_queuedir) << "]\n" ); + LOGDEB("WebQueueIndexer::processqueue: [" << (m_queuedir) << "]\n" ); m_config->setKeyDir(m_queuedir); if (!path_makepath(m_queuedir, 0700)) { - LOGERR("BeagleQueueIndexer:: can't create queuedir [" << (m_queuedir) << "] errno " << (errno) << "\n" ); + LOGERR("WebQueueIndexer:: can't create queuedir [" << (m_queuedir) << "] errno " << (errno) << "\n" ); return false; } if (!m_cache || !m_cache->cc()) { - LOGERR("BeagleQueueIndexer: cache initialization failed\n" ); + LOGERR("WebQueueIndexer: cache initialization failed\n" ); return false; } CirCache *cc = m_cache->cc(); @@ -282,7 +284,7 @@ do { string udi; if (!cc->getCurrentUdi(udi)) { - LOGERR("BeagleQueueIndexer:: cache file damaged\n" ); + LOGERR("WebQueueIndexer:: cache file damaged\n" ); break; } if (udi.empty()) @@ -295,7 +297,7 @@ indexFromCache(udi); updstatus(udi); } catch (CancelExcept) { - LOGERR("BeagleQueueIndexer: interrupted\n" ); + LOGERR("WebQueueIndexer: interrupted\n" ); return false; } } @@ -307,17 +309,17 @@ FsTreeWalker walker(FsTreeWalker::FtwNoRecurse); walker.addSkippedName(".*"); FsTreeWalker::Status status = walker.walk(m_queuedir, *this); - LOGDEB("BeagleQueueIndexer::processqueue: done: status " << (status) << "\n" ); + LOGDEB("WebQueueIndexer::processqueue: done: status " << (status) << "\n" ); return true; } // Index a list of files (sent by the real time monitor) -bool BeagleQueueIndexer::indexFiles(list<string>& files) -{ - LOGDEB("BeagleQueueIndexer::indexFiles\n" ); +bool WebQueueIndexer::indexFiles(list<string>& files) +{ + LOGDEB("WebQueueIndexer::indexFiles\n" ); if (!m_db) { - LOGERR("BeagleQueueIndexer::indexfiles no db??\n" ); + LOGERR("WebQueueIndexer::indexfiles no db??\n" ); return false; } for (list<string>::iterator it = files.begin(); it != files.end();) { @@ -326,7 +328,7 @@ } string father = path_getfather(*it); if (father.compare(m_queuedir)) { - LOGDEB("BeagleQueueIndexer::indexfiles: skipping [" << *it << "] (nq)\n" ); + LOGDEB("WebQueueIndexer::indexfiles: skipping [" << *it << "] (nq)\n" ); it++; continue; } // Pb: we are often called with the dot file, before the @@ -342,11 +344,11 @@ } struct stat st; if (path_fileprops(*it, &st) != 0) { - LOGERR("BeagleQueueIndexer::indexfiles: cant stat [" << *it << "]\n" ); + LOGERR("WebQueueIndexer::indexfiles: cant stat [" << *it << "]\n" ); it++; continue; } if (!S_ISREG(st.st_mode)) { - LOGDEB("BeagleQueueIndexer::indexfiles: skipping [" << *it << "] (nr)\n" ); + LOGDEB("WebQueueIndexer::indexfiles: skipping [" << *it << "] (nr)\n" ); it++; continue; } @@ -360,7 +362,7 @@ } FsTreeWalker::Status -BeagleQueueIndexer::processone(const string &path, +WebQueueIndexer::processone(const string &path, const struct stat *stp, FsTreeWalker::CbFlag flg) { @@ -374,9 +376,9 @@ string dotpath = path_cat(path_getfather(path), string(".") + path_getsimple(path)); - LOGDEB("BeagleQueueIndexer: prc1: [" << (path) << "]\n" ); - - BeagleDotFile dotfile(m_config, dotpath); + LOGDEB("WebQueueIndexer: prc1: [" << (path) << "]\n" ); + + WebQueueDotFile dotfile(m_config, dotpath); Rcl::Doc dotdoc; string udi, udipath; if (!dotfile.toDoc(dotdoc)) @@ -388,7 +390,7 @@ udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url)); make_udi(udipath, cstr_null, udi); - LOGDEB("BeagleQueueIndexer: prc1: udi [" << (udi) << "]\n" ); + LOGDEB("WebQueueIndexer: prc1: udi [" << (udi) << "]\n" ); char ascdate[30]; sprintf(ascdate, "%ld", long(stp->st_mtime)); @@ -410,7 +412,7 @@ } else { Rcl::Doc doc; // Store the dotdoc fields in the future doc. In case someone wants - // to use beagle-generated fields like beagle:inurl + // to use fields generated by the browser plugin like inurl doc.meta = dotdoc.meta; FileInterner interner(path, stp, m_config, @@ -420,11 +422,11 @@ try { fis = interner.internfile(doc); } catch (CancelExcept) { - LOGERR("BeagleQueueIndexer: interrupted\n" ); + LOGERR("WebQueueIndexer: interrupted\n" ); goto out; } if (fis != FileInterner::FIDone && fis != FileInterner::FIAgain) { - LOGERR("BeagleQueueIndexer: bad status from internfile\n" ); + LOGERR("WebQueueIndexer: bad status from internfile\n" ); // TOBEDONE: internfile can return FIAgain here if it is // paging a big text file, we should loop. Means we're // only indexing the first page for text/plain files @@ -457,11 +459,11 @@ string fdata; file_to_string(path, fdata); if (!m_cache || !m_cache->cc()) { - LOGERR("BeagleQueueIndexer: cache initialization failed\n" ); + LOGERR("WebQueueIndexer: cache initialization failed\n" ); goto out; } if (!m_cache->cc()->put(udi, &dotfile.m_fields, fdata, 0)) { - LOGERR("BeagleQueueIndexer::prc1: cache_put failed; " << (m_cache->cc()->getReason()) << "\n" ); + LOGERR("WebQueueIndexer::prc1: cache_put failed; " << (m_cache->cc()->getReason()) << "\n" ); goto out; } }
src/index/beaglequeue.h to src/filters/rclgenxslt.py
--- a/src/index/beaglequeue.h +++ b/src/filters/rclgenxslt.py @@ -1,80 +1,39 @@ -/* Copyright (C) 2009 J.F.Dockes - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#ifndef _beaglequeue_h_included_ -#define _beaglequeue_h_included_ +#!/usr/bin/env python3 +# Copyright (C) 2018 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +###################################### -#include <list> +# Base class for simple (one stylesheet) xslt-based handlers -/** - * Process the Beagle indexing queue. - * - * Beagle MUST NOT be running, else mayhem will ensue. - * - * This is mainly written to reuse the Beagle Firefox plug-in (which - * copies visited pages and bookmarks to the queue). - */ +from __future__ import print_function -#include "fstreewalk.h" -#include "rcldoc.h" +import sys +import rclxslt +import gzip +from rclbasehandler import RclBaseHandler -class DbIxStatusUpdater; -class CirCache; -class RclConfig; -class BeagleQueueCache; -namespace Rcl { - class Db; -} +class XSLTExtractor(RclBaseHandler): + def __init__(self, em, stylesheet, gzip=False): + super(XSLTExtractor, self).__init__(em) + self.stylesheet = stylesheet + self.dogz = gzip -class BeagleQueueIndexer : public FsTreeWalkerCB { -public: - BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db, - DbIxStatusUpdater *updfunc = 0); - ~BeagleQueueIndexer(); - - /** This is called by the top indexer in recollindex. - * Does the walking and the talking */ - bool index(); - - /** Called when we fstreewalk the queue dir */ - FsTreeWalker::Status - processone(const string &, const struct stat *, FsTreeWalker::CbFlag); - - /** Index a list of files. No db cleaning or stemdb updating. - * Used by the real time monitor */ - bool indexFiles(std::list<std::string>& files); - /** Purge a list of files. No way to do this currently and dont want - * to do anything as this is mostly called by the monitor when *I* delete - * files inside the queue dir */ - bool purgeFiles(std::list<std::string>& files) {return true;} - - /** Called when indexing data from the cache, and from internfile for - * search result preview */ - bool getFromCache(const string& udi, Rcl::Doc &doc, string& data, - string *hittype = 0); -private: - RclConfig *m_config; - Rcl::Db *m_db; - BeagleQueueCache *m_cache; - string m_queuedir; - DbIxStatusUpdater *m_updater; - bool m_nocacheindex; - - bool indexFromCache(const string& udi); - void updstatus(const string& udi); -}; - -#endif /* _beaglequeue_h_included_ */ + def html_text(self, fn): + if self.dogz: + data = gzip.open(fn, 'rb').read() + else: + data = open(fn, 'rb').read() + return rclxslt.apply_sheet_data(self.stylesheet, data)
src/index/bglfetcher.cpp to src/python/pychm/setup.py.in
--- a/src/index/bglfetcher.cpp +++ b/src/python/pychm/setup.py.in @@ -1,64 +1,36 @@ -/* Copyright (C) 2012 J.F.Dockes - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include "autoconfig.h" +from setuptools import setup, Extension -#include <mutex> +long_description = ''' +Version of the chm package modified to support Python 3 and bundled with Recoll. +The chm package provides three modules, chm, chmlib and extra, which provide +access to the API implemented by the C library chmlib and some additional +classes and functions. They are used to access MS-ITSS encoded files - +Compressed Html Help files (.chm). +''' -#include "rcldoc.h" -#include "fetcher.h" -#include "bglfetcher.h" -#include "log.h" -#include "beaglequeuecache.h" +# For shadow builds: references to the source tree +import os +top = os.path.join('@srcdir@', '..', '..') +pytop = '@srcdir@' -// We use a single beagle cache object to access beagle data. We protect it -// against multiple thread access. -static std::mutex o_beagler_mutex; - -bool BGLDocFetcher::fetch(RclConfig* cnf, const Rcl::Doc& idoc, RawDoc& out) -{ - string udi; - if (!idoc.getmeta(Rcl::Doc::keyudi, &udi) || udi.empty()) { - LOGERR("BGLDocFetcher:: no udi in idoc\n" ); - return false; - } - Rcl::Doc dotdoc; - { - std::unique_lock<std::mutex> locker(o_beagler_mutex); - // Retrieve from our webcache (beagle data). The beagler - // object is created at the first call of this routine and - // deleted when the program exits. - static BeagleQueueCache o_beagler(cnf); - if (!o_beagler.getFromCache(udi, dotdoc, out.data)) { - LOGINFO("BGLDocFetcher::fetch: failed for [" << (udi) << "]\n" ); - return false; - } - } - if (dotdoc.mimetype.compare(idoc.mimetype)) { - LOGINFO("BGLDocFetcher:: udi [" << (udi) << "], mimetp mismatch: in: [" << (idoc.mimetype) << "], bgl [" << (dotdoc.mimetype) << "]\n" ); - } - out.kind = RawDoc::RDK_DATA; - return true; -} - -bool BGLDocFetcher::makesig(RclConfig* cnf, const Rcl::Doc& idoc, string& sig) -{ - // Bgl sigs are empty - sig.clear(); - return true; -} - - +setup(name="recollchm", + version="0.8.4.1+git", + description="Python package to handle CHM files", + author="Rubens Ramos", + author_email="rubensr@users.sourceforge.net", + maintainer="Mikhail Gusarov", + maintainer_email="dottedmag@dottedmag.net", + url="https://github.com/dottedmag/pychm", + license="GPL", + long_description=long_description, + package_dir = {'' : os.path.join(top, 'python', 'pychm')}, + py_modules=["recollchm.chm", "recollchm.chmlib"], + ext_modules=[Extension("recollchm._chmlib", + [os.path.join(pytop, "recollchm/swig_chm.c")], + libraries=["chm"], + extra_compile_args=["-DSWIG_COBJECT_TYPES"]), + Extension("recollchm.extra", + [os.path.join(pytop, "recollchm/extra.c")], + extra_compile_args=["-D__PYTHON__"], + libraries=["chm"])] + )
src/index/bglfetcher.h to src/index/webqueuefetcher.h
--- a/src/index/bglfetcher.h +++ b/src/index/webqueuefetcher.h @@ -14,18 +14,19 @@ * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#ifndef _BGLFETCHER_H_INCLUDED_ -#define _BGLFETCHER_H_INCLUDED_ +#ifndef _WEBQUEUEFETCHER_H_INCLUDED_ +#define _WEBQUEUEFETCHER_H_INCLUDED_ + #include "fetcher.h" /** - * The Beagle cache fetcher: + * The WEB queue cache fetcher: */ -class BGLDocFetcher : public DocFetcher{ +class WQDocFetcher : public DocFetcher{ virtual bool fetch(RclConfig* cnf, const Rcl::Doc& idoc, RawDoc& out); virtual bool makesig(RclConfig* cnf, const Rcl::Doc& idoc, std::string& sig); - virtual ~BGLDocFetcher() {} + virtual ~WQDocFetcher() {} }; -#endif /* _BGLFETCHER_H_INCLUDED_ */ +#endif /* _WEBQUEUEFETCHER_H_INCLUDED_ */
src/python/recoll/Makefile.in to src/python/pychm/README-RECOLL.txt
--- a/src/python/recoll/Makefile.in +++ b/src/python/pychm/README-RECOLL.txt @@ -1,12 +1,11 @@ -all: - echo libdir: $(libdir) - test '@srcdir@' = '.' || cp -rp @srcdir@/recoll . - libdir=$(libdir) python setup.py build -install: - sudo python setup.py install -clean: - rm -rf build - rm -f recoll/__init__.pyc - rm -rf recoll/__pycache__ -distclean: clean - rm -f setup.py +May 2018: + +pychm has no python3 version. The pull request I submitted for the port is +sitting there, and so is the Debian bug. + +https://github.com/dottedmag/pychm/pull/5 + +Which is why Recoll bundles pychm, enhanced for Python3, for now. The +source repo is here: + +https://github.com/medoc92/pychm
src/utils/refcntr.h to src/python/pychm/recollchm/__init__.py
--- a/src/utils/refcntr.h +++ b/src/python/pychm/recollchm/__init__.py @@ -1,70 +1,32 @@ -#ifndef _REFCNTR_H_ -#define _REFCNTR_H_ -/* Copyright (C) 2014 J.F.Dockes - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ +# Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net> +# +# pychm is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this program; see the file COPYING. If not, +# write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA +# -// See Stroustrup C++ 3rd ed, p. 783 -// This is only used if std::shared_ptr is not available -template <class X> class RefCntr { - X *rep; - int *pcount; -public: - RefCntr() - : rep(0), pcount(0) - {} - explicit RefCntr(X *pp) - : rep(pp), pcount(new int(1)) - {} - RefCntr(const RefCntr &r) - : rep(r.rep), pcount(r.pcount) - { - if (pcount) - (*pcount)++; - } - RefCntr& operator=(const RefCntr& r) - { - if (rep == r.rep) - return *this; - if (pcount && --(*pcount) == 0) { - delete rep; - delete pcount; - } - rep = r.rep; - pcount = r.pcount; - if (pcount) - (*pcount)++; - return *this; - } - void reset() { - if (pcount && --(*pcount) == 0) { - delete rep; - delete pcount; - } - rep = 0; - pcount = 0; - } - ~RefCntr() - { - reset(); - } - X *operator->() {return rep;} - X *get() const {return rep;} - int use_count() const {return pcount ? *pcount : 0;} - operator bool() const {return rep != 0;} -}; +''' + chm - A package to manipulate CHM files -#endif /*_REFCNTR_H_ */ + The chm package provides four modules: chm, chmlib, extra and + _chmlib. _chmlib and chmlib are very low level libraries generated + from SWIG interface files, and are simple wrappers around the API + defined by the C library chmlib. + The extra module adds full-text search support. + the chm module provides some higher level classes to simplify + access to the CHM files information. +''' +__all__ = ["chm", "chmlib", "_chmlib", "extra"] +__version__ = "0.8.4.1+git" +__revision__ = "$Id$"