Parent: [17393b] (diff)

Child: [7b8031] (diff)

Download this file

rclscribus    182 lines (158 with data), 4.9 kB

# @(#$Id: rclscribus,v 1.4 2007-06-08 13:51:09 dockes Exp $  (C) 2004 J.F.Dockes
# There may still be code from Estraier in here:
# Estraier: a personal full-text search system
# Copyright (C) 2003-2004 Mikio Hirabayashi
# Convert a scribus file to recoll HTML. This only handles the newer .sla
# files until I can have a look at an older .scd.
# We just hack into the scribus XML, taking advantage that the tag of
# interest is apparently always output on a single line.
# The text seems to be found in attribute CH of tag ITEXT, it is utf-8
# Tried to convert this to xsltproc but it seems that quite a few
# Scribus document are not actually proper xml

# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL

# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file

# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).

# Describe error in a way that can be interpreted by our caller
    echo RECFILTERROR $*
    # Also alert on stderr just in case
    echo ":2:$progname::: $*" 1>&2
    exit 1

    case $cmd in
	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
      return 1 ;;

    for cmd in $*;do
      if iscmd $cmd 
        senderror HELPERNOTFOUND $cmd

# show help message
if test $# -ne 1 -o "$1" = "--help" 
  echo "Convert a $filetype file to HTML text for Recoll indexing."
  echo "Usage: $progname [infile]"
  exit 1


# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
  senderror INPUTNOSUCHFILE "$infile"

# protect access to our temp files and directories
umask 77

# !! Leave the following line unmodified !

checkcmds grep awk sed

# A small sed program to join lines where they are broken inside an
# attribute value. The idea is that all scribus tag are apparently on one
# line except when there are embedded new lines in an attribute lie
# 'comments'. The first version of the sed script joins line which does not
# end with > with the next. It doesn't guard against an embedded '>'. The
# seconf joins line not beginning with '<' with the previous. It is much
# slower for some reason.
/[^>] *$/N; s/\n/ /; ta'
#$!N;/^ *[^<]/s/\n/ /;ta

# Extract description title author and keywords
description=`sed -e "$sedjoinprog" < $infile | \
awk '
    if (match($0, " COMMENTS=\"[^\"]+")) { 
       s=substr($0, RSTART+11, RLENGTH-11)
       printf("%s", s);
       # Note: there is no way to know if this ends a frame, so no "<br>"

title=`sed -e "$sedjoinprog" < $infile | \
awk '
    if (match($0, " TITLE=\"[^\"]+")) { 
       s=substr($0, RSTART+8, RLENGTH-8)
       printf("%s", s);
       # Note: there is no way to know if this ends a frame, so no "<br>"

author=`sed -e "$sedjoinprog" < $infile | \
awk '
    if (match($0, " AUTHOR=\"[^\"]+")) { 
       s=substr($0, RSTART+9, RLENGTH-9)
       printf("%s", s);
       # Note: there is no way to know if this ends a frame, so no "<br>"

keywords=`sed -e "$sedjoinprog" < $infile | \
awk '
    if (match($0, " KEYWORDS=\"[^\"]+")) { 
       s=substr($0, RSTART+11, RLENGTH-11)
       printf("%s", s);
       # Note: there is no way to know if this ends a frame, so no "<br>"

#echo description: [$description];echo title: [$title];
#echo author: [$author];echo keywords: [$keywords]

cat <<EOF
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
<meta name="author" content="$author">
<meta name="description" content="$description">
<meta name="keywords" content="$keywords">

sed -e ':a' -e '/[^>] *$/N; s/\n/ /; ta' < $infile | \
awk '
/<ITEXT / {
    if (match($0, " CH=\"[^\"]+")) { 
       s=substr($0, RSTART+5, RLENGTH-5)
       printf("%s", s)
       # Note: there is no way to know if this ends a frame, so no "<br>"
    print "</p></body></html>"
' | \
sed -e 's/&#x5;/<br>/g' -e 's/&#x1c;/<br>/g' -e 's/