Switch to unified view

a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html
...
...
18
alink="#0000FF">
18
alink="#0000FF">
19
  <div lang="en" class="book">
19
  <div lang="en" class="book">
20
    <div class="titlepage">
20
    <div class="titlepage">
21
      <div>
21
      <div>
22
        <div>
22
        <div>
23
          <h1 class="title"><a name="idp37528496" id=
23
          <h1 class="title"><a name="idp35245072" id=
24
          "idp37528496"></a>Recoll user manual</h1>
24
          "idp35245072"></a>Recoll user manual</h1>
25
        </div>
25
        </div>
26
26
27
        <div>
27
        <div>
28
          <div class="author">
28
          <div class="author">
29
            <h3 class="author"><span class=
29
            <h3 class="author"><span class=
...
...
107
                <dt><span class="sect2">2.1.2. <a href=
107
                <dt><span class="sect2">2.1.2. <a href=
108
                "#RCL.INDEXING.INTRODUCTION.CONFIG">Configurations,
108
                "#RCL.INDEXING.INTRODUCTION.CONFIG">Configurations,
109
                multiple indexes</a></span></dt>
109
                multiple indexes</a></span></dt>
110
110
111
                <dt><span class="sect2">2.1.3. <a href=
111
                <dt><span class="sect2">2.1.3. <a href=
112
                "#idp43099712">Document types</a></span></dt>
112
                "#idp40818624">Document types</a></span></dt>
113
113
114
                <dt><span class="sect2">2.1.4. <a href=
114
                <dt><span class="sect2">2.1.4. <a href=
115
                "#idp43124208">Indexing failures</a></span></dt>
115
                "#idp40843200">Indexing failures</a></span></dt>
116
116
117
                <dt><span class="sect2">2.1.5. <a href=
117
                <dt><span class="sect2">2.1.5. <a href=
118
                "#idp43131216">Recovery</a></span></dt>
118
                "#idp40850208">Recovery</a></span></dt>
119
              </dl>
119
              </dl>
120
            </dd>
120
            </dd>
121
121
122
            <dt><span class="sect1">2.2. <a href=
122
            <dt><span class="sect1">2.2. <a href=
123
            "#RCL.INDEXING.STORAGE">Index storage</a></span></dt>
123
            "#RCL.INDEXING.STORAGE">Index storage</a></span></dt>
...
...
170
            <dt><span class="sect1">2.6. <a href=
170
            <dt><span class="sect1">2.6. <a href=
171
            "#RCL.INDEXING.EXTTAGS">Importing external
171
            "#RCL.INDEXING.EXTTAGS">Importing external
172
            tags</a></span></dt>
172
            tags</a></span></dt>
173
173
174
            <dt><span class="sect1">2.7. <a href=
174
            <dt><span class="sect1">2.7. <a href=
175
            "#RCL.INDEXING.PERIODIC">Periodic
175
            "#RCL.INDEXING.PDF">The PDF input
176
            indexing</a></span></dt>
176
            handler</a></span></dt>
177
177
178
            <dd>
178
            <dd>
179
              <dl>
179
              <dl>
180
                <dt><span class="sect2">2.7.1. <a href=
180
                <dt><span class="sect2">2.7.1. <a href=
181
                "#RCL.INDEXING.PDF.OCR">OCR with
182
                Tesseract</a></span></dt>
183
184
                <dt><span class="sect2">2.7.2. <a href=
185
                "#RCL.INDEXING.PDF.XMP">XMP fields
186
                extraction</a></span></dt>
187
188
                <dt><span class="sect2">2.7.3. <a href=
189
                "#RCL.INDEXING.PDF.ATTACH">PDF attachment
190
                indexing</a></span></dt>
191
              </dl>
192
            </dd>
193
194
            <dt><span class="sect1">2.8. <a href=
195
            "#RCL.INDEXING.PERIODIC">Periodic
196
            indexing</a></span></dt>
197
198
            <dd>
199
              <dl>
200
                <dt><span class="sect2">2.8.1. <a href=
181
                "#RCL.INDEXING.PERIODIC.EXEC">Running
201
                "#RCL.INDEXING.PERIODIC.EXEC">Running
182
                indexing</a></span></dt>
202
                indexing</a></span></dt>
183
203
184
                <dt><span class="sect2">2.7.2. <a href=
204
                <dt><span class="sect2">2.8.2. <a href=
185
                "#RCL.INDEXING.PERIODIC.AUTOMAT">Using <span class=
205
                "#RCL.INDEXING.PERIODIC.AUTOMAT">Using <span class=
186
                "command"><strong>cron</strong></span> to automate
206
                "command"><strong>cron</strong></span> to automate
187
                indexing</a></span></dt>
207
                indexing</a></span></dt>
188
              </dl>
208
              </dl>
189
            </dd>
209
            </dd>
190
210
191
            <dt><span class="sect1">2.8. <a href=
211
            <dt><span class="sect1">2.9. <a href=
192
            "#RCL.INDEXING.MONITOR">Real time
212
            "#RCL.INDEXING.MONITOR">Real time
193
            indexing</a></span></dt>
213
            indexing</a></span></dt>
194
214
195
            <dd>
215
            <dd>
196
              <dl>
216
              <dl>
197
                <dt><span class="sect2">2.8.1. <a href=
217
                <dt><span class="sect2">2.9.1. <a href=
198
                "#RCL.INDEXING.MONITOR.FASTFILES">Slowing down the
218
                "#RCL.INDEXING.MONITOR.FASTFILES">Slowing down the
199
                reindexing rate for fast changing
219
                reindexing rate for fast changing
200
                files</a></span></dt>
220
                files</a></span></dt>
201
              </dl>
221
              </dl>
202
            </dd>
222
            </dd>
...
...
766
        "command"><strong>recoll</strong></span> GUI are stored in
786
        "command"><strong>recoll</strong></span> GUI are stored in
767
        the standard location defined by <span class=
787
        the standard location defined by <span class=
768
        "application">Qt</span>.</p>
788
        "application">Qt</span>.</p>
769
789
770
        <p>The <a class="link" href="#RCL.INDEXING.PERIODIC.EXEC"
790
        <p>The <a class="link" href="#RCL.INDEXING.PERIODIC.EXEC"
771
        title="2.7.1.&nbsp;Running indexing">indexing process</a>
791
        title="2.8.1.&nbsp;Running indexing">indexing process</a>
772
        is started automatically the first time you execute the
792
        is started automatically the first time you execute the
773
        <span class="command"><strong>recoll</strong></span> GUI.
793
        <span class="command"><strong>recoll</strong></span> GUI.
774
        Indexing can also be performed by executing the
794
        Indexing can also be performed by executing the
775
        <span class="command"><strong>recollindex</strong></span>
795
        <span class="command"><strong>recollindex</strong></span>
776
        command. <span class="application">Recoll</span> indexing
796
        command. <span class="application">Recoll</span> indexing
...
...
877
          <div class="itemizedlist">
897
          <div class="itemizedlist">
878
            <ul class="itemizedlist" style=
898
            <ul class="itemizedlist" style=
879
            "list-style-type: disc;">
899
            "list-style-type: disc;">
880
              <li class="listitem">
900
              <li class="listitem">
881
                <p><b><a class="link" href="#RCL.INDEXING.PERIODIC"
901
                <p><b><a class="link" href="#RCL.INDEXING.PERIODIC"
882
                title="2.7.&nbsp;Periodic indexing">Periodic (or
902
                title="2.8.&nbsp;Periodic indexing">Periodic (or
883
                batch) indexing:</a>&nbsp;</b>indexing takes place
903
                batch) indexing:</a>&nbsp;</b>indexing takes place
884
                at discrete times, by executing the <span class=
904
                at discrete times, by executing the <span class=
885
                "command"><strong>recollindex</strong></span>
905
                "command"><strong>recollindex</strong></span>
886
                command. The typical usage is to have a nightly
906
                command. The typical usage is to have a nightly
887
                indexing run <a class="link" href=
907
                indexing run <a class="link" href=
888
                "#RCL.INDEXING.PERIODIC.AUTOMAT" title=
908
                "#RCL.INDEXING.PERIODIC.AUTOMAT" title=
889
                "2.7.2.&nbsp;Using cron to automate indexing">programmed</a>
909
                "2.8.2.&nbsp;Using cron to automate indexing">programmed</a>
890
                into your <span class=
910
                into your <span class=
891
                "command"><strong>cron</strong></span> file.</p>
911
                "command"><strong>cron</strong></span> file.</p>
892
              </li>
912
              </li>
893
913
894
              <li class="listitem">
914
              <li class="listitem">
895
                <p><b><a class="link" href="#RCL.INDEXING.MONITOR"
915
                <p><b><a class="link" href="#RCL.INDEXING.MONITOR"
896
                title="2.8.&nbsp;Real time indexing">Real time
916
                title="2.9.&nbsp;Real time indexing">Real time
897
                indexing:</a>&nbsp;</b>indexing takes place as soon
917
                indexing:</a>&nbsp;</b>indexing takes place as soon
898
                as a file is created or changed. <span class=
918
                as a file is created or changed. <span class=
899
                "command"><strong>recollindex</strong></span> runs
919
                "command"><strong>recollindex</strong></span> runs
900
                as a daemon and uses a file system alteration
920
                as a daemon and uses a file system alteration
901
                monitor such as <span class=
921
                monitor such as <span class=
...
...
995
1015
996
        <div class="sect2">
1016
        <div class="sect2">
997
          <div class="titlepage">
1017
          <div class="titlepage">
998
            <div>
1018
            <div>
999
              <div>
1019
              <div>
1000
                <h3 class="title"><a name="idp43099712" id=
1020
                <h3 class="title"><a name="idp40818624" id=
1001
                "idp43099712"></a>2.1.3.&nbsp;Document types</h3>
1021
                "idp40818624"></a>2.1.3.&nbsp;Document types</h3>
1002
              </div>
1022
              </div>
1003
            </div>
1023
            </div>
1004
          </div>
1024
          </div>
1005
1025
1006
          <p><span class="application">Recoll</span> knows about
1026
          <p><span class="application">Recoll</span> knows about
...
...
1109
1129
1110
        <div class="sect2">
1130
        <div class="sect2">
1111
          <div class="titlepage">
1131
          <div class="titlepage">
1112
            <div>
1132
            <div>
1113
              <div>
1133
              <div>
1114
                <h3 class="title"><a name="idp43124208" id=
1134
                <h3 class="title"><a name="idp40843200" id=
1115
                "idp43124208"></a>2.1.4.&nbsp;Indexing
1135
                "idp40843200"></a>2.1.4.&nbsp;Indexing
1116
                failures</h3>
1136
                failures</h3>
1117
              </div>
1137
              </div>
1118
            </div>
1138
            </div>
1119
          </div>
1139
          </div>
1120
1140
...
...
1150
1170
1151
        <div class="sect2">
1171
        <div class="sect2">
1152
          <div class="titlepage">
1172
          <div class="titlepage">
1153
            <div>
1173
            <div>
1154
              <div>
1174
              <div>
1155
                <h3 class="title"><a name="idp43131216" id=
1175
                <h3 class="title"><a name="idp40850208" id=
1156
                "idp43131216"></a>2.1.5.&nbsp;Recovery</h3>
1176
                "idp40850208"></a>2.1.5.&nbsp;Recovery</h3>
1157
              </div>
1177
              </div>
1158
            </div>
1178
            </div>
1159
          </div>
1179
          </div>
1160
1180
1161
          <p>In the rare case where the index becomes corrupted
1181
          <p>In the rare case where the index becomes corrupted
...
...
1914
      <div class="sect1">
1934
      <div class="sect1">
1915
        <div class="titlepage">
1935
        <div class="titlepage">
1916
          <div>
1936
          <div>
1917
            <div>
1937
            <div>
1918
              <h2 class="title" style="clear: both"><a name=
1938
              <h2 class="title" style="clear: both"><a name=
1939
              "RCL.INDEXING.PDF" id=
1940
              "RCL.INDEXING.PDF"></a>2.7.&nbsp;The PDF input
1941
              handler</h2>
1942
            </div>
1943
          </div>
1944
        </div>
1945
1946
        <p>The PDF format is very important for scientific and
1947
        technical documentation, and document archival. It has
1948
        extensive facilities for storing metadata along with the
1949
        document, and these facilities are actually used in the
1950
        real world.</p>
1951
1952
        <p>In consequence, the <code class=
1953
        "filename">rclpdf.py</code> PDF input handler has more
1954
        complex capabilities than most others, and it is also more
1955
        configurable. Specifically, <code class=
1956
        "filename">rclpdf.py</code> can automatically use
1957
        <span class="application">tesseract</span> to perform OCR
1958
        if the document text is empty, it can be configured to
1959
        extract specific metadata tags from an XMP packet, and to
1960
        extract PDF attachments.</p>
1961
1962
        <div class="sect2">
1963
          <div class="titlepage">
1964
            <div>
1965
              <div>
1966
                <h3 class="title"><a name="RCL.INDEXING.PDF.OCR"
1967
                id="RCL.INDEXING.PDF.OCR"></a>2.7.1.&nbsp;OCR with
1968
                Tesseract</h3>
1969
              </div>
1970
            </div>
1971
          </div>
1972
1973
          <p>If both <span class="application">tesseract</span> and
1974
          <span class="command"><strong>pdftoppm</strong></span>
1975
          (generally from the <span class=
1976
          "application">poppler-utils</span> package) are
1977
          installed, the PDF handler may attempt OCR on PDF files
1978
          with no text content. This is controlled by the <a class=
1979
          "link" href=
1980
          "#RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR">pdfocr</a>
1981
          configuration variable, which is false by default because
1982
          OCR is very slow.</p>
1983
1984
          <p>The choice of language is very important for
1985
          successfull OCR. Recoll has currently no way to determine
1986
          this from the document itself. You can set the language
1987
          to use through the contents of a <code class=
1988
          "filename">.ocrpdflang</code> text file in the same
1989
          directory as the PDF document, or through the
1990
          <code class="envar">RECOLL_TESSERACT_LANG</code>
1991
          environment variable, or through the contents of an
1992
          <code class="filename">ocrpdf</code> text file inside the
1993
          configuration directory. If none of the above are used,
1994
          <span class="application">Recoll</span> will try to guess
1995
          the language from the NLS environment.</p>
1996
        </div>
1997
1998
        <div class="sect2">
1999
          <div class="titlepage">
2000
            <div>
2001
              <div>
2002
                <h3 class="title"><a name="RCL.INDEXING.PDF.XMP"
2003
                id="RCL.INDEXING.PDF.XMP"></a>2.7.2.&nbsp;XMP
2004
                fields extraction</h3>
2005
              </div>
2006
            </div>
2007
          </div>
2008
2009
          <p>The <code class="filename">rclpdf.py</code> script in
2010
          <span class="application">Recoll</span> version 1.23.2
2011
          and later can extract XMP metadata fields by executing
2012
          the <span class="command"><strong>pdfinfo</strong></span>
2013
          command (usually found with <span class=
2014
          "application">poppler-utils</span>). This is controlled
2015
          by the <a class="link" href=
2016
          "#RCL.INSTALL.CONFIG.RECOLLCONF.PDFEXTRAMETA">pdfextrameta</a>
2017
          configuration variable, which specifies which tags to
2018
          extract and, possibly, how to rename them.</p>
2019
2020
          <p>The <a class="link" href=
2021
          "#RCL.INSTALL.CONFIG.RECOLLCONF.PDFEXTRAMETAFIX">pdfextrametafix</a>
2022
          variable can be used to designate a file with Python code
2023
          to edit the metadata fields (available for <span class=
2024
          "application">Recoll</span> 1.23.3 and later. 1.23.2 has
2025
          equivalent code inside the handler script). Example:</p>
2026
          <pre class="programlisting">
2027
import sys
2028
import re
2029
2030
class MetaFixer(object):
2031
    def __init__(self):
2032
        pass
2033
2034
    def metafix(self, nm, txt):
2035
        if nm == 'bibtex:pages':
2036
            txt = re.sub(r'--', '-', txt)
2037
        elif nm == 'someothername':
2038
            # do something else
2039
            pass
2040
        elif nm == 'stillanother':
2041
            # etc.
2042
            pass
2043
    
2044
        return txt
2045
        
2046
</pre>
2047
        </div>
2048
2049
        <div class="sect2">
2050
          <div class="titlepage">
2051
            <div>
2052
              <div>
2053
                <h3 class="title"><a name="RCL.INDEXING.PDF.ATTACH"
2054
                id="RCL.INDEXING.PDF.ATTACH"></a>2.7.3.&nbsp;PDF
2055
                attachment indexing</h3>
2056
              </div>
2057
            </div>
2058
          </div>
2059
2060
          <p>If <span class="application">pdftk</span> is
2061
          installed, and if the the <a class="link" href=
2062
          "#RCL.INSTALL.CONFIG.RECOLLCONF.PDFATTACH">pdfattach</a>
2063
          configuration variable is set, the PDF input handler will
2064
          try to extract PDF attachements for indexing as
2065
          sub-documents of the PDF file. This is disabled by
2066
          default, because it slows down PDF indexing a bit even if
2067
          not one attachment is ever found (PDF attachments are
2068
          uncommon in my experience).</p>
2069
        </div>
2070
      </div>
2071
2072
      <div class="sect1">
2073
        <div class="titlepage">
2074
          <div>
2075
            <div>
2076
              <h2 class="title" style="clear: both"><a name=
1919
              "RCL.INDEXING.PERIODIC" id=
2077
              "RCL.INDEXING.PERIODIC" id=
1920
              "RCL.INDEXING.PERIODIC"></a>2.7.&nbsp;Periodic
2078
              "RCL.INDEXING.PERIODIC"></a>2.8.&nbsp;Periodic
1921
              indexing</h2>
2079
              indexing</h2>
1922
            </div>
2080
            </div>
1923
          </div>
2081
          </div>
1924
        </div>
2082
        </div>
1925
2083
...
...
1927
          <div class="titlepage">
2085
          <div class="titlepage">
1928
            <div>
2086
            <div>
1929
              <div>
2087
              <div>
1930
                <h3 class="title"><a name=
2088
                <h3 class="title"><a name=
1931
                "RCL.INDEXING.PERIODIC.EXEC" id=
2089
                "RCL.INDEXING.PERIODIC.EXEC" id=
1932
                "RCL.INDEXING.PERIODIC.EXEC"></a>2.7.1.&nbsp;Running
2090
                "RCL.INDEXING.PERIODIC.EXEC"></a>2.8.1.&nbsp;Running
1933
                indexing</h3>
2091
                indexing</h3>
1934
              </div>
2092
              </div>
1935
            </div>
2093
            </div>
1936
          </div>
2094
          </div>
1937
2095
...
...
2035
          <div class="titlepage">
2193
          <div class="titlepage">
2036
            <div>
2194
            <div>
2037
              <div>
2195
              <div>
2038
                <h3 class="title"><a name=
2196
                <h3 class="title"><a name=
2039
                "RCL.INDEXING.PERIODIC.AUTOMAT" id=
2197
                "RCL.INDEXING.PERIODIC.AUTOMAT" id=
2040
                "RCL.INDEXING.PERIODIC.AUTOMAT"></a>2.7.2.&nbsp;Using
2198
                "RCL.INDEXING.PERIODIC.AUTOMAT"></a>2.8.2.&nbsp;Using
2041
                <span class="command"><strong>cron</strong></span>
2199
                <span class="command"><strong>cron</strong></span>
2042
                to automate indexing</h3>
2200
                to automate indexing</h3>
2043
              </div>
2201
              </div>
2044
            </div>
2202
            </div>
2045
          </div>
2203
          </div>
...
...
2093
        <div class="titlepage">
2251
        <div class="titlepage">
2094
          <div>
2252
          <div>
2095
            <div>
2253
            <div>
2096
              <h2 class="title" style="clear: both"><a name=
2254
              <h2 class="title" style="clear: both"><a name=
2097
              "RCL.INDEXING.MONITOR" id=
2255
              "RCL.INDEXING.MONITOR" id=
2098
              "RCL.INDEXING.MONITOR"></a>2.8.&nbsp;Real time
2256
              "RCL.INDEXING.MONITOR"></a>2.9.&nbsp;Real time
2099
              indexing</h2>
2257
              indexing</h2>
2100
            </div>
2258
            </div>
2101
          </div>
2259
          </div>
2102
        </div>
2260
        </div>
2103
2261
...
...
2223
          <div class="titlepage">
2381
          <div class="titlepage">
2224
            <div>
2382
            <div>
2225
              <div>
2383
              <div>
2226
                <h3 class="title"><a name=
2384
                <h3 class="title"><a name=
2227
                "RCL.INDEXING.MONITOR.FASTFILES" id=
2385
                "RCL.INDEXING.MONITOR.FASTFILES" id=
2228
                "RCL.INDEXING.MONITOR.FASTFILES"></a>2.8.1.&nbsp;Slowing
2386
                "RCL.INDEXING.MONITOR.FASTFILES"></a>2.9.1.&nbsp;Slowing
2229
                down the reindexing rate for fast changing
2387
                down the reindexing rate for fast changing
2230
                files</h3>
2388
                files</h3>
2231
              </div>
2389
              </div>
2232
            </div>
2390
            </div>
2233
          </div>
2391
          </div>
...
...
9846
                <p>Enable PDF attachment extraction by executing
10004
                <p>Enable PDF attachment extraction by executing
9847
                pdftk (if available). This is normally disabled,
10005
                pdftk (if available). This is normally disabled,
9848
                because it does slow down PDF indexing a bit even
10006
                because it does slow down PDF indexing a bit even
9849
                if not one attachment is ever found.</p>
10007
                if not one attachment is ever found.</p>
9850
              </dd>
10008
              </dd>
10009
10010
              <dt><a name=
10011
              "RCL.INSTALL.CONFIG.RECOLLCONF.PDFEXTRAMETA" id=
10012
              "RCL.INSTALL.CONFIG.RECOLLCONF.PDFEXTRAMETA"></a><span class="term"><code class="varname">pdfextrameta</code></span></dt>
10013
10014
              <dd>
10015
                <p>Extract text from selected XMP metadata tags.
10016
                This is a space-separated list of qualified XMP tag
10017
                names. Each element can also include a translation
10018
                to a Recoll field name, separated by a '|'
10019
                character. If the second element is absent, the tag
10020
                name is used as the Recoll field names. You will
10021
                also need to add specifications to the 'fields'
10022
                file to direct processing of the extracted
10023
                data.</p>
10024
              </dd>
10025
10026
              <dt><a name=
10027
              "RCL.INSTALL.CONFIG.RECOLLCONF.PDFEXTRAMETAFIX" id=
10028
              "RCL.INSTALL.CONFIG.RECOLLCONF.PDFEXTRAMETAFIX"></a><span class="term"><code class="varname">pdfextrametafix</code></span></dt>
10029
10030
              <dd>
10031
                <p>Define name of XMP field editing script. This
10032
                defines the name of a script to be loaded for
10033
                editing XMP field values. The script should define
10034
                a 'MetaFixer' class with a metafix() method which
10035
                will be called with the qualified tag name and
10036
                value of each selected field, for editing or
10037
                erasing. A new instance is created for each
10038
                document, so that the object can keep state for,
10039
                e.g. eliminating duplicate values.</p>
10040
              </dd>
9851
            </dl>
10041
            </dl>
9852
          </div>
10042
          </div>
9853
10043
9854
          <div class="sect3">
10044
          <div class="sect3">
9855
            <div class="titlepage">
10045
            <div class="titlepage">