a/website/recoll_XMP/index.html b/website/recoll_XMP/index.html
1
<html>
1
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
2
    "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
3
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
2
<head>
4
<head>

5
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8" />
6
<meta name="generator" content="AsciiDoc 8.6.9" />
3
<title>Indexing PDF XMP-metadata</title>
7
<title>Indexing PDF XMP-metadata with Recoll</title>

8
<style type="text/css">
9
/* Shared CSS for AsciiDoc xhtml11 and html5 backends */
10
11
/* Default font. */
12
body {
13
  font-family: Georgia,serif;
14
}
15
16
/* Title font. */
17
h1, h2, h3, h4, h5, h6,
18
div.title, caption.title,
19
thead, p.table.header,
20
#toctitle,
21
#author, #revnumber, #revdate, #revremark,
22
#footer {
23
  font-family: Arial,Helvetica,sans-serif;
24
}
25
26
body {
27
  margin: 1em 5% 1em 5%;
28
}
29
30
a {
31
  color: blue;
32
  text-decoration: underline;
33
}
34
a:visited {
35
  color: fuchsia;
36
}
37
38
em {
39
  font-style: italic;
40
  color: navy;
41
}
42
43
strong {
44
  font-weight: bold;
45
  color: #083194;
46
}
47
48
h1, h2, h3, h4, h5, h6 {
49
  color: #527bbd;
50
  margin-top: 1.2em;
51
  margin-bottom: 0.5em;
52
  line-height: 1.3;
53
}
54
55
h1, h2, h3 {
56
  border-bottom: 2px solid silver;
57
}
58
h2 {
59
  padding-top: 0.5em;
60
}
61
h3 {
62
  float: left;
63
}
64
h3 + * {
65
  clear: left;
66
}
67
h5 {
68
  font-size: 1.0em;
69
}
70
71
div.sectionbody {
72
  margin-left: 0;
73
}
74
75
hr {
76
  border: 1px solid silver;
77
}
78
79
p {
80
  margin-top: 0.5em;
81
  margin-bottom: 0.5em;
82
}
83
84
ul, ol, li > p {
85
  margin-top: 0;
86
}
87
ul > li     { color: #aaa; }
88
ul > li > * { color: black; }
89
90
.monospaced, code, pre {
91
  font-family: "Courier New", Courier, monospace;
92
  font-size: inherit;
93
  color: navy;
94
  padding: 0;
95
  margin: 0;
96
}
97
pre {
98
  white-space: pre-wrap;
99
}
100
101
#author {
102
  color: #527bbd;
103
  font-weight: bold;
104
  font-size: 1.1em;
105
}
106
#email {
107
}
108
#revnumber, #revdate, #revremark {
109
}
110
111
#footer {
112
  font-size: small;
113
  border-top: 2px solid silver;
114
  padding-top: 0.5em;
115
  margin-top: 4.0em;
116
}
117
#footer-text {
118
  float: left;
119
  padding-bottom: 0.5em;
120
}
121
#footer-badges {
122
  float: right;
123
  padding-bottom: 0.5em;
124
}
125
126
#preamble {
127
  margin-top: 1.5em;
128
  margin-bottom: 1.5em;
129
}
130
div.imageblock, div.exampleblock, div.verseblock,
131
div.quoteblock, div.literalblock, div.listingblock, div.sidebarblock,
132
div.admonitionblock {
133
  margin-top: 1.0em;
134
  margin-bottom: 1.5em;
135
}
136
div.admonitionblock {
137
  margin-top: 2.0em;
138
  margin-bottom: 2.0em;
139
  margin-right: 10%;
140
  color: #606060;
141
}
142
143
div.content { /* Block element content. */
144
  padding: 0;
145
}
146
147
/* Block element titles. */
148
div.title, caption.title {
149
  color: #527bbd;
150
  font-weight: bold;
151
  text-align: left;
152
  margin-top: 1.0em;
153
  margin-bottom: 0.5em;
154
}
155
div.title + * {
156
  margin-top: 0;
157
}
158
159
td div.title:first-child {
160
  margin-top: 0.0em;
161
}
162
div.content div.title:first-child {
163
  margin-top: 0.0em;
164
}
165
div.content + div.title {
166
  margin-top: 0.0em;
167
}
168
169
div.sidebarblock > div.content {
170
  background: #ffffee;
171
  border: 1px solid #dddddd;
172
  border-left: 4px solid #f0f0f0;
173
  padding: 0.5em;
174
}
175
176
div.listingblock > div.content {
177
  border: 1px solid #dddddd;
178
  border-left: 5px solid #f0f0f0;
179
  background: #f8f8f8;
180
  padding: 0.5em;
181
}
182
183
div.quoteblock, div.verseblock {
184
  padding-left: 1.0em;
185
  margin-left: 1.0em;
186
  margin-right: 10%;
187
  border-left: 5px solid #f0f0f0;
188
  color: #888;
189
}
190
191
div.quoteblock > div.attribution {
192
  padding-top: 0.5em;
193
  text-align: right;
194
}
195
196
div.verseblock > pre.content {
197
  font-family: inherit;
198
  font-size: inherit;
199
}
200
div.verseblock > div.attribution {
201
  padding-top: 0.75em;
202
  text-align: left;
203
}
204
/* DEPRECATED: Pre version 8.2.7 verse style literal block. */
205
div.verseblock + div.attribution {
206
  text-align: left;
207
}
208
209
div.admonitionblock .icon {
210
  vertical-align: top;
211
  font-size: 1.1em;
212
  font-weight: bold;
213
  text-decoration: underline;
214
  color: #527bbd;
215
  padding-right: 0.5em;
216
}
217
div.admonitionblock td.content {
218
  padding-left: 0.5em;
219
  border-left: 3px solid #dddddd;
220
}
221
222
div.exampleblock > div.content {
223
  border-left: 3px solid #dddddd;
224
  padding-left: 0.5em;
225
}
226
227
div.imageblock div.content { padding-left: 0; }
228
span.image img { border-style: none; vertical-align: text-bottom; }
229
a.image:visited { color: white; }
230
231
dl {
232
  margin-top: 0.8em;
233
  margin-bottom: 0.8em;
234
}
235
dt {
236
  margin-top: 0.5em;
237
  margin-bottom: 0;
238
  font-style: normal;
239
  color: navy;
240
}
241
dd > *:first-child {
242
  margin-top: 0.1em;
243
}
244
245
ul, ol {
246
    list-style-position: outside;
247
}
248
ol.arabic {
249
  list-style-type: decimal;
250
}
251
ol.loweralpha {
252
  list-style-type: lower-alpha;
253
}
254
ol.upperalpha {
255
  list-style-type: upper-alpha;
256
}
257
ol.lowerroman {
258
  list-style-type: lower-roman;
259
}
260
ol.upperroman {
261
  list-style-type: upper-roman;
262
}
263
264
div.compact ul, div.compact ol,
265
div.compact p, div.compact p,
266
div.compact div, div.compact div {
267
  margin-top: 0.1em;
268
  margin-bottom: 0.1em;
269
}
270
271
tfoot {
272
  font-weight: bold;
273
}
274
td > div.verse {
275
  white-space: pre;
276
}
277
278
div.hdlist {
279
  margin-top: 0.8em;
280
  margin-bottom: 0.8em;
281
}
282
div.hdlist tr {
283
  padding-bottom: 15px;
284
}
285
dt.hdlist1.strong, td.hdlist1.strong {
286
  font-weight: bold;
287
}
288
td.hdlist1 {
289
  vertical-align: top;
290
  font-style: normal;
291
  padding-right: 0.8em;
292
  color: navy;
293
}
294
td.hdlist2 {
295
  vertical-align: top;
296
}
297
div.hdlist.compact tr {
298
  margin: 0;
299
  padding-bottom: 0;
300
}
301
302
.comment {
303
  background: yellow;
304
}
305
306
.footnote, .footnoteref {
307
  font-size: 0.8em;
308
}
309
310
span.footnote, span.footnoteref {
311
  vertical-align: super;
312
}
313
314
#footnotes {
315
  margin: 20px 0 20px 0;
316
  padding: 7px 0 0 0;
317
}
318
319
#footnotes div.footnote {
320
  margin: 0 0 5px 0;
321
}
322
323
#footnotes hr {
324
  border: none;
325
  border-top: 1px solid silver;
326
  height: 1px;
327
  text-align: left;
328
  margin-left: 0;
329
  width: 20%;
330
  min-width: 100px;
331
}
332
333
div.colist td {
334
  padding-right: 0.5em;
335
  padding-bottom: 0.3em;
336
  vertical-align: top;
337
}
338
div.colist td img {
339
  margin-top: 0.3em;
340
}
341
342
@media print {
343
  #footer-badges { display: none; }
344
}
345
346
#toc {
347
  margin-bottom: 2.5em;
348
}
349
350
#toctitle {
351
  color: #527bbd;
352
  font-size: 1.1em;
353
  font-weight: bold;
354
  margin-top: 1.0em;
355
  margin-bottom: 0.1em;
356
}
357
358
div.toclevel0, div.toclevel1, div.toclevel2, div.toclevel3, div.toclevel4 {
359
  margin-top: 0;
360
  margin-bottom: 0;
361
}
362
div.toclevel2 {
363
  margin-left: 2em;
364
  font-size: 0.9em;
365
}
366
div.toclevel3 {
367
  margin-left: 4em;
368
  font-size: 0.9em;
369
}
370
div.toclevel4 {
371
  margin-left: 6em;
372
  font-size: 0.9em;
373
}
374
375
span.aqua { color: aqua; }
376
span.black { color: black; }
377
span.blue { color: blue; }
378
span.fuchsia { color: fuchsia; }
379
span.gray { color: gray; }
380
span.green { color: green; }
381
span.lime { color: lime; }
382
span.maroon { color: maroon; }
383
span.navy { color: navy; }
384
span.olive { color: olive; }
385
span.purple { color: purple; }
386
span.red { color: red; }
387
span.silver { color: silver; }
388
span.teal { color: teal; }
389
span.white { color: white; }
390
span.yellow { color: yellow; }
391
392
span.aqua-background { background: aqua; }
393
span.black-background { background: black; }
394
span.blue-background { background: blue; }
395
span.fuchsia-background { background: fuchsia; }
396
span.gray-background { background: gray; }
397
span.green-background { background: green; }
398
span.lime-background { background: lime; }
399
span.maroon-background { background: maroon; }
400
span.navy-background { background: navy; }
401
span.olive-background { background: olive; }
402
span.purple-background { background: purple; }
403
span.red-background { background: red; }
404
span.silver-background { background: silver; }
405
span.teal-background { background: teal; }
406
span.white-background { background: white; }
407
span.yellow-background { background: yellow; }
408
409
span.big { font-size: 2em; }
410
span.small { font-size: 0.6em; }
411
412
span.underline { text-decoration: underline; }
413
span.overline { text-decoration: overline; }
414
span.line-through { text-decoration: line-through; }
415
416
div.unbreakable { page-break-inside: avoid; }
417
418
419
/*
420
 * xhtml11 specific
421
 *
422
 * */
423
424
div.tableblock {
425
  margin-top: 1.0em;
426
  margin-bottom: 1.5em;
427
}
428
div.tableblock > table {
429
  border: 3px solid #527bbd;
430
}
431
thead, p.table.header {
432
  font-weight: bold;
433
  color: #527bbd;
434
}
435
p.table {
436
  margin-top: 0;
437
}
438
/* Because the table frame attribute is overriden by CSS in most browsers. */
439
div.tableblock > table[frame="void"] {
440
  border-style: none;
441
}
442
div.tableblock > table[frame="hsides"] {
443
  border-left-style: none;
444
  border-right-style: none;
445
}
446
div.tableblock > table[frame="vsides"] {
447
  border-top-style: none;
448
  border-bottom-style: none;
449
}
450
451
452
/*
453
 * html5 specific
454
 *
455
 * */
456
457
table.tableblock {
458
  margin-top: 1.0em;
459
  margin-bottom: 1.5em;
460
}
461
thead, p.tableblock.header {
462
  font-weight: bold;
463
  color: #527bbd;
464
}
465
p.tableblock {
466
  margin-top: 0;
467
}
468
table.tableblock {
469
  border-width: 3px;
470
  border-spacing: 0px;
471
  border-style: solid;
472
  border-color: #527bbd;
473
  border-collapse: collapse;
474
}
475
th.tableblock, td.tableblock {
476
  border-width: 1px;
477
  padding: 4px;
478
  border-style: solid;
479
  border-color: #527bbd;
480
}
481
482
table.tableblock.frame-topbot {
483
  border-left-style: hidden;
484
  border-right-style: hidden;
485
}
486
table.tableblock.frame-sides {
487
  border-top-style: hidden;
488
  border-bottom-style: hidden;
489
}
490
table.tableblock.frame-none {
491
  border-style: hidden;
492
}
493
494
th.tableblock.halign-left, td.tableblock.halign-left {
495
  text-align: left;
496
}
497
th.tableblock.halign-center, td.tableblock.halign-center {
498
  text-align: center;
499
}
500
th.tableblock.halign-right, td.tableblock.halign-right {
501
  text-align: right;
502
}
503
504
th.tableblock.valign-top, td.tableblock.valign-top {
505
  vertical-align: top;
506
}
507
th.tableblock.valign-middle, td.tableblock.valign-middle {
508
  vertical-align: middle;
509
}
510
th.tableblock.valign-bottom, td.tableblock.valign-bottom {
511
  vertical-align: bottom;
512
}
513
514
515
/*
516
 * manpage specific
517
 *
518
 * */
519
520
body.manpage h1 {
521
  padding-top: 0.5em;
522
  padding-bottom: 0.5em;
523
  border-top: 2px solid silver;
524
  border-bottom: 2px solid silver;
525
}
526
body.manpage h2 {
527
  border-style: none;
528
}
529
body.manpage div.sectionbody {
530
  margin-left: 3em;
531
}
532
533
@media print {
534
  body.manpage div#toc { display: none; }
535
}
536
537
538
</style>
539
<script type="text/javascript">
540
/*<![CDATA[*/
541
var asciidoc = {  // Namespace.
542
543
/////////////////////////////////////////////////////////////////////
544
// Table Of Contents generator
545
/////////////////////////////////////////////////////////////////////
546
547
/* Author: Mihai Bazon, September 2002
548
 * http://students.infoiasi.ro/~mishoo
549
 *
550
 * Table Of Content generator
551
 * Version: 0.4
552
 *
553
 * Feel free to use this script under the terms of the GNU General Public
554
 * License, as long as you do not remove or alter this notice.
555
 */
556
557
 /* modified by Troy D. Hanson, September 2006. License: GPL */
558
 /* modified by Stuart Rackham, 2006, 2009. License: GPL */
559
560
// toclevels = 1..4.
561
toc: function (toclevels) {
562
563
  function getText(el) {
564
    var text = "";
565
    for (var i = el.firstChild; i != null; i = i.nextSibling) {
566
      if (i.nodeType == 3 /* Node.TEXT_NODE */) // IE doesn't speak constants.
567
        text += i.data;
568
      else if (i.firstChild != null)
569
        text += getText(i);
570
    }
571
    return text;
572
  }
573
574
  function TocEntry(el, text, toclevel) {
575
    this.element = el;
576
    this.text = text;
577
    this.toclevel = toclevel;
578
  }
579
580
  function tocEntries(el, toclevels) {
581
    var result = new Array;
582
    var re = new RegExp('[hH]([1-'+(toclevels+1)+'])');
583
    // Function that scans the DOM tree for header elements (the DOM2
584
    // nodeIterator API would be a better technique but not supported by all
585
    // browsers).
586
    var iterate = function (el) {
587
      for (var i = el.firstChild; i != null; i = i.nextSibling) {
588
        if (i.nodeType == 1 /* Node.ELEMENT_NODE */) {
589
          var mo = re.exec(i.tagName);
590
          if (mo && (i.getAttribute("class") || i.getAttribute("className")) != "float") {
591
            result[result.length] = new TocEntry(i, getText(i), mo[1]-1);
592
          }
593
          iterate(i);
594
        }
595
      }
596
    }
597
    iterate(el);
598
    return result;
599
  }
600
601
  var toc = document.getElementById("toc");
602
  if (!toc) {
603
    return;
604
  }
605
606
  // Delete existing TOC entries in case we're reloading the TOC.
607
  var tocEntriesToRemove = [];
608
  var i;
609
  for (i = 0; i < toc.childNodes.length; i++) {
610
    var entry = toc.childNodes[i];
611
    if (entry.nodeName.toLowerCase() == 'div'
612
     && entry.getAttribute("class")
613
     && entry.getAttribute("class").match(/^toclevel/))
614
      tocEntriesToRemove.push(entry);
615
  }
616
  for (i = 0; i < tocEntriesToRemove.length; i++) {
617
    toc.removeChild(tocEntriesToRemove[i]);
618
  }
619
620
  // Rebuild TOC entries.
621
  var entries = tocEntries(document.getElementById("content"), toclevels);
622
  for (var i = 0; i < entries.length; ++i) {
623
    var entry = entries[i];
624
    if (entry.element.id == "")
625
      entry.element.id = "_toc_" + i;
626
    var a = document.createElement("a");
627
    a.href = "#" + entry.element.id;
628
    a.appendChild(document.createTextNode(entry.text));
629
    var div = document.createElement("div");
630
    div.appendChild(a);
631
    div.className = "toclevel" + entry.toclevel;
632
    toc.appendChild(div);
633
  }
634
  if (entries.length == 0)
635
    toc.parentNode.removeChild(toc);
636
},
637
638
639
/////////////////////////////////////////////////////////////////////
640
// Footnotes generator
641
/////////////////////////////////////////////////////////////////////
642
643
/* Based on footnote generation code from:
644
 * http://www.brandspankingnew.net/archive/2005/07/format_footnote.html
645
 */
646
647
footnotes: function () {
648
  // Delete existing footnote entries in case we're reloading the footnodes.
649
  var i;
650
  var noteholder = document.getElementById("footnotes");
651
  if (!noteholder) {
652
    return;
653
  }
654
  var entriesToRemove = [];
655
  for (i = 0; i < noteholder.childNodes.length; i++) {
656
    var entry = noteholder.childNodes[i];
657
    if (entry.nodeName.toLowerCase() == 'div' && entry.getAttribute("class") == "footnote")
658
      entriesToRemove.push(entry);
659
  }
660
  for (i = 0; i < entriesToRemove.length; i++) {
661
    noteholder.removeChild(entriesToRemove[i]);
662
  }
663
664
  // Rebuild footnote entries.
665
  var cont = document.getElementById("content");
666
  var spans = cont.getElementsByTagName("span");
667
  var refs = {};
668
  var n = 0;
669
  for (i=0; i<spans.length; i++) {
670
    if (spans[i].className == "footnote") {
671
      n++;
672
      var note = spans[i].getAttribute("data-note");
673
      if (!note) {
674
        // Use [\s\S] in place of . so multi-line matches work.
675
        // Because JavaScript has no s (dotall) regex flag.
676
        note = spans[i].innerHTML.match(/\s*\[([\s\S]*)]\s*/)[1];
677
        spans[i].innerHTML =
678
          "[<a id='_footnoteref_" + n + "' href='#_footnote_" + n +
679
          "' title='View footnote' class='footnote'>" + n + "</a>]";
680
        spans[i].setAttribute("data-note", note);
681
      }
682
      noteholder.innerHTML +=
683
        "<div class='footnote' id='_footnote_" + n + "'>" +
684
        "<a href='#_footnoteref_" + n + "' title='Return to text'>" +
685
        n + "</a>. " + note + "</div>";
686
      var id =spans[i].getAttribute("id");
687
      if (id != null) refs["#"+id] = n;
688
    }
689
  }
690
  if (n == 0)
691
    noteholder.parentNode.removeChild(noteholder);
692
  else {
693
    // Process footnoterefs.
694
    for (i=0; i<spans.length; i++) {
695
      if (spans[i].className == "footnoteref") {
696
        var href = spans[i].getElementsByTagName("a")[0].getAttribute("href");
697
        href = href.match(/#.*/)[0];  // Because IE return full URL.
698
        n = refs[href];
699
        spans[i].innerHTML =
700
          "[<a href='#_footnote_" + n +
701
          "' title='View footnote' class='footnote'>" + n + "</a>]";
702
      }
703
    }
704
  }
705
},
706
707
install: function(toclevels) {
708
  var timerId;
709
710
  function reinstall() {
711
    asciidoc.footnotes();
712
    if (toclevels) {
713
      asciidoc.toc(toclevels);
714
    }
715
  }
716
717
  function reinstallAndRemoveTimer() {
718
    clearInterval(timerId);
719
    reinstall();
720
  }
721
722
  timerId = setInterval(reinstall, 500);
723
  if (document.addEventListener)
724
    document.addEventListener("DOMContentLoaded", reinstallAndRemoveTimer, false);
725
  else
726
    window.onload = reinstallAndRemoveTimer;
727
}
728
729
}
730
asciidoc.install();
731
/*]]>*/
732
</script>
4
</head>
733
</head>

5
734
<body class="article">
6
<body>
735
<div id="header">
7
736
<h1>Indexing PDF XMP-metadata with Recoll</h1>
8
<h2>Introduction</h2>
737
</div>
9
<p>Organizing and searching a large collection of PDFs as part of a research project can be a demanding task.
738
<div id="content">
10
<a href="http://en.wikipedia.org/wiki/Extensible_Metadata_Platform">XMP metadata</a> stored in a PDF, such as journal title, publication year, and user-added keywords, are often useful when searching for a publication.
739
<div id="preamble">
11
Here, we describe the use of a custom Recoll filter to retrieve this metadata, an indexing configuration to store it, and result paragraph format to display it. See also a related wiki entry, <a href="https://bitbucket.org/medoc/recoll/wiki/HandleCustomField.wiki">Generating a custom field and using it to sort results</a>, for sorting results on PDF page count.
740
<div class="sectionbody">
12
741
<div class="paragraph"><p>The original document describing XMP metadata usage with Recoll was
13
<h2>Saving metadata to PDFs</h2>
742
written by Jeffrey Dick and is <a href="original-text.html">still available
14
<p>Bibliographic metadata can be saved in the PDF file itself. In the <a href="http://jabref.sourceforge.net">JabRef</a> bibliography manager, this is done with the "Write XMP-metadata to PDFs" menu item. Note the presence of the keywords in the screenshot below; this field is a good place to tag the PDF with any words of your choosing to describe genre, topic, etc.
743
here</a>. However it described using the old shell-based PDF Recoll input
15
<p><img src="jabref_metadata.png">
744
handler, which differs a lot from doing something equivalent with the
16
745
current Python-based one (for which XMP capability is available from
17
<h2>Custom indexing (fields file)</h2>
746
recoll 1.23.2, but the new handler can be used with previous Recoll
18
<p>Let's create two fields named "year" and "journal". The prefixes starting with "XY" are extension prefixes that are added to the terms in the Xapian database (Recoll internally does not use prefixes starting with XY). Additionally, the year and journal are stored so they can be displayed in the results list. Some other types of metadata, such as title, author and keywords, are already indexed by Recoll (the default rclpdf finds them using the <b>pdftotext</b> command) so there is no need to add those to the [prefixes] section.
747
versions).</p></div>
19
<p>Add this text to the fields file in your Recoll configuration directory (<tt>~/.recoll/fields</tt>).
748
<div class="paragraph"><p>This page was adapted from the text by Jeffrey Dick, using input from
20
<pre>
749
Johannes Menzel, (especially the result list paragraph format),
21
[prefixes]
750
adapting things for the new handler. The discussion which led to the
751
updated handler is a
752
<a href="https://bitbucket.org/medoc/recoll/issues/300/extracting-xmp-metadata-and-tmsu-tags">Bitbucket
753
Recoll issue</a>.</p></div>
754
</div>
755
</div>
756
<div class="sect1">
757
<h2 id="_introduction">Introduction</h2>
758
<div class="sectionbody">
759
<div class="paragraph"><p>Organizing and searching a large collection of PDFs as part of a
760
research project can be a demanding task.
761
<a href="http://en.wikipedia.org/wiki/Extensible_Metadata_Platform">XMP
762
metadata</a> stored in a PDF, such as journal title, publication year,
763
and user-added keywords, are often useful when searching for a
764
publication.</p></div>
765
<div class="paragraph"><p>Here, we describe customizing Recoll to retrieve this metadata, store it,
766
and defining a result paragraph format to display it. See also a related
767
wiki entry,
768
<a href="https://bitbucket.org/medoc/recoll/wiki/HandleCustomField.wiki">Generating
769
a custom field and using it to sort results</a>, for sorting results on PDF
770
page count.</p></div>
771
</div>
772
</div>
773
<div class="sect1">
774
<h2 id="_saving_metadata_to_pdfs">Saving metadata to PDFs</h2>
775
<div class="sectionbody">
776
<div class="paragraph"><p>Bibliographic metadata can be saved in the PDF file itself. In
777
the <a href="http://jabref.sourceforge.net">JabRef</a> bibliography
778
manager, this is done with the "Write XMP-metadata to PDFs" menu
779
item. Note the presence of the keywords in the screenshot below; this
780
field is a good place to tag the PDF with any words of your choosing
781
to describe genre, topic, etc.</p></div>
782
<div class="imageblock">
783
<div class="content">
784
<img src="jabref_metadata.png" alt="Editing metadata with jabref" />
785
</div>
786
</div>
787
</div>
788
</div>
789
<div class="sect1">
790
<h2 id="_custom_indexing_fields_file">Custom indexing (fields file)</h2>
791
<div class="sectionbody">
792
<div class="paragraph"><p>Let&#8217;s create two fields named "year" and "journal". The prefixes
793
starting with "XY" are extension prefixes that are added to the terms
794
in the Xapian database (Recoll internally does not use prefixes
795
starting with XY). Additionally, the year and journal are stored so
796
they can be displayed in the results list. Some other types of
797
metadata, such as title, author and keywords, are already indexed by
798
Recoll (the default rclpdf finds them using the <strong>pdftotext</strong>
799
command) so there is no need to add those to the [prefixes] section.</p></div>
800
<div class="paragraph"><p>Add this text to the fields file in your Recoll configuration
801
directory (<em>~/.recoll/fields</em>).</p></div>
802
<div class="listingblock">
803
<div class="content">
804
<pre><code>[prefixes]
22
year = XYEAR
805
year = XYEAR

23
journal = XYJOUR
806
journal = XYJOUR

24
807
25
[stored]
808
[stored]

26
year =
809
bibtex:year =
27
journal =
810
bibtex:journal =</code></pre>
811
</div></div>
812
</div>
813
</div>
814
<div class="sect1">
815
<h2 id="_telling_the_handler_what_fields_to_extract">Telling the handler what fields to extract</h2>
816
<div class="sectionbody">
817
<div class="paragraph"><p>As of Recoll 1.23.2, the PDF handler has the capability to use
818
<strong>pdfinfo</strong> for extracting XMP metadata. The switch for executing <strong>pdfinfo</strong>
819
is the <em>pdfextrameta</em> configuration parameter, and the value of the
820
parameter is a list of XMP tags to extract, with optional conversion
821
to Recoll field names (the XMP qualified tag name is kept by
822
default). Example:</p></div>
823
<div class="listingblock">
824
<div class="content">
825
<pre><code>pdfextrameta =  bibtex:year bibtex:journal bibtex:booktitle|title</code></pre>
826
</div></div>
827
<div class="paragraph"><p>Here, <em>bibtex:year</em> and <em>bibtex:journal</em> are used directly, and
828
<em>bibtex:booktitle</em> is translated to <em>title</em> (the example is not
829
supposed to make sense)</p></div>
830
</div>
831
</div>
832
<div class="sect1">
833
<h2 id="_editing_the_field_values">Editing the field values</h2>
834
<div class="sectionbody">
835
<div class="paragraph"><p>Shortly after the 1.23.2 release, the new rclpdf.py was modified to
836
enable calling external Python code for editing the values of the XMP
837
metadata fields. The name of the external script is defined by the
838
<em>pdfextrametafix</em> configuration variable, and it should define a
839
<em>MetaFixer</em> class, with a <em>metafix()</em> method.</p></div>
840
<div class="paragraph"><p>In practise, add the following to recoll.conf:</p></div>
841
<div class="listingblock">
842
<div class="content">
843
<pre><code>pdfextrametafix = /path/to/my/script.py</code></pre>
844
</div></div>
845
<div class="paragraph"><p>The Python script could look like the following:</p></div>
846
<div class="listingblock">
847
<div class="content">
848
<pre><code>import sys
849
import re
850
851
# This can be used for local XMP field editing.
852
#
853
# A new instance is created for each PDF document (so the object could
854
# keep state to avoid, e.g. duplicate values)
855
#
856
# The metafix method receives an (original) field name, and the text
857
# value, and should return the possibly modified text.
858
class MetaFixer(object):
859
    def __init__(self):
860
        pass
861
862
    def metafix(self, nm, txt):
863
        if nm == 'bibtex:pages':
864
            txt = re.sub(r'--', '-', txt)
865
        elif nm == 'someothername':
866
            # do something else
867
            pass
868
        elif nm == 'stillanother':
869
            # etc.
870
            pass
871
872
        return txt</code></pre>
873
</div></div>
874
</div>
875
</div>
876
<div class="sect1">
877
<h2 id="_indexing">Indexing</h2>
878
<div class="sectionbody">
879
<div class="paragraph"><p>Then index away!</p></div>
880
<div class="paragraph"><p>Note that you can also run the rclpdf.py script manually,
881
e.g. <code>rclpdf.py -d /path/to/some.pdf</code>, to inspect the
882
output. If things are working correctly, the &lt;head&gt; consists of the
883
HTML meta elements, and the &lt;body&gt; contains the text of the PDF.</p></div>
884
</div>
885
</div>
886
<div class="sect1">
887
<h2 id="_result_paragraph_format">Result paragraph format</h2>
888
<div class="sectionbody">
889
<div class="paragraph"><p>Here, the result is formatted to show the title, which is a link
890
to open the document, in blue with underlining turned off. The next
891
two lines contain the authors, then the journal title in green
892
italicized text followed by year (in parentheses). The keywords are
893
listed in red after the abstract/text snippet.</p></div>
894
<div class="paragraph"><p>Edit this using the Recoll GUI: Preferences &gt; GUI configuration &gt;
895
    Result List &gt; Edit result paragraph format string.</p></div>
896
<div class="listingblock">
897
<div class="content">
898
<pre><code>&lt;table class="respar" style="padding-bottom: 10px;" cellspacing="5" cellpadding="5"&gt;
899
900
&lt;thead style="vertical-align: top;"&gt;
901
&lt;tr&gt;
902
&lt;td colspan="3" style="border-bottom: 1pt dotted #004070; font-size: smaller;"&gt;&lt;a href="E%N"&gt;%u&lt;/a&gt; | %S | Relevanz: %R&lt;/td&gt;
903
&lt;/tr&gt;
904
&lt;/thead&gt;
905
906
&lt;tbody style="vertical-align: top;"&gt;
907
&lt;tr&gt;
908
&lt;td&gt;&lt;a href="P%N"&gt;&lt;img src="%I" alt="" width="64" height="auto" /&gt;&lt;/a&gt;&lt;/td&gt;
909
&lt;td style="width: 250px;"&gt;&lt;span style="color: #004070;"&gt;
910
  &lt;div style="font-style: italic;"&gt;%(author)&lt;/div&gt;
911
  &lt;div style="font-weight: bold;"&gt;&lt;a href="E%N"&gt;&amp;raquo;%T&amp;laquo;&lt;/a&gt;&lt;/div&gt;
912
  &lt;div style="text-transform: uppercase; margin-top: 5pt"&gt;%(reftype)&lt;/div&gt;&lt;/td&gt;
913
&lt;td&gt;
914
  &lt;div style="font-size: smaller;"&gt;
915
    %(refauthor)%(refchapter) %(reftitle)%(refeditor)%(refbooktitle)%(refjournal)%(refvolume)%(refnumber)%(refaddress)%(reflocation)%(refpublisher)%(refyear)%(refpages).&lt;/div&gt;
916
  &lt;div style="text-align: justify; font-family: serif; margin-top: 5pt; margin-bottom: 5pt"&gt;&amp;raquo;&lt;a href="A%N"&gt;%A&lt;/a&gt;&amp;laquo;&lt;/div&gt;
917
  &lt;div&gt;%(refkeywords)&lt;/div&gt;
918
  &lt;div style="font-size: smaller;"&gt;&lt;a href="%(refurl)"&gt;%(refurl)&lt;/a&gt;&lt;/div&gt;
919
  &lt;div style="font-size: smaller"&gt; %(refkey) %(refisbn) %(refissn) %(refdoi)&lt;/div&gt;&lt;/td&gt;
920
&lt;/tr&gt;
921
&lt;/tbody&gt;
922
923
&lt;/table&gt;</code></pre>
924
</div></div>
925
<div class="paragraph"><p>The screenshot below also has the <em>Highlight color for query terms</em>
926
set to <code>black; font-weight:bold;</code> for bold, black text (instead
927
of the blue default). There
928
are linkhttps://bitbucket.org/medoc/recoll/wiki/ResultsThumbnails[various
929
methods for creating the thumbnails]; the ones here were made by
930
opening the directory containing the PDFs in the Dolphin file manager
931
(part of KDE) and selecting the Preview option.</p></div>
932
</div>
933
</div>
934
<div class="sect1">
935
<h2 id="_a_search_example">A search example</h2>
936
<div class="sectionbody">
937
<div class="paragraph"><p>The simple query is <code>cerevisiae keyword:protein</code>. This
938
returns only PDFs that have the text "cerevisiae" and have been
939
tagged with the "protein" keyword. The LaTeX-style formatting from
940
the BibTeX database is displayed as HTML (note the italicized words
941
in article title, and umlaut in author&#8217;s name). Other queries could
942
be made based on the PDF metadata, e.g. <em>journal:plos</em>
943
r <em>year:2013</em>.</p></div>
944
<div class="paragraph"><p>image::recoll_query.png</p></div>
945
</div>
946
</div>
947
<div class="sect1">
948
<h2 id="_more_possibilities">More possibilities</h2>
949
<div class="sectionbody">
950
<div class="ulist"><ul>
951
<li>
952
<p>
953
The sort buttons (up- and down-arrows) in Recoll sort the
954
  results by the modified date on the file at the time of indexing. If
955
  you want this sorting to reflect the publication year, then the
956
  timestamp should be set accordingly. If names of the PDFs contain
957
  the year (e.g. BZS2007.pdf, CKE+2011.pdf), the following one-liner
958
  would set the modified date to January 1st of the year:
28
</pre>
959
</p>

29
30
<h2>Custom filter (rclpdf file)</h2>
31
<p>This is where the heavy lifting happens. The filter should create HTML meta elements for each of the named index fields. Below is a diff between the default rclpdf and a customized one. The PDF metadata is gathered using the <b>pdfinfo</b> command. Then, <b>grep</b> and <b>sed</b> are used to extract the publication year and journal name from metadata fields beginning with "bibtex:" (part of the XMP metadata written by JabRef, in XML format). That information is fed to <b>awk</b>, which puts together the output. The crucial part in the customized awk script is the inclusion of the HTML meta elements with the names "year" and "journal".
32
<p>There is some additional processing carried out by the l2html function. This replaces some LaTeX-style accents (stored in the PDF metadata if the BibTeX file contains them) with HTML entities. Only a few examples are shown here; other LaTeX accents could be processed in a similar manner. If desired, the sed commands could be modified to give UTF-8 characters instead of the HTML entities.
33
<p>The l2html function also converts the LaTeX \emph{...} (emphasized text) to HTML markup for italics, &lt;i&gt;...&lt;/i&gt;. With this, and the markup="html" attribute in the HTML meta elements (given in the awk script for the title and at the end of the filter for the author), italicized text and accented characters represented by HTML entities will be shown in the results.
34
<p>One other thing to note: the filter changes the "Subject" HTML meta tag (created by pdftotext) to "Abstract"; this is so that the actual abstract - of the journal article, stored in the BibTeX database and written as metadata to the PDF, and reported by pdftotext as the "Subject" - is indexed independently of the title. Otherwise, terms in the "Subject" and "title" meta tags by default get indexed together by Recoll, so a title: query would actually match words appearing in the abstract.
35
<p>Grab the default rclpdf for Recoll 1.18.1 (most likely <tt>/usr/share/recoll/filters/rclpdf</tt>) then apply this patch and save the result in <tt>~/.recoll/filters/rclpdf</tt> .
36
<pre>
37
104a105,126
38
> 
39
> l2html() 
40
> {
41
>   # redirect the stdin so the function can be used in a pipe
42
>   cat |
43
>   # use sed to replace some accented (LaTeX format) characters
44
>   sed -e 's/\\"a/\&amp;auml;/g' |      # a umlaut
45
>   sed -e "s/\\\'a/\&amp;aacute;/g" |   # a acute
46
>   sed -e "s/\\\\\`a/\&amp;agrave;/g" | # a grave
47
>   sed -e 's/\\u{a}/\&amp;#x103;/g' |   # a breve
48
>   # linebreak so multiple \emph{.*} can be replaced
49
>   sed -e 's/\\emph{/\n&amp;/g' |          
50
>   # \emph{.*} to &lt;i>.*&lt;/i> 
51
>   sed -e 's/\\emph{\(.*\)}/\&amp;lt;i\&amp;gt;\1\&amp;lt;\/i\&amp;gt;/g'
52
> }
53
> 
54
> # get PDF metadata
55
> PDFINFO=`pdfinfo -meta "$infile" 2>/dev/null`
56
> # need grep -a (--text) becuase sometimes it treats input as binary
57
> YEAR=`echo "$PDFINFO" | grep -a bibtex:year | sed -e 's/&lt;\/.*>//g' | sed -e 's/&lt;.*>//g'`
58
> JOURNAL=`echo "$PDFINFO" | grep -a bibtex:journal | sed -e 's/&lt;\/.*>//g' | sed -e 's/&lt;.*>//g'`
59
> 
60
107c129
61
&lt; awk 'BEGIN'\
62
63
> awk -v year="$YEAR" -v journal="$JOURNAL" 'BEGIN'\
64
111a134,136
65
>   yearmeta = "&lt;meta name=\"year\" content=\""
66
>   journalmeta = "&lt;meta name=\"journal\" content=\""
67
>   endmeta = "\">\n"
68
115a141,146
69
>   if(doescape == 0 &amp;&amp; $0 ~ /&lt;\/head>/) {
70
>     match($0, /&lt;\/head>/)
71
>     part1 = substr($0, 0, RSTART-1)
72
>     part2 = substr($0, RSTART, length($0))
73
>     $0 =  part1 yearmeta year endmeta journalmeta journal endmeta part2
74
>   }
75
133c164
76
&lt;     mid = "&lt;title>" mid "&lt;/title>"
77
78
>     mid = "&lt;meta name=\"title\" markup=\"html\" content=\"" mid "\">"
79
167c198
80
&lt; ' 
81
82
> ' | 
83
168a200,205
84
> # replace latex with html markup
85
> l2html |
86
> # replace "Subject" with "Abstract"
87
> sed -e s/\&lt;meta\ name=\"Subject\"/\&lt;meta\ name=\"Abstract\"/g |
88
> # add markup="html" to author meta element
89
> sed -e s/\&lt;meta\ name=\"Author\"/\&lt;meta\ name=\"Author\"\ markup=\"html\"/g
90
</pre>
91
92
<h2>Use the source (mimeconf file)</h2>
93
<p>Recoll needs to know about your custom rclpdf. Make sure the rclpdf is executable, and add this to <tt>~/.recoll/mimeconf</tt> (replace &lt;username> with your username).
94
<pre>
95
[index]
96
application/pdf = exec /home/&lt;username>/.recoll/filters/rclpdf
97
</pre>
98
<p>Then index away!
99
<p>Note that you can also run the rclpdf script manually, e.g. <tt>rclpdf /path/to/some.pdf</tt>, to inspect the output. If things are working correctly, the &lt;head> consists of the HTML meta elements, and the &lt;body> contains the text of the PDF.
100
101
<h2>Result paragraph format</h2>
102
<p>Here, the result is formatted to show the title, which is a link to open the document, in blue with underlining turned off. The next two lines contain the authors, then the journal title in green italicized text followed by year (in parentheses). The keywords are listed in red after the abstract/text snippet.
103
<p>Edit this using the Recoll GUI: Preferences > GUI configuration > Result List > Edit result paragraph format string.
104
<pre>
105
&lt;a href="P%N">&lt;img src="%I" align="left">&lt;/a>
106
&amp;nbsp;&lt;span style=font-size:1.15em>&lt;a style=text-decoration:none href="E%N">%(title)&lt;/a>&lt;/span>&lt;br>
107
&amp;nbsp;%(author)&lt;br>
108
&amp;nbsp;&lt;font color="#009000">&lt;i>%(journal)&lt;/i>&lt;/font>&amp;nbsp;(%(year))
109
&amp;nbsp;&lt;table bgcolor="#e0e0e0"> &lt;tr>&lt;td>&lt;div>%A&lt;/div>&lt;/td>&lt;/tr>
110
&lt;/table>&lt;font color="#900000">%K&lt;/font>
111
&lt;br>&lt;br>
112
</pre>
113
The screenshot below also has the "Highlight color for query terms" set to <tt>black; font-weight:bold;</tt> for bold, black text (instead of the blue default). There are <a href="https://bitbucket.org/medoc/recoll/wiki/ResultsThumbnails">various methods for creating the thumbnails</a>; the ones here were made by opening the directory containing the PDFs in the Dolphin file manager (part of KDE) and selecting the Preview option.
114
115
<h2>A search example</h2>
116
<p>The simple query is <tt>cerevisiae keyword:protein</tt>. This returns only PDFs that have the text "cerevisiae" and have been tagged with the "protein" keyword. The LaTeX-style formatting from the BibTeX database is displayed as HTML (note the italicized words in article title, and umlaut in author's name). Other queries could be made based on the PDF metadata, e.g. <tt>journal:plos</tt> or <tt>year:2013</tt> .
117
<p><img src="recoll_query.png">
118
119
<h2>More possibilities</h2>
120
<ul>
121
  <li>The sort buttons (up- and down-arrows) in Recoll sort the results by the modified date on the file at the time of indexing. If you want this sorting to reflect the publication year, then the timestamp should be set accordingly. If names of the PDFs contain the year (e.g. BZS2007.pdf, CKE+2011.pdf), the following one-liner would set the modified date to January 1st of the year: <tt>for i in `ls *.pdf`; do touch -d `echo $i | sed 's/[^0-9]*//g'`-01-01 $i; done</tt> . Note that the publication year could then be shown in the result list using the stored date of the file (using "%D" in the result paragraph format, and date format "%Y") instead of having to add the year to the index as shown above.
122
  <li>The filter can be modified to fill in the "journal" field for BibTex entries that aren't journal articles (e.g. bibtex:booktitle for "InCollection" entries).
123
</ul>
960
</li>

124
961
</ul></div>
962
<div class="listingblock">
963
<div class="content">
964
<pre><code>for i in `ls *.pdf`; do touch -d `echo $i | sed 's/[^0-9]*//g'`-01-01 $i; done</code></pre>
965
</div></div>
966
<div class="paragraph"><p>Note that the publication year could then be shown in
967
the result list using the stored date of the file (using "%D" in the
968
result paragraph format, and date format "%Y") instead of having to
969
add the year to the index as shown above.</p></div>
970
<div class="ulist"><ul>
971
<li>
972
<p>
973
The filter can be modified to fill in the "journal" field for
974
  BibTex entries that aren&#8217;t journal articles (e.g. bibtex:booktitle
975
  for "InCollection" entries).
976
</p>
977
</li>
978
</ul></div>
979
</div>
980
</div>
981
</div>
982
<div id="footnotes"><hr /></div>
983
<div id="footer">
984
<div id="footer-text">
985
Last updated
986
 2017-05-17 07:27:42 CEST
987
</div>
988
</div>
125
</body>
989
</body>

990
</html>