recoll / Code / Diff of /src/filters/rclpurple

Diff of /src/filters/rclpurple [871eeb] .. [783755]

Switch to side-by-side view

--- a/src/filters/rclpurple
+++ b/src/filters/rclpurple
@@ -82,41 +82,79 @@
 # !! Leave the following line unmodified !
 #ENDRECFILTCOMMONCODE
 
-checkcmds awk iconv 
+checkcmds awk
 
 awk '
 # First line: parse from, to , output html header
 NR == 1 {
-    if (NF != 13) {
+    if (NF != 14 && NF != 13 && NF != 9) {
        printf("Bad format: (NF %d) %s\n", NF, $0)
        exit 1
     }
     to = $3
-    from = $12
-    proto = $13
-    date =  $5 " " $6 " " $7 " " $8 " " $9 " " $10
-    #printf("from [%s] to [%s] proto [%s] date [%s]\n", from, to, proto, date)
+    if (NF == 14 || NF == 13) {
+        mon_i["Jan"] = "01"
+        mon_i["Feb"] = "02"    
+        mon_i["Mar"] = "03"
+        mon_i["Apr"] = "04"
+        mon_i["May"] = "05"
+        mon_i["Jun"] = "06"
+        mon_i["Jul"] = "07"
+        mon_i["Aug"] = "08"
+        mon_i["Sep"] = "09"
+        mon_i["Oct"] = "10"
+        mon_i["Nov"] = "11"
+        mon_i["Dec"] = "12"
+        date =  $8 "-" mon_i[$7] "-" $6 "T" $9
+        if (NF == 14) {
+            from = $13
+        }
+        if (NF == 13) {
+            from = $12
+        }
+    }
+
+    if (NF == 9) {
+        from = $8
+        date =  $5
+    }
+
+    #printf("from [%s] to [%s] date [%s]\n", from, to, date)
+
     print "<html><head>"
     print "<title> " $0 "</title>"
-    print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=iso8859-1\">"
+
     # Yes there is no such thing as a "date" meta tag. This probably should
-    # be http-equiv=last-modified or such
+    # be http-equiv=last-modified or such, but recollindex understands "date"
     printf("<meta name=\"date\" content=\"%s\">\n", date)
+
+    printf("<meta name=\"author\" content=\"%s\">\n", from)
+    printf("<meta name=\"recipient\" content=\"%s\">\n", to)
     print "</head><body>"
+    print "<pre>"
 
-    # Remember who the main persons are.
+    if (ENVIRON["RECOLL_FILTER_FORPREVIEW"] == "yes") {
+        printf("%s\n", $0)
+    }
+
+    # Remember who the main persons are. This is so that we output
+    # them once while indexing the conversation body, to avoid giving
+    # excessive weight by repeated indexing to the term.
     authors[from] = "yes"
     authors[to] = "yes"
     next
 }
-# Message first line. We strip from/to and time when indexing
+
 /^\([0-2][0-9]:[0-5][0-9]:[0-5][0-9]\)/ {
+    # Conversation element 1st line. We strip from/to (except 1st
+    # occurrence) and time when indexing.  Time is not interesting and
+    # repeated from/to indexing would give excessive weight
     if (ENVIRON["RECOLL_FILTER_FORPREVIEW"] == "yes") {
-       # Preview: output everything
-        print $0 " " "<br>"
+        # Preview: output everything
+        print $0
     } else {
-	# Index: output only text, except each new author once
-	#printf("INDEX: NF %d [%s] [%s] [%s] ", NF, $1, $2, $3);
+	# Index: output only text, except each new author once. Unfortunately,
+        # it is too late to add them to the "author" field.
 	from = $2
 	sub(":$", "", from);
 	if (authors[from] == "") {
@@ -126,16 +164,16 @@
         for (idx = 3; idx <= NR; idx++) {
             printf("%s ", $idx)
 	}
-	printf("<br>\n")
+	printf("\n")
     }
     next
 }
-# Continuation line: print it
+# Conversation element continuation line: print it
 {
-    printf("%s <br>\n", $0) 
+    printf("%s\n", $0) 
 }
 END {
-    printf("</body></html>\n")
+    printf("</pre></body></html>\n")
 }
 ' < "$infile"