Switch to unified view

a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp
...
...
22
 * -----END-LICENCE-----
22
 * -----END-LICENCE-----
23
 */
23
 */
24
24
25
// This file has code from omindex + an adaptor function for recoll at the end
25
// This file has code from omindex + an adaptor function for recoll at the end
26
26
27
#include "htmlparse.h"
28
#include "mimehandler.h"
27
#include "mimehandler.h"
29
#include "debuglog.h"
28
#include "debuglog.h"
30
#include "csguess.h"
29
#include "csguess.h"
31
#include "readfile.h"
30
#include "readfile.h"
32
#include "transcode.h"
31
#include "transcode.h"
33
#include "mimeparse.h"
32
#include "mimeparse.h"
34
33
#include "myhtmlparse.h"
35
class MyHtmlParser : public HtmlParser {
34
#include "indextext.h"
36
 public:
37
    bool in_script_tag;
38
    bool in_style_tag;
39
    string title, sample, keywords, dump;
40
    string ocharset; // This is the charset our user thinks the doc was
41
    string charset; // This is the charset it was supposedly converted to
42
    string doccharset; // Set this to value of charset parameter in header
43
    bool indexing_allowed;
44
    void process_text(const string &text);
45
    void opening_tag(const string &tag, const map<string,string> &p);
46
    void closing_tag(const string &tag);
47
    MyHtmlParser() :
48
  in_script_tag(false),
49
  in_style_tag(false),
50
  indexing_allowed(true) { }
51
};
52
53
void
54
MyHtmlParser::process_text(const string &text)
55
{
56
    // some tags are meaningful mid-word so this is simplistic at best...
57
58
    if (!in_script_tag && !in_style_tag) {
59
  string::size_type firstchar = text.find_first_not_of(" \t\n\r");
60
  if (firstchar != string::npos) {
61
      dump += text.substr(firstchar);
62
      dump += " ";
63
  }
64
    }
65
}
66
67
// lets hope that the charset includes ascii values...
68
static inline void
69
lowercase_term(string &term)
70
{
71
    string::iterator i = term.begin();
72
    while (i != term.end()) {
73
  if (*i >= 'A' && *i <= 'Z')
74
      *i = *i + 'a' - 'A';
75
        i++;
76
    }
77
}
78
35
79
#include <iostream>
36
#include <iostream>
80
using namespace std;
37
using namespace std;
81
38
82
83
void
84
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
85
{
86
#if 0
87
    cout << "TAG: " << tag << ": " << endl;
88
    map<string, string>::const_iterator x;
89
    for (x = p.begin(); x != p.end(); x++) {
90
  cout << "  " << x->first << " -> '" << x->second << "'" << endl;
91
    }
92
#endif
93
    
94
    if (tag == "meta") {
95
  map<string, string>::const_iterator i, j;
96
  if ((i = p.find("content")) != p.end()) {
97
      if ((j = p.find("name")) != p.end()) {
98
      string name = j->second;
99
      lowercase_term(name);
100
      if (name == "description") {
101
          if (sample.empty()) {
102
          sample = i->second;
103
          decode_entities(sample);
104
          }
105
      } else if (name == "keywords") {
106
          if (!keywords.empty()) keywords += ' ';
107
          string tmp = i->second;
108
          decode_entities(tmp);
109
          keywords += tmp;
110
      } else if (name == "robots") {
111
          string val = i->second;
112
          decode_entities(val);
113
          lowercase_term(val);
114
          if (val.find("none") != string::npos ||
115
          val.find("noindex") != string::npos) {
116
          indexing_allowed = false;
117
          throw true;
118
          }
119
      }
120
      } else if ((j = p.find("http-equiv")) != p.end()) {
121
      string hequiv = j->second;
122
      lowercase_term(hequiv);
123
      if (hequiv == "content-type") {
124
          string value = i->second;
125
          MimeHeaderValue p = parseMimeHeaderValue(value);
126
          map<string, string>::const_iterator k;
127
          if ((k = p.params.find("charset")) != p.params.end()) {
128
          doccharset = k->second;
129
          if (doccharset != ocharset) {
130
              LOGDEB1(("Doc specified charset '%s' "
131
                   "differs from announced '%s'\n",
132
                   doccharset.c_str(), ocharset.c_str()));
133
              throw true;
134
          }
135
          }
136
      }
137
      }
138
  }
139
    } else if (tag == "p" || tag == "br" || tag == "li") {
140
  dump += "\n";
141
    } else if (tag == "script") {
142
  in_script_tag = true;
143
    } else if (tag == "style") {
144
  in_style_tag = true;
145
    } else if (tag == "body") {
146
  dump = "";
147
    }
148
}
149
150
void
151
MyHtmlParser::closing_tag(const string &tag)
152
{
153
    if (tag == "title") {
154
  title = dump;
155
  dump = "";
156
    } else if (tag == "script") {
157
  in_script_tag = false;
158
    } else if (tag == "style") {
159
  in_style_tag = false;
160
    } else if (tag == "body") {
161
  throw true;
162
    }
163
}
164
39
165
bool textHtmlToDoc(RclConfig *conf, const string &fn, 
40
bool textHtmlToDoc(RclConfig *conf, const string &fn, 
166
             const string &mtype, Rcl::Doc &docout)
41
             const string &mtype, Rcl::Doc &docout)
167
{
42
{
168
    LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
43
    LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));