recoll / Code / Diff of /src/internfile/mh

Diff of /src/internfile/mh_html.cpp [370032] .. [6d35f5]

Switch to unified view


...
 * -----END-LICENCE-----
 */

// This file has code from omindex + an adaptor function for recoll at the end


#include "mimehandler.h"
#include "debuglog.h"
#include "csguess.h"
#include "readfile.h"
#include "transcode.h"
#include "mimeparse.h"
#include "myhtmlparse.h"
#include "indextext.h"











































#include <iostream>
using namespace std;




















































































bool textHtmlToDoc(RclConfig *conf, const string &fn, 
             const string &mtype, Rcl::Doc &docout)
{
    LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));

	a/src/internfile/mh_html.cpp		b/src/internfile/mh_html.cpp
	...		...
22	* -----END-LICENCE-----	22	* -----END-LICENCE-----
23	*/	23	*/
24		24
25	// This file has code from omindex + an adaptor function for recoll at the end	25	// This file has code from omindex + an adaptor function for recoll at the end
26		26
27	#include "htmlparse.h"
28	#include "mimehandler.h"	27	#include "mimehandler.h"
29	#include "debuglog.h"	28	#include "debuglog.h"
30	#include "csguess.h"	29	#include "csguess.h"
31	#include "readfile.h"	30	#include "readfile.h"
32	#include "transcode.h"	31	#include "transcode.h"
33	#include "mimeparse.h"	32	#include "mimeparse.h"
34		33	#include "myhtmlparse.h"
35	class MyHtmlParser : public HtmlParser {	34	#include "indextext.h"
36	public:
37	bool in_script_tag;
38	bool in_style_tag;
39	string title, sample, keywords, dump;
40	string ocharset; // This is the charset our user thinks the doc was
41	string charset; // This is the charset it was supposedly converted to
42	string doccharset; // Set this to value of charset parameter in header
43	bool indexing_allowed;
44	void process_text(const string &text);
45	void opening_tag(const string &tag, const map<string,string> &p);
46	void closing_tag(const string &tag);
47	MyHtmlParser() :
48	in_script_tag(false),
49	in_style_tag(false),
50	indexing_allowed(true) { }
51	};
52
53	void
54	MyHtmlParser::process_text(const string &text)
55	{
56	// some tags are meaningful mid-word so this is simplistic at best...
57
58	if (!in_script_tag && !in_style_tag) {
59	string::size_type firstchar = text.find_first_not_of(" \t\n\r");
60	if (firstchar != string::npos) {
61	dump += text.substr(firstchar);
62	dump += " ";
63	}
64	}
65	}
66
67	// lets hope that the charset includes ascii values...
68	static inline void
69	lowercase_term(string &term)
70	{
71	string::iterator i = term.begin();
72	while (i != term.end()) {
73	if (i >= 'A' && i <= 'Z')
74	i = i + 'a' - 'A';
75	i++;
76	}
77	}
78		35
79	#include <iostream>	36	#include <iostream>
80	using namespace std;	37	using namespace std;
81		38
82
83	void
84	MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
85	{
86	#if 0
87	cout << "TAG: " << tag << ": " << endl;
88	map<string, string>::const_iterator x;
89	for (x = p.begin(); x != p.end(); x++) {
90	cout << " " << x->first << " -> '" << x->second << "'" << endl;
91	}
92	#endif
93
94	if (tag == "meta") {
95	map<string, string>::const_iterator i, j;
96	if ((i = p.find("content")) != p.end()) {
97	if ((j = p.find("name")) != p.end()) {
98	string name = j->second;
99	lowercase_term(name);
100	if (name == "description") {
101	if (sample.empty()) {
102	sample = i->second;
103	decode_entities(sample);
104	}
105	} else if (name == "keywords") {
106	if (!keywords.empty()) keywords += ' ';
107	string tmp = i->second;
108	decode_entities(tmp);
109	keywords += tmp;
110	} else if (name == "robots") {
111	string val = i->second;
112	decode_entities(val);
113	lowercase_term(val);
114	if (val.find("none") != string::npos \|\|
115	val.find("noindex") != string::npos) {
116	indexing_allowed = false;
117	throw true;
118	}
119	}
120	} else if ((j = p.find("http-equiv")) != p.end()) {
121	string hequiv = j->second;
122	lowercase_term(hequiv);
123	if (hequiv == "content-type") {
124	string value = i->second;
125	MimeHeaderValue p = parseMimeHeaderValue(value);
126	map<string, string>::const_iterator k;
127	if ((k = p.params.find("charset")) != p.params.end()) {
128	doccharset = k->second;
129	if (doccharset != ocharset) {
130	LOGDEB1(("Doc specified charset '%s' "
131	"differs from announced '%s'\n",
132	doccharset.c_str(), ocharset.c_str()));
133	throw true;
134	}
135	}
136	}
137	}
138	}
139	} else if (tag == "p" \|\| tag == "br" \|\| tag == "li") {
140	dump += "\n";
141	} else if (tag == "script") {
142	in_script_tag = true;
143	} else if (tag == "style") {
144	in_style_tag = true;
145	} else if (tag == "body") {
146	dump = "";
147	}
148	}
149
150	void
151	MyHtmlParser::closing_tag(const string &tag)
152	{
153	if (tag == "title") {
154	title = dump;
155	dump = "";
156	} else if (tag == "script") {
157	in_script_tag = false;
158	} else if (tag == "style") {
159	in_style_tag = false;
160	} else if (tag == "body") {
161	throw true;
162	}
163	}
164		39
165	bool textHtmlToDoc(RclConfig *conf, const string &fn,	40	bool textHtmlToDoc(RclConfig *conf, const string &fn,
166	const string &mtype, Rcl::Doc &docout)	41	const string &mtype, Rcl::Doc &docout)
167	{	42	{
168	LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));	43	LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));