|
a/src/internfile/mh_html.h |
|
b/src/internfile/mh_html.h |
|
... |
|
... |
24 |
/**
|
24 |
/**
|
25 |
* Convert html to utf-8 text and extract whatever metadata we can find.
|
25 |
* Convert html to utf-8 text and extract whatever metadata we can find.
|
26 |
*/
|
26 |
*/
|
27 |
class MimeHandlerHtml : public RecollFilter {
|
27 |
class MimeHandlerHtml : public RecollFilter {
|
28 |
public:
|
28 |
public:
|
29 |
MimeHandlerHtml(RclConfig *cnf, const string& id)
|
29 |
MimeHandlerHtml(RclConfig *cnf, const std::string& id)
|
30 |
: RecollFilter(cnf, id)
|
30 |
: RecollFilter(cnf, id) {
|
31 |
{
|
|
|
32 |
}
|
31 |
}
|
33 |
virtual ~MimeHandlerHtml()
|
32 |
virtual ~MimeHandlerHtml() {}
|
34 |
{
|
33 |
|
35 |
}
|
|
|
36 |
virtual bool set_document_file(const string& mt, const string &file_path);
|
|
|
37 |
virtual bool set_document_string(const string& mt, const string &data);
|
|
|
38 |
virtual bool is_data_input_ok(DataInput input) const {
|
34 |
virtual bool is_data_input_ok(DataInput input) const {
|
39 |
if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING)
|
35 |
if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING)
|
40 |
return true;
|
36 |
return true;
|
41 |
return false;
|
37 |
return false;
|
42 |
}
|
38 |
}
|
43 |
virtual bool next_document();
|
39 |
virtual bool next_document();
|
44 |
const string& get_html()
|
40 |
const std::string& get_html()
|
45 |
{
|
41 |
{
|
46 |
return m_html;
|
42 |
return m_html;
|
47 |
}
|
43 |
}
|
48 |
virtual void clear() {
|
44 |
virtual void clear() {
|
49 |
m_filename.erase();
|
45 |
m_filename.erase();
|
50 |
m_html.erase();
|
46 |
m_html.erase();
|
51 |
RecollFilter::clear();
|
47 |
RecollFilter::clear();
|
52 |
}
|
48 |
}
|
|
|
49 |
protected:
|
|
|
50 |
virtual bool set_document_file_impl(const std::string& mt,
|
|
|
51 |
const std::string &file_path);
|
|
|
52 |
virtual bool set_document_string_impl(const std::string& mt,
|
|
|
53 |
const std::string &data);
|
|
|
54 |
|
53 |
private:
|
55 |
private:
|
54 |
string m_filename;
|
56 |
std::string m_filename;
|
55 |
string m_html;
|
57 |
std::string m_html;
|
56 |
};
|
58 |
};
|
57 |
|
59 |
|
58 |
#endif /* _HTML_H_INCLUDED_ */
|
60 |
#endif /* _HTML_H_INCLUDED_ */
|