Switch to unified view

a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp
1
/* htmlparse.cc: simple HTML parser for omega indexer
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: mh_html.cpp,v 1.26 2008-10-03 06:17:46 dockes Exp $ (C) 2005 J.F.Dockes";
3
#endif
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
8
 *   (at your option) any later version.
2
 *
9
 *
3
 * ----START-LICENCE----
10
 *   This program is distributed in the hope that it will be useful,
4
 * Copyright 1999,2000,2001 BrightStation PLC
11
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
5
 * Copyright 2001 Ananova Ltd
12
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
6
 * Copyright 2002 Olly Betts
13
 *   GNU General Public License for more details.
7
 *
14
 *
8
 * This program is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU General Public License as
10
 * published by the Free Software Foundation; either version 2 of the
11
 * License, or (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
15
 *   You should have received a copy of the GNU General Public License
19
 * along with this program; if not, write to the Free Software
16
 *   along with this program; if not, write to the
17
 *   Free Software Foundation, Inc.,
20
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21
 * USA
22
 * -----END-LICENCE-----
23
 */
19
 */
24
20
25
// This file has code from omindex + an adaptor function for recoll at the end
26
21
27
#include "mimehandler.h"
22
#include "mimehandler.h"
28
#include "debuglog.h"
23
#include "debuglog.h"
29
#include "csguess.h"
24
#include "csguess.h"
30
#include "readfile.h"
25
#include "readfile.h"
...
...
34
#include "indextext.h"
29
#include "indextext.h"
35
#include "mh_html.h"
30
#include "mh_html.h"
36
#include "smallut.h"
31
#include "smallut.h"
37
32
38
#include <iostream>
33
#include <iostream>
34
39
#ifndef NO_NAMESPACES
35
#ifndef NO_NAMESPACES
40
using namespace std;
36
using namespace std;
41
#endif /* NO_NAMESPACES */
37
#endif /* NO_NAMESPACES */
42
38
43
39
44
bool MimeHandlerHtml::set_document_file(const string &fn)
40
bool MimeHandlerHtml::set_document_file(const string &fn)
45
{
41
{
46
    LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
42
    LOGDEB0(("textHtmlToDoc: %s\n", fn.c_str()));
47
    string otext;
43
    string otext;
48
    if (!file_to_string(fn, otext)) {
44
    if (!file_to_string(fn, otext)) {
49
    LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
45
    LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
50
    return false;
46
    return false;
51
    }
47
    }
...
...
117
        break;
113
        break;
118
    } catch (bool diag) {
114
    } catch (bool diag) {
119
        result = p;
115
        result = p;
120
        if (diag == true) {
116
        if (diag == true) {
121
        // Parser throws true at end of text. ok
117
        // Parser throws true at end of text. ok
118
119
      if (m_forPreview) {
120
          // Save the html text
121
          m_html = transcoded;
122
          // In many cases, we need to change the charset decl,
123
          // because the file was transcoded. It seems that just
124
          // inserting one is enough (only the 1st one seems to
125
          // be used by browsers/qtextedit).
126
          unsigned int idx = m_html.find("<head>");
127
          if (idx == string::npos)
128
          idx = m_html.find("<HEAD>");
129
          if (idx != string::npos)
130
          m_html.replace(idx+6, 0, 
131
                     "<meta http-equiv=\"content-type\" "
132
                     "content=\"text/html; charset=utf-8\">");
133
      }
134
122
        break;
135
        break;
123
        }
136
        }
124
137
125
        LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
138
        LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
126
            charset.c_str(), result.get_charset().c_str()));
139
            charset.c_str(), result.get_charset().c_str()));