recoll / Code / Diff of /src/internfile/htmlparse.cpp

Diff of /src/internfile/htmlparse.cpp [3872f8] .. [57b6f2]

Switch to unified view


/* This file was copied/updated from xapian-omega-1.0.1 and modified */

/* htmlparse.cc: simple HTML parser for omega indexer
 *

 * Copyright 1999,2000,2001 BrightStation PLC
 * Copyright 2001 Ananova Ltd
 * Copyright 2002,2006 Olly Betts
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
...
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA

 */

#ifndef lint
static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.7 2007-06-19 10:28:40 dockes Exp $ ";
#endif



#include <algorithm>


using std::find;
using std::find_if;

#include "htmlparse.h"
#include <stdio.h>
#include <ctype.h>



map<string, unsigned int> HtmlParser::named_ents;







inline static bool
p_notdigit(char c)
{
    return !isdigit(static_cast<unsigned char>(c));
}

inline static bool
p_notxdigit(char c)
{
    return !isxdigit(static_cast<unsigned char>(c));
}

inline static bool
p_notalnum(char c)
{
    return !isalnum(static_cast<unsigned char>(c));
}

inline static bool
p_notwhitespace(char c)
{
    return !isspace(static_cast<unsigned char>(c));
}

inline static bool
p_nottag(char c)
{
    return !isalnum(static_cast<unsigned char>(c)) &&
  c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
}

inline static bool
p_whitespacegt(char c)
{
    return isspace(static_cast<unsigned char>(c)) || c == '>';
}

inline static bool
p_whitespaceeqgt(char c)
{
    return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';

































































































}

HtmlParser::HtmlParser()
{
    // RECOLL: no need to initialize these entities, we use those from
    // myhtmlparse
#if 0
    static const struct ent { const char *n; unsigned int v; } ents[] = {
#include "namedentities.h"
  { NULL, 0 }
    };
    if (named_ents.empty()) {
  const struct ent *i = ents;
  while (i->n) {







        named_ents[string(i->n)] = i->v;
      ++i;
    }
    }
#endif
}

void
HtmlParser::decode_entities(string &s)
{
    // Not used for recoll. Kept here to minimize the amount of diffs
#if 0




    // We need a const_iterator version of s.end() - otherwise the
    // find() and find_if() templates don't work...
    string::const_iterator amp = s.begin(), s_end = s.end();
    while ((amp = find(amp, s_end, '&')) != s_end) {
    unsigned int val = 0;
    string::const_iterator end, p = amp + 1;

    if (p != s_end && *p == '#') {
        p++;
        if (p != s_end && (*p == 'x' || *p == 'X')) {
        // hex
        p++;
        end = find_if(p, s_end, p_notxdigit);
        sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
        } else {
...
        val = atoi(s.substr(p - s.begin(), end - p).c_str());
        }
    } else {
        end = find_if(p, s_end, p_notalnum);
        string code = s.substr(p - s.begin(), end - p);
        map<string, unsigned int>::const_iterator i;
        i = named_ents.find(code);
        if (i != named_ents.end()) val = i->second;

    }

    if (end < s_end && *end == ';') end++;


    if (val) {









        string::size_type amp_pos = amp - s.begin();
      if (val < 0x80) {
      s.replace(amp_pos, end - amp, 1u, char(val));
      } else {
      // Convert unicode value val to UTF-8.
      char seq[4];
      unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
      s.replace(amp_pos, end - amp, seq, len);
      }
        s_end = s.end();
        // We've modified the string, so the iterators are no longer
        // valid...
        amp = s.begin() + amp_pos + 1;
    } else {
        amp = end;
    }
    }
#endif
}

void
HtmlParser::parse_html(const string &body)
{
    in_script = false;

    map<string,string> Param;
    string::const_iterator start = body.begin();

    while (true) {
  // Skip through until we find an HTML tag, a comment, or the end of
  // document.  Ignore isolated occurences of `<' which don't start
  // a tag or comment.    
    string::const_iterator p = start;




    while (true) {
        p = find(p, body.end(), '<');
        if (p == body.end()) break;
        unsigned char ch = *(p + 1);

        // Tag, closing tag, or comment (or SGML declaration).
        if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
      if (ch == '?') {
      // PHP code or XML declaration.
      // XML declaration is only valid at the start of the first line.
      // FIXME: need to deal with BOMs...
      if (p != body.begin() || body.size() < 20) break;

      // XML declaration looks something like this:
      // <?xml version="1.0" encoding="UTF-8"?>
      if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
      if (strchr(" \t\r\n", p[5]) == NULL) break;

      string::const_iterator decl_end = find(p + 6, body.end(), '?');
      if (decl_end == body.end()) break;

      // Default charset for XML is UTF-8.
      charset = "UTF-8";

      string decl(p + 6, decl_end);
      size_t enc = decl.find("encoding");
      if (enc == string::npos) break;

      enc = decl.find_first_not_of(" \t\r\n", enc + 8);
      if (enc == string::npos || enc == decl.size()) break;

      if (decl[enc] != '=') break;
      
      enc = decl.find_first_not_of(" \t\r\n", enc + 1);
      if (enc == string::npos || enc == decl.size()) break;

      if (decl[enc] != '"' && decl[enc] != '\'') break;

      char quote = decl[enc++];
      size_t enc_end = decl.find(quote, enc);

      if (enc != string::npos)
          charset = decl.substr(enc, enc_end - enc);

      break;
      }
        p++; 
    }

  // Process text up to start of tag.
    if (p > start || p == body.end()) {
        string text = body.substr(start - body.begin(), p - start);
        decode_entities(text);
        process_text(text);
    }
...
    if (*start == '!') {
        if (++start == body.end()) break;
        if (++start == body.end()) break;
        // comment or SGML declaration
        if (*(start - 1) == '-' && *start == '-') {
      ++start;
      string::const_iterator close = find(start, body.end(), '>');
        // An unterminated comment swallows rest of document
        // (like Netscape, but unlike MSIE IIRC)
        if (close == body.end()) break;

      p = close;
        // look for -->
        while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
            p = find(p + 1, body.end(), '>');

      if (p != body.end()) {
          // Check for htdig's "ignore this bit" comments.
          if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
          string::size_type i;
          i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
          if (i == string::npos) break;
          start = body.begin() + i + 21;
          continue;
          }
            // If we found --> skip to there.
          start = p;
      } else {
            // Otherwise skip to the first > we found (as Netscape does).
          start = close;
      }
        } else {
        // just an SGML declaration, perhaps giving the DTD - ignore it
        start = find(start - 1, body.end(), '>');
        if (start == body.end()) break;
        }
...
          
        p = start;
        start = find_if(start, body.end(), p_nottag);
        string tag = body.substr(p - body.begin(), start - p);
        // convert tagname to lowercase
        for (string::iterator i = tag.begin(); i != tag.end(); ++i)
      *i = tolower(static_cast<unsigned char>(*i));
           
        if (closing) {
        closing_tag(tag);
      if (in_script && tag == "script") in_script = false;
           
        /* ignore any bogus parameters on closing tags */
        p = find(start, body.end(), '>');
        if (p == body.end()) break;
        start = p + 1;
...
            }
               
            if (name.size()) {
                // convert parameter name to lowercase
                string::iterator i;
                for (i = name.begin(); i != name.end(); ++i)
              *i = tolower(static_cast<unsigned char>(*i));
                // in case of multiple entries, use the first
                // (as Netscape does)
                if (Param.find(name) == Param.end())
                Param[name] = value;
            }
            }
        }
        opening_tag(tag, Param);
        Param.clear();

      // In <script> tags we ignore opening tags to avoid problems
      // with "a<b".
      if (tag == "script") in_script = true;

        if (start != body.end() && *start == '>') ++start;
        }
    }
    }
}

	a/src/internfile/htmlparse.cpp		b/src/internfile/htmlparse.cpp
1	/* This file was copied from omega-0.8.5 and modified */	1	/* This file was copied/updated from xapian-omega-1.0.1 and modified */
2		2
3	/* htmlparse.cc: simple HTML parser for omega indexer	3	/* htmlparse.cc: simple HTML parser for omega indexer
4	*	4	*
5	* ----START-LICENCE----
6	* Copyright 1999,2000,2001 BrightStation PLC	5	* Copyright 1999,2000,2001 BrightStation PLC
7	* Copyright 2001 Ananova Ltd	6	* Copyright 2001 Ananova Ltd
8	* Copyright 2002 Olly Betts	7	* Copyright 2002,2006 Olly Betts
9	*	8	*
10	* This program is free software; you can redistribute it and/or	9	* This program is free software; you can redistribute it and/or
11	* modify it under the terms of the GNU General Public License as	10	* modify it under the terms of the GNU General Public License as
12	* published by the Free Software Foundation; either version 2 of the	11	* published by the Free Software Foundation; either version 2 of the
13	* License, or (at your option) any later version.	12	* License, or (at your option) any later version.
	...		...
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.	17	* GNU General Public License for more details.
19	*	18	*
20	* You should have received a copy of the GNU General Public License	19	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software	20	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307	21	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23	* USA	22	* USA
24	* -----END-LICENCE-----
25	*/	23	*/
26		24
27	#ifndef lint	25	#ifndef lint
28	static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.6 2006-01-30 11:15:27 dockes Exp $ ";	26	static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.7 2007-06-19 10:28:40 dockes Exp $ ";
29	#endif	27	#endif
30		28
31	//#include <config.h>
32
33	#include <algorithm>	29	#include <algorithm>
34	#ifndef NO_NAMESPACES
35	using namespace std;
36	//using std::find;	30	using std::find;
37	//using std::find_if;	31	using std::find_if;
38	#endif /* NO_NAMESPACES */
39	#include "htmlparse.h"	32	#include "htmlparse.h"
40	#include <stdio.h>	33	#include <stdio.h>
41	#include <ctype.h>	34	#include <ctype.h>
42		35
43	#include "transcode.h"
44
45	map<string, string> HtmlParser::named_ents;	36	map<string, unsigned int> HtmlParser::named_ents;
46
47	inline static bool
48	p_alpha(char c)
49	{
50	return isalpha(c);
51	}
52		37
53	inline static bool	38	inline static bool
54	p_notdigit(char c)	39	p_notdigit(char c)
55	{	40	{
56	return !isdigit(c);	41	return !isdigit(static_cast<unsigned char>(c));
57	}	42	}
58		43
59	inline static bool	44	inline static bool
60	p_notxdigit(char c)	45	p_notxdigit(char c)
61	{	46	{
62	return !isxdigit(c);	47	return !isxdigit(static_cast<unsigned char>(c));
63	}	48	}
64		49
65	inline static bool	50	inline static bool
66	p_notalnum(char c)	51	p_notalnum(char c)
67	{	52	{
68	return !isalnum(c);	53	return !isalnum(static_cast<unsigned char>(c));
69	}	54	}
70		55
71	inline static bool	56	inline static bool
72	p_notwhitespace(char c)	57	p_notwhitespace(char c)
73	{	58	{
74	return !isspace(c);	59	return !isspace(static_cast<unsigned char>(c));
75	}	60	}
76		61
77	inline static bool	62	inline static bool
78	p_nottag(char c)	63	p_nottag(char c)
79	{	64	{
80	return !isalnum(c) && c != '.' && c != '-';	65	return !isalnum(static_cast<unsigned char>(c)) &&
		66	c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
81	}	67	}
82		68
83	inline static bool	69	inline static bool
84	p_whitespacegt(char c)	70	p_whitespacegt(char c)
85	{	71	{
86	return isspace(c) \|\| c == '>';	72	return isspace(static_cast<unsigned char>(c)) \|\| c == '>';
87	}	73	}
88		74
89	inline static bool	75	inline static bool
90	p_whitespaceeqgt(char c)	76	p_whitespaceeqgt(char c)
91	{	77	{
92	return isspace(c) \|\| c == '=' \|\| c == '>';	78	return isspace(static_cast<unsigned char>(c)) \|\| c == '=' \|\| c == '>';
93	}
94
95	/*
96	* The following array was taken from Estraier. Estraier was
97	* written by Mikio Hirabayashi.
98	* Copyright (C) 2003-2004 Mikio Hirabayashi
99	* The version where this comes from
100	* is covered by the GNU licence, as this file.*/
101	static const char *epairs[] = {
102	/* basic symbols */
103	"amp", "&", "lt", "<", "gt", ">", "quot", "\"", "apos", "'",
104	/* ISO-8859-1 */
105	"nbsp", "\xc2\xa0", "iexcl", "\xc2\xa1", "cent", "\xc2\xa2",
106	"pound", "\xc2\xa3", "curren", "\xc2\xa4", "yen", "\xc2\xa5",
107	"brvbar", "\xc2\xa6", "sect", "\xc2\xa7", "uml", "\xc2\xa8",
108	"copy", "\xc2\xa9", "ordf", "\xc2\xaa", "laquo", "\xc2\xab",
109	"not", "\xc2\xac", "shy", "\xc2\xad", "reg", "\xc2\xae",
110	"macr", "\xc2\xaf", "deg", "\xc2\xb0", "plusmn", "\xc2\xb1",
111	"sup2", "\xc2\xb2", "sup3", "\xc2\xb3", "acute", "\xc2\xb4",
112	"micro", "\xc2\xb5", "para", "\xc2\xb6", "middot", "\xc2\xb7",
113	"cedil", "\xc2\xb8", "sup1", "\xc2\xb9", "ordm", "\xc2\xba",
114	"raquo", "\xc2\xbb", "frac14", "\xc2\xbc", "frac12", "\xc2\xbd",
115	"frac34", "\xc2\xbe", "iquest", "\xc2\xbf", "Agrave", "\xc3\x80",
116	"Aacute", "\xc3\x81", "Acirc", "\xc3\x82", "Atilde", "\xc3\x83",
117	"Auml", "\xc3\x84", "Aring", "\xc3\x85", "AElig", "\xc3\x86",
118	"Ccedil", "\xc3\x87", "Egrave", "\xc3\x88", "Eacute", "\xc3\x89",
119	"Ecirc", "\xc3\x8a", "Euml", "\xc3\x8b", "Igrave", "\xc3\x8c",
120	"Iacute", "\xc3\x8d", "Icirc", "\xc3\x8e", "Iuml", "\xc3\x8f",
121	"ETH", "\xc3\x90", "Ntilde", "\xc3\x91", "Ograve", "\xc3\x92",
122	"Oacute", "\xc3\x93", "Ocirc", "\xc3\x94", "Otilde", "\xc3\x95",
123	"Ouml", "\xc3\x96", "times", "\xc3\x97", "Oslash", "\xc3\x98",
124	"Ugrave", "\xc3\x99", "Uacute", "\xc3\x9a", "Ucirc", "\xc3\x9b",
125	"Uuml", "\xc3\x9c", "Yacute", "\xc3\x9d", "THORN", "\xc3\x9e",
126	"szlig", "\xc3\x9f", "agrave", "\xc3\xa0", "aacute", "\xc3\xa1",
127	"acirc", "\xc3\xa2", "atilde", "\xc3\xa3", "auml", "\xc3\xa4",
128	"aring", "\xc3\xa5", "aelig", "\xc3\xa6", "ccedil", "\xc3\xa7",
129	"egrave", "\xc3\xa8", "eacute", "\xc3\xa9", "ecirc", "\xc3\xaa",
130	"euml", "\xc3\xab", "igrave", "\xc3\xac", "iacute", "\xc3\xad",
131	"icirc", "\xc3\xae", "iuml", "\xc3\xaf", "eth", "\xc3\xb0",
132	"ntilde", "\xc3\xb1", "ograve", "\xc3\xb2", "oacute", "\xc3\xb3",
133	"ocirc", "\xc3\xb4", "otilde", "\xc3\xb5", "ouml", "\xc3\xb6",
134	"divide", "\xc3\xb7", "oslash", "\xc3\xb8", "ugrave", "\xc3\xb9",
135	"uacute", "\xc3\xba", "ucirc", "\xc3\xbb", "uuml", "\xc3\xbc",
136	"yacute", "\xc3\xbd", "thorn", "\xc3\xbe", "yuml", "\xc3\xbf",
137	/* ISO-10646 */
138	"fnof", "\xc6\x92", "Alpha", "\xce\x91", "Beta", "\xce\x92",
139	"Gamma", "\xce\x93", "Delta", "\xce\x94", "Epsilon", "\xce\x95",
140	"Zeta", "\xce\x96", "Eta", "\xce\x97", "Theta", "\xce\x98",
141	"Iota", "\xce\x99", "Kappa", "\xce\x9a", "Lambda", "\xce\x9b",
142	"Mu", "\xce\x9c", "Nu", "\xce\x9d", "Xi", "\xce\x9e",
143	"Omicron", "\xce\x9f", "Pi", "\xce\xa0", "Rho", "\xce\xa1",
144	"Sigma", "\xce\xa3", "Tau", "\xce\xa4", "Upsilon", "\xce\xa5",
145	"Phi", "\xce\xa6", "Chi", "\xce\xa7", "Psi", "\xce\xa8",
146	"Omega", "\xce\xa9", "alpha", "\xce\xb1", "beta", "\xce\xb2",
147	"gamma", "\xce\xb3", "delta", "\xce\xb4", "epsilon", "\xce\xb5",
148	"zeta", "\xce\xb6", "eta", "\xce\xb7", "theta", "\xce\xb8",
149	"iota", "\xce\xb9", "kappa", "\xce\xba", "lambda", "\xce\xbb",
150	"mu", "\xce\xbc", "nu", "\xce\xbd", "xi", "\xce\xbe",
151	"omicron", "\xce\xbf", "pi", "\xcf\x80", "rho", "\xcf\x81",
152	"sigmaf", "\xcf\x82", "sigma", "\xcf\x83", "tau", "\xcf\x84",
153	"upsilon", "\xcf\x85", "phi", "\xcf\x86", "chi", "\xcf\x87",
154	"psi", "\xcf\x88", "omega", "\xcf\x89", "thetasym", "\xcf\x91",
155	"upsih", "\xcf\x92", "piv", "\xcf\x96", "bull", "\xe2\x80\xa2",
156	"hellip", "\xe2\x80\xa6", "prime", "\xe2\x80\xb2", "Prime", "\xe2\x80\xb3",
157	"oline", "\xe2\x80\xbe", "frasl", "\xe2\x81\x84", "weierp", "\xe2\x84\x98",
158	"image", "\xe2\x84\x91", "real", "\xe2\x84\x9c", "trade", "\xe2\x84\xa2",
159	"alefsym", "\xe2\x84\xb5", "larr", "\xe2\x86\x90", "uarr", "\xe2\x86\x91",
160	"rarr", "\xe2\x86\x92", "darr", "\xe2\x86\x93", "harr", "\xe2\x86\x94",
161	"crarr", "\xe2\x86\xb5", "lArr", "\xe2\x87\x90", "uArr", "\xe2\x87\x91",
162	"rArr", "\xe2\x87\x92", "dArr", "\xe2\x87\x93", "hArr", "\xe2\x87\x94",
163	"forall", "\xe2\x88\x80", "part", "\xe2\x88\x82", "exist", "\xe2\x88\x83",
164	"empty", "\xe2\x88\x85", "nabla", "\xe2\x88\x87", "isin", "\xe2\x88\x88",
165	"notin", "\xe2\x88\x89", "ni", "\xe2\x88\x8b", "prod", "\xe2\x88\x8f",
166	"sum", "\xe2\x88\x91", "minus", "\xe2\x88\x92", "lowast", "\xe2\x88\x97",
167	"radic", "\xe2\x88\x9a", "prop", "\xe2\x88\x9d", "infin", "\xe2\x88\x9e",
168	"ang", "\xe2\x88\xa0", "and", "\xe2\x88\xa7", "or", "\xe2\x88\xa8",
169	"cap", "\xe2\x88\xa9", "cup", "\xe2\x88\xaa", "int", "\xe2\x88\xab",
170	"there4", "\xe2\x88\xb4", "sim", "\xe2\x88\xbc", "cong", "\xe2\x89\x85",
171	"asymp", "\xe2\x89\x88", "ne", "\xe2\x89\xa0", "equiv", "\xe2\x89\xa1",
172	"le", "\xe2\x89\xa4", "ge", "\xe2\x89\xa5", "sub", "\xe2\x8a\x82",
173	"sup", "\xe2\x8a\x83", "nsub", "\xe2\x8a\x84", "sube", "\xe2\x8a\x86",
174	"supe", "\xe2\x8a\x87", "oplus", "\xe2\x8a\x95", "otimes", "\xe2\x8a\x97",
175	"perp", "\xe2\x8a\xa5", "sdot", "\xe2\x8b\x85", "lceil", "\xe2\x8c\x88",
176	"rceil", "\xe2\x8c\x89", "lfloor", "\xe2\x8c\x8a", "rfloor", "\xe2\x8c\x8b",
177	"lang", "\xe2\x8c\xa9", "rang", "\xe2\x8c\xaa", "loz", "\xe2\x97\x8a",
178	"spades", "\xe2\x99\xa0", "clubs", "\xe2\x99\xa3", "hearts", "\xe2\x99\xa5",
179	"diams", "\xe2\x99\xa6", "OElig", "\xc5\x92", "oelig", "\xc5\x93",
180	"Scaron", "\xc5\xa0", "scaron", "\xc5\xa1", "Yuml", "\xc5\xb8",
181	"circ", "\xcb\x86", "tilde", "\xcb\x9c", "ensp", "\xe2\x80\x82",
182	"emsp", "\xe2\x80\x83", "thinsp", "\xe2\x80\x89", "zwnj", "\xe2\x80\x8c",
183	"zwj", "\xe2\x80\x8d", "lrm", "\xe2\x80\x8e", "rlm", "\xe2\x80\x8f",
184	"ndash", "\xe2\x80\x93", "mdash", "\xe2\x80\x94", "lsquo", "\xe2\x80\x98",
185	"rsquo", "\xe2\x80\x99", "sbquo", "\xe2\x80\x9a", "ldquo", "\xe2\x80\x9c",
186	"rdquo", "\xe2\x80\x9d", "bdquo", "\xe2\x80\x9e", "dagger", "\xe2\x80\xa0",
187	"Dagger", "\xe2\x80\xa1", "permil", "\xe2\x80\xb0", "lsaquo", "\xe2\x80\xb9",
188	"rsaquo", "\xe2\x80\xba", "euro", "\xe2\x82\xac",
189	NULL, NULL
190	};	79	}
191		80
192	HtmlParser::HtmlParser()	81	HtmlParser::HtmlParser()
193	{	82	{
		83	// RECOLL: no need to initialize these entities, we use those from
		84	// myhtmlparse
		85	#if 0
		86	static const struct ent { const char *n; unsigned int v; } ents[] = {
		87	#include "namedentities.h"
		88	{ NULL, 0 }
		89	};
194	if (named_ents.empty()) {	90	if (named_ents.empty()) {
195	for (int i = 0;;) {	91	const struct ent *i = ents;
196	const char *ent;	92	while (i->n) {
197	const char *val;
198	ent = epairs[i++];
199	if (ent == 0)
200	break;
201	val = epairs[i++];
202	if (val == 0)
203	break;
204	named_ents[string(ent)] = val;	93	named_ents[string(i->n)] = i->v;
		94	++i;
205	}	95	}
206	}	96	}
		97	#endif
207	}	98	}
208		99
209	void	100	void
210	HtmlParser::decode_entities(string &s)	101	HtmlParser::decode_entities(string &s)
211	{	102	{
212	// This has no meaning whatsoever if the character encoding is unknown,	103	// Not used for recoll. Kept here to minimize the amount of diffs
213	// so don't do it. If charset known, caller has converted text to utf-8,	104	#if 0
214	// and this is also how we translate entities
215	// if (charset != "utf-8")
216	// return;
217
218	// We need a const_iterator version of s.end() - otherwise the	105	// We need a const_iterator version of s.end() - otherwise the
219	// find() and find_if() templates don't work...	106	// find() and find_if() templates don't work...
220	string::const_iterator amp = s.begin(), s_end = s.end();	107	string::const_iterator amp = s.begin(), s_end = s.end();
221	while ((amp = find(amp, s_end, '&')) != s_end) {	108	while ((amp = find(amp, s_end, '&')) != s_end) {
222	unsigned int val = 0;	109	unsigned int val = 0;
223	string::const_iterator end, p = amp + 1;	110	string::const_iterator end, p = amp + 1;
224	string subs;
225	if (p != s_end && *p == '#') {	111	if (p != s_end && *p == '#') {
226	p++;	112	p++;
227	if (p != s_end && tolower(*p) == 'x') {	113	if (p != s_end && (p == 'x' \|\| p == 'X')) {
228	// hex	114	// hex
229	p++;	115	p++;
230	end = find_if(p, s_end, p_notxdigit);	116	end = find_if(p, s_end, p_notxdigit);
231	sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);	117	sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
232	} else {	118	} else {
	...		...
235	val = atoi(s.substr(p - s.begin(), end - p).c_str());	121	val = atoi(s.substr(p - s.begin(), end - p).c_str());
236	}	122	}
237	} else {	123	} else {
238	end = find_if(p, s_end, p_notalnum);	124	end = find_if(p, s_end, p_notalnum);
239	string code = s.substr(p - s.begin(), end - p);	125	string code = s.substr(p - s.begin(), end - p);
240	map<string, string>::const_iterator i;	126	map<string, unsigned int>::const_iterator i;
241	i = named_ents.find(code);	127	i = named_ents.find(code);
242	if (i != named_ents.end())	128	if (i != named_ents.end()) val = i->second;
243	subs = i->second;
244	}	129	}
245
246	if (end < s_end && *end == ';')	130	if (end < s_end && *end == ';') end++;
247	end++;
248
249	if (val) {	131	if (val) {
250	// The code is the code position for a unicode char. We need
251	// to translate it to an utf-8 string.
252	string utf16be;
253	utf16be += char(val / 256);
254	utf16be += char(val % 256);
255	transcode(utf16be, subs, "UTF-16BE", "UTF-8");
256	}
257
258	if (subs.length() > 0) {
259	string::size_type amp_pos = amp - s.begin();	132	string::size_type amp_pos = amp - s.begin();
		133	if (val < 0x80) {
		134	s.replace(amp_pos, end - amp, 1u, char(val));
		135	} else {
		136	// Convert unicode value val to UTF-8.
		137	char seq[4];
		138	unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
260	s.replace(amp_pos, end - amp, subs);	139	s.replace(amp_pos, end - amp, seq, len);
		140	}
261	s_end = s.end();	141	s_end = s.end();
262	// We've modified the string, so the iterators are no longer	142	// We've modified the string, so the iterators are no longer
263	// valid...	143	// valid...
264	amp = s.begin() + amp_pos + subs.length();	144	amp = s.begin() + amp_pos + 1;
265	} else {	145	} else {
266	amp = end;	146	amp = end;
267	}	147	}
268	}	148	}
		149	#endif
269	}	150	}
270		151
271	void	152	void
272	HtmlParser::parse_html(const string &body)	153	HtmlParser::parse_html(const string &body)
273	{	154	{
		155	in_script = false;
		156
274	map<string,string> Param;	157	map<string,string> Param;
275	string::const_iterator start = body.begin();	158	string::const_iterator start = body.begin();
276		159
277	while (1) {	160	while (true) {
		161	// Skip through until we find an HTML tag, a comment, or the end of
		162	// document. Ignore isolated occurences of `<' which don't start
		163	// a tag or comment.
278	string::const_iterator p = start;	164	string::const_iterator p = start;
279
280	// Eat text until we find an HTML tag, a comment, or the end
281	// of document. Ignore isolated occurences of `<' which don't
282	// start a tag or comment
283	while (1) {	165	while (true) {
284	p = find(p, body.end(), '<');	166	p = find(p, body.end(), '<');
285	if (p == body.end()) break;	167	if (p == body.end()) break;
286	char ch = *(p + 1);	168	unsigned char ch = *(p + 1);
		169
287	// tag, closing tag, comment (or SGML declaration), or PHP	170	// Tag, closing tag, or comment (or SGML declaration).
288	if (isalpha(ch) \|\| ch == '/' \|\| ch == '!' \|\| ch == '?') break;	171	if ((!in_script && isalpha(ch)) \|\| ch == '/' \|\| ch == '!') break;
		172	if (ch == '?') {
		173	// PHP code or XML declaration.
		174	// XML declaration is only valid at the start of the first line.
		175	// FIXME: need to deal with BOMs...
		176	if (p != body.begin() \|\| body.size() < 20) break;
		177
		178	// XML declaration looks something like this:
		179	// <?xml version="1.0" encoding="UTF-8"?>
		180	if (p[2] != 'x' \|\| p[3] != 'm' \|\| p[4] != 'l') break;
		181	if (strchr(" \t\r\n", p[5]) == NULL) break;
		182
		183	string::const_iterator decl_end = find(p + 6, body.end(), '?');
		184	if (decl_end == body.end()) break;
		185
		186	// Default charset for XML is UTF-8.
		187	charset = "UTF-8";
		188
		189	string decl(p + 6, decl_end);
		190	size_t enc = decl.find("encoding");
		191	if (enc == string::npos) break;
		192
		193	enc = decl.find_first_not_of(" \t\r\n", enc + 8);
		194	if (enc == string::npos \|\| enc == decl.size()) break;
		195
		196	if (decl[enc] != '=') break;
		197
		198	enc = decl.find_first_not_of(" \t\r\n", enc + 1);
		199	if (enc == string::npos \|\| enc == decl.size()) break;
		200
		201	if (decl[enc] != '"' && decl[enc] != '\'') break;
		202
		203	char quote = decl[enc++];
		204	size_t enc_end = decl.find(quote, enc);
		205
		206	if (enc != string::npos)
		207	charset = decl.substr(enc, enc_end - enc);
		208
		209	break;
		210	}
289	p++;	211	p++;
290	}	212	}
291		213
292	// Process text	214	// Process text up to start of tag.
293	if (p > start \|\| p == body.end()) {	215	if (p > start \|\| p == body.end()) {
294	string text = body.substr(start - body.begin(), p - start);	216	string text = body.substr(start - body.begin(), p - start);
295	decode_entities(text);	217	decode_entities(text);
296	process_text(text);	218	process_text(text);
297	}	219	}
	...		...
308	if (*start == '!') {	230	if (*start == '!') {
309	if (++start == body.end()) break;	231	if (++start == body.end()) break;
310	if (++start == body.end()) break;	232	if (++start == body.end()) break;
311	// comment or SGML declaration	233	// comment or SGML declaration
312	if ((start - 1) == '-' && start == '-') {	234	if ((start - 1) == '-' && start == '-') {
313	start = find(start + 1, body.end(), '>');	235	++start;
		236	string::const_iterator close = find(start, body.end(), '>');
314	// unterminated comment swallows rest of document	237	// An unterminated comment swallows rest of document
315	// (like NS, but unlike MSIE iirc)	238	// (like Netscape, but unlike MSIE IIRC)
316	if (start == body.end()) break;	239	if (close == body.end()) break;
317		240
318	p = start;	241	p = close;
319	// look for -->	242	// look for -->
320	while (p != body.end() && ((p - 1) != '-' \|\| (p - 2) != '-'))	243	while (p != body.end() && ((p - 1) != '-' \|\| (p - 2) != '-'))
321	p = find(p + 1, body.end(), '>');	244	p = find(p + 1, body.end(), '>');
322		245
		246	if (p != body.end()) {
		247	// Check for htdig's "ignore this bit" comments.
		248	if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
		249	string::size_type i;
		250	i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
		251	if (i == string::npos) break;
		252	start = body.begin() + i + 21;
		253	continue;
		254	}
323	// If we found --> skip to there, otherwise	255	// If we found --> skip to there.
		256	start = p;
		257	} else {
324	// skip to the first > we found (as Netscape does)	258	// Otherwise skip to the first > we found (as Netscape does).
325	if (p != body.end()) start = p;	259	start = close;
		260	}
326	} else {	261	} else {
327	// just an SGML declaration, perhaps giving the DTD - ignore it	262	// just an SGML declaration, perhaps giving the DTD - ignore it
328	start = find(start - 1, body.end(), '>');	263	start = find(start - 1, body.end(), '>');
329	if (start == body.end()) break;	264	if (start == body.end()) break;
330	}	265	}
	...		...
352		287
353	p = start;	288	p = start;
354	start = find_if(start, body.end(), p_nottag);	289	start = find_if(start, body.end(), p_nottag);
355	string tag = body.substr(p - body.begin(), start - p);	290	string tag = body.substr(p - body.begin(), start - p);
356	// convert tagname to lowercase	291	// convert tagname to lowercase
357	for (string::iterator i = tag.begin(); i != tag.end(); i++)	292	for (string::iterator i = tag.begin(); i != tag.end(); ++i)
358	i = tolower(i);	293	i = tolower(static_cast<unsigned char>(i));
359		294
360	if (closing) {	295	if (closing) {
361	closing_tag(tag);	296	closing_tag(tag);
		297	if (in_script && tag == "script") in_script = false;
362		298
363	/* ignore any bogus parameters on closing tags */	299	/* ignore any bogus parameters on closing tags */
364	p = find(start, body.end(), '>');	300	p = find(start, body.end(), '>');
365	if (p == body.end()) break;	301	if (p == body.end()) break;
366	start = p + 1;	302	start = p + 1;
	...		...
400	}	336	}
401		337
402	if (name.size()) {	338	if (name.size()) {
403	// convert parameter name to lowercase	339	// convert parameter name to lowercase
404	string::iterator i;	340	string::iterator i;
405	for (i = name.begin(); i != name.end(); i++)	341	for (i = name.begin(); i != name.end(); ++i)
406	i = tolower(i);	342	i = tolower(static_cast<unsigned char>(i));
407	// in case of multiple entries, use the first	343	// in case of multiple entries, use the first
408	// (as Netscape does)	344	// (as Netscape does)
409	if (Param.find(name) == Param.end())	345	if (Param.find(name) == Param.end())
410	Param[name] = value;	346	Param[name] = value;
411	}	347	}
412	}	348	}
413	}	349	}
414	opening_tag(tag, Param);	350	opening_tag(tag, Param);
415	Param.clear();	351	Param.clear();
416		352
		353	// In <script> tags we ignore opening tags to avoid problems
		354	// with "a<b".
		355	if (tag == "script") in_script = true;
		356
417	if (start != body.end() && *start == '>') ++start;	357	if (start != body.end() && *start == '>') ++start;
418	}	358	}
419	}	359	}
420	}	360	}
421	}	361	}