--- a
+++ b/src/python/pychm/chm/extra.c
@@ -0,0 +1,803 @@
+/*
+ * extra.c - full-text search support for pychm
+ *
+ * Copyright (C) 2004 Rubens Ramos <rubensr@users.sourceforge.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Author: Rubens Ramos <rubensr@users.sourceforge.net>
+ *
+ * Heavily based on work done by:
+ * Pabs <pabs@zip.to> - chmdeco
+ * Razvan Cojocaru <razvanco@gmx.net> - xCHM
+ *
+ */
+
+#include "chm_lib.h"
+#ifdef __PYTHON__
+#include "Python.h"
+#else
+#include <stdio.h>
+#define PyObject void
+#endif
+
+typedef struct {
+ PyObject_HEAD
+ void *ptr;
+ void *ty;
+ int own;
+ PyObject *next;
+#ifdef SWIGPYTHON_BUILTIN
+ PyObject *dict;
+#endif
+} SwigPyObject;
+
+#include <stdlib.h>
+
+#if defined(_WIN32) || defined(__WIN32__)
+# if defined(_MSC_VER)
+# if defined(STATIC_LINKED)
+# define MODEXPORT(a) a
+# define MODIMPORT(a) extern a
+# else
+# define MODEXPORT(a) __declspec(dllexport) a
+# define MODIMPORT(a) extern a
+# endif
+#define uint64_t unsigned long long
+#define uint32_t unsigned int
+#define uint16_t unsigned short
+#define uint8_t unsigned char
+#define size_t int
+#define strcasecmp _stricmp
+#define strncasecmp _strnicmp
+# else
+# if defined(__BORLANDC__)
+# define MODEXPORT(a) a _export
+# define MODIMPORT(a) a _export
+# else
+# define MODEXPORT(a) a
+# define MODIMPORT(a) a
+# endif
+# endif
+#else
+# define MODEXPORT(a) a
+# define MODIMPORT(a) a
+#include <inttypes.h>
+#include <strings.h>
+#endif
+
+#define false 0
+#define true 1
+
+#define FTS_HEADER_LEN 0x32
+#define TOPICS_ENTRY_LEN 16
+#define COMMON_BUF_LEN 1025
+
+#define FREE(x) free (x); x = NULL
+
+static uint16_t
+get_uint16 (uint8_t* b) {
+ return b[0] |
+ b[1]<<8;
+}
+
+static uint32_t
+get_uint32 (uint8_t* b) {
+ return b[0] |
+ b[1]<<8 |
+ b[2]<<16 |
+ b[3]<<24;
+}
+
+static uint64_t
+be_encint (unsigned char *buffer, size_t *length)
+{
+ uint64_t result = 0;
+ int shift=0;
+ *length = 0;
+
+ do {
+ result |= ((*buffer) & 0x7f) << shift;
+ shift += 7;
+ *length = *length + 1;
+
+ } while (*(buffer++) & 0x80);
+
+ return result;
+}
+
+/*
+ Finds the first unset bit in memory. Returns the number of set bits found.
+ Returns -1 if the buffer runs out before we find an unset bit.
+*/
+static int
+ffus (unsigned char* byte, int* bit, size_t *length) {
+ int bits = 0;
+ *length = 0;
+
+ while(*byte & (1 << *bit)){
+ if(*bit)
+ --(*bit);
+ else {
+ ++byte;
+ ++(*length);
+ *bit = 7;
+ }
+ ++bits;
+ }
+
+ if(*bit)
+ --(*bit);
+ else {
+ ++(*length);
+ *bit = 7;
+ }
+
+ return bits;
+}
+
+
+static uint64_t
+sr_int(unsigned char* byte, int* bit,
+ unsigned char s, unsigned char r, size_t *length)
+{
+ uint64_t ret;
+ unsigned char mask;
+ int n, n_bits, num_bits, base, count;
+ size_t fflen;
+
+ *length = 0;
+
+ if(!bit || *bit > 7 || s != 2)
+ return ~(uint64_t)0;
+ ret = 0;
+
+ count = ffus(byte, bit, &fflen);
+ *length += fflen;
+ byte += *length;
+
+ n_bits = n = r + (count ? count-1 : 0) ;
+
+ while (n > 0) {
+ num_bits = n > *bit ? *bit : n-1;
+ base = n > *bit ? 0 : *bit - (n-1);
+
+ switch (num_bits){
+ case 0:
+ mask = 1;
+ break;
+ case 1:
+ mask = 3;
+ break;
+ case 2:
+ mask = 7;
+ break;
+ case 3:
+ mask = 0xf;
+ break;
+ case 4:
+ mask = 0x1f;
+ break;
+ case 5:
+ mask = 0x3f;
+ break;
+ case 6:
+ mask = 0x7f;
+ break;
+ case 7:
+ mask = 0xff;
+ break;
+ default:
+ mask = 0xff;
+ break;
+ }
+
+ mask <<= base;
+ ret = (ret << (num_bits+1)) |
+ (uint64_t)((*byte & mask) >> base);
+
+ if( n > *bit ){
+ ++byte;
+ ++(*length);
+ n -= *bit+1;
+ *bit = 7;
+ } else {
+ *bit -= n;
+ n = 0;
+ }
+ }
+
+ if(count)
+ ret |= (uint64_t)1 << n_bits;
+
+ return ret;
+}
+
+
+static uint32_t
+get_leaf_node_offset(struct chmFile *chmfile,
+ const char *text,
+ uint32_t initial_offset,
+ uint32_t buff_size,
+ uint16_t tree_depth,
+ struct chmUnitInfo *ui)
+{
+ unsigned char word_len;
+ unsigned char pos;
+ uint16_t free_space;
+ char *wrd_buf;
+ char *word = NULL;
+ uint32_t test_offset = 0;
+ uint32_t i = sizeof(uint16_t);
+ unsigned char *buffer = malloc (buff_size);
+
+ if (NULL == buffer)
+ return 0;
+
+ while (--tree_depth) {
+ if (initial_offset == test_offset) {
+ FREE(buffer);
+ return 0;
+ }
+
+ test_offset = initial_offset;
+ if (chm_retrieve_object (chmfile, ui, buffer,
+ initial_offset, buff_size) == 0) {
+ FREE(buffer);
+ return 0;
+ }
+
+ free_space = get_uint16 (buffer);
+
+ while (i < buff_size - free_space) {
+
+ word_len = *(buffer + i);
+ pos = *(buffer + i + 1);
+
+ wrd_buf = malloc (word_len);
+ memcpy (wrd_buf, buffer + i + 2, word_len - 1);
+ wrd_buf[word_len - 1] = 0;
+
+ if (pos == 0) {
+ FREE (word);
+ word = (char *) strdup (wrd_buf);
+ } else {
+ word = realloc (word, word_len + pos + 1);
+ strcpy (word + pos, wrd_buf);
+ }
+
+ FREE(wrd_buf);
+
+ if (strcasecmp (text, word) <= 0) {
+ initial_offset = get_uint32 (buffer + i + word_len + 1);
+ break;
+ }
+
+ i += word_len + sizeof (unsigned char) + sizeof(uint32_t) +
+ sizeof(uint16_t);
+ }
+ }
+
+ if(initial_offset == test_offset)
+ initial_offset = 0;
+
+ FREE(word);
+ FREE(buffer);
+
+ return initial_offset;
+}
+
+static int
+pychm_process_wlc (struct chmFile *chmfile,
+ uint64_t wlc_count, uint64_t wlc_size,
+ uint32_t wlc_offset, unsigned char ds,
+ unsigned char dr, unsigned char cs,
+ unsigned char cr, unsigned char ls,
+ unsigned char lr, struct chmUnitInfo *uimain,
+ struct chmUnitInfo* uitbl,
+ struct chmUnitInfo *uistrings,
+ struct chmUnitInfo* topics,
+ struct chmUnitInfo *urlstr,
+ PyObject *dict)
+{
+ uint32_t stroff, urloff;
+ uint64_t i, j, count;
+ size_t length;
+ int wlc_bit = 7;
+ size_t off = 0;
+ uint64_t index = 0;
+ unsigned char entry[TOPICS_ENTRY_LEN];
+ unsigned char combuf[COMMON_BUF_LEN];
+ unsigned char *buffer = malloc (wlc_size);
+ char *url = NULL;
+ char *topic = NULL;
+
+ if (chm_retrieve_object(chmfile, uimain, buffer,
+ wlc_offset, wlc_size) == 0) {
+ FREE(buffer);
+ return false;
+ }
+
+ for (i = 0; i < wlc_count; ++i) {
+
+ if(wlc_bit != 7) {
+ ++off;
+ wlc_bit = 7;
+ }
+
+ index += sr_int(buffer + off, &wlc_bit, ds, dr, &length);
+ off += length;
+
+ if(chm_retrieve_object(chmfile, topics, entry,
+ index * 16, TOPICS_ENTRY_LEN) == 0) {
+ FREE(topic);
+ FREE(url);
+ FREE(buffer);
+ return false;
+ }
+
+ combuf[COMMON_BUF_LEN - 1] = 0;
+ stroff = get_uint32 (entry + 4);
+
+ FREE (topic);
+ if (chm_retrieve_object (chmfile, uistrings, combuf,
+ stroff, COMMON_BUF_LEN - 1) == 0) {
+ topic = strdup ("Untitled in index");
+
+ } else {
+ combuf[COMMON_BUF_LEN - 1] = 0;
+
+ topic = strdup ((char *)combuf);
+ }
+
+ urloff = get_uint32 (entry + 8);
+
+ if(chm_retrieve_object (chmfile, uitbl, combuf,
+ urloff, 12) == 0) {
+ FREE(buffer);
+ return false;
+ }
+
+ urloff = get_uint32 (combuf + 8);
+
+ if (chm_retrieve_object (chmfile, urlstr, combuf,
+ urloff + 8, COMMON_BUF_LEN - 1) == 0) {
+ FREE(topic);
+ FREE(url);
+ FREE(buffer);
+ return false;
+ }
+
+ combuf[COMMON_BUF_LEN - 1] = 0;
+
+ FREE (url);
+ url = strdup ((char *)combuf);
+
+ if (url && topic) {
+#ifdef __PYTHON__
+ PyDict_SetItem(dict,
+#if PY_MAJOR_VERSION >= 3
+ PyBytes_FromStringAndSize(topic, strlen(topic)),
+ PyBytes_FromStringAndSize(url, strlen(url))
+#else
+ PyString_FromString (topic),
+ PyString_FromString (url)
+#endif
+ );
+#else
+ printf ("%s ==> %s\n", url, topic);
+#endif
+ }
+
+ count = sr_int (buffer + off, &wlc_bit, cs, cr, &length);
+ off += length;
+
+ for (j = 0; j < count; ++j) {
+ sr_int (buffer + off, &wlc_bit, ls, lr, &length);
+ off += length;
+ }
+ }
+
+ FREE(topic);
+ FREE(url);
+ FREE(buffer);
+
+ return true;
+}
+
+static int
+chm_search (struct chmFile *chmfile,
+ const char *text, int whole_words,
+ int titles_only, PyObject *dict)
+{
+ unsigned char header[FTS_HEADER_LEN];
+ unsigned char doc_index_s;
+ unsigned char doc_index_r;
+ unsigned char code_count_s;
+ unsigned char code_count_r;
+ unsigned char loc_codes_s;
+ unsigned char loc_codes_r;
+ unsigned char word_len, pos;
+ unsigned char *buffer;
+ char *word = NULL;
+ uint32_t node_offset;
+ uint32_t node_len;
+ uint16_t tree_depth;
+ uint32_t i;
+ uint16_t free_space;
+ uint64_t wlc_count, wlc_size;
+ uint32_t wlc_offset;
+ char *wrd_buf;
+ unsigned char title;
+ size_t encsz;
+ struct chmUnitInfo ui, uitopics, uiurltbl, uistrings, uiurlstr;
+ int partial = false;
+
+ if (NULL == text)
+ return -1;
+
+ if (chm_resolve_object (chmfile, "/$FIftiMain", &ui) !=
+ CHM_RESOLVE_SUCCESS ||
+ chm_resolve_object (chmfile, "/#TOPICS", &uitopics) !=
+ CHM_RESOLVE_SUCCESS ||
+ chm_resolve_object (chmfile, "/#STRINGS", &uistrings) !=
+ CHM_RESOLVE_SUCCESS ||
+ chm_resolve_object (chmfile, "/#URLTBL", &uiurltbl) !=
+ CHM_RESOLVE_SUCCESS ||
+ chm_resolve_object (chmfile, "/#URLSTR", &uiurlstr) !=
+ CHM_RESOLVE_SUCCESS)
+ return false;
+
+ if(chm_retrieve_object(chmfile, &ui, header, 0, FTS_HEADER_LEN) == 0)
+ return false;
+
+ doc_index_s = header[0x1E];
+ doc_index_r = header[0x1F];
+ code_count_s = header[0x20];
+ code_count_r = header[0x21];
+ loc_codes_s = header[0x22];
+ loc_codes_r = header[0x23];
+
+ if(doc_index_s != 2 || code_count_s != 2 || loc_codes_s != 2) {
+ return false;
+ }
+
+ node_offset = get_uint32 (header + 0x14);
+ node_len = get_uint32 (header + 0x2e);
+ tree_depth = get_uint16 (header + 0x18);
+
+ i = sizeof(uint16_t);
+
+ buffer = malloc (node_len);
+
+ node_offset = get_leaf_node_offset (chmfile, text, node_offset, node_len,
+ tree_depth, &ui);
+
+ if (!node_offset) {
+ FREE(buffer);
+ return false;
+ }
+
+ do {
+
+ if (chm_retrieve_object (chmfile, &ui, buffer,
+ node_offset, node_len) == 0) {
+ FREE(word);
+ FREE(buffer);
+ return false;
+ }
+
+ free_space = get_uint16 (buffer + 6);
+
+ i = sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint16_t);
+
+ encsz = 0;
+
+ while (i < node_len - free_space) {
+ word_len = *(buffer + i);
+ pos = *(buffer + i + 1);
+
+ wrd_buf = malloc (word_len);
+ memcpy (wrd_buf, buffer + i + 2, word_len - 1);
+ wrd_buf[word_len - 1] = 0;
+
+ if (pos == 0) {
+ FREE(word);
+ word = (char *) strdup (wrd_buf);
+ } else {
+ word = realloc (word, word_len + pos + 1);
+ strcpy (word + pos, wrd_buf);
+ }
+
+ FREE(wrd_buf);
+
+ i += 2 + word_len;
+ title = *(buffer + i - 1);
+
+ wlc_count = be_encint (buffer + i, &encsz);
+ i += encsz;
+
+ wlc_offset = get_uint32 (buffer + i);
+
+ i += sizeof(uint32_t) + sizeof(uint16_t);
+ wlc_size = be_encint (buffer + i, &encsz);
+ i += encsz;
+
+ node_offset = get_uint32 (buffer);
+
+ if (!title && titles_only)
+ continue;
+
+ if (whole_words && !strcasecmp(text, word)) {
+ partial = pychm_process_wlc (chmfile, wlc_count, wlc_size,
+ wlc_offset, doc_index_s,
+ doc_index_r,code_count_s,
+ code_count_r, loc_codes_s,
+ loc_codes_r, &ui, &uiurltbl,
+ &uistrings, &uitopics,
+ &uiurlstr, dict);
+ FREE(word);
+ FREE(buffer);
+ return partial;
+ }
+
+ if (!whole_words) {
+ if (!strncasecmp (word, text, strlen(text))) {
+ partial = true;
+ pychm_process_wlc (chmfile, wlc_count, wlc_size,
+ wlc_offset, doc_index_s,
+ doc_index_r,code_count_s,
+ code_count_r, loc_codes_s,
+ loc_codes_r, &ui, &uiurltbl,
+ &uistrings, &uitopics,
+ &uiurlstr, dict);
+
+ } else if (strncasecmp (text, word, strlen(text)) < -1)
+ break;
+ }
+
+ }
+ } while (!whole_words &&
+ !strncmp (word, text, strlen(text)) &&
+ node_offset);
+
+ FREE(word);
+ FREE(buffer);
+
+ return partial;
+}
+
+typedef struct {
+ const char *file;
+ int offset;
+} Langrec;
+
+static Langrec lang_files[] = {
+ {"/$FIftiMain", 0x7E},
+ {"$WWKeywordLinks/BTree", 0x34},
+ {"$WWAssociativeLinks/BTree", 0x34}
+};
+
+#define LANG_FILES_SIZE (sizeof(lang_files)/sizeof(Langrec))
+
+static int
+chm_get_lcid (struct chmFile *chmfile) {
+ struct chmUnitInfo ui;
+ uint32_t lang;
+ int i;
+
+ for (i=0; i<LANG_FILES_SIZE; i++) {
+
+ if (chm_resolve_object (chmfile, lang_files[i].file, &ui) ==
+ CHM_RESOLVE_SUCCESS) {
+
+ if (chm_retrieve_object (chmfile, &ui, (unsigned char *) &lang,
+ lang_files[i].offset, sizeof(uint32_t)) != 0)
+ return lang;
+ }
+ }
+
+ return -1;
+}
+
+#ifdef __PYTHON__
+
+static PyObject *
+is_searchable (PyObject *self, PyObject *args) {
+ struct chmFile *file;
+ PyObject *obj0;
+ struct chmUnitInfo ui;
+
+ if (PyArg_ParseTuple (args, "O:is_searchable", &obj0)) {
+
+ file = (struct chmFile *) ((SwigPyObject*)(obj0))->ptr;
+
+ if (chm_resolve_object (file, "/$FIftiMain", &ui) !=
+ CHM_RESOLVE_SUCCESS ||
+ chm_resolve_object (file, "/#TOPICS", &ui) !=
+ CHM_RESOLVE_SUCCESS ||
+ chm_resolve_object (file, "/#STRINGS", &ui) !=
+ CHM_RESOLVE_SUCCESS ||
+ chm_resolve_object (file, "/#URLTBL", &ui) !=
+ CHM_RESOLVE_SUCCESS ||
+ chm_resolve_object (file, "/#URLSTR", &ui) !=
+ CHM_RESOLVE_SUCCESS)
+ return Py_BuildValue ("i", 0);
+ else
+ return Py_BuildValue ("i", 1);
+ } else {
+ PyErr_SetString(PyExc_TypeError, "Expected chmfile (not CHMFile!)");
+ return NULL;
+ }
+}
+
+static PyObject *
+search (PyObject *self, PyObject *args) {
+ char *text;
+ int whole_words = 0;
+ int titles_only = 0;
+ int partial;
+ struct chmFile *file;
+ PyObject *obj0;
+ PyObject *dict;
+
+#if PY_MAJOR_VERSION >= 3
+ PyObject *obj1;
+ if (PyArg_ParseTuple (args, "OSii:search", &obj0, &obj1,
+#else
+ if (PyArg_ParseTuple (args, "Osii:search", &obj0, &text,
+#endif
+ &whole_words, &titles_only)) {
+
+#if PY_MAJOR_VERSION >= 3
+ text = PyBytes_AsString(obj1);
+#endif
+ dict = PyDict_New();
+
+ if (dict) {
+ file = (struct chmFile *) ((SwigPyObject*)(obj0))->ptr;
+
+ partial = chm_search (file,
+ text, whole_words, titles_only, dict);
+ return Py_BuildValue ("(iO)", partial, dict);
+
+ } else {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ } else {
+ PyErr_SetString(PyExc_TypeError,
+ "Expected chmfile (not CHMFile!), string, int, int");
+ return NULL;
+ }
+}
+
+static PyObject *
+get_lcid (PyObject *self, PyObject *args) {
+ int code;
+ struct chmFile *file;
+ PyObject *obj0;
+
+ if (PyArg_ParseTuple (args, "O:get_lcid", &obj0)) {
+
+ file = (struct chmFile *) ((SwigPyObject*)(obj0))->ptr;
+
+ code = chm_get_lcid (file);
+
+ if (code != -1)
+ return Py_BuildValue ("i", code);
+ else
+ Py_INCREF(Py_None);
+ return Py_None;
+ } else {
+ PyErr_SetString(PyExc_TypeError,"Expected a chmfile (not a CHMFile!)");
+ return NULL;
+ }
+}
+
+static PyMethodDef
+IndexMethods[] = {
+ {"get_lcid", get_lcid, METH_VARARGS,
+ "Returns LCID (Locale ID) for archive."},
+ {"search", search, METH_VARARGS,
+ "Perform Full-Text search."},
+ {"is_searchable", is_searchable, METH_VARARGS,
+ "Return 1 if it is possible to search the archive, 0 otherwise."},
+ {NULL, NULL, 0, NULL}
+};
+
+
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef moduledef = {
+ PyModuleDef_HEAD_INIT,
+ "extra",
+ NULL,
+ -1,
+ IndexMethods,
+ NULL,
+ NULL,
+ NULL,
+ NULL
+};
+
+#define INITERROR return NULL
+
+#else /* python < 3 */
+
+#define INITERROR return
+
+#endif /* python 3/2 */
+
+
+#if PY_MAJOR_VERSION >= 3
+PyObject* PyInit_extra(void)
+#else
+void initextra (void)
+#endif
+{
+ PyObject *module;
+#if PY_MAJOR_VERSION >= 3
+ module = PyModule_Create(&moduledef);
+#else
+ module = Py_InitModule ("extra", IndexMethods);
+#endif
+ if (module == NULL)
+ INITERROR;
+
+#if PY_MAJOR_VERSION >= 3
+ return module;
+#endif
+}
+
+#else
+
+int
+main (int argc, char **argv) {
+ struct chmFile *file;
+ char text[255];
+ int whole_words, titles_only;
+ int partial;
+
+ if (argc == 2) {
+ file = chm_open (argv[1]);
+
+ if (file) {
+ printf ("\nLCID= %d (%08X)\n", chm_get_lcid(file), chm_get_lcid(file));
+ while (1) {
+ printf ("\n<whole_words> <titles_only> <string>\n");
+ printf ("> ");
+ if (scanf ("%d %d %s", &whole_words, &titles_only, text))
+ partial = chm_search (file,
+ text, whole_words, titles_only, NULL);
+ else
+ break;
+
+ printf ("Partial = %d\n", partial);
+ }
+
+ chm_close (file);
+ return 0;
+ }
+
+ return -1;
+
+ } else {
+ printf ("\n%s <filename>\n", argv[0]);
+ return 0;
+ }
+}
+
+#endif