recoll / Code / Diff of /src/python/pychm/chm/chm.py

Diff of /src/python/pychm/chm/chm.py [000000] .. [29e63a]

Switch to side-by-side view

--- a
+++ b/src/python/pychm/chm/chm.py
@@ -0,0 +1,502 @@
+# Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
+#
+# Based on code by:
+# Copyright (C) 2003  Razvan Cojocaru <razvanco@gmx.net>
+#
+# pychm is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public
+# License along with this program; see the file COPYING.  If not,
+# write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA
+
+'''
+   chm - A high-level front end for the chmlib python module.
+
+   The chm module provides high level access to the functionality
+   included in chmlib. It encapsulates functions in the CHMFile class, and
+   provides some additional features, such as the ability to obtain
+   the contents tree of a CHM archive.
+
+'''
+
+from . import chmlib
+from . import extra
+import array
+import os.path
+import sys
+
+charset_table = {
+    0: 'iso8859_1',  # ANSI_CHARSET
+    238: 'iso8859_2',  # EASTEUROPE_CHARSET
+    178: 'iso8859_6',  # ARABIC_CHARSET
+    161: 'iso8859_7',  # GREEK_CHARSET
+    177: 'iso8859_8',  # HEBREW_CHARSET
+    162: 'iso8859_9',  # TURKISH_CHARSET
+    222: 'iso8859_11',  # THAI_CHARSET - hmm not in python 2.2...
+    186: 'iso8859_13',  # BALTIC_CHARSET
+    204: 'cp1251',  # RUSSIAN_CHARSET
+    255: 'cp437',  # OEM_CHARSET
+    128: 'cp932',  # SHIFTJIS_CHARSET
+    134: 'cp936',  # GB2312_CHARSET
+    129: 'cp949',  # HANGUL_CHARSET
+    136: 'cp950',  # CHINESEBIG5_CHARSET
+    1: None,  # DEFAULT_CHARSET
+    2: None,  # SYMBOL_CHARSET
+    130: None,  # JOHAB_CHARSET
+    163: None,  # VIETNAMESE_CHARSET
+    77: None,  # MAC_CHARSET
+}
+
+locale_table = {
+    0x0436: ('iso8859_1', "Afrikaans", "Western Europe & US"),
+    0x041c: ('iso8859_2', "Albanian", "Central Europe"),
+    0x0401: ('iso8859_6', "Arabic_Saudi_Arabia", "Arabic"),
+    0x0801: ('iso8859_6', "Arabic_Iraq", "Arabic"),
+    0x0c01: ('iso8859_6', "Arabic_Egypt", "Arabic"),
+    0x1001: ('iso8859_6', "Arabic_Libya", "Arabic"),
+    0x1401: ('iso8859_6', "Arabic_Algeria", "Arabic"),
+    0x1801: ('iso8859_6', "Arabic_Morocco", "Arabic"),
+    0x1c01: ('iso8859_6', "Arabic_Tunisia", "Arabic"),
+    0x2001: ('iso8859_6', "Arabic_Oman", "Arabic"),
+    0x2401: ('iso8859_6', "Arabic_Yemen", "Arabic"),
+    0x2801: ('iso8859_6', "Arabic_Syria", "Arabic"),
+    0x2c01: ('iso8859_6', "Arabic_Jordan", "Arabic"),
+    0x3001: ('iso8859_6', "Arabic_Lebanon", "Arabic"),
+    0x3401: ('iso8859_6', "Arabic_Kuwait", "Arabic"),
+    0x3801: ('iso8859_6', "Arabic_UAE", "Arabic"),
+    0x3c01: ('iso8859_6', "Arabic_Bahrain", "Arabic"),
+    0x4001: ('iso8859_6', "Arabic_Qatar", "Arabic"),
+    0x042b: (None, "Armenian", "Armenian"),
+    0x042c: ('iso8859_9', "Azeri_Latin", "Turkish"),
+    0x082c: ('cp1251', "Azeri_Cyrillic", "Cyrillic"),
+    0x042d: ('iso8859_1', "Basque", "Western Europe & US"),
+    0x0423: ('cp1251', "Belarusian", "Cyrillic"),
+    0x0402: ('cp1251', "Bulgarian", "Cyrillic"),
+    0x0403: ('iso8859_1', "Catalan", "Western Europe & US"),
+    0x0404: ('cp950', "Chinese_Taiwan", "Traditional Chinese"),
+    0x0804: ('cp936', "Chinese_PRC", "Simplified Chinese"),
+    0x0c04: ('cp950', "Chinese_Hong_Kong", "Traditional Chinese"),
+    0x1004: ('cp936', "Chinese_Singapore", "Simplified Chinese"),
+    0x1404: ('cp950', "Chinese_Macau", "Traditional Chinese"),
+    0x041a: ('iso8859_2', "Croatian", "Central Europe"),
+    0x0405: ('iso8859_2', "Czech", "Central Europe"),
+    0x0406: ('iso8859_1', "Danish", "Western Europe & US"),
+    0x0413: ('iso8859_1', "Dutch_Standard", "Western Europe & US"),
+    0x0813: ('iso8859_1', "Dutch_Belgian", "Western Europe & US"),
+    0x0409: ('iso8859_1', "English_United_States", "Western Europe & US"),
+    0x0809: ('iso8859_1', "English_United_Kingdom", "Western Europe & US"),
+    0x0c09: ('iso8859_1', "English_Australian", "Western Europe & US"),
+    0x1009: ('iso8859_1', "English_Canadian", "Western Europe & US"),
+    0x1409: ('iso8859_1', "English_New_Zealand", "Western Europe & US"),
+    0x1809: ('iso8859_1', "English_Irish", "Western Europe & US"),
+    0x1c09: ('iso8859_1', "English_South_Africa", "Western Europe & US"),
+    0x2009: ('iso8859_1', "English_Jamaica", "Western Europe & US"),
+    0x2409: ('iso8859_1', "English_Caribbean", "Western Europe & US"),
+    0x2809: ('iso8859_1', "English_Belize", "Western Europe & US"),
+    0x2c09: ('iso8859_1', "English_Trinidad", "Western Europe & US"),
+    0x3009: ('iso8859_1', "English_Zimbabwe", "Western Europe & US"),
+    0x3409: ('iso8859_1', "English_Philippines", "Western Europe & US"),
+    0x0425: ('iso8859_13', "Estonian", "Baltic",),
+    0x0438: ('iso8859_1', "Faeroese", "Western Europe & US"),
+    0x0429: ('iso8859_6', "Farsi", "Arabic"),
+    0x040b: ('iso8859_1', "Finnish", "Western Europe & US"),
+    0x040c: ('iso8859_1', "French_Standard", "Western Europe & US"),
+    0x080c: ('iso8859_1', "French_Belgian", "Western Europe & US"),
+    0x0c0c: ('iso8859_1', "French_Canadian", "Western Europe & US"),
+    0x100c: ('iso8859_1', "French_Swiss", "Western Europe & US"),
+    0x140c: ('iso8859_1', "French_Luxembourg", "Western Europe & US"),
+    0x180c: ('iso8859_1', "French_Monaco", "Western Europe & US"),
+    0x0437: (None, "Georgian", "Georgian"),
+    0x0407: ('iso8859_1', "German_Standard", "Western Europe & US"),
+    0x0807: ('iso8859_1', "German_Swiss", "Western Europe & US"),
+    0x0c07: ('iso8859_1', "German_Austrian", "Western Europe & US"),
+    0x1007: ('iso8859_1', "German_Luxembourg", "Western Europe & US"),
+    0x1407: ('iso8859_1', "German_Liechtenstein", "Western Europe & US"),
+    0x0408: ('iso8859_7', "Greek", "Greek"),
+    0x040d: ('iso8859_8', "Hebrew", "Hebrew"),
+    0x0439: (None, "Hindi", "Indic"),
+    0x040e: ('iso8859_2', "Hungarian", "Central Europe"),
+    0x040f: ('iso8859_1', "Icelandic", "Western Europe & US"),
+    0x0421: ('iso8859_1', "Indonesian", "Western Europe & US"),
+    0x0410: ('iso8859_1', "Italian_Standard", "Western Europe & US"),
+    0x0810: ('iso8859_1', "Italian_Swiss", "Western Europe & US"),
+    0x0411: ('cp932', "Japanese", "Japanese"),
+    0x043f: ('cp1251', "Kazakh", "Cyrillic"),
+    0x0457: (None, "Konkani", "Indic"),
+    0x0412: ('cp949', "Korean", "Korean"),
+    0x0426: ('iso8859_13', "Latvian", "Baltic",),
+    0x0427: ('iso8859_13', "Lithuanian", "Baltic",),
+    0x042f: ('cp1251', "Macedonian", "Cyrillic"),
+    0x043e: ('iso8859_1', "Malay_Malaysia", "Western Europe & US"),
+    0x083e: ('iso8859_1', "Malay_Brunei_Darussalam", "Western Europe & US"),
+    0x044e: (None, "Marathi", "Indic"),
+    0x0414: ('iso8859_1', "Norwegian_Bokmal", "Western Europe & US"),
+    0x0814: ('iso8859_1', "Norwegian_Nynorsk", "Western Europe & US"),
+    0x0415: ('iso8859_2', "Polish", "Central Europe"),
+    0x0416: ('iso8859_1', "Portuguese_Brazilian", "Western Europe & US"),
+    0x0816: ('iso8859_1', "Portuguese_Standard", "Western Europe & US"),
+    0x0418: ('iso8859_2', "Romanian", "Central Europe"),
+    0x0419: ('cp1251', "Russian", "Cyrillic"),
+    0x044f: (None, "Sanskrit", "Indic"),
+    0x081a: ('iso8859_2', "Serbian_Latin", "Central Europe"),
+    0x0c1a: ('cp1251', "Serbian_Cyrillic", "Cyrillic"),
+    0x041b: ('iso8859_2', "Slovak", "Central Europe"),
+    0x0424: ('iso8859_2', "Slovenian", "Central Europe"),
+    0x040a: ('iso8859_1', "Spanish_Trad_Sort", "Western Europe & US"),
+    0x080a: ('iso8859_1', "Spanish_Mexican", "Western Europe & US"),
+    0x0c0a: ('iso8859_1', "Spanish_Modern_Sort", "Western Europe & US"),
+    0x100a: ('iso8859_1', "Spanish_Guatemala", "Western Europe & US"),
+    0x140a: ('iso8859_1', "Spanish_Costa_Rica", "Western Europe & US"),
+    0x180a: ('iso8859_1', "Spanish_Panama", "Western Europe & US"),
+    0x1c0a: ('iso8859_1', "Spanish_Dominican_Repub", "Western Europe & US"),
+    0x200a: ('iso8859_1', "Spanish_Venezuela", "Western Europe & US"),
+    0x240a: ('iso8859_1', "Spanish_Colombia", "Western Europe & US"),
+    0x280a: ('iso8859_1', "Spanish_Peru", "Western Europe & US"),
+    0x2c0a: ('iso8859_1', "Spanish_Argentina", "Western Europe & US"),
+    0x300a: ('iso8859_1', "Spanish_Ecuador", "Western Europe & US"),
+    0x340a: ('iso8859_1', "Spanish_Chile", "Western Europe & US"),
+    0x380a: ('iso8859_1', "Spanish_Uruguay", "Western Europe & US"),
+    0x3c0a: ('iso8859_1', "Spanish_Paraguay", "Western Europe & US"),
+    0x400a: ('iso8859_1', "Spanish_Bolivia", "Western Europe & US"),
+    0x440a: ('iso8859_1', "Spanish_El_Salvador", "Western Europe & US"),
+    0x480a: ('iso8859_1', "Spanish_Honduras", "Western Europe & US"),
+    0x4c0a: ('iso8859_1', "Spanish_Nicaragua", "Western Europe & US"),
+    0x500a: ('iso8859_1', "Spanish_Puerto_Rico", "Western Europe & US"),
+    0x0441: ('iso8859_1', "Swahili", "Western Europe & US"),
+    0x041d: ('iso8859_1', "Swedish", "Western Europe & US"),
+    0x081d: ('iso8859_1', "Swedish_Finland", "Western Europe & US"),
+    0x0449: (None, "Tamil", "Indic"),
+    0x0444: ('cp1251', "Tatar", "Cyrillic"),
+    0x041e: ('iso8859_11', "Thai", "Thai"),
+    0x041f: ('iso8859_9', "Turkish", "Turkish"),
+    0x0422: ('cp1251', "Ukrainian", "Cyrillic"),
+    0x0420: ('iso8859_6', "Urdu", "Arabic"),
+    0x0443: ('iso8859_9', "Uzbek_Latin", "Turkish"),
+    0x0843: ('cp1251', "Uzbek_Cyrillic", "Cyrillic"),
+    0x042a: (None, "Vietnamese", "Vietnamese")
+}
+
+
+class CHMFile:
+    "A class to manage access to CHM files."
+    filename = ""
+    file = None
+    title = ""
+    home = "/"
+    index = None
+    topics = None
+    encoding = None
+    lcid = None
+    binaryindex = None
+
+    def __init__(self):
+        self.searchable = 0
+
+    def LoadCHM(self, archiveName):
+        '''Loads a CHM archive.
+        This function will also call GetArchiveInfo to obtain information
+        such as the index file name and the topics file. It returns 1 on
+        success, and 0 if it fails.
+        '''
+        if self.filename is not None:
+            self.CloseCHM()
+
+        self.file = chmlib.chm_open(archiveName)
+        if self.file is None:
+            return 0
+
+        self.filename = archiveName
+        self.GetArchiveInfo()
+
+        return 1
+
+    def CloseCHM(self):
+        '''Closes the CHM archive.
+        This function will close the CHM file, if it is open. All variables
+        are also reset.
+        '''
+        if self.filename is not None:
+            chmlib.chm_close(self.file)
+            self.file = None
+            self.filename = ''
+            self.title = ""
+            self.home = "/"
+            self.index = None
+            self.topics = None
+            self.encoding = None
+
+    def GetArchiveInfo(self):
+        '''Obtains information on CHM archive.
+        This function checks the /#SYSTEM file inside the CHM archive to
+        obtain the index, home page, topics, encoding and title. It is called
+        from LoadCHM.
+        '''
+
+        self.searchable = extra.is_searchable(self.file)
+        self.lcid = None
+
+        result, ui = chmlib.chm_resolve_object(self.file, b'/#SYSTEM')
+        if (result != chmlib.CHM_RESOLVE_SUCCESS):
+            sys.stderr.write('GetArchiveInfo: #SYSTEM does not exist\n')
+            return 0
+
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 4, ui.length)
+        if (size == 0):
+            sys.stderr.write('GetArchiveInfo: file size = 0\n')
+            return 0
+
+        buff = array.array('B', text)
+
+        index = 0
+        while (index < size):
+            cursor = buff[index] + (buff[index+1] * 256)
+
+            if (cursor == 0):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.topics = b'/' + text[index:index+cursor-1]
+            elif (cursor == 1):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.index = b'/' + text[index:index+cursor-1]
+            elif (cursor == 2):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.home = b'/' + text[index:index+cursor-1]
+            elif (cursor == 3):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.title = text[index:index+cursor-1]
+            elif (cursor == 4):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.lcid = buff[index] + (buff[index+1] * 256)
+            elif (cursor == 6):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                tmp = text[index:index+cursor-1]
+                if not self.topics:
+                    tmp1 = b'/' + tmp + b'.hhc'
+                    tmp2 = b'/' + tmp + b'.hhk'
+                    res1, ui1 = chmlib.chm_resolve_object(self.file, tmp1)
+                    res2, ui2 = chmlib.chm_resolve_object(self.file, tmp2)
+                    if not self.topics and res1 == chmlib.CHM_RESOLVE_SUCCESS:
+                        self.topics = b'/' + tmp + b'.hhc'
+                    if not self.index and res2 == chmlib.CHM_RESOLVE_SUCCESS:
+                        self.index = b'/' + tmp + b'.hhk'
+            elif (cursor == 16):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.encoding = text[index:index+cursor-1]
+            else:
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+            index += cursor
+
+        self.GetWindowsInfo()
+
+        if not self.lcid:
+            self.lcid = extra.get_lcid(self.file)
+
+        return 1
+
+    def GetTopicsTree(self):
+        '''Reads and returns the topics tree.
+        This auxiliary function reads and returns the topics tree file
+        contents for the CHM archive.
+        '''
+        if self.topics is None:
+            return None
+
+        if self.topics:
+            res, ui = chmlib.chm_resolve_object(self.file, self.topics)
+            if (res != chmlib.CHM_RESOLVE_SUCCESS):
+                return None
+
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 0, ui.length)
+        if (size == 0):
+            sys.stderr.write('GetTopicsTree: file size = 0\n')
+            return None
+        return text
+
+    def GetIndex(self):
+        '''Reads and returns the index tree.
+        This auxiliary function reads and returns the index tree file
+        contents for the CHM archive.
+        '''
+        if self.index is None:
+            return None
+
+        if self.index:
+            res, ui = chmlib.chm_resolve_object(self.file, self.index)
+            if (res != chmlib.CHM_RESOLVE_SUCCESS):
+                return None
+
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 0, ui.length)
+        if (size == 0):
+            sys.stderr.write('GetIndex: file size = 0\n')
+            return None
+        return text
+
+    def ResolveObject(self, document):
+        '''Tries to locate a document in the archive.
+        This function tries to locate the document inside the archive. It
+        returns a tuple where the first element is zero if the function
+        was successful, and the second is the UnitInfo for that document.
+        The UnitInfo is used to retrieve the document contents
+        '''
+        if self.file:
+            # path = os.path.abspath(document) # wtf?? the index contents
+                                               # are independant of the os !
+            path = document
+            return chmlib.chm_resolve_object(self.file, path)
+        else:
+            return (1, None)
+
+    def RetrieveObject(self, ui, start=-1, length=-1):
+        '''Retrieves the contents of a document.
+        This function takes a UnitInfo and two optional arguments, the first
+        being the start address and the second is the length. These define
+        the amount of data to be read from the archive.
+        '''
+        if self.file and ui:
+            if length == -1:
+                len = ui.length
+            else:
+                len = length
+            if start == -1:
+                st = 0
+            else:
+                st = long(start)
+            return chmlib.chm_retrieve_object(self.file, ui, st, len)
+        else:
+            return (0, '')
+
+    def Search(self, text, wholewords=0, titleonly=0):
+        '''Performs full-text search on the archive.
+        The first parameter is the word to look for, the second
+        indicates if the search should be for whole words only, and
+        the third parameter indicates if the search should be
+        restricted to page titles.
+        This method will return a tuple, the first item
+        indicating if the search results were partial, and the second
+        item being a dictionary containing the results.'''
+        if text and text != '' and self.file:
+            return extra.search(self.file, text, wholewords, titleonly)
+        else:
+            return None
+
+    def IsSearchable(self):
+        '''Indicates if the full-text search is available for this
+        archive - this flag is updated when GetArchiveInfo is called'''
+        return self.searchable
+
+    def GetEncoding(self):
+        '''Returns a string that can be used with the codecs python package
+        to encode or decode the files in the chm archive. If an error is
+        found, or if it is not possible to find the encoding, None is
+        returned.'''
+        if self.encoding:
+            vals = self.encoding.split(b',')
+            if len(vals) > 2:
+                try:
+                    return charset_table[int(vals[2])]
+                except KeyError:
+                    pass
+        return None
+
+    def GetLCID(self):
+        '''Returns the archive Locale ID'''
+        if self.lcid in locale_table:
+            return locale_table[self.lcid]
+        else:
+            return None
+
+    def GetDWORD(self, buff, idx=0):
+        '''Internal method.
+        Reads a double word (4 bytes) from a buffer.
+        '''
+        result = buff[idx] + (buff[idx+1] << 8) + (buff[idx+2] << 16) + \
+            (buff[idx+3] << 24)
+
+        if result == 0xFFFFFFFF:
+            result = 0
+
+        return result
+
+    def GetString(self, text, idx):
+        '''Internal method.
+        Retrieves a string from the #STRINGS buffer.
+        '''
+        next = text.find(b'\x00', idx)
+        chunk = text[idx:next]
+        return chunk
+
+    def GetWindowsInfo(self):
+        '''Gets information from the #WINDOWS file.
+        Checks the #WINDOWS file to see if it has any info that was
+        not found in #SYSTEM (topics, index or default page.
+        '''
+        result, ui = chmlib.chm_resolve_object(self.file, b'/#WINDOWS')
+        if (result != chmlib.CHM_RESOLVE_SUCCESS):
+            return -1
+
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 0, 8)
+        if (size < 8):
+            return -2
+
+        buff = array.array('B', text)
+        num_entries = self.GetDWORD(buff, 0)
+        entry_size = self.GetDWORD(buff, 4)
+
+        if num_entries < 1:
+            return -3
+
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 8, entry_size)
+        if (size < entry_size):
+            return -4
+
+        buff = array.array('B', text)
+        toc_index = self.GetDWORD(buff, 0x60)
+        idx_index = self.GetDWORD(buff, 0x64)
+        dft_index = self.GetDWORD(buff, 0x68)
+
+        result, ui = chmlib.chm_resolve_object(self.file, b'/#STRINGS')
+        if (result != chmlib.CHM_RESOLVE_SUCCESS):
+            return -5
+
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 0, ui.length)
+        if (size == 0):
+            return -6
+
+        if (not self.topics):
+            self.topics = self.GetString(text, toc_index)
+            if not self.topics.startswith(b"/"):
+                self.topics = b"/" + self.topics
+
+        if (not self.index):
+            self.index = self.GetString(text, idx_index)
+            if not self.index.startswith(b"/"):
+                self.index = b"/" + self.index
+
+        if (dft_index != 0):
+            self.home = self.GetString(text, dft_index)
+            if not self.home.startswith(b"/"):
+                self.home = b"/" + self.home