Switch to unified view

a b/src/python/recoll/pyrclextract.cpp
1
/* Copyright (C) 2007 J.F.Dockes
2
 *   This program is free software; you can redistribute it and/or modify
3
 *   it under the terms of the GNU General Public License as published by
4
 *   the Free Software Foundation; either version 2 of the License, or
5
 *   (at your option) any later version.
6
 *
7
 *   This program is distributed in the hope that it will be useful,
8
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
 *   GNU General Public License for more details.
11
 *
12
 *   You should have received a copy of the GNU General Public License
13
 *   along with this program; if not, write to the
14
 *   Free Software Foundation, Inc.,
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16
 */
17
18
19
#include <Python.h>
20
#include <structmember.h>
21
#include <bytearrayobject.h>
22
23
#include <strings.h>
24
25
#include <string>
26
using namespace std;
27
28
#include "debuglog.h"
29
#include "rcldoc.h"
30
#include "internfile.h"
31
#include "rclconfig.h"
32
33
#include "pyrecoll.h"
34
35
static PyObject *recoll_DocType;
36
37
//////////////////////////////////////////////////////////////////////
38
/// Extractor object code
39
typedef struct {
40
    PyObject_HEAD
41
    /* Type-specific fields go here. */
42
    FileInterner *xtr;
43
    TempDir *tmpdir;
44
    RclConfig *rclconfig;
45
} rclx_ExtractorObject;
46
47
static void 
48
Extractor_dealloc(rclx_ExtractorObject *self)
49
{
50
    LOGDEB(("Extractor_dealloc\n"));
51
    delete self->xtr;
52
    delete self->tmpdir;
53
    self->ob_type->tp_free((PyObject*)self);
54
}
55
56
static PyObject *
57
Extractor_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
58
{
59
    LOGDEB(("Extractor_new\n"));
60
    rclx_ExtractorObject *self = 
61
  (rclx_ExtractorObject *)type->tp_alloc(type, 0);
62
    if (self == 0) 
63
  return 0;
64
    self->xtr = 0;
65
    self->tmpdir = 0;
66
    self->rclconfig = 0;
67
    return (PyObject *)self;
68
}
69
70
static int
71
Extractor_init(rclx_ExtractorObject *self, PyObject *args, PyObject *kwargs)
72
{
73
    LOGDEB(("Extractor_init\n"));
74
    static const char* kwlist[] = {"doc", NULL};
75
    PyObject *pdobj;
76
77
    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!", (char**)kwlist, 
78
                   recoll_DocType, &pdobj))
79
  return -1;
80
    recoll_DocObject *dobj = (recoll_DocObject *)pdobj;
81
    self->tmpdir = new TempDir;
82
    if (dobj->doc == 0) {
83
        PyErr_SetString(PyExc_AttributeError, "Null Doc ?");
84
  return -1;
85
    }
86
    self->rclconfig = dobj->rclconfig;
87
    self->xtr = new FileInterner(*dobj->doc, self->rclconfig, *self->tmpdir,
88
               FileInterner::FIF_forPreview);
89
    return 0;
90
}
91
92
static PyObject *
93
Extractor_extract(rclx_ExtractorObject* self, PyObject *args, PyObject *kwargs)
94
{
95
    LOGDEB(("Extractor_extract\n"));
96
    static const char* kwlist[] = {"ipath", NULL};
97
    char *sipath = 0;
98
99
    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es:Extractor_extract", 
100
                   (char**)kwlist, 
101
                   "utf-8", &sipath))
102
  return 0;
103
104
    string ipath(sipath);
105
    PyMem_Free(sipath);
106
107
    if (self->xtr == 0) {
108
        PyErr_SetString(PyExc_AttributeError, "extract: null object");
109
  return 0;
110
    }
111
    /* Call the doc class object to create a new doc. */
112
    recoll_DocObject *result = 
113
       (recoll_DocObject *)PyObject_CallObject((PyObject *)recoll_DocType, 0);
114
    if (!result) {
115
  LOGERR(("Query_fetchone: couldn't create doc object for result\n"));
116
  return 0;
117
    }
118
    FileInterner::Status status = self->xtr->internfile(*(result->doc), ipath);
119
    if (status != FileInterner::FIDone) {
120
        PyErr_SetString(PyExc_AttributeError, "internfile failure");
121
        return 0;
122
    }
123
124
    string html = self->xtr->get_html();
125
    if (!html.empty()) {
126
  result->doc->text = html;
127
  result->doc->mimetype = "text/html";
128
    }
129
130
    // fetching attributes easier. Is this actually needed ? Useful for
131
    // url which is also formatted .
132
    Rcl::Doc *doc = result->doc;
133
    printableUrl(self->rclconfig->getDefCharset(), doc->url, 
134
       doc->meta[Rcl::Doc::keyurl]);
135
    doc->meta[Rcl::Doc::keytp] = doc->mimetype;
136
    doc->meta[Rcl::Doc::keyipt] = doc->ipath;
137
    doc->meta[Rcl::Doc::keyfs] = doc->fbytes;
138
    doc->meta[Rcl::Doc::keyds] = doc->dbytes;
139
    return (PyObject *)result;
140
}
141
142
PyDoc_STRVAR(doc_extract,
143
"extract(ipath)\n"
144
"Extract document defined by ipath and return a doc object.\n"
145
);
146
147
static PyMethodDef Extractor_methods[] = {
148
    {"extract", (PyCFunction)Extractor_extract, METH_VARARGS|METH_KEYWORDS,
149
     doc_extract},
150
    {NULL}  /* Sentinel */
151
};
152
153
PyDoc_STRVAR(doc_ExtractorObject,
154
"Extractor()\n"
155
"\n"
156
"A Extractor object describes a query. It has a number of global\n"
157
"parameters and a chain of search clauses.\n"
158
);
159
static PyTypeObject rclx_ExtractorType = {
160
    PyObject_HEAD_INIT(NULL)
161
    0,                         /*ob_size*/
162
    "rclextract.Extractor",             /*tp_name*/
163
    sizeof(rclx_ExtractorObject), /*tp_basicsize*/
164
    0,                         /*tp_itemsize*/
165
    (destructor)Extractor_dealloc,    /*tp_dealloc*/
166
    0,                         /*tp_print*/
167
    0,                         /*tp_getattr*/
168
    0,                         /*tp_setattr*/
169
    0,                         /*tp_compare*/
170
    0,                         /*tp_repr*/
171
    0,                         /*tp_as_number*/
172
    0,                         /*tp_as_sequence*/
173
    0,                         /*tp_as_mapping*/
174
    0,                         /*tp_hash */
175
    0,                         /*tp_call*/
176
    0,                         /*tp_str*/
177
    0,                         /*tp_getattro*/
178
    0,                         /*tp_setattro*/
179
    0,                         /*tp_as_buffer*/
180
    Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,  /*tp_flags*/
181
    doc_ExtractorObject,      /* tp_doc */
182
    0,                       /* tp_traverse */
183
    0,                       /* tp_clear */
184
    0,                       /* tp_richcompare */
185
    0,                       /* tp_weaklistoffset */
186
    0,                       /* tp_iter */
187
    0,                       /* tp_iternext */
188
    Extractor_methods,        /* tp_methods */
189
    0,                         /* tp_members */
190
    0,                         /* tp_getset */
191
    0,                         /* tp_base */
192
    0,                         /* tp_dict */
193
    0,                         /* tp_descr_get */
194
    0,                         /* tp_descr_set */
195
    0,                         /* tp_dictoffset */
196
    (initproc)Extractor_init, /* tp_init */
197
    0,                         /* tp_alloc */
198
    Extractor_new,            /* tp_new */
199
};
200
201
///////////////////////////////////// Module-level stuff
202
static PyMethodDef rclxMethods[] = {
203
    {NULL, NULL, 0, NULL}        /* Sentinel */
204
};
205
PyDoc_STRVAR(rclx_doc_string,
206
       "This is an interface to the Recoll text extraction features.");
207
208
#ifndef PyMODINIT_FUNC    /* declarations for DLL import/export */
209
#define PyMODINIT_FUNC void
210
#endif
211
PyMODINIT_FUNC
212
initrclextract(void)
213
{
214
    PyObject* m = Py_InitModule("rclextract", rclxMethods);
215
    PyModule_AddStringConstant(m, "__doc__", rclx_doc_string);
216
217
    if (PyType_Ready(&rclx_ExtractorType) < 0)
218
        return;
219
    Py_INCREF(&rclx_ExtractorType);
220
    PyModule_AddObject(m, "Extractor", (PyObject *)&rclx_ExtractorType);
221
222
    recoll_DocType = (PyObject*)PyCapsule_Import("recoll.doctype", 0);
223
}