|
a/src/python/recoll/pyrclextract.cpp |
|
b/src/python/recoll/pyrclextract.cpp |
|
... |
|
... |
27 |
|
27 |
|
28 |
#include "debuglog.h"
|
28 |
#include "debuglog.h"
|
29 |
#include "rcldoc.h"
|
29 |
#include "rcldoc.h"
|
30 |
#include "internfile.h"
|
30 |
#include "internfile.h"
|
31 |
#include "rclconfig.h"
|
31 |
#include "rclconfig.h"
|
|
|
32 |
#include "rclinit.h"
|
32 |
|
33 |
|
33 |
#include "pyrecoll.h"
|
34 |
#include "pyrecoll.h"
|
34 |
|
35 |
|
|
|
36 |
// Imported from pyrecoll
|
35 |
static PyObject *recoll_DocType;
|
37 |
static PyObject *recoll_DocType;
|
|
|
38 |
static RclConfig *rclconfig;
|
36 |
|
39 |
|
37 |
//////////////////////////////////////////////////////////////////////
|
40 |
//////////////////////////////////////////////////////////////////////
|
38 |
/// Extractor object code
|
41 |
/// Extractor object code
|
39 |
typedef struct {
|
42 |
typedef struct {
|
40 |
PyObject_HEAD
|
43 |
PyObject_HEAD
|
|
... |
|
... |
87 |
self->xtr = new FileInterner(*dobj->doc, self->rclconfig, *self->tmpdir,
|
90 |
self->xtr = new FileInterner(*dobj->doc, self->rclconfig, *self->tmpdir,
|
88 |
FileInterner::FIF_forPreview);
|
91 |
FileInterner::FIF_forPreview);
|
89 |
return 0;
|
92 |
return 0;
|
90 |
}
|
93 |
}
|
91 |
|
94 |
|
|
|
95 |
PyDoc_STRVAR(doc_Extractor_textextract,
|
|
|
96 |
"textextract(ipath)\n"
|
|
|
97 |
"Extract document defined by ipath and return a doc object. The doc.text\n"
|
|
|
98 |
"field has the document text as either text/plain or text/html\n"
|
|
|
99 |
"according to doc.mimetype.\n"
|
|
|
100 |
);
|
|
|
101 |
|
92 |
static PyObject *
|
102 |
static PyObject *
|
93 |
Extractor_extract(rclx_ExtractorObject* self, PyObject *args, PyObject *kwargs)
|
103 |
Extractor_textextract(rclx_ExtractorObject* self, PyObject *args,
|
|
|
104 |
PyObject *kwargs)
|
94 |
{
|
105 |
{
|
95 |
LOGDEB(("Extractor_extract\n"));
|
106 |
LOGDEB(("Extractor_textextract\n"));
|
96 |
static const char* kwlist[] = {"ipath", NULL};
|
107 |
static const char* kwlist[] = {"ipath", NULL};
|
97 |
char *sipath = 0;
|
108 |
char *sipath = 0;
|
98 |
|
109 |
|
99 |
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es:Extractor_extract",
|
110 |
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es:Extractor_textextract",
|
100 |
(char**)kwlist,
|
111 |
(char**)kwlist,
|
101 |
"utf-8", &sipath))
|
112 |
"utf-8", &sipath))
|
102 |
return 0;
|
113 |
return 0;
|
103 |
|
114 |
|
104 |
string ipath(sipath);
|
115 |
string ipath(sipath);
|
|
... |
|
... |
110 |
}
|
121 |
}
|
111 |
/* Call the doc class object to create a new doc. */
|
122 |
/* Call the doc class object to create a new doc. */
|
112 |
recoll_DocObject *result =
|
123 |
recoll_DocObject *result =
|
113 |
(recoll_DocObject *)PyObject_CallObject((PyObject *)recoll_DocType, 0);
|
124 |
(recoll_DocObject *)PyObject_CallObject((PyObject *)recoll_DocType, 0);
|
114 |
if (!result) {
|
125 |
if (!result) {
|
115 |
LOGERR(("Query_fetchone: couldn't create doc object for result\n"));
|
126 |
PyErr_SetString(PyExc_AttributeError, "extract: doc create failed");
|
116 |
return 0;
|
127 |
return 0;
|
117 |
}
|
128 |
}
|
118 |
FileInterner::Status status = self->xtr->internfile(*(result->doc), ipath);
|
129 |
FileInterner::Status status = self->xtr->internfile(*(result->doc), ipath);
|
119 |
if (status != FileInterner::FIDone) {
|
130 |
if (status != FileInterner::FIDone) {
|
120 |
PyErr_SetString(PyExc_AttributeError, "internfile failure");
|
131 |
PyErr_SetString(PyExc_AttributeError, "internfile failure");
|
|
... |
|
... |
125 |
if (!html.empty()) {
|
136 |
if (!html.empty()) {
|
126 |
result->doc->text = html;
|
137 |
result->doc->text = html;
|
127 |
result->doc->mimetype = "text/html";
|
138 |
result->doc->mimetype = "text/html";
|
128 |
}
|
139 |
}
|
129 |
|
140 |
|
130 |
// fetching attributes easier. Is this actually needed ? Useful for
|
141 |
// Is this actually needed ? Useful for url which is also formatted .
|
131 |
// url which is also formatted .
|
|
|
132 |
Rcl::Doc *doc = result->doc;
|
142 |
Rcl::Doc *doc = result->doc;
|
133 |
printableUrl(self->rclconfig->getDefCharset(), doc->url,
|
143 |
printableUrl(self->rclconfig->getDefCharset(), doc->url,
|
134 |
doc->meta[Rcl::Doc::keyurl]);
|
144 |
doc->meta[Rcl::Doc::keyurl]);
|
135 |
doc->meta[Rcl::Doc::keytp] = doc->mimetype;
|
145 |
doc->meta[Rcl::Doc::keytp] = doc->mimetype;
|
136 |
doc->meta[Rcl::Doc::keyipt] = doc->ipath;
|
146 |
doc->meta[Rcl::Doc::keyipt] = doc->ipath;
|
137 |
doc->meta[Rcl::Doc::keyfs] = doc->fbytes;
|
147 |
doc->meta[Rcl::Doc::keyfs] = doc->fbytes;
|
138 |
doc->meta[Rcl::Doc::keyds] = doc->dbytes;
|
148 |
doc->meta[Rcl::Doc::keyds] = doc->dbytes;
|
139 |
return (PyObject *)result;
|
149 |
return (PyObject *)result;
|
140 |
}
|
150 |
}
|
141 |
|
151 |
|
142 |
PyDoc_STRVAR(doc_extract,
|
152 |
PyDoc_STRVAR(doc_Extractor_idoctofile,
|
143 |
"extract(ipath)\n"
|
153 |
"idoctofile(ipath)\n"
|
144 |
"Extract document defined by ipath and return a doc object.\n"
|
154 |
"Extract document defined by ipath into a file, in its native format.\n"
|
145 |
);
|
155 |
);
|
|
|
156 |
static PyObject *
|
|
|
157 |
Extractor_idoctofile(rclx_ExtractorObject* self, PyObject *args,
|
|
|
158 |
PyObject *kwargs)
|
|
|
159 |
{
|
|
|
160 |
LOGDEB(("Extractor_idoctofile\n"));
|
|
|
161 |
static const char* kwlist[] = {"ipath", "mimetype", "ofilename", NULL};
|
|
|
162 |
char *sipath = 0;
|
|
|
163 |
char *smt = 0;
|
|
|
164 |
char *soutfile = 0; // no freeing
|
|
|
165 |
if (!PyArg_ParseTupleAndKeywords(args,kwargs, "eses|s:Extractor_idoctofile",
|
|
|
166 |
(char**)kwlist,
|
|
|
167 |
"utf-8", &sipath,
|
|
|
168 |
"utf-8", &smt,
|
|
|
169 |
&soutfile))
|
|
|
170 |
return 0;
|
|
|
171 |
|
|
|
172 |
string ipath(sipath);
|
|
|
173 |
PyMem_Free(sipath);
|
|
|
174 |
string mimetype(smt);
|
|
|
175 |
PyMem_Free(smt);
|
|
|
176 |
string outfile;
|
|
|
177 |
if (soutfile && *soutfile)
|
|
|
178 |
outfile.assign(soutfile);
|
|
|
179 |
|
|
|
180 |
if (self->xtr == 0) {
|
|
|
181 |
PyErr_SetString(PyExc_AttributeError, "extract: null object");
|
|
|
182 |
return 0;
|
|
|
183 |
}
|
|
|
184 |
TempFile temp;
|
|
|
185 |
bool status = self->xtr->interntofile(temp, outfile, ipath, mimetype);
|
|
|
186 |
if (!status) {
|
|
|
187 |
PyErr_SetString(PyExc_AttributeError, "interntofile failure");
|
|
|
188 |
return 0;
|
|
|
189 |
}
|
|
|
190 |
if (outfile.empty())
|
|
|
191 |
temp->setnoremove(1);
|
|
|
192 |
PyObject *result = outfile.empty() ? PyString_FromString(temp->filename()) :
|
|
|
193 |
PyString_FromString(outfile.c_str());
|
|
|
194 |
return (PyObject *)result;
|
|
|
195 |
}
|
146 |
|
196 |
|
147 |
static PyMethodDef Extractor_methods[] = {
|
197 |
static PyMethodDef Extractor_methods[] = {
|
148 |
{"extract", (PyCFunction)Extractor_extract, METH_VARARGS|METH_KEYWORDS,
|
198 |
{"textextract", (PyCFunction)Extractor_textextract,
|
149 |
doc_extract},
|
199 |
METH_VARARGS|METH_KEYWORDS, doc_Extractor_textextract},
|
|
|
200 |
{"idoctofile", (PyCFunction)Extractor_idoctofile,
|
|
|
201 |
METH_VARARGS|METH_KEYWORDS, doc_Extractor_idoctofile},
|
150 |
{NULL} /* Sentinel */
|
202 |
{NULL} /* Sentinel */
|
151 |
};
|
203 |
};
|
152 |
|
204 |
|
153 |
PyDoc_STRVAR(doc_ExtractorObject,
|
205 |
PyDoc_STRVAR(doc_ExtractorObject,
|
154 |
"Extractor()\n"
|
206 |
"Extractor()\n"
|
155 |
"\n"
|
207 |
"\n"
|
156 |
"A Extractor object describes a query. It has a number of global\n"
|
208 |
"An Extractor object can extract data from a native simple or compound\n"
|
157 |
"parameters and a chain of search clauses.\n"
|
209 |
"object.\n"
|
158 |
);
|
210 |
);
|
159 |
static PyTypeObject rclx_ExtractorType = {
|
211 |
static PyTypeObject rclx_ExtractorType = {
|
160 |
PyObject_HEAD_INIT(NULL)
|
212 |
PyObject_HEAD_INIT(NULL)
|
161 |
0, /*ob_size*/
|
213 |
0, /*ob_size*/
|
162 |
"rclextract.Extractor", /*tp_name*/
|
214 |
"rclextract.Extractor", /*tp_name*/
|
|
... |
|
... |
209 |
#define PyMODINIT_FUNC void
|
261 |
#define PyMODINIT_FUNC void
|
210 |
#endif
|
262 |
#endif
|
211 |
PyMODINIT_FUNC
|
263 |
PyMODINIT_FUNC
|
212 |
initrclextract(void)
|
264 |
initrclextract(void)
|
213 |
{
|
265 |
{
|
|
|
266 |
// We run recollinit. It's responsible for initializing some static data
|
|
|
267 |
// which is distinct from pyrecoll's as we're separately dlopened
|
|
|
268 |
string reason;
|
|
|
269 |
rclconfig = recollinit(0, 0, reason, 0);
|
|
|
270 |
if (rclconfig == 0) {
|
|
|
271 |
PyErr_SetString(PyExc_EnvironmentError, reason.c_str());
|
|
|
272 |
return;
|
|
|
273 |
}
|
|
|
274 |
if (!rclconfig->ok()) {
|
|
|
275 |
PyErr_SetString(PyExc_EnvironmentError,
|
|
|
276 |
"Recoll init error: bad environment ?");
|
|
|
277 |
return;
|
|
|
278 |
}
|
|
|
279 |
|
214 |
PyObject* m = Py_InitModule("rclextract", rclxMethods);
|
280 |
PyObject* m = Py_InitModule("rclextract", rclxMethods);
|
215 |
PyModule_AddStringConstant(m, "__doc__", rclx_doc_string);
|
281 |
PyModule_AddStringConstant(m, "__doc__", rclx_doc_string);
|
216 |
|
282 |
|
217 |
if (PyType_Ready(&rclx_ExtractorType) < 0)
|
283 |
if (PyType_Ready(&rclx_ExtractorType) < 0)
|
218 |
return;
|
284 |
return;
|
219 |
Py_INCREF(&rclx_ExtractorType);
|
285 |
Py_INCREF(&rclx_ExtractorType);
|
220 |
PyModule_AddObject(m, "Extractor", (PyObject *)&rclx_ExtractorType);
|
286 |
PyModule_AddObject(m, "Extractor", (PyObject *)&rclx_ExtractorType);
|
221 |
|
287 |
|
222 |
recoll_DocType = (PyObject*)PyCapsule_Import("recoll.doctype", 0);
|
288 |
recoll_DocType = (PyObject*)PyCapsule_Import("recoll.doctypeptr", 0);
|
223 |
}
|
289 |
}
|