Switch to unified view

a/src/python/recoll/pyrclextract.cpp b/src/python/recoll/pyrclextract.cpp
...
...
27
27
28
#include "debuglog.h"
28
#include "debuglog.h"
29
#include "rcldoc.h"
29
#include "rcldoc.h"
30
#include "internfile.h"
30
#include "internfile.h"
31
#include "rclconfig.h"
31
#include "rclconfig.h"
32
#include "rclinit.h"
32
33
33
#include "pyrecoll.h"
34
#include "pyrecoll.h"
34
35
36
// Imported from pyrecoll
35
static PyObject *recoll_DocType;
37
static PyObject *recoll_DocType;
38
static RclConfig *rclconfig;
36
39
37
//////////////////////////////////////////////////////////////////////
40
//////////////////////////////////////////////////////////////////////
38
/// Extractor object code
41
/// Extractor object code
39
typedef struct {
42
typedef struct {
40
    PyObject_HEAD
43
    PyObject_HEAD
...
...
87
    self->xtr = new FileInterner(*dobj->doc, self->rclconfig, *self->tmpdir,
90
    self->xtr = new FileInterner(*dobj->doc, self->rclconfig, *self->tmpdir,
88
                 FileInterner::FIF_forPreview);
91
                 FileInterner::FIF_forPreview);
89
    return 0;
92
    return 0;
90
}
93
}
91
94
95
PyDoc_STRVAR(doc_Extractor_textextract,
96
"textextract(ipath)\n"
97
"Extract document defined by ipath and return a doc object. The doc.text\n"
98
"field has the document text as either text/plain or text/html\n"
99
"according to doc.mimetype.\n"
100
);
101
92
static PyObject *
102
static PyObject *
93
Extractor_extract(rclx_ExtractorObject* self, PyObject *args, PyObject *kwargs)
103
Extractor_textextract(rclx_ExtractorObject* self, PyObject *args, 
104
            PyObject *kwargs)
94
{
105
{
95
    LOGDEB(("Extractor_extract\n"));
106
    LOGDEB(("Extractor_textextract\n"));
96
    static const char* kwlist[] = {"ipath", NULL};
107
    static const char* kwlist[] = {"ipath", NULL};
97
    char *sipath = 0;
108
    char *sipath = 0;
98
109
99
    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es:Extractor_extract", 
110
    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es:Extractor_textextract",
100
                     (char**)kwlist, 
111
                     (char**)kwlist, 
101
                     "utf-8", &sipath))
112
                     "utf-8", &sipath))
102
    return 0;
113
    return 0;
103
114
104
    string ipath(sipath);
115
    string ipath(sipath);
...
...
110
    }
121
    }
111
    /* Call the doc class object to create a new doc. */
122
    /* Call the doc class object to create a new doc. */
112
    recoll_DocObject *result = 
123
    recoll_DocObject *result = 
113
       (recoll_DocObject *)PyObject_CallObject((PyObject *)recoll_DocType, 0);
124
       (recoll_DocObject *)PyObject_CallObject((PyObject *)recoll_DocType, 0);
114
    if (!result) {
125
    if (!result) {
115
  LOGERR(("Query_fetchone: couldn't create doc object for result\n"));
126
        PyErr_SetString(PyExc_AttributeError, "extract: doc create failed");
116
    return 0;
127
    return 0;
117
    }
128
    }
118
    FileInterner::Status status = self->xtr->internfile(*(result->doc), ipath);
129
    FileInterner::Status status = self->xtr->internfile(*(result->doc), ipath);
119
    if (status != FileInterner::FIDone) {
130
    if (status != FileInterner::FIDone) {
120
        PyErr_SetString(PyExc_AttributeError, "internfile failure");
131
        PyErr_SetString(PyExc_AttributeError, "internfile failure");
...
...
125
    if (!html.empty()) {
136
    if (!html.empty()) {
126
    result->doc->text = html;
137
    result->doc->text = html;
127
    result->doc->mimetype = "text/html";
138
    result->doc->mimetype = "text/html";
128
    }
139
    }
129
140
130
    // fetching attributes easier. Is this actually needed ? Useful for
141
    // Is this actually needed ? Useful for url which is also formatted .
131
    // url which is also formatted .
132
    Rcl::Doc *doc = result->doc;
142
    Rcl::Doc *doc = result->doc;
133
    printableUrl(self->rclconfig->getDefCharset(), doc->url, 
143
    printableUrl(self->rclconfig->getDefCharset(), doc->url, 
134
         doc->meta[Rcl::Doc::keyurl]);
144
         doc->meta[Rcl::Doc::keyurl]);
135
    doc->meta[Rcl::Doc::keytp] = doc->mimetype;
145
    doc->meta[Rcl::Doc::keytp] = doc->mimetype;
136
    doc->meta[Rcl::Doc::keyipt] = doc->ipath;
146
    doc->meta[Rcl::Doc::keyipt] = doc->ipath;
137
    doc->meta[Rcl::Doc::keyfs] = doc->fbytes;
147
    doc->meta[Rcl::Doc::keyfs] = doc->fbytes;
138
    doc->meta[Rcl::Doc::keyds] = doc->dbytes;
148
    doc->meta[Rcl::Doc::keyds] = doc->dbytes;
139
    return (PyObject *)result;
149
    return (PyObject *)result;
140
}
150
}
141
151
142
PyDoc_STRVAR(doc_extract,
152
PyDoc_STRVAR(doc_Extractor_idoctofile,
143
"extract(ipath)\n"
153
"idoctofile(ipath)\n"
144
"Extract document defined by ipath and return a doc object.\n"
154
"Extract document defined by ipath into a file, in its native format.\n"
145
);
155
);
156
static PyObject *
157
Extractor_idoctofile(rclx_ExtractorObject* self, PyObject *args, 
158
           PyObject *kwargs)
159
{
160
    LOGDEB(("Extractor_idoctofile\n"));
161
    static const char* kwlist[] = {"ipath", "mimetype", "ofilename", NULL};
162
    char *sipath = 0;
163
    char *smt = 0;
164
    char *soutfile = 0; // no freeing
165
    if (!PyArg_ParseTupleAndKeywords(args,kwargs, "eses|s:Extractor_idoctofile",
166
                   (char**)kwlist, 
167
                   "utf-8", &sipath,
168
                   "utf-8", &smt,
169
                   &soutfile))
170
  return 0;
171
172
    string ipath(sipath);
173
    PyMem_Free(sipath);
174
    string mimetype(smt);
175
    PyMem_Free(smt);
176
    string outfile;
177
    if (soutfile && *soutfile)
178
  outfile.assign(soutfile); 
179
    
180
    if (self->xtr == 0) {
181
        PyErr_SetString(PyExc_AttributeError, "extract: null object");
182
  return 0;
183
    }
184
    TempFile temp;
185
    bool status = self->xtr->interntofile(temp, outfile, ipath, mimetype);
186
    if (!status) {
187
        PyErr_SetString(PyExc_AttributeError, "interntofile failure");
188
  return 0;
189
    }
190
    if (outfile.empty())
191
  temp->setnoremove(1);
192
    PyObject *result = outfile.empty() ? PyString_FromString(temp->filename()) :
193
  PyString_FromString(outfile.c_str());
194
    return (PyObject *)result;
195
}
146
196
147
static PyMethodDef Extractor_methods[] = {
197
static PyMethodDef Extractor_methods[] = {
148
    {"extract", (PyCFunction)Extractor_extract, METH_VARARGS|METH_KEYWORDS,
198
    {"textextract", (PyCFunction)Extractor_textextract, 
149
     doc_extract},
199
     METH_VARARGS|METH_KEYWORDS, doc_Extractor_textextract},
200
    {"idoctofile", (PyCFunction)Extractor_idoctofile, 
201
     METH_VARARGS|METH_KEYWORDS, doc_Extractor_idoctofile},
150
    {NULL}  /* Sentinel */
202
    {NULL}  /* Sentinel */
151
};
203
};
152
204
153
PyDoc_STRVAR(doc_ExtractorObject,
205
PyDoc_STRVAR(doc_ExtractorObject,
154
"Extractor()\n"
206
"Extractor()\n"
155
"\n"
207
"\n"
156
"A Extractor object describes a query. It has a number of global\n"
208
"An Extractor object can extract data from a native simple or compound\n"
157
"parameters and a chain of search clauses.\n"
209
"object.\n"
158
);
210
);
159
static PyTypeObject rclx_ExtractorType = {
211
static PyTypeObject rclx_ExtractorType = {
160
    PyObject_HEAD_INIT(NULL)
212
    PyObject_HEAD_INIT(NULL)
161
    0,                         /*ob_size*/
213
    0,                         /*ob_size*/
162
    "rclextract.Extractor",             /*tp_name*/
214
    "rclextract.Extractor",             /*tp_name*/
...
...
209
#define PyMODINIT_FUNC void
261
#define PyMODINIT_FUNC void
210
#endif
262
#endif
211
PyMODINIT_FUNC
263
PyMODINIT_FUNC
212
initrclextract(void)
264
initrclextract(void)
213
{
265
{
266
    // We run recollinit. It's responsible for initializing some static data
267
    // which is distinct from pyrecoll's as we're separately dlopened
268
    string reason;
269
    rclconfig = recollinit(0, 0, reason, 0);
270
    if (rclconfig == 0) {
271
  PyErr_SetString(PyExc_EnvironmentError, reason.c_str());
272
  return;
273
    }
274
    if (!rclconfig->ok()) {
275
  PyErr_SetString(PyExc_EnvironmentError, 
276
          "Recoll init error: bad environment ?");
277
  return;
278
    }
279
214
    PyObject* m = Py_InitModule("rclextract", rclxMethods);
280
    PyObject* m = Py_InitModule("rclextract", rclxMethods);
215
    PyModule_AddStringConstant(m, "__doc__", rclx_doc_string);
281
    PyModule_AddStringConstant(m, "__doc__", rclx_doc_string);
216
282
217
    if (PyType_Ready(&rclx_ExtractorType) < 0)
283
    if (PyType_Ready(&rclx_ExtractorType) < 0)
218
        return;
284
        return;
219
    Py_INCREF(&rclx_ExtractorType);
285
    Py_INCREF(&rclx_ExtractorType);
220
    PyModule_AddObject(m, "Extractor", (PyObject *)&rclx_ExtractorType);
286
    PyModule_AddObject(m, "Extractor", (PyObject *)&rclx_ExtractorType);
221
287
222
    recoll_DocType = (PyObject*)PyCapsule_Import("recoll.doctype", 0);
288
    recoll_DocType = (PyObject*)PyCapsule_Import("recoll.doctypeptr", 0);
223
}
289
}