recoll / Code / Diff of /src/python/recoll/pyrclextract.cpp

Diff of /src/python/recoll/pyrclextract.cpp [4ceb46] .. [aa9518]

Switch to side-by-side view

--- a/src/python/recoll/pyrclextract.cpp
+++ b/src/python/recoll/pyrclextract.cpp
@@ -29,10 +29,13 @@
 #include "rcldoc.h"
 #include "internfile.h"
 #include "rclconfig.h"
+#include "rclinit.h"
 
 #include "pyrecoll.h"
 
+// Imported from pyrecoll
 static PyObject *recoll_DocType;
+static RclConfig *rclconfig;
 
 //////////////////////////////////////////////////////////////////////
 /// Extractor object code
@@ -89,14 +92,22 @@
     return 0;
 }
 
+PyDoc_STRVAR(doc_Extractor_textextract,
+"textextract(ipath)\n"
+"Extract document defined by ipath and return a doc object. The doc.text\n"
+"field has the document text as either text/plain or text/html\n"
+"according to doc.mimetype.\n"
+);
+
 static PyObject *
-Extractor_extract(rclx_ExtractorObject* self, PyObject *args, PyObject *kwargs)
-{
-    LOGDEB(("Extractor_extract\n"));
+Extractor_textextract(rclx_ExtractorObject* self, PyObject *args, 
+		      PyObject *kwargs)
+{
+    LOGDEB(("Extractor_textextract\n"));
     static const char* kwlist[] = {"ipath", NULL};
     char *sipath = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es:Extractor_extract", 
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es:Extractor_textextract",
 				     (char**)kwlist, 
 				     "utf-8", &sipath))
 	return 0;
@@ -112,7 +123,7 @@
     recoll_DocObject *result = 
        (recoll_DocObject *)PyObject_CallObject((PyObject *)recoll_DocType, 0);
     if (!result) {
-	LOGERR(("Query_fetchone: couldn't create doc object for result\n"));
+        PyErr_SetString(PyExc_AttributeError, "extract: doc create failed");
 	return 0;
     }
     FileInterner::Status status = self->xtr->internfile(*(result->doc), ipath);
@@ -127,8 +138,7 @@
 	result->doc->mimetype = "text/html";
     }
 
-    // fetching attributes easier. Is this actually needed ? Useful for
-    // url which is also formatted .
+    // Is this actually needed ? Useful for url which is also formatted .
     Rcl::Doc *doc = result->doc;
     printableUrl(self->rclconfig->getDefCharset(), doc->url, 
 		 doc->meta[Rcl::Doc::keyurl]);
@@ -139,22 +149,64 @@
     return (PyObject *)result;
 }
 
-PyDoc_STRVAR(doc_extract,
-"extract(ipath)\n"
-"Extract document defined by ipath and return a doc object.\n"
+PyDoc_STRVAR(doc_Extractor_idoctofile,
+"idoctofile(ipath)\n"
+"Extract document defined by ipath into a file, in its native format.\n"
 );
+static PyObject *
+Extractor_idoctofile(rclx_ExtractorObject* self, PyObject *args, 
+		     PyObject *kwargs)
+{
+    LOGDEB(("Extractor_idoctofile\n"));
+    static const char* kwlist[] = {"ipath", "mimetype", "ofilename", NULL};
+    char *sipath = 0;
+    char *smt = 0;
+    char *soutfile = 0; // no freeing
+    if (!PyArg_ParseTupleAndKeywords(args,kwargs, "eses|s:Extractor_idoctofile",
+				     (char**)kwlist, 
+				     "utf-8", &sipath,
+				     "utf-8", &smt,
+				     &soutfile))
+	return 0;
+
+    string ipath(sipath);
+    PyMem_Free(sipath);
+    string mimetype(smt);
+    PyMem_Free(smt);
+    string outfile;
+    if (soutfile && *soutfile)
+	outfile.assign(soutfile); 
+    
+    if (self->xtr == 0) {
+        PyErr_SetString(PyExc_AttributeError, "extract: null object");
+	return 0;
+    }
+    TempFile temp;
+    bool status = self->xtr->interntofile(temp, outfile, ipath, mimetype);
+    if (!status) {
+        PyErr_SetString(PyExc_AttributeError, "interntofile failure");
+	return 0;
+    }
+    if (outfile.empty())
+	temp->setnoremove(1);
+    PyObject *result = outfile.empty() ? PyString_FromString(temp->filename()) :
+	PyString_FromString(outfile.c_str());
+    return (PyObject *)result;
+}
 
 static PyMethodDef Extractor_methods[] = {
-    {"extract", (PyCFunction)Extractor_extract, METH_VARARGS|METH_KEYWORDS,
-     doc_extract},
+    {"textextract", (PyCFunction)Extractor_textextract, 
+     METH_VARARGS|METH_KEYWORDS, doc_Extractor_textextract},
+    {"idoctofile", (PyCFunction)Extractor_idoctofile, 
+     METH_VARARGS|METH_KEYWORDS, doc_Extractor_idoctofile},
     {NULL}  /* Sentinel */
 };
 
 PyDoc_STRVAR(doc_ExtractorObject,
 "Extractor()\n"
 "\n"
-"A Extractor object describes a query. It has a number of global\n"
-"parameters and a chain of search clauses.\n"
+"An Extractor object can extract data from a native simple or compound\n"
+"object.\n"
 );
 static PyTypeObject rclx_ExtractorType = {
     PyObject_HEAD_INIT(NULL)
@@ -211,6 +263,20 @@
 PyMODINIT_FUNC
 initrclextract(void)
 {
+    // We run recollinit. It's responsible for initializing some static data
+    // which is distinct from pyrecoll's as we're separately dlopened
+    string reason;
+    rclconfig = recollinit(0, 0, reason, 0);
+    if (rclconfig == 0) {
+	PyErr_SetString(PyExc_EnvironmentError, reason.c_str());
+	return;
+    }
+    if (!rclconfig->ok()) {
+	PyErr_SetString(PyExc_EnvironmentError, 
+			"Recoll init error: bad environment ?");
+	return;
+    }
+
     PyObject* m = Py_InitModule("rclextract", rclxMethods);
     PyModule_AddStringConstant(m, "__doc__", rclx_doc_string);
 
@@ -219,5 +285,5 @@
     Py_INCREF(&rclx_ExtractorType);
     PyModule_AddObject(m, "Extractor", (PyObject *)&rclx_ExtractorType);
 
-    recoll_DocType = (PyObject*)PyCapsule_Import("recoll.doctype", 0);
-}
+    recoll_DocType = (PyObject*)PyCapsule_Import("recoll.doctypeptr", 0);
+}