Switch to side-by-side view

--- a/src/python/recoll/pyrecoll.cpp
+++ b/src/python/recoll/pyrecoll.cpp
@@ -15,7 +15,6 @@
  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  */
 
-
 #include <Python.h>
 #include <structmember.h>
 #include <bytearrayobject.h>
@@ -37,6 +36,10 @@
 #include "wasatorcl.h"
 #include "debuglog.h"
 #include "pathut.h"
+#include "plaintorich.h"
+#include "hldata.h"
+
+#include "pyrecoll.h"
 
 static set<Rcl::Db *> the_dbs;
 static set<Rcl::Query *> the_queries;
@@ -45,7 +48,7 @@
 static RclConfig *rclconfig;
 
 // This has to exist somewhere in the python api ??
-PyObject *obj_Create(PyTypeObject *tp, PyObject *args, PyObject *kwargs)
+static PyObject *obj_Create(PyTypeObject *tp, PyObject *args, PyObject *kwargs)
 {
     PyObject *result = tp->tp_new(tp, args, kwargs);
     if (result && tp->tp_init(result, args, kwargs) < 0)
@@ -252,11 +255,6 @@
 
 ///////////////////////////////////////////////////////////////////////
 ///// DOC Doc code
-typedef struct {
-    PyObject_HEAD
-    /* Type-specific fields go here. */
-    Rcl::Doc *doc;
-} recoll_DocObject;
 
 static void 
 Doc_dealloc(recoll_DocObject *self)
@@ -292,11 +290,12 @@
     self->doc = new Rcl::Doc;
     if (self->doc == 0)
 	return -1;
+    self->rclconfig = rclconfig;
     the_docs.insert(self->doc);
     return 0;
 }
 
-PyDoc_STRVAR(doc_getbinurl,
+PyDoc_STRVAR(doc_Doc_getbinurl,
 "getbinurl(none) -> binary url\n"
 "\n"
 "Returns an URL with a path part which is a as bit for bit copy of the \n"
@@ -316,7 +315,7 @@
 					 self->doc->url.size());
 }
 
-PyDoc_STRVAR(doc_setbinurl,
+PyDoc_STRVAR(doc_Doc_setbinurl,
 "setbinurl(url) -> binary url\n"
 "\n"
 "Set the URL from binary path like file://may/contain/unencodable/bytes\n"
@@ -340,12 +339,94 @@
     Py_RETURN_NONE;
 }
 
+PyDoc_STRVAR(doc_Doc_keys,
+"keys() -> list of doc object keys (attribute names)\n"
+);
+static PyObject *
+Doc_keys(recoll_DocObject *self)
+{
+    LOGDEB(("Doc_keys\n"));
+    if (self->doc == 0 || 
+	the_docs.find(self->doc) == the_docs.end()) {
+        PyErr_SetString(PyExc_AttributeError, "doc");
+	return 0;
+    }
+
+    PyObject *pkeys = PyList_New(0);
+    for (map<string,string>::const_iterator it = self->doc->meta.begin();
+	 it != self->doc->meta.end(); it++) {
+	PyList_Append(pkeys,  PyUnicode_Decode(it->first.c_str(), 
+					       it->first.size(), 
+					       "UTF-8", "replace"));
+    }
+    return pkeys;
+}
+
+PyDoc_STRVAR(doc_Doc_items,
+"items() -> dictionary of doc object keys/values\n"
+);
+static PyObject *
+Doc_items(recoll_DocObject *self)
+{
+    LOGDEB(("Doc_getbinurl\n"));
+    if (self->doc == 0 || 
+	the_docs.find(self->doc) == the_docs.end()) {
+        PyErr_SetString(PyExc_AttributeError, "doc");
+	return 0;
+    }
+
+    PyObject *pdict = PyDict_New();
+    for (map<string,string>::const_iterator it = self->doc->meta.begin();
+	 it != self->doc->meta.end(); it++) {
+	PyDict_SetItem(pdict, 
+		       PyUnicode_Decode(it->first.c_str(), 
+					it->first.size(), 
+					"UTF-8", "replace"),
+		       PyUnicode_Decode(it->second.c_str(), 
+					it->second.size(), 
+					"UTF-8", "replace"));
+    }
+    return pdict;
+}
+
+PyDoc_STRVAR(doc_Doc_get,
+"get(key) -> value\n"
+"Retrieve the named doc attribute\n"
+);
+
+static PyObject *
+Doc_get(recoll_DocObject *self, PyObject *args)
+{
+    LOGDEB(("Doc_get\n"));
+    char *sutf8 = 0; // needs freeing
+    if (!PyArg_ParseTuple(args, "es:Doc_get",
+			  "utf-8", &sutf8)) {
+	return 0;
+    }
+    string key(sutf8);
+    PyMem_Free(sutf8);
+
+    if (self->doc == 0 || 
+	the_docs.find(self->doc) == the_docs.end()) {
+        PyErr_SetString(PyExc_AttributeError, "doc??");
+	return 0;
+    }
+    string value;
+    if (self->doc->getmeta(key, 0)) {
+	value = self->doc->meta[key];
+	return PyUnicode_Decode(value.c_str(), 
+				value.size(), 
+				"UTF-8", "replace");
+    }
+    Py_RETURN_NONE;
+}
 
 static PyMethodDef Doc_methods[] = {
-    {"getbinurl", (PyCFunction)Doc_getbinurl, METH_NOARGS,
-     doc_getbinurl},
-    {"setbinurl", (PyCFunction)Doc_setbinurl, METH_O,
-     doc_setbinurl},
+    {"getbinurl", (PyCFunction)Doc_getbinurl, METH_NOARGS, doc_Doc_getbinurl},
+    {"setbinurl", (PyCFunction)Doc_setbinurl, METH_O, doc_Doc_setbinurl},
+    {"keys", (PyCFunction)Doc_keys, METH_NOARGS, doc_Doc_keys},
+    {"items", (PyCFunction)Doc_items, METH_NOARGS, doc_Doc_items},
+    {"get", (PyCFunction)Doc_get, METH_VARARGS, doc_Doc_get},
     {NULL}  /* Sentinel */
 };
 
@@ -380,8 +461,6 @@
     case 'f':
 	if (!key.compare(Rcl::Doc::keyfs)) {
 	    value = self->doc->fbytes; found = true;
-	} else if (!key.compare(Rcl::Doc::keyfs)) {
-	    value = self->doc->fbytes; found = true;
 	} else if (!key.compare(Rcl::Doc::keyfmt)) {
 	    value = self->doc->fmtime; found = true;
 	}
@@ -417,6 +496,11 @@
 	} else 	if (!key.compare(Rcl::Doc::keysz)) {
 	    value = self->doc->dbytes.empty() ? self->doc->fbytes : 
 		self->doc->dbytes; found = true;
+	}
+	break;
+    case 't':
+	if (!key.compare("text")) {
+	    value = self->doc->text; found = true;
 	}
 	break;
     }
@@ -432,7 +516,14 @@
 
 	if (self->doc->getmeta(key, 0)) {
 	    value = self->doc->meta[key];
-	}
+	    found = true;
+	}
+    }
+
+    if (!found) {
+	LOGDEB(("Doc_getattr: name [%s] key [%s] Not found\n",
+		name, key.c_str()));
+	Py_RETURN_NONE;
     }
 
     LOGDEB(("Doc_getattr: [%s] (%s) -> [%s]\n",
@@ -683,10 +774,17 @@
 {
     LOGDEB(("Query_sortby\n"));
     static const char *kwlist[] = {"field", "ascending", NULL};
+    PyObject *ascobj = 0;
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|i", (char**)kwlist,
 				     &self->sortfield,
-				     &self->ascending))
-	return 0;
+				     &ascobj))
+	return 0;
+
+    if (ascobj != 0 && !PyObject_IsTrue(ascobj))
+	self->ascending = false;
+    else 
+	self->ascending = true;
+
     Py_RETURN_NONE;
 }
 
@@ -707,13 +805,15 @@
     char *sutf8 = 0; // needs freeing
     char *sstemlang = 0;
     int dostem = 1;
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|ies:Query_execute", 
+    PyObject *dostemobj = 0;
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|Oes:Query_execute", 
 				     (char**)kwlist, "utf-8", &sutf8,
-				     &dostem, 
+				     &dostemobj, 
 				     "utf-8", &sstemlang)) {
 	return 0;
     }
-
+    if (dostemobj != 0 && !PyObject_IsTrue(dostemobj))
+	dostem = 0;
 
     string utf8(sutf8);
     PyMem_Free(sutf8);
@@ -828,15 +928,274 @@
     return (PyObject *)result;
 }
 
+
+PyDoc_STRVAR(doc_Query_highlight,
+"highlight(text, ishtml = 0/1, eolbr = 0/1, methods = object))\n"
+"Will insert <span \"class=rclmatch\"></span> tags around the match areas\n"
+"in the input text and return the modified text\n"
+"ishtml can be set to indicate that the input text is html and html special\n"
+" characters should not be escaped\n"
+"methods if set should be an object with methods startMatch(i) and endMatch()\n"
+"  which will be called for each match and should return a begin and end tag\n"
+);
+
+class PyPlainToRich: public PlainToRich {
+public:
+    PyPlainToRich(PyObject *methods)
+    : m_methods(methods)
+    {
+    }
+    virtual ~PyPlainToRich()
+    {
+    }
+    virtual string startMatch(unsigned int idx)
+    {
+	PyObject *res =  0;
+	if (m_methods)
+	    res = PyObject_CallMethod(m_methods, (char *)"startMatch", 
+				      (char *)"(i)", idx);
+	if (res == 0)
+	    return "<span class=\"rclmatch\">";
+	PyObject *res1 = res;
+	if (PyUnicode_Check(res))
+	    res1 = PyUnicode_AsUTF8String(res);
+	return PyString_AsString(res1);
+    } 
+
+    virtual string endMatch() 
+    {
+	PyObject *res =  0;
+	if (m_methods)
+	    res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0);
+	if (res == 0)
+	    return "</span res is null>";
+	PyObject *res1 = res;
+	if (PyUnicode_Check(res))
+	    res1 = PyUnicode_AsUTF8String(res);
+	return PyString_AsString(res1);
+    }
+
+    PyObject *m_methods;
+};
+
+static PyObject *
+Query_highlight(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
+{
+    LOGDEB1(("Query_highlight\n"));
+    static const char *kwlist[] = {"text", "ishtml", "methods", NULL};
+    char *sutf8 = 0; // needs freeing
+    int ishtml = 0;
+    PyObject *ishtmlobj = 0;
+    PyObject *methods = 0;
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|OO:Query_highlight",
+				     (char**)kwlist, 
+				     "utf-8", &sutf8,
+				     &ishtml,
+				     &methods)) {
+	return 0;
+    }
+    string utf8(sutf8);
+    LOGDEB(("Query_highlight: [%s] ishtml %d\n", sutf8, ishtml));
+    PyMem_Free(sutf8);
+    if (ishtmlobj != 0 && PyObject_IsTrue(ishtmlobj))
+	ishtml = 1;
+
+    if (self->query == 0 || 
+	the_queries.find(self->query) == the_queries.end()) {
+        PyErr_SetString(PyExc_AttributeError, "query");
+	return 0;
+    }
+
+    RefCntr<Rcl::SearchData> sd = self->query->getSD();
+    if (sd.isNull()) {
+	PyErr_SetString(PyExc_ValueError, "Query not initialized");
+	return 0;
+    }
+    HighlightData hldata;
+    sd->getTerms(hldata);
+    PyPlainToRich hler(methods);
+    hler.set_inputhtml(ishtml);
+    list<string> out;
+    hler.plaintorich(utf8, out, hldata, 5000000);
+    if (out.empty()) {
+	PyErr_SetString(PyExc_ValueError, "Plaintorich failed");
+	return 0;
+    }
+    PyObject* unicode = PyUnicode_FromStringAndSize(out.begin()->c_str(),
+						    out.begin()->size());
+    return Py_BuildValue("u#", PyUnicode_AsUnicode(unicode), 
+			 PyUnicode_GetSize(unicode));
+}
+
+PyDoc_STRVAR(doc_Query_makedocabstract,
+"makedocabstract(doc, methods = object))\n"
+"Will create a snippets abstract for doc by selecting text around the match\n"
+" terms\n"
+"If methods is set, will also perform highlighting. See the highlight method\n"
+);
+static PyObject *
+Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs)
+{
+    LOGDEB(("Db_makeDocAbstract\n"));
+    static const char *kwlist[] = {"doc", "methods", NULL};
+    recoll_DocObject *pydoc = 0;
+    PyObject *hlmethods = 0;
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|O:Query_makeDocAbstract",
+				     (char **)kwlist,
+				     &recoll_DocType, &pydoc,
+				     &hlmethods)) {
+	return 0;
+    }
+
+    if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) {
+	LOGERR(("Query_makeDocAbstract: doc not found %p\n", pydoc->doc));
+        PyErr_SetString(PyExc_AttributeError, "doc");
+        return 0;
+    }
+    if (the_queries.find(self->query) == the_queries.end()) {
+	LOGERR(("Query_makeDocAbstract: query not found %p\n", self->query));
+        PyErr_SetString(PyExc_AttributeError, "query");
+        return 0;
+    }
+    RefCntr<Rcl::SearchData> sd = self->query->getSD();
+    if (sd.isNull()) {
+	PyErr_SetString(PyExc_ValueError, "Query not initialized");
+	return 0;
+    }
+    string abstract;
+    if (hlmethods == 0) {
+	if (!self->query->makeDocAbstract(*(pydoc->doc), abstract)) {
+	    PyErr_SetString(PyExc_EnvironmentError, 
+			    "rcl makeDocAbstract failed");
+	    return 0;
+	}
+    } else {
+	HighlightData hldata;
+	sd->getTerms(hldata);
+	PyPlainToRich hler(hlmethods);
+	hler.set_inputhtml(0);
+	vector<string> vabs;
+	self->query->makeDocAbstract(*pydoc->doc, vabs);
+	for (unsigned int i = 0; i < vabs.size(); i++) {
+	    if (vabs[i].empty())
+		continue;
+	    list<string> lr;
+	    // There may be data like page numbers before the snippet text.
+	    // will be in brackets.
+	    string::size_type bckt = vabs[i].find("]");
+	    if (bckt == string::npos) {
+		hler.plaintorich(vabs[i], lr, hldata);
+	    } else {
+		hler.plaintorich(vabs[i].substr(bckt), lr, hldata);
+		lr.front() = vabs[i].substr(0, bckt) + lr.front();
+	    }
+	    abstract += lr.front();
+	    abstract += "...";
+	}
+    }
+
+    // Return a python unicode object
+    return PyUnicode_Decode(abstract.c_str(), abstract.size(), 
+				     "UTF-8", "replace");
+}
+
+
+PyDoc_STRVAR(doc_Query_getxquery,
+"getxquery(None) -> Unicode string\n"
+"\n"
+"Retrieves the Xapian query description as a Unicode string.\n"
+"Meaningful only after executexx\n"
+);
+static PyObject *
+Query_getxquery(recoll_QueryObject* self, PyObject *, PyObject *)
+{
+    LOGDEB(("Query_getxquery\n"));
+
+    if (self->query == 0 || 
+	the_queries.find(self->query) == the_queries.end()) {
+        PyErr_SetString(PyExc_AttributeError, "query");
+	return 0;
+    }
+    RefCntr<Rcl::SearchData> sd = self->query->getSD();
+    if (sd.isNull()) {
+	PyErr_SetString(PyExc_ValueError, "Query not initialized");
+	return 0;
+    }
+    string desc = sd->getDescription();
+    return PyUnicode_Decode(desc.c_str(), desc.size(), "UTF-8", "replace");
+}
+
+PyDoc_STRVAR(doc_Query_getgroups,
+"getgroups(None) -> a list of pairs\n"
+"\n"
+"Retrieves the expanded query terms. Meaningful only after executexx\n"
+"In each pair, the first entry is a list of user terms, the second a list of\n"
+"query terms as derived from the user terms and used in the Xapian Query.\n"
+"The size of each list is one for simple terms, or more for group and phrase\n"
+"clauses\n"
+);
+static PyObject *
+Query_getgroups(recoll_QueryObject* self, PyObject *, PyObject *)
+{
+    LOGDEB(("Query_getxquery\n"));
+
+    if (self->query == 0 || 
+	the_queries.find(self->query) == the_queries.end()) {
+        PyErr_SetString(PyExc_AttributeError, "query");
+	return 0;
+    }
+    RefCntr<Rcl::SearchData> sd = self->query->getSD();
+    if (sd.isNull()) {
+	PyErr_SetString(PyExc_ValueError, "Query not initialized");
+	return 0;
+    }
+    HighlightData hld;
+    sd->getTerms(hld);
+    PyObject *mainlist = PyList_New(0);
+    PyObject *ulist;
+    PyObject *xlist;
+    // We walk the groups vector. For each we retrieve the user group,
+    // make a python list of each, then group those in a pair, and
+    // append this to the main list.
+    for (unsigned int i = 0; i < hld.groups.size(); i++) {
+	unsigned int ugidx = hld.grpsugidx[i];
+	ulist = PyList_New(hld.ugroups[ugidx].size());
+	for (unsigned int j = 0; j < hld.ugroups[ugidx].size(); j++) {
+	    PyList_SetItem(ulist, j, 
+			   PyUnicode_Decode(hld.ugroups[ugidx][j].c_str(), 
+					    hld.ugroups[ugidx][j].size(), 
+					    "UTF-8", "replace"));
+	}
+
+	xlist = PyList_New(hld.groups[i].size());
+	for (unsigned int j = 0; j < hld.groups[i].size(); j++) {
+	    PyList_SetItem(xlist, j, 
+			   PyUnicode_Decode(hld.groups[i][j].c_str(), 
+					    hld.groups[i][j].size(), 
+					    "UTF-8", "replace"));
+	}
+	PyList_Append(mainlist,  Py_BuildValue("(OO)", ulist, xlist));
+    }
+    return mainlist;
+}
+
 static PyMethodDef Query_methods[] = {
     {"execute", (PyCFunction)Query_execute, METH_VARARGS|METH_KEYWORDS, 
      doc_Query_execute},
     {"executesd", (PyCFunction)Query_executesd, METH_VARARGS|METH_KEYWORDS, 
      doc_Query_executesd},
-    {"fetchone", (PyCFunction)Query_fetchone, METH_VARARGS,
+    {"fetchone", (PyCFunction)Query_fetchone, METH_NOARGS,
      doc_Query_fetchone},
     {"sortby", (PyCFunction)Query_sortby, METH_VARARGS|METH_KEYWORDS,
      doc_Query_sortby},
+    {"highlight", (PyCFunction)Query_highlight, METH_VARARGS|METH_KEYWORDS,
+     doc_Query_highlight},
+    {"getxquery", (PyCFunction)Query_getxquery, METH_NOARGS,
+     doc_Query_getxquery},
+    {"getgroups", (PyCFunction)Query_getgroups, METH_NOARGS,
+     doc_Query_getgroups},
+    {"makedocabstract", (PyCFunction)Query_makedocabstract, 
+     METH_VARARGS|METH_KEYWORDS, doc_Query_makedocabstract},
     {NULL}  /* Sentinel */
 };
 
@@ -1037,12 +1396,13 @@
         PyErr_SetString(PyExc_AttributeError, "db id not found");
         return 0;
     }
+    LOGDEB(("Db_setAbstractParams: mxchrs %d, ctxwrds %d\n", maxchars, ctxwords));
     self->db->setAbstractParams(-1, maxchars, ctxwords);
     Py_RETURN_NONE;
 }
 
 static PyObject *
-Db_makeDocAbstract(recoll_DbObject* self, PyObject *args, PyObject *)
+Db_makeDocAbstract(recoll_DbObject* self, PyObject *args)
 {
     LOGDEB(("Db_makeDocAbstract\n"));
     recoll_DocObject *pydoc = 0;
@@ -1341,4 +1701,8 @@
     PyModule_AddStringConstant(m, "__doc__",
                                pyrecoll_doc_string);
 
-}
+    
+    PyObject* doctypecapsule = 
+	PyCapsule_New(&recoll_DocType, "recoll.doctype", 0);
+    PyModule_AddObject(m, "doctype", doctypecapsule);
+}