wasaparse.ypp
506 lines (463 with data), 12.3 kB
%{
#define YYDEBUG 1
#include "autoconfig.h"
#include <stdio.h>
#include <iostream>
#include <string>
#include "searchdata.h"
#include "wasaparserdriver.h"
#include "wasaparse.hpp"
using namespace std;
//#define LOG_PARSER
#ifdef LOG_PARSER
#define LOGP(X) {cerr << X;}
#else
#define LOGP(X)
#endif
int yylex(yy::parser::semantic_type *, yy::parser::location_type *,
WasaParserDriver *);
void yyerror(char const *);
static void qualify(Rcl::SearchDataClauseDist *, const string &);
static void addSubQuery(WasaParserDriver *d,
Rcl::SearchData *sd, Rcl::SearchData *sq)
{
if (sd && sq)
sd->addClause(
new Rcl::SearchDataClauseSub(std::shared_ptr<Rcl::SearchData>(sq)));
}
%}
%skeleton "lalr1.cc"
%defines
%locations
%error-verbose
%parse-param {WasaParserDriver* d}
%lex-param {WasaParserDriver* d}
%union {
std::string *str;
Rcl::SearchDataClauseRange *rg;
Rcl::SearchDataClauseSimple *cl;
Rcl::SearchData *sd;
}
%destructor {delete $$;} <str>
%type <cl> qualquote
%type <cl> fieldexpr
%type <rg> range
%type <cl> term
%type <sd> query
%type <str> complexfieldname
/* Non operator tokens need precedence because of the possibility of
concatenation which needs to have lower prec than OR */
%left <str> WORD
%left <str> QUOTED
%left <str> QUALIFIERS
%left AND UCONCAT '(' '-'
%left OR
%token EQUALS CONTAINS SMALLEREQ SMALLER GREATEREQ GREATER RANGE
%%
topquery: query
{
// It's possible that we end up with no query (e.g.: because just a
// date filter was set, no terms). Allocate an empty query so that we
// have something to set the global criteria on (this will yield a
// Xapian search like <alldocuments> FILTER xxx
if ($1 == 0)
d->m_result = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
else
d->m_result = $1;
}
query:
query query %prec UCONCAT
{
LOGP("q: query query\n");
Rcl::SearchData *sd = 0;
if ($1 || $2) {
sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
addSubQuery(d, sd, $1);
addSubQuery(d, sd, $2);
}
$$ = sd;
}
| query AND query
{
LOGP("q: query AND query\n");
Rcl::SearchData *sd = 0;
if ($1 || $3) {
sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
addSubQuery(d, sd, $1);
addSubQuery(d, sd, $3);
}
$$ = sd;
}
| query OR query
{
LOGP("query: query OR query\n");
Rcl::SearchData *top = 0;
if ($1 || $3) {
top = new Rcl::SearchData(Rcl::SCLT_OR, d->m_stemlang);
addSubQuery(d, top, $1);
addSubQuery(d, top, $3);
}
$$ = top;
}
| '(' query ')'
{
LOGP("q: ( query )\n");
$$ = $2;
}
|
fieldexpr %prec UCONCAT
{
LOGP("q: fieldexpr\n");
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
if (d->addClause(sd, $1)) {
$$ = sd;
} else {
delete sd;
$$ = 0;
}
}
;
fieldexpr: term
{
LOGP("fe: simple fieldexpr: " << $1->gettext() << endl);
$$ = $1;
}
| complexfieldname EQUALS term
{
LOGP("fe: " << *$1 << " = " << $3->gettext() << endl);
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_EQUALS);
$$ = $3;
delete $1;
}
| complexfieldname CONTAINS term
{
LOGP("fe: " << *$1 << " : " << $3->gettext() << endl);
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_CONTAINS);
$$ = $3;
delete $1;
}
| complexfieldname CONTAINS range
{
LOGP("fe: " << *$1 << " : " << $3->gettext() << endl);
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_CONTAINS);
$$ = $3;
delete $1;
}
| complexfieldname SMALLER term
{
LOGP("fe: " << *$1 << " < " << $3->gettext() << endl);
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_LT);
$$ = $3;
delete $1;
}
| complexfieldname SMALLEREQ term
{
LOGP("fe: " << *$1 << " <= " << $3->gettext() << endl);
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_LTE);
$$ = $3;
delete $1;
}
| complexfieldname GREATER term
{
LOGP("fe: " << *$1 << " > " << $3->gettext() << endl);
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_GT);
$$ = $3;
delete $1;
}
| complexfieldname GREATEREQ term
{
LOGP("fe: " << *$1 << " >= " << $3->gettext() << endl);
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_GTE);
$$ = $3;
delete $1;
}
| '-' fieldexpr
{
LOGP("fe: - fieldexpr[" << $2->gettext() << "]" << endl);
$2->setexclude(true);
$$ = $2;
}
;
/* Deal with field names like dc:title */
complexfieldname:
WORD
{
LOGP("cfn: WORD" << endl);
$$ = $1;
}
|
complexfieldname CONTAINS WORD
{
LOGP("cfn: complexfieldname ':' WORD" << endl);
$$ = new string(*$1 + string(":") + *$3);
delete $1;
delete $3;
}
range:
WORD RANGE WORD
{
LOGP("Range: " << *$1 << string(" .. ") << *$3 << endl);
$$ = new Rcl::SearchDataClauseRange(*$1, *$3);
delete $1;
delete $3;
}
|
RANGE WORD
{
LOGP("Range: " << "" << string(" .. ") << *$2 << endl);
$$ = new Rcl::SearchDataClauseRange("", *$2);
delete $2;
}
|
WORD RANGE
{
LOGP("Range: " << *$1 << string(" .. ") << "" << endl);
$$ = new Rcl::SearchDataClauseRange(*$1, "");
delete $1;
}
;
term:
WORD
{
LOGP("term[" << *$1 << "]" << endl);
$$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1);
delete $1;
}
| qualquote
{
$$ = $1;
}
qualquote:
QUOTED
{
LOGP("QUOTED[" << *$1 << "]" << endl);
$$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
delete $1;
}
| QUOTED QUALIFIERS
{
LOGP("QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl);
Rcl::SearchDataClauseDist *cl =
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
qualify(cl, *$2);
$$ = cl;
delete $1;
delete $2;
}
%%
#include <ctype.h>
// Look for int at index, skip and return new index found? value.
static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval)
{
unsigned int ncur = cur;
if (cur < q.size() - 1) {
char *endptr;
int val = strtol(&q[cur + 1], &endptr, 10);
if (endptr != &q[cur + 1]) {
ncur += endptr - &q[cur + 1];
*pval = val;
}
}
return ncur;
}
static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
{
// cerr << "qualify(" << cl << ", " << quals << ")" << endl;
for (unsigned int i = 0; i < quals.length(); i++) {
//fprintf(stderr, "qual char %c\n", quals[i]);
switch (quals[i]) {
case 'b':
cl->setWeight(10.0);
break;
case 'c': break;
case 'C':
cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
break;
case 'd': break;
case 'D':
cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
break;
case 'e':
cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
break;
case 'l':
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
break;
case 'L': break;
case 'o':
{
int slack = 10;
i = qualGetInt(quals, i, &slack);
cl->setslack(slack);
//cerr << "set slack " << cl->getslack() << " done" << endl;
}
break;
case 'p':
cl->setTp(Rcl::SCLT_NEAR);
if (cl->getslack() == 0) {
cl->setslack(10);
//cerr << "set slack " << cl->getslack() << " done" << endl;
}
break;
case 's':
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSYNS);
break;
case 'S':
break;
case '.':case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':
{
int n = 0;
float factor = 1.0;
if (sscanf(&(quals[i]), "%f %n", &factor, &n)) {
if (factor != 1.0) {
cl->setWeight(factor);
}
}
if (n > 0)
i += n - 1;
}
default:
break;
}
}
}
// specialstartchars are special only at the beginning of a token
// (e.g. doctor-who is a term, not 2 terms separated by '-')
static const string specialstartchars("-");
// specialinchars are special everywhere except inside a quoted string
static const string specialinchars(":=<>()");
// Called with the first dquote already read
static int parseString(WasaParserDriver *d, yy::parser::semantic_type *yylval)
{
string* value = new string();
d->qualifiers().clear();
int c;
while ((c = d->GETCHAR())) {
switch (c) {
case '\\':
/* Escape: get next char */
c = d->GETCHAR();
if (c == 0) {
value->push_back(c);
goto out;
}
value->push_back(c);
break;
case '"':
/* End of string. Look for qualifiers */
while ((c = d->GETCHAR()) && (isalnum(c) || c == '.'))
d->qualifiers().push_back(c);
d->UNGETCHAR(c);
goto out;
default:
value->push_back(c);
}
}
out:
//cerr << "GOT QUOTED ["<<value<<"] quals [" << d->qualifiers() << "]" << endl;
yylval->str = value;
return yy::parser::token::QUOTED;
}
int yylex(yy::parser::semantic_type *yylval, yy::parser::location_type *,
WasaParserDriver *d)
{
if (!d->qualifiers().empty()) {
yylval->str = new string();
yylval->str->swap(d->qualifiers());
return yy::parser::token::QUALIFIERS;
}
int c;
/* Skip white space. */
while ((c = d->GETCHAR()) && isspace(c))
continue;
if (c == 0)
return 0;
if (specialstartchars.find_first_of(c) != string::npos) {
//cerr << "yylex: return " << c << endl;
return c;
}
// field-term relations, and ranges
switch (c) {
case '=': return yy::parser::token::EQUALS;
case ':': return yy::parser::token::CONTAINS;
case '<': {
int c1 = d->GETCHAR();
if (c1 == '=') {
return yy::parser::token::SMALLEREQ;
} else {
d->UNGETCHAR(c1);
return yy::parser::token::SMALLER;
}
}
case '.': {
int c1 = d->GETCHAR();
if (c1 == '.') {
return yy::parser::token::RANGE;
} else {
d->UNGETCHAR(c1);
break;
}
}
case '>': {
int c1 = d->GETCHAR();
if (c1 == '=') {
return yy::parser::token::GREATEREQ;
} else {
d->UNGETCHAR(c1);
return yy::parser::token::GREATER;
}
}
case '(': case ')':
return c;
}
if (c == '"')
return parseString(d, yylval);
d->UNGETCHAR(c);
// Other chars start a term or field name or reserved word
string* word = new string();
while ((c = d->GETCHAR())) {
if (isspace(c)) {
//cerr << "Word broken by whitespace" << endl;
break;
} else if (specialinchars.find_first_of(c) != string::npos) {
//cerr << "Word broken by special char" << endl;
d->UNGETCHAR(c);
break;
} else if (c == '.') {
int c1 = d->GETCHAR();
if (c1 == '.') {
d->UNGETCHAR(c1);
d->UNGETCHAR(c);
break;
} else {
d->UNGETCHAR(c1);
word->push_back(c);
}
} else if (c == 0) {
//cerr << "Word broken by EOF" << endl;
break;
} else {
word->push_back(c);
}
}
if (!word->compare("AND") || !word->compare("&&")) {
delete word;
return yy::parser::token::AND;
} else if (!word->compare("OR") || !word->compare("||")) {
delete word;
return yy::parser::token::OR;
}
// cerr << "Got word [" << word << "]" << endl;
yylval->str = word;
return yy::parser::token::WORD;
}