Parent: [595e41] (diff)

Download this file

wasaparse.ypp    506 lines (463 with data), 12.3 kB

%{
#define YYDEBUG 1
#include "autoconfig.h"

#include <stdio.h>

#include <iostream>
#include <string>

#include "searchdata.h"
#include "wasaparserdriver.h"
#include "wasaparse.hpp"

using namespace std;

//#define LOG_PARSER
#ifdef LOG_PARSER
#define LOGP(X) {cerr << X;}
#else
#define LOGP(X)
#endif

int yylex(yy::parser::semantic_type *, yy::parser::location_type *, 
          WasaParserDriver *);
void yyerror(char const *);
static void qualify(Rcl::SearchDataClauseDist *, const string &);

static void addSubQuery(WasaParserDriver *d,
                        Rcl::SearchData *sd, Rcl::SearchData *sq)
{
    if (sd && sq)
        sd->addClause(
            new Rcl::SearchDataClauseSub(std::shared_ptr<Rcl::SearchData>(sq)));
}

%}

%skeleton "lalr1.cc"
%defines
%locations
%error-verbose

%parse-param {WasaParserDriver* d}
%lex-param {WasaParserDriver* d}

%union {
    std::string *str;
    Rcl::SearchDataClauseRange *rg;
    Rcl::SearchDataClauseSimple *cl;
    Rcl::SearchData *sd;
}
%destructor {delete $$;} <str>

%type <cl> qualquote
%type <cl> fieldexpr
%type <rg> range
%type <cl> term
%type <sd> query
%type <str> complexfieldname

/* Non operator tokens need precedence because of the possibility of
   concatenation which needs to have lower prec than OR */
%left <str> WORD
%left <str> QUOTED
%left <str> QUALIFIERS
%left AND UCONCAT '(' '-'
%left OR

%token EQUALS CONTAINS SMALLEREQ SMALLER GREATEREQ GREATER RANGE

%%

topquery: query
{
    // It's possible that we end up with no query (e.g.: because just a
    // date filter was set, no terms). Allocate an empty query so that we
    // have something to set the global criteria on (this will yield a
    // Xapian search like <alldocuments> FILTER xxx
    if ($1 == 0)
        d->m_result = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
    else
        d->m_result = $1;
}

query: 
query query %prec UCONCAT
{
    LOGP("q: query query\n");
    Rcl::SearchData *sd = 0;
    if ($1 || $2) {
        sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
        addSubQuery(d, sd, $1);
        addSubQuery(d, sd, $2);
    }
    $$ = sd;
}
| query AND query
{
    LOGP("q: query AND query\n");
    Rcl::SearchData *sd = 0;
    if ($1 || $3) {
        sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
        addSubQuery(d, sd, $1);
        addSubQuery(d, sd, $3);
    }
    $$ = sd;
}
| query OR query
{
    LOGP("query: query OR query\n");
    Rcl::SearchData *top = 0;
    if ($1 || $3) {
       top = new Rcl::SearchData(Rcl::SCLT_OR, d->m_stemlang);
       addSubQuery(d, top, $1);
       addSubQuery(d, top, $3);
    }
    $$ = top;
}
| '(' query ')' 
{
    LOGP("q: ( query )\n");
    $$ = $2;
}
|
fieldexpr %prec UCONCAT
{
    LOGP("q: fieldexpr\n");
    Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
    if (d->addClause(sd, $1)) {
        $$ = sd;
    } else {
        delete sd;
        $$ = 0;
    }
}
;

fieldexpr: term 
{
    LOGP("fe: simple fieldexpr: " << $1->gettext() << endl);
    $$ = $1;
}
| complexfieldname EQUALS term 
{
    LOGP("fe: " << *$1 << " = " << $3->gettext() << endl);
    $3->setfield(*$1);
    $3->setrel(Rcl::SearchDataClause::REL_EQUALS);
    $$ = $3;
    delete $1;
}
| complexfieldname CONTAINS term 
{
    LOGP("fe: " << *$1 << " : " << $3->gettext() << endl);
    $3->setfield(*$1);
    $3->setrel(Rcl::SearchDataClause::REL_CONTAINS);
    $$ = $3;
    delete $1;
}
| complexfieldname CONTAINS range
{
    LOGP("fe: " << *$1 << " : " << $3->gettext() << endl);
    $3->setfield(*$1);
    $3->setrel(Rcl::SearchDataClause::REL_CONTAINS);
    $$ = $3;
    delete $1;
}
| complexfieldname SMALLER term 
{
    LOGP("fe: " << *$1 << " < " << $3->gettext() << endl);
    $3->setfield(*$1);
    $3->setrel(Rcl::SearchDataClause::REL_LT);
    $$ = $3;
    delete $1;
}
| complexfieldname SMALLEREQ term 
{
    LOGP("fe: " << *$1 << " <= " << $3->gettext() << endl);
    $3->setfield(*$1);
    $3->setrel(Rcl::SearchDataClause::REL_LTE);
    $$ = $3;
    delete $1;
}
| complexfieldname GREATER term 
{
    LOGP("fe: "  << *$1 << " > " << $3->gettext() << endl);
    $3->setfield(*$1);
    $3->setrel(Rcl::SearchDataClause::REL_GT);
    $$ = $3;
    delete $1;
}
| complexfieldname GREATEREQ term 
{
    LOGP("fe: " << *$1 << " >= " << $3->gettext() << endl);
    $3->setfield(*$1);
    $3->setrel(Rcl::SearchDataClause::REL_GTE);
    $$ = $3;
    delete $1;
}
| '-' fieldexpr 
{
    LOGP("fe: - fieldexpr[" << $2->gettext() << "]" << endl);
    $2->setexclude(true);
    $$ = $2;
}
;

/* Deal with field names like dc:title */
complexfieldname: 
WORD
{
    LOGP("cfn: WORD" << endl);
    $$ = $1;
}
|
complexfieldname CONTAINS WORD
{
    LOGP("cfn: complexfieldname ':' WORD" << endl);
    $$ = new string(*$1 + string(":") + *$3);
    delete $1;
    delete $3;
}

range:
WORD RANGE WORD
{
    LOGP("Range: " << *$1 << string(" .. ") << *$3 << endl);
    $$ = new Rcl::SearchDataClauseRange(*$1, *$3);
    delete $1;
    delete $3;
}
|
RANGE WORD
{
    LOGP("Range: " << "" << string(" .. ") << *$2 << endl);
    $$ = new Rcl::SearchDataClauseRange("", *$2);
    delete $2;
}
|
WORD RANGE
{
    LOGP("Range: " << *$1 << string(" .. ") << "" << endl);
    $$ = new Rcl::SearchDataClauseRange(*$1, "");
    delete $1;
}
;

term: 
WORD
{
    LOGP("term[" << *$1 << "]" << endl);
    $$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1);
    delete $1;
}
| qualquote 
{
    $$ = $1;
}

qualquote: 
QUOTED
{
    LOGP("QUOTED[" << *$1 << "]" << endl);
    $$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
    delete $1;
}
| QUOTED QUALIFIERS 
{
    LOGP("QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl);
    Rcl::SearchDataClauseDist *cl = 
        new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
    qualify(cl, *$2);
    $$ = cl;
    delete $1;
    delete $2;
}


%%

#include <ctype.h>

// Look for int at index, skip and return new index found? value.
static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval)
{
    unsigned int ncur = cur;
    if (cur < q.size() - 1) {
        char *endptr;
        int val = strtol(&q[cur + 1], &endptr, 10);
        if (endptr != &q[cur + 1]) {
            ncur += endptr - &q[cur + 1];
            *pval = val;
        }
    }
    return ncur;
}

static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
{
    // cerr << "qualify(" << cl << ", " << quals << ")" << endl;
    for (unsigned int i = 0; i < quals.length(); i++) {
        //fprintf(stderr, "qual char %c\n", quals[i]);
        switch (quals[i]) {
        case 'b': 
            cl->setWeight(10.0);
            break;
        case 'c': break;
        case 'C': 
            cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
            break;
        case 'd': break;
        case 'D':  
            cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
            break;
        case 'e': 
            cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
            cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
            cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
            break;
        case 'l': 
            cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
            break;
        case 'L': break;
        case 'o':  
        {
            int slack = 10;
            i = qualGetInt(quals, i, &slack);
            cl->setslack(slack);
            //cerr << "set slack " << cl->getslack() << " done" << endl;
        }
        break;
        case 'p': 
            cl->setTp(Rcl::SCLT_NEAR);
            if (cl->getslack() == 0) {
                cl->setslack(10);
                //cerr << "set slack " << cl->getslack() << " done" << endl;
            }
            break;
        case 's': 
            cl->addModifier(Rcl::SearchDataClause::SDCM_NOSYNS);
            break;
	case 'S':
            break;
        case '.':case '0':case '1':case '2':case '3':case '4':
        case '5':case '6':case '7':case '8':case '9':
        {
            int n = 0;
            float factor = 1.0;
            if (sscanf(&(quals[i]), "%f %n", &factor, &n)) {
                if (factor != 1.0) {
                    cl->setWeight(factor);
                }
            }
            if (n > 0)
                i += n - 1;
        }
        default:
            break;
        }
    }
}


// specialstartchars are special only at the beginning of a token
// (e.g. doctor-who is a term, not 2 terms separated by '-')
static const string specialstartchars("-");
// specialinchars are special everywhere except inside a quoted string
static const string specialinchars(":=<>()");

// Called with the first dquote already read
static int parseString(WasaParserDriver *d, yy::parser::semantic_type *yylval)
{
    string* value = new string();
    d->qualifiers().clear();
    int c;
    while ((c = d->GETCHAR())) {
        switch (c) {
        case '\\':
            /* Escape: get next char */
            c = d->GETCHAR();
            if (c == 0) {
                value->push_back(c);
                goto out;
            }
            value->push_back(c);
            break;
        case '"':
            /* End of string. Look for qualifiers */
            while ((c = d->GETCHAR()) && (isalnum(c) || c == '.'))
                d->qualifiers().push_back(c);
            d->UNGETCHAR(c);
            goto out;
        default:
            value->push_back(c);
        }
    }
out:
    //cerr << "GOT QUOTED ["<<value<<"] quals [" << d->qualifiers() << "]" << endl;
    yylval->str = value;
    return yy::parser::token::QUOTED;
}


int yylex(yy::parser::semantic_type *yylval, yy::parser::location_type *, 
		  WasaParserDriver *d)
{
    if (!d->qualifiers().empty()) {
        yylval->str = new string();
        yylval->str->swap(d->qualifiers());
        return yy::parser::token::QUALIFIERS;
    }

    int c;

    /* Skip white space.  */
    while ((c = d->GETCHAR()) && isspace(c))
        continue;

    if (c == 0)
        return 0;

    if (specialstartchars.find_first_of(c) != string::npos) {
        //cerr << "yylex: return " << c << endl;
        return c;
    }

    // field-term relations, and ranges
    switch (c) {
    case '=': return yy::parser::token::EQUALS;
    case ':': return yy::parser::token::CONTAINS;
    case '<': {
        int c1 = d->GETCHAR();
        if (c1 == '=') {
            return yy::parser::token::SMALLEREQ;
        } else {
            d->UNGETCHAR(c1);
            return yy::parser::token::SMALLER;
        }
    }
    case '.': {
        int c1 = d->GETCHAR();
        if (c1 == '.') {
            return yy::parser::token::RANGE;
        } else {
            d->UNGETCHAR(c1);
            break;
        }
    }
    case '>': {
        int c1 = d->GETCHAR();
        if (c1 == '=') {
            return yy::parser::token::GREATEREQ;
        } else {
            d->UNGETCHAR(c1);
            return yy::parser::token::GREATER;
        }
    }
    case '(': case ')':
        return c;
    }
        
    if (c == '"')
        return parseString(d, yylval);

    d->UNGETCHAR(c);

    // Other chars start a term or field name or reserved word
    string* word = new string();
    while ((c = d->GETCHAR())) {
        if (isspace(c)) {
            //cerr << "Word broken by whitespace" << endl;
            break;
        } else if (specialinchars.find_first_of(c) != string::npos) {
            //cerr << "Word broken by special char" << endl;
            d->UNGETCHAR(c);
            break;
        } else if (c == '.') {
            int c1 = d->GETCHAR();
            if (c1 == '.') {
                d->UNGETCHAR(c1);
                d->UNGETCHAR(c);
                break;
            } else {
                d->UNGETCHAR(c1);
                word->push_back(c);
            }
        } else if (c == 0) {
            //cerr << "Word broken by EOF" << endl;
            break;
        } else {
            word->push_back(c);
        }
    }
    
    if (!word->compare("AND") || !word->compare("&&")) {
        delete word;
        return yy::parser::token::AND;
    } else if (!word->compare("OR") || !word->compare("||")) {
        delete word;
        return yy::parser::token::OR;
    }

//    cerr << "Got word [" << word << "]" << endl;
    yylval->str = word;
    return yy::parser::token::WORD;
}