Switch to unified view

a b/src/query/wasaparse.ypp
1
%{
2
#define YYDEBUG 1
3
4
#include <stdio.h>
5
6
#include <iostream>
7
#include <string>
8
9
#include "searchdata.h"
10
#include "wasaparserdriver.h"
11
#include "wasaparse.h"
12
13
using namespace std;
14
15
// #define LOG_PARSER
16
#ifdef LOG_PARSER
17
#define LOGP(X) {cerr << X;}
18
#else
19
#define LOGP(X)
20
#endif
21
22
int yylex(yy::parser::semantic_type *, yy::parser::location_type *, 
23
          WasaParserDriver *);
24
void yyerror(char const *);
25
static void qualify(Rcl::SearchDataClauseDist *, const string &);
26
27
static void addSubQuery(WasaParserDriver *d,
28
                        Rcl::SearchData *sd, Rcl::SearchData *sq)
29
{
30
    sd->addClause(new Rcl::SearchDataClauseSub(RefCntr<Rcl::SearchData>(sq)));
31
}
32
33
%}
34
35
%skeleton "lalr1.cc"
36
%defines
37
%locations
38
%error-verbose
39
40
%parse-param {WasaParserDriver* d}
41
%lex-param {WasaParserDriver* d}
42
43
%union {
44
    std::string *str;
45
    Rcl::SearchDataClauseSimple *cl;
46
    Rcl::SearchData *sd;
47
}
48
%destructor {delete $$;} <str>
49
50
%type <cl> qualquote
51
%type <cl> fieldexpr
52
%type <cl> term
53
%type <sd> query
54
%type <str> complexfieldname
55
56
                          /* Non operator tokens need precedence because of the possibility of
57
                             concatenation which needs to have lower prec than OR */
58
%left <str> WORD
59
%left <str> QUOTED
60
%left <str> QUALIFIERS
61
%left AND UCONCAT '(' '-'
62
%left OR
63
64
%token EQUALS CONTAINS SMALLEREQ SMALLER GREATEREQ GREATER
65
66
%%
67
68
topquery: query
69
{
70
    LOGP("END PARSING\n");
71
    d->m_result = $1;
72
}
73
74
query: 
75
query query %prec UCONCAT
76
{
77
    LOGP("q: query query\n");
78
    Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
79
    addSubQuery(d, sd, $1);
80
    addSubQuery(d, sd, $2);
81
    $$ = sd;
82
}
83
| query AND query
84
{
85
    LOGP("q: query AND query\n");
86
    Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
87
    addSubQuery(d, sd, $1);
88
    addSubQuery(d, sd, $3);
89
    $$ = sd;
90
}
91
| query OR query
92
{
93
    LOGP("q: query OR query\n");
94
    Rcl::SearchData *top = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
95
    Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, d->m_stemlang);
96
    addSubQuery(d, sd, $1);
97
    addSubQuery(d, sd, $3);
98
    addSubQuery(d, top, sd);
99
    $$ = top;
100
}
101
| '(' query ')' 
102
{
103
    LOGP("q: ( query )\n");
104
    $$ = $2;
105
}
106
|
107
fieldexpr %prec UCONCAT
108
{
109
    LOGP("q: fieldexpr\n");
110
    Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
111
    d->addClause(sd, $1);
112
    $$ = sd;
113
}
114
;
115
116
fieldexpr: term 
117
{
118
    LOGP("fe: simple fieldexpr: " << $1->gettext() << endl);
119
    $$ = $1;
120
}
121
| complexfieldname EQUALS term 
122
{
123
    LOGP("fe: " << *$1 << " = " << $3->gettext() << endl);
124
    $3->setfield(*$1);
125
    $3->setrel(Rcl::SearchDataClause::REL_EQUALS);
126
    $$ = $3;
127
    delete $1;
128
}
129
| complexfieldname CONTAINS term 
130
{
131
    LOGP("fe: " << *$1 << " : " << $3->gettext() << endl);
132
    $3->setfield(*$1);
133
    $3->setrel(Rcl::SearchDataClause::REL_CONTAINS);
134
    $$ = $3;
135
    delete $1;
136
}
137
| complexfieldname SMALLER term 
138
{
139
    LOGP(cerr << "fe: " << *$1 << " < " << $3->gettext() << endl);
140
    $3->setfield(*$1);
141
    $3->setrel(Rcl::SearchDataClause::REL_LT);
142
    $$ = $3;
143
    delete $1;
144
}
145
| complexfieldname SMALLEREQ term 
146
{
147
    LOGP("fe: " << *$1 << " <= " << $3->gettext() << endl);
148
    $3->setfield(*$1);
149
    $3->setrel(Rcl::SearchDataClause::REL_LTE);
150
    $$ = $3;
151
    delete $1;
152
}
153
| complexfieldname GREATER term 
154
{
155
    LOGP("fe: "  << *$1 << " > " << $3->gettext() << endl);
156
    $3->setfield(*$1);
157
    $3->setrel(Rcl::SearchDataClause::REL_GT);
158
    $$ = $3;
159
    delete $1;
160
}
161
| complexfieldname GREATEREQ term 
162
{
163
    LOGP("fe: " << *$1 << " >= " << $3->gettext() << endl);
164
    $3->setfield(*$1);
165
    $3->setrel(Rcl::SearchDataClause::REL_GTE);
166
    $$ = $3;
167
    delete $1;
168
}
169
| '-' fieldexpr 
170
{
171
    LOGP("fe: - fieldexpr[" << $2->gettext() << "]" << endl);
172
    $2->setexclude(true);
173
    $$ = $2;
174
}
175
;
176
177
/* Deal with field names like dc:title */
178
complexfieldname: 
179
WORD
180
{
181
    LOGP("cfn: WORD" << endl);
182
    $$ = $1;
183
}
184
|
185
complexfieldname CONTAINS WORD
186
{
187
    LOGP("cfn: complexfieldname ':' WORD" << endl);
188
    $$ = new string(*$1 + string(":") + *$3);
189
    delete $1;
190
    delete $3;
191
}
192
193
term: 
194
WORD
195
{
196
    LOGP("term[" << *$1 << "]" << endl);
197
    $$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1);
198
    delete $1;
199
}
200
| qualquote 
201
{
202
    $$ = $1;
203
}
204
205
qualquote: 
206
QUOTED
207
{
208
    LOGP("QUOTED[" << *$1 << "]" << endl);
209
    $$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
210
    delete $1;
211
}
212
| QUOTED QUALIFIERS 
213
{
214
    LOGP("QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl);
215
    Rcl::SearchDataClauseDist *cl = 
216
        new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
217
    qualify(cl, *$2);
218
    $$ = cl;
219
    delete $1;
220
    delete $2;
221
}
222
223
224
%%
225
226
#include <ctype.h>
227
228
// Look for int at index, skip and return new index found? value.
229
static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval)
230
{
231
    unsigned int ncur = cur;
232
    if (cur < q.size() - 1) {
233
        char *endptr;
234
        int val = strtol(&q[cur + 1], &endptr, 10);
235
        if (endptr != &q[cur + 1]) {
236
            ncur += endptr - &q[cur + 1];
237
            *pval = val;
238
        }
239
    }
240
    return ncur;
241
}
242
243
static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
244
{
245
    // cerr << "qualify(" << cl << ", " << quals << ")" << endl;
246
    for (unsigned int i = 0; i < quals.length(); i++) {
247
        //fprintf(stderr, "qual char %c\n", quals[i]);
248
        switch (quals[i]) {
249
        case 'b': 
250
            cl->setWeight(10.0);
251
            break;
252
        case 'c': break;
253
        case 'C': 
254
            cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
255
            break;
256
        case 'd': break;
257
        case 'D':  
258
            cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
259
            break;
260
        case 'e': 
261
            cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
262
            cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
263
            cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
264
            break;
265
        case 'l': 
266
            cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
267
            break;
268
        case 'L': break;
269
        case 'o':  
270
        {
271
            int slack = 10;
272
            i = qualGetInt(quals, i, &slack);
273
            cl->setslack(slack);
274
            //cerr << "set slack " << cl->getslack() << " done" << endl;
275
        }
276
        break;
277
        case 'p': 
278
            cl->setTp(Rcl::SCLT_NEAR);
279
            if (cl->getslack() == 0) {
280
                cl->setslack(10);
281
                //cerr << "set slack " << cl->getslack() << " done" << endl;
282
            }
283
            break;
284
        case '.':case '0':case '1':case '2':case '3':case '4':
285
        case '5':case '6':case '7':case '8':case '9':
286
        {
287
            int n = 0;
288
            float factor = 1.0;
289
            if (sscanf(&(quals[i]), "%f %n", &factor, &n)) {
290
                if (factor != 1.0) {
291
                    cl->setWeight(factor);
292
                }
293
            }
294
            if (n > 0)
295
                i += n - 1;
296
        }
297
        default:
298
            break;
299
        }
300
    }
301
}
302
303
304
// specialstartchars are special only at the beginning of a token
305
// (e.g. doctor-who is a term, not 2 terms separated by '-')
306
static const string specialstartchars("-");
307
// specialinchars are special everywhere except inside a quoted string
308
static const string specialinchars(":=<>()");
309
310
// Called with the first dquote already read
311
static int parseString(WasaParserDriver *d, yy::parser::semantic_type *yylval)
312
{
313
    string* value = new string();
314
    d->qualifiers().clear();
315
    int c;
316
    while ((c = d->GETCHAR())) {
317
        switch (c) {
318
        case '\\':
319
            /* Escape: get next char */
320
            c = d->GETCHAR();
321
            if (c == 0) {
322
                value->push_back(c);
323
                goto out;
324
            }
325
            value->push_back(c);
326
            break;
327
        case '"':
328
            /* End of string. Look for qualifiers */
329
            while ((c = d->GETCHAR()) && (isalnum(c) || c == '.'))
330
                d->qualifiers().push_back(c);
331
            d->UNGETCHAR(c);
332
            goto out;
333
        default:
334
            value->push_back(c);
335
        }
336
    }
337
out:
338
    //cerr << "GOT QUOTED ["<<value<<"] quals [" << d->qualifiers() << "]" << endl;
339
    yylval->str = value;
340
    return yy::parser::token::QUOTED;
341
}
342
343
344
int yylex(yy::parser::semantic_type *yylval, yy::parser::location_type *, 
345
        WasaParserDriver *d)
346
{
347
    if (!d->qualifiers().empty()) {
348
        yylval->str = new string();
349
        yylval->str->swap(d->qualifiers());
350
        return yy::parser::token::QUALIFIERS;
351
    }
352
353
    int c;
354
355
    /* Skip white space.  */
356
    while ((c = d->GETCHAR()) && isspace(c))
357
        continue;
358
359
    if (c == 0)
360
        return 0;
361
362
    if (specialstartchars.find_first_of(c) != string::npos) {
363
        //cerr << "yylex: return " << c << endl;
364
        return c;
365
    }
366
367
    // field-term relations
368
    switch (c) {
369
    case '=': return yy::parser::token::EQUALS;
370
    case ':': return yy::parser::token::CONTAINS;
371
    case '<': {
372
        int c1 = d->GETCHAR();
373
        if (c1 == '=') {
374
            return yy::parser::token::SMALLEREQ;
375
        } else {
376
            d->UNGETCHAR(c1);
377
            return yy::parser::token::SMALLER;
378
        }
379
    }
380
    case '>': {
381
        int c1 = d->GETCHAR();
382
        if (c1 == '=') {
383
            return yy::parser::token::GREATEREQ;
384
        } else {
385
            d->UNGETCHAR(c1);
386
            return yy::parser::token::GREATER;
387
        }
388
    }
389
    case '(': case ')':
390
        return c;
391
    }
392
        
393
    if (c == '"')
394
        return parseString(d, yylval);
395
396
    d->UNGETCHAR(c);
397
398
    // Other chars start a term or field name or reserved word
399
    string* word = new string();
400
    while ((c = d->GETCHAR())) {
401
        if (isspace(c)) {
402
            //cerr << "Word broken by whitespace" << endl;
403
            break;
404
        } else if (specialinchars.find_first_of(c) != string::npos) {
405
            //cerr << "Word broken by special char" << endl;
406
            d->UNGETCHAR(c);
407
            break;
408
        } else if (c == 0) {
409
            //cerr << "Word broken by EOF" << endl;
410
            break;
411
        } else {
412
            word->push_back(c);
413
        }
414
    }
415
    
416
    if (!word->compare("AND") || !word->compare("&&")) {
417
        delete word;
418
        return yy::parser::token::AND;
419
    } else if (!word->compare("OR") || !word->compare("||")) {
420
        delete word;
421
        return yy::parser::token::OR;
422
    }
423
424
//    cerr << "Got word [" << word << "]" << endl;
425
    yylval->str = word;
426
    return yy::parser::token::WORD;
427
}