|
a |
|
b/src/query/wasaparse.ypp |
|
|
1 |
%{
|
|
|
2 |
#define YYDEBUG 1
|
|
|
3 |
|
|
|
4 |
#include <stdio.h>
|
|
|
5 |
|
|
|
6 |
#include <iostream>
|
|
|
7 |
#include <string>
|
|
|
8 |
|
|
|
9 |
#include "searchdata.h"
|
|
|
10 |
#include "wasaparserdriver.h"
|
|
|
11 |
#include "wasaparse.h"
|
|
|
12 |
|
|
|
13 |
using namespace std;
|
|
|
14 |
|
|
|
15 |
// #define LOG_PARSER
|
|
|
16 |
#ifdef LOG_PARSER
|
|
|
17 |
#define LOGP(X) {cerr << X;}
|
|
|
18 |
#else
|
|
|
19 |
#define LOGP(X)
|
|
|
20 |
#endif
|
|
|
21 |
|
|
|
22 |
int yylex(yy::parser::semantic_type *, yy::parser::location_type *,
|
|
|
23 |
WasaParserDriver *);
|
|
|
24 |
void yyerror(char const *);
|
|
|
25 |
static void qualify(Rcl::SearchDataClauseDist *, const string &);
|
|
|
26 |
|
|
|
27 |
static void addSubQuery(WasaParserDriver *d,
|
|
|
28 |
Rcl::SearchData *sd, Rcl::SearchData *sq)
|
|
|
29 |
{
|
|
|
30 |
sd->addClause(new Rcl::SearchDataClauseSub(RefCntr<Rcl::SearchData>(sq)));
|
|
|
31 |
}
|
|
|
32 |
|
|
|
33 |
%}
|
|
|
34 |
|
|
|
35 |
%skeleton "lalr1.cc"
|
|
|
36 |
%defines
|
|
|
37 |
%locations
|
|
|
38 |
%error-verbose
|
|
|
39 |
|
|
|
40 |
%parse-param {WasaParserDriver* d}
|
|
|
41 |
%lex-param {WasaParserDriver* d}
|
|
|
42 |
|
|
|
43 |
%union {
|
|
|
44 |
std::string *str;
|
|
|
45 |
Rcl::SearchDataClauseSimple *cl;
|
|
|
46 |
Rcl::SearchData *sd;
|
|
|
47 |
}
|
|
|
48 |
%destructor {delete $$;} <str>
|
|
|
49 |
|
|
|
50 |
%type <cl> qualquote
|
|
|
51 |
%type <cl> fieldexpr
|
|
|
52 |
%type <cl> term
|
|
|
53 |
%type <sd> query
|
|
|
54 |
%type <str> complexfieldname
|
|
|
55 |
|
|
|
56 |
/* Non operator tokens need precedence because of the possibility of
|
|
|
57 |
concatenation which needs to have lower prec than OR */
|
|
|
58 |
%left <str> WORD
|
|
|
59 |
%left <str> QUOTED
|
|
|
60 |
%left <str> QUALIFIERS
|
|
|
61 |
%left AND UCONCAT '(' '-'
|
|
|
62 |
%left OR
|
|
|
63 |
|
|
|
64 |
%token EQUALS CONTAINS SMALLEREQ SMALLER GREATEREQ GREATER
|
|
|
65 |
|
|
|
66 |
%%
|
|
|
67 |
|
|
|
68 |
topquery: query
|
|
|
69 |
{
|
|
|
70 |
LOGP("END PARSING\n");
|
|
|
71 |
d->m_result = $1;
|
|
|
72 |
}
|
|
|
73 |
|
|
|
74 |
query:
|
|
|
75 |
query query %prec UCONCAT
|
|
|
76 |
{
|
|
|
77 |
LOGP("q: query query\n");
|
|
|
78 |
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
|
|
79 |
addSubQuery(d, sd, $1);
|
|
|
80 |
addSubQuery(d, sd, $2);
|
|
|
81 |
$$ = sd;
|
|
|
82 |
}
|
|
|
83 |
| query AND query
|
|
|
84 |
{
|
|
|
85 |
LOGP("q: query AND query\n");
|
|
|
86 |
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
|
|
87 |
addSubQuery(d, sd, $1);
|
|
|
88 |
addSubQuery(d, sd, $3);
|
|
|
89 |
$$ = sd;
|
|
|
90 |
}
|
|
|
91 |
| query OR query
|
|
|
92 |
{
|
|
|
93 |
LOGP("q: query OR query\n");
|
|
|
94 |
Rcl::SearchData *top = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
|
|
95 |
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, d->m_stemlang);
|
|
|
96 |
addSubQuery(d, sd, $1);
|
|
|
97 |
addSubQuery(d, sd, $3);
|
|
|
98 |
addSubQuery(d, top, sd);
|
|
|
99 |
$$ = top;
|
|
|
100 |
}
|
|
|
101 |
| '(' query ')'
|
|
|
102 |
{
|
|
|
103 |
LOGP("q: ( query )\n");
|
|
|
104 |
$$ = $2;
|
|
|
105 |
}
|
|
|
106 |
|
|
|
|
107 |
fieldexpr %prec UCONCAT
|
|
|
108 |
{
|
|
|
109 |
LOGP("q: fieldexpr\n");
|
|
|
110 |
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
|
|
111 |
d->addClause(sd, $1);
|
|
|
112 |
$$ = sd;
|
|
|
113 |
}
|
|
|
114 |
;
|
|
|
115 |
|
|
|
116 |
fieldexpr: term
|
|
|
117 |
{
|
|
|
118 |
LOGP("fe: simple fieldexpr: " << $1->gettext() << endl);
|
|
|
119 |
$$ = $1;
|
|
|
120 |
}
|
|
|
121 |
| complexfieldname EQUALS term
|
|
|
122 |
{
|
|
|
123 |
LOGP("fe: " << *$1 << " = " << $3->gettext() << endl);
|
|
|
124 |
$3->setfield(*$1);
|
|
|
125 |
$3->setrel(Rcl::SearchDataClause::REL_EQUALS);
|
|
|
126 |
$$ = $3;
|
|
|
127 |
delete $1;
|
|
|
128 |
}
|
|
|
129 |
| complexfieldname CONTAINS term
|
|
|
130 |
{
|
|
|
131 |
LOGP("fe: " << *$1 << " : " << $3->gettext() << endl);
|
|
|
132 |
$3->setfield(*$1);
|
|
|
133 |
$3->setrel(Rcl::SearchDataClause::REL_CONTAINS);
|
|
|
134 |
$$ = $3;
|
|
|
135 |
delete $1;
|
|
|
136 |
}
|
|
|
137 |
| complexfieldname SMALLER term
|
|
|
138 |
{
|
|
|
139 |
LOGP(cerr << "fe: " << *$1 << " < " << $3->gettext() << endl);
|
|
|
140 |
$3->setfield(*$1);
|
|
|
141 |
$3->setrel(Rcl::SearchDataClause::REL_LT);
|
|
|
142 |
$$ = $3;
|
|
|
143 |
delete $1;
|
|
|
144 |
}
|
|
|
145 |
| complexfieldname SMALLEREQ term
|
|
|
146 |
{
|
|
|
147 |
LOGP("fe: " << *$1 << " <= " << $3->gettext() << endl);
|
|
|
148 |
$3->setfield(*$1);
|
|
|
149 |
$3->setrel(Rcl::SearchDataClause::REL_LTE);
|
|
|
150 |
$$ = $3;
|
|
|
151 |
delete $1;
|
|
|
152 |
}
|
|
|
153 |
| complexfieldname GREATER term
|
|
|
154 |
{
|
|
|
155 |
LOGP("fe: " << *$1 << " > " << $3->gettext() << endl);
|
|
|
156 |
$3->setfield(*$1);
|
|
|
157 |
$3->setrel(Rcl::SearchDataClause::REL_GT);
|
|
|
158 |
$$ = $3;
|
|
|
159 |
delete $1;
|
|
|
160 |
}
|
|
|
161 |
| complexfieldname GREATEREQ term
|
|
|
162 |
{
|
|
|
163 |
LOGP("fe: " << *$1 << " >= " << $3->gettext() << endl);
|
|
|
164 |
$3->setfield(*$1);
|
|
|
165 |
$3->setrel(Rcl::SearchDataClause::REL_GTE);
|
|
|
166 |
$$ = $3;
|
|
|
167 |
delete $1;
|
|
|
168 |
}
|
|
|
169 |
| '-' fieldexpr
|
|
|
170 |
{
|
|
|
171 |
LOGP("fe: - fieldexpr[" << $2->gettext() << "]" << endl);
|
|
|
172 |
$2->setexclude(true);
|
|
|
173 |
$$ = $2;
|
|
|
174 |
}
|
|
|
175 |
;
|
|
|
176 |
|
|
|
177 |
/* Deal with field names like dc:title */
|
|
|
178 |
complexfieldname:
|
|
|
179 |
WORD
|
|
|
180 |
{
|
|
|
181 |
LOGP("cfn: WORD" << endl);
|
|
|
182 |
$$ = $1;
|
|
|
183 |
}
|
|
|
184 |
|
|
|
|
185 |
complexfieldname CONTAINS WORD
|
|
|
186 |
{
|
|
|
187 |
LOGP("cfn: complexfieldname ':' WORD" << endl);
|
|
|
188 |
$$ = new string(*$1 + string(":") + *$3);
|
|
|
189 |
delete $1;
|
|
|
190 |
delete $3;
|
|
|
191 |
}
|
|
|
192 |
|
|
|
193 |
term:
|
|
|
194 |
WORD
|
|
|
195 |
{
|
|
|
196 |
LOGP("term[" << *$1 << "]" << endl);
|
|
|
197 |
$$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1);
|
|
|
198 |
delete $1;
|
|
|
199 |
}
|
|
|
200 |
| qualquote
|
|
|
201 |
{
|
|
|
202 |
$$ = $1;
|
|
|
203 |
}
|
|
|
204 |
|
|
|
205 |
qualquote:
|
|
|
206 |
QUOTED
|
|
|
207 |
{
|
|
|
208 |
LOGP("QUOTED[" << *$1 << "]" << endl);
|
|
|
209 |
$$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
|
|
|
210 |
delete $1;
|
|
|
211 |
}
|
|
|
212 |
| QUOTED QUALIFIERS
|
|
|
213 |
{
|
|
|
214 |
LOGP("QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl);
|
|
|
215 |
Rcl::SearchDataClauseDist *cl =
|
|
|
216 |
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
|
|
|
217 |
qualify(cl, *$2);
|
|
|
218 |
$$ = cl;
|
|
|
219 |
delete $1;
|
|
|
220 |
delete $2;
|
|
|
221 |
}
|
|
|
222 |
|
|
|
223 |
|
|
|
224 |
%%
|
|
|
225 |
|
|
|
226 |
#include <ctype.h>
|
|
|
227 |
|
|
|
228 |
// Look for int at index, skip and return new index found? value.
|
|
|
229 |
static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval)
|
|
|
230 |
{
|
|
|
231 |
unsigned int ncur = cur;
|
|
|
232 |
if (cur < q.size() - 1) {
|
|
|
233 |
char *endptr;
|
|
|
234 |
int val = strtol(&q[cur + 1], &endptr, 10);
|
|
|
235 |
if (endptr != &q[cur + 1]) {
|
|
|
236 |
ncur += endptr - &q[cur + 1];
|
|
|
237 |
*pval = val;
|
|
|
238 |
}
|
|
|
239 |
}
|
|
|
240 |
return ncur;
|
|
|
241 |
}
|
|
|
242 |
|
|
|
243 |
static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
|
|
|
244 |
{
|
|
|
245 |
// cerr << "qualify(" << cl << ", " << quals << ")" << endl;
|
|
|
246 |
for (unsigned int i = 0; i < quals.length(); i++) {
|
|
|
247 |
//fprintf(stderr, "qual char %c\n", quals[i]);
|
|
|
248 |
switch (quals[i]) {
|
|
|
249 |
case 'b':
|
|
|
250 |
cl->setWeight(10.0);
|
|
|
251 |
break;
|
|
|
252 |
case 'c': break;
|
|
|
253 |
case 'C':
|
|
|
254 |
cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
|
|
|
255 |
break;
|
|
|
256 |
case 'd': break;
|
|
|
257 |
case 'D':
|
|
|
258 |
cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
|
|
|
259 |
break;
|
|
|
260 |
case 'e':
|
|
|
261 |
cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
|
|
|
262 |
cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
|
|
|
263 |
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
|
|
|
264 |
break;
|
|
|
265 |
case 'l':
|
|
|
266 |
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
|
|
|
267 |
break;
|
|
|
268 |
case 'L': break;
|
|
|
269 |
case 'o':
|
|
|
270 |
{
|
|
|
271 |
int slack = 10;
|
|
|
272 |
i = qualGetInt(quals, i, &slack);
|
|
|
273 |
cl->setslack(slack);
|
|
|
274 |
//cerr << "set slack " << cl->getslack() << " done" << endl;
|
|
|
275 |
}
|
|
|
276 |
break;
|
|
|
277 |
case 'p':
|
|
|
278 |
cl->setTp(Rcl::SCLT_NEAR);
|
|
|
279 |
if (cl->getslack() == 0) {
|
|
|
280 |
cl->setslack(10);
|
|
|
281 |
//cerr << "set slack " << cl->getslack() << " done" << endl;
|
|
|
282 |
}
|
|
|
283 |
break;
|
|
|
284 |
case '.':case '0':case '1':case '2':case '3':case '4':
|
|
|
285 |
case '5':case '6':case '7':case '8':case '9':
|
|
|
286 |
{
|
|
|
287 |
int n = 0;
|
|
|
288 |
float factor = 1.0;
|
|
|
289 |
if (sscanf(&(quals[i]), "%f %n", &factor, &n)) {
|
|
|
290 |
if (factor != 1.0) {
|
|
|
291 |
cl->setWeight(factor);
|
|
|
292 |
}
|
|
|
293 |
}
|
|
|
294 |
if (n > 0)
|
|
|
295 |
i += n - 1;
|
|
|
296 |
}
|
|
|
297 |
default:
|
|
|
298 |
break;
|
|
|
299 |
}
|
|
|
300 |
}
|
|
|
301 |
}
|
|
|
302 |
|
|
|
303 |
|
|
|
304 |
// specialstartchars are special only at the beginning of a token
|
|
|
305 |
// (e.g. doctor-who is a term, not 2 terms separated by '-')
|
|
|
306 |
static const string specialstartchars("-");
|
|
|
307 |
// specialinchars are special everywhere except inside a quoted string
|
|
|
308 |
static const string specialinchars(":=<>()");
|
|
|
309 |
|
|
|
310 |
// Called with the first dquote already read
|
|
|
311 |
static int parseString(WasaParserDriver *d, yy::parser::semantic_type *yylval)
|
|
|
312 |
{
|
|
|
313 |
string* value = new string();
|
|
|
314 |
d->qualifiers().clear();
|
|
|
315 |
int c;
|
|
|
316 |
while ((c = d->GETCHAR())) {
|
|
|
317 |
switch (c) {
|
|
|
318 |
case '\\':
|
|
|
319 |
/* Escape: get next char */
|
|
|
320 |
c = d->GETCHAR();
|
|
|
321 |
if (c == 0) {
|
|
|
322 |
value->push_back(c);
|
|
|
323 |
goto out;
|
|
|
324 |
}
|
|
|
325 |
value->push_back(c);
|
|
|
326 |
break;
|
|
|
327 |
case '"':
|
|
|
328 |
/* End of string. Look for qualifiers */
|
|
|
329 |
while ((c = d->GETCHAR()) && (isalnum(c) || c == '.'))
|
|
|
330 |
d->qualifiers().push_back(c);
|
|
|
331 |
d->UNGETCHAR(c);
|
|
|
332 |
goto out;
|
|
|
333 |
default:
|
|
|
334 |
value->push_back(c);
|
|
|
335 |
}
|
|
|
336 |
}
|
|
|
337 |
out:
|
|
|
338 |
//cerr << "GOT QUOTED ["<<value<<"] quals [" << d->qualifiers() << "]" << endl;
|
|
|
339 |
yylval->str = value;
|
|
|
340 |
return yy::parser::token::QUOTED;
|
|
|
341 |
}
|
|
|
342 |
|
|
|
343 |
|
|
|
344 |
int yylex(yy::parser::semantic_type *yylval, yy::parser::location_type *,
|
|
|
345 |
WasaParserDriver *d)
|
|
|
346 |
{
|
|
|
347 |
if (!d->qualifiers().empty()) {
|
|
|
348 |
yylval->str = new string();
|
|
|
349 |
yylval->str->swap(d->qualifiers());
|
|
|
350 |
return yy::parser::token::QUALIFIERS;
|
|
|
351 |
}
|
|
|
352 |
|
|
|
353 |
int c;
|
|
|
354 |
|
|
|
355 |
/* Skip white space. */
|
|
|
356 |
while ((c = d->GETCHAR()) && isspace(c))
|
|
|
357 |
continue;
|
|
|
358 |
|
|
|
359 |
if (c == 0)
|
|
|
360 |
return 0;
|
|
|
361 |
|
|
|
362 |
if (specialstartchars.find_first_of(c) != string::npos) {
|
|
|
363 |
//cerr << "yylex: return " << c << endl;
|
|
|
364 |
return c;
|
|
|
365 |
}
|
|
|
366 |
|
|
|
367 |
// field-term relations
|
|
|
368 |
switch (c) {
|
|
|
369 |
case '=': return yy::parser::token::EQUALS;
|
|
|
370 |
case ':': return yy::parser::token::CONTAINS;
|
|
|
371 |
case '<': {
|
|
|
372 |
int c1 = d->GETCHAR();
|
|
|
373 |
if (c1 == '=') {
|
|
|
374 |
return yy::parser::token::SMALLEREQ;
|
|
|
375 |
} else {
|
|
|
376 |
d->UNGETCHAR(c1);
|
|
|
377 |
return yy::parser::token::SMALLER;
|
|
|
378 |
}
|
|
|
379 |
}
|
|
|
380 |
case '>': {
|
|
|
381 |
int c1 = d->GETCHAR();
|
|
|
382 |
if (c1 == '=') {
|
|
|
383 |
return yy::parser::token::GREATEREQ;
|
|
|
384 |
} else {
|
|
|
385 |
d->UNGETCHAR(c1);
|
|
|
386 |
return yy::parser::token::GREATER;
|
|
|
387 |
}
|
|
|
388 |
}
|
|
|
389 |
case '(': case ')':
|
|
|
390 |
return c;
|
|
|
391 |
}
|
|
|
392 |
|
|
|
393 |
if (c == '"')
|
|
|
394 |
return parseString(d, yylval);
|
|
|
395 |
|
|
|
396 |
d->UNGETCHAR(c);
|
|
|
397 |
|
|
|
398 |
// Other chars start a term or field name or reserved word
|
|
|
399 |
string* word = new string();
|
|
|
400 |
while ((c = d->GETCHAR())) {
|
|
|
401 |
if (isspace(c)) {
|
|
|
402 |
//cerr << "Word broken by whitespace" << endl;
|
|
|
403 |
break;
|
|
|
404 |
} else if (specialinchars.find_first_of(c) != string::npos) {
|
|
|
405 |
//cerr << "Word broken by special char" << endl;
|
|
|
406 |
d->UNGETCHAR(c);
|
|
|
407 |
break;
|
|
|
408 |
} else if (c == 0) {
|
|
|
409 |
//cerr << "Word broken by EOF" << endl;
|
|
|
410 |
break;
|
|
|
411 |
} else {
|
|
|
412 |
word->push_back(c);
|
|
|
413 |
}
|
|
|
414 |
}
|
|
|
415 |
|
|
|
416 |
if (!word->compare("AND") || !word->compare("&&")) {
|
|
|
417 |
delete word;
|
|
|
418 |
return yy::parser::token::AND;
|
|
|
419 |
} else if (!word->compare("OR") || !word->compare("||")) {
|
|
|
420 |
delete word;
|
|
|
421 |
return yy::parser::token::OR;
|
|
|
422 |
}
|
|
|
423 |
|
|
|
424 |
// cerr << "Got word [" << word << "]" << endl;
|
|
|
425 |
yylval->str = word;
|
|
|
426 |
return yy::parser::token::WORD;
|
|
|
427 |
}
|