|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.36 2007-12-13 06:58:21 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.37 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
57 |
// handled with a set holding all the separator values.
|
57 |
// handled with a set holding all the separator values.
|
58 |
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
|
58 |
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
|
59 |
static int charclasses[256];
|
59 |
static int charclasses[256];
|
60 |
|
60 |
|
61 |
static set<unsigned int> unicign;
|
61 |
static set<unsigned int> unicign;
|
|
|
62 |
static set<unsigned int> visiblewhite;
|
62 |
static void setcharclasses()
|
63 |
static void setcharclasses()
|
63 |
{
|
64 |
{
|
64 |
static int init = 0;
|
65 |
static int init = 0;
|
65 |
if (init)
|
66 |
if (init)
|
66 |
return;
|
67 |
return;
|
|
... |
|
... |
89 |
|
90 |
|
90 |
char special[] = ".@+-,#'\n\r";
|
91 |
char special[] = ".@+-,#'\n\r";
|
91 |
for (i = 0; i < strlen(special); i++)
|
92 |
for (i = 0; i < strlen(special); i++)
|
92 |
charclasses[int(special[i])] = special[i];
|
93 |
charclasses[int(special[i])] = special[i];
|
93 |
|
94 |
|
94 |
for (i = 0; i < sizeof(uniign); i++)
|
95 |
for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
|
95 |
unicign.insert(uniign[i]);
|
96 |
unicign.insert(uniign[i]);
|
|
|
97 |
}
|
96 |
unicign.insert((unsigned int)-1);
|
98 |
unicign.insert((unsigned int)-1);
|
|
|
99 |
|
|
|
100 |
for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
|
|
|
101 |
visiblewhite.insert(avsbwht[i]);
|
|
|
102 |
}
|
97 |
|
103 |
|
98 |
init = 1;
|
104 |
init = 1;
|
99 |
}
|
105 |
}
|
100 |
|
106 |
|
101 |
static inline int whatcc(unsigned int c)
|
107 |
static inline int whatcc(unsigned int c)
|
|
... |
|
... |
531 |
TextSplit splitter(&cb, flgs);
|
537 |
TextSplit splitter(&cb, flgs);
|
532 |
splitter.text_to_words(s);
|
538 |
splitter.text_to_words(s);
|
533 |
return cb.wcnt;
|
539 |
return cb.wcnt;
|
534 |
}
|
540 |
}
|
535 |
|
541 |
|
|
|
542 |
bool TextSplit::hasVisibleWhite(const string &in)
|
|
|
543 |
{
|
|
|
544 |
setcharclasses();
|
|
|
545 |
Utf8Iter it(in);
|
|
|
546 |
for (; !it.eof(); it++) {
|
|
|
547 |
unsigned int c = *it;
|
|
|
548 |
LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
|
|
|
549 |
if (c == (unsigned int)-1) {
|
|
|
550 |
LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
|
|
|
551 |
return false;
|
|
|
552 |
}
|
|
|
553 |
if (visiblewhite.find(c) != visiblewhite.end())
|
|
|
554 |
return true;
|
|
|
555 |
}
|
|
|
556 |
return false;
|
|
|
557 |
}
|
|
|
558 |
|
|
|
559 |
template <class T> bool u8stringToStrings(const string &s, T &tokens)
|
|
|
560 |
{
|
|
|
561 |
setcharclasses();
|
|
|
562 |
Utf8Iter it(s);
|
|
|
563 |
|
|
|
564 |
string current;
|
|
|
565 |
tokens.clear();
|
|
|
566 |
enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
|
|
|
567 |
states state = SPACE;
|
|
|
568 |
for (; !it.eof(); it++) {
|
|
|
569 |
unsigned int c = *it;
|
|
|
570 |
if (visiblewhite.find(c) != visiblewhite.end())
|
|
|
571 |
c = ' ';
|
|
|
572 |
LOGDEB3(("TextSplit::stringToStrings: 0x%04x\n", c));
|
|
|
573 |
if (c == (unsigned int)-1) {
|
|
|
574 |
LOGERR(("TextSplit::stringToStrings: error while "
|
|
|
575 |
"scanning UTF-8 string\n"));
|
|
|
576 |
return false;
|
|
|
577 |
}
|
|
|
578 |
|
|
|
579 |
switch (c) {
|
|
|
580 |
case '"':
|
|
|
581 |
switch(state) {
|
|
|
582 |
case SPACE: state = INQUOTE; continue;
|
|
|
583 |
case TOKEN: goto push_char;
|
|
|
584 |
case ESCAPE: state = INQUOTE; goto push_char;
|
|
|
585 |
case INQUOTE: tokens.push_back(current);current.clear();
|
|
|
586 |
state = SPACE; continue;
|
|
|
587 |
}
|
|
|
588 |
break;
|
|
|
589 |
case '\\':
|
|
|
590 |
switch(state) {
|
|
|
591 |
case SPACE:
|
|
|
592 |
case TOKEN: state=TOKEN; goto push_char;
|
|
|
593 |
case INQUOTE: state = ESCAPE; continue;
|
|
|
594 |
case ESCAPE: state = INQUOTE; goto push_char;
|
|
|
595 |
}
|
|
|
596 |
break;
|
|
|
597 |
|
|
|
598 |
case ' ':
|
|
|
599 |
case '\t':
|
|
|
600 |
case '\n':
|
|
|
601 |
case '\r':
|
|
|
602 |
switch(state) {
|
|
|
603 |
case SPACE: continue;
|
|
|
604 |
case TOKEN: tokens.push_back(current); current.clear();
|
|
|
605 |
state = SPACE; continue;
|
|
|
606 |
case INQUOTE:
|
|
|
607 |
case ESCAPE: goto push_char;
|
|
|
608 |
}
|
|
|
609 |
break;
|
|
|
610 |
|
|
|
611 |
default:
|
|
|
612 |
switch(state) {
|
|
|
613 |
case ESCAPE: state = INQUOTE; break;
|
|
|
614 |
case SPACE: state = TOKEN; break;
|
|
|
615 |
case TOKEN:
|
|
|
616 |
case INQUOTE: break;
|
|
|
617 |
}
|
|
|
618 |
push_char:
|
|
|
619 |
it.appendchartostring(current);
|
|
|
620 |
}
|
|
|
621 |
}
|
|
|
622 |
|
|
|
623 |
// End of string. Process residue, and possible error (unfinished quote)
|
|
|
624 |
switch(state) {
|
|
|
625 |
case SPACE: break;
|
|
|
626 |
case TOKEN: tokens.push_back(current); break;
|
|
|
627 |
case INQUOTE:
|
|
|
628 |
case ESCAPE: return false;
|
|
|
629 |
}
|
|
|
630 |
return true;
|
|
|
631 |
}
|
|
|
632 |
|
|
|
633 |
bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
|
|
|
634 |
{
|
|
|
635 |
return u8stringToStrings<list<string> >(s, tokens);
|
|
|
636 |
}
|
|
|
637 |
|
536 |
#else // TEST driver ->
|
638 |
#else // TEST driver ->
|
537 |
|
639 |
|
538 |
#include <unistd.h>
|
640 |
#include <unistd.h>
|
539 |
#include <errno.h>
|
641 |
#include <errno.h>
|
540 |
#include <fcntl.h>
|
642 |
#include <fcntl.h>
|