Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.36 2007-12-13 06:58:21 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.37 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
57
// handled with a set holding all the separator values.
57
// handled with a set holding all the separator values.
58
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
58
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
59
static int charclasses[256];
59
static int charclasses[256];
60
60
61
static set<unsigned int> unicign;
61
static set<unsigned int> unicign;
62
static set<unsigned int> visiblewhite;
62
static void setcharclasses()
63
static void setcharclasses()
63
{
64
{
64
    static int init = 0;
65
    static int init = 0;
65
    if (init)
66
    if (init)
66
    return;
67
    return;
...
...
89
90
90
    char special[] = ".@+-,#'\n\r";
91
    char special[] = ".@+-,#'\n\r";
91
    for (i = 0; i  < strlen(special); i++)
92
    for (i = 0; i  < strlen(special); i++)
92
    charclasses[int(special[i])] = special[i];
93
    charclasses[int(special[i])] = special[i];
93
94
94
    for (i = 0; i < sizeof(uniign); i++) 
95
    for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
95
    unicign.insert(uniign[i]);
96
    unicign.insert(uniign[i]);
97
    }
96
    unicign.insert((unsigned int)-1);
98
    unicign.insert((unsigned int)-1);
99
100
    for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
101
  visiblewhite.insert(avsbwht[i]);
102
    }
97
103
98
    init = 1;
104
    init = 1;
99
}
105
}
100
106
101
static inline int whatcc(unsigned int c)
107
static inline int whatcc(unsigned int c)
...
...
531
    TextSplit splitter(&cb, flgs);
537
    TextSplit splitter(&cb, flgs);
532
    splitter.text_to_words(s);
538
    splitter.text_to_words(s);
533
    return cb.wcnt;
539
    return cb.wcnt;
534
}
540
}
535
541
542
bool TextSplit::hasVisibleWhite(const string &in)
543
{
544
    setcharclasses();
545
    Utf8Iter it(in);
546
    for (; !it.eof(); it++) {
547
  unsigned int c = *it;
548
  LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
549
  if (c == (unsigned int)-1) {
550
      LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
551
      return false;
552
  }
553
  if (visiblewhite.find(c) != visiblewhite.end())
554
      return true;
555
    }
556
    return false;
557
}
558
559
template <class T> bool u8stringToStrings(const string &s, T &tokens)
560
{
561
    setcharclasses();
562
    Utf8Iter it(s);
563
564
    string current;
565
    tokens.clear();
566
    enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
567
    states state = SPACE;
568
    for (; !it.eof(); it++) {
569
  unsigned int c = *it;
570
  if (visiblewhite.find(c) != visiblewhite.end()) 
571
      c = ' ';
572
  LOGDEB3(("TextSplit::stringToStrings: 0x%04x\n", c));
573
  if (c == (unsigned int)-1) {
574
      LOGERR(("TextSplit::stringToStrings: error while "
575
          "scanning UTF-8 string\n"));
576
      return false;
577
  }
578
579
  switch (c) {
580
      case '"': 
581
      switch(state) {
582
      case SPACE: state = INQUOTE; continue;
583
      case TOKEN: goto push_char;
584
      case ESCAPE: state = INQUOTE; goto push_char;
585
      case INQUOTE: tokens.push_back(current);current.clear();
586
      state = SPACE; continue;
587
      }
588
      break;
589
      case '\\': 
590
      switch(state) {
591
      case SPACE: 
592
      case TOKEN: state=TOKEN; goto push_char;
593
      case INQUOTE: state = ESCAPE; continue;
594
      case ESCAPE: state = INQUOTE; goto push_char;
595
      }
596
      break;
597
598
      case ' ': 
599
      case '\t': 
600
      case '\n': 
601
      case '\r': 
602
      switch(state) {
603
        case SPACE: continue;
604
        case TOKEN: tokens.push_back(current); current.clear();
605
      state = SPACE; continue; 
606
      case INQUOTE: 
607
      case ESCAPE: goto push_char;
608
      }
609
      break;
610
611
      default:
612
      switch(state) {
613
        case ESCAPE: state = INQUOTE; break;
614
        case SPACE:  state = TOKEN;  break;
615
        case TOKEN: 
616
        case INQUOTE: break;
617
      }
618
  push_char:
619
      it.appendchartostring(current);
620
  }
621
    }
622
623
    // End of string. Process residue, and possible error (unfinished quote)
624
    switch(state) {
625
    case SPACE: break;
626
    case TOKEN: tokens.push_back(current); break;
627
    case INQUOTE: 
628
    case ESCAPE: return false;
629
    }
630
    return true;
631
}
632
633
bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
634
{
635
    return u8stringToStrings<list<string> >(s, tokens);
636
}
637
536
#else  // TEST driver ->
638
#else  // TEST driver ->
537
639
538
#include <unistd.h>
640
#include <unistd.h>
539
#include <errno.h>
641
#include <errno.h>
540
#include <fcntl.h>
642
#include <fcntl.h>