|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
21 |
#include <stdlib.h>
|
21 |
#include <stdlib.h>
|
22 |
|
22 |
|
23 |
#include <iostream>
|
23 |
#include <iostream>
|
24 |
#include <string>
|
24 |
#include <string>
|
25 |
#include <algorithm>
|
25 |
#include <algorithm>
|
26 |
#include <tr1/unordered_set>
|
|
|
27 |
#include <cstring>
|
26 |
#include <cstring>
|
|
|
27 |
#include "unordered_defs.h"
|
|
|
28 |
using namespace std;
|
28 |
|
29 |
|
29 |
#include "textsplit.h"
|
30 |
#include "textsplit.h"
|
30 |
#include "debuglog.h"
|
31 |
#include "debuglog.h"
|
31 |
//#define UTF8ITER_CHECK
|
32 |
//#define UTF8ITER_CHECK
|
32 |
#include "utf8iter.h"
|
33 |
#include "utf8iter.h"
|
33 |
#include "uproplist.h"
|
34 |
#include "uproplist.h"
|
34 |
|
|
|
35 |
using std::tr1::unordered_set;
|
|
|
36 |
using namespace std;
|
|
|
37 |
|
35 |
|
38 |
|
36 |
|
39 |
/**
|
37 |
/**
|
40 |
* Splitting a text into words. The code in this file works with utf-8
|
38 |
* Splitting a text into words. The code in this file works with utf-8
|
41 |
* in a semi-clean way (see uproplist.h). Ascii still gets special treatment.
|
39 |
* in a semi-clean way (see uproplist.h). Ascii still gets special treatment.
|
|
... |
|
... |
58 |
// Real UTF-8 characters are handled with sets holding all characters
|
56 |
// Real UTF-8 characters are handled with sets holding all characters
|
59 |
// with interesting properties. This is far from full-blown management
|
57 |
// with interesting properties. This is far from full-blown management
|
60 |
// of Unicode properties, but seems to do the job well enough in most
|
58 |
// of Unicode properties, but seems to do the job well enough in most
|
61 |
// common cases
|
59 |
// common cases
|
62 |
static vector<unsigned int> vpuncblocks;
|
60 |
static vector<unsigned int> vpuncblocks;
|
63 |
static unordered_set<unsigned int> spunc;
|
61 |
static STD_UNORDERED_SET<unsigned int> spunc;
|
64 |
static unordered_set<unsigned int> visiblewhite;
|
62 |
static STD_UNORDERED_SET<unsigned int> visiblewhite;
|
65 |
static unordered_set<unsigned int> sskip;
|
63 |
static STD_UNORDERED_SET<unsigned int> sskip;
|
66 |
|
64 |
|
67 |
class CharClassInit {
|
65 |
class CharClassInit {
|
68 |
public:
|
66 |
public:
|
69 |
CharClassInit()
|
67 |
CharClassInit()
|
70 |
{
|
68 |
{
|