Parent: [d3a267] (diff)

Download this file

uproplist.h    203 lines (195 with data), 7.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _PROPLIST_H_INCLUDED_
#define _PROPLIST_H_INCLUDED_
/**
* A subset of Unicode chars that we consider word breaks when we
* split text in words.
*
* This is used as a quick fix to the ascii-based code, and is not correct.
* the correct way would be to do what http://www.unicode.org/reports/tr29/
* says.
*/
// Punctuation chararacters blocks array. Each block is defined by a
// starting and ending code point (both included). MUST BE SORTED.
static const unsigned unipuncblocks[] = {
// Start of latin-1 supplement block, up to capital A grave
0x0080, 0x00BF,
// General punctuation
0x2000, 0x206F,
// Superscripts and subscripts
0x2070, 0x209F,
// Currency symbols
0x20A0, 0x20CF,
// Letterlike symbols
0x2100, 0x214f,
// Number forms
0x2150, 0x218F,
// Arrows
0x2190, 0x21FF,
// Mathematical Operators
0x2200, 0x22FF,
// Miscellaneous Technical
0x2300, 0x23FF,
// Control Pictures
0x2400, 0x243F,
// Optical Character Recognition
0x2440, 0x245F,
// Enclosed Alphanumerics
0x2460, 0x24FF,
// Box Drawing
0x2500, 0x257F,
// Block Elements
0x2580, 0x259F,
// Geometric Shapes
0x25A0, 0x25FF,
// Miscellaneous Symbols
0x2600, 0x26FF,
// Dingbats
0x2700, 0x27BF,
// Miscellaneous Mathematical Symbols-A
0x27C0, 0x27EF,
// Supplemental Arrows-A
0x27F0, 0x27FF,
// Supplemental Arrows-B
0x2900, 0x297F,
// Miscellaneous Mathematical Symbols-B
0x2980, 0x29FF,
// Supplemental Mathematical Operators
0x2A00, 0x2AFF,
// Miscellaneous Symbols and Arrows
0x2B00, 0x2BFF,
};
// Other punctuation characters list. Not all punctuation is in a
// separate block some is found in the middle of alphanumeric codes.
static const unsigned int unipunc[] = {
0x00D7, /* MULTIPLICATION SIGN */
0x00F7, /* DIVISION SIGN */
0x037E, /* GREEK QUESTION MARK */
0x0387, /* GREEK ANO TELEIA */
0x055C, /* ARMENIAN EXCLAMATION MARK */
0x055E, /* ARMENIAN QUESTION MARK */
0x0589, /* ARMENIAN FULL STOP */
0x058A, /* ARMENIAN HYPHEN */
0x05C3, /* HEBREW PUNCTUATION SOF PASUQ */
0x060C, /* ARABIC COMMA */
0x061B, /* ARABIC SEMICOLON */
0x061F, /* ARABIC QUESTION MARK */
0x06D4, /* ARABIC FULL STOP */
0x0964, /* DEVANAGARI DANDA */
0x0965, /* DEVANAGARI DOUBLE DANDA */
0x166E, /* CANADIAN SYLLABICS FULL STOP */
0x1680, /* OGHAM SPACE MARK */
0x16EB, /* RUNIC SINGLE PUNCTUATION */
0x16EC, /* RUNIC MULTIPLE PUNCTUATION */
0x16ED, /* RUNIC CROSS PUNCTUATION */
0x1803, /* MONGOLIAN FULL STOP */
0x1806, /* MONGOLIAN TODO SOFT HYPHEN */
0x1809, /* MONGOLIAN MANCHU FULL STOP */
0x180E, /* MONGOLIAN VOWEL SEPARATOR */
0x2E2E, /* REVERSED QUESTION MARK;Po;0;ON;;;;;N;;;;; */
0x3000, /* IDEOGRAPHIC SPACE*/
0x3002, /* IDEOGRAPHIC FULL STOP*/
0x300C, /* LEFT CORNER BRACKET*/
0x300D, /* RIGHT CORNER BRACKET*/
0x300E, /* LEFT WHITE CORNER BRACKET*/
0x300F, /* RIGHT WHITE CORNER BRACKET*/
0x301C, /* WAVE DASH*/
0x301D, /* REVERSED DOUBLE PRIME QUOTATION MARK*/
0x301E, /* LOW DOUBLE PRIME QUOTATION MARK*/
0x3030, /* WAVY DASH*/
0x30FB, /* KATAKANA MIDDLE DOT*/
0xC2B6, /* PILCROW SIGN;So;0;ON;;;;;N;PARAGRAPH SIGN;;;; */
0xC3B7, /* DIVISION SIGN;Sm;0;ON;;;;;N;;;;; */
0xFE31, /* PRESENTATION FORM FOR VERTICAL EM DASH*/
0xFE32, /* PRESENTATION FORM FOR VERTICAL EN DASH*/
0xFE41, /* PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET*/
0xFE42, /* PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET*/
0xFE43, /* PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET*/
0xFE44, /* PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET*/
0xFE50, /* [3] SMALL COMMA..SMALL FULL STOP*/
0xFE51, /* [3] SMALL COMMA..SMALL FULL STOP*/
0xFE52, /* STOP*/
0xFE52, /* [3] SMALL COMMA..SMALL FULL STOP*/
0xFE54, /* [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE55, /* [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE56, /* [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE57, /* [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE58, /* SMALL EM DASH */
0xFE63, /* SMALL HYPHEN-MINUS */
0xFF01, /* FULLWIDTH EXCLAMATION MARK */
0xFF02, /* FULLWIDTH QUOTATION MARK */
0xFF03, /* FULLWIDTH NUMBER SIGN */
0xFF04, /* FULLWIDTH DOLLAR SIGN */
0xFF05, /* FULLWIDTH PERCENT SIGN */
0xFF06, /* FULLWIDTH AMPERSAND */
0xFF07, /* FULLWIDTH APOSTROPHE */
0xFF08, /* FULLWIDTH LEFT PARENTHESIS */
0xFF09, /* FULLWIDTH RIGHT PARENTHESIS */
0xFF0A, /* FULLWIDTH ASTERISK */
0xFF0B, /* FULLWIDTH PLUS SIGN */
0xFF0C, /* FULLWIDTH COMMA */
0xFF0D, /* FULLWIDTH HYPHEN-MINUS */
0xFF0E, /* FULLWIDTH FULL STOP */
0xFF0F, /* FULLWIDTH SOLIDUS */
0xFF1A, /* [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
0xFF1B, /* [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
0xFF1F, /* FULLWIDTH QUESTION MARK*/
0xFF61, /* HALFWIDTH IDEOGRAPHIC FULL STOP*/
0xFF62, /* HALFWIDTH LEFT CORNER BRACKET*/
0xFF63, /* HALFWIDTH RIGHT CORNER BRACKET*/
0xFF64, /* HALFWIDTH IDEOGRAPHIC COMMA*/
0xFF65, /* HALFWIDTH KATAKANA MIDDLE DOT*/
};
// Characters that should just be discarded. Some of these are in the
// above blocks, but this array is tested first, so it's not worth
// breaking the blocks
static const unsigned int uniskip[] = {
0x00AD, /* SOFT HYPHEN */
0x034F, /* COMBINING GRAPHEME JOINER */
0x2027, /* HYPHENATION POINT */
0x200C, /* ZERO WIDTH NON-JOINER */
0x200D, /* ZERO WIDTH JOINER */
0x2060, /* WORD JOINER . Actually this should not be ignored but used to
* prevent a word break... */
};
/* Things that would visibly break a block of text, rendering obvious the need
* of quotation if a phrase search is wanted */
static const unsigned int avsbwht[] = {
0x0009, /* CHARACTER TABULATION */
0x000A, /* LINE FEED */
0x000D, /* CARRIAGE RETURN */
0x0020, /* SPACE;Zs;0;WS */
0x00A0, /* NO-BREAK SPACE;Zs;0;CS */
0x1680, /* OGHAM SPACE MARK;Zs;0;WS */
0x180E, /* MONGOLIAN VOWEL SEPARATOR;Zs;0;WS */
0x2000, /* EN QUAD;Zs;0;WS */
0x2001, /* EM QUAD;Zs;0;WS */
0x2002, /* EN SPACE;Zs;0;WS */
0x2003, /* EM SPACE;Zs;0;WS */
0x2004, /* THREE-PER-EM SPACE;Zs;0;WS */
0x2005, /* FOUR-PER-EM SPACE;Zs;0;WS */
0x2006, /* SIX-PER-EM SPACE;Zs;0;WS */
0x2007, /* FIGURE SPACE;Zs;0;WS */
0x2008, /* PUNCTUATION SPACE;Zs;0;WS */
0x2009, /* THIN SPACE;Zs;0;WS */
0x200A, /* HAIR SPACE;Zs;0;WS */
0x202F, /* NARROW NO-BREAK SPACE;Zs;0;CS */
0x205F, /* MEDIUM MATHEMATICAL SPACE;Zs;0;WS */
0x3000, /* IDEOGRAPHIC SPACE;Zs;0;WS */
};
#endif // _PROPLIST_H_INCLUDED_