--- a
+++ b/src/common/uproplist.h
@@ -0,0 +1,168 @@
+#ifndef _PROPLIST_H_INCLUDED_
+#define _PROPLIST_H_INCLUDED_
+/* @(#$Id: uproplist.h,v 1.1 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes */
+/*
+ * A subset of Unicode chars that we consider whitespace when we split text in
+ * words.
+
+ * This is used as a quick fix to the ascii-based code, and is not correct.
+ * the correct way would be to do what http://www.unicode.org/reports/tr29/
+ * says. We should then convert first to ucs-4, and then strictly use
+ * character properties, which might actually be simpler than the current
+ * solution...
+ *
+ * From:
+# PropList-4.0.1.txt
+# Date: 2004-03-02, 02:42:40 GMT [MD]
+#
+# Unicode Character Database
+# Copyright (c) 1991-2004 Unicode, Inc.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For documentation, see UCD.html
+*/
+
+static const unsigned int uniign[] = {
+ 0x0085, /* ; White_Space # Cc <control-0085>*/
+ 0x00A0, /* ; White_Space # Zs NO-BREAK SPACE*/
+ 0x00A1, /* misc signs, bullet etc... */
+ 0x00A2,
+ 0x00A3,
+ 0x00A4,
+ 0x00A5,
+ 0x00A6,
+ 0x00A9, /* copyright sign */
+ 0x00AA,
+ 0x00AE, /* registered sign */
+ 0x1680, /* ; White_Space # Zs OGHAM SPACE MARK*/
+ 0x180E, /* ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR*/
+ 0x2000, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x2001, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x2002, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x2003, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x2004, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x2005, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x2006, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x2007, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x2008, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x2009, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x200A, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
+ 0x2028, /* ; White_Space # Zl LINE SEPARATOR*/
+ 0x2029, /* ; White_Space # Zp PARAGRAPH SEPARATOR*/
+ 0x202F, /* ; White_Space # Zs NARROW NO-BREAK SPACE*/
+ 0x205F, /* ; White_Space # Zs MEDIUM MATHEMATICAL SPACE*/
+ 0x3000, /* ; White_Space # Zs IDEOGRAPHIC SPACE*/
+ 0x002D, /* ; Dash # Pd HYPHEN-MINUS*/
+ 0x058A, /* ; Dash # Pd ARMENIAN HYPHEN*/
+ 0x1806, /* ; Dash # Pd MONGOLIAN TODO SOFT HYPHEN*/
+ 0x2010, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
+ 0x2011, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
+ 0x2012, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
+ 0x2013, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
+ 0x2014, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
+ 0x2015, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
+ 0x2053, /* ; Dash # Po SWUNG DASH*/
+ 0x207B, /* ; Dash # Sm SUPERSCRIPT MINUS*/
+ 0x208B, /* ; Dash # Sm SUBSCRIPT MINUS*/
+ 0x2212, /* ; Dash # Sm MINUS SIGN*/
+ 0x301C, /* ; Dash # Pd WAVE DASH*/
+ 0x3030, /* ; Dash # Pd WAVY DASH*/
+ 0xFE31, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EM DASH*/
+ 0xFE32, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EN DASH*/
+ 0xFE58, /* ; Dash # Pd SMALL EM DASH*/
+ 0xFE63, /* ; Dash # Pd SMALL HYPHEN-MINUS*/
+ 0xFF0D, /* ; Dash # Pd FULLWIDTH HYPHEN-MINUS*/
+ 0x00AD, /* ; Hyphen # Cf SOFT HYPHEN*/
+ 0x058A, /* ; Hyphen # Pd ARMENIAN HYPHEN*/
+ 0x1806, /* ; Hyphen # Pd MONGOLIAN TODO SOFT HYPHEN*/
+ 0x2010, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/
+ 0x2011, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/
+ 0x30FB, /* ; Hyphen # Pc KATAKANA MIDDLE DOT*/
+ 0xFE63, /* ; Hyphen # Pd SMALL HYPHEN-MINUS*/
+ 0xFF0D, /* ; Hyphen # Pd FULLWIDTH HYPHEN-MINUS*/
+ 0xFF65, /* ; Hyphen # Pc HALFWIDTH KATAKANA MIDDLE DOT*/
+ 0x00AB, /* ; Quotation_Mark # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK*/
+ 0x00BB, /* ; Quotation_Mark # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK*/
+ 0x2018, /* ; Quotation_Mark # Pi LEFT SINGLE QUOTATION MARK*/
+ 0x2019, /* ; Quotation_Mark # Pf RIGHT SINGLE QUOTATION MARK*/
+ 0x201A, /* ; Quotation_Mark # Ps SINGLE LOW-9 QUOTATION MARK*/
+ 0x201B, /* ; Quotation_Mark # Pi SINGLE HIGH-REVERSED-9 QUOTATION MARK*/
+ 0x201C, /* ; Quotation_Mark # Pi LEFT DOUBLE QUOTATION MARK*/
+ 0x201D, /* ; Quotation_Mark # Pf RIGHT DOUBLE QUOTATION MARK*/
+ 0x201E, /* ; Quotation_Mark # Ps DOUBLE LOW-9 QUOTATION MARK*/
+ 0x201F, /* ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK*/
+ 0x2039, /* ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK*/
+ 0x203A, /* ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK*/
+ 0x300C, /* ; Quotation_Mark # Ps LEFT CORNER BRACKET*/
+ 0x300D, /* ; Quotation_Mark # Pe RIGHT CORNER BRACKET*/
+ 0x300E, /* ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET*/
+ 0x300F, /* ; Quotation_Mark # Pe RIGHT WHITE CORNER BRACKET*/
+ 0x301D, /* ; Quotation_Mark # Ps REVERSED DOUBLE PRIME QUOTATION MARK*/
+ 0x301E, /* ; Quotation_Mark # Pe DOUBLE PRIME QUOTATION MARK*/
+ 0x301E, /* ; Quotation_Mark # Pe LOW DOUBLE PRIME QUOTATION MARK*/
+ 0xFE41, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET*/
+ 0xFE42, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET*/
+ 0xFE43, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET*/
+ 0xFE44, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET*/
+ 0xFF02, /* ; Quotation_Mark # Po FULLWIDTH QUOTATION MARK*/
+ 0xFF07, /* ; Quotation_Mark # Po FULLWIDTH APOSTROPHE*/
+ 0xFF62, /* ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET*/
+ 0xFF63, /* ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET*/
+ 0x0021, /* ; Terminal_Punctuation # Po EXCLAMATION MARK*/
+ 0x002C, /* ; Terminal_Punctuation # Po COMMA*/
+ 0x002E, /* ; Terminal_Punctuation # Po FULL STOP*/
+ 0x003A, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/
+ 0x003B, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/
+ 0x003F, /* ; Terminal_Punctuation # Po QUESTION MARK*/
+ 0x037E, /* ; Terminal_Punctuation # Po GREEK QUESTION MARK*/
+ 0x0387, /* ; Terminal_Punctuation # Po GREEK ANO TELEIA*/
+ 0x0589, /* ; Terminal_Punctuation # Po ARMENIAN FULL STOP*/
+ 0x05C3, /* ; Terminal_Punctuation # Po HEBREW PUNCTUATION SOF PASUQ*/
+ 0x060C, /* ; Terminal_Punctuation # Po ARABIC COMMA*/
+ 0x061B, /* ; Terminal_Punctuation # Po ARABIC SEMICOLON*/
+ 0x061F, /* ; Terminal_Punctuation # Po ARABIC QUESTION MARK*/
+ 0x06D4, /* ; Terminal_Punctuation # Po ARABIC FULL STOP*/
+ 0x2047, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
+ 0x2048, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
+ 0x2049, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
+ 0xFE50, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
+ 0xFE51, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
+ 0xFE52, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
+ 0xFE54, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
+ 0xFE55, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
+ 0xFE56, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
+ 0xFE57, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
+ 0xFF01, /* ; Terminal_Punctuation # Po FULLWIDTH EXCLAMATION MARK*/
+ 0xFF0C, /* ; Terminal_Punctuation # Po FULLWIDTH COMMA*/
+ 0xFF0E, /* ; Terminal_Punctuation # Po FULLWIDTH FULL STOP*/
+ 0xFF1A, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
+ 0xFF1B, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
+ 0xFF1F, /* ; Terminal_Punctuation # Po FULLWIDTH QUESTION MARK*/
+ 0xFF61, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
+ 0xFF64, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA*/
+ 0x0021, /* ; STerm # Po EXCLAMATION MARK*/
+ 0x002E, /* ; STerm # Po FULL STOP*/
+ 0x003F, /* ; STerm # Po QUESTION MARK*/
+ 0x055C, /* ; STerm # Po ARMENIAN EXCLAMATION MARK*/
+ 0x055E, /* ; STerm # Po ARMENIAN QUESTION MARK*/
+ 0x0589, /* ; STerm # Po ARMENIAN FULL STOP*/
+ 0x061F, /* ; STerm # Po ARABIC QUESTION MARK*/
+ 0x06D4, /* ; STerm # Po ARABIC FULL STOP*/
+ 0x166E, /* ; STerm # Po CANADIAN SYLLABICS FULL STOP*/
+ 0x1803, /* ; STerm # Po MONGOLIAN FULL STOP*/
+ 0x1809, /* ; STerm # Po MONGOLIAN MANCHU FULL STOP*/
+ 0x203C, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
+ 0x203D, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
+ 0x2047, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
+ 0x2048, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
+ 0x2049, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
+ 0x3002, /* ; STerm # Po IDEOGRAPHIC FULL STOP*/
+ 0xFE52, /* ; STerm # Po SMALL FULL STOP*/
+ 0xFE56, /* ; STerm # Po SMALL QUESTION MARK*/
+ 0xFE57, /* ; STerm # Po SMALL EXCLAMATION MARK*/
+ 0xFF01, /* ; STerm # Po FULLWIDTH EXCLAMATION MARK*/
+ 0xFF0E, /* ; STerm # Po FULLWIDTH FULL STOP*/
+ 0xFF1F, /* ; STerm # Po FULLWIDTH QUESTION MARK*/
+ 0xFF61, /* ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
+};
+
+#endif /*PLIST_H_INCLUDED_ */