|
a/src/common/unacpp.cpp |
|
b/src/common/unacpp.cpp |
|
... |
|
... |
61 |
if (cout)
|
61 |
if (cout)
|
62 |
free(cout);
|
62 |
free(cout);
|
63 |
return true;
|
63 |
return true;
|
64 |
}
|
64 |
}
|
65 |
|
65 |
|
|
|
66 |
// Functions to determine upper-case or accented status could be implemented
|
|
|
67 |
// hugely more efficiently inside the unac c code, but there only used for
|
|
|
68 |
// testing user-entered terms, so we don't really care.
|
66 |
bool unaciscapital(const string& in)
|
69 |
bool unaciscapital(const string& in)
|
67 |
{
|
70 |
{
|
|
|
71 |
LOGDEB2(("unaciscapital: [%s]\n", in.c_str()));
|
68 |
if (in.empty())
|
72 |
if (in.empty())
|
69 |
return false;
|
73 |
return false;
|
70 |
Utf8Iter it(in);
|
74 |
Utf8Iter it(in);
|
71 |
string shorter;
|
75 |
string shorter;
|
72 |
it.appendchartostring(shorter);
|
76 |
it.appendchartostring(shorter);
|
73 |
|
77 |
|
74 |
string noacterm, noaclowterm;
|
78 |
string lower;
|
75 |
if (!unacmaybefold(shorter, noacterm, "UTF-8", UNACOP_UNAC)) {
|
79 |
if (!unacmaybefold(shorter, lower, "UTF-8", UNACOP_FOLD)) {
|
76 |
LOGINFO(("unaciscapital: unac failed for [%s]\n", in.c_str()));
|
80 |
LOGINFO(("unaciscapital: unac/fold failed for [%s]\n", in.c_str()));
|
77 |
return false;
|
81 |
return false;
|
78 |
}
|
82 |
}
|
79 |
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", UNACOP_UNACFOLD)) {
|
|
|
80 |
LOGINFO(("unaciscapital: unacfold failed for [%s]\n", in.c_str()));
|
|
|
81 |
return false;
|
|
|
82 |
}
|
|
|
83 |
Utf8Iter it1(noacterm);
|
83 |
Utf8Iter it1(lower);
|
84 |
Utf8Iter it2(noaclowterm);
|
|
|
85 |
if (*it1 != *it2)
|
84 |
if (*it != *it1)
|
|
|
85 |
return true;
|
|
|
86 |
else
|
|
|
87 |
return false;
|
|
|
88 |
}
|
|
|
89 |
bool unachasuppercase(const string& in)
|
|
|
90 |
{
|
|
|
91 |
LOGDEB2(("unachasuppercase: [%s]\n", in.c_str()));
|
|
|
92 |
if (in.empty())
|
|
|
93 |
return false;
|
|
|
94 |
|
|
|
95 |
string lower;
|
|
|
96 |
if (!unacmaybefold(in, lower, "UTF-8", UNACOP_FOLD)) {
|
|
|
97 |
LOGINFO(("unachasuppercase: unac/fold failed for [%s]\n", in.c_str()));
|
|
|
98 |
return false;
|
|
|
99 |
}
|
|
|
100 |
if (lower != in)
|
|
|
101 |
return true;
|
|
|
102 |
else
|
|
|
103 |
return false;
|
|
|
104 |
}
|
|
|
105 |
bool unachasaccents(const string& in)
|
|
|
106 |
{
|
|
|
107 |
LOGDEB2(("unachasaccents: [%s]\n", in.c_str()));
|
|
|
108 |
if (in.empty())
|
|
|
109 |
return false;
|
|
|
110 |
|
|
|
111 |
string noac;
|
|
|
112 |
if (!unacmaybefold(in, noac, "UTF-8", UNACOP_UNAC)) {
|
|
|
113 |
LOGINFO(("unachasaccents: unac/unac failed for [%s]\n", in.c_str()));
|
|
|
114 |
return false;
|
|
|
115 |
}
|
|
|
116 |
if (noac != in)
|
86 |
return true;
|
117 |
return true;
|
87 |
else
|
118 |
else
|
88 |
return false;
|
119 |
return false;
|
89 |
}
|
120 |
}
|
90 |
|
121 |
|
|
... |
|
... |
105 |
|
136 |
|
106 |
static char *thisprog;
|
137 |
static char *thisprog;
|
107 |
|
138 |
|
108 |
static char usage [] = "\n"
|
139 |
static char usage [] = "\n"
|
109 |
"[-c|-C] <encoding> <infile> <outfile>\n"
|
140 |
"[-c|-C] <encoding> <infile> <outfile>\n"
|
110 |
" Default : unaccent\n"
|
141 |
" Default : unaccent\n"
|
111 |
" -c : unaccent and casefold\n"
|
142 |
" -c : unaccent and casefold\n"
|
112 |
" -C : casefold only\n"
|
143 |
" -C : casefold only\n"
|
|
|
144 |
"-t <string> test string as capitalized, upper-case anywhere, accents\n"
|
|
|
145 |
" the parameter is supposedly utf-8 so this can only work in an utf-8\n"
|
|
|
146 |
" locale\n"
|
113 |
"\n";
|
147 |
"\n";
|
114 |
|
|
|
115 |
;
|
148 |
;
|
|
|
149 |
|
116 |
static void
|
150 |
static void
|
117 |
Usage(void)
|
151 |
Usage(void)
|
118 |
{
|
152 |
{
|
119 |
fprintf(stderr, "%s: usage: %s\n", thisprog, usage);
|
153 |
fprintf(stderr, "%s: usage: %s\n", thisprog, usage);
|
120 |
exit(1);
|
154 |
exit(1);
|
121 |
}
|
155 |
}
|
122 |
|
156 |
|
123 |
static int op_flags;
|
157 |
static int op_flags;
|
124 |
#define OPT_c 0x2
|
158 |
#define OPT_c 0x2
|
125 |
#define OPT_C 0x4
|
159 |
#define OPT_C 0x4
|
|
|
160 |
#define OPT_t 0x8
|
126 |
|
161 |
|
127 |
int main(int argc, char **argv)
|
162 |
int main(int argc, char **argv)
|
128 |
{
|
163 |
{
|
129 |
UnacOp op = UNACOP_UNAC;
|
164 |
UnacOp op = UNACOP_UNAC;
|
130 |
|
165 |
|
|
... |
|
... |
138 |
Usage();
|
173 |
Usage();
|
139 |
while (**argv)
|
174 |
while (**argv)
|
140 |
switch (*(*argv)++) {
|
175 |
switch (*(*argv)++) {
|
141 |
case 'c': op_flags |= OPT_c; break;
|
176 |
case 'c': op_flags |= OPT_c; break;
|
142 |
case 'C': op_flags |= OPT_C; break;
|
177 |
case 'C': op_flags |= OPT_C; break;
|
|
|
178 |
case 't': op_flags |= OPT_t; break;
|
143 |
default: Usage(); break;
|
179 |
default: Usage(); break;
|
144 |
}
|
180 |
}
|
145 |
argc--; argv++;
|
181 |
argc--; argv++;
|
146 |
}
|
182 |
}
|
147 |
|
183 |
|
148 |
if (op_flags & OPT_c) {
|
184 |
if (op_flags & OPT_t) {
|
|
|
185 |
if (argc != 1)
|
|
|
186 |
Usage();
|
|
|
187 |
string in = *argv++;argc--;
|
|
|
188 |
bool capital, upper, accent;
|
|
|
189 |
capital = unaciscapital(in);
|
|
|
190 |
upper = unachasuppercase(in);
|
|
|
191 |
accent = unachasaccents(in);
|
|
|
192 |
cout << "[" << in << "] : " <<
|
|
|
193 |
"capitalized: " << (capital ? "Yes. " : "No. ") <<
|
|
|
194 |
"has uppercase: " << (upper ? "Yes. " : "No. ") <<
|
|
|
195 |
"has accents: " << (accent ? "Yes. " : "No. ") <<
|
|
|
196 |
endl;
|
|
|
197 |
return 0;
|
|
|
198 |
} else {
|
|
|
199 |
if (argc != 3)
|
|
|
200 |
Usage();
|
|
|
201 |
if (op_flags & OPT_c) {
|
149 |
op = UNACOP_UNACFOLD;
|
202 |
op = UNACOP_UNACFOLD;
|
150 |
} else if (op_flags & OPT_C) {
|
203 |
} else if (op_flags & OPT_C) {
|
151 |
op = UNACOP_FOLD;
|
204 |
op = UNACOP_FOLD;
|
152 |
}
|
205 |
}
|
153 |
|
206 |
|
154 |
if (argc != 3) {
|
|
|
155 |
Usage();
|
|
|
156 |
}
|
|
|
157 |
|
|
|
158 |
const char *encoding = *argv++; argc--;
|
207 |
const char *encoding = *argv++; argc--;
|
159 |
string ifn = *argv++; argc--;
|
208 |
string ifn = *argv++; argc--;
|
160 |
if (!ifn.compare("stdin"))
|
209 |
if (!ifn.compare("stdin"))
|
161 |
ifn.clear();
|
210 |
ifn.clear();
|
162 |
const char *ofn = *argv++; argc--;
|
211 |
const char *ofn = *argv++; argc--;
|
163 |
|
212 |
|
164 |
string reason;
|
213 |
string reason;
|
165 |
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
214 |
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
166 |
|
215 |
|
167 |
string odata;
|
216 |
string odata;
|
168 |
if (!file_to_string(ifn, odata)) {
|
217 |
if (!file_to_string(ifn, odata)) {
|
169 |
cerr << "file_to_string " << ifn << " : " << odata << endl;
|
218 |
cerr << "file_to_string " << ifn << " : " << odata << endl;
|
170 |
return 1;
|
219 |
return 1;
|
171 |
}
|
220 |
}
|
172 |
string ndata;
|
221 |
string ndata;
|
173 |
if (!unacmaybefold(odata, ndata, encoding, op)) {
|
222 |
if (!unacmaybefold(odata, ndata, encoding, op)) {
|
174 |
cerr << "unac: " << ndata << endl;
|
223 |
cerr << "unac: " << ndata << endl;
|
175 |
return 1;
|
224 |
return 1;
|
176 |
}
|
225 |
}
|
177 |
|
226 |
|
178 |
int fd;
|
227 |
int fd;
|
179 |
if (strcmp(ofn, "stdout")) {
|
228 |
if (strcmp(ofn, "stdout")) {
|
180 |
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
229 |
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
181 |
} else {
|
230 |
} else {
|
182 |
fd = 1;
|
231 |
fd = 1;
|
183 |
}
|
232 |
}
|
184 |
if (fd < 0) {
|
233 |
if (fd < 0) {
|
185 |
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
234 |
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
186 |
<< endl;
|
235 |
<< endl;
|
187 |
return 1;
|
236 |
return 1;
|
188 |
}
|
237 |
}
|
189 |
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
|
238 |
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
|
190 |
cerr << "Write(2) failed: " << strerror(errno) << endl;
|
239 |
cerr << "Write(2) failed: " << strerror(errno) << endl;
|
|
|
240 |
return 1;
|
|
|
241 |
}
|
|
|
242 |
close(fd);
|
191 |
return 1;
|
243 |
return 0;
|
192 |
}
|
244 |
}
|
193 |
close(fd);
|
|
|
194 |
return 0;
|
|
|
195 |
}
|
245 |
}
|
196 |
|
246 |
|
197 |
#endif
|
247 |
#endif
|