Switch to unified view

a/src/utils/utf8iter.cpp b/src/utils/utf8iter.cpp
1
/* Copyright (C) 2005 J.F.Dockes
1
/* Copyright (C) 2017-2019 J.F.Dockes
2
 *   This program is free software; you can redistribute it and/or modify
2
 *   This program is free software; you can redistribute it and/or modify
3
 *   it under the terms of the GNU General Public License as published by
3
 *   it under the terms of the GNU Lesser General Public License as published by
4
 *   the Free Software Foundation; either version 2 of the License, or
4
 *   the Free Software Foundation; either version 2.1 of the License, or
5
 *   (at your option) any later version.
5
 *   (at your option) any later version.
6
 *
6
 *
7
 *   This program is distributed in the hope that it will be useful,
7
 *   This program is distributed in the hope that it will be useful,
8
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
8
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
9
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
 *   GNU General Public License for more details.
10
 *   GNU Lesser General Public License for more details.
11
 *
11
 *
12
 *   You should have received a copy of the GNU General Public License
12
 *   You should have received a copy of the GNU Lesser General Public License
13
 *   along with this program; if not, write to the
13
 *   along with this program; if not, write to the
14
 *   Free Software Foundation, Inc.,
14
 *   Free Software Foundation, Inc.,
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16
 */
16
 */
17
#include <stdio.h>
18
#include <stdlib.h>
19
17
18
#include "utf8iter.h"
20
#include <string>
19
#include <string>
21
#include <iostream>
22
#include <vector>
23
20
21
using std::string;
24
22
25
#include "log.h"
23
void utf8truncate(std::string& s, int maxlen)
26
#include "transcode.h"
24
{
25
    if (s.size() <= string::size_type(maxlen)) {
26
        return;
27
    }
28
    Utf8Iter iter(s);
29
    string::size_type pos = 0;
30
    while (iter++ != string::npos)
31
        if (iter.getBpos() < string::size_type(maxlen)) {
32
            pos = iter.getBpos();
33
        }
27
34
28
#ifndef NO_NAMESPACES
35
    s.erase(pos);
29
using namespace std;
30
#endif /* NO_NAMESPACES */
31
32
#define UTF8ITER_CHECK
33
#include "utf8iter.h"
34
#include "readfile.h"
35
#include "textsplit.h"
36
37
void tryempty()
38
{
39
    Utf8Iter it("");
40
    cout << "EOF ? " << it.eof() << endl;
41
    TextSplit::isCJK(*it);
42
    exit(0);
43
}
36
}
44
45
const char *thisprog;
46
static char usage [] =
47
    "utf8iter [opts] infile outfile\n"
48
    " converts infile to 32 bits unicode (processor order), for testing\n"
49
    "-v : print stuff as we go\n"
50
    ;
51
52
void Usage() {
53
    fprintf(stderr, "%s:%s\n", thisprog, usage);
54
    exit(1);
55
}
56
static int     op_flags;
57
#define OPT_v   0x2 
58
59
int main(int argc, char **argv)
60
{
61
    thisprog = argv[0];
62
    argc--; argv++;
63
64
    while (argc > 0 && **argv == '-') {
65
  (*argv)++;
66
  if (!(**argv))
67
      Usage();
68
  while (**argv)
69
      switch (*(*argv)++) {
70
      case 'v':   op_flags |= OPT_v; break;
71
72
      default: Usage();   break;
73
      }
74
  argc--;argv++;
75
    }
76
77
    if (argc != 2) {
78
  Usage();
79
    }
80
    const char *infile = *argv++;argc--;
81
    const char *outfile = *argv++;argc--;
82
    string in;
83
    if (!file_to_string(infile, in)) {
84
  cerr << "Cant read file\n" << endl;
85
  exit(1);
86
    }
87
    
88
    vector<unsigned int>ucsout1;
89
    string out, out1;
90
    Utf8Iter it(in);
91
    FILE *fp = fopen(outfile, "w");
92
    if (fp == 0) {
93
  fprintf(stderr, "cant create %s\n", outfile);
94
  exit(1);
95
    }
96
97
    int nchars = 0;
98
    for (;!it.eof(); it++) {
99
  unsigned int value = *it;
100
  if (value == (unsigned int)-1) {
101
      cerr << "Conversion error occurred\n" << endl;
102
      exit(1);
103
  }
104
  if (op_flags & OPT_v) {
105
     printf("Value: 0x%x", value);
106
     if (value < 0x7f)
107
         printf(" (%c) ", value);
108
     printf("\n");
109
  }
110
  // UTF-32LE or BE array
111
  ucsout1.push_back(value);
112
  // UTF-32LE or BE file
113
  fwrite(&value, 4, 1, fp);
114
115
  // Reconstructed utf8 strings (2 methods)
116
  if (!it.appendchartostring(out))
117
      break;
118
  // conversion to string
119
  out1 += it;
120
  
121
  // fprintf(stderr, "%s", string(it).c_str());
122
  nchars++;
123
    }
124
    fclose(fp);
125
126
    fprintf(stderr, "nchars %d\n", nchars);
127
    if (in.compare(out)) {
128
  fprintf(stderr, "error: out != in\n");
129
  exit(1);
130
    }
131
    if (in != out1) {
132
  fprintf(stderr, "error: out1 != in\n");
133
  exit(1);
134
    }
135
136
    // Rewind and do it a second time
137
    vector<unsigned int>ucsout2;
138
    it.rewind();
139
    for (int i = 0; ; i++) {
140
  unsigned int value;
141
  if ((value = it[i]) == (unsigned int)-1) {
142
      fprintf(stderr, "%d chars\n", i);
143
      break;
144
  }
145
  it++;
146
  ucsout2.push_back(value);
147
    }
148
149
    if (ucsout1 != ucsout2) {
150
  fprintf(stderr, "error: ucsout1 != ucsout2\n");
151
  exit(1);
152
    }
153
154
    ucsout2.clear();
155
    int ercnt;
156
    const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine
157
    string ucs, ucs1;
158
    for (vector<unsigned int>::iterator it = ucsout1.begin(); 
159
   it != ucsout1.end(); it++) {
160
  unsigned int i = *it;
161
  ucs.append((const char *)&i, 4);
162
    }
163
    if (!transcode(ucs, ucs1, 
164
         encoding, encoding, &ercnt) || ercnt) {
165
  fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt);
166
  exit(1);
167
    }
168
    if (ucs.compare(ucs1)) {
169
  fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n");
170
  exit(1);
171
    }
172
173
    if (!transcode(ucs, ucs1, 
174
         encoding, "UTF-8", &ercnt) || ercnt) {
175
  fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n",
176
      ercnt);
177
  exit(1);
178
    }
179
    if (ucs1.compare(in)) {
180
  fprintf(stderr, "Transcode back to utf-8 compare to in failed\n");
181
  exit(1);
182
    }
183
    exit(0);
184
}
185