Open Chinese Convert 1.3.2+gad37fd0a6.dirty
A project for conversion between Traditional and Simplified Chinese
Loading...
Searching...
No Matches
UTF8Util.hpp
1/*
2 * Open Chinese Convert
3 *
4 * Copyright 2013 Carbo Kuo <byvoid@byvoid.com>
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#pragma once
20
21#ifdef _MSC_VER
22#ifndef NOMINMAX
23#define NOMINMAX
24#endif
25#include <windows.h>
26#endif // _MSC_VER
27
28#include <cstdint>
29#include <cstring>
30
31#include "Common.hpp"
32#include "Exception.hpp"
33
34namespace opencc {
39class OPENCC_EXPORT UTF8Util {
40public:
44 static void SkipUtf8Bom(FILE* fp);
45
50 static size_t NextCharLengthNoException(const char* str) {
51 const unsigned char ch = static_cast<unsigned char>(*str);
52 if ((ch & 0xF0) == 0xE0) {
53 return 3;
54 } else if ((ch & 0x80) == 0x00) {
55 return 1;
56 } else if ((ch & 0xE0) == 0xC0) {
57 return 2;
58 } else if ((ch & 0xF8) == 0xF0) {
59 return 4;
60 } else if ((ch & 0xFC) == 0xF8) {
61 return 5;
62 } else if ((ch & 0xFE) == 0xFC) {
63 return 6;
64 }
65 return 0;
66 }
67
71 static size_t NextCharLength(const char* str) {
72 size_t length = NextCharLengthNoException(str);
73 if (length == 0) {
74 throw InvalidUTF8(str);
75 }
76 return length;
77 }
78
82 static size_t PrevCharLength(const char* str) {
83 const char* candidate = str - 1;
84 size_t distance = 1;
85 while (distance < 6) {
86 const unsigned char ch = static_cast<unsigned char>(*candidate);
87 if ((ch & 0xC0) != 0x80) {
88 break;
89 }
90 candidate--;
91 distance++;
92 }
93
94 const size_t length = NextCharLengthNoException(candidate);
95 if (length == distance) {
96 return length;
97 }
98 throw InvalidUTF8(str);
99 }
100
104 static const char* NextChar(const char* str) {
105 return str + NextCharLength(str);
106 }
107
111 static const char* PrevChar(const char* str) {
112 return str - PrevCharLength(str);
113 }
114
115 static size_t IdeographicDescriptionOperatorArity(uint32_t codePoint) {
116 switch (codePoint) {
117 case 0x2FF2:
118 case 0x2FF3:
119 return 3;
120 case 0x2FFE:
121 case 0x2FFF:
122 return 1;
123 case 0x2FF0:
124 case 0x2FF1:
125 case 0x2FF4:
126 case 0x2FF5:
127 case 0x2FF6:
128 case 0x2FF7:
129 case 0x2FF8:
130 case 0x2FF9:
131 case 0x2FFA:
132 case 0x2FFB:
133 case 0x2FFC:
134 case 0x2FFD:
135 return 2;
136 default:
137 return 0;
138 }
139 }
140
141 static size_t NextIdeographicDescriptionSequenceLength(const char* str,
142 size_t len) {
143 const size_t kMaxIDSDepth = 16;
144 const size_t kMaxIDSCodePoints = 64;
145 if (len == 0) {
146 return 0;
147 }
148 const size_t charLen = NextCharLengthNoException(str);
149 if (charLen == 0 || charLen > len) {
150 return 0;
151 }
152 const uint32_t codePoint = CodePointNoException(str, charLen);
153 if (IdeographicDescriptionOperatorArity(codePoint) == 0) {
154 return 0;
155 }
156
157 size_t consumed = 0;
158 size_t codePoints = 0;
159 if (ConsumeIdeographicDescriptionSequence(
160 str, len, kMaxIDSDepth, kMaxIDSCodePoints, &consumed,
161 &codePoints) == IDSParseStatus::Complete) {
162 return consumed;
163 }
164 return 0;
165 }
166
167 static bool IsIncompleteIdeographicDescriptionSequencePrefix(const char* str,
168 size_t len) {
169 const size_t kMaxIDSDepth = 16;
170 const size_t kMaxIDSCodePoints = 64;
171 if (len == 0) {
172 return false;
173 }
174 const size_t charLen = NextCharLengthNoException(str);
175 if (charLen == 0 || charLen > len) {
176 return false;
177 }
178 const uint32_t codePoint = CodePointNoException(str, charLen);
179 if (IdeographicDescriptionOperatorArity(codePoint) == 0) {
180 return false;
181 }
182
183 size_t consumed = 0;
184 size_t codePoints = 0;
185 return ConsumeIdeographicDescriptionSequence(
186 str, len, kMaxIDSDepth, kMaxIDSCodePoints, &consumed,
187 &codePoints) == IDSParseStatus::Incomplete;
188 }
189
190 static bool IsVariationSelector(uint32_t codePoint) {
191 return (codePoint >= 0xFE00 && codePoint <= 0xFE0F) ||
192 (codePoint >= 0xE0100 && codePoint <= 0xE01EF);
193 }
194
195 static bool ContainsVariationSelector(const char* str, size_t len) {
196 const char* pStr = str;
197 const char* strEnd = str + len;
198 while (pStr < strEnd) {
199 const size_t remainingLength = strEnd - pStr;
200 const size_t charLen = NextCharLengthNoException(pStr);
201 if (charLen == 0) {
202 ++pStr;
203 continue;
204 }
205 if (charLen > remainingLength) {
206 return false;
207 }
208 if (IsVariationSelector(CodePointNoException(pStr, charLen))) {
209 return true;
210 }
211 pStr += charLen;
212 }
213 return false;
214 }
215
222 static size_t Length(const char* str) {
223 size_t length = 0;
224 while (*str != '\0') {
225 const size_t charLen = NextCharLengthNoException(str);
226 if (charLen == 0) {
227 throw InvalidUTF8(str);
228 }
229 // Verify all continuation bytes are present before the null terminator.
230 // Use a while loop (not a for-with-return) to avoid complex control flow
231 // that triggers MSVC LTCG code-generator bugs.
232 size_t i = 1;
233 while (i < charLen && str[i] != '\0') {
234 ++i;
235 }
236 if (i < charLen) {
237 throw InvalidUTF8(str); // Truncated sequence: throw, don't silently skip
238 }
239 str += charLen;
240 ++length;
241 }
242 return length;
243 }
244
251 static const char* FindNextInline(const char* str, const char ch) {
252 while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
253 str = NextChar(str);
254 }
255 return str;
256 }
257
261 static bool IsLineEndingOrFileEnding(const char ch) {
262 return ch == '\0' || ch == '\n' || ch == '\r';
263 }
264
268 static std::string FromSubstr(const char* str, size_t length) {
269 std::string newStr;
270 newStr.resize(length);
271 memcpy(newStr.data(), str, length);
272 return newStr;
273 }
274
279 static bool NotShorterThan(const char* str, size_t byteLength) {
280 while (byteLength > 0) {
281 if (*str == '\0') {
282 return false;
283 }
284 byteLength--;
285 str++;
286 }
287 return true;
288 }
289
294 static std::string TruncateUTF8(const char* str, size_t maxByteLength) {
295 std::string wordTrunc;
296 if (NotShorterThan(str, maxByteLength)) {
297 size_t len = 0;
298 const char* pStr = str;
299 for (;;) {
300 const size_t charLength = NextCharLength(pStr);
301 if (len + charLength > maxByteLength) {
302 break;
303 }
304 pStr += charLength;
305 len += charLength;
306 }
307 wordTrunc = FromSubstr(str, len);
308 } else {
309 wordTrunc = str;
310 }
311 return wordTrunc;
312 }
313
317 static void ReplaceAll(std::string& str, const char* from, const char* to) {
318 std::string::size_type pos = 0;
319 std::string::size_type fromLen = strlen(from);
320 std::string::size_type toLen = strlen(to);
321 while ((pos = str.find(from, pos)) != std::string::npos) {
322 str.replace(pos, fromLen, to);
323 pos += toLen;
324 }
325 }
326
330 static std::string Join(const std::vector<std::string>& strings,
331 const std::string& separator) {
332 std::ostringstream buffer;
333 bool first = true;
334 for (const auto& str : strings) {
335 if (!first) {
336 buffer << separator;
337 }
338 buffer << str;
339 first = false;
340 }
341 return buffer.str();
342 }
343
347 static std::string Join(const std::vector<std::string>& strings) {
348 std::ostringstream buffer;
349 for (const auto& str : strings) {
350 buffer << str;
351 }
352 return buffer.str();
353 }
354
355 static void GetByteMap(const char* str, const size_t utf8Length,
356 std::vector<size_t>* byteMap) {
357 if (byteMap->size() < utf8Length) {
358 byteMap->resize(utf8Length);
359 }
360 const char* pstr = str;
361 for (size_t i = 0; i < utf8Length; i++) {
362 (*byteMap)[i] = pstr - str;
363 pstr = NextChar(pstr);
364 }
365 }
366
367#ifdef _MSC_VER
368 static std::wstring GetPlatformString(const std::string& str) {
369 return U8ToU16(str);
370 }
371#else
372 static std::string GetPlatformString(const std::string& str) { return str; }
373#endif // _MSC_VER
374
375#ifdef _MSC_VER
376 static std::string U16ToU8(const std::wstring& wstr) {
377 std::string ret;
378 int length = static_cast<int>(wstr.length());
379 int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, NULL, 0,
380 NULL, NULL);
381 if (convcnt > 0) {
382 ret.resize(convcnt);
383 WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, &ret[0], convcnt,
384 NULL, NULL);
385 }
386 return ret;
387 }
388
389 static std::wstring U8ToU16(const std::string& str) {
390 std::wstring ret;
391 int length = static_cast<int>(str.length());
392 int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, NULL, 0);
393 if (convcnt > 0) {
394 ret.resize(convcnt);
395 MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, &ret[0], convcnt);
396 }
397 return ret;
398 }
399#endif // _MSC_VER
400
401private:
402 enum class IDSParseStatus {
403 Complete,
404 Incomplete,
405 Invalid,
406 };
407
408 static uint32_t CodePointNoException(const char* str, size_t charLen) {
409 const unsigned char first = static_cast<unsigned char>(str[0]);
410 if (charLen == 1) {
411 return first;
412 }
413
414 uint32_t codePoint = first & ((1U << (7 - charLen)) - 1);
415 for (size_t i = 1; i < charLen; i++) {
416 codePoint = (codePoint << 6) |
417 (static_cast<unsigned char>(str[i]) & 0x3FU);
418 }
419 return codePoint;
420 }
421
422 static IDSParseStatus ConsumeIdeographicDescriptionSequence(
423 const char* str, size_t len, size_t depthLeft, size_t maxCodePoints,
424 size_t* consumed, size_t* codePoints) {
425 if (len == 0) {
426 return IDSParseStatus::Incomplete;
427 }
428 if (depthLeft == 0 || *codePoints >= maxCodePoints) {
429 return IDSParseStatus::Invalid;
430 }
431 const size_t charLen = NextCharLengthNoException(str);
432 if (charLen == 0) {
433 return IDSParseStatus::Invalid;
434 }
435 if (charLen > len) {
436 return IDSParseStatus::Incomplete;
437 }
438 ++(*codePoints);
439
440 const uint32_t codePoint = CodePointNoException(str, charLen);
441 const size_t arity = IdeographicDescriptionOperatorArity(codePoint);
442 if (arity == 0) {
443 *consumed = charLen;
444 return IDSParseStatus::Complete;
445 }
446
447 size_t offset = charLen;
448 for (size_t i = 0; i < arity; i++) {
449 if (offset >= len) {
450 return IDSParseStatus::Incomplete;
451 }
452 size_t operandLength = 0;
453 const IDSParseStatus operandStatus = ConsumeIdeographicDescriptionSequence(
454 str + offset, len - offset, depthLeft - 1, maxCodePoints,
455 &operandLength, codePoints);
456 if (operandStatus != IDSParseStatus::Complete) {
457 return operandStatus;
458 }
459 offset += operandLength;
460 }
461 *consumed = offset;
462 return IDSParseStatus::Complete;
463 }
464};
465} // namespace opencc
Definition Exception.hpp:77
UTF8 std::string utilities.
Definition UTF8Util.hpp:39
static bool IsLineEndingOrFileEnding(const char ch)
Returns true if the character is a line ending or end of file.
Definition UTF8Util.hpp:261
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition UTF8Util.hpp:82
static std::string FromSubstr(const char *str, size_t length)
Copies a substring with given length to a new string.
Definition UTF8Util.hpp:268
static void ReplaceAll(std::string &str, const char *from, const char *to)
Replaces all patterns in a std::string in place.
Definition UTF8Util.hpp:317
static void SkipUtf8Bom(FILE *fp)
Detect UTF8 BOM and skip it.
Definition UTF8Util.cpp:23
static size_t NextCharLengthNoException(const char *str)
Returns the length in byte for the next UTF8 character.
Definition UTF8Util.hpp:50
static bool NotShorterThan(const char *str, size_t byteLength)
Returns true if the given std::string is longer or as long as the given length.
Definition UTF8Util.hpp:279
static std::string Join(const std::vector< std::string > &strings)
Joins a std::string vector in to a std::string.
Definition UTF8Util.hpp:347
static std::string TruncateUTF8(const char *str, size_t maxByteLength)
Truncates a std::string with a maximal length in byte.
Definition UTF8Util.hpp:294
static size_t Length(const char *str)
Returns the UTF8 length of a null-terminated string.
Definition UTF8Util.hpp:222
static const char * FindNextInline(const char *str, const char ch)
Finds a character in the same line.
Definition UTF8Util.hpp:251
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition UTF8Util.hpp:71
static std::string Join(const std::vector< std::string > &strings, const std::string &separator)
Joins a std::string vector in to a std::string with a separator.
Definition UTF8Util.hpp:330
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition UTF8Util.hpp:111
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition UTF8Util.hpp:104