csutil/csuctransform.h
Go to the documentation of this file.00001 /* 00002 Copyright (C) 2003 by Frank Richter 00003 00004 This library is free software; you can redistribute it and/or 00005 modify it under the terms of the GNU Library General Public 00006 License as published by the Free Software Foundation; either 00007 version 2 of the License, or (at your option) any later version. 00008 00009 This library is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00012 Library General Public License for more details. 00013 00014 You should have received a copy of the GNU Library General Public 00015 License along with this library; if not, write to the Free 00016 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 00017 */ 00018 00019 #ifndef __CS_CSUCTRANSFORM_H__ 00020 #define __CS_CSUCTRANSFORM_H__ 00021 00022 #include "csunicode.h" 00023 00035 #define CS_UC_MAX_UTF8_ENCODED 4 /* 6 to encode 32 bit */ 00036 00040 #define CS_UC_MAX_UTF16_ENCODED 2 00041 00045 #define CS_UC_MAX_UTF32_ENCODED 1 00046 #if (CS_WCHAR_T_SIZE == 1) 00047 #define CS_UC_MAX_WCHAR_T_ENCODED CS_UC_MAX_UTF8_ENCODED 00048 #elif (CS_WCHAR_T_SIZE == 2) 00049 00053 #define CS_UC_MAX_WCHAR_T_ENCODED CS_UC_MAX_UTF16_ENCODED 00054 #else 00055 #define CS_UC_MAX_WCHAR_T_ENCODED CS_UC_MAX_UTF32_ENCODED 00056 #endif 00057 00061 #define CS_UC_MAX_MAPPED 3 00062 00066 enum 00067 { 00073 csUcMapSimple = (1 << 0) 00074 }; 00075 00079 class CS_CRYSTALSPACE_EXPORT csUnicodeTransform 00080 { 00081 public: 00082 #define FAIL(ret) \ 00083 { \ 00084 if (isValid) *isValid = false; \ 00085 ch = CS_UC_CHAR_REPLACER; \ 00086 return ret; \ 00087 } 00088 00089 #define SUCCEED \ 00090 if (isValid) *isValid = true; \ 00091 return chUsed; 00092 00093 #define GET_NEXT(next) \ 00094 if ((size_t)chUsed == strlen) \ 00095 { \ 00096 FAIL(chUsed); \ 00097 } \ 00098 next = *str++; \ 00099 if (next == 0) \ 00100 { \ 00101 FAIL(chUsed); \ 00102 } \ 00103 chUsed++; 00104 00123 inline static int UTF8Decode (const utf8_char* str, size_t strlen, 00124 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00125 { 00126 if (str == 0) 00127 { 00128 FAIL(0); 00129 } 00130 int chUsed = 0; 00131 00132 utf8_char curCh; 00133 GET_NEXT(curCh); 00134 if ((curCh & 0x80) == 0) 00135 { 00136 // easy case 00137 ch = curCh; 00138 SUCCEED; 00139 } 00140 else 00141 { 00142 // Count with how many bytes this char is encoded. 00143 int n = 0; 00144 while ((n < 7) && ((curCh & (1 << (7 - n))) != 0)) { n++; } 00145 00146 if ((n < 2) || (n > 6)) 00147 { 00148 // Invalid code: first char of a "sequence" must have 00149 // at least two and at most six MSBs set 00150 FAIL(1); 00151 } 00152 00153 ch = (curCh & ((1 << (8 - n)) - 1)); 00154 00155 for (int i = 1; i < n; i++) 00156 { 00157 GET_NEXT(curCh); 00158 if ((curCh & 0xc0) != 0x80) 00159 { 00160 FAIL(chUsed); 00161 } 00162 else 00163 { 00164 ch <<= 6; 00165 ch |= (curCh & 0x3f); 00166 } 00167 } 00168 00169 // Check if in Unicode range. 00170 if (ch > CS_UC_LAST_CHAR) 00171 { 00172 FAIL(chUsed); 00173 } 00174 00175 // Check for "overlong" codes. 00176 if ((ch < 0x80) && (n > 0)) 00177 { 00178 FAIL(chUsed); 00179 } 00180 else if ((ch < 0x800) && (n > 2)) 00181 { 00182 FAIL(chUsed); 00183 } 00184 else if ((ch < 0x10000) && (n > 3)) 00185 { 00186 FAIL(chUsed); 00187 } 00188 else if ((ch < 0x200000) && (n > 4)) 00189 { 00190 FAIL(chUsed); 00191 } 00192 /* 00193 else if ((ch < 0x4000000) && (n > 5)) 00194 { 00195 FAIL(chUsed); 00196 } 00197 else if ((ch < 0x80000000) && (n > 6)) 00198 { 00199 FAIL(chUsed); 00200 } 00201 */ 00202 00203 if (!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 00204 || CS_UC_IS_SURROGATE(ch))) 00205 FAIL(chUsed); 00206 SUCCEED; 00207 } 00208 } 00209 00214 inline static int UTF16Decode (const utf16_char* str, size_t strlen, 00215 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00216 { 00217 if (str == 0) 00218 { 00219 FAIL(0); 00220 } 00221 int chUsed = 0; 00222 00223 utf16_char curCh; 00224 GET_NEXT(curCh); 00225 // Decode surrogate 00226 if (CS_UC_IS_SURROGATE (curCh)) 00227 { 00228 // Invalid code 00229 if (!CS_UC_IS_HIGH_SURROGATE (curCh)) 00230 { 00231 FAIL(chUsed); 00232 } 00233 ch = 0x10000 + ((curCh & 0x03ff) << 10); 00234 GET_NEXT(curCh); 00235 // Invalid code 00236 if (!CS_UC_IS_LOW_SURROGATE (curCh)) 00237 { 00238 // Fail with 1 so the char is handled upon the next Decode. 00239 FAIL(1); 00240 } 00241 ch |= (curCh & 0x3ff); 00242 } 00243 else 00244 { 00245 ch = curCh; 00246 } 00247 if (!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 00248 || CS_UC_IS_SURROGATE(ch))) 00249 FAIL(chUsed); 00250 SUCCEED; 00251 } 00252 00257 inline static int UTF32Decode (const utf32_char* str, size_t strlen, 00258 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00259 { 00260 if (str == 0) 00261 { 00262 FAIL(0); 00263 } 00264 int chUsed = 0; 00265 00266 GET_NEXT(ch); 00267 if ((!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 00268 || CS_UC_IS_SURROGATE(ch))) || (ch > CS_UC_LAST_CHAR)) 00269 FAIL(chUsed); 00270 SUCCEED; 00271 } 00272 00277 inline static int Decode (const utf8_char* str, size_t strlen, 00278 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00279 { 00280 return UTF8Decode (str, strlen, ch, isValid, returnNonChar); 00281 } 00286 inline static int Decode (const utf16_char* str, size_t strlen, 00287 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00288 { 00289 return UTF16Decode (str, strlen, ch, isValid, returnNonChar); 00290 } 00295 inline static int Decode (const utf32_char* str, size_t strlen, 00296 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00297 { 00298 return UTF32Decode (str, strlen, ch, isValid, returnNonChar); 00299 } 00300 00302 #undef FAIL 00303 #undef SUCCEED 00304 #undef GET_NEXT 00305 00308 #define _OUTPUT_CHAR(buf, chr) \ 00309 if (bufRemaining > 0) \ 00310 { \ 00311 if(buf) *buf++ = chr; \ 00312 bufRemaining--; \ 00313 } \ 00314 encodedLen++; 00315 00316 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(buf, chr) 00317 00333 inline static int EncodeUTF8 (const utf32_char ch, utf8_char* buf, 00334 size_t bufsize, bool allowNonchars = false) 00335 { 00336 if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 00337 || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR)) 00338 return 0; 00339 size_t bufRemaining = bufsize; 00340 int encodedLen = 0; 00341 00342 if (ch < 0x80) 00343 { 00344 OUTPUT_CHAR ((utf8_char)ch); 00345 } 00346 else if (ch < 0x800) 00347 { 00348 OUTPUT_CHAR ((utf8_char)(0xc0 | (ch >> 6))); 00349 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00350 } 00351 else if (ch < 0x10000) 00352 { 00353 OUTPUT_CHAR ((utf8_char)(0xe0 | (ch >> 12))); 00354 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00355 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00356 } 00357 else if (ch < 0x200000) 00358 { 00359 OUTPUT_CHAR ((utf8_char)(0xf0 | (ch >> 18))); 00360 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00361 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00362 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00363 } 00364 /* 00365 else if (ch < 0x4000000) 00366 { 00367 OUTPUT_CHAR ((utf8_char)(0xf8 | (ch >> 24))); 00368 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f))); 00369 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00370 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00371 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00372 } 00373 else if (ch < 0x80000000) 00374 { 00375 OUTPUT_CHAR ((utf8_char)(0xfc | (ch >> 30))); 00376 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 24) & 0x3f))); 00377 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f))); 00378 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00379 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00380 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00381 } 00382 */ 00383 return encodedLen; 00384 } 00385 00401 inline static int EncodeUTF16 (const utf32_char ch, utf16_char* buf, 00402 size_t bufsize, bool allowNonchars = false) 00403 { 00404 if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 00405 || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR)) 00406 return 0; 00407 size_t bufRemaining = bufsize; 00408 int encodedLen = 0; 00409 00410 if (ch < 0x10000) 00411 { 00412 OUTPUT_CHAR((utf16_char)ch); 00413 } 00414 else if (ch < 0x100000) 00415 { 00416 utf32_char ch_shifted = ch - 0x10000; 00417 OUTPUT_CHAR((utf16_char)((ch_shifted >> 10) 00418 | CS_UC_CHAR_HIGH_SURROGATE_FIRST)); 00419 OUTPUT_CHAR((utf16_char)((ch_shifted & 0x3ff) 00420 | CS_UC_CHAR_LOW_SURROGATE_FIRST)); 00421 } 00422 else 00423 return 0; 00424 00425 return encodedLen; 00426 } 00427 00443 inline static int EncodeUTF32 (const utf32_char ch, utf32_char* buf, 00444 size_t bufsize, bool allowNonchars = false) 00445 { 00446 if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 00447 || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR)) 00448 return 0; 00449 size_t bufRemaining = bufsize; 00450 int encodedLen = 0; 00451 00452 OUTPUT_CHAR(ch); 00453 00454 return encodedLen; 00455 } 00456 00461 inline static int Encode (const utf32_char ch, utf8_char* buf, 00462 size_t bufsize, bool allowNonchars = false) 00463 { 00464 return EncodeUTF8 (ch, buf, bufsize, allowNonchars); 00465 } 00470 inline static int Encode (const utf32_char ch, utf16_char* buf, 00471 size_t bufsize, bool allowNonchars = false) 00472 { 00473 return EncodeUTF16 (ch, buf, bufsize, allowNonchars); 00474 } 00479 inline static int Encode (const utf32_char ch, utf32_char* buf, 00480 size_t bufsize, bool allowNonchars = false) 00481 { 00482 return EncodeUTF32 (ch, buf, bufsize, allowNonchars); 00483 } 00485 #undef OUTPUT_CHAR 00486 00489 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(dest, chr) 00490 00491 #define UCTF_CONVERTER(funcName, fromType, decoder, toType, encoder) \ 00492 inline static size_t funcName (toType* dest, size_t destSize, \ 00493 const fromType* source, size_t srcSize = (size_t)-1) \ 00494 { \ 00495 if ((srcSize == 0) || (source == 0)) \ 00496 return 0; \ 00497 \ 00498 size_t bufRemaining = (destSize > 0) ? destSize - 1 : 0; \ 00499 size_t encodedLen = 0; \ 00500 \ 00501 size_t srcChars = srcSize; \ 00502 \ 00503 if (srcSize == (size_t)-1) \ 00504 { \ 00505 srcChars = 0; \ 00506 const fromType* sptr = source; \ 00507 while (*sptr++ != 0) srcChars++; \ 00508 } \ 00509 \ 00510 while (srcChars > 0) \ 00511 { \ 00512 utf32_char ch; \ 00513 int scnt = decoder (source, srcChars, ch, 0); \ 00514 if (scnt == 0) break; \ 00515 int dcnt = encoder (ch, dest, bufRemaining); \ 00516 if (dcnt == 0) \ 00517 { \ 00518 dcnt = encoder (CS_UC_CHAR_REPLACER, dest, bufRemaining); \ 00519 } \ 00520 \ 00521 if ((size_t)dcnt >= bufRemaining) \ 00522 { \ 00523 if (dest && (destSize > 0)) dest += bufRemaining; \ 00524 bufRemaining = 0; \ 00525 } \ 00526 else \ 00527 { \ 00528 bufRemaining -= dcnt; \ 00529 if (dest && (destSize > 0)) dest += dcnt; \ 00530 } \ 00531 encodedLen += dcnt; \ 00532 if ((size_t)scnt >= srcChars) break; \ 00533 srcChars -= scnt; \ 00534 source += scnt; \ 00535 } \ 00536 \ 00537 if (dest) *dest = 0; \ 00538 \ 00539 return encodedLen + 1; \ 00540 } 00541 00557 UCTF_CONVERTER (UTF8to16, utf8_char, UTF8Decode, utf16_char, EncodeUTF16); 00562 UCTF_CONVERTER (UTF8to32, utf8_char, UTF8Decode, utf32_char, EncodeUTF32); 00563 00568 UCTF_CONVERTER (UTF16to8, utf16_char, UTF16Decode, utf8_char, EncodeUTF8); 00573 UCTF_CONVERTER (UTF16to32, utf16_char, UTF16Decode, utf32_char, EncodeUTF32); 00574 00579 UCTF_CONVERTER (UTF32to8, utf32_char, UTF32Decode, utf8_char, EncodeUTF8); 00584 UCTF_CONVERTER (UTF32to16, utf32_char, UTF32Decode, utf16_char, EncodeUTF16); 00587 #undef UCTF_CONVERTER 00588 #undef OUTPUT_CHAR 00589 #undef _OUTPUT_CHAR 00590 00591 #if (CS_WCHAR_T_SIZE == 1) 00592 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00593 const utf8_char* source, size_t srcSize) 00594 { 00595 size_t srcChars = srcSize; 00596 if (srcSize == (size_t)-1) 00597 { 00598 srcChars = 0; 00599 const utf8_char* sptr = source; 00600 while (*sptr++ != 0) srcChars++; 00601 } 00602 if ((dest != 0) && (destSize != 0)) 00603 { 00604 size_t len = MIN (destSize - 1, srcChars); 00605 memcpy (dest, source, size * sizeof (wchar_t)); 00606 *(dest + len) = 0; 00607 } 00608 return srcChars + 1; 00609 }; 00610 00611 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00612 const utf16_char* source, size_t srcSize) 00613 { 00614 return UTF16to8 ((utf8_char*)dest, destSize, source, srcSize); 00615 }; 00616 00617 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00618 const utf32_char* source, size_t srcSize) 00619 { 00620 return UTF32to8 ((utf8_char*)dest, destSize, source, srcSize); 00621 }; 00622 00623 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00624 const wchar_t* source, size_t srcSize) 00625 { 00626 size_t srcChars = srcSize; 00627 if (srcSize == (size_t)-1) 00628 { 00629 srcChars = 0; 00630 const wchar_t* sptr = source; 00631 while (*sptr++ != 0) srcChars++; 00632 } 00633 if ((dest != 0) && (destSize != 0)) 00634 { 00635 size_t len = MIN (destSize - 1, srcChars); 00636 memcpy (dest, source, len * sizeof (wchar_t)); 00637 *(dest + len) = 0; 00638 } 00639 return srcChars + 1; 00640 }; 00641 00642 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00643 const wchar_t* source, size_t srcSize) 00644 { 00645 return UTF8to16 (dest, destSize, source, srcSize); 00646 }; 00647 00648 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00649 const wchar_t* source, size_t srcSize) 00650 { 00651 return UTF8to32 (dest, destSize, source, srcSize); 00652 }; 00653 00654 inline static int Decode (const wchar_t* str, size_t strlen, 00655 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00656 { 00657 return UTF8Decode ((utf8_char*)str, strlen, ch, isValid, returnNonChar); 00658 } 00659 inline static int Encode (const utf32_char ch, wchar_t* buf, 00660 size_t bufsize, bool allowNonchars = false) 00661 { 00662 return EncodeUTF8 (ch, (utf8_char*)buf, bufsize, allowNonchars); 00663 } 00664 #elif (CS_WCHAR_T_SIZE == 2) 00665 // Methods below for doxygen documentation are here as the size '2' is 00666 // default. 00667 00674 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00675 const utf8_char* source, size_t srcSize) 00676 { 00677 return UTF8to16 ((utf16_char*)dest, destSize, source, srcSize); 00678 }; 00679 00684 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00685 const utf16_char* source, size_t srcSize) 00686 { 00687 size_t srcChars = srcSize; 00688 if (srcSize == (size_t)-1) 00689 { 00690 srcChars = 0; 00691 const utf16_char* sptr = source; 00692 while (*sptr++ != 0) srcChars++; 00693 } 00694 if ((dest != 0) && (destSize != 0)) 00695 { 00696 size_t len = MIN (destSize - 1, srcChars); 00697 memcpy (dest, source, len * sizeof (wchar_t)); 00698 *(dest + len) = 0; 00699 } 00700 return srcChars + 1; 00701 }; 00702 00707 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00708 const utf32_char* source, size_t srcSize) 00709 { 00710 return UTF32to16 ((utf16_char*)dest, destSize, source, srcSize); 00711 }; 00712 00717 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00718 const wchar_t* source, size_t srcSize) 00719 { 00720 return UTF16to8 (dest, destSize, (utf16_char*)source, srcSize); 00721 }; 00722 00727 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00728 const wchar_t* source, size_t srcSize) 00729 { 00730 size_t srcChars = srcSize; 00731 if (srcSize == (size_t)-1) 00732 { 00733 srcChars = 0; 00734 const wchar_t* sptr = source; 00735 while (*sptr++ != 0) srcChars++; 00736 } 00737 if ((dest != 0) && (destSize != 0)) 00738 { 00739 size_t len = MIN (destSize - 1, srcChars); 00740 memcpy (dest, source, len * sizeof (wchar_t)); 00741 *(dest + len) = 0; 00742 } 00743 return srcChars + 1; 00744 }; 00745 00750 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00751 const wchar_t* source, size_t srcSize) 00752 { 00753 return UTF16to32 (dest, destSize, (utf16_char*)source, srcSize); 00754 }; 00755 00756 /* Decode()/Encode() overloads for wchar_t. 00757 * - On VC7+, wchar_t may be an unsigned short or the special type __wchar_t. 00758 * - On VC6 wchar_t is always an unsigned short. __wchar_t does not exist. 00759 * Now there may be conflicts with the utf16_char overloads if wchar_t is 00760 * an unsigned short. On the other hand, we would like to support VC7+'s 00761 * built-in wchar_t as well. 00762 * So: on VC7+, provide overloads for __wchar_t, on VC6, don't compile this 00763 * code at all, on other compilers, provide overloads for wchar_t instead 00764 * (by re#definining __wchar_t). 00765 */ 00766 #if !defined(CS_COMPILER_MSVC) || (_MSC_VER > 1300) 00767 #if !defined(CS_COMPILER_MSVC) 00768 #define __wchar_t wchar_t 00769 #endif 00770 00774 inline static int Decode (const __wchar_t* str, size_t strlen, 00775 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00776 { 00777 return UTF16Decode ((utf16_char*)str, strlen, ch, isValid, returnNonChar); 00778 } 00783 inline static int Encode (const utf32_char ch, __wchar_t* buf, 00784 size_t bufsize, bool allowNonchars = false) 00785 { 00786 return EncodeUTF16 (ch, (utf16_char*)buf, bufsize, allowNonchars); 00787 } 00788 #ifdef __wchar_t 00789 #undef __wchar_t 00790 #endif 00791 #endif 00792 00793 #elif (CS_WCHAR_T_SIZE == 4) 00794 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00795 const utf8_char* source, size_t srcSize) 00796 { 00797 return UTF8to32 ((utf32_char*)dest, destSize, source, srcSize); 00798 }; 00799 00800 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00801 const utf16_char* source, size_t srcSize) 00802 { 00803 return UTF16to32 ((utf32_char*)dest, destSize, source, srcSize); 00804 }; 00805 00806 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00807 const utf32_char* source, size_t srcSize) 00808 { 00809 size_t srcChars = srcSize; 00810 if (srcSize == (size_t)-1) 00811 { 00812 srcChars = 0; 00813 const utf32_char* sptr = source; 00814 while (*sptr++ != 0) srcChars++; 00815 } 00816 if ((dest != 0) && (destSize != 0)) 00817 { 00818 size_t len = MIN (destSize - 1, srcChars); 00819 memcpy (dest, source, len * sizeof (wchar_t)); 00820 *(dest + len) = 0; 00821 } 00822 return srcChars + 1; 00823 }; 00824 00825 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00826 const wchar_t* source, size_t srcSize) 00827 { 00828 return UTF32to8 (dest, destSize, (utf32_char*)source, srcSize); 00829 }; 00830 00831 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00832 const wchar_t* source, size_t srcSize) 00833 { 00834 return UTF32to16 (dest, destSize, (utf32_char*)source, srcSize); 00835 }; 00836 00837 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00838 const wchar_t* source, size_t srcSize) 00839 { 00840 size_t srcChars = srcSize; 00841 if (srcSize == (size_t)-1) 00842 { 00843 srcChars = 0; 00844 const wchar_t* sptr = source; 00845 while (*sptr++ != 0) srcChars++; 00846 } 00847 if ((dest != 0) && (destSize != 0)) 00848 { 00849 size_t len = MIN (destSize - 1, srcChars); 00850 memcpy (dest, source, len * sizeof (wchar_t)); 00851 *(dest + len) = 0; 00852 } 00853 return srcChars + 1; 00854 }; 00855 00856 inline static int Decode (const wchar_t* str, size_t strlen, 00857 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00858 { 00859 return UTF32Decode ((utf32_char*)str, strlen, ch, isValid, returnNonChar); 00860 } 00861 inline static int Encode (const utf32_char ch, wchar_t* buf, 00862 size_t bufsize, bool allowNonchars = false) 00863 { 00864 return EncodeUTF32 (ch, (utf32_char*)buf, bufsize, allowNonchars); 00865 } 00866 #else 00867 #error Odd-sized, unsupported wchar_t! 00868 #endif 00869 00882 inline static int UTF8Skip (const utf8_char* str, size_t maxSkip) 00883 { 00884 if (maxSkip < 1) return 0; 00885 00886 if ((*str & 0x80) == 0) 00887 { 00888 return 1; 00889 } 00890 else 00891 { 00892 int n = 0; 00893 while ((n < 7) && ((*str & (1 << (7 - n))) != 0)) { n++; } 00894 00895 if ((n < 2) || (n > 6)) 00896 { 00897 return 1; 00898 } 00899 00900 int skip = 1; 00901 00902 for (; skip < n; skip++) 00903 { 00904 if (((str[skip] & 0xc0) != 0x80) || ((size_t)skip > maxSkip)) 00905 { 00906 break; 00907 } 00908 } 00909 return skip; 00910 } 00911 } 00912 00923 inline static int UTF8Rewind (const utf8_char* str, size_t maxRew) 00924 { 00925 if (maxRew < 1) return 0; 00926 00927 const utf8_char* pos = str - 1; 00928 00929 if ((*pos & 0x80) == 0) 00930 { 00931 return 1; 00932 } 00933 00934 // Skip backward to the first byte of the sequence. 00935 int skip = 1; 00936 while (((*pos & 0xc0) == 0x80) && ((size_t)skip < maxRew)) 00937 { 00938 skip++; 00939 pos--; 00940 } 00941 00942 return skip; 00943 } 00944 00950 inline static int UTF16Skip (const utf16_char* str, size_t maxSkip) 00951 { 00952 if (CS_UC_IS_HIGH_SURROGATE (*str)) 00953 return (int)(MIN(maxSkip, (size_t)2)); 00954 else 00955 return (int)(MIN(maxSkip, (size_t)1)); 00956 } 00957 00963 inline static int UTF16Rewind (const utf16_char* str, size_t maxRew) 00964 { 00965 if (maxRew < 1) return 0; 00966 00967 const utf16_char* pos = str - 1; 00968 if (!CS_UC_IS_SURROGATE(*pos)) 00969 return 1; 00970 else 00971 { 00972 if ((maxRew > 1) && (CS_UC_IS_HIGH_SURROGATE(*(pos - 1)))) 00973 return 2; 00974 else 00975 return 1; 00976 } 00977 } 00978 00984 inline static int UTF32Skip (const utf32_char* str, size_t maxSkip) 00985 { 00986 (void)str; // silence gcc 00987 return (int)(MIN(maxSkip, (size_t)1)); 00988 } 00989 00995 inline static int UTF32Rewind (const utf32_char* str, size_t maxRew) 00996 { 00997 (void)str; // silence gcc 00998 if (maxRew < 1) return 0; 00999 return 1; 01000 } 01015 static size_t MapToUpper (const utf32_char ch, utf32_char* dest, 01016 size_t destSize, uint flags = 0); 01023 inline static utf32_char MapToUpper (const utf32_char ch) 01024 { 01025 utf32_char ret; 01026 MapToUpper (ch, &ret, 1, csUcMapSimple); 01027 return ret; 01028 } 01033 static size_t MapToLower (const utf32_char ch, utf32_char* dest, 01034 size_t destSize, uint flags = 0); 01035 inline static utf32_char MapToLower (const utf32_char ch) 01036 { 01037 utf32_char ret; 01038 MapToLower (ch, &ret, 1, csUcMapSimple); 01039 return ret; 01040 } 01046 static size_t MapToFold (const utf32_char ch, utf32_char* dest, 01047 size_t destSize, uint flags = 0); 01048 inline static utf32_char MapToFold (const utf32_char ch) 01049 { 01050 utf32_char ret; 01051 MapToFold (ch, &ret, 1, csUcMapSimple); 01052 return ret; 01053 } 01055 }; 01056 01059 #endif 01060
Generated for Crystal Space 2.0 by doxygen 1.6.1