nux-1.16.0
|
00001 /* 00002 * Copyright 2010 Inalogic® Inc. 00003 * 00004 * This program is free software: you can redistribute it and/or modify it 00005 * under the terms of the GNU Lesser General Public License, as 00006 * published by the Free Software Foundation; either version 2.1 or 3.0 00007 * of the License. 00008 * 00009 * This program is distributed in the hope that it will be useful, but 00010 * WITHOUT ANY WARRANTY; without even the implied warranties of 00011 * MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR 00012 * PURPOSE. See the applicable version of the GNU Lesser General Public 00013 * License for more details. 00014 * 00015 * You should have received a copy of both the GNU Lesser General Public 00016 * License along with this program. If not, see <http://www.gnu.org/licenses/> 00017 * 00018 * Authored by: Jay Taoko <jaytaoko@inalogic.com> 00019 * 00020 */ 00021 00022 00023 /* 00024 * Copyright 2001-2004 Unicode, Inc. 00025 * 00026 * Disclaimer 00027 * 00028 * This source code is provided as is by Unicode, Inc. No claims are 00029 * made as to fitness for any particular purpose. No warranties of any 00030 * kind are expressed or implied. The recipient agrees to determine 00031 * applicability of information provided. If this file has been 00032 * purchased on magnetic or optical media from Unicode, Inc., the 00033 * sole remedy for any claim will be exchange of defective media 00034 * within 90 days of receipt. 00035 * 00036 * Limitations on Rights to Redistribute This Code 00037 * 00038 * Unicode, Inc. hereby grants the right to freely use the information 00039 * supplied in this file in the creation of products supporting the 00040 * Unicode Standard, and to make copies of this file in any form 00041 * for internal or external distribution as long as this notice 00042 * remains attached. 00043 */ 00044 00045 /* --------------------------------------------------------------------- 00046 00047 Conversions between UTF32, UTF-16, and UTF-8. Source code file. 00048 Author: Mark E. Davis, 1994. 00049 Rev History: Rick McGowan, fixes & updates May 2001. 00050 Sept 2001: fixed const & error conditions per 00051 mods suggested by S. Parent & A. Lillich. 00052 June 2002: Tim Dodd added detection and handling of incomplete 00053 source sequences, enhanced error detection, added casts 00054 to eliminate compiler warnings. 00055 July 2003: slight mods to back out aggressive FFFE detection. 00056 Jan 2004: updated switches in from-UTF8 conversions. 00057 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 00058 May 2006: updated isLegalUTF8Sequence. 00059 00060 See the header file "ConvertUTF.h" for complete documentation. 00061 00062 ------------------------------------------------------------------------ */ 00063 00064 #include "NuxCore.h" 00065 //#include "NUni.h" 00066 00067 namespace nux 00068 { 00069 00070 static const int halfShift = 10; /* used for shifting by 10 bits */ 00071 00072 static const t_UTF32 halfBase = 0x0010000UL; 00073 static const t_UTF32 halfMask = 0x3FFUL; 00074 00075 #define UNI_SUR_HIGH_START (t_UTF32)0xD800 00076 #define UNI_SUR_HIGH_END (t_UTF32)0xDBFF 00077 #define UNI_SUR_LOW_START (t_UTF32)0xDC00 00078 #define UNI_SUR_LOW_END (t_UTF32)0xDFFF 00079 00080 00081 ConversionResult ConvertUTF32toUTF16 (const t_UTF32 **sourceStart, const t_UTF32 *sourceEnd, t_UTF16 **targetStart, t_UTF16 *targetEnd, ConversionFlags flags) 00082 { 00083 ConversionResult result = conversionOK; 00084 const t_UTF32 *source = *sourceStart; 00085 t_UTF16 *target = *targetStart; 00086 00087 while (source < sourceEnd) 00088 { 00089 t_UTF32 ch; 00090 00091 if (target >= targetEnd) 00092 { 00093 result = targetExhausted; 00094 break; 00095 } 00096 00097 ch = *source++; 00098 00099 if (ch <= UNI_MAX_BMP) /* Target is a character <= 0xFFFF */ 00100 { 00101 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 00102 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) 00103 { 00104 if (flags == strictConversion) 00105 { 00106 --source; /* return to the illegal value itself */ 00107 result = sourceIllegal; 00108 break; 00109 } 00110 else 00111 { 00112 *target++ = UNI_REPLACEMENT_CHAR; 00113 } 00114 } 00115 else 00116 { 00117 *target++ = (t_UTF16) ch; /* normal case */ 00118 } 00119 } 00120 else if (ch > UNI_MAX_LEGAL_UTF32) 00121 { 00122 if (flags == strictConversion) 00123 { 00124 result = sourceIllegal; 00125 } 00126 else 00127 { 00128 *target++ = UNI_REPLACEMENT_CHAR; 00129 } 00130 } 00131 else 00132 { 00133 /* target is a character in range 0xFFFF - 0x10FFFF. */ 00134 if (target + 1 >= targetEnd) 00135 { 00136 --source; /* Back up source pointer! */ 00137 result = targetExhausted; 00138 break; 00139 } 00140 00141 ch -= halfBase; 00142 *target++ = (t_UTF16) ( (ch >> halfShift) + UNI_SUR_HIGH_START); 00143 *target++ = (t_UTF16) ( (ch & halfMask) + UNI_SUR_LOW_START); 00144 } 00145 } 00146 00147 *sourceStart = source; 00148 *targetStart = target; 00149 return result; 00150 } 00151 00152 /* --------------------------------------------------------------------- */ 00153 00154 ConversionResult ConvertUTF16toUTF32 (const t_UTF16 **sourceStart, const t_UTF16 *sourceEnd, t_UTF32 **targetStart, t_UTF32 *targetEnd, ConversionFlags flags) 00155 { 00156 ConversionResult result = conversionOK; 00157 const t_UTF16 *source = *sourceStart; 00158 t_UTF32 *target = *targetStart; 00159 t_UTF32 ch, ch2; 00160 00161 while (source < sourceEnd) 00162 { 00163 const t_UTF16 *oldSource = source; /* In case we have to back up because of target overflow. */ 00164 ch = *source++; 00165 00166 /* If we have a surrogate pair, convert to UTF32 first. */ 00167 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 00168 { 00169 /* If the 16 bits following the high surrogate are in the source buffer... */ 00170 if (source < sourceEnd) 00171 { 00172 ch2 = *source; 00173 00174 /* If it's a low surrogate, convert to UTF32. */ 00175 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 00176 { 00177 ch = ( (ch - UNI_SUR_HIGH_START) << halfShift) 00178 + (ch2 - UNI_SUR_LOW_START) + halfBase; 00179 ++source; 00180 } 00181 else if (flags == strictConversion) /* it's an unpaired high surrogate */ 00182 { 00183 --source; /* return to the illegal value itself */ 00184 result = sourceIllegal; 00185 break; 00186 } 00187 } 00188 else /* We don't have the 16 bits following the high surrogate. */ 00189 { 00190 --source; /* return to the high surrogate */ 00191 result = sourceExhausted; 00192 break; 00193 } 00194 } 00195 else if (flags == strictConversion) 00196 { 00197 /* UTF-16 surrogate values are illegal in UTF-32 */ 00198 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) 00199 { 00200 --source; /* return to the illegal value itself */ 00201 result = sourceIllegal; 00202 break; 00203 } 00204 } 00205 00206 if (target >= targetEnd) 00207 { 00208 source = oldSource; /* Back up source pointer! */ 00209 result = targetExhausted; 00210 break; 00211 } 00212 00213 *target++ = ch; 00214 } 00215 00216 *sourceStart = source; 00217 *targetStart = target; 00218 #ifdef CVTUTF_DEBUG 00219 00220 if (result == sourceIllegal) 00221 { 00222 fprintf (stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 00223 fflush (stderr); 00224 } 00225 00226 #endif 00227 return result; 00228 } 00229 00230 /* --------------------------------------------------------------------- */ 00231 00232 /* 00233 * Index into the table below with the first byte of a UTF-8 sequence to 00234 * get the number of trailing bytes that are supposed to follow it. 00235 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 00236 * left as-is for anyone who may want to do such conversion, which was 00237 * allowed in earlier algorithms. 00238 */ 00239 static const char trailingBytesForUTF8[256] = 00240 { 00241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00242 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00243 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00244 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00246 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00247 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00248 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 00249 }; 00250 00251 /* 00252 * Magic values subtracted from a buffer value during UTF8 conversion. 00253 * This table contains as many values as there might be trailing bytes 00254 * in a UTF-8 sequence. 00255 */ 00256 static const t_UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 00257 0x03C82080UL, 0xFA082080UL, 0x82082080UL 00258 }; 00259 00260 /* 00261 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 00262 * into the first byte, depending on how many bytes follow. There are 00263 * as many entries in this table as there are UTF-8 sequence types. 00264 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 00265 * for *legal* UTF-8 will be 4 or fewer bytes total. 00266 */ 00267 static const t_UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 00268 00269 /* --------------------------------------------------------------------- */ 00270 00271 /* The interface converts a whole buffer to avoid function-call overhead. 00272 * Constants have been gathered. Loops & conditionals have been removed as 00273 * much as possible for efficiency, in favor of drop-through switches. 00274 * (See "Note A" at the bottom of the file for equivalent code.) 00275 * If your compiler supports it, the "isLegalUTF8" call can be turned 00276 * into an inline function. 00277 */ 00278 00279 /* --------------------------------------------------------------------- */ 00280 00281 ConversionResult ConvertUTF16toUTF8 (const t_UTF16 **sourceStart, const t_UTF16 *sourceEnd, t_UTF8 **targetStart, t_UTF8 *targetEnd, ConversionFlags flags) 00282 { 00283 ConversionResult result = conversionOK; 00284 const t_UTF16 *source = *sourceStart; 00285 t_UTF8 *target = *targetStart; 00286 00287 while (source < sourceEnd) 00288 { 00289 t_UTF32 ch; 00290 unsigned short bytesToWrite = 0; 00291 const t_UTF32 byteMask = 0xBF; 00292 const t_UTF32 byteMark = 0x80; 00293 const t_UTF16 *oldSource = source; /* In case we have to back up because of target overflow. */ 00294 ch = *source++; 00295 00296 /* If we have a surrogate pair, convert to UTF32 first. */ 00297 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 00298 { 00299 /* If the 16 bits following the high surrogate are in the source buffer... */ 00300 if (source < sourceEnd) 00301 { 00302 t_UTF32 ch2 = *source; 00303 00304 /* If it's a low surrogate, convert to UTF32. */ 00305 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 00306 { 00307 ch = ( (ch - UNI_SUR_HIGH_START) << halfShift) 00308 + (ch2 - UNI_SUR_LOW_START) + halfBase; 00309 ++source; 00310 } 00311 else if (flags == strictConversion) /* it's an unpaired high surrogate */ 00312 { 00313 --source; /* return to the illegal value itself */ 00314 result = sourceIllegal; 00315 break; 00316 } 00317 } 00318 else /* We don't have the 16 bits following the high surrogate. */ 00319 { 00320 --source; /* return to the high surrogate */ 00321 result = sourceExhausted; 00322 break; 00323 } 00324 } 00325 else if (flags == strictConversion) 00326 { 00327 /* UTF-16 surrogate values are illegal in UTF-32 */ 00328 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) 00329 { 00330 --source; /* return to the illegal value itself */ 00331 result = sourceIllegal; 00332 break; 00333 } 00334 } 00335 00336 /* Figure out how many bytes the result will require */ 00337 if (ch < (t_UTF32) 0x80) 00338 { 00339 bytesToWrite = 1; 00340 } 00341 else if (ch < (t_UTF32) 0x800) 00342 { 00343 bytesToWrite = 2; 00344 } 00345 else if (ch < (t_UTF32) 0x10000) 00346 { 00347 bytesToWrite = 3; 00348 } 00349 else if (ch < (t_UTF32) 0x110000) 00350 { 00351 bytesToWrite = 4; 00352 } 00353 else 00354 { 00355 bytesToWrite = 3; 00356 ch = UNI_REPLACEMENT_CHAR; 00357 } 00358 00359 target += bytesToWrite; 00360 00361 if (target > targetEnd) 00362 { 00363 source = oldSource; /* Back up source pointer! */ 00364 target -= bytesToWrite; 00365 result = targetExhausted; 00366 break; 00367 } 00368 00369 switch (bytesToWrite) /* note: everything falls through. */ 00370 { 00371 case 4: 00372 *--target = (t_UTF8) ( (ch | byteMark) & byteMask); 00373 ch >>= 6; 00374 case 3: 00375 *--target = (t_UTF8) ( (ch | byteMark) & byteMask); 00376 ch >>= 6; 00377 case 2: 00378 *--target = (t_UTF8) ( (ch | byteMark) & byteMask); 00379 ch >>= 6; 00380 case 1: 00381 *--target = (t_UTF8) (ch | firstByteMark[bytesToWrite]); 00382 } 00383 00384 target += bytesToWrite; 00385 } 00386 00387 *sourceStart = source; 00388 *targetStart = target; 00389 return result; 00390 } 00391 00392 /* --------------------------------------------------------------------- */ 00393 00394 /* 00395 * Utility routine to tell whether a sequence of bytes is legal UTF-8. 00396 * This must be called with the length pre-determined by the first byte. 00397 * If not calling this from ConvertUTF8to*, then the length can be set by: 00398 * length = trailingBytesForUTF8[*source]+1; 00399 * and the sequence is illegal right away if there aren't that many bytes 00400 * available. 00401 * If presented with a length > 4, this returns false. The Unicode 00402 * definition of UTF-8 goes up to 4-byte sequences. 00403 */ 00404 00405 static bool isLegalUTF8 (const t_UTF8 *source, int length) 00406 { 00407 t_UTF8 a; 00408 const t_UTF8 *srcptr = source + length; 00409 00410 switch (length) 00411 { 00412 default: 00413 return false; 00414 /* Everything else falls through when "true"... */ 00415 case 4: 00416 00417 if ( (a = (*--srcptr) ) < 0x80 || a > 0xBF) return false; 00418 00419 case 3: 00420 00421 if ( (a = (*--srcptr) ) < 0x80 || a > 0xBF) return false; 00422 00423 case 2: 00424 00425 if ( (a = (*--srcptr) ) > 0xBF) return false; 00426 00427 switch (*source) 00428 { 00429 /* no fall-through in this inner switch */ 00430 case 0xE0: 00431 00432 if (a < 0xA0) return false; 00433 00434 break; 00435 case 0xED: 00436 00437 if ( (a < 0x80) || (a > 0x9F) ) return false; 00438 00439 break; 00440 case 0xF0: 00441 00442 if (a < 0x90) return false; 00443 00444 break; 00445 case 0xF4: 00446 00447 if (a > 0x8F) return false; 00448 00449 break; 00450 default: 00451 00452 if (a < 0x80) return false; 00453 } 00454 00455 case 1: 00456 00457 if (*source >= 0x80 && *source < 0xC2) return false; 00458 } 00459 00460 if (*source > 0xF4) return false; 00461 00462 return true; 00463 } 00464 00465 /* --------------------------------------------------------------------- */ 00466 00467 /* 00468 * Exported function to return whether a UTF-8 sequence is legal or not. 00469 * This is not used here; it's just exported. 00470 */ 00471 00472 bool isLegalUTF8Sequence (const t_UTF8 *source, const t_UTF8 *sourceEnd) 00473 { 00474 int length; 00475 00476 if (source == sourceEnd) 00477 { 00478 return true; 00479 } 00480 00481 while (true) 00482 { 00483 length = trailingBytesForUTF8[*source] + 1; 00484 00485 if (source + length > sourceEnd) 00486 { 00487 return false; 00488 } 00489 00490 if (!isLegalUTF8 (source, length) ) 00491 { 00492 return false; 00493 } 00494 00495 source += length; 00496 00497 if (source >= sourceEnd) 00498 { 00499 return true; 00500 } 00501 } 00502 } 00503 00508 bool 00509 tr_utf8_validate ( const char *str, int max_len, const char **end ) 00510 { 00511 const t_UTF8 *source = (const t_UTF8 *) str; 00512 const t_UTF8 *sourceEnd; 00513 00514 if ( max_len == 0 ) 00515 return true; 00516 00517 if ( str == NULL ) 00518 return false; 00519 00520 sourceEnd = source + ( (max_len < 0) ? strlen (str) : (size_t) max_len); 00521 00522 if ( source == sourceEnd ) 00523 { 00524 if ( end != NULL ) 00525 *end = (const char *) source; 00526 00527 return true; 00528 } 00529 00530 for ( ;; ) 00531 { 00532 const int length = trailingBytesForUTF8[*source] + 1; 00533 00534 if (source + length > sourceEnd) 00535 { 00536 if ( end != NULL ) 00537 *end = (const char *) source; 00538 00539 return false; 00540 } 00541 00542 if (!isLegalUTF8 (source, length) ) 00543 { 00544 if ( end != NULL ) 00545 *end = (const char *) source; 00546 00547 return false; 00548 } 00549 00550 source += length; 00551 00552 if (source >= sourceEnd) 00553 { 00554 if ( end != NULL ) 00555 *end = (const char *) source; 00556 00557 return true; 00558 } 00559 } 00560 00561 00562 } 00563 00564 00565 /* --------------------------------------------------------------------- */ 00566 00567 ConversionResult ConvertUTF8toUTF16 (const t_UTF8 **sourceStart, const t_UTF8 *sourceEnd, t_UTF16 **targetStart, t_UTF16 *targetEnd, ConversionFlags flags) 00568 { 00569 ConversionResult result = conversionOK; 00570 const t_UTF8 *source = *sourceStart; 00571 t_UTF16 *target = *targetStart; 00572 00573 while (source < sourceEnd) 00574 { 00575 t_UTF32 ch = 0; 00576 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 00577 00578 if (source + extraBytesToRead >= sourceEnd) 00579 { 00580 result = sourceExhausted; 00581 break; 00582 } 00583 00584 /* Do this check whether lenient or strict */ 00585 if (! isLegalUTF8 (source, extraBytesToRead + 1) ) 00586 { 00587 result = sourceIllegal; 00588 break; 00589 } 00590 00591 /* 00592 * The cases all fall through. See "Note A" below. 00593 */ 00594 switch (extraBytesToRead) 00595 { 00596 case 5: 00597 ch += *source++; 00598 ch <<= 6; /* remember, illegal UTF-8 */ 00599 case 4: 00600 ch += *source++; 00601 ch <<= 6; /* remember, illegal UTF-8 */ 00602 case 3: 00603 ch += *source++; 00604 ch <<= 6; 00605 case 2: 00606 ch += *source++; 00607 ch <<= 6; 00608 case 1: 00609 ch += *source++; 00610 ch <<= 6; 00611 case 0: 00612 ch += *source++; 00613 } 00614 00615 ch -= offsetsFromUTF8[extraBytesToRead]; 00616 00617 if (target >= targetEnd) 00618 { 00619 source -= (extraBytesToRead + 1); /* Back up source pointer! */ 00620 result = targetExhausted; 00621 break; 00622 } 00623 00624 if (ch <= UNI_MAX_BMP) /* Target is a character <= 0xFFFF */ 00625 { 00626 /* UTF-16 surrogate values are illegal in UTF-32 */ 00627 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) 00628 { 00629 if (flags == strictConversion) 00630 { 00631 source -= (extraBytesToRead + 1); /* return to the illegal value itself */ 00632 result = sourceIllegal; 00633 break; 00634 } 00635 else 00636 { 00637 *target++ = UNI_REPLACEMENT_CHAR; 00638 } 00639 } 00640 else 00641 { 00642 *target++ = (t_UTF16) ch; /* normal case */ 00643 } 00644 } 00645 else if (ch > UNI_MAX_UTF16) 00646 { 00647 if (flags == strictConversion) 00648 { 00649 result = sourceIllegal; 00650 source -= (extraBytesToRead + 1); /* return to the start */ 00651 break; /* Bail out; shouldn't continue */ 00652 } 00653 else 00654 { 00655 *target++ = UNI_REPLACEMENT_CHAR; 00656 } 00657 } 00658 else 00659 { 00660 /* target is a character in range 0xFFFF - 0x10FFFF. */ 00661 if (target + 1 >= targetEnd) 00662 { 00663 source -= (extraBytesToRead + 1); /* Back up source pointer! */ 00664 result = targetExhausted; 00665 break; 00666 } 00667 00668 ch -= halfBase; 00669 *target++ = (t_UTF16) ( (ch >> halfShift) + UNI_SUR_HIGH_START); 00670 *target++ = (t_UTF16) ( (ch & halfMask) + UNI_SUR_LOW_START); 00671 } 00672 } 00673 00674 *sourceStart = source; 00675 *targetStart = target; 00676 return result; 00677 } 00678 00679 /* --------------------------------------------------------------------- */ 00680 00681 ConversionResult ConvertUTF32toUTF8 ( 00682 const t_UTF32 **sourceStart, const t_UTF32 *sourceEnd, 00683 t_UTF8 **targetStart, t_UTF8 *targetEnd, ConversionFlags flags) 00684 { 00685 ConversionResult result = conversionOK; 00686 const t_UTF32 *source = *sourceStart; 00687 t_UTF8 *target = *targetStart; 00688 00689 while (source < sourceEnd) 00690 { 00691 t_UTF32 ch; 00692 unsigned short bytesToWrite = 0; 00693 const t_UTF32 byteMask = 0xBF; 00694 const t_UTF32 byteMark = 0x80; 00695 ch = *source++; 00696 00697 if (flags == strictConversion ) 00698 { 00699 /* UTF-16 surrogate values are illegal in UTF-32 */ 00700 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) 00701 { 00702 --source; /* return to the illegal value itself */ 00703 result = sourceIllegal; 00704 break; 00705 } 00706 } 00707 00708 /* 00709 * Figure out how many bytes the result will require. Turn any 00710 * illegally large UTF32 things (> Plane 17) into replacement chars. 00711 */ 00712 if (ch < (t_UTF32) 0x80) 00713 { 00714 bytesToWrite = 1; 00715 } 00716 else if (ch < (t_UTF32) 0x800) 00717 { 00718 bytesToWrite = 2; 00719 } 00720 else if (ch < (t_UTF32) 0x10000) 00721 { 00722 bytesToWrite = 3; 00723 } 00724 else if (ch <= UNI_MAX_LEGAL_UTF32) 00725 { 00726 bytesToWrite = 4; 00727 } 00728 else 00729 { 00730 bytesToWrite = 3; 00731 ch = UNI_REPLACEMENT_CHAR; 00732 result = sourceIllegal; 00733 } 00734 00735 target += bytesToWrite; 00736 00737 if (target > targetEnd) 00738 { 00739 --source; /* Back up source pointer! */ 00740 target -= bytesToWrite; 00741 result = targetExhausted; 00742 break; 00743 } 00744 00745 switch (bytesToWrite) /* note: everything falls through. */ 00746 { 00747 case 4: 00748 *--target = (t_UTF8) ( (ch | byteMark) & byteMask); 00749 ch >>= 6; 00750 case 3: 00751 *--target = (t_UTF8) ( (ch | byteMark) & byteMask); 00752 ch >>= 6; 00753 case 2: 00754 *--target = (t_UTF8) ( (ch | byteMark) & byteMask); 00755 ch >>= 6; 00756 case 1: 00757 *--target = (t_UTF8) (ch | firstByteMark[bytesToWrite]); 00758 } 00759 00760 target += bytesToWrite; 00761 } 00762 00763 *sourceStart = source; 00764 *targetStart = target; 00765 return result; 00766 } 00767 00768 /* --------------------------------------------------------------------- */ 00769 00770 ConversionResult ConvertUTF8toUTF32 ( 00771 const t_UTF8 **sourceStart, const t_UTF8 *sourceEnd, 00772 t_UTF32 **targetStart, t_UTF32 *targetEnd, ConversionFlags flags) 00773 { 00774 ConversionResult result = conversionOK; 00775 const t_UTF8 *source = *sourceStart; 00776 t_UTF32 *target = *targetStart; 00777 00778 while (source < sourceEnd) 00779 { 00780 t_UTF32 ch = 0; 00781 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 00782 00783 if (source + extraBytesToRead >= sourceEnd) 00784 { 00785 result = sourceExhausted; 00786 break; 00787 } 00788 00789 /* Do this check whether lenient or strict */ 00790 if (! isLegalUTF8 (source, extraBytesToRead + 1) ) 00791 { 00792 result = sourceIllegal; 00793 break; 00794 } 00795 00796 /* 00797 * The cases all fall through. See "Note A" below. 00798 */ 00799 switch (extraBytesToRead) 00800 { 00801 case 5: 00802 ch += *source++; 00803 ch <<= 6; 00804 case 4: 00805 ch += *source++; 00806 ch <<= 6; 00807 case 3: 00808 ch += *source++; 00809 ch <<= 6; 00810 case 2: 00811 ch += *source++; 00812 ch <<= 6; 00813 case 1: 00814 ch += *source++; 00815 ch <<= 6; 00816 case 0: 00817 ch += *source++; 00818 } 00819 00820 ch -= offsetsFromUTF8[extraBytesToRead]; 00821 00822 if (target >= targetEnd) 00823 { 00824 source -= (extraBytesToRead + 1); /* Back up the source pointer! */ 00825 result = targetExhausted; 00826 break; 00827 } 00828 00829 if (ch <= UNI_MAX_LEGAL_UTF32) 00830 { 00831 /* 00832 * UTF-16 surrogate values are illegal in UTF-32, and anything 00833 * over Plane 17 (> 0x10FFFF) is illegal. 00834 */ 00835 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) 00836 { 00837 if (flags == strictConversion) 00838 { 00839 source -= (extraBytesToRead + 1); /* return to the illegal value itself */ 00840 result = sourceIllegal; 00841 break; 00842 } 00843 else 00844 { 00845 *target++ = UNI_REPLACEMENT_CHAR; 00846 } 00847 } 00848 else 00849 { 00850 *target++ = ch; 00851 } 00852 } 00853 else /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 00854 { 00855 result = sourceIllegal; 00856 *target++ = UNI_REPLACEMENT_CHAR; 00857 } 00858 } 00859 00860 *sourceStart = source; 00861 *targetStart = target; 00862 return result; 00863 } 00864 00865 /* --------------------------------------------------------------------- 00866 00867 Note A. 00868 The fall-through switches in UTF-8 reading code save a 00869 temp variable, some decrements & conditionals. The switches 00870 are equivalent to the following loop: 00871 { 00872 int tmpBytesToRead = extraBytesToRead+1; 00873 do { 00874 ch += *source++; 00875 --tmpBytesToRead; 00876 if (tmpBytesToRead) ch <<= 6; 00877 } while (tmpBytesToRead > 0); 00878 } 00879 In UTF-8 writing code, the switches on "bytesToWrite" are 00880 similarly unrolled loops. 00881 00882 --------------------------------------------------------------------- */ 00883 00884 }