nux-1.16.0
|
00001 /* 00002 * Copyright 2010 Inalogicยฎ Inc. 00003 * 00004 * This program is free software: you can redistribute it and/or modify it 00005 * under the terms of the GNU Lesser General Public License, as 00006 * published by the Free Software Foundation; either version 2.1 or 3.0 00007 * of the License. 00008 * 00009 * This program is distributed in the hope that it will be useful, but 00010 * WITHOUT ANY WARRANTY; without even the implied warranties of 00011 * MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR 00012 * PURPOSE. See the applicable version of the GNU Lesser General Public 00013 * License for more details. 00014 * 00015 * You should have received a copy of both the GNU Lesser General Public 00016 * License along with this program. If not, see <http://www.gnu.org/licenses/> 00017 * 00018 * Authored by: Jay Taoko <jaytaoko@inalogic.com> 00019 * 00020 */ 00021 00022 00023 #include "NuxCore.h" 00024 #include "NUTF.h" 00025 00026 namespace nux 00027 { 00028 00029 NUTF8::NUTF8 (const UNICHAR *Source) 00030 { 00031 Convert (Source); 00032 } 00033 00034 NUTF8::NUTF8 (const std::wstring &Source) 00035 { 00036 Convert (NUX_REINTERPRET_CAST (UNICHAR *, NUX_CONST_CAST (wchar_t *, Source.c_str() ) ) ); 00037 } 00038 00039 void NUTF8::Convert (const UNICHAR *Source) 00040 { 00041 int NumBytes = 0; 00042 // *6 each UTF16 char can translate to up to 6 bytes in UTF8 00043 // +1 for NULL char 00044 size_t Size = wcslen ( (wchar_t *) Source) * 6 + 1; 00045 utf8 = new char[Size]; 00046 memset (utf8, 0, Size); 00047 00048 unsigned char TwoBytes[2]; 00049 TwoBytes[0] = '\0'; 00050 TwoBytes[1] = '\0'; 00051 00052 utf8[0] = '\0'; 00053 00054 // U-00000000 U-0000007F: 0xxxxxxx 00055 // U-00000080 U-000007FF: 110xxxxx 10xxxxxx 00056 // U-00000800 U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 00057 // U-00010000 U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00058 // U-00200000 U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 00059 // U-04000000 U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 00060 // The original specification of UTF-8 allowed for sequences of up to six bytes covering numbers up to 31 bits 00061 // (the original limit of the universal character set). However, UTF-8 was restricted by RFC 3629 to use only 00062 // the area covered by the formal Unicode definition, U+0000 to U+10FFFF, in November 2003. So UTF-8 code point is at most 4 bytes. 00063 00064 for (size_t n = 0; Source[n] != 0; n++) 00065 { 00066 if (Source[n] <= 0x7F) 00067 { 00068 TwoBytes[0] = (char) Source[n]; 00069 STRCAT_S (utf8, Size, (const char *) &TwoBytes[0]); 00070 } 00071 else 00072 { 00073 // 11 valid bits 2 bytes 00074 if (Source[n] <= 0x7FF) 00075 { 00076 // Extract the 5 highest bits 00077 TwoBytes[0] = (char) (0xC0 + (Source[n] >> 6) ); 00078 NumBytes = 2; 00079 } 00080 // 16 valid bits 3 bytes 00081 else if (Source[n] <= 0xFFFF) 00082 { 00083 // Extract the highest 4 bits 00084 TwoBytes[0] = (char) (0xE0 + (Source[n] >> 12) ); 00085 NumBytes = 3; 00086 } 00087 // Unichar is only 16 bits. Do no continue because (Source[n] >> 18) does not make sense. 00088 // 21 valid bits 4 bytes 00089 else if (Source[n] <= 0x1FFFFF) 00090 { 00091 // Extract the highest 3 bits 00092 TwoBytes[0] = (char) (0xF0 + (Source[n] >> 18) ); 00093 NumBytes = 4; 00094 } 00095 // Split a 26 bit character into 5 bytes 00096 else if (Source[n] <= 0x3FFFFFF) 00097 { 00098 // Extract the highest 2 bits 00099 TwoBytes[0] = (char) (0xF8 + (Source[n] >> 24) ); 00100 NumBytes = 5; 00101 } 00102 // Split a 31 bit character into 6 bytes 00103 else if (Source[n] <= 0x7FFFFFFF) 00104 { 00105 // Extract the highest bit 00106 TwoBytes[0] = (char) (0xFC + (Source[n] >> 30) ); 00107 NumBytes = 6; 00108 } 00109 00110 STRCAT_S (utf8, Size, (const char *) &TwoBytes[0]); 00111 00112 // Extract the remaining bits - 6 bits at a time 00113 for (int i = 1, shift = (NumBytes - 2) * 6; shift >= 0; i++, shift -= 6) 00114 { 00115 TwoBytes[0] = (char) (0x80 + ( (Source[n] >> shift) & 0x3F) ); 00116 STRCAT_S (utf8, Size, (const char *) &TwoBytes[0]); 00117 } 00118 } 00119 } 00120 } 00121 00122 // void NUTF8::Convert(const t_UTF32* Source) 00123 // { 00124 // int NumBytes = 0; 00125 // 00126 // int Size = 0; 00127 // while(Source[Size] != 0) 00128 // { 00129 // ++Size; 00130 // } 00131 // // *6: each UTF16 char can translate to up to 6 bytes in UTF8 00132 // // +1: for NULL char 00133 // Size = Size * 6 + 1; 00134 // utf8 = new char[Size*sizeof(t_UTF32)]; 00135 // memset(utf8, 0, Size*sizeof(t_UTF32)); 00136 // 00137 // unsigned char TwoBytes[2]; 00138 // TwoBytes[0] = '\0'; TwoBytes[1] = '\0'; 00139 // 00140 // utf8[0] = '\0'; 00141 // 00142 // // U-00000000 U-0000007F: 0xxxxxxx 00143 // // U-00000080 U-000007FF: 110xxxxx 10xxxxxx 00144 // // U-00000800 U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 00145 // // U-00010000 U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00146 // // U-00200000 U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 00147 // // U-04000000 U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 00148 // // The original specification of UTF-8 allowed for sequences of up to six bytes covering numbers up to 31 bits 00149 // // (the original limit of the universal character set). However, UTF-8 was restricted by RFC 3629 to use only 00150 // // the area covered by the formal Unicode definition, U+0000 to U+10FFFF, in November 2003. So UTF-8 code point is at most 4 bytes. 00151 // 00152 // for(size_t n = 0; Source[n] != 0; n++) 00153 // { 00154 // if (Source[n] <= 0x7F) 00155 // { 00156 // TwoBytes[0] = (char)Source[n]; 00157 // STRCAT_S(utf8, Size, (const char*)&TwoBytes[0]); 00158 // } 00159 // else 00160 // { 00161 // // 11 valid bits 2 bytes 00162 // if (Source[n] <= 0x7FF) 00163 // { 00164 // // Extract the 5 highest bits 00165 // TwoBytes[0] = (char)(0xC0 + (Source[n] >> 6)); 00166 // NumBytes = 2; 00167 // } 00168 // // 16 valid bits 3 bytes 00169 // else if (Source[n] <= 0xFFFF) 00170 // { 00171 // // Extract the highest 4 bits 00172 // TwoBytes[0] = (char)(0xE0 + (Source[n] >> 12)); 00173 // NumBytes = 3; 00174 // } 00175 // // 21 valid bits 4 bytes 00176 // else if (Source[n] <= 0x1FFFFF) 00177 // { 00178 // // Extract the highest 3 bits 00179 // TwoBytes[0] = (char)(0xF0 + (Source[n] >> 18)); 00180 // NumBytes = 4; 00181 // } 00182 // // Split a 26 bit character into 5 bytes 00183 // else if (Source[n] <= 0x3FFFFFF) 00184 // { 00185 // // Extract the highest 2 bits 00186 // TwoBytes[0] = (char)(0xF8 + (Source[n] >> 24)); 00187 // NumBytes = 5; 00188 // } 00189 // // Split a 31 bit character into 6 bytes 00190 // else if (Source[n] <= 0x7FFFFFFF) 00191 // { 00192 // // Extract the highest bit 00193 // TwoBytes[0] = (char)(0xFC + (Source[n] >> 30)); 00194 // NumBytes = 6; 00195 // } 00196 // 00197 // STRCAT_S(utf8, Size, (const char*)&TwoBytes[0]); 00198 // 00199 // // Extract the remaining bits - 6 bits at a time 00200 // for(int i = 1, shift = (NumBytes-2)*6; shift >= 0; i++, shift -= 6) 00201 // { 00202 // TwoBytes[0] = (char)(0x80 + ((Source[n] >> shift) & 0x3F)); 00203 // STRCAT_S(utf8, Size, (const char*)&TwoBytes[0]); 00204 // } 00205 // } 00206 // } 00207 // } 00208 00209 NUTF8::~NUTF8() 00210 { 00211 delete [] utf8; 00212 } 00213 00214 NUTF8::operator const char* () 00215 { 00216 return utf8; 00217 } 00218 00220 // Convert each unicode character in the source to UTF-8 00221 00222 NUTF16::NUTF16 (const char *Source) 00223 { 00224 Convert (Source); 00225 } 00226 00227 NUTF16::NUTF16 (const std::string &Source) 00228 { 00229 Convert (Source.c_str() ); 00230 } 00231 00232 void NUTF16::Convert (const char *Source) 00233 { 00234 // U-00000000 U-0000007F: 0xxxxxxx 00235 // U-00000080 U-000007FF: 110xxxxx 10xxxxxx 00236 // U-00000800 U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 00237 // U-00010000 U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00238 // U-00200000 U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 00239 // U-04000000 U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 00240 00241 unsigned char MSB; 00242 int temp = 0; 00243 int numbytes = 0; // Number of bytes used to represent the unicode char 00244 int pos = 0; 00245 00246 size_t len = strlen (Source) + 1; // +1 for NULL char 00247 unicode = new UNICHAR[len*6]; 00248 00249 // Loop through the characters in the string and decode them 00250 for (size_t n = 0; n < len; ++n) 00251 { 00252 // Find the hexadecimal number following the equals sign 00253 MSB = Source[n]; 00254 00255 if (MSB <= 0x7F) 00256 { 00257 unicode[pos++] = (UNICHAR) MSB; 00258 } 00259 else 00260 { 00261 // 2 bytes 00262 if (MSB >= 0xC0 && MSB <= 0xDF) 00263 { 00264 temp = (MSB - 0xC0) << 6; 00265 numbytes = 2; 00266 } 00267 // 3 bytes 00268 else if (MSB >= 0xE0 && MSB <= 0xEF) 00269 { 00270 temp = (MSB - 0xE0) << 12; 00271 numbytes = 3; 00272 } 00273 // 4 bytes 00274 else if (MSB >= 0xF0 && MSB <= 0xF7) 00275 { 00276 temp = (MSB - 0xF0) << 18; 00277 numbytes = 4; 00278 } 00279 // 5 bytes 00280 else if (MSB >= 0xF8 && MSB <= 0xFB) 00281 { 00282 temp = (MSB - 0xF8) << 24; 00283 numbytes = 5; 00284 } 00285 // 6 bytes 00286 else if (MSB >= 0xFC && MSB <= 0xFD) 00287 { 00288 temp = (MSB - 0xFC) << 30; 00289 numbytes = 6; 00290 } 00291 00292 // Loop through the remaining hexadecimal numbers representing the next unicode character 00293 for (int i = 0, shift = (numbytes - 2) * 6; shift >= 0; i++, shift -= 6) 00294 { 00295 int nVal = ( ( (unsigned char) Source[n+1+i]) - 0x80 ) << shift; 00296 temp += nVal; 00297 } 00298 00299 // Add the unicode character to the final string 00300 unicode[pos++] = (UNICHAR) temp; 00301 00302 // Move the character index in the source to the next unicode character 00303 n += (numbytes - 1); 00304 } 00305 } 00306 } 00307 00308 NUTF16::~NUTF16() 00309 { 00310 delete [] unicode; 00311 } 00312 00313 NUTF16::operator const UNICHAR* () 00314 { 00315 return unicode; 00316 } 00317 00318 }