nux-1.16.0
|
00001 /* 00002 * Copyright 2010 Inalogic® Inc. 00003 * 00004 * This program is free software: you can redistribute it and/or modify it 00005 * under the terms of the GNU Lesser General Public License, as 00006 * published by the Free Software Foundation; either version 2.1 or 3.0 00007 * of the License. 00008 * 00009 * This program is distributed in the hope that it will be useful, but 00010 * WITHOUT ANY WARRANTY; without even the implied warranties of 00011 * MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR 00012 * PURPOSE. See the applicable version of the GNU Lesser General Public 00013 * License for more details. 00014 * 00015 * You should have received a copy of both the GNU Lesser General Public 00016 * License along with this program. If not, see <http://www.gnu.org/licenses/> 00017 * 00018 * Authored by: Jay Taoko <jaytaoko@inalogic.com> 00019 * 00020 */ 00021 00022 00023 #ifndef NUTF_H 00024 #define NUTF_H 00025 00026 // http://en.wikipedia.org/wiki/UTF-16 00027 00028 // In computing, UTF-16 (16-bit Unicode Transformation Format) is a variable-length character encoding 00029 // for Unicode, capable of encoding the entire Unicode repertoire. The encoding form maps code points 00030 // (characters) into a sequence of 16-bit words, called code units. For characters in the Basic 00031 // Multilingual Plane (BMP) the resulting encoding is a single 16-bit word. For characters in the other 00032 // planes, the encoding will result in a pair of 16-bit words, together called a surrogate pair. All possible 00033 // code points from U+0000 through U+10FFFF, except for the surrogate code points U+D800–U+DFFF 00034 // (which are not characters), are uniquely mapped by UTF-16 regardless of the code point's current or 00035 // future character assignment or use. 00036 // 00037 // As many uses in computing require units of bytes (octets) there are three related encoding schemes 00038 // which map to octet sequences instead of words: namely UTF-16, UTF-16BE, and UTF-16LE. They 00039 // differ only in the byte order chosen to represent each 16-bit unit and whether they make use of a 00040 // Byte Order Mark. All of the schemes will result in either a 2 or 4-byte sequence for any given character. 00041 // 00042 // UTF-16 is officially defined in Annex Q of the international standard ISO/IEC 10646-1. It is also 00043 // described in The Unicode Standard version 3.0 and higher, as well as in the IETF's RFC 2781. 00044 // 00045 // UCS-2 (2-byte Universal Character Set) is an obsolete character encoding which is a predecessor 00046 // to UTF-16. The UCS-2 encoding form is nearly identical to that of UTF-16, except that it does not 00047 // support surrogate pairs and therefore can only encode characters in the BMP range U+0000 through 00048 // U+FFFF. As a consequence it is a fixed-length encoding that always encodes characters into a 00049 // single 16-bit value. As with UTF-16, there are three related encoding schemes (UCS-2, UCS-2BE, UCS-2LE) 00050 // that map characters to a specific byte sequence. 00051 // 00052 // Because of the technical similarities and upwards compatibility from UCS-2 to UTF-16, the two 00053 // encodings are often erroneously conflated and used as if interchangeable, so that strings encoded 00054 // in UTF-16 are sometimes misidentified as being encoded in UCS-2. 00055 00056 namespace nux 00057 { 00058 00060 class NUTF8 00061 { 00062 // UTF-8 encoded characters may theoretically be up to six bytes long, however 16-bit BMP characters are only up to three bytes long. 00063 public: 00064 explicit NUTF8 (const UNICHAR *Source); 00065 explicit NUTF8 (const std::wstring &Source); 00066 ~NUTF8(); 00067 00068 operator const char* (); 00069 00070 private: 00071 void Convert (const UNICHAR *); 00072 //void Convert(const t_UTF32*); 00073 char *utf8; 00074 00075 }; 00076 00078 class NUTF16 00079 { 00080 public: 00081 explicit NUTF16 (const char *Source); 00082 explicit NUTF16 (const std::string &Source); 00083 ~NUTF16(); 00084 00085 operator const UNICHAR* (); 00086 00087 private: 00088 void Convert (const char *); 00089 UNICHAR *unicode; 00090 00091 }; 00092 00093 } 00094 00095 #endif // NUTF_H