kowala's home: 一些 Unicode 及 Ansi 碼的討論

開這個話題目的，當然也是為了多了解一些編碼知識，為了解決如何在VC++中使用中文字碼問題。

先來看看Unicode及Ansi在記憶體中的排列方式(http://www.regexlab.com/zh/encoding.htm)

在 ASCII 阶段，单字节字符串使用一个字节存放一个字符（SBCS）。比如，"Bob123" 在内存中为：
42    6F    62    31    32    33    00
－    －    －    －    －    －    －
B    o    b    1    2    3    \0

在使用 ANSI 编码支持多种语言阶段，每个字符使用一个字节或多个字节来表示（MBCS），因此，这种方式存放的字符也被称作多字节字符。比如，"中文123" 在中文 Windows 95 内存中为7个字节，每个汉字占2个字节，每个英文和数字字符占1个字节：
D6 D0    CE C4    31    32    33    00
－－    －－    －    －    －    －
中    文    1    2    3    \0

在 UNICODE 被采用之后，计算机存放字符串时，改为存放每个字符在 UNICODE 字符集中的序号。目前计算机一般使用 2 个字节（16 位）来存放一个序号（DBCS），因此，这种方式存放的字符也被称作宽字节字符。比如，字符串 "中文123" 在 Windows 2000 下，内存中实际存放的是 5 个序号：
2D 4E    87 65    31 00    32 00    33 00    00 00          ← 在 x86 CPU 中，低字节在前
－－    －－    －－    －－    －－    －－
中    文    1    2    3    \0

一共占 10 个字节。

UTF-8的編碼形式(http://www.lihuasoft.net/article/show.php?id=2795)

UTF-8标准就是Unicode（ISO10646）标准的一种变形方式，
UTF的全称是：Unicode/UCS Transformation Format，其实有两种UTF，一种是UTF-8，一种是UTF-16，
不过UTF-16使用较少，其对应关系如下：
在Unicode中编码为 0000 - 007F 的 UTF-8 中编码形式为: 0xxxxxxx
在Unicode中编码为 0080 - 07FF 的 UTF-8 中编码形式为: 110xxxxx 10xxxxxx
在Unicode中编码为 0000 - 007F 的 UTF-8 中编码形式为: 1110xxxx 10xxxxxx 10xxxxxx

C++ 中相关实现方法 (http://www.regexlab.com/zh/encoding.htm)

声明一段字符串常量：
// ANSI 字符串，内容长度 7 字节
char     sz[20] = "中文123";

// UNICODE 字符串，内容长度 5 个 wchar_t（10 字节）
wchar_t wsz[20] = L"\x4E2D\x6587\x0031\x0032\x0033";

UNICODE 字符串的 I/O 操作，字符与字节的转换操作：
// 运行时设定当前 ANSI 编码，VC 格式
setlocale(LC_ALL, ".936");

// GCC 中格式
setlocale(LC_ALL, "zh_CN.GBK");

// Visual C++ 中使用小写 %s，按照 setlocale 指定编码输出到文件
// GCC 中使用大写 %S
fwprintf(fp, L"%s\n", wsz);

// 把 UNICODE 字符串按照 setlocale 指定的编码转换成字节
wcstombs(sz, wsz, 20);
// 把字节串按照 setlocale 指定的编码转换成 UNICODE 字符串
mbstowcs(wsz, sz, 20);

在 Visual C++ 中，UNICODE 字符串常量有更简单的表示方法。如果源程序的编码与当前默认 ANSI 编码不符，则需要使用 #pragma setlocale，告诉编译器源程序使用的编码：
// 如果源程序的编码与当前默认 ANSI 编码不一致，
// 则需要此行，编译时用来指明当前源程序使用的编码
#pragma setlocale(".936")

// UNICODE 字符串常量，内容长度 10 字节
wchar_t wsz[20] = L"中文123";

以上需要注意 #pragma setlocale 与 setlocale(LC_ALL, "") 的作用是不同的，#pragma setlocale 在编译时起作用，setlocale() 在运行时起作用。

這裡有一些轉換函式 (http://stdsoft.blogbus.com/logs/56468290.html)

static wstring ANSI2Unicode(const string & strin){
                wstring strout;
                // 预计算所需空间大小（已包含结束字符）,单位wchar_t
                int dwNum = MultiByteToWideChar (CP_ACP, 0,strin.c_str(), -1, 0 , 0);
                wchar_t * pBuffer = new wchar_t[dwNum];
                if (!pBuffer) {
                    return strout;
                }
                memset(pBuffer,0,(dwNum)*sizeof(wchar_t));
                if(MultiByteToWideChar(CP_ACP, 0, strin.c_str(),-1,pBuffer,dwNum) >= 0){
                    strout = pBuffer;
                }
                delete[] pBuffer;
                return strout;
}

static string Unicode2UTF8(const wstring & strin){
                string strout;
                //测试所需存储空间大小（已包含结束字符）,单位char
                int dwNum = WideCharToMultiByte(CP_UTF8,0,strin.c_str(),-1,0,0,0,0);
                char* pBuffer = new char[dwNum];
                if (!pBuffer){
                    return strout;
                }
                memset(pBuffer,0,dwNum);
                if(WideCharToMultiByte(CP_UTF8,0,strin.c_str(),-1,pBuffer,dwNum,0,0) >= 0){
                    strout = pBuffer;
                }
                delete[] pBuffer;
                return strout;
}

static string ANSI2UTF8(const string & strin){
return Unicode2UTF8(ANSI2Unicode(strin));
}

static string Unicode2ANSI(const wstring & strin){
                string strout;
                //测试所需存储空间大小（已包含结束字符）,单位char
                int dwNum = WideCharToMultiByte(CP_ACP,0,strin.c_str(),-1,0,0,0,0);
                char* pBuffer = new char[dwNum];
                if (!pBuffer){
                    return strout;
                }
                memset(pBuffer,0,dwNum);
                BOOL use_def_char = FALSE;               if(WideCharToMultiByte(CP_ACP,0,strin.c_str(),-1,pBuffer,dwNum,"?",&use_def_char) >= 0){
                    strout = pBuffer;
                }
                delete[] pBuffer;
                return strout;
}

static wstring UTF82Unicode(const string & strin) {
                wstring strout;
               // 预计算所需空间大小（已包含结束字符）,单位wchar_t
                int dwNum = MultiByteToWideChar (CP_UTF8, 0,strin.c_str(), -1, 0 , 0);
                wchar_t* pBuffer = new wchar_t[dwNum];
                if (!pBuffer) {
                    return strout;
                }
                memset(pBuffer,0,dwNum*sizeof(wchar_t));
               if(MultiByteToWideChar(CP_UTF8,0,strin.c_str(),-1,pBuffer,dwNum) >= 0) {
                    strout = pBuffer;
                }
                delete[] pBuffer;
                return strout;
}

static string UTF82ANSI(const string & strin) {
return Unicode2ANSI(UTF82Unicode(strin));
}

這位作者的網頁也不錯
http://shukaiyang.myweb.hinet.net/courses/cpp/unicode2ascii.zhtw.htm
http://shukaiyang.myweb.hinet.net/courses/cpp/unicodestr.zhtw.htm

kowala's home

2011-06-14

一些 Unicode 及 Ansi 碼的討論

沒有留言:

張貼留言