给你一个文件或者二进制,不知道编码是gb2312还是utf8的情况下怎么正确读取出文本呢? 以下代码是as3的,其他编程语言只要稍微改动一下即可。 上代码,请直接用! (感谢C++ BLOG提供的判断utf8的方法:http://hi.baidu.com/xingyan126/item/4abec1c1c2143755bcef6956) - /**
- * 从未知编码的二进制流中读取文本
- * @param ba
- * @param len 读取长度,默认为-1,则读取至文件尾
- * @return
- */
- public static function readString(ba:ByteArray,len:int = -1):String
- {
- if ((len != -1 && len > ba.bytesAvailable) || (len == -1)) len = ba.bytesAvailable;
- var encode:String = 'gb2312';
- //先判断头三个字节是不是utf bom
- if (ba.bytesAvailable >= 3)
- {
- //0xEF 0xBB 0xBF
- var chkarr:Array = [];
- var a:int = 0xffffffEF;
- var b:int = 0xffffffBB;
- var c:int = 0xffffffBF;
- chkarr.push(ba.readByte());
- chkarr.push(ba.readByte());
- chkarr.push(ba.readByte());
- if ((chkarr[0] == a && chkarr[1] == b && chkarr[2] == c))
- {
- //utf-8 bom
- encode = 'utf-8';
- return ba.readMultiByte(len - 3, encode);
- }
- else
- {
- ba.position -= 3;
- }
- }
-
- //逐个字节判断是否有UTF8的编码
- if (isUTF8(ba, len))
- {
- encode = 'utf-8';
- }
-
- return ba.readMultiByte(len - 3, encode);
- }
-
- /**
- * 判断文本是否是UTF8编码
- * @param ba
- * @param len 读取长度,默认为-1,则读取至文件尾
- * @return
- */
- public static function isUTF8(ba:ByteArray,len:int = -1):Boolean
- {
- if ((len != -1 && len > ba.bytesAvailable) || (len == -1)) len = ba.bytesAvailable;
- var score:int = 0;
- var i:int;
- var goodbytes:int = 0, asciibytes:int = 0;
- // Maybe also use UTF8 Byte Order Mark: EF BB BF
- // Check to see if characters fit into acceptable ranges
- var oldpos:int = ba.position;
- var byte:int, byte1:int, byte2:int;
- var curlen:int = len;
- while(curlen>0)
- {
- ba.position = oldpos + (len - curlen);
- byte = ba.readByte();
- curlen -= 1;
- if (curlen >= 1) byte1 = ba.readByte();
- if (curlen >= 2) byte2 = ba.readByte();
-
- //0x7f = 127 = 01111111
- if ((byte & 0x7F) == byte)
- {
- // 最高位是0的ASCII字符
- asciibytes++;
- // Ignore ASCII, can throw off count
- }
- else if (-64 <= byte && byte <= -33
- //-0x40~-0x21
- && // Two bytes
- curlen >= 1 && -128 <= byte1
- &&
- byte1<= -65)
- {
- goodbytes += 2;
- curlen -= 1;
- }
- else if (-32 <= byte
- && byte <= -17
- && // Three bytes
- curlen >= 2 && -128 <= byte1
- && byte1 <= -65 && -128 <= byte2
- && byte2 <= -65)
- {
- goodbytes += 3;
- curlen -= 2;
- }
- }
-
- ba.position = oldpos;
-
- if (asciibytes == len)
- {
- return false;
- }
- score = 100 * goodbytes / (len - asciibytes);
- // If not above 98, reduce to zero to prevent coincidental matches
- // Allows for some (few) bad formed sequences
- if (score > 98) {
- return true;
- } else if (score > 95 && goodbytes > 30) {
- return true;
- } else {
- return false;
- }
- }
复制代码 作者:YoYo,原文地址:http://yoyo.play175.com/p/207.html
|