当前位置:

首页 > Python基础教程 >

C#教程之如何检测或判断一个文件或字节流（无(3)

if (le16 > 0 && be16 > 0) { return Encoding.None; } } if (le16 > 0) { if (le16 == le32 && buffer.Length % 4 == 0) { return Encoding.Utf32NoBom; } return Encoding.UnicodeNoBom; } else if (be16 > 0) { return Encoding.BigEndianUnicodeNoBom; } else if (buffer.Length % 4 == 0 && zeroCount >= buffer.Length / 4) { return Encoding.Utf32NoBom; } return Encoding.None; } /// <summary> /// Checks if a buffer contains any nulls. Used to check for binary vs text data. /// </summary> /// <param name="buffer">The byte buffer.</param> /// <param name="size">The size of the byte buffer.</param> private static bool ContainsZero(byte[] buffer, int size) { uint pos = 0; while (pos < size) { if (buffer[pos++] == 0) { return true; } } return false; } /// <summary> /// Checks if a buffer contains text that looks like utf16. This is done based /// on the use of nulls which in ASCII/script like text can be useful to identify. /// 按照一定的空0数的概率来预测。 /// </summary> /// <param name="buffer">The byte buffer.</param> /// <param name="size">The size of the byte buffer.</param> /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns> private Encoding CheckByZeroNumPercent(byte[] buffer, int size) { //单数 int oddZeroCount = 0; //双数 int evenZeroCount = 0; // Get even nulls uint pos = 0; while (pos < size) { if (buffer[pos] == 0) { evenZeroCount++; } pos += 2; } // Get odd nulls pos = 1; while (pos < size) { if (buffer[pos] == 0) { oddZeroCount++; } pos += 2; } double evenZeroPercent = evenZeroCount * 2.0 / size; double oddZeroPercent = oddZeroCount * 2.0 / size; // Lots of odd nulls, low number of even nulls 这里的条件做了修改 if (evenZeroPercent < 0.1 && oddZeroPercent > 0) { return Encoding.UnicodeNoBom; } // Lots of even nulls, low number of odd nulls 这里的条件也做了修改 if (oddZeroPercent < 0.1 && evenZeroPercent > 0) { return Encoding.BigEndianUnicodeNoBom; } // Don't know return Encoding.None; } /// <summary> /// Checks if a buffer contains valid utf8. /// 以UTF8 的字节范围来检测。 /// </summary> /// <param name="buffer">The byte buffer.</param> /// <param name="size">The size of the byte buffer.</param> /// <returns> /// Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or /// Encoding.ASCII (data in 0.127 range). /// </returns> /// <returns>2</returns> private Encoding CheckUtf8(byte[] buffer, int size) { // UTF8 Valid sequences // 0xxxxxxx ASCII // 110xxxxx 10xxxxxx 2-byte // 1110xxxx 10xxxxxx 10xxxxxx 3-byte // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte // // Width in UTF8 // Decimal Width // 0-127 1 byte // 194-223 2 bytes // 224-239 3 bytes // 240-244 4 bytes // // Subsequent chars are in the range 128-191 bool onlySawAsciiRange = true; uint pos = 0; while (pos < size) { byte ch = buffer[pos++]; if (ch == 0) { return Encoding.None; } int moreChars; if (ch <= 127) { // 1 byte moreChars = 0; } else if (ch >= 194 && ch <= 223) { // 2 Byte moreChars = 1; } else if (ch >= 224 && ch <= 239) { // 3 Byte moreChars = 2; } else if (ch >= 240 && ch <= 244) { // 4 Byte moreChars = 3; } else { return Encoding.None; // Not utf8 } // Check secondary chars are in range if we are expecting any while (moreChars > 0 && pos < size) { onlySawAsciiRange = false; // Seen non-ascii chars now ch = buffer[pos++]; if (ch < 128 || ch > 191) { return Encoding.None; // Not utf8 } --moreChars; } } // If we get to here then only valid UTF-8 sequences have been processed // If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide) return onlySawAsciiRange ? Encoding.Ascii : Encoding.Utf8Nobom; } /// <summary> /// 是否中文编码（GB2312、GBK、Big5） /// </summary> private void CheckChinese(byte[] buffer, int size) { IsChinese = false; if (size < 2) { return; } // Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes size--; uint pos = 0; bool isCN = false; while (pos < size) { //GB2312 //0xB0-0xF7(176-247) //0xA0-0xFE（160-254） //GBK //0x81-0xFE（129-254） //0x40-0xFE（64-254） //Big5 //0x81-0xFE（129-255） //0x40-0x7E（64-126） OR 0xA1－0xFE（161-254） byte ch1 = buffer[pos++]; byte ch2 = buffer[pos++]; isCN = (ch1 >= 176 && ch1 <= 247 && ch2 >= 160 && ch2 <= 254) || (ch1 >= 129 && ch1 <= 254 && ch2 >= 64 && ch2 <= 254) || (ch1 >= 129 && ((ch2 >= 64 && ch2 <= 126) || (ch2 >= 161 && ch2 <= 254))); if (!isCN) { return; } } IsChinese = true; } } }

后续更新地址：https://github.com/cyq1162/cyqdata/blob/master/Tool/IOHelper.cs

总结：

1、考虑到UTF7已经过时了，所以直接无视了。

2、对于纯中文情况，UTF16下是BE还是LE，暂时没有想到好的检测方法，所以默认返回了常用的LE，即Unicode。

3、其它一切都安好，全国公开的C#版本，应该就此一份了。

栏目列表

首页 > Python基础教程 >

C#教程之如何检测或判断一个文件或字节流（无(3)

总结：