0
点赞
收藏
分享

微信扫一扫

C++判断字符串编码格式(ANSI\UTF16_LE\UTF16_BE\UTF8\UTF8_BOM)

蓝哆啦呀 2022-09-16 阅读 62

enum Encode { ANSI = 1, UTF16_LE, UTF16_BE, UTF8_BOM, UTF8 };

__inline static

Encode IsUtf8Data(const uint8_t* data, size_t size)

{

bool bAnsi= true;

uint8_t ch = 0x00;

int32_t nBytes = 0;

for (auto i = 0; i < size; i++)

{

ch = *(data + i);

if ((ch & 0x80) != 0x00)

{

bAnsi = false;

}

if (nBytes == 0)

{

if (ch >= 0x80)

{

if (ch >= 0xFC && ch <= 0xFD)

{

nBytes = 6;

}

else if (ch >= 0xF8)

{

nBytes = 5;

}

else if (ch >= 0xF0)

{

nBytes = 4;

}

else if (ch >= 0xE0)

{

nBytes = 3;

}

else if (ch >= 0xC0)

{

nBytes = 2;

}

else

{

return Encode::ANSI;

}

nBytes--;

}

}

else

{

if ((ch & 0xC0) != 0x80)

{

return Encode::ANSI;

}

nBytes--;

}

}

if (nBytes > 0 || bAnsi)

{

return Encode::ANSI;

}

return Encode::UTF8;

}

__inline static

Encode DetectEncode(const uint8_t* data, size_t size)

{

if (size > 2 && data[0] == 0xFF && data[1] == 0xFE)

{

return Encode::UTF16_LE;

}

else if (size > 2 && data[0] == 0xFE && data[1] == 0xFF)

{

return Encode::UTF16_BE;

}

else if (size > 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF)

{

return Encode::UTF8_BOM;

}

else

{

return IsUtf8Data(data, size);

}

}

调用例子:

auto s = FILE_READER(sv.begin()->c_str(), std::ios::binary);

switch (DetectEncode((const uint8_t*)s.data(), s.size()))

{

case ANSI:

break;

case UTF16_LE:

s.erase(s.begin());

s.erase(s.begin());

s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t)));

break;

case UTF16_BE:

s.erase(s.begin());

s.erase(s.begin());

s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t)));

break;

case UTF8_BOM:

s.erase(s.begin());

s.erase(s.begin());

s.erase(s.begin());

s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s));

break;

case UTF8:

s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s));

break;

default:

break;

}

举报

相关推荐

0 条评论