主题:哪位可以帮我把这个转成VB的?(有C++和Delphi源码)
见:http://blog.csdn.net/jh_zzz/archive/2007/05/31/1632776.aspx
Delphi 的版本:
function IsTextUTF8(lpstrInputStream : PChar; iLen : Integer) : Boolean;
var
i : Integer;
cOctets : DWORD; // octets to go in this UTF-8 encoded character
chr : UCHAR;
bAllAscii : Boolean;
begin
cOctets := 0;
bAllAscii := True;
for i := 0 to iLen - 1 do
begin
chr := Ord(lpstrInputStream[i]);
if ( (chr and $80) <> 0 ) then
bAllAscii := False;
if ( cOctets = 0 ) then
begin
//
// 7 bit ascii after 7 bit ascii is just fine. Handle start of encoding case.
//
if ( chr >= $80 ) then
begin
//
// count of the leading 1 bits is the number of characters encoded
//
chr := chr * 2;
cOctets := cOctets + 1;
while( (chr and $80) <> 0 ) do
begin
chr := chr * 2;
cOctets := cOctets + 1;
end;
cOctets := cOctets - 1; // count includes this character
if( cOctets = 0 ) then
begin
Result := False; // must start with 11xxxxxx
exit;
end;
end;
end
else begin
// non-leading bytes must start as 10xxxxxx
if( (chr and $C0) <> $80 ) then
begin
Result := False;
exit;
end;
cOctets := cOctets - 1; // processed another octet in encoding
end;
end;
//
// End of text. Check for consistency.
//
if( cOctets > 0 ) then // anything left over at the end is an error
begin
Result := False;
exit;
end;
if bAllAscii then // Not utf-8 if all ascii. Forces caller to use code pages for conversion
begin
Result := False;
exit;
end;
Result := True;
end;
下面的是 C++ 原作:
/* IsTextUTF8
*
* UTF-8 is the encoding of Unicode based on Internet Society RFC2279
* ( See http://www.cis.ohio-state.edu/htbin/rfc/rfc2279.html )
*
* Basicly:
* 0000 0000-0000 007F - 0xxxxxxx (ascii converts to 1 octet!)
* 0000 0080-0000 07FF - 110xxxxx 10xxxxxx ( 2 octet format)
* 0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)
* (this keeps going for 32 bit unicode)
*
*
* Return value: TRUE, if the text is in UTF-8 format.
* FALSE, if the text is not in UTF-8 format.
* We will also return FALSE is it is only 7-bit ascii, so the right code page
* will be used.
*
* Actually for 7 bit ascii, it doesn't matter which code page we use, but
* notepad will remember that it is utf-8 and "save" or "save as" will store
* the file with a UTF-8 BOM. Not cool.
*/
INT IsTextUTF8( LPSTR lpstrInputStream, INT iLen )
{
INT i;
DWORD cOctets; // octets to go in this UTF-8 encoded character
UCHAR chr;
BOOL bAllAscii= TRUE;
cOctets= 0;
for( i=0; i < iLen; i++ ) {
chr= *(lpstrInputStream+i);
if( (chr&0x80) != 0 ) bAllAscii= FALSE;
if( cOctets == 0 ) {
//
// 7 bit ascii after 7 bit ascii is just fine. Handle start of encoding case.
//
if( chr >= 0x80 ) {
//
// count of the leading 1 bits is the number of characters encoded
//
do {
chr <<= 1;
cOctets++;
}
while( (chr&0x80) != 0 );
cOctets--; // count includes this character
if( cOctets == 0 ) return FALSE; // must start with 11xxxxxx
}
}
else {
// non-leading bytes must start as 10xxxxxx
if( (chr&0xC0) != 0x80 ) {
return FALSE;
}
cOctets--; // processed another octet in encoding
}
}
//
// End of text. Check for consistency.
//
if( cOctets > 0 ) { // anything left over at the end is an error
return FALSE;
}
if( bAllAscii ) { // Not utf-8 if all ascii. Forces caller to use code pages for conversion
return FALSE;
}
return TRUE;
}
Delphi 的版本:
function IsTextUTF8(lpstrInputStream : PChar; iLen : Integer) : Boolean;
var
i : Integer;
cOctets : DWORD; // octets to go in this UTF-8 encoded character
chr : UCHAR;
bAllAscii : Boolean;
begin
cOctets := 0;
bAllAscii := True;
for i := 0 to iLen - 1 do
begin
chr := Ord(lpstrInputStream[i]);
if ( (chr and $80) <> 0 ) then
bAllAscii := False;
if ( cOctets = 0 ) then
begin
//
// 7 bit ascii after 7 bit ascii is just fine. Handle start of encoding case.
//
if ( chr >= $80 ) then
begin
//
// count of the leading 1 bits is the number of characters encoded
//
chr := chr * 2;
cOctets := cOctets + 1;
while( (chr and $80) <> 0 ) do
begin
chr := chr * 2;
cOctets := cOctets + 1;
end;
cOctets := cOctets - 1; // count includes this character
if( cOctets = 0 ) then
begin
Result := False; // must start with 11xxxxxx
exit;
end;
end;
end
else begin
// non-leading bytes must start as 10xxxxxx
if( (chr and $C0) <> $80 ) then
begin
Result := False;
exit;
end;
cOctets := cOctets - 1; // processed another octet in encoding
end;
end;
//
// End of text. Check for consistency.
//
if( cOctets > 0 ) then // anything left over at the end is an error
begin
Result := False;
exit;
end;
if bAllAscii then // Not utf-8 if all ascii. Forces caller to use code pages for conversion
begin
Result := False;
exit;
end;
Result := True;
end;
下面的是 C++ 原作:
/* IsTextUTF8
*
* UTF-8 is the encoding of Unicode based on Internet Society RFC2279
* ( See http://www.cis.ohio-state.edu/htbin/rfc/rfc2279.html )
*
* Basicly:
* 0000 0000-0000 007F - 0xxxxxxx (ascii converts to 1 octet!)
* 0000 0080-0000 07FF - 110xxxxx 10xxxxxx ( 2 octet format)
* 0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)
* (this keeps going for 32 bit unicode)
*
*
* Return value: TRUE, if the text is in UTF-8 format.
* FALSE, if the text is not in UTF-8 format.
* We will also return FALSE is it is only 7-bit ascii, so the right code page
* will be used.
*
* Actually for 7 bit ascii, it doesn't matter which code page we use, but
* notepad will remember that it is utf-8 and "save" or "save as" will store
* the file with a UTF-8 BOM. Not cool.
*/
INT IsTextUTF8( LPSTR lpstrInputStream, INT iLen )
{
INT i;
DWORD cOctets; // octets to go in this UTF-8 encoded character
UCHAR chr;
BOOL bAllAscii= TRUE;
cOctets= 0;
for( i=0; i < iLen; i++ ) {
chr= *(lpstrInputStream+i);
if( (chr&0x80) != 0 ) bAllAscii= FALSE;
if( cOctets == 0 ) {
//
// 7 bit ascii after 7 bit ascii is just fine. Handle start of encoding case.
//
if( chr >= 0x80 ) {
//
// count of the leading 1 bits is the number of characters encoded
//
do {
chr <<= 1;
cOctets++;
}
while( (chr&0x80) != 0 );
cOctets--; // count includes this character
if( cOctets == 0 ) return FALSE; // must start with 11xxxxxx
}
}
else {
// non-leading bytes must start as 10xxxxxx
if( (chr&0xC0) != 0x80 ) {
return FALSE;
}
cOctets--; // processed another octet in encoding
}
}
//
// End of text. Check for consistency.
//
if( cOctets > 0 ) { // anything left over at the end is an error
return FALSE;
}
if( bAllAscii ) { // Not utf-8 if all ascii. Forces caller to use code pages for conversion
return FALSE;
}
return TRUE;
}