When dealing with a recent ExifTool remoting task, there was a question whether or not a given Unicode file name could be safely represented in the system ANSI code page. Only if the file name was fully convertible it could be passed to the application directly.
In case the file name cannot be converted to the current code page, an application which does not utilize the CreateFileW() API will not be able to open the file with this name. In case the file system supports old style DOS 8.3 filenames, the application should resort to using those instead.
BOOL IsConvertibleText( PCWSTR sFile )
{
BOOL bRet = FALSE;
if ( sFile )
{
int iBuffer = WideCharToMultiByte( CP_ACP, 0, sFile, -1, NULL, 0, NULL, NULL );
if ( iBuffer != 0 )
{
iBuffer += 1;
PSTR a = (PSTR)HeapAlloc( GetProcessHeap(), 0, iBuffer );
if ( a )
{
if ( WideCharToMultiByte( CP_ACP, 0, sFile, -1, a, iBuffer, NULL, NULL ) )
{
iBuffer = MultiByteToWideChar( CP_ACP, 0, a, -1, NULL, 0 );
if ( iBuffer != 0 )
{
iBuffer = ( iBuffer + 1 ) * sizeof(WCHAR);
PWSTR w = (PWSTR)HeapAlloc( GetProcessHeap(), 0, iBuffer );
if ( w )
if ( MultiByteToWideChar( CP_ACP, 0, a, -1, w, iBuffer ) )
if ( CompareStringW( LOCALE_SYSTEM_DEFAULT, 0, sFile, -1, w, -1 ) == CSTR_EQUAL )
bRet = TRUE;
HeapFree( GetProcessHeap(), 0, w );
}
}
HeapFree( GetProcessHeap(), 0, a );
}
}
}
return bRet;
}
For those using C#:
bool IsConvertibleText( string sFile )
{
byte[] b = Encoding.Default.GetBytes( sFile );
string s = Encoding.Default.GetString( b );
return sFile.Equals( s, StringComparison.InvariantCulture );
}
See also: Post in CPAN::Forum
Win32API::File Unicode support bug
Categories: C#, C++, Software Development Tags: ANSI, C++, Code Page, Conversion, ExifTool, MultiByteToWideChar, Unicode, WideCharToMultiByte, Win32
Sometimes we come across Text that has been encoded in a particular locale or Unicode encoding. ATL CString classes do not provide conversion for this in most cases, that’s where these two extension classes come in handy:
CStringWExt – Convert 8-bit Character Sets to UTF-16
class CStringWExt : public CStringW
{
public:
BOOL Latin12Wide ( PSTR s ) { return CP2Wide( 28591 , s ); } // Latin1 encoding or ISO/IEC 8859-1, similar to Windows-1252
BOOL OEM2Wide ( PSTR s ) { return CP2Wide( CP_OEMCP , s ); } // Use for console related text
BOOL ASCII2Wide ( PSTR s ) { return CP2Wide( 20127 , s ); }
BOOL UTF72Wide ( PSTR s ) { return CP2Wide( CP_UTF7 , s ); }
BOOL UTF82Wide ( PSTR s ) { return CP2Wide( CP_UTF8 , s ); }
BOOL ANSI2Wide ( PSTR s ) { return CP2Wide( CP_ACP , s ); }
BOOL UserCP2Wide ( PSTR s ) { return CP2Wide( GetUserCodePage() , s ); }
BOOL SystemCP2Wide( PSTR s ) { return CP2Wide( GetSystemCodePage(), s ); } // System code page is the locale set for non Unicode programs
UINT GetUserCodePage() { return GetCodePage( LOCALE_USER_DEFAULT ); }
UINT GetSystemCodePage() { return GetCodePage( LOCALE_SYSTEM_DEFAULT ); }
UINT GetCodePage( LCID locale )
{
UINT langCP;
if ( GetLocaleInfo( locale, LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER, (LPTSTR)&langCP, sizeof(langCP) ) )
return langCP;
return 0;
}
BOOL CP2Wide( UINT cp, PCSTR s )
{
if ( s == NULL )
return FALSE;
int iBuffer = MultiByteToWideChar( cp, 0, s, -1, NULL, 0 );
if ( iBuffer == 0 )
return FALSE;
Preallocate( iBuffer );
if ( !MultiByteToWideChar( cp, 0, s, -1, GetBuffer() , GetAllocLength() ) )
return FALSE;
ReleaseBuffer();
return TRUE;
}
};
CStringAExt – Convert UTF-16 to 8-bit Character Set
This conversion with a target of OEM, ASCII and ANSI CP is potentially lossy, depeding on the text that has to be converted. To check if any loss has occurred, use an instance of CStringWExt above.
class CStringAExt : public CStringA
{
public:
BOOL Wide2Latin1 ( PWSTR s ) { return Wide2CP( 28591 , s ); } // Latin1 encoding or ISO/IEC 8859-1, similar to Windows-1252
BOOL Wide2OEM ( PWSTR s ) { return Wide2CP( CP_OEMCP , s ); } // Use for console related text
BOOL Wide2ASCII ( PWSTR s ) { return Wide2CP( 20127 , s ); }
BOOL Wide2UTF7 ( PWSTR s ) { return Wide2CP( CP_UTF7 , s ); }
BOOL Wide2UTF8 ( PWSTR s ) { return Wide2CP( CP_UTF8 , s ); }
BOOL Wide2ANSI ( PWSTR s ) { return Wide2CP( CP_ACP , s ); }
BOOL Wide2UserCP ( PWSTR s ) { return Wide2CP( GetUserCodePage() , s ); }
BOOL Wide2SystemCP( PWSTR s ) { return Wide2CP( GetSystemCodePage(), s ); } // System code page is the locale set for non Unicode programs
UINT GetUserCodePage() { return GetCodePage( LOCALE_USER_DEFAULT ); }
UINT GetSystemCodePage() { return GetCodePage( LOCALE_SYSTEM_DEFAULT ); }
UINT GetCodePage( LCID locale )
{
UINT langCP;
if ( GetLocaleInfo( locale, LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER, (LPTSTR)&langCP, sizeof(langCP) ) )
return langCP;
return 0;
}
BOOL Wide2CP( UINT cp, PCWSTR s )
{
if ( s == NULL )
return FALSE;
int iBuffer = WideCharToMultiByte( cp, 0, s, -1, NULL, 0, NULL, NULL );
if ( iBuffer == 0 )
return FALSE;
Preallocate( iBuffer );
if ( !WideCharToMultiByte( cp, 0, s, -1, GetBuffer() , GetAllocLength(), NULL, NULL ) )
return FALSE;
ReleaseBuffer();
return TRUE;
}
};
Categories: C++, Software Development Tags: ANSI, ASCII, ATL, C++, Code Page, CString, Internationalization, ISO/IEC 8859-1, Latin1, MBCS, MFC, OEM, Unicode, UTF-7, UTF-8, Win32, Windows-1252