changeset: 32950:adfe7d39a049 branch: legacy-trunk user: Hye-Shik Chang date: Wed Aug 04 06:33:51 2004 +0000 files: Include/pyport.h Lib/test/test_locale.py Misc/NEWS description: Add a workaround for a problem that UTF-8 strings can be corrupted or broken by basic ctype functions in 4.4BSD descendants. This will be fixed in their future development branches but they'll keep the POSIX-incompatibility for their backward-compatiblities in near future. diff -r 9ebd405683cd -r adfe7d39a049 Include/pyport.h --- a/Include/pyport.h Wed Aug 04 02:36:18 2004 +0000 +++ b/Include/pyport.h Wed Aug 04 06:33:51 2004 +0000 @@ -411,6 +411,39 @@ extern double hypot(double, double); #endif + +/******************************************************************* +On 4.4BSD-descendants, ctype functions serves the whole range of +wchar_t character set rather than single byte code points only. +This characteristic can break some operations of string object +including str.upper() and str.split() on UTF-8 locales. This +workaround was provided by Tim Robbins of FreeBSD project. He said +the incompatibility will be fixed in FreeBSD 6. +********************************************************************/ + +#ifdef __FreeBSD__ +#include +#if __FreeBSD_version > 500039 +#include +#include +#undef isalnum +#define isalnum(c) iswalnum(btowc(c)) +#undef isalpha +#define isalpha(c) iswalpha(btowc(c)) +#undef islower +#define islower(c) iswlower(btowc(c)) +#undef isspace +#define isspace(c) iswspace(btowc(c)) +#undef isupper +#define isupper(c) iswupper(btowc(c)) +#undef tolower +#define tolower(c) towlower(btowc(c)) +#undef toupper +#define toupper(c) towupper(btowc(c)) +#endif +#endif + + /* Declarations for symbol visibility. PyAPI_FUNC(type): Declares a public Python API function and return type diff -r 9ebd405683cd -r adfe7d39a049 Lib/test/test_locale.py --- a/Lib/test/test_locale.py Wed Aug 04 02:36:18 2004 +0000 +++ b/Lib/test/test_locale.py Wed Aug 04 06:33:51 2004 +0000 @@ -47,3 +47,38 @@ locale.getpreferredencoding() finally: locale.setlocale(locale.LC_NUMERIC, oldlocale) + + +# Test BSD Rune locale's bug for isctype functions. +def teststrop(s, method, output): + if verbose: + print "%s.%s() =? %s ..." % (repr(s), method, repr(output)), + result = getattr(s, method)() + if result != output: + if verbose: + print "no" + print "%s.%s() == %s != %s" % (repr(s), method, repr(result), + repr(output)) + elif verbose: + print "yes" + +try: + oldlocale = locale.setlocale(locale.LC_CTYPE) + locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8') +except locale.Error: + pass +else: + try: + teststrop('\x20', 'isspace', True) + teststrop('\xa0', 'isspace', False) + teststrop('\xa1', 'isspace', False) + teststrop('\xc0', 'isalpha', False) + teststrop('\xc0', 'isalnum', False) + teststrop('\xc0', 'isupper', False) + teststrop('\xc0', 'islower', False) + teststrop('\xec\xa0\xbc', 'split', ['\xec\xa0\xbc']) + teststrop('\xed\x95\xa0', 'strip', '\xed\x95\xa0') + teststrop('\xcc\x85', 'lower', '\xcc\x85') + teststrop('\xed\x95\xa0', 'upper', '\xed\x95\xa0') + finally: + locale.setlocale(locale.LC_CTYPE, oldlocale) diff -r 9ebd405683cd -r adfe7d39a049 Misc/NEWS --- a/Misc/NEWS Wed Aug 04 02:36:18 2004 +0000 +++ b/Misc/NEWS Wed Aug 04 06:33:51 2004 +0000 @@ -64,6 +64,9 @@ - Implemented bind_textdomain_codeset() in locale module. +- Added a workaround for proper string operations in BSDs. str.split + and str.is* methods can now work correctly with UTF-8 locales. + Extension modules -----------------