changeset: 98379:128a3f03ddeb user: Victor Stinner date: Tue Sep 29 12:32:13 2015 +0200 files: Doc/whatsnew/3.6.rst Lib/test/test_codecs.py Misc/NEWS Objects/unicodeobject.c description: Optimize ascii/latin1+surrogateescape encoders Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape`` error handler: the encoders are now up to 3 times as fast. Initial patch written by Serhiy Storchaka. diff -r e025bdffd71c -r 128a3f03ddeb Doc/whatsnew/3.6.rst --- a/Doc/whatsnew/3.6.rst Tue Sep 29 01:56:54 2015 -0400 +++ b/Doc/whatsnew/3.6.rst Tue Sep 29 12:32:13 2015 +0200 @@ -117,6 +117,9 @@ * The ASCII decoder is now up to 60 times as fast for error handlers: ``surrogateescape``, ``ignore`` and ``replace``. +* The ASCII and the Latin1 encoders are now up to 3 times as fast for the error + error ``surrogateescape``. + Build and C API Changes ======================= diff -r e025bdffd71c -r 128a3f03ddeb Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Tue Sep 29 01:56:54 2015 -0400 +++ b/Lib/test/test_codecs.py Tue Sep 29 12:32:13 2015 +0200 @@ -3060,7 +3060,31 @@ class ASCIITest(unittest.TestCase): + def test_encode(self): + self.assertEqual('abc123'.encode('ascii'), b'abc123') + + def test_encode_error(self): + for data, error_handler, expected in ( + ('[\x80\xff\u20ac]', 'ignore', b'[]'), + ('[\x80\xff\u20ac]', 'replace', b'[???]'), + ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'), + ('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'), + ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), + ): + with self.subTest(data=data, error_handler=error_handler, + expected=expected): + self.assertEqual(data.encode('ascii', error_handler), + expected) + + def test_encode_surrogateescape_error(self): + with self.assertRaises(UnicodeEncodeError): + # the first character can be decoded, but not the second + '\udc80\xff'.encode('ascii', 'surrogateescape') + def test_decode(self): + self.assertEqual(b'abc'.decode('ascii'), 'abc') + + def test_decode_error(self): for data, error_handler, expected in ( (b'[\x80\xff]', 'ignore', '[]'), (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), @@ -3073,5 +3097,41 @@ expected) +class Latin1Test(unittest.TestCase): + def test_encode(self): + for data, expected in ( + ('abc', b'abc'), + ('\x80\xe9\xff', b'\x80\xe9\xff'), + ): + with self.subTest(data=data, expected=expected): + self.assertEqual(data.encode('latin1'), expected) + + def test_encode_errors(self): + for data, error_handler, expected in ( + ('[\u20ac\udc80]', 'ignore', b'[]'), + ('[\u20ac\udc80]', 'replace', b'[??]'), + ('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'), + ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'), + ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), + ): + with self.subTest(data=data, error_handler=error_handler, + expected=expected): + self.assertEqual(data.encode('latin1', error_handler), + expected) + + def test_encode_surrogateescape_error(self): + with self.assertRaises(UnicodeEncodeError): + # the first character can be decoded, but not the second + '\udc80\u20ac'.encode('latin1', 'surrogateescape') + + def test_decode(self): + for data, expected in ( + (b'abc', 'abc'), + (b'[\x80\xff]', '[\x80\xff]'), + ): + with self.subTest(data=data, expected=expected): + self.assertEqual(data.decode('latin1'), expected) + + if __name__ == "__main__": unittest.main() diff -r e025bdffd71c -r 128a3f03ddeb Misc/NEWS --- a/Misc/NEWS Tue Sep 29 01:56:54 2015 -0400 +++ b/Misc/NEWS Tue Sep 29 12:32:13 2015 +0200 @@ -10,6 +10,10 @@ Core and Builtins ----------------- +- Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape`` + error handler: the encoders are now up to 3 times as fast. Initial patch + written by Serhiy Storchaka. + - Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the getrandom() function instead of the getentropy() function. The getentropy() function is blocking to generate very good quality entropy, os.urandom() diff -r e025bdffd71c -r 128a3f03ddeb Objects/unicodeobject.c --- a/Objects/unicodeobject.c Tue Sep 29 01:56:54 2015 -0400 +++ b/Objects/unicodeobject.c Tue Sep 29 12:32:13 2015 +0200 @@ -6532,6 +6532,22 @@ pos = collend; break; + case _Py_ERROR_SURROGATEESCAPE: + for (i = collstart; i < collend; ++i) { + ch = PyUnicode_READ(kind, data, i); + if (ch < 0xdc80 || 0xdcff < ch) { + /* Not a UTF-8b surrogate */ + break; + } + *str++ = (char)(ch - 0xdc00); + ++pos; + } + if (i >= collend) + break; + collstart = pos; + assert(collstart != collend); + /* fallback to general error handling */ + default: repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj, encoding, reason, unicode, &exc,