changeset: 98609:9cf89366bbcb parent: 98607:e9c1404d6bd9 user: Victor Stinner date: Fri Oct 09 03:17:30 2015 +0200 files: Lib/test/test_codecs.py Objects/unicodeobject.c description: Issue #25318: Avoid sprintf() in backslashreplace() Rewrite backslashreplace() to be closer to PyCodec_BackslashReplaceErrors(). Add also unit tests for non-BMP characters. diff -r e9c1404d6bd9 -r 9cf89366bbcb Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Fri Oct 09 02:52:16 2015 +0200 +++ b/Lib/test/test_codecs.py Fri Oct 09 03:17:30 2015 +0200 @@ -3155,7 +3155,8 @@ ('[\x80\xff\u20ac]', 'ignore', b'[]'), ('[\x80\xff\u20ac]', 'replace', b'[???]'), ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'), - ('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'), + ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace', + b'[\\x80\\xff\\u20ac\\U000abcde]'), ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), ): with self.subTest(data=data, error_handler=error_handler, @@ -3197,7 +3198,8 @@ for data, error_handler, expected in ( ('[\u20ac\udc80]', 'ignore', b'[]'), ('[\u20ac\udc80]', 'replace', b'[??]'), - ('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'), + ('[\u20ac\U000abcde]', 'backslashreplace', + b'[\\u20ac\\U000abcde]'), ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'), ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), ): diff -r e9c1404d6bd9 -r 9cf89366bbcb Objects/unicodeobject.c --- a/Objects/unicodeobject.c Fri Oct 09 02:52:16 2015 +0200 +++ b/Objects/unicodeobject.c Fri Oct 09 03:17:30 2015 +0200 @@ -610,14 +610,25 @@ /* generate replacement */ for (i = collstart; i < collend; ++i) { ch = PyUnicode_READ(kind, data, i); - if (ch < 0x100) - str += sprintf(str, "\\x%02x", ch); - else if (ch < 0x10000) - str += sprintf(str, "\\u%04x", ch); - else { - assert(ch <= MAX_UNICODE); - str += sprintf(str, "\\U%08x", ch); - } + *str++ = '\\'; + if (ch >= 0x00010000) { + *str++ = 'U'; + *str++ = Py_hexdigits[(ch>>28)&0xf]; + *str++ = Py_hexdigits[(ch>>24)&0xf]; + *str++ = Py_hexdigits[(ch>>20)&0xf]; + *str++ = Py_hexdigits[(ch>>16)&0xf]; + *str++ = Py_hexdigits[(ch>>12)&0xf]; + *str++ = Py_hexdigits[(ch>>8)&0xf]; + } + else if (ch >= 0x100) { + *str++ = 'u'; + *str++ = Py_hexdigits[(ch>>12)&0xf]; + *str++ = Py_hexdigits[(ch>>8)&0xf]; + } + else + *str++ = 'x'; + *str++ = Py_hexdigits[(ch>>4)&0xf]; + *str++ = Py_hexdigits[ch&0xf]; } return str; }