@@ -34,6 +34,13 @@ extern int winerror_to_errno(int);
3434int _Py_open_cloexec_works = -1 ;
3535#endif
3636
37+ // The value must be the same in unicodeobject.c.
38+ #define MAX_UNICODE 0x10ffff
39+
40+ // mbstowcs() and mbrtowc() errors
41+ static const size_t DECODE_ERROR = ((size_t )-1 );
42+ static const size_t INCOMPLETE_CHARACTER = (size_t )-2 ;
43+
3744
3845static int
3946get_surrogateescape (_Py_error_handler errors , int * surrogateescape )
@@ -82,6 +89,57 @@ _Py_device_encoding(int fd)
8289#endif
8390}
8491
92+
93+ static size_t
94+ is_valid_wide_char (wchar_t ch )
95+ {
96+ if (Py_UNICODE_IS_SURROGATE (ch )) {
97+ // Reject lone surrogate characters
98+ return 0 ;
99+ }
100+ if (ch > MAX_UNICODE ) {
101+ // bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
102+ // The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
103+ // it creates characters outside the [U+0000; U+10ffff] range:
104+ // https://sourceware.org/bugzilla/show_bug.cgi?id=2373
105+ return 0 ;
106+ }
107+ return 1 ;
108+ }
109+
110+
111+ static size_t
112+ _Py_mbstowcs (wchar_t * dest , const char * src , size_t n )
113+ {
114+ size_t count = mbstowcs (dest , src , n );
115+ if (dest != NULL && count != DECODE_ERROR ) {
116+ for (size_t i = 0 ; i < count ; i ++ ) {
117+ wchar_t ch = dest [i ];
118+ if (!is_valid_wide_char (ch )) {
119+ return DECODE_ERROR ;
120+ }
121+ }
122+ }
123+ return count ;
124+ }
125+
126+
127+ #ifdef HAVE_MBRTOWC
128+ static size_t
129+ _Py_mbrtowc (wchar_t * pwc , const char * str , size_t len , mbstate_t * pmbs )
130+ {
131+ assert (pwc != NULL );
132+ size_t count = mbrtowc (pwc , str , len , pmbs );
133+ if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER ) {
134+ if (!is_valid_wide_char (* pwc )) {
135+ return DECODE_ERROR ;
136+ }
137+ }
138+ return count ;
139+ }
140+ #endif
141+
142+
85143#if !defined(_Py_FORCE_UTF8_FS_ENCODING ) && !defined(MS_WINDOWS )
86144
87145#define USE_FORCE_ASCII
@@ -148,8 +206,8 @@ check_force_ascii(void)
148206 size_t res ;
149207
150208 ch = (unsigned char )0xA7 ;
151- res = mbstowcs (& wch , (char * )& ch , 1 );
152- if (res != ( size_t ) -1 && wch == L'\xA7' ) {
209+ res = _Py_mbstowcs (& wch , (char * )& ch , 1 );
210+ if (res != DECODE_ERROR && wch == L'\xA7' ) {
153211 /* On HP-UX withe C locale or the POSIX locale,
154212 nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
155213 Latin1 encoding in practice. Force ASCII in this case.
@@ -196,8 +254,8 @@ check_force_ascii(void)
196254
197255 unsigned uch = (unsigned char )i ;
198256 ch [0 ] = (char )uch ;
199- res = mbstowcs (wch , ch , 1 );
200- if (res != ( size_t ) -1 ) {
257+ res = _Py_mbstowcs (wch , ch , 1 );
258+ if (res != DECODE_ERROR ) {
201259 /* decoding a non-ASCII character from the locale encoding succeed:
202260 the locale encoding is not ASCII, force ASCII */
203261 return 1 ;
@@ -387,9 +445,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
387445 */
388446 argsize = strlen (arg );
389447#else
390- argsize = mbstowcs (NULL , arg , 0 );
448+ argsize = _Py_mbstowcs (NULL , arg , 0 );
391449#endif
392- if (argsize != ( size_t ) -1 ) {
450+ if (argsize != DECODE_ERROR ) {
393451 if (argsize > PY_SSIZE_T_MAX / sizeof (wchar_t ) - 1 ) {
394452 return -1 ;
395453 }
@@ -398,21 +456,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
398456 return -1 ;
399457 }
400458
401- count = mbstowcs (res , arg , argsize + 1 );
402- if (count != (size_t )-1 ) {
403- wchar_t * tmp ;
404- /* Only use the result if it contains no
405- surrogate characters. */
406- for (tmp = res ; * tmp != 0 &&
407- !Py_UNICODE_IS_SURROGATE (* tmp ); tmp ++ )
408- ;
409- if (* tmp == 0 ) {
410- if (wlen != NULL ) {
411- * wlen = count ;
412- }
413- * wstr = res ;
414- return 0 ;
459+ count = _Py_mbstowcs (res , arg , argsize + 1 );
460+ if (count != DECODE_ERROR ) {
461+ * wstr = res ;
462+ if (wlen != NULL ) {
463+ * wlen = count ;
415464 }
465+ return 0 ;
416466 }
417467 PyMem_RawFree (res );
418468 }
@@ -436,46 +486,36 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
436486 out = res ;
437487 memset (& mbs , 0 , sizeof mbs );
438488 while (argsize ) {
439- size_t converted = mbrtowc (out , (char * )in , argsize , & mbs );
489+ size_t converted = _Py_mbrtowc (out , (char * )in , argsize , & mbs );
440490 if (converted == 0 ) {
441491 /* Reached end of string; null char stored. */
442492 break ;
443493 }
444494
445- if (converted == ( size_t ) -2 ) {
495+ if (converted == INCOMPLETE_CHARACTER ) {
446496 /* Incomplete character. This should never happen,
447497 since we provide everything that we have -
448498 unless there is a bug in the C library, or I
449499 misunderstood how mbrtowc works. */
450500 goto decode_error ;
451501 }
452502
453- if (converted == ( size_t ) -1 ) {
503+ if (converted == DECODE_ERROR ) {
454504 if (!surrogateescape ) {
455505 goto decode_error ;
456506 }
457507
458- /* Conversion error. Escape as UTF-8b, and start over
459- in the initial shift state. */
508+ /* Decoding error. Escape as UTF-8b, and start over in the initial
509+ shift state. */
460510 * out ++ = 0xdc00 + * in ++ ;
461511 argsize -- ;
462512 memset (& mbs , 0 , sizeof mbs );
463513 continue ;
464514 }
465515
466- if (Py_UNICODE_IS_SURROGATE (* out )) {
467- if (!surrogateescape ) {
468- goto decode_error ;
469- }
516+ // _Py_mbrtowc() reject lone surrogate characters
517+ assert (!Py_UNICODE_IS_SURROGATE (* out ));
470518
471- /* Surrogate character. Escape the original
472- byte sequence with surrogateescape. */
473- argsize -= converted ;
474- while (converted -- ) {
475- * out ++ = 0xdc00 + * in ++ ;
476- }
477- continue ;
478- }
479519 /* successfully converted some bytes */
480520 in += converted ;
481521 argsize -= converted ;
@@ -652,7 +692,7 @@ encode_current_locale(const wchar_t *text, char **str,
652692 else {
653693 converted = wcstombs (NULL , buf , 0 );
654694 }
655- if (converted == ( size_t ) -1 ) {
695+ if (converted == DECODE_ERROR ) {
656696 goto encode_error ;
657697 }
658698 if (bytes != NULL ) {
@@ -1440,7 +1480,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
14401480 char cmode [10 ];
14411481 size_t r ;
14421482 r = wcstombs (cmode , mode , 10 );
1443- if (r == ( size_t ) -1 || r >= 10 ) {
1483+ if (r == DECODE_ERROR || r >= 10 ) {
14441484 errno = EINVAL ;
14451485 return NULL ;
14461486 }
0 commit comments