@@ -316,6 +316,21 @@ def _splitnetloc(url, start=0):
316316 delim = min (delim , wdelim ) # use earliest delim position
317317 return url [start :delim ], url [delim :] # return (domain, rest)
318318
319+ def _checknetloc (netloc ):
320+ if not netloc or not any (ord (c ) > 127 for c in netloc ):
321+ return
322+ # looking for characters like \u2100 that expand to 'a/c'
323+ # IDNA uses NFKC equivalence, so normalize for this check
324+ import unicodedata
325+ netloc2 = unicodedata .normalize ('NFKC' , netloc )
326+ if netloc == netloc2 :
327+ return
328+ _ , _ , netloc = netloc .rpartition ('@' ) # anything to the left of '@' is okay
329+ for c in '/?#@:' :
330+ if c in netloc2 :
331+ raise ValueError ("netloc '" + netloc2 + "' contains invalid " +
332+ "characters under NFKC normalization" )
333+
319334def urlsplit (url , scheme = '' , allow_fragments = True ):
320335 """Parse a URL into 5 components:
321336 <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -345,6 +360,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
345360 url , fragment = url .split ('#' , 1 )
346361 if '?' in url :
347362 url , query = url .split ('?' , 1 )
363+ _checknetloc (netloc )
348364 v = SplitResult (scheme , netloc , url , query , fragment )
349365 _parse_cache [key ] = v
350366 return _coerce_result (v )
@@ -368,6 +384,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
368384 url , fragment = url .split ('#' , 1 )
369385 if '?' in url :
370386 url , query = url .split ('?' , 1 )
387+ _checknetloc (netloc )
371388 v = SplitResult (scheme , netloc , url , query , fragment )
372389 _parse_cache [key ] = v
373390 return _coerce_result (v )
0 commit comments