@@ -327,6 +327,21 @@ def _splitnetloc(url, start=0):
327327 delim = min (delim , wdelim ) # use earliest delim position
328328 return url [start :delim ], url [delim :] # return (domain, rest)
329329
330+ def _checknetloc (netloc ):
331+ if not netloc or not any (ord (c ) > 127 for c in netloc ):
332+ return
333+ # looking for characters like \u2100 that expand to 'a/c'
334+ # IDNA uses NFKC equivalence, so normalize for this check
335+ import unicodedata
336+ netloc2 = unicodedata .normalize ('NFKC' , netloc )
337+ if netloc == netloc2 :
338+ return
339+ _ , _ , netloc = netloc .rpartition ('@' ) # anything to the left of '@' is okay
340+ for c in '/?#@:' :
341+ if c in netloc2 :
342+ raise ValueError ("netloc '" + netloc2 + "' contains invalid " +
343+ "characters under NFKC normalization" )
344+
330345def urlsplit (url , scheme = '' , allow_fragments = True ):
331346 """Parse a URL into 5 components:
332347 <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -356,6 +371,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
356371 url , fragment = url .split ('#' , 1 )
357372 if '?' in url :
358373 url , query = url .split ('?' , 1 )
374+ _checknetloc (netloc )
359375 v = SplitResult (scheme , netloc , url , query , fragment )
360376 _parse_cache [key ] = v
361377 return _coerce_result (v )
@@ -379,6 +395,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
379395 url , fragment = url .split ('#' , 1 )
380396 if '?' in url :
381397 url , query = url .split ('?' , 1 )
398+ _checknetloc (netloc )
382399 v = SplitResult (scheme , netloc , url , query , fragment )
383400 _parse_cache [key ] = v
384401 return _coerce_result (v )
0 commit comments