@@ -165,6 +165,21 @@ def _splitnetloc(url, start=0):
165165 delim = min (delim , wdelim ) # use earliest delim position
166166 return url [start :delim ], url [delim :] # return (domain, rest)
167167
168+ def _checknetloc (netloc ):
169+ if not netloc or not isinstance (netloc , unicode ):
170+ return
171+ # looking for characters like \u2100 that expand to 'a/c'
172+ # IDNA uses NFKC equivalence, so normalize for this check
173+ import unicodedata
174+ netloc2 = unicodedata .normalize ('NFKC' , netloc )
175+ if netloc == netloc2 :
176+ return
177+ _ , _ , netloc = netloc .rpartition ('@' ) # anything to the left of '@' is okay
178+ for c in '/?#@:' :
179+ if c in netloc2 :
180+ raise ValueError ("netloc '" + netloc2 + "' contains invalid " +
181+ "characters under NFKC normalization" )
182+
168183def urlsplit (url , scheme = '' , allow_fragments = True ):
169184 """Parse a URL into 5 components:
170185 <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -193,6 +208,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
193208 url , fragment = url .split ('#' , 1 )
194209 if '?' in url :
195210 url , query = url .split ('?' , 1 )
211+ _checknetloc (netloc )
196212 v = SplitResult (scheme , netloc , url , query , fragment )
197213 _parse_cache [key ] = v
198214 return v
@@ -216,6 +232,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
216232 url , fragment = url .split ('#' , 1 )
217233 if '?' in url :
218234 url , query = url .split ('?' , 1 )
235+ _checknetloc (netloc )
219236 v = SplitResult (scheme , netloc , url , query , fragment )
220237 _parse_cache [key ] = v
221238 return v
0 commit comments