33
44# Standard/builtin Python modules
55import string
6+ from string import joinfields , splitfields , find , rfind
67
78# A classification of schemes ('' means apply by default)
89uses_relative = ['ftp' , 'http' , 'gopher' , 'nntp' , 'wais' , 'file' ,
1819# Characters valid in scheme names
1920scheme_chars = string .letters + string .digits + '+-.'
2021
22+ _parse_cache = {}
23+
24+ def clear_cache ():
25+ global _parse_cache
26+ _parse_cache = {}
27+
28+
2129# Parse a URL into 6 components:
2230# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
2331# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
2432# Note that we don't break the components up in smaller bits
2533# (e.g. netloc is a single string) and we don't expand % escapes.
2634def urlparse (url , scheme = '' , allow_framents = 1 ):
27- netloc = ''
28- path = ''
29- params = ''
30- query = ''
31- fragment = ''
35+ key = url , scheme , allow_framents
36+ if _parse_cache .has_key (key ):
37+ return _parse_cache [key ]
38+ netloc = path = params = query = fragment = ''
3239 i = string .find (url , ':' )
3340 if i > 0 :
3441 for c in url [:i ]:
@@ -54,7 +61,9 @@ def urlparse(url, scheme = '', allow_framents = 1):
5461 i = string .find (url , ';' )
5562 if i >= 0 :
5663 url , params = url [:i ], url [i + 1 :]
57- return scheme , netloc , url , params , query , fragment
64+ tuple = scheme , netloc , url , params , query , fragment
65+ _parse_cache [key ] = tuple
66+ return tuple
5867
5968# Put a parsed URL back together again. This may result in a slightly
6069# different, but equivalent URL, if the URL that was parsed originally
@@ -80,7 +89,7 @@ def urljoin(base, url, allow_framents = 1):
8089 if not base :
8190 return url
8291 bscheme , bnetloc , bpath , bparams , bquery , bfragment = \
83- urlparse (base , '' , allow_framents )
92+ urlparse (base , '' , allow_framents )
8493 scheme , netloc , path , params , query , fragment = \
8594 urlparse (url , bscheme , allow_framents )
8695 # XXX Unofficial hack: default netloc to bnetloc even if
@@ -90,9 +99,9 @@ def urljoin(base, url, allow_framents = 1):
9099 scheme in uses_netloc and bscheme in uses_netloc :
91100 netloc = bnetloc
92101 # Strip the port number
93- i = string . find (netloc , '@' )
102+ i = find (netloc , '@' )
94103 if i < 0 : i = 0
95- i = string . find (netloc , ':' , i )
104+ i = find (netloc , ':' , i )
96105 if i >= 0 :
97106 netloc = netloc [:i ]
98107 if scheme != bscheme or scheme not in uses_relative :
@@ -107,15 +116,12 @@ def urljoin(base, url, allow_framents = 1):
107116 return urlunparse ((scheme , netloc , path ,
108117 params , query , fragment ))
109118 if not path :
110- path = bpath
111- if not query :
112- query = bquery
113- return urlunparse ((scheme , netloc , path ,
114- params , query , fragment ))
115- i = string .rfind (bpath , '/' )
119+ return urlunparse ((scheme , netloc , bpath ,
120+ params , query or bquery , fragment ))
121+ i = rfind (bpath , '/' )
116122 if i >= 0 :
117123 path = bpath [:i ] + '/' + path
118- segments = string . splitfields (path , '/' )
124+ segments = splitfields (path , '/' )
119125 if segments [- 1 ] == '.' :
120126 segments [- 1 ] = ''
121127 while '.' in segments :
@@ -132,10 +138,21 @@ def urljoin(base, url, allow_framents = 1):
132138 break
133139 if len (segments ) >= 2 and segments [- 1 ] == '..' :
134140 segments [- 2 :] = ['' ]
135- path = string .joinfields (segments , '/' )
136- return urlunparse ((scheme , netloc , path ,
141+ return urlunparse ((scheme , netloc , joinfields (segments , '/' ),
137142 params , query , fragment ))
138143
144+ def urldefrag (url ):
145+ """Removes any existing fragment from URL.
146+
147+ Returns a tuple of the defragmented URL and the fragment. If
148+ the URL contained no fragments, the second element is the
149+ empty string.
150+ """
151+ s , n , p , a , q , frag = urlparse (url )
152+ defrag = urlunparse ((s , n , p , a , q , '' ))
153+ return defrag , frag
154+
155+
139156test_input = """
140157 http://a/b/c/d
141158
0 commit comments