normalize_feed_url() encodes characters
authorStefan Kögl <stefan@skoegl.net>
Tue, 11 Jun 2013 16:35:50 +0000 (11 18:35 +0200)
committerStefan Kögl <stefan@skoegl.net>
Tue, 11 Jun 2013 16:35:50 +0000 (11 18:35 +0200)
mygpo/utils.py

index c482404..d0605d6 100644 (file)
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 #
 # This file is part of my.gpodder.org.
 #
@@ -933,12 +934,19 @@ def normalize_feed_url(url):
     >>> normalize_feed_url('http://w%20x:y%20z@example.org/')
     'http://example.org/'
     >>> normalize_feed_url('http://example.com/x@y:z@test.com/')
-    'http://example.com/x@y:z@test.com/'
+    'http://example.com/x%40y%3Az%40test.com/'
+    >>> normalize_feed_url('http://en.wikipedia.org/wiki/Ä')
+    'http://en.wikipedia.org/wiki/%C3%84'
+    >>> normalize_feed_url('http://en.wikipedia.org/w/index.php?title=Ä&action=edit')
+    'http://en.wikipedia.org/w/index.php?title=%C3%84&action=edit'
     """
     url = url.strip()
     if not url or len(url) < 8:
         return None
 
+    if isinstance(url, unicode):
+        url = url.encode('utf-8', 'ignore')
+
     # This is a list of prefixes that you can use to minimize the amount of
     # keystrokes that you have to use.
     # Feel free to suggest other useful prefixes, and I'll add them here.
@@ -966,6 +974,10 @@ def normalize_feed_url(url):
     # Schemes and domain names are case insensitive
     scheme, netloc = scheme.lower(), netloc.lower()
 
+    # encode non-encoded characters
+    path = urllib.quote(path, '/%')
+    query = urllib.quote_plus(query, ':&=')
+
     # Remove authentication to protect users' privacy
     netloc = netloc.rsplit('@', 1)[-1]