replace sanitizing rules with gPodder's normalize_feed_url()
authorStefan Kögl <stefan@skoegl.net>
Mon, 27 May 2013 19:02:35 +0000 (27 21:02 +0200)
committerStefan Kögl <stefan@skoegl.net>
Mon, 27 May 2013 19:02:35 +0000 (27 21:02 +0200)
22 files changed:
couchdb/general/_design/sanitizing_rules/views/by_slug/map.js [deleted file]
couchdb/general/_design/sanitizing_rules/views/by_target/map.js [deleted file]
doc/dev/couchdb-docs.rst
doc/dev/couchdb-views.rst
doc/dev/files-overview.rst
mygpo/api/advanced/__init__.py
mygpo/api/advanced/directory.py
mygpo/api/advanced/episode.py
mygpo/api/legacy.py
mygpo/api/management/commands/sanitizing-maintenance.py [deleted file]
mygpo/api/sanitizing.py [deleted file]
mygpo/api/simple.py
mygpo/core/models.py
mygpo/db/couchdb/common.py
mygpo/db/couchdb/management/commands/compact-couchdb.py
mygpo/db/couchdb/management/commands/touch-couchdb-views.py
mygpo/directory/search.py
mygpo/maintenance/management/commands/delete-sanitizing-rules.py [deleted file]
mygpo/maintenance/management/commands/sync-sanitizing-rules.py [deleted file]
mygpo/utils.py
mygpo/web/views/podcast.py
sanitizing-rules.ini [deleted file]

diff --git a/couchdb/general/_design/sanitizing_rules/views/by_slug/map.js b/couchdb/general/_design/sanitizing_rules/views/by_slug/map.js
deleted file mode 100644 (file)
index e17c674..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-function(doc)
-{
-    if(doc.doc_type == "SanitizingRule")
-    {
-        emit(doc.slug, null);
-    }
-}
diff --git a/couchdb/general/_design/sanitizing_rules/views/by_target/map.js b/couchdb/general/_design/sanitizing_rules/views/by_target/map.js
deleted file mode 100644 (file)
index 8d3c453..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-function(doc)
-{
-    if(doc.doc_type == "SanitizingRule")
-    {
-        for(n in doc.applies_to)
-        {
-            emit([doc.applies_to[n], doc.priority], null);
-        }
-    }
-}
index 3f37677..9fa51c6 100644 (file)
@@ -272,23 +272,6 @@ Example Document ::
     }
 
 
-SanitizingRule
---------------
-
-Contains a URL Sanitizing Rule.
-
-Example Document ::
-
-    {
-        slug:         "feedburner-feeds2",
-        applies_to:   ["podcast", "episode"],
-        search:       "http://feeds2\.feedburner\.com",
-        replace_by:   "http://feeds.feedburner.com",
-        descriptions: "Replace {feeds2 => feeds}.feedburner.com",
-        priority:     100,
-    }
-
-
 Category
 --------
 
index 4996a8b..144e527 100644 (file)
@@ -183,17 +183,6 @@ Doc-Types: Podcast, PodcastGroup, PodcastSubscriberData
 * `podcasts/subscriber_data <https://github.com/gpodder/mygpo/tree/master/couchdb/general/_design/podcasts/views/subscriber_data>`_
 
 
-Sanitizing Rules
-^^^^^^^^^^^^^^^^
-
-Doc-Types: SanitizingRule
-
-**Views**
-
-* `sanitizing_rules/by_slug <https://github.com/gpodder/mygpo/tree/master/couchdb/general/_design/sanitizing_rules/views/by_slug>`_
-* `sanitizing_rules/by_target <https://github.com/gpodder/mygpo/tree/master/couchdb/general/_design/sanitizing_rules/views/by_target>`_
-
-
 Slugs
 ^^^^^
 
index 7478d7e..fc05233 100644 (file)
@@ -39,8 +39,6 @@ The ``mygpo`` project consists of the following files ::
       management/commands/merge-episode-states.py              # merges duplicates of episodes states
       management/commands/move-subscriber-data.py              # moves subscriber data from podcasts into separate objects
       management/commands/import-episode-actions.py            # imports episode actions from files
-      management/commands/sync-sanitizing-rules.py             # synchronizes sanitizing rules to the database
-      management/commands/delete-sanitizing-rules.py           # deletes sanitizing rules
 
     data/                                                      # stuff related to podcast and episode data
       youtube.py                                               # utils for accessing YouTube data
@@ -98,7 +96,6 @@ The ``mygpo`` project consists of the following files ::
       basic_auth.py
       httpresponse.py
       simple.py
-      sanitizing.py
       tasks.py
       advanced/auth.py
       advanced/sync.py
@@ -108,7 +105,6 @@ The ``mygpo`` project consists of the following files ::
       advanced/episode.py
       advanced/directory.py
       management/commands/__init__.py
-      management/commands/sanitizing-maintenance.py
       management/__init__.py
 
     directory/
index c1d70a9..c62d764 100644 (file)
@@ -38,14 +38,12 @@ from django.conf import settings as dsettings
 
 from mygpo.api.constants import EPISODE_ACTION_TYPES, DEVICE_TYPES
 from mygpo.api.httpresponse import JsonResponse
-from mygpo.api.sanitizing import sanitize_url, sanitize_urls
 from mygpo.api.advanced.directory import episode_data, podcast_data
 from mygpo.api.backend import get_device, BulkSubscribe
 from mygpo.utils import parse_time, format_time, parse_bool, get_timestamp, \
-    parse_request_body
+    parse_request_body, normalize_feed_url
 from mygpo.decorators import allowed_methods, repeat_on_conflict
 from mygpo.core import models
-from mygpo.core.models import SanitizingRule, Podcast
 from mygpo.core.tasks import auto_flattr_episode
 from mygpo.users.models import PodcastUserState, EpisodeAction, \
      EpisodeUserState, DeviceDoesNotExist, DeviceUIDException, \
@@ -139,8 +137,8 @@ def update_subscriptions(user, device, add, remove):
         if a in remove:
             raise ValueError('can not add and remove %s at the same time' % a)
 
-    add_s = list(sanitize_urls(add, 'podcast'))
-    rem_s = list(sanitize_urls(remove, 'podcast'))
+    add_s = map(normalize_feed_url, add)
+    rem_s = map(normalize_feed_url, remove)
 
     assert len(add) == len(add_s) and len(remove) == len(rem_s)
 
@@ -361,12 +359,12 @@ def update_episodes(user, actions, now, ua_string):
     for action in actions:
 
         podcast_url = action['podcast']
-        podcast_url = sanitize_append(podcast_url, 'podcast', update_urls)
+        podcast_url = sanitize_append(podcast_url, update_urls)
         if podcast_url == '':
             continue
 
         episode_url = action['episode']
-        episode_url = sanitize_append(episode_url, 'episode', update_urls)
+        episode_url = sanitize_append(episode_url, update_urls)
         if episode_url == '':
             continue
 
@@ -641,8 +639,8 @@ def favorites(request, username):
     return JsonResponse(ret)
 
 
-def sanitize_append(url, obj_type, sanitized_list):
-    urls = sanitize_url(url, obj_type)
+def sanitize_append(url, sanitized_list):
+    urls = normalize_feed_url(url)
     if url != urls:
-        sanitized_list.append( (url, urls) )
+        sanitized_list.append( (url, urls or '') )
     return urls
index 35032ec..c2fd2c0 100644 (file)
@@ -23,11 +23,10 @@ from django.views.decorators.cache import cache_page
 
 from mygpo.core import models
 from mygpo.core.models import Podcast, PodcastGroup
-from mygpo.utils import parse_range
+from mygpo.utils import parse_range, normalize_feed_url
 from mygpo.directory.tags import Topics
 from mygpo.web.utils import get_episode_link_target, get_podcast_link_target
 from mygpo.api.httpresponse import JsonResponse
-from mygpo.api.sanitizing import sanitize_url
 from mygpo.db.couchdb.episode import episode_for_podcast_url
 from mygpo.db.couchdb.podcast import podcast_by_id, podcast_for_url
 from mygpo.db.couchdb.directory import category_for_tag
@@ -58,7 +57,7 @@ def tag_podcasts(request, tag, count):
 
 @cache_page(60 * 60)
 def podcast_info(request):
-    url = sanitize_url(request.GET.get('url', ''))
+    url = normalize_feed_url(request.GET.get('url', ''))
 
     # 404 before we query for url, because query would complain
     # about missing param
@@ -76,8 +75,8 @@ def podcast_info(request):
 
 @cache_page(60 * 60)
 def episode_info(request):
-    podcast_url = sanitize_url(request.GET.get('podcast', ''))
-    episode_url = sanitize_url(request.GET.get('url', ''), 'episode')
+    podcast_url = normalize_feed_url(request.GET.get('podcast', ''))
+    episode_url = normalize_feed_url(request.GET.get('url', ''))
 
     # 404 before we query for url, because query would complain
     # about missing parameters
index 1d446c4..27fd9d8 100644 (file)
@@ -27,10 +27,9 @@ from django.views.decorators.cache import never_cache
 from mygpo.core import models
 from mygpo.api.httpresponse import JsonResponse
 from mygpo.api.exceptions import ParameterMissing
-from mygpo.api.sanitizing import sanitize_url
 from mygpo.api.backend import get_device
 from mygpo.users.models import Chapter
-from mygpo.utils import parse_time, parse_request_body
+from mygpo.utils import parse_time, parse_request_body, normalize_feed_url
 from mygpo.decorators import allowed_methods
 from mygpo.api.basic_auth import require_valid_user, check_username
 from mygpo.db.couchdb.episode import episode_for_podcast_url
@@ -61,16 +60,16 @@ def chapters(request, username):
         update_urls = []
 
         # podcast sanitizing
-        s_podcast_url = sanitize_url(podcast_url)
+        s_podcast_url = normalize_feed_url(podcast_url)
         if s_podcast_url != podcast_url:
             req['podcast'] = s_podcast_url
-            update_urls.append((podcast_url, s_podcast_url))
+            update_urls.append((podcast_url, s_podcast_url or ''))
 
         # episode sanitizing
-        s_episode_url = sanitize_url(episode_url, 'episode')
+        s_episode_url = normalize_feed_url(episode_url, 'episode')
         if s_episode_url != episode_url:
             req['episode'] = s_episode_url
-            update_urls.append((episode_url, s_episode_url))
+            update_urls.append((episode_url, s_episode_url or ''))
 
         if (s_podcast_url != '') and (s_episode_url != ''):
             try:
@@ -99,8 +98,8 @@ def chapters(request, username):
         except ValueError:
             return HttpResponseBadRequest('since-value is not a valid timestamp')
 
-        podcast_url = sanitize_url(podcast_url)
-        episode_url = sanitize_url(episode_url, 'episode')
+        podcast_url = normalize_feed_url(podcast_url)
+        episode_url = normalize_feed_url(episode_url)
         episode = episode_for_podcast_url(podcast_url, episode_url)
 
         if episode is None:
@@ -137,8 +136,8 @@ def chapters(request, username):
 
 
 def update_chapters(req, user):
-    podcast_url = sanitize_url(req['podcast'])
-    episode_url = sanitize_url(req['episode'], 'episode')
+    podcast_url = normalize_feed_url(req['podcast'])
+    episode_url = normalize_feed_url(req['episode'])
 
     episode = episode_for_podcast_url(podcast_url, episode_url,
             create=True)
index dd80771..b70e968 100644 (file)
@@ -22,12 +22,12 @@ from django.utils.datastructures import MultiValueDictKeyError
 from django.views.decorators.csrf import csrf_exempt
 from django.views.decorators.cache import never_cache
 
-from mygpo.api.sanitizing import sanitize_urls
 from mygpo.users.models import User
 from mygpo.api.opml import Importer, Exporter
 from mygpo.core.models import Podcast, SubscriptionException
 from mygpo.api.backend import get_device
 from mygpo.db.couchdb.podcast import podcast_for_url
+from mygpo.utils import normalize_feed_url
 
 import logging
 logger = logging.getLogger(__name__)
@@ -60,8 +60,8 @@ def upload(request):
     i = Importer(opml)
 
     podcast_urls = [p['url'] for p in i.items]
-    podcast_urls = sanitize_urls(podcast_urls)
-    podcast_urls = filter(lambda x: x, podcast_urls)
+    podcast_urls = map(normalize_feed_url, podcast_urls)
+    podcast_urls = filter(None, podcast_urls)
 
     new = [u for u in podcast_urls if u not in existing_urls]
     rem = [u for e in existing_urls if u not in podcast_urls]
diff --git a/mygpo/api/management/commands/sanitizing-maintenance.py b/mygpo/api/management/commands/sanitizing-maintenance.py
deleted file mode 100644 (file)
index 9cfca32..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-from optparse import make_option
-
-from django.core.management.base import BaseCommand
-
-from mygpo.api.sanitizing import maintenance
-
-
-class Command(BaseCommand):
-
-    option_list = BaseCommand.option_list + (
-        make_option('--dry-run', action='store_true', dest='dry_run', default=False, help="Don't rewrite anything, just print the stats afterwards."),
-        )
-
-
-    def handle(self, *args, **options):
-
-        maintenance(options.get('dry_run'))
diff --git a/mygpo/api/sanitizing.py b/mygpo/api/sanitizing.py
deleted file mode 100644 (file)
index 1bc9fe0..0000000
+++ /dev/null
@@ -1,185 +0,0 @@
-import collections
-import urlparse
-import re
-
-from django.core.cache import cache
-
-from mygpo.core import models
-from mygpo.utils import iterate_together, progress
-from mygpo.db.couchdb.podcast import podcast_count, podcast_for_oldid, \
-        all_podcasts
-from mygpo.db.couchdb.common import sanitizingrules_by_obj_type
-
-import logging
-logger = logging.getLogger(__name__)
-
-
-def sanitize_urls(urls, obj_type='podcast'):
-    """ Apply sanitizing rules to the given URLs and return the results """
-
-    return [sanitize_url(url, obj_type) for url in urls]
-
-
-def sanitize_url(url, obj_type='podcast'):
-    """ Apply sanitizing rules to the given URL and return the results """
-
-    rules = sanitizingrules_by_obj_type(obj_type)
-    url = basic_sanitizing(url)
-    url = apply_sanitizing_rules(url, rules)
-    return url
-
-
-
-def basic_sanitizing(url):
-    """
-    does basic sanitizing through urlparse and additionally converts the netloc to lowercase
-    """
-    r = urlparse.urlsplit(url)
-    netloc = r.netloc.lower()
-    r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, '')
-    return r2.geturl()
-
-
-def apply_sanitizing_rules(url, rules):
-    """
-    applies all url sanitizing rules to the given url
-    setting podcast=True uses only those rules which have use_podcast set to True.
-    When passing podcast=False this check is ommitted. The same is valid
-    for episode.
-    """
-
-    for rule in rules:
-
-        orig = url
-
-        # check for precompiled regex first
-        if hasattr(rule, 'search_re'):
-            url = rule.search_re.sub(rule.replace, url)
-        else:
-            url = re.sub(rule.search, rule.replace, url)
-
-        if orig != url:
-            c = getattr(rule, 'hits', 0)
-            rule.hits = c+1
-
-    return url
-
-
-def maintenance(dry_run=False):
-    """
-    This currently checks how many podcasts could be removed by
-    applying both basic sanitizing rules and those from the database.
-
-    This will later be used to replace podcasts!
-    """
-
-    podcast_rules = sanitizingrules_by_obj_type('podcast')
-    episode_rules = sanitizingrules_by_obj_type('episode')
-
-    num_podcasts = podcast_count()
-
-    print 'Stats'
-    print ' * %d podcasts - %d rules' % (num_podcasts, len(podcast_rules))
-    if dry_run:
-        print ' * dry run - nothing will be written to the database'
-    print
-
-    print 'precompiling regular expressions'
-
-    podcast_rules = list(precompile_rules(podcast_rules))
-    episode_rules = list(precompile_rules(episode_rules))
-
-    p_stats = collections.defaultdict(int)
-    e_stats = collections.defaultdict(int)
-
-    podcasts = all_podcasts()
-
-    for n, p in enumerate(podcasts):
-        su = sanitize_url(p.url, rules=podcast_rules)
-
-        # nothing to do
-        if su == p.url:
-            p_stats['unchanged'] += 1
-            continue
-
-        # invalid podcast, remove
-        if su == '':
-            if not dry_run:
-                p.delete()
-            p_stats['deleted'] += 1
-
-        su_podcast = podcast_for_url(url=su)
-
-        if not su_podcast:
-            # "target" podcast does not exist, we simply change the url
-            if not dry_run:
-                logger.info('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
-                p.url = su
-                p.save()
-
-            p_stats['updated'] += 1
-            continue
-
-        # nothing to do
-        if p == su_podcast:
-            p_stats['unchanged'] += 1
-            continue
-
-        # last option - merge podcasts
-        if not dry_run:
-            rewrite_podcasts(p, su_podcast)
-            p.delete()
-
-        p_stats['merged'] += 1
-
-        progress(n+1, num_podcasts, str(p.id))
-
-    print 'finished %s podcasts' % (n+1)
-    print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
-    print 'Hits'
-    for _, r in podcast_rules:
-        print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
-
-
-def rewrite_podcasts(p_old, p_new):
-
-    logger.info('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
-
-    rewrite_newpodcast(p_old, p_new)
-
-def rewrite_newpodcast(p_old, p_new):
-    p_n = podcast_for_oldid(p_new.id)
-    p_o = podcast_for_oldid(p_old.id)
-
-    if None in (p_n, p_o):
-        return
-
-
-    # merge subscriber data
-    subscribers = []
-    compare = lambda a, b: cmp(a.timestamp, b.timestamp)
-    for n, o in iterate_together([p_n.subscribers, p_o.subscribers]):
-
-        # we assume that the new podcast has much more subscribers
-        # taking only count of the old podcast would look like a drop
-        if None in (n, o):
-            continue
-
-        subscribers.append(
-                models.SubscriberData(
-                    timestamp = o.timestamp,
-                    subscriber_count = n.subscriber_count + \
-                                       n.subscriber_count if n else 0\
-                )
-            )
-
-    p_n.subscribers = subscribers
-
-    p_n.save()
-    p_o.delete()
-
-
-def precompile_rules(rules):
-    for rule in rules:
-        rule.search_re = re.compile(rule.search, re.UNICODE)
-        yield rule
index 8f15a28..3712f29 100644 (file)
@@ -37,13 +37,12 @@ from mygpo.core.models import Podcast
 from mygpo.users.models import Suggestions
 from mygpo.api.opml import Exporter, Importer
 from mygpo.api.httpresponse import JsonResponse
-from mygpo.api.sanitizing import sanitize_urls
 from mygpo.directory.toplist import PodcastToplist
 from mygpo.directory.models import ExamplePodcasts
 from mygpo.api.advanced.directory import podcast_data
 from mygpo.directory.search import search_podcasts
 from mygpo.decorators import allowed_methods
-from mygpo.utils import parse_range
+from mygpo.utils import parse_range, normalize_feed_url
 from mygpo.core.json import json, JSONDecodeError
 from mygpo.db.couchdb import BulkException
 from mygpo.db.couchdb.podcast import podcasts_by_id
@@ -201,7 +200,7 @@ def parse_subscription(raw_post_data, format):
         return []
 
 
-    urls = sanitize_urls(urls)
+    urls = map(normalize_feed_url, urls)
     urls = filter(None, urls)
     urls = set(urls)
     return urls
index 20dd189..206a045 100644 (file)
@@ -548,17 +548,3 @@ class PodcastGroup(Document, SlugMixin, OldIdMixin):
             return '%s %s (%s)' % (self.__class__.__name__, self._id[:10], self.oldid)
         else:
             return '%s %s' % (self.__class__.__name__, self._id[:10])
-
-
-
-class SanitizingRule(Document):
-    slug        = StringProperty()
-    applies_to  = StringListProperty()
-    search      = StringProperty()
-    replace     = StringProperty()
-    priority    = IntegerProperty()
-    description = StringProperty()
-
-
-    def __repr__(self):
-        return 'SanitizingRule %s' % self._id
index 76a0f6f..bb7d1f2 100644 (file)
@@ -1,53 +1,8 @@
-from mygpo.core.models import SanitizingRule
-from mygpo.cache import cache_result
 from mygpo.db.couchdb import get_main_database
 from mygpo.db import QueryParameterMissing
 from mygpo.db.couchdb.utils import multi_request_view
 
 
-class SanitizingRuleStub(object):
-    pass
-
-
-@cache_result(timeout=60*60)
-def sanitizingrules_by_obj_type(obj_type):
-
-    if not obj_type:
-        raise QueryParameterMissing('obj_type')
-
-    r = SanitizingRule.view('sanitizing_rules/by_target',
-            include_docs = True,
-            startkey     = [obj_type, None],
-            endkey       = [obj_type, {}],
-        )
-
-    return map(_wrap_rule, r)
-
-def _wrap_rule(rule):
-    obj = SanitizingRuleStub()
-    obj.slug = rule.slug
-    obj.applies_to = list(rule.applies_to)
-    obj.search = rule.search
-    obj.replace = rule.replace
-    obj.priority = rule.priority
-    obj.description = rule.description
-    return obj
-
-
-@cache_result(timeout=60*60)
-def sanitizingrule_for_slug(slug):
-
-    if not slug:
-        raise QueryParameterMissing('slug')
-
-    r = SanitizingRule.view('sanitizing_rules/by_slug',
-            include_docs=True,
-            key=slug,
-        )
-
-    return r.one() if r else None
-
-
 def missing_slug_count(doc_type, start, end):
 
     if not doc_type:
index d6ff14b..870c168 100644 (file)
@@ -11,7 +11,6 @@ from django.core.management.base import BaseCommand
 from django.conf import settings
 
 from mygpo.decorators import repeat_on_conflict
-from mygpo.core.models import SanitizingRule
 from mygpo.utils import progress
 
 
index 1d6494f..500769a 100644 (file)
@@ -10,7 +10,6 @@ from django.core.management.base import BaseCommand
 from django.conf import settings
 
 from mygpo.decorators import repeat_on_conflict
-from mygpo.core.models import SanitizingRule
 from mygpo.utils import progress
 
 
index 7163787..ff74f5f 100644 (file)
@@ -1,6 +1,5 @@
-from mygpo.utils import is_url
+from mygpo.utils import is_url, normalize_feed_url
 from mygpo.data.feeddownloader import PodcastUpdater, NoPodcastCreated
-from mygpo.api.sanitizing import sanitize_url
 from mygpo.cache import cache_result
 from mygpo.db.couchdb.podcast import podcast_for_url, search
 
@@ -9,7 +8,7 @@ from mygpo.db.couchdb.podcast import podcast_for_url, search
 def search_podcasts(q, limit=20, skip=0):
 
     if is_url(q):
-        url = sanitize_url(q)
+        url = normalize_feed_url(q)
 
         podcast = podcast_for_url(url, create=False)
 
diff --git a/mygpo/maintenance/management/commands/delete-sanitizing-rules.py b/mygpo/maintenance/management/commands/delete-sanitizing-rules.py
deleted file mode 100755 (executable)
index 6544b14..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-import sys
-import ConfigParser
-
-from django.core.management.base import BaseCommand
-
-from mygpo.decorators import repeat_on_conflict
-from mygpo.utils import progress
-from mygpo.db.couchdb.common import sanitizingrule_for_slug
-
-
-
-class Command(BaseCommand):
-    """
-    """
-
-    def handle(self, *args, **options):
-
-        if not args:
-            print >> sys.stderr, "Usage: ./manage.py delete-sanitizing-rules <slug> [<slug2> ...]"
-            return
-
-
-        for n, slug in enumerate(args):
-            rule = sanitizingrule_for_slug(slug)
-
-            if rule:
-                self.delete_rule(rule=rule)
-
-            progress(n+1, len(args))
-
-
-    @repeat_on_conflict(['rule'])
-    def delete_rule(self, rule):
-        rule.delete()
diff --git a/mygpo/maintenance/management/commands/sync-sanitizing-rules.py b/mygpo/maintenance/management/commands/sync-sanitizing-rules.py
deleted file mode 100755 (executable)
index ce0c64c..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-import sys
-import ConfigParser
-
-from django.core.management.base import BaseCommand
-
-from mygpo.decorators import repeat_on_conflict
-from mygpo.core.models import SanitizingRule
-from mygpo.utils import progress
-from mygpo.db.couchdb.common import sanitizingrule_for_slug
-
-
-
-class Command(BaseCommand):
-    """
-    """
-
-    def handle(self, *args, **options):
-
-        if not args:
-            print >> sys.stderr, "Usage: ./manage.py sync-sanitizing-rules <filename> [<filename2> ...]"
-            return
-
-
-        for filename in args:
-            config = ConfigParser.ConfigParser()
-            config.read(filename)
-            sections = config.sections()
-
-            for n, slug in enumerate(sections):
-                rule = sanitizingrule_for_slug(slug) or SanitizingRule()
-
-                self.update_rule(rule=rule, config=config, slug=slug)
-
-                progress(n+1, len(sections), filename)
-
-
-    @repeat_on_conflict(['rule'])
-    def update_rule(self, rule, config, slug):
-        rule.slug = slug
-        rule.applies_to = []
-        if config.getboolean(slug, 'podcast'):
-            rule.applies_to.append('podcast')
-
-        if config.getboolean(slug, 'episode'):
-            rule.applies_to.append('episode')
-
-        rule.search = config.get(slug, 'search')
-        rule.replace = config.get(slug, 'replace')
-        rule.priority = config.getint(slug, 'priority')
-        rule.description = config.get(slug, 'description')
-        rule.save()
index ec0cb05..c482404 100644 (file)
@@ -878,3 +878,107 @@ def parse_request_body(request):
         raw_body = zlib.decompress(raw_body)
 
     return json.loads(raw_body)
+
+
+def normalize_feed_url(url):
+    """
+    Converts any URL to http:// or ftp:// so that it can be
+    used with "wget". If the URL cannot be converted (invalid
+    or unknown scheme), "None" is returned.
+
+    This will also normalize feed:// and itpc:// to http://.
+
+    >>> normalize_feed_url('itpc://example.org/podcast.rss')
+    'http://example.org/podcast.rss'
+
+    If no URL scheme is defined (e.g. "curry.com"), we will
+    simply assume the user intends to add a http:// feed.
+
+    >>> normalize_feed_url('curry.com')
+    'http://curry.com/'
+
+    There are even some more shortcuts for advanced users
+    and lazy typists (see the source for details).
+
+    >>> normalize_feed_url('fb:43FPodcast')
+    'http://feeds.feedburner.com/43FPodcast'
+
+    It will also take care of converting the domain name to
+    all-lowercase (because domains are not case sensitive):
+
+    >>> normalize_feed_url('http://Example.COM/')
+    'http://example.com/'
+
+    Some other minimalistic changes are also taken care of,
+    e.g. a ? with an empty query is removed:
+
+    >>> normalize_feed_url('http://example.org/test?')
+    'http://example.org/test'
+
+    Leading and trailing whitespace is removed
+
+    >>> normalize_feed_url(' http://example.com/podcast.rss ')
+    'http://example.com/podcast.rss'
+
+    HTTP Authentication is removed to protect users' privacy
+
+    >>> normalize_feed_url('http://a@b:c@host.com/')
+    'http://host.com/'
+    >>> normalize_feed_url('ftp://a:b:c@host.com/')
+    'ftp://host.com/'
+    >>> normalize_feed_url('http://i%2Fo:P%40ss%3A@host.com/')
+    'http://host.com/'
+    >>> normalize_feed_url('ftp://%C3%B6sterreich@host.com/')
+    'ftp://host.com/'
+    >>> normalize_feed_url('http://w%20x:y%20z@example.org/')
+    'http://example.org/'
+    >>> normalize_feed_url('http://example.com/x@y:z@test.com/')
+    'http://example.com/x@y:z@test.com/'
+    """
+    url = url.strip()
+    if not url or len(url) < 8:
+        return None
+
+    # This is a list of prefixes that you can use to minimize the amount of
+    # keystrokes that you have to use.
+    # Feel free to suggest other useful prefixes, and I'll add them here.
+    PREFIXES = {
+            'fb:': 'http://feeds.feedburner.com/%s',
+            'yt:': 'http://www.youtube.com/rss/user/%s/videos.rss',
+            'sc:': 'http://soundcloud.com/%s',
+            'fm4od:': 'http://onapp1.orf.at/webcam/fm4/fod/%s.xspf',
+            # YouTube playlists. To get a list of playlists per-user, use:
+            # https://gdata.youtube.com/feeds/api/users/<username>/playlists
+            'ytpl:': 'http://gdata.youtube.com/feeds/api/playlists/%s',
+    }
+
+    for prefix, expansion in PREFIXES.iteritems():
+        if url.startswith(prefix):
+            url = expansion % (url[len(prefix):],)
+            break
+
+    # Assume HTTP for URLs without scheme
+    if not '://' in url:
+        url = 'http://' + url
+
+    scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
+
+    # Schemes and domain names are case insensitive
+    scheme, netloc = scheme.lower(), netloc.lower()
+
+    # Remove authentication to protect users' privacy
+    netloc = netloc.rsplit('@', 1)[-1]
+
+    # Normalize empty paths to "/"
+    if path == '':
+        path = '/'
+
+    # feed://, itpc:// and itms:// are really http://
+    if scheme in ('feed', 'itpc', 'itms'):
+        scheme = 'http'
+
+    if scheme not in ('http', 'https', 'ftp', 'file'):
+        return None
+
+    # urlunsplit might return "a slighty different, but equivalent URL"
+    return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
index 0369adc..037f98c 100644 (file)
@@ -13,7 +13,7 @@ from django.views.decorators.cache import never_cache, cache_control
 from mygpo.core.models import PodcastGroup, SubscriptionException
 from mygpo.core.proxy import proxy_object
 from mygpo.core.tasks import flattr_thing
-from mygpo.api.sanitizing import sanitize_url
+from mygpo.utils import normalize_feed_url
 from mygpo.users.settings import PUBLIC_SUB_PODCAST, FLATTR_TOKEN
 from mygpo.users.models import HistoryEntry, DeviceDoesNotExist, SubscriptionAction
 from mygpo.web.forms import SyncForm
@@ -319,9 +319,9 @@ def subscribe_url(request):
     if not url:
         raise Http404('http://my.gpodder.org/subscribe?url=http://www.example.com/podcast.xml')
 
-    url = sanitize_url(url)
+    url = normalize_feed_url(url)
 
-    if url == '':
+    if not url:
         raise Http404('Please specify a valid url')
 
     podcast = podcast_for_url(url, create=True)
diff --git a/sanitizing-rules.ini b/sanitizing-rules.ini
deleted file mode 100644 (file)
index 97b70fd..0000000
+++ /dev/null
@@ -1,324 +0,0 @@
-# This files contains rules to rewrite Podcast and Episode URLs
-# All rules should be given in the following format
-#
-#[some-unique-slug]
-#podcast=1       1 if the rule applies to podcast URLs, otherwise 0
-#episode=1       1 if the rule applies to episode URLs, otherwise 0
-#search=regex    search-regex that should be replaced
-#replace=regex   string with which the search string should be replace. may contain references
-#priority=x      rules are applied in order of increasing priority
-#description     text describing the rule, possibly mentioning a bug
-#
-
-[feedburner-feeds2]
-podcast=1
-episode=1
-search=feeds2\.feedburner\.com
-replace=feeds.feedburner.com
-priority=1
-description=Rewriting for feedburner should happen as "feeds2.feedburner.com" -> "feeds.feedburner.com"
-
-[feedburner-format]
-podcast=1
-episode=1
-search=(?P<unchanged>feedburner\.com.+)\?format=xml
-replace=\g<unchanged>
-priority=2
-description=Feedburner URLs should have their "?format=xml" query string removed
-
-[remove-leading-whitespace]
-podcast=1
-episode=1
-search=^\s+
-replace=
-priority=0
-description=Remove leading whitespaces
-
-[remove-trailing-whitespace]
-podcast=1
-episode=1
-search=\s+$
-replace=
-priority=0
-description=Remove trailing whitespaces
-
-[unknown-protocol]
-podcast=1
-episode=1
-search=^[^(https?):].+
-replace=
-priority=100
-description=Empty any string that doesn't start with either http or https
-
-[feedburner-trailing-slash]
-podcast=1
-episode=0
-search=(?P<unchanged>feedburner\.com.+)\/$
-replace=\g<unchanged>
-priority=2
-description=Feedburner URLs sometimes have a trailing slash, which can be removed safely
-
-[non-ascii]
-podcast=1
-episode=1
-search=^.*[^\x20-\x7E].*$
-replace=
-priority=50
-description=Remove URLs with non-ascii characters
-
-[twit-podcasts]
-podcast=1
-episode=0
-search=^http://leoville\.tv/podcasts/(?P<podcast>\w+)\.xml$
-replace=http://leo.am/podcasts/\g<podcast>
-priority=10
-description=Rewrite URLs of TWiT Podcasts because most users use a URL that is going to break soon (bug 885)
-
-[hardcore-history-old-url]
-podcast=1
-episode=0
-search=^http://www\.dancarlin\.com/dchh\.xml$
-replace=http://feeds.feedburner.com/dancarlin/history
-priority=10
-description=Rewrite podcast URL of Dan Carlin's Hardcore History because the old URL doesn't work anymore (bug 855)
-
-[spaces]
-podcast=1
-episode=1
-search=^.*\s.*$
-replace=
-priority=10
-description=All URLs that contain spaces are considered invalid
-
-[libsyn-podcasts]
-podcast=0
-episode=1
-search=http://media.libsyn.com/media/(?P<res>.*)$
-replace=http://traffic.libsyn.com/\g<res>
-priority=10
-description=Update new URL for libsy Podcasts (Learn Japanese with Beb and Alex)
-
-[abc-podcasts]
-podcast=1
-episode=0
-search=^http://site\.abc\.go\.com/abc/xml/podcastRSS\?(.*&)?feedPublishKey=(?P<key>\d+)(&.*)?$
-replace=http://a.abc.com/abc/xml/podcastRSS?feedPublishKey=\g<key>
-priority=100
-description=Merge URLs for ABC Podcasts (bug 977)
-
-[remove-http-auth]
-podcast=1
-episode=1
-search=^(?P<protocol>[a-zA-Z])://[-_\w]+(:[^@]+)?@(?P<rest>.+)$
-replace=\g<protocol>://\g<rest>
-priority=20
-description=Remove HTTP-Authentication from URLs
-
-[rpod-ru-parameters]
-podcast=0
-episode=1
-search=^(?P<unchanged>http://rpod\.ru/personal/.+\.mp[34])\?[0-9a-z]+$
-replace=\g<unchanged>
-priority=100
-description=
-
-[collegehumor]
-podcast=0
-episode=1
-search=^http://\d+\.media\.collegehumor\.com/(?P<unchanged>.+)$
-replace=http://1.media.collegehumor.com/\g<unchanged>
-priority=100
-description=
-
-[shot-of-jaq-merge]
-podcast=1
-episode=0
-search=(?i)^http://feeds.feedburner.com/ShotOfJaq$
-replace=http://shotofjaq.org/feed/
-priority=100
-description=Merges Shot of Jaq feeds to the URL given on their Website
-
-[shot-of-jaq-trailing-slash]
-podcast=1
-episode=0
-search=http://shotofjaq.org/feed$
-replace=http://shotofjaq.org/feed/
-priority=100
-description=Add trailing slash to Shot of Jaq feed URL
-
-[shot-of-jaq-feedburner]
-podcast=1
-episode=0
-search=(?i)^http://feeds\.feedburner\.com/ShotOfJaqOGG$
-replace=http://feeds.feedburner.com/ShotOfJaqOgg
-priority=100
-description=Unify all Feedburner URLs for the Shot of Jaq Ogg feed (seems it doesn't have an shotofjaq.org url)
-
-[ted-talks-video]
-podcast=1
-episode=0
-search=(?i)^http://feeds\.feedburner\.com/TEDTalks_video$
-replace=http://feeds.feedburner.com/tedtalks_video
-priority=100
-description=Unify all TEDTalks podcasts
-
-[ted-talks-audio]
-podcast=1
-episode=0
-search=(?i)^http://feeds\.feedburner\.com/TEDTalks_audio$
-replace=http://feeds.feedburner.com/tedtalks_audio
-priority=100
-description=Unify all TEDTalks podcasts
-
-[ted-talks-hd]
-podcast=1
-episode=0
-search=(?i)^http://feeds\.feedburner\.com/TedtalksHD$
-replace=http://feeds.feedburner.com/tedtalkshd
-priority=100
-description=Unify all TEDTalks podcasts
-
-[mintcast-feedburner]
-podcast=1
-episode=0
-search=(?i)^http://feeds\.feedburner\.com/mintcastpodcast$
-replace=http://www.mintcast.org/feed/podcast/
-priority=100
-description=Unify mintCast feeds (bug 1035
-
-[mintcast]
-podcast=1
-episode=0
-search=(?i)^http://www.mintcast.org/feed/$
-replace=http://www.mintcast.org/feed/podcast/
-priority=100
-description=Unify mintCast feeds (bug 1035
-
-[crankygeeks-feedburner]
-podcast=1
-episode=0
-search=(?i)^http://feeds\.feedburner\.com/ziffdavis/crankygeekspodcast$
-replace=http://feeds.ziffdavis.com/ziffdavis/crankygeekspodcast
-priority=100
-description=Unify Cranky Geed Podcasts (MP3) (bug 1032
-
-[crankygeeks-xml]
-podcast=1
-episode=0
-search=^http://rssnewsapps\.ziffdavis\.com/audioblogs/crankygeeks/cg\.audio\.xml/?$
-replace=http://feeds.ziffdavis.com/ziffdavis/crankygeekspodcast
-priority=100
-description=Unify Cranky Geed Podcasts (MP3) (bug 1032
-
-[crankygeeks-slash]
-podcast=1
-episode=0
-search=^http://feeds\.ziffdavis\.com/ziffdavis/crankygeekspodcast/$
-replace=http://feeds.ziffdavis.com/ziffdavis/crankygeekspodcast
-priority=100
-description=Unify Cranky Geed Podcasts (MP3) (bug 1032
-
-[crankygeeks-video]
-podcast=1
-episode=0
-search=(?i)^http://feeds\.feedburner\.com/ziffdavis/cgh264video$
-replace=http://feeds.ziffdavis.com/ziffdavis/cgh264video
-priority=100
-description=Unify Cranky Geed Podcasts (H.264) (bug 1032
-
-[crankygeeks-xml-video]
-podcast=1
-episode=0
-search=^http://rssnewsapps\.ziffdavis\.com/audioblogs/crankygeeks/cg\.h\.264\.xml$
-replace=http://feeds.ziffdavis.com/ziffdavis/cgh264video
-priority=100
-description=Unify Cranky Geed Podcasts (H.264) (bug 1032
-
-[crankygeeks-xml-h264]
-podcast=1
-episode=0
-search=^http://rssnewsapps\.ziffdavis\.com/audioblogs/crankygeeks/cg\.ipod\.xml$
-replace=http://feeds.ziffdavis.com/ziffdavis/cgipodvideo
-priority=100
-description=Unify Cranky Geed Podcasts (iPod Video) (bug 1032
-
-[crankygeeks-xml-mp4]
-podcast=1
-episode=0
-search=^http://rssnewsapps\.ziffdavis\.com/audioblogs/crankygeeks/cg\.mp4\.xml$
-replace=http://feeds.ziffdavis.com/ziffdavis/cgipodvideo
-priority=100
-description=Unify Cranky Geed Podcasts (iPod Video) (bug 1032
-
-[crankygeeks-ipod]
-podcast=1
-episode=0
-search=(?i)^http://feeds\.feedburner\.com/ziffdavis/cgipodvideo$
-replace=http://feeds.ziffdavis.com/ziffdavis/cgipodvideo
-priority=100
-description=Unify Cranky Geed Podcasts (iPod Video) (bug 1032
-
-[crankygeeks-mpeg4]
-podcast=1
-episode=0
-search=^http://feeds\.ziffdavis\.com/ziffdavis/cgmpeg4video/$
-replace=http://feeds.ziffdavis.com/ziffdavis/cgipodvideo
-priority=100
-description=Unify Cranky Geed Podcasts (iPod Video) (bug 1032
-
-[crankygeeks-ipod-cgipod-video]
-podcast=1
-episode=0
-search=^http://feeds\.ziffdavis\.com/ziffdavis/cgipodvideo\?format=xml$
-replace=http://feeds.ziffdavis.com/ziffdavis/cgipodvideo
-priority=100
-description=Unify Cranky Geed Podcasts (iPod Video) (bug 1032
-
-[crankygeeks-wmv]
-podcast=1
-episode=0
-search=^http://rssnewsapps\.ziffdavis\.com/audioblogs/crankygeeks/cg\.wmv\.xml$
-replace=http://feeds.feedburner.com/ziffdavis/cgwmvvideo
-priority=100
-description=Unify Cranky Geed Podcasts (WMV) (bug 1032
-
-[no-agenda]
-podcast=1
-episode=0
-search=^http://noagenda\.podshow\.com/feed$
-replace=http://www.mevio.com/feeds/noagenda.xml
-priority=100
-description=Unify No Agend Feeds
-
-[escape-pods]
-podcast=1
-episode=0
-search=(?i)^http://feeds\.feedburner\.com/EscapePod$
-replace=http://escapepod.org/feed/
-priority=100
-description=Unify Escape Pod Feeds
-
-[hacker-public-radio]
-podcast=1
-episode=0
-search=http://hackerpublicradio\.org/(?P<res>.*)$
-replace=http://www.hackerpublicradio.org/\g<res>
-priority=100
-description=Unify Hacker Public Radio (bug 1090)
-
-[hacker-medley]
-podcast=1
-episode=0
-search=(?i)^http://feeds\.feedburner\.com/HackerMedley$
-replace=http://hackermedley.org/feed/podcast/
-priority=100
-description=Unify Hacker Medley Podcast
-
-[phones-show]
-podcast=1
-episode=0
-search=.*http://3lib\.ukonline\.co\.uk/sshow/sshowchat\.rss.*
-replace=http://stevelitchfield.com/sshow/sshowchat.rss
-priority=100
-description=Rewrite old URL of The Phones Show (by request of Steve Litchfield on 2011-04-01)
-