From a55a1af62ab3e149cc662c0102cb4328b9991602 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Stefan=20K=C3=B6gl?= Date: Mon, 27 May 2013 21:02:35 +0200 Subject: [PATCH] replace sanitizing rules with gPodder's normalize_feed_url() --- .../_design/sanitizing_rules/views/by_slug/map.js | 7 - .../sanitizing_rules/views/by_target/map.js | 10 - doc/dev/couchdb-docs.rst | 17 -- doc/dev/couchdb-views.rst | 11 - doc/dev/files-overview.rst | 4 - mygpo/api/advanced/__init__.py | 18 +- mygpo/api/advanced/directory.py | 9 +- mygpo/api/advanced/episode.py | 19 +- mygpo/api/legacy.py | 6 +- .../management/commands/sanitizing-maintenance.py | 17 -- mygpo/api/sanitizing.py | 185 ------------ mygpo/api/simple.py | 5 +- mygpo/core/models.py | 14 - mygpo/db/couchdb/common.py | 45 --- .../couchdb/management/commands/compact-couchdb.py | 1 - .../management/commands/touch-couchdb-views.py | 1 - mygpo/directory/search.py | 5 +- .../management/commands/delete-sanitizing-rules.py | 34 --- .../management/commands/sync-sanitizing-rules.py | 51 ---- mygpo/utils.py | 104 +++++++ mygpo/web/views/podcast.py | 6 +- sanitizing-rules.ini | 324 --------------------- 22 files changed, 135 insertions(+), 758 deletions(-) delete mode 100644 couchdb/general/_design/sanitizing_rules/views/by_slug/map.js delete mode 100644 couchdb/general/_design/sanitizing_rules/views/by_target/map.js delete mode 100644 mygpo/api/management/commands/sanitizing-maintenance.py delete mode 100644 mygpo/api/sanitizing.py delete mode 100755 mygpo/maintenance/management/commands/delete-sanitizing-rules.py delete mode 100755 mygpo/maintenance/management/commands/sync-sanitizing-rules.py delete mode 100644 sanitizing-rules.ini diff --git a/couchdb/general/_design/sanitizing_rules/views/by_slug/map.js b/couchdb/general/_design/sanitizing_rules/views/by_slug/map.js deleted file mode 100644 index e17c674e..00000000 --- a/couchdb/general/_design/sanitizing_rules/views/by_slug/map.js +++ /dev/null @@ -1,7 +0,0 @@ -function(doc) -{ - if(doc.doc_type == "SanitizingRule") - { - emit(doc.slug, null); - } -} diff --git a/couchdb/general/_design/sanitizing_rules/views/by_target/map.js b/couchdb/general/_design/sanitizing_rules/views/by_target/map.js deleted file mode 100644 index 8d3c453d..00000000 --- a/couchdb/general/_design/sanitizing_rules/views/by_target/map.js +++ /dev/null @@ -1,10 +0,0 @@ -function(doc) -{ - if(doc.doc_type == "SanitizingRule") - { - for(n in doc.applies_to) - { - emit([doc.applies_to[n], doc.priority], null); - } - } -} diff --git a/doc/dev/couchdb-docs.rst b/doc/dev/couchdb-docs.rst index 3f376777..9fa51c6c 100644 --- a/doc/dev/couchdb-docs.rst +++ b/doc/dev/couchdb-docs.rst @@ -272,23 +272,6 @@ Example Document :: } -SanitizingRule --------------- - -Contains a URL Sanitizing Rule. - -Example Document :: - - { - slug: "feedburner-feeds2", - applies_to: ["podcast", "episode"], - search: "http://feeds2\.feedburner\.com", - replace_by: "http://feeds.feedburner.com", - descriptions: "Replace {feeds2 => feeds}.feedburner.com", - priority: 100, - } - - Category -------- diff --git a/doc/dev/couchdb-views.rst b/doc/dev/couchdb-views.rst index 4996a8b0..144e5273 100644 --- a/doc/dev/couchdb-views.rst +++ b/doc/dev/couchdb-views.rst @@ -183,17 +183,6 @@ Doc-Types: Podcast, PodcastGroup, PodcastSubscriberData * `podcasts/subscriber_data `_ -Sanitizing Rules -^^^^^^^^^^^^^^^^ - -Doc-Types: SanitizingRule - -**Views** - -* `sanitizing_rules/by_slug `_ -* `sanitizing_rules/by_target `_ - - Slugs ^^^^^ diff --git a/doc/dev/files-overview.rst b/doc/dev/files-overview.rst index 7478d7ea..fc05233b 100644 --- a/doc/dev/files-overview.rst +++ b/doc/dev/files-overview.rst @@ -39,8 +39,6 @@ The ``mygpo`` project consists of the following files :: management/commands/merge-episode-states.py # merges duplicates of episodes states management/commands/move-subscriber-data.py # moves subscriber data from podcasts into separate objects management/commands/import-episode-actions.py # imports episode actions from files - management/commands/sync-sanitizing-rules.py # synchronizes sanitizing rules to the database - management/commands/delete-sanitizing-rules.py # deletes sanitizing rules data/ # stuff related to podcast and episode data youtube.py # utils for accessing YouTube data @@ -98,7 +96,6 @@ The ``mygpo`` project consists of the following files :: basic_auth.py httpresponse.py simple.py - sanitizing.py tasks.py advanced/auth.py advanced/sync.py @@ -108,7 +105,6 @@ The ``mygpo`` project consists of the following files :: advanced/episode.py advanced/directory.py management/commands/__init__.py - management/commands/sanitizing-maintenance.py management/__init__.py directory/ diff --git a/mygpo/api/advanced/__init__.py b/mygpo/api/advanced/__init__.py index c1d70a90..c62d7640 100644 --- a/mygpo/api/advanced/__init__.py +++ b/mygpo/api/advanced/__init__.py @@ -38,14 +38,12 @@ from django.conf import settings as dsettings from mygpo.api.constants import EPISODE_ACTION_TYPES, DEVICE_TYPES from mygpo.api.httpresponse import JsonResponse -from mygpo.api.sanitizing import sanitize_url, sanitize_urls from mygpo.api.advanced.directory import episode_data, podcast_data from mygpo.api.backend import get_device, BulkSubscribe from mygpo.utils import parse_time, format_time, parse_bool, get_timestamp, \ - parse_request_body + parse_request_body, normalize_feed_url from mygpo.decorators import allowed_methods, repeat_on_conflict from mygpo.core import models -from mygpo.core.models import SanitizingRule, Podcast from mygpo.core.tasks import auto_flattr_episode from mygpo.users.models import PodcastUserState, EpisodeAction, \ EpisodeUserState, DeviceDoesNotExist, DeviceUIDException, \ @@ -139,8 +137,8 @@ def update_subscriptions(user, device, add, remove): if a in remove: raise ValueError('can not add and remove %s at the same time' % a) - add_s = list(sanitize_urls(add, 'podcast')) - rem_s = list(sanitize_urls(remove, 'podcast')) + add_s = map(normalize_feed_url, add) + rem_s = map(normalize_feed_url, remove) assert len(add) == len(add_s) and len(remove) == len(rem_s) @@ -361,12 +359,12 @@ def update_episodes(user, actions, now, ua_string): for action in actions: podcast_url = action['podcast'] - podcast_url = sanitize_append(podcast_url, 'podcast', update_urls) + podcast_url = sanitize_append(podcast_url, update_urls) if podcast_url == '': continue episode_url = action['episode'] - episode_url = sanitize_append(episode_url, 'episode', update_urls) + episode_url = sanitize_append(episode_url, update_urls) if episode_url == '': continue @@ -641,8 +639,8 @@ def favorites(request, username): return JsonResponse(ret) -def sanitize_append(url, obj_type, sanitized_list): - urls = sanitize_url(url, obj_type) +def sanitize_append(url, sanitized_list): + urls = normalize_feed_url(url) if url != urls: - sanitized_list.append( (url, urls) ) + sanitized_list.append( (url, urls or '') ) return urls diff --git a/mygpo/api/advanced/directory.py b/mygpo/api/advanced/directory.py index 35032ec0..c2fd2c01 100644 --- a/mygpo/api/advanced/directory.py +++ b/mygpo/api/advanced/directory.py @@ -23,11 +23,10 @@ from django.views.decorators.cache import cache_page from mygpo.core import models from mygpo.core.models import Podcast, PodcastGroup -from mygpo.utils import parse_range +from mygpo.utils import parse_range, normalize_feed_url from mygpo.directory.tags import Topics from mygpo.web.utils import get_episode_link_target, get_podcast_link_target from mygpo.api.httpresponse import JsonResponse -from mygpo.api.sanitizing import sanitize_url from mygpo.db.couchdb.episode import episode_for_podcast_url from mygpo.db.couchdb.podcast import podcast_by_id, podcast_for_url from mygpo.db.couchdb.directory import category_for_tag @@ -58,7 +57,7 @@ def tag_podcasts(request, tag, count): @cache_page(60 * 60) def podcast_info(request): - url = sanitize_url(request.GET.get('url', '')) + url = normalize_feed_url(request.GET.get('url', '')) # 404 before we query for url, because query would complain # about missing param @@ -76,8 +75,8 @@ def podcast_info(request): @cache_page(60 * 60) def episode_info(request): - podcast_url = sanitize_url(request.GET.get('podcast', '')) - episode_url = sanitize_url(request.GET.get('url', ''), 'episode') + podcast_url = normalize_feed_url(request.GET.get('podcast', '')) + episode_url = normalize_feed_url(request.GET.get('url', '')) # 404 before we query for url, because query would complain # about missing parameters diff --git a/mygpo/api/advanced/episode.py b/mygpo/api/advanced/episode.py index 1d446c4e..27fd9d80 100644 --- a/mygpo/api/advanced/episode.py +++ b/mygpo/api/advanced/episode.py @@ -27,10 +27,9 @@ from django.views.decorators.cache import never_cache from mygpo.core import models from mygpo.api.httpresponse import JsonResponse from mygpo.api.exceptions import ParameterMissing -from mygpo.api.sanitizing import sanitize_url from mygpo.api.backend import get_device from mygpo.users.models import Chapter -from mygpo.utils import parse_time, parse_request_body +from mygpo.utils import parse_time, parse_request_body, normalize_feed_url from mygpo.decorators import allowed_methods from mygpo.api.basic_auth import require_valid_user, check_username from mygpo.db.couchdb.episode import episode_for_podcast_url @@ -61,16 +60,16 @@ def chapters(request, username): update_urls = [] # podcast sanitizing - s_podcast_url = sanitize_url(podcast_url) + s_podcast_url = normalize_feed_url(podcast_url) if s_podcast_url != podcast_url: req['podcast'] = s_podcast_url - update_urls.append((podcast_url, s_podcast_url)) + update_urls.append((podcast_url, s_podcast_url or '')) # episode sanitizing - s_episode_url = sanitize_url(episode_url, 'episode') + s_episode_url = normalize_feed_url(episode_url, 'episode') if s_episode_url != episode_url: req['episode'] = s_episode_url - update_urls.append((episode_url, s_episode_url)) + update_urls.append((episode_url, s_episode_url or '')) if (s_podcast_url != '') and (s_episode_url != ''): try: @@ -99,8 +98,8 @@ def chapters(request, username): except ValueError: return HttpResponseBadRequest('since-value is not a valid timestamp') - podcast_url = sanitize_url(podcast_url) - episode_url = sanitize_url(episode_url, 'episode') + podcast_url = normalize_feed_url(podcast_url) + episode_url = normalize_feed_url(episode_url) episode = episode_for_podcast_url(podcast_url, episode_url) if episode is None: @@ -137,8 +136,8 @@ def chapters(request, username): def update_chapters(req, user): - podcast_url = sanitize_url(req['podcast']) - episode_url = sanitize_url(req['episode'], 'episode') + podcast_url = normalize_feed_url(req['podcast']) + episode_url = normalize_feed_url(req['episode']) episode = episode_for_podcast_url(podcast_url, episode_url, create=True) diff --git a/mygpo/api/legacy.py b/mygpo/api/legacy.py index dd807715..b70e968f 100644 --- a/mygpo/api/legacy.py +++ b/mygpo/api/legacy.py @@ -22,12 +22,12 @@ from django.utils.datastructures import MultiValueDictKeyError from django.views.decorators.csrf import csrf_exempt from django.views.decorators.cache import never_cache -from mygpo.api.sanitizing import sanitize_urls from mygpo.users.models import User from mygpo.api.opml import Importer, Exporter from mygpo.core.models import Podcast, SubscriptionException from mygpo.api.backend import get_device from mygpo.db.couchdb.podcast import podcast_for_url +from mygpo.utils import normalize_feed_url import logging logger = logging.getLogger(__name__) @@ -60,8 +60,8 @@ def upload(request): i = Importer(opml) podcast_urls = [p['url'] for p in i.items] - podcast_urls = sanitize_urls(podcast_urls) - podcast_urls = filter(lambda x: x, podcast_urls) + podcast_urls = map(normalize_feed_url, podcast_urls) + podcast_urls = filter(None, podcast_urls) new = [u for u in podcast_urls if u not in existing_urls] rem = [u for e in existing_urls if u not in podcast_urls] diff --git a/mygpo/api/management/commands/sanitizing-maintenance.py b/mygpo/api/management/commands/sanitizing-maintenance.py deleted file mode 100644 index 9cfca321..00000000 --- a/mygpo/api/management/commands/sanitizing-maintenance.py +++ /dev/null @@ -1,17 +0,0 @@ -from optparse import make_option - -from django.core.management.base import BaseCommand - -from mygpo.api.sanitizing import maintenance - - -class Command(BaseCommand): - - option_list = BaseCommand.option_list + ( - make_option('--dry-run', action='store_true', dest='dry_run', default=False, help="Don't rewrite anything, just print the stats afterwards."), - ) - - - def handle(self, *args, **options): - - maintenance(options.get('dry_run')) diff --git a/mygpo/api/sanitizing.py b/mygpo/api/sanitizing.py deleted file mode 100644 index 1bc9fe04..00000000 --- a/mygpo/api/sanitizing.py +++ /dev/null @@ -1,185 +0,0 @@ -import collections -import urlparse -import re - -from django.core.cache import cache - -from mygpo.core import models -from mygpo.utils import iterate_together, progress -from mygpo.db.couchdb.podcast import podcast_count, podcast_for_oldid, \ - all_podcasts -from mygpo.db.couchdb.common import sanitizingrules_by_obj_type - -import logging -logger = logging.getLogger(__name__) - - -def sanitize_urls(urls, obj_type='podcast'): - """ Apply sanitizing rules to the given URLs and return the results """ - - return [sanitize_url(url, obj_type) for url in urls] - - -def sanitize_url(url, obj_type='podcast'): - """ Apply sanitizing rules to the given URL and return the results """ - - rules = sanitizingrules_by_obj_type(obj_type) - url = basic_sanitizing(url) - url = apply_sanitizing_rules(url, rules) - return url - - - -def basic_sanitizing(url): - """ - does basic sanitizing through urlparse and additionally converts the netloc to lowercase - """ - r = urlparse.urlsplit(url) - netloc = r.netloc.lower() - r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, '') - return r2.geturl() - - -def apply_sanitizing_rules(url, rules): - """ - applies all url sanitizing rules to the given url - setting podcast=True uses only those rules which have use_podcast set to True. - When passing podcast=False this check is ommitted. The same is valid - for episode. - """ - - for rule in rules: - - orig = url - - # check for precompiled regex first - if hasattr(rule, 'search_re'): - url = rule.search_re.sub(rule.replace, url) - else: - url = re.sub(rule.search, rule.replace, url) - - if orig != url: - c = getattr(rule, 'hits', 0) - rule.hits = c+1 - - return url - - -def maintenance(dry_run=False): - """ - This currently checks how many podcasts could be removed by - applying both basic sanitizing rules and those from the database. - - This will later be used to replace podcasts! - """ - - podcast_rules = sanitizingrules_by_obj_type('podcast') - episode_rules = sanitizingrules_by_obj_type('episode') - - num_podcasts = podcast_count() - - print 'Stats' - print ' * %d podcasts - %d rules' % (num_podcasts, len(podcast_rules)) - if dry_run: - print ' * dry run - nothing will be written to the database' - print - - print 'precompiling regular expressions' - - podcast_rules = list(precompile_rules(podcast_rules)) - episode_rules = list(precompile_rules(episode_rules)) - - p_stats = collections.defaultdict(int) - e_stats = collections.defaultdict(int) - - podcasts = all_podcasts() - - for n, p in enumerate(podcasts): - su = sanitize_url(p.url, rules=podcast_rules) - - # nothing to do - if su == p.url: - p_stats['unchanged'] += 1 - continue - - # invalid podcast, remove - if su == '': - if not dry_run: - p.delete() - p_stats['deleted'] += 1 - - su_podcast = podcast_for_url(url=su) - - if not su_podcast: - # "target" podcast does not exist, we simply change the url - if not dry_run: - logger.info('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su)) - p.url = su - p.save() - - p_stats['updated'] += 1 - continue - - # nothing to do - if p == su_podcast: - p_stats['unchanged'] += 1 - continue - - # last option - merge podcasts - if not dry_run: - rewrite_podcasts(p, su_podcast) - p.delete() - - p_stats['merged'] += 1 - - progress(n+1, num_podcasts, str(p.id)) - - print 'finished %s podcasts' % (n+1) - print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats - print 'Hits' - for _, r in podcast_rules: - print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0) - - -def rewrite_podcasts(p_old, p_new): - - logger.info('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url)) - - rewrite_newpodcast(p_old, p_new) - -def rewrite_newpodcast(p_old, p_new): - p_n = podcast_for_oldid(p_new.id) - p_o = podcast_for_oldid(p_old.id) - - if None in (p_n, p_o): - return - - - # merge subscriber data - subscribers = [] - compare = lambda a, b: cmp(a.timestamp, b.timestamp) - for n, o in iterate_together([p_n.subscribers, p_o.subscribers]): - - # we assume that the new podcast has much more subscribers - # taking only count of the old podcast would look like a drop - if None in (n, o): - continue - - subscribers.append( - models.SubscriberData( - timestamp = o.timestamp, - subscriber_count = n.subscriber_count + \ - n.subscriber_count if n else 0\ - ) - ) - - p_n.subscribers = subscribers - - p_n.save() - p_o.delete() - - -def precompile_rules(rules): - for rule in rules: - rule.search_re = re.compile(rule.search, re.UNICODE) - yield rule diff --git a/mygpo/api/simple.py b/mygpo/api/simple.py index 8f15a285..3712f294 100644 --- a/mygpo/api/simple.py +++ b/mygpo/api/simple.py @@ -37,13 +37,12 @@ from mygpo.core.models import Podcast from mygpo.users.models import Suggestions from mygpo.api.opml import Exporter, Importer from mygpo.api.httpresponse import JsonResponse -from mygpo.api.sanitizing import sanitize_urls from mygpo.directory.toplist import PodcastToplist from mygpo.directory.models import ExamplePodcasts from mygpo.api.advanced.directory import podcast_data from mygpo.directory.search import search_podcasts from mygpo.decorators import allowed_methods -from mygpo.utils import parse_range +from mygpo.utils import parse_range, normalize_feed_url from mygpo.core.json import json, JSONDecodeError from mygpo.db.couchdb import BulkException from mygpo.db.couchdb.podcast import podcasts_by_id @@ -201,7 +200,7 @@ def parse_subscription(raw_post_data, format): return [] - urls = sanitize_urls(urls) + urls = map(normalize_feed_url, urls) urls = filter(None, urls) urls = set(urls) return urls diff --git a/mygpo/core/models.py b/mygpo/core/models.py index 20dd189c..206a0454 100644 --- a/mygpo/core/models.py +++ b/mygpo/core/models.py @@ -548,17 +548,3 @@ class PodcastGroup(Document, SlugMixin, OldIdMixin): return '%s %s (%s)' % (self.__class__.__name__, self._id[:10], self.oldid) else: return '%s %s' % (self.__class__.__name__, self._id[:10]) - - - -class SanitizingRule(Document): - slug = StringProperty() - applies_to = StringListProperty() - search = StringProperty() - replace = StringProperty() - priority = IntegerProperty() - description = StringProperty() - - - def __repr__(self): - return 'SanitizingRule %s' % self._id diff --git a/mygpo/db/couchdb/common.py b/mygpo/db/couchdb/common.py index 76a0f6f7..bb7d1f23 100644 --- a/mygpo/db/couchdb/common.py +++ b/mygpo/db/couchdb/common.py @@ -1,53 +1,8 @@ -from mygpo.core.models import SanitizingRule -from mygpo.cache import cache_result from mygpo.db.couchdb import get_main_database from mygpo.db import QueryParameterMissing from mygpo.db.couchdb.utils import multi_request_view -class SanitizingRuleStub(object): - pass - - -@cache_result(timeout=60*60) -def sanitizingrules_by_obj_type(obj_type): - - if not obj_type: - raise QueryParameterMissing('obj_type') - - r = SanitizingRule.view('sanitizing_rules/by_target', - include_docs = True, - startkey = [obj_type, None], - endkey = [obj_type, {}], - ) - - return map(_wrap_rule, r) - -def _wrap_rule(rule): - obj = SanitizingRuleStub() - obj.slug = rule.slug - obj.applies_to = list(rule.applies_to) - obj.search = rule.search - obj.replace = rule.replace - obj.priority = rule.priority - obj.description = rule.description - return obj - - -@cache_result(timeout=60*60) -def sanitizingrule_for_slug(slug): - - if not slug: - raise QueryParameterMissing('slug') - - r = SanitizingRule.view('sanitizing_rules/by_slug', - include_docs=True, - key=slug, - ) - - return r.one() if r else None - - def missing_slug_count(doc_type, start, end): if not doc_type: diff --git a/mygpo/db/couchdb/management/commands/compact-couchdb.py b/mygpo/db/couchdb/management/commands/compact-couchdb.py index d6ff14b0..870c1689 100644 --- a/mygpo/db/couchdb/management/commands/compact-couchdb.py +++ b/mygpo/db/couchdb/management/commands/compact-couchdb.py @@ -11,7 +11,6 @@ from django.core.management.base import BaseCommand from django.conf import settings from mygpo.decorators import repeat_on_conflict -from mygpo.core.models import SanitizingRule from mygpo.utils import progress diff --git a/mygpo/db/couchdb/management/commands/touch-couchdb-views.py b/mygpo/db/couchdb/management/commands/touch-couchdb-views.py index 1d6494f2..500769aa 100644 --- a/mygpo/db/couchdb/management/commands/touch-couchdb-views.py +++ b/mygpo/db/couchdb/management/commands/touch-couchdb-views.py @@ -10,7 +10,6 @@ from django.core.management.base import BaseCommand from django.conf import settings from mygpo.decorators import repeat_on_conflict -from mygpo.core.models import SanitizingRule from mygpo.utils import progress diff --git a/mygpo/directory/search.py b/mygpo/directory/search.py index 71637876..ff74f5f1 100644 --- a/mygpo/directory/search.py +++ b/mygpo/directory/search.py @@ -1,6 +1,5 @@ -from mygpo.utils import is_url +from mygpo.utils import is_url, normalize_feed_url from mygpo.data.feeddownloader import PodcastUpdater, NoPodcastCreated -from mygpo.api.sanitizing import sanitize_url from mygpo.cache import cache_result from mygpo.db.couchdb.podcast import podcast_for_url, search @@ -9,7 +8,7 @@ from mygpo.db.couchdb.podcast import podcast_for_url, search def search_podcasts(q, limit=20, skip=0): if is_url(q): - url = sanitize_url(q) + url = normalize_feed_url(q) podcast = podcast_for_url(url, create=False) diff --git a/mygpo/maintenance/management/commands/delete-sanitizing-rules.py b/mygpo/maintenance/management/commands/delete-sanitizing-rules.py deleted file mode 100755 index 6544b145..00000000 --- a/mygpo/maintenance/management/commands/delete-sanitizing-rules.py +++ /dev/null @@ -1,34 +0,0 @@ -import sys -import ConfigParser - -from django.core.management.base import BaseCommand - -from mygpo.decorators import repeat_on_conflict -from mygpo.utils import progress -from mygpo.db.couchdb.common import sanitizingrule_for_slug - - - -class Command(BaseCommand): - """ - """ - - def handle(self, *args, **options): - - if not args: - print >> sys.stderr, "Usage: ./manage.py delete-sanitizing-rules [ ...]" - return - - - for n, slug in enumerate(args): - rule = sanitizingrule_for_slug(slug) - - if rule: - self.delete_rule(rule=rule) - - progress(n+1, len(args)) - - - @repeat_on_conflict(['rule']) - def delete_rule(self, rule): - rule.delete() diff --git a/mygpo/maintenance/management/commands/sync-sanitizing-rules.py b/mygpo/maintenance/management/commands/sync-sanitizing-rules.py deleted file mode 100755 index ce0c64c1..00000000 --- a/mygpo/maintenance/management/commands/sync-sanitizing-rules.py +++ /dev/null @@ -1,51 +0,0 @@ -import sys -import ConfigParser - -from django.core.management.base import BaseCommand - -from mygpo.decorators import repeat_on_conflict -from mygpo.core.models import SanitizingRule -from mygpo.utils import progress -from mygpo.db.couchdb.common import sanitizingrule_for_slug - - - -class Command(BaseCommand): - """ - """ - - def handle(self, *args, **options): - - if not args: - print >> sys.stderr, "Usage: ./manage.py sync-sanitizing-rules [ ...]" - return - - - for filename in args: - config = ConfigParser.ConfigParser() - config.read(filename) - sections = config.sections() - - for n, slug in enumerate(sections): - rule = sanitizingrule_for_slug(slug) or SanitizingRule() - - self.update_rule(rule=rule, config=config, slug=slug) - - progress(n+1, len(sections), filename) - - - @repeat_on_conflict(['rule']) - def update_rule(self, rule, config, slug): - rule.slug = slug - rule.applies_to = [] - if config.getboolean(slug, 'podcast'): - rule.applies_to.append('podcast') - - if config.getboolean(slug, 'episode'): - rule.applies_to.append('episode') - - rule.search = config.get(slug, 'search') - rule.replace = config.get(slug, 'replace') - rule.priority = config.getint(slug, 'priority') - rule.description = config.get(slug, 'description') - rule.save() diff --git a/mygpo/utils.py b/mygpo/utils.py index ec0cb050..c4824049 100644 --- a/mygpo/utils.py +++ b/mygpo/utils.py @@ -878,3 +878,107 @@ def parse_request_body(request): raw_body = zlib.decompress(raw_body) return json.loads(raw_body) + + +def normalize_feed_url(url): + """ + Converts any URL to http:// or ftp:// so that it can be + used with "wget". If the URL cannot be converted (invalid + or unknown scheme), "None" is returned. + + This will also normalize feed:// and itpc:// to http://. + + >>> normalize_feed_url('itpc://example.org/podcast.rss') + 'http://example.org/podcast.rss' + + If no URL scheme is defined (e.g. "curry.com"), we will + simply assume the user intends to add a http:// feed. + + >>> normalize_feed_url('curry.com') + 'http://curry.com/' + + There are even some more shortcuts for advanced users + and lazy typists (see the source for details). + + >>> normalize_feed_url('fb:43FPodcast') + 'http://feeds.feedburner.com/43FPodcast' + + It will also take care of converting the domain name to + all-lowercase (because domains are not case sensitive): + + >>> normalize_feed_url('http://Example.COM/') + 'http://example.com/' + + Some other minimalistic changes are also taken care of, + e.g. a ? with an empty query is removed: + + >>> normalize_feed_url('http://example.org/test?') + 'http://example.org/test' + + Leading and trailing whitespace is removed + + >>> normalize_feed_url(' http://example.com/podcast.rss ') + 'http://example.com/podcast.rss' + + HTTP Authentication is removed to protect users' privacy + + >>> normalize_feed_url('http://a@b:c@host.com/') + 'http://host.com/' + >>> normalize_feed_url('ftp://a:b:c@host.com/') + 'ftp://host.com/' + >>> normalize_feed_url('http://i%2Fo:P%40ss%3A@host.com/') + 'http://host.com/' + >>> normalize_feed_url('ftp://%C3%B6sterreich@host.com/') + 'ftp://host.com/' + >>> normalize_feed_url('http://w%20x:y%20z@example.org/') + 'http://example.org/' + >>> normalize_feed_url('http://example.com/x@y:z@test.com/') + 'http://example.com/x@y:z@test.com/' + """ + url = url.strip() + if not url or len(url) < 8: + return None + + # This is a list of prefixes that you can use to minimize the amount of + # keystrokes that you have to use. + # Feel free to suggest other useful prefixes, and I'll add them here. + PREFIXES = { + 'fb:': 'http://feeds.feedburner.com/%s', + 'yt:': 'http://www.youtube.com/rss/user/%s/videos.rss', + 'sc:': 'http://soundcloud.com/%s', + 'fm4od:': 'http://onapp1.orf.at/webcam/fm4/fod/%s.xspf', + # YouTube playlists. To get a list of playlists per-user, use: + # https://gdata.youtube.com/feeds/api/users//playlists + 'ytpl:': 'http://gdata.youtube.com/feeds/api/playlists/%s', + } + + for prefix, expansion in PREFIXES.iteritems(): + if url.startswith(prefix): + url = expansion % (url[len(prefix):],) + break + + # Assume HTTP for URLs without scheme + if not '://' in url: + url = 'http://' + url + + scheme, netloc, path, query, fragment = urlparse.urlsplit(url) + + # Schemes and domain names are case insensitive + scheme, netloc = scheme.lower(), netloc.lower() + + # Remove authentication to protect users' privacy + netloc = netloc.rsplit('@', 1)[-1] + + # Normalize empty paths to "/" + if path == '': + path = '/' + + # feed://, itpc:// and itms:// are really http:// + if scheme in ('feed', 'itpc', 'itms'): + scheme = 'http' + + if scheme not in ('http', 'https', 'ftp', 'file'): + return None + + # urlunsplit might return "a slighty different, but equivalent URL" + return urlparse.urlunsplit((scheme, netloc, path, query, fragment)) diff --git a/mygpo/web/views/podcast.py b/mygpo/web/views/podcast.py index 0369adc4..037f98c1 100644 --- a/mygpo/web/views/podcast.py +++ b/mygpo/web/views/podcast.py @@ -13,7 +13,7 @@ from django.views.decorators.cache import never_cache, cache_control from mygpo.core.models import PodcastGroup, SubscriptionException from mygpo.core.proxy import proxy_object from mygpo.core.tasks import flattr_thing -from mygpo.api.sanitizing import sanitize_url +from mygpo.utils import normalize_feed_url from mygpo.users.settings import PUBLIC_SUB_PODCAST, FLATTR_TOKEN from mygpo.users.models import HistoryEntry, DeviceDoesNotExist, SubscriptionAction from mygpo.web.forms import SyncForm @@ -319,9 +319,9 @@ def subscribe_url(request): if not url: raise Http404('http://my.gpodder.org/subscribe?url=http://www.example.com/podcast.xml') - url = sanitize_url(url) + url = normalize_feed_url(url) - if url == '': + if not url: raise Http404('Please specify a valid url') podcast = podcast_for_url(url, create=True) diff --git a/sanitizing-rules.ini b/sanitizing-rules.ini deleted file mode 100644 index 97b70fd4..00000000 --- a/sanitizing-rules.ini +++ /dev/null @@ -1,324 +0,0 @@ -# This files contains rules to rewrite Podcast and Episode URLs -# All rules should be given in the following format -# -#[some-unique-slug] -#podcast=1 1 if the rule applies to podcast URLs, otherwise 0 -#episode=1 1 if the rule applies to episode URLs, otherwise 0 -#search=regex search-regex that should be replaced -#replace=regex string with which the search string should be replace. may contain references -#priority=x rules are applied in order of increasing priority -#description text describing the rule, possibly mentioning a bug -# - -[feedburner-feeds2] -podcast=1 -episode=1 -search=feeds2\.feedburner\.com -replace=feeds.feedburner.com -priority=1 -description=Rewriting for feedburner should happen as "feeds2.feedburner.com" -> "feeds.feedburner.com" - -[feedburner-format] -podcast=1 -episode=1 -search=(?Pfeedburner\.com.+)\?format=xml -replace=\g -priority=2 -description=Feedburner URLs should have their "?format=xml" query string removed - -[remove-leading-whitespace] -podcast=1 -episode=1 -search=^\s+ -replace= -priority=0 -description=Remove leading whitespaces - -[remove-trailing-whitespace] -podcast=1 -episode=1 -search=\s+$ -replace= -priority=0 -description=Remove trailing whitespaces - -[unknown-protocol] -podcast=1 -episode=1 -search=^[^(https?):].+ -replace= -priority=100 -description=Empty any string that doesn't start with either http or https - -[feedburner-trailing-slash] -podcast=1 -episode=0 -search=(?Pfeedburner\.com.+)\/$ -replace=\g -priority=2 -description=Feedburner URLs sometimes have a trailing slash, which can be removed safely - -[non-ascii] -podcast=1 -episode=1 -search=^.*[^\x20-\x7E].*$ -replace= -priority=50 -description=Remove URLs with non-ascii characters - -[twit-podcasts] -podcast=1 -episode=0 -search=^http://leoville\.tv/podcasts/(?P\w+)\.xml$ -replace=http://leo.am/podcasts/\g -priority=10 -description=Rewrite URLs of TWiT Podcasts because most users use a URL that is going to break soon (bug 885) - -[hardcore-history-old-url] -podcast=1 -episode=0 -search=^http://www\.dancarlin\.com/dchh\.xml$ -replace=http://feeds.feedburner.com/dancarlin/history -priority=10 -description=Rewrite podcast URL of Dan Carlin's Hardcore History because the old URL doesn't work anymore (bug 855) - -[spaces] -podcast=1 -episode=1 -search=^.*\s.*$ -replace= -priority=10 -description=All URLs that contain spaces are considered invalid - -[libsyn-podcasts] -podcast=0 -episode=1 -search=http://media.libsyn.com/media/(?P.*)$ -replace=http://traffic.libsyn.com/\g -priority=10 -description=Update new URL for libsy Podcasts (Learn Japanese with Beb and Alex) - -[abc-podcasts] -podcast=1 -episode=0 -search=^http://site\.abc\.go\.com/abc/xml/podcastRSS\?(.*&)?feedPublishKey=(?P\d+)(&.*)?$ -replace=http://a.abc.com/abc/xml/podcastRSS?feedPublishKey=\g -priority=100 -description=Merge URLs for ABC Podcasts (bug 977) - -[remove-http-auth] -podcast=1 -episode=1 -search=^(?P[a-zA-Z])://[-_\w]+(:[^@]+)?@(?P.+)$ -replace=\g://\g -priority=20 -description=Remove HTTP-Authentication from URLs - -[rpod-ru-parameters] -podcast=0 -episode=1 -search=^(?Phttp://rpod\.ru/personal/.+\.mp[34])\?[0-9a-z]+$ -replace=\g -priority=100 -description= - -[collegehumor] -podcast=0 -episode=1 -search=^http://\d+\.media\.collegehumor\.com/(?P.+)$ -replace=http://1.media.collegehumor.com/\g -priority=100 -description= - -[shot-of-jaq-merge] -podcast=1 -episode=0 -search=(?i)^http://feeds.feedburner.com/ShotOfJaq$ -replace=http://shotofjaq.org/feed/ -priority=100 -description=Merges Shot of Jaq feeds to the URL given on their Website - -[shot-of-jaq-trailing-slash] -podcast=1 -episode=0 -search=http://shotofjaq.org/feed$ -replace=http://shotofjaq.org/feed/ -priority=100 -description=Add trailing slash to Shot of Jaq feed URL - -[shot-of-jaq-feedburner] -podcast=1 -episode=0 -search=(?i)^http://feeds\.feedburner\.com/ShotOfJaqOGG$ -replace=http://feeds.feedburner.com/ShotOfJaqOgg -priority=100 -description=Unify all Feedburner URLs for the Shot of Jaq Ogg feed (seems it doesn't have an shotofjaq.org url) - -[ted-talks-video] -podcast=1 -episode=0 -search=(?i)^http://feeds\.feedburner\.com/TEDTalks_video$ -replace=http://feeds.feedburner.com/tedtalks_video -priority=100 -description=Unify all TEDTalks podcasts - -[ted-talks-audio] -podcast=1 -episode=0 -search=(?i)^http://feeds\.feedburner\.com/TEDTalks_audio$ -replace=http://feeds.feedburner.com/tedtalks_audio -priority=100 -description=Unify all TEDTalks podcasts - -[ted-talks-hd] -podcast=1 -episode=0 -search=(?i)^http://feeds\.feedburner\.com/TedtalksHD$ -replace=http://feeds.feedburner.com/tedtalkshd -priority=100 -description=Unify all TEDTalks podcasts - -[mintcast-feedburner] -podcast=1 -episode=0 -search=(?i)^http://feeds\.feedburner\.com/mintcastpodcast$ -replace=http://www.mintcast.org/feed/podcast/ -priority=100 -description=Unify mintCast feeds (bug 1035 - -[mintcast] -podcast=1 -episode=0 -search=(?i)^http://www.mintcast.org/feed/$ -replace=http://www.mintcast.org/feed/podcast/ -priority=100 -description=Unify mintCast feeds (bug 1035 - -[crankygeeks-feedburner] -podcast=1 -episode=0 -search=(?i)^http://feeds\.feedburner\.com/ziffdavis/crankygeekspodcast$ -replace=http://feeds.ziffdavis.com/ziffdavis/crankygeekspodcast -priority=100 -description=Unify Cranky Geed Podcasts (MP3) (bug 1032 - -[crankygeeks-xml] -podcast=1 -episode=0 -search=^http://rssnewsapps\.ziffdavis\.com/audioblogs/crankygeeks/cg\.audio\.xml/?$ -replace=http://feeds.ziffdavis.com/ziffdavis/crankygeekspodcast -priority=100 -description=Unify Cranky Geed Podcasts (MP3) (bug 1032 - -[crankygeeks-slash] -podcast=1 -episode=0 -search=^http://feeds\.ziffdavis\.com/ziffdavis/crankygeekspodcast/$ -replace=http://feeds.ziffdavis.com/ziffdavis/crankygeekspodcast -priority=100 -description=Unify Cranky Geed Podcasts (MP3) (bug 1032 - -[crankygeeks-video] -podcast=1 -episode=0 -search=(?i)^http://feeds\.feedburner\.com/ziffdavis/cgh264video$ -replace=http://feeds.ziffdavis.com/ziffdavis/cgh264video -priority=100 -description=Unify Cranky Geed Podcasts (H.264) (bug 1032 - -[crankygeeks-xml-video] -podcast=1 -episode=0 -search=^http://rssnewsapps\.ziffdavis\.com/audioblogs/crankygeeks/cg\.h\.264\.xml$ -replace=http://feeds.ziffdavis.com/ziffdavis/cgh264video -priority=100 -description=Unify Cranky Geed Podcasts (H.264) (bug 1032 - -[crankygeeks-xml-h264] -podcast=1 -episode=0 -search=^http://rssnewsapps\.ziffdavis\.com/audioblogs/crankygeeks/cg\.ipod\.xml$ -replace=http://feeds.ziffdavis.com/ziffdavis/cgipodvideo -priority=100 -description=Unify Cranky Geed Podcasts (iPod Video) (bug 1032 - -[crankygeeks-xml-mp4] -podcast=1 -episode=0 -search=^http://rssnewsapps\.ziffdavis\.com/audioblogs/crankygeeks/cg\.mp4\.xml$ -replace=http://feeds.ziffdavis.com/ziffdavis/cgipodvideo -priority=100 -description=Unify Cranky Geed Podcasts (iPod Video) (bug 1032 - -[crankygeeks-ipod] -podcast=1 -episode=0 -search=(?i)^http://feeds\.feedburner\.com/ziffdavis/cgipodvideo$ -replace=http://feeds.ziffdavis.com/ziffdavis/cgipodvideo -priority=100 -description=Unify Cranky Geed Podcasts (iPod Video) (bug 1032 - -[crankygeeks-mpeg4] -podcast=1 -episode=0 -search=^http://feeds\.ziffdavis\.com/ziffdavis/cgmpeg4video/$ -replace=http://feeds.ziffdavis.com/ziffdavis/cgipodvideo -priority=100 -description=Unify Cranky Geed Podcasts (iPod Video) (bug 1032 - -[crankygeeks-ipod-cgipod-video] -podcast=1 -episode=0 -search=^http://feeds\.ziffdavis\.com/ziffdavis/cgipodvideo\?format=xml$ -replace=http://feeds.ziffdavis.com/ziffdavis/cgipodvideo -priority=100 -description=Unify Cranky Geed Podcasts (iPod Video) (bug 1032 - -[crankygeeks-wmv] -podcast=1 -episode=0 -search=^http://rssnewsapps\.ziffdavis\.com/audioblogs/crankygeeks/cg\.wmv\.xml$ -replace=http://feeds.feedburner.com/ziffdavis/cgwmvvideo -priority=100 -description=Unify Cranky Geed Podcasts (WMV) (bug 1032 - -[no-agenda] -podcast=1 -episode=0 -search=^http://noagenda\.podshow\.com/feed$ -replace=http://www.mevio.com/feeds/noagenda.xml -priority=100 -description=Unify No Agend Feeds - -[escape-pods] -podcast=1 -episode=0 -search=(?i)^http://feeds\.feedburner\.com/EscapePod$ -replace=http://escapepod.org/feed/ -priority=100 -description=Unify Escape Pod Feeds - -[hacker-public-radio] -podcast=1 -episode=0 -search=http://hackerpublicradio\.org/(?P.*)$ -replace=http://www.hackerpublicradio.org/\g -priority=100 -description=Unify Hacker Public Radio (bug 1090) - -[hacker-medley] -podcast=1 -episode=0 -search=(?i)^http://feeds\.feedburner\.com/HackerMedley$ -replace=http://hackermedley.org/feed/podcast/ -priority=100 -description=Unify Hacker Medley Podcast - -[phones-show] -podcast=1 -episode=0 -search=.*http://3lib\.ukonline\.co\.uk/sshow/sshowchat\.rss.* -replace=http://stevelitchfield.com/sshow/sshowchat.rss -priority=100 -description=Rewrite old URL of The Phones Show (by request of Steve Litchfield on 2011-04-01) - -- 2.11.4.GIT