fix feed-downloader
[mygpo.git] / mygpo / api / sanitizing.py
blobd514f3a7bd0bc7f60bf41919280971f53e059e8e
1 import collections
2 import urlparse
3 import re
5 from django.core.cache import cache
7 from mygpo.core import models
8 from mygpo.log import log
9 from mygpo.utils import iterate_together, progress
10 from mygpo.db.couchdb.podcast import podcast_count, podcast_for_oldid, \
11 all_podcasts
12 from mygpo.db.couchdb.common import sanitizingrules_by_obj_type
15 def sanitize_urls(urls, obj_type='podcast'):
16 """ Apply sanitizing rules to the given URLs and return the results """
18 return [sanitize_url(url, obj_type) for url in urls]
21 def sanitize_url(url, obj_type='podcast'):
22 """ Apply sanitizing rules to the given URL and return the results """
24 rules = sanitizingrules_by_obj_type(obj_type)
25 url = basic_sanitizing(url)
26 url = apply_sanitizing_rules(url, rules)
27 return url
31 def basic_sanitizing(url):
32 """
33 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
34 """
35 r = urlparse.urlsplit(url)
36 netloc = r.netloc.lower()
37 r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, '')
38 return r2.geturl()
41 def apply_sanitizing_rules(url, rules):
42 """
43 applies all url sanitizing rules to the given url
44 setting podcast=True uses only those rules which have use_podcast set to True.
45 When passing podcast=False this check is ommitted. The same is valid
46 for episode.
47 """
49 for rule in rules:
51 orig = url
53 # check for precompiled regex first
54 if hasattr(rule, 'search_re'):
55 url = rule.search_re.sub(rule.replace, url)
56 else:
57 url = re.sub(rule.search, rule.replace, url)
59 if orig != url:
60 c = getattr(rule, 'hits', 0)
61 rule.hits = c+1
63 return url
66 def maintenance(dry_run=False):
67 """
68 This currently checks how many podcasts could be removed by
69 applying both basic sanitizing rules and those from the database.
71 This will later be used to replace podcasts!
72 """
74 podcast_rules = sanitizingrules_by_obj_type('podcast')
75 episode_rules = sanitizingrules_by_obj_type('episode')
77 num_podcasts = podcast_count()
79 print 'Stats'
80 print ' * %d podcasts - %d rules' % (num_podcasts, len(podcast_rules))
81 if dry_run:
82 print ' * dry run - nothing will be written to the database'
83 print
85 print 'precompiling regular expressions'
87 podcast_rules = list(precompile_rules(podcast_rules))
88 episode_rules = list(precompile_rules(episode_rules))
90 p_stats = collections.defaultdict(int)
91 e_stats = collections.defaultdict(int)
93 podcasts = all_podcasts()
95 for n, p in enumerate(podcasts):
96 su = sanitize_url(p.url, rules=podcast_rules)
98 # nothing to do
99 if su == p.url:
100 p_stats['unchanged'] += 1
101 continue
103 # invalid podcast, remove
104 if su == '':
105 if not dry_run:
106 p.delete()
107 p_stats['deleted'] += 1
109 su_podcast = podcast_for_url(url=su)
111 if not su_podcast:
112 # "target" podcast does not exist, we simply change the url
113 if not dry_run:
114 log('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
115 p.url = su
116 p.save()
118 p_stats['updated'] += 1
119 continue
121 # nothing to do
122 if p == su_podcast:
123 p_stats['unchanged'] += 1
124 continue
126 # last option - merge podcasts
127 if not dry_run:
128 rewrite_podcasts(p, su_podcast)
129 p.delete()
131 p_stats['merged'] += 1
133 progress(n+1, num_podcasts, str(p.id))
135 print 'finished %s podcasts' % (n+1)
136 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
137 print 'Hits'
138 for _, r in podcast_rules:
139 print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
142 def rewrite_podcasts(p_old, p_new):
144 log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
146 rewrite_newpodcast(p_old, p_new)
148 def rewrite_newpodcast(p_old, p_new):
149 p_n = podcast_for_oldid(p_new.id)
150 p_o = podcast_for_oldid(p_old.id)
152 if None in (p_n, p_o):
153 return
156 # merge subscriber data
157 subscribers = []
158 compare = lambda a, b: cmp(a.timestamp, b.timestamp)
159 for n, o in iterate_together([p_n.subscribers, p_o.subscribers]):
161 # we assume that the new podcast has much more subscribers
162 # taking only count of the old podcast would look like a drop
163 if None in (n, o):
164 continue
166 subscribers.append(
167 models.SubscriberData(
168 timestamp = o.timestamp,
169 subscriber_count = n.subscriber_count + \
170 n.subscriber_count if n else 0\
174 p_n.subscribers = subscribers
176 p_n.save()
177 p_o.delete()
180 def precompile_rules(rules):
181 for rule in rules:
182 rule.search_re = re.compile(rule.search, re.UNICODE)
183 yield rule