add missing None-check
[mygpo.git] / mygpo / api / sanitizing.py
blobae96cb44e71106274c5e03e3245a3068d8c5327a
1 import collections
3 from django.core.cache import cache
5 from mygpo.core import models
6 from mygpo.api.models import Podcast, Episode
7 from mygpo.log import log
8 from mygpo.utils import iterate_together, progress
9 import urlparse
10 import re
14 def sanitize_urls(urls, obj_type='podcast', rules=None):
15 """ Apply sanitizing rules to the given URLs and return the results """
17 rules = get_sanitizing_rules(obj_type, rules)
18 return (sanitize_url(url, rules=rules) for url in urls)
21 def sanitize_url(url, obj_type='podcast', rules=None):
22 """ Apply sanitizing rules to the given URL and return the results """
24 rules = get_sanitizing_rules(obj_type, rules=rules)
25 url = basic_sanitizing(url)
26 url = apply_sanitizing_rules(url, rules)
27 return url
30 def get_sanitizing_rules(obj_type, rules=None):
31 """ Returns the sanitizing-rules from the cache or the database """
33 cache_name = '%s-sanitizing-rules' % obj_type
35 sanitizing_rules = \
36 rules or \
37 cache.get(cache_name) or \
38 list(models.SanitizingRule.for_obj_type(obj_type))
40 cache.set(cache_name, sanitizing_rules, 60 * 60)
42 return sanitizing_rules
45 def basic_sanitizing(url):
46 """
47 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
48 """
49 r = urlparse.urlsplit(url)
50 netloc = r.netloc.lower()
51 r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, r.fragment)
52 return r2.geturl()
55 def apply_sanitizing_rules(url, rules):
56 """
57 applies all url sanitizing rules to the given url
58 setting podcast=True uses only those rules which have use_podcast set to True.
59 When passing podcast=False this check is ommitted. The same is valid
60 for episode.
61 """
63 for rule in rules:
65 orig = url
67 # check for precompiled regex first
68 if hasattr(rule, 'search_re'):
69 url = rule.search_re.sub(rule.replace, url)
70 else:
71 url = re.sub(rule.search, rule.replace, url)
73 if orig != url:
74 c = getattr(rule, 'hits', 0)
75 rule.hits = c+1
77 return url
80 def maintenance(dry_run=False):
81 """
82 This currently checks how many podcasts could be removed by
83 applying both basic sanitizing rules and those from the database.
85 This will later be used to replace podcasts!
86 """
88 podcast_rules = get_sanitizing_rules('podcast')
89 episode_rules = get_sanitizing_rules('episode')
91 num_podcasts = Podcast.objects.count()
92 num_episodes = Episode.objects.count()
94 print 'Stats'
95 print ' * %d podcasts - %d rules' % (num_podcasts, len(podcast_rules))
96 print ' * %d episodes - %d rules' % (num_episodes, len(episode_rules))
97 if dry_run:
98 print ' * dry run - nothing will be written to the database'
99 print
101 print 'precompiling regular expressions'
103 podcast_rules = list(precompile_rules(podcast_rules))
104 episode_rules = list(precompile_rules(episode_rules))
106 p_stats = collections.defaultdict(int)
107 e_stats = collections.defaultdict(int)
109 podcasts = Podcast.objects.only('id', 'url').order_by('id').iterator()
111 for n, p in enumerate(podcasts):
112 try:
113 su = sanitize_url(p.url, rules=podcast_rules)
114 except Exception, e:
115 log('failed to sanitize url for podcast %s: %s' % (p.id, e))
116 print 'failed to sanitize url for podcast %s: %s' % (p.id, e)
117 p_stats['error'] += 1
118 continue
120 # nothing to do
121 if su == p.url:
122 p_stats['unchanged'] += 1
123 continue
125 # invalid podcast, remove
126 if su == '':
127 try:
128 if not dry_run:
129 p.delete()
130 p_stats['deleted'] += 1
132 except Exception, e:
133 log('failed to delete podcast %s: %s' % (p.id, e))
134 print 'failed to delete podcast %s: %s' % (p.id, e)
135 p_stats['error'] += 1
137 continue
139 try:
140 su_podcast = Podcast.objects.get(url=su)
142 except Podcast.DoesNotExist, e:
143 # "target" podcast does not exist, we simply change the url
144 if not dry_run:
145 log('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
146 p.url = su
147 p.save()
149 p_stats['updated'] += 1
150 continue
152 # nothing to do
153 if p == su_podcast:
154 p_stats['unchanged'] += 1
155 continue
157 # last option - merge podcasts
158 try:
159 if not dry_run:
160 rewrite_podcasts(p, su_podcast)
161 p.delete()
163 p_stats['merged'] += 1
165 except Exception, e:
166 log('error rewriting podcast %s: %s' % (p.id, e))
167 print 'error rewriting podcast %s: %s' % (p.id, e)
168 p_stats['error'] += 1
169 continue
171 progress(n+1, num_podcasts, str(p.id))
173 print 'finished %s podcasts' % (n+1)
174 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
175 print 'Hits'
176 for _, r in podcast_rules:
177 print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
179 episodes = Episode.objects.only('id', 'url').order_by('id').iterator()
180 for e in episodes:
181 try:
182 su = sanitize_url(e.url, rules=episode_rules)
183 except Exception, ex:
184 log('failed to sanitize url for episode %s: %s' % (e.id, ex))
185 print 'failed to sanitize url for episode %s: %s' % (e.id, ex)
186 e_stats['error'] += 1
187 continue
189 # nothing to do
190 if su == e.url:
191 e_stats['unchanged'] += 1
192 continue
194 # invalid episode, remove
195 if su == '':
196 try:
197 if not dry_run:
198 e.delete()
200 e_stats['deleted'] += 1
201 except Exception, ex:
202 log('failed to delete episode %s: %s' % (e.id, ex))
203 print 'failed to delete episode %s: %s' % (e.id, ex)
204 e_stats['error'] += 1
206 continue
208 try:
209 su_episode = Episode.objects.get(url=su, podcast=e.podcast)
211 except Episode.DoesNotExist, ex:
212 # "target" episode does not exist, we simply change the url
213 if not dry_run:
214 log('updating episode %s - "%s" => "%s"' % (e.id, e.url, su))
215 e.url = su
216 e.save()
218 e_stats['updated'] += 1
219 continue
221 # nothing to do
222 if e == su_episode:
223 e_stats['unchanged'] += 1
224 continue
227 # last option - merge episodes
228 try:
229 if not dry_run:
230 e.delete()
232 e_stats['merged'] += 1
234 except Exception, ex:
235 log('error rewriting episode %s: %s' % (e.id, ex))
236 print 'error rewriting episode %s: %s' % (e.id, ex)
237 e_stats['error'] += 1
238 continue
240 progress(n+1, num_episodes, str(e.id))
242 print 'finished %s episodes' % num_episodes
243 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % e_stats
244 print
245 print 'finished %s podcasts' % num_podcasts
246 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
247 print
248 print 'Hits'
249 for _, r in episode_rules:
250 print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
253 def rewrite_podcasts(p_old, p_new):
255 log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
257 rewrite_newpodcast(p_old, p_new)
259 def rewrite_newpodcast(p_old, p_new):
260 p_n = models.Podcast.for_oldid(p_new.id)
261 p_o = models.Podcast.for_oldid(p_old.id)
263 if Nont in (p_n, p_o):
264 return
267 # merge subscriber data
268 subscribers = []
269 compare = lambda a, b: cmp(a.timestamp, b.timestamp)
270 for n, o in iterate_together(p_n.subscribers, p_o.subscribers):
272 # we assume that the new podcast has much more subscribers
273 # taking only count of the old podcast would look like a drop
274 if None in (n, o):
275 continue
277 subscribers.append(
278 models.SubscriberData(
279 timestamp = o.timestamp,
280 subscriber_count = n.subscriber_count + \
281 n.subscriber_count if n else 0\
285 p_n.subscribers = subscribers
287 p_n.save()
288 p_o.delete()
291 def precompile_rules(rules):
292 for rule in rules:
293 rule.search_re = re.compile(rule.search, re.UNICODE)
294 yield rule