dd39cbfb0b3c76065f554436b4346dd54b9ebb69
[mygpo.git] / mygpo / api / sanitizing.py
blobdd39cbfb0b3c76065f554436b4346dd54b9ebb69
1 from mygpo.api.models import URLSanitizingRule, Podcast, ToplistEntry, SuggestionEntry, SubscriptionAction, SubscriptionMeta, Subscription, Episode, EpisodeAction, EpisodeToplistEntry
2 from mygpo.api.models.episodes import Chapter
3 from mygpo.api.models.users import EpisodeFavorite
4 from mygpo.data.models import BackendSubscription, Listener, HistoricPodcastData, PodcastTag
5 from mygpo.log import log
6 import urlparse
7 import re
8 import sys
10 def sanitize_url(url, podcast=True, episode=False, rules=URLSanitizingRule.objects.all().order_by('priority')):
11 url = basic_sanitizing(url)
12 url = apply_sanitizing_rules(url, rules, podcast, episode)
13 return url
16 def basic_sanitizing(url):
17 """
18 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
19 """
20 r = urlparse.urlsplit(url)
21 netloc = r.netloc.lower()
22 r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, r.fragment)
23 return r2.geturl()
25 def apply_sanitizing_rules(url, rules, podcast=True, episode=False):
26 """
27 applies all url sanitizing rules to the given url
28 setting podcast=True uses only those rules which have use_podcast set to True.
29 When passing podcast=False this check is ommitted. The same is valid
30 for episode.
31 """
32 if podcast: rules = [r for r in rules if r.use_podcast==True]
33 if episode: rules = [r for r in rules if r.use_episode==True]
35 for r in rules:
36 orig = url
38 if r.search_precompiled:
39 url = r.search_precompiled.sub(r.replace, url)
40 else:
41 url = re.sub(r.search, r.replace, url)
43 if orig != url:
44 c = getattr(r, 'hits', 0)
45 r.hits = c+1
47 return url
50 def maintenance(dry_run=False):
51 """
52 This currently checks how many podcasts could be removed by
53 applying both basic sanitizing rules and those from the database.
55 This will later be used to replace podcasts!
56 """
57 print 'Stats'
58 print ' * %s podcasts' % Podcast.objects.count()
59 print ' * %s episodes' % Episode.objects.count()
60 print ' * %s rules' % URLSanitizingRule.objects.count()
61 if dry_run:
62 print ' * dry run - nothing will be written to the database'
63 print
65 print 'precompiling regular expressions'
66 rules = precompile_rules()
68 p_unchanged = 0
69 p_merged = 0
70 p_updated = 0
71 p_deleted = 0
72 p_error = 0
73 e_unchanged = 0
74 e_merged = 0
75 e_updated = 0
76 e_deleted = 0
77 e_error = 0
79 count = 0
81 podcasts = Podcast.objects.only('id', 'url').iterator()
82 total = Podcast.objects.count()
83 duplicates = 0
84 sanitized_urls = []
85 for p in podcasts:
86 count += 1
87 if (count % 1000) == 0: print '% 3.2f%% (podcast id %s)' % (((count + 0.0)/total*100), p.id)
88 try:
89 su = sanitize_url(p.url, rules=rules)
90 except Exception, e:
91 log('failed to sanitize url for podcast %s: %s' % (p.id, e))
92 print 'failed to sanitize url for podcast %s: %s' % (p.id, e)
93 p_error += 1
94 continue
96 # nothing to do
97 if su == p.url:
98 p_unchanged += 1
99 continue
101 # invalid podcast, remove
102 if su == '':
103 try:
104 if not dry_run:
105 delete_podcast(p)
106 p_deleted += 1
108 except Exception, e:
109 log('failed to delete podcast %s: %s' % (p.id, e))
110 print 'failed to delete podcast %s: %s' % (p.id, e)
111 p_error += 1
113 continue
115 try:
116 su_podcast = Podcast.objects.get(url=su)
118 except Podcast.DoesNotExist, e:
119 # "target" podcast does not exist, we simply change the url
120 if not dry_run:
121 log('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
122 p.url = su
123 p.save()
125 p_updated += 1
126 continue
128 # nothing to do
129 if p == su_podcast:
130 p_unchanged += 1
131 continue
133 # last option - merge podcasts
134 try:
135 if not dry_run:
136 rewrite_podcasts(p, su_podcast)
137 tmp = Subscription.objects.filter(podcast=p)
138 if tmp.count() > 0: print tmp.count()
139 p.delete()
141 p_merged += 1
143 except Exception, e:
144 log('error rewriting podcast %s: %s' % (p.id, e))
145 print 'error rewriting podcast %s: %s' % (p.id, e)
146 p_error += 1
147 continue
149 print 'finished %s podcasts' % count
150 print ' * %s unchanged' % p_unchanged
151 print ' * %s merged' % p_merged
152 print ' * %s updated' % p_updated
153 print ' * %s deleted' % p_deleted
154 print ' * %s error' % p_error
155 print 'Hits'
156 for r in rules:
157 print ' * %s => %s: %s' % (r.search, r.replace, getattr(r, 'hits', 0))
159 count = 0
160 total = Episode.objects.count()
161 episodes = Episode.objects.only('id', 'url').iterator()
162 for e in episodes:
163 count += 1
164 if (count % 10000) == 0: print '% 3.2f%% (episode id %s)' % (((count + 0.0)/total*100), e.id)
165 try:
166 su = sanitize_url(e.url, rules=rules, podcast=False, episode=True)
167 except Exception, ex:
168 log('failed to sanitize url for episode %s: %s' % (e.id, ex))
169 print 'failed to sanitize url for episode %s: %s' % (e.id, ex)
170 p_error += 1
171 continue
173 # nothing to do
174 if su == e.url:
175 e_unchanged += 1
176 continue
178 # invalid episode, remove
179 if su == '':
180 try:
181 if not dry_run:
182 delete_episode(e)
184 e_deleted += 1
185 except Exception, ex:
186 log('failed to delete episode %s: %s' % (e.id, ex))
187 print 'failed to delete episode %s: %s' % (e.id, ex)
188 e_error += 1
190 continue
192 try:
193 su_episode = Episode.objects.get(url=su, podcast=e.podcast)
195 except Episode.DoesNotExist, ex:
196 # "target" episode does not exist, we simply change the url
197 if not dry_run:
198 log('updating episode %s - "%s" => "%s"' % (e.id, e.url, su))
199 e.url = su
200 e.save()
202 e_updated += 1
203 continue
205 # nothing to do
206 if e == su_episode:
207 e_unchanged += 1
208 continue
211 # last option - merge episodes
212 try:
213 if not dry_run:
214 rewrite_episode_actions(e, su_episode)
215 rewrite_listeners(e, su_episode)
216 rewrite_chapters(e, su_episode)
217 rewrite_favorites(e, su_episode)
218 e.delete()
220 e_merged += 1
222 except Exception, ex:
223 log('error rewriting episode %s: %s' % (e.id, ex))
224 print 'error rewriting episode %s: %s' % (e.id, ex)
225 e_error += 1
226 continue
229 print 'finished %s episodes' % count
230 print ' * %s unchanged' % e_unchanged
231 print ' * %s merged' % e_merged
232 print ' * %s updated' % e_updated
233 print ' * %s deleted' % e_deleted
234 print ' * %s error' % e_error
235 print
236 print 'finished %s podcasts' % count
237 print ' * %s unchanged' % p_unchanged
238 print ' * %s merged' % p_merged
239 print ' * %s updated' % p_updated
240 print ' * %s deleted' % p_deleted
241 print ' * %s error' % p_error
242 print
243 print 'Hits'
244 for r in rules:
245 print ' * %s => %s: %s' % (r.search, r.replace, getattr(r, 'hits', 0))
249 def delete_podcast(p):
250 SubscriptionAction.objects.filter(podcast=p).delete()
251 BackendSubscription.objects.filter(podcast=p).delete()
252 p.delete()
255 def delete_episode(e):
256 EpisodeAction.objects.filter(episode=e).delete()
257 Listener.objects.filter(episode=e).delete()
258 e.delete()
261 def rewrite_podcasts(p_old, p_new):
263 log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
265 # we simply delete incorrect toplist and suggestions entries,
266 # because we can't re-calculate them
267 ToplistEntry.objects.filter(podcast=p_old).delete()
268 SuggestionEntry.objects.filter(podcast=p_old).delete()
269 HistoricPodcastData.objects.filter(podcast=p_old).delete()
270 HistoricPodcastData.objects.filter(podcast=p_new).delete()
272 rewrite_episodes(p_old, p_new)
274 for sm in SubscriptionMeta.objects.filter(podcast=p_old):
275 try:
276 sm_new = SubscriptionMeta.objects.get(user=sm.user, podcast=p_new)
277 log('subscription meta %s (user %s, podcast %s) already exists, deleting %s (user %s, podcast %s)' % (sm_new.id, sm.user.id, p_new.id, sm.id, sm.user.id, p_old.id))
278 # meta-info already exist for the correct podcast, delete the other one
279 sm.delete()
281 except SubscriptionMeta.DoesNotExist:
282 # meta-info for new podcast does not yet exist, update the old one
283 log('updating subscription meta %s (user %s, podcast %s => %s)' % (sm.id, sm.user, p_old.id, p_new.id))
284 sm.podcast = p_new
285 sm.save()
287 for sa in SubscriptionAction.objects.filter(podcast=p_old):
288 try:
289 log('updating subscription action %s (device %s, action %s, timestamp %s, podcast %s => %s)' % (sa.id, sa.device.id, sa.action, sa.timestamp, sa.podcast.id, p_new.id))
290 sa.podcast = p_new
291 sa.save()
292 except Exception, e:
293 log('error updating subscription action %s: %s, deleting' % (sa.id, e))
294 sa.delete()
296 for sub in BackendSubscription.objects.filter(podcast=p_old):
297 try:
298 log('updating subscription %s (device %s, user %s, since %s, podcast %s => %s)' % (sub.id, sub.device.id, sub.user.id, sub.subscribed_since, p_old.id, p_new.id))
299 sub.podcast = p_new
300 sub.save()
301 except Exception, e:
302 log('error updating subscription %s: %s, deleting' % (sub.id, e))
303 sub.delete()
305 for tag in PodcastTag.objects.filter(podcast=p_old):
306 try:
307 log('updating tag %s (tag %s, source %s, podcast %s => %s)' % (tag.id, tag.tag, tag.source, p_old.id, p_new.id))
308 tag.podcast = p_new
309 tag.save()
310 except Exception, e:
311 log('error updating tag %s: %s, deleting.' % (tag.id, e))
312 tag.delete()
315 def rewrite_episodes(p_old, p_new):
317 for e in Episode.objects.filter(podcast=p_old):
318 try:
319 e_new, created_ = Episode.objects.get_or_create(podcast=p_new, url=e.url)
321 log('episode %s (url %s, podcast %s) already exists; updating episode actions for episode %s (url %s, podcast %s)' % (e_new.id, e.url, p_new.id, e.id, e.url, p_old.id))
322 rewrite_episode_actions(e, e_new)
323 log('episode actions for episode %s (url "%s", podcast %s) updated.' % (e.id, e.url, p_old.id))
324 rewrite_listeners(e, e_new)
325 log('listeners for episode %s (url "%s", podcast %s) updated.' % (e.id, e.url, p_old.id))
326 rewrite_chapters(e, e_new)
327 log('chapters for episode %s (url "%s", podcast %s) updated.' % (e.id, e.url, p_old.id))
328 rewrite_favorites(e, e_new)
329 log('favorites for episode %s (url "%s", podcast %s) updated, deleting.' % (e.id, e.url, p_old.id))
330 e.delete()
332 except Episode.DoesNotExist:
333 log('updating episode %s (url "%s", podcast %s => %s)' % (e.id, e.url, p_old.id, p_new.id))
334 e.podcast = p_new
335 e.save()
338 def rewrite_episode_actions(e_old, e_new):
340 for ea in EpisodeAction.objects.filter(episode=e_old):
341 try:
342 log('updating episode action %s (user %s, timestamp %s, episode %s => %s)' % (ea.id, ea.user.id, ea.timestamp, e_old.id, e_new.id))
343 ea.epsidode = e_new
344 ea.save()
346 except Exception, e:
347 log('error updating episode action %s: %s, deleting' % (sa.id, e))
348 ea.delete()
351 def rewrite_listeners(e_old, e_new):
353 for l in Listener.objects.filter(episode=e_old):
354 try:
355 log('updating listener %s (user %s, device %s, podcast %s, episode %s => %s)' % (l.id, l.user.id, l.device.id, l.podcast.id, e_old.id, e_new.id))
356 l.episode = e_new
357 l.podcast = e_new.podcast
358 l.save()
360 except Exception, e:
361 log('error updating listener %s: %s, deleting' % (l.id, e))
362 l.delete()
365 def rewrite_chapters(e_old, e_new):
367 for c in Chapter.objects.filter(episode=e_old):
368 try:
369 log('updating chapter %s (user %s, device %s, episode %s => %s)' % (c.id, c.device.id, e_old.id, e_new.id))
370 c.episode = e_new
371 c.save()
373 except Exception, e:
374 log('error updating chapter %s: %s, deleting' % (c.id, e))
375 c.delete()
378 def rewrite_favorites(e_old, e_new):
379 for f in EpisodeFavorite.objects.filter(episode=e_old):
380 try:
381 log('updating favorite %s (user %s, episode %s => %s)' % (f.id, f.user.id, e_old.id, e_new.id))
382 f.episode = e_new
383 f.save()
385 except Exception, e:
386 log('error updating favorite %s: %s, deleting' % (f.id, e))
387 f.delete()
390 def precompile_rules(rules=URLSanitizingRule.objects.all().order_by('priority')):
391 rules_p = []
392 for rule in rules:
393 r = re.compile(rule.search, re.UNICODE)
394 rule.search_precompile = r
395 rules_p.append( rule )
397 return rules_p