remove unnecessary imports
[mygpo.git] / mygpo / api / sanitizing.py
blob4cfdac78f1dc40cfd2125d63df6d7c3a0b9467c4
1 from mygpo.api.models import URLSanitizingRule, Podcast, ToplistEntry, SuggestionEntry, SubscriptionAction, SubscriptionMeta, Subscription, Episode, EpisodeAction
2 from mygpo.api.models.episodes import Chapter
3 from mygpo.api.models.users import EpisodeFavorite
4 from mygpo.data.models import BackendSubscription, Listener, HistoricPodcastData, PodcastTag
5 from mygpo.log import log
6 import urlparse
7 import re
9 def sanitize_url(url, podcast=True, episode=False, rules=URLSanitizingRule.objects.all().order_by('priority')):
10 url = basic_sanitizing(url)
11 url = apply_sanitizing_rules(url, rules, podcast, episode)
12 return url
15 def basic_sanitizing(url):
16 """
17 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
18 """
19 r = urlparse.urlsplit(url)
20 netloc = r.netloc.lower()
21 r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, r.fragment)
22 return r2.geturl()
24 def apply_sanitizing_rules(url, rules, podcast=True, episode=False):
25 """
26 applies all url sanitizing rules to the given url
27 setting podcast=True uses only those rules which have use_podcast set to True.
28 When passing podcast=False this check is ommitted. The same is valid
29 for episode.
30 """
31 if podcast: rules = [r for r in rules if r.use_podcast==True]
32 if episode: rules = [r for r in rules if r.use_episode==True]
34 for r in rules:
35 orig = url
37 if r.search_precompiled:
38 url = r.search_precompiled.sub(r.replace, url)
39 else:
40 url = re.sub(r.search, r.replace, url)
42 if orig != url:
43 c = getattr(r, 'hits', 0)
44 r.hits = c+1
46 return url
49 def maintenance(dry_run=False):
50 """
51 This currently checks how many podcasts could be removed by
52 applying both basic sanitizing rules and those from the database.
54 This will later be used to replace podcasts!
55 """
56 print 'Stats'
57 print ' * %s podcasts' % Podcast.objects.count()
58 print ' * %s episodes' % Episode.objects.count()
59 print ' * %s rules' % URLSanitizingRule.objects.count()
60 if dry_run:
61 print ' * dry run - nothing will be written to the database'
62 print
64 print 'precompiling regular expressions'
65 rules = precompile_rules()
67 p_unchanged = 0
68 p_merged = 0
69 p_updated = 0
70 p_deleted = 0
71 p_error = 0
72 e_unchanged = 0
73 e_merged = 0
74 e_updated = 0
75 e_deleted = 0
76 e_error = 0
78 count = 0
80 podcasts = Podcast.objects.only('id', 'url').iterator()
81 total = Podcast.objects.count()
82 duplicates = 0
83 sanitized_urls = []
84 for p in podcasts:
85 count += 1
86 if (count % 1000) == 0: print '% 3.2f%% (podcast id %s)' % (((count + 0.0)/total*100), p.id)
87 try:
88 su = sanitize_url(p.url, rules=rules)
89 except Exception, e:
90 log('failed to sanitize url for podcast %s: %s' % (p.id, e))
91 print 'failed to sanitize url for podcast %s: %s' % (p.id, e)
92 p_error += 1
93 continue
95 # nothing to do
96 if su == p.url:
97 p_unchanged += 1
98 continue
100 # invalid podcast, remove
101 if su == '':
102 try:
103 if not dry_run:
104 delete_podcast(p)
105 p_deleted += 1
107 except Exception, e:
108 log('failed to delete podcast %s: %s' % (p.id, e))
109 print 'failed to delete podcast %s: %s' % (p.id, e)
110 p_error += 1
112 continue
114 try:
115 su_podcast = Podcast.objects.get(url=su)
117 except Podcast.DoesNotExist, e:
118 # "target" podcast does not exist, we simply change the url
119 if not dry_run:
120 log('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
121 p.url = su
122 p.save()
124 p_updated += 1
125 continue
127 # nothing to do
128 if p == su_podcast:
129 p_unchanged += 1
130 continue
132 # last option - merge podcasts
133 try:
134 if not dry_run:
135 rewrite_podcasts(p, su_podcast)
136 tmp = Subscription.objects.filter(podcast=p)
137 if tmp.count() > 0: print tmp.count()
138 p.delete()
140 p_merged += 1
142 except Exception, e:
143 log('error rewriting podcast %s: %s' % (p.id, e))
144 print 'error rewriting podcast %s: %s' % (p.id, e)
145 p_error += 1
146 continue
148 print 'finished %s podcasts' % count
149 print ' * %s unchanged' % p_unchanged
150 print ' * %s merged' % p_merged
151 print ' * %s updated' % p_updated
152 print ' * %s deleted' % p_deleted
153 print ' * %s error' % p_error
154 print 'Hits'
155 for r in rules:
156 print ' * %s => %s: %s' % (r.search, r.replace, getattr(r, 'hits', 0))
158 count = 0
159 total = Episode.objects.count()
160 episodes = Episode.objects.only('id', 'url').iterator()
161 for e in episodes:
162 count += 1
163 if (count % 10000) == 0: print '% 3.2f%% (episode id %s)' % (((count + 0.0)/total*100), e.id)
164 try:
165 su = sanitize_url(e.url, rules=rules, podcast=False, episode=True)
166 except Exception, ex:
167 log('failed to sanitize url for episode %s: %s' % (e.id, ex))
168 print 'failed to sanitize url for episode %s: %s' % (e.id, ex)
169 p_error += 1
170 continue
172 # nothing to do
173 if su == e.url:
174 e_unchanged += 1
175 continue
177 # invalid episode, remove
178 if su == '':
179 try:
180 if not dry_run:
181 delete_episode(e)
183 e_deleted += 1
184 except Exception, ex:
185 log('failed to delete episode %s: %s' % (e.id, ex))
186 print 'failed to delete episode %s: %s' % (e.id, ex)
187 e_error += 1
189 continue
191 try:
192 su_episode = Episode.objects.get(url=su, podcast=e.podcast)
194 except Episode.DoesNotExist, ex:
195 # "target" episode does not exist, we simply change the url
196 if not dry_run:
197 log('updating episode %s - "%s" => "%s"' % (e.id, e.url, su))
198 e.url = su
199 e.save()
201 e_updated += 1
202 continue
204 # nothing to do
205 if e == su_episode:
206 e_unchanged += 1
207 continue
210 # last option - merge episodes
211 try:
212 if not dry_run:
213 rewrite_episode_actions(e, su_episode)
214 rewrite_listeners(e, su_episode)
215 rewrite_chapters(e, su_episode)
216 rewrite_favorites(e, su_episode)
217 e.delete()
219 e_merged += 1
221 except Exception, ex:
222 log('error rewriting episode %s: %s' % (e.id, ex))
223 print 'error rewriting episode %s: %s' % (e.id, ex)
224 e_error += 1
225 continue
228 print 'finished %s episodes' % count
229 print ' * %s unchanged' % e_unchanged
230 print ' * %s merged' % e_merged
231 print ' * %s updated' % e_updated
232 print ' * %s deleted' % e_deleted
233 print ' * %s error' % e_error
234 print
235 print 'finished %s podcasts' % count
236 print ' * %s unchanged' % p_unchanged
237 print ' * %s merged' % p_merged
238 print ' * %s updated' % p_updated
239 print ' * %s deleted' % p_deleted
240 print ' * %s error' % p_error
241 print
242 print 'Hits'
243 for r in rules:
244 print ' * %s => %s: %s' % (r.search, r.replace, getattr(r, 'hits', 0))
248 def delete_podcast(p):
249 SubscriptionAction.objects.filter(podcast=p).delete()
250 BackendSubscription.objects.filter(podcast=p).delete()
251 p.delete()
254 def delete_episode(e):
255 EpisodeAction.objects.filter(episode=e).delete()
256 Listener.objects.filter(episode=e).delete()
257 e.delete()
260 def rewrite_podcasts(p_old, p_new):
262 log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
264 # we simply delete incorrect toplist and suggestions entries,
265 # because we can't re-calculate them
266 ToplistEntry.objects.filter(podcast=p_old).delete()
267 SuggestionEntry.objects.filter(podcast=p_old).delete()
268 HistoricPodcastData.objects.filter(podcast=p_old).delete()
269 HistoricPodcastData.objects.filter(podcast=p_new).delete()
271 rewrite_episodes(p_old, p_new)
273 for sm in SubscriptionMeta.objects.filter(podcast=p_old):
274 try:
275 sm_new = SubscriptionMeta.objects.get(user=sm.user, podcast=p_new)
276 log('subscription meta %s (user %s, podcast %s) already exists, deleting %s (user %s, podcast %s)' % (sm_new.id, sm.user.id, p_new.id, sm.id, sm.user.id, p_old.id))
277 # meta-info already exist for the correct podcast, delete the other one
278 sm.delete()
280 except SubscriptionMeta.DoesNotExist:
281 # meta-info for new podcast does not yet exist, update the old one
282 log('updating subscription meta %s (user %s, podcast %s => %s)' % (sm.id, sm.user, p_old.id, p_new.id))
283 sm.podcast = p_new
284 sm.save()
286 for sa in SubscriptionAction.objects.filter(podcast=p_old):
287 try:
288 log('updating subscription action %s (device %s, action %s, timestamp %s, podcast %s => %s)' % (sa.id, sa.device.id, sa.action, sa.timestamp, sa.podcast.id, p_new.id))
289 sa.podcast = p_new
290 sa.save()
291 except Exception, e:
292 log('error updating subscription action %s: %s, deleting' % (sa.id, e))
293 sa.delete()
295 for sub in BackendSubscription.objects.filter(podcast=p_old):
296 try:
297 log('updating subscription %s (device %s, user %s, since %s, podcast %s => %s)' % (sub.id, sub.device.id, sub.user.id, sub.subscribed_since, p_old.id, p_new.id))
298 sub.podcast = p_new
299 sub.save()
300 except Exception, e:
301 log('error updating subscription %s: %s, deleting' % (sub.id, e))
302 sub.delete()
304 for tag in PodcastTag.objects.filter(podcast=p_old):
305 try:
306 log('updating tag %s (tag %s, source %s, podcast %s => %s)' % (tag.id, tag.tag, tag.source, p_old.id, p_new.id))
307 tag.podcast = p_new
308 tag.save()
309 except Exception, e:
310 log('error updating tag %s: %s, deleting.' % (tag.id, e))
311 tag.delete()
314 def rewrite_episodes(p_old, p_new):
316 for e in Episode.objects.filter(podcast=p_old):
317 try:
318 e_new, created_ = Episode.objects.get_or_create(podcast=p_new, url=e.url)
320 log('episode %s (url %s, podcast %s) already exists; updating episode actions for episode %s (url %s, podcast %s)' % (e_new.id, e.url, p_new.id, e.id, e.url, p_old.id))
321 rewrite_episode_actions(e, e_new)
322 log('episode actions for episode %s (url "%s", podcast %s) updated.' % (e.id, e.url, p_old.id))
323 rewrite_listeners(e, e_new)
324 log('listeners for episode %s (url "%s", podcast %s) updated.' % (e.id, e.url, p_old.id))
325 rewrite_chapters(e, e_new)
326 log('chapters for episode %s (url "%s", podcast %s) updated.' % (e.id, e.url, p_old.id))
327 rewrite_favorites(e, e_new)
328 log('favorites for episode %s (url "%s", podcast %s) updated, deleting.' % (e.id, e.url, p_old.id))
329 e.delete()
331 except Episode.DoesNotExist:
332 log('updating episode %s (url "%s", podcast %s => %s)' % (e.id, e.url, p_old.id, p_new.id))
333 e.podcast = p_new
334 e.save()
337 def rewrite_episode_actions(e_old, e_new):
339 for ea in EpisodeAction.objects.filter(episode=e_old):
340 try:
341 log('updating episode action %s (user %s, timestamp %s, episode %s => %s)' % (ea.id, ea.user.id, ea.timestamp, e_old.id, e_new.id))
342 ea.epsidode = e_new
343 ea.save()
345 except Exception, e:
346 log('error updating episode action %s: %s, deleting' % (sa.id, e))
347 ea.delete()
350 def rewrite_listeners(e_old, e_new):
352 for l in Listener.objects.filter(episode=e_old):
353 try:
354 log('updating listener %s (user %s, device %s, podcast %s, episode %s => %s)' % (l.id, l.user.id, l.device.id, l.podcast.id, e_old.id, e_new.id))
355 l.episode = e_new
356 l.podcast = e_new.podcast
357 l.save()
359 except Exception, e:
360 log('error updating listener %s: %s, deleting' % (l.id, e))
361 l.delete()
364 def rewrite_chapters(e_old, e_new):
366 for c in Chapter.objects.filter(episode=e_old):
367 try:
368 log('updating chapter %s (user %s, device %s, episode %s => %s)' % (c.id, c.device.id, e_old.id, e_new.id))
369 c.episode = e_new
370 c.save()
372 except Exception, e:
373 log('error updating chapter %s: %s, deleting' % (c.id, e))
374 c.delete()
377 def rewrite_favorites(e_old, e_new):
378 for f in EpisodeFavorite.objects.filter(episode=e_old):
379 try:
380 log('updating favorite %s (user %s, episode %s => %s)' % (f.id, f.user.id, e_old.id, e_new.id))
381 f.episode = e_new
382 f.save()
384 except Exception, e:
385 log('error updating favorite %s: %s, deleting' % (f.id, e))
386 f.delete()
389 def precompile_rules(rules=URLSanitizingRule.objects.all().order_by('priority')):
390 rules_p = []
391 for rule in rules:
392 r = re.compile(rule.search, re.UNICODE)
393 rule.search_precompile = r
394 rules_p.append( rule )
396 return rules_p