mygpo/api/sanitizing.py

   1 import collections
   2
   3 from django.core.cache import cache
   4
   5 from mygpo.core import models
   6 from mygpo.api.models import Podcast, Episode
   7 from mygpo.log import log
   8 from mygpo.utils import iterate_together, progress
   9 import urlparse
  10 import re
  11
  12
  13
  14 def sanitize_urls(urls, obj_type='podcast', rules=None):
  15     """ Apply sanitizing rules to the given URLs and return the results """
  16
  17     rules = get_sanitizing_rules(obj_type, rules)
  18     return (sanitize_url(url, rules=rules) for url in urls)
  19
  20
  21 def sanitize_url(url, obj_type='podcast', rules=None):
  22     """ Apply sanitizing rules to the given URL and return the results """
  23
  24     rules = get_sanitizing_rules(obj_type, rules=rules)
  25     url = basic_sanitizing(url)
  26     url = apply_sanitizing_rules(url, rules)
  27     return url
  28
  29
  30 def get_sanitizing_rules(obj_type, rules=None):
  31     """ Returns the sanitizing-rules from the cache or the database """
  32
  33     cache_name = '%s-sanitizing-rules' % obj_type
  34
  35     sanitizing_rules = \
  36             rules or \
  37             cache.get(cache_name) or \
  38             list(models.SanitizingRule.for_obj_type(obj_type))
  39
  40     cache.set(cache_name, sanitizing_rules, 60 * 60)
  41
  42     return sanitizing_rules
  43
  44
  45 def basic_sanitizing(url):
  46     """
  47     does basic sanitizing through urlparse and additionally converts the netloc to lowercase
  48     """
  49     r = urlparse.urlsplit(url)
  50     netloc = r.netloc.lower()
  51     r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, r.fragment)
  52     return r2.geturl()
  53
  54
  55 def apply_sanitizing_rules(url, rules):
  56     """
  57     applies all url sanitizing rules to the given url
  58     setting podcast=True uses only those rules which have use_podcast set to True.
  59     When passing podcast=False this check is ommitted. The same is valid
  60     for episode.
  61     """
  62
  63     for rule in rules:
  64
  65         orig = url
  66
  67         # check for precompiled regex first
  68         if hasattr(rule, 'search_re'):
  69             url = rule.search_re.sub(rule.replace, url)
  70         else:
  71             url = re.sub(rule.search, rule.replace, url)
  72
  73         if orig != url:
  74             c = getattr(rule, 'hits', 0)
  75             rule.hits = c+1
  76
  77     return url
  78
  79
  80 def maintenance(dry_run=False):
  81     """
  82     This currently checks how many podcasts could be removed by
  83     applying both basic sanitizing rules and those from the database.
  84
  85     This will later be used to replace podcasts!
  86     """
  87
  88     podcast_rules = get_sanitizing_rules('podcast')
  89     episode_rules = get_sanitizing_rules('episode')
  90
  91     num_podcasts = Podcast.objects.count()
  92     num_episodes = Episode.objects.count()
  93
  94     print 'Stats'
  95     print ' * %d podcasts - %d rules' % (num_podcasts, len(podcast_rules))
  96     print ' * %d episodes - %d rules' % (num_episodes, len(episode_rules))
  97     if dry_run:
  98         print ' * dry run - nothing will be written to the database'
  99     print
 100
 101     print 'precompiling regular expressions'
 102
 103     podcast_rules = list(precompile_rules(podcast_rules))
 104     episode_rules = list(precompile_rules(episode_rules))
 105
 106     p_stats = collections.defaultdict(int)
 107     e_stats = collections.defaultdict(int)
 108
 109     podcasts = Podcast.objects.only('id', 'url').order_by('id').iterator()
 110
 111     for n, p in enumerate(podcasts):
 112         try:
 113             su = sanitize_url(p.url, rules=podcast_rules)
 114         except Exception, e:
 115             log('failed to sanitize url for podcast %s: %s' % (p.id, e))
 116             print 'failed to sanitize url for podcast %s: %s' % (p.id, e)
 117             p_stats['error'] += 1
 118             continue
 119
 120         # nothing to do
 121         if su == p.url:
 122             p_stats['unchanged'] += 1
 123             continue
 124
 125         # invalid podcast, remove
 126         if su == '':
 127             try:
 128                 if not dry_run:
 129                     p.delete()
 130                 p_stats['deleted'] += 1
 131
 132             except Exception, e:
 133                 log('failed to delete podcast %s: %s' % (p.id, e))
 134                 print 'failed to delete podcast %s: %s' % (p.id, e)
 135                 p_stats['error'] += 1
 136
 137             continue
 138
 139         try:
 140             su_podcast = Podcast.objects.get(url=su)
 141
 142         except Podcast.DoesNotExist, e:
 143             # "target" podcast does not exist, we simply change the url
 144             if not dry_run:
 145                 log('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
 146                 p.url = su
 147                 p.save()
 148
 149             p_stats['updated'] += 1
 150             continue
 151
 152         # nothing to do
 153         if p == su_podcast:
 154             p_stats['unchanged'] += 1
 155             continue
 156
 157         # last option - merge podcasts
 158         try:
 159             if not dry_run:
 160                 rewrite_podcasts(p, su_podcast)
 161                 p.delete()
 162
 163             p_stats['merged'] += 1
 164
 165         except Exception, e:
 166             log('error rewriting podcast %s: %s' % (p.id, e))
 167             print 'error rewriting podcast %s: %s' % (p.id, e)
 168             p_stats['error'] += 1
 169             continue
 170
 171         progress(n+1, num_podcasts, str(p.id))
 172
 173     print 'finished %s podcasts' % (n+1)
 174     print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
 175     print 'Hits'
 176     for _, r in podcast_rules:
 177         print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
 178
 179     episodes = Episode.objects.only('id', 'url').order_by('id').iterator()
 180     for e in episodes:
 181         try:
 182             su = sanitize_url(e.url, rules=episode_rules)
 183         except Exception, ex:
 184             log('failed to sanitize url for episode %s: %s' % (e.id, ex))
 185             print 'failed to sanitize url for episode %s: %s' % (e.id, ex)
 186             e_stats['error'] += 1
 187             continue
 188
 189         # nothing to do
 190         if su == e.url:
 191             e_stats['unchanged'] += 1
 192             continue
 193
 194         # invalid episode, remove
 195         if su == '':
 196             try:
 197                 if not dry_run:
 198                     e.delete()
 199
 200                 e_stats['deleted'] += 1
 201             except Exception, ex:
 202                 log('failed to delete episode %s: %s' % (e.id, ex))
 203                 print 'failed to delete episode %s: %s' % (e.id, ex)
 204                 e_stats['error'] += 1
 205
 206             continue
 207
 208         try:
 209             su_episode = Episode.objects.get(url=su, podcast=e.podcast)
 210
 211         except Episode.DoesNotExist, ex:
 212             # "target" episode does not exist, we simply change the url
 213             if not dry_run:
 214                 log('updating episode %s - "%s" => "%s"' % (e.id, e.url, su))
 215                 e.url = su
 216                 e.save()
 217
 218             e_stats['updated'] += 1
 219             continue
 220
 221         # nothing to do
 222         if e == su_episode:
 223             e_stats['unchanged'] += 1
 224             continue
 225
 226
 227         # last option - merge episodes
 228         try:
 229             if not dry_run:
 230                 e.delete()
 231
 232             e_stats['merged'] += 1
 233
 234         except Exception, ex:
 235             log('error rewriting episode %s: %s' % (e.id, ex))
 236             print 'error rewriting episode %s: %s' % (e.id, ex)
 237             e_stats['error'] += 1
 238             continue
 239
 240         progress(n+1, num_episodes, str(e.id))
 241
 242     print 'finished %s episodes' % num_episodes
 243     print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % e_stats
 244     print
 245     print 'finished %s podcasts' % num_podcasts
 246     print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
 247     print
 248     print 'Hits'
 249     for _, r in episode_rules:
 250         print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
 251
 252
 253 def rewrite_podcasts(p_old, p_new):
 254
 255     log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
 256
 257     rewrite_newpodcast(p_old, p_new)
 258
 259 def rewrite_newpodcast(p_old, p_new):
 260     p_n = models.Podcast.for_oldid(p_new.id)
 261     p_o = models.Podcast.for_oldid(p_old.id)
 262
 263     if Nont in (p_n, p_o):
 264         return
 265
 266
 267     # merge subscriber data
 268     subscribers = []
 269     compare = lambda a, b: cmp(a.timestamp, b.timestamp)
 270     for n, o in iterate_together(p_n.subscribers, p_o.subscribers):
 271
 272         # we assume that the new podcast has much more subscribers
 273         # taking only count of the old podcast would look like a drop
 274         if None in (n, o):
 275             continue
 276
 277         subscribers.append(
 278                 models.SubscriberData(
 279                     timestamp = o.timestamp,
 280                     subscriber_count = n.subscriber_count + \
 281                                        n.subscriber_count if n else 0\
 282                 )
 283             )
 284
 285     p_n.subscribers = subscribers
 286
 287     p_n.save()
 288     p_o.delete()
 289
 290
 291 def precompile_rules(rules):
 292     for rule in rules:
 293         rule.search_re = re.compile(rule.search, re.UNICODE)
 294         yield rule