mygpo/api/sanitizing.py

   1 import collections
   2 import urlparse
   3 import re
   4
   5 from django.core.cache import cache
   6
   7 from mygpo.core import models
   8 from mygpo.log import log
   9 from mygpo.utils import iterate_together, progress
  10 from mygpo.db.couchdb.podcast import podcast_count, podcast_for_oldid, \
  11         all_podcasts
  12 from mygpo.db.couchdb.common import sanitizingrules_by_obj_type
  13
  14
  15 def sanitize_urls(urls, obj_type='podcast'):
  16     """ Apply sanitizing rules to the given URLs and return the results """
  17
  18     return [sanitize_url(url, obj_type) for url in urls]
  19
  20
  21 def sanitize_url(url, obj_type='podcast'):
  22     """ Apply sanitizing rules to the given URL and return the results """
  23
  24     rules = sanitizingrules_by_obj_type(obj_type)
  25     url = basic_sanitizing(url)
  26     url = apply_sanitizing_rules(url, rules)
  27     return url
  28
  29
  30
  31 def basic_sanitizing(url):
  32     """
  33     does basic sanitizing through urlparse and additionally converts the netloc to lowercase
  34     """
  35     r = urlparse.urlsplit(url)
  36     netloc = r.netloc.lower()
  37     r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, '')
  38     return r2.geturl()
  39
  40
  41 def apply_sanitizing_rules(url, rules):
  42     """
  43     applies all url sanitizing rules to the given url
  44     setting podcast=True uses only those rules which have use_podcast set to True.
  45     When passing podcast=False this check is ommitted. The same is valid
  46     for episode.
  47     """
  48
  49     for rule in rules:
  50
  51         orig = url
  52
  53         # check for precompiled regex first
  54         if hasattr(rule, 'search_re'):
  55             url = rule.search_re.sub(rule.replace, url)
  56         else:
  57             url = re.sub(rule.search, rule.replace, url)
  58
  59         if orig != url:
  60             c = getattr(rule, 'hits', 0)
  61             rule.hits = c+1
  62
  63     return url
  64
  65
  66 def maintenance(dry_run=False):
  67     """
  68     This currently checks how many podcasts could be removed by
  69     applying both basic sanitizing rules and those from the database.
  70
  71     This will later be used to replace podcasts!
  72     """
  73
  74     podcast_rules = sanitizingrules_by_obj_type('podcast')
  75     episode_rules = sanitizingrules_by_obj_type('episode')
  76
  77     num_podcasts = podcast_count()
  78
  79     print 'Stats'
  80     print ' * %d podcasts - %d rules' % (num_podcasts, len(podcast_rules))
  81     if dry_run:
  82         print ' * dry run - nothing will be written to the database'
  83     print
  84
  85     print 'precompiling regular expressions'
  86
  87     podcast_rules = list(precompile_rules(podcast_rules))
  88     episode_rules = list(precompile_rules(episode_rules))
  89
  90     p_stats = collections.defaultdict(int)
  91     e_stats = collections.defaultdict(int)
  92
  93     podcasts = all_podcasts()
  94
  95     for n, p in enumerate(podcasts):
  96         su = sanitize_url(p.url, rules=podcast_rules)
  97
  98         # nothing to do
  99         if su == p.url:
 100             p_stats['unchanged'] += 1
 101             continue
 102
 103         # invalid podcast, remove
 104         if su == '':
 105             if not dry_run:
 106                 p.delete()
 107             p_stats['deleted'] += 1
 108
 109         su_podcast = podcast_for_url(url=su)
 110
 111         if not su_podcast:
 112             # "target" podcast does not exist, we simply change the url
 113             if not dry_run:
 114                 log('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
 115                 p.url = su
 116                 p.save()
 117
 118             p_stats['updated'] += 1
 119             continue
 120
 121         # nothing to do
 122         if p == su_podcast:
 123             p_stats['unchanged'] += 1
 124             continue
 125
 126         # last option - merge podcasts
 127         if not dry_run:
 128             rewrite_podcasts(p, su_podcast)
 129             p.delete()
 130
 131         p_stats['merged'] += 1
 132
 133         progress(n+1, num_podcasts, str(p.id))
 134
 135     print 'finished %s podcasts' % (n+1)
 136     print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
 137     print 'Hits'
 138     for _, r in podcast_rules:
 139         print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
 140
 141
 142 def rewrite_podcasts(p_old, p_new):
 143
 144     log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
 145
 146     rewrite_newpodcast(p_old, p_new)
 147
 148 def rewrite_newpodcast(p_old, p_new):
 149     p_n = podcast_for_oldid(p_new.id)
 150     p_o = podcast_for_oldid(p_old.id)
 151
 152     if None in (p_n, p_o):
 153         return
 154
 155
 156     # merge subscriber data
 157     subscribers = []
 158     compare = lambda a, b: cmp(a.timestamp, b.timestamp)
 159     for n, o in iterate_together([p_n.subscribers, p_o.subscribers]):
 160
 161         # we assume that the new podcast has much more subscribers
 162         # taking only count of the old podcast would look like a drop
 163         if None in (n, o):
 164             continue
 165
 166         subscribers.append(
 167                 models.SubscriberData(
 168                     timestamp = o.timestamp,
 169                     subscriber_count = n.subscriber_count + \
 170                                        n.subscriber_count if n else 0\
 171                 )
 172             )
 173
 174     p_n.subscribers = subscribers
 175
 176     p_n.save()
 177     p_o.delete()
 178
 179
 180 def precompile_rules(rules):
 181     for rule in rules:
 182         rule.search_re = re.compile(rule.search, re.UNICODE)
 183         yield rule