mygpo/api/sanitizing.py

   1 import collections
   2 import urlparse
   3 import re
   4
   5 from django.core.cache import cache
   6
   7 from mygpo.core import models
   8 from mygpo.log import log
   9 from mygpo.utils import iterate_together, progress
  10 from mygpo.db.couchdb.podcast import podcast_count, podcast_for_oldid, \
  11         all_podcasts
  12 from mygpo.db.couchdb.common import sanitizingrules_by_obj_type
  13
  14
  15 def sanitize_urls(urls, obj_type='podcast'):
  16     """ Apply sanitizing rules to the given URLs and return the results """
  17
  18     rules = sanitizingrules_by_obj_type(obj_type)
  19     return (sanitize_url(url, rules=rules) for url in urls)
  20
  21
  22 def sanitize_url(url, obj_type='podcast'):
  23     """ Apply sanitizing rules to the given URL and return the results """
  24
  25     rules = sanitizingrules_by_obj_type(obj_type)
  26     url = basic_sanitizing(url)
  27     url = apply_sanitizing_rules(url, rules)
  28     return url
  29
  30
  31
  32 def basic_sanitizing(url):
  33     """
  34     does basic sanitizing through urlparse and additionally converts the netloc to lowercase
  35     """
  36     r = urlparse.urlsplit(url)
  37     netloc = r.netloc.lower()
  38     r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, '')
  39     return r2.geturl()
  40
  41
  42 def apply_sanitizing_rules(url, rules):
  43     """
  44     applies all url sanitizing rules to the given url
  45     setting podcast=True uses only those rules which have use_podcast set to True.
  46     When passing podcast=False this check is ommitted. The same is valid
  47     for episode.
  48     """
  49
  50     for rule in rules:
  51
  52         orig = url
  53
  54         # check for precompiled regex first
  55         if hasattr(rule, 'search_re'):
  56             url = rule.search_re.sub(rule.replace, url)
  57         else:
  58             url = re.sub(rule.search, rule.replace, url)
  59
  60         if orig != url:
  61             c = getattr(rule, 'hits', 0)
  62             rule.hits = c+1
  63
  64     return url
  65
  66
  67 def maintenance(dry_run=False):
  68     """
  69     This currently checks how many podcasts could be removed by
  70     applying both basic sanitizing rules and those from the database.
  71
  72     This will later be used to replace podcasts!
  73     """
  74
  75     podcast_rules = sanitizingrules_by_obj_type('podcast')
  76     episode_rules = sanitizingrules_by_obj_type('episode')
  77
  78     num_podcasts = podcast_count()
  79
  80     print 'Stats'
  81     print ' * %d podcasts - %d rules' % (num_podcasts, len(podcast_rules))
  82     if dry_run:
  83         print ' * dry run - nothing will be written to the database'
  84     print
  85
  86     print 'precompiling regular expressions'
  87
  88     podcast_rules = list(precompile_rules(podcast_rules))
  89     episode_rules = list(precompile_rules(episode_rules))
  90
  91     p_stats = collections.defaultdict(int)
  92     e_stats = collections.defaultdict(int)
  93
  94     podcasts = all_podcasts()
  95
  96     for n, p in enumerate(podcasts):
  97         su = sanitize_url(p.url, rules=podcast_rules)
  98
  99         # nothing to do
 100         if su == p.url:
 101             p_stats['unchanged'] += 1
 102             continue
 103
 104         # invalid podcast, remove
 105         if su == '':
 106             if not dry_run:
 107                 p.delete()
 108             p_stats['deleted'] += 1
 109
 110         su_podcast = podcast_for_url(url=su)
 111
 112         if not su_podcast:
 113             # "target" podcast does not exist, we simply change the url
 114             if not dry_run:
 115                 log('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
 116                 p.url = su
 117                 p.save()
 118
 119             p_stats['updated'] += 1
 120             continue
 121
 122         # nothing to do
 123         if p == su_podcast:
 124             p_stats['unchanged'] += 1
 125             continue
 126
 127         # last option - merge podcasts
 128         if not dry_run:
 129             rewrite_podcasts(p, su_podcast)
 130             p.delete()
 131
 132         p_stats['merged'] += 1
 133
 134         progress(n+1, num_podcasts, str(p.id))
 135
 136     print 'finished %s podcasts' % (n+1)
 137     print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
 138     print 'Hits'
 139     for _, r in podcast_rules:
 140         print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
 141
 142
 143 def rewrite_podcasts(p_old, p_new):
 144
 145     log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
 146
 147     rewrite_newpodcast(p_old, p_new)
 148
 149 def rewrite_newpodcast(p_old, p_new):
 150     p_n = podcast_for_oldid(p_new.id)
 151     p_o = podcast_for_oldid(p_old.id)
 152
 153     if None in (p_n, p_o):
 154         return
 155
 156
 157     # merge subscriber data
 158     subscribers = []
 159     compare = lambda a, b: cmp(a.timestamp, b.timestamp)
 160     for n, o in iterate_together([p_n.subscribers, p_o.subscribers]):
 161
 162         # we assume that the new podcast has much more subscribers
 163         # taking only count of the old podcast would look like a drop
 164         if None in (n, o):
 165             continue
 166
 167         subscribers.append(
 168                 models.SubscriberData(
 169                     timestamp = o.timestamp,
 170                     subscriber_count = n.subscriber_count + \
 171                                        n.subscriber_count if n else 0\
 172                 )
 173             )
 174
 175     p_n.subscribers = subscribers
 176
 177     p_n.save()
 178     p_o.delete()
 179
 180
 181 def precompile_rules(rules):
 182     for rule in rules:
 183         rule.search_re = re.compile(rule.search, re.UNICODE)
 184         yield rule