mygpo/api/sanitizing.py

   1 import collections
   2 import urlparse
   3 import re
   4
   5 from django.core.cache import cache
   6
   7 from mygpo.core import models
   8 from mygpo.utils import iterate_together, progress
   9 from mygpo.db.couchdb.podcast import podcast_count, podcast_for_oldid, \
  10         all_podcasts
  11 from mygpo.db.couchdb.common import sanitizingrules_by_obj_type
  12
  13 import logging
  14 logger = logging.getLogger(__name__)
  15
  16
  17 def sanitize_urls(urls, obj_type='podcast'):
  18     """ Apply sanitizing rules to the given URLs and return the results """
  19
  20     return [sanitize_url(url, obj_type) for url in urls]
  21
  22
  23 def sanitize_url(url, obj_type='podcast'):
  24     """ Apply sanitizing rules to the given URL and return the results """
  25
  26     rules = sanitizingrules_by_obj_type(obj_type)
  27     url = basic_sanitizing(url)
  28     url = apply_sanitizing_rules(url, rules)
  29     return url
  30
  31
  32
  33 def basic_sanitizing(url):
  34     """
  35     does basic sanitizing through urlparse and additionally converts the netloc to lowercase
  36     """
  37     r = urlparse.urlsplit(url)
  38     netloc = r.netloc.lower()
  39     r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, '')
  40     return r2.geturl()
  41
  42
  43 def apply_sanitizing_rules(url, rules):
  44     """
  45     applies all url sanitizing rules to the given url
  46     setting podcast=True uses only those rules which have use_podcast set to True.
  47     When passing podcast=False this check is ommitted. The same is valid
  48     for episode.
  49     """
  50
  51     for rule in rules:
  52
  53         orig = url
  54
  55         # check for precompiled regex first
  56         if hasattr(rule, 'search_re'):
  57             url = rule.search_re.sub(rule.replace, url)
  58         else:
  59             url = re.sub(rule.search, rule.replace, url)
  60
  61         if orig != url:
  62             c = getattr(rule, 'hits', 0)
  63             rule.hits = c+1
  64
  65     return url
  66
  67
  68 def maintenance(dry_run=False):
  69     """
  70     This currently checks how many podcasts could be removed by
  71     applying both basic sanitizing rules and those from the database.
  72
  73     This will later be used to replace podcasts!
  74     """
  75
  76     podcast_rules = sanitizingrules_by_obj_type('podcast')
  77     episode_rules = sanitizingrules_by_obj_type('episode')
  78
  79     num_podcasts = podcast_count()
  80
  81     print 'Stats'
  82     print ' * %d podcasts - %d rules' % (num_podcasts, len(podcast_rules))
  83     if dry_run:
  84         print ' * dry run - nothing will be written to the database'
  85     print
  86
  87     print 'precompiling regular expressions'
  88
  89     podcast_rules = list(precompile_rules(podcast_rules))
  90     episode_rules = list(precompile_rules(episode_rules))
  91
  92     p_stats = collections.defaultdict(int)
  93     e_stats = collections.defaultdict(int)
  94
  95     podcasts = all_podcasts()
  96
  97     for n, p in enumerate(podcasts):
  98         su = sanitize_url(p.url, rules=podcast_rules)
  99
 100         # nothing to do
 101         if su == p.url:
 102             p_stats['unchanged'] += 1
 103             continue
 104
 105         # invalid podcast, remove
 106         if su == '':
 107             if not dry_run:
 108                 p.delete()
 109             p_stats['deleted'] += 1
 110
 111         su_podcast = podcast_for_url(url=su)
 112
 113         if not su_podcast:
 114             # "target" podcast does not exist, we simply change the url
 115             if not dry_run:
 116                 logger.info('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
 117                 p.url = su
 118                 p.save()
 119
 120             p_stats['updated'] += 1
 121             continue
 122
 123         # nothing to do
 124         if p == su_podcast:
 125             p_stats['unchanged'] += 1
 126             continue
 127
 128         # last option - merge podcasts
 129         if not dry_run:
 130             rewrite_podcasts(p, su_podcast)
 131             p.delete()
 132
 133         p_stats['merged'] += 1
 134
 135         progress(n+1, num_podcasts, str(p.id))
 136
 137     print 'finished %s podcasts' % (n+1)
 138     print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
 139     print 'Hits'
 140     for _, r in podcast_rules:
 141         print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
 142
 143
 144 def rewrite_podcasts(p_old, p_new):
 145
 146     logger.info('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
 147
 148     rewrite_newpodcast(p_old, p_new)
 149
 150 def rewrite_newpodcast(p_old, p_new):
 151     p_n = podcast_for_oldid(p_new.id)
 152     p_o = podcast_for_oldid(p_old.id)
 153
 154     if None in (p_n, p_o):
 155         return
 156
 157
 158     # merge subscriber data
 159     subscribers = []
 160     compare = lambda a, b: cmp(a.timestamp, b.timestamp)
 161     for n, o in iterate_together([p_n.subscribers, p_o.subscribers]):
 162
 163         # we assume that the new podcast has much more subscribers
 164         # taking only count of the old podcast would look like a drop
 165         if None in (n, o):
 166             continue
 167
 168         subscribers.append(
 169                 models.SubscriberData(
 170                     timestamp = o.timestamp,
 171                     subscriber_count = n.subscriber_count + \
 172                                        n.subscriber_count if n else 0\
 173                 )
 174             )
 175
 176     p_n.subscribers = subscribers
 177
 178     p_n.save()
 179     p_o.delete()
 180
 181
 182 def precompile_rules(rules):
 183     for rule in rules:
 184         rule.search_re = re.compile(rule.search, re.UNICODE)
 185         yield rule