use pillow instead of PIL
[mygpo.git] / mygpo / api / sanitizing.py
blob1bc9fe04a58ed1733e616dc4296ca08233fc3ab1
1 import collections
2 import urlparse
3 import re
5 from django.core.cache import cache
7 from mygpo.core import models
8 from mygpo.utils import iterate_together, progress
9 from mygpo.db.couchdb.podcast import podcast_count, podcast_for_oldid, \
10 all_podcasts
11 from mygpo.db.couchdb.common import sanitizingrules_by_obj_type
13 import logging
14 logger = logging.getLogger(__name__)
17 def sanitize_urls(urls, obj_type='podcast'):
18 """ Apply sanitizing rules to the given URLs and return the results """
20 return [sanitize_url(url, obj_type) for url in urls]
23 def sanitize_url(url, obj_type='podcast'):
24 """ Apply sanitizing rules to the given URL and return the results """
26 rules = sanitizingrules_by_obj_type(obj_type)
27 url = basic_sanitizing(url)
28 url = apply_sanitizing_rules(url, rules)
29 return url
33 def basic_sanitizing(url):
34 """
35 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
36 """
37 r = urlparse.urlsplit(url)
38 netloc = r.netloc.lower()
39 r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, '')
40 return r2.geturl()
43 def apply_sanitizing_rules(url, rules):
44 """
45 applies all url sanitizing rules to the given url
46 setting podcast=True uses only those rules which have use_podcast set to True.
47 When passing podcast=False this check is ommitted. The same is valid
48 for episode.
49 """
51 for rule in rules:
53 orig = url
55 # check for precompiled regex first
56 if hasattr(rule, 'search_re'):
57 url = rule.search_re.sub(rule.replace, url)
58 else:
59 url = re.sub(rule.search, rule.replace, url)
61 if orig != url:
62 c = getattr(rule, 'hits', 0)
63 rule.hits = c+1
65 return url
68 def maintenance(dry_run=False):
69 """
70 This currently checks how many podcasts could be removed by
71 applying both basic sanitizing rules and those from the database.
73 This will later be used to replace podcasts!
74 """
76 podcast_rules = sanitizingrules_by_obj_type('podcast')
77 episode_rules = sanitizingrules_by_obj_type('episode')
79 num_podcasts = podcast_count()
81 print 'Stats'
82 print ' * %d podcasts - %d rules' % (num_podcasts, len(podcast_rules))
83 if dry_run:
84 print ' * dry run - nothing will be written to the database'
85 print
87 print 'precompiling regular expressions'
89 podcast_rules = list(precompile_rules(podcast_rules))
90 episode_rules = list(precompile_rules(episode_rules))
92 p_stats = collections.defaultdict(int)
93 e_stats = collections.defaultdict(int)
95 podcasts = all_podcasts()
97 for n, p in enumerate(podcasts):
98 su = sanitize_url(p.url, rules=podcast_rules)
100 # nothing to do
101 if su == p.url:
102 p_stats['unchanged'] += 1
103 continue
105 # invalid podcast, remove
106 if su == '':
107 if not dry_run:
108 p.delete()
109 p_stats['deleted'] += 1
111 su_podcast = podcast_for_url(url=su)
113 if not su_podcast:
114 # "target" podcast does not exist, we simply change the url
115 if not dry_run:
116 logger.info('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
117 p.url = su
118 p.save()
120 p_stats['updated'] += 1
121 continue
123 # nothing to do
124 if p == su_podcast:
125 p_stats['unchanged'] += 1
126 continue
128 # last option - merge podcasts
129 if not dry_run:
130 rewrite_podcasts(p, su_podcast)
131 p.delete()
133 p_stats['merged'] += 1
135 progress(n+1, num_podcasts, str(p.id))
137 print 'finished %s podcasts' % (n+1)
138 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
139 print 'Hits'
140 for _, r in podcast_rules:
141 print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
144 def rewrite_podcasts(p_old, p_new):
146 logger.info('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
148 rewrite_newpodcast(p_old, p_new)
150 def rewrite_newpodcast(p_old, p_new):
151 p_n = podcast_for_oldid(p_new.id)
152 p_o = podcast_for_oldid(p_old.id)
154 if None in (p_n, p_o):
155 return
158 # merge subscriber data
159 subscribers = []
160 compare = lambda a, b: cmp(a.timestamp, b.timestamp)
161 for n, o in iterate_together([p_n.subscribers, p_o.subscribers]):
163 # we assume that the new podcast has much more subscribers
164 # taking only count of the old podcast would look like a drop
165 if None in (n, o):
166 continue
168 subscribers.append(
169 models.SubscriberData(
170 timestamp = o.timestamp,
171 subscriber_count = n.subscriber_count + \
172 n.subscriber_count if n else 0\
176 p_n.subscribers = subscribers
178 p_n.save()
179 p_o.delete()
182 def precompile_rules(rules):
183 for rule in rules:
184 rule.search_re = re.compile(rule.search, re.UNICODE)
185 yield rule