move sanitizing rule queries into db module
[mygpo.git] / mygpo / api / sanitizing.py
blob2bdcfef85a0270569a725242d72f1942e66f0d98
1 import collections
2 import urlparse
3 import re
5 from django.core.cache import cache
7 from mygpo.core import models
8 from mygpo.log import log
9 from mygpo.utils import iterate_together, progress
10 from mygpo.db.couchdb.podcast import podcast_count, podcast_for_oldid, \
11 all_podcasts
12 from mygpo.db.couchdb.common import sanitizingrules_by_obj_type
15 def sanitize_urls(urls, obj_type='podcast'):
16 """ Apply sanitizing rules to the given URLs and return the results """
18 rules = sanitizingrules_by_obj_type(obj_type)
19 return (sanitize_url(url, rules=rules) for url in urls)
22 def sanitize_url(url, obj_type='podcast'):
23 """ Apply sanitizing rules to the given URL and return the results """
25 rules = sanitizingrules_by_obj_type(obj_type)
26 url = basic_sanitizing(url)
27 url = apply_sanitizing_rules(url, rules)
28 return url
32 def basic_sanitizing(url):
33 """
34 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
35 """
36 r = urlparse.urlsplit(url)
37 netloc = r.netloc.lower()
38 r2 = urlparse.SplitResult(r.scheme, netloc, r.path, r.query, '')
39 return r2.geturl()
42 def apply_sanitizing_rules(url, rules):
43 """
44 applies all url sanitizing rules to the given url
45 setting podcast=True uses only those rules which have use_podcast set to True.
46 When passing podcast=False this check is ommitted. The same is valid
47 for episode.
48 """
50 for rule in rules:
52 orig = url
54 # check for precompiled regex first
55 if hasattr(rule, 'search_re'):
56 url = rule.search_re.sub(rule.replace, url)
57 else:
58 url = re.sub(rule.search, rule.replace, url)
60 if orig != url:
61 c = getattr(rule, 'hits', 0)
62 rule.hits = c+1
64 return url
67 def maintenance(dry_run=False):
68 """
69 This currently checks how many podcasts could be removed by
70 applying both basic sanitizing rules and those from the database.
72 This will later be used to replace podcasts!
73 """
75 podcast_rules = sanitizingrules_by_obj_type('podcast')
76 episode_rules = sanitizingrules_by_obj_type('episode')
78 num_podcasts = podcast_count()
80 print 'Stats'
81 print ' * %d podcasts - %d rules' % (num_podcasts, len(podcast_rules))
82 if dry_run:
83 print ' * dry run - nothing will be written to the database'
84 print
86 print 'precompiling regular expressions'
88 podcast_rules = list(precompile_rules(podcast_rules))
89 episode_rules = list(precompile_rules(episode_rules))
91 p_stats = collections.defaultdict(int)
92 e_stats = collections.defaultdict(int)
94 podcasts = all_podcasts()
96 for n, p in enumerate(podcasts):
97 su = sanitize_url(p.url, rules=podcast_rules)
99 # nothing to do
100 if su == p.url:
101 p_stats['unchanged'] += 1
102 continue
104 # invalid podcast, remove
105 if su == '':
106 if not dry_run:
107 p.delete()
108 p_stats['deleted'] += 1
110 su_podcast = podcast_for_url(url=su)
112 if not su_podcast:
113 # "target" podcast does not exist, we simply change the url
114 if not dry_run:
115 log('updating podcast %s - "%s" => "%s"' % (p.id, p.url, su))
116 p.url = su
117 p.save()
119 p_stats['updated'] += 1
120 continue
122 # nothing to do
123 if p == su_podcast:
124 p_stats['unchanged'] += 1
125 continue
127 # last option - merge podcasts
128 if not dry_run:
129 rewrite_podcasts(p, su_podcast)
130 p.delete()
132 p_stats['merged'] += 1
134 progress(n+1, num_podcasts, str(p.id))
136 print 'finished %s podcasts' % (n+1)
137 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
138 print 'Hits'
139 for _, r in podcast_rules:
140 print '% 30s: %d' % (r.slug, getattr(r, 'hits', 0) if hasattr(r, 'hits') else 0)
143 def rewrite_podcasts(p_old, p_new):
145 log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url))
147 rewrite_newpodcast(p_old, p_new)
149 def rewrite_newpodcast(p_old, p_new):
150 p_n = podcast_for_oldid(p_new.id)
151 p_o = podcast_for_oldid(p_old.id)
153 if None in (p_n, p_o):
154 return
157 # merge subscriber data
158 subscribers = []
159 compare = lambda a, b: cmp(a.timestamp, b.timestamp)
160 for n, o in iterate_together([p_n.subscribers, p_o.subscribers]):
162 # we assume that the new podcast has much more subscribers
163 # taking only count of the old podcast would look like a drop
164 if None in (n, o):
165 continue
167 subscribers.append(
168 models.SubscriberData(
169 timestamp = o.timestamp,
170 subscriber_count = n.subscriber_count + \
171 n.subscriber_count if n else 0\
175 p_n.subscribers = subscribers
177 p_n.save()
178 p_o.delete()
181 def precompile_rules(rules):
182 for rule in rules:
183 rule.search_re = re.compile(rule.search, re.UNICODE)
184 yield rule