3 from django
.core
.cache
import cache
5 from mygpo
.core
import models
6 from mygpo
.api
.models
import Podcast
, Episode
7 from mygpo
.log
import log
8 from mygpo
.utils
import iterate_together
, progress
14 def sanitize_urls(urls
, obj_type
='podcast', rules
=None):
15 """ Apply sanitizing rules to the given URLs and return the results """
17 rules
= get_sanitizing_rules(obj_type
, rules
)
18 return (sanitize_url(url
, rules
=rules
) for url
in urls
)
21 def sanitize_url(url
, obj_type
='podcast', rules
=None):
22 """ Apply sanitizing rules to the given URL and return the results """
24 rules
= get_sanitizing_rules(obj_type
, rules
=rules
)
25 url
= basic_sanitizing(url
)
26 url
= apply_sanitizing_rules(url
, rules
)
30 def get_sanitizing_rules(obj_type
, rules
=None):
31 """ Returns the sanitizing-rules from the cache or the database """
33 cache_name
= '%s-sanitizing-rules' % obj_type
37 cache
.get(cache_name
) or \
38 list(models
.SanitizingRule
.for_obj_type(obj_type
))
40 cache
.set(cache_name
, sanitizing_rules
, 60 * 60)
42 return sanitizing_rules
45 def basic_sanitizing(url
):
47 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
49 r
= urlparse
.urlsplit(url
)
50 netloc
= r
.netloc
.lower()
51 r2
= urlparse
.SplitResult(r
.scheme
, netloc
, r
.path
, r
.query
, r
.fragment
)
55 def apply_sanitizing_rules(url
, rules
):
57 applies all url sanitizing rules to the given url
58 setting podcast=True uses only those rules which have use_podcast set to True.
59 When passing podcast=False this check is ommitted. The same is valid
67 # check for precompiled regex first
68 if hasattr(rule
, 'search_re'):
69 url
= rule
.search_re
.sub(rule
.replace
, url
)
71 url
= re
.sub(rule
.search
, rule
.replace
, url
)
74 c
= getattr(rule
, 'hits', 0)
80 def maintenance(dry_run
=False):
82 This currently checks how many podcasts could be removed by
83 applying both basic sanitizing rules and those from the database.
85 This will later be used to replace podcasts!
88 podcast_rules
= get_sanitizing_rules('podcast')
89 episode_rules
= get_sanitizing_rules('episode')
91 num_podcasts
= Podcast
.objects
.count()
92 num_episodes
= Episode
.objects
.count()
95 print ' * %d podcasts - %d rules' % (num_podcasts
, len(podcast_rules
))
96 print ' * %d episodes - %d rules' % (num_episodes
, len(episode_rules
))
98 print ' * dry run - nothing will be written to the database'
101 print 'precompiling regular expressions'
103 podcast_rules
= list(precompile_rules(podcast_rules
))
104 episode_rules
= list(precompile_rules(episode_rules
))
106 p_stats
= collections
.defaultdict(int)
107 e_stats
= collections
.defaultdict(int)
109 podcasts
= Podcast
.objects
.only('id', 'url').order_by('id').iterator()
111 for n
, p
in enumerate(podcasts
):
113 su
= sanitize_url(p
.url
, rules
=podcast_rules
)
115 log('failed to sanitize url for podcast %s: %s' % (p
.id, e
))
116 print 'failed to sanitize url for podcast %s: %s' % (p
.id, e
)
117 p_stats
['error'] += 1
122 p_stats
['unchanged'] += 1
125 # invalid podcast, remove
130 p_stats
['deleted'] += 1
133 log('failed to delete podcast %s: %s' % (p
.id, e
))
134 print 'failed to delete podcast %s: %s' % (p
.id, e
)
135 p_stats
['error'] += 1
140 su_podcast
= Podcast
.objects
.get(url
=su
)
142 except Podcast
.DoesNotExist
, e
:
143 # "target" podcast does not exist, we simply change the url
145 log('updating podcast %s - "%s" => "%s"' % (p
.id, p
.url
, su
))
149 p_stats
['updated'] += 1
154 p_stats
['unchanged'] += 1
157 # last option - merge podcasts
160 rewrite_podcasts(p
, su_podcast
)
163 p_stats
['merged'] += 1
166 log('error rewriting podcast %s: %s' % (p
.id, e
))
167 print 'error rewriting podcast %s: %s' % (p
.id, e
)
168 p_stats
['error'] += 1
171 progress(n
+1, num_podcasts
, str(p
.id))
173 print 'finished %s podcasts' % (n
+1)
174 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
176 for _
, r
in podcast_rules
:
177 print '% 30s: %d' % (r
.slug
, getattr(r
, 'hits', 0) if hasattr(r
, 'hits') else 0)
179 episodes
= Episode
.objects
.only('id', 'url').order_by('id').iterator()
182 su
= sanitize_url(e
.url
, rules
=episode_rules
)
183 except Exception, ex
:
184 log('failed to sanitize url for episode %s: %s' % (e
.id, ex
))
185 print 'failed to sanitize url for episode %s: %s' % (e
.id, ex
)
186 e_stats
['error'] += 1
191 e_stats
['unchanged'] += 1
194 # invalid episode, remove
200 e_stats
['deleted'] += 1
201 except Exception, ex
:
202 log('failed to delete episode %s: %s' % (e
.id, ex
))
203 print 'failed to delete episode %s: %s' % (e
.id, ex
)
204 e_stats
['error'] += 1
209 su_episode
= Episode
.objects
.get(url
=su
, podcast
=e
.podcast
)
211 except Episode
.DoesNotExist
, ex
:
212 # "target" episode does not exist, we simply change the url
214 log('updating episode %s - "%s" => "%s"' % (e
.id, e
.url
, su
))
218 e_stats
['updated'] += 1
223 e_stats
['unchanged'] += 1
227 # last option - merge episodes
232 e_stats
['merged'] += 1
234 except Exception, ex
:
235 log('error rewriting episode %s: %s' % (e
.id, ex
))
236 print 'error rewriting episode %s: %s' % (e
.id, ex
)
237 e_stats
['error'] += 1
240 progress(n
+1, num_episodes
, str(e
.id))
242 print 'finished %s episodes' % num_episodes
243 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % e_stats
245 print 'finished %s podcasts' % num_podcasts
246 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
249 for _
, r
in episode_rules
:
250 print '% 30s: %d' % (r
.slug
, getattr(r
, 'hits', 0) if hasattr(r
, 'hits') else 0)
253 def rewrite_podcasts(p_old
, p_new
):
255 log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old
.id, p_old
.url
, p_new
.id, p_new
.url
))
257 rewrite_newpodcast(p_old
, p_new
)
259 def rewrite_newpodcast(p_old
, p_new
):
260 p_n
= models
.Podcast
.for_oldid(p_new
.id)
261 p_o
= models
.Podcast
.for_oldid(p_old
.id)
263 if Nont
in (p_n
, p_o
):
267 # merge subscriber data
269 compare
= lambda a
, b
: cmp(a
.timestamp
, b
.timestamp
)
270 for n
, o
in iterate_together(p_n
.subscribers
, p_o
.subscribers
):
272 # we assume that the new podcast has much more subscribers
273 # taking only count of the old podcast would look like a drop
278 models
.SubscriberData(
279 timestamp
= o
.timestamp
,
280 subscriber_count
= n
.subscriber_count
+ \
281 n
.subscriber_count
if n
else 0\
285 p_n
.subscribers
= subscribers
291 def precompile_rules(rules
):
293 rule
.search_re
= re
.compile(rule
.search
, re
.UNICODE
)