5 from django
.core
.cache
import cache
7 from mygpo
.core
import models
8 from mygpo
.log
import log
9 from mygpo
.utils
import iterate_together
, progress
10 from mygpo
.db
.couchdb
.podcast
import podcast_count
, podcast_for_oldid
, \
12 from mygpo
.db
.couchdb
.common
import sanitizingrules_by_obj_type
15 def sanitize_urls(urls
, obj_type
='podcast'):
16 """ Apply sanitizing rules to the given URLs and return the results """
18 rules
= sanitizingrules_by_obj_type(obj_type
)
19 return (sanitize_url(url
, rules
=rules
) for url
in urls
)
22 def sanitize_url(url
, obj_type
='podcast'):
23 """ Apply sanitizing rules to the given URL and return the results """
25 rules
= sanitizingrules_by_obj_type(obj_type
)
26 url
= basic_sanitizing(url
)
27 url
= apply_sanitizing_rules(url
, rules
)
32 def basic_sanitizing(url
):
34 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
36 r
= urlparse
.urlsplit(url
)
37 netloc
= r
.netloc
.lower()
38 r2
= urlparse
.SplitResult(r
.scheme
, netloc
, r
.path
, r
.query
, '')
42 def apply_sanitizing_rules(url
, rules
):
44 applies all url sanitizing rules to the given url
45 setting podcast=True uses only those rules which have use_podcast set to True.
46 When passing podcast=False this check is ommitted. The same is valid
54 # check for precompiled regex first
55 if hasattr(rule
, 'search_re'):
56 url
= rule
.search_re
.sub(rule
.replace
, url
)
58 url
= re
.sub(rule
.search
, rule
.replace
, url
)
61 c
= getattr(rule
, 'hits', 0)
67 def maintenance(dry_run
=False):
69 This currently checks how many podcasts could be removed by
70 applying both basic sanitizing rules and those from the database.
72 This will later be used to replace podcasts!
75 podcast_rules
= sanitizingrules_by_obj_type('podcast')
76 episode_rules
= sanitizingrules_by_obj_type('episode')
78 num_podcasts
= podcast_count()
81 print ' * %d podcasts - %d rules' % (num_podcasts
, len(podcast_rules
))
83 print ' * dry run - nothing will be written to the database'
86 print 'precompiling regular expressions'
88 podcast_rules
= list(precompile_rules(podcast_rules
))
89 episode_rules
= list(precompile_rules(episode_rules
))
91 p_stats
= collections
.defaultdict(int)
92 e_stats
= collections
.defaultdict(int)
94 podcasts
= all_podcasts()
96 for n
, p
in enumerate(podcasts
):
97 su
= sanitize_url(p
.url
, rules
=podcast_rules
)
101 p_stats
['unchanged'] += 1
104 # invalid podcast, remove
108 p_stats
['deleted'] += 1
110 su_podcast
= podcast_for_url(url
=su
)
113 # "target" podcast does not exist, we simply change the url
115 log('updating podcast %s - "%s" => "%s"' % (p
.id, p
.url
, su
))
119 p_stats
['updated'] += 1
124 p_stats
['unchanged'] += 1
127 # last option - merge podcasts
129 rewrite_podcasts(p
, su_podcast
)
132 p_stats
['merged'] += 1
134 progress(n
+1, num_podcasts
, str(p
.id))
136 print 'finished %s podcasts' % (n
+1)
137 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
139 for _
, r
in podcast_rules
:
140 print '% 30s: %d' % (r
.slug
, getattr(r
, 'hits', 0) if hasattr(r
, 'hits') else 0)
143 def rewrite_podcasts(p_old
, p_new
):
145 log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old
.id, p_old
.url
, p_new
.id, p_new
.url
))
147 rewrite_newpodcast(p_old
, p_new
)
149 def rewrite_newpodcast(p_old
, p_new
):
150 p_n
= podcast_for_oldid(p_new
.id)
151 p_o
= podcast_for_oldid(p_old
.id)
153 if None in (p_n
, p_o
):
157 # merge subscriber data
159 compare
= lambda a
, b
: cmp(a
.timestamp
, b
.timestamp
)
160 for n
, o
in iterate_together([p_n
.subscribers
, p_o
.subscribers
]):
162 # we assume that the new podcast has much more subscribers
163 # taking only count of the old podcast would look like a drop
168 models
.SubscriberData(
169 timestamp
= o
.timestamp
,
170 subscriber_count
= n
.subscriber_count
+ \
171 n
.subscriber_count
if n
else 0\
175 p_n
.subscribers
= subscribers
181 def precompile_rules(rules
):
183 rule
.search_re
= re
.compile(rule
.search
, re
.UNICODE
)