5 from django
.core
.cache
import cache
7 from mygpo
.core
import models
8 from mygpo
.log
import log
9 from mygpo
.utils
import iterate_together
, progress
10 from mygpo
.db
.couchdb
.podcast
import podcast_count
, podcast_for_oldid
, \
12 from mygpo
.db
.couchdb
.common
import sanitizingrules_by_obj_type
15 def sanitize_urls(urls
, obj_type
='podcast'):
16 """ Apply sanitizing rules to the given URLs and return the results """
18 return [sanitize_url(url
, obj_type
) for url
in urls
]
21 def sanitize_url(url
, obj_type
='podcast'):
22 """ Apply sanitizing rules to the given URL and return the results """
24 rules
= sanitizingrules_by_obj_type(obj_type
)
25 url
= basic_sanitizing(url
)
26 url
= apply_sanitizing_rules(url
, rules
)
31 def basic_sanitizing(url
):
33 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
35 r
= urlparse
.urlsplit(url
)
36 netloc
= r
.netloc
.lower()
37 r2
= urlparse
.SplitResult(r
.scheme
, netloc
, r
.path
, r
.query
, '')
41 def apply_sanitizing_rules(url
, rules
):
43 applies all url sanitizing rules to the given url
44 setting podcast=True uses only those rules which have use_podcast set to True.
45 When passing podcast=False this check is ommitted. The same is valid
53 # check for precompiled regex first
54 if hasattr(rule
, 'search_re'):
55 url
= rule
.search_re
.sub(rule
.replace
, url
)
57 url
= re
.sub(rule
.search
, rule
.replace
, url
)
60 c
= getattr(rule
, 'hits', 0)
66 def maintenance(dry_run
=False):
68 This currently checks how many podcasts could be removed by
69 applying both basic sanitizing rules and those from the database.
71 This will later be used to replace podcasts!
74 podcast_rules
= sanitizingrules_by_obj_type('podcast')
75 episode_rules
= sanitizingrules_by_obj_type('episode')
77 num_podcasts
= podcast_count()
80 print ' * %d podcasts - %d rules' % (num_podcasts
, len(podcast_rules
))
82 print ' * dry run - nothing will be written to the database'
85 print 'precompiling regular expressions'
87 podcast_rules
= list(precompile_rules(podcast_rules
))
88 episode_rules
= list(precompile_rules(episode_rules
))
90 p_stats
= collections
.defaultdict(int)
91 e_stats
= collections
.defaultdict(int)
93 podcasts
= all_podcasts()
95 for n
, p
in enumerate(podcasts
):
96 su
= sanitize_url(p
.url
, rules
=podcast_rules
)
100 p_stats
['unchanged'] += 1
103 # invalid podcast, remove
107 p_stats
['deleted'] += 1
109 su_podcast
= podcast_for_url(url
=su
)
112 # "target" podcast does not exist, we simply change the url
114 log('updating podcast %s - "%s" => "%s"' % (p
.id, p
.url
, su
))
118 p_stats
['updated'] += 1
123 p_stats
['unchanged'] += 1
126 # last option - merge podcasts
128 rewrite_podcasts(p
, su_podcast
)
131 p_stats
['merged'] += 1
133 progress(n
+1, num_podcasts
, str(p
.id))
135 print 'finished %s podcasts' % (n
+1)
136 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
138 for _
, r
in podcast_rules
:
139 print '% 30s: %d' % (r
.slug
, getattr(r
, 'hits', 0) if hasattr(r
, 'hits') else 0)
142 def rewrite_podcasts(p_old
, p_new
):
144 log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old
.id, p_old
.url
, p_new
.id, p_new
.url
))
146 rewrite_newpodcast(p_old
, p_new
)
148 def rewrite_newpodcast(p_old
, p_new
):
149 p_n
= podcast_for_oldid(p_new
.id)
150 p_o
= podcast_for_oldid(p_old
.id)
152 if None in (p_n
, p_o
):
156 # merge subscriber data
158 compare
= lambda a
, b
: cmp(a
.timestamp
, b
.timestamp
)
159 for n
, o
in iterate_together([p_n
.subscribers
, p_o
.subscribers
]):
161 # we assume that the new podcast has much more subscribers
162 # taking only count of the old podcast would look like a drop
167 models
.SubscriberData(
168 timestamp
= o
.timestamp
,
169 subscriber_count
= n
.subscriber_count
+ \
170 n
.subscriber_count
if n
else 0\
174 p_n
.subscribers
= subscribers
180 def precompile_rules(rules
):
182 rule
.search_re
= re
.compile(rule
.search
, re
.UNICODE
)