5 from django
.core
.cache
import cache
7 from mygpo
.core
import models
8 from mygpo
.utils
import iterate_together
, progress
9 from mygpo
.db
.couchdb
.podcast
import podcast_count
, podcast_for_oldid
, \
11 from mygpo
.db
.couchdb
.common
import sanitizingrules_by_obj_type
14 logger
= logging
.getLogger(__name__
)
17 def sanitize_urls(urls
, obj_type
='podcast'):
18 """ Apply sanitizing rules to the given URLs and return the results """
20 return [sanitize_url(url
, obj_type
) for url
in urls
]
23 def sanitize_url(url
, obj_type
='podcast'):
24 """ Apply sanitizing rules to the given URL and return the results """
26 rules
= sanitizingrules_by_obj_type(obj_type
)
27 url
= basic_sanitizing(url
)
28 url
= apply_sanitizing_rules(url
, rules
)
33 def basic_sanitizing(url
):
35 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
37 r
= urlparse
.urlsplit(url
)
38 netloc
= r
.netloc
.lower()
39 r2
= urlparse
.SplitResult(r
.scheme
, netloc
, r
.path
, r
.query
, '')
43 def apply_sanitizing_rules(url
, rules
):
45 applies all url sanitizing rules to the given url
46 setting podcast=True uses only those rules which have use_podcast set to True.
47 When passing podcast=False this check is ommitted. The same is valid
55 # check for precompiled regex first
56 if hasattr(rule
, 'search_re'):
57 url
= rule
.search_re
.sub(rule
.replace
, url
)
59 url
= re
.sub(rule
.search
, rule
.replace
, url
)
62 c
= getattr(rule
, 'hits', 0)
68 def maintenance(dry_run
=False):
70 This currently checks how many podcasts could be removed by
71 applying both basic sanitizing rules and those from the database.
73 This will later be used to replace podcasts!
76 podcast_rules
= sanitizingrules_by_obj_type('podcast')
77 episode_rules
= sanitizingrules_by_obj_type('episode')
79 num_podcasts
= podcast_count()
82 print ' * %d podcasts - %d rules' % (num_podcasts
, len(podcast_rules
))
84 print ' * dry run - nothing will be written to the database'
87 print 'precompiling regular expressions'
89 podcast_rules
= list(precompile_rules(podcast_rules
))
90 episode_rules
= list(precompile_rules(episode_rules
))
92 p_stats
= collections
.defaultdict(int)
93 e_stats
= collections
.defaultdict(int)
95 podcasts
= all_podcasts()
97 for n
, p
in enumerate(podcasts
):
98 su
= sanitize_url(p
.url
, rules
=podcast_rules
)
102 p_stats
['unchanged'] += 1
105 # invalid podcast, remove
109 p_stats
['deleted'] += 1
111 su_podcast
= podcast_for_url(url
=su
)
114 # "target" podcast does not exist, we simply change the url
116 logger
.info('updating podcast %s - "%s" => "%s"' % (p
.id, p
.url
, su
))
120 p_stats
['updated'] += 1
125 p_stats
['unchanged'] += 1
128 # last option - merge podcasts
130 rewrite_podcasts(p
, su_podcast
)
133 p_stats
['merged'] += 1
135 progress(n
+1, num_podcasts
, str(p
.id))
137 print 'finished %s podcasts' % (n
+1)
138 print '%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error' % p_stats
140 for _
, r
in podcast_rules
:
141 print '% 30s: %d' % (r
.slug
, getattr(r
, 'hits', 0) if hasattr(r
, 'hits') else 0)
144 def rewrite_podcasts(p_old
, p_new
):
146 logger
.info('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old
.id, p_old
.url
, p_new
.id, p_new
.url
))
148 rewrite_newpodcast(p_old
, p_new
)
150 def rewrite_newpodcast(p_old
, p_new
):
151 p_n
= podcast_for_oldid(p_new
.id)
152 p_o
= podcast_for_oldid(p_old
.id)
154 if None in (p_n
, p_o
):
158 # merge subscriber data
160 compare
= lambda a
, b
: cmp(a
.timestamp
, b
.timestamp
)
161 for n
, o
in iterate_together([p_n
.subscribers
, p_o
.subscribers
]):
163 # we assume that the new podcast has much more subscribers
164 # taking only count of the old podcast would look like a drop
169 models
.SubscriberData(
170 timestamp
= o
.timestamp
,
171 subscriber_count
= n
.subscriber_count
+ \
172 n
.subscriber_count
if n
else 0\
176 p_n
.subscribers
= subscribers
182 def precompile_rules(rules
):
184 rule
.search_re
= re
.compile(rule
.search
, re
.UNICODE
)