From: Stefan Kögl
Date: Thu, 18 Jul 2013 16:29:21 +0000 (+0200)
Subject: add admin task for unifying the slugs of episodes
X-Git-Url: https://repo.or.cz/w/mygpo.git/commitdiff_plain/67dab09a93e4d1857ecd680b3143da34f7d26f4b
add admin task for unifying the slugs of episodes
---
diff --git a/mygpo/admin/tasks.py b/mygpo/admin/tasks.py
index 0e998768..e84f4c9e 100644
--- a/mygpo/admin/tasks.py
+++ b/mygpo/admin/tasks.py
@@ -1,8 +1,10 @@
from collections import Counter
from mygpo.cel import celery
+from mygpo.core.slugs import get_duplicate_slugs, EpisodeSlug
from mygpo.maintenance.merge import PodcastMerger
from mygpo.db.couchdb.podcast import podcasts_by_id
+from mygpo.db.couchdb.episode import episodes_for_podcast_uncached
from celery.utils.log import get_task_logger
logger = get_task_logger(__name__)
@@ -26,3 +28,49 @@ def merge_podcasts(podcast_ids, num_groups):
logger.info('merging result: %s', actions)
return actions, podcast
+
+
+@celery.task
+def unify_slugs(podcast):
+ """ Removes duplicate slugs of a podcast's episodes """
+
+ logger.warn('unifying slugs for podcast %s', podcast)
+ episodes = episodes_for_podcast_uncached(podcast)
+ logger.info('found %d episodes', len(episodes))
+
+ common_title = podcast.get_common_episode_title()
+ actions = Counter()
+
+ # get episodes with duplicate slugs
+ for slug, dups in get_duplicate_slugs(episodes):
+ actions['dup-slugs'] += 1
+ # and remove their slugs
+ logger.info('Found %d duplicates for slug %s', len(dups), slug)
+ for dup in dups:
+ actions['dup-episodes'] += 1
+
+ # check if we're removing the "main" slug
+ if dup.slug == slug:
+
+ # if possible, replace it with a "merged" slug
+ if dup.merged_slugs:
+ dup.slug = dup.merged_slugs.pop()
+ actions['replaced-with-merged'] += 1
+ logger.info('Replacing slug with merged slug %s', dup.slug)
+
+ # try to find a new slug
+ else:
+ dup.slug = EpisodeSlug(dup, common_title,
+ override_existing=True).get_slug()
+ actions['replaced-with-new'] += 1
+ logger.info('Replacing slug with new slug %s', dup.slug)
+
+ # if the problematic slug is a merged one, remove it
+ if slug in dup.merged_slugs:
+ actions['removed-merged'] += 1
+ logger.info('Removing merged slug %s', slug)
+ dup.merged_slugs.remove(slug)
+
+ dup.save()
+
+ return actions, podcast
diff --git a/mygpo/admin/templates/admin/overview.html b/mygpo/admin/templates/admin/overview.html
index 0ee08c66..25742578 100644
--- a/mygpo/admin/templates/admin/overview.html
+++ b/mygpo/admin/templates/admin/overview.html
@@ -16,6 +16,7 @@
- {% trans "Host Information" %}
- {% trans "Merge Podcasts and Episodes" %}
+ - {% trans "Unify Duplicate Slugs" %}
- {% trans "Client Stats" %} ({% trans "JSON" %})
- {% trans "User-Agent Stats" %}
- {% trans "File Type Stats" %}
diff --git a/mygpo/admin/templates/admin/merge-status.html b/mygpo/admin/templates/admin/task-status.html
similarity index 81%
rename from mygpo/admin/templates/admin/merge-status.html
rename to mygpo/admin/templates/admin/task-status.html
index 74f025d7..2d2c6ecc 100644
--- a/mygpo/admin/templates/admin/merge-status.html
+++ b/mygpo/admin/templates/admin/task-status.html
@@ -12,18 +12,18 @@
{% block title %}
{% if ready %}
- {% trans "Merge Finished" %}
+ {% trans "Operation Finished" %}
{% else %}
- {% trans "Merge Ongoing" %}
+ {% trans "Operation Ongoing" %}
{% endif %}
{% endblock %}
{% block header %}
{% if ready %}
- {% trans "Merge Finished" %}
+ {% trans "Operation Finished" %}
{% else %}
- {% trans "Merge Ongoing" %}
+ {% trans "Operation Ongoing" %}
{% endif %}
{% endblock %}
@@ -36,6 +36,8 @@
{% for action, count in actions %}
- {{ action }}: {{ count }}
+ {% empty %}
+ - {% trans "none" %}
{% endfor %}
diff --git a/mygpo/admin/templates/admin/unify-slugs-select.html b/mygpo/admin/templates/admin/unify-slugs-select.html
new file mode 100644
index 00000000..675c3879
--- /dev/null
+++ b/mygpo/admin/templates/admin/unify-slugs-select.html
@@ -0,0 +1,30 @@
+{% extends "base.html" %}
+{% load i18n %}
+{% load podcasts %}
+
+{% load menu %}
+{% block mainmenu %}{{ "/admin/"|main_menu }}{% endblock %}
+{% block sectionmenu %}{{ "/admin/"|section_menu:"Admin" }}{% endblock %}
+
+{% block title %}{% trans "Admin Area" %}{% endblock %}
+
+{% block header %}
+ {% trans "Merge Podcasts and Episodes" %}
+{% endblock %}
+
+{% block content %}
+
+
+{% endblock %}
+
diff --git a/mygpo/admin/urls.py b/mygpo/admin/urls.py
index 13e7da9d..11a232ef 100644
--- a/mygpo/admin/urls.py
+++ b/mygpo/admin/urls.py
@@ -3,7 +3,8 @@ from django.conf.urls import *
from mygpo.admin.views import Overview, MergeSelect, MergeVerify, \
MergeProcess, MergeStatus, ClientStatsView, ClientStatsJsonView, \
UserAgentStatsView, StatsView, StatsJsonView, HostInfo, \
- FiletypeStatsView, ActivateUserView
+ FiletypeStatsView, ActivateUserView, UnifyDuplicateSlugsSelect, \
+ UnifyDuplicateSlugs, UnifySlugsStatus
urlpatterns = patterns('mygpo.admin.views',
url(r'^$', Overview.as_view(), name='admin-overview'),
@@ -30,4 +31,16 @@ urlpatterns = patterns('mygpo.admin.views',
url(r'^activate-user/$',
ActivateUserView.as_view(),
name='admin-activate-user'),
+
+ url(r'^unify-slugs/select$',
+ UnifyDuplicateSlugsSelect.as_view(),
+ name='admin-unify-slugs-select'),
+
+ url(r'^unify-slugs/$',
+ UnifyDuplicateSlugs.as_view(),
+ name='admin-unify-slugs'),
+
+ url(r'^unify-slugs/status/(?P[^/]+)$',
+ UnifySlugsStatus.as_view(),
+ name='admin-unify-slugs-status'),
)
diff --git a/mygpo/admin/views.py b/mygpo/admin/views.py
index e4af3ab0..083b582c 100644
--- a/mygpo/admin/views.py
+++ b/mygpo/admin/views.py
@@ -19,7 +19,7 @@ from mygpo.admin.group import PodcastGrouper
from mygpo.maintenance.merge import PodcastMerger, IncorrectMergeException
from mygpo.users.models import User
from mygpo.admin.clients import UserAgentStats, ClientStats
-from mygpo.admin.tasks import merge_podcasts
+from mygpo.admin.tasks import merge_podcasts, unify_slugs
from mygpo.utils import get_git_head
from mygpo.api.httpresponse import JsonResponse
from mygpo.cel import celery
@@ -180,7 +180,7 @@ class MergeProcess(MergeBase):
class MergeStatus(AdminView):
""" Displays the status of the merge operation """
- template_name = 'admin/merge-status.html'
+ template_name = 'admin/task-status.html'
def get(self, request, task_id):
result = merge_podcasts.AsyncResult(task_id)
@@ -201,7 +201,7 @@ class MergeStatus(AdminView):
messages.error(request, str(ime))
return HttpResponseRedirect(reverse('admin-merge'))
- return render(request, 'admin/merge-status.html', {
+ return self.render_to_response({
'ready': True,
'actions': actions.items(),
'podcast': podcast,
@@ -336,3 +336,52 @@ class ActivateUserView(AdminView):
_('User {username} ({email}) activated'.format(
username=user.username, email=user.email)))
return HttpResponseRedirect(reverse('admin-activate-user'))
+
+
+
+class UnifyDuplicateSlugsSelect(AdminView):
+ """ select a podcast for which to unify slugs """
+ template_name = 'admin/unify-slugs-select.html'
+
+
+class UnifyDuplicateSlugs(AdminView):
+ """ start slug-unification task """
+
+ def post(self, request):
+ podcast_url = request.POST.get('feed')
+ podcast = podcast_for_url(podcast_url)
+
+ if not podcast:
+ messages.error(request, _('Podcast with URL "%s" does not exist' %
+ (podcast_url,)))
+ return HttpResponseRedirect(reverse('admin-unify-slugs-select'))
+
+ res = unify_slugs.delay(podcast)
+ return HttpResponseRedirect(reverse('admin-unify-slugs-status',
+ args=[res.task_id]))
+
+
+class UnifySlugsStatus(AdminView):
+ """ Displays the status of the unify-slugs operation """
+
+ template_name = 'admin/task-status.html'
+
+ def get(self, request, task_id):
+ result = merge_podcasts.AsyncResult(task_id)
+
+ if not result.ready():
+ return self.render_to_response({
+ 'ready': False,
+ })
+
+ # clear cache to make merge result visible
+ # TODO: what to do with multiple frontends?
+ cache.clear()
+
+ actions, podcast = result.get()
+
+ return self.render_to_response({
+ 'ready': True,
+ 'actions': actions.items(),
+ 'podcast': podcast,
+ })
diff --git a/mygpo/core/slugs.py b/mygpo/core/slugs.py
index 915f8574..0d3d9052 100644
--- a/mygpo/core/slugs.py
+++ b/mygpo/core/slugs.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
from itertools import count
from couchdbkit.ext.django.schema import *
@@ -5,6 +7,7 @@ from couchdbkit.ext.django.schema import *
from django.utils.text import slugify
from mygpo.decorators import repeat_on_conflict
+from mygpo.utils import partition
def assign_slug(obj, generator):
@@ -38,8 +41,8 @@ class SlugGenerator(object):
""" Generates a unique slug for an object """
- def __init__(self, obj):
- if obj.slug:
+ def __init__(self, obj, override_existing=False):
+ if obj.slug and not override_existing:
raise ValueError('%(obj)s already has slug %(slug)s' % \
dict(obj=obj, slug=obj.slug))
@@ -109,9 +112,9 @@ class PodcastSlug(PodcastGroupSlug):
class EpisodeSlug(SlugGenerator):
""" Generates slugs for Episodes """
- def __init__(self, episode, common_title):
+ def __init__(self, episode, common_title, override_existing=False):
self.common_title = common_title
- super(EpisodeSlug, self).__init__(episode)
+ super(EpisodeSlug, self).__init__(episode, override_existing)
self.podcast_id = episode.podcast
@@ -247,3 +250,41 @@ class SlugMixin(DocumentSchema):
# remove from merged slugs
self.merged_slugs = list(set(self.merged_slugs) - set([slug]))
+
+
+def get_duplicate_slugs(episodes):
+ """ Finds duplicate slugs and yields (slug, duplicates) pairs for each slug
+
+ Such a pair is only yielded for each slug that actually has a duplicate.
+ The "duplicates" list does not contain the selected "winner" of a set of
+ duplicates. """
+
+ # we build a dict of {slug: [episode1, episode2, ...], ...}
+ # for each slug all episodes are given that use this slug
+ slugs = defaultdict(list)
+
+ for episode in episodes:
+ all_slugs = filter(None, [episode.slug] + episode.merged_slugs)
+ for slug in all_slugs:
+ slugs[slug].append(episode)
+
+ # filter out unique slugs
+ dups = {s: eps for (s, eps) in slugs.items() if len(eps) > 1}
+
+ for slug, episodes in dups.items():
+ merged, main = partition(episodes, lambda e: e.slug == slug)
+
+ main, merged = list(main), list(merged)
+
+ # we want to determine exactly one winner, the rest is in "merged"
+ if len(main) == 1:
+ winner = main[0]
+
+ if len(main) < 1:
+ winner = merged.pop()
+
+ if len(main) > 1:
+ winner, merged = main[0], main[1:] + merged
+
+ # for every loser, remove the slug
+ yield slug, merged
diff --git a/mygpo/core/tests.py b/mygpo/core/tests.py
index 1caa36e4..76f855e5 100644
--- a/mygpo/core/tests.py
+++ b/mygpo/core/tests.py
@@ -21,7 +21,8 @@ import doctest
from django.test import TestCase
import mygpo.utils
-from mygpo.core.models import Podcast, PodcastGroup
+from mygpo.core.slugs import get_duplicate_slugs
+from mygpo.core.models import Podcast, PodcastGroup, Episode
class PodcastGroupTests(unittest.TestCase):
@@ -62,8 +63,32 @@ class PodcastGroupTests(unittest.TestCase):
+class UnifySlugTests(unittest.TestCase):
+
+ def test_unify(self):
+
+ from mygpo.core.models import Episode
+ a = Episode(_id='a', slug='1')
+ b = Episode(_id='b', merged_slugs=['1'])
+ c = Episode(_id='c', merged_slugs=['1'])
+
+ dups_list = list(get_duplicate_slugs([a, b, c]))
+
+ # only one duplicate slug is reported
+ self.assertEquals(len(dups_list), 1)
+
+ slug, dups = dups_list[0]
+
+ self.assertEquals(slug, '1')
+ self.assertEquals(len(dups), 2)
+ self.assertEquals(dups[0], b)
+ self.assertEquals(dups[1], c)
+ self.assertEquals(dups, [b, c])
+
+
def suite():
suite = unittest.TestSuite()
suite.addTest(doctest.DocTestSuite(mygpo.utils))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(PodcastGroupTests))
+ suite.addTest(unittest.TestLoader().loadTestsFromTestCase(UnifySlugTests))
return suite
diff --git a/mygpo/utils.py b/mygpo/utils.py
index d0605d65..47ee4ad3 100644
--- a/mygpo/utils.py
+++ b/mygpo/utils.py
@@ -24,6 +24,7 @@ import operator
import sys
import re
import collections
+import itertools
from datetime import datetime, timedelta, date
import time
import hashlib
@@ -994,3 +995,9 @@ def normalize_feed_url(url):
# urlunsplit might return "a slighty different, but equivalent URL"
return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
+
+
+def partition(items, predicate=bool):
+ a, b = itertools.tee((predicate(item), item) for item in items)
+ return ((item for pred, item in a if not pred),
+ (item for pred, item in b if pred))