add admin task for unifying the slugs of episodes
authorStefan Kögl <stefan@skoegl.net>
Thu, 18 Jul 2013 16:29:21 +0000 (18 18:29 +0200)
committerStefan Kögl <stefan@skoegl.net>
Thu, 18 Jul 2013 16:29:21 +0000 (18 18:29 +0200)
mygpo/admin/tasks.py
mygpo/admin/templates/admin/overview.html
mygpo/admin/templates/admin/task-status.html [moved from mygpo/admin/templates/admin/merge-status.html with 81% similarity]
mygpo/admin/templates/admin/unify-slugs-select.html [new file with mode: 0644]
mygpo/admin/urls.py
mygpo/admin/views.py
mygpo/core/slugs.py
mygpo/core/tests.py
mygpo/utils.py

index 0e99876..e84f4c9 100644 (file)
@@ -1,8 +1,10 @@
 from collections import Counter
 
 from mygpo.cel import celery
+from mygpo.core.slugs import get_duplicate_slugs, EpisodeSlug
 from mygpo.maintenance.merge import PodcastMerger
 from mygpo.db.couchdb.podcast import podcasts_by_id
+from mygpo.db.couchdb.episode import episodes_for_podcast_uncached
 
 from celery.utils.log import get_task_logger
 logger = get_task_logger(__name__)
@@ -26,3 +28,49 @@ def merge_podcasts(podcast_ids, num_groups):
     logger.info('merging result: %s', actions)
 
     return actions, podcast
+
+
+@celery.task
+def unify_slugs(podcast):
+    """ Removes duplicate slugs of a podcast's episodes """
+
+    logger.warn('unifying slugs for podcast %s', podcast)
+    episodes = episodes_for_podcast_uncached(podcast)
+    logger.info('found %d episodes', len(episodes))
+
+    common_title = podcast.get_common_episode_title()
+    actions = Counter()
+
+    # get episodes with duplicate slugs
+    for slug, dups in get_duplicate_slugs(episodes):
+        actions['dup-slugs'] += 1
+        # and remove their slugs
+        logger.info('Found %d duplicates for slug %s', len(dups), slug)
+        for dup in dups:
+            actions['dup-episodes'] += 1
+
+            # check if we're removing the "main" slug
+            if dup.slug == slug:
+
+                # if possible, replace it with a "merged" slug
+                if dup.merged_slugs:
+                    dup.slug = dup.merged_slugs.pop()
+                    actions['replaced-with-merged'] += 1
+                    logger.info('Replacing slug with merged slug %s', dup.slug)
+
+                # try to find a new slug
+                else:
+                    dup.slug = EpisodeSlug(dup, common_title,
+                        override_existing=True).get_slug()
+                    actions['replaced-with-new'] += 1
+                    logger.info('Replacing slug with new slug %s', dup.slug)
+
+            # if the problematic slug is a merged one, remove it
+            if slug in dup.merged_slugs:
+                actions['removed-merged'] += 1
+                logger.info('Removing merged slug %s', slug)
+                dup.merged_slugs.remove(slug)
+
+            dup.save()
+
+    return actions, podcast
index 0ee08c6..2574257 100644 (file)
@@ -16,6 +16,7 @@
  <ul class="icons">
   <li><i class="icon-stethoscope"></i> <a href="{% url "admin-hostinfo" %}">{% trans "Host Information" %}</a></li>
   <li><i class="icon-resize-small"></i> <a href="{% url "admin-merge" %}?podcasts=2">{% trans "Merge Podcasts and Episodes" %}</a></li>
+  <li><i class="icon-resize-small"></i> <a href="{% url "admin-unify-slugs-select" %}">{% trans "Unify Duplicate Slugs" %}</a></li>
   <li><i class="icon-bar-chart"></i> <a href="{% url "clients" %}">{% trans "Client Stats" %}</a> (<a href="{% url "clients-json" %}">{% trans "JSON" %}</a>)</li>
   <li><i class="icon-bar-chart"></i> <a href="{% url "useragents" %}">{% trans "User-Agent Stats" %}</a></li>
   <li><i class="icon-bar-chart"></i> <a href="{% url "admin-filetypes" %}">{% trans "File Type Stats" %}</a></li>
similarity index 81%
rename from mygpo/admin/templates/admin/merge-status.html
rename to mygpo/admin/templates/admin/task-status.html
index 74f025d..2d2c6ec 100644 (file)
 
 {% block title %}
  {% if ready %}
-  {% trans "Merge Finished" %}
+  {% trans "Operation Finished" %}
  {% else %}
-  {% trans "Merge Ongoing" %}
+  {% trans "Operation Ongoing" %}
  {% endif %}
 {% endblock %}
 
 {% block header %}
  <h1>
   {% if ready %}
-   {% trans "Merge Finished" %}
+   {% trans "Operation Finished" %}
   {% else %}
-   {% trans "Merge Ongoing" %}
+   {% trans "Operation Ongoing" %}
   {% endif %}
  </h1>
 {% endblock %}
@@ -36,6 +36,8 @@
    <ul>
     {% for action, count in actions %}
      <li>{{ action }}: {{ count }}</li>
+    {% empty %}
+     <li><em>{% trans "none" %}</em></li>
     {% endfor %}
    </ul>
   </p>
diff --git a/mygpo/admin/templates/admin/unify-slugs-select.html b/mygpo/admin/templates/admin/unify-slugs-select.html
new file mode 100644 (file)
index 0000000..675c387
--- /dev/null
@@ -0,0 +1,30 @@
+{% extends "base.html" %}
+{% load i18n %}
+{% load podcasts %}
+
+{% load menu %}
+{% block mainmenu %}{{ "/admin/"|main_menu }}{% endblock %}
+{% block sectionmenu %}{{ "/admin/"|section_menu:"Admin" }}{% endblock %}
+
+{% block title %}{% trans "Admin Area" %}{% endblock %}
+
+{% block header %}
+ <h1>{% trans "Merge Podcasts and Episodes" %}</h1>
+{% endblock %}
+
+{% block content %}
+ <form method="post" action="{% url "admin-unify-slugs" %}">
+  {% csrf_token %}
+  <table>
+    <tr>
+     <th>Feed URL</th>
+     <td><input type="text" name="feed" value="" /></td>
+    </tr>
+   <tr>
+    <td><input type="submit" value="OK" /></td>
+   </tr>
+  </table>
+ </form>
+
+{% endblock %}
+
index 13e7da9..11a232e 100644 (file)
@@ -3,7 +3,8 @@ from django.conf.urls import *
 from mygpo.admin.views import Overview, MergeSelect, MergeVerify, \
          MergeProcess, MergeStatus, ClientStatsView, ClientStatsJsonView, \
          UserAgentStatsView, StatsView, StatsJsonView, HostInfo, \
-         FiletypeStatsView, ActivateUserView
+         FiletypeStatsView, ActivateUserView, UnifyDuplicateSlugsSelect, \
+         UnifyDuplicateSlugs, UnifySlugsStatus
 
 urlpatterns = patterns('mygpo.admin.views',
  url(r'^$',              Overview.as_view(),     name='admin-overview'),
@@ -30,4 +31,16 @@ urlpatterns = patterns('mygpo.admin.views',
  url(r'^activate-user/$',
      ActivateUserView.as_view(),
      name='admin-activate-user'),
+
+ url(r'^unify-slugs/select$',
+     UnifyDuplicateSlugsSelect.as_view(),
+     name='admin-unify-slugs-select'),
+
+ url(r'^unify-slugs/$',
+     UnifyDuplicateSlugs.as_view(),
+     name='admin-unify-slugs'),
+
+ url(r'^unify-slugs/status/(?P<task_id>[^/]+)$',
+     UnifySlugsStatus.as_view(),
+     name='admin-unify-slugs-status'),
 )
index e4af3ab..083b582 100644 (file)
@@ -19,7 +19,7 @@ from mygpo.admin.group import PodcastGrouper
 from mygpo.maintenance.merge import PodcastMerger, IncorrectMergeException
 from mygpo.users.models import User
 from mygpo.admin.clients import UserAgentStats, ClientStats
-from mygpo.admin.tasks import merge_podcasts
+from mygpo.admin.tasks import merge_podcasts, unify_slugs
 from mygpo.utils import get_git_head
 from mygpo.api.httpresponse import JsonResponse
 from mygpo.cel import celery
@@ -180,7 +180,7 @@ class MergeProcess(MergeBase):
 class MergeStatus(AdminView):
     """ Displays the status of the merge operation """
 
-    template_name = 'admin/merge-status.html'
+    template_name = 'admin/task-status.html'
 
     def get(self, request, task_id):
         result = merge_podcasts.AsyncResult(task_id)
@@ -201,7 +201,7 @@ class MergeStatus(AdminView):
             messages.error(request, str(ime))
             return HttpResponseRedirect(reverse('admin-merge'))
 
-        return render(request, 'admin/merge-status.html', {
+        return self.render_to_response({
                 'ready': True,
                 'actions': actions.items(),
                 'podcast': podcast,
@@ -336,3 +336,52 @@ class ActivateUserView(AdminView):
                          _('User {username} ({email}) activated'.format(
                             username=user.username, email=user.email)))
         return HttpResponseRedirect(reverse('admin-activate-user'))
+
+
+
+class UnifyDuplicateSlugsSelect(AdminView):
+    """ select a podcast for which to unify slugs """
+    template_name = 'admin/unify-slugs-select.html'
+
+
+class UnifyDuplicateSlugs(AdminView):
+    """ start slug-unification task """
+
+    def post(self, request):
+        podcast_url = request.POST.get('feed')
+        podcast = podcast_for_url(podcast_url)
+
+        if not podcast:
+            messages.error(request, _('Podcast with URL "%s" does not exist' %
+                                      (podcast_url,)))
+            return HttpResponseRedirect(reverse('admin-unify-slugs-select'))
+
+        res = unify_slugs.delay(podcast)
+        return HttpResponseRedirect(reverse('admin-unify-slugs-status',
+                    args=[res.task_id]))
+
+
+class UnifySlugsStatus(AdminView):
+    """ Displays the status of the unify-slugs operation """
+
+    template_name = 'admin/task-status.html'
+
+    def get(self, request, task_id):
+        result = merge_podcasts.AsyncResult(task_id)
+
+        if not result.ready():
+            return self.render_to_response({
+                'ready': False,
+            })
+
+        # clear cache to make merge result visible
+        # TODO: what to do with multiple frontends?
+        cache.clear()
+
+        actions, podcast = result.get()
+
+        return self.render_to_response({
+            'ready': True,
+            'actions': actions.items(),
+            'podcast': podcast,
+        })
index 915f857..0d3d905 100644 (file)
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
 from itertools import count
 
 from couchdbkit.ext.django.schema import *
@@ -5,6 +7,7 @@ from couchdbkit.ext.django.schema import *
 from django.utils.text import slugify
 
 from mygpo.decorators import repeat_on_conflict
+from mygpo.utils import partition
 
 
 def assign_slug(obj, generator):
@@ -38,8 +41,8 @@ class SlugGenerator(object):
     """ Generates a unique slug for an object """
 
 
-    def __init__(self, obj):
-        if obj.slug:
+    def __init__(self, obj, override_existing=False):
+        if obj.slug and not override_existing:
             raise ValueError('%(obj)s already has slug %(slug)s' % \
                 dict(obj=obj, slug=obj.slug))
 
@@ -109,9 +112,9 @@ class PodcastSlug(PodcastGroupSlug):
 class EpisodeSlug(SlugGenerator):
     """ Generates slugs for Episodes """
 
-    def __init__(self, episode, common_title):
+    def __init__(self, episode, common_title, override_existing=False):
         self.common_title = common_title
-        super(EpisodeSlug, self).__init__(episode)
+        super(EpisodeSlug, self).__init__(episode, override_existing)
         self.podcast_id = episode.podcast
 
 
@@ -247,3 +250,41 @@ class SlugMixin(DocumentSchema):
 
         # remove from merged slugs
         self.merged_slugs = list(set(self.merged_slugs) - set([slug]))
+
+
+def get_duplicate_slugs(episodes):
+    """ Finds duplicate slugs and yields (slug, duplicates) pairs for each slug
+
+    Such a pair is only yielded for each slug that actually has a duplicate.
+    The "duplicates" list does not contain the selected "winner" of a set of
+    duplicates. """
+
+    # we build a dict of {slug: [episode1, episode2, ...], ...}
+    # for each slug all episodes are given that use this slug
+    slugs = defaultdict(list)
+
+    for episode in episodes:
+        all_slugs = filter(None, [episode.slug] + episode.merged_slugs)
+        for slug in all_slugs:
+            slugs[slug].append(episode)
+
+    # filter out unique slugs
+    dups = {s: eps for (s, eps) in slugs.items() if len(eps) > 1}
+
+    for slug, episodes in dups.items():
+        merged, main = partition(episodes, lambda e: e.slug == slug)
+
+        main, merged = list(main), list(merged)
+
+        # we want to determine exactly one winner, the rest is in "merged"
+        if len(main) == 1:
+            winner = main[0]
+
+        if len(main) < 1:
+            winner = merged.pop()
+
+        if len(main) > 1:
+            winner, merged = main[0], main[1:] + merged
+
+        # for every loser, remove the slug
+        yield slug, merged
index 1caa36e..76f855e 100644 (file)
@@ -21,7 +21,8 @@ import doctest
 from django.test import TestCase
 
 import mygpo.utils
-from mygpo.core.models import Podcast, PodcastGroup
+from mygpo.core.slugs import get_duplicate_slugs
+from mygpo.core.models import Podcast, PodcastGroup, Episode
 
 
 class PodcastGroupTests(unittest.TestCase):
@@ -62,8 +63,32 @@ class PodcastGroupTests(unittest.TestCase):
 
 
 
+class UnifySlugTests(unittest.TestCase):
+
+    def test_unify(self):
+
+        from mygpo.core.models import Episode
+        a = Episode(_id='a', slug='1')
+        b = Episode(_id='b', merged_slugs=['1'])
+        c = Episode(_id='c', merged_slugs=['1'])
+
+        dups_list = list(get_duplicate_slugs([a, b, c]))
+
+        # only one duplicate slug is reported
+        self.assertEquals(len(dups_list), 1)
+
+        slug, dups = dups_list[0]
+
+        self.assertEquals(slug, '1')
+        self.assertEquals(len(dups), 2)
+        self.assertEquals(dups[0], b)
+        self.assertEquals(dups[1], c)
+        self.assertEquals(dups, [b, c])
+
+
 def suite():
     suite = unittest.TestSuite()
     suite.addTest(doctest.DocTestSuite(mygpo.utils))
     suite.addTest(unittest.TestLoader().loadTestsFromTestCase(PodcastGroupTests))
+    suite.addTest(unittest.TestLoader().loadTestsFromTestCase(UnifySlugTests))
     return suite
index d0605d6..47ee4ad 100644 (file)
@@ -24,6 +24,7 @@ import operator
 import sys
 import re
 import collections
+import itertools
 from datetime import datetime, timedelta, date
 import time
 import hashlib
@@ -994,3 +995,9 @@ def normalize_feed_url(url):
 
     # urlunsplit might return "a slighty different, but equivalent URL"
     return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
+
+
+def partition(items, predicate=bool):
+    a, b = itertools.tee((predicate(item), item) for item in items)
+    return ((item for pred, item in a if not pred),
+            (item for pred, item in b if pred))