xpcom/analysis/deki.py

   1 """ deki.py - Access the wiki pages on a MindTouch Deki server via the API.
   2
   3 Here's what this code can do:
   4
   5   wiki = deki.Deki("http://developer.mozilla.org/@api/deki/", username, password)
   6   page = wiki.get_page("Sheep")
   7   print page.title
   8   print page.doc.toxml()
   9
  10   page.title = "Bananas"
  11   page.save()
  12
  13 There are also some additional methods:
  14   wiki.create_page(path, content, title=, override=)
  15   wiki.move_page(old, new)
  16   wiki.get_subpages(page)
  17
  18 This module does not try to mimic the MindTouch "Plug" API.  It's meant to be
  19 higher-level than that.
  20 """
  21
  22 import sys
  23 import urllib2, cookielib, httplib
  24 import xml.dom.minidom as dom
  25 from urllib import quote as _urllib_quote
  26 from urllib import urlencode as _urlencode
  27 import urlparse
  28 from datetime import datetime
  29 import re
  30
  31 __all__ = ['Deki']
  32
  33
  34 # === Utils
  35
  36 def _check(fact):
  37     if not fact:
  38         raise AssertionError('check failed')
  39
  40 def _urlquote(s, *args):
  41     return _urllib_quote(s.encode('utf-8'), *args)
  42
  43 def _make_url(*dirs, **params):
  44     """ dirs must already be url-encoded, params must not """
  45     url = '/'.join(dirs)
  46     if params:
  47         url += '?' + _urlencode(params)
  48     return url
  49
  50 class PutRequest(urllib2.Request):
  51     def get_method(self):
  52         return "PUT"
  53
  54 # === Dream framework client code
  55
  56 # This handler causes python to "always be logged in" when it's talking to the
  57 # server.  If you're just accessing public pages, it generates more requests
  58 # than are strictly needed, but this is the behavior you want for a bot.
  59 #
  60 # The users/authenticate request is sent twice: once without any basic auth and
  61 # once with.  Dumb.  Feel free to fix.
  62 #
  63 class _LoginHandler(urllib2.HTTPCookieProcessor):
  64     def __init__(self, server):
  65         policy = cookielib.DefaultCookiePolicy(rfc2965=True)
  66         cookiejar = cookielib.CookieJar(policy)
  67         urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
  68         self.server = server
  69
  70     def http_request(self, req):
  71         #print "DEBUG- Requesting " + req.get_full_url()
  72         s = self.server
  73         req = urllib2.HTTPCookieProcessor.http_request(self, req)
  74         if ('Cookie' not in req.unredirected_hdrs
  75               and req.get_full_url() != s.base + 'users/authenticate'):
  76             s.login()
  77             # Retry - should have a new cookie.
  78             req = urllib2.HTTPCookieProcessor.http_request(self, req)
  79             _check('Cookie' in req.unredirected_hdrs)
  80         return req
  81
  82 class DreamClient:
  83     def __init__(self, base, user, password):
  84         """
  85         base - The base URI of the Deki API, with trailing slash.
  86                Typically, 'http://wiki.example.org/@api/deki/'.
  87         user, password - Your Deki login information.
  88         """
  89         self.base = base
  90         pm = urllib2.HTTPPasswordMgrWithDefaultRealm()
  91         pm.add_password(None, self.base, user, password)
  92         ah = urllib2.HTTPBasicAuthHandler(pm)
  93         lh = _LoginHandler(self)
  94         self._opener = urllib2.build_opener(ah, lh)
  95
  96     def login(self):
  97         response = self._opener.open(self.base + 'users/authenticate')
  98         response.close()
  99
 100     def open(self, url):
 101         return self._opener.open(self.base + url)
 102
 103     def _handleResponse(self, req):
 104         """Helper method shared between post() and put()"""
 105         resp = self._opener.open(req)
 106         try:
 107             ct = resp.headers.get('Content-Type', '(none)')
 108             if '/xml' in ct or '+xml' in ct:
 109                 return dom.parse(resp)
 110             else:
 111                 #print "DEBUG- Content-Type:", ct
 112                 crud = resp.read()
 113                 #print 'DEBUG- crud:\n---\n%s\n---' % re.sub(r'(?m)^', '    ', crud)
 114                 return None
 115         finally:
 116             resp.close()
 117
 118
 119     def post(self, url, data, type):
 120         #print "DEBUG- posting to:", self.base + url
 121         req = urllib2.Request(self.base + url, data, {'Content-Type': type})
 122         return self._handleResponse(req)
 123
 124     def put(self, url, data, type):
 125         #print "DEBUG- putting to:", self.base + url
 126         req = PutRequest(self.base + url, data, {'Content-Type': type})
 127         return self._handleResponse(req)
 128
 129     def get_xml(self, url):
 130         resp = self.open(url)
 131         try:
 132             return dom.parse(resp)
 133         finally:
 134             resp.close()
 135
 136
 137 # === DOM
 138
 139 def _text_of(node):
 140     if node.nodeType == node.ELEMENT_NODE:
 141         return u''.join(_text_of(n) for n in node.childNodes)
 142     elif node.nodeType == node.TEXT_NODE:
 143         return node.nodeValue
 144     else:
 145         return u''
 146
 147 def _the_element_by_name(doc, tagName):
 148     elts = doc.getElementsByTagName(tagName)
 149     if len(elts) != 1:
 150         raise ValueError("Expected exactly one <%s> tag, got %d." % (tagName, len(elts)))
 151     return elts[0]
 152
 153 def _first_element(node):
 154     n = node.firstChild
 155     while n is not None:
 156         if n.nodeType == n.ELEMENT_NODE:
 157             return n
 158         n = node.nextSibling
 159     return None
 160
 161 def _find_elements(node, path):
 162     if u'/' in path:
 163         [first, rest] = path.split(u'/', 1)
 164         for child in _find_elements(node, first):
 165             for desc in _find_elements(child, rest):
 166                 yield desc
 167     else:
 168         for n in node.childNodes:
 169             if n.nodeType == node.ELEMENT_NODE and n.nodeName == path:
 170                 yield n
 171
 172
 173 # === Deki
 174
 175 def _format_page_id(id):
 176     if isinstance(id, int):
 177         return str(id)
 178     elif id is Deki.HOME:
 179         return 'home'
 180     elif isinstance(id, basestring):
 181         # Double-encoded, per the Deki API reference.
 182         return '=' + _urlquote(_urlquote(id, ''))
 183
 184 class Deki(DreamClient):
 185     HOME = object()
 186
 187     def get_page(self, page_id):
 188         """ Get the content of a page from the wiki.
 189
 190         The page_id argument must be one of:
 191           an int - The page id (an arbitrary number assigned by Deki)
 192           a str - The page name (not the title, the full path that shows up in the URL)
 193           Deki.HOME - Refers to the main page of the wiki.
 194
 195         Returns a Page object.
 196         """
 197         p = Page(self)
 198         p._load(page_id)
 199         return p
 200
 201     def create_page(self, path, content, title=None, overwrite=False):
 202         """ Create a new wiki page.
 203
 204         Parameters:
 205           path - str - The page id.
 206           content - str - The XML content to put in the new page.
 207             The document element must be a <body>.
 208           title - str - The page title.  Keyword argument only.
 209             Defaults to the last path-segment of path.
 210           overwrite - bool - Whether to overwrite an existing page. If false,
 211             and the page already exists, the method will throw an error.
 212         """
 213         if title is None:
 214             title = path.split('/')[-1]
 215         doc = dom.parseString(content)
 216         _check(doc.documentElement.tagName == 'body')
 217         p = Page(self)
 218         p._create(path, title, doc, overwrite)
 219
 220     def attach_file(self, page, name, data, mimetype, description=None):
 221         """Create or update a file attachment.
 222
 223         Parameters:
 224           page - str - the page ID this file is related to
 225           name - str - the name of the file
 226           data - str - the file data
 227           mimetype - str - the MIME type of the file
 228           description - str - a description of the file
 229         """
 230
 231         p = {}
 232         if description is not None:
 233             p['description'] = description
 234
 235         url = _make_url('pages', _format_page_id(page),
 236                         'files', _format_page_id(name), **p)
 237
 238         r = self.put(url, data, mimetype)
 239         _check(r.documentElement.nodeName == u'file')
 240
 241     def get_subpages(self, page_id):
 242         """ Return the ids of all subpages of the given page. """
 243         doc = self.get_xml(_make_url("pages", _format_page_id(page_id),
 244                                      "files,subpages"))
 245         for elt in _find_elements(doc, u'page/subpages/page.subpage/path'):
 246             yield _text_of(elt)
 247
 248     def move_page(self, page_id, new_title, redirects=True):
 249         """ Move an existing page to a new location.
 250
 251         A page cannot be moved to a destination that already exists, is a
 252         descendant, or has a protected title (ex.  Special:xxx, User:,
 253         Template:).
 254
 255         When a page is moved, subpages under the specified page are also moved.
 256         For each moved page, the system automatically creates an alias page
 257         that redirects from the old to the new destination.
 258         """
 259         self.post(_make_url("pages", _format_page_id(page_id), "move",
 260                             to=new_title,
 261                             redirects=redirects and "1" or "0"),
 262                   "", "text/plain")
 263
 264 class Page:
 265     """ A Deki wiki page.
 266
 267     To obtain a page, call wiki.get_page(id).
 268     Attributes:
 269         title : unicode - The page title.
 270         doc : Document - The content of the page as a DOM Document.
 271           The root element of this document is a <body>.
 272         path : unicode - The path.  Use this to detect redirects, as otherwise
 273           page.save() will overwrite the redirect with a copy of the content!
 274         deki : Deki - The Deki object from which the page was loaded.
 275         page_id : str/id/Deki.HOME - The page id used to load the page.
 276         load_time : datetime - The time the page was loaded,
 277           according to the clock on the client machine.
 278     Methods:
 279         save() - Save the modified document back to the server.
 280           Only the page.title and the contents of page.doc are saved.
 281     """
 282
 283     def __init__(self, deki):
 284         self.deki = deki
 285
 286     def _create(self, path, title, doc, overwrite):
 287         self.title = title
 288         self.doc = doc
 289         self.page_id = path
 290         if overwrite:
 291             self.load_time = datetime(2500, 1, 1)
 292         else:
 293             self.load_time = datetime(1900, 1, 1)
 294         self.path = path
 295         self.save()
 296
 297     def _load(self, page_id):
 298         """ page_id - See comment near the definition of `HOME`. """
 299         load_time = datetime.utcnow()
 300
 301         # Getting the title is a whole separate query!
 302         url = 'pages/%s/info' % _format_page_id(page_id)
 303         doc = self.deki.get_xml(url)
 304         title = _text_of(_the_element_by_name(doc, 'title'))
 305         path = _text_of(_the_element_by_name(doc, 'path'))
 306
 307         # If you prefer to sling regexes, you can request format=raw instead.
 308         # The result is an XML document with one big fat text node in the body.
 309         url = _make_url('pages', _format_page_id(page_id), 'contents',
 310                         format='xhtml', mode='edit')
 311         doc = self.deki.get_xml(url)
 312
 313         content = doc.documentElement
 314         _check(content.tagName == u'content')
 315         body = _first_element(content)
 316         _check(body is not None)
 317         _check(body.tagName == u'body')
 318
 319         doc.removeChild(content)
 320         doc.appendChild(body)
 321
 322         self.page_id = page_id
 323         self.load_time = load_time
 324         self.title = title
 325         self.path = path
 326         self.doc = doc
 327
 328     def save(self):
 329         p = {'edittime': _urlquote(self.load_time.strftime('%Y%m%d%H%M%S')),
 330              'abort': 'modified'}
 331
 332         if self.title is not None:
 333             p['title'] = _urlquote(self.title)
 334
 335         url = _make_url('pages', _format_page_id(self.page_id), 'contents', **p)
 336
 337         body = self.doc.documentElement
 338         bodyInnerXML = ''.join(n.toxml('utf-8') for n in body.childNodes)
 339
 340         reply = self.deki.post(url, bodyInnerXML, 'text/plain; charset=utf-8')
 341         _check(reply.documentElement.nodeName == u'edit')
 342         _check(reply.documentElement.getAttribute(u'status') == u'success')