mutagen/ogg.py

   1 # Copyright 2006 Joe Wreschnig <piman@sacredchao.net>
   2 #
   3 # This program is free software; you can redistribute it and/or modify
   4 # it under the terms of the GNU General Public License version 2 as
   5 # published by the Free Software Foundation.
   6 #
   7 # $Id: ogg.py 3975 2007-01-13 21:51:17Z piman $
   8
   9 """Read and write Ogg bitstreams and pages.
  10
  11 This module reads and writes a subset of the Ogg bitstream format
  12 version 0. It does *not* read or write Ogg Vorbis files! For that,
  13 you should use mutagen.oggvorbis.
  14
  15 This implementation is based on the RFC 3533 standard found at
  16 http://www.xiph.org/ogg/doc/rfc3533.txt.
  17 """
  18
  19 import struct
  20 import sys
  21 import zlib
  22
  23 from cStringIO import StringIO
  24
  25 from mutagen import FileType
  26 from mutagen._util import cdata, insert_bytes, delete_bytes
  27
  28 class error(IOError):
  29     """Ogg stream parsing errors."""
  30     pass
  31
  32 class OggPage(object):
  33     """A single Ogg page (not necessarily a single encoded packet).
  34
  35     A page is a header of 26 bytes, followed by the length of the
  36     data, followed by the data.
  37
  38     The constructor is givin a file-like object pointing to the start
  39     of an Ogg page. After the constructor is finished it is pointing
  40     to the start of the next page.
  41
  42     Attributes:
  43     version -- stream structure version (currently always 0)
  44     position -- absolute stream position (default -1)
  45     serial -- logical stream serial number (default 0)
  46     sequence -- page sequence number within logical stream (default 0)
  47     offset -- offset this page was read from (default None)
  48     complete -- if the last packet on this page is complete (default True)
  49     packets -- list of raw packet data (default [])
  50
  51     Note that if 'complete' is false, the next page's 'continued'
  52     property must be true (so set both when constructing pages).
  53
  54     If a file-like object is supplied to the constructor, the above
  55     attributes will be filled in based on it.
  56     """
  57
  58     version = 0
  59     __type_flags = 0
  60     position = 0L
  61     serial = 0
  62     sequence = 0
  63     offset = None
  64     complete = True
  65
  66     def __init__(self, fileobj=None):
  67         self.packets = []
  68
  69         if fileobj is None:
  70             return
  71
  72         self.offset = fileobj.tell()
  73
  74         header = fileobj.read(27)
  75         if len(header) == 0:
  76             raise EOFError
  77
  78         try:
  79             (oggs, self.version, self.__type_flags, self.position,
  80              self.serial, self.sequence, crc, segments) = struct.unpack(
  81                 "<4sBBqIIiB", header)
  82         except struct.error:
  83             raise error("unable to read full header; got %r" % header)
  84
  85         if oggs != "OggS":
  86             raise error("read %r, expected %r, at 0x%x" % (
  87                 oggs, "OggS", fileobj.tell() - 27))
  88
  89         if self.version != 0:
  90             raise error("version %r unsupported" % self.version)
  91
  92         total = 0
  93         lacings = []
  94         lacing_bytes = fileobj.read(segments)
  95         if len(lacing_bytes) != segments:
  96             raise error("unable to read %r lacing bytes" % segments)
  97         for c in map(ord, lacing_bytes):
  98             total += c
  99             if c < 255:
 100                 lacings.append(total)
 101                 total = 0
 102         if total:
 103             lacings.append(total)
 104             self.complete = False
 105
 106         self.packets = map(fileobj.read, lacings)
 107         if map(len, self.packets) != lacings:
 108             raise error("unable to read full data")
 109
 110     def __eq__(self, other):
 111         """Two Ogg pages are the same if they write the same data."""
 112         try:
 113             return (self.write() == other.write())
 114         except AttributeError:
 115             return False
 116
 117     def __repr__(self):
 118         attrs = ['version', 'position', 'serial', 'sequence', 'offset',
 119                  'complete', 'continued', 'first', 'last']
 120         values = ["%s=%r" % (attr, getattr(self, attr)) for attr in attrs]
 121         return "<%s %s, %d bytes in %d packets>" % (
 122             type(self).__name__, " ".join(values), sum(map(len, self.packets)),
 123             len(self.packets))
 124
 125     def write(self):
 126         """Return a string encoding of the page header and data.
 127
 128         A ValueError is raised if the data is too big to fit in a
 129         single page.
 130         """
 131
 132         data = [
 133             struct.pack("<4sBBqIIi", "OggS", self.version, self.__type_flags,
 134                         self.position, self.serial, self.sequence, 0)
 135             ]
 136
 137         lacing_data = []
 138         for datum in self.packets:
 139             quot, rem = divmod(len(datum), 255)
 140             lacing_data.append("\xff" * quot + chr(rem))
 141         lacing_data = "".join(lacing_data)
 142         if not self.complete and lacing_data.endswith("\x00"):
 143             lacing_data = lacing_data[:-1]
 144         data.append(chr(len(lacing_data)))
 145         data.append(lacing_data)
 146         data.extend(self.packets)
 147         data = "".join(data)
 148
 149         # Python's CRC is swapped relative to Ogg's needs.
 150         crc = ~zlib.crc32(data.translate(cdata.bitswap), -1)
 151         # Although we're using to_int_be, this actually makes the CRC
 152         # a proper le integer, since Python's CRC is byteswapped.
 153         crc = cdata.to_int_be(crc).translate(cdata.bitswap)
 154         data = data[:22] + crc + data[26:]
 155         return data
 156
 157     def __size(self):
 158         size = 27 # Initial header size
 159         for datum in self.packets:
 160             quot, rem = divmod(len(datum), 255)
 161             size += quot + 1
 162         if not self.complete and rem == 0:
 163             # Packet contains a multiple of 255 bytes and is not
 164             # terminated, so we don't have a \x00 at the end.
 165             size -= 1
 166         size += sum(map(len, self.packets))
 167         return size
 168
 169     size = property(__size, doc="Total frame size.")
 170
 171     def __set_flag(self, bit, val):
 172         mask = 1 << bit
 173         if val: self.__type_flags |= mask
 174         else: self.__type_flags &= ~mask
 175
 176     continued = property(
 177         lambda self: cdata.test_bit(self.__type_flags, 0),
 178         lambda self, v: self.__set_flag(0, v),
 179         doc="The first packet is continued from the previous page.")
 180
 181     first = property(
 182         lambda self: cdata.test_bit(self.__type_flags, 1),
 183         lambda self, v: self.__set_flag(1, v),
 184         doc="This is the first page of a logical bitstream.")
 185
 186     last = property(
 187         lambda self: cdata.test_bit(self.__type_flags, 2),
 188         lambda self, v: self.__set_flag(2, v),
 189         doc="This is the last page of a logical bitstream.")
 190
 191     def renumber(klass, fileobj, serial, start):
 192         """Renumber pages belonging to a specified logical stream.
 193
 194         fileobj must be opened with mode r+b or w+b.
 195
 196         Starting at page number 'start', renumber all pages belonging
 197         to logical stream 'serial'. Other pages will be ignored.
 198
 199         fileobj must point to the start of a valid Ogg page; any
 200         occuring after it and part of the specified logical stream
 201         will be numbered. No adjustment will be made to the data in
 202         the pages nor the granule position; only the page number, and
 203         so also the CRC.
 204
 205         If an error occurs (e.g. non-Ogg data is found), fileobj will
 206         be left pointing to the place in the stream the error occured,
 207         but the invalid data will be left intact (since this function
 208         does not change the total file size).
 209         """
 210
 211         number = start
 212         while True:
 213             try: page = OggPage(fileobj)
 214             except EOFError:
 215                 break
 216             else:
 217                 if page.serial != serial:
 218                     # Wrong stream, skip this page.
 219                     continue
 220                 # Changing the number can't change the page size,
 221                 # so seeking back based on the current size is safe.
 222                 fileobj.seek(-page.size, 1)
 223             page.sequence = number
 224             fileobj.write(page.write())
 225             fileobj.seek(page.offset + page.size, 0)
 226             number += 1
 227     renumber = classmethod(renumber)
 228
 229     def to_packets(klass, pages, strict=False):
 230         """Construct a list of packet data from a list of Ogg pages.
 231
 232         If strict is true, the first page must start a new packet,
 233         and the last page must end the last packet.
 234         """
 235
 236         serial = pages[0].serial
 237         sequence = pages[0].sequence
 238         packets = []
 239
 240         if strict:
 241             if pages[0].continued:
 242                 raise ValueError("first packet is continued")
 243             if not pages[-1].complete:
 244                 raise ValueError("last packet does not complete")
 245         elif pages and pages[0].continued:
 246             packets.append("")
 247
 248         for page in pages:
 249             if serial != page.serial:
 250                 raise ValueError("invalid serial number in %r" % page)
 251             elif sequence != page.sequence:
 252                 raise ValueError("bad sequence number in %r" % page)
 253             else: sequence += 1
 254
 255             if page.continued: packets[-1] += page.packets[0]
 256             else: packets.append(page.packets[0])
 257             packets.extend(page.packets[1:])
 258
 259         return packets
 260     to_packets = classmethod(to_packets)
 261
 262     def from_packets(klass, packets, sequence=0,
 263                      default_size=4096, wiggle_room=2048):
 264         """Construct a list of Ogg pages from a list of packet data.
 265
 266         The algorithm will generate pages of approximately
 267         default_size in size (rounded down to the nearest multiple of
 268         255). However, it will also allow pages to increase to
 269         approximately default_size + wiggle_room if allowing the
 270         wiggle room would finish a packet (only one packet will be
 271         finished in this way per page; if the next packet would fit
 272         into the wiggle room, it still starts on a new page).
 273
 274         This method reduces packet fragmentation when packet sizes are
 275         slightly larger than the default page size, while still
 276         ensuring most pages are of the average size.
 277
 278         Pages are numbered started at 'sequence'; other information is
 279         uninitialized.
 280         """
 281
 282         chunk_size = (default_size // 255) * 255
 283
 284         pages = []
 285
 286         page = OggPage()
 287         page.sequence = sequence
 288
 289         for packet in packets:
 290             page.packets.append("")
 291             while packet:
 292                 data, packet = packet[:chunk_size], packet[chunk_size:]
 293                 if page.size < default_size and len(page.packets) < 255:
 294                     page.packets[-1] += data
 295                 else:
 296                     # If we've put any packet data into this page yet,
 297                     # we need to mark it incomplete. However, we can
 298                     # also have just started this packet on an already
 299                     # full page, in which case, just start the new
 300                     # page with this packet.
 301                     if page.packets[-1]:
 302                         page.complete = False
 303                         if len(page.packets) == 1:
 304                             page.position = -1L
 305                     else:
 306                         page.packets.pop(-1)
 307                     pages.append(page)
 308                     page = OggPage()
 309                     page.continued = not pages[-1].complete
 310                     page.sequence = pages[-1].sequence + 1
 311                     page.packets.append(data)
 312
 313                 if len(packet) < wiggle_room:
 314                     page.packets[-1] += packet
 315                     packet = ""
 316
 317         if page.packets:
 318             pages.append(page)
 319
 320         return pages
 321     from_packets = classmethod(from_packets)
 322
 323     def replace(klass, fileobj, old_pages, new_pages):
 324         """Replace old_pages with new_pages within fileobj.
 325
 326         old_pages must have come from reading fileobj originally.
 327         new_pages are assumed to have the 'same' data as old_pages,
 328         and so the serial and sequence numbers will be copied, as will
 329         the flags for the first and last pages.
 330
 331         fileobj will be resized and pages renumbered as necessary. As
 332         such, it must be opened r+b or w+b.
 333         """
 334
 335         # Number the new pages starting from the first old page.
 336         first = old_pages[0].sequence
 337         for page, seq in zip(new_pages, range(first, first + len(new_pages))):
 338             page.sequence = seq
 339             page.serial = old_pages[0].serial
 340
 341         new_pages[0].first = old_pages[0].first
 342         new_pages[0].last = old_pages[0].last
 343         new_pages[0].continued = old_pages[0].continued
 344
 345         new_pages[-1].first = old_pages[-1].first
 346         new_pages[-1].last = old_pages[-1].last
 347         new_pages[-1].complete = old_pages[-1].complete
 348         if not new_pages[-1].complete and len(new_pages[-1].packets) == 1:
 349             new_pages[-1].position = -1L
 350
 351         new_data = "".join(map(klass.write, new_pages))
 352
 353         # Make room in the file for the new data.
 354         delta = len(new_data)
 355         fileobj.seek(old_pages[0].offset, 0)
 356         insert_bytes(fileobj, delta, old_pages[0].offset)
 357         fileobj.seek(old_pages[0].offset, 0)
 358         fileobj.write(new_data)
 359         new_data_end = old_pages[0].offset + delta
 360
 361         # Go through the old pages and delete them. Since we shifted
 362         # the data down the file, we need to adjust their offsets. We
 363         # also need to go backwards, so we don't adjust the deltas of
 364         # the other pages.
 365         old_pages.reverse()
 366         for old_page in old_pages:
 367             adj_offset = old_page.offset + delta
 368             delete_bytes(fileobj, old_page.size, adj_offset)
 369
 370         # Finally, if there's any discrepency in length, we need to
 371         # renumber the pages for the logical stream.
 372         if len(old_pages) != len(new_pages):
 373             fileobj.seek(new_data_end, 0)
 374             serial = new_pages[-1].serial
 375             sequence = new_pages[-1].sequence + 1
 376             klass.renumber(fileobj, serial, sequence)
 377     replace = classmethod(replace)
 378
 379     def find_last(klass, fileobj, serial):
 380         """Find the last page of the stream 'serial'.
 381
 382         If the file is not multiplexed this function is fast. If it is,
 383         it must read the whole the stream.
 384
 385         This finds the last page in the actual file object, or the last
 386         page in the stream (with eos set), whichever comes first.
 387         """
 388
 389         # For non-muxed streams, look at the last page.
 390         try: fileobj.seek(-256*256, 2)
 391         except IOError:
 392             # The file is less than 64k in length.
 393             fileobj.seek(0)
 394         data = fileobj.read()
 395         try: index = data.rindex("OggS")
 396         except ValueError:
 397             raise error("unable to find final Ogg header")
 398         stringobj = StringIO(data[index:])
 399         best_page = None
 400         try:
 401             page = OggPage(stringobj)
 402         except error:
 403             pass
 404         else:
 405             if page.serial == serial:
 406                 if page.last: return page
 407                 else: best_page = page
 408             else: best_page = None
 409
 410         # The stream is muxed, so use the slow way.
 411         fileobj.seek(0)
 412         try:
 413             page = OggPage(fileobj)
 414             while not page.last:
 415                 page = OggPage(fileobj)
 416                 while page.serial != serial:
 417                     page = OggPage(fileobj)
 418                 best_page = page
 419             return page
 420         except error:
 421             return best_page
 422         except EOFError:
 423             return best_page
 424     find_last = classmethod(find_last)
 425
 426 class OggFileType(FileType):
 427     """An generic Ogg file."""
 428
 429     _Info = None
 430     _Tags = None
 431     _Error = None
 432     _mimes = ["application/ogg", "application/x-ogg"]
 433
 434     def load(self, filename):
 435         """Load file information from a filename."""
 436
 437         self.filename = filename
 438         fileobj = file(filename, "rb")
 439         try:
 440             try:
 441                 self.info = self._Info(fileobj)
 442                 self.tags = self._Tags(fileobj, self.info)
 443
 444                 if self.info.length:
 445                     # The streaminfo gave us real length information,
 446                     # don't waste time scanning the Ogg.
 447                     return
 448
 449                 last_page = OggPage.find_last(fileobj, self.info.serial)
 450                 samples = last_page.position
 451                 try:
 452                     denom = self.info.sample_rate
 453                 except AttributeError:
 454                     denom = self.info.fps
 455                 self.info.length = samples / float(denom)
 456
 457             except error, e:
 458                 raise self._Error, e, sys.exc_info()[2]
 459             except EOFError:
 460                 raise self._Error, "no appropriate stream found"
 461         finally:
 462             fileobj.close()
 463
 464     def delete(self, filename=None):
 465         """Remove tags from a file.
 466
 467         If no filename is given, the one most recently loaded is used.
 468         """
 469         if filename is None:
 470             filename = self.filename
 471
 472         self.tags.clear()
 473         fileobj = file(filename, "rb+")
 474         try:
 475             try: self.tags._inject(fileobj)
 476             except error, e:
 477                 raise self._Error, e, sys.exc_info()[2]
 478             except EOFError:
 479                 raise self._Error, "no appropriate stream found"
 480         finally:
 481             fileobj.close()
 482
 483     def save(self, filename=None):
 484         """Save a tag to a file.
 485
 486         If no filename is given, the one most recently loaded is used.
 487         """
 488         if filename is None:
 489             filename = self.filename
 490         fileobj = file(filename, "rb+")
 491         try:
 492             try: self.tags._inject(fileobj)
 493             except error, e:
 494                 raise self._Error, e, sys.exc_info()[2]
 495             except EOFError:
 496                 raise self._Error, "no appropriate stream found"
 497         finally:
 498             fileobj.close()