TOOLS/matroska.py: support 8-byte floats in parsing mode
[mplayer/kovensky.git] / TOOLS / matroska.py
blob77268c8183c4c9b2159de4cfdc5a6c9383914a59
1 #!/usr/bin/python
2 """
3 Generate C definitions for parsing Matroska files.
4 Can also be used to directly parse Matroska files and display their contents.
5 """
8 # This file is part of MPlayer.
10 # MPlayer is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation; either version 2 of the License, or
13 # (at your option) any later version.
15 # MPlayer is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
20 # You should have received a copy of the GNU General Public License along
21 # with MPlayer; if not, write to the Free Software Foundation, Inc.,
22 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 elements_ebml = (
27 'EBML, 1a45dfa3, sub', (
28 'EBMLVersion, 4286, uint',
29 'EBMLReadVersion, 42f7, uint',
30 'EBMLMaxIDLength, 42f2, uint',
31 'EBMLMaxSizeLength, 42f3, uint',
32 'DocType, 4282, str',
33 'DocTypeVersion, 4287, uint',
34 'DocTypeReadVersion, 4285, uint',
37 'CRC32, bf, binary',
38 'Void, ec, binary',
41 elements_matroska = (
42 'Segment, 18538067, sub', (
44 'SeekHead*, 114d9b74, sub', (
45 'Seek*, 4dbb, sub', (
46 'SeekID, 53ab, ebml_id',
47 'SeekPosition, 53ac, uint',
51 'Info*, 1549a966, sub', (
52 'SegmentUID, 73a4, binary',
53 'PrevUID, 3cb923, binary',
54 'NextUID, 3eb923, binary',
55 'TimecodeScale, 2ad7b1, uint',
56 'DateUTC, 4461, sint',
57 'Title, 7ba9, str',
58 'MuxingApp, 4d80, str',
59 'WritingApp, 5741, str',
60 'Duration, 4489, float',
63 'Cluster*, 1f43b675, sub', (
64 'Timecode, e7, uint',
65 'BlockGroup*, a0, sub', (
66 'Block, a1, binary',
67 'BlockDuration, 9b, uint',
68 'ReferenceBlock*, fb, sint',
70 'SimpleBlock*, a3, binary',
73 'Tracks*, 1654ae6b, sub', (
74 'TrackEntry*, ae, sub', (
75 'TrackNumber, d7, uint',
76 'TrackUID, 73c5, uint',
77 'TrackType, 83, uint',
78 'FlagEnabled, b9, uint',
79 'FlagDefault, 88, uint',
80 'FlagForced, 55aa, uint',
81 'FlagLacing, 9c, uint',
82 'MinCache, 6de7, uint',
83 'DefaultDuration, 23e383, uint',
84 'TrackTimecodeScale, 23314f, float',
85 'MaxBlockAdditionID, 55ee, uint',
86 'Name, 536e, str',
87 'Language, 22b59c, str',
88 'CodecID, 86, str',
89 'CodecPrivate, 63a2, binary',
90 'CodecDecodeAll, aa, uint',
91 'Video, e0, sub', (
92 'FlagInterlaced, 9a, uint',
93 'PixelWidth, b0, uint',
94 'PixelHeight, ba, uint',
95 'DisplayWidth, 54b0, uint',
96 'DisplayHeight, 54ba, uint',
97 'FrameRate, 2383e3, float',
99 'Audio, e1, sub', (
100 'SamplingFrequency, b5, float',
101 'Channels, 9f, uint',
102 'BitDepth, 6264, uint',
104 'ContentEncodings, 6d80, sub', (
105 'ContentEncoding*, 6240, sub', (
106 'ContentEncodingOrder, 5031, uint',
107 'ContentEncodingScope, 5032, uint',
108 'ContentEncodingType, 5033, uint',
109 'ContentCompression, 5034, sub', (
110 'ContentCompAlgo, 4254, uint',
111 'ContentCompSettings, 4255, binary',
118 'Cues, 1c53bb6b, sub', (
119 'CuePoint*, bb, sub', (
120 'CueTime, b3, uint',
121 'CueTrackPositions*, b7, sub', (
122 'CueTrack, f7, uint',
123 'CueClusterPosition, f1, uint',
128 'Attachments, 1941a469, sub', (
129 'AttachedFile*, 61a7, sub', (
130 'FileName, 466e, str',
131 'FileMimeType, 4660, str',
132 'FileData, 465c, binary',
133 'FileUID, 46ae, uint',
137 'Chapters, 1043a770, sub', (
138 'EditionEntry*, 45b9, sub', (
139 'EditionUID, 45bc, uint',
140 'EditionFlagHidden, 45bd, uint',
141 'EditionFlagDefault, 45db, uint',
142 'EditionFlagOrdered, 45dd, uint',
143 'ChapterAtom*, b6, sub', (
144 'ChapterUID, 73c4, uint',
145 'ChapterTimeStart, 91, uint',
146 'ChapterTimeEnd, 92, uint',
147 'ChapterFlagHidden, 98, uint',
148 'ChapterFlagEnabled, 4598, uint',
149 'ChapterSegmentUID, 6e67, binary',
150 'ChapterSegmentEditionUID, 6ebc, uint',
151 'ChapterDisplay*, 80, sub', (
152 'ChapString, 85, str',
153 'ChapLanguage*, 437c, str',
158 'Tags*, 1254c367, sub', (
159 'Tag*, 7373, sub', (
160 'Targets, 63c0, sub', (
161 'TargetTypeValue, 68ca, uint',
169 import sys
170 from math import ldexp
172 def byte2num(s):
173 return int(s.encode('hex'), 16)
175 def camelcase_to_words(name):
176 parts = []
177 start = 0
178 for i in range(1, len(name)):
179 if name[i].isupper() and (name[i-1].islower() or
180 name[i+1:i+2].islower()):
181 parts.append(name[start:i])
182 start = i
183 parts.append(name[start:])
184 return '_'.join(parts).lower()
186 class MatroskaElement(object):
188 def __init__(self, name, elid, valtype, namespace):
189 self.name = name
190 self.definename = '%s_ID_%s' % (namespace, name.upper())
191 self.fieldname = camelcase_to_words(name)
192 self.structname = 'ebml_' + self.fieldname
193 self.elid = elid
194 self.valtype = valtype
195 if valtype == 'sub':
196 self.ebmltype = 'EBML_TYPE_SUBELEMENTS'
197 self.valname = 'struct %s' % self.structname
198 else:
199 self.ebmltype = 'EBML_TYPE_' + valtype.upper()
200 try:
201 self.valname = {'uint': 'uint64_t', 'str': 'struct bstr',
202 'binary': 'struct bstr', 'ebml_id': 'uint32_t',
203 'float': 'double', 'sint': 'int64_t',
204 }[valtype]
205 except KeyError:
206 raise SyntaxError('Unrecognized value type ' + valtype)
207 self.subelements = ()
209 def add_subelements(self, subelements):
210 self.subelements = subelements
211 self.subids = set(x[0].elid for x in subelements)
213 elementd = {}
214 elementlist = []
215 def parse_elems(l, namespace):
216 subelements = []
217 for el in l:
218 if isinstance(el, str):
219 name, hexid, eltype = [x.strip() for x in el.split(',')]
220 multiple = name.endswith('*')
221 name = name.strip('*')
222 new = MatroskaElement(name, hexid, eltype, namespace)
223 elementd[hexid] = new
224 elementlist.append(new)
225 subelements.append((new, multiple))
226 else:
227 new.add_subelements(parse_elems(el, namespace))
228 return subelements
230 parse_elems(elements_ebml, 'EBML')
231 parse_elems(elements_matroska, 'MATROSKA')
233 def generate_C_header():
234 print('// Generated by TOOLS/matroska.py, do not edit manually')
235 print
237 for el in elementlist:
238 print('#define %-40s 0x%s' % (el.definename, el.elid))
240 print
242 for el in reversed(elementlist):
243 if not el.subelements:
244 continue
245 print
246 print('struct %s {' % el.structname)
247 l = max(len(subel.valname) for subel, multiple in el.subelements)+1
248 for subel, multiple in el.subelements:
249 print(' %-*s %s%s;' % (l, subel.valname, (' ', '*')[multiple],
250 subel.fieldname))
251 print
252 for subel, multiple in el.subelements:
253 print(' int n_%s;' % (subel.fieldname))
254 print('};')
256 for el in elementlist:
257 if not el.subelements:
258 continue
259 print('extern const struct ebml_elem_desc %s_desc;' % el.structname)
261 print
262 print('#define MAX_EBML_SUBELEMENTS %d' % max(len(el.subelements)
263 for el in elementlist))
267 def generate_C_definitions():
268 print('// Generated by TOOLS/matroska.py, do not edit manually')
269 print
270 for el in reversed(elementlist):
271 print
272 if el.subelements:
273 print('#define N %s' % el.fieldname)
274 print('E_S("%s", %d)' % (el.name, len(el.subelements)))
275 for subel, multiple in el.subelements:
276 print('F(%s, %s, %d)' % (subel.definename, subel.fieldname,
277 multiple))
278 print('}};')
279 print('#undef N')
280 else:
281 print('E("%s", %s, %s)' % (el.name, el.fieldname, el.ebmltype))
283 def read(s, length):
284 t = s.read(length)
285 if len(t) != length:
286 raise IOError
287 return t
289 def read_id(s):
290 t = read(s, 1)
291 i = 0
292 mask = 128
293 if ord(t) == 0:
294 raise SyntaxError
295 while not ord(t) & mask:
296 i += 1
297 mask >>= 1
298 t += read(s, i)
299 return t
301 def read_vint(s):
302 t = read(s, 1)
303 i = 0
304 mask = 128
305 if ord(t) == 0:
306 raise SyntaxError
307 while not ord(t) & mask:
308 i += 1
309 mask >>= 1
310 t = chr(ord(t) & (mask - 1))
311 t += read(s, i)
312 return i+1, byte2num(t)
314 def read_str(s, length):
315 return read(s, length)
317 def read_uint(s, length):
318 t = read(s, length)
319 return byte2num(t)
321 def read_sint(s, length):
322 i = read_uint(s, length)
323 mask = 1 << (length * 8 - 1)
324 if i & mask:
325 i -= 2 * mask
326 return i
328 def read_float(s, length):
329 t = read(s, length)
330 i = byte2num(t)
331 if length == 4:
332 f = ldexp((i & 0x7fffff) + (1 << 23), (i >> 23 & 0xff) - 150)
333 if i & (1 << 31):
334 f = -f
335 elif length == 8:
336 f = ldexp((i & ((1 << 52) - 1)) + (1 << 52), (i >> 52 & 0x7ff) - 1075)
337 if i & (1 << 63):
338 f = -f
339 else:
340 raise SyntaxError
341 return f
343 def parse_one(s, depth, parent, maxlen):
344 elid = read_id(s).encode('hex')
345 elem = elementd.get(elid)
346 if parent is not None and elid not in parent.subids and elid not in ('ec', 'bf'):
347 print('Unexpected:', elid)
348 if 1:
349 raise NotImplementedError
350 size, length = read_vint(s)
351 this_length = len(elid) / 2 + size + length
352 if elem is not None:
353 if elem.valtype != 'skip':
354 print depth, elid, elem.name, 'size:', length, 'value:',
355 if elem.valtype == 'sub':
356 print('subelements:')
357 while length > 0:
358 length -= parse_one(s, depth + 1, elem, length)
359 if length < 0:
360 raise SyntaxError
361 elif elem.valtype == 'str':
362 print 'string', repr(read_str(s, length))
363 elif elem.valtype in ('binary', 'ebml_id'):
364 t = read_str(s, length)
365 dec = ''
366 if elem.valtype == 'ebml_id':
367 idelem = elementd.get(t.encode('hex'))
368 if idelem is None:
369 dec = '(UNKNOWN)'
370 else:
371 dec = '(%s)' % idelem.name
372 if len(t) < 20:
373 t = t.encode('hex')
374 else:
375 t = '<skipped %d bytes>' % len(t)
376 print 'binary', t, dec
377 elif elem.valtype == 'uint':
378 print 'uint', read_uint(s, length)
379 elif elem.valtype == 'sint':
380 print 'sint', read_sint(s, length)
381 elif elem.valtype == 'float':
382 print 'float', read_float(s, length)
383 elif elem.valtype == 'skip':
384 read(s, length)
385 else:
386 raise NotImplementedError
387 else:
388 print(depth, 'Unknown element:', elid, 'size:', length)
389 read(s, length)
390 return this_length
392 def parse_toplevel(s):
393 parse_one(s, 0, None, 1 << 63)
395 if sys.argv[1] == '--generate-header':
396 generate_C_header()
397 elif sys.argv[1] == '--generate-definitions':
398 generate_C_definitions()
399 else:
400 s = open(sys.argv[1])
401 while 1:
402 parse_toplevel(s)