subreader: fix unsafe sscanf calls with "%["
[mplayer.git] / TOOLS / matroska.py
blobffa388e47bf237d5a61bfe0b1abb270112080141
1 #!/usr/bin/env python3
2 """
3 Generate C definitions for parsing Matroska files.
4 Can also be used to directly parse Matroska files and display their contents.
5 """
8 # This file is part of MPlayer.
10 # MPlayer is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation; either version 2 of the License, or
13 # (at your option) any later version.
15 # MPlayer is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
20 # You should have received a copy of the GNU General Public License along
21 # with MPlayer; if not, write to the Free Software Foundation, Inc.,
22 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 elements_ebml = (
27 'EBML, 1a45dfa3, sub', (
28 'EBMLVersion, 4286, uint',
29 'EBMLReadVersion, 42f7, uint',
30 'EBMLMaxIDLength, 42f2, uint',
31 'EBMLMaxSizeLength, 42f3, uint',
32 'DocType, 4282, str',
33 'DocTypeVersion, 4287, uint',
34 'DocTypeReadVersion, 4285, uint',
37 'CRC32, bf, binary',
38 'Void, ec, binary',
41 elements_matroska = (
42 'Segment, 18538067, sub', (
44 'SeekHead*, 114d9b74, sub', (
45 'Seek*, 4dbb, sub', (
46 'SeekID, 53ab, ebml_id',
47 'SeekPosition, 53ac, uint',
51 'Info*, 1549a966, sub', (
52 'SegmentUID, 73a4, binary',
53 'PrevUID, 3cb923, binary',
54 'NextUID, 3eb923, binary',
55 'TimecodeScale, 2ad7b1, uint',
56 'DateUTC, 4461, sint',
57 'Title, 7ba9, str',
58 'MuxingApp, 4d80, str',
59 'WritingApp, 5741, str',
60 'Duration, 4489, float',
63 'Cluster*, 1f43b675, sub', (
64 'Timecode, e7, uint',
65 'BlockGroup*, a0, sub', (
66 'Block, a1, binary',
67 'BlockDuration, 9b, uint',
68 'ReferenceBlock*, fb, sint',
70 'SimpleBlock*, a3, binary',
73 'Tracks*, 1654ae6b, sub', (
74 'TrackEntry*, ae, sub', (
75 'TrackNumber, d7, uint',
76 'TrackUID, 73c5, uint',
77 'TrackType, 83, uint',
78 'FlagEnabled, b9, uint',
79 'FlagDefault, 88, uint',
80 'FlagForced, 55aa, uint',
81 'FlagLacing, 9c, uint',
82 'MinCache, 6de7, uint',
83 'MaxCache, 6df8, uint',
84 'DefaultDuration, 23e383, uint',
85 'TrackTimecodeScale, 23314f, float',
86 'MaxBlockAdditionID, 55ee, uint',
87 'Name, 536e, str',
88 'Language, 22b59c, str',
89 'CodecID, 86, str',
90 'CodecPrivate, 63a2, binary',
91 'CodecName, 258688, str',
92 'CodecDecodeAll, aa, uint',
93 'Video, e0, sub', (
94 'FlagInterlaced, 9a, uint',
95 'PixelWidth, b0, uint',
96 'PixelHeight, ba, uint',
97 'DisplayWidth, 54b0, uint',
98 'DisplayHeight, 54ba, uint',
99 'DisplayUnit, 54b2, uint',
100 'FrameRate, 2383e3, float',
102 'Audio, e1, sub', (
103 'SamplingFrequency, b5, float',
104 'OutputSamplingFrequency, 78b5, float',
105 'Channels, 9f, uint',
106 'BitDepth, 6264, uint',
108 'ContentEncodings, 6d80, sub', (
109 'ContentEncoding*, 6240, sub', (
110 'ContentEncodingOrder, 5031, uint',
111 'ContentEncodingScope, 5032, uint',
112 'ContentEncodingType, 5033, uint',
113 'ContentCompression, 5034, sub', (
114 'ContentCompAlgo, 4254, uint',
115 'ContentCompSettings, 4255, binary',
122 'Cues, 1c53bb6b, sub', (
123 'CuePoint*, bb, sub', (
124 'CueTime, b3, uint',
125 'CueTrackPositions*, b7, sub', (
126 'CueTrack, f7, uint',
127 'CueClusterPosition, f1, uint',
132 'Attachments, 1941a469, sub', (
133 'AttachedFile*, 61a7, sub', (
134 'FileDescription, 467e, str',
135 'FileName, 466e, str',
136 'FileMimeType, 4660, str',
137 'FileData, 465c, binary',
138 'FileUID, 46ae, uint',
142 'Chapters, 1043a770, sub', (
143 'EditionEntry*, 45b9, sub', (
144 'EditionUID, 45bc, uint',
145 'EditionFlagHidden, 45bd, uint',
146 'EditionFlagDefault, 45db, uint',
147 'EditionFlagOrdered, 45dd, uint',
148 'ChapterAtom*, b6, sub', (
149 'ChapterUID, 73c4, uint',
150 'ChapterTimeStart, 91, uint',
151 'ChapterTimeEnd, 92, uint',
152 'ChapterFlagHidden, 98, uint',
153 'ChapterFlagEnabled, 4598, uint',
154 'ChapterSegmentUID, 6e67, binary',
155 'ChapterSegmentEditionUID, 6ebc, uint',
156 'ChapterDisplay*, 80, sub', (
157 'ChapString, 85, str',
158 'ChapLanguage*, 437c, str',
159 'ChapCountry*, 437e, str',
164 'Tags*, 1254c367, sub', (
165 'Tag*, 7373, sub', (
166 'Targets, 63c0, sub', (
167 'TargetTypeValue, 68ca, uint',
168 'TargetTrackUID, 63c5, uint',
169 'TargetEditionUID, 63c9, uint',
170 'TargetChapterUID, 63c4, uint',
171 'TargetAttachmentUID, 63c6, uint',
173 'SimpleTag*, 67c8, sub', (
174 'TagName, 45a3, str',
175 'TagLanguage, 447a, str',
176 'TagString, 4487, str'
184 import sys
185 from math import ldexp
186 from binascii import hexlify
188 def byte2num(s):
189 return int(hexlify(s), 16)
191 class EOF(Exception): pass
193 def camelcase_to_words(name):
194 parts = []
195 start = 0
196 for i in range(1, len(name)):
197 if name[i].isupper() and (name[i-1].islower() or
198 name[i+1:i+2].islower()):
199 parts.append(name[start:i])
200 start = i
201 parts.append(name[start:])
202 return '_'.join(parts).lower()
204 class MatroskaElement(object):
206 def __init__(self, name, elid, valtype, namespace):
207 self.name = name
208 self.definename = '{}_ID_{}'.format(namespace, name.upper())
209 self.fieldname = camelcase_to_words(name)
210 self.structname = 'ebml_' + self.fieldname
211 self.elid = elid
212 self.valtype = valtype
213 if valtype == 'sub':
214 self.ebmltype = 'EBML_TYPE_SUBELEMENTS'
215 self.valname = 'struct ' + self.structname
216 else:
217 self.ebmltype = 'EBML_TYPE_' + valtype.upper()
218 try:
219 self.valname = {'uint': 'uint64_t', 'str': 'struct bstr',
220 'binary': 'struct bstr', 'ebml_id': 'uint32_t',
221 'float': 'double', 'sint': 'int64_t',
222 }[valtype]
223 except KeyError:
224 raise SyntaxError('Unrecognized value type ' + valtype)
225 self.subelements = ()
227 def add_subelements(self, subelements):
228 self.subelements = subelements
229 self.subids = set(x[0].elid for x in subelements)
231 elementd = {}
232 elementlist = []
233 def parse_elems(l, namespace):
234 subelements = []
235 for el in l:
236 if isinstance(el, str):
237 name, hexid, eltype = [x.strip() for x in el.split(',')]
238 multiple = name.endswith('*')
239 name = name.strip('*')
240 new = MatroskaElement(name, hexid, eltype, namespace)
241 elementd[hexid] = new
242 elementlist.append(new)
243 subelements.append((new, multiple))
244 else:
245 new.add_subelements(parse_elems(el, namespace))
246 return subelements
248 parse_elems(elements_ebml, 'EBML')
249 parse_elems(elements_matroska, 'MATROSKA')
251 def generate_C_header():
252 print('// Generated by TOOLS/matroska.py, do not edit manually')
253 print()
255 for el in elementlist:
256 print('#define {0.definename:40} 0x{0.elid}'.format(el))
258 print()
260 for el in reversed(elementlist):
261 if not el.subelements:
262 continue
263 print()
264 print('struct {0.structname} {{'.format(el))
265 l = max(len(subel.valname) for subel, multiple in el.subelements)+1
266 for subel, multiple in el.subelements:
267 print(' {e.valname:{l}} {star}{e.fieldname};'.format(
268 e=subel, l=l, star=' *'[multiple]))
269 print()
270 for subel, multiple in el.subelements:
271 print(' int n_{0.fieldname};'.format(subel))
272 print('};')
274 for el in elementlist:
275 if not el.subelements:
276 continue
277 print('extern const struct ebml_elem_desc {0.structname}_desc;'.format(
278 el))
280 print()
281 print('#define MAX_EBML_SUBELEMENTS', max(len(el.subelements)
282 for el in elementlist))
286 def generate_C_definitions():
287 print('// Generated by TOOLS/matroska.py, do not edit manually')
288 print()
289 for el in reversed(elementlist):
290 print()
291 if el.subelements:
292 print('#define N', el.fieldname)
293 print('E_S("{}", {})'.format(el.name, len(el.subelements)))
294 for subel, multiple in el.subelements:
295 print('F({0.definename}, {0.fieldname}, {1})'.format(
296 subel, int(multiple)))
297 print('}};')
298 print('#undef N')
299 else:
300 print('E("{0.name}", {0.fieldname}, {0.ebmltype})'.format(el))
302 def read(s, length):
303 t = s.read(length)
304 if len(t) != length:
305 raise EOF
306 return t
308 def read_id(s):
309 t = read(s, 1)
310 i = 0
311 mask = 128
312 if ord(t) == 0:
313 raise SyntaxError
314 while not ord(t) & mask:
315 i += 1
316 mask >>= 1
317 t += read(s, i)
318 return t
320 def read_vint(s):
321 t = read(s, 1)
322 i = 0
323 mask = 128
324 if ord(t) == 0:
325 raise SyntaxError
326 while not ord(t) & mask:
327 i += 1
328 mask >>= 1
329 t = bytes((ord(t) & (mask - 1),))
330 t += read(s, i)
331 return i+1, byte2num(t)
333 def read_str(s, length):
334 return read(s, length)
336 def read_uint(s, length):
337 t = read(s, length)
338 return byte2num(t)
340 def read_sint(s, length):
341 i = read_uint(s, length)
342 mask = 1 << (length * 8 - 1)
343 if i & mask:
344 i -= 2 * mask
345 return i
347 def read_float(s, length):
348 t = read(s, length)
349 i = byte2num(t)
350 if length == 4:
351 f = ldexp((i & 0x7fffff) + (1 << 23), (i >> 23 & 0xff) - 150)
352 if i & (1 << 31):
353 f = -f
354 elif length == 8:
355 f = ldexp((i & ((1 << 52) - 1)) + (1 << 52), (i >> 52 & 0x7ff) - 1075)
356 if i & (1 << 63):
357 f = -f
358 else:
359 raise SyntaxError
360 return f
362 def parse_one(s, depth, parent, maxlen):
363 elid = hexlify(read_id(s)).decode('ascii')
364 elem = elementd.get(elid)
365 if parent is not None and elid not in parent.subids and elid not in ('ec', 'bf'):
366 print('Unexpected:', elid)
367 if 1:
368 raise NotImplementedError
369 size, length = read_vint(s)
370 this_length = len(elid) / 2 + size + length
371 if elem is not None:
372 if elem.valtype != 'skip':
373 print(depth, elid, elem.name, 'size:', length, 'value:', end=' ')
374 if elem.valtype == 'sub':
375 print('subelements:')
376 while length > 0:
377 length -= parse_one(s, depth + 1, elem, length)
378 if length < 0:
379 raise SyntaxError
380 elif elem.valtype == 'str':
381 print('string', repr(read_str(s, length).decode('utf8', 'replace')))
382 elif elem.valtype in ('binary', 'ebml_id'):
383 t = read_str(s, length)
384 dec = ''
385 if elem.valtype == 'ebml_id':
386 idelem = elementd.get(hexlify(t).decode('ascii'))
387 if idelem is None:
388 dec = '(UNKNOWN)'
389 else:
390 dec = '({0.name})'.format(idelem)
391 if len(t) < 20:
392 t = hexlify(t).decode('ascii')
393 else:
394 t = '<skipped {} bytes>'.format(len(t))
395 print('binary', t, dec)
396 elif elem.valtype == 'uint':
397 print('uint', read_uint(s, length))
398 elif elem.valtype == 'sint':
399 print('sint', read_sint(s, length))
400 elif elem.valtype == 'float':
401 print('float', read_float(s, length))
402 elif elem.valtype == 'skip':
403 read(s, length)
404 else:
405 raise NotImplementedError
406 else:
407 print(depth, 'Unknown element:', elid, 'size:', length)
408 read(s, length)
409 return this_length
411 def parse_toplevel(s):
412 parse_one(s, 0, None, 1 << 63)
414 if sys.argv[1] == '--generate-header':
415 generate_C_header()
416 elif sys.argv[1] == '--generate-definitions':
417 generate_C_definitions()
418 else:
419 s = open(sys.argv[1], "rb")
420 while 1:
421 start = s.tell()
422 try:
423 parse_toplevel(s)
424 except EOF:
425 if s.tell() != start:
426 raise Exception("Unexpected end of file")
427 break