AAC: Detect corrupted stream
[cmus.git] / id3.c
blob2b22b7407fba72d2c95858bbc0a1c61f61eb7590
1 /*
2 * Copyright 2005 Timo Hirvonen
3 */
5 #include "id3.h"
6 #include "comment.h"
7 #include "xmalloc.h"
8 #include "utf8_encode.h"
9 #include "uchar.h"
10 #include "options.h"
11 #include "debug.h"
13 #include <unistd.h>
14 #include <inttypes.h>
15 #include <errno.h>
16 #include <stdio.h>
19 * position:
21 * 0 "ID3"
22 * -10 "3DI"
23 * -128 "TAG"
24 * -138 "3DI"
26 * if v2 is at beginning _and_ at end then there must be a seek tag at beginning
29 struct ID3 {
30 char v1[128];
31 char *v2[NUM_ID3_KEYS];
33 unsigned int has_v1 : 1;
34 unsigned int has_v2 : 1;
37 struct v2_header {
38 unsigned char ver_major;
39 unsigned char ver_minor;
40 unsigned char flags;
41 uint32_t size;
44 struct v2_extended_header {
45 uint32_t size;
48 struct v2_frame_header {
49 char id[4];
50 uint32_t size;
51 uint16_t flags;
54 #define V2_HEADER_UNSYNC (1 << 7)
55 #define V2_HEADER_EXTENDED (1 << 6)
56 #define V2_HEADER_EXPERIMENTAL (1 << 5)
57 #define V2_HEADER_FOOTER (1 << 4)
59 #define V2_FRAME_COMPRESSED (1 << 3) /* great idea!!1 */
60 #define V2_FRAME_ENCRYPTHED (1 << 2) /* wow, this is very neat! */
61 #define V2_FRAME_UNSYNC (1 << 1)
62 #define V2_FRAME_LEN_INDICATOR (1 << 0)
64 #define NR_GENRES 148
65 /* genres {{{ */
66 static const char *genres[NR_GENRES] = {
67 "Blues",
68 "Classic Rock",
69 "Country",
70 "Dance",
71 "Disco",
72 "Funk",
73 "Grunge",
74 "Hip-Hop",
75 "Jazz",
76 "Metal",
77 "New Age",
78 "Oldies",
79 "Other",
80 "Pop",
81 "R&B",
82 "Rap",
83 "Reggae",
84 "Rock",
85 "Techno",
86 "Industrial",
87 "Alternative",
88 "Ska",
89 "Death Metal",
90 "Pranks",
91 "Soundtrack",
92 "Euro-Techno",
93 "Ambient",
94 "Trip-Hop",
95 "Vocal",
96 "Jazz+Funk",
97 "Fusion",
98 "Trance",
99 "Classical",
100 "Instrumental",
101 "Acid",
102 "House",
103 "Game",
104 "Sound Clip",
105 "Gospel",
106 "Noise",
107 "Alt",
108 "Bass",
109 "Soul",
110 "Punk",
111 "Space",
112 "Meditative",
113 "Instrumental Pop",
114 "Instrumental Rock",
115 "Ethnic",
116 "Gothic",
117 "Darkwave",
118 "Techno-Industrial",
119 "Electronic",
120 "Pop-Folk",
121 "Eurodance",
122 "Dream",
123 "Southern Rock",
124 "Comedy",
125 "Cult",
126 "Gangsta Rap",
127 "Top 40",
128 "Christian Rap",
129 "Pop/Funk",
130 "Jungle",
131 "Native American",
132 "Cabaret",
133 "New Wave",
134 "Psychedelic",
135 "Rave",
136 "Showtunes",
137 "Trailer",
138 "Lo-Fi",
139 "Tribal",
140 "Acid Punk",
141 "Acid Jazz",
142 "Polka",
143 "Retro",
144 "Musical",
145 "Rock & Roll",
146 "Hard Rock",
147 "Folk",
148 "Folk/Rock",
149 "National Folk",
150 "Swing",
151 "Fast-Fusion",
152 "Bebob",
153 "Latin",
154 "Revival",
155 "Celtic",
156 "Bluegrass",
157 "Avantgarde",
158 "Gothic Rock",
159 "Progressive Rock",
160 "Psychedelic Rock",
161 "Symphonic Rock",
162 "Slow Rock",
163 "Big Band",
164 "Chorus",
165 "Easy Listening",
166 "Acoustic",
167 "Humour",
168 "Speech",
169 "Chanson",
170 "Opera",
171 "Chamber Music",
172 "Sonata",
173 "Symphony",
174 "Booty Bass",
175 "Primus",
176 "Porn Groove",
177 "Satire",
178 "Slow Jam",
179 "Club",
180 "Tango",
181 "Samba",
182 "Folklore",
183 "Ballad",
184 "Power Ballad",
185 "Rhythmic Soul",
186 "Freestyle",
187 "Duet",
188 "Punk Rock",
189 "Drum Solo",
190 "A Cappella",
191 "Euro-House",
192 "Dance Hall",
193 "Goa",
194 "Drum & Bass",
195 "Club-House",
196 "Hardcore",
197 "Terror",
198 "Indie",
199 "BritPop",
200 "Negerpunk",
201 "Polsk Punk",
202 "Beat",
203 "Christian Gangsta Rap",
204 "Heavy Metal",
205 "Black Metal",
206 "Crossover",
207 "Contemporary Christian",
208 "Christian Rock",
209 "Merengue",
210 "Salsa",
211 "Thrash Metal",
212 "Anime",
213 "JPop",
214 "Synthpop"
216 /* }}} */
218 #if 1
219 #define id3_debug(...) d_print(__VA_ARGS__)
220 #else
221 #define id3_debug(...) do { } while (0)
222 #endif
224 static int utf16_is_special(const uchar uch)
226 if (UTF16_IS_HSURROGATE(uch) || UTF16_IS_LSURROGATE(uch) || UTF16_IS_BOM(uch))
227 return -1;
228 return 0;
231 static char *utf16_to_utf8(const unsigned char *buf, int buf_size)
233 char *out;
234 int i, idx;
236 out = xnew(char, (buf_size / 2) * 4 + 1);
237 i = idx = 0;
238 while (buf_size - i >= 2) {
239 uchar u;
241 u = buf[i] + (buf[i + 1] << 8);
242 if (u_is_unicode(u)) {
243 if (utf16_is_special(u) == 0)
244 u_set_char(out, &idx, u);
245 } else {
246 free(out);
247 return NULL;
249 if (u == 0)
250 return out;
251 i += 2;
253 u_set_char(out, &idx, 0);
254 return out;
257 static char *utf16be_to_utf8(const unsigned char *buf, int buf_size)
259 char *out;
260 int i, idx;
262 out = xnew(char, (buf_size / 2) * 4 + 1);
263 i = 0;
264 idx = 0;
265 while (buf_size - i >= 2) {
266 uchar u;
268 u = buf[i + 1] + (buf[i] << 8);
269 if (u_is_unicode(u)) {
270 if (utf16_is_special(u) == 0)
271 u_set_char(out, &idx, u);
272 } else {
273 free(out);
274 return NULL;
276 if (u == 0)
277 return out;
278 i += 2;
280 u_set_char(out, &idx, 0);
281 return out;
284 static int is_v1(const char *buf)
286 return buf[0] == 'T' && buf[1] == 'A' && buf[2] == 'G';
289 static int u32_unsync(const unsigned char *buf, uint32_t *up)
291 uint32_t b, u = 0;
292 int i;
294 for (i = 0; i < 4; i++) {
295 b = buf[i];
296 if (b >= 0x80)
297 return 0;
298 u <<= 7;
299 u |= b;
301 *up = u;
302 return 1;
305 static void get_u32(const unsigned char *buf, uint32_t *up)
307 uint32_t b, u = 0;
308 int i;
310 for (i = 0; i < 4; i++) {
311 b = buf[i];
312 u <<= 8;
313 u |= b;
315 *up = u;
318 static void get_u24(const unsigned char *buf, uint32_t *up)
320 uint32_t b, u = 0;
321 int i;
323 for (i = 0; i < 3; i++) {
324 b = buf[i];
325 u <<= 8;
326 u |= b;
328 *up = u;
331 static int v2_header_footer_parse(struct v2_header *header, const char *buf)
333 const unsigned char *b = (const unsigned char *)buf;
335 header->ver_major = b[3];
336 header->ver_minor = b[4];
337 header->flags = b[5];
338 if (header->ver_major == 0xff || header->ver_minor == 0xff)
339 return 0;
340 return u32_unsync(b + 6, &header->size);
343 static int v2_header_parse(struct v2_header *header, const char *buf)
345 if (buf[0] != 'I' || buf[1] != 'D' || buf[2] != '3')
346 return 0;
347 return v2_header_footer_parse(header, buf);
350 static int v2_footer_parse(struct v2_header *header, const char *buf)
352 if (buf[0] != '3' || buf[1] != 'D' || buf[2] != 'I')
353 return 0;
354 return v2_header_footer_parse(header, buf);
357 static int v2_extended_header_parse(struct v2_extended_header *header, const char *buf)
359 return u32_unsync((const unsigned char *)buf, &header->size);
362 static int is_frame_id_char(char ch)
364 return (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9');
367 /* XXXYYY
369 * X = [A-Z0-9]
370 * Y = byte
372 * XXX is frame
373 * YYY is frame size excluding this 6 byte header
375 static int v2_2_0_frame_header_parse(struct v2_frame_header *header, const char *buf)
377 int i;
379 for (i = 0; i < 3; i++) {
380 if (!is_frame_id_char(buf[i]))
381 return 0;
382 header->id[i] = buf[i];
384 header->id[3] = 0;
385 get_u24((const unsigned char *)(buf + 3), &header->size);
386 header->flags = 0;
387 if (header->size == 0)
388 return 0;
389 id3_debug("%c%c%c %d\n", header->id[0], header->id[1], header->id[2], header->size);
390 return 1;
393 /* XXXXYYYYZZ
395 * X = [A-Z0-9]
396 * Y = byte
397 * Z = byte
399 * XXXX is frame
400 * YYYY is frame size excluding this 10 byte header
401 * ZZ is flags
403 static int v2_3_0_frame_header_parse(struct v2_frame_header *header, const char *buf)
405 int i;
407 for (i = 0; i < 4; i++) {
408 if (!is_frame_id_char(buf[i]))
409 return 0;
410 header->id[i] = buf[i];
412 get_u32((const unsigned char *)(buf + 4), &header->size);
413 header->flags = (buf[8] << 8) | buf[9];
414 if (header->size == 0)
415 return 0;
416 id3_debug("%c%c%c%c %d\n", header->id[0], header->id[1], header->id[2],
417 header->id[3], header->size);
418 return 1;
421 /* same as 2.3 but header size is sync safe */
422 static int v2_4_0_frame_header_parse(struct v2_frame_header *header, const char *buf)
424 int i;
426 for (i = 0; i < 4; i++) {
427 if (!is_frame_id_char(buf[i]))
428 return 0;
429 header->id[i] = buf[i];
431 if (!u32_unsync((const unsigned char *)(buf + 4), &header->size))
432 return 0;
433 header->flags = (buf[8] << 8) | buf[9];
434 if (header->size == 0)
435 return 0;
436 id3_debug("%c%c%c%c %d\n", header->id[0], header->id[1], header->id[2],
437 header->id[3], header->size);
438 return 1;
441 static int read_all(int fd, char *buf, size_t size)
443 size_t pos = 0;
445 while (pos < size) {
446 int rc = read(fd, buf + pos, size - pos);
448 if (rc == -1) {
449 if (errno == EINTR || errno == EAGAIN)
450 continue;
451 return -1;
453 pos += rc;
455 return 0;
458 static char *parse_genre(const char *str)
460 int parenthesis = 0;
461 long int idx;
462 char *end;
464 if (strncasecmp(str, "(RX", 3) == 0)
465 return xstrdup("Remix");
467 if (strncasecmp(str, "(CR", 3) == 0)
468 return xstrdup("Cover");
470 if (*str == '(') {
471 parenthesis = 1;
472 str++;
475 idx = strtol(str, &end, 10);
476 if (str != end) {
477 /* Number parsed but there may be some crap after the number.
478 * I don't care, ID3v2 by definition contains crap.
480 if (idx >= 0 && idx < NR_GENRES)
481 return xstrdup(genres[idx]);
484 if (parenthesis) {
485 const char *ptr = strchr(str, ')');
487 if (ptr && ptr[1]) {
488 /* genre name after random crap in parenthesis,
489 * return the genre name */
490 return xstrdup(ptr + 1);
492 str--;
495 /* random crap, just return it and wait for a bug report */
496 return xstrdup(str);
499 /* http://www.id3.org/id3v2.4.0-structure.txt */
500 static struct {
501 const char name[8];
502 enum id3_key key;
503 } frame_tab[] = {
504 /* 2.4.0 */
505 { "TDRC", ID3_DATE }, // recording date
506 { "TDRL", ID3_DATE }, // release date
507 { "TDOR", ID3_DATE }, // original release date
509 /* >= 2.3.0 */
510 { "TPE1", ID3_ARTIST },
511 { "TALB", ID3_ALBUM },
512 { "TIT2", ID3_TITLE },
513 { "TYER", ID3_DATE },
514 { "TCON", ID3_GENRE },
515 { "TPOS", ID3_DISC },
516 { "TRCK", ID3_TRACK },
517 { "TPE2", ID3_ALBUMARTIST },
519 /* obsolete frames (2.2.0) */
520 { "TP1", ID3_ARTIST },
521 { "TAL", ID3_ALBUM },
522 { "TT2", ID3_TITLE },
523 { "TYE", ID3_DATE },
524 { "TCO", ID3_GENRE },
525 { "TPA", ID3_DISC },
526 { "TRK", ID3_TRACK },
528 { "", -1 }
531 static int frame_tab_index(const char *id)
533 int i;
535 for (i = 0; frame_tab[i].key != -1; i++) {
536 if (!strncmp(id, frame_tab[i].name, 4))
537 return i;
539 return -1;
542 static void fix_date(char *buf)
544 const char *ptr = buf;
545 int ch, len = 0;
547 do {
548 ch = *ptr++;
549 if (ch >= '0' && ch <= '9') {
550 len++;
551 continue;
553 if (len == 4) {
554 // number which length is 4, must be year
555 memmove(buf, ptr - 5, 4);
556 buf[4] = 0;
557 return;
559 len = 0;
560 } while (ch);
561 *buf = 0;
564 static char *decode_str(const char *buf, int len, int encoding)
566 char *in, *out = NULL;
567 int rc = 0;
569 switch (encoding) {
570 case 0x00: /* ISO-8859-1 */
571 in = xstrndup(buf, len);
572 rc = utf8_encode(in, id3_default_charset, &out);
573 free(in);
574 break;
575 case 0x03: /* UTF-8 */
576 in = xstrndup(buf, len);
577 if (u_is_valid(in)) {
578 out = in;
579 } else {
580 rc = utf8_encode(in, id3_default_charset, &out);
581 free(in);
583 break;
584 case 0x01: /* UTF-16 */
585 out = utf16_to_utf8((const unsigned char *)buf, len);
586 break;
587 case 0x02: /* UTF-16BE */
588 out = utf16be_to_utf8((const unsigned char *)buf, len);
589 break;
591 return out;
594 static void v2_add_frame(ID3 *id3, struct v2_frame_header *fh, const char *buf)
596 int idx, encoding = *buf++, len = fh->size - 1;
597 enum id3_key key = NUM_ID3_KEYS;
598 char *out;
600 if (encoding > 3)
601 return;
603 idx = frame_tab_index(fh->id);
604 if (idx >= 0) {
605 key = frame_tab[idx].key;
606 out = decode_str(buf, len, encoding);
607 if (!out)
608 return;
610 if (key == ID3_TRACK || key == ID3_DISC)
611 fix_track_or_disc(out);
612 if (key == ID3_GENRE) {
613 char *tmp;
615 id3_debug("genre before: '%s'\n", out);
616 tmp = parse_genre(out);
617 free(out);
618 out = tmp;
620 if (key == ID3_DATE) {
621 id3_debug("date before: '%s'\n", out);
622 fix_date(out);
623 if (!*out) {
624 id3_debug("date parsing failed\n");
625 free(out);
626 return;
630 id3_debug("%s '%s'\n", frame_tab[idx].name, out);
631 } else if (!strncmp(fh->id, "TXXX", 4)) {
632 int size;
634 id3_debug("TXXX\n");
636 /* TXXX<len><encoding><key><val> */
637 out = decode_str(buf, len, encoding);
638 if (!out)
639 return;
641 id3_debug("TXXX, key = '%s'\n", out);
642 if (!strcasecmp(out, "replaygain_track_gain"))
643 key = ID3_RG_TRACK_GAIN;
644 if (!strcasecmp(out, "replaygain_track_peak"))
645 key = ID3_RG_TRACK_PEAK;
646 if (!strcasecmp(out, "replaygain_album_gain"))
647 key = ID3_RG_ALBUM_GAIN;
648 if (!strcasecmp(out, "replaygain_album_peak"))
649 key = ID3_RG_ALBUM_PEAK;
651 size = strlen(out) + 1;
652 free(out);
654 if (key == NUM_ID3_KEYS)
655 return;
657 buf += size;
658 len -= size;
659 if (len <= 0)
660 return;
662 out = decode_str(buf, len, encoding);
663 if (!out)
664 return;
666 id3_debug("TXXX, val = '%s'\n", out);
667 } else {
668 return;
671 free(id3->v2[key]);
672 id3->v2[key] = out;
673 id3->has_v2 = 1;
676 static void unsync(unsigned char *buf, int *lenp)
678 int len = *lenp;
679 int s, d;
681 s = d = 0;
682 while (s < len - 1) {
683 if (buf[s] == 0xff && buf[s + 1] == 0x00) {
684 /* 0xff 0x00 -> 0xff */
685 buf[d++] = 0xff;
686 s += 2;
688 if (s < len - 2 && buf[s] == 0x00) {
689 /* 0xff 0x00 0x00 -> 0xff 0x00 */
690 buf[d++] = 0x00;
691 s++;
693 continue;
695 buf[d++] = buf[s++];
697 if (s < len)
698 buf[d++] = buf[s++];
700 d_print("unsyncronization removed %d bytes\n", s - d);
701 *lenp = d;
704 static int v2_read(ID3 *id3, int fd, const struct v2_header *header)
706 char *buf;
707 int rc, buf_size;
708 int frame_start, i;
709 int frame_header_size;
711 buf_size = header->size;
712 buf = xnew(char, buf_size);
713 rc = read_all(fd, buf, buf_size);
714 if (rc) {
715 free(buf);
716 return rc;
719 frame_start = 0;
720 if (header->flags & V2_HEADER_EXTENDED) {
721 struct v2_extended_header ext;
723 v2_extended_header_parse(&ext, buf);
724 if (ext.size > buf_size) {
725 id3_debug("extended header corrupted\n");
726 free(buf);
727 return -2;
729 frame_start = ext.size;
730 /* should check if update flag is set */
733 if (header->flags & V2_HEADER_UNSYNC) {
734 int len = buf_size - frame_start;
736 unsync((unsigned char *)(buf + frame_start), &len);
737 buf_size = len + frame_start;
740 frame_header_size = 10;
741 if (header->ver_major == 2)
742 frame_header_size = 6;
744 i = frame_start;
745 while (i < buf_size - frame_header_size) {
746 struct v2_frame_header fh;
747 int len;
749 if (header->ver_major == 2) {
750 if (!v2_2_0_frame_header_parse(&fh, buf + i))
751 break;
752 } else if (header->ver_major == 3) {
753 if (!v2_3_0_frame_header_parse(&fh, buf + i))
754 break;
755 } else {
756 /* assume v2.4 */
757 if (!v2_4_0_frame_header_parse(&fh, buf + i))
758 break;
761 i += frame_header_size;
762 if (fh.size > buf_size - i) {
763 id3_debug("frame too big\n");
764 break;
767 len = fh.size;
768 if (fh.flags & V2_FRAME_UNSYNC) {
769 int tmp = len;
771 unsync((unsigned char *)(buf + i), &tmp);
772 fh.size = tmp;
774 v2_add_frame(id3, &fh, buf + i);
775 i += len;
778 free(buf);
779 return 0;
782 int id3_tag_size(const char *buf, int buf_size)
784 struct v2_header header;
786 if (buf_size < 10)
787 return 0;
788 if (v2_header_parse(&header, buf)) {
789 if (header.flags & V2_HEADER_FOOTER) {
790 /* header + data + footer */
791 id3_debug("v2.%d.%d with footer\n", header.ver_major, header.ver_minor);
792 return 10 + header.size + 10;
794 /* header */
795 id3_debug("v2.%d.%d\n", header.ver_major, header.ver_minor);
796 return 10 + header.size;
798 if (buf_size >= 3 && is_v1(buf)) {
799 id3_debug("v1\n");
800 return 128;
802 return 0;
805 ID3 *id3_new(void)
807 return xnew0(ID3, 1);
810 void id3_free(ID3 *id3)
812 int i;
814 for (i = 0; i < NUM_ID3_KEYS; i++)
815 free(id3->v2[i]);
816 free(id3);
819 int id3_read_tags(ID3 *id3, int fd, unsigned int flags)
821 off_t off;
822 int rc;
824 if (flags & ID3_V2) {
825 struct v2_header header;
826 char buf[138];
828 rc = read_all(fd, buf, 10);
829 if (rc)
830 goto rc_error;
831 if (v2_header_parse(&header, buf)) {
832 rc = v2_read(id3, fd, &header);
833 if (rc)
834 goto rc_error;
835 /* get v1 if needed */
836 } else {
837 /* get v2 from end and optionally v1 */
839 off = lseek(fd, -138, SEEK_END);
840 if (off == -1)
841 goto error;
842 rc = read_all(fd, buf, 138);
843 if (rc)
844 goto rc_error;
846 if (is_v1(buf + 10)) {
847 if (flags & ID3_V1) {
848 memcpy(id3->v1, buf + 10, 128);
849 id3->has_v1 = 1;
851 if (v2_footer_parse(&header, buf)) {
852 /* footer at end of file - 128 */
853 off = lseek(fd, -(header.size + 138), SEEK_END);
854 if (off == -1)
855 goto error;
856 rc = v2_read(id3, fd, &header);
857 if (rc)
858 goto rc_error;
860 } else if (v2_footer_parse(&header, buf + 128)) {
861 /* footer at end of file */
862 off = lseek(fd, -(header.size + 10), SEEK_END);
863 if (off == -1)
864 goto error;
865 rc = v2_read(id3, fd, &header);
866 if (rc)
867 goto rc_error;
869 return 0;
872 if (flags & ID3_V1) {
873 off = lseek(fd, -128, SEEK_END);
874 if (off == -1)
875 goto error;
876 rc = read_all(fd, id3->v1, 128);
877 if (rc)
878 goto rc_error;
879 id3->has_v1 = is_v1(id3->v1);
881 return 0;
882 error:
883 rc = -1;
884 rc_error:
885 return rc;
888 static char *v1_get_str(const char *buf, int len)
890 char in[32];
891 char *out;
892 int i;
894 for (i = len - 1; i >= 0; i--) {
895 if (buf[i] != 0 && buf[i] != ' ')
896 break;
898 if (i == -1)
899 return NULL;
900 memcpy(in, buf, i + 1);
901 in[i + 1] = 0;
902 if (u_is_valid(in))
903 return xstrdup(in);
904 if (utf8_encode(in, id3_default_charset, &out))
905 return NULL;
906 return out;
909 char *id3_get_comment(ID3 *id3, enum id3_key key)
911 if (id3->has_v2) {
912 if (id3->v2[key])
913 return xstrdup(id3->v2[key]);
915 if (id3->has_v1) {
916 switch (key) {
917 case ID3_ARTIST:
918 return v1_get_str(id3->v1 + 33, 30);
919 case ID3_ALBUM:
920 return v1_get_str(id3->v1 + 63, 30);
921 case ID3_TITLE:
922 return v1_get_str(id3->v1 + 3, 30);
923 case ID3_DATE:
924 return v1_get_str(id3->v1 + 93, 4);
925 case ID3_GENRE:
927 unsigned char idx = id3->v1[127];
929 if (idx >= NR_GENRES)
930 return NULL;
931 return xstrdup(genres[idx]);
933 case ID3_TRACK:
935 char *t;
937 if (id3->v1[125] != 0)
938 return NULL;
939 t = xnew(char, 4);
940 snprintf(t, 4, "%d", ((unsigned char *)id3->v1)[126]);
941 return t;
943 default:
944 return NULL;
947 return NULL;