On error set ip->eof in ip_read(). Remove ip_set_eof()
[cmus.git] / id3.c
blobedb2eee726616198119bca2301b468391107112d
1 /*
2 * Copyright 2005 Timo Hirvonen
3 */
5 #include "id3.h"
6 #include "comment.h"
7 #include "xmalloc.h"
8 #include "utf8_encode.h"
9 #include "uchar.h"
10 #include "options.h"
11 #include "debug.h"
13 #include <unistd.h>
14 #include <inttypes.h>
15 #include <errno.h>
16 #include <stdio.h>
19 * position:
21 * 0 "ID3"
22 * -10 "3DI"
23 * -128 "TAG"
24 * -138 "3DI"
26 * if v2 is at beginning _and_ at end then there must be a seek tag at beginning
29 struct ID3 {
30 char v1[128];
31 char *v2[NUM_ID3_KEYS];
33 unsigned int has_v1 : 1;
34 unsigned int has_v2 : 1;
37 struct v2_header {
38 unsigned char ver_major;
39 unsigned char ver_minor;
40 unsigned char flags;
41 uint32_t size;
44 struct v2_extended_header {
45 uint32_t size;
48 struct v2_frame_header {
49 char id[4];
50 uint32_t size;
51 uint16_t flags;
54 #define V2_HEADER_UNSYNC (1 << 7)
55 #define V2_HEADER_EXTENDED (1 << 6)
56 #define V2_HEADER_EXPERIMENTAL (1 << 5)
57 #define V2_HEADER_FOOTER (1 << 4)
59 #define V2_FRAME_COMPRESSED (1 << 3) /* great idea!!1 */
60 #define V2_FRAME_ENCRYPTHED (1 << 2) /* wow, this is very neat! */
61 #define V2_FRAME_UNSYNC (1 << 1)
62 #define V2_FRAME_LEN_INDICATOR (1 << 0)
64 #define NR_GENRES 148
65 /* genres {{{ */
66 static const char *genres[NR_GENRES] = {
67 "Blues",
68 "Classic Rock",
69 "Country",
70 "Dance",
71 "Disco",
72 "Funk",
73 "Grunge",
74 "Hip-Hop",
75 "Jazz",
76 "Metal",
77 "New Age",
78 "Oldies",
79 "Other",
80 "Pop",
81 "R&B",
82 "Rap",
83 "Reggae",
84 "Rock",
85 "Techno",
86 "Industrial",
87 "Alternative",
88 "Ska",
89 "Death Metal",
90 "Pranks",
91 "Soundtrack",
92 "Euro-Techno",
93 "Ambient",
94 "Trip-Hop",
95 "Vocal",
96 "Jazz+Funk",
97 "Fusion",
98 "Trance",
99 "Classical",
100 "Instrumental",
101 "Acid",
102 "House",
103 "Game",
104 "Sound Clip",
105 "Gospel",
106 "Noise",
107 "Alt",
108 "Bass",
109 "Soul",
110 "Punk",
111 "Space",
112 "Meditative",
113 "Instrumental Pop",
114 "Instrumental Rock",
115 "Ethnic",
116 "Gothic",
117 "Darkwave",
118 "Techno-Industrial",
119 "Electronic",
120 "Pop-Folk",
121 "Eurodance",
122 "Dream",
123 "Southern Rock",
124 "Comedy",
125 "Cult",
126 "Gangsta Rap",
127 "Top 40",
128 "Christian Rap",
129 "Pop/Funk",
130 "Jungle",
131 "Native American",
132 "Cabaret",
133 "New Wave",
134 "Psychedelic",
135 "Rave",
136 "Showtunes",
137 "Trailer",
138 "Lo-Fi",
139 "Tribal",
140 "Acid Punk",
141 "Acid Jazz",
142 "Polka",
143 "Retro",
144 "Musical",
145 "Rock & Roll",
146 "Hard Rock",
147 "Folk",
148 "Folk/Rock",
149 "National Folk",
150 "Swing",
151 "Fast-Fusion",
152 "Bebob",
153 "Latin",
154 "Revival",
155 "Celtic",
156 "Bluegrass",
157 "Avantgarde",
158 "Gothic Rock",
159 "Progressive Rock",
160 "Psychedelic Rock",
161 "Symphonic Rock",
162 "Slow Rock",
163 "Big Band",
164 "Chorus",
165 "Easy Listening",
166 "Acoustic",
167 "Humour",
168 "Speech",
169 "Chanson",
170 "Opera",
171 "Chamber Music",
172 "Sonata",
173 "Symphony",
174 "Booty Bass",
175 "Primus",
176 "Porn Groove",
177 "Satire",
178 "Slow Jam",
179 "Club",
180 "Tango",
181 "Samba",
182 "Folklore",
183 "Ballad",
184 "Power Ballad",
185 "Rhythmic Soul",
186 "Freestyle",
187 "Duet",
188 "Punk Rock",
189 "Drum Solo",
190 "A Cappella",
191 "Euro-House",
192 "Dance Hall",
193 "Goa",
194 "Drum & Bass",
195 "Club-House",
196 "Hardcore",
197 "Terror",
198 "Indie",
199 "BritPop",
200 "Negerpunk",
201 "Polsk Punk",
202 "Beat",
203 "Christian Gangsta Rap",
204 "Heavy Metal",
205 "Black Metal",
206 "Crossover",
207 "Contemporary Christian",
208 "Christian Rock",
209 "Merengue",
210 "Salsa",
211 "Thrash Metal",
212 "Anime",
213 "JPop",
214 "Synthpop"
216 /* }}} */
218 #if 1
219 #define id3_debug(...) d_print(__VA_ARGS__)
220 #else
221 #define id3_debug(...) do { } while (0)
222 #endif
224 static int utf16_is_special(const uchar uch)
226 if (UTF16_IS_HSURROGATE(uch) || UTF16_IS_LSURROGATE(uch) || UTF16_IS_BOM(uch))
227 return -1;
228 return 0;
231 static char *utf16_to_utf8(const unsigned char *buf, int buf_size)
233 char *out;
234 int i, idx;
236 out = xnew(char, (buf_size / 2) * 4 + 1);
237 i = idx = 0;
238 while (buf_size - i >= 2) {
239 uchar u;
241 u = buf[i] + (buf[i + 1] << 8);
242 if (u_is_unicode(u)) {
243 if (utf16_is_special(u) == 0)
244 u_set_char(out, &idx, u);
245 } else {
246 free(out);
247 return NULL;
249 if (u == 0)
250 return out;
251 i += 2;
253 u_set_char(out, &idx, 0);
254 return out;
257 static char *utf16be_to_utf8(const unsigned char *buf, int buf_size)
259 char *out;
260 int i, idx;
262 out = xnew(char, (buf_size / 2) * 4 + 1);
263 i = 0;
264 idx = 0;
265 while (buf_size - i >= 2) {
266 uchar u;
268 u = buf[i + 1] + (buf[i] << 8);
269 if (u_is_unicode(u)) {
270 if (utf16_is_special(u) == 0)
271 u_set_char(out, &idx, u);
272 } else {
273 free(out);
274 return NULL;
276 if (u == 0)
277 return out;
278 i += 2;
280 u_set_char(out, &idx, 0);
281 return out;
284 static int is_v1(const char *buf)
286 return buf[0] == 'T' && buf[1] == 'A' && buf[2] == 'G';
289 static int u32_unsync(const unsigned char *buf, uint32_t *up)
291 uint32_t b, u = 0;
292 int i;
294 for (i = 0; i < 4; i++) {
295 b = buf[i];
296 if (b >= 0x80)
297 return 0;
298 u <<= 7;
299 u |= b;
301 *up = u;
302 return 1;
305 static void get_u32(const unsigned char *buf, uint32_t *up)
307 uint32_t b, u = 0;
308 int i;
310 for (i = 0; i < 4; i++) {
311 b = buf[i];
312 u <<= 8;
313 u |= b;
315 *up = u;
318 static void get_u24(const unsigned char *buf, uint32_t *up)
320 uint32_t b, u = 0;
321 int i;
323 for (i = 0; i < 3; i++) {
324 b = buf[i];
325 u <<= 8;
326 u |= b;
328 *up = u;
331 static int v2_header_footer_parse(struct v2_header *header, const char *buf)
333 const unsigned char *b = (const unsigned char *)buf;
335 header->ver_major = b[3];
336 header->ver_minor = b[4];
337 header->flags = b[5];
338 if (header->ver_major == 0xff || header->ver_minor == 0xff)
339 return 0;
340 return u32_unsync(b + 6, &header->size);
343 static int v2_header_parse(struct v2_header *header, const char *buf)
345 if (buf[0] != 'I' || buf[1] != 'D' || buf[2] != '3')
346 return 0;
347 return v2_header_footer_parse(header, buf);
350 static int v2_footer_parse(struct v2_header *header, const char *buf)
352 if (buf[0] != '3' || buf[1] != 'D' || buf[2] != 'I')
353 return 0;
354 return v2_header_footer_parse(header, buf);
357 static int v2_extended_header_parse(struct v2_extended_header *header, const char *buf)
359 return u32_unsync((const unsigned char *)buf, &header->size);
362 static int is_frame_id_char(char ch)
364 return (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9');
367 /* XXXYYY
369 * X = [A-Z0-9]
370 * Y = byte
372 * XXX is frame
373 * YYY is frame size excluding this 6 byte header
375 static int v2_2_0_frame_header_parse(struct v2_frame_header *header, const char *buf)
377 int i;
379 for (i = 0; i < 3; i++) {
380 if (!is_frame_id_char(buf[i]))
381 return 0;
382 header->id[i] = buf[i];
384 header->id[3] = 0;
385 get_u24((const unsigned char *)(buf + 3), &header->size);
386 header->flags = 0;
387 if (header->size == 0)
388 return 0;
389 id3_debug("%c%c%c %d\n", header->id[0], header->id[1], header->id[2], header->size);
390 return 1;
393 /* XXXXYYYYZZ
395 * X = [A-Z0-9]
396 * Y = byte
397 * Z = byte
399 * XXXX is frame
400 * YYYY is frame size excluding this 10 byte header
401 * ZZ is flags
403 static int v2_3_0_frame_header_parse(struct v2_frame_header *header, const char *buf)
405 int i;
407 for (i = 0; i < 4; i++) {
408 if (!is_frame_id_char(buf[i]))
409 return 0;
410 header->id[i] = buf[i];
412 get_u32((const unsigned char *)(buf + 4), &header->size);
413 header->flags = (buf[8] << 8) | buf[9];
414 if (header->size == 0)
415 return 0;
416 id3_debug("%c%c%c%c %d\n", header->id[0], header->id[1], header->id[2],
417 header->id[3], header->size);
418 return 1;
421 /* same as 2.3 but header size is sync safe */
422 static int v2_4_0_frame_header_parse(struct v2_frame_header *header, const char *buf)
424 int i;
426 for (i = 0; i < 4; i++) {
427 if (!is_frame_id_char(buf[i]))
428 return 0;
429 header->id[i] = buf[i];
431 if (!u32_unsync((const unsigned char *)(buf + 4), &header->size))
432 return 0;
433 header->flags = (buf[8] << 8) | buf[9];
434 if (header->size == 0)
435 return 0;
436 id3_debug("%c%c%c%c %d\n", header->id[0], header->id[1], header->id[2],
437 header->id[3], header->size);
438 return 1;
441 static int read_all(int fd, char *buf, size_t size)
443 size_t pos = 0;
445 while (pos < size) {
446 int rc = read(fd, buf + pos, size - pos);
448 if (rc == -1) {
449 if (errno == EINTR || errno == EAGAIN)
450 continue;
451 return -1;
453 pos += rc;
455 return 0;
458 static char *parse_genre(const char *str)
460 int parenthesis = 0;
461 long int idx;
462 char *end;
464 if (strncasecmp(str, "(RX", 3) == 0)
465 return xstrdup("Remix");
467 if (strncasecmp(str, "(CR", 3) == 0)
468 return xstrdup("Cover");
470 if (*str == '(') {
471 parenthesis = 1;
472 str++;
475 idx = strtol(str, &end, 10);
476 if (str != end) {
477 /* Number parsed but there may be some crap after the number.
478 * I don't care, ID3v2 by definition contains crap.
480 if (idx >= 0 && idx < NR_GENRES)
481 return xstrdup(genres[idx]);
484 if (parenthesis) {
485 const char *ptr = strchr(str, ')');
487 if (ptr && ptr[1]) {
488 /* genre name after random crap in parenthesis,
489 * return the genre name */
490 return xstrdup(ptr + 1);
492 str--;
495 /* random crap, just return it and wait for a bug report */
496 return xstrdup(str);
499 /* http://www.id3.org/id3v2.4.0-structure.txt */
500 static struct {
501 const char name[8];
502 enum id3_key key;
503 } frame_tab[] = {
504 /* 2.4.0 */
505 { "TDRC", ID3_DATE },
507 /* >= 2.3.0 */
508 { "TPE1", ID3_ARTIST },
509 { "TALB", ID3_ALBUM },
510 { "TIT2", ID3_TITLE },
511 { "TYER", ID3_DATE },
512 { "TCON", ID3_GENRE },
513 { "TPOS", ID3_DISC },
514 { "TRCK", ID3_TRACK },
515 { "TPE2", ID3_ALBUMARTIST },
517 /* obsolete frames (2.2.0) */
518 { "TP1", ID3_ARTIST },
519 { "TAL", ID3_ALBUM },
520 { "TT2", ID3_TITLE },
521 { "TYE", ID3_DATE },
522 { "TCO", ID3_GENRE },
523 { "TPA", ID3_DISC },
524 { "TRK", ID3_TRACK },
526 { "", -1 }
529 static void v2_add_frame(ID3 *id3, struct v2_frame_header *fh, const char *buf)
531 int i, encoding = *buf++, len = fh->size - 1;
533 if (encoding > 3)
534 return;
536 for (i = 0; frame_tab[i].key != -1; i++) {
537 enum id3_key key = frame_tab[i].key;
538 char *in, *out;
539 int rc;
541 if (strncmp(fh->id, frame_tab[i].name, 4))
542 continue;
544 switch (encoding) {
545 case 0x00: /* ISO-8859-1 */
546 in = xstrndup(buf, len);
547 rc = utf8_encode(in, id3_default_charset, &out);
548 free(in);
549 if (rc)
550 return;
551 break;
552 case 0x03: /* UTF-8 */
553 in = xstrndup(buf, len);
554 if (u_is_valid(in)) {
555 out = in;
556 } else {
557 rc = utf8_encode(in, id3_default_charset, &out);
558 free(in);
559 if (rc)
560 return;
562 break;
563 case 0x01: /* UTF-16 */
564 out = utf16_to_utf8((const unsigned char *)buf, len);
565 if (out == NULL)
566 return;
567 break;
568 case 0x02: /* UTF-16BE */
569 out = utf16be_to_utf8((const unsigned char *)buf, len);
570 if (out == NULL)
571 return;
572 break;
574 if (key == ID3_TRACK || key == ID3_DISC)
575 fix_track_or_disc(out);
576 if (key == ID3_GENRE) {
577 char *tmp;
579 id3_debug("genre before: '%s'\n", out);
580 tmp = parse_genre(out);
581 free(out);
582 out = tmp;
584 free(id3->v2[key]);
585 id3->v2[key] = out;
586 id3->has_v2 = 1;
587 id3_debug("%s '%s'\n", frame_tab[i].name, out);
588 break;
592 static void unsync(unsigned char *buf, int *lenp)
594 int len = *lenp;
595 int s, d;
597 s = d = 0;
598 while (s < len - 1) {
599 if (buf[s] == 0xff && buf[s + 1] == 0x00) {
600 /* 0xff 0x00 -> 0xff */
601 buf[d++] = 0xff;
602 s += 2;
604 if (s < len - 2 && buf[s] == 0x00) {
605 /* 0xff 0x00 0x00 -> 0xff 0x00 */
606 buf[d++] = 0x00;
607 s++;
609 continue;
611 buf[d++] = buf[s++];
613 if (s < len)
614 buf[d++] = buf[s++];
616 d_print("unsyncronization removed %d bytes\n", s - d);
617 *lenp = d;
620 static int v2_read(ID3 *id3, int fd, const struct v2_header *header)
622 char *buf;
623 int rc, buf_size;
624 int frame_start, i;
625 int frame_header_size;
627 buf_size = header->size;
628 buf = xnew(char, buf_size);
629 rc = read_all(fd, buf, buf_size);
630 if (rc) {
631 free(buf);
632 return rc;
635 frame_start = 0;
636 if (header->flags & V2_HEADER_EXTENDED) {
637 struct v2_extended_header ext;
639 v2_extended_header_parse(&ext, buf);
640 if (ext.size > buf_size) {
641 id3_debug("extended header corrupted\n");
642 free(buf);
643 return -2;
645 frame_start = ext.size;
646 /* should check if update flag is set */
649 if (header->flags & V2_HEADER_UNSYNC) {
650 int len = buf_size - frame_start;
652 unsync((unsigned char *)(buf + frame_start), &len);
653 buf_size = len + frame_start;
656 frame_header_size = 10;
657 if (header->ver_major == 2)
658 frame_header_size = 6;
660 i = frame_start;
661 while (i < buf_size - frame_header_size) {
662 struct v2_frame_header fh;
663 int len;
665 if (header->ver_major == 2) {
666 if (!v2_2_0_frame_header_parse(&fh, buf + i))
667 break;
668 } else if (header->ver_major == 3) {
669 if (!v2_3_0_frame_header_parse(&fh, buf + i))
670 break;
671 } else {
672 /* assume v2.4 */
673 if (!v2_4_0_frame_header_parse(&fh, buf + i))
674 break;
677 i += frame_header_size;
678 if (fh.size > buf_size - i) {
679 id3_debug("frame too big\n");
680 break;
683 len = fh.size;
684 if (fh.flags & V2_FRAME_UNSYNC) {
685 int tmp = len;
687 unsync((unsigned char *)(buf + i), &tmp);
688 fh.size = tmp;
690 v2_add_frame(id3, &fh, buf + i);
691 i += len;
694 free(buf);
695 return 0;
698 int id3_tag_size(const char *buf, int buf_size)
700 struct v2_header header;
702 if (buf_size < 10)
703 return 0;
704 if (v2_header_parse(&header, buf)) {
705 if (header.flags & V2_HEADER_FOOTER) {
706 /* header + data + footer */
707 id3_debug("v2.%d.%d with footer\n", header.ver_major, header.ver_minor);
708 return 10 + header.size + 10;
710 /* header */
711 id3_debug("v2.%d.%d\n", header.ver_major, header.ver_minor);
712 return 10 + header.size;
714 if (buf_size >= 3 && is_v1(buf)) {
715 id3_debug("v1\n");
716 return 128;
718 return 0;
721 ID3 *id3_new(void)
723 return xnew0(ID3, 1);
726 void id3_free(ID3 *id3)
728 int i;
730 for (i = 0; i < NUM_ID3_KEYS; i++)
731 free(id3->v2[i]);
732 free(id3);
735 int id3_read_tags(ID3 *id3, int fd, unsigned int flags)
737 off_t off;
738 int rc;
740 if (flags & ID3_V2) {
741 struct v2_header header;
742 char buf[138];
744 rc = read_all(fd, buf, 10);
745 if (rc)
746 goto rc_error;
747 if (v2_header_parse(&header, buf)) {
748 rc = v2_read(id3, fd, &header);
749 if (rc)
750 goto rc_error;
751 /* get v1 if needed */
752 } else {
753 /* get v2 from end and optionally v1 */
755 off = lseek(fd, -138, SEEK_END);
756 if (off == -1)
757 goto error;
758 rc = read_all(fd, buf, 138);
759 if (rc)
760 goto rc_error;
762 if (is_v1(buf + 10)) {
763 if (flags & ID3_V1) {
764 memcpy(id3->v1, buf + 10, 128);
765 id3->has_v1 = 1;
767 if (v2_footer_parse(&header, buf)) {
768 /* footer at end of file - 128 */
769 off = lseek(fd, -(header.size + 138), SEEK_END);
770 if (off == -1)
771 goto error;
772 rc = v2_read(id3, fd, &header);
773 if (rc)
774 goto rc_error;
776 } else if (v2_footer_parse(&header, buf + 128)) {
777 /* footer at end of file */
778 off = lseek(fd, -(header.size + 10), SEEK_END);
779 if (off == -1)
780 goto error;
781 rc = v2_read(id3, fd, &header);
782 if (rc)
783 goto rc_error;
785 return 0;
788 if (flags & ID3_V1) {
789 off = lseek(fd, -128, SEEK_END);
790 if (off == -1)
791 goto error;
792 rc = read_all(fd, id3->v1, 128);
793 if (rc)
794 goto rc_error;
795 id3->has_v1 = is_v1(id3->v1);
797 return 0;
798 error:
799 rc = -1;
800 rc_error:
801 return rc;
804 static char *v1_get_str(const char *buf, int len)
806 char in[32];
807 char *out;
808 int i;
810 for (i = len - 1; i >= 0; i--) {
811 if (buf[i] != 0 && buf[i] != ' ')
812 break;
814 if (i == -1)
815 return NULL;
816 memcpy(in, buf, i + 1);
817 in[i + 1] = 0;
818 if (u_is_valid(in))
819 return xstrdup(in);
820 if (utf8_encode(in, id3_default_charset, &out))
821 return NULL;
822 return out;
825 char *id3_get_comment(ID3 *id3, enum id3_key key)
827 if (id3->has_v2) {
828 if (id3->v2[key])
829 return xstrdup(id3->v2[key]);
831 if (id3->has_v1) {
832 switch (key) {
833 case ID3_ARTIST:
834 return v1_get_str(id3->v1 + 33, 30);
835 case ID3_ALBUM:
836 return v1_get_str(id3->v1 + 63, 30);
837 case ID3_TITLE:
838 return v1_get_str(id3->v1 + 3, 30);
839 case ID3_DATE:
840 return v1_get_str(id3->v1 + 93, 4);
841 case ID3_GENRE:
843 unsigned char idx = id3->v1[127];
845 if (idx >= NR_GENRES)
846 return NULL;
847 return xstrdup(genres[idx]);
849 case ID3_TRACK:
851 char *t;
853 if (id3->v1[125] != 0)
854 return NULL;
855 t = xnew(char, 4);
856 snprintf(t, 4, "%d", ((unsigned char *)id3->v1)[126]);
857 return t;
859 case ID3_DISC:
860 case ID3_ALBUMARTIST:
861 case NUM_ID3_KEYS:
862 return NULL;
865 return NULL;