Added README.md and LICENSE files
[audiosum.git] / src / audiosum.c
blob2531406800b5f4954b3d27f4b1159add1d40d678
1 /******************************************************************************
3 audiosum.c -- Prints the hash of an audio file excluding tag sections.
4 (C) Copyright 2001-2008 Octavio Alvarez Piza, alvarezp@alvarezp.ods.org
6 Released under GNU/GPL license. All other rights reserved.
8 ## For program operation, see the showhelp() function. ##
10 Version history before Git:
12 + Code has been restructured to avoid nested if().
13 + Code now uses MD5 instead of CRC32.
14 + Moved to Linux, so development is now focused on Linux. Sorry, Win32 folks.
15 + Added "--help" switch.
16 + 0.1.1: + Now knows how to ignore Lyrics3 sections. Since I
17 don't have MP3 files with Lyrics3v1, this feature
18 is not tested, and may not work as expected.
19 + 0.1.0: + Initial Release. Plain program, knows how to ignore
20 ID3v1.x and ID3v2.x tags. Never released.
22 ******************************************************************************/
24 #include <stdio.h>
25 #include <math.h>
26 #include <string.h>
28 #include <mhash.h>
30 #include "config.h"
32 /* #define DEBUG */
34 /* This code will most probably go away. */
35 #ifdef DEBUG
36 void printd(char *msg, const char *filename, int code)
38 fprintf(stderr, "%s:%04X:%s\n", msg, code, filename);
40 #else /* */
41 #define printd(a,b,c) ((void)0)
42 #endif /* */
44 hashid default_algorithm = MHASH_MD5;
46 struct algorithms {
47 const char *cmdline_name;
48 hashid libmhash_id;
49 } algorithms[] = {
50 // { "crc32", MHASH_CRC32 },
51 // { "crc32b", MHASH_CRC32B },
52 // { "adler32", MHASH_ADLER32 },
53 // { "md2", MHASH_MD2 },
54 // { "md4", MHASH_MD4 },
55 { "md5", MHASH_MD5 },
56 { "sha1", MHASH_SHA1 },
57 { "sha224", MHASH_SHA224 },
58 { "sha256", MHASH_SHA256 },
59 { "sha384", MHASH_SHA384 },
60 { "sha512", MHASH_SHA512 },
61 { "gost", MHASH_GOST },
62 { "ripemd128", MHASH_RIPEMD128 },
63 { "ripemd160", MHASH_RIPEMD160 },
64 { "ripemd256", MHASH_RIPEMD256 },
65 { "ripemd320", MHASH_RIPEMD320 },
66 { "tiger128", MHASH_TIGER128 },
67 { "tiger160", MHASH_TIGER160 },
68 { "tiger192", MHASH_TIGER192 },
69 { "haval224", MHASH_HAVAL224 },
70 { "haval256", MHASH_HAVAL256 },
71 { "haval192", MHASH_HAVAL192 },
72 { "haval160", MHASH_HAVAL160 },
73 { "haval128", MHASH_HAVAL128 },
74 { "whirlpool", MHASH_WHIRLPOOL },
75 { "snefru128", MHASH_SNEFRU128 },
76 { "snefru256", MHASH_SNEFRU256 },
79 const algorithms_n = sizeof(algorithms) / sizeof(algorithms[0]);
81 int
82 ProcessFileID3v1(FILE * f, unsigned long *OffsetStart,
83 unsigned long *OffsetEnd)
85 char TagBuffer[3] = { 0 };
87 if (fseek(f, *OffsetEnd - 128, SEEK_SET) != 0) {
88 return -1;
91 if (fread(&TagBuffer, 3, 1, f) == 0) {
92 return -1;
95 if (strncmp("TAG", TagBuffer, 3) != 0) {
96 /* TAG not found. We are done. */
97 return 0;
100 *OffsetEnd -= 128;
101 return 1;
105 ProcessFileID3v2(FILE * f, unsigned long *OffsetStart,
106 unsigned long *OffsetEnd)
108 int x;
109 long id3size = 0;
111 struct ID3V2_Header {
112 unsigned char sign[3];
113 unsigned char version[2];
114 unsigned char flags;
115 unsigned char size[4];
116 } id3;
118 fseek(f, *OffsetStart, SEEK_SET);
120 if (fread(&id3, sizeof(struct ID3V2_Header), 1, f) <= 0) {
121 return -1;
124 if (!((id3.sign[0] == 'I') &&
125 (id3.sign[1] == 'D') &&
126 (id3.sign[2] == '3') &&
127 (id3.version[0] < 0xFF) &&
128 (id3.size[0] < 0x80) &&
129 (id3.size[1] < 0x80) &&
130 (id3.size[2] < 0x80) && (id3.size[3] < 0x80)))
132 return 0;
135 for (x = 0; x < 4; x++)
136 id3size += pow(128, x) * (unsigned long) (id3.size[3 - x] & 127);
137 id3size += 10;
138 if (id3.flags & 16)
139 id3size += 10;
140 *OffsetStart += id3size;
142 return 1;
147 ProcessFileLyrics3v1(FILE * f, unsigned long *OffsetStart,
148 unsigned long *OffsetEnd)
150 char L3Buffer[5109];
151 int FoundTag = 0;
153 if (fseek(f, *OffsetEnd - 9, SEEK_SET) != 0) {
154 return -1;
157 if (fread(&L3Buffer, 9, 1, f) <= 0 ) {
158 return -1;
161 if (strncmp("LYRICSEND", L3Buffer, 9) != 0) {
162 /* We found no ending LYRICSEND tag. We are done. */
163 return 0;
166 if (fseek(f, *OffsetEnd - 5109, SEEK_SET) != 0) {
167 return -1;
170 /* Since maximum lyrics length is 5100, we go back 5109 and _search_
171 * from this point on. 5109 == 5100 + strlen("LYRICSEND")
173 * This is the recommended method from http://www.id3.org/Lyrics3
175 if (fread(&L3Buffer, 1, 5109, f) <= 0) {
176 return -1;
179 char *pos=strstr(L3Buffer, "LYRICSBEGIN");
181 if (pos == 0) {
182 /* We found LYRICSEND but not LYRICSBEGIN. Treat as unexpected. */
183 return -1;
186 /* We found a valid LYRICSBEGIN signature, so we have found a complete
187 * record.
189 *OffsetEnd -= 5109;
190 *OffsetEnd += pos - (char *)&L3Buffer;
195 ProcessFileLyrics3v2(FILE * f, unsigned long *OffsetStart,
196 unsigned long *OffsetEnd)
198 char L3Buffer[11];
199 int FoundTag = 0;
200 unsigned long size = 0;
202 if (fseek(f, *OffsetEnd - 9, SEEK_SET) != 0) {
203 return -1;
206 if (fread(&L3Buffer, 9, 1, f) <= 0) {
207 return -1;
210 if (strncmp("LYRICS200", L3Buffer, 9) != 0) {
211 /* We found no ending "LYRICS200" tag. We are done. */
212 return 0;
215 if (fseek(f, *OffsetEnd - 15, SEEK_SET) != 0) {
216 return -1;
219 if (fgets((char *) &L3Buffer, 7, f) <= 0) {
220 return -1;
223 L3Buffer[6] = 0;
224 sscanf(L3Buffer, "%lu", &size);
226 int r = 0;
227 if (fseek(f, *OffsetEnd - size - 15, SEEK_SET) != 0) {
228 return -1;
231 if ((r = fread(&L3Buffer, 1, 11, f)) <= 0) {
232 return -1;
235 if (!strncmp("LYRICSBEGIN", L3Buffer, 11)) {
236 /* LYRICSBEGIN found where it should be. We are done. */
237 *OffsetEnd -= size + 15;
238 return 1;
241 /* LYRICSBEGIN not found where supposed to. Try as if the size mark
242 * included the ending signature.
244 if (fseek(f, *OffsetEnd - size, SEEK_SET) != 0) {
245 return -1;
248 if (fread(&L3Buffer, 11, 1, f) <= 0) {
249 return -1;
252 if (!strncmp("LYRICSBEGIN", L3Buffer, 11)) {
253 /* Yes, the size mark was invalid: it included the last ending
254 * signature.
256 * From the Lyrics3v2 spec: The size value includes the "LYRICSBEGIN"
257 * string, but does not include the 6 character size descriptor and
258 * the trailing "LYRICS200" string.
260 *OffsetEnd -= size;
261 return 2;
264 /* So the size mark is invalid. We didn't find the LYRICSBEGIN signature.
265 * Treat as an unexpected error.
267 return -1;
270 unsigned long filesize(FILE * stream)
272 long curpos, length;
273 curpos = ftell(stream);
274 fseek(stream, 0L, SEEK_END);
275 length = ftell(stream);
276 fseek(stream, curpos, SEEK_SET);
277 return length;
280 void showhelp()
282 const char *help = "\r\n\
283 usage: audiosum [options]\r\n\
284 \r\n\
285 Options:\r\n\
286 -a algo Choose a different algorithm from MD5 for hashing.\r\n\
287 -l Print the list of supported hashes.\r\n\
288 -b n (%) Brief: Only compute n percent of the size of each file.\r\n\
289 If n == 0 or ommited, only print the file size.\r\n\
290 -h Shows this help.\r\n\
291 \r\n\
292 Program operation:\r\n\
293 + It reads a sequence of file names from stdin (they should be MP3 files)\r\n\
294 sends to stdout the following information about them:\r\n\
295 : File size, in hex format (8 chars).\r\n\
296 : Hash of the file without ID3 or Lyrics tags, in hex format.\r\n\
297 : What signatures were found.\r\n\
298 : Complete file name.\r\n\
299 \r\n\
300 It tries to ignore non-audio parts. Currently ignored sections are:\r\n\
301 : ID3v1.x\r\n\
302 : ID3v2.x\r\n\
303 : Lyrics3 v1 (not tested)\r\n\
304 : Lyrics3 v2.00\r\n\
305 \r\n\
306 Usage:\r\n\
307 : Audiosum is designed to be wrapped by sort and uniq, like this:\r\n\
308 (you can copy and paste):\r\n\
309 \r\n\
310 find $HOME /mnt/music -iname \"*.mp3\" | \\\r\n\
311 audiosum -b | sort | uniq -D -w 8 | cut -d ' ' -f 6- | \\\r\n\
312 audiosum -b 2 | sort | uniq -D -w 41 | cut -d ' ' -f 7- | \\\r\n\
313 audiosum | sort | uniq --all-repeated=separate -w 41 > result.txt\r\n\
314 \r\n\
317 printf("%s", help);
320 void showhashes()
322 hashid i = 0;
324 printf("Supported hashes:\n\n");
325 printf(" %-10s %4s\n", "Hash", "Bits");
326 for (i = 0; i < algorithms_n; i++) {
327 printf(" : %c%-10s %4d\n",
328 algorithms[i].libmhash_id == default_algorithm? '*' : ' ',
329 algorithms[i].cmdline_name,
330 mhash_get_block_size(algorithms[i].libmhash_id)*8);
333 printf("\n * Default algorithm if -a is ommited\n\n");
336 int main(int arg_n, char *arg[])
338 char filename[1024];
340 int i;
341 MHASH td;
342 unsigned char buffer[8192];
343 unsigned char *hash;
345 hashid hash_algorithm = default_algorithm;
346 int brief=100;
347 int help=0;
348 char c;
349 int errflg=0;
350 unsigned long crc;
351 FILE *f;
352 int r;
354 while ((c = getopt(arg_n, arg, ":lb:a:h")) != -1) {
355 switch(c) {
356 case 'a':
357 for (i = 0; i < algorithms_n; i++) {
358 if (strcmp(optarg, algorithms[i].cmdline_name) == 0) {
359 hash_algorithm = algorithms[i].libmhash_id;
360 break;
363 if (i == algorithms_n) {
364 fprintf(stderr, "audiosum: unknown algorithm name: %s\n",
365 optarg);
366 return 1;
368 break;
369 case 'b':
370 brief=atoi(optarg);
371 break;
372 case 'h':
373 help++;
374 break;
375 case 'l':
376 showhashes();
377 return 0;
378 break;
379 case ':':
380 switch (optopt) {
381 case 'b':
382 brief = 0;
383 break;
384 case 'a':
385 fprintf(stderr, "audiosum: warning: unspecified hash algorithm."
386 " Using the default.\n");
387 break;
389 break;
390 case '?':
391 fprintf(stderr, "audiosum: unrecognized option: -%c\n", optopt);
392 exit(EXIT_FAILURE);
393 break;
396 if (errflg || help) {
397 showhelp();
398 if (help)
399 exit(EXIT_SUCCESS);
400 else
401 exit(EXIT_FAILURE);
404 while (fgets(filename, 1024, stdin)) {
405 if (filename[strlen(filename) - 1] == '\n') {
406 filename[strlen(filename) - 1] = '\0';
409 f = fopen(filename, "rb");
410 if (!f) {
411 fprintf(stderr, "ERROR:Opening file: %s\n", filename);
412 continue;
415 unsigned long OffsetStart = 0;
416 unsigned long OffsetEnd = filesize(f);
418 char *hadi3v1;
419 if (!(r = ProcessFileID3v1(f, &OffsetStart, &OffsetEnd)) < 0) {
420 printd("ERROR:Unexp, Analyzing ID3v1", filename, r);
421 fclose(f);
422 continue;
424 if (r == 0)
425 hadi3v1 = "----";
426 if (r == 1)
427 hadi3v1 = "I3v1";
429 char *hadi3v2;
430 if (!(r = ProcessFileID3v2(f, &OffsetStart, &OffsetEnd)) < 0) {
431 printd("ERROR:Unexp, Analyzing ID3v2", filename, r);
432 fclose(f);
433 continue;
435 if (r == 0)
436 hadi3v2 = "----";
437 if (r == 1)
438 hadi3v2 = "I3v2";
440 char *hadl3v1;
441 if (!(r = ProcessFileLyrics3v1(f, &OffsetStart, &OffsetEnd)) < 0) {
442 printd("ERROR:Unexp, Analyzing Lyrics3v1", filename, r);
443 fclose(f);
444 continue;
446 if (r == 0)
447 hadl3v1 = "----";
448 if (r == 1)
449 hadl3v1 = "L3v1";
451 char *hadl3v2;
452 if (!(r = ProcessFileLyrics3v2(f, &OffsetStart, &OffsetEnd)) < 0) {
453 printd("ERROR:Unexp, Analyzing Lyrics3v2", filename, r);
454 fclose(f);
455 continue;
457 if (r == 0)
458 hadl3v2 = "----";
459 if (r == 1)
460 hadl3v2 = "L3v2";
461 if (r == 2)
462 hadl3v2 = "l3v2";
464 fseek(f, OffsetStart, SEEK_SET);
466 printf("%08lx ", OffsetEnd - OffsetStart);
468 if (brief > 0) {
469 td = mhash_init(hash_algorithm);
471 if (td == MHASH_FAILED)
472 exit(1);
474 int howmany = ((OffsetEnd - OffsetStart + 1)*brief)/100;
475 while (howmany > 0
476 && (r =
477 fread(&buffer, 1, howmany > 8192 ? 8192 : howmany,
478 f)) > 0) {
479 mhash(td, &buffer, r);
480 howmany -= 8192;
483 hash = mhash_end(td);
485 for (i = 0; i < mhash_get_block_size(hash_algorithm); i++) {
486 printf("%.2x", hash[i]);
488 free(hash);
489 printf(" ");
492 printf("[%s] [%s] [%s] [%s] %s\n", hadi3v1, hadi3v2, hadl3v1, hadl3v2, filename);
494 fclose (f);
498 return 0;