From dd9c51f83acab7a1fc04add156ebe238f9e7f949 Mon Sep 17 00:00:00 2001 From: ketmar Date: Sun, 28 Jul 2013 09:09:00 +0000 Subject: [PATCH] tagscan: fixed short name generator FossilOrigin-Name: 7553e59ced8df92ace886d458eb98612361248b67fb31607aa5d68507a7018e0 --- src/tagscan.c | 253 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 138 insertions(+), 115 deletions(-) diff --git a/src/tagscan.c b/src/tagscan.c index 1dccc97..b286c57 100644 --- a/src/tagscan.c +++ b/src/tagscan.c @@ -47,11 +47,78 @@ //////////////////////////////////////////////////////////////////////////////// +static const unsigned char utf8Length[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x00-0x0f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x10-0x1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x20-0x2f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x30-0x3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x40-0x4f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x50-0x5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x60-0x6f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x70-0x7f + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x80-0x8f + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x90-0x9f + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xa0-0xaf + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xb0-0xbf + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xc0-0xcf c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xd0-0xdf + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, //0xe0-0xef + 4,4,4,4,4,8,8,8,8,8,8,8,8,8,8,8 //0xf0-0xff +}; + + +static inline int is_possible_utf8 (const void *buf) { + for (const unsigned char *data = (const unsigned char *)buf; *data; ++data) if (*data >= 128) return 1; + return 0; +} + + +static inline int is_valid_utf8 (const void *buf) { + const unsigned char *data = (const unsigned char *)buf; + while (*data) { + unsigned char len = utf8Length[*data]; + uint32_t uc; + switch (len) { + case 0: // ascii + data += 1; + continue; + case 8: case 9: // invalid + return 0; + } + // utf-8 + uc = (*data++)&(0x7c>>len); + while (--len) { + if (!data[0]) return 0; + if (utf8Length[*data] != 9) return 0; + uc = (uc<<6)|((*data++)&0x3f); + } + if (uc > 0x10ffff) return 0; + if ((uc >= 0xd800 && uc <= 0xdfff) || // utf16/utf32 surrogates + (uc >= 0xfdd0 && uc <= 0xfdef) || // just for fun + (uc >= 0xfffe && uc <= 0xffff)) return 0; // bad unicode + } + return 1; +} + + +static inline int notutf (const char *str) { + int havehi = 0; + for (const unsigned char *s = (const unsigned char *)str; *s; ++s) if (*s >= 128) { havehi = 1; break; } + if (havehi) { + //printf("[%s]: %d\n", str, is_valid_utf8(str)); + return (is_valid_utf8(str) == 0); + } + return 0; +} + + +//////////////////////////////////////////////////////////////////////////////// static void trimstr (char *s) { if (s != NULL && s[0]) { + //char *olds = strdup(s), *xs = s; char *ns, *p; // change all '/' to underlines - for (; *s; ++s) if (*s == '/' || *s == 127) *s = '_'; + for (ns = s; *ns; ++ns) if (*ns == '/' || *ns == 127) *ns = '_'; // remove duplicate underlines for (ns = p = s; *ns; ++ns) { if (*ns == '_') { @@ -77,6 +144,8 @@ static void trimstr (char *s) { // trim leading spaces and underlines for (ns = s; *ns && (leIsSpace(*ns) || *ns == '_'); ++ns) ; if (ns != s) memmove(s, ns, strlen(ns)+1); + // + //fprintf(stderr, "trans: [%s] -> [%s]\n", olds, xs); free(olds); } } @@ -85,37 +154,51 @@ static void trimstr (char *s) { static int opt_koi8 = 1; -static char *str_transliterate (const char *str) { +#define xalloca(ptr,msize) do { \ + size_t msz = (msize); \ + (ptr) = alloca(msz); \ + memset((ptr), 0, msz); \ +} while (0) + + +static char *str_transliterate (const char *str, int inkoi) { iconv_t cd; size_t il, ol, ool; char *outs, *ibuf, *obuf, *res; int asis = 1; if (str == NULL) return strdup(""); - if (str == NULL || !str[0]) { outs = strdup(str); goto done; } + //if (!str[0]) { outs = strdup(str); goto done; } for (const unsigned char *u = (const unsigned char *)str; *u; ++u) if (*u >= 128) { asis = 0; break; } if (asis) { outs = strdup(str); goto done; } - cd = iconv_open("koi8-u//translit//ignore", "utf-8"); - if (cd == (iconv_t)-1) return NULL; - outs = calloc(1, strlen(str)*6+4); - if (outs == NULL) { + // + if (!inkoi) { + fprintf(stderr, "str_transliterate: \"%s\" (%d)\n", str, is_valid_utf8(str)); + cd = iconv_open("koi8-u//translit//ignore", "utf-8"); + if (cd == (iconv_t)-1) { + fprintf(stderr, "FATAL: can't create conversion object!\n"); + abort(); + return NULL; + } + xalloca(outs, strlen(str)*6+4); + ibuf = (char *)str; + obuf = outs; + il = strlen(str); + ool = ol = il*4; + il = iconv(cd, &ibuf, &il, &obuf, &ol); iconv_close(cd); - return NULL; - } - ibuf = (char *)str; - obuf = outs; - il = strlen(str); - ool = ol = il*4; - il = iconv(cd, &ibuf, &il, &obuf, &ol); - iconv_close(cd); - if (il == (size_t)-1) { - free(outs); - return NULL; + if (il == (size_t)-1) { + fprintf(stderr, "CONVERSION FUCKED: [%s]\n", str); + abort(); + return NULL; + } + xalloca(res, ool-ol+1); + if (ool-ol > 0) memcpy(res, outs, ool-ol); + //free(outs); + outs = translitstr(res); + //free(res); + } else { + outs = translitstr(str); } - res = calloc(ool-ol+1, 1); - if (ool-ol > 0) memcpy(res, outs, ool-ol); - free(outs); - outs = translitstr(res); - free(res); done: for (char *s = outs; *s; ++s) { *s = le2lower(*s); @@ -296,13 +379,15 @@ static char *str_convert (const char *str, const char *def, const char *fname, i ostr = str = def; } if (toutf) { - char *t = w2u(str, 0); + char *t = w2u(str, 0), *t1; if (t == NULL) { dlogf("FUCK! %s: [%s]\n", fname, str); abort(); } + t1 = str_tokoi(t); + free(t); //printf("u: [%s] -> [%s]\n", str, t); - ostr = str = t; + ostr = str = t1; } s = ss = alloca(strlen(str)+1); while (*str && leIsSpace(*str)) ++str; @@ -320,7 +405,8 @@ static char *str_convert (const char *str, const char *def, const char *fname, i } *ss = 0; while (*s && leIsSpace(s[strlen(s)-1])) s[strlen(s)-1] = 0; - if (strcmp(s, ostr) != 0) dlogf("%s: [%s] -> [%s]\n", fname, ostr, s); + //if (strcmp(s, ostr) != 0) dlogf("%s: [%s] -> [%s]\n", fname, ostr, s); + //fprintf(stderr, "strconvert: [%s] -> [%s] (%d) (%d)\n", ostr, s, is_valid_utf8(ostr), is_valid_utf8(s)); if (toutf) free((void *)ostr); memset(res, 0, sizeof(res)); if (!s[0]) { @@ -333,72 +419,6 @@ static char *str_convert (const char *str, const char *def, const char *fname, i //////////////////////////////////////////////////////////////////////////////// -static const unsigned char utf8Length[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x00-0x0f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x10-0x1f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x20-0x2f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x30-0x3f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x40-0x4f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x50-0x5f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x60-0x6f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x70-0x7f - 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x80-0x8f - 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x90-0x9f - 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xa0-0xaf - 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xb0-0xbf - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xc0-0xcf c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127 - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xd0-0xdf - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, //0xe0-0xef - 4,4,4,4,4,8,8,8,8,8,8,8,8,8,8,8 //0xf0-0xff -}; - - -static inline int is_possible_utf8 (const void *buf) { - for (const unsigned char *data = (const unsigned char *)buf; *data; ++data) if (*data >= 128) return 1; - return 0; -} - - -static inline int is_valid_utf8 (const void *buf) { - const unsigned char *data = (const unsigned char *)buf; - while (*data) { - unsigned char len = utf8Length[*data]; - uint32_t uc; - switch (len) { - case 0: // ascii - data += 1; - continue; - case 8: case 9: // invalid - return 0; - } - // utf-8 - uc = (*data++)&(0x7c>>len); - while (--len) { - if (!data[0]) return 0; - if (utf8Length[*data] != 9) return 0; - uc = (uc<<6)|((*data++)&0x3f); - } - if (uc > 0x10ffff) return 0; - if ((uc >= 0xd800 && uc <= 0xdfff) || // utf16/utf32 surrogates - (uc >= 0xfdd0 && uc <= 0xfdef) || // just for fun - (uc >= 0xfffe && uc <= 0xffff)) return 0; // bad unicode - } - return 1; -} - - -static inline int notutf (const char *str) { - int havehi = 0; - for (const unsigned char *s = (const unsigned char *)str; *s; ++s) if (*s >= 128) { havehi = 1; break; } - if (havehi) { - //printf("[%s]: %d\n", str, is_valid_utf8(str)); - return (is_valid_utf8(str) == 0); - } - return 0; -} - - -//////////////////////////////////////////////////////////////////////////////// typedef struct { uint16_t len; char *str; @@ -531,8 +551,9 @@ static tagvalue_t *tag_add (const char *str, string_t *oval) { for (tvx = taglist; tvx != NULL; tvx = tvx->hh.next) if (strcmp(tvx->nval, nn) == 0) break; if (tvx != NULL) str = tvx->nval; // found one, change free(nn); + //fprintf(stderr, "tag_trans: [%s]\n", str); // and transliterate anyway - str = str_transliterate(str); + str = str_transliterate(str, 1); } value = cstring_add(str); HASH_FIND_PTR(taglist, &value, tv); @@ -621,41 +642,39 @@ static void fileinfo_postprocess (void) { string_t *snv; const char *onamep = strrchr(fi->realname->str, '/')+1; // this CAN'T fail! const char *oext = strrchr(onamep, '.'); - char *trs; + char *trs, *tmp = NULL; // transliterate utf-8 or koi if (is_possible_utf8(onamep)) { - char *ext; if (is_valid_utf8(onamep)) { // convert to koi-8 - char *koi = str_tokoi(onamep); - if (koi == NULL) { fprintf(stderr, "FATAL: fileinfo_postprocess() failed to convert encoding!\n"); abort(); } - trs = translitstr(koi); - free(koi); - } else { - trs = translitstr(onamep); - } - // trim extension - if ((ext = strrchr(trs, '.')) != NULL) { - if (strlen(ext) > strlen(oext)) { - // make room for new extension (rare case) - trs = realloc(trs, strlen(trs)+1+strlen(ext)+32); - if (trs == NULL) { fprintf(stderr, "FATAL: out of memory in fileinfo_postprocess()!\n"); abort(); } - } - *ext = 0; + tmp = str_tokoi(onamep); + if (tmp == NULL) { fprintf(stderr, "FATAL: fileinfo_postprocess() failed to convert encoding!\n"); abort(); } } - } else { - // no russian letters here - trs = strdup(onamep); } + if (tmp == NULL) tmp = strdup(onamep); + // normalize + if (oext != NULL) strrchr(tmp, '.')[0] = 0; + trs = str_transliterate(tmp, 1); + trimstr(trs); + if (oext != NULL) { + //fprintf(stderr, "tmp=%p; trs=%p; oext=%p\n", tmp, trs, oext); + tmp = realloc(tmp, strlen(trs)+strlen(oext)+4); + sprintf(tmp, "%s%s", trs, oext); + for (char *s = tmp+strlen(trs); *s; ++s) *s = le2lower(*s); + free(trs); + trs = tmp; + } + /* // fuck all nonalnums, convert to lowercase for (unsigned char *s = (unsigned char *)trs; *s; ++s) { if (*s == 127 || (*s < 128 || !leIsAlNum(*s))) *s = '_'; *s = le2lower(*s); } + */ // trim and normalize name - trimstr(trs); + //trimstr(trs); // add extension back - if (oext != NULL) strcat(trs, oext); + //if (oext != NULL) strcat(trs, oext); // add track number if any if (fi->tracknum > 0) { int tno = 0; @@ -705,6 +724,7 @@ static void fileinfo_postprocess (void) { fi->shortname = snv; fi->snamestr = snv->str; HASH_ADD_KEYPTR(hh_sname, snhash, fi->snamestr, strlen(fi->snamestr), fi); + fprintf(stderr, "[%s] -> [%s]\n", onamep, fi->shortname->str); } // now delete hash, we don't need it anymore HASH_CLEAR(hh_sname, snhash); @@ -1072,6 +1092,9 @@ int main (int argc, char *argv[]) { //taglib_id3v2_set_default_text_encoding(TagLib_ID3v2_UTF8); // used only for writing //oldfc = fcount; for (int f = 2; f < argc; ++f) process_dir(argv[f]); + fileinfo_postprocess(); + tags_postprocess(); + printf("%d files, %d strings, %d tags.\n", HASH_COUNT(filelist), HASH_COUNT(strings), HASH_COUNT(taglist)); //write_tagfile(argv[1]); //printf("%d new files processed\n", fcount-oldfc); } -- 2.11.4.GIT