From dd9c51f83acab7a1fc04add156ebe238f9e7f949 Mon Sep 17 00:00:00 2001
From: ketmar <ketmar@ketmar.no-ip.org>
Date: Sun, 28 Jul 2013 09:09:00 +0000
Subject: [PATCH] tagscan: fixed short name generator

FossilOrigin-Name: 7553e59ced8df92ace886d458eb98612361248b67fb31607aa5d68507a7018e0
---
 src/tagscan.c | 253 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 138 insertions(+), 115 deletions(-)

diff --git a/src/tagscan.c b/src/tagscan.c
index 1dccc97..b286c57 100644
--- a/src/tagscan.c
+++ b/src/tagscan.c
@@ -47,11 +47,78 @@
 
 
 ////////////////////////////////////////////////////////////////////////////////
+static const unsigned char utf8Length[256] = {
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x00-0x0f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x10-0x1f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x20-0x2f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x30-0x3f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x40-0x4f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x50-0x5f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x60-0x6f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x70-0x7f
+  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x80-0x8f
+  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x90-0x9f
+  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xa0-0xaf
+  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xb0-0xbf
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xc0-0xcf  c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xd0-0xdf
+  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, //0xe0-0xef
+  4,4,4,4,4,8,8,8,8,8,8,8,8,8,8,8  //0xf0-0xff
+};
+
+
+static inline int is_possible_utf8 (const void *buf) {
+  for (const unsigned char *data = (const unsigned char *)buf; *data; ++data) if (*data >= 128) return 1;
+  return 0;
+}
+
+
+static inline int is_valid_utf8 (const void *buf) {
+  const unsigned char *data = (const unsigned char *)buf;
+  while (*data) {
+    unsigned char len = utf8Length[*data];
+    uint32_t uc;
+    switch (len) {
+      case 0: // ascii
+        data += 1;
+        continue;
+      case 8: case 9: // invalid
+        return 0;
+    }
+    // utf-8
+    uc = (*data++)&(0x7c>>len);
+    while (--len) {
+      if (!data[0]) return 0;
+      if (utf8Length[*data] != 9) return 0;
+      uc = (uc<<6)|((*data++)&0x3f);
+    }
+    if (uc > 0x10ffff) return 0;
+    if ((uc >= 0xd800 && uc <= 0xdfff) || // utf16/utf32 surrogates
+        (uc >= 0xfdd0 && uc <= 0xfdef) || // just for fun
+        (uc >= 0xfffe && uc <= 0xffff)) return 0; // bad unicode
+  }
+  return 1;
+}
+
+
+static inline int notutf (const char *str) {
+  int havehi = 0;
+  for (const unsigned char *s = (const unsigned char *)str; *s; ++s) if (*s >= 128) { havehi = 1; break; }
+  if (havehi) {
+    //printf("[%s]: %d\n", str, is_valid_utf8(str));
+    return (is_valid_utf8(str) == 0);
+  }
+  return 0;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
 static void trimstr (char *s) {
   if (s != NULL && s[0]) {
+    //char *olds = strdup(s), *xs = s;
     char *ns, *p;
     // change all '/' to underlines
-    for (; *s; ++s) if (*s == '/' || *s == 127) *s = '_';
+    for (ns = s; *ns; ++ns) if (*ns == '/' || *ns == 127) *ns = '_';
     // remove duplicate underlines
     for (ns = p = s; *ns; ++ns) {
       if (*ns == '_') {
@@ -77,6 +144,8 @@ static void trimstr (char *s) {
     // trim leading spaces and underlines
     for (ns = s; *ns && (leIsSpace(*ns) || *ns == '_'); ++ns) ;
     if (ns != s) memmove(s, ns, strlen(ns)+1);
+    //
+    //fprintf(stderr, "trans: [%s] -> [%s]\n", olds, xs); free(olds);
   }
 }
 
@@ -85,37 +154,51 @@ static void trimstr (char *s) {
 static int opt_koi8 = 1;
 
 
-static char *str_transliterate (const char *str) {
+#define xalloca(ptr,msize)  do { \
+  size_t msz = (msize); \
+  (ptr) = alloca(msz); \
+  memset((ptr), 0, msz); \
+} while (0)
+
+
+static char *str_transliterate (const char *str, int inkoi) {
   iconv_t cd;
   size_t il, ol, ool;
   char *outs, *ibuf, *obuf, *res;
   int asis = 1;
   if (str == NULL) return strdup("");
-  if (str == NULL || !str[0]) { outs = strdup(str); goto done; }
+  //if (!str[0]) { outs = strdup(str); goto done; }
   for (const unsigned char *u = (const unsigned char *)str; *u; ++u) if (*u >= 128) { asis = 0; break; }
   if (asis) { outs = strdup(str); goto done; }
-  cd = iconv_open("koi8-u//translit//ignore", "utf-8");
-  if (cd == (iconv_t)-1) return NULL;
-  outs = calloc(1, strlen(str)*6+4);
-  if (outs == NULL) {
+  //
+  if (!inkoi) {
+    fprintf(stderr, "str_transliterate: \"%s\" (%d)\n", str, is_valid_utf8(str));
+    cd = iconv_open("koi8-u//translit//ignore", "utf-8");
+    if (cd == (iconv_t)-1) {
+      fprintf(stderr, "FATAL: can't create conversion object!\n");
+      abort();
+      return NULL;
+    }
+    xalloca(outs, strlen(str)*6+4);
+    ibuf = (char *)str;
+    obuf = outs;
+    il = strlen(str);
+    ool = ol = il*4;
+    il = iconv(cd, &ibuf, &il, &obuf, &ol);
     iconv_close(cd);
-    return NULL;
-  }
-  ibuf = (char *)str;
-  obuf = outs;
-  il = strlen(str);
-  ool = ol = il*4;
-  il = iconv(cd, &ibuf, &il, &obuf, &ol);
-  iconv_close(cd);
-  if (il == (size_t)-1) {
-    free(outs);
-    return NULL;
+    if (il == (size_t)-1) {
+      fprintf(stderr, "CONVERSION FUCKED: [%s]\n", str);
+      abort();
+      return NULL;
+    }
+    xalloca(res, ool-ol+1);
+    if (ool-ol > 0) memcpy(res, outs, ool-ol);
+    //free(outs);
+    outs = translitstr(res);
+    //free(res);
+  } else {
+    outs = translitstr(str);
   }
-  res = calloc(ool-ol+1, 1);
-  if (ool-ol > 0) memcpy(res, outs, ool-ol);
-  free(outs);
-  outs = translitstr(res);
-  free(res);
 done:
   for (char *s = outs; *s; ++s) {
     *s = le2lower(*s);
@@ -296,13 +379,15 @@ static char *str_convert (const char *str, const char *def, const char *fname, i
     ostr = str = def;
   }
   if (toutf) {
-    char *t = w2u(str, 0);
+    char *t = w2u(str, 0), *t1;
     if (t == NULL) {
       dlogf("FUCK! %s: [%s]\n", fname, str);
       abort();
     }
+    t1 = str_tokoi(t);
+    free(t);
     //printf("u: [%s] -> [%s]\n", str, t);
-    ostr = str = t;
+    ostr = str = t1;
   }
   s = ss = alloca(strlen(str)+1);
   while (*str && leIsSpace(*str)) ++str;
@@ -320,7 +405,8 @@ static char *str_convert (const char *str, const char *def, const char *fname, i
   }
   *ss = 0;
   while (*s && leIsSpace(s[strlen(s)-1])) s[strlen(s)-1] = 0;
-  if (strcmp(s, ostr) != 0) dlogf("%s: [%s] -> [%s]\n", fname, ostr, s);
+  //if (strcmp(s, ostr) != 0) dlogf("%s: [%s] -> [%s]\n", fname, ostr, s);
+  //fprintf(stderr, "strconvert: [%s] -> [%s] (%d) (%d)\n", ostr, s, is_valid_utf8(ostr), is_valid_utf8(s));
   if (toutf) free((void *)ostr);
   memset(res, 0, sizeof(res));
   if (!s[0]) {
@@ -333,72 +419,6 @@ static char *str_convert (const char *str, const char *def, const char *fname, i
 
 
 ////////////////////////////////////////////////////////////////////////////////
-static const unsigned char utf8Length[256] = {
-  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x00-0x0f
-  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x10-0x1f
-  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x20-0x2f
-  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x30-0x3f
-  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x40-0x4f
-  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x50-0x5f
-  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x60-0x6f
-  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x70-0x7f
-  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x80-0x8f
-  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x90-0x9f
-  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xa0-0xaf
-  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xb0-0xbf
-  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xc0-0xcf  c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127
-  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xd0-0xdf
-  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, //0xe0-0xef
-  4,4,4,4,4,8,8,8,8,8,8,8,8,8,8,8  //0xf0-0xff
-};
-
-
-static inline int is_possible_utf8 (const void *buf) {
-  for (const unsigned char *data = (const unsigned char *)buf; *data; ++data) if (*data >= 128) return 1;
-  return 0;
-}
-
-
-static inline int is_valid_utf8 (const void *buf) {
-  const unsigned char *data = (const unsigned char *)buf;
-  while (*data) {
-    unsigned char len = utf8Length[*data];
-    uint32_t uc;
-    switch (len) {
-      case 0: // ascii
-        data += 1;
-        continue;
-      case 8: case 9: // invalid
-        return 0;
-    }
-    // utf-8
-    uc = (*data++)&(0x7c>>len);
-    while (--len) {
-      if (!data[0]) return 0;
-      if (utf8Length[*data] != 9) return 0;
-      uc = (uc<<6)|((*data++)&0x3f);
-    }
-    if (uc > 0x10ffff) return 0;
-    if ((uc >= 0xd800 && uc <= 0xdfff) || // utf16/utf32 surrogates
-        (uc >= 0xfdd0 && uc <= 0xfdef) || // just for fun
-        (uc >= 0xfffe && uc <= 0xffff)) return 0; // bad unicode
-  }
-  return 1;
-}
-
-
-static inline int notutf (const char *str) {
-  int havehi = 0;
-  for (const unsigned char *s = (const unsigned char *)str; *s; ++s) if (*s >= 128) { havehi = 1; break; }
-  if (havehi) {
-    //printf("[%s]: %d\n", str, is_valid_utf8(str));
-    return (is_valid_utf8(str) == 0);
-  }
-  return 0;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
 typedef struct {
   uint16_t len;
   char *str;
@@ -531,8 +551,9 @@ static tagvalue_t *tag_add (const char *str, string_t *oval) {
     for (tvx = taglist; tvx != NULL; tvx = tvx->hh.next) if (strcmp(tvx->nval, nn) == 0) break;
     if (tvx != NULL) str = tvx->nval; // found one, change
     free(nn);
+    //fprintf(stderr, "tag_trans: [%s]\n", str);
     // and transliterate anyway
-    str = str_transliterate(str);
+    str = str_transliterate(str, 1);
   }
   value = cstring_add(str);
   HASH_FIND_PTR(taglist, &value, tv);
@@ -621,41 +642,39 @@ static void fileinfo_postprocess (void) {
     string_t *snv;
     const char *onamep = strrchr(fi->realname->str, '/')+1; // this CAN'T fail!
     const char *oext = strrchr(onamep, '.');
-    char *trs;
+    char *trs, *tmp = NULL;
     // transliterate utf-8 or koi
     if (is_possible_utf8(onamep)) {
-      char *ext;
       if (is_valid_utf8(onamep)) {
         // convert to koi-8
-        char *koi = str_tokoi(onamep);
-        if (koi == NULL) { fprintf(stderr, "FATAL: fileinfo_postprocess() failed to convert encoding!\n"); abort(); }
-        trs = translitstr(koi);
-        free(koi);
-      } else {
-        trs = translitstr(onamep);
-      }
-      // trim extension
-      if ((ext = strrchr(trs, '.')) != NULL) {
-        if (strlen(ext) > strlen(oext)) {
-          // make room for new extension (rare case)
-          trs = realloc(trs, strlen(trs)+1+strlen(ext)+32);
-          if (trs == NULL) { fprintf(stderr, "FATAL: out of memory in fileinfo_postprocess()!\n"); abort(); }
-        }
-        *ext = 0;
+        tmp = str_tokoi(onamep);
+        if (tmp == NULL) { fprintf(stderr, "FATAL: fileinfo_postprocess() failed to convert encoding!\n"); abort(); }
       }
-    } else {
-      // no russian letters here
-      trs = strdup(onamep);
     }
+    if (tmp == NULL) tmp = strdup(onamep);
+    // normalize
+    if (oext != NULL) strrchr(tmp, '.')[0] = 0;
+    trs = str_transliterate(tmp, 1);
+    trimstr(trs);
+    if (oext != NULL) {
+      //fprintf(stderr, "tmp=%p; trs=%p; oext=%p\n", tmp, trs, oext);
+      tmp = realloc(tmp, strlen(trs)+strlen(oext)+4);
+      sprintf(tmp, "%s%s", trs, oext);
+      for (char *s = tmp+strlen(trs); *s; ++s) *s = le2lower(*s);
+      free(trs);
+      trs = tmp;
+    }
+    /*
     // fuck all nonalnums, convert to lowercase
     for (unsigned char *s = (unsigned char *)trs; *s; ++s) {
       if (*s == 127 || (*s < 128 || !leIsAlNum(*s))) *s = '_';
       *s = le2lower(*s);
     }
+    */
     // trim and normalize name
-    trimstr(trs);
+    //trimstr(trs);
     // add extension back
-    if (oext != NULL) strcat(trs, oext);
+    //if (oext != NULL) strcat(trs, oext);
     // add track number if any
     if (fi->tracknum > 0) {
       int tno = 0;
@@ -705,6 +724,7 @@ static void fileinfo_postprocess (void) {
     fi->shortname = snv;
     fi->snamestr = snv->str;
     HASH_ADD_KEYPTR(hh_sname, snhash, fi->snamestr, strlen(fi->snamestr), fi);
+    fprintf(stderr, "[%s] -> [%s]\n", onamep, fi->shortname->str);
   }
   // now delete hash, we don't need it anymore
   HASH_CLEAR(hh_sname, snhash);
@@ -1072,6 +1092,9 @@ int main (int argc, char *argv[]) {
     //taglib_id3v2_set_default_text_encoding(TagLib_ID3v2_UTF8); // used only for writing
     //oldfc = fcount;
     for (int f = 2; f < argc; ++f) process_dir(argv[f]);
+    fileinfo_postprocess();
+    tags_postprocess();
+    printf("%d files, %d strings, %d tags.\n", HASH_COUNT(filelist), HASH_COUNT(strings), HASH_COUNT(taglist));
     //write_tagfile(argv[1]);
     //printf("%d new files processed\n", fcount-oldfc);
   }
-- 
2.11.4.GIT