modules/demux/dvb-text.h

   1 /*****************************************************************************
   2  * dvb-text.h:
   3  *****************************************************************************
   4  * Copyright (C) 2007-2011 VLC authors and VideoLAN
   5  *
   6  * This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU Lesser General Public License as published by
   8  * the Free Software Foundation; either version 2.1 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14  * GNU Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public License
  17  * along with this program; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  19  *****************************************************************************/
  20
  21 /**
  22  * Converts a DVB SI text item to UTF-8.
  23  * Refer to EN 800 486 annex A.
  24  * @return a heap-allocation nul-terminated UTF-8 string or NULL on error.
  25  */
  26 static char *vlc_from_EIT (const void *buf, size_t length)
  27 {
  28     if (unlikely(length == 0))
  29         return NULL;
  30
  31     char encbuf[12];
  32     const char *encoding = encbuf;
  33
  34     const char *in = buf;
  35     size_t offset = 1;
  36     unsigned char c = *in;
  37
  38     if (c >= 0x20)
  39     {
  40         offset = 0;
  41         encoding = "ISO_6937";
  42     }
  43     else if ((1 << c) & 0x0EFE) /* 1-7, 9-11 -> ISO 8859-(c+4) */
  44     {
  45         snprintf (encbuf, sizeof (encbuf), "ISO_8859-%u", 4u + c);
  46     }
  47     else switch (c)
  48     {
  49         case 0x10: /* two more bytes */
  50             offset = 3;
  51             if (length < 3 || in[1] != 0x00)
  52                 return NULL;
  53
  54             c = in[2];
  55             if ((1 << c) & 0xEFFE) /* 1-11, 13-15 -> ISO 8859-(c) */
  56                snprintf (encbuf, sizeof (encbuf), "ISO_8859-%hhu", c);
  57            else
  58                return NULL;
  59            break;
  60         case 0x11: /* the BMP */
  61         case 0x14: /* Big5 subset of the BMP */
  62             encoding = "UCS-2BE";
  63             break;
  64         case 0x12:
  65             /* DVB has no clue about Korean. KS X 1001 (a.k.a. KS C 5601) is a
  66              * character set, not a character encoding... So we assume EUC-KR.
  67              * It is an encoding of KS X 1001. In practice, I guess nobody uses
  68              * this in any real DVB system. */
  69             encoding = "EUC-KR";
  70             break;
  71         case 0x13: /* GB-2312-1980 */
  72             encoding = "GB2312";
  73             break;
  74         case 0x15:
  75             encoding = "UTF-8";
  76             break;
  77 #if 0
  78         case 0x1F: /* operator-specific(?) */
  79             offset = 2;
  80 #endif
  81         default:
  82             return NULL;
  83     }
  84
  85     in += offset;
  86     length -= offset;
  87
  88     char *out = FromCharset (encoding, in, length);
  89     if (out == NULL)
  90     {   /* Fallback... */
  91         out = strndup (in, length);
  92         if (unlikely(out == NULL))
  93             return NULL;
  94         EnsureUTF8 (out);
  95     }
  96
  97     length = strlen(out);
  98     /* Convert control codes */
  99     for (char *p = strchr (out, '\xC2'); p; p = strchr (p + 1, '\xC2'))
 100     {
 101         /* We have valid UTF-8, to 0xC2 is followed by a continuation byte. */
 102         /* 0x80-0x85,0x88-0x89 are reserved.
 103          * 0x86-0x87 are identical to Unicode and Latin-1.
 104          * 0x8A is CR/LF.
 105          * 0x8B-0x9F are unspecified. */
 106         if (p[1] == '\x8A')
 107             memcpy (p, "\r\n", 2);
 108
 109         /* Strip character emphasis */
 110         if (p[1] == '\x86' || p[1] == '\x87') {
 111             const size_t n = p - out;
 112             memmove (p, p+2, length - n);
 113             length -= 2;
 114             out[length] = '\0';
 115             if (length == n)
 116                 break;
 117         }
 118     }
 119
 120     /* Private use area */
 121     for (char *p = strchr (out, '\xEE'); p; p = strchr (p + 1, '\xEE'))
 122     {
 123         /* Within UTF-8, 0xEE is followed by a two continuation bytes. */
 124         if (p[1] != '\x82')
 125             continue;
 126         if (p[2] == '\x8A')
 127             memcpy (p, "\r\r\n", 3); /* we need three bytes, so to CRs ;) */
 128
 129         /* Strip character emphasis */
 130         if (p[2] == '\x86' || p[2] == '\x87') {
 131             const size_t n = p - out;
 132             memmove (p, p+3, length - n);
 133             length -= 3;
 134             out[length] = '\0';
 135             if (length == n)
 136                 break;
 137         }
 138     }
 139
 140     return out;
 141 }