1 /*****************************************************************************
3 *****************************************************************************
4 * Copyright (C) 2007-2011 VLC authors and VideoLAN
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU Lesser General Public License as published by
8 * the Free Software Foundation; either version 2.1 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with this program; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
19 *****************************************************************************/
22 * Converts a DVB SI text item to UTF-8.
23 * Refer to EN 800 486 annex A.
24 * @return a heap-allocation nul-terminated UTF-8 string or NULL on error.
26 static char *vlc_from_EIT (const void *buf
, size_t length
)
28 if (unlikely(length
== 0))
32 const char *encoding
= encbuf
;
36 unsigned char c
= *in
;
41 encoding
= "ISO_6937";
43 else if ((1 << c
) & 0x0EFE) /* 1-7, 9-11 -> ISO 8859-(c+4) */
45 snprintf (encbuf
, sizeof (encbuf
), "ISO_8859-%u", 4u + c
);
49 case 0x10: /* two more bytes */
51 if (length
< 3 || in
[1] != 0x00)
55 if ((1 << c
) & 0xEFFE) /* 1-11, 13-15 -> ISO 8859-(c) */
56 snprintf (encbuf
, sizeof (encbuf
), "ISO_8859-%hhu", c
);
60 case 0x11: /* the BMP */
61 case 0x14: /* Big5 subset of the BMP */
65 /* DVB has no clue about Korean. KS X 1001 (a.k.a. KS C 5601) is a
66 * character set, not a character encoding... So we assume EUC-KR.
67 * It is an encoding of KS X 1001. In practice, I guess nobody uses
68 * this in any real DVB system. */
71 case 0x13: /* GB-2312-1980 */
78 case 0x1F: /* operator-specific(?) */
88 char *out
= FromCharset (encoding
, in
, length
);
91 out
= strndup (in
, length
);
92 if (unlikely(out
== NULL
))
98 /* Convert control codes */
99 for (char *p
= strchr (out
, '\xC2'); p
; p
= strchr (p
+ 1, '\xC2'))
101 /* We have valid UTF-8, to 0xC2 is followed by a continuation byte. */
102 /* 0x80-0x85,0x88-0x89 are reserved.
103 * 0x86-0x87 are identical to Unicode and Latin-1.
105 * 0x8B-0x9F are unspecified. */
107 memcpy (p
, "\r\n", 2);
109 /* Strip character emphasis */
110 if (p
[1] == '\x86' || p
[1] == '\x87') {
111 const size_t n
= p
- out
;
112 memmove (p
, p
+2, length
- n
);
120 /* Private use area */
121 for (char *p
= strchr (out
, '\xEE'); p
; p
= strchr (p
+ 1, '\xEE'))
123 /* Within UTF-8, 0xEE is followed by a two continuation bytes. */
127 memcpy (p
, "\r\r\n", 3); /* we need three bytes, so to CRs ;) */
129 /* Strip character emphasis */
130 if (p
[2] == '\x86' || p
[2] == '\x87') {
131 const size_t n
= p
- out
;
132 memmove (p
, p
+3, length
- n
);