some updates
[iv.d.git] / utfutil.d
blobad75b92f0a06904100994a3d9e12c4671746a97d
1 /* Invisible Vector Library
2 * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
3 * Understanding is not required. Only obedience.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
8 * Permission is granted to anyone to use this software for any purpose,
9 * including commercial applications, and to alter it and redistribute it
10 * freely, subject to the following restrictions:
11 * 1. The origin of this software must not be misrepresented; you must not
12 * claim that you wrote the original software. If you use this software
13 * in a product, an acknowledgment in the product documentation would be
14 * appreciated but is not required.
15 * 2. Altered source versions must be plainly marked as such, and must not be
16 * misrepresented as being the original software.
17 * 3. This notice may not be removed or altered from any source distribution.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 // UTF-8 utilities (there will be more soon ;-)
24 module iv.utfutil /*is aliced*/;
25 import iv.alice;
28 // ////////////////////////////////////////////////////////////////////////// //
29 bool isValidUtf8Start (ubyte b) pure nothrow @safe @nogc { pragma(inline, true); return (b < 128 || (b&0xc0) == 0xC0); } /// rough check
31 // ////////////////////////////////////////////////////////////////////////// //
32 bool isUtf8Start() (char ch) pure nothrow @trusted @nogc { pragma(inline, true); return ((ch&0xC0) == 0xC0); } /// does this char start UTF-8 sequence?
33 bool isUtf8Cont() (char ch) pure nothrow @trusted @nogc { pragma(inline, true); return ((ch&0xC0) == 0x80); } /// does this char continue UTF-8 sequence?
36 // ////////////////////////////////////////////////////////////////////////// //
37 /// fast state-machine based UTF-8 decoder; using 8 bytes of memory
38 /// code points from invalid range will never be valid, this is the property of the state machine
39 align(1) struct Utf8DecoderFast {
40 align(1):
41 public:
42 enum dchar replacement = '\uFFFD'; /// replacement char for invalid unicode
43 static bool isValidDC (dchar c) pure nothrow @safe @nogc { pragma(inline, true); return (c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF)); } /// is given codepoint valid?
45 private:
46 enum State {
47 Accept = 0,
48 Reject = 12,
51 // see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
52 static immutable ubyte[0x16c] utf8dfa = [
53 // maps bytes to character classes
54 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 00-0f
55 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 10-1f
56 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 20-2f
57 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 30-3f
58 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 40-4f
59 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 50-5f
60 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 60-6f
61 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 70-7f
62 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, // 80-8f
63 0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09, // 90-9f
64 0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07, // a0-af
65 0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07, // b0-bf
66 0x08,0x08,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02, // c0-cf
67 0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02, // d0-df
68 0x0a,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x04,0x03,0x03, // e0-ef
69 0x0b,0x06,0x06,0x06,0x05,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, // f0-ff
70 // maps a combination of a state of the automaton and a character class to a state
71 0x00,0x0c,0x18,0x24,0x3c,0x60,0x54,0x0c,0x0c,0x0c,0x30,0x48,0x0c,0x0c,0x0c,0x0c, // 100-10f
72 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x00, // 110-11f
73 0x0c,0x00,0x0c,0x0c,0x0c,0x18,0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x18,0x0c,0x0c, // 120-12f
74 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x0c, // 130-13f
75 0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x24, // 140-14f
76 0x0c,0x24,0x0c,0x0c,0x0c,0x24,0x0c,0x0c,0x0c,0x0c,0x0c,0x24,0x0c,0x24,0x0c,0x0c, // 150-15f
77 0x0c,0x24,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c];
79 private:
80 uint state = 0;
82 nothrow @safe @nogc:
83 public:
84 dchar codepoint = 0; /// decoded codepoint (valid only when decoder is in "complete" state)
86 @property bool complete () const pure { pragma(inline, true); return (state == State.Accept); } /// is current character complete? take `codepoint` then
87 @property bool invalid () const pure { pragma(inline, true); return (state == State.Reject); } /// did we hit invalid UTF-8 byte sequence?
88 @property bool completeOrInvalid () const pure { pragma(inline, true); return (state == State.Accept || state == State.Reject); } /// did we hit end of valid or invalid UTF-8 byte sequence?
89 void reset () pure { pragma(inline, true); state = State.Accept; codepoint = 0; } ///
90 /// process one byte, return `true` if codepoint is ready
91 bool decode (ubyte b) @trusted {
92 if (state == State.Reject) { state = 0; codepoint = 0; }
93 immutable uint type = utf8dfa.ptr[b];
94 codepoint = (state != State.Accept ? (b&0x3fu)|(codepoint<<6) : (0xff>>type)&b);
95 state = utf8dfa.ptr[256+state+type];
96 return (state == State.Accept);
98 /// same as `decode`, but caller is guaranteed that decoder will never get invalid utf-8 sequence
99 bool decodeValid (ubyte b) @trusted {
100 immutable uint type = utf8dfa.ptr[b];
101 codepoint = (state != State.Accept ? (b&0x3fu)|(codepoint<<6) : (0xff>>type)&b);
102 state = utf8dfa.ptr[256+state+type];
103 return (state == State.Accept);
105 /// same as `decode`, never reaches `invalid` state, returns `replacement` for invalid chars
106 bool decodeSafe (ubyte b) @trusted {
107 immutable uint type = utf8dfa.ptr[b];
108 codepoint = (state != State.Accept ? (b&0x3f)|(codepoint<<6) : (0xff>>type)&b);
109 if ((state = utf8dfa.ptr[256+state+type]) == State.Reject) { state = State.Accept; codepoint = replacement; }
110 return (state == State.Accept);
115 // ////////////////////////////////////////////////////////////////////////// //
116 /// slightly slower state-machine based UTF-8 decoder; using 4 bytes of memory (uint)
117 /// code points from invalid range will never be valid, this is the property of the state machine
118 align(1) struct Utf8Decoder {
119 align(1):
120 public:
121 enum dchar replacement = '\uFFFD'; /// replacement char for invalid unicode
122 static bool isValidDC (dchar c) pure nothrow @safe @nogc { pragma(inline, true); return (c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF)); } /// is given codepoint valid?
124 private:
125 enum State : uint {
126 Accept = 0x0000_0000u,
127 Reject = 0x0c00_0000u,
128 Mask = 0xff00_0000u
130 uint codep = State.Accept;
131 nothrow @safe @nogc:
132 public:
133 @property bool complete () const pure { pragma(inline, true); return ((codep&State.Mask) == State.Accept); } /// is current character complete?
134 @property bool invalid () const pure { pragma(inline, true); return ((codep&State.Mask) == State.Reject); } ///
135 @property bool completeOrInvalid () const pure { pragma(inline, true); return (complete || invalid); } ///
136 @property dchar currCodePoint () const pure { pragma(inline, true); return (codep <= dchar.max ? codep : replacement); } /// valid only if decoder is in "complete" state
137 void reset () pure { codep = State.Accept; } ///
138 /** never reaches `invalid` state, returns `replacement` for invalid chars.
139 * returns invalid dchar while it is "in progress" (i.e. result > dchar.max). */
140 dchar decode (ubyte b) @trusted {
141 immutable ubyte type = Utf8DecoderFast.utf8dfa.ptr[b];
142 ubyte state = (codep>>24)&0xff;
143 codep = (state /*!= State.Accept*/ ? (b&0x3f)|((codep&~State.Mask)<<6) : (0xff>>type)&b);
144 if ((state = Utf8DecoderFast.utf8dfa.ptr[256+state+type]) == 12/*State.Reject*/) {
145 codep = replacement;
146 } else {
147 codep |= (cast(uint)state<<24);
149 return codep;
154 // ////////////////////////////////////////////////////////////////////////// //
155 /// returns -1 on error (out of room in `s`), or number of generated bytes.
156 /// will never set more than 4 bytes of `s`.
157 int utf8Encode(dchar replacement='\uFFFD') (char[] s, dchar c) pure nothrow @trusted @nogc {
158 static assert(Utf8Decoder.isValidDC(replacement), "invalid replacement char");
159 // if this is out-of-range char, put replacement instead
160 if (!Utf8Decoder.isValidDC(c)) c = replacement;
161 if (c <= 0x7F) {
162 if (s.length < 1) return -1;
163 s.ptr[0] = cast(char)c;
164 return 1;
166 if (c <= 0x7FF) {
167 if (s.length < 2) return -1;
168 s.ptr[0] = cast(char)(0xC0|(c>>6));
169 s.ptr[1] = cast(char)(0x80|(c&0x3F));
170 return 2;
172 if (c <= 0xFFFF) {
173 if (s.length < 3) return -1;
174 s.ptr[0] = cast(char)(0xE0|(c>>12));
175 s.ptr[1] = cast(char)(0x80|((c>>6)&0x3F));
176 s.ptr[2] = cast(char)(0x80|(c&0x3F));
177 return 3;
179 if (c <= 0x10FFFF) {
180 if (s.length < 4) return -1;
181 s.ptr[0] = cast(char)(0xF0|(c>>18));
182 s.ptr[1] = cast(char)(0x80|((c>>12)&0x3F));
183 s.ptr[2] = cast(char)(0x80|((c>>6)&0x3F));
184 s.ptr[3] = cast(char)(0x80|(c&0x3F));
185 return 4;
187 // the thing that should not be
188 assert(0, "wtf?!");
192 // ////////////////////////////////////////////////////////////////////////// //
193 /// doesn't do all possible checks, so don't pass invalid UTF-8
194 usize utf8Length (const(char)[] s) pure nothrow @trusted @nogc {
195 static immutable ubyte[256] UTF8stride = [
196 cast(ubyte)
197 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
198 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
199 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
200 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
201 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
202 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
203 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
204 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
205 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
206 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
207 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
208 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
209 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
210 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
211 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
212 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
214 usize pos = 0, res = 0;
215 while (pos < s.length) {
216 ubyte l = UTF8stride.ptr[s.ptr[pos++]];
217 if (l == 0xFF) l = 1;
218 res += l;
219 pos += (l-1);
221 return res;
225 /// `ch`: utf8 start
226 /// -1: invalid utf8
227 byte utf8CodeLen(bool allowOverlong=false) (char ch) pure nothrow @trusted @nogc {
228 //pragma(inline, true);
229 if (ch < 0x80) return 1;
230 static if (allowOverlong) {
231 if ((ch&0b1111_1110) == 0b1111_1100) return 6;
232 if ((ch&0b1111_1100) == 0b1111_1000) return 5;
234 if ((ch&0b1111_1000) == 0b1111_0000) return 4;
235 if ((ch&0b1111_0000) == 0b1110_0000) return 3;
236 if ((ch&0b1110_0000) == 0b1100_0000) return 2;
237 return -1; // invalid
242 bool utf8Valid (const(void)[] buf) pure nothrow @trusted @nogc {
243 const(ubyte)* bp = cast(const(ubyte)*)buf.ptr;
244 auto left = buf.length;
245 while (left--) {
246 auto len = utf8CodeLen(*bp++)-1;
247 if (len < 0 || len > left) return false;
248 left -= len;
249 while (len-- > 0) if (((*bp++)&0b1100_0000) != 0b1000_0000) return false;
251 return true;
256 bool utf8ValidText (const(void)[] buf) pure nothrow @trusted @nogc {
257 const(ubyte)* bp = cast(const(ubyte)*)buf.ptr;
258 auto left = buf.length;
259 while (left--) {
260 auto len = utf8CodeLen(*bp++)-1;
261 if (len < 0 || len > left) return false;
262 if (len == 1 && bp[-1] < 32) {
263 if (bp[-1] != 9 && bp[-1] != 10 && bp[-1] != 13) return false;
265 left -= len;
266 while (len-- > 0) if (((*bp++)&0b1100_0000) != 0b1000_0000) return false;
268 return true;
272 // ////////////////////////////////////////////////////////////////////////// //
274 private static immutable ubyte[0x458-0x401] uni2koiTable = [
275 0xB3,0x3F,0x3F,0xB4,0x3F,0xB6,0xB7,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0xE1,
276 0xE2,0xF7,0xE7,0xE4,0xE5,0xF6,0xFA,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xF0,0xF2,
277 0xF3,0xF4,0xF5,0xE6,0xE8,0xE3,0xFE,0xFB,0xFD,0xFF,0xF9,0xF8,0xFC,0xE0,0xF1,0xC1,
278 0xC2,0xD7,0xC7,0xC4,0xC5,0xD6,0xDA,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xD0,0xD2,
279 0xD3,0xD4,0xD5,0xC6,0xC8,0xC3,0xDE,0xDB,0xDD,0xDF,0xD9,0xD8,0xDC,0xC0,0xD1,0x3F,
280 0xA3,0x3F,0x3F,0xA4,0x3F,0xA6,0xA7
284 private static immutable dchar[128] koi2uniTable = [
285 0x2500,0x2502,0x250C,0x2510,0x2514,0x2518,0x251C,0x2524,0x252C,0x2534,0x253C,0x2580,0x2584,0x2588,0x258C,0x2590,
286 0x2591,0x2592,0x2593,0x2320,0x25A0,0x2219,0x221A,0x2248,0x2264,0x2265,0x00A0,0x2321,0x00B0,0x00B2,0x00B7,0x00F7,
287 0x2550,0x2551,0x2552,0x0451,0x0454,0x2554,0x0456,0x0457,0x2557,0x2558,0x2559,0x255A,0x255B,0x0491,0x255D,0x255E,
288 0x255F,0x2560,0x2561,0x0401,0x0404,0x2563,0x0406,0x0407,0x2566,0x2567,0x2568,0x2569,0x256A,0x0490,0x256C,0x00A9,
289 0x044E,0x0430,0x0431,0x0446,0x0434,0x0435,0x0444,0x0433,0x0445,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,0x043E,
290 0x043F,0x044F,0x0440,0x0441,0x0442,0x0443,0x0436,0x0432,0x044C,0x044B,0x0437,0x0448,0x044D,0x0449,0x0447,0x044A,
291 0x042E,0x0410,0x0411,0x0426,0x0414,0x0415,0x0424,0x0413,0x0425,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E,
292 0x041F,0x042F,0x0420,0x0421,0x0422,0x0423,0x0416,0x0412,0x042C,0x042B,0x0417,0x0428,0x042D,0x0429,0x0427,0x042A,
296 // convert unicode to koi8-u
297 public char uni2koi() (dchar ch) pure nothrow @trusted @nogc {
298 if (ch < 128) return cast(char)(ch&0xff);
299 if (ch > 0x400 && ch < 0x458) return cast(char)(uni2koiTable.ptr[ch-0x401]);
300 switch (ch) {
301 case 0x490: return 0xBD; // ukrainian G with upturn (upcase)
302 case 0x491: return 0xAD; // ukrainian G with upturn (locase)
303 case 0x2500: return 0x80; // BOX DRAWINGS LIGHT HORIZONTAL
304 case 0x2502: return 0x81; // BOX DRAWINGS LIGHT VERTICAL
305 case 0x250c: return 0x82; // BOX DRAWINGS LIGHT DOWN AND RIGHT
306 case 0x2510: return 0x83; // BOX DRAWINGS LIGHT DOWN AND LEFT
307 case 0x2514: return 0x84; // BOX DRAWINGS LIGHT UP AND RIGHT
308 case 0x2518: return 0x85; // BOX DRAWINGS LIGHT UP AND LEFT
309 case 0x251c: return 0x86; // BOX DRAWINGS LIGHT VERTICAL AND RIGHT
310 case 0x2524: return 0x87; // BOX DRAWINGS LIGHT VERTICAL AND LEFT
311 case 0x252c: return 0x88; // BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
312 case 0x2534: return 0x89; // BOX DRAWINGS LIGHT UP AND HORIZONTAL
313 case 0x253c: return 0x8A; // BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
314 case 0x2580: return 0x8B; // UPPER HALF BLOCK
315 case 0x2584: return 0x8C; // LOWER HALF BLOCK
316 case 0x2588: return 0x8D; // FULL BLOCK
317 case 0x258c: return 0x8E; // LEFT HALF BLOCK
318 case 0x2590: return 0x8F; // RIGHT HALF BLOCK
319 case 0x2591: return 0x90; // LIGHT SHADE
320 case 0x2592: return 0x91; // MEDIUM SHADE
321 case 0x2593: return 0x92; // DARK SHADE
322 case 0x2320: return 0x93; // TOP HALF INTEGRAL
323 case 0x25a0: return 0x94; // BLACK SQUARE
324 case 0x2219: return 0x95; // BULLET OPERATOR
325 case 0x221a: return 0x96; // SQUARE ROOT
326 case 0x2248: return 0x97; // ALMOST EQUAL TO
327 case 0x2264: return 0x98; // LESS-THAN OR EQUAL TO
328 case 0x2265: return 0x99; // GREATER-THAN OR EQUAL TO
329 case 0x00a0: return 0x9A; // NO-BREAK SPACE
330 case 0x2321: return 0x9B; // BOTTOM HALF INTEGRAL
331 case 0x00b0: return 0x9C; // DEGREE SIGN
332 case 0x00b2: return 0x9D; // SUPERSCRIPT TWO
333 case 0x00b7: return 0x9E; // MIDDLE DOT
334 case 0x00f7: return 0x9F; // DIVISION SIGN
335 case 0x2550: return 0xA0; // BOX DRAWINGS DOUBLE HORIZONTAL
336 case 0x2551: return 0xA1; // BOX DRAWINGS DOUBLE VERTICAL
337 case 0x2552: return 0xA2; // BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE
338 case 0x2554: return 0xA5; // BOX DRAWINGS DOUBLE DOWN AND RIGHT
339 case 0x2557: return 0xA8; // BOX DRAWINGS DOUBLE DOWN AND LEFT
340 case 0x2558: return 0xA9; // BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE
341 case 0x2559: return 0xAA; // BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE
342 case 0x255a: return 0xAB; // BOX DRAWINGS DOUBLE UP AND RIGHT
343 case 0x255b: return 0xAC; // BOX DRAWINGS UP SINGLE AND LEFT DOUBLE
344 case 0x255d: return 0xAE; // BOX DRAWINGS DOUBLE UP AND LEFT
345 case 0x255e: return 0xAF; // BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE
346 case 0x255f: return 0xB0; // BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE
347 case 0x2560: return 0xB1; // BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
348 case 0x2561: return 0xB2; // BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE
349 case 0x2563: return 0xB5; // BOX DRAWINGS DOUBLE VERTICAL AND LEFT
350 case 0x2566: return 0xB8; // BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL
351 case 0x2567: return 0xB9; // BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE
352 case 0x2568: return 0xBA; // BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE
353 case 0x2569: return 0xBB; // BOX DRAWINGS DOUBLE UP AND HORIZONTAL
354 case 0x256a: return 0xBC; // BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE
355 case 0x256c: return 0xBE; // BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL
356 case 0x00a9: return 0xBF; // COPYRIGHT SIGN
358 case 0x2562: return 0xB4; // BOX DRAWINGS DOUBLE VERTICAL AND LEFT SINGLE
359 case 0x2564: return 0xB6; // BOX DRAWINGS DOWN SINGLE AND DOUBLE HORIZONTAL
360 case 0x2565: return 0xB7; // BOX DRAWINGS DOWN DOUBLE AND SINGLE HORIZONTAL
361 case 0x256B: return 0xBD; // BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL SINGLE
362 default:
364 return 0;
368 // convert koi8-u to unicode
369 public dchar koi2uni() (char ch) pure nothrow @trusted @nogc {
370 pragma(inline, true);
371 return (ch < 128 ? ch : koi2uniTable.ptr[cast(ubyte)ch-128]);
376 // ////////////////////////////////////////////////////////////////////////// //
377 /// convert koi8 to unicode
378 wchar koi2uni() (char ch) pure nothrow @trusted @nogc {
379 static immutable wchar[256] utbl = [
380 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000a,0x000b,0x000c,0x000d,0x000e,0x000f,
381 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001a,0x001b,0x001c,0x001d,0x001e,0x001f,
382 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002a,0x002b,0x002c,0x002d,0x002e,0x002f,
383 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003a,0x003b,0x003c,0x003d,0x003e,0x003f,
384 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004a,0x004b,0x004c,0x004d,0x004e,0x004f,
385 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005a,0x005b,0x005c,0x005d,0x005e,0x005f,
386 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006a,0x006b,0x006c,0x006d,0x006e,0x006f,
387 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007a,0x007b,0x007c,0x007d,0x007e,0x007f,
388 0x2500,0x2502,0x250c,0x2510,0x2514,0x2518,0x251c,0x2524,0x252c,0x2534,0x253c,0x2580,0x2584,0x2588,0x258c,0x2590,
389 0x2591,0x2592,0x2593,0x2320,0x25a0,0x2219,0x221a,0x2248,0x2264,0x2265,0x00a0,0x2321,0x00b0,0x00b2,0x00b7,0x00f7,
390 0x2550,0x2551,0x2552,0x0451,0x0454,0x2554,0x0456,0x0457,0x2557,0x2558,0x2559,0x255a,0x255b,0x0491,0x255d,0x255e,
391 0x255f,0x2560,0x2561,0x0401,0x0404,0x2563,0x0406,0x0407,0x2566,0x2567,0x2568,0x2569,0x256a,0x0490,0x256c,0x00a9,
392 0x044e,0x0430,0x0431,0x0446,0x0434,0x0435,0x0444,0x0433,0x0445,0x0438,0x0439,0x043a,0x043b,0x043c,0x043d,0x043e,
393 0x043f,0x044f,0x0440,0x0441,0x0442,0x0443,0x0436,0x0432,0x044c,0x044b,0x0437,0x0448,0x044d,0x0449,0x0447,0x044a,
394 0x042e,0x0410,0x0411,0x0426,0x0414,0x0415,0x0424,0x0413,0x0425,0x0418,0x0419,0x041a,0x041b,0x041c,0x041d,0x041e,
395 0x041f,0x042f,0x0420,0x0421,0x0422,0x0423,0x0416,0x0412,0x042c,0x042b,0x0417,0x0428,0x042d,0x0429,0x0427,0x042a,
397 return utbl.ptr[cast(ubyte)ch];
400 /// convert unicode to koi8
401 char uni2koi(char repchar='?') (dchar dch) pure nothrow @trusted @nogc {
402 if (dch < 128) return cast(char)(dch&0xff);
403 if (dch == 0x00a0) return cast(char)0x9a;
404 if (dch == 0x00a9) return cast(char)0xbf;
405 if (dch == 0x00b0) return cast(char)0x9c;
406 if (dch == 0x00b2) return cast(char)0x9d;
407 if (dch == 0x00b7) return cast(char)0x9e;
408 if (dch == 0x00f7) return cast(char)0x9f;
409 if (dch == 0x0401) return cast(char)0xb3;
410 if (dch == 0x0404) return cast(char)0xb4;
411 if (dch == 0x0406) return cast(char)0xb6;
412 if (dch == 0x0407) return cast(char)0xb7;
413 if (dch >= 0x0410 && dch <= 0x044f) {
414 static immutable char[64] ctbl0 = [
415 0xe1,0xe2,0xf7,0xe7,0xe4,0xe5,0xf6,0xfa,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,0xf0,
416 0xf2,0xf3,0xf4,0xf5,0xe6,0xe8,0xe3,0xfe,0xfb,0xfd,0xff,0xf9,0xf8,0xfc,0xe0,0xf1,
417 0xc1,0xc2,0xd7,0xc7,0xc4,0xc5,0xd6,0xda,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,0xd0,
418 0xd2,0xd3,0xd4,0xd5,0xc6,0xc8,0xc3,0xde,0xdb,0xdd,0xdf,0xd9,0xd8,0xdc,0xc0,0xd1,
420 return ctbl0.ptr[cast(uint)dch-1040];
422 if (dch == 0x0451) return cast(char)0xa3;
423 if (dch == 0x0454) return cast(char)0xa4;
424 if (dch == 0x0456) return cast(char)0xa6;
425 if (dch == 0x0457) return cast(char)0xa7;
426 if (dch == 0x0490) return cast(char)0xbd;
427 if (dch == 0x0491) return cast(char)0xad;
428 if (dch == 0x2219) return cast(char)0x95;
429 if (dch == 0x221a) return cast(char)0x96;
430 if (dch == 0x2248) return cast(char)0x97;
431 if (dch == 0x2264) return cast(char)0x98;
432 if (dch == 0x2265) return cast(char)0x99;
433 if (dch == 0x2320) return cast(char)0x93;
434 if (dch == 0x2321) return cast(char)0x9b;
435 if (dch == 0x2500) return cast(char)0x80;
436 if (dch == 0x2502) return cast(char)0x81;
437 if (dch == 0x250c) return cast(char)0x82;
438 if (dch == 0x2510) return cast(char)0x83;
439 if (dch == 0x2514) return cast(char)0x84;
440 if (dch == 0x2518) return cast(char)0x85;
441 if (dch == 0x251c) return cast(char)0x86;
442 if (dch == 0x2524) return cast(char)0x87;
443 if (dch == 0x252c) return cast(char)0x88;
444 if (dch == 0x2534) return cast(char)0x89;
445 if (dch == 0x253c) return cast(char)0x8a;
446 if (dch == 0x2550) return cast(char)0xa0;
447 if (dch == 0x2551) return cast(char)0xa1;
448 if (dch == 0x2552) return cast(char)0xa2;
449 if (dch == 0x2554) return cast(char)0xa5;
450 if (dch >= 0x2557 && dch <= 0x255b) {
451 static immutable char[5] ctbl1 = [0xa8,0xa9,0xaa,0xab,0xac,];
452 return ctbl1.ptr[cast(uint)dch-9559];
454 if (dch >= 0x255d && dch <= 0x2561) {
455 static immutable char[5] ctbl2 = [0xae,0xaf,0xb0,0xb1,0xb2,];
456 return ctbl2.ptr[cast(uint)dch-9565];
458 if (dch == 0x2563) return cast(char)0xb5;
459 if (dch >= 0x2566 && dch <= 0x256a) {
460 static immutable char[5] ctbl3 = [0xb8,0xb9,0xba,0xbb,0xbc,];
461 return ctbl3.ptr[cast(uint)dch-9574];
463 if (dch == 0x256c) return cast(char)0xbe;
464 if (dch == 0x2580) return cast(char)0x8b;
465 if (dch == 0x2584) return cast(char)0x8c;
466 if (dch == 0x2588) return cast(char)0x8d;
467 if (dch == 0x258c) return cast(char)0x8e;
468 if (dch >= 0x2590 && dch <= 0x2593) {
469 static immutable char[4] ctbl4 = [0x8f,0x90,0x91,0x92,];
470 return ctbl4.ptr[cast(uint)dch-9616];
472 if (dch == 0x25a0) return cast(char)0x94;
473 return repchar;
477 // ////////////////////////////////////////////////////////////////////////// //
478 /// conver 1251 to unicode
479 wchar cp12512uni() (char ch) pure nothrow @trusted @nogc {
480 static immutable wchar[256] utbl = [
481 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000a,0x000b,0x000c,0x000d,0x000e,0x000f,
482 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001a,0x001b,0x001c,0x001d,0x001e,0x001f,
483 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002a,0x002b,0x002c,0x002d,0x002e,0x002f,
484 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003a,0x003b,0x003c,0x003d,0x003e,0x003f,
485 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004a,0x004b,0x004c,0x004d,0x004e,0x004f,
486 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005a,0x005b,0x005c,0x005d,0x005e,0x005f,
487 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006a,0x006b,0x006c,0x006d,0x006e,0x006f,
488 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007a,0x007b,0x007c,0x007d,0x007e,0x007f,
489 0x0402,0x0403,0x201a,0x0453,0x201e,0x2026,0x2020,0x2021,0x20ac,0x2030,0x0409,0x2039,0x040a,0x040c,0x040b,0x040f,
490 0x0452,0x2018,0x2019,0x201c,0x201d,0x2022,0x2013,0x2014,0xfffd,0x2122,0x0459,0x203a,0x045a,0x045c,0x045b,0x045f,
491 0x00a0,0x040e,0x045e,0x0408,0x00a4,0x0490,0x00a6,0x00a7,0x0401,0x00a9,0x0404,0x00ab,0x00ac,0x00ad,0x00ae,0x0407,
492 0x00b0,0x00b1,0x0406,0x0456,0x0491,0x00b5,0x00b6,0x00b7,0x0451,0x2116,0x0454,0x00bb,0x0458,0x0405,0x0455,0x0457,
493 0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0416,0x0417,0x0418,0x0419,0x041a,0x041b,0x041c,0x041d,0x041e,0x041f,
494 0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,0x0427,0x0428,0x0429,0x042a,0x042b,0x042c,0x042d,0x042e,0x042f,
495 0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0436,0x0437,0x0438,0x0439,0x043a,0x043b,0x043c,0x043d,0x043e,0x043f,
496 0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,0x0446,0x0447,0x0448,0x0449,0x044a,0x044b,0x044c,0x044d,0x044e,0x044f,
498 return utbl.ptr[cast(ubyte)ch];
501 /// convert unicode to 1251
502 char uni2cp1251(char repchar='?') (dchar dch) pure nothrow @trusted @nogc {
503 if (dch < 128) return cast(char)(dch&0xff);
504 if (dch == 0x00a0) return cast(char)0xa0;
505 if (dch == 0x00a4) return cast(char)0xa4;
506 if (dch == 0x00a6) return cast(char)0xa6;
507 if (dch == 0x00a7) return cast(char)0xa7;
508 if (dch == 0x00a9) return cast(char)0xa9;
509 if (dch >= 0x00ab && dch <= 0x00ae) {
510 static immutable char[4] ctbl0 = [0xab,0xac,0xad,0xae,];
511 return ctbl0.ptr[cast(uint)dch-171];
513 if (dch == 0x00b0) return cast(char)0xb0;
514 if (dch == 0x00b1) return cast(char)0xb1;
515 if (dch == 0x00b5) return cast(char)0xb5;
516 if (dch == 0x00b6) return cast(char)0xb6;
517 if (dch == 0x00b7) return cast(char)0xb7;
518 if (dch == 0x00bb) return cast(char)0xbb;
519 if (dch >= 0x0401 && dch <= 0x040c) {
520 static immutable char[12] ctbl1 = [0xa8,0x80,0x81,0xaa,0xbd,0xb2,0xaf,0xa3,0x8a,0x8c,0x8e,0x8d,];
521 return ctbl1.ptr[cast(uint)dch-1025];
523 if (dch >= 0x040e && dch <= 0x044f) {
524 static immutable char[66] ctbl2 = [
525 0xa1,0x8f,0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,
526 0xce,0xcf,0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,
527 0xde,0xdf,0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,
528 0xee,0xef,0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,
529 0xfe,0xff,
531 return ctbl2.ptr[cast(uint)dch-1038];
533 if (dch >= 0x0451 && dch <= 0x045c) {
534 static immutable char[12] ctbl3 = [0xb8,0x90,0x83,0xba,0xbe,0xb3,0xbf,0xbc,0x9a,0x9c,0x9e,0x9d,];
535 return ctbl3.ptr[cast(uint)dch-1105];
537 if (dch == 0x045e) return cast(char)0xa2;
538 if (dch == 0x045f) return cast(char)0x9f;
539 if (dch == 0x0490) return cast(char)0xa5;
540 if (dch == 0x0491) return cast(char)0xb4;
541 if (dch == 0x2013) return cast(char)0x96;
542 if (dch == 0x2014) return cast(char)0x97;
543 if (dch == 0x2018) return cast(char)0x91;
544 if (dch == 0x2019) return cast(char)0x92;
545 if (dch == 0x201a) return cast(char)0x82;
546 if (dch == 0x201c) return cast(char)0x93;
547 if (dch == 0x201d) return cast(char)0x94;
548 if (dch == 0x201e) return cast(char)0x84;
549 if (dch == 0x2020) return cast(char)0x86;
550 if (dch == 0x2021) return cast(char)0x87;
551 if (dch == 0x2022) return cast(char)0x95;
552 if (dch == 0x2026) return cast(char)0x85;
553 if (dch == 0x2030) return cast(char)0x89;
554 if (dch == 0x2039) return cast(char)0x8b;
555 if (dch == 0x203a) return cast(char)0x9b;
556 if (dch == 0x20ac) return cast(char)0x88;
557 if (dch == 0x2116) return cast(char)0xb9;
558 if (dch == 0x2122) return cast(char)0x99;
559 //if (dch == 0xfffd) return cast(char)0x98;
560 return repchar;
564 // ////////////////////////////////////////////////////////////////////////// //
565 /// convert 866 to unicode
566 wchar cp8662uni() (char ch) pure nothrow @trusted @nogc {
567 static immutable wchar[256] utbl = [
568 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000a,0x000b,0x000c,0x000d,0x000e,0x000f,
569 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001a,0x001b,0x001c,0x001d,0x001e,0x001f,
570 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002a,0x002b,0x002c,0x002d,0x002e,0x002f,
571 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003a,0x003b,0x003c,0x003d,0x003e,0x003f,
572 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004a,0x004b,0x004c,0x004d,0x004e,0x004f,
573 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005a,0x005b,0x005c,0x005d,0x005e,0x005f,
574 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006a,0x006b,0x006c,0x006d,0x006e,0x006f,
575 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007a,0x007b,0x007c,0x007d,0x007e,0x007f,
576 0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0416,0x0417,0x0418,0x0419,0x041a,0x041b,0x041c,0x041d,0x041e,0x041f,
577 0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,0x0427,0x0428,0x0429,0x042a,0x042b,0x042c,0x042d,0x042e,0x042f,
578 0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0436,0x0437,0x0438,0x0439,0x043a,0x043b,0x043c,0x043d,0x043e,0x043f,
579 0x2591,0x2592,0x2593,0x2502,0x2524,0x2561,0x2562,0x2556,0x2555,0x2563,0x2551,0x2557,0x255d,0x255c,0x255b,0x2510,
580 0x2514,0x2534,0x252c,0x251c,0x2500,0x253c,0x255e,0x255f,0x255a,0x2554,0x2569,0x2566,0x2560,0x2550,0x256c,0x2567,
581 0x2568,0x2564,0x2565,0x2559,0x2558,0x2552,0x2553,0x256b,0x256a,0x2518,0x250c,0x2588,0x2584,0x258c,0x2590,0x2580,
582 0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,0x0446,0x0447,0x0448,0x0449,0x044a,0x044b,0x044c,0x044d,0x044e,0x044f,
583 0x0401,0x0451,0x0404,0x0454,0x0407,0x0457,0x040e,0x045e,0x00b0,0x2219,0x00b7,0x221a,0x2116,0x00a4,0x25a0,0x00a0,
585 return utbl.ptr[cast(ubyte)ch];
588 /// convert unicode to 866
589 char uni2cp866(char repchar='?') (dchar dch) pure nothrow @trusted @nogc {
590 if (dch < 128) return cast(char)(dch&0xff);
591 if (dch == 0x00a0) return cast(char)0xff;
592 if (dch == 0x00a4) return cast(char)0xfd;
593 if (dch == 0x00b0) return cast(char)0xf8;
594 if (dch == 0x00b7) return cast(char)0xfa;
595 if (dch == 0x0401) return cast(char)0xf0;
596 if (dch == 0x0404) return cast(char)0xf2;
597 if (dch == 0x0407) return cast(char)0xf4;
598 if (dch == 0x040e) return cast(char)0xf6;
599 if (dch >= 0x0410 && dch <= 0x044f) {
600 static immutable char[64] ctbl0 = [
601 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
602 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
603 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
604 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
606 return ctbl0.ptr[cast(uint)dch-1040];
608 if (dch == 0x0451) return cast(char)0xf1;
609 if (dch == 0x0454) return cast(char)0xf3;
610 if (dch == 0x0457) return cast(char)0xf5;
611 if (dch == 0x045e) return cast(char)0xf7;
612 if (dch == 0x2116) return cast(char)0xfc;
613 if (dch == 0x2219) return cast(char)0xf9;
614 if (dch == 0x221a) return cast(char)0xfb;
615 if (dch == 0x2500) return cast(char)0xc4;
616 if (dch == 0x2502) return cast(char)0xb3;
617 if (dch == 0x250c) return cast(char)0xda;
618 if (dch == 0x2510) return cast(char)0xbf;
619 if (dch == 0x2514) return cast(char)0xc0;
620 if (dch == 0x2518) return cast(char)0xd9;
621 if (dch == 0x251c) return cast(char)0xc3;
622 if (dch == 0x2524) return cast(char)0xb4;
623 if (dch == 0x252c) return cast(char)0xc2;
624 if (dch == 0x2534) return cast(char)0xc1;
625 if (dch == 0x253c) return cast(char)0xc5;
626 if (dch >= 0x2550 && dch <= 0x256c) {
627 static immutable char[29] ctbl1 = [
628 0xcd,0xba,0xd5,0xd6,0xc9,0xb8,0xb7,0xbb,0xd4,0xd3,0xc8,0xbe,0xbd,0xbc,0xc6,0xc7,
629 0xcc,0xb5,0xb6,0xb9,0xd1,0xd2,0xcb,0xcf,0xd0,0xca,0xd8,0xd7,0xce,
631 return ctbl1.ptr[cast(uint)dch-9552];
633 if (dch == 0x2580) return cast(char)0xdf;
634 if (dch == 0x2584) return cast(char)0xdc;
635 if (dch == 0x2588) return cast(char)0xdb;
636 if (dch == 0x258c) return cast(char)0xdd;
637 if (dch >= 0x2590 && dch <= 0x2593) {
638 static immutable char[4] ctbl2 = [0xde,0xb0,0xb1,0xb2,];
639 return ctbl2.ptr[cast(uint)dch-9616];
641 if (dch == 0x25a0) return cast(char)0xfe;
642 return repchar;
646 // ////////////////////////////////////////////////////////////////////////// //
647 /// `strlen()` for utf-8 string
648 public usize utflen (const(char)[] s) nothrow @trusted @nogc {
649 Utf8DecoderFast dc;
650 int res = 0;
651 foreach (char ch; s) if (dc.decode(cast(ubyte)ch)) ++res;
652 return res;
656 /// remove last character from utf-8 string
657 public T utfchop(T : const(char)[]) (T s) nothrow @trusted @nogc {
658 Utf8DecoderFast dc;
659 int last = 0;
660 foreach (immutable idx, char ch; s) if (dc.decode(cast(ubyte)ch)) last = cast(int)idx;
661 return s[0..last];
665 /// skip first `len` characters in utf-8 string
666 public T utfskip(T : const(char)[]) (T s, ptrdiff_t len) nothrow @trusted @nogc {
667 if (len < 1) return s;
668 if (len >= s.length) return null;
669 Utf8DecoderFast dc;
670 foreach (immutable idx, char ch; s) {
671 if (dc.decode(cast(ubyte)ch)) {
672 if (--len == 0) return s[idx+1..$];
675 return null;
679 /// take first `len` characters in utf-8 string
680 public T utfleft(T : const(char)[]) (T s, ptrdiff_t len) nothrow @trusted @nogc {
681 if (len < 1) return null;
682 if (len >= s.length) return s;
683 Utf8DecoderFast dc;
684 foreach (immutable idx, char ch; s) {
685 if (dc.decode(cast(ubyte)ch)) {
686 if (--len == 0) return s[0..idx+1];
689 return s;
693 /// take last `len` characters in utf-8 string (slow!)
694 public T utfright(T : const(char)[]) (T s, ptrdiff_t len) nothrow @trusted @nogc {
695 if (len < 1) return null;
696 if (len >= s.length) return s;
697 auto fulllen = s.utflen;
698 if (len >= fulllen) return s;
699 Utf8DecoderFast dc;
700 foreach (immutable idx, char ch; s) {
701 if (dc.decode(cast(ubyte)ch)) {
702 if (--fulllen == len) return s[idx+1..$];
705 return null;
709 /// take `len` characters from position `pos` in utf-8 string (slow!)
710 public T utfmid(T : const(char)[]) (T s, ptrdiff_t pos, ptrdiff_t len) nothrow @trusted @nogc {
711 if (len < 1 || pos >= s.length) return null;
712 Utf8DecoderFast dc;
713 int ds = -1, de = -1;
714 if (pos == 0) ds = 0;
715 foreach (immutable idx, char ch; s) {
716 if (dc.decode(cast(ubyte)ch)) {
717 if (ds < 0) {
718 if (pos > 0) --pos; else ++pos;
719 if (pos == 0) ds = cast(int)idx+1;
720 } else if (de < 0) {
721 if (--len == 0) { de = cast(int)idx+1; break; }
722 } else {
723 assert(0, "wtf?!");
727 if (ds < 0) return null;
728 if (de < 0) return s[ds..$];
729 return s[ds..de];
733 /// remove `len` characters from position `pos` in utf-8 string (slow!)
734 /// NOT REALLY TESTED!
735 public T utfdel(T : const(char)[]) (T s, ptrdiff_t pos, ptrdiff_t len) {
736 static if (is(T == typeof(null))) {
737 return null;
738 } else {
739 if (len < 1 || pos >= s.length) return s;
740 Utf8DecoderFast dc;
741 int ds = -1, de = -1;
742 if (pos == 0) ds = 0;
743 foreach (immutable idx, char ch; s) {
744 if (dc.decode(cast(ubyte)ch)) {
745 if (ds < 0) {
746 if (pos > 0) --pos; else ++pos;
747 if (pos == 0) ds = cast(int)idx+1;
748 } else if (de < 0) {
749 if (--len == 0) { de = cast(int)idx+1; break; }
750 } else {
751 assert(0, "wtf?!");
755 if (ds < 0) return s;
756 if (de < 0) return s[0..ds];
757 static if (is(T : char[])) {
758 return s[0..ds]~s[de..$];
759 } else {
760 char[] res = s[0..ds].dup;
761 res ~= s[de..$];
762 return cast(T)res; // it is safe to cast here
768 // ////////////////////////////////////////////////////////////////////////// //
769 public T utfChopToSize(T:const(char)[]) (T s, int maxsize=255) nothrow @trusted {
770 static if (is(T == typeof(null))) {
771 return s;
772 } else {
773 if (maxsize < 1) return null;
774 if (s.length <= maxsize) return s;
775 // this is slow, but i don't care
776 while (s.length > maxsize) s = s.utfchop;
777 // add "..."
778 if (maxsize > 3) {
779 while (s.length > maxsize-3) s = s.utfchop;
780 static if (is(T == const(char)[])) {
781 return cast(T)(s.dup~"...");
782 } else {
783 return cast(T)(s~"...");
786 return s;
791 // ////////////////////////////////////////////////////////////////////////// //
792 // various one-byte encoding things, 'cause why not?
795 // ////////////////////////////////////////////////////////////////////////// //
796 public immutable char[256] koi8from866Table = [
797 '\x00','\x01','\x02','\x03','\x04','\x05','\x06','\x07','\x08','\x09','\x0a','\x0b','\x0c','\x0d','\x0e','\x0f',
798 '\x10','\x11','\x12','\x13','\x14','\x15','\x16','\x17','\x18','\x19','\x1a','\x1b','\x1c','\x1d','\x1e','\x1f',
799 '\x20','\x21','\x22','\x23','\x24','\x25','\x26','\x27','\x28','\x29','\x2a','\x2b','\x2c','\x2d','\x2e','\x2f',
800 '\x30','\x31','\x32','\x33','\x34','\x35','\x36','\x37','\x38','\x39','\x3a','\x3b','\x3c','\x3d','\x3e','\x3f',
801 '\x40','\x41','\x42','\x43','\x44','\x45','\x46','\x47','\x48','\x49','\x4a','\x4b','\x4c','\x4d','\x4e','\x4f',
802 '\x50','\x51','\x52','\x53','\x54','\x55','\x56','\x57','\x58','\x59','\x5a','\x5b','\x5c','\x5d','\x5e','\x5f',
803 '\x60','\x61','\x62','\x63','\x64','\x65','\x66','\x67','\x68','\x69','\x6a','\x6b','\x6c','\x6d','\x6e','\x6f',
804 '\x70','\x71','\x72','\x73','\x74','\x75','\x76','\x77','\x78','\x79','\x7a','\x7b','\x7c','\x7d','\x7e','\x7f',
805 '\xe1','\xe2','\xf7','\xe7','\xe4','\xe5','\xf6','\xfa','\xe9','\xea','\xeb','\xec','\xed','\xee','\xef','\xf0',
806 '\xf2','\xf3','\xf4','\xf5','\xe6','\xe8','\xe3','\xfe','\xfb','\xfd','\xff','\xf9','\xf8','\xfc','\xe0','\xf1',
807 '\xc1','\xc2','\xd7','\xc7','\xc4','\xc5','\xd6','\xda','\xc9','\xca','\xcb','\xcc','\xcd','\xce','\xcf','\xd0',
808 '\x90','\x91','\x92','\x81','\x87','\xb2','\x3f','\x3f','\x3f','\xb5','\xa1','\xa8','\xae','\x3f','\xac','\x83',
809 '\x84','\x89','\x88','\x86','\x80','\x8a','\xaf','\xb0','\xab','\xa5','\xbb','\xb8','\xb1','\xa0','\xbe','\xb9',
810 '\xba','\x3f','\x3f','\xaa','\xa9','\xa2','\x3f','\x3f','\xbc','\x85','\x82','\x8d','\x8c','\x8e','\x8f','\x8b',
811 '\xd2','\xd3','\xd4','\xd5','\xc6','\xc8','\xc3','\xde','\xdb','\xdd','\xdf','\xd9','\xd8','\xdc','\xc0','\xd1',
812 '\xb3','\xa3','\xb4','\xa4','\xb7','\xa7','\x3f','\x3f','\x9c','\x95','\x9e','\x96','\x3f','\x3f','\x94','\x9a',
815 public immutable char[256] koi8from1251Table = [
816 '\x00','\x01','\x02','\x03','\x04','\x05','\x06','\x07','\x08','\x09','\x0a','\x0b','\x0c','\x0d','\x0e','\x0f',
817 '\x10','\x11','\x12','\x13','\x14','\x15','\x16','\x17','\x18','\x19','\x1a','\x1b','\x1c','\x1d','\x1e','\x1f',
818 '\x20','\x21','\x22','\x23','\x24','\x25','\x26','\x27','\x28','\x29','\x2a','\x2b','\x2c','\x2d','\x2e','\x2f',
819 '\x30','\x31','\x32','\x33','\x34','\x35','\x36','\x37','\x38','\x39','\x3a','\x3b','\x3c','\x3d','\x3e','\x3f',
820 '\x40','\x41','\x42','\x43','\x44','\x45','\x46','\x47','\x48','\x49','\x4a','\x4b','\x4c','\x4d','\x4e','\x4f',
821 '\x50','\x51','\x52','\x53','\x54','\x55','\x56','\x57','\x58','\x59','\x5a','\x5b','\x5c','\x5d','\x5e','\x5f',
822 '\x60','\x61','\x62','\x63','\x64','\x65','\x66','\x67','\x68','\x69','\x6a','\x6b','\x6c','\x6d','\x6e','\x6f',
823 '\x70','\x71','\x72','\x73','\x74','\x75','\x76','\x77','\x78','\x79','\x7a','\x7b','\x7c','\x7d','\x7e','\x7f',
824 '\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f',
825 '\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f','\x3f',
826 '\x9a','\x3f','\x3f','\x3f','\x3f','\xbd','\x3f','\x3f','\xb3','\xbf','\xb4','\x3f','\x3f','\x3f','\x3f','\xb7',
827 '\x9c','\x3f','\xb6','\xa6','\xad','\x3f','\x3f','\x9e','\xa3','\x3f','\xa4','\x3f','\x3f','\x3f','\x3f','\xa7',
828 '\xe1','\xe2','\xf7','\xe7','\xe4','\xe5','\xf6','\xfa','\xe9','\xea','\xeb','\xec','\xed','\xee','\xef','\xf0',
829 '\xf2','\xf3','\xf4','\xf5','\xe6','\xe8','\xe3','\xfe','\xfb','\xfd','\xff','\xf9','\xf8','\xfc','\xe0','\xf1',
830 '\xc1','\xc2','\xd7','\xc7','\xc4','\xc5','\xd6','\xda','\xc9','\xca','\xcb','\xcc','\xcd','\xce','\xcf','\xd0',
831 '\xd2','\xd3','\xd4','\xd5','\xc6','\xc8','\xc3','\xde','\xdb','\xdd','\xdf','\xd9','\xd8','\xdc','\xc0','\xd1',
834 // char toupper/tolower, koi8
835 public immutable char[256] koi8tolowerTable = [
836 '\x00','\x01','\x02','\x03','\x04','\x05','\x06','\x07','\x08','\x09','\x0a','\x0b','\x0c','\x0d','\x0e','\x0f',
837 '\x10','\x11','\x12','\x13','\x14','\x15','\x16','\x17','\x18','\x19','\x1a','\x1b','\x1c','\x1d','\x1e','\x1f',
838 '\x20','\x21','\x22','\x23','\x24','\x25','\x26','\x27','\x28','\x29','\x2a','\x2b','\x2c','\x2d','\x2e','\x2f',
839 '\x30','\x31','\x32','\x33','\x34','\x35','\x36','\x37','\x38','\x39','\x3a','\x3b','\x3c','\x3d','\x3e','\x3f',
840 '\x40','\x61','\x62','\x63','\x64','\x65','\x66','\x67','\x68','\x69','\x6a','\x6b','\x6c','\x6d','\x6e','\x6f',
841 '\x70','\x71','\x72','\x73','\x74','\x75','\x76','\x77','\x78','\x79','\x7a','\x5b','\x5c','\x5d','\x5e','\x5f',
842 '\x60','\x61','\x62','\x63','\x64','\x65','\x66','\x67','\x68','\x69','\x6a','\x6b','\x6c','\x6d','\x6e','\x6f',
843 '\x70','\x71','\x72','\x73','\x74','\x75','\x76','\x77','\x78','\x79','\x7a','\x7b','\x7c','\x7d','\x7e','\x7f',
844 '\x80','\x81','\x82','\x83','\x84','\x85','\x86','\x87','\x88','\x89','\x8a','\x8b','\x8c','\x8d','\x8e','\x8f',
845 '\x90','\x91','\x92','\x93','\x94','\x95','\x96','\x97','\x98','\x99','\x9a','\x9b','\x9c','\x9d','\x9e','\x9f',
846 '\xa0','\xa1','\xa2','\xa3','\xa4','\xa5','\xa6','\xa7','\xa8','\xa9','\xaa','\xab','\xac','\xad','\xae','\xaf',
847 '\xb0','\xb1','\xb2','\xa3','\xa4','\xb5','\xa6','\xa7','\xb8','\xb9','\xba','\xbb','\xbc','\xad','\xbe','\xbf',
848 '\xc0','\xc1','\xc2','\xc3','\xc4','\xc5','\xc6','\xc7','\xc8','\xc9','\xca','\xcb','\xcc','\xcd','\xce','\xcf',
849 '\xd0','\xd1','\xd2','\xd3','\xd4','\xd5','\xd6','\xd7','\xd8','\xd9','\xda','\xdb','\xdc','\xdd','\xde','\xdf',
850 '\xc0','\xc1','\xc2','\xc3','\xc4','\xc5','\xc6','\xc7','\xc8','\xc9','\xca','\xcb','\xcc','\xcd','\xce','\xcf',
851 '\xd0','\xd1','\xd2','\xd3','\xd4','\xd5','\xd6','\xd7','\xd8','\xd9','\xda','\xdb','\xdc','\xdd','\xde','\xdf',
854 public immutable char[256] koi8toupperTable = [
855 '\x00','\x01','\x02','\x03','\x04','\x05','\x06','\x07','\x08','\x09','\x0a','\x0b','\x0c','\x0d','\x0e','\x0f',
856 '\x10','\x11','\x12','\x13','\x14','\x15','\x16','\x17','\x18','\x19','\x1a','\x1b','\x1c','\x1d','\x1e','\x1f',
857 '\x20','\x21','\x22','\x23','\x24','\x25','\x26','\x27','\x28','\x29','\x2a','\x2b','\x2c','\x2d','\x2e','\x2f',
858 '\x30','\x31','\x32','\x33','\x34','\x35','\x36','\x37','\x38','\x39','\x3a','\x3b','\x3c','\x3d','\x3e','\x3f',
859 '\x40','\x41','\x42','\x43','\x44','\x45','\x46','\x47','\x48','\x49','\x4a','\x4b','\x4c','\x4d','\x4e','\x4f',
860 '\x50','\x51','\x52','\x53','\x54','\x55','\x56','\x57','\x58','\x59','\x5a','\x5b','\x5c','\x5d','\x5e','\x5f',
861 '\x60','\x41','\x42','\x43','\x44','\x45','\x46','\x47','\x48','\x49','\x4a','\x4b','\x4c','\x4d','\x4e','\x4f',
862 '\x50','\x51','\x52','\x53','\x54','\x55','\x56','\x57','\x58','\x59','\x5a','\x7b','\x7c','\x7d','\x7e','\x7f',
863 '\x80','\x81','\x82','\x83','\x84','\x85','\x86','\x87','\x88','\x89','\x8a','\x8b','\x8c','\x8d','\x8e','\x8f',
864 '\x90','\x91','\x92','\x93','\x94','\x95','\x96','\x97','\x98','\x99','\x9a','\x9b','\x9c','\x9d','\x9e','\x9f',
865 '\xa0','\xa1','\xa2','\xb3','\xb4','\xa5','\xb6','\xb7','\xa8','\xa9','\xaa','\xab','\xac','\xbd','\xae','\xaf',
866 '\xb0','\xb1','\xb2','\xb3','\xb4','\xb5','\xb6','\xb7','\xb8','\xb9','\xba','\xbb','\xbc','\xbd','\xbe','\xbf',
867 '\xe0','\xe1','\xe2','\xe3','\xe4','\xe5','\xe6','\xe7','\xe8','\xe9','\xea','\xeb','\xec','\xed','\xee','\xef',
868 '\xf0','\xf1','\xf2','\xf3','\xf4','\xf5','\xf6','\xf7','\xf8','\xf9','\xfa','\xfb','\xfc','\xfd','\xfe','\xff',
869 '\xe0','\xe1','\xe2','\xe3','\xe4','\xe5','\xe6','\xe7','\xe8','\xe9','\xea','\xeb','\xec','\xed','\xee','\xef',
870 '\xf0','\xf1','\xf2','\xf3','\xf4','\xf5','\xf6','\xf7','\xf8','\xf9','\xfa','\xfb','\xfc','\xfd','\xfe','\xff',
873 public immutable ubyte[32] koi8alphaTable = [
874 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
875 0x00,0x00,0x00,0x00,0xd8,0x20,0xd8,0x20,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
878 public char koi8lower (char ch) pure nothrow @trusted @nogc {
879 pragma(inline, true);
880 return koi8tolowerTable.ptr[cast(int)ch];
883 public char koi8upper (char ch) pure nothrow @trusted @nogc {
884 pragma(inline, true);
885 return koi8toupperTable.ptr[cast(int)ch];
888 public bool koi8isAlpha (char ch) pure nothrow @trusted @nogc {
889 pragma(inline, true);
890 return ((koi8alphaTable.ptr[ch/8]&(1<<(ch%8))) != 0);
894 // ////////////////////////////////////////////////////////////////////////// //
895 // supports cyrillic
896 public dchar simpleUniLower (immutable dchar ch) pure nothrow @safe @nogc {
897 pragma(inline, true);
898 return
899 ((ch >= 'A' && ch <= 'Z') || (ch >= 0x0410 && ch <= 0x042F)) ? ch+32 :
900 (ch >= 0x0400 && ch <= 0x040F) ? ch+0x0050 :
904 // supports cyrillic
905 public dchar simpleUniUpper (immutable dchar ch) pure nothrow @safe @nogc {
906 pragma(inline, true);
907 return
908 ((ch >= 'a' && ch <= 'z') || (ch >= 0x0430 && ch <= 0x044F)) ? ch-32 :
909 (ch >= 0x0450 && ch <= 0x045F) ? ch-0x0050 :
914 public bool isCyrillicUniUp (immutable dchar ch) pure nothrow @safe @nogc {
915 pragma(inline, true);
916 return (ch >= 0x0400 && ch <= 0x042F);
919 public bool isCyrillicUniLo (immutable dchar ch) pure nothrow @safe @nogc {
920 pragma(inline, true);
921 return (ch >= 0x0430 && ch <= 0x045F);
925 // rejects some cyrillic chars i'm not interested in
926 public bool isValidCyrillicUni (immutable dchar ch) pure nothrow @safe @nogc {
927 pragma(inline, true);
928 return
929 (ch >= 0x0410 && ch <= 0x044F) ||
930 // some special cyrillic chars
931 ch == 0x0401 || ch == 0x0404 || ch == 0x0406 || ch == 0x0407 || ch == 0x040E ||
932 ch == 0x0451 || ch == 0x0454 || ch == 0x0456 || ch == 0x0457 || ch == 0x045E;
936 // ////////////////////////////////////////////////////////////////////////// //
937 // replaces univalid UTF chars with `?`
938 public T simpleUTFToLower(T:const(char)[]) (T s) pure nothrow @trusted {
939 static if (is(T == typeof(null))) {
940 return null;
941 } else {
942 bool needASCII = false, needUTF = false;
943 foreach (immutable char ch; s) {
944 if (ch >= 128) { needUTF = true; break; }
945 if (!needASCII && ch >= 'A' && ch <= 'Z') needASCII = true;
947 // has some utf?
948 if (needUTF) {
949 // recode utf
950 char[] res;
951 res.reserve(s.length);
952 for (usize idx = 0; idx < s.length; ) {
953 char ch = s.ptr[idx++];
954 if (ch < 128) {
955 if (ch >= 'A' && ch <= 'Z') ch += 32;
956 res ~= ch;
957 } else {
958 // try to decode UTF-8
959 Utf8DecoderFast dc;
960 usize upos = idx-1;
961 do { dc.decode(cast(ubyte)s.ptr[upos++]);} while (upos < s.length && !dc.completeOrInvalid);
962 //{ import std.stdio; writeln(dc.codepoint, " : ", isCyrillicUniUp(dc.codepoint)); }
963 if (!dc.complete || !isCyrillicUniUp(dc.codepoint)) {
964 // so other utf bytes will be invalid, and copied as-is
965 res ~= ch;
966 } else {
967 // locase cyrillic
968 immutable dchar lodch = simpleUniLower(dc.codepoint);
969 char[4] buf = void;
970 foreach (immutable uch; buf[0..cast(usize)utf8Encode(buf[], lodch)]) res ~= uch;
971 idx = upos;
975 return cast(T)res; // it is safe to cast here
976 } else if (needASCII) {
977 // only ascii
978 char[] res = new char[s.length];
979 res[] = s[];
980 foreach (ref char ch; res) if (ch >= 'A' && ch <= 'Z') ch += 32;
981 return cast(T)res; // it is safe to cast here
982 } else {
983 // nothing to do
984 return s;
989 // ////////////////////////////////////////////////////////////////////////// //
990 // replaces univalid UTF chars with `?`
991 public T simpleUTFToUpper(T:const(char)[]) (T s) pure nothrow @trusted {
992 static if (is(T == typeof(null))) {
993 return null;
994 } else {
995 bool needASCII = false, needUTF = false;
996 foreach (immutable char ch; s) {
997 if (ch >= 128) { needUTF = true; break; }
998 if (!needASCII && ch >= 'a' && ch <= 'z') needASCII = true;
1000 // has some utf?
1001 if (needUTF) {
1002 // recode utf
1003 char[] res;
1004 res.reserve(s.length);
1005 for (usize idx = 0; idx < s.length; ) {
1006 char ch = s.ptr[idx++];
1007 if (ch < 128) {
1008 if (ch >= 'a' && ch <= 'z') ch -= 32;
1009 res ~= ch;
1010 } else {
1011 // try to decode UTF-8
1012 Utf8DecoderFast dc;
1013 usize upos = idx-1;
1014 do { dc.decode(cast(ubyte)s.ptr[upos++]);} while (upos < s.length && !dc.completeOrInvalid);
1015 if (!dc.complete || !isCyrillicUniLo(dc.codepoint)) {
1016 // so other utf bytes will be invalid, and copied as-is
1017 res ~= ch;
1018 } else {
1019 // locase cyrillic
1020 immutable dchar lodch = simpleUniUpper(dc.codepoint);
1021 char[4] buf = void;
1022 foreach (immutable uch; buf[0..cast(usize)utf8Encode(buf[], lodch)]) res ~= uch;
1023 idx = upos;
1027 return cast(T)res; // it is safe to cast here
1028 } else if (needASCII) {
1029 // only ascii
1030 char[] res = new char[s.length];
1031 res[] = s[];
1032 foreach (ref char ch; res) if (ch >= 'a' && ch <= 'z') ch -= 32;
1033 return cast(T)res; // it is safe to cast here
1034 } else {
1035 // nothing to do
1036 return s;
1042 unittest {
1043 //import std.stdio; writeln(simpleUTFToLower("Alice И Miriel"));
1044 assert(simpleUTFToLower("Alice And Miriel") == "alice and miriel");
1045 assert(simpleUTFToLower("Alice И Miriel") == "alice и miriel");
1047 assert(simpleUTFToUpper("Alice And Miriel") == "ALICE AND MIRIEL");
1048 assert(simpleUTFToUpper("Alice и Miriel") == "ALICE И MIRIEL");
1052 // ////////////////////////////////////////////////////////////////////////// //
1053 // simple translit (including for cyrillic)
1054 private align(1) struct TransInfo {
1055 align(1):
1056 uint cFrom;
1057 uint cTo;
1060 private static immutable TransInfo[389] translit = [
1061 TransInfo(0x00A0, 0x000020), /*   to */
1062 TransInfo(0x00B5, 0x000075), /* µ to u */
1063 TransInfo(0x00C0, 0x000041), /* À to A */
1064 TransInfo(0x00C1, 0x000041), /* Á to A */
1065 TransInfo(0x00C2, 0x000041), /* Â to A */
1066 TransInfo(0x00C3, 0x000041), /* Ã to A */
1067 TransInfo(0x00C4, 0x006541), /* Ä to Ae */
1068 TransInfo(0x00C5, 0x006141), /* Å to Aa */
1069 TransInfo(0x00C6, 0x004541), /* Æ to AE */
1070 TransInfo(0x00C7, 0x000043), /* Ç to C */
1071 TransInfo(0x00C8, 0x000045), /* È to E */
1072 TransInfo(0x00C9, 0x000045), /* É to E */
1073 TransInfo(0x00CA, 0x000045), /* Ê to E */
1074 TransInfo(0x00CB, 0x000045), /* Ë to E */
1075 TransInfo(0x00CC, 0x000049), /* Ì to I */
1076 TransInfo(0x00CD, 0x000049), /* Í to I */
1077 TransInfo(0x00CE, 0x000049), /* Î to I */
1078 TransInfo(0x00CF, 0x000049), /* Ï to I */
1079 TransInfo(0x00D0, 0x000044), /* Ð to D */
1080 TransInfo(0x00D1, 0x00004E), /* Ñ to N */
1081 TransInfo(0x00D2, 0x00004F), /* Ò to O */
1082 TransInfo(0x00D3, 0x00004F), /* Ó to O */
1083 TransInfo(0x00D4, 0x00004F), /* Ô to O */
1084 TransInfo(0x00D5, 0x00004F), /* Õ to O */
1085 TransInfo(0x00D6, 0x00654F), /* Ö to Oe */
1086 TransInfo(0x00D7, 0x000078), /* × to x */
1087 TransInfo(0x00D8, 0x00004F), /* Ø to O */
1088 TransInfo(0x00D9, 0x000055), /* Ù to U */
1089 TransInfo(0x00DA, 0x000055), /* Ú to U */
1090 TransInfo(0x00DB, 0x000055), /* Û to U */
1091 TransInfo(0x00DC, 0x006555), /* Ü to Ue */
1092 TransInfo(0x00DD, 0x000059), /* Ý to Y */
1093 TransInfo(0x00DE, 0x006854), /* Þ to Th */
1094 TransInfo(0x00DF, 0x007373), /* ß to ss */
1095 TransInfo(0x00E0, 0x000061), /* à to a */
1096 TransInfo(0x00E1, 0x000061), /* á to a */
1097 TransInfo(0x00E2, 0x000061), /* â to a */
1098 TransInfo(0x00E3, 0x000061), /* ã to a */
1099 TransInfo(0x00E4, 0x006561), /* ä to ae */
1100 TransInfo(0x00E5, 0x006161), /* å to aa */
1101 TransInfo(0x00E6, 0x006561), /* æ to ae */
1102 TransInfo(0x00E7, 0x000063), /* ç to c */
1103 TransInfo(0x00E8, 0x000065), /* è to e */
1104 TransInfo(0x00E9, 0x000065), /* é to e */
1105 TransInfo(0x00EA, 0x000065), /* ê to e */
1106 TransInfo(0x00EB, 0x000065), /* ë to e */
1107 TransInfo(0x00EC, 0x000069), /* ì to i */
1108 TransInfo(0x00ED, 0x000069), /* í to i */
1109 TransInfo(0x00EE, 0x000069), /* î to i */
1110 TransInfo(0x00EF, 0x000069), /* ï to i */
1111 TransInfo(0x00F0, 0x000064), /* ð to d */
1112 TransInfo(0x00F1, 0x00006E), /* ñ to n */
1113 TransInfo(0x00F2, 0x00006F), /* ò to o */
1114 TransInfo(0x00F3, 0x00006F), /* ó to o */
1115 TransInfo(0x00F4, 0x00006F), /* ô to o */
1116 TransInfo(0x00F5, 0x00006F), /* õ to o */
1117 TransInfo(0x00F6, 0x00656F), /* ö to oe */
1118 TransInfo(0x00F7, 0x00003A), /* ÷ to : */
1119 TransInfo(0x00F8, 0x00006F), /* ø to o */
1120 TransInfo(0x00F9, 0x000075), /* ù to u */
1121 TransInfo(0x00FA, 0x000075), /* ú to u */
1122 TransInfo(0x00FB, 0x000075), /* û to u */
1123 TransInfo(0x00FC, 0x006575), /* ü to ue */
1124 TransInfo(0x00FD, 0x000079), /* ý to y */
1125 TransInfo(0x00FE, 0x006874), /* þ to th */
1126 TransInfo(0x00FF, 0x000079), /* ÿ to y */
1127 TransInfo(0x0100, 0x000041), /* Ā to A */
1128 TransInfo(0x0101, 0x000061), /* ā to a */
1129 TransInfo(0x0102, 0x000041), /* Ă to A */
1130 TransInfo(0x0103, 0x000061), /* ă to a */
1131 TransInfo(0x0104, 0x000041), /* Ą to A */
1132 TransInfo(0x0105, 0x000061), /* ą to a */
1133 TransInfo(0x0106, 0x000043), /* Ć to C */
1134 TransInfo(0x0107, 0x000063), /* ć to c */
1135 TransInfo(0x0108, 0x006843), /* Ĉ to Ch */
1136 TransInfo(0x0109, 0x006863), /* ĉ to ch */
1137 TransInfo(0x010A, 0x000043), /* Ċ to C */
1138 TransInfo(0x010B, 0x000063), /* ċ to c */
1139 TransInfo(0x010C, 0x000043), /* Č to C */
1140 TransInfo(0x010D, 0x000063), /* č to c */
1141 TransInfo(0x010E, 0x000044), /* Ď to D */
1142 TransInfo(0x010F, 0x000064), /* ď to d */
1143 TransInfo(0x0110, 0x000044), /* Đ to D */
1144 TransInfo(0x0111, 0x000064), /* đ to d */
1145 TransInfo(0x0112, 0x000045), /* Ē to E */
1146 TransInfo(0x0113, 0x000065), /* ē to e */
1147 TransInfo(0x0114, 0x000045), /* Ĕ to E */
1148 TransInfo(0x0115, 0x000065), /* ĕ to e */
1149 TransInfo(0x0116, 0x000045), /* Ė to E */
1150 TransInfo(0x0117, 0x000065), /* ė to e */
1151 TransInfo(0x0118, 0x000045), /* Ę to E */
1152 TransInfo(0x0119, 0x000065), /* ę to e */
1153 TransInfo(0x011A, 0x000045), /* Ě to E */
1154 TransInfo(0x011B, 0x000065), /* ě to e */
1155 TransInfo(0x011C, 0x006847), /* Ĝ to Gh */
1156 TransInfo(0x011D, 0x006867), /* ĝ to gh */
1157 TransInfo(0x011E, 0x000047), /* Ğ to G */
1158 TransInfo(0x011F, 0x000067), /* ğ to g */
1159 TransInfo(0x0120, 0x000047), /* Ġ to G */
1160 TransInfo(0x0121, 0x000067), /* ġ to g */
1161 TransInfo(0x0122, 0x000047), /* Ģ to G */
1162 TransInfo(0x0123, 0x000067), /* ģ to g */
1163 TransInfo(0x0124, 0x006848), /* Ĥ to Hh */
1164 TransInfo(0x0125, 0x006868), /* ĥ to hh */
1165 TransInfo(0x0126, 0x000048), /* Ħ to H */
1166 TransInfo(0x0127, 0x000068), /* ħ to h */
1167 TransInfo(0x0128, 0x000049), /* Ĩ to I */
1168 TransInfo(0x0129, 0x000069), /* ĩ to i */
1169 TransInfo(0x012A, 0x000049), /* Ī to I */
1170 TransInfo(0x012B, 0x000069), /* ī to i */
1171 TransInfo(0x012C, 0x000049), /* Ĭ to I */
1172 TransInfo(0x012D, 0x000069), /* ĭ to i */
1173 TransInfo(0x012E, 0x000049), /* Į to I */
1174 TransInfo(0x012F, 0x000069), /* į to i */
1175 TransInfo(0x0130, 0x000049), /* İ to I */
1176 TransInfo(0x0131, 0x000069), /* ı to i */
1177 TransInfo(0x0132, 0x004A49), /* IJ to IJ */
1178 TransInfo(0x0133, 0x006A69), /* ij to ij */
1179 TransInfo(0x0134, 0x00684A), /* Ĵ to Jh */
1180 TransInfo(0x0135, 0x00686A), /* ĵ to jh */
1181 TransInfo(0x0136, 0x00004B), /* Ķ to K */
1182 TransInfo(0x0137, 0x00006B), /* ķ to k */
1183 TransInfo(0x0138, 0x00006B), /* ĸ to k */
1184 TransInfo(0x0139, 0x00004C), /* Ĺ to L */
1185 TransInfo(0x013A, 0x00006C), /* ĺ to l */
1186 TransInfo(0x013B, 0x00004C), /* Ļ to L */
1187 TransInfo(0x013C, 0x00006C), /* ļ to l */
1188 TransInfo(0x013D, 0x00004C), /* Ľ to L */
1189 TransInfo(0x013E, 0x00006C), /* ľ to l */
1190 TransInfo(0x013F, 0x002E4C), /* Ŀ to L. */
1191 TransInfo(0x0140, 0x002E6C), /* ŀ to l. */
1192 TransInfo(0x0141, 0x00004C), /* Ł to L */
1193 TransInfo(0x0142, 0x00006C), /* ł to l */
1194 TransInfo(0x0143, 0x00004E), /* Ń to N */
1195 TransInfo(0x0144, 0x00006E), /* ń to n */
1196 TransInfo(0x0145, 0x00004E), /* Ņ to N */
1197 TransInfo(0x0146, 0x00006E), /* ņ to n */
1198 TransInfo(0x0147, 0x00004E), /* Ň to N */
1199 TransInfo(0x0148, 0x00006E), /* ň to n */
1200 TransInfo(0x0149, 0x006E27), /* ʼn to 'n */
1201 TransInfo(0x014A, 0x00474E), /* Ŋ to NG */
1202 TransInfo(0x014B, 0x00676E), /* ŋ to ng */
1203 TransInfo(0x014C, 0x00004F), /* Ō to O */
1204 TransInfo(0x014D, 0x00006F), /* ō to o */
1205 TransInfo(0x014E, 0x00004F), /* Ŏ to O */
1206 TransInfo(0x014F, 0x00006F), /* ŏ to o */
1207 TransInfo(0x0150, 0x00004F), /* Ő to O */
1208 TransInfo(0x0151, 0x00006F), /* ő to o */
1209 TransInfo(0x0152, 0x00454F), /* Πto OE */
1210 TransInfo(0x0153, 0x00656F), /* œ to oe */
1211 TransInfo(0x0154, 0x000052), /* Ŕ to R */
1212 TransInfo(0x0155, 0x000072), /* ŕ to r */
1213 TransInfo(0x0156, 0x000052), /* Ŗ to R */
1214 TransInfo(0x0157, 0x000072), /* ŗ to r */
1215 TransInfo(0x0158, 0x000052), /* Ř to R */
1216 TransInfo(0x0159, 0x000072), /* ř to r */
1217 TransInfo(0x015A, 0x000053), /* Ś to S */
1218 TransInfo(0x015B, 0x000073), /* ś to s */
1219 TransInfo(0x015C, 0x006853), /* Ŝ to Sh */
1220 TransInfo(0x015D, 0x006873), /* ŝ to sh */
1221 TransInfo(0x015E, 0x000053), /* Ş to S */
1222 TransInfo(0x015F, 0x000073), /* ş to s */
1223 TransInfo(0x0160, 0x000053), /* Š to S */
1224 TransInfo(0x0161, 0x000073), /* š to s */
1225 TransInfo(0x0162, 0x000054), /* Ţ to T */
1226 TransInfo(0x0163, 0x000074), /* ţ to t */
1227 TransInfo(0x0164, 0x000054), /* Ť to T */
1228 TransInfo(0x0165, 0x000074), /* ť to t */
1229 TransInfo(0x0166, 0x000054), /* Ŧ to T */
1230 TransInfo(0x0167, 0x000074), /* ŧ to t */
1231 TransInfo(0x0168, 0x000055), /* Ũ to U */
1232 TransInfo(0x0169, 0x000075), /* ũ to u */
1233 TransInfo(0x016A, 0x000055), /* Ū to U */
1234 TransInfo(0x016B, 0x000075), /* ū to u */
1235 TransInfo(0x016C, 0x000055), /* Ŭ to U */
1236 TransInfo(0x016D, 0x000075), /* ŭ to u */
1237 TransInfo(0x016E, 0x000055), /* Ů to U */
1238 TransInfo(0x016F, 0x000075), /* ů to u */
1239 TransInfo(0x0170, 0x000055), /* Ű to U */
1240 TransInfo(0x0171, 0x000075), /* ű to u */
1241 TransInfo(0x0172, 0x000055), /* Ų to U */
1242 TransInfo(0x0173, 0x000075), /* ų to u */
1243 TransInfo(0x0174, 0x000057), /* Ŵ to W */
1244 TransInfo(0x0175, 0x000077), /* ŵ to w */
1245 TransInfo(0x0176, 0x000059), /* Ŷ to Y */
1246 TransInfo(0x0177, 0x000079), /* ŷ to y */
1247 TransInfo(0x0178, 0x000059), /* Ÿ to Y */
1248 TransInfo(0x0179, 0x00005A), /* Ź to Z */
1249 TransInfo(0x017A, 0x00007A), /* ź to z */
1250 TransInfo(0x017B, 0x00005A), /* Ż to Z */
1251 TransInfo(0x017C, 0x00007A), /* ż to z */
1252 TransInfo(0x017D, 0x00005A), /* Ž to Z */
1253 TransInfo(0x017E, 0x00007A), /* ž to z */
1254 TransInfo(0x017F, 0x000073), /* ſ to s */
1255 TransInfo(0x0192, 0x000066), /* ƒ to f */
1256 TransInfo(0x0218, 0x000053), /* Ș to S */
1257 TransInfo(0x0219, 0x000073), /* ș to s */
1258 TransInfo(0x021A, 0x000054), /* Ț to T */
1259 TransInfo(0x021B, 0x000074), /* ț to t */
1260 TransInfo(0x0386, 0x000041), /* Ά to A */
1261 TransInfo(0x0388, 0x000045), /* Έ to E */
1262 TransInfo(0x0389, 0x000049), /* Ή to I */
1263 TransInfo(0x038A, 0x000049), /* Ί to I */
1264 TransInfo(0x038C, 0x00004f), /* Ό to O */
1265 TransInfo(0x038E, 0x000059), /* Ύ to Y */
1266 TransInfo(0x038F, 0x00004f), /* Ώ to O */
1267 TransInfo(0x0390, 0x000069), /* ΐ to i */
1268 TransInfo(0x0391, 0x000041), /* Α to A */
1269 TransInfo(0x0392, 0x000042), /* Β to B */
1270 TransInfo(0x0393, 0x000047), /* Γ to G */
1271 TransInfo(0x0394, 0x000044), /* Δ to D */
1272 TransInfo(0x0395, 0x000045), /* Ε to E */
1273 TransInfo(0x0396, 0x00005a), /* Ζ to Z */
1274 TransInfo(0x0397, 0x000049), /* Η to I */
1275 TransInfo(0x0398, 0x006854), /* Θ to Th */
1276 TransInfo(0x0399, 0x000049), /* Ι to I */
1277 TransInfo(0x039A, 0x00004b), /* Κ to K */
1278 TransInfo(0x039B, 0x00004c), /* Λ to L */
1279 TransInfo(0x039C, 0x00004d), /* Μ to M */
1280 TransInfo(0x039D, 0x00004e), /* Ν to N */
1281 TransInfo(0x039E, 0x000058), /* Ξ to X */
1282 TransInfo(0x039F, 0x00004f), /* Ο to O */
1283 TransInfo(0x03A0, 0x000050), /* Π to P */
1284 TransInfo(0x03A1, 0x000052), /* Ρ to R */
1285 TransInfo(0x03A3, 0x000053), /* Σ to S */
1286 TransInfo(0x03A4, 0x000054), /* Τ to T */
1287 TransInfo(0x03A5, 0x000059), /* Υ to Y */
1288 TransInfo(0x03A6, 0x000046), /* Φ to F */
1289 TransInfo(0x03A7, 0x006843), /* Χ to Ch */
1290 TransInfo(0x03A8, 0x007350), /* Ψ to Ps */
1291 TransInfo(0x03A9, 0x00004f), /* Ω to O */
1292 TransInfo(0x03AA, 0x000049), /* Ϊ to I */
1293 TransInfo(0x03AB, 0x000059), /* Ϋ to Y */
1294 TransInfo(0x03AC, 0x000061), /* ά to a */
1295 TransInfo(0x03AD, 0x000065), /* έ to e */
1296 TransInfo(0x03AE, 0x000069), /* ή to i */
1297 TransInfo(0x03AF, 0x000069), /* ί to i */
1298 TransInfo(0x03B1, 0x000061), /* α to a */
1299 TransInfo(0x03B2, 0x000062), /* β to b */
1300 TransInfo(0x03B3, 0x000067), /* γ to g */
1301 TransInfo(0x03B4, 0x000064), /* δ to d */
1302 TransInfo(0x03B5, 0x000065), /* ε to e */
1303 TransInfo(0x03B6, 0x00007a), /* ζ to z */
1304 TransInfo(0x03B7, 0x000069), /* η to i */
1305 TransInfo(0x03B8, 0x006874), /* θ to th */
1306 TransInfo(0x03B9, 0x000069), /* ι to i */
1307 TransInfo(0x03BA, 0x00006b), /* κ to k */
1308 TransInfo(0x03BB, 0x00006c), /* λ to l */
1309 TransInfo(0x03BC, 0x00006d), /* μ to m */
1310 TransInfo(0x03BD, 0x00006e), /* ν to n */
1311 TransInfo(0x03BE, 0x000078), /* ξ to x */
1312 TransInfo(0x03BF, 0x00006f), /* ο to o */
1313 TransInfo(0x03C0, 0x000070), /* π to p */
1314 TransInfo(0x03C1, 0x000072), /* ρ to r */
1315 TransInfo(0x03C3, 0x000073), /* σ to s */
1316 TransInfo(0x03C4, 0x000074), /* τ to t */
1317 TransInfo(0x03C5, 0x000079), /* υ to y */
1318 TransInfo(0x03C6, 0x000066), /* φ to f */
1319 TransInfo(0x03C7, 0x006863), /* χ to ch */
1320 TransInfo(0x03C8, 0x007370), /* ψ to ps */
1321 TransInfo(0x03C9, 0x00006f), /* ω to o */
1322 TransInfo(0x03CA, 0x000069), /* ϊ to i */
1323 TransInfo(0x03CB, 0x000079), /* ϋ to y */
1324 TransInfo(0x03CC, 0x00006f), /* ό to o */
1325 TransInfo(0x03CD, 0x000079), /* ύ to y */
1326 TransInfo(0x03CE, 0x000069), /* ώ to i */
1327 TransInfo(0x0400, 0x004549), /* Ѐ to IE */
1328 TransInfo(0x0401, 0x004f59), /* Ё to YO */
1329 TransInfo(0x0402, 0x000044), /* Ђ to D */
1330 TransInfo(0x0403, 0x000047), /* Ѓ to G */
1331 TransInfo(0x0404, 0x000045), /* Є to E */
1332 TransInfo(0x0405, 0x00005a), /* Ѕ to Z */
1333 TransInfo(0x0406, 0x000049), /* І to I */
1334 TransInfo(0x0407, 0x004959), /* Ї to YI */
1335 TransInfo(0x0408, 0x00004a), /* Ј to J */
1336 TransInfo(0x0409, 0x000049), /* Љ to I */
1337 TransInfo(0x040A, 0x00004e), /* Њ to N */
1338 TransInfo(0x040B, 0x000044), /* Ћ to D */
1339 TransInfo(0x040C, 0x00004b), /* Ќ to K */
1340 TransInfo(0x040D, 0x000049), /* Ѝ to I */
1341 TransInfo(0x040E, 0x000056), /* Ў to V */
1342 TransInfo(0x040F, 0x000044), /* Џ to D */
1343 TransInfo(0x0410, 0x000041), /* А to A */
1344 TransInfo(0x0411, 0x000042), /* Б to B */
1345 TransInfo(0x0412, 0x000056), /* В to V */
1346 TransInfo(0x0413, 0x000047), /* Г to G */
1347 TransInfo(0x0414, 0x000044), /* Д to D */
1348 TransInfo(0x0415, 0x000045), /* Е to E */
1349 TransInfo(0x0416, 0x00485a), /* Ж to ZH */
1350 TransInfo(0x0417, 0x00005a), /* З to Z */
1351 TransInfo(0x0418, 0x000049), /* И to I */
1352 TransInfo(0x0419, 0x00004a), /* Й to J */
1353 TransInfo(0x041A, 0x00004b), /* К to K */
1354 TransInfo(0x041B, 0x00004c), /* Л to L */
1355 TransInfo(0x041C, 0x00004d), /* М to M */
1356 TransInfo(0x041D, 0x00004e), /* Н to N */
1357 TransInfo(0x041E, 0x00004f), /* О to O */
1358 TransInfo(0x041F, 0x000050), /* П to P */
1359 TransInfo(0x0420, 0x000052), /* Р to R */
1360 TransInfo(0x0421, 0x000053), /* С to S */
1361 TransInfo(0x0422, 0x000054), /* Т to T */
1362 TransInfo(0x0423, 0x000055), /* У to U */
1363 TransInfo(0x0424, 0x000046), /* Ф to F */
1364 TransInfo(0x0425, 0x000048), /* Х to H */
1365 TransInfo(0x0426, 0x004354), /* Ц to TC */
1366 TransInfo(0x0427, 0x004843), /* Ч to CH */
1367 TransInfo(0x0428, 0x004853), /* Ш to SH */
1368 TransInfo(0x0429, 0x484353), /* Щ to SCH */
1369 TransInfo(0x042A, 0x000060), /* Ъ to ` */
1370 TransInfo(0x042B, 0x000059), /* Ы to Y */
1371 TransInfo(0x042C, 0x000027), /* Ь to ' */
1372 TransInfo(0x042D, 0x000045), /* Э to E */
1373 TransInfo(0x042E, 0x00554a), /* Ю to JU */
1374 TransInfo(0x042F, 0x00414a), /* Я to JA */
1375 TransInfo(0x0430, 0x000061), /* а to a */
1376 TransInfo(0x0431, 0x000062), /* б to b */
1377 TransInfo(0x0432, 0x000076), /* в to v */
1378 TransInfo(0x0433, 0x000067), /* г to g */
1379 TransInfo(0x0434, 0x000064), /* д to d */
1380 TransInfo(0x0435, 0x000065), /* е to e */
1381 TransInfo(0x0436, 0x00687a), /* ж to zh */
1382 TransInfo(0x0437, 0x00007a), /* з to z */
1383 TransInfo(0x0438, 0x000069), /* и to i */
1384 TransInfo(0x0439, 0x00006a), /* й to j */
1385 TransInfo(0x043A, 0x00006b), /* к to k */
1386 TransInfo(0x043B, 0x00006c), /* л to l */
1387 TransInfo(0x043C, 0x00006d), /* м to m */
1388 TransInfo(0x043D, 0x00006e), /* н to n */
1389 TransInfo(0x043E, 0x00006f), /* о to o */
1390 TransInfo(0x043F, 0x000070), /* п to p */
1391 TransInfo(0x0440, 0x000072), /* р to r */
1392 TransInfo(0x0441, 0x000073), /* с to s */
1393 TransInfo(0x0442, 0x000074), /* т to t */
1394 TransInfo(0x0443, 0x000075), /* у to u */
1395 TransInfo(0x0444, 0x000066), /* ф to f */
1396 TransInfo(0x0445, 0x000068), /* х to h */
1397 TransInfo(0x0446, 0x006374), /* ц to tc */
1398 TransInfo(0x0447, 0x006863), /* ч to ch */
1399 TransInfo(0x0448, 0x006873), /* ш to sh */
1400 TransInfo(0x0449, 0x686373), /* щ to sch */
1401 TransInfo(0x044A, 0x000060), /* ъ to ` */
1402 TransInfo(0x044B, 0x000079), /* ы to y */
1403 TransInfo(0x044C, 0x000027), /* ь to ' */
1404 TransInfo(0x044D, 0x000065), /* э to e */
1405 TransInfo(0x044E, 0x00756a), /* ю to ju */
1406 TransInfo(0x044F, 0x00616a), /* я to ja */
1407 TransInfo(0x0450, 0x006569), /* ѐ to ie */
1408 TransInfo(0x0451, 0x006f79), /* ё to yo */
1409 TransInfo(0x0452, 0x000064), /* ђ to d */
1410 TransInfo(0x0453, 0x000067), /* ѓ to g */
1411 TransInfo(0x0454, 0x000065), /* є to e */
1412 TransInfo(0x0455, 0x00007a), /* ѕ to z */
1413 TransInfo(0x0456, 0x000069), /* і to i */
1414 TransInfo(0x0457, 0x006979), /* ї to yi */
1415 TransInfo(0x0458, 0x00006a), /* ј to j */
1416 TransInfo(0x0459, 0x000069), /* љ to i */
1417 TransInfo(0x045A, 0x00006e), /* њ to n */
1418 TransInfo(0x045B, 0x000064), /* ћ to d */
1419 TransInfo(0x045C, 0x00006b), /* ќ to k */
1420 TransInfo(0x045D, 0x000069), /* ѝ to i */
1421 TransInfo(0x045E, 0x000076), /* ў to v */
1422 TransInfo(0x045F, 0x000064), /* џ to d */
1423 TransInfo(0x1E02, 0x000042), /* Ḃ to B */
1424 TransInfo(0x1E03, 0x000062), /* ḃ to b */
1425 TransInfo(0x1E0A, 0x000044), /* Ḋ to D */
1426 TransInfo(0x1E0B, 0x000064), /* ḋ to d */
1427 TransInfo(0x1E1E, 0x000046), /* Ḟ to F */
1428 TransInfo(0x1E1F, 0x000066), /* ḟ to f */
1429 TransInfo(0x1E40, 0x00004D), /* Ṁ to M */
1430 TransInfo(0x1E41, 0x00006D), /* ṁ to m */
1431 TransInfo(0x1E56, 0x000050), /* Ṗ to P */
1432 TransInfo(0x1E57, 0x000070), /* ṗ to p */
1433 TransInfo(0x1E60, 0x000053), /* Ṡ to S */
1434 TransInfo(0x1E61, 0x000073), /* ṡ to s */
1435 TransInfo(0x1E6A, 0x000054), /* Ṫ to T */
1436 TransInfo(0x1E6B, 0x000074), /* ṫ to t */
1437 TransInfo(0x1E80, 0x000057), /* Ẁ to W */
1438 TransInfo(0x1E81, 0x000077), /* ẁ to w */
1439 TransInfo(0x1E82, 0x000057), /* Ẃ to W */
1440 TransInfo(0x1E83, 0x000077), /* ẃ to w */
1441 TransInfo(0x1E84, 0x000057), /* Ẅ to W */
1442 TransInfo(0x1E85, 0x000077), /* ẅ to w */
1443 TransInfo(0x1EF2, 0x000059), /* Ỳ to Y */
1444 TransInfo(0x1EF3, 0x000079), /* ỳ to y */
1445 TransInfo(0xFB00, 0x006666), /* ff to ff */
1446 TransInfo(0xFB01, 0x006966), /* fi to fi */
1447 TransInfo(0xFB02, 0x006C66), /* fl to fl */
1448 TransInfo(0xFB05, 0x007473), /* ſt to st */
1449 TransInfo(0xFB06, 0x007473), /* st to st */
1454 // returns 0 if not found
1455 private uint findTranslation (const uint src) pure nothrow @trusted @nogc {
1456 if (src < translit.ptr[0].cFrom || src > translit.ptr[translit.length-1].cFrom) return 0;
1457 uint imin = 0, imax = cast(uint)translit.length-1, cmp;
1458 // continually narrow search until just one element remains
1459 while (imin < imax) {
1460 uint imid = (imin+imax)>>1; // we will never overflow here; no, really!
1461 // note: 0 <= imin < imax implies imid will always be less than imax
1462 // reduce the search
1463 if (translit.ptr[imid].cFrom < src) imin = imid+1; else imax = imid;
1465 // At exit of while:
1466 // if A[] is empty, then imax < imin
1467 // otherwise imax == imin
1468 // deferred test for equality
1469 return (translit.ptr[imin].cFrom == src ? translit.ptr[imin].cTo : 0);
1474 * Convert the input string from UTF-8 into pure ASCII by converting
1475 * all non-ASCII characters to some combination of characters in the
1476 * ASCII subset.
1478 * The returned string might contain more characters than the input.
1480 public T utfTranslit(T:const(char)[]) (T s) nothrow @trusted {
1481 static if (is(T == typeof(null))) {
1482 return null;
1483 } else {
1484 bool needWork = false;
1485 foreach (immutable char ch; s) if (ch >= 128) { needWork = true; break; }
1486 if (!needWork) return s;
1487 // recode utf
1488 char[] res;
1489 res.reserve(s.length+16);
1490 for (usize idx = 0; idx < s.length; ) {
1491 char ch = s.ptr[idx++];
1492 if (ch < 128) {
1493 res ~= ch;
1494 } else {
1495 // try to decode UTF-8
1496 Utf8DecoderFast dc;
1497 usize upos = idx-1;
1498 do { dc.decode(cast(ubyte)s.ptr[upos++]);} while (upos < s.length && !dc.completeOrInvalid);
1499 if (!dc.complete) {
1500 // so other utf bytes will be invalid, and copied as-is
1501 res ~= ch;
1502 } else {
1503 uint trans = findTranslation(dc.codepoint);
1504 if (trans) {
1505 res ~= cast(char)(trans&0xff);
1506 trans >>= 8;
1507 if (trans&0xff) {
1508 res ~= cast(char)(trans&0xff);
1509 trans >>= 8;
1510 if (trans&0xff) res ~= cast(char)trans;
1512 idx = upos;
1513 } else {
1514 // so other utf bytes will be invalid, and copied as-is
1515 res ~= ch;
1520 return cast(T)res; // it is safe to cast here
1525 unittest {
1526 assert(utfTranslit("хУй и пИЗДа!") == "hUj i pIZDa!");
1527 assert(utfTranslit("вставай товаРИЩ, не время для ПОТЕХИ, ш-Ш-щ-Щ-ц-Ц-ё-Ё-й-Й") == "vstavaj tovaRISCH, ne vremja dlja POTEHI, sh-SH-sch-SCH-tc-TC-yo-YO-j-J");