src/utf8dec.c

   1 // ////////////////////////////////////////////////////////////////////////// //
   2 // see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
   3 // for some recovery strategies it may be useful to determine the number of bytes expected.
   4 static const uint8_t sxed_utf8dfa[0x16c] = {
   5   // maps bytes to character classes
   6   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 00-0f
   7   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 10-1f
   8   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 20-2f
   9   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 30-3f
  10   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 40-4f
  11   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 50-5f
  12   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 60-6f
  13   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 70-7f
  14   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, // 80-8f
  15   0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09, // 90-9f
  16   0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07, // a0-af
  17   0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07, // b0-bf
  18   0x08,0x08,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02, // c0-cf
  19   0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02, // d0-df
  20   0x0a,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x04,0x03,0x03, // e0-ef
  21   0x0b,0x06,0x06,0x06,0x05,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, // f0-ff
  22   // maps a combination of a state of the automaton and a character class to a state
  23   0x00,0x0c,0x18,0x24,0x3c,0x60,0x54,0x0c,0x0c,0x0c,0x30,0x48,0x0c,0x0c,0x0c,0x0c, // 100-10f
  24   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x00, // 110-11f
  25   0x0c,0x00,0x0c,0x0c,0x0c,0x18,0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x18,0x0c,0x0c, // 120-12f
  26   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x0c, // 130-13f
  27   0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x24, // 140-14f
  28   0x0c,0x24,0x0c,0x0c,0x0c,0x24,0x0c,0x0c,0x0c,0x0c,0x0c,0x24,0x0c,0x24,0x0c,0x0c, // 150-15f
  29   0x0c,0x24,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c
  30 };
  31
  32
  33 // replacement char for invalid unicode
  34 #define SXED_UTF8_REPLACEMENT_CP  (0xFFFDU)
  35
  36
  37 // ////////////////////////////////////////////////////////////////////////// //
  38 static inline /*__attribute__((always_inline))*/ __attribute__((unused))
  39 // is the given codepoint valid?
  40 int sxed_utf8_valid_cp (uint32_t cp) { return (cp < 0xD800U || (cp > 0xDFFFU && cp <= 0x10FFFFU)); }
  41
  42 static inline /*__attribute__((always_inline))*/ __attribute__((unused))
  43 // is the given codepoint considered printable?
  44 // i restrict it to some useful subset
  45 // unifuck is unifucked, but i hope that i sorted out all idiotic diactritics and control chars
  46 int sxed_utf8_printable_cp (uint32_t cp) {
  47   return
  48     // basic latin
  49     cp <= 0x024FU ||
  50     // some greek, and cyrillic w/o combiners
  51     (cp >= 0x0390U && cp <= 0x0482U) ||
  52     (cp >= 0x048AU && cp <= 0x052FU) ||
  53     // runic (just for fun)
  54     (cp >= 0x16A0U && cp <= 0x168FU) ||
  55     // latin extended additional
  56     (cp >= 0x1E00U && cp <= 0x1EFFU) ||
  57     // some general punctuation, extensions, etc.
  58     (cp >= 0x2000U && cp <= 0x2C7FU) ||
  59     // supplemental punctuation
  60     (cp >= 0x2E00U && cp <= 0x2E42U) ||
  61     // more latin extended
  62     (cp >= 0xAB30U && cp <= 0xAB65U);
  63 }
  64
  65
  66 // ////////////////////////////////////////////////////////////////////////// //
  67 // DFA-based utf-8 decoder with only 32 bits of state
  68 // "overlong" variants accept "zero overlong" (0xC0 0x80), but are slower
  69 // normal variants reject overlongs
  70 //
  71 // use like this:
  72 //
  73 //   uint32_t cp = 0; // 0 is important! (also, assigning zero can be used to reset the decoder)
  74 //   for (;;) {
  75 //     cp = sxed_utf8d_consume(cp, next_byte());
  76 //     if (sxed_utf8_valid_cp(cp)) {
  77 //        ...process unicode codepoint from `cp`...
  78 //     }
  79 //   }
  80 //
  81 // or:
  82 //
  83 //   uint32_t cp = 0; // 0 is important! (also, assigning zero can be used to reset the decoder)
  84 //   for (;;) {
  85 //     cp = sxed_utf8d_consume_ex(cp, next_byte());
  86 //     if (sxed_utf8_valid_cp(cp)) {
  87 //        ...process unicode codepoint from `cp`...
  88 //     } else if (sxed_utf8d_fuckedup(cp)) {
  89 //        ...we found an invalid utf-8 sequence...
  90 //     }
  91 //   }
  92
  93 static inline /*__attribute__((always_inline))*/ __attribute__((unused))
  94 // never reaches `invalid` state, returns `replacement` for invalid chars
  95 // returns invalid codepoint while it is "in progress" (i.e. result > 0x10FFFFU)
  96 uint32_t sxed_utf8d_consume (uint32_t cp, const char ch) {
  97   uint8_t state = cp>>24;
  98   if (state == 12/*State.Reject*/) state = 0/*State.Accept*/; // invalid utf-8 sequence was hit, restart (just in case)
  99   const uint8_t type = sxed_utf8dfa[((uint8_t)ch)];
 100   cp = (state /*!= State.Accept*/ ? (((uint8_t)ch)&0x3fU)|((cp&~0xff000000U)<<6) : (0xff>>type)&((uint8_t)ch));
 101   if ((state = sxed_utf8dfa[256+state+type]) == 12/*State.Reject*/) return SXED_UTF8_REPLACEMENT_CP; // invalid utf-8 sequence
 102   return
 103     state ? cp|((uint32_t)state<<24) :
 104     sxed_utf8_valid_cp(cp) ? cp : //k8: i don't remember if this is required, but it's better be safe than sorry
 105     SXED_UTF8_REPLACEMENT_CP;
 106 }
 107
 108 static inline /*__attribute__((always_inline))*/ __attribute__((unused))
 109 // can reach `invalid` state
 110 // returns invalid codepoint while it is "in progress" (i.e. result > 0x10FFFFU)
 111 // on invalid utf-8 sequence, returns special value that can be tested with `sxed_utf8d_fuckedup()`
 112 // codepoint is undefined in this case (but it won't pass `sxed_utf8_valid_cp()`)
 113 // next call will restart decoding
 114 uint32_t sxed_utf8d_consume_ex (uint32_t cp, const char ch) {
 115   uint8_t state = cp>>24;
 116   if (state == 12/*State.Reject*/) state = 0/*State.Accept*/; // invalid utf-8 sequence was hit, restart (just in case)
 117   const uint8_t type = sxed_utf8dfa[((uint8_t)ch)];
 118   cp = (state /*!= State.Accept*/ ? (((uint8_t)ch)&0x3fU)|((cp&~0xff000000U)<<6) : (0xff>>type)&((uint8_t)ch));
 119   if ((state = sxed_utf8dfa[256+state+type]) == 12/*State.Reject*/) return 12U<<24; // invalid utf-8 sequence
 120   return
 121     state ? cp|((uint32_t)state<<24) :
 122     sxed_utf8_valid_cp(cp) ? cp : //k8: i don't remember if this is required, but it's better be safe than sorry
 123     12U<<24; // invalid utf-8 sequence
 124 }
 125
 126 static inline /*__attribute__((always_inline))*/ __attribute__((unused))
 127 // check value returned from `sxed_utf8d_consume_ex()` with this
 128 // returns non-zero if last decoding operation hit invalid utf-8 sequence
 129 // codepoint is undefined in this case (but it won't pass `sxed_utf8_valid_cp()`)
 130 int sxed_utf8d_fuckedup (const uint32_t cp) { return ((cp>>24) == 12); }