gcc/go/gofrontend/go-encode-id.cc

   1 // go-encode-id.cc -- Go identifier and packagepath encoding/decoding hooks
   2
   3 // Copyright 2016 The Go Authors. All rights reserved.
   4 // Use of this source code is governed by a BSD-style
   5 // license that can be found in the LICENSE file.
   6
   7 #include "go-system.h"
   8
   9 #include "gogo.h"
  10 #include "go-location.h"
  11 #include "go-linemap.h"
  12 #include "go-encode-id.h"
  13 #include "lex.h"
  14
  15 // Return whether the character c can appear in a name that we are
  16 // encoding.  We only permit ASCII alphanumeric characters.
  17
  18 static bool
  19 char_needs_encoding(char c)
  20 {
  21   switch (c)
  22     {
  23     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
  24     case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
  25     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
  26     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
  27     case 'Y': case 'Z':
  28     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
  29     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
  30     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
  31     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
  32     case 'y': case 'z':
  33     case '0': case '1': case '2': case '3': case '4':
  34     case '5': case '6': case '7': case '8': case '9':
  35       return false;
  36     default:
  37       return true;
  38     }
  39 }
  40
  41 // Return whether the identifier needs to be translated because it
  42 // contains non-ASCII characters.
  43
  44 bool
  45 go_id_needs_encoding(const std::string& str)
  46 {
  47   for (std::string::const_iterator p = str.begin();
  48        p != str.end();
  49        ++p)
  50     if (char_needs_encoding(*p))
  51       return true;
  52   return false;
  53 }
  54
  55 // Map from characters to the underscore encoding for them.
  56
  57 class Special_char_code
  58 {
  59  public:
  60   Special_char_code();
  61
  62   // Return the simple underscore encoding for C, or 0 if none.
  63   char
  64   code_for(unsigned int c) const
  65   {
  66     if (c <= 127)
  67       return this->codes_[c];
  68     return 0;
  69   }
  70
  71  private:
  72   // Encodings for characters.
  73   char codes_[128];
  74 };
  75
  76 // Construct the underscore encoding map.
  77
  78 Special_char_code::Special_char_code()
  79 {
  80   memset(this->codes_, 0, sizeof this->codes_);
  81   this->codes_['_'] = '_';
  82   this->codes_['.'] = '0';
  83   this->codes_['/'] = '1';
  84   this->codes_['*'] = '2';
  85   this->codes_[','] = '3';
  86   this->codes_['{'] = '4';
  87   this->codes_['}'] = '5';
  88   this->codes_['['] = '6';
  89   this->codes_[']'] = '7';
  90   this->codes_['('] = '8';
  91   this->codes_[')'] = '9';
  92   this->codes_['"'] = 'a';
  93   this->codes_[' '] = 'b';
  94   this->codes_[';'] = 'c';
  95 }
  96
  97 // The singleton Special_char_code.
  98
  99 static const Special_char_code special_char_code;
 100
 101 // Pull the next UTF-8 character out of P and store it in *PC.  Return
 102 // the number of bytes read.
 103
 104 static size_t
 105 fetch_utf8_char(const char* p, unsigned int* pc)
 106 {
 107   unsigned char c = *p;
 108   if ((c & 0x80) == 0)
 109     {
 110       *pc = c;
 111       return 1;
 112     }
 113   size_t len = 0;
 114   while ((c & 0x80) != 0)
 115     {
 116       ++len;
 117       c <<= 1;
 118     }
 119   unsigned int rc = *p & ((1 << (7 - len)) - 1);
 120   for (size_t i = 1; i < len; i++)
 121     {
 122       unsigned int u = p[i];
 123       rc <<= 6;
 124       rc |= u & 0x3f;
 125     }
 126   *pc = rc;
 127   return len;
 128 }
 129
 130 // Encode an identifier using assembler-friendly characters.  The
 131 // encoding is described in detail near the end of the long comment at
 132 // the start of names.cc.
 133
 134 std::string
 135 go_encode_id(const std::string &id)
 136 {
 137   if (Lex::is_invalid_identifier(id))
 138     {
 139       go_assert(saw_errors());
 140       return id;
 141     }
 142
 143   std::string ret;
 144   const char* p = id.c_str();
 145   const char* pend = p + id.length();
 146
 147   // We encode a leading digit, to ensure that no identifier starts
 148   // with a digit.
 149   if (pend > p && p[0] >= '0' && p[0] <= '9')
 150     {
 151       char buf[8];
 152       snprintf(buf, sizeof buf, "_x%02x", p[0]);
 153       ret.append(buf);
 154       ++p;
 155     }
 156
 157   while (p < pend)
 158     {
 159       unsigned int c;
 160       size_t len = fetch_utf8_char(p, &c);
 161       if (len == 1)
 162         {
 163           if (!char_needs_encoding(c))
 164             ret.push_back(c);
 165           else
 166             {
 167               char code = special_char_code.code_for(c);
 168               if (code != 0)
 169                 {
 170                   ret.push_back('_');
 171                   ret.push_back(code);
 172                 }
 173               else
 174                 {
 175                   char buf[8];
 176                   snprintf(buf, sizeof buf, "_x%02x", c);
 177                   ret.append(buf);
 178                 }
 179             }
 180         }
 181       else
 182         {
 183           char buf[16];
 184           if (c < 0x10000)
 185             snprintf(buf, sizeof buf, "_u%04x", c);
 186           else
 187             snprintf(buf, sizeof buf, "_U%08x", c);
 188           ret.append(buf);
 189         }
 190
 191       p += len;
 192     }
 193
 194   return ret;
 195 }
 196
 197 // Convert a hex digit string to a unicode codepoint. No checking
 198 // to insure that the hex digit is meaningful.
 199
 200 static unsigned
 201 hex_digits_to_unicode_codepoint(const char *digits, unsigned ndig)
 202 {
 203   unsigned result = 0;
 204   for (unsigned i = 0; i < ndig; ++i) {
 205     result <<= 4;
 206     result |= Lex::hex_val(digits[i]);
 207   }
 208   return result;
 209 }
 210
 211 // Decode/demangle a mangled string produced by go_encode_id(). Returns
 212 // empty string if demangling process fails in some way.  At the moment
 213 // this routine is unused; there is an equivalent routine in the runtime
 214 // used for demangling symbols appearing in stack traces.
 215
 216 std::string
 217 go_decode_id(const std::string &encoded)
 218 {
 219   std::string ret;
 220   const char* p = encoded.c_str();
 221   const char* pend = p + encoded.length();
 222   const Location loc = Linemap::predeclared_location();
 223
 224   while (p < pend)
 225     {
 226       if (*p != '_' || p + 1 == pend)
 227         {
 228           ret.push_back(*p);
 229           p++;
 230           continue;
 231         }
 232
 233       switch (p[1])
 234         {
 235         case '_':
 236           ret.push_back('_');
 237           p += 2;
 238           break;
 239         case '0':
 240           ret.push_back('.');
 241           p += 2;
 242           break;
 243         case '1':
 244           ret.push_back('/');
 245           p += 2;
 246           break;
 247         case '2':
 248           ret.push_back('*');
 249           p += 2;
 250           break;
 251         case '3':
 252           ret.push_back(',');
 253           p += 2;
 254           break;
 255         case '4':
 256           ret.push_back('{');
 257           p += 2;
 258           break;
 259         case '5':
 260           ret.push_back('}');
 261           p += 2;
 262           break;
 263         case '6':
 264           ret.push_back('[');
 265           p += 2;
 266           break;
 267         case '7':
 268           ret.push_back(']');
 269           p += 2;
 270           break;
 271         case '8':
 272           ret.push_back('(');
 273           p += 2;
 274           break;
 275         case '9':
 276           ret.push_back(')');
 277           p += 2;
 278           break;
 279         case 'a':
 280           ret.push_back('"');
 281           p += 2;
 282           break;
 283         case 'b':
 284           ret.push_back(' ');
 285           p += 2;
 286           break;
 287         case 'c':
 288           ret.push_back(';');
 289           p += 2;
 290           break;
 291         case 'x':
 292           {
 293             const char* digits = p + 2;
 294             if (strlen(digits) < 2)
 295               return "";
 296             unsigned int rune = hex_digits_to_unicode_codepoint(digits, 2);
 297             Lex::append_char(rune, true, &ret, loc);
 298             p += 4;
 299           }
 300           break;
 301         case 'u':
 302           {
 303             const char* digits = p + 2;
 304             if (strlen(digits) < 4)
 305               return "";
 306             unsigned int rune = hex_digits_to_unicode_codepoint(digits, 4);
 307             Lex::append_char(rune, true, &ret, loc);
 308             p += 6;
 309           }
 310           break;
 311         case 'U':
 312           {
 313             const char* digits = p + 2;
 314             if (strlen(digits) < 8)
 315               return "";
 316             unsigned int rune = hex_digits_to_unicode_codepoint(digits, 8);
 317             Lex::append_char(rune, true, &ret, loc);
 318             p += 10;
 319           }
 320           break;
 321         default:
 322           return "";
 323         }
 324     }
 325
 326   return ret;
 327 }
 328
 329 // Encode a struct field tag.  This is only used when we need to
 330 // create a type descriptor for an anonymous struct type with field
 331 // tags.  Underscore encoding will be applied to the returned string.
 332 // The tag will appear between curly braces, so that is all we have to
 333 // avoid.
 334
 335 std::string
 336 go_mangle_struct_tag(const std::string& tag)
 337 {
 338   std::string ret;
 339   const char* p = tag.c_str();
 340   const char* pend = p + tag.length();
 341   while (p < pend)
 342     {
 343       unsigned int c;
 344       size_t len = fetch_utf8_char(p, &c);
 345       if (len > 1)
 346         ret.append(p, len);
 347       else if (c != '{' && c != '}' && c != '\\')
 348         ret.push_back(c);
 349       else
 350         {
 351           ret.push_back('\\');
 352           ret.push_back(c);
 353         }
 354       p += len;
 355     }
 356   return ret;
 357 }