gcc/go/gofrontend/go-encode-id.cc

   1 // go-encode-id.cc -- Go identifier encoding hooks
   2
   3 // Copyright 2016 The Go Authors. All rights reserved.
   4 // Use of this source code is governed by a BSD-style
   5 // license that can be found in the LICENSE file.
   6
   7 #include "go-system.h"
   8
   9 #include "gogo.h"
  10 #include "go-location.h"
  11 #include "go-linemap.h"
  12 #include "go-encode-id.h"
  13 #include "lex.h"
  14
  15 // Return whether the character c is OK to use in the assembler.  We
  16 // only permit ASCII alphanumeric characters, underscore, and dot.
  17
  18 static bool
  19 char_needs_encoding(char c)
  20 {
  21   switch (c)
  22     {
  23     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
  24     case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
  25     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
  26     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
  27     case 'Y': case 'Z':
  28     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
  29     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
  30     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
  31     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
  32     case 'y': case 'z':
  33     case '0': case '1': case '2': case '3': case '4':
  34     case '5': case '6': case '7': case '8': case '9':
  35     case '_': case '.':
  36       return false;
  37     default:
  38       return true;
  39     }
  40 }
  41
  42 // Return whether the identifier needs to be translated because it
  43 // contains non-ASCII characters.
  44
  45 bool
  46 go_id_needs_encoding(const std::string& str)
  47 {
  48   for (std::string::const_iterator p = str.begin();
  49        p != str.end();
  50        ++p)
  51     if (char_needs_encoding(*p))
  52       return true;
  53   return false;
  54 }
  55
  56 // Pull the next UTF-8 character out of P and store it in *PC.  Return
  57 // the number of bytes read.
  58
  59 static size_t
  60 fetch_utf8_char(const char* p, unsigned int* pc)
  61 {
  62   unsigned char c = *p;
  63   if ((c & 0x80) == 0)
  64     {
  65       *pc = c;
  66       return 1;
  67     }
  68   size_t len = 0;
  69   while ((c & 0x80) != 0)
  70     {
  71       ++len;
  72       c <<= 1;
  73     }
  74   unsigned int rc = *p & ((1 << (7 - len)) - 1);
  75   for (size_t i = 1; i < len; i++)
  76     {
  77       unsigned int u = p[i];
  78       rc <<= 6;
  79       rc |= u & 0x3f;
  80     }
  81   *pc = rc;
  82   return len;
  83 }
  84
  85 // Encode an identifier using ASCII characters.  The encoding is
  86 // described in detail near the end of the long comment at the start
  87 // of names.cc.  Short version: translate all non-ASCII-alphanumeric
  88 // characters into ..uXXXX or ..UXXXXXXXX.
  89
  90 std::string
  91 go_encode_id(const std::string &id)
  92 {
  93   if (Lex::is_invalid_identifier(id))
  94     {
  95       go_assert(saw_errors());
  96       return id;
  97     }
  98
  99   // The encoding is only unambiguous if the input string does not
 100   // contain ..u or ..U.
 101   go_assert(id.find("..u") == std::string::npos);
 102   go_assert(id.find("..U") == std::string::npos);
 103
 104   std::string ret;
 105   const char* p = id.c_str();
 106   const char* pend = p + id.length();
 107
 108   // A leading ".0" is a space introduced before a mangled type name
 109   // that starts with a 'u' or 'U', to avoid confusion with the
 110   // mangling used here.  We don't need a leading ".0", and we don't
 111   // want symbols that start with '.', so remove it.
 112   if (p[0] == '.' && p[1] == '0')
 113     p += 2;
 114
 115   while (p < pend)
 116     {
 117       unsigned int c;
 118       size_t len = fetch_utf8_char(p, &c);
 119       if (len == 1)
 120         {
 121           // At this point we should only be seeing alphanumerics or
 122           // underscore or dot.
 123           go_assert(!char_needs_encoding(c));
 124           ret += c;
 125         }
 126       else
 127         {
 128           char buf[16];
 129           if (c < 0x10000)
 130             snprintf(buf, sizeof buf, "..u%04x", c);
 131           else
 132             snprintf(buf, sizeof buf, "..U%08x", c);
 133
 134           // We don't want a symbol to start with '.', so add a prefix
 135           // if needed.
 136           if (ret.empty())
 137             ret += '_';
 138
 139           ret += buf;
 140         }
 141       p += len;
 142     }
 143   return ret;
 144 }
 145
 146 std::string
 147 go_selectively_encode_id(const std::string &id)
 148 {
 149   if (go_id_needs_encoding(id))
 150     return go_encode_id(id);
 151   return std::string();
 152 }
 153
 154 // Encode a struct field tag.  This is only used when we need to
 155 // create a type descriptor for an anonymous struct type with field
 156 // tags.  This mangling is applied before go_encode_id.  We skip
 157 // alphanumerics and underscore, replace every other single byte
 158 // character with .xNN, and leave larger UTF-8 characters for
 159 // go_encode_id.
 160
 161 std::string
 162 go_mangle_struct_tag(const std::string& tag)
 163 {
 164   std::string ret;
 165   const char* p = tag.c_str();
 166   const char* pend = p + tag.length();
 167   while (p < pend)
 168     {
 169       unsigned int c;
 170       size_t len = fetch_utf8_char(p, &c);
 171       if (len > 1)
 172         ret.append(p, len);
 173       else if (!char_needs_encoding(c) && c != '.')
 174         ret += c;
 175       else
 176         {
 177           char buf[16];
 178           snprintf(buf, sizeof buf, ".x%02x", c);
 179           ret += buf;
 180         }
 181       p += len;
 182     }
 183   return ret;
 184 }