Lib/encodings/idna.py

   1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
   2
   3 import stringprep, re, codecs
   4 from unicodedata import ucd_3_2_0 as unicodedata
   5
   6 # IDNA section 3.1
   7 dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
   8
   9 # IDNA section 5
  10 ace_prefix = b"xn--"
  11 sace_prefix = "xn--"
  12
  13 # This assumes query strings, so AllowUnassigned is true
  14 def nameprep(label):
  15     # Map
  16     newlabel = []
  17     for c in label:
  18         if stringprep.in_table_b1(c):
  19             # Map to nothing
  20             continue
  21         newlabel.append(stringprep.map_table_b2(c))
  22     label = "".join(newlabel)
  23
  24     # Normalize
  25     label = unicodedata.normalize("NFKC", label)
  26
  27     # Prohibit
  28     for c in label:
  29         if stringprep.in_table_c12(c) or \
  30            stringprep.in_table_c22(c) or \
  31            stringprep.in_table_c3(c) or \
  32            stringprep.in_table_c4(c) or \
  33            stringprep.in_table_c5(c) or \
  34            stringprep.in_table_c6(c) or \
  35            stringprep.in_table_c7(c) or \
  36            stringprep.in_table_c8(c) or \
  37            stringprep.in_table_c9(c):
  38             raise UnicodeError("Invalid character %r" % c)
  39
  40     # Check bidi
  41     RandAL = [stringprep.in_table_d1(x) for x in label]
  42     for c in RandAL:
  43         if c:
  44             # There is a RandAL char in the string. Must perform further
  45             # tests:
  46             # 1) The characters in section 5.8 MUST be prohibited.
  47             # This is table C.8, which was already checked
  48             # 2) If a string contains any RandALCat character, the string
  49             # MUST NOT contain any LCat character.
  50             if any(stringprep.in_table_d2(x) for x in label):
  51                 raise UnicodeError("Violation of BIDI requirement 2")
  52
  53             # 3) If a string contains any RandALCat character, a
  54             # RandALCat character MUST be the first character of the
  55             # string, and a RandALCat character MUST be the last
  56             # character of the string.
  57             if not RandAL[0] or not RandAL[-1]:
  58                 raise UnicodeError("Violation of BIDI requirement 3")
  59
  60     return label
  61
  62 def ToASCII(label):
  63     try:
  64         # Step 1: try ASCII
  65         label = label.encode("ascii")
  66     except UnicodeError:
  67         pass
  68     else:
  69         # Skip to step 3: UseSTD3ASCIIRules is false, so
  70         # Skip to step 8.
  71         if 0 < len(label) < 64:
  72             return label
  73         raise UnicodeError("label empty or too long")
  74
  75     # Step 2: nameprep
  76     label = nameprep(label)
  77
  78     # Step 3: UseSTD3ASCIIRules is false
  79     # Step 4: try ASCII
  80     try:
  81         label = label.encode("ascii")
  82     except UnicodeError:
  83         pass
  84     else:
  85         # Skip to step 8.
  86         if 0 < len(label) < 64:
  87             return label
  88         raise UnicodeError("label empty or too long")
  89
  90     # Step 5: Check ACE prefix
  91     if label.startswith(sace_prefix):
  92         raise UnicodeError("Label starts with ACE prefix")
  93
  94     # Step 6: Encode with PUNYCODE
  95     label = label.encode("punycode")
  96
  97     # Step 7: Prepend ACE prefix
  98     label = ace_prefix + label
  99
 100     # Step 8: Check size
 101     if 0 < len(label) < 64:
 102         return label
 103     raise UnicodeError("label empty or too long")
 104
 105 def ToUnicode(label):
 106     # Step 1: Check for ASCII
 107     if isinstance(label, bytes):
 108         pure_ascii = True
 109     else:
 110         try:
 111             label = label.encode("ascii")
 112             pure_ascii = True
 113         except UnicodeError:
 114             pure_ascii = False
 115     if not pure_ascii:
 116         # Step 2: Perform nameprep
 117         label = nameprep(label)
 118         # It doesn't say this, but apparently, it should be ASCII now
 119         try:
 120             label = label.encode("ascii")
 121         except UnicodeError:
 122             raise UnicodeError("Invalid character in IDN label")
 123     # Step 3: Check for ACE prefix
 124     if not label.startswith(ace_prefix):
 125         return str(label, "ascii")
 126
 127     # Step 4: Remove ACE prefix
 128     label1 = label[len(ace_prefix):]
 129
 130     # Step 5: Decode using PUNYCODE
 131     result = label1.decode("punycode")
 132
 133     # Step 6: Apply ToASCII
 134     label2 = ToASCII(result)
 135
 136     # Step 7: Compare the result of step 6 with the one of step 3
 137     # label2 will already be in lower case.
 138     if str(label, "ascii").lower() != str(label2, "ascii"):
 139         raise UnicodeError("IDNA does not round-trip", label, label2)
 140
 141     # Step 8: return the result of step 5
 142     return result
 143
 144 ### Codec APIs
 145
 146 class Codec(codecs.Codec):
 147     def encode(self, input, errors='strict'):
 148
 149         if errors != 'strict':
 150             # IDNA is quite clear that implementations must be strict
 151             raise UnicodeError("unsupported error handling "+errors)
 152
 153         if not input:
 154             return b'', 0
 155
 156         result = bytearray()
 157         labels = dots.split(input)
 158         if labels and not labels[-1]:
 159             trailing_dot = b'.'
 160             del labels[-1]
 161         else:
 162             trailing_dot = b''
 163         for label in labels:
 164             if result:
 165                 # Join with U+002E
 166                 result.extend(b'.')
 167             result.extend(ToASCII(label))
 168         return bytes(result+trailing_dot), len(input)
 169
 170     def decode(self, input, errors='strict'):
 171
 172         if errors != 'strict':
 173             raise UnicodeError("Unsupported error handling "+errors)
 174
 175         if not input:
 176             return "", 0
 177
 178         # IDNA allows decoding to operate on Unicode strings, too.
 179         if not isinstance(input, bytes):
 180             # XXX obviously wrong, see #3232
 181             input = bytes(input)
 182         labels = input.split(b".")
 183
 184         if labels and len(labels[-1]) == 0:
 185             trailing_dot = '.'
 186             del labels[-1]
 187         else:
 188             trailing_dot = ''
 189
 190         result = []
 191         for label in labels:
 192             result.append(ToUnicode(label))
 193
 194         return ".".join(result)+trailing_dot, len(input)
 195
 196 class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
 197     def _buffer_encode(self, input, errors, final):
 198         if errors != 'strict':
 199             # IDNA is quite clear that implementations must be strict
 200             raise UnicodeError("unsupported error handling "+errors)
 201
 202         if not input:
 203             return (b'', 0)
 204
 205         labels = dots.split(input)
 206         trailing_dot = b''
 207         if labels:
 208             if not labels[-1]:
 209                 trailing_dot = b'.'
 210                 del labels[-1]
 211             elif not final:
 212                 # Keep potentially unfinished label until the next call
 213                 del labels[-1]
 214                 if labels:
 215                     trailing_dot = b'.'
 216
 217         result = bytearray()
 218         size = 0
 219         for label in labels:
 220             if size:
 221                 # Join with U+002E
 222                 result.extend(b'.')
 223                 size += 1
 224             result.extend(ToASCII(label))
 225             size += len(label)
 226
 227         result += trailing_dot
 228         size += len(trailing_dot)
 229         return (bytes(result), size)
 230
 231 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
 232     def _buffer_decode(self, input, errors, final):
 233         if errors != 'strict':
 234             raise UnicodeError("Unsupported error handling "+errors)
 235
 236         if not input:
 237             return ("", 0)
 238
 239         # IDNA allows decoding to operate on Unicode strings, too.
 240         if isinstance(input, str):
 241             labels = dots.split(input)
 242         else:
 243             # Must be ASCII string
 244             input = str(input, "ascii")
 245             labels = input.split(".")
 246
 247         trailing_dot = ''
 248         if labels:
 249             if not labels[-1]:
 250                 trailing_dot = '.'
 251                 del labels[-1]
 252             elif not final:
 253                 # Keep potentially unfinished label until the next call
 254                 del labels[-1]
 255                 if labels:
 256                     trailing_dot = '.'
 257
 258         result = []
 259         size = 0
 260         for label in labels:
 261             result.append(ToUnicode(label))
 262             if size:
 263                 size += 1
 264             size += len(label)
 265
 266         result = ".".join(result) + trailing_dot
 267         size += len(trailing_dot)
 268         return (result, size)
 269
 270 class StreamWriter(Codec,codecs.StreamWriter):
 271     pass
 272
 273 class StreamReader(Codec,codecs.StreamReader):
 274     pass
 275
 276 ### encodings module API
 277
 278 def getregentry():
 279     return codecs.CodecInfo(
 280         name='idna',
 281         encode=Codec().encode,
 282         decode=Codec().decode,
 283         incrementalencoder=IncrementalEncoder,
 284         incrementaldecoder=IncrementalDecoder,
 285         streamwriter=StreamWriter,
 286         streamreader=StreamReader,
 287     )