Lib/encodings/idna.py

   1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
   2
   3 import stringprep, re, codecs
   4 from unicodedata import ucd_3_2_0 as unicodedata
   5
   6 # IDNA section 3.1
   7 dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
   8
   9 # IDNA section 5
  10 ace_prefix = "xn--"
  11 uace_prefix = unicode(ace_prefix, "ascii")
  12
  13 # This assumes query strings, so AllowUnassigned is true
  14 def nameprep(label):
  15     # Map
  16     newlabel = []
  17     for c in label:
  18         if stringprep.in_table_b1(c):
  19             # Map to nothing
  20             continue
  21         newlabel.append(stringprep.map_table_b2(c))
  22     label = u"".join(newlabel)
  23
  24     # Normalize
  25     label = unicodedata.normalize("NFKC", label)
  26
  27     # Prohibit
  28     for c in label:
  29         if stringprep.in_table_c12(c) or \
  30            stringprep.in_table_c22(c) or \
  31            stringprep.in_table_c3(c) or \
  32            stringprep.in_table_c4(c) or \
  33            stringprep.in_table_c5(c) or \
  34            stringprep.in_table_c6(c) or \
  35            stringprep.in_table_c7(c) or \
  36            stringprep.in_table_c8(c) or \
  37            stringprep.in_table_c9(c):
  38             raise UnicodeError("Invalid character %r" % c)
  39
  40     # Check bidi
  41     RandAL = map(stringprep.in_table_d1, label)
  42     for c in RandAL:
  43         if c:
  44             # There is a RandAL char in the string. Must perform further
  45             # tests:
  46             # 1) The characters in section 5.8 MUST be prohibited.
  47             # This is table C.8, which was already checked
  48             # 2) If a string contains any RandALCat character, the string
  49             # MUST NOT contain any LCat character.
  50             if filter(stringprep.in_table_d2, label):
  51                 raise UnicodeError("Violation of BIDI requirement 2")
  52
  53             # 3) If a string contains any RandALCat character, a
  54             # RandALCat character MUST be the first character of the
  55             # string, and a RandALCat character MUST be the last
  56             # character of the string.
  57             if not RandAL[0] or not RandAL[-1]:
  58                 raise UnicodeError("Violation of BIDI requirement 3")
  59
  60     return label
  61
  62 def ToASCII(label):
  63     try:
  64         # Step 1: try ASCII
  65         label = label.encode("ascii")
  66     except UnicodeError:
  67         pass
  68     else:
  69         # Skip to step 3: UseSTD3ASCIIRules is false, so
  70         # Skip to step 8.
  71         if 0 < len(label) < 64:
  72             return label
  73         raise UnicodeError("label empty or too long")
  74
  75     # Step 2: nameprep
  76     label = nameprep(label)
  77
  78     # Step 3: UseSTD3ASCIIRules is false
  79     # Step 4: try ASCII
  80     try:
  81         label = label.encode("ascii")
  82     except UnicodeError:
  83         pass
  84     else:
  85         # Skip to step 8.
  86         if 0 < len(label) < 64:
  87             return label
  88         raise UnicodeError("label empty or too long")
  89
  90     # Step 5: Check ACE prefix
  91     if label.startswith(uace_prefix):
  92         raise UnicodeError("Label starts with ACE prefix")
  93
  94     # Step 6: Encode with PUNYCODE
  95     label = label.encode("punycode")
  96
  97     # Step 7: Prepend ACE prefix
  98     label = ace_prefix + label
  99
 100     # Step 8: Check size
 101     if 0 < len(label) < 64:
 102         return label
 103     raise UnicodeError("label empty or too long")
 104
 105 def ToUnicode(label):
 106     # Step 1: Check for ASCII
 107     if isinstance(label, str):
 108         pure_ascii = True
 109     else:
 110         try:
 111             label = label.encode("ascii")
 112             pure_ascii = True
 113         except UnicodeError:
 114             pure_ascii = False
 115     if not pure_ascii:
 116         # Step 2: Perform nameprep
 117         label = nameprep(label)
 118         # It doesn't say this, but apparently, it should be ASCII now
 119         try:
 120             label = label.encode("ascii")
 121         except UnicodeError:
 122             raise UnicodeError("Invalid character in IDN label")
 123     # Step 3: Check for ACE prefix
 124     if not label.startswith(ace_prefix):
 125         return unicode(label, "ascii")
 126
 127     # Step 4: Remove ACE prefix
 128     label1 = label[len(ace_prefix):]
 129
 130     # Step 5: Decode using PUNYCODE
 131     result = label1.decode("punycode")
 132
 133     # Step 6: Apply ToASCII
 134     label2 = ToASCII(result)
 135
 136     # Step 7: Compare the result of step 6 with the one of step 3
 137     # label2 will already be in lower case.
 138     if label.lower() != label2:
 139         raise UnicodeError("IDNA does not round-trip", label, label2)
 140
 141     # Step 8: return the result of step 5
 142     return result
 143
 144 ### Codec APIs
 145
 146 class Codec(codecs.Codec):
 147     def encode(self,input,errors='strict'):
 148
 149         if errors != 'strict':
 150             # IDNA is quite clear that implementations must be strict
 151             raise UnicodeError("unsupported error handling "+errors)
 152
 153         if not input:
 154             return "", 0
 155
 156         result = []
 157         labels = dots.split(input)
 158         if labels and len(labels[-1])==0:
 159             trailing_dot = '.'
 160             del labels[-1]
 161         else:
 162             trailing_dot = ''
 163         for label in labels:
 164             result.append(ToASCII(label))
 165         # Join with U+002E
 166         return ".".join(result)+trailing_dot, len(input)
 167
 168     def decode(self,input,errors='strict'):
 169
 170         if errors != 'strict':
 171             raise UnicodeError("Unsupported error handling "+errors)
 172
 173         if not input:
 174             return u"", 0
 175
 176         # IDNA allows decoding to operate on Unicode strings, too.
 177         if isinstance(input, unicode):
 178             labels = dots.split(input)
 179         else:
 180             # Must be ASCII string
 181             input = str(input)
 182             unicode(input, "ascii")
 183             labels = input.split(".")
 184
 185         if labels and len(labels[-1]) == 0:
 186             trailing_dot = u'.'
 187             del labels[-1]
 188         else:
 189             trailing_dot = u''
 190
 191         result = []
 192         for label in labels:
 193             result.append(ToUnicode(label))
 194
 195         return u".".join(result)+trailing_dot, len(input)
 196
 197 class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
 198     def _buffer_encode(self, input, errors, final):
 199         if errors != 'strict':
 200             # IDNA is quite clear that implementations must be strict
 201             raise UnicodeError("unsupported error handling "+errors)
 202
 203         if not input:
 204             return ("", 0)
 205
 206         labels = dots.split(input)
 207         trailing_dot = u''
 208         if labels:
 209             if not labels[-1]:
 210                 trailing_dot = '.'
 211                 del labels[-1]
 212             elif not final:
 213                 # Keep potentially unfinished label until the next call
 214                 del labels[-1]
 215                 if labels:
 216                     trailing_dot = '.'
 217
 218         result = []
 219         size = 0
 220         for label in labels:
 221             result.append(ToASCII(label))
 222             if size:
 223                 size += 1
 224             size += len(label)
 225
 226         # Join with U+002E
 227         result = ".".join(result) + trailing_dot
 228         size += len(trailing_dot)
 229         return (result, size)
 230
 231 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
 232     def _buffer_decode(self, input, errors, final):
 233         if errors != 'strict':
 234             raise UnicodeError("Unsupported error handling "+errors)
 235
 236         if not input:
 237             return (u"", 0)
 238
 239         # IDNA allows decoding to operate on Unicode strings, too.
 240         if isinstance(input, unicode):
 241             labels = dots.split(input)
 242         else:
 243             # Must be ASCII string
 244             input = str(input)
 245             unicode(input, "ascii")
 246             labels = input.split(".")
 247
 248         trailing_dot = u''
 249         if labels:
 250             if not labels[-1]:
 251                 trailing_dot = u'.'
 252                 del labels[-1]
 253             elif not final:
 254                 # Keep potentially unfinished label until the next call
 255                 del labels[-1]
 256                 if labels:
 257                     trailing_dot = u'.'
 258
 259         result = []
 260         size = 0
 261         for label in labels:
 262             result.append(ToUnicode(label))
 263             if size:
 264                 size += 1
 265             size += len(label)
 266
 267         result = u".".join(result) + trailing_dot
 268         size += len(trailing_dot)
 269         return (result, size)
 270
 271 class StreamWriter(Codec,codecs.StreamWriter):
 272     pass
 273
 274 class StreamReader(Codec,codecs.StreamReader):
 275     pass
 276
 277 ### encodings module API
 278
 279 def getregentry():
 280     return codecs.CodecInfo(
 281         name='idna',
 282         encode=Codec().encode,
 283         decode=Codec().decode,
 284         incrementalencoder=IncrementalEncoder,
 285         incrementaldecoder=IncrementalDecoder,
 286         streamwriter=StreamWriter,
 287         streamreader=StreamReader,
 288     )