1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
3 import stringprep
, re
, codecs
4 from unicodedata
import ucd_3_2_0
as unicodedata
7 dots
= re
.compile(u
"[\u002E\u3002\uFF0E\uFF61]")
11 uace_prefix
= unicode(ace_prefix
, "ascii")
13 # This assumes query strings, so AllowUnassigned is true
18 if stringprep
.in_table_b1(c
):
21 newlabel
.append(stringprep
.map_table_b2(c
))
22 label
= u
"".join(newlabel
)
25 label
= unicodedata
.normalize("NFKC", label
)
29 if stringprep
.in_table_c12(c
) or \
30 stringprep
.in_table_c22(c
) or \
31 stringprep
.in_table_c3(c
) or \
32 stringprep
.in_table_c4(c
) or \
33 stringprep
.in_table_c5(c
) or \
34 stringprep
.in_table_c6(c
) or \
35 stringprep
.in_table_c7(c
) or \
36 stringprep
.in_table_c8(c
) or \
37 stringprep
.in_table_c9(c
):
38 raise UnicodeError("Invalid character %r" % c
)
41 RandAL
= map(stringprep
.in_table_d1
, label
)
44 # There is a RandAL char in the string. Must perform further
46 # 1) The characters in section 5.8 MUST be prohibited.
47 # This is table C.8, which was already checked
48 # 2) If a string contains any RandALCat character, the string
49 # MUST NOT contain any LCat character.
50 if filter(stringprep
.in_table_d2
, label
):
51 raise UnicodeError("Violation of BIDI requirement 2")
53 # 3) If a string contains any RandALCat character, a
54 # RandALCat character MUST be the first character of the
55 # string, and a RandALCat character MUST be the last
56 # character of the string.
57 if not RandAL
[0] or not RandAL
[-1]:
58 raise UnicodeError("Violation of BIDI requirement 3")
65 label
= label
.encode("ascii")
69 # Skip to step 3: UseSTD3ASCIIRules is false, so
71 if 0 < len(label
) < 64:
73 raise UnicodeError("label empty or too long")
76 label
= nameprep(label
)
78 # Step 3: UseSTD3ASCIIRules is false
81 label
= label
.encode("ascii")
86 if 0 < len(label
) < 64:
88 raise UnicodeError("label empty or too long")
90 # Step 5: Check ACE prefix
91 if label
.startswith(uace_prefix
):
92 raise UnicodeError("Label starts with ACE prefix")
94 # Step 6: Encode with PUNYCODE
95 label
= label
.encode("punycode")
97 # Step 7: Prepend ACE prefix
98 label
= ace_prefix
+ label
101 if 0 < len(label
) < 64:
103 raise UnicodeError("label empty or too long")
105 def ToUnicode(label
):
106 # Step 1: Check for ASCII
107 if isinstance(label
, str):
111 label
= label
.encode("ascii")
116 # Step 2: Perform nameprep
117 label
= nameprep(label
)
118 # It doesn't say this, but apparently, it should be ASCII now
120 label
= label
.encode("ascii")
122 raise UnicodeError("Invalid character in IDN label")
123 # Step 3: Check for ACE prefix
124 if not label
.startswith(ace_prefix
):
125 return unicode(label
, "ascii")
127 # Step 4: Remove ACE prefix
128 label1
= label
[len(ace_prefix
):]
130 # Step 5: Decode using PUNYCODE
131 result
= label1
.decode("punycode")
133 # Step 6: Apply ToASCII
134 label2
= ToASCII(result
)
136 # Step 7: Compare the result of step 6 with the one of step 3
137 # label2 will already be in lower case.
138 if label
.lower() != label2
:
139 raise UnicodeError("IDNA does not round-trip", label
, label2
)
141 # Step 8: return the result of step 5
146 class Codec(codecs
.Codec
):
147 def encode(self
,input,errors
='strict'):
149 if errors
!= 'strict':
150 # IDNA is quite clear that implementations must be strict
151 raise UnicodeError("unsupported error handling "+errors
)
157 labels
= dots
.split(input)
158 if labels
and len(labels
[-1])==0:
164 result
.append(ToASCII(label
))
166 return ".".join(result
)+trailing_dot
, len(input)
168 def decode(self
,input,errors
='strict'):
170 if errors
!= 'strict':
171 raise UnicodeError("Unsupported error handling "+errors
)
176 # IDNA allows decoding to operate on Unicode strings, too.
177 if isinstance(input, unicode):
178 labels
= dots
.split(input)
180 # Must be ASCII string
182 unicode(input, "ascii")
183 labels
= input.split(".")
185 if labels
and len(labels
[-1]) == 0:
193 result
.append(ToUnicode(label
))
195 return u
".".join(result
)+trailing_dot
, len(input)
197 class IncrementalEncoder(codecs
.BufferedIncrementalEncoder
):
198 def _buffer_encode(self
, input, errors
, final
):
199 if errors
!= 'strict':
200 # IDNA is quite clear that implementations must be strict
201 raise UnicodeError("unsupported error handling "+errors
)
206 labels
= dots
.split(input)
213 # Keep potentially unfinished label until the next call
221 result
.append(ToASCII(label
))
227 result
= ".".join(result
) + trailing_dot
228 size
+= len(trailing_dot
)
229 return (result
, size
)
231 class IncrementalDecoder(codecs
.BufferedIncrementalDecoder
):
232 def _buffer_decode(self
, input, errors
, final
):
233 if errors
!= 'strict':
234 raise UnicodeError("Unsupported error handling "+errors
)
239 # IDNA allows decoding to operate on Unicode strings, too.
240 if isinstance(input, unicode):
241 labels
= dots
.split(input)
243 # Must be ASCII string
245 unicode(input, "ascii")
246 labels
= input.split(".")
254 # Keep potentially unfinished label until the next call
262 result
.append(ToUnicode(label
))
267 result
= u
".".join(result
) + trailing_dot
268 size
+= len(trailing_dot
)
269 return (result
, size
)
271 class StreamWriter(Codec
,codecs
.StreamWriter
):
274 class StreamReader(Codec
,codecs
.StreamReader
):
277 ### encodings module API
280 return codecs
.CodecInfo(
282 encode
=Codec().encode
,
283 decode
=Codec().decode
,
284 incrementalencoder
=IncrementalEncoder
,
285 incrementaldecoder
=IncrementalDecoder
,
286 streamwriter
=StreamWriter
,
287 streamreader
=StreamReader
,