Merged revisions 81656 via svnmerge from
[python/dscho.git] / Lib / encodings / idna.py
blob583bdf130a5089050490811fb5a29cf05c86288d
1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
3 import stringprep, re, codecs
4 from unicodedata import ucd_3_2_0 as unicodedata
6 # IDNA section 3.1
7 dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
9 # IDNA section 5
10 ace_prefix = b"xn--"
11 sace_prefix = "xn--"
13 # This assumes query strings, so AllowUnassigned is true
14 def nameprep(label):
15 # Map
16 newlabel = []
17 for c in label:
18 if stringprep.in_table_b1(c):
19 # Map to nothing
20 continue
21 newlabel.append(stringprep.map_table_b2(c))
22 label = "".join(newlabel)
24 # Normalize
25 label = unicodedata.normalize("NFKC", label)
27 # Prohibit
28 for c in label:
29 if stringprep.in_table_c12(c) or \
30 stringprep.in_table_c22(c) or \
31 stringprep.in_table_c3(c) or \
32 stringprep.in_table_c4(c) or \
33 stringprep.in_table_c5(c) or \
34 stringprep.in_table_c6(c) or \
35 stringprep.in_table_c7(c) or \
36 stringprep.in_table_c8(c) or \
37 stringprep.in_table_c9(c):
38 raise UnicodeError("Invalid character %r" % c)
40 # Check bidi
41 RandAL = [stringprep.in_table_d1(x) for x in label]
42 for c in RandAL:
43 if c:
44 # There is a RandAL char in the string. Must perform further
45 # tests:
46 # 1) The characters in section 5.8 MUST be prohibited.
47 # This is table C.8, which was already checked
48 # 2) If a string contains any RandALCat character, the string
49 # MUST NOT contain any LCat character.
50 if any(stringprep.in_table_d2(x) for x in label):
51 raise UnicodeError("Violation of BIDI requirement 2")
53 # 3) If a string contains any RandALCat character, a
54 # RandALCat character MUST be the first character of the
55 # string, and a RandALCat character MUST be the last
56 # character of the string.
57 if not RandAL[0] or not RandAL[-1]:
58 raise UnicodeError("Violation of BIDI requirement 3")
60 return label
62 def ToASCII(label):
63 try:
64 # Step 1: try ASCII
65 label = label.encode("ascii")
66 except UnicodeError:
67 pass
68 else:
69 # Skip to step 3: UseSTD3ASCIIRules is false, so
70 # Skip to step 8.
71 if 0 < len(label) < 64:
72 return label
73 raise UnicodeError("label empty or too long")
75 # Step 2: nameprep
76 label = nameprep(label)
78 # Step 3: UseSTD3ASCIIRules is false
79 # Step 4: try ASCII
80 try:
81 label = label.encode("ascii")
82 except UnicodeError:
83 pass
84 else:
85 # Skip to step 8.
86 if 0 < len(label) < 64:
87 return label
88 raise UnicodeError("label empty or too long")
90 # Step 5: Check ACE prefix
91 if label.startswith(sace_prefix):
92 raise UnicodeError("Label starts with ACE prefix")
94 # Step 6: Encode with PUNYCODE
95 label = label.encode("punycode")
97 # Step 7: Prepend ACE prefix
98 label = ace_prefix + label
100 # Step 8: Check size
101 if 0 < len(label) < 64:
102 return label
103 raise UnicodeError("label empty or too long")
105 def ToUnicode(label):
106 # Step 1: Check for ASCII
107 if isinstance(label, bytes):
108 pure_ascii = True
109 else:
110 try:
111 label = label.encode("ascii")
112 pure_ascii = True
113 except UnicodeError:
114 pure_ascii = False
115 if not pure_ascii:
116 # Step 2: Perform nameprep
117 label = nameprep(label)
118 # It doesn't say this, but apparently, it should be ASCII now
119 try:
120 label = label.encode("ascii")
121 except UnicodeError:
122 raise UnicodeError("Invalid character in IDN label")
123 # Step 3: Check for ACE prefix
124 if not label.startswith(ace_prefix):
125 return str(label, "ascii")
127 # Step 4: Remove ACE prefix
128 label1 = label[len(ace_prefix):]
130 # Step 5: Decode using PUNYCODE
131 result = label1.decode("punycode")
133 # Step 6: Apply ToASCII
134 label2 = ToASCII(result)
136 # Step 7: Compare the result of step 6 with the one of step 3
137 # label2 will already be in lower case.
138 if str(label, "ascii").lower() != str(label2, "ascii"):
139 raise UnicodeError("IDNA does not round-trip", label, label2)
141 # Step 8: return the result of step 5
142 return result
144 ### Codec APIs
146 class Codec(codecs.Codec):
147 def encode(self, input, errors='strict'):
149 if errors != 'strict':
150 # IDNA is quite clear that implementations must be strict
151 raise UnicodeError("unsupported error handling "+errors)
153 if not input:
154 return b'', 0
156 result = bytearray()
157 labels = dots.split(input)
158 if labels and not labels[-1]:
159 trailing_dot = b'.'
160 del labels[-1]
161 else:
162 trailing_dot = b''
163 for label in labels:
164 if result:
165 # Join with U+002E
166 result.extend(b'.')
167 result.extend(ToASCII(label))
168 return bytes(result+trailing_dot), len(input)
170 def decode(self, input, errors='strict'):
172 if errors != 'strict':
173 raise UnicodeError("Unsupported error handling "+errors)
175 if not input:
176 return "", 0
178 # IDNA allows decoding to operate on Unicode strings, too.
179 if not isinstance(input, bytes):
180 # XXX obviously wrong, see #3232
181 input = bytes(input)
182 labels = input.split(b".")
184 if labels and len(labels[-1]) == 0:
185 trailing_dot = '.'
186 del labels[-1]
187 else:
188 trailing_dot = ''
190 result = []
191 for label in labels:
192 result.append(ToUnicode(label))
194 return ".".join(result)+trailing_dot, len(input)
196 class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
197 def _buffer_encode(self, input, errors, final):
198 if errors != 'strict':
199 # IDNA is quite clear that implementations must be strict
200 raise UnicodeError("unsupported error handling "+errors)
202 if not input:
203 return (b'', 0)
205 labels = dots.split(input)
206 trailing_dot = b''
207 if labels:
208 if not labels[-1]:
209 trailing_dot = b'.'
210 del labels[-1]
211 elif not final:
212 # Keep potentially unfinished label until the next call
213 del labels[-1]
214 if labels:
215 trailing_dot = b'.'
217 result = bytearray()
218 size = 0
219 for label in labels:
220 if size:
221 # Join with U+002E
222 result.extend(b'.')
223 size += 1
224 result.extend(ToASCII(label))
225 size += len(label)
227 result += trailing_dot
228 size += len(trailing_dot)
229 return (bytes(result), size)
231 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
232 def _buffer_decode(self, input, errors, final):
233 if errors != 'strict':
234 raise UnicodeError("Unsupported error handling "+errors)
236 if not input:
237 return ("", 0)
239 # IDNA allows decoding to operate on Unicode strings, too.
240 if isinstance(input, str):
241 labels = dots.split(input)
242 else:
243 # Must be ASCII string
244 input = str(input, "ascii")
245 labels = input.split(".")
247 trailing_dot = ''
248 if labels:
249 if not labels[-1]:
250 trailing_dot = '.'
251 del labels[-1]
252 elif not final:
253 # Keep potentially unfinished label until the next call
254 del labels[-1]
255 if labels:
256 trailing_dot = '.'
258 result = []
259 size = 0
260 for label in labels:
261 result.append(ToUnicode(label))
262 if size:
263 size += 1
264 size += len(label)
266 result = ".".join(result) + trailing_dot
267 size += len(trailing_dot)
268 return (result, size)
270 class StreamWriter(Codec,codecs.StreamWriter):
271 pass
273 class StreamReader(Codec,codecs.StreamReader):
274 pass
276 ### encodings module API
278 def getregentry():
279 return codecs.CodecInfo(
280 name='idna',
281 encode=Codec().encode,
282 decode=Codec().decode,
283 incrementalencoder=IncrementalEncoder,
284 incrementaldecoder=IncrementalDecoder,
285 streamwriter=StreamWriter,
286 streamreader=StreamReader,