Add PyErr_WarnEx()
[pytest.git] / Lib / encodings / punycode.py
blobd97200fd35b11fa5a0bc68af79904e5b11751494
1 # -*- coding: iso-8859-1 -*-
2 """ Codec for the Punicode encoding, as specified in RFC 3492
4 Written by Martin v. Löwis.
5 """
7 import codecs
9 ##################### Encoding #####################################
11 def segregate(str):
12 """3.1 Basic code point segregation"""
13 base = []
14 extended = {}
15 for c in str:
16 if ord(c) < 128:
17 base.append(c)
18 else:
19 extended[c] = 1
20 extended = extended.keys()
21 extended.sort()
22 return "".join(base).encode("ascii"),extended
24 def selective_len(str, max):
25 """Return the length of str, considering only characters below max."""
26 res = 0
27 for c in str:
28 if ord(c) < max:
29 res += 1
30 return res
32 def selective_find(str, char, index, pos):
33 """Return a pair (index, pos), indicating the next occurrence of
34 char in str. index is the position of the character considering
35 only ordinals up to and including char, and pos is the position in
36 the full string. index/pos is the starting position in the full
37 string."""
39 l = len(str)
40 while 1:
41 pos += 1
42 if pos == l:
43 return (-1, -1)
44 c = str[pos]
45 if c == char:
46 return index+1, pos
47 elif c < char:
48 index += 1
50 def insertion_unsort(str, extended):
51 """3.2 Insertion unsort coding"""
52 oldchar = 0x80
53 result = []
54 oldindex = -1
55 for c in extended:
56 index = pos = -1
57 char = ord(c)
58 curlen = selective_len(str, char)
59 delta = (curlen+1) * (char - oldchar)
60 while 1:
61 index,pos = selective_find(str,c,index,pos)
62 if index == -1:
63 break
64 delta += index - oldindex
65 result.append(delta-1)
66 oldindex = index
67 delta = 0
68 oldchar = char
70 return result
72 def T(j, bias):
73 # Punycode parameters: tmin = 1, tmax = 26, base = 36
74 res = 36 * (j + 1) - bias
75 if res < 1: return 1
76 if res > 26: return 26
77 return res
79 digits = "abcdefghijklmnopqrstuvwxyz0123456789"
80 def generate_generalized_integer(N, bias):
81 """3.3 Generalized variable-length integers"""
82 result = []
83 j = 0
84 while 1:
85 t = T(j, bias)
86 if N < t:
87 result.append(digits[N])
88 return result
89 result.append(digits[t + ((N - t) % (36 - t))])
90 N = (N - t) // (36 - t)
91 j += 1
93 def adapt(delta, first, numchars):
94 if first:
95 delta //= 700
96 else:
97 delta //= 2
98 delta += delta // numchars
99 # ((base - tmin) * tmax) // 2 == 455
100 divisions = 0
101 while delta > 455:
102 delta = delta // 35 # base - tmin
103 divisions += 36
104 bias = divisions + (36 * delta // (delta + 38))
105 return bias
108 def generate_integers(baselen, deltas):
109 """3.4 Bias adaptation"""
110 # Punycode parameters: initial bias = 72, damp = 700, skew = 38
111 result = []
112 bias = 72
113 for points, delta in enumerate(deltas):
114 s = generate_generalized_integer(delta, bias)
115 result.extend(s)
116 bias = adapt(delta, points==0, baselen+points+1)
117 return "".join(result)
119 def punycode_encode(text):
120 base, extended = segregate(text)
121 base = base.encode("ascii")
122 deltas = insertion_unsort(text, extended)
123 extended = generate_integers(len(base), deltas)
124 if base:
125 return base + "-" + extended
126 return extended
128 ##################### Decoding #####################################
130 def decode_generalized_number(extended, extpos, bias, errors):
131 """3.3 Generalized variable-length integers"""
132 result = 0
133 w = 1
134 j = 0
135 while 1:
136 try:
137 char = ord(extended[extpos])
138 except IndexError:
139 if errors == "strict":
140 raise UnicodeError, "incomplete punicode string"
141 return extpos + 1, None
142 extpos += 1
143 if 0x41 <= char <= 0x5A: # A-Z
144 digit = char - 0x41
145 elif 0x30 <= char <= 0x39:
146 digit = char - 22 # 0x30-26
147 elif errors == "strict":
148 raise UnicodeError("Invalid extended code point '%s'"
149 % extended[extpos])
150 else:
151 return extpos, None
152 t = T(j, bias)
153 result += digit * w
154 if digit < t:
155 return extpos, result
156 w = w * (36 - t)
157 j += 1
160 def insertion_sort(base, extended, errors):
161 """3.2 Insertion unsort coding"""
162 char = 0x80
163 pos = -1
164 bias = 72
165 extpos = 0
166 while extpos < len(extended):
167 newpos, delta = decode_generalized_number(extended, extpos,
168 bias, errors)
169 if delta is None:
170 # There was an error in decoding. We can't continue because
171 # synchronization is lost.
172 return base
173 pos += delta+1
174 char += pos // (len(base) + 1)
175 if char > 0x10FFFF:
176 if errors == "strict":
177 raise UnicodeError, ("Invalid character U+%x" % char)
178 char = ord('?')
179 pos = pos % (len(base) + 1)
180 base = base[:pos] + unichr(char) + base[pos:]
181 bias = adapt(delta, (extpos == 0), len(base))
182 extpos = newpos
183 return base
185 def punycode_decode(text, errors):
186 pos = text.rfind("-")
187 if pos == -1:
188 base = ""
189 extended = text
190 else:
191 base = text[:pos]
192 extended = text[pos+1:]
193 base = unicode(base, "ascii", errors)
194 extended = extended.upper()
195 return insertion_sort(base, extended, errors)
197 ### Codec APIs
199 class Codec(codecs.Codec):
201 def encode(self,input,errors='strict'):
202 res = punycode_encode(input)
203 return res, len(input)
205 def decode(self,input,errors='strict'):
206 if errors not in ('strict', 'replace', 'ignore'):
207 raise UnicodeError, "Unsupported error handling "+errors
208 res = punycode_decode(input, errors)
209 return res, len(input)
211 class IncrementalEncoder(codecs.IncrementalEncoder):
212 def encode(self, input, final=False):
213 return punycode_encode(input)
215 class IncrementalDecoder(codecs.IncrementalDecoder):
216 def decode(self, input, final=False):
217 if self.errors not in ('strict', 'replace', 'ignore'):
218 raise UnicodeError, "Unsupported error handling "+self.errors
219 return punycode_decode(input, self.errors)
221 class StreamWriter(Codec,codecs.StreamWriter):
222 pass
224 class StreamReader(Codec,codecs.StreamReader):
225 pass
227 ### encodings module API
229 def getregentry():
230 return codecs.CodecInfo(
231 name='punycode',
232 encode=Codec().encode,
233 decode=Codec().decode,
234 incrementalencoder=IncrementalEncoder,
235 incrementaldecoder=IncrementalDecoder,
236 streamwriter=StreamWriter,
237 streamreader=StreamReader,