1 # -*- coding: iso-8859-1 -*-
2 """ Codec for the Punicode encoding, as specified in RFC 3492
4 Written by Martin v. Löwis.
9 ##################### Encoding #####################################
12 """3.1 Basic code point segregation"""
20 extended
= extended
.keys()
22 return "".join(base
).encode("ascii"),extended
24 def selective_len(str, max):
25 """Return the length of str, considering only characters below max."""
32 def selective_find(str, char
, index
, pos
):
33 """Return a pair (index, pos), indicating the next occurrence of
34 char in str. index is the position of the character considering
35 only ordinals up to and including char, and pos is the position in
36 the full string. index/pos is the starting position in the full
50 def insertion_unsort(str, extended
):
51 """3.2 Insertion unsort coding"""
58 curlen
= selective_len(str, char
)
59 delta
= (curlen
+1) * (char
- oldchar
)
61 index
,pos
= selective_find(str,c
,index
,pos
)
64 delta
+= index
- oldindex
65 result
.append(delta
-1)
73 # Punycode parameters: tmin = 1, tmax = 26, base = 36
74 res
= 36 * (j
+ 1) - bias
76 if res
> 26: return 26
79 digits
= "abcdefghijklmnopqrstuvwxyz0123456789"
80 def generate_generalized_integer(N
, bias
):
81 """3.3 Generalized variable-length integers"""
87 result
.append(digits
[N
])
89 result
.append(digits
[t
+ ((N
- t
) % (36 - t
))])
90 N
= (N
- t
) // (36 - t
)
93 def adapt(delta
, first
, numchars
):
98 delta
+= delta
// numchars
99 # ((base - tmin) * tmax) // 2 == 455
102 delta
= delta
// 35 # base - tmin
104 bias
= divisions
+ (36 * delta
// (delta
+ 38))
108 def generate_integers(baselen
, deltas
):
109 """3.4 Bias adaptation"""
110 # Punycode parameters: initial bias = 72, damp = 700, skew = 38
113 for points
, delta
in enumerate(deltas
):
114 s
= generate_generalized_integer(delta
, bias
)
116 bias
= adapt(delta
, points
==0, baselen
+points
+1)
117 return "".join(result
)
119 def punycode_encode(text
):
120 base
, extended
= segregate(text
)
121 base
= base
.encode("ascii")
122 deltas
= insertion_unsort(text
, extended
)
123 extended
= generate_integers(len(base
), deltas
)
125 return base
+ "-" + extended
128 ##################### Decoding #####################################
130 def decode_generalized_number(extended
, extpos
, bias
, errors
):
131 """3.3 Generalized variable-length integers"""
137 char
= ord(extended
[extpos
])
139 if errors
== "strict":
140 raise UnicodeError, "incomplete punicode string"
141 return extpos
+ 1, None
143 if 0x41 <= char
<= 0x5A: # A-Z
145 elif 0x30 <= char
<= 0x39:
146 digit
= char
- 22 # 0x30-26
147 elif errors
== "strict":
148 raise UnicodeError("Invalid extended code point '%s'"
155 return extpos
, result
160 def insertion_sort(base
, extended
, errors
):
161 """3.2 Insertion unsort coding"""
166 while extpos
< len(extended
):
167 newpos
, delta
= decode_generalized_number(extended
, extpos
,
170 # There was an error in decoding. We can't continue because
171 # synchronization is lost.
174 char
+= pos
// (len(base
) + 1)
176 if errors
== "strict":
177 raise UnicodeError, ("Invalid character U+%x" % char
)
179 pos
= pos
% (len(base
) + 1)
180 base
= base
[:pos
] + unichr(char
) + base
[pos
:]
181 bias
= adapt(delta
, (extpos
== 0), len(base
))
185 def punycode_decode(text
, errors
):
186 pos
= text
.rfind("-")
192 extended
= text
[pos
+1:]
193 base
= unicode(base
, "ascii", errors
)
194 extended
= extended
.upper()
195 return insertion_sort(base
, extended
, errors
)
199 class Codec(codecs
.Codec
):
201 def encode(self
,input,errors
='strict'):
202 res
= punycode_encode(input)
203 return res
, len(input)
205 def decode(self
,input,errors
='strict'):
206 if errors
not in ('strict', 'replace', 'ignore'):
207 raise UnicodeError, "Unsupported error handling "+errors
208 res
= punycode_decode(input, errors
)
209 return res
, len(input)
211 class IncrementalEncoder(codecs
.IncrementalEncoder
):
212 def encode(self
, input, final
=False):
213 return punycode_encode(input)
215 class IncrementalDecoder(codecs
.IncrementalDecoder
):
216 def decode(self
, input, final
=False):
217 if self
.errors
not in ('strict', 'replace', 'ignore'):
218 raise UnicodeError, "Unsupported error handling "+self
.errors
219 return punycode_decode(input, self
.errors
)
221 class StreamWriter(Codec
,codecs
.StreamWriter
):
224 class StreamReader(Codec
,codecs
.StreamReader
):
227 ### encodings module API
230 return codecs
.CodecInfo(
232 encode
=Codec().encode
,
233 decode
=Codec().decode
,
234 incrementalencoder
=IncrementalEncoder
,
235 incrementaldecoder
=IncrementalDecoder
,
236 streamwriter
=StreamWriter
,
237 streamreader
=StreamReader
,