2 prototypes define: #DecoderStream &parents: {StreamProcessor}.
3 prototypes define: #EncoderStream &parents: {WriteStream} &slots: {#target}.
5 u@(EncoderStream traits) on: target
11 u@(EncoderStream traits) contents
16 u@(EncoderStream traits) close
23 Unicode define: #ReadingNonCharacter &parents: {SeriousCondition}.
24 Unicode define: #WritingNonCharacter &parents: {SeriousCondition}.
26 c@(Unicode ReadingNonCharacter traits) describeOn: out
27 [out ; 'Unicode character illegal for exhange in input.'].
29 c@(Unicode WritingNonCharacter traits) describeOn: out
30 [out ; 'Trying to export a Unicode character that is illegal for exchange.'].
34 prototypes ensureNamespace: #UTF8.
36 UTF8 define: #Malformed &parents: {SeriousCondition}.
38 c@(UTF8 Malformed traits) describeOn: out
39 [out ; 'Malformed UTF-8 encoded data.'].
41 UTF8 define: #Decoder &parents: {DecoderStream}.
43 u@(UTF8 Decoder traits) nextCode
44 "This was adapted from Squeak m17n package."
45 [| character1 value1 character2 value2 unicode character3 value3 codesets char character4 value4 |
46 character1 := u source next ifNil: [^ Nil].
47 (value1 := character1 code) <= 127
48 ifTrue: ["1-byte character"
50 (value1 bitAnd: 16rE0) = 192
51 ifTrue: ["2-byte character"
52 character2 := u source next ifNil: [UTF8 Malformed signal].
53 value2 := character2 code.
54 ^ (((value1 bitAnd: 31) bitShift: 6) + (value2 bitAnd: 63))].
55 (value1 bitAnd: 16rF0) = 224
56 ifTrue: ["3-byte character"
57 character2 := u source next ifNil: [UTF8 Malformed signal].
58 value2 := character2 code.
59 character3 := u source next ifNil: [UTF8 Malformed signal].
60 value3 := character3 code.
61 unicode := ((value1 bitAnd: 15) bitShift: 12) + ((value2 bitAnd: 63) bitShift: 6)
62 + (value3 bitAnd: 63)].
63 (value1 bitAnd: 16rF8) = 240
64 ifTrue: ["4-byte character"
65 character2 := u source next ifNil: [UTF8 Malformed signal].
66 value2 := character2 code.
67 character3 := u source next ifNil: [UTF8 Malformed signal].
68 value3 := character3 code.
69 character4 := u source next ifNil: [UTF8 Malformed signal].
70 value4 := character4 code.
71 unicode := ((value1 bitAnd: 16r7) bitShift: 18) +
72 ((value2 bitAnd: 63) bitShift: 12) +
73 ((value3 bitAnd: 63) bitShift: 6) +
75 "unicode ifNil: [^ (16777323)]." "FIXME: what is this?"
79 u@(UTF8 Decoder traits) next
81 (next := u nextCode as: UnicodeCharacter)
82 isNonCharacter ifTrue: [Unicode ReadingNonCharacter signal].
86 UTF8 define: #Encoder &parents: {EncoderStream}.
88 u@(UTF8 Encoder traits) nextPut: aCharacter
89 "This was adapted from Squeak Unicode package."
90 [| codePoint rest byteStream increment mask numberOfMaskBits
91 sixthBitOn counterBit counterIncrement forbiddenForLeadingOctet |
92 aCharacter isNonCharacter ifTrue: [Unicode WritingNonCharacter signal].
93 byteStream := (Array newSize: 7) writer.
94 (codePoint := aCharacter code) < 16r80
95 ifTrue: [byteStream nextPut: codePoint]
96 ifFalse: [mask := 2r111111.
97 numberOfMaskBits := 6.
98 counterBit := 2r10000000.
99 counterIncrement := 2r01000000.
100 forbiddenForLeadingOctet := 2r11000000.
101 increment := 2r00100000.
104 [rest ~= 0 \/ sixthBitOn]
106 [rest < 16r100 /\ [(rest bitAnd: forbiddenForLeadingOctet) = 0]
108 [byteStream nextPut: ((rest bitAnd: mask) bitOr: counterBit).
112 [byteStream nextPut: ((rest bitAnd: mask) bitOr: 2r10000000).
113 sixthBitOn := (rest bitAnd: 2r100000) ~= 0.
114 rest := rest bitShift: numberOfMaskBits negated.
115 forbiddenForLeadingOctet := forbiddenForLeadingOctet bitOr: increment.
116 increment := increment bitShift: -1.
117 counterBit := counterBit bitOr: counterIncrement.
118 counterIncrement := counterIncrement bitShift: -1]]].
119 (byteStream contents reversed select: #isNotNil `er)
120 do: [| :byte | u target nextPut: (byte as: ASCIIString Character)].
125 prototypes ensureNamespace: #UTF16.
126 UTF16 ensureNamespace: #Abstract.
128 "UTF16 Abstract encoder/decoder uses 16-bit values. The endianness
129 issues are handled at lower level."
131 UTF16 Abstract define: #Malformed &parents: {SeriousCondition}.
133 c@(UTF16 Abstract Malformed traits) describeOn: out
134 [out ; 'Malformed UTF-16 encoded data.'].
136 UTF16 Abstract define: #Decoder &parents: {DecoderStream}.
138 u@(UTF16 Abstract Decoder traits) nextCode
139 "FIXME: are the bitAnd: 16r3FF's needed? Don't the range
140 checks handle this as a side effect? Endianness?"
141 [| character1 character2 result |
142 character1 := u source next ifNil: [^ Nil].
143 (character1 < 16rD800 or: [character1 > 16rDFFF])
144 ifTrue: [^ character1 code].
145 (character1 between: 16rD800 and: 16rDBFF)
146 ifFalse: [u Malformed signal].
147 character2 := u source next ifNil: [^ Nil].
148 (character2 between: 16rDC00 and: 16rDFFF)
149 ifFalse: [u Malformed signal].
150 result := ((character1 bitAnd: 16r3FF) bitShift: 10)
151 bitOr: (character2 bitAnd: 16r3FF).
155 u@(UTF16 Abstract Decoder traits) next
157 next := u nextCode as: UnicodeCharacter.
158 next isNonCharacter ifTrue: [Unicode ReadingNonCharacter signal].
162 UTF16 Abstract define: #Encoder &parents: {EncoderStream}.
164 u@(UTF16 Abstract Encoder traits) on: target
170 u@(UTF16 Abstract Encoder traits) nextPut: aCharacter
171 [| codePoint U2 half1 half2 |
172 aCharacter isNonCharacter ifTrue: [Unicode WritingNonCharacter signal].
173 (codePoint := aCharacter code) < 16r10000
174 ifTrue: [u target nextPut: codePoint.
176 U2 := codePoint - 16r10000.
177 half1 := 16rD800 bitOr: (U2 bitShift: -10).
178 half2 := 16rDC00 bitOr: (U2 bitAnd: 16r3FF).
179 u target nextPut: half1.
180 u target nextPut: half2.
183 prototypes ensureNamespace: #UTF16LE.
185 UTF16LE define: #Malformed &parents: {SeriousCondition}.
186 UTF16LE define: #Decoder &parents: {UTF16 Abstract Decoder}.
187 UTF16LE define: #Encoder &parents: {UTF16 Abstract Encoder}.
189 c@(UTF16LE Malformed traits) describeOn: out
190 [out ; 'Malformed UTF-16LE encoded data.'].
192 u@(UTF16LE Decoder traits) newOn: source
194 UTF16 Abstract Decoder newOn: (Int16ReadStream LittleEndian newOn: source)
197 u@(UTF16LE Encoder traits) newOn: target
199 UTF16 Abstract Encoder newOn: (Int16WriteStream LittleEndian newOn: target)
202 prototypes ensureNamespace: #UTF16BE.
204 UTF16BE define: #Malformed &parents: {SeriousCondition}.
205 UTF16BE define: #Decoder &parents: {DecoderStream}.
206 UTF16BE define: #Encoder &parents: {EncoderStream}.
208 c@(UTF16BE Malformed traits) describeOn: out
209 [out ; 'Malformed UTF-16BE encoded data.'].
211 u@(UTF16BE Decoder traits) newOn: source
213 UTF16 Abstract Decoder newOn: (Int16ReadStream BigEndian newOn: source)
216 u@(UTF16BE Encoder traits) newOn: target
218 UTF16 Abstract Encoder newOn: (Int16WriteStream BigEndian newOn: target)
221 "UTF-16 streams (with no BE/LE) start with a byte order mark."
223 UTF16 define: #Decoder &parents: {DecoderStream}.
224 UTF16 define: #Encoder &parents: {EncoderStream}.
226 UTF16 define: #NoByteOrderMark &parents: {SeriousCondition}.
228 c@(UTF16 NoByteOrderMark traits) describeOn: out
229 [out ; 'UTF-16 stream doesn\'t contain byte order mark. Use UTF-16BE or UTF-16LE.'].
231 s@(UTF16 Decoder traits) newOn: source
232 [| stream byteOrderMark |
233 stream := Int16ReadStream BigEndian newOn: source.
234 (byteOrderMark := stream next) caseOf: {
235 16rFEFF -> [UTF16 Abstract Decoder newOn: stream].
236 16rFFFE -> [UTF16LE Decoder newOn: source]
237 } otherwise: [UTF16 NoByteOrderMark signal]
240 s@(UTF16 Encoder traits) newOn: target &littleEndian: little
242 little `defaultsTo: False.
243 stream := (little ifTrue: [UTF16LE] ifFalse: [UTF16BE]) Encoder newOn: target.
244 stream nextPut: (16rFEFF as: UnicodeCharacter).