1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
10 maxCombiningChars
= 30
11 maxBufferSize
= maxCombiningChars
+ 2 // +1 to hold starter +1 to hold CGJ
12 maxBackRunes
= maxCombiningChars
- 1
13 maxNFCExpansion
= 3 // NFC(0x1D160)
14 maxNFKCExpansion
= 18 // NFKC(0xFDFA)
16 maxByteBufferSize
= utf8
.UTFMax
* maxBufferSize
// 128
19 // reorderBuffer is used to normalize a single segment. Characters inserted with
20 // insert are decomposed and reordered based on CCC. The compose method can
21 // be used to recombine characters. Note that the byte buffer does not hold
22 // the UTF-8 characters in order. Only the rune array is maintained in sorted
23 // order. flush writes the resulting segment to a byte array.
24 type reorderBuffer
struct {
25 rune
[maxBufferSize
]runeInfo
// Per character info.
26 byte [maxByteBufferSize
]byte // UTF-8 buffer. Referenced by runeInfo.pos.
27 nrune
int // Number of runeInfos.
28 nbyte
uint8 // Number or bytes.
38 func (rb
*reorderBuffer
) init(f Form
, src
[]byte) {
40 rb
.srcBytes
= inputBytes(src
)
45 func (rb
*reorderBuffer
) initString(f Form
, src
string) {
47 rb
.srcString
= inputString(src
)
48 rb
.src
= &rb
.srcString
52 // reset discards all characters from the buffer.
53 func (rb
*reorderBuffer
) reset() {
58 // flush appends the normalized segment to out and resets rb.
59 func (rb
*reorderBuffer
) flush(out
[]byte) []byte {
60 for i
:= 0; i
< rb
.nrune
; i
++ {
61 start
:= rb
.rune
[i
].pos
62 end
:= start
+ rb
.rune
[i
].size
63 out
= append(out
, rb
.byte[start
:end
]...)
69 // flushCopy copies the normalized segment to buf and resets rb.
70 // It returns the number of bytes written to buf.
71 func (rb
*reorderBuffer
) flushCopy(buf
[]byte) int {
73 for i
:= 0; i
< rb
.nrune
; i
++ {
75 p
+= copy(buf
[p
:], rb
.byte[runep
.pos
:runep
.pos
+runep
.size
])
81 // insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
82 // It returns false if the buffer is not large enough to hold the rune.
83 // It is used internally by insert and insertString only.
84 func (rb
*reorderBuffer
) insertOrdered(info runeInfo
) bool {
86 if n
>= maxCombiningChars
+1 {
92 // Find insertion position + move elements to make room.
101 pos
:= uint8(rb
.nbyte
)
102 rb
.nbyte
+= utf8
.UTFMax
108 // insert inserts the given rune in the buffer ordered by CCC.
109 // It returns true if the buffer was large enough to hold the decomposed rune.
110 func (rb
*reorderBuffer
) insert(src input
, i
int, info runeInfo
) bool {
111 if rune
:= src
.hangul(i
); rune
!= 0 {
112 return rb
.decomposeHangul(rune
)
114 if info
.hasDecomposition() {
115 return rb
.insertDecomposed(info
.decomposition())
117 return rb
.insertSingle(src
, i
, info
)
120 // insertDecomposed inserts an entry in to the reorderBuffer for each rune
121 // in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
122 func (rb
*reorderBuffer
) insertDecomposed(dcomp
[]byte) bool {
123 saveNrune
, saveNbyte
:= rb
.nrune
, rb
.nbyte
124 rb
.tmpBytes
= inputBytes(dcomp
)
125 for i
:= 0; i
< len(dcomp
); {
126 info
:= rb
.f
.info(&rb
.tmpBytes
, i
)
128 if !rb
.insertOrdered(info
) {
129 rb
.nrune
, rb
.nbyte
= saveNrune
, saveNbyte
132 i
+= copy(rb
.byte[pos
:], dcomp
[i
:i
+int(info
.size
)])
137 // insertSingle inserts an entry in the reorderBuffer for the rune at
138 // position i. info is the runeInfo for the rune at position i.
139 func (rb
*reorderBuffer
) insertSingle(src input
, i
int, info runeInfo
) bool {
140 // insertOrder changes nbyte
142 if !rb
.insertOrdered(info
) {
145 src
.copySlice(rb
.byte[pos
:], i
, i
+int(info
.size
))
149 // appendRune inserts a rune at the end of the buffer. It is used for Hangul.
150 func (rb
*reorderBuffer
) appendRune(r rune
) {
152 sz
:= utf8
.EncodeRune(rb
.byte[bn
:], rune(r
))
153 rb
.nbyte
+= utf8
.UTFMax
154 rb
.rune
[rb
.nrune
] = runeInfo
{pos
: bn
, size
: uint8(sz
)}
158 // assignRune sets a rune at position pos. It is used for Hangul and recomposition.
159 func (rb
*reorderBuffer
) assignRune(pos
int, r rune
) {
160 bn
:= rb
.rune
[pos
].pos
161 sz
:= utf8
.EncodeRune(rb
.byte[bn
:], rune(r
))
162 rb
.rune
[pos
] = runeInfo
{pos
: bn
, size
: uint8(sz
)}
165 // runeAt returns the rune at position n. It is used for Hangul and recomposition.
166 func (rb
*reorderBuffer
) runeAt(n
int) rune
{
168 r
, _
:= utf8
.DecodeRune(rb
.byte[inf
.pos
: inf
.pos
+inf
.size
])
172 // bytesAt returns the UTF-8 encoding of the rune at position n.
173 // It is used for Hangul and recomposition.
174 func (rb
*reorderBuffer
) bytesAt(n
int) []byte {
176 return rb
.byte[inf
.pos
: int(inf
.pos
)+int(inf
.size
)]
179 // For Hangul we combine algorithmically, instead of using tables.
181 hangulBase
= 0xAC00 // UTF-8(hangulBase) -> EA B0 80
186 hangulEnd
= hangulBase
+ jamoLVTCount
// UTF-8(0xD7A4) -> ED 9E A4
191 jamoLBase
= 0x1100 // UTF-8(jamoLBase) -> E1 84 00
202 jamoVTCount
= 21 * 28
203 jamoLVTCount
= 19 * 21 * 28
206 const hangulUTF8Size
= 3
208 func isHangul(b
[]byte) bool {
209 if len(b
) < hangulUTF8Size
{
213 if b0
< hangulBase0
{
218 case b0
== hangulBase0
:
219 return b1
>= hangulBase1
220 case b0
< hangulEnd0
:
222 case b0
> hangulEnd0
:
224 case b1
< hangulEnd1
:
227 return b1
== hangulEnd1
&& b
[2] < hangulEnd2
230 func isHangulString(b
string) bool {
231 if len(b
) < hangulUTF8Size
{
235 if b0
< hangulBase0
{
240 case b0
== hangulBase0
:
241 return b1
>= hangulBase1
242 case b0
< hangulEnd0
:
244 case b0
> hangulEnd0
:
246 case b1
< hangulEnd1
:
249 return b1
== hangulEnd1
&& b
[2] < hangulEnd2
252 // Caller must ensure len(b) >= 2.
253 func isJamoVT(b
[]byte) bool {
254 // True if (rune & 0xff00) == jamoLBase
255 return b
[0] == jamoLBase0
&& (b
[1]&0xFC) == jamoLBase1
258 func isHangulWithoutJamoT(b
[]byte) bool {
259 c
, _
:= utf8
.DecodeRune(b
)
261 return c
< jamoLVTCount
&& c%jamoTCount
== 0
264 // decomposeHangul writes the decomposed Hangul to buf and returns the number
265 // of bytes written. len(buf) should be at least 9.
266 func decomposeHangul(buf
[]byte, r rune
) int {
267 const JamoUTF8Len
= 3
271 utf8
.EncodeRune(buf
, jamoLBase
+r
/jamoVCount
)
272 utf8
.EncodeRune(buf
[JamoUTF8Len
:], jamoVBase
+r%jamoVCount
)
274 utf8
.EncodeRune(buf
[2*JamoUTF8Len
:], jamoTBase
+x
)
275 return 3 * JamoUTF8Len
277 return 2 * JamoUTF8Len
280 // decomposeHangul algorithmically decomposes a Hangul rune into
281 // its Jamo components.
282 // See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
283 func (rb
*reorderBuffer
) decomposeHangul(r rune
) bool {
292 rb
.appendRune(jamoLBase
+ r
/jamoVCount
)
293 rb
.appendRune(jamoVBase
+ r%jamoVCount
)
295 rb
.appendRune(jamoTBase
+ x
)
300 // combineHangul algorithmically combines Jamo character components into Hangul.
301 // See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
302 func (rb
*reorderBuffer
) combineHangul(s
, i
, k
int) {
311 if s
!= k
-1 && cccB
>= cccC
{
312 // b[i] is blocked by greater-equal cccX below it
316 l
:= rb
.runeAt(s
) // also used to compare to hangulBase
317 v
:= rb
.runeAt(i
) // also used to compare to jamoT
319 case jamoLBase
<= l
&& l
< jamoLEnd
&&
320 jamoVBase
<= v
&& v
< jamoVEnd
:
321 // 11xx plus 116x to LV
322 rb
.assignRune(s
, hangulBase
+
323 (l
-jamoLBase
)*jamoVTCount
+(v
-jamoVBase
)*jamoTCount
)
324 case hangulBase
<= l
&& l
< hangulEnd
&&
325 jamoTBase
< v
&& v
< jamoTEnd
&&
326 ((l
-hangulBase
)%jamoTCount
) == 0:
327 // ACxx plus 11Ax to LVT
328 rb
.assignRune(s
, l
+v
-jamoTBase
)
338 // compose recombines the runes in the buffer.
339 // It should only be used to recompose a single segment, as it will not
340 // handle alternations between Hangul and non-Hangul characters correctly.
341 func (rb
*reorderBuffer
) compose() {
342 // UAX #15, section X5 , including Corrigendum #5
343 // "In any character sequence beginning with starter S, a character C is
344 // blocked from S if and only if there is some character B between S
345 // and C, and either B is a starter or it has the same or higher
346 // combining class as C."
353 for s
, i
:= 0, 1; i
< bn
; i
++ {
354 if isJamoVT(rb
.bytesAt(i
)) {
355 // Redo from start in Hangul mode. Necessary to support
356 // U+320E..U+321E in NFKC mode.
357 rb
.combineHangul(s
, i
, k
)
361 // We can only use combineForward as a filter if we later
362 // get the info for the combined character. This is more
363 // expensive than using the filter. Using combinesBackward()
365 if ii
.combinesBackward() {
368 blocked
:= false // b[i] blocked by starter or greater or equal CCC?
372 blocked
= s
!= k
-1 && cccB
>= cccC
375 combined
:= combine(rb
.runeAt(s
), rb
.runeAt(i
))
377 rb
.assignRune(s
, combined
)