stop obsoleting events once the query is done
[debiancodesearch.git] / regexp / utf.go
blobd587eaaf8487fa1d0aaa75b4c303a26d79f25e32
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package regexp
7 import (
8 "regexp/syntax"
9 "unicode"
10 "unicode/utf8"
13 const (
14 instFail = syntax.InstFail
15 instAlt = syntax.InstAlt
16 instByteRange = syntax.InstRune | 0x80 // local opcode
18 argFold = 1 << 16
21 func toByteProg(prog *syntax.Prog) error {
22 var b runeBuilder
23 for pc := range prog.Inst {
24 i := &prog.Inst[pc]
25 switch i.Op {
26 case syntax.InstRune, syntax.InstRune1:
27 // General rune range. PIA.
28 // TODO: Pick off single-byte case.
29 if lo, hi, fold, ok := oneByteRange(i); ok {
30 i.Op = instByteRange
31 i.Arg = uint32(lo)<<8 | uint32(hi)
32 if fold {
33 i.Arg |= argFold
35 break
38 r := i.Rune
39 if syntax.Flags(i.Arg)&syntax.FoldCase != 0 {
40 // Build folded list.
41 var rr []rune
42 if len(r) == 1 {
43 rr = appendFoldedRange(rr, r[0], r[0])
44 } else {
45 for j := 0; j < len(r); j += 2 {
46 rr = appendFoldedRange(rr, r[j], r[j+1])
49 r = rr
52 b.init(prog, uint32(pc), i.Out)
53 if len(r) == 1 {
54 b.addRange(r[0], r[0], false)
55 } else {
56 for j := 0; j < len(r); j += 2 {
57 b.addRange(r[j], r[j+1], false)
61 case syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
62 // All runes.
63 // AnyNotNL should exclude \n but the line-at-a-time
64 // execution takes care of that for us.
65 b.init(prog, uint32(pc), i.Out)
66 b.addRange(0, unicode.MaxRune, false)
69 return nil
72 func oneByteRange(i *syntax.Inst) (lo, hi byte, fold, ok bool) {
73 if i.Op == syntax.InstRune1 {
74 r := i.Rune[0]
75 if r < utf8.RuneSelf {
76 return byte(r), byte(r), false, true
79 if i.Op != syntax.InstRune {
80 return
82 fold = syntax.Flags(i.Arg)&syntax.FoldCase != 0
83 if len(i.Rune) == 1 || len(i.Rune) == 2 && i.Rune[0] == i.Rune[1] {
84 r := i.Rune[0]
85 if r >= utf8.RuneSelf {
86 return
88 if fold && !asciiFold(r) {
89 return
91 return byte(r), byte(r), fold, true
93 if len(i.Rune) == 2 && i.Rune[1] < utf8.RuneSelf {
94 if fold {
95 for r := i.Rune[0]; r <= i.Rune[1]; r++ {
96 if asciiFold(r) {
97 return
101 return byte(i.Rune[0]), byte(i.Rune[1]), fold, true
103 if len(i.Rune) == 4 && i.Rune[0] == i.Rune[1] && i.Rune[2] == i.Rune[3] && unicode.SimpleFold(i.Rune[0]) == i.Rune[2] && unicode.SimpleFold(i.Rune[2]) == i.Rune[0] {
104 return byte(i.Rune[0]), byte(i.Rune[0]), true, true
107 return
110 func asciiFold(r rune) bool {
111 if r >= utf8.RuneSelf {
112 return false
114 r1 := unicode.SimpleFold(r)
115 if r1 >= utf8.RuneSelf {
116 return false
118 if r1 == r {
119 return true
121 return unicode.SimpleFold(r1) == r
124 func maxRune(n int) rune {
125 b := 0
126 if n == 1 {
127 b = 7
128 } else {
129 b = 8 - (n + 1) + 6*(n-1)
131 return 1<<uint(b) - 1
134 type cacheKey struct {
135 lo, hi uint8
136 fold bool
137 next uint32
140 type runeBuilder struct {
141 begin uint32
142 out uint32
143 cache map[cacheKey]uint32
144 p *syntax.Prog
147 func (b *runeBuilder) init(p *syntax.Prog, begin, out uint32) {
148 // We will rewrite p.Inst[begin] to hold the accumulated
149 // machine. For now, there is no match.
150 p.Inst[begin].Op = instFail
152 b.begin = begin
153 b.out = out
154 if b.cache == nil {
155 b.cache = make(map[cacheKey]uint32)
157 for k := range b.cache {
158 delete(b.cache, k)
160 b.p = p
163 func (b *runeBuilder) uncachedSuffix(lo, hi byte, fold bool, next uint32) uint32 {
164 if next == 0 {
165 next = b.out
167 pc := len(b.p.Inst)
168 i := syntax.Inst{Op: instByteRange, Arg: uint32(lo)<<8 | uint32(hi), Out: next}
169 if fold {
170 i.Arg |= argFold
172 b.p.Inst = append(b.p.Inst, i)
173 return uint32(pc)
176 func (b *runeBuilder) suffix(lo, hi byte, fold bool, next uint32) uint32 {
177 if lo < 0x80 || hi > 0xbf {
178 // Not a continuation byte, no need to cache.
179 return b.uncachedSuffix(lo, hi, fold, next)
182 key := cacheKey{lo, hi, fold, next}
183 if pc, ok := b.cache[key]; ok {
184 return pc
187 pc := b.uncachedSuffix(lo, hi, fold, next)
188 b.cache[key] = pc
189 return pc
192 func (b *runeBuilder) addBranch(pc uint32) {
193 // Add pc to the branch at the beginning.
194 i := &b.p.Inst[b.begin]
195 switch i.Op {
196 case syntax.InstFail:
197 i.Op = syntax.InstNop
198 i.Out = pc
199 return
200 case syntax.InstNop:
201 i.Op = syntax.InstAlt
202 i.Arg = pc
203 return
204 case syntax.InstAlt:
205 apc := uint32(len(b.p.Inst))
206 b.p.Inst = append(b.p.Inst, syntax.Inst{Op: instAlt, Out: i.Arg, Arg: pc})
207 i = &b.p.Inst[b.begin]
208 i.Arg = apc
209 b.begin = apc
213 func (b *runeBuilder) addRange(lo, hi rune, fold bool) {
214 if lo > hi {
215 return
218 // TODO: Pick off 80-10FFFF for special handling?
219 if lo == 0x80 && hi == 0x10FFFF {
222 // Split range into same-length sized ranges.
223 for i := 1; i < utf8.UTFMax; i++ {
224 max := maxRune(i)
225 if lo <= max && max < hi {
226 b.addRange(lo, max, fold)
227 b.addRange(max+1, hi, fold)
228 return
232 // ASCII range is special.
233 if hi < utf8.RuneSelf {
234 b.addBranch(b.suffix(byte(lo), byte(hi), fold, 0))
235 return
238 // Split range into sections that agree on leading bytes.
239 for i := 1; i < utf8.UTFMax; i++ {
240 m := rune(1)<<uint(6*i) - 1 // last i bytes of UTF-8 sequence
241 if lo&^m != hi&^m {
242 if lo&m != 0 {
243 b.addRange(lo, lo|m, fold)
244 b.addRange((lo|m)+1, hi, fold)
245 return
247 if hi&m != m {
248 b.addRange(lo, hi&^m-1, fold)
249 b.addRange(hi&^m, hi, fold)
250 return
255 // Finally. Generate byte matching equivalent for lo-hi.
256 var ulo, uhi [utf8.UTFMax]byte
257 n := utf8.EncodeRune(ulo[:], lo)
258 m := utf8.EncodeRune(uhi[:], hi)
259 if n != m {
260 panic("codesearch/regexp: bad utf-8 math")
263 pc := uint32(0)
264 for i := n - 1; i >= 0; i-- {
265 pc = b.suffix(ulo[i], uhi[i], false, pc)
267 b.addBranch(pc)