1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
14 instFail
= syntax
.InstFail
15 instAlt
= syntax
.InstAlt
16 instByteRange
= syntax
.InstRune |
0x80 // local opcode
21 func toByteProg(prog
*syntax
.Prog
) error
{
23 for pc
:= range prog
.Inst
{
26 case syntax
.InstRune
, syntax
.InstRune1
:
27 // General rune range. PIA.
28 // TODO: Pick off single-byte case.
29 if lo
, hi
, fold
, ok
:= oneByteRange(i
); ok
{
31 i
.Arg
= uint32(lo
)<<8 |
uint32(hi
)
39 if syntax
.Flags(i
.Arg
)&syntax
.FoldCase
!= 0 {
43 rr
= appendFoldedRange(rr
, r
[0], r
[0])
45 for j
:= 0; j
< len(r
); j
+= 2 {
46 rr
= appendFoldedRange(rr
, r
[j
], r
[j
+1])
52 b
.init(prog
, uint32(pc
), i
.Out
)
54 b
.addRange(r
[0], r
[0], false)
56 for j
:= 0; j
< len(r
); j
+= 2 {
57 b
.addRange(r
[j
], r
[j
+1], false)
61 case syntax
.InstRuneAny
, syntax
.InstRuneAnyNotNL
:
63 // AnyNotNL should exclude \n but the line-at-a-time
64 // execution takes care of that for us.
65 b
.init(prog
, uint32(pc
), i
.Out
)
66 b
.addRange(0, unicode
.MaxRune
, false)
72 func oneByteRange(i
*syntax
.Inst
) (lo
, hi
byte, fold
, ok
bool) {
73 if i
.Op
== syntax
.InstRune1
{
75 if r
< utf8
.RuneSelf
{
76 return byte(r
), byte(r
), false, true
79 if i
.Op
!= syntax
.InstRune
{
82 fold
= syntax
.Flags(i
.Arg
)&syntax
.FoldCase
!= 0
83 if len(i
.Rune
) == 1 ||
len(i
.Rune
) == 2 && i
.Rune
[0] == i
.Rune
[1] {
85 if r
>= utf8
.RuneSelf
{
88 if fold
&& !asciiFold(r
) {
91 return byte(r
), byte(r
), fold
, true
93 if len(i
.Rune
) == 2 && i
.Rune
[1] < utf8
.RuneSelf
{
95 for r
:= i
.Rune
[0]; r
<= i
.Rune
[1]; r
++ {
101 return byte(i
.Rune
[0]), byte(i
.Rune
[1]), fold
, true
103 if len(i
.Rune
) == 4 && i
.Rune
[0] == i
.Rune
[1] && i
.Rune
[2] == i
.Rune
[3] && unicode
.SimpleFold(i
.Rune
[0]) == i
.Rune
[2] && unicode
.SimpleFold(i
.Rune
[2]) == i
.Rune
[0] {
104 return byte(i
.Rune
[0]), byte(i
.Rune
[0]), true, true
110 func asciiFold(r rune
) bool {
111 if r
>= utf8
.RuneSelf
{
114 r1
:= unicode
.SimpleFold(r
)
115 if r1
>= utf8
.RuneSelf
{
121 return unicode
.SimpleFold(r1
) == r
124 func maxRune(n
int) rune
{
129 b
= 8 - (n
+ 1) + 6*(n
-1)
131 return 1<<uint(b
) - 1
134 type cacheKey
struct {
140 type runeBuilder
struct {
143 cache
map[cacheKey
]uint32
147 func (b
*runeBuilder
) init(p
*syntax
.Prog
, begin
, out
uint32) {
148 // We will rewrite p.Inst[begin] to hold the accumulated
149 // machine. For now, there is no match.
150 p
.Inst
[begin
].Op
= instFail
155 b
.cache
= make(map[cacheKey
]uint32)
157 for k
:= range b
.cache
{
163 func (b
*runeBuilder
) uncachedSuffix(lo
, hi
byte, fold
bool, next
uint32) uint32 {
168 i
:= syntax
.Inst
{Op
: instByteRange
, Arg
: uint32(lo
)<<8 |
uint32(hi
), Out
: next
}
172 b
.p
.Inst
= append(b
.p
.Inst
, i
)
176 func (b
*runeBuilder
) suffix(lo
, hi
byte, fold
bool, next
uint32) uint32 {
177 if lo
< 0x80 || hi
> 0xbf {
178 // Not a continuation byte, no need to cache.
179 return b
.uncachedSuffix(lo
, hi
, fold
, next
)
182 key
:= cacheKey
{lo
, hi
, fold
, next
}
183 if pc
, ok
:= b
.cache
[key
]; ok
{
187 pc
:= b
.uncachedSuffix(lo
, hi
, fold
, next
)
192 func (b
*runeBuilder
) addBranch(pc
uint32) {
193 // Add pc to the branch at the beginning.
194 i
:= &b
.p
.Inst
[b
.begin
]
196 case syntax
.InstFail
:
197 i
.Op
= syntax
.InstNop
201 i
.Op
= syntax
.InstAlt
205 apc
:= uint32(len(b
.p
.Inst
))
206 b
.p
.Inst
= append(b
.p
.Inst
, syntax
.Inst
{Op
: instAlt
, Out
: i
.Arg
, Arg
: pc
})
207 i
= &b
.p
.Inst
[b
.begin
]
213 func (b
*runeBuilder
) addRange(lo
, hi rune
, fold
bool) {
218 // TODO: Pick off 80-10FFFF for special handling?
219 if lo
== 0x80 && hi
== 0x10FFFF {
222 // Split range into same-length sized ranges.
223 for i
:= 1; i
< utf8
.UTFMax
; i
++ {
225 if lo
<= max
&& max
< hi
{
226 b
.addRange(lo
, max
, fold
)
227 b
.addRange(max
+1, hi
, fold
)
232 // ASCII range is special.
233 if hi
< utf8
.RuneSelf
{
234 b
.addBranch(b
.suffix(byte(lo
), byte(hi
), fold
, 0))
238 // Split range into sections that agree on leading bytes.
239 for i
:= 1; i
< utf8
.UTFMax
; i
++ {
240 m
:= rune(1)<<uint(6*i
) - 1 // last i bytes of UTF-8 sequence
243 b
.addRange(lo
, lo|m
, fold
)
244 b
.addRange((lo|m
)+1, hi
, fold
)
248 b
.addRange(lo
, hi
&^m
-1, fold
)
249 b
.addRange(hi
&^m
, hi
, fold
)
255 // Finally. Generate byte matching equivalent for lo-hi.
256 var ulo
, uhi
[utf8
.UTFMax
]byte
257 n
:= utf8
.EncodeRune(ulo
[:], lo
)
258 m
:= utf8
.EncodeRune(uhi
[:], hi
)
260 panic("codesearch/regexp: bad utf-8 math")
264 for i
:= n
- 1; i
>= 0; i
-- {
265 pc
= b
.suffix(ulo
[i
], uhi
[i
], false, pc
)