amd64 insn selector: improved handling of Or1/And1 trees.
[valgrind.git] / coregrind / pub_core_transtab_asm.h
blob8b585f17d2ec47e7b28dd1101502b0e260ac134e
2 /*--------------------------------------------------------------------*/
3 /*--- Asm-only TransTab stuff. pub_core_transtab_asm.h ---*/
4 /*--------------------------------------------------------------------*/
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
10 Copyright (C) 2000-2017 Julian Seward
11 jseward@acm.org
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
29 #ifndef __PUB_CORE_TRANSTAB_ASM_H
30 #define __PUB_CORE_TRANSTAB_ASM_H
32 /* Constants for the fast translation lookup cache. It is a 4 way associative
33 cache, with more-or-less LRU replacement. It contains 2^VG_TT_FAST_BITS
34 sets.
36 On all targets, the set number is computed from least significant 2 *
37 VG_TT_FAST_BITS of the guest address. This is a bit unusual in as much as
38 it is more normal just to use a VG_TT_FAST_BITS-sized slice of the address
39 as the set number. Using twice as many bits (the two chunks are xor'd)
40 spreads entries out (reduces aliasing) and significantly reduces the overall
41 miss rate. The cost is two extra cycles on the fast lookup path, to perform
42 an extra shift and an xor.
44 For each set there are 4 ways: way0, way1, way2 and way3. way0 is intended
45 to be the MRU and way3 the LRU. Most lookups hit way0 and involve no
46 modification of the line. A hit at way1 causes way0 and way1 to be swapped.
47 A hit at way2 causes way1 and way2 to be swapped; that is, way2 is moved one
48 step closer to the front. But not all the way to the front. Similarly a
49 hit at way3 causes way2 and way3 to be swapped.
51 See VG_(lookupInFastCache) for a C implementation of this logic and
52 dispatch-*-*.S, label VG_(disp_cp_xindir), for the handcoded assembly
53 equivalents for each target. Note that VG_(lookupInFastCache) is used in C
54 land for some administrative lookups but isn't really performance critical.
55 The dispatch-*-*.S implementations are used to process all indirect branches
56 in the simulator and so *are* performance critical.
58 Updates to the cache are rare. These are performed by setFastCacheEntry.
59 New entries are put into way0 and all others are shifted down one slot, so
60 that the contents of way3 falls out of the cache.
62 On x86/amd64, the cache index is computed as
63 (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1 : 0]'.
65 On ppc32/ppc64/mips32/mips64/arm64, the bottom two bits of instruction
66 addresses are zero, which means the above function causes only 1/4 of the
67 sets to ever be used. So instead the function is
68 (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1+2 : 0+2]'.
70 On arm32, the minimum instruction size is 2, so we discard only the least
71 significant bit of the address, hence:
72 (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1+1 : 0+1]'.
74 On s390x the rightmost bit of an instruction address is zero, so the arm32
75 scheme is used. */
77 #define VG_TT_FAST_BITS 13
78 #define VG_TT_FAST_SETS (1 << VG_TT_FAST_BITS)
79 #define VG_TT_FAST_MASK ((VG_TT_FAST_SETS) - 1)
81 // Log2(sizeof(FastCacheSet)). This is needed in the handwritten assembly.
83 #if defined(VGA_amd64) || defined(VGA_arm64) \
84 || defined(VGA_ppc64be) || defined(VGA_ppc64le) \
85 || (defined(VGA_mips64) && defined(VGABI_64)) \
86 || defined(VGA_s390x)
87 // And all other 64-bit hosts
88 # define VG_FAST_CACHE_SET_BITS 6
89 // These FCS_{g,h}{0,1,2,3} are the values of
90 // offsetof(FastCacheSet,{guest,host}{0,1,2,3}).
91 # define FCS_g0 0
92 # define FCS_h0 8
93 # define FCS_g1 16
94 # define FCS_h1 24
95 # define FCS_g2 32
96 # define FCS_h2 40
97 # define FCS_g3 48
98 # define FCS_h3 56
100 #elif defined(VGA_x86) || defined(VGA_arm) || defined(VGA_ppc32) \
101 || defined(VGA_mips32) || defined(VGP_nanomips_linux) \
102 || (defined(VGA_mips64) && defined(VGABI_N32))
103 // And all other 32-bit hosts
104 # define VG_FAST_CACHE_SET_BITS 5
105 # define FCS_g0 0
106 # define FCS_h0 4
107 # define FCS_g1 8
108 # define FCS_h1 12
109 # define FCS_g2 16
110 # define FCS_h2 20
111 # define FCS_g3 24
112 # define FCS_h3 28
114 #else
115 # error "VG_FAST_CACHE_SET_BITS not known"
116 #endif
118 #endif // __PUB_CORE_TRANSTAB_ASM_H
120 /*--------------------------------------------------------------------*/
121 /*--- end ---*/
122 /*--------------------------------------------------------------------*/