Optimize vswap()
vswap() is called often enough and shows in profile and it was easy to
hand optimize swapping vtop[-1] and vtop[0] - instead of large (28 bytes
on i386) tmp variable and two memory to memory copies, let's swap areas
by longs through registers with streamlined assembly.
For
$ ./tcc -B. -bench -DONE_SOURCE -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" -c tcc.c
before:
# Overhead Command Shared Object Symbol
# ........ ........... ................... ..............................................
#
15.19% tcc tcc [.] next_nomacro1
5.19% tcc libc-2.13.so [.] _int_malloc
4.57% tcc tcc [.] next
3.36% tcc tcc [.] tok_str_add2
3.03% tcc tcc [.] macro_subst_tok
2.93% tcc tcc [.] macro_subst
2.53% tcc tcc [.] next_nomacro_spc
2.49% tcc tcc [.] vswap
2.36% tcc libc-2.13.so [.] _int_free
│ ST_FUNC void vswap(void)
│ {
1,96 │ push %edi
2,65 │ push %esi
1,08 │ sub $0x20,%esp
│ SValue tmp;
│
│ /* cannot let cpu flags if other instruction are generated. Also
│ avoid leaving VT_JMP anywhere except on the top of the stack
│ because it would complicate the code generator. */
│ if (vtop >= vstack) {
0,98 │ mov 0x8078cac,%eax
│ cmp $0x8078d3c,%eax
1,18 │ ┌──jb 24
│ │ int v = vtop->r & VT_VALMASK;
1,08 │ │ mov 0x8(%eax),%edx
0,78 │ │ and $0x3f,%edx
│ │ if (v == VT_CMP || (v & ~1) == VT_JMP)
0,78 │ │ cmp $0x33,%edx
0,69 │ │↓ je 54
0,59 │ │ and $0xfffffffe,%edx
0,49 │ │ cmp $0x34,%edx
0,29 │ │↓ je 54
│ │ gv(RC_INT);
│ │ }
│ │ tmp = vtop[0];
1,08 │24:└─→lea 0x4(%esp),%edi
0,39 │ mov $0x7,%ecx
│ mov %eax,%esi
14,41 │ rep movsl %ds:(%esi),%es:(%edi)
│ vtop[0] = vtop[-1];
9,51 │ lea -0x1c(%eax),%esi
1,96 │ mov $0x7,%cl
│ mov %eax,%edi
17,06 │ rep movsl %ds:(%esi),%es:(%edi)
│ vtop[-1] = tmp;
10,20 │ mov 0x8078cac,%edi
2,35 │ sub $0x1c,%edi
0,78 │ lea 0x4(%esp),%esi
│ mov $0x7,%cl
15,20 │ rep movsl %ds:(%esi),%es:(%edi)
│ }
9,90 │ add $0x20,%esp
2,25 │ pop %esi
1,67 │ pop %edi
0,69 │ ret
after:
# Overhead Command Shared Object Symbol
# ........ ........... ................... ..............................................
#
15.27% tcc tcc [.] next_nomacro1
5.08% tcc libc-2.13.so [.] _int_malloc
4.57% tcc tcc [.] next
3.17% tcc tcc [.] tok_str_add2
3.12% tcc tcc [.] macro_subst
2.99% tcc tcc [.] macro_subst_tok
2.43% tcc tcc [.] next_nomacro_spc
2.32% tcc libc-2.13.so [.] _int_free
. . .
0.71% tcc tcc [.] vswap
│ ST_FUNC void vswap(void)
│ {
7,22 │ push %eax
│ /* cannot let cpu flags if other instruction are generated. Also
│ avoid leaving VT_JMP anywhere except on the top of the stack
│ because it would complicate the code generator. */
│ if (vtop >= vstack) {
11,34 │ mov 0x8078cac,%eax
2,75 │ cmp $0x8078d3c,%eax
0,34 │ ┌──jb 20
│ │ int v = vtop->r & VT_VALMASK;
0,34 │ │ mov 0x8(%eax),%edx
8,93 │ │ and $0x3f,%edx
│ │ if (v == VT_CMP || (v & ~1) == VT_JMP)
2,06 │ │ cmp $0x33,%edx
2,41 │ │↓ je 74
2,41 │ │ and $0xfffffffe,%edx
0,34 │ │ cmp $0x34,%edx
2,41 │ │↓ je 74
│ │ vtopl[-1*VSIZEL + i] = tmpl; \
│ │ } do {} while (0)
│ │
│ │ VSWAPL(15); VSWAPL(14); VSWAPL(13); VSWAPL(12);
│ │ VSWAPL(11); VSWAPL(10); VSWAPL( 9); VSWAPL( 8);
│ │ VSWAPL( 7); VSWAPL( 6); VSWAPL( 5); VSWAPL( 4);
2,06 │20:└─→mov 0x18(%eax),%edx
1,37 │ mov -0x4(%eax),%ecx
2,06 │ mov %ecx,0x18(%eax)
1,37 │ mov %edx,-0x4(%eax)
2,06 │ mov 0x14(%eax),%edx
2,06 │ mov -0x8(%eax),%ecx
2,41 │ mov %ecx,0x14(%eax)
3,09 │ mov %edx,-0x8(%eax)
3,09 │ mov 0x10(%eax),%edx
1,72 │ mov -0xc(%eax),%ecx
2,75 │ mov %ecx,0x10(%eax)
1,72 │ mov %edx,-0xc(%eax)
│ VSWAPL( 3); VSWAPL( 2); VSWAPL( 1); VSWAPL( 0);
2,41 │ mov 0xc(%eax),%edx
2,41 │ mov -0x10(%eax),%ecx
2,41 │ mov %ecx,0xc(%eax)
0,69 │ mov %edx,-0x10(%eax)
1,72 │ mov 0x8(%eax),%edx
0,69 │ mov -0x14(%eax),%ecx
1,03 │ mov %ecx,0x8(%eax)
1,37 │ mov %edx,-0x14(%eax)
1,37 │ mov 0x4(%eax),%edx
0,69 │ mov -0x18(%eax),%ecx
3,09 │ mov %ecx,0x4(%eax)
2,06 │ mov %edx,-0x18(%eax)
1,37 │ mov (%eax),%edx
2,41 │ mov -0x1c(%eax),%ecx
1,37 │ mov %ecx,(%eax)
4,12 │ mov %edx,-0x1c(%eax)
│ }
│
│ # undef VSWAPL
│ # undef VSIZEL
│ }
1,03 │ pop %eax
3,44 │ ret
Overal speedup:
# best of 5 runs
before: 8268 idents, 47203 lines,
1526763 bytes, 0.148 s, 319217 lines/s, 10.3 MB/s
after: 8273 idents, 47231 lines,
1527685 bytes, 0.146 s, 324092 lines/s, 10.5 MB/s
Static ASSERT macro taken from CCAN's[1] build_assert[2] which is in
public domain.
[1] http://ccodearchive.net/
[2] http://git.ozlabs.org/?p=ccan;a=blob;f=ccan/build_assert/build_assert.h;h=
24e59c44cd930173178ac9b6e101b0af64a879e9;hb=HEAD