amd64 pipeline: improve performance of cvtdq2ps and cvtps2dq (128 and 256 bit version...
[valgrind.git] / perf / bigcode.c
blob02e069d9f6587139b966981f8f6a272a3217dea3
1 // This artificial program runs a lot of code. The exact amount depends on
2 // the command line -- if an arg "0" is given, it does exactly
3 // the same amount of work, but using four times as much code.
4 // If an arg >= 1 is given, the amount of code is multiplied by this arg.
5 //
6 // It's a stress test for Valgrind's translation speed; natively the two
7 // modes run in about the same time (the I-cache effects aren't big enough
8 // to make a difference), but under Valgrind the one running more code is
9 // significantly slower due to the extra translation time.
11 // 31 Aug 2015: this only "works" on x86/amd64/s390 by accident; the
12 // test is essentially kludged. This "generates" code into memory
13 // (the mmap'd area) and the executes it. But historically and even
14 // after this commit (r15601), the test has been run without
15 // --smc-check=all or all-non-file. That just happens to work because
16 // the "generated" code is never modified, so there's never a
17 // translated-vs-reality coherence problem. Really we ought to run
18 // with the new-as-of-r15601 default --smc-check=all-non-file, but that
19 // hugely slows it down and makes the results non-comparable with
20 // pre r15601 results, so instead the .vgperf files now specify the
21 // old default value --smc-check=stack explicitly.
24 #include <stdio.h>
25 #include <string.h>
26 #include <stdlib.h>
27 #include <assert.h>
28 #if defined(__mips__)
29 #include <asm/cachectl.h>
30 #include <sys/syscall.h>
31 #endif
32 #include "tests/sys_mman.h"
34 #define FN_SIZE 1280 // Must be big enough to hold the compiled f()
35 // and any literal pool that might be used
36 #define N_LOOPS 20000 // Should be divisible by four
37 #define RATIO 4 // Ratio of code sizes between the two modes
39 int f(int x, int y)
41 int i;
42 for (i = 0; i < 5000; i++) {
43 switch (x % 8) {
44 case 1: y += 3;
45 case 2: y += x;
46 case 3: y *= 2;
47 default: y--;
50 return y;
53 int main(int argc, char* argv[])
55 int h, i, sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0;
56 int n_fns, n_reps;
58 if (argc <= 1) {
59 // Mode 1: not so much code
60 n_fns = N_LOOPS / RATIO;
61 n_reps = RATIO;
62 printf("mode 1: ");
63 } else {
64 // Mode 2: lots of code
65 const int mul = atoi(argv[1]);
66 if (mul == 0)
67 n_fns = N_LOOPS;
68 else
69 n_fns = N_LOOPS * mul;
70 n_reps = 1;
71 printf("mode 1: ");
73 printf("%d copies of f(), %d reps\n", n_fns, n_reps);
75 char* a = mmap(0, FN_SIZE * n_fns,
76 PROT_EXEC|PROT_WRITE|PROT_READ,
77 MAP_PRIVATE|MAP_ANONYMOUS, -1,0);
78 assert(a != (char*)MAP_FAILED);
80 // Make a whole lot of copies of f(). FN_SIZE is much bigger than f()
81 // will ever be (we hope).
82 for (i = 0; i < n_fns; i++) {
83 memcpy(&a[FN_SIZE*i], f, FN_SIZE);
86 #if defined(__mips__)
87 syscall(__NR_cacheflush, a, FN_SIZE * n_fns, ICACHE);
88 #endif
90 for (h = 0; h < n_reps; h += 1) {
91 for (i = 0; i < n_fns; i += 4) {
92 int(*f1)(int,int) = (void*)&a[FN_SIZE*(i+0)];
93 int(*f2)(int,int) = (void*)&a[FN_SIZE*(i+1)];
94 int(*f3)(int,int) = (void*)&a[FN_SIZE*(i+2)];
95 int(*f4)(int,int) = (void*)&a[FN_SIZE*(i+3)];
96 sum1 += f1(i+0, n_fns-i+0);
97 sum2 += f2(i+1, n_fns-i+1);
98 sum3 += f3(i+2, n_fns-i+2);
99 sum4 += f4(i+3, n_fns-i+3);
100 if (i % 1000 == 0)
101 printf(".");
104 printf("result = %d\n", sum1 + sum2 + sum3 + sum4);
105 return 0;