2 This code comes originally from the output of gcc
3 (gcc -S) [ Dont forget to specify -O3 also !!!]
5 Then it has been hand-optimized in the inner loop
7 Hum it is a bit faster... but not enough to satisfy me :)
40 movl ikse+4(%esi),%esi
43 movl ikse+168(%esi),%ecx
47 movl igrek+4(%esi),%esi
50 movl igrek+164(%esi),%edx
51 movl igrek+168(%esi),%ebx
52 movl ikse+164(%esi),%eax
88 movl -56(%ebp),%esi // swapped ebx et esi
91 movl -24(%ebp),%ecx // swapped ecx et edi
93 // movl %esi,-72(%ebp)
112 movw (%edx,%eax,2),%ax
119 movw (%edx,%eax,2),%ax
126 movw (%edx,%eax,2),%ax
133 movw (%edx,%eax,2),%ax
140 movw (%edx,%eax,2),%ax
147 movw (%edx,%eax,2),%ax
154 movw (%edx,%eax,2),%ax
161 movw (%edx,%eax,2),%ax