1 ;uInt longest_match_x64(
3 ; IPos cur_match); /* current match */
5 ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
6 ; (AMD64 on Athlon 64, Opteron, Phenom
7 ; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
8 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
10 ; File written by Gilles Vollant, by converting to assembly the longest_match
11 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
13 ; and by taking inspiration on asm686 with masm, optimised assembly code
14 ; from Brian Raiter, written 1998
16 ; This software is provided 'as-is', without any express or implied
17 ; warranty. In no event will the authors be held liable for any damages
18 ; arising from the use of this software.
20 ; Permission is granted to anyone to use this software for any purpose,
21 ; including commercial applications, and to alter it and redistribute it
22 ; freely, subject to the following restrictions:
24 ; 1. The origin of this software must not be misrepresented; you must not
25 ; claim that you wrote the original software. If you use this software
26 ; in a product, an acknowledgment in the product documentation would be
27 ; appreciated but is not required.
28 ; 2. Altered source versions must be plainly marked as such, and must not be
29 ; misrepresented as being the original software
30 ; 3. This notice may not be removed or altered from any source distribution.
35 ; http://www.winimage.com/zLibDll
36 ; http://www.muppetlabs.com/~breadbox/software/assembly.html
38 ; to compile this file for infozip Zip, I use option:
39 ; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
41 ; to compile this file for zLib, I use option:
42 ; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
43 ; Be carrefull to adapt zlib1222add below to your version of zLib
44 ; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
45 ; value of zlib1222add later)
47 ; This file compile with Microsoft Macro Assembler (x64) for AMD64
49 ; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
51 ; (you can get Windows WDK with ml64 for AMD64 from
52 ; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
56 ;uInt longest_match(s, cur_match)
58 ; IPos cur_match; /* current match */
66 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
67 ; free register : r14,r15
68 ; register can be saved : rsp
70 chainlenwmask
equ rsp
+ 8 - LocalVarsSize
; high word: current chain len
72 ;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10
73 ;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11
74 ;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w
75 ;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx
76 ;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13
77 ;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d
78 ;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9
81 nicematch
equ (rsp
+ 16 - LocalVarsSize
) ; a good enough match size
84 save_rdi
equ rsp
+ 24 - LocalVarsSize
85 save_rsi
equ rsp
+ 32 - LocalVarsSize
86 save_rbx
equ rsp
+ 40 - LocalVarsSize
87 save_rbp
equ rsp
+ 48 - LocalVarsSize
88 save_r12
equ rsp
+ 56 - LocalVarsSize
89 save_r13
equ rsp
+ 64 - LocalVarsSize
90 ;save_r14 equ rsp + 72 - LocalVarsSize
91 ;save_r15 equ rsp + 80 - LocalVarsSize
94 ; summary of register usage
114 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib
115 ; in the deflate_state structure since the asm code was first written
116 ; (if you compile with zlib 1.0.4 or older, remove the +4).
117 ; Note : these value are good with a 8 bytes boundary pack structure
122 MIN_LOOKAHEAD
equ (MAX_MATCH
+MIN_MATCH
+1)
125 ;;; Offsets for fields in the deflate_state structure. These numbers
126 ;;; are calculated from the definition of deflate_state, with the
127 ;;; assumption that the compiler will dword-align the fields. (Thus,
128 ;;; changing the definition of deflate_state could easily cause this
129 ;;; program to crash horribly, without so much as a warning at
130 ;;; compile time. Sigh.)
132 ; all the +zlib1222add offsets are due to the addition of fields
133 ; in zlib in the deflate_state structure since the asm code was first written
134 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
135 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
136 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
142 COMM
window_size:DWORD
144 COMM
window:BYTE:010040H
145 COMM
prev:WORD:08000H
149 COMM
match_start:DWORD
151 COMM
prev_length:DWORD ; PrevLen
152 COMM
max_chain_length:DWORD
153 COMM
good_match:DWORD
154 COMM
nice_match:DWORD
155 prev_ad
equ OFFSET prev
156 window_ad
equ OFFSET window
157 nicematch
equ nice_match
166 dsWSize
equ 56+zlib1222add
+(zlib1222add
/2)
167 dsWMask
equ 64+zlib1222add
+(zlib1222add
/2)
168 dsWindow
equ 72+zlib1222add
169 dsPrev
equ 88+zlib1222add
170 dsMatchLen
equ 128+zlib1222add
171 dsPrevMatch
equ 132+zlib1222add
172 dsStrStart
equ 140+zlib1222add
173 dsMatchStart
equ 144+zlib1222add
174 dsLookahead
equ 148+zlib1222add
175 dsPrevLen
equ 152+zlib1222add
176 dsMaxChainLen
equ 156+zlib1222add
177 dsGoodMatch
equ 172+zlib1222add
178 dsNiceMatch
equ 176+zlib1222add
180 window_size
equ [ rcx
+ dsWSize
]
181 WMask
equ [ rcx
+ dsWMask
]
182 window_ad
equ [ rcx
+ dsWindow
]
183 prev_ad
equ [ rcx
+ dsPrev
]
184 strstart
equ [ rcx
+ dsStrStart
]
185 match_start
equ [ rcx
+ dsMatchStart
]
186 Lookahead
equ [ rcx
+ dsLookahead
] ; 0ffffffffh on infozip
187 prev_length
equ [ rcx
+ dsPrevLen
]
188 max_chain_length
equ [ rcx
+ dsMaxChainLen
]
189 good_match
equ [ rcx
+ dsGoodMatch
]
190 nice_match
equ [ rcx
+ dsNiceMatch
]
193 ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
195 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
196 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
198 ; All registers must be preserved across the call, except for
199 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
203 ;;; Save registers that the compiler may be using, and adjust esp to
204 ;;; make room for our stack frame.
207 ;;; Retrieve the function arguments. r8d will hold cur_match
208 ;;; throughout the entire function. edx will hold the pointer to the
209 ;;; deflate_state structure during the function's setup (before
210 ;;; entering the main loop.
212 ; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
214 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
231 ;;; uInt wmask = s->w_mask;
232 ;;; unsigned chain_length = s->max_chain_length;
233 ;;; if (s->prev_length >= s->good_match) {
234 ;;; chain_length >>= 2;
240 mov ebx, max_chain_length
246 ;;; chainlen is decremented once beforehand so that the function can
247 ;;; use the sign flag instead of the zero flag for the exit test.
248 ;;; It is then shifted into the high word, to make room for the wmask
249 ;;; value, which it will always accompany.
256 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
259 mov [chainlenwmask
], ebx
260 ; on infozip nice_match = [nice_match]
263 mov [chainlenwmask
], ebx
270 ;;; register Bytef *scan = s->window + s->strstart;
275 ;;; Determine how many bytes the scan ptr is off from being
282 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
283 ;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
285 mov eax,07efah
; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
288 sub eax, MIN_LOOKAHEAD
293 mov r11d
, prev_length
297 ;;; int best_len = s->prev_length;
300 ;;; Store the sum of s->window + best_len in esi locally, and in esi.
304 ;;; register ush scan_start = *(ushf*)scan;
305 ;;; register ush scan_end = *(ushf*)(scan+best_len-1);
306 ;;; Posf *prev = s->prev;
308 movzx r12d
,word ptr [r9
]
309 movzx ebx, word ptr [r9
+ r11
- 1]
313 ;;; Jump into the main loop.
315 mov edx, [chainlenwmask
]
317 cmp bx,word ptr [rsi
+ r8
- 1]
323 movzx r8d
, word ptr [rdi
+ r8
*2]
330 cmp bx,word ptr [rsi
+ r8
- 1]
336 movzx r8d
, word ptr [rdi
+ r8
*2]
343 cmp bx,word ptr [rsi
+ r8
- 1]
349 movzx r8d
, word ptr [rdi
+ r8
*2]
357 cmp bx,word ptr [rsi
+ r8
- 1]
363 ;;; match = s->window + cur_match;
364 ;;; if (*(ushf*)(match+best_len-1) != scan_end ||
365 ;;; *(ushf*)match != scan_start) continue;
367 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
368 ;;; && --chain_length != 0);
370 ;;; Here is the inner loop of the function. The function will spend the
371 ;;; majority of its time in this loop, and majority of that time will
372 ;;; be spent in the first ten instructions.
374 ;;; Within this loop:
377 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
378 ;;; esi = windowbestlen - i.e., (window + bestlen)
385 movzx r8d
, word ptr [rdi
+ r8
*2]
393 cmp bx,word ptr [rsi
+ r8
- 1]
396 cmp r12w
, word ptr [r10
+ r8
]
400 ;;; Store the current value of chainlen.
401 mov [chainlenwmask
], edx
403 ;;; Point edi to the string under scrutiny, and esi to the string we
404 ;;; are hoping to match it up with. In actuality, esi and edi are
405 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
406 ;;; initialized to -(MAX_MATCH_8 - scanalign).
409 mov rdx
, 0fffffffffffffef8h
; -(MAX_MATCH_8)
410 lea rsi
, [rsi
+ r13
+ 0108h] ;MAX_MATCH_8]
411 lea rdi
, [r9
+ r13
+ 0108h] ;MAX_MATCH_8]
417 ;;; Test the strings for equality, 8 bytes at a time. At the end,
418 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
420 ;;; We already know at this point that the first three bytes of the
421 ;;; strings match each other, and they can be safely passed over before
422 ;;; starting the compare loop. So what this code does is skip over 0-3
423 ;;; bytes, as much as necessary in order to dword-align the edi
424 ;;; pointer. (rsi will still be misaligned three times out of four.)
426 ;;; It should be confessed that this loop usually does not represent
427 ;;; much of the total running time. Replacing it with a more
428 ;;; straightforward "rep cmpsb" would not drastically degrade
437 mov rax
, [rsi
+ rdx
+ 8]
438 xor rax
, [rdi
+ rdx
+ 8]
442 mov rax
, [rsi
+ rdx
+ 8+8]
443 xor rax
, [rdi
+ rdx
+ 8+8]
450 LeaveLoopCmps16: add rdx
,8
451 LeaveLoopCmps8: add rdx
,8
471 ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
472 ;;; then automatically accept it as the best possible match and leave.
479 ;;; If the length of the match is not longer than the best match we
480 ;;; have so far, then forget it and return to the lookup loop.
481 ;///////////////////////////////////
489 mov edx, [chainlenwmask
]
492 ;;; s->match_start = cur_match;
494 ;;; if (len >= nice_match) break;
495 ;;; scan_end = *(ushf*)(scan+best_len-1);
505 movzx ebx, word ptr [r9
+ rax
- 1]
507 mov edx, [chainlenwmask
]
510 ;;; Accept the current string, with the maximum possible length.
516 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
517 ;;; return s->lookahead;
528 ;;; Restore the stack and return from whence we came.
542 ; please don't remove this string !
543 ; Your can freely use gvmat64 in any free or commercial app
544 ; but it is far better don't remove the string in the binary!
545 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0