2 ; match.asm -- Pentium-Pro optimized version of longest_match()
4 ; Updated for zlib 1.1.3 and converted to MASM 6.1x
5 ; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com>
6 ; and Chuck Walbourn <chuckw@kinesoft.com>
7 ; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro>
9 ; This is free software; you can redistribute it and/or modify it
10 ; under the terms of the GNU General Public License.
13 ; Written for zlib 1.1.2
14 ; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
19 ;===========================================================================
21 ;===========================================================================
25 MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1)
26 MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7))
28 ;===========================================================================
30 ;===========================================================================
32 ; This STRUCT assumes a 4-byte alignment
38 ds_pending_buf_size dd ?
60 ds_match_length dd ? ; used
61 ds_prev_match dd ? ; used
62 ds_match_available dd ?
63 ds_strstart dd ? ; used
64 ds_match_start dd ? ; used
65 ds_lookahead dd ? ; used
66 ds_prev_length dd ? ; used
67 ds_max_chain_length dd ? ; used
68 ds_max_laxy_match dd ?
71 ds_good_match dd ? ; used
72 ds_nice_match dd ? ; used
74 ; Don't need anymore of the struct for match
77 ;===========================================================================
79 ;===========================================================================
82 ;---------------------------------------------------------------------------
84 ;---------------------------------------------------------------------------
88 ; no initialization needed
92 ;---------------------------------------------------------------------------
93 ; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
94 ;---------------------------------------------------------------------------
100 ; Since this code uses EBP for a scratch register, the stack frame must
101 ; be manually constructed and referenced relative to the ESP register.
105 chainlenwmask = 0 ; high word: current chain len
107 window = 4 ; local copy of s->window
108 windowbestlen = 8 ; s->window + bestlen
109 scanend = 12 ; last two bytes of string
110 scanstart = 16 ; first two bytes of string
111 scanalign = 20 ; dword-misalignment of string
112 nicematch = 24 ; a good enough match size
113 bestlen = 28 ; size of best match so far
114 scan = 32 ; ptr to string wanting match
115 varsize = 36 ; number of bytes (also offset to last saved register)
117 ; Saved Registers (actually pushed into place)
128 ; Save registers that the compiler may be using
134 ; Allocate local variable space
137 ; Retrieve the function arguments. ecx will hold cur_match
138 ; throughout the entire function. edx will hold the pointer to the
139 ; deflate_state structure during the function's setup (before
140 ; entering the main loop).
142 mov edx, [esp+deflatestate]
143 ASSUME edx:PTR DEFLATE_STATE
145 mov ecx, [esp+curmatch]
147 ; uInt wmask = s->w_mask;
148 ; unsigned chain_length = s->max_chain_length;
149 ; if (s->prev_length >= s->good_match) {
150 ; chain_length >>= 2;
153 mov eax, [edx].ds_prev_length
154 mov ebx, [edx].ds_good_match
156 mov eax, [edx].ds_w_mask
157 mov ebx, [edx].ds_max_chain_length
158 jl SHORT LastMatchGood
162 ; chainlen is decremented once beforehand so that the function can
163 ; use the sign flag instead of the zero flag for the exit test.
164 ; It is then shifted into the high word, to make room for the wmask
165 ; value, which it will always accompany.
170 mov [esp+chainlenwmask], ebx
172 ; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
174 mov eax, [edx].ds_nice_match
175 mov ebx, [edx].ds_lookahead
177 jl SHORT LookaheadLess
180 mov [esp+nicematch], ebx
182 ;/* register Bytef *scan = s->window + s->strstart; */
184 mov esi, [edx].ds_window
185 mov [esp+window], esi
186 mov ebp, [edx].ds_strstart
190 ;/* Determine how many bytes the scan ptr is off from being */
191 ;/* dword-aligned. */
196 mov [esp+scanalign], eax
198 ;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
199 ;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
201 mov eax, [edx].ds_w_size
202 sub eax, MIN_LOOKAHEAD
204 jg SHORT LimitPositive
208 ;/* int best_len = s->prev_length; */
210 mov eax, [edx].ds_prev_length
211 mov [esp+bestlen], eax
213 ;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
216 mov [esp+windowbestlen], esi
218 ;/* register ush scan_start = *(ushf*)scan; */
219 ;/* register ush scan_end = *(ushf*)(scan+best_len-1); */
220 ;/* Posf *prev = s->prev; */
222 movzx ebx, WORD PTR[edi]
223 mov [esp+scanstart], ebx
224 movzx ebx, WORD PTR[eax+edi-1]
225 mov [esp+scanend], ebx
226 mov edi, [edx].ds_prev
228 ;/* Jump into the main loop. */
230 mov edx, [esp+chainlenwmask]
234 ; * match = s->window + cur_match;
235 ; * if (*(ushf*)(match+best_len-1) != scan_end ||
236 ; * *(ushf*)match != scan_start) continue;
238 ; * } while ((cur_match = prev[cur_match & wmask]) > limit
239 ; * && --chain_length != 0);
241 ; * Here is the inner loop of the function. The function will spend the
242 ; * majority of its time in this loop, and majority of that time will
243 ; * be spent in the first ten instructions.
245 ; * Within this loop:
248 ; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
249 ; * %esi = windowbestlen - i.e., (window + bestlen)
257 movzx ecx, WORD PTR[edi+ecx*2]
264 movzx eax, WORD PTR[esi+ecx-1]
268 mov eax, [esp+window]
269 movzx eax, WORD PTR[eax+ecx]
270 cmp eax, [esp+scanstart]
273 ;/* Store the current value of chainlen. */
275 mov [esp+chainlenwmask], edx
277 ;/* Point %edi to the string under scrutiny, and %esi to the string we */
278 ;/* are hoping to match it up with. In actuality, %esi and %edi are */
279 ;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
280 ;/* initialized to -(MAX_MATCH_8 - scanalign). */
282 mov esi, [esp+window]
285 mov eax, [esp+scanalign]
286 mov edx, -MAX_MATCH_8
287 lea edi, [edi+eax+MAX_MATCH_8]
288 lea esi, [esi+eax+MAX_MATCH_8]
290 ;/* Test the strings for equality, 8 bytes at a time. At the end,
291 ; * adjust %edx so that it is offset to the exact byte that mismatched.
293 ; * We already know at this point that the first three bytes of the
294 ; * strings match each other, and they can be safely passed over before
295 ; * starting the compare loop. So what this code does is skip over 0-3
296 ; * bytes, as much as necessary in order to dword-align the %edi
297 ; * pointer. (%esi will still be misaligned three times out of four.)
299 ; * It should be confessed that this loop usually does not represent
300 ; * much of the total running time. Replacing it with a more
301 ; * straightforward "rep cmpsb" would not drastically degrade
306 mov eax, DWORD PTR[esi+edx]
307 xor eax, DWORD PTR[edi+edx]
308 jnz SHORT LeaveLoopCmps
310 mov eax, DWORD PTR[esi+edx+4]
311 xor eax, DWORD PTR[edi+edx+4]
312 jnz SHORT LeaveLoopCmps4
333 ;/* Calculate the length of the match. If it is longer than MAX_MATCH, */
334 ;/* then automatically accept it as the best possible match and leave. */
342 ;/* If the length of the match is not longer than the best match we */
343 ;/* have so far, then forget it and return to the lookup loop. */
345 mov edx, [esp+deflatestate]
346 mov ebx, [esp+bestlen]
349 mov esi, [esp+windowbestlen]
350 mov edi, [edx].ds_prev
351 mov ebx, [esp+scanend]
352 mov edx, [esp+chainlenwmask]
356 ;/* s->match_start = cur_match; */
357 ;/* best_len = len; */
358 ;/* if (len >= nice_match) break; */
359 ;/* scan_end = *(ushf*)(scan+best_len-1); */
362 mov ebx, [esp+nicematch]
363 mov [esp+bestlen], eax
364 mov [edx].ds_match_start, ecx
367 mov esi, [esp+window]
369 mov [esp+windowbestlen], esi
370 movzx ebx, WORD PTR[edi+eax-1]
371 mov edi, [edx].ds_prev
372 mov [esp+scanend], ebx
373 mov edx, [esp+chainlenwmask]
377 ;/* Accept the current string, with the maximum possible length. */
380 mov edx, [esp+deflatestate]
381 mov DWORD PTR[esp+bestlen], MAX_MATCH
382 mov [edx].ds_match_start, ecx
384 ;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
385 ;/* return s->lookahead; */
388 mov edx, [esp+deflatestate]
389 mov ebx, [esp+bestlen]
390 mov eax, [edx].ds_lookahead
392 jg SHORT LookaheadRet
396 ; Restore the stack and return from whence we came.