1 dnl AMD K7 mpn_addlsh1_n
-- rp
[] = up
[] + (vp
[] << 1)
3 dnl Copyright
2011 Free Software Foundation
, Inc.
5 dnl Contributed to the GNU project by Torbjorn Granlund
and Marco Bodrato.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
35 C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
36 C The innerloop is 2*3-way unrolled, which is best we can do with the available
37 C registers. It seems tricky to use the same structure for rsblsh1_n, since we
38 C cannot feed carry between operations there.
44 C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift)
45 C P4 model 0 (Willamette)
47 C P4 model 2 (Northwood)
48 C P4 model 3 (Prescott)
55 C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
56 C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately,
57 C that means we need an initial magic multiply.
59 C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We
60 C cannot do rsblsh1_n since we feed carry from the shift blocks to the
61 C add/subtract blocks, which is right for addition but reversed for
62 C subtraction. We could perhaps do sublsh1_n, with some extra move insns,
63 C without losing any time, since we're
not issue limited but carry recurrency
66 C Breaking carry recurrency might be a good idea. We would then need separate
67 C registers for the shift carry
and add/subtract carry
, which
in turn would
68 C force is to
2*2-way unrolling.
70 defframe
(PARAM_SIZE
, 16)
71 defframe
(PARAM_DBLD
, 12)
72 defframe
(PARAM_SRC
, 8)
73 defframe
(PARAM_DST
, 4)
75 dnl re
-use parameter space
76 define
(VAR_COUNT
,`PARAM_DST
')
77 define(VAR_TMP,`PARAM_DBLD')
82 PROLOGUE
(mpn_addlsh1_n
)
91 push %ebx FRAME_pushl()
92 mov PARAM_SIZE, %ebx C size
102 not %edx C count = -(size\8)-1
105 push vp FRAME_pushl()
108 lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3
110 lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6
116 shr %edx ') C restore 2nd saved carry bit
119 rcr %edx C restore 1st saved carry bit
123 adc %edx, %edx C save a carry bit in edx
125 adc %edx, %edx ') C save another carry bit in edx
138 shr %edx ') C restore 2nd saved carry bit
146 rcr %edx C restore 1st saved carry bit
163 adc %edx, %edx C save a carry bit in edx
174 adc %edx, %edx ') C save another carry bit in edx
193 pop %ebx FRAME_popl
()