source/libs/gmp/gmp-src/mpn/x86/k7/aors_n.asm

   1 dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
   2
   3 dnl  Copyright 1999-2003 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C K7: 1.64 cycles/limb (at 16 limbs/loop).
  35
  36
  37
  38 dnl  K7: UNROLL_COUNT cycles/limb
  39 dnl           8           1.9
  40 dnl          16           1.64
  41 dnl          32           1.7
  42 dnl          64           2.0
  43 dnl  Maximum possible with the current code is 64.
  44
  45 deflit(UNROLL_COUNT, 16)
  46
  47
  48 ifdef(`OPERATION_add_n', `
  49         define(M4_inst,        adcl)
  50         define(M4_function_n,  mpn_add_n)
  51         define(M4_function_nc, mpn_add_nc)
  52         define(M4_description, add)
  53 ',`ifdef(`OPERATION_sub_n', `
  54         define(M4_inst,        sbbl)
  55         define(M4_function_n,  mpn_sub_n)
  56         define(M4_function_nc, mpn_sub_nc)
  57         define(M4_description, subtract)
  58 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
  59 ')')')
  60
  61 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
  62
  63
  64 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
  65 C                         mp_size_t size);
  66 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
  67 C                          mp_size_t size, mp_limb_t carry);
  68 C
  69 C Calculate src1,size M4_description src2,size, and store the result in
  70 C dst,size.  The return value is the carry bit from the top of the result (1
  71 C or 0).
  72 C
  73 C The _nc version accepts 1 or 0 for an initial carry into the low limb of
  74 C the calculation.  Note values other than 1 or 0 here will lead to garbage
  75 C results.
  76 C
  77 C This code runs at 1.64 cycles/limb, which might be the best possible with
  78 C plain integer operations.  Each limb is 2 loads and 1 store, any 2 of
  79 C which can be done each cycle, leading to 1.5 c/l.
  80
  81 dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
  82 ifdef(`PIC',`
  83 deflit(UNROLL_THRESHOLD, 8)
  84 ',`
  85 deflit(UNROLL_THRESHOLD, 8)
  86 ')
  87
  88 defframe(PARAM_CARRY,20)
  89 defframe(PARAM_SIZE, 16)
  90 defframe(PARAM_SRC2, 12)
  91 defframe(PARAM_SRC1, 8)
  92 defframe(PARAM_DST,  4)
  93
  94 defframe(SAVE_EBP, -4)
  95 defframe(SAVE_ESI, -8)
  96 defframe(SAVE_EBX, -12)
  97 defframe(SAVE_EDI, -16)
  98 deflit(STACK_SPACE, 16)
  99
 100         TEXT
 101         ALIGN(32)
 102 deflit(`FRAME',0)
 103
 104 PROLOGUE(M4_function_nc)
 105         movl    PARAM_CARRY, %eax
 106         jmp     L(start)
 107 EPILOGUE()
 108
 109 PROLOGUE(M4_function_n)
 110
 111         xorl    %eax, %eax      C carry
 112 L(start):
 113         movl    PARAM_SIZE, %ecx
 114         subl    $STACK_SPACE, %esp
 115 deflit(`FRAME',STACK_SPACE)
 116
 117         movl    %edi, SAVE_EDI
 118         movl    %ebx, SAVE_EBX
 119         cmpl    $UNROLL_THRESHOLD, %ecx
 120
 121         movl    PARAM_SRC2, %edx
 122         movl    PARAM_SRC1, %ebx
 123         jae     L(unroll)
 124
 125         movl    PARAM_DST, %edi
 126         leal    (%ebx,%ecx,4), %ebx
 127         leal    (%edx,%ecx,4), %edx
 128
 129         leal    (%edi,%ecx,4), %edi
 130         negl    %ecx
 131         shrl    %eax
 132
 133         C This loop in in a single 16 byte code block already, so no
 134         C alignment necessary.
 135 L(simple):
 136         C eax   scratch
 137         C ebx   src1
 138         C ecx   counter
 139         C edx   src2
 140         C esi
 141         C edi   dst
 142         C ebp
 143
 144         movl    (%ebx,%ecx,4), %eax
 145         M4_inst (%edx,%ecx,4), %eax
 146         movl    %eax, (%edi,%ecx,4)
 147         incl    %ecx
 148         jnz     L(simple)
 149
 150         movl    $0, %eax
 151         movl    SAVE_EDI, %edi
 152
 153         movl    SAVE_EBX, %ebx
 154         setc    %al
 155         addl    $STACK_SPACE, %esp
 156
 157         ret
 158
 159
 160 C -----------------------------------------------------------------------------
 161         C This is at 0x55, close enough to aligned.
 162 L(unroll):
 163 deflit(`FRAME',STACK_SPACE)
 164         movl    %ebp, SAVE_EBP
 165         andl    $-2, %ecx               C size low bit masked out
 166         andl    $1, PARAM_SIZE          C size low bit kept
 167
 168         movl    %ecx, %edi
 169         decl    %ecx
 170         movl    PARAM_DST, %ebp
 171
 172         shrl    $UNROLL_LOG2, %ecx
 173         negl    %edi
 174         movl    %esi, SAVE_ESI
 175
 176         andl    $UNROLL_MASK, %edi
 177
 178 ifdef(`PIC',`
 179         call    L(pic_calc)
 180 L(here):
 181 ',`
 182         leal    L(entry) (%edi,%edi,8), %esi    C 9 bytes per
 183 ')
 184         negl    %edi
 185         shrl    %eax
 186
 187         leal    ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
 188         leal    ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
 189         leal    ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
 190
 191         jmp     *%esi
 192
 193
 194 ifdef(`PIC',`
 195 L(pic_calc):
 196         C See mpn/x86/README about old gas bugs
 197         leal    (%edi,%edi,8), %esi
 198         addl    $L(entry)-L(here), %esi
 199         addl    (%esp), %esi
 200         ret_internal
 201 ')
 202
 203
 204 C -----------------------------------------------------------------------------
 205         ALIGN(32)
 206 L(top):
 207         C eax   zero
 208         C ebx   src1
 209         C ecx   counter
 210         C edx   src2
 211         C esi   scratch (was computed jump)
 212         C edi   dst
 213         C ebp   scratch
 214
 215         leal    UNROLL_BYTES(%edx), %edx
 216
 217 L(entry):
 218 deflit(CHUNK_COUNT, 2)
 219 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 220         deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 221         deflit(`disp1', eval(disp0 + 4))
 222
 223 Zdisp(  movl,   disp0,(%ebx), %esi)
 224         movl    disp1(%ebx), %ebp
 225 Zdisp(  M4_inst,disp0,(%edx), %esi)
 226 Zdisp(  movl,   %esi, disp0,(%edi))
 227         M4_inst disp1(%edx), %ebp
 228         movl    %ebp, disp1(%edi)
 229 ')
 230
 231         decl    %ecx
 232         leal    UNROLL_BYTES(%ebx), %ebx
 233         leal    UNROLL_BYTES(%edi), %edi
 234         jns     L(top)
 235
 236
 237         mov     PARAM_SIZE, %esi
 238         movl    SAVE_EBP, %ebp
 239         movl    $0, %eax
 240
 241         decl    %esi
 242         js      L(even)
 243
 244         movl    (%ebx), %ecx
 245         M4_inst UNROLL_BYTES(%edx), %ecx
 246         movl    %ecx, (%edi)
 247 L(even):
 248
 249         movl    SAVE_EDI, %edi
 250         movl    SAVE_EBX, %ebx
 251         setc    %al
 252
 253         movl    SAVE_ESI, %esi
 254         addl    $STACK_SPACE, %esp
 255
 256         ret
 257
 258 EPILOGUE()