source/libs/gmp/gmp-src/mpn/x86/p6/aorsmul_1.asm

   1 dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
   2
   3 dnl  Copyright 1999-2002, 2005 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C                           cycles/limb
  35 C P5
  36 C P6 model 0-8,10-12             6.44
  37 C P6 model 9  (Banias)           6.15
  38 C P6 model 13 (Dothan)           6.11
  39 C P4 model 0  (Willamette)
  40 C P4 model 1  (?)
  41 C P4 model 2  (Northwood)
  42 C P4 model 3  (Prescott)
  43 C P4 model 4  (Nocona)
  44 C AMD K6
  45 C AMD K7
  46 C AMD K8
  47
  48
  49 dnl  P6 UNROLL_COUNT cycles/limb
  50 dnl          8           6.7
  51 dnl         16           6.35
  52 dnl         32           6.3
  53 dnl         64           6.3
  54 dnl  Maximum possible with the current code is 64.
  55
  56 deflit(UNROLL_COUNT, 16)
  57
  58
  59 ifdef(`OPERATION_addmul_1', `
  60         define(M4_inst,        addl)
  61         define(M4_function_1,  mpn_addmul_1)
  62         define(M4_function_1c, mpn_addmul_1c)
  63         define(M4_description, add it to)
  64         define(M4_desc_retval, carry)
  65 ',`ifdef(`OPERATION_submul_1', `
  66         define(M4_inst,        subl)
  67         define(M4_function_1,  mpn_submul_1)
  68         define(M4_function_1c, mpn_submul_1c)
  69         define(M4_description, subtract it from)
  70         define(M4_desc_retval, borrow)
  71 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
  72 ')')')
  73
  74 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
  75
  76
  77 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  78 C                            mp_limb_t mult);
  79 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  80 C                             mp_limb_t mult, mp_limb_t carry);
  81 C
  82 C Calculate src,size multiplied by mult and M4_description dst,size.
  83 C Return the M4_desc_retval limb from the top of the result.
  84 C
  85 C This code is pretty much the same as the K6 code.  The unrolled loop is
  86 C the same, but there's just a few scheduling tweaks in the setups and the
  87 C simple loop.
  88 C
  89 C A number of variations have been tried for the unrolled loop, with one or
  90 C two carries, and with loads scheduled earlier, but nothing faster than 6
  91 C cycles/limb has been found.
  92
  93 ifdef(`PIC',`
  94 deflit(UNROLL_THRESHOLD, 5)
  95 ',`
  96 deflit(UNROLL_THRESHOLD, 5)
  97 ')
  98
  99 defframe(PARAM_CARRY,     20)
 100 defframe(PARAM_MULTIPLIER,16)
 101 defframe(PARAM_SIZE,      12)
 102 defframe(PARAM_SRC,       8)
 103 defframe(PARAM_DST,       4)
 104
 105         TEXT
 106         ALIGN(32)
 107
 108 PROLOGUE(M4_function_1c)
 109         pushl   %ebx
 110 deflit(`FRAME',4)
 111         movl    PARAM_CARRY, %ebx
 112         jmp     L(start_nc)
 113 EPILOGUE()
 114
 115 PROLOGUE(M4_function_1)
 116         push    %ebx
 117 deflit(`FRAME',4)
 118         xorl    %ebx, %ebx      C initial carry
 119
 120 L(start_nc):
 121         movl    PARAM_SIZE, %ecx
 122         pushl   %esi
 123 deflit(`FRAME',8)
 124
 125         movl    PARAM_SRC, %esi
 126         pushl   %edi
 127 deflit(`FRAME',12)
 128
 129         movl    PARAM_DST, %edi
 130         pushl   %ebp
 131 deflit(`FRAME',16)
 132         cmpl    $UNROLL_THRESHOLD, %ecx
 133
 134         movl    PARAM_MULTIPLIER, %ebp
 135         jae     L(unroll)
 136
 137
 138         C simple loop
 139         C this is offset 0x22, so close enough to aligned
 140 L(simple):
 141         C eax   scratch
 142         C ebx   carry
 143         C ecx   counter
 144         C edx   scratch
 145         C esi   src
 146         C edi   dst
 147         C ebp   multiplier
 148
 149         movl    (%esi), %eax
 150         addl    $4, %edi
 151
 152         mull    %ebp
 153
 154         addl    %ebx, %eax
 155         adcl    $0, %edx
 156
 157         M4_inst %eax, -4(%edi)
 158         movl    %edx, %ebx
 159
 160         adcl    $0, %ebx
 161         decl    %ecx
 162
 163         leal    4(%esi), %esi
 164         jnz     L(simple)
 165
 166
 167         popl    %ebp
 168         popl    %edi
 169
 170         popl    %esi
 171         movl    %ebx, %eax
 172
 173         popl    %ebx
 174         ret
 175
 176
 177
 178 C------------------------------------------------------------------------------
 179 C VAR_JUMP holds the computed jump temporarily because there's not enough
 180 C registers when doing the mul for the initial two carry limbs.
 181 C
 182 C The add/adc for the initial carry in %ebx is necessary only for the
 183 C mpn_add/submul_1c entry points.  Duplicating the startup code to
 184 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
 185 C idea.
 186
 187 dnl  overlapping with parameters already fetched
 188 define(VAR_COUNTER,`PARAM_SIZE')
 189 define(VAR_JUMP,   `PARAM_DST')
 190
 191         C this is offset 0x43, so close enough to aligned
 192 L(unroll):
 193         C eax
 194         C ebx   initial carry
 195         C ecx   size
 196         C edx
 197         C esi   src
 198         C edi   dst
 199         C ebp
 200
 201         movl    %ecx, %edx
 202         decl    %ecx
 203
 204         subl    $2, %edx
 205         negl    %ecx
 206
 207         shrl    $UNROLL_LOG2, %edx
 208         andl    $UNROLL_MASK, %ecx
 209
 210         movl    %edx, VAR_COUNTER
 211         movl    %ecx, %edx
 212
 213         C 15 code bytes per limb
 214 ifdef(`PIC',`
 215         call    L(pic_calc)
 216 L(here):
 217 ',`
 218         shll    $4, %edx
 219         negl    %ecx
 220
 221         leal    L(entry) (%edx,%ecx,1), %edx
 222 ')
 223         movl    (%esi), %eax            C src low limb
 224
 225         movl    %edx, VAR_JUMP
 226         leal    ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
 227
 228         mull    %ebp
 229
 230         addl    %ebx, %eax      C initial carry (from _1c)
 231         adcl    $0, %edx
 232
 233         movl    %edx, %ebx      C high carry
 234         leal    ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
 235
 236         movl    VAR_JUMP, %edx
 237         testl   $1, %ecx
 238         movl    %eax, %ecx      C low carry
 239
 240         cmovnz( %ebx, %ecx)     C high,low carry other way around
 241         cmovnz( %eax, %ebx)
 242
 243         jmp     *%edx
 244
 245
 246 ifdef(`PIC',`
 247 L(pic_calc):
 248         shll    $4, %edx
 249         negl    %ecx
 250
 251         C See mpn/x86/README about old gas bugs
 252         leal    (%edx,%ecx,1), %edx
 253         addl    $L(entry)-L(here), %edx
 254
 255         addl    (%esp), %edx
 256
 257         ret_internal
 258 ')
 259
 260
 261 C -----------------------------------------------------------
 262         ALIGN(32)
 263 L(top):
 264 deflit(`FRAME',16)
 265         C eax   scratch
 266         C ebx   carry hi
 267         C ecx   carry lo
 268         C edx   scratch
 269         C esi   src
 270         C edi   dst
 271         C ebp   multiplier
 272         C
 273         C VAR_COUNTER   loop counter
 274         C
 275         C 15 code bytes per limb
 276
 277         addl    $UNROLL_BYTES, %edi
 278
 279 L(entry):
 280 deflit(CHUNK_COUNT,2)
 281 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 282         deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
 283         deflit(`disp1', eval(disp0 + 4))
 284
 285 Zdisp(  movl,   disp0,(%esi), %eax)
 286         mull    %ebp
 287 Zdisp(  M4_inst,%ecx, disp0,(%edi))
 288         adcl    %eax, %ebx
 289         movl    %edx, %ecx
 290         adcl    $0, %ecx
 291
 292         movl    disp1(%esi), %eax
 293         mull    %ebp
 294         M4_inst %ebx, disp1(%edi)
 295         adcl    %eax, %ecx
 296         movl    %edx, %ebx
 297         adcl    $0, %ebx
 298 ')
 299
 300         decl    VAR_COUNTER
 301         leal    UNROLL_BYTES(%esi), %esi
 302
 303         jns     L(top)
 304
 305
 306 deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
 307
 308         M4_inst %ecx, disp0(%edi)
 309         movl    %ebx, %eax
 310
 311         popl    %ebp
 312         popl    %edi
 313
 314         popl    %esi
 315         popl    %ebx
 316         adcl    $0, %eax
 317
 318         ret
 319
 320 EPILOGUE()