source/libs/gmp/gmp-src/mpn/x86/k6/mul_1.asm

   1 dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
   2
   3 dnl  Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C                           cycles/limb
  35 C P5
  36 C P6 model 0-8,10-12             5.5
  37 C P6 model 9  (Banias)
  38 C P6 model 13 (Dothan)           4.87
  39 C P4 model 0  (Willamette)
  40 C P4 model 1  (?)
  41 C P4 model 2  (Northwood)
  42 C P4 model 3  (Prescott)
  43 C P4 model 4  (Nocona)
  44 C AMD K6                         6.25
  45 C AMD K7
  46 C AMD K8
  47
  48
  49 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  50 C                      mp_limb_t multiplier);
  51 C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  52 C                       mp_limb_t multiplier, mp_limb_t carry);
  53 C
  54 C Multiply src,size by mult and store the result in dst,size.
  55 C Return the carry limb from the top of the result.
  56 C
  57 C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
  58 C the low limb of the result.
  59
  60 defframe(PARAM_CARRY,     20)
  61 defframe(PARAM_MULTIPLIER,16)
  62 defframe(PARAM_SIZE,      12)
  63 defframe(PARAM_SRC,       8)
  64 defframe(PARAM_DST,       4)
  65
  66 dnl  minimum 5 because the unrolled code can't handle less
  67 deflit(UNROLL_THRESHOLD, 5)
  68
  69         TEXT
  70         ALIGN(32)
  71
  72 PROLOGUE(mpn_mul_1c)
  73         pushl   %esi
  74 deflit(`FRAME',4)
  75         movl    PARAM_CARRY, %esi
  76         jmp     L(start_nc)
  77 EPILOGUE()
  78
  79
  80 PROLOGUE(mpn_mul_1)
  81         push    %esi
  82 deflit(`FRAME',4)
  83         xorl    %esi, %esi      C initial carry
  84
  85 L(start_nc):
  86         mov     PARAM_SIZE, %ecx
  87         push    %ebx
  88 FRAME_pushl()
  89
  90         movl    PARAM_SRC, %ebx
  91         push    %edi
  92 FRAME_pushl()
  93
  94         movl    PARAM_DST, %edi
  95         pushl   %ebp
  96 FRAME_pushl()
  97
  98         cmpl    $UNROLL_THRESHOLD, %ecx
  99         movl    PARAM_MULTIPLIER, %ebp
 100
 101         jae     L(unroll)
 102
 103
 104         C code offset 0x22 here, close enough to aligned
 105 L(simple):
 106         C eax   scratch
 107         C ebx   src
 108         C ecx   counter
 109         C edx   scratch
 110         C esi   carry
 111         C edi   dst
 112         C ebp   multiplier
 113         C
 114         C this loop 8 cycles/limb
 115
 116         movl    (%ebx), %eax
 117         addl    $4, %ebx
 118
 119         mull    %ebp
 120
 121         addl    %esi, %eax
 122         movl    $0, %esi
 123
 124         adcl    %edx, %esi
 125
 126         movl    %eax, (%edi)
 127         addl    $4, %edi
 128
 129         loop    L(simple)
 130
 131
 132         popl    %ebp
 133
 134         popl    %edi
 135         popl    %ebx
 136
 137         movl    %esi, %eax
 138         popl    %esi
 139
 140         ret
 141
 142
 143 C -----------------------------------------------------------------------------
 144 C The code for each limb is 6 cycles, with instruction decoding being the
 145 C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
 146 C cycles/limb in total.
 147 C
 148 C The secret ingredient to get 6.25 is to start the loop with the mul and
 149 C have the load/store pair at the end.  Rotating the load/store to the top
 150 C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
 151 C
 152 C The whole unrolled loop fits nicely in exactly 80 bytes.
 153
 154
 155         ALIGN(16)       C already aligned to 16 here actually
 156 L(unroll):
 157         movl    (%ebx), %eax
 158         leal    -16(%ebx,%ecx,4), %ebx
 159
 160         leal    -16(%edi,%ecx,4), %edi
 161         subl    $4, %ecx
 162
 163         negl    %ecx
 164
 165
 166         ALIGN(16)       C one byte nop for this alignment
 167 L(top):
 168         C eax   scratch
 169         C ebx   &src[size-4]
 170         C ecx   counter
 171         C edx   scratch
 172         C esi   carry
 173         C edi   &dst[size-4]
 174         C ebp   multiplier
 175
 176         mull    %ebp
 177
 178         addl    %esi, %eax
 179         movl    $0, %esi
 180
 181         adcl    %edx, %esi
 182
 183         movl    %eax, (%edi,%ecx,4)
 184         movl    4(%ebx,%ecx,4), %eax
 185
 186
 187         mull    %ebp
 188
 189         addl    %esi, %eax
 190         movl    $0, %esi
 191
 192         adcl    %edx, %esi
 193
 194         movl    %eax, 4(%edi,%ecx,4)
 195         movl    8(%ebx,%ecx,4), %eax
 196
 197
 198         mull    %ebp
 199
 200         addl    %esi, %eax
 201         movl    $0, %esi
 202
 203         adcl    %edx, %esi
 204
 205         movl    %eax, 8(%edi,%ecx,4)
 206         movl    12(%ebx,%ecx,4), %eax
 207
 208
 209         mull    %ebp
 210
 211         addl    %esi, %eax
 212         movl    $0, %esi
 213
 214         adcl    %edx, %esi
 215
 216         movl    %eax, 12(%edi,%ecx,4)
 217         movl    16(%ebx,%ecx,4), %eax
 218
 219
 220         addl    $4, %ecx
 221         js      L(top)
 222
 223
 224
 225         C eax   next src limb
 226         C ebx   &src[size-4]
 227         C ecx   0 to 3 representing respectively 4 to 1 further limbs
 228         C edx
 229         C esi   carry
 230         C edi   &dst[size-4]
 231
 232         testb   $2, %cl
 233         jnz     L(finish_not_two)
 234
 235         mull    %ebp
 236
 237         addl    %esi, %eax
 238         movl    $0, %esi
 239
 240         adcl    %edx, %esi
 241
 242         movl    %eax, (%edi,%ecx,4)
 243         movl    4(%ebx,%ecx,4), %eax
 244
 245
 246         mull    %ebp
 247
 248         addl    %esi, %eax
 249         movl    $0, %esi
 250
 251         adcl    %edx, %esi
 252
 253         movl    %eax, 4(%edi,%ecx,4)
 254         movl    8(%ebx,%ecx,4), %eax
 255
 256         addl    $2, %ecx
 257 L(finish_not_two):
 258
 259
 260         testb   $1, %cl
 261         jnz     L(finish_not_one)
 262
 263         mull    %ebp
 264
 265         addl    %esi, %eax
 266         movl    $0, %esi
 267
 268         adcl    %edx, %esi
 269
 270         movl    %eax, 8(%edi)
 271         movl    12(%ebx), %eax
 272 L(finish_not_one):
 273
 274
 275         mull    %ebp
 276
 277         addl    %esi, %eax
 278         popl    %ebp
 279
 280         adcl    $0, %edx
 281
 282         movl    %eax, 12(%edi)
 283         popl    %edi
 284
 285         popl    %ebx
 286         movl    %edx, %eax
 287
 288         popl    %esi
 289
 290         ret
 291
 292 EPILOGUE()