source/libs/gmp/gmp-src/mpn/x86/mul_basecase.asm

   1 dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
   2 dnl  in a third limb vector.
   3
   4 dnl  Copyright 1996-2002 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34
  35 C     cycles/crossproduct
  36 C P5      15
  37 C P6       7.5
  38 C K6      12.5
  39 C K7       5.5
  40 C P4      24
  41
  42
  43 C void mpn_mul_basecase (mp_ptr wp,
  44 C                        mp_srcptr xp, mp_size_t xsize,
  45 C                        mp_srcptr yp, mp_size_t ysize);
  46 C
  47 C This was written in a haste since the Pentium optimized code that was used
  48 C for all x86 machines was slow for the Pentium II.  This code would benefit
  49 C from some cleanup.
  50 C
  51 C To shave off some percentage of the run-time, one should make 4 variants
  52 C of the Louter loop, for the four different outcomes of un mod 4.  That
  53 C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
  54 C part of the function, but since it is not very large, that would be
  55 C acceptable.
  56 C
  57 C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
  58 C unknown.
  59
  60 defframe(PARAM_YSIZE,20)
  61 defframe(PARAM_YP,   16)
  62 defframe(PARAM_XSIZE,12)
  63 defframe(PARAM_XP,   8)
  64 defframe(PARAM_WP,   4)
  65
  66 defframe(VAR_MULTIPLIER, -4)
  67 defframe(VAR_COUNTER,    -8)
  68 deflit(VAR_STACK_SPACE,  8)
  69
  70         TEXT
  71         ALIGN(8)
  72
  73 PROLOGUE(mpn_mul_basecase)
  74 deflit(`FRAME',0)
  75
  76         subl    $VAR_STACK_SPACE,%esp
  77         pushl   %esi
  78         pushl   %ebp
  79         pushl   %edi
  80 deflit(`FRAME',eval(VAR_STACK_SPACE+12))
  81
  82         movl    PARAM_XP,%esi
  83         movl    PARAM_WP,%edi
  84         movl    PARAM_YP,%ebp
  85
  86         movl    (%esi),%eax             C load xp[0]
  87         mull    (%ebp)                  C multiply by yp[0]
  88         movl    %eax,(%edi)             C store to wp[0]
  89         movl    PARAM_XSIZE,%ecx        C xsize
  90         decl    %ecx                    C If xsize = 1, ysize = 1 too
  91         jz      L(done)
  92
  93         pushl   %ebx
  94 FRAME_pushl()
  95         movl    %edx,%ebx
  96
  97         leal    4(%esi),%esi
  98         leal    4(%edi),%edi
  99
 100 L(oopM):
 101         movl    (%esi),%eax             C load next limb at xp[j]
 102         leal    4(%esi),%esi
 103         mull    (%ebp)
 104         addl    %ebx,%eax
 105         movl    %edx,%ebx
 106         adcl    $0,%ebx
 107         movl    %eax,(%edi)
 108         leal    4(%edi),%edi
 109         decl    %ecx
 110         jnz     L(oopM)
 111
 112         movl    %ebx,(%edi)             C most significant limb of product
 113         addl    $4,%edi                 C increment wp
 114         movl    PARAM_XSIZE,%eax
 115         shll    $2,%eax
 116         subl    %eax,%edi
 117         subl    %eax,%esi
 118
 119         movl    PARAM_YSIZE,%eax        C ysize
 120         decl    %eax
 121         jz      L(skip)
 122         movl    %eax,VAR_COUNTER        C set index i to ysize
 123
 124 L(outer):
 125         movl    PARAM_YP,%ebp           C yp
 126         addl    $4,%ebp                 C make ebp point to next v limb
 127         movl    %ebp,PARAM_YP
 128         movl    (%ebp),%eax             C copy y limb ...
 129         movl    %eax,VAR_MULTIPLIER     C ... to stack slot
 130         movl    PARAM_XSIZE,%ecx
 131
 132         xorl    %ebx,%ebx
 133         andl    $3,%ecx
 134         jz      L(end0)
 135
 136 L(oop0):
 137         movl    (%esi),%eax
 138         mull    VAR_MULTIPLIER
 139         leal    4(%esi),%esi
 140         addl    %ebx,%eax
 141         movl    $0,%ebx
 142         adcl    %ebx,%edx
 143         addl    %eax,(%edi)
 144         adcl    %edx,%ebx               C propagate carry into cylimb
 145
 146         leal    4(%edi),%edi
 147         decl    %ecx
 148         jnz     L(oop0)
 149
 150 L(end0):
 151         movl    PARAM_XSIZE,%ecx
 152         shrl    $2,%ecx
 153         jz      L(endX)
 154
 155         ALIGN(8)
 156 L(oopX):
 157         movl    (%esi),%eax
 158         mull    VAR_MULTIPLIER
 159         addl    %eax,%ebx
 160         movl    $0,%ebp
 161         adcl    %edx,%ebp
 162
 163         movl    4(%esi),%eax
 164         mull    VAR_MULTIPLIER
 165         addl    %ebx,(%edi)
 166         adcl    %eax,%ebp       C new lo + cylimb
 167         movl    $0,%ebx
 168         adcl    %edx,%ebx
 169
 170         movl    8(%esi),%eax
 171         mull    VAR_MULTIPLIER
 172         addl    %ebp,4(%edi)
 173         adcl    %eax,%ebx       C new lo + cylimb
 174         movl    $0,%ebp
 175         adcl    %edx,%ebp
 176
 177         movl    12(%esi),%eax
 178         mull    VAR_MULTIPLIER
 179         addl    %ebx,8(%edi)
 180         adcl    %eax,%ebp       C new lo + cylimb
 181         movl    $0,%ebx
 182         adcl    %edx,%ebx
 183
 184         addl    %ebp,12(%edi)
 185         adcl    $0,%ebx         C propagate carry into cylimb
 186
 187         leal    16(%esi),%esi
 188         leal    16(%edi),%edi
 189         decl    %ecx
 190         jnz     L(oopX)
 191
 192 L(endX):
 193         movl    %ebx,(%edi)
 194         addl    $4,%edi
 195
 196         C we incremented wp and xp in the loop above; compensate
 197         movl    PARAM_XSIZE,%eax
 198         shll    $2,%eax
 199         subl    %eax,%edi
 200         subl    %eax,%esi
 201
 202         movl    VAR_COUNTER,%eax
 203         decl    %eax
 204         movl    %eax,VAR_COUNTER
 205         jnz     L(outer)
 206
 207 L(skip):
 208         popl    %ebx
 209         popl    %edi
 210         popl    %ebp
 211         popl    %esi
 212         addl    $8,%esp
 213         ret
 214
 215 L(done):
 216         movl    %edx,4(%edi)       C store to wp[1]
 217         popl    %edi
 218         popl    %ebp
 219         popl    %esi
 220         addl    $8,%esp
 221         ret
 222
 223 EPILOGUE()