source/libs/gmp/gmp-src/mpn/x86/pentium/mode1o.asm

   1 dnl  Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
   2
   3 dnl  Copyright 2000-2002, 2014 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P5: 23.0 cycles/limb
  35
  36
  37 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
  38 C                               mp_limb_t divisor);
  39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
  40 C                                mp_limb_t divisor, mp_limb_t carry);
  41 C
  42 C There seems no way to pair up the two lone instructions in the main loop.
  43 C
  44 C The special case for size==1 saves about 20 cycles (non-PIC), making it
  45 C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
  46 C all sizes.
  47 C
  48 C Alternatives:
  49 C
  50 C Using mmx for the multiplies might be possible, with pmullw and pmulhw
  51 C having just 3 cycle latencies, but carry bit handling would probably be
  52 C complicated.
  53
  54 defframe(PARAM_CARRY,  16)
  55 defframe(PARAM_DIVISOR,12)
  56 defframe(PARAM_SIZE,   8)
  57 defframe(PARAM_SRC,    4)
  58
  59 dnl  re-using parameter space
  60 define(VAR_INVERSE,`PARAM_SIZE')
  61
  62         TEXT
  63
  64         ALIGN(16)
  65 PROLOGUE(mpn_modexact_1c_odd)
  66 deflit(`FRAME',0)
  67
  68         movl    PARAM_DIVISOR, %eax
  69         movl    PARAM_CARRY, %edx
  70
  71         jmp     L(start_1c)
  72
  73 EPILOGUE()
  74
  75         ALIGN(16)
  76 PROLOGUE(mpn_modexact_1_odd)
  77 deflit(`FRAME',0)
  78
  79         movl    PARAM_DIVISOR, %eax
  80         xorl    %edx, %edx              C carry
  81
  82 L(start_1c):
  83
  84 ifdef(`PIC',`
  85 ifdef(`DARWIN',`
  86         shrl    %eax                    C d/2
  87         LEA(    binvert_limb_table, %ecx)
  88         pushl   %ebx            FRAME_pushl()
  89         movl    PARAM_SIZE, %ebx
  90
  91         andl    $127, %eax
  92         subl    $2, %ebx
  93
  94         movb    (%eax,%ecx), %cl
  95         jc      L(one_limb)
  96 ',`
  97         call    L(here)         FRAME_pushl()
  98 L(here):
  99
 100         shrl    %eax                    C d/2
 101         movl    (%esp), %ecx            C eip
 102
 103         addl    $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
 104         movl    %ebx, (%esp)            C push ebx
 105
 106         andl    $127, %eax
 107         movl    PARAM_SIZE, %ebx
 108
 109         movl    binvert_limb_table@GOT(%ecx), %ecx
 110         subl    $2, %ebx
 111
 112         movb    (%eax,%ecx), %cl                        C inv 8 bits
 113         jc      L(one_limb)
 114 ')
 115 ',`
 116 dnl non-PIC
 117         shrl    %eax                    C d/2
 118         pushl   %ebx            FRAME_pushl()
 119
 120         movl    PARAM_SIZE, %ebx
 121         andl    $127, %eax
 122
 123         subl    $2, %ebx
 124         jc      L(one_limb)
 125
 126         movb    binvert_limb_table(%eax), %cl           C inv 8 bits
 127 ')
 128
 129         movl    %ecx, %eax
 130         addl    %ecx, %ecx              C 2*inv
 131
 132         imull   %eax, %eax              C inv*inv
 133
 134         imull   PARAM_DIVISOR, %eax     C inv*inv*d
 135
 136         subl    %eax, %ecx              C inv = 2*inv - inv*inv*d
 137
 138         movl    %ecx, %eax
 139         addl    %ecx, %ecx              C 2*inv
 140
 141         imull   %eax, %eax              C inv*inv
 142
 143         imull   PARAM_DIVISOR, %eax     C inv*inv*d
 144
 145         subl    %eax, %ecx              C inv = 2*inv - inv*inv*d
 146         pushl   %esi            FRAME_pushl()
 147
 148         ASSERT(e,`      C d*inv == 1 mod 2^GMP_LIMB_BITS
 149         movl    %ecx, %eax
 150         imull   PARAM_DIVISOR, %eax
 151         cmpl    $1, %eax')
 152
 153         movl    PARAM_SRC, %esi
 154         movl    %ecx, VAR_INVERSE
 155
 156         movl    (%esi), %eax            C src[0]
 157         leal    4(%esi,%ebx,4), %esi    C &src[size-1]
 158
 159         xorl    $-1, %ebx               C -(size-1)
 160         ASSERT(nz)
 161         jmp     L(entry)
 162
 163
 164 C The use of VAR_INVERSE means only a store is needed for that value, rather
 165 C than a push and pop of say %edi.
 166
 167         ALIGN(16)
 168 L(top):
 169         C eax   scratch, low product
 170         C ebx   counter, limbs, negative
 171         C ecx   carry bit
 172         C edx   scratch, high product
 173         C esi   &src[size-1]
 174         C edi
 175         C ebp
 176
 177         mull    PARAM_DIVISOR           C h:dummy = q*d
 178
 179         movl    (%esi,%ebx,4), %eax     C src[i]
 180         subl    %ecx, %edx              C h -= -c
 181
 182 L(entry):
 183         subl    %edx, %eax              C s = src[i] - h
 184
 185         sbbl    %ecx, %ecx              C new -c (0 or -1)
 186
 187         imull   VAR_INVERSE, %eax       C q = s*i
 188
 189         incl    %ebx
 190         jnz     L(top)
 191
 192
 193         mull    PARAM_DIVISOR
 194
 195         movl    (%esi), %eax            C src high
 196         subl    %ecx, %edx              C h -= -c
 197
 198         cmpl    PARAM_DIVISOR, %eax
 199
 200         jbe     L(skip_last)
 201 deflit(FRAME_LAST,FRAME)
 202
 203
 204         subl    %edx, %eax              C s = src[i] - h
 205         popl    %esi            FRAME_popl()
 206
 207         sbbl    %ecx, %ecx              C c (0 or -1)
 208         popl    %ebx            FRAME_popl()
 209
 210         imull   VAR_INVERSE, %eax       C q = s*i
 211
 212         mull    PARAM_DIVISOR           C h:dummy = q*d
 213
 214         movl    %edx, %eax
 215
 216         subl    %ecx, %eax
 217
 218         ret
 219
 220
 221 C When high<divisor can skip last step.
 222
 223 L(skip_last):
 224 deflit(`FRAME',FRAME_LAST)
 225         C eax   src high
 226         C ebx
 227         C ecx
 228         C edx   r
 229         C esi
 230
 231         subl    %eax, %edx      C r-s
 232         popl    %esi            FRAME_popl()
 233
 234         sbbl    %eax, %eax      C -1 if underflow
 235         movl    PARAM_DIVISOR, %ebx
 236
 237         andl    %ebx, %eax      C divisor if underflow
 238         popl    %ebx            FRAME_popl()
 239
 240         addl    %edx, %eax      C addback if underflow
 241
 242         ret
 243
 244
 245 C Special case for size==1 using a division for r = c-a mod d.
 246 C Could look for a-c<d and save a division sometimes, but that doesn't seem
 247 C worth bothering about.
 248
 249 L(one_limb):
 250 deflit(`FRAME',4)
 251         C eax
 252         C ebx   size-2 (==-1)
 253         C ecx
 254         C edx   carry
 255         C esi   src end
 256         C edi
 257         C ebp
 258
 259         movl    %edx, %eax
 260         movl    PARAM_SRC, %edx
 261
 262         movl    PARAM_DIVISOR, %ecx
 263         popl    %ebx            FRAME_popl()
 264
 265         subl    (%edx), %eax            C c-a
 266
 267         sbbl    %edx, %edx
 268         decl    %ecx                    C d-1
 269
 270         andl    %ecx, %edx              C b*d+c-a if c<a, or c-a if c>=a
 271
 272         divl    PARAM_DIVISOR
 273
 274         movl    %edx, %eax
 275
 276         ret
 277
 278 EPILOGUE()
 279 ASM_END()