source/libs/gmp/gmp-src/mpn/x86/k7/mode1o.asm

   1 dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
   2
   3 dnl  Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C          cycles/limb
  35 C Athlon:     11.0
  36 C Hammer:      7.0
  37
  38
  39 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
  40 C                               mp_limb_t divisor);
  41 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
  42 C                                mp_limb_t divisor, mp_limb_t carry);
  43 C
  44 C With the loop running at just 11 cycles it doesn't seem worth bothering to
  45 C check for high<divisor to save one step.
  46 C
  47 C Using a divl for size==1 measures slower than the modexact method, which
  48 C is not too surprising since for the latter it's only about 24 cycles to
  49 C calculate the modular inverse.
  50
  51 defframe(PARAM_CARRY,  16)
  52 defframe(PARAM_DIVISOR,12)
  53 defframe(PARAM_SIZE,   8)
  54 defframe(PARAM_SRC,    4)
  55
  56 defframe(SAVE_EBX,     -4)
  57 defframe(SAVE_ESI,     -8)
  58 defframe(SAVE_EDI,    -12)
  59 defframe(SAVE_EBP,    -16)
  60
  61 deflit(STACK_SPACE, 16)
  62
  63         TEXT
  64
  65         ALIGN(16)
  66 PROLOGUE(mpn_modexact_1c_odd)
  67 deflit(`FRAME',0)
  68
  69         movl    PARAM_CARRY, %ecx
  70         jmp     L(start_1c)
  71
  72 EPILOGUE()
  73
  74
  75         ALIGN(16)
  76 PROLOGUE(mpn_modexact_1_odd)
  77 deflit(`FRAME',0)
  78
  79         xorl    %ecx, %ecx
  80 L(start_1c):
  81         movl    PARAM_DIVISOR, %eax
  82         subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
  83
  84         movl    %esi, SAVE_ESI
  85         movl    PARAM_DIVISOR, %esi
  86
  87         movl    %edi, SAVE_EDI
  88
  89         shrl    %eax                    C d/2
  90
  91         andl    $127, %eax
  92
  93 ifdef(`PIC',`
  94         LEA(    binvert_limb_table, %edi)
  95         movzbl  (%eax,%edi), %edi               C inv 8 bits
  96 ',`
  97         movzbl  binvert_limb_table(%eax), %edi  C inv 8 bits
  98 ')
  99
 100         xorl    %edx, %edx              C initial extra carry
 101         leal    (%edi,%edi), %eax       C 2*inv
 102
 103         imull   %edi, %edi              C inv*inv
 104
 105         movl    %ebp, SAVE_EBP
 106         movl    PARAM_SIZE, %ebp
 107
 108         movl    %ebx, SAVE_EBX
 109         movl    PARAM_SRC, %ebx
 110
 111         imull   %esi, %edi              C inv*inv*d
 112
 113         subl    %edi, %eax              C inv = 2*inv - inv*inv*d
 114         leal    (%eax,%eax), %edi       C 2*inv
 115
 116         imull   %eax, %eax              C inv*inv
 117
 118         imull   %esi, %eax              C inv*inv*d
 119
 120         leal    (%ebx,%ebp,4), %ebx     C src end
 121         negl    %ebp                    C -size
 122
 123         subl    %eax, %edi              C inv = 2*inv - inv*inv*d
 124
 125         ASSERT(e,`      C d*inv == 1 mod 2^GMP_LIMB_BITS
 126         movl    %esi, %eax
 127         imull   %edi, %eax
 128         cmpl    $1, %eax')
 129
 130
 131 C The dependent chain here is
 132 C
 133 C                            cycles
 134 C       subl    %edx, %eax      1
 135 C       imull   %edi, %eax      4
 136 C       mull    %esi            6  (high limb)
 137 C                             ----
 138 C       total                  11
 139 C
 140 C Out of order execution hides the load latency for the source data, so no
 141 C special scheduling is required.
 142
 143 L(top):
 144         C eax   src limb
 145         C ebx   src end ptr
 146         C ecx   next carry bit, 0 or 1 (or initial carry param)
 147         C edx   carry limb, high of last product
 148         C esi   divisor
 149         C edi   inverse
 150         C ebp   counter, limbs, negative
 151
 152         movl    (%ebx,%ebp,4), %eax
 153
 154         subl    %ecx, %eax              C apply carry bit
 155         movl    $0, %ecx
 156
 157         setc    %cl                     C new carry bit
 158
 159         subl    %edx, %eax              C apply carry limb
 160         adcl    $0, %ecx
 161
 162         imull   %edi, %eax
 163
 164         mull    %esi
 165
 166         incl    %ebp
 167         jnz     L(top)
 168
 169
 170         movl    SAVE_ESI, %esi
 171         movl    SAVE_EDI, %edi
 172         leal    (%ecx,%edx), %eax
 173
 174         movl    SAVE_EBX, %ebx
 175         movl    SAVE_EBP, %ebp
 176         addl    $STACK_SPACE, %esp
 177
 178         ret
 179
 180 EPILOGUE()
 181 ASM_END()