source/libs/gmp/gmp-src/mpn/x86/p6/mode1o.asm

   1 dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
   2
   3 dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P6: 10.0 cycles/limb
  35
  36
  37 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
  38 C                               mp_limb_t divisor);
  39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
  40 C                                mp_limb_t divisor, mp_limb_t carry);
  41 C
  42 C It's not worth skipping a step at the end when high<divisor since the main
  43 C loop is only 10 cycles.
  44
  45 defframe(PARAM_CARRY,  16)
  46 defframe(PARAM_DIVISOR,12)
  47 defframe(PARAM_SIZE,   8)
  48 defframe(PARAM_SRC,    4)
  49
  50 dnl  Not enough room under modexact_1 to make these re-use the parameter
  51 dnl  space, unfortunately.
  52 defframe(SAVE_EBX,     -4)
  53 defframe(SAVE_ESI,     -8)
  54 defframe(SAVE_EDI,    -12)
  55 deflit(STACK_SPACE, 12)
  56
  57         TEXT
  58
  59         ALIGN(16)
  60 PROLOGUE(mpn_modexact_1c_odd)
  61 deflit(`FRAME',0)
  62
  63         movl    PARAM_CARRY, %ecx
  64         jmp     L(start_1c)
  65
  66 EPILOGUE()
  67
  68         ALIGN(16)
  69 PROLOGUE(mpn_modexact_1_odd)
  70 deflit(`FRAME',0)
  71
  72         xorl    %ecx, %ecx
  73 L(start_1c):
  74         movl    PARAM_DIVISOR, %eax
  75
  76         subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
  77
  78         movl    %esi, SAVE_ESI
  79         movl    PARAM_SRC, %esi
  80
  81         shrl    %eax                    C d/2
  82         movl    %edi, SAVE_EDI
  83
  84         andl    $127, %eax
  85
  86 ifdef(`PIC',`
  87         LEA(    binvert_limb_table, %edi)
  88         movzbl  (%eax,%edi), %edi               C inv 8 bits
  89 ',`
  90         movzbl  binvert_limb_table(%eax), %edi  C inv 8 bits
  91 ')
  92
  93         xorl    %edx, %edx              C initial extra carry
  94         leal    (%edi,%edi), %eax       C 2*inv
  95
  96         imull   %edi, %edi              C inv*inv
  97
  98         movl    %ebx, SAVE_EBX
  99         movl    PARAM_SIZE, %ebx
 100
 101         imull   PARAM_DIVISOR, %edi     C inv*inv*d
 102
 103         subl    %edi, %eax              C inv = 2*inv - inv*inv*d
 104         leal    (%eax,%eax), %edi       C 2*inv
 105
 106         imull   %eax, %eax              C inv*inv
 107
 108         imull   PARAM_DIVISOR, %eax     C inv*inv*d
 109
 110         leal    (%esi,%ebx,4), %esi     C src end
 111         negl    %ebx                    C -size
 112
 113         subl    %eax, %edi              C inv = 2*inv - inv*inv*d
 114
 115         ASSERT(e,`      C d*inv == 1 mod 2^GMP_LIMB_BITS
 116         movl    PARAM_DIVISOR, %eax
 117         imull   %edi, %eax
 118         cmpl    $1, %eax')
 119
 120
 121 C The dependent chain here is
 122 C
 123 C       subl    %edx, %eax       1
 124 C       imull   %edi, %eax       4
 125 C       mull    PARAM_DIVISOR    5
 126 C                              ----
 127 C       total                   10
 128 C
 129 C and this is the measured speed.  No special scheduling is necessary, out
 130 C of order execution hides the load latency.
 131
 132 L(top):
 133         C eax   scratch (src limb)
 134         C ebx   counter, limbs, negative
 135         C ecx   carry bit, 0 or 1
 136         C edx   carry limb, high of last product
 137         C esi   &src[size]
 138         C edi   inverse
 139         C ebp
 140
 141         movl    (%esi,%ebx,4), %eax
 142         subl    %ecx, %eax
 143
 144         sbbl    %ecx, %ecx
 145         subl    %edx, %eax
 146
 147         sbbl    $0, %ecx
 148
 149         imull   %edi, %eax
 150
 151         negl    %ecx
 152
 153         mull    PARAM_DIVISOR
 154
 155         incl    %ebx
 156         jnz     L(top)
 157
 158
 159         movl    SAVE_ESI, %esi
 160         leal    (%ecx,%edx), %eax
 161
 162         movl    SAVE_EDI, %edi
 163
 164         movl    SAVE_EBX, %ebx
 165         addl    $STACK_SPACE, %esp
 166
 167         ret
 168
 169 EPILOGUE()
 170 ASM_END()