source/libs/gmp/gmp-src/mpn/x86/k7/bdiv_q_1.asm

   1 dnl  AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
   2
   3 dnl  Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
   4
   5 dnl  Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35
  36 C          cycles/limb
  37 C Athlon:     11.0
  38 C Hammer:      9.0
  39
  40
  41 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  42 C                      mp_limb_t divisor);
  43 C
  44 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
  45 C achieved with no special effort.  The load and shrld latencies are hidden
  46 C by out of order execution.
  47 C
  48 C It's a touch faster on size==1 to use the mul-by-inverse than divl.
  49
  50 defframe(PARAM_SHIFT,  24)
  51 defframe(PARAM_INVERSE,20)
  52 defframe(PARAM_DIVISOR,16)
  53 defframe(PARAM_SIZE,   12)
  54 defframe(PARAM_SRC,    8)
  55 defframe(PARAM_DST,    4)
  56
  57 defframe(SAVE_EBX,     -4)
  58 defframe(SAVE_ESI,     -8)
  59 defframe(SAVE_EDI,    -12)
  60 defframe(SAVE_EBP,    -16)
  61 defframe(VAR_INVERSE, -20)
  62 defframe(VAR_DST_END, -24)
  63
  64 deflit(STACK_SPACE, 24)
  65
  66         TEXT
  67
  68 C mp_limb_t
  69 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
  70 C                   mp_limb_t inverse, int shift)
  71         ALIGN(16)
  72 PROLOGUE(mpn_pi1_bdiv_q_1)
  73 deflit(`FRAME',0)
  74
  75         subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
  76         movl    PARAM_SHIFT, %ecx       C shift count
  77
  78         movl    %ebp, SAVE_EBP
  79         movl    PARAM_SIZE, %ebp
  80
  81         movl    %esi, SAVE_ESI
  82         movl    PARAM_SRC, %esi
  83
  84         movl    %edi, SAVE_EDI
  85         movl    PARAM_DST, %edi
  86
  87         movl    %ebx, SAVE_EBX
  88
  89         leal    (%esi,%ebp,4), %esi     C src end
  90         leal    (%edi,%ebp,4), %edi     C dst end
  91         negl    %ebp                    C -size
  92
  93         movl    PARAM_INVERSE, %eax     C inv
  94
  95 L(common):
  96         movl    %eax, VAR_INVERSE
  97         movl    (%esi,%ebp,4), %eax     C src[0]
  98
  99         incl    %ebp
 100         jz      L(one)
 101
 102         movl    (%esi,%ebp,4), %edx     C src[1]
 103
 104         shrdl(  %cl, %edx, %eax)
 105
 106         movl    %edi, VAR_DST_END
 107         xorl    %ebx, %ebx
 108         jmp     L(entry)
 109
 110         ALIGN(8)
 111 L(top):
 112         C eax   q
 113         C ebx   carry bit, 0 or 1
 114         C ecx   shift
 115         C edx
 116         C esi   src end
 117         C edi   dst end
 118         C ebp   counter, limbs, negative
 119
 120         mull    PARAM_DIVISOR           C carry limb in edx
 121
 122         movl    -4(%esi,%ebp,4), %eax
 123         movl    (%esi,%ebp,4), %edi
 124
 125         shrdl(  %cl, %edi, %eax)
 126
 127         subl    %ebx, %eax              C apply carry bit
 128         setc    %bl
 129         movl    VAR_DST_END, %edi
 130
 131         subl    %edx, %eax              C apply carry limb
 132         adcl    $0, %ebx
 133
 134 L(entry):
 135         imull   VAR_INVERSE, %eax
 136
 137         movl    %eax, -4(%edi,%ebp,4)
 138         incl    %ebp
 139         jnz     L(top)
 140
 141
 142         mull    PARAM_DIVISOR           C carry limb in edx
 143
 144         movl    -4(%esi), %eax          C src high limb
 145         shrl    %cl, %eax
 146         movl    SAVE_ESI, %esi
 147
 148         subl    %ebx, %eax              C apply carry bit
 149         movl    SAVE_EBX, %ebx
 150         movl    SAVE_EBP, %ebp
 151
 152         subl    %edx, %eax              C apply carry limb
 153
 154         imull   VAR_INVERSE, %eax
 155
 156         movl    %eax, -4(%edi)
 157         movl    SAVE_EDI, %edi
 158         addl    $STACK_SPACE, %esp
 159
 160         ret
 161
 162 L(one):
 163         shrl    %cl, %eax
 164         movl    SAVE_ESI, %esi
 165         movl    SAVE_EBX, %ebx
 166
 167         imull   VAR_INVERSE, %eax
 168
 169         movl    SAVE_EBP, %ebp
 170
 171         movl    %eax, -4(%edi)
 172         movl    SAVE_EDI, %edi
 173         addl    $STACK_SPACE, %esp
 174
 175         ret
 176 EPILOGUE()
 177
 178 C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
 179 C                           mp_limb_t divisor);
 180 C
 181
 182         ALIGN(16)
 183 PROLOGUE(mpn_bdiv_q_1)
 184 deflit(`FRAME',0)
 185
 186         movl    PARAM_DIVISOR, %eax
 187         subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
 188         movl    $-1, %ecx               C shift count
 189
 190         movl    %ebp, SAVE_EBP
 191         movl    PARAM_SIZE, %ebp
 192
 193         movl    %esi, SAVE_ESI
 194         movl    %edi, SAVE_EDI
 195
 196         C If there's usually only one or two trailing zero bits then this
 197         C should be faster than bsfl.
 198 L(strip_twos):
 199         incl    %ecx
 200         shrl    %eax
 201         jnc     L(strip_twos)
 202
 203         movl    %ebx, SAVE_EBX
 204         leal    1(%eax,%eax), %ebx      C d without twos
 205         andl    $127, %eax              C d/2, 7 bits
 206
 207 ifdef(`PIC',`
 208         LEA(    binvert_limb_table, %edx)
 209         movzbl  (%eax,%edx), %eax               C inv 8 bits
 210 ',`
 211         movzbl  binvert_limb_table(%eax), %eax  C inv 8 bits
 212 ')
 213
 214         leal    (%eax,%eax), %edx       C 2*inv
 215         movl    %ebx, PARAM_DIVISOR     C d without twos
 216
 217         imull   %eax, %eax              C inv*inv
 218
 219         movl    PARAM_SRC, %esi
 220         movl    PARAM_DST, %edi
 221
 222         imull   %ebx, %eax              C inv*inv*d
 223
 224         subl    %eax, %edx              C inv = 2*inv - inv*inv*d
 225         leal    (%edx,%edx), %eax       C 2*inv
 226
 227         imull   %edx, %edx              C inv*inv
 228
 229         leal    (%esi,%ebp,4), %esi     C src end
 230         leal    (%edi,%ebp,4), %edi     C dst end
 231         negl    %ebp                    C -size
 232
 233         imull   %ebx, %edx              C inv*inv*d
 234
 235         subl    %edx, %eax              C inv = 2*inv - inv*inv*d
 236
 237         ASSERT(e,`      C expect d*inv == 1 mod 2^GMP_LIMB_BITS
 238         pushl   %eax    FRAME_pushl()
 239         imull   PARAM_DIVISOR, %eax
 240         cmpl    $1, %eax
 241         popl    %eax    FRAME_popl()')
 242
 243         jmp     L(common)
 244 EPILOGUE()
 245 ASM_END()