source/libs/gmp/gmp-src/mpn/x86/k7/dive_1.asm

   1 dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C          cycles/limb
  35 C Athlon:     11.0
  36 C Hammer:      9.0
  37
  38
  39 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  40 C                      mp_limb_t divisor);
  41 C
  42 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
  43 C achieved with no special effort.  The load and shrld latencies are hidden
  44 C by out of order execution.
  45 C
  46 C It's a touch faster on size==1 to use the mul-by-inverse than divl.
  47
  48 defframe(PARAM_DIVISOR,16)
  49 defframe(PARAM_SIZE,   12)
  50 defframe(PARAM_SRC,    8)
  51 defframe(PARAM_DST,    4)
  52
  53 defframe(SAVE_EBX,     -4)
  54 defframe(SAVE_ESI,     -8)
  55 defframe(SAVE_EDI,    -12)
  56 defframe(SAVE_EBP,    -16)
  57 defframe(VAR_INVERSE, -20)
  58 defframe(VAR_DST_END, -24)
  59
  60 deflit(STACK_SPACE, 24)
  61
  62         TEXT
  63
  64         ALIGN(16)
  65 PROLOGUE(mpn_divexact_1)
  66 deflit(`FRAME',0)
  67
  68         movl    PARAM_DIVISOR, %eax
  69         subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
  70         movl    $-1, %ecx               C shift count
  71
  72         movl    %ebp, SAVE_EBP
  73         movl    PARAM_SIZE, %ebp
  74
  75         movl    %esi, SAVE_ESI
  76         movl    %edi, SAVE_EDI
  77
  78         C If there's usually only one or two trailing zero bits then this
  79         C should be faster than bsfl.
  80 L(strip_twos):
  81         incl    %ecx
  82         shrl    %eax
  83         jnc     L(strip_twos)
  84
  85         movl    %ebx, SAVE_EBX
  86         leal    1(%eax,%eax), %ebx      C d without twos
  87         andl    $127, %eax              C d/2, 7 bits
  88
  89 ifdef(`PIC',`
  90         LEA(    binvert_limb_table, %edx)
  91         movzbl  (%eax,%edx), %eax               C inv 8 bits
  92 ',`
  93         movzbl  binvert_limb_table(%eax), %eax  C inv 8 bits
  94 ')
  95
  96         leal    (%eax,%eax), %edx       C 2*inv
  97         movl    %ebx, PARAM_DIVISOR     C d without twos
  98
  99         imull   %eax, %eax              C inv*inv
 100
 101         movl    PARAM_SRC, %esi
 102         movl    PARAM_DST, %edi
 103
 104         imull   %ebx, %eax              C inv*inv*d
 105
 106         subl    %eax, %edx              C inv = 2*inv - inv*inv*d
 107         leal    (%edx,%edx), %eax       C 2*inv
 108
 109         imull   %edx, %edx              C inv*inv
 110
 111         leal    (%esi,%ebp,4), %esi     C src end
 112         leal    (%edi,%ebp,4), %edi     C dst end
 113         negl    %ebp                    C -size
 114
 115         imull   %ebx, %edx              C inv*inv*d
 116
 117         subl    %edx, %eax              C inv = 2*inv - inv*inv*d
 118
 119         ASSERT(e,`      C expect d*inv == 1 mod 2^GMP_LIMB_BITS
 120         pushl   %eax    FRAME_pushl()
 121         imull   PARAM_DIVISOR, %eax
 122         cmpl    $1, %eax
 123         popl    %eax    FRAME_popl()')
 124
 125         movl    %eax, VAR_INVERSE
 126         movl    (%esi,%ebp,4), %eax     C src[0]
 127
 128         incl    %ebp
 129         jz      L(one)
 130
 131         movl    (%esi,%ebp,4), %edx     C src[1]
 132
 133         shrdl(  %cl, %edx, %eax)
 134
 135         movl    %edi, VAR_DST_END
 136         xorl    %ebx, %ebx
 137         jmp     L(entry)
 138
 139         ALIGN(8)
 140 L(top):
 141         C eax   q
 142         C ebx   carry bit, 0 or 1
 143         C ecx   shift
 144         C edx
 145         C esi   src end
 146         C edi   dst end
 147         C ebp   counter, limbs, negative
 148
 149         mull    PARAM_DIVISOR           C carry limb in edx
 150
 151         movl    -4(%esi,%ebp,4), %eax
 152         movl    (%esi,%ebp,4), %edi
 153
 154         shrdl(  %cl, %edi, %eax)
 155
 156         subl    %ebx, %eax              C apply carry bit
 157         setc    %bl
 158         movl    VAR_DST_END, %edi
 159
 160         subl    %edx, %eax              C apply carry limb
 161         adcl    $0, %ebx
 162
 163 L(entry):
 164         imull   VAR_INVERSE, %eax
 165
 166         movl    %eax, -4(%edi,%ebp,4)
 167         incl    %ebp
 168         jnz     L(top)
 169
 170
 171         mull    PARAM_DIVISOR           C carry limb in edx
 172
 173         movl    -4(%esi), %eax          C src high limb
 174         shrl    %cl, %eax
 175         movl    SAVE_ESI, %esi
 176
 177         subl    %ebx, %eax              C apply carry bit
 178         movl    SAVE_EBX, %ebx
 179         movl    SAVE_EBP, %ebp
 180
 181         subl    %edx, %eax              C apply carry limb
 182
 183         imull   VAR_INVERSE, %eax
 184
 185         movl    %eax, -4(%edi)
 186         movl    SAVE_EDI, %edi
 187         addl    $STACK_SPACE, %esp
 188
 189         ret
 190
 191
 192 L(one):
 193         shrl    %cl, %eax
 194         movl    SAVE_ESI, %esi
 195         movl    SAVE_EBX, %ebx
 196
 197         imull   VAR_INVERSE, %eax
 198
 199         movl    SAVE_EBP, %ebp
 200         movl    %eax, -4(%edi)
 201
 202         movl    SAVE_EDI, %edi
 203         addl    $STACK_SPACE, %esp
 204
 205         ret
 206
 207 EPILOGUE()
 208 ASM_END()