source/libs/gmp/gmp-src/mpn/x86/p6/dive_1.asm

   1 dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
   2
   3 dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C       odd  even  divisor
  35 C P6:  10.0  12.0  cycles/limb
  36
  37
  38 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  39 C                      mp_limb_t divisor);
  40 C
  41 C The odd case is basically the same as mpn_modexact_1_odd, just with an
  42 C extra store, and it runs at the same 10 cycles which is the dependent
  43 C chain.
  44 C
  45 C The shifts for the even case aren't on the dependent chain so in principle
  46 C it could run the same too, but nothing running at 10 has been found.
  47 C Perhaps there's too many uops (an extra 4 over the odd case).
  48
  49 defframe(PARAM_DIVISOR,16)
  50 defframe(PARAM_SIZE,   12)
  51 defframe(PARAM_SRC,     8)
  52 defframe(PARAM_DST,     4)
  53
  54 defframe(SAVE_EBX,     -4)
  55 defframe(SAVE_ESI,     -8)
  56 defframe(SAVE_EDI,    -12)
  57 defframe(SAVE_EBP,    -16)
  58 defframe(VAR_INVERSE, -20)
  59 deflit(STACK_SPACE, 20)
  60
  61         TEXT
  62
  63         ALIGN(16)
  64 PROLOGUE(mpn_divexact_1)
  65 deflit(`FRAME',0)
  66
  67         movl    PARAM_DIVISOR, %eax
  68         subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
  69
  70         movl    %esi, SAVE_ESI
  71         movl    PARAM_SRC, %esi
  72
  73         movl    %ebx, SAVE_EBX
  74         movl    PARAM_SIZE, %ebx
  75
  76         bsfl    %eax, %ecx              C trailing twos
  77
  78         movl    %ebp, SAVE_EBP
  79
  80         shrl    %cl, %eax               C d without twos
  81
  82         movl    %eax, %edx
  83         shrl    %eax                    C d/2 without twos
  84
  85         movl    %edx, PARAM_DIVISOR
  86         andl    $127, %eax
  87
  88 ifdef(`PIC',`
  89         LEA(    binvert_limb_table, %ebp)
  90         movzbl  (%eax,%ebp), %ebp               C inv 8 bits
  91 ',`
  92         movzbl  binvert_limb_table(%eax), %ebp  C inv 8 bits
  93 ')
  94
  95         leal    (%ebp,%ebp), %eax       C 2*inv
  96
  97         imull   %ebp, %ebp              C inv*inv
  98
  99         movl    %edi, SAVE_EDI
 100         movl    PARAM_DST, %edi
 101
 102         leal    (%esi,%ebx,4), %esi     C src end
 103
 104         imull   PARAM_DIVISOR, %ebp     C inv*inv*d
 105
 106         subl    %ebp, %eax              C inv = 2*inv - inv*inv*d
 107         leal    (%eax,%eax), %ebp       C 2*inv
 108
 109         imull   %eax, %eax              C inv*inv
 110
 111         leal    (%edi,%ebx,4), %edi     C dst end
 112         negl    %ebx                    C -size
 113
 114         movl    %edi, PARAM_DST
 115
 116         imull   PARAM_DIVISOR, %eax     C inv*inv*d
 117
 118         subl    %eax, %ebp              C inv = 2*inv - inv*inv*d
 119
 120         ASSERT(e,`      C d*inv == 1 mod 2^GMP_LIMB_BITS
 121         movl    PARAM_DIVISOR, %eax
 122         imull   %ebp, %eax
 123         cmpl    $1, %eax')
 124
 125         movl    %ebp, VAR_INVERSE
 126         movl    (%esi,%ebx,4), %eax     C src[0]
 127
 128         orl     %ecx, %ecx
 129         jnz     L(even)
 130
 131         C ecx initial carry is zero
 132         jmp     L(odd_entry)
 133
 134
 135 C The dependent chain here is
 136 C
 137 C       subl    %edx, %eax       1
 138 C       imull   %ebp, %eax       4
 139 C       mull    PARAM_DIVISOR    5
 140 C                              ----
 141 C       total                   10
 142 C
 143 C and this is the measured speed.  No special scheduling is necessary, out
 144 C of order execution hides the load latency.
 145
 146 L(odd_top):
 147         C eax   scratch (src limb)
 148         C ebx   counter, limbs, negative
 149         C ecx   carry bit
 150         C edx   carry limb, high of last product
 151         C esi   &src[size]
 152         C edi   &dst[size]
 153         C ebp
 154
 155         mull    PARAM_DIVISOR
 156
 157         movl    (%esi,%ebx,4), %eax
 158         subl    %ecx, %eax
 159
 160         sbbl    %ecx, %ecx
 161         subl    %edx, %eax
 162
 163         sbbl    $0, %ecx
 164
 165 L(odd_entry):
 166         imull   VAR_INVERSE, %eax
 167
 168         movl    %eax, (%edi,%ebx,4)
 169         negl    %ecx
 170
 171         incl    %ebx
 172         jnz     L(odd_top)
 173
 174
 175         movl    SAVE_ESI, %esi
 176
 177         movl    SAVE_EDI, %edi
 178
 179         movl    SAVE_EBP, %ebp
 180
 181         movl    SAVE_EBX, %ebx
 182         addl    $STACK_SPACE, %esp
 183
 184         ret
 185
 186
 187 L(even):
 188         C eax   src[0]
 189         C ebx   counter, limbs, negative
 190         C ecx   shift
 191         C edx
 192         C esi
 193         C edi
 194         C ebp
 195
 196         xorl    %ebp, %ebp              C initial carry bit
 197         xorl    %edx, %edx              C initial carry limb (for size==1)
 198
 199         incl    %ebx
 200         jz      L(even_one)
 201
 202         movl    (%esi,%ebx,4), %edi     C src[1]
 203
 204         shrdl(  %cl, %edi, %eax)
 205
 206         jmp     L(even_entry)
 207
 208
 209 L(even_top):
 210         C eax   scratch
 211         C ebx   counter, limbs, negative
 212         C ecx   shift
 213         C edx   scratch
 214         C esi   &src[size]
 215         C edi   &dst[size] and scratch
 216         C ebp   carry bit
 217
 218         movl    (%esi,%ebx,4), %edi
 219
 220         mull    PARAM_DIVISOR
 221
 222         movl    -4(%esi,%ebx,4), %eax
 223         shrdl(  %cl, %edi, %eax)
 224
 225         subl    %ebp, %eax
 226
 227         sbbl    %ebp, %ebp
 228         subl    %edx, %eax
 229
 230         sbbl    $0, %ebp
 231
 232 L(even_entry):
 233         imull   VAR_INVERSE, %eax
 234
 235         movl    PARAM_DST, %edi
 236         negl    %ebp
 237
 238         movl    %eax, -4(%edi,%ebx,4)
 239         incl    %ebx
 240         jnz     L(even_top)
 241
 242
 243
 244         mull    PARAM_DIVISOR
 245
 246         movl    -4(%esi), %eax
 247
 248 L(even_one):
 249         shrl    %cl, %eax
 250         movl    SAVE_ESI, %esi
 251
 252         subl    %ebp, %eax
 253         movl    SAVE_EBP, %ebp
 254
 255         subl    %edx, %eax
 256         movl    SAVE_EBX, %ebx
 257
 258         imull   VAR_INVERSE, %eax
 259
 260         movl    %eax, -4(%edi)
 261         movl    SAVE_EDI, %edi
 262         addl    $STACK_SPACE, %esp
 263
 264         ret
 265
 266 EPILOGUE()
 267 ASM_END()