source/libs/gmp/gmp-src/mpn/x86/pentium/dive_1.asm

   1 dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2001, 2002, 2014 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C         divisor
  35 C       odd   even
  36 C P54:  24.5  30.5   cycles/limb
  37 C P55:  23.0  28.0
  38
  39
  40 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  41 C                      mp_limb_t divisor);
  42 C
  43 C Plain divl is used for small sizes, since the inverse takes a while to
  44 C setup.  Multiplying works out faster for size>=3 when the divisor is odd,
  45 C or size>=4 when the divisor is even.  Actually on P55 size==2 for odd or
  46 C size==3 for even are about the same speed for both divl or mul, but the
  47 C former is used since it will use up less code cache.
  48 C
  49 C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
  50 C expected.  On P54 in the even case the shrdl pairing nonsense (see
  51 C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
  52 C further 1.5 slowdown for both odd and even.
  53
  54 defframe(PARAM_DIVISOR,16)
  55 defframe(PARAM_SIZE,   12)
  56 defframe(PARAM_SRC,    8)
  57 defframe(PARAM_DST,    4)
  58
  59 dnl  re-use parameter space
  60 define(VAR_INVERSE,`PARAM_DST')
  61
  62         TEXT
  63
  64         ALIGN(32)
  65 PROLOGUE(mpn_divexact_1)
  66 deflit(`FRAME',0)
  67
  68         movl    PARAM_DIVISOR, %eax
  69         movl    PARAM_SIZE, %ecx
  70
  71         pushl   %esi            FRAME_pushl()
  72         push    %edi            FRAME_pushl()
  73
  74         movl    PARAM_SRC, %esi
  75         andl    $1, %eax
  76
  77         movl    PARAM_DST, %edi
  78         addl    %ecx, %eax      C size if even, size+1 if odd
  79
  80         cmpl    $4, %eax
  81         jae     L(mul_by_inverse)
  82
  83
  84         xorl    %edx, %edx
  85 L(div_top):
  86         movl    -4(%esi,%ecx,4), %eax
  87
  88         divl    PARAM_DIVISOR
  89
  90         movl    %eax, -4(%edi,%ecx,4)
  91         decl    %ecx
  92
  93         jnz     L(div_top)
  94
  95         popl    %edi
  96         popl    %esi
  97
  98         ret
  99
 100
 101
 102 L(mul_by_inverse):
 103         movl    PARAM_DIVISOR, %eax
 104         movl    $-1, %ecx
 105
 106 L(strip_twos):
 107         ASSERT(nz, `orl %eax, %eax')
 108         shrl    %eax
 109         incl    %ecx                    C shift count
 110
 111         jnc     L(strip_twos)
 112
 113         leal    1(%eax,%eax), %edx      C d
 114         andl    $127, %eax              C d/2, 7 bits
 115
 116         pushl   %ebx            FRAME_pushl()
 117         pushl   %ebp            FRAME_pushl()
 118
 119 ifdef(`PIC',`dnl
 120         LEA(    binvert_limb_table, %ebp)
 121         movzbl  (%eax,%ebp), %eax               C inv 8 bits
 122 ',`
 123         movzbl  binvert_limb_table(%eax), %eax  C inv 8 bits
 124 ')
 125
 126         movl    %eax, %ebp              C inv
 127         addl    %eax, %eax              C 2*inv
 128
 129         imull   %ebp, %ebp              C inv*inv
 130
 131         imull   %edx, %ebp              C inv*inv*d
 132
 133         subl    %ebp, %eax              C inv = 2*inv - inv*inv*d
 134         movl    PARAM_SIZE, %ebx
 135
 136         movl    %eax, %ebp
 137         addl    %eax, %eax              C 2*inv
 138
 139         imull   %ebp, %ebp              C inv*inv
 140
 141         imull   %edx, %ebp              C inv*inv*d
 142
 143         subl    %ebp, %eax              C inv = 2*inv - inv*inv*d
 144         movl    %edx, PARAM_DIVISOR     C d without twos
 145
 146         leal    (%esi,%ebx,4), %esi     C src end
 147         leal    (%edi,%ebx,4), %edi     C dst end
 148
 149         negl    %ebx                    C -size
 150
 151         ASSERT(e,`      C expect d*inv == 1 mod 2^GMP_LIMB_BITS
 152         pushl   %eax    FRAME_pushl()
 153         imull   PARAM_DIVISOR, %eax
 154         cmpl    $1, %eax
 155         popl    %eax    FRAME_popl()')
 156
 157         movl    %eax, VAR_INVERSE
 158         xorl    %ebp, %ebp              C initial carry bit
 159
 160         movl    (%esi,%ebx,4), %eax     C src low limb
 161         orl     %ecx, %ecx              C shift
 162
 163         movl    4(%esi,%ebx,4), %edx    C src second limb (for even)
 164         jz      L(odd_entry)
 165
 166         shrdl(  %cl, %edx, %eax)
 167
 168         incl    %ebx
 169         jmp     L(even_entry)
 170
 171
 172         ALIGN(8)
 173 L(odd_top):
 174         C eax   scratch
 175         C ebx   counter, limbs, negative
 176         C ecx
 177         C edx
 178         C esi   src end
 179         C edi   dst end
 180         C ebp   carry bit, 0 or -1
 181
 182         mull    PARAM_DIVISOR
 183
 184         movl    (%esi,%ebx,4), %eax
 185         subl    %ebp, %edx
 186
 187         subl    %edx, %eax
 188
 189         sbbl    %ebp, %ebp
 190
 191 L(odd_entry):
 192         imull   VAR_INVERSE, %eax
 193
 194         movl    %eax, (%edi,%ebx,4)
 195
 196         incl    %ebx
 197         jnz     L(odd_top)
 198
 199
 200         popl    %ebp
 201         popl    %ebx
 202
 203         popl    %edi
 204         popl    %esi
 205
 206         ret
 207
 208
 209 L(even_top):
 210         C eax   scratch
 211         C ebx   counter, limbs, negative
 212         C ecx   twos
 213         C edx
 214         C esi   src end
 215         C edi   dst end
 216         C ebp   carry bit, 0 or -1
 217
 218         mull    PARAM_DIVISOR
 219
 220         subl    %ebp, %edx              C carry bit
 221         movl    -4(%esi,%ebx,4), %eax   C src limb
 222
 223         movl    (%esi,%ebx,4), %ebp     C and one above it
 224
 225         shrdl(  %cl, %ebp, %eax)
 226
 227         subl    %edx, %eax              C carry limb
 228
 229         sbbl    %ebp, %ebp
 230
 231 L(even_entry):
 232         imull   VAR_INVERSE, %eax
 233
 234         movl    %eax, -4(%edi,%ebx,4)
 235         incl    %ebx
 236
 237         jnz     L(even_top)
 238
 239
 240
 241         mull    PARAM_DIVISOR
 242
 243         movl    -4(%esi), %eax          C src high limb
 244         subl    %ebp, %edx
 245
 246         shrl    %cl, %eax
 247
 248         subl    %edx, %eax              C no carry if division is exact
 249
 250         imull   VAR_INVERSE, %eax
 251
 252         movl    %eax, -4(%edi)          C dst high limb
 253         nop                             C protect against cache bank clash
 254
 255         popl    %ebp
 256         popl    %ebx
 257
 258         popl    %edi
 259         popl    %esi
 260
 261         ret
 262
 263 EPILOGUE()
 264 ASM_END()