source/libs/gmp/gmp-src/mpn/x86/pentium4/sse2/dive_1.asm

   1 dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P4: 19.0 cycles/limb
  35
  36
  37 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  38 C                      mp_limb_t divisor);
  39 C
  40 C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
  41 C being on the dependent chain and there being plenty of cycles available,
  42 C using an unaligned movq on every second iteration measured about 23 c/l.
  43 C
  44 C Using divl for size==1 seems a touch quicker than mul-by-inverse.  The mul
  45 C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
  46 C that might be hidden by out-of-order execution, whereas divl is around 60.
  47 C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
  48 C faster.
  49
  50 defframe(PARAM_DIVISOR,16)
  51 defframe(PARAM_SIZE,   12)
  52 defframe(PARAM_SRC,    8)
  53 defframe(PARAM_DST,    4)
  54
  55         TEXT
  56
  57         ALIGN(16)
  58 PROLOGUE(mpn_divexact_1)
  59 deflit(`FRAME',0)
  60
  61         movl    PARAM_SIZE, %edx
  62
  63         movl    PARAM_SRC, %eax
  64
  65         movl    PARAM_DIVISOR, %ecx
  66         subl    $1, %edx
  67         jnz     L(two_or_more)
  68
  69         movl    (%eax), %eax
  70         xorl    %edx, %edx
  71
  72         divl    %ecx
  73         movl    PARAM_DST, %ecx
  74
  75         movl    %eax, (%ecx)
  76         ret
  77
  78
  79 L(two_or_more):
  80         C eax   src
  81         C ebx
  82         C ecx   divisor
  83         C edx   size-1
  84
  85         movl    %ecx, %eax
  86         bsfl    %ecx, %ecx              C trailing twos
  87
  88         shrl    %cl, %eax               C d = divisor without twos
  89         movd    %eax, %mm6
  90         movd    %ecx, %mm7              C shift
  91
  92         shrl    %eax                    C d/2
  93
  94         andl    $127, %eax              C d/2, 7 bits
  95
  96 ifdef(`PIC',`
  97         LEA(    binvert_limb_table, %ecx)
  98         movzbl  (%eax,%ecx), %eax               C inv 8 bits
  99 ',`
 100         movzbl  binvert_limb_table(%eax), %eax  C inv 8 bits
 101 ')
 102
 103         C
 104
 105         movd    %eax, %mm5              C inv
 106
 107         movd    %eax, %mm0              C inv
 108
 109         pmuludq %mm5, %mm5              C inv*inv
 110
 111         C
 112
 113         pmuludq %mm6, %mm5              C inv*inv*d
 114         paddd   %mm0, %mm0              C 2*inv
 115
 116         C
 117
 118         psubd   %mm5, %mm0              C inv = 2*inv - inv*inv*d
 119         pxor    %mm5, %mm5
 120
 121         paddd   %mm0, %mm5
 122         pmuludq %mm0, %mm0              C inv*inv
 123
 124         pcmpeqd %mm4, %mm4
 125         psrlq   $32, %mm4               C 0x00000000FFFFFFFF
 126
 127         C
 128
 129         pmuludq %mm6, %mm0              C inv*inv*d
 130         paddd   %mm5, %mm5              C 2*inv
 131
 132         movl    PARAM_SRC, %eax
 133         movl    PARAM_DST, %ecx
 134         pxor    %mm1, %mm1              C initial carry limb
 135
 136         C
 137
 138         psubd   %mm0, %mm5              C inv = 2*inv - inv*inv*d
 139
 140         ASSERT(e,`      C expect d*inv == 1 mod 2^GMP_LIMB_BITS
 141         pushl   %eax    FRAME_pushl()
 142         movq    %mm6, %mm0
 143         pmuludq %mm5, %mm0
 144         movd    %mm0, %eax
 145         cmpl    $1, %eax
 146         popl    %eax    FRAME_popl()')
 147
 148         pxor    %mm0, %mm0              C initial carry bit
 149
 150
 151 C The dependent chain here is as follows.
 152 C
 153 C                                       latency
 154 C       psubq    s = (src-cbit) - climb    2
 155 C       pmuludq  q = s*inverse             8
 156 C       pmuludq  prod = q*divisor          8
 157 C       psrlq    climb = high(prod)        2
 158 C                                         --
 159 C                                         20
 160 C
 161 C Yet the loop measures 19.0 c/l, so obviously there's something gained
 162 C there over a straight reading of the chip documentation.
 163
 164 L(top):
 165         C eax   src, incrementing
 166         C ebx
 167         C ecx   dst, incrementing
 168         C edx   counter, size-1 iterations
 169         C
 170         C mm0   carry bit
 171         C mm1   carry limb
 172         C mm4   0x00000000FFFFFFFF
 173         C mm5   inverse
 174         C mm6   divisor
 175         C mm7   shift
 176
 177         movd    (%eax), %mm2
 178         movd    4(%eax), %mm3
 179         addl    $4, %eax
 180         punpckldq %mm3, %mm2
 181
 182         psrlq   %mm7, %mm2
 183         pand    %mm4, %mm2              C src
 184         psubq   %mm0, %mm2              C src - cbit
 185
 186         psubq   %mm1, %mm2              C src - cbit - climb
 187         movq    %mm2, %mm0
 188         psrlq   $63, %mm0               C new cbit
 189
 190         pmuludq %mm5, %mm2              C s*inverse
 191         movd    %mm2, (%ecx)            C q
 192         addl    $4, %ecx
 193
 194         movq    %mm6, %mm1
 195         pmuludq %mm2, %mm1              C q*divisor
 196         psrlq   $32, %mm1               C new climb
 197
 198         subl    $1, %edx
 199         jnz     L(top)
 200
 201
 202 L(done):
 203         movd    (%eax), %mm2
 204         psrlq   %mm7, %mm2              C src
 205         psubq   %mm0, %mm2              C src - cbit
 206
 207         psubq   %mm1, %mm2              C src - cbit - climb
 208
 209         pmuludq %mm5, %mm2              C s*inverse
 210         movd    %mm2, (%ecx)            C q
 211
 212         emms
 213         ret
 214
 215 EPILOGUE()
 216 ASM_END()