source/libs/gmp/gmp-src/mpn/x86_64/pentium4/mod_34lsub1.asm

   1 dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
   2
   3 dnl  Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation,
   4 dnl  Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34
  35 C            cycles/limb
  36 C AMD K8,K9      1.0
  37 C AMD K10        1.12
  38 C Intel P4       3.25
  39 C Intel core2    1.5
  40 C Intel corei    1.5
  41 C Intel atom     2.5
  42 C VIA nano       1.75
  43
  44
  45 C INPUT PARAMETERS
  46 define(`ap',    %rdi)
  47 define(`n',     %rsi)
  48
  49 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
  50
  51 C TODO
  52 C  * Review feed-in and wind-down code.  In particular, try to avoid adc and
  53 C    sbb to placate Pentium4.
  54 C  * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling,
  55 C    without the dual loop exits.
  56
  57 ABI_SUPPORT(DOS64)
  58 ABI_SUPPORT(STD64)
  59
  60 ASM_START()
  61         TEXT
  62         ALIGN(32)
  63 PROLOGUE(mpn_mod_34lsub1)
  64         FUNC_ENTRY(2)
  65
  66         mov     $0x0000FFFFFFFFFFFF, %r11
  67
  68         sub     $2, %rsi
  69         ja      L(gt2)
  70
  71         mov     (ap), %rax
  72         nop
  73         jb      L(1)
  74
  75         mov     8(ap), %rsi
  76         mov     %rax, %rdx
  77         shr     $48, %rax               C src[0] low
  78
  79         and     %r11, %rdx              C src[0] high
  80         add     %rdx, %rax
  81         mov     R32(%rsi), R32(%rdx)
  82
  83         shr     $32, %rsi               C src[1] high
  84         add     %rsi, %rax
  85
  86         shl     $16, %rdx               C src[1] low
  87         add     %rdx, %rax
  88
  89 L(1):   FUNC_EXIT()
  90         ret
  91
  92
  93         ALIGN(16)
  94 L(gt2): xor     R32(%rax), R32(%rax)
  95         xor     R32(%rcx), R32(%rcx)
  96         xor     R32(%rdx), R32(%rdx)
  97         xor     %r8, %r8
  98         xor     %r9, %r9
  99         xor     %r10, %r10
 100
 101 L(top): add     (ap), %rax
 102         adc     $0, %r10
 103         add     8(ap), %rcx
 104         adc     $0, %r8
 105         add     16(ap), %rdx
 106         adc     $0, %r9
 107
 108         sub     $3, %rsi
 109         jng     L(end)
 110
 111         add     24(ap), %rax
 112         adc     $0, %r10
 113         add     32(ap), %rcx
 114         adc     $0, %r8
 115         add     40(ap), %rdx
 116         lea     48(ap), ap
 117         adc     $0, %r9
 118
 119         sub     $3, %rsi
 120         jg      L(top)
 121
 122
 123         add     $-24, ap
 124 L(end): add     %r9, %rax
 125         adc     %r10, %rcx
 126         adc     %r8, %rdx
 127
 128         inc     %rsi
 129         mov     $0x1, R32(%r10)
 130         js      L(combine)
 131
 132         mov     $0x10000, R32(%r10)
 133         adc     24(ap), %rax
 134         dec     %rsi
 135         js      L(combine)
 136
 137         adc     32(ap), %rcx
 138         mov     $0x100000000, %r10
 139
 140 L(combine):
 141         sbb     %rsi, %rsi              C carry
 142         mov     %rax, %rdi              C 0mod3
 143         shr     $48, %rax               C 0mod3 high
 144
 145         and     %r10, %rsi              C carry masked
 146         and     %r11, %rdi              C 0mod3 low
 147         mov     R32(%rcx), R32(%r10)    C 1mod3
 148
 149         add     %rsi, %rax              C apply carry
 150         shr     $32, %rcx               C 1mod3 high
 151
 152         add     %rdi, %rax              C apply 0mod3 low
 153         movzwl  %dx, R32(%rdi)          C 2mod3
 154         shl     $16, %r10               C 1mod3 low
 155
 156         add     %rcx, %rax              C apply 1mod3 high
 157         shr     $16, %rdx               C 2mod3 high
 158
 159         add     %r10, %rax              C apply 1mod3 low
 160         shl     $32, %rdi               C 2mod3 low
 161
 162         add     %rdx, %rax              C apply 2mod3 high
 163         add     %rdi, %rax              C apply 2mod3 low
 164
 165         FUNC_EXIT()
 166         ret
 167 EPILOGUE()