source/libs/gmp/gmp-src/mpn/x86_64/k8/div_qr_1n_pi1.asm

   1 dnl  x86-64 mpn_div_qr_1n_pi1
   2 dnl  -- Divide an mpn number by a normalized single-limb number,
   3 dnl     using a single-limb inverse.
   4
   5 dnl  Contributed to the GNU project by Niels Möller
   6
   7 dnl  Copyright 2013 Free Software Foundation, Inc.
   8
   9 dnl  This file is part of the GNU MP Library.
  10 dnl
  11 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  12 dnl  it under the terms of either:
  13 dnl
  14 dnl    * the GNU Lesser General Public License as published by the Free
  15 dnl      Software Foundation; either version 3 of the License, or (at your
  16 dnl      option) any later version.
  17 dnl
  18 dnl  or
  19 dnl
  20 dnl    * the GNU General Public License as published by the Free Software
  21 dnl      Foundation; either version 2 of the License, or (at your option) any
  22 dnl      later version.
  23 dnl
  24 dnl  or both in parallel, as here.
  25 dnl
  26 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  27 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  28 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  29 dnl  for more details.
  30 dnl
  31 dnl  You should have received copies of the GNU General Public License and the
  32 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  33 dnl  see https://www.gnu.org/licenses/.
  34
  35 include(`../config.m4')
  36
  37
  38 C               c/l
  39 C AMD K8,K9     11
  40 C AMD K10       11
  41 C AMD bull      16
  42 C AMD pile      14.25
  43 C AMD steam      ?
  44 C AMD bobcat    16
  45 C AMD jaguar     ?
  46 C Intel P4      47.5    poor
  47 C Intel core    28.5    very poor
  48 C Intel NHM     29      very poor
  49 C Intel SBR     16      poor
  50 C Intel IBR     13.5
  51 C Intel HWL     12
  52 C Intel BWL      ?
  53 C Intel atom    53      very poor
  54 C VIA nano      19
  55
  56
  57 C INPUT Parameters
  58 define(`QP', `%rdi')
  59 define(`UP', `%rsi')
  60 define(`UN_INPUT', `%rdx')
  61 define(`U1', `%rcx')    C Also in %rax
  62 define(`D', `%r8')
  63 define(`DINV', `%r9')
  64
  65 C Invariants
  66 define(`B2', `%rbp')
  67 define(`B2md', `%rbx')
  68
  69 C Variables
  70 define(`UN', `%r8')     C Overlaps D input
  71 define(`T', `%r10')
  72 define(`U0', `%r11')
  73 define(`U2', `%r12')
  74 define(`Q0', `%r13')
  75 define(`Q1', `%r14')
  76 define(`Q2', `%r15')
  77
  78 ABI_SUPPORT(STD64)
  79
  80         ASM_START()
  81         TEXT
  82         ALIGN(16)
  83 PROLOGUE(mpn_div_qr_1n_pi1)
  84         FUNC_ENTRY(6)
  85 IFDOS(` mov     56(%rsp), %r8   ')
  86 IFDOS(` mov     64(%rsp), %r9   ')
  87         dec     UN_INPUT
  88         jnz     L(first)
  89
  90         C Just a single 2/1 division.
  91         C T, U0 are allocated in scratch registers
  92         lea     1(U1), T
  93         mov     U1, %rax
  94         mul     DINV
  95         mov     (UP), U0
  96         add     U0, %rax
  97         adc     T, %rdx
  98         mov     %rdx, T
  99         imul    D, %rdx
 100         sub     %rdx, U0
 101         cmp     U0, %rax
 102         lea     (U0, D), %rax
 103         cmovnc  U0, %rax
 104         sbb     $0, T
 105         cmp     D, %rax
 106         jc      L(single_div_done)
 107         sub     D, %rax
 108         add     $1, T
 109 L(single_div_done):
 110         mov     T, (QP)
 111         FUNC_EXIT
 112         ret
 113 L(first):
 114         C FIXME: Could delay some of these until we enter the loop.
 115         push    %r15
 116         push    %r14
 117         push    %r13
 118         push    %r12
 119         push    %rbx
 120         push    %rbp
 121
 122         mov     D, B2
 123         imul    DINV, B2
 124         neg     B2
 125         mov     B2, B2md
 126         sub     D, B2md
 127
 128         C D not needed until final reduction
 129         push    D
 130         mov     UN_INPUT, UN    C Clobbers D
 131
 132         mov     DINV, %rax
 133         mul     U1
 134         mov     %rax, Q0
 135         add     U1, %rdx
 136         mov     %rdx, T
 137
 138         mov     B2, %rax
 139         mul     U1
 140         mov     -8(UP, UN, 8), U0
 141         mov     (UP, UN, 8), U1
 142         mov     T, (QP, UN, 8)
 143         add     %rax, U0
 144         adc     %rdx, U1
 145         sbb     U2, U2
 146         dec     UN
 147         mov     U1, %rax
 148         jz      L(final)
 149         mov     $0, R32(Q1)
 150
 151         ALIGN(16)
 152
 153         C Loop is 28 instructions, 30 K8/K10 decoder slots, should run
 154         C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1
 155         C is zero, and carry holds an extra copy of U2.
 156 L(loop):
 157         C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
 158         C Remains to add in B (U1 + c)
 159         cmovc   DINV, Q1
 160         mov     U2, Q2
 161         neg     Q2
 162         mul     DINV
 163         add     %rdx, Q1
 164         adc     $0, Q2
 165         add     Q0, Q1
 166         mov     %rax, Q0
 167         mov     B2, %rax
 168         lea     (B2md, U0), T
 169         adc     $0, Q2
 170
 171         C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
 172         mul     U1
 173         and     B2, U2
 174         add     U2, U0
 175         cmovnc  U0, T
 176
 177         C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
 178         adc     U1, Q1
 179         mov     -8(UP, UN, 8), U0
 180         adc     Q2, 8(QP, UN, 8)
 181         jc      L(q_incr)
 182 L(q_incr_done):
 183         add     %rax, U0
 184         mov     T, %rax
 185         adc     %rdx, %rax
 186         mov     Q1, (QP, UN, 8)
 187         mov     $0, R32(Q1)
 188         sbb     U2, U2
 189         dec     UN
 190         mov     %rax, U1
 191         jnz     L(loop)
 192
 193 L(final):
 194         pop     D
 195
 196         mov     U2, Q1
 197         and     D, U2
 198         sub     U2, %rax
 199         neg     Q1
 200
 201         mov     %rax, U1
 202         sub     D, %rax
 203         cmovc   U1, %rax
 204         sbb     $-1, Q1
 205
 206         lea     1(%rax), T
 207         mul     DINV
 208         add     U0, %rax
 209         adc     T, %rdx
 210         mov     %rdx, T
 211         imul    D, %rdx
 212         sub     %rdx, U0
 213         cmp     U0, %rax
 214         lea     (U0, D), %rax
 215         cmovnc  U0, %rax
 216         sbb     $0, T
 217         cmp     D, %rax
 218         jc      L(div_done)
 219         sub     D, %rax
 220         add     $1, T
 221 L(div_done):
 222         add     T, Q0
 223         mov     Q0, (QP)
 224         adc     Q1, 8(QP)
 225         jnc     L(done)
 226 L(final_q_incr):
 227         addq    $1, 16(QP)
 228         lea     8(QP), QP
 229         jc      L(final_q_incr)
 230
 231 L(done):
 232         pop     %rbp
 233         pop     %rbx
 234         pop     %r12
 235         pop     %r13
 236         pop     %r14
 237         pop     %r15
 238         FUNC_EXIT
 239         ret
 240
 241 L(q_incr):
 242         C U1 is not live, so use it for indexing
 243         lea     16(QP, UN, 8), U1
 244 L(q_incr_loop):
 245         addq    $1, (U1)
 246         jnc     L(q_incr_done)
 247         lea     8(U1), U1
 248         jmp     L(q_incr_loop)
 249 EPILOGUE()