source/libs/gmp/gmp-src/mpn/x86_64/div_qr_1n_pi1.asm

   1 dnl  x86-64 mpn_div_qr_1n_pi1
   2 dnl  -- Divide an mpn number by a normalized single-limb number,
   3 dnl     using a single-limb inverse.
   4
   5 dnl  Contributed to the GNU project by Niels Möller
   6
   7 dnl  Copyright 2013 Free Software Foundation, Inc.
   8
   9 dnl  This file is part of the GNU MP Library.
  10 dnl
  11 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  12 dnl  it under the terms of either:
  13 dnl
  14 dnl    * the GNU Lesser General Public License as published by the Free
  15 dnl      Software Foundation; either version 3 of the License, or (at your
  16 dnl      option) any later version.
  17 dnl
  18 dnl  or
  19 dnl
  20 dnl    * the GNU General Public License as published by the Free Software
  21 dnl      Foundation; either version 2 of the License, or (at your option) any
  22 dnl      later version.
  23 dnl
  24 dnl  or both in parallel, as here.
  25 dnl
  26 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  27 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  28 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  29 dnl  for more details.
  30 dnl
  31 dnl  You should have received copies of the GNU General Public License and the
  32 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  33 dnl  see https://www.gnu.org/licenses/.
  34
  35 include(`../config.m4')
  36
  37
  38 C               c/l
  39 C AMD K8,K9     13
  40 C AMD K10       13
  41 C AMD bull      16.5
  42 C AMD pile      15
  43 C AMD steam      ?
  44 C AMD bobcat    16
  45 C AMD jaguar     ?
  46 C Intel P4      47      poor
  47 C Intel core    19.25
  48 C Intel NHM     18
  49 C Intel SBR     15      poor
  50 C Intel IBR     13
  51 C Intel HWL     11.7
  52 C Intel BWL      ?
  53 C Intel atom    52      very poor
  54 C VIA nano      19
  55
  56
  57 C INPUT Parameters
  58 define(`QP', `%rdi')
  59 define(`UP', `%rsi')
  60 define(`UN_INPUT', `%rdx')
  61 define(`U1', `%rcx')    C Also in %rax
  62 define(`D', `%r8')
  63 define(`DINV', `%r9')
  64
  65 C Invariants
  66 define(`B2', `%rbp')
  67 define(`B2md', `%rbx')
  68
  69 C Variables
  70 define(`UN', `%r8')     C Overlaps D input
  71 define(`T', `%r10')
  72 define(`U0', `%r11')
  73 define(`U2', `%r12')
  74 define(`Q0', `%r13')
  75 define(`Q1', `%r14')
  76 define(`Q2', `%r15')
  77
  78 ABI_SUPPORT(STD64)
  79
  80         ASM_START()
  81         TEXT
  82         ALIGN(16)
  83 PROLOGUE(mpn_div_qr_1n_pi1)
  84         FUNC_ENTRY(6)
  85 IFDOS(` mov     56(%rsp), %r8   ')
  86 IFDOS(` mov     64(%rsp), %r9   ')
  87         dec     UN_INPUT
  88         jnz     L(first)
  89
  90         C Just a single 2/1 division.
  91         C T, U0 are allocated in scratch registers
  92         lea     1(U1), T
  93         mov     U1, %rax
  94         mul     DINV
  95         mov     (UP), U0
  96         add     U0, %rax
  97         adc     T, %rdx
  98         mov     %rdx, T
  99         imul    D, %rdx
 100         sub     %rdx, U0
 101         cmp     U0, %rax
 102         lea     (U0, D), %rax
 103         cmovnc  U0, %rax
 104         sbb     $0, T
 105         cmp     D, %rax
 106         jc      L(single_div_done)
 107         sub     D, %rax
 108         add     $1, T
 109 L(single_div_done):
 110         mov     T, (QP)
 111         FUNC_EXIT
 112         ret
 113 L(first):
 114         C FIXME: Could delay some of these until we enter the loop.
 115         push    %r15
 116         push    %r14
 117         push    %r13
 118         push    %r12
 119         push    %rbx
 120         push    %rbp
 121
 122         mov     D, B2
 123         imul    DINV, B2
 124         neg     B2
 125         mov     B2, B2md
 126         sub     D, B2md
 127
 128         C D not needed until final reduction
 129         push    D
 130         mov     UN_INPUT, UN    C Clobbers D
 131
 132         mov     DINV, %rax
 133         mul     U1
 134         mov     %rax, Q0
 135         add     U1, %rdx
 136         mov     %rdx, T
 137
 138         mov     B2, %rax
 139         mul     U1
 140         mov     -8(UP, UN, 8), U0
 141         mov     (UP, UN, 8), U1
 142         mov     T, (QP, UN, 8)
 143         add     %rax, U0
 144         adc     %rdx, U1
 145         sbb     U2, U2
 146         dec     UN
 147         mov     U1, %rax
 148         jz      L(final)
 149
 150         ALIGN(16)
 151
 152         C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
 153         C At entry, %rax holds an extra copy of U1
 154 L(loop):
 155         C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
 156         C Remains to add in B (U1 + c)
 157         mov     DINV, Q1
 158         mov     U2, Q2
 159         and     U2, Q1
 160         neg     Q2
 161         mul     DINV
 162         add     %rdx, Q1
 163         adc     $0, Q2
 164         add     Q0, Q1
 165         mov     %rax, Q0
 166         mov     B2, %rax
 167         lea     (B2md, U0), T
 168         adc     $0, Q2
 169
 170         C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
 171         mul     U1
 172         and     B2, U2
 173         add     U2, U0
 174         cmovnc  U0, T
 175
 176         C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
 177         adc     U1, Q1
 178         mov     -8(UP, UN, 8), U0
 179         adc     Q2, 8(QP, UN, 8)
 180         jc      L(q_incr)
 181 L(q_incr_done):
 182         add     %rax, U0
 183         mov     T, %rax
 184         adc     %rdx, %rax
 185         mov     Q1, (QP, UN, 8)
 186         sbb     U2, U2
 187         dec     UN
 188         mov     %rax, U1
 189         jnz     L(loop)
 190
 191 L(final):
 192         pop     D
 193
 194         mov     U2, Q1
 195         and     D, U2
 196         sub     U2, %rax
 197         neg     Q1
 198
 199         mov     %rax, U1
 200         sub     D, %rax
 201         cmovc   U1, %rax
 202         sbb     $-1, Q1
 203
 204         lea     1(%rax), T
 205         mul     DINV
 206         add     U0, %rax
 207         adc     T, %rdx
 208         mov     %rdx, T
 209         imul    D, %rdx
 210         sub     %rdx, U0
 211         cmp     U0, %rax
 212         lea     (U0, D), %rax
 213         cmovnc  U0, %rax
 214         sbb     $0, T
 215         cmp     D, %rax
 216         jc      L(div_done)
 217         sub     D, %rax
 218         add     $1, T
 219 L(div_done):
 220         add     T, Q0
 221         mov     Q0, (QP)
 222         adc     Q1, 8(QP)
 223         jnc     L(done)
 224 L(final_q_incr):
 225         addq    $1, 16(QP)
 226         lea     8(QP), QP
 227         jc      L(final_q_incr)
 228
 229 L(done):
 230         pop     %rbp
 231         pop     %rbx
 232         pop     %r12
 233         pop     %r13
 234         pop     %r14
 235         pop     %r15
 236         FUNC_EXIT
 237         ret
 238
 239 L(q_incr):
 240         C U1 is not live, so use it for indexing
 241         lea     16(QP, UN, 8), U1
 242 L(q_incr_loop):
 243         addq    $1, (U1)
 244         jnc     L(q_incr_done)
 245         lea     8(U1), U1
 246         jmp     L(q_incr_loop)
 247 EPILOGUE()