sysdeps/x86_64/mul_1.S

   1 /* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store
   2    the result in a second limb vector.
   3    Copyright (C) 2003-2016 Free Software Foundation, Inc.
   4    This file is part of the GNU MP Library.
   5
   6    The GNU MP Library is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU Lesser General Public License as published by
   8    the Free Software Foundation; either version 2.1 of the License, or (at your
   9    option) any later version.
  10
  11    The GNU MP Library is distributed in the hope that it will be useful, but
  12    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  14    License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public License
  17    along with the GNU MP Library; see the file COPYING.LIB.  If not,
  18    see <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21 #include "asm-syntax.h"
  22
  23 #define rp      %rdi
  24 #define up      %rsi
  25 #define n_param %rdx
  26 #define vl      %rcx
  27
  28 #define n       %r11
  29
  30         .text
  31 ENTRY (__mpn_mul_1)
  32         push    %rbx
  33         cfi_adjust_cfa_offset (8)
  34         cfi_rel_offset (%rbx, 0)
  35         xor     %r10, %r10
  36         mov     (up), %rax              /* read first u limb early */
  37         mov     n_param, %rbx           /* move away n from rdx, mul uses it */
  38         mul     vl
  39         mov     %rbx, %r11
  40
  41         add     %r10, %rax
  42         adc     $0, %rdx
  43
  44         and     $3, %ebx
  45         jz      L(b0)
  46         cmp     $2, %ebx
  47         jz      L(b2)
  48         jg      L(b3)
  49
  50 L(b1):  dec     n
  51         jne     L(gt1)
  52         mov     %rax, (rp)
  53         jmp     L(ret)
  54 L(gt1): lea     8(up,n,8), up
  55         lea     -8(rp,n,8), rp
  56         neg     n
  57         xor     %r10, %r10
  58         xor     %ebx, %ebx
  59         mov     %rax, %r9
  60         mov     (up,n,8), %rax
  61         mov     %rdx, %r8
  62         jmp     L(L1)
  63
  64 L(b0):  lea     (up,n,8), up
  65         lea     -16(rp,n,8), rp
  66         neg     n
  67         xor     %r10, %r10
  68         mov     %rax, %r8
  69         mov     %rdx, %rbx
  70         jmp     L(L0)
  71
  72 L(b3):  lea     -8(up,n,8), up
  73         lea     -24(rp,n,8), rp
  74         neg     n
  75         mov     %rax, %rbx
  76         mov     %rdx, %r10
  77         jmp     L(L3)
  78
  79 L(b2):  lea     -16(up,n,8), up
  80         lea     -32(rp,n,8), rp
  81         neg     n
  82         xor     %r8, %r8
  83         xor     %ebx, %ebx
  84         mov     %rax, %r10
  85         mov     24(up,n,8), %rax
  86         mov     %rdx, %r9
  87         jmp     L(L2)
  88
  89         .p2align 4
  90 L(top): mov     %r10, (rp,n,8)
  91         add     %rax, %r9
  92         mov     (up,n,8), %rax
  93         adc     %rdx, %r8
  94         mov     $0, %r10d
  95 L(L1):  mul     vl
  96         mov     %r9, 8(rp,n,8)
  97         add     %rax, %r8
  98         adc     %rdx, %rbx
  99 L(L0):  mov     8(up,n,8), %rax
 100         mul     vl
 101         mov     %r8, 16(rp,n,8)
 102         add     %rax, %rbx
 103         adc     %rdx, %r10
 104 L(L3):  mov     16(up,n,8), %rax
 105         mul     vl
 106         mov     %rbx, 24(rp,n,8)
 107         mov     $0, %r8d                # zero
 108         mov     %r8, %rbx               # zero
 109         add     %rax, %r10
 110         mov     24(up,n,8), %rax
 111         mov     %r8, %r9                # zero
 112         adc     %rdx, %r9
 113 L(L2):  mul     vl
 114         add     $4, n
 115         js      L(top)
 116
 117         mov     %r10, (rp,n,8)
 118         add     %rax, %r9
 119         adc     %r8, %rdx
 120         mov     %r9, 8(rp,n,8)
 121         add     %r8, %rdx
 122 L(ret): mov     %rdx, %rax
 123
 124         pop     %rbx
 125         cfi_adjust_cfa_offset (-8)
 126         cfi_restore (%rbx)
 127         ret
 128 END (__mpn_mul_1)