sysdeps/x86_64/mul_1.S

   1 /* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store
   2    the result in a second limb vector.
   3    Copyright (C) 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
   4    This file is part of the GNU MP Library.
   5
   6    The GNU MP Library is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU Lesser General Public License as published by
   8    the Free Software Foundation; either version 2.1 of the License, or (at your
   9    option) any later version.
  10
  11    The GNU MP Library is distributed in the hope that it will be useful, but
  12    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  14    License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public License
  17    along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  19    MA 02111-1307, USA. */
  20
  21 #include <sysdep.h>
  22 #include "asm-syntax.h"
  23
  24 #define rp      %rdi
  25 #define up      %rsi
  26 #define n_param %rdx
  27 #define vl      %rcx
  28
  29 #define n       %r11
  30
  31         .text
  32 ENTRY (__mpn_mul_1)
  33         push    %rbx
  34         cfi_adjust_cfa_offset (8)
  35         cfi_rel_offset (%rbx, 0)
  36         xor     %r10, %r10
  37         mov     (up), %rax              /* read first u limb early */
  38         mov     n_param, %rbx           /* move away n from rdx, mul uses it */
  39         mul     vl
  40         mov     %rbx, %r11
  41
  42         add     %r10, %rax
  43         adc     $0, %rdx
  44
  45         and     $3, %ebx
  46         jz      L(b0)
  47         cmp     $2, %ebx
  48         jz      L(b2)
  49         jg      L(b3)
  50
  51 L(b1):  dec     n
  52         jne     L(gt1)
  53         mov     %rax, (rp)
  54         jmp     L(ret)
  55 L(gt1): lea     8(up,n,8), up
  56         lea     -8(rp,n,8), rp
  57         neg     n
  58         xor     %r10, %r10
  59         xor     %ebx, %ebx
  60         mov     %rax, %r9
  61         mov     (up,n,8), %rax
  62         mov     %rdx, %r8
  63         jmp     L(L1)
  64
  65 L(b0):  lea     (up,n,8), up
  66         lea     -16(rp,n,8), rp
  67         neg     n
  68         xor     %r10, %r10
  69         mov     %rax, %r8
  70         mov     %rdx, %rbx
  71         jmp     L(L0)
  72
  73 L(b3):  lea     -8(up,n,8), up
  74         lea     -24(rp,n,8), rp
  75         neg     n
  76         mov     %rax, %rbx
  77         mov     %rdx, %r10
  78         jmp     L(L3)
  79
  80 L(b2):  lea     -16(up,n,8), up
  81         lea     -32(rp,n,8), rp
  82         neg     n
  83         xor     %r8, %r8
  84         xor     %ebx, %ebx
  85         mov     %rax, %r10
  86         mov     24(up,n,8), %rax
  87         mov     %rdx, %r9
  88         jmp     L(L2)
  89
  90         .p2align 4
  91 L(top): mov     %r10, (rp,n,8)
  92         add     %rax, %r9
  93         mov     (up,n,8), %rax
  94         adc     %rdx, %r8
  95         mov     $0, %r10d
  96 L(L1):  mul     vl
  97         mov     %r9, 8(rp,n,8)
  98         add     %rax, %r8
  99         adc     %rdx, %rbx
 100 L(L0):  mov     8(up,n,8), %rax
 101         mul     vl
 102         mov     %r8, 16(rp,n,8)
 103         add     %rax, %rbx
 104         adc     %rdx, %r10
 105 L(L3):  mov     16(up,n,8), %rax
 106         mul     vl
 107         mov     %rbx, 24(rp,n,8)
 108         mov     $0, %r8d                # zero
 109         mov     %r8, %rbx               # zero
 110         add     %rax, %r10
 111         mov     24(up,n,8), %rax
 112         mov     %r8, %r9                # zero
 113         adc     %rdx, %r9
 114 L(L2):  mul     vl
 115         add     $4, n
 116         js      L(top)
 117
 118         mov     %r10, (rp,n,8)
 119         add     %rax, %r9
 120         adc     %r8, %rdx
 121         mov     %r9, 8(rp,n,8)
 122         add     %r8, %rdx
 123 L(ret): mov     %rdx, %rax
 124
 125         pop     %rbx
 126         cfi_adjust_cfa_offset (-8)
 127         cfi_restore (%rbx)
 128         ret
 129 END (__mpn_mul_1)