sysdeps/x86_64/add_n.S

   1 /* x86-64 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
   2    sum in a third limb vector.
   3    Copyright (C) 2006-2016 Free Software Foundation, Inc.
   4    This file is part of the GNU MP Library.
   5
   6    The GNU MP Library is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU Lesser General Public License as published by
   8    the Free Software Foundation; either version 2.1 of the License, or (at your
   9    option) any later version.
  10
  11    The GNU MP Library is distributed in the hope that it will be useful, but
  12    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  14    License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public License
  17    along with the GNU MP Library; see the file COPYING.LIB.  If not,
  18    see <http://www.gnu.org/licenses/>.  */
  19
  20 #include "sysdep.h"
  21 #include "asm-syntax.h"
  22
  23 #define rp      %rdi
  24 #define up      %rsi
  25 #define vp      %rdx
  26 #define n       %rcx
  27 #define cy      %r8
  28
  29 #ifndef func
  30 # define func __mpn_add_n
  31 # define ADCSBB adc
  32 #endif
  33
  34         .text
  35 ENTRY (func)
  36         xor     %r8, %r8
  37         mov     (up), %r10
  38         mov     (vp), %r11
  39
  40         lea     -8(up,n,8), up
  41         lea     -8(vp,n,8), vp
  42         lea     -16(rp,n,8), rp
  43         mov     %ecx, %eax
  44         neg     n
  45         and     $3, %eax
  46         je      L(b00)
  47         add     %rax, n         /* clear low rcx bits for jrcxz */
  48         cmp     $2, %eax
  49         jl      L(b01)
  50         je      L(b10)
  51
  52 L(b11): shr     %r8             /* set cy */
  53         jmp     L(e11)
  54
  55 L(b00): shr     %r8             /* set cy */
  56         mov     %r10, %r8
  57         mov     %r11, %r9
  58         lea     4(n), n
  59         jmp     L(e00)
  60
  61 L(b01): shr     %r8             /* set cy */
  62         jmp     L(e01)
  63
  64 L(b10): shr     %r8             /* set cy */
  65         mov     %r10, %r8
  66         mov     %r11, %r9
  67         jmp     L(e10)
  68
  69 L(end): ADCSBB  %r11, %r10
  70         mov     %r10, 8(rp)
  71         mov     %ecx, %eax      /* clear eax, ecx contains 0 */
  72         adc     %eax, %eax
  73         ret
  74
  75         .p2align 4
  76 L(top):
  77         mov     -24(up,n,8), %r8
  78         mov     -24(vp,n,8), %r9
  79         ADCSBB  %r11, %r10
  80         mov     %r10, -24(rp,n,8)
  81 L(e00):
  82         mov     -16(up,n,8), %r10
  83         mov     -16(vp,n,8), %r11
  84         ADCSBB  %r9, %r8
  85         mov     %r8, -16(rp,n,8)
  86 L(e11):
  87         mov     -8(up,n,8), %r8
  88         mov     -8(vp,n,8), %r9
  89         ADCSBB  %r11, %r10
  90         mov     %r10, -8(rp,n,8)
  91 L(e10):
  92         mov     (up,n,8), %r10
  93         mov     (vp,n,8), %r11
  94         ADCSBB  %r9, %r8
  95         mov     %r8, (rp,n,8)
  96 L(e01):
  97         jrcxz   L(end)
  98         lea     4(n), n
  99         jmp     L(top)
 100 END (func)