source/libs/gmp/gmp-src/mpn/x86_64/pentium4/rsh1aors_n.asm

   1 dnl  x86-64 mpn_rsh1add_n/mpn_rsh1sub_n optimized for Pentium 4.
   2
   3 dnl  Contributed to the GNU project by Torbjorn Granlund.
   4
   5 dnl  Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35
  36 C            cycles/limb
  37 C AMD K8,K9      4.13
  38 C AMD K10        4.13
  39 C Intel P4       5.70
  40 C Intel core2    4.75
  41 C Intel corei    5
  42 C Intel atom     8.75
  43 C VIA nano       5.25
  44
  45 C TODO
  46 C  * Try to make this smaller, 746 bytes seem excessive for this 2nd class
  47 C    function.  Less sw pipelining would help, and since we now probably
  48 C    pipeline somewhat too deeply, it might not affect performance too much.
  49 C  * A separate small-n loop might speed things as well as make things smaller.
  50 C    That loop should be selected before pushing registers.
  51
  52 C INPUT PARAMETERS
  53 define(`rp',    `%rdi')
  54 define(`up',    `%rsi')
  55 define(`vp',    `%rdx')
  56 define(`n',     `%rcx')
  57 define(`cy',    `%r8')
  58
  59 ifdef(`OPERATION_rsh1add_n', `
  60         define(ADDSUB,        add)
  61         define(func,          mpn_rsh1add_n)
  62         define(func_nc,       mpn_rsh1add_nc)')
  63 ifdef(`OPERATION_rsh1sub_n', `
  64         define(ADDSUB,        sub)
  65         define(func,          mpn_rsh1sub_n)
  66         define(func_nc,       mpn_rsh1sub_nc)')
  67
  68 ABI_SUPPORT(DOS64)
  69 ABI_SUPPORT(STD64)
  70
  71 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
  72
  73 ASM_START()
  74         TEXT
  75 PROLOGUE(func)
  76         FUNC_ENTRY(4)
  77         xor     %r8, %r8
  78 IFDOS(` jmp     L(ent)          ')
  79 EPILOGUE()
  80 PROLOGUE(func_nc)
  81         FUNC_ENTRY(4)
  82 IFDOS(` mov     56(%rsp), %r8   ')
  83 L(ent): push    %rbx
  84         push    %r12
  85         push    %r13
  86         push    %r14
  87         push    %r15
  88
  89         mov     (vp), %r9
  90         mov     (up), %r15
  91
  92         mov     R32(n), R32(%rax)
  93         and     $3, R32(%rax)
  94         jne     L(n00)
  95
  96         mov     R32(%r8), R32(%rbx)     C n = 0, 4, 8, ...
  97         mov     8(up), %r10
  98         ADDSUB  %r9, %r15
  99         mov     8(vp), %r9
 100         setc    R8(%rax)
 101         ADDSUB  %rbx, %r15              C return bit
 102         jnc     1f
 103         mov     $1, R8(%rax)
 104 1:      mov     16(up), %r12
 105         ADDSUB  %r9, %r10
 106         mov     16(vp), %r9
 107         setc    R8(%rbx)
 108         mov     %r15, %r13
 109         ADDSUB  %rax, %r10
 110         jnc     1f
 111         mov     $1, R8(%rbx)
 112 1:      mov     24(up), %r11
 113         ADDSUB  %r9, %r12
 114         lea     32(up), up
 115         mov     24(vp), %r9
 116         lea     32(vp), vp
 117         setc    R8(%rax)
 118         mov     %r10, %r14
 119         shl     $63, %r10
 120         shr     %r13
 121         jmp     L(L00)
 122
 123 L(n00): cmp     $2, R32(%rax)
 124         jnc     L(n01)
 125         xor     R32(%rbx), R32(%rbx)    C n = 1, 5, 9, ...
 126         lea     -24(rp), rp
 127         mov     R32(%r8), R32(%rax)
 128         dec     n
 129         jnz     L(gt1)
 130         ADDSUB  %r9, %r15
 131         setc    R8(%rbx)
 132         ADDSUB  %rax, %r15
 133         jnc     1f
 134         mov     $1, R8(%rbx)
 135 1:      mov     %r15, %r14
 136         shl     $63, %rbx
 137         shr     %r14
 138         jmp     L(cj1)
 139 L(gt1): mov     8(up), %r8
 140         ADDSUB  %r9, %r15
 141         mov     8(vp), %r9
 142         setc    R8(%rbx)
 143         ADDSUB  %rax, %r15
 144         jnc     1f
 145         mov     $1, R8(%rbx)
 146 1:      mov     16(up), %r10
 147         ADDSUB  %r9, %r8
 148         mov     16(vp), %r9
 149         setc    R8(%rax)
 150         mov     %r15, %r14
 151         ADDSUB  %rbx, %r8
 152         jnc     1f
 153         mov     $1, R8(%rax)
 154 1:      mov     24(up), %r12
 155         ADDSUB  %r9, %r10
 156         mov     24(vp), %r9
 157         setc    R8(%rbx)
 158         mov     %r8, %r13
 159         shl     $63, %r8
 160         shr     %r14
 161         lea     8(up), up
 162         lea     8(vp), vp
 163         jmp     L(L01)
 164
 165 L(n01): jne     L(n10)
 166         lea     -16(rp), rp             C n = 2, 6, 10, ...
 167         mov     R32(%r8), R32(%rbx)
 168         mov     8(up), %r11
 169         ADDSUB  %r9, %r15
 170         mov     8(vp), %r9
 171         setc    R8(%rax)
 172         ADDSUB  %rbx, %r15
 173         jnc     1f
 174         mov     $1, R8(%rax)
 175 1:      sub     $2, n
 176         jnz     L(gt2)
 177         ADDSUB  %r9, %r11
 178         setc    R8(%rbx)
 179         mov     %r15, %r13
 180         ADDSUB  %rax, %r11
 181         jnc     1f
 182         mov     $1, R8(%rbx)
 183 1:      mov     %r11, %r14
 184         shl     $63, %r11
 185         shr     %r13
 186         jmp     L(cj2)
 187 L(gt2): mov     16(up), %r8
 188         ADDSUB  %r9, %r11
 189         mov     16(vp), %r9
 190         setc    R8(%rbx)
 191         mov     %r15, %r13
 192         ADDSUB  %rax, %r11
 193         jnc     1f
 194         mov     $1, R8(%rbx)
 195 1:      mov     24(up), %r10
 196         ADDSUB  %r9, %r8
 197         mov     24(vp), %r9
 198         setc    R8(%rax)
 199         mov     %r11, %r14
 200         shl     $63, %r11
 201         shr     %r13
 202         lea     16(up), up
 203         lea     16(vp), vp
 204         jmp     L(L10)
 205
 206 L(n10): xor     R32(%rbx), R32(%rbx)    C n = 3, 7, 11, ...
 207         lea     -8(rp), rp
 208         mov     R32(%r8), R32(%rax)
 209         mov     8(up), %r12
 210         ADDSUB  %r9, %r15
 211         mov     8(vp), %r9
 212         setc    R8(%rbx)
 213         ADDSUB  %rax, %r15
 214         jnc     1f
 215         mov     $1, R8(%rbx)
 216 1:      mov     16(up), %r11
 217         ADDSUB  %r9, %r12
 218         mov     16(vp), %r9
 219         setc    R8(%rax)
 220         mov     %r15, %r14
 221         ADDSUB  %rbx, %r12
 222         jnc     1f
 223         mov     $1, R8(%rax)
 224 1:      sub     $3, n
 225         jnz     L(gt3)
 226         ADDSUB  %r9, %r11
 227         setc    R8(%rbx)
 228         mov     %r12, %r13
 229         shl     $63, %r12
 230         shr     %r14
 231         jmp     L(cj3)
 232 L(gt3): mov     24(up), %r8
 233         ADDSUB  %r9, %r11
 234         mov     24(vp), %r9
 235         setc    R8(%rbx)
 236         mov     %r12, %r13
 237         shl     $63, %r12
 238         shr     %r14
 239         lea     24(up), up
 240         lea     24(vp), vp
 241         jmp     L(L11)
 242
 243 L(c0):  mov     $1, R8(%rbx)
 244         jmp     L(rc0)
 245 L(c1):  mov     $1, R8(%rax)
 246         jmp     L(rc1)
 247 L(c2):  mov     $1, R8(%rbx)
 248         jmp     L(rc2)
 249
 250         ALIGN(16)
 251 L(top): mov     (up), %r8       C not on critical path
 252         or      %r13, %r10
 253         ADDSUB  %r9, %r11       C not on critical path
 254         mov     (vp), %r9       C not on critical path
 255         setc    R8(%rbx)        C save carry out
 256         mov     %r12, %r13      C new for later
 257         shl     $63, %r12       C shift new right
 258         shr     %r14            C shift old left
 259         mov     %r10, (rp)
 260 L(L11): ADDSUB  %rax, %r11      C apply previous carry out
 261         jc      L(c0)           C jump if ripple
 262 L(rc0): mov     8(up), %r10
 263         or      %r14, %r12
 264         ADDSUB  %r9, %r8
 265         mov     8(vp), %r9
 266         setc    R8(%rax)
 267         mov     %r11, %r14
 268         shl     $63, %r11
 269         shr     %r13
 270         mov     %r12, 8(rp)
 271 L(L10): ADDSUB  %rbx, %r8
 272         jc      L(c1)
 273 L(rc1): mov     16(up), %r12
 274         or      %r13, %r11
 275         ADDSUB  %r9, %r10
 276         mov     16(vp), %r9
 277         setc    R8(%rbx)
 278         mov     %r8, %r13
 279         shl     $63, %r8
 280         shr     %r14
 281         mov     %r11, 16(rp)
 282 L(L01): ADDSUB  %rax, %r10
 283         jc      L(c2)
 284 L(rc2): mov     24(up), %r11
 285         or      %r14, %r8
 286         ADDSUB  %r9, %r12
 287         lea     32(up), up
 288         mov     24(vp), %r9
 289         lea     32(vp), vp
 290         setc    R8(%rax)
 291         mov     %r10, %r14
 292         shl     $63, %r10
 293         shr     %r13
 294         mov     %r8, 24(rp)
 295         lea     32(rp), rp
 296 L(L00): ADDSUB  %rbx, %r12
 297         jc      L(c3)
 298 L(rc3): sub     $4, n
 299         ja      L(top)
 300
 301 L(end): or      %r13, %r10
 302         ADDSUB  %r9, %r11
 303         setc    R8(%rbx)
 304         mov     %r12, %r13
 305         shl     $63, %r12
 306         shr     %r14
 307         mov     %r10, (rp)
 308 L(cj3): ADDSUB  %rax, %r11
 309         jnc     1f
 310         mov     $1, R8(%rbx)
 311 1:      or      %r14, %r12
 312         mov     %r11, %r14
 313         shl     $63, %r11
 314         shr     %r13
 315         mov     %r12, 8(rp)
 316 L(cj2): or      %r13, %r11
 317         shl     $63, %rbx
 318         shr     %r14
 319         mov     %r11, 16(rp)
 320 L(cj1): or      %r14, %rbx
 321         mov     %rbx, 24(rp)
 322
 323         mov     R32(%r15), R32(%rax)
 324         and     $1, R32(%rax)
 325         pop     %r15
 326         pop     %r14
 327         pop     %r13
 328         pop     %r12
 329         pop     %rbx
 330         FUNC_EXIT()
 331         ret
 332 L(c3):  mov     $1, R8(%rax)
 333         jmp     L(rc3)
 334 EPILOGUE()