source/libs/gmp/gmp-src/mpn/pa32/hppa1_1/submul_1.asm

   1 dnl  HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
   2 dnl  the result from a second limb vector.
   3
   4 dnl  Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34 C INPUT PARAMETERS
  35 C res_ptr       r26
  36 C s1_ptr        r25
  37 C size          r24
  38 C s2_limb       r23
  39
  40 C This runs at 12 cycles/limb on a PA7000.  With the used instructions, it can
  41 C not become faster due to data cache contention after a store.  On the PA7100
  42 C it runs at 11 cycles/limb.
  43
  44 C There are some ideas described in mul_1.asm that applies to this code too.
  45
  46 C It seems possible to make this run as fast as mpn_addmul_1, if we use
  47 C       sub,>>= %r29,%r19,%r22
  48 C       addi    1,%r28,%r28
  49 C but that requires reworking the hairy software pipeline...
  50
  51 ASM_START()
  52 PROLOGUE(mpn_submul_1)
  53 C       .callinfo       frame=64,no_calls
  54
  55         ldo             64(%r30),%r30
  56         fldws,ma        4(%r25),%fr5
  57         stw             %r23,-16(%r30)          C move s2_limb ...
  58         addib,=         -1,%r24,L(just_one_limb)
  59          fldws          -16(%r30),%fr4          C ... into fr4
  60         add             %r0,%r0,%r0             C clear carry
  61         xmpyu           %fr4,%fr5,%fr6
  62         fldws,ma        4(%r25),%fr7
  63         fstds           %fr6,-16(%r30)
  64         xmpyu           %fr4,%fr7,%fr8
  65         ldw             -12(%r30),%r19          C least significant limb in product
  66         ldw             -16(%r30),%r28
  67
  68         fstds           %fr8,-16(%r30)
  69         addib,=         -1,%r24,L(end)
  70          ldw            -12(%r30),%r1
  71
  72 C Main loop
  73 LDEF(loop)
  74         ldws            0(%r26),%r29
  75         fldws,ma        4(%r25),%fr5
  76         sub             %r29,%r19,%r22
  77         add             %r22,%r19,%r0
  78         stws,ma         %r22,4(%r26)
  79         addc            %r28,%r1,%r19
  80         xmpyu           %fr4,%fr5,%fr6
  81         ldw             -16(%r30),%r28
  82         fstds           %fr6,-16(%r30)
  83         addc            %r0,%r28,%r28
  84         addib,<>        -1,%r24,L(loop)
  85          ldw            -12(%r30),%r1
  86
  87 LDEF(end)
  88         ldw             0(%r26),%r29
  89         sub             %r29,%r19,%r22
  90         add             %r22,%r19,%r0
  91         stws,ma         %r22,4(%r26)
  92         addc            %r28,%r1,%r19
  93         ldw             -16(%r30),%r28
  94         ldws            0(%r26),%r29
  95         addc            %r0,%r28,%r28
  96         sub             %r29,%r19,%r22
  97         add             %r22,%r19,%r0
  98         stws,ma         %r22,4(%r26)
  99         addc            %r0,%r28,%r28
 100         bv              0(%r2)
 101          ldo            -64(%r30),%r30
 102
 103 LDEF(just_one_limb)
 104         xmpyu           %fr4,%fr5,%fr6
 105         ldw             0(%r26),%r29
 106         fstds           %fr6,-16(%r30)
 107         ldw             -12(%r30),%r1
 108         ldw             -16(%r30),%r28
 109         sub             %r29,%r1,%r22
 110         add             %r22,%r1,%r0
 111         stw             %r22,0(%r26)
 112         addc            %r0,%r28,%r28
 113         bv              0(%r2)
 114          ldo            -64(%r30),%r30
 115 EPILOGUE()