source/libs/gmp/gmp-src/mpn/pa32/hppa1_1/mul_1.asm

   1 dnl  HP-PA 1.1 mpn_mul_1 -- Multiply a limb vector with a limb and store the
   2 dnl  result in a second limb vector.
   3
   4 dnl  Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34 C INPUT PARAMETERS
  35 C res_ptr       r26
  36 C s1_ptr        r25
  37 C size          r24
  38 C s2_limb       r23
  39
  40 C This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
  41 C not become faster due to data cache contention after a store.  On the PA7100
  42 C it runs at 7 cycles/limb.
  43
  44 C We could use fldds to read two limbs at a time from the S1 array, and that
  45 C could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
  46 C PA7100, respectively.  We don't do that since it does not seem worth the
  47 C (alignment) troubles...
  48
  49 C At least the PA7100 is rumored to be able to deal with cache-misses without
  50 C stalling instruction issue.  If this is true, and the cache is actually also
  51 C lockup-free, we should use a deeper software pipeline, and load from S1 very
  52 C early!  (The loads and stores to -12(sp) will surely be in the cache.)
  53
  54 ASM_START()
  55 PROLOGUE(mpn_mul_1)
  56 C       .callinfo       frame=64,no_calls
  57
  58         ldo             64(%r30),%r30
  59         fldws,ma        4(%r25),%fr5
  60         stw             %r23,-16(%r30)          C move s2_limb ...
  61         addib,=         -1,%r24,L(just_one_limb)
  62          fldws          -16(%r30),%fr4          C ... into fr4
  63         add             %r0,%r0,%r0             C clear carry
  64         xmpyu           %fr4,%fr5,%fr6
  65         fldws,ma        4(%r25),%fr7
  66         fstds           %fr6,-16(%r30)
  67         xmpyu           %fr4,%fr7,%fr8
  68         ldw             -12(%r30),%r19          C least significant limb in product
  69         ldw             -16(%r30),%r28
  70
  71         fstds           %fr8,-16(%r30)
  72         addib,=         -1,%r24,L(end)
  73          ldw            -12(%r30),%r1
  74
  75 C Main loop
  76 LDEF(loop)
  77         fldws,ma        4(%r25),%fr5
  78         stws,ma         %r19,4(%r26)
  79         addc            %r28,%r1,%r19
  80         xmpyu           %fr4,%fr5,%fr6
  81         ldw             -16(%r30),%r28
  82         fstds           %fr6,-16(%r30)
  83         addib,<>        -1,%r24,L(loop)
  84          ldw            -12(%r30),%r1
  85
  86 LDEF(end)
  87         stws,ma         %r19,4(%r26)
  88         addc            %r28,%r1,%r19
  89         ldw             -16(%r30),%r28
  90         stws,ma         %r19,4(%r26)
  91         addc            %r0,%r28,%r28
  92         bv              0(%r2)
  93          ldo            -64(%r30),%r30
  94
  95 LDEF(just_one_limb)
  96         xmpyu           %fr4,%fr5,%fr6
  97         fstds           %fr6,-16(%r30)
  98         ldw             -16(%r30),%r28
  99         ldo             -64(%r30),%r30
 100         bv              0(%r2)
 101          fstws          %fr6R,0(%r26)
 102 EPILOGUE()