sysdeps/alpha/lshift.s

   1  # Alpha 21064 __mpn_lshift --
   2
   3  # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
   4
   5  # This file is part of the GNU MP Library.
   6
   7  # The GNU MP Library is free software; you can redistribute it and/or modify
   8  # it under the terms of the GNU Library General Public License as published by
   9  # the Free Software Foundation; either version 2 of the License, or (at your
  10  # option) any later version.
  11
  12  # The GNU MP Library is distributed in the hope that it will be useful, but
  13  # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14  # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  15  # License for more details.
  16
  17  # You should have received a copy of the GNU Library General Public License
  18  # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
  19  # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21
  22  # INPUT PARAMETERS
  23  # res_ptr      r16
  24  # s1_ptr       r17
  25  # size         r18
  26  # cnt          r19
  27
  28  # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
  29  # it would take 4 cycles/limb.  It should be possible to get down to 3
  30  # cycles/limb since both ldq and stq can be paired with the other used
  31  # instructions.  But there are many restrictions in the 21064 pipeline that
  32  # makes it hard, if not impossible, to get down to 3 cycles/limb:
  33
  34  # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
  35  # 2. Only aligned instruction pairs can be paired.
  36  # 3. The store buffer or silo might not be able to deal with the bandwidth.
  37
  38         .set    noreorder
  39         .set    noat
  40 .text
  41         .align  3
  42         .globl  __mpn_lshift
  43         .ent    __mpn_lshift
  44 __mpn_lshift:
  45         .frame  $30,0,$26,0
  46
  47         s8addq  $18,$17,$17     # make r17 point at end of s1
  48         ldq     $4,-8($17)      # load first limb
  49         subq    $17,8,$17
  50         subq    $31,$19,$7
  51         s8addq  $18,$16,$16     # make r16 point at end of RES
  52         subq    $18,1,$18
  53         and     $18,4-1,$20     # number of limbs in first loop
  54         srl     $4,$7,$0        # compute function result
  55
  56         beq     $20,L0
  57         subq    $18,$20,$18
  58
  59         .align  3
  60 Loop0:
  61         ldq     $3,-8($17)
  62         subq    $16,8,$16
  63         subq    $17,8,$17
  64         subq    $20,1,$20
  65         sll     $4,$19,$5
  66         srl     $3,$7,$6
  67         bis     $3,$3,$4
  68         bis     $5,$6,$8
  69         stq     $8,0($16)
  70         bne     $20,Loop0
  71
  72 L0:     beq     $18,Lend
  73
  74         .align  3
  75 Loop:   ldq     $3,-8($17)
  76         subq    $16,32,$16
  77         subq    $18,4,$18
  78         sll     $4,$19,$5
  79         srl     $3,$7,$6
  80
  81         ldq     $4,-16($17)
  82         sll     $3,$19,$1
  83         bis     $5,$6,$8
  84         stq     $8,24($16)
  85         srl     $4,$7,$2
  86
  87         ldq     $3,-24($17)
  88         sll     $4,$19,$5
  89         bis     $1,$2,$8
  90         stq     $8,16($16)
  91         srl     $3,$7,$6
  92
  93         ldq     $4,-32($17)
  94         sll     $3,$19,$1
  95         bis     $5,$6,$8
  96         stq     $8,8($16)
  97         srl     $4,$7,$2
  98
  99         subq    $17,32,$17
 100         bis     $1,$2,$8
 101         stq     $8,0($16)
 102
 103         bgt     $18,Loop
 104
 105 Lend:   sll     $4,$19,$8
 106         stq     $8,-8($16)
 107         ret     $31,($26),1
 108         .end    __mpn_lshift