libgcc/config/cris/umulsidi3.S

   1 ;; Copyright (C) 2001-2024 Free Software Foundation, Inc.
   2 ;;
   3 ;; This file is part of GCC.
   4 ;;
   5 ;; GCC is free software; you can redistribute it and/or modify it under
   6 ;; the terms of the GNU General Public License as published by the Free
   7 ;; Software Foundation; either version 3, or (at your option) any later
   8 ;; version.
   9 ;;
  10 ;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  11 ;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 ;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 ;; for more details.
  14 ;;
  15 ;; Under Section 7 of GPL version 3, you are granted additional
  16 ;; permissions described in the GCC Runtime Library Exception, version
  17 ;; 3.1, as published by the Free Software Foundation.
  18 ;;
  19 ;; You should have received a copy of the GNU General Public License and
  20 ;; a copy of the GCC Runtime Library Exception along with this program;
  21 ;; see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22 ;; <http://www.gnu.org/licenses/>.
  23 ;;
  24 ;; This code is derived from mulsi3.S, observing that the mstep*16-based
  25 ;; multiplications there, from which it is formed, are actually
  26 ;; zero-extending; in gcc-speak "umulhisi3".  The difference to *this*
  27 ;; function is just a missing top mstep*16 sequence and shifts and 64-bit
  28 ;; additions for the high part.  Compared to an implementation based on
  29 ;; calling __Mul four times (see default implementation of umul_ppmm in
  30 ;; longlong.h), this will complete in a time between a fourth and a third
  31 ;; of that, assuming the value-based optimizations don't strike.  If they
  32 ;; all strike there (very often) but none here, we still win, though by a
  33 ;; lesser margin, due to lesser total overhead.
  34
  35 #define L(x) .x
  36 #define CONCAT1(a, b) CONCAT2(a, b)
  37 #define CONCAT2(a, b) a ## b
  38
  39 #ifdef __USER_LABEL_PREFIX__
  40 # define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
  41 #else
  42 # define SYM(x) x
  43 #endif
  44
  45         .global SYM(__umulsidi3)
  46         .type   SYM(__umulsidi3),@function
  47 SYM(__umulsidi3):
  48 #if defined (__CRIS_arch_version) && __CRIS_arch_version >= 10
  49 ;; Can't have the mulu.d last on a cache-line, due to a hardware bug.  See
  50 ;; the documentation for -mmul-bug-workaround.
  51 ;; Not worthwhile to conditionalize here.
  52         .p2alignw 2,0x050f
  53         mulu.d $r11,$r10
  54         ret
  55         move $mof,$r11
  56 #else
  57         move.d $r11,$r9
  58         bound.d $r10,$r9
  59         cmpu.w 65535,$r9
  60         bls L(L3)
  61         move.d $r10,$r12
  62
  63         move.d $r10,$r13
  64         movu.w $r11,$r9 ; ab*cd = (a*c)<<32 (a*d + b*c)<<16 + b*d
  65
  66 ;; We're called for floating point numbers very often with the "low" 16
  67 ;; bits zero, so it's worthwhile to optimize for that.
  68
  69         beq L(L6)       ; d == 0?
  70         lslq 16,$r13
  71
  72         beq L(L7)       ; b == 0?
  73         clear.w $r10
  74
  75         mstep $r9,$r13  ; d*b
  76         mstep $r9,$r13
  77         mstep $r9,$r13
  78         mstep $r9,$r13
  79         mstep $r9,$r13
  80         mstep $r9,$r13
  81         mstep $r9,$r13
  82         mstep $r9,$r13
  83         mstep $r9,$r13
  84         mstep $r9,$r13
  85         mstep $r9,$r13
  86         mstep $r9,$r13
  87         mstep $r9,$r13
  88         mstep $r9,$r13
  89         mstep $r9,$r13
  90         mstep $r9,$r13
  91
  92 L(L7):
  93         test.d $r10
  94         mstep $r9,$r10  ; d*a
  95         mstep $r9,$r10
  96         mstep $r9,$r10
  97         mstep $r9,$r10
  98         mstep $r9,$r10
  99         mstep $r9,$r10
 100         mstep $r9,$r10
 101         mstep $r9,$r10
 102         mstep $r9,$r10
 103         mstep $r9,$r10
 104         mstep $r9,$r10
 105         mstep $r9,$r10
 106         mstep $r9,$r10
 107         mstep $r9,$r10
 108         mstep $r9,$r10
 109         mstep $r9,$r10
 110
 111 ;; d*a in $r10, d*b in $r13, ab in $r12 and cd in $r11
 112 ;; $r9 = d, need to do b*c and a*c; we can drop d.
 113 ;; so $r9 is up for use and we can shift down $r11 as the mstep
 114 ;; source for the next mstep-part.
 115
 116 L(L8):
 117         lsrq 16,$r11
 118         move.d $r12,$r9
 119         lslq 16,$r9
 120         beq L(L9)       ; b == 0?
 121         mstep $r11,$r9
 122
 123         mstep $r11,$r9  ; b*c
 124         mstep $r11,$r9
 125         mstep $r11,$r9
 126         mstep $r11,$r9
 127         mstep $r11,$r9
 128         mstep $r11,$r9
 129         mstep $r11,$r9
 130         mstep $r11,$r9
 131         mstep $r11,$r9
 132         mstep $r11,$r9
 133         mstep $r11,$r9
 134         mstep $r11,$r9
 135         mstep $r11,$r9
 136         mstep $r11,$r9
 137         mstep $r11,$r9
 138 L(L9):
 139
 140 ;; d*a in $r10, d*b in $r13, c*b in $r9, ab in $r12 and c in $r11,
 141 ;; need to do a*c.  We want that to end up in $r11, so we shift up $r11 to
 142 ;; now use as the destination operand.  We'd need a test insn to update N
 143 ;; to do it the other way round.
 144
 145         lsrq 16,$r12
 146         lslq 16,$r11
 147         mstep $r12,$r11
 148         mstep $r12,$r11
 149         mstep $r12,$r11
 150         mstep $r12,$r11
 151         mstep $r12,$r11
 152         mstep $r12,$r11
 153         mstep $r12,$r11
 154         mstep $r12,$r11
 155         mstep $r12,$r11
 156         mstep $r12,$r11
 157         mstep $r12,$r11
 158         mstep $r12,$r11
 159         mstep $r12,$r11
 160         mstep $r12,$r11
 161         mstep $r12,$r11
 162         mstep $r12,$r11
 163
 164 ;; d*a in $r10, d*b in $r13, c*b in $r9, a*c in $r11 ($r12 free).
 165 ;; Need (a*d + b*c)<<16 + b*d into $r10 and
 166 ;; a*c + (a*d + b*c)>>16 plus carry from the additions into $r11.
 167
 168         add.d $r9,$r10  ; (a*d + b*c) - may produce a carry.
 169         scs $r12        ; The carry corresponds to bit 16 of $r11.
 170         lslq 16,$r12
 171         add.d $r12,$r11 ; $r11 = a*c + carry from (a*d + b*c).
 172
 173 #if defined (__CRIS_arch_version) && __CRIS_arch_version >= 8
 174         swapw $r10
 175         addu.w $r10,$r11 ; $r11 = a*c + (a*d + b*c) >> 16 including carry.
 176         clear.w $r10    ; $r10 = (a*d + b*c) << 16
 177 #else
 178         move.d $r10,$r9
 179         lsrq 16,$r9
 180         add.d $r9,$r11  ; $r11 = a*c + (a*d + b*c) >> 16 including carry.
 181         lslq 16,$r10    ; $r10 = (a*d + b*c) << 16
 182 #endif
 183         add.d $r13,$r10 ; $r10 = (a*d + b*c) << 16 + b*d - may produce a carry.
 184         scs $r9
 185         ret
 186         add.d $r9,$r11  ; Last carry added to the high-order 32 bits.
 187
 188 L(L6):
 189         clear.d $r13
 190         ba L(L8)
 191         clear.d $r10
 192
 193 L(L11):
 194         clear.d $r10
 195         ret
 196         clear.d $r11
 197
 198 L(L3):
 199 ;; Form the maximum in $r10, by knowing the minimum, $r9.
 200 ;; (We don't know which one of $r10 or $r11 it is.)
 201 ;; Check if the largest operand is still just 16 bits.
 202
 203         xor $r9,$r10
 204         xor $r11,$r10
 205         cmpu.w 65535,$r10
 206         bls L(L5)
 207         movu.w $r9,$r13
 208
 209 ;; We have ab*cd = (a*c)<<32 + (a*d + b*c)<<16 + b*d, but c==0
 210 ;; so we only need (a*d)<<16 + b*d with d = $r13, ab = $r10.
 211 ;; Remember that the upper part of (a*d)<<16 goes into the lower part
 212 ;; of $r11 and there may be a carry from adding the low 32 parts.
 213         beq L(L11)      ; d == 0?
 214         move.d $r10,$r9
 215
 216         lslq 16,$r9
 217         beq L(L10)      ; b == 0?
 218         clear.w $r10
 219
 220         mstep $r13,$r9  ; b*d
 221         mstep $r13,$r9
 222         mstep $r13,$r9
 223         mstep $r13,$r9
 224         mstep $r13,$r9
 225         mstep $r13,$r9
 226         mstep $r13,$r9
 227         mstep $r13,$r9
 228         mstep $r13,$r9
 229         mstep $r13,$r9
 230         mstep $r13,$r9
 231         mstep $r13,$r9
 232         mstep $r13,$r9
 233         mstep $r13,$r9
 234         mstep $r13,$r9
 235         mstep $r13,$r9
 236 L(L10):
 237         test.d $r10
 238         mstep $r13,$r10 ; a*d
 239         mstep $r13,$r10
 240         mstep $r13,$r10
 241         mstep $r13,$r10
 242         mstep $r13,$r10
 243         mstep $r13,$r10
 244         mstep $r13,$r10
 245         mstep $r13,$r10
 246         mstep $r13,$r10
 247         mstep $r13,$r10
 248         mstep $r13,$r10
 249         mstep $r13,$r10
 250         mstep $r13,$r10
 251         mstep $r13,$r10
 252         mstep $r13,$r10
 253         mstep $r13,$r10
 254         move.d $r10,$r11
 255         lsrq 16,$r11
 256         lslq 16,$r10
 257         add.d $r9,$r10
 258         scs $r12
 259         ret
 260         add.d $r12,$r11
 261
 262 L(L5):
 263 ;; We have ab*cd = (a*c)<<32 + (a*d + b*c)<<16 + b*d, but a and c==0
 264 ;; so b*d (with min=b=$r13, max=d=$r10) it is.  As it won't overflow the
 265 ;; 32-bit part, just set $r11 to 0.
 266
 267         lslq 16,$r10
 268         clear.d $r11
 269
 270         mstep $r13,$r10
 271         mstep $r13,$r10
 272         mstep $r13,$r10
 273         mstep $r13,$r10
 274         mstep $r13,$r10
 275         mstep $r13,$r10
 276         mstep $r13,$r10
 277         mstep $r13,$r10
 278         mstep $r13,$r10
 279         mstep $r13,$r10
 280         mstep $r13,$r10
 281         mstep $r13,$r10
 282         mstep $r13,$r10
 283         mstep $r13,$r10
 284         mstep $r13,$r10
 285         ret
 286         mstep $r13,$r10
 287 #endif
 288 L(Lfe1):
 289         .size   SYM(__umulsidi3),L(Lfe1)-SYM(__umulsidi3)