gcc/config/c6x/lib1funcs.asm

   1 /* Copyright 2010, 2011  Free Software Foundation, Inc.
   2    Contributed by Bernd Schmidt <bernds@codesourcery.com>.
   3
   4 This file is free software; you can redistribute it and/or modify it
   5 under the terms of the GNU General Public License as published by the
   6 Free Software Foundation; either version 3, or (at your option) any
   7 later version.
   8
   9 This file is distributed in the hope that it will be useful, but
  10 WITHOUT ANY WARRANTY; without even the implied warranty of
  11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12 General Public License for more details.
  13
  14 Under Section 7 of GPL version 3, you are granted additional
  15 permissions described in the GCC Runtime Library Exception, version
  16 3.1, as published by the Free Software Foundation.
  17
  18 You should have received a copy of the GNU General Public License and
  19 a copy of the GCC Runtime Library Exception along with this program;
  20 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23         ;; ABI considerations for the divide functions
  24         ;; The following registers are call-used:
  25         ;; __c6xabi_divi A0,A1,A2,A4,A6,B0,B1,B2,B4,B5
  26         ;; __c6xabi_divu A0,A1,A2,A4,A6,B0,B1,B2,B4
  27         ;; __c6xabi_remi A1,A2,A4,A5,A6,B0,B1,B2,B4
  28         ;; __c6xabi_remu A1,A4,A5,A7,B0,B1,B2,B4
  29         ;;
  30         ;; In our implementation, divu and remu are leaf functions,
  31         ;; while both divi and remi call into divu.
  32         ;; A0 is not clobbered by any of the functions.
  33         ;; divu does not clobber B2 either, which is taken advantage of
  34         ;; in remi.
  35         ;; divi uses B5 to hold the original return address during
  36         ;; the call to divu.
  37         ;; remi uses B2 and A5 to hold the input values during the
  38         ;; call to divu.  It stores B3 in on the stack.
  39
  40 #ifdef L_divsi3
  41 .text
  42 .align 2
  43 .global __c6xabi_divi
  44 .hidden __c6xabi_divi
  45 .type __c6xabi_divi, STT_FUNC
  46
  47 __c6xabi_divi:
  48         call .s2        __c6xabi_divu
  49 ||      mv .d2          B3, B5
  50 ||      cmpgt .l1       0, A4, A1
  51 ||      cmpgt .l2       0, B4, B1
  52
  53         [A1] neg .l1    A4, A4
  54 ||      [B1] neg .l2    B4, B4
  55 ||      xor .s1x        A1, B1, A1
  56
  57 #ifdef _TMS320C6400
  58         [A1] addkpc .s2 1f, B3, 4
  59 #else
  60         [A1] mvkl .s2   1f, B3
  61         [A1] mvkh .s2   1f, B3
  62         nop             2
  63 #endif
  64 1:
  65         neg .l1         A4, A4
  66 ||      mv .l2          B3,B5
  67 ||      ret .s2         B5
  68         nop             5
  69 #endif
  70
  71 #if defined L_modsi3 || defined L_divmodsi4
  72 .align 2
  73 #ifdef L_modsi3
  74 #define MOD_OUTPUT_REG A4
  75 .global __c6xabi_remi
  76 .hidden __c6xabi_remi
  77 .type __c6xabi_remi, STT_FUNC
  78 #else
  79 #define MOD_OUTPUT_REG A5
  80 .global __c6xabi_divremi
  81 .hidden __c6xabi_divremi
  82 .type __c6xabi_divremi, STT_FUNC
  83 __c6xabi_divremi:
  84 #endif
  85
  86 __c6xabi_remi:
  87         stw .d2t2       B3, *B15--[2]
  88 ||      cmpgt .l1       0, A4, A1
  89 ||      cmpgt .l2       0, B4, B2
  90 ||      mv .s1          A4, A5
  91 ||      call .s2        __c6xabi_divu
  92
  93         [A1] neg .l1    A4, A4
  94 ||      [B2] neg .l2    B4, B4
  95 ||      xor .s2x        B2, A1, B0
  96 ||      mv .d2          B4, B2
  97
  98 #ifdef _TMS320C6400
  99         [B0] addkpc .s2 1f, B3, 1
 100         [!B0] addkpc .s2 2f, B3, 1
 101         nop             2
 102 #else
 103         [B0] mvkl .s2   1f,B3
 104         [!B0] mvkl .s2  2f,B3
 105
 106         [B0] mvkh .s2   1f,B3
 107         [!B0] mvkh .s2  2f,B3
 108 #endif
 109 1:
 110         neg .l1         A4, A4
 111 2:
 112         ldw .d2t2       *++B15[2], B3
 113
 114 #ifdef _TMS320C6400_PLUS
 115         mpy32 .m1x      A4, B2, A6
 116         nop             3
 117         ret .s2         B3
 118         sub .l1         A5, A6, MOD_OUTPUT_REG
 119         nop             4
 120 #else
 121         mpyu .m1x       A4, B2, A1
 122         nop             1
 123         mpylhu .m1x     A4, B2, A6
 124 ||      mpylhu .m2x     B2, A4, B2
 125         nop             1
 126         add .l1x        A6, B2, A6
 127 ||      ret .s2         B3
 128         shl .s1         A6, 16, A6
 129         add .d1         A6, A1, A6
 130         sub .l1         A5, A6, MOD_OUTPUT_REG
 131         nop             2
 132 #endif
 133
 134 #endif
 135
 136 #if defined L_udivsi3 || defined L_udivmodsi4
 137 .align 2
 138 #ifdef L_udivsi3
 139 .global __c6xabi_divu
 140 .hidden __c6xabi_divu
 141 .type __c6xabi_divu, STT_FUNC
 142 __c6xabi_divu:
 143 #else
 144 .global __c6xabi_divremu
 145 .hidden __c6xabi_divremu
 146 .type __c6xabi_divremu, STT_FUNC
 147 __c6xabi_divremu:
 148 #endif
 149         ;; We use a series of up to 31 subc instructions.  First, we find
 150         ;; out how many leading zero bits there are in the divisor.  This
 151         ;; gives us both a shift count for aligning (shifting) the divisor
 152         ;; to the, and the number of times we have to execute subc.
 153
 154         ;; At the end, we have both the remainder and most of the quotient
 155         ;; in A4.  The top bit of the quotient is computed first and is
 156         ;; placed in A2.
 157
 158         ;; Return immediately if the dividend is zero.  Setting B4 to 1
 159         ;; is a trick to allow us to leave the following insns in the jump
 160         ;; delay slot without affecting the result.
 161         mv      .s2x    A4, B1
 162
 163 #ifndef _TMS320C6400
 164 [!b1]   mvk     .s2     1, B4
 165 #endif
 166 [b1]    lmbd    .l2     1, B4, B1
 167 ||[!b1] b       .s2     B3      ; RETURN A
 168 #ifdef _TMS320C6400
 169 ||[!b1] mvk     .d2     1, B4
 170 #endif
 171 #ifdef L_udivmodsi4
 172 ||[!b1] zero    .s1     A5
 173 #endif
 174         mv      .l1x    B1, A6
 175 ||      shl     .s2     B4, B1, B4
 176
 177         ;; The loop performs a maximum of 28 steps, so we do the
 178         ;; first 3 here.
 179         cmpltu  .l1x    A4, B4, A2
 180 [!A2]   sub     .l1x    A4, B4, A4
 181 ||      shru    .s2     B4, 1, B4
 182 ||      xor     .s1     1, A2, A2
 183
 184         shl     .s1     A2, 31, A2
 185 || [b1] subc    .l1x    A4,B4,A4
 186 || [b1] add     .s2     -1, B1, B1
 187 [b1]    subc    .l1x    A4,B4,A4
 188 || [b1] add     .s2     -1, B1, B1
 189
 190         ;; RETURN A may happen here (note: must happen before the next branch)
 191 0:
 192         cmpgt   .l2     B1, 7, B0
 193 || [b1] subc    .l1x    A4,B4,A4
 194 || [b1] add     .s2     -1, B1, B1
 195 [b1]    subc    .l1x    A4,B4,A4
 196 || [b1] add     .s2     -1, B1, B1
 197 || [b0] b       .s1     0b
 198 [b1]    subc    .l1x    A4,B4,A4
 199 || [b1] add     .s2     -1, B1, B1
 200 [b1]    subc    .l1x    A4,B4,A4
 201 || [b1] add     .s2     -1, B1, B1
 202 [b1]    subc    .l1x    A4,B4,A4
 203 || [b1] add     .s2     -1, B1, B1
 204 [b1]    subc    .l1x    A4,B4,A4
 205 || [b1] add     .s2     -1, B1, B1
 206 [b1]    subc    .l1x    A4,B4,A4
 207 || [b1] add     .s2     -1, B1, B1
 208         ;; loop backwards branch happens here
 209
 210         ret     .s2     B3
 211 ||      mvk     .s1     32, A1
 212         sub     .l1     A1, A6, A6
 213 #ifdef L_udivmodsi4
 214 ||      extu    .s1     A4, A6, A5
 215 #endif
 216         shl     .s1     A4, A6, A4
 217         shru    .s1     A4, 1, A4
 218 ||      sub     .l1     A6, 1, A6
 219         or      .l1     A2, A4, A4
 220         shru    .s1     A4, A6, A4
 221         nop
 222
 223 #endif
 224
 225 #ifdef L_umodsi3
 226 .align 2
 227 .global __c6xabi_remu
 228 .hidden __c6xabi_remu
 229 .type __c6xabi_remu, STT_FUNC
 230 __c6xabi_remu:
 231         ;; The ABI seems designed to prevent these functions calling each other,
 232         ;; so we duplicate most of the divsi3 code here.
 233         mv      .s2x    A4, B1
 234 #ifndef _TMS320C6400
 235 [!b1]   mvk     .s2     1, B4
 236 #endif
 237         lmbd    .l2     1, B4, B1
 238 ||[!b1] b       .s2     B3      ; RETURN A
 239 #ifdef _TMS320C6400
 240 ||[!b1] mvk     .d2     1, B4
 241 #endif
 242
 243         mv      .l1x    B1, A7
 244 ||      shl     .s2     B4, B1, B4
 245
 246         cmpltu  .l1x    A4, B4, A1
 247 [!a1]   sub     .l1x    A4, B4, A4
 248         shru    .s2     B4, 1, B4
 249
 250 0:
 251         cmpgt   .l2     B1, 7, B0
 252 || [b1] subc    .l1x    A4,B4,A4
 253 || [b1] add     .s2     -1, B1, B1
 254         ;; RETURN A may happen here (note: must happen before the next branch)
 255 [b1]    subc    .l1x    A4,B4,A4
 256 || [b1] add     .s2     -1, B1, B1
 257 || [b0] b       .s1     0b
 258 [b1]    subc    .l1x    A4,B4,A4
 259 || [b1] add     .s2     -1, B1, B1
 260 [b1]    subc    .l1x    A4,B4,A4
 261 || [b1] add     .s2     -1, B1, B1
 262 [b1]    subc    .l1x    A4,B4,A4
 263 || [b1] add     .s2     -1, B1, B1
 264 [b1]    subc    .l1x    A4,B4,A4
 265 || [b1] add     .s2     -1, B1, B1
 266 [b1]    subc    .l1x    A4,B4,A4
 267 || [b1] add     .s2     -1, B1, B1
 268         ;; loop backwards branch happens here
 269
 270         ret     .s2     B3
 271 [b1]    subc    .l1x    A4,B4,A4
 272 || [b1] add     .s2     -1, B1, B1
 273 [b1]    subc    .l1x    A4,B4,A4
 274
 275         extu    .s1     A4, A7, A4
 276         nop     2
 277 #endif
 278
 279 #if defined L_strasgi_64plus && defined _TMS320C6400_PLUS
 280
 281 .align 2
 282 .global __c6xabi_strasgi_64plus
 283 .hidden __c6xabi_strasgi_64plus
 284 .type __c6xabi_strasgi_64plus, STT_FUNC
 285 __c6xabi_strasgi_64plus:
 286         shru    .s2x    a6, 2, b31
 287 ||      mv      .s1     a4, a30
 288 ||      mv      .d2     b4, b30
 289
 290         add     .s2     -4, b31, b31
 291
 292         sploopd         1
 293 ||      mvc     .s2     b31, ilc
 294         ldw     .d2t2   *b30++, b31
 295         nop     4
 296         mv      .s1x    b31,a31
 297         spkernel        6, 0
 298 ||      stw     .d1t1   a31, *a30++
 299
 300         ret     .s2     b3
 301         nop 5
 302 #endif
 303
 304 #ifdef L_strasgi
 305 .global __c6xabi_strasgi
 306 .type __c6xabi_strasgi, STT_FUNC
 307 __c6xabi_strasgi:
 308         ;; This is essentially memcpy, with alignment known to be at least
 309         ;; 4, and the size a multiple of 4 greater than or equal to 28.
 310         ldw     .d2t1   *B4++, A0
 311 ||      mvk     .s2     16, B1
 312         ldw     .d2t1   *B4++, A1
 313 ||      mvk     .s2     20, B2
 314 ||      sub     .d1     A6, 24, A6
 315         ldw     .d2t1   *B4++, A5
 316         ldw     .d2t1   *B4++, A7
 317 ||      mv      .l2x    A6, B7
 318         ldw     .d2t1   *B4++, A8
 319         ldw     .d2t1   *B4++, A9
 320 ||      mv      .s2x    A0, B5
 321 ||      cmpltu  .l2     B2, B7, B0
 322
 323 0:
 324         stw     .d1t2   B5, *A4++
 325 ||[b0]  ldw     .d2t1   *B4++, A0
 326 ||      mv      .s2x    A1, B5
 327 ||      mv      .l2     B7, B6
 328
 329 [b0]    sub     .d2     B6, 24, B7
 330 ||[b0]  b       .s2     0b
 331 ||      cmpltu  .l2     B1, B6, B0
 332
 333 [b0]    ldw     .d2t1   *B4++, A1
 334 ||      stw     .d1t2   B5, *A4++
 335 ||      mv      .s2x    A5, B5
 336 ||      cmpltu  .l2     12, B6, B0
 337
 338 [b0]    ldw     .d2t1   *B4++, A5
 339 ||      stw     .d1t2   B5, *A4++
 340 ||      mv      .s2x    A7, B5
 341 ||      cmpltu  .l2     8, B6, B0
 342
 343 [b0]    ldw     .d2t1   *B4++, A7
 344 ||      stw     .d1t2   B5, *A4++
 345 ||      mv      .s2x    A8, B5
 346 ||      cmpltu  .l2     4, B6, B0
 347
 348 [b0]    ldw     .d2t1   *B4++, A8
 349 ||      stw     .d1t2   B5, *A4++
 350 ||      mv      .s2x    A9, B5
 351 ||      cmpltu  .l2     0, B6, B0
 352
 353 [b0]    ldw     .d2t1   *B4++, A9
 354 ||      stw     .d1t2   B5, *A4++
 355 ||      mv      .s2x    A0, B5
 356 ||      cmpltu  .l2     B2, B7, B0
 357
 358         ;; loop back branch happens here
 359
 360         cmpltu  .l2     B1, B6, B0
 361 ||      ret     .s2     b3
 362
 363 [b0]    stw     .d1t1   A1, *A4++
 364 ||      cmpltu  .l2     12, B6, B0
 365 [b0]    stw     .d1t1   A5, *A4++
 366 ||      cmpltu  .l2     8, B6, B0
 367 [b0]    stw     .d1t1   A7, *A4++
 368 ||      cmpltu  .l2     4, B6, B0
 369 [b0]    stw     .d1t1   A8, *A4++
 370 ||      cmpltu  .l2     0, B6, B0
 371 [b0]    stw     .d1t1   A9, *A4++
 372
 373         ;; return happens here
 374
 375 #endif
 376
 377 #ifdef _TMS320C6400_PLUS
 378 #ifdef L_push_rts
 379 .align 2
 380 .global __c6xabi_push_rts
 381 .hidden __c6xabi_push_rts
 382 .type __c6xabi_push_rts, STT_FUNC
 383 __c6xabi_push_rts:
 384         stw .d2t2       B14, *B15--[2]
 385         stdw .d2t1      A15:A14, *B15--
 386 ||      b .s2x          A3
 387         stdw .d2t2      B13:B12, *B15--
 388         stdw .d2t1      A13:A12, *B15--
 389         stdw .d2t2      B11:B10, *B15--
 390         stdw .d2t1      A11:A10, *B15--
 391         stdw .d2t2      B3:B2, *B15--
 392 #endif
 393
 394 #ifdef L_pop_rts
 395 .align 2
 396 .global __c6xabi_pop_rts
 397 .hidden __c6xabi_pop_rts
 398 .type __c6xabi_pop_rts, STT_FUNC
 399 __c6xabi_pop_rts:
 400         lddw .d2t2      *++B15, B3:B2
 401         lddw .d2t1      *++B15, A11:A10
 402         lddw .d2t2      *++B15, B11:B10
 403         lddw .d2t1      *++B15, A13:A12
 404         lddw .d2t2      *++B15, B13:B12
 405         lddw .d2t1      *++B15, A15:A14
 406 ||      b .s2           B3
 407         ldw .d2t2       *++B15[2], B14
 408         nop             4
 409 #endif
 410
 411 #ifdef L_call_stub
 412 .align 2
 413 .global __c6xabi_call_stub
 414 .type __c6xabi_call_stub, STT_FUNC
 415 __c6xabi_call_stub:
 416         stw .d2t1       A2, *B15--[2]
 417         stdw .d2t1      A7:A6, *B15--
 418 ||      call .s2        B31
 419         stdw .d2t1      A1:A0, *B15--
 420         stdw .d2t2      B7:B6, *B15--
 421         stdw .d2t2      B5:B4, *B15--
 422         stdw .d2t2      B1:B0, *B15--
 423         stdw .d2t2      B3:B2, *B15--
 424 ||      addkpc .s2      1f, B3, 0
 425 1:
 426         lddw .d2t2      *++B15, B3:B2
 427         lddw .d2t2      *++B15, B1:B0
 428         lddw .d2t2      *++B15, B5:B4
 429         lddw .d2t2      *++B15, B7:B6
 430         lddw .d2t1      *++B15, A1:A0
 431         lddw .d2t1      *++B15, A7:A6
 432 ||      b .s2           B3
 433         ldw .d2t1       *++B15[2], A2
 434         nop             4
 435 #endif
 436
 437 #endif
 438