sysdeps/powerpc/powerpc64/power4/memcmp.S

   1 /* Optimized strcmp implementation for PowerPC64.
   2    Copyright (C) 2003, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
  18    02110-1301 USA.  */
  19
  20 #include <sysdep.h>
  21 #include <bp-sym.h>
  22 #include <bp-asm.h>
  23
  24 /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
  25
  26         .machine power4
  27 EALIGN (BP_SYM(memcmp), 4, 0)
  28         CALL_MCOUNT 3
  29
  30 #define rTMP    r0
  31 #define rRTN    r3
  32 #define rSTR1   r3      /* first string arg */
  33 #define rSTR2   r4      /* second string arg */
  34 #define rN      r5      /* max string length */
  35 /* Note:  The Bounded pointer support in this code is broken.  This code
  36    was inherited from PPC32 and and that support was never completed.
  37    Current PPC gcc does not support -fbounds-check or -fbounded-pointers.  */
  38 #define rWORD1  r6      /* current word in s1 */
  39 #define rWORD2  r7      /* current word in s2 */
  40 #define rWORD3  r8      /* next word in s1 */
  41 #define rWORD4  r9      /* next word in s2 */
  42 #define rWORD5  r10     /* next word in s1 */
  43 #define rWORD6  r11     /* next word in s2 */
  44 #define rBITDIF r12     /* bits that differ in s1 & s2 words */
  45 #define rWORD7  r30     /* next word in s1 */
  46 #define rWORD8  r31     /* next word in s2 */
  47
  48         xor     rTMP, rSTR2, rSTR1
  49         cmpldi  cr6, rN, 0
  50         cmpldi  cr1, rN, 12
  51         clrldi. rTMP, rTMP, 61
  52         clrldi  rBITDIF, rSTR1, 61
  53         cmpldi  cr5, rBITDIF, 0
  54         beq-    cr6, L(zeroLength)
  55         dcbt    0,rSTR1
  56         dcbt    0,rSTR2
  57 /* If less than 8 bytes or not aligned, use the unalligned
  58    byte loop.  */
  59         blt     cr1, L(bytealigned)
  60         std     rWORD8,-8(r1)
  61         cfi_offset(rWORD8,-8)
  62         std     rWORD7,-16(r1)
  63         cfi_offset(rWORD7,-16)
  64         bne     L(unaligned)
  65 /* At this point we know both strings have the same alignment and the
  66    compare length is at least 8 bytes.  rBITDIF containes the low order
  67    3 bits of rSTR1 and cr5 contains the result of the logical compare
  68    of rBITDIF to 0.  If rBITDIF == 0 then we are already double word
  69    aligned and can perform the DWaligned loop.
  70
  71    Otherwise we know the two strings have the same alignment (but not
  72    yet DW).  So we can force the string addresses to the next lower DW
  73    boundary and special case this first DW word using shift left to
  74    ellimiate bits preceeding the first byte.  Since we want to join the
  75    normal (DWaligned) compare loop, starting at the second double word,
  76    we need to adjust the length (rN) and special case the loop
  77    versioning for the first DW. This insures that the loop count is
  78    correct and the first DW (shifted) is in the expected resister pair.  */
  79         .align 4
  80 L(samealignment):
  81         clrrdi  rSTR1, rSTR1, 3
  82         clrrdi  rSTR2, rSTR2, 3
  83         beq     cr5, L(DWaligned)
  84         add     rN, rN, rBITDIF
  85         sldi    r11, rBITDIF, 3
  86         srdi    rTMP, rN, 5     /* Divide by 32 */
  87         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
  88         ld      rWORD1, 0(rSTR1)
  89         ld      rWORD2, 0(rSTR2)
  90         cmpldi  cr1, rBITDIF, 16
  91         cmpldi  cr7, rN, 32
  92         clrldi  rN, rN, 61
  93         beq     L(dPs4)
  94         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
  95         bgt     cr1, L(dPs3)
  96         beq     cr1, L(dPs2)
  97
  98 /* Remainder is 8 */
  99         .align 3
 100 L(dsP1):
 101         sld     rWORD5, rWORD1, r11
 102         sld     rWORD6, rWORD2, r11
 103         cmpld   cr5, rWORD5, rWORD6
 104         blt     cr7, L(dP1x)
 105 /* Do something useful in this cycle since we have to branch anyway.  */
 106         ld      rWORD1, 8(rSTR1)
 107         ld      rWORD2, 8(rSTR2)
 108         cmpld   cr0, rWORD1, rWORD2
 109         b       L(dP1e)
 110 /* Remainder is 16 */
 111         .align 4
 112 L(dPs2):
 113         sld     rWORD5, rWORD1, r11
 114         sld     rWORD6, rWORD2, r11
 115         cmpld   cr6, rWORD5, rWORD6
 116         blt     cr7, L(dP2x)
 117 /* Do something useful in this cycle since we have to branch anyway.  */
 118         ld      rWORD7, 8(rSTR1)
 119         ld      rWORD8, 8(rSTR2)
 120         cmpld   cr5, rWORD7, rWORD8
 121         b       L(dP2e)
 122 /* Remainder is 24 */
 123         .align 4
 124 L(dPs3):
 125         sld     rWORD3, rWORD1, r11
 126         sld     rWORD4, rWORD2, r11
 127         cmpld   cr1, rWORD3, rWORD4
 128         b       L(dP3e)
 129 /* Count is a multiple of 32, remainder is 0 */
 130         .align 4
 131 L(dPs4):
 132         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 133         sld     rWORD1, rWORD1, r11
 134         sld     rWORD2, rWORD2, r11
 135         cmpld   cr0, rWORD1, rWORD2
 136         b       L(dP4e)
 137
 138 /* At this point we know both strings are double word aligned and the
 139    compare length is at least 8 bytes.  */
 140         .align 4
 141 L(DWaligned):
 142         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
 143         srdi    rTMP, rN, 5     /* Divide by 32 */
 144         cmpldi  cr1, rBITDIF, 16
 145         cmpldi  cr7, rN, 32
 146         clrldi  rN, rN, 61
 147         beq     L(dP4)
 148         bgt     cr1, L(dP3)
 149         beq     cr1, L(dP2)
 150
 151 /* Remainder is 8 */
 152         .align 4
 153 L(dP1):
 154         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 155 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
 156    (8-15 byte compare), we want to use only volitile registers.  This
 157    means we can avoid restoring non-volitile registers since we did not
 158    change any on the early exit path.  The key here is the non-early
 159    exit path only cares about the condition code (cr5), not about which
 160    register pair was used.  */
 161         ld      rWORD5, 0(rSTR1)
 162         ld      rWORD6, 0(rSTR2)
 163         cmpld   cr5, rWORD5, rWORD6
 164         blt     cr7, L(dP1x)
 165         ld      rWORD1, 8(rSTR1)
 166         ld      rWORD2, 8(rSTR2)
 167         cmpld   cr0, rWORD1, rWORD2
 168 L(dP1e):
 169         ld      rWORD3, 16(rSTR1)
 170         ld      rWORD4, 16(rSTR2)
 171         cmpld   cr1, rWORD3, rWORD4
 172         ld      rWORD5, 24(rSTR1)
 173         ld      rWORD6, 24(rSTR2)
 174         cmpld   cr6, rWORD5, rWORD6
 175         bne     cr5, L(dLcr5)
 176         bne     cr0, L(dLcr0)
 177
 178         ldu     rWORD7, 32(rSTR1)
 179         ldu     rWORD8, 32(rSTR2)
 180         bne     cr1, L(dLcr1)
 181         cmpld   cr5, rWORD7, rWORD8
 182         bdnz    L(dLoop)
 183         bne     cr6, L(dLcr6)
 184         ld      rWORD8,-8(r1)
 185         ld      rWORD7,-16(r1)
 186         .align 3
 187 L(dP1x):
 188         sldi.   r12, rN, 3
 189         bne     cr5, L(dLcr5)
 190         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 191         bne     L(d00)
 192         li      rRTN, 0
 193         blr
 194
 195 /* Remainder is 16 */
 196         .align 4
 197 L(dP2):
 198         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 199         ld      rWORD5, 0(rSTR1)
 200         ld      rWORD6, 0(rSTR2)
 201         cmpld   cr6, rWORD5, rWORD6
 202         blt     cr7, L(dP2x)
 203         ld      rWORD7, 8(rSTR1)
 204         ld      rWORD8, 8(rSTR2)
 205         cmpld   cr5, rWORD7, rWORD8
 206 L(dP2e):
 207         ld      rWORD1, 16(rSTR1)
 208         ld      rWORD2, 16(rSTR2)
 209         cmpld   cr0, rWORD1, rWORD2
 210         ld      rWORD3, 24(rSTR1)
 211         ld      rWORD4, 24(rSTR2)
 212         cmpld   cr1, rWORD3, rWORD4
 213         addi    rSTR1, rSTR1, 8
 214         addi    rSTR2, rSTR2, 8
 215         bne     cr6, L(dLcr6)
 216         bne     cr5, L(dLcr5)
 217         b       L(dLoop2)
 218 /* Again we are on a early exit path (16-23 byte compare), we want to
 219    only use volitile registers and avoid restoring non-volitile
 220    registers.  */
 221         .align 4
 222 L(dP2x):
 223         ld      rWORD3, 8(rSTR1)
 224         ld      rWORD4, 8(rSTR2)
 225         cmpld   cr5, rWORD3, rWORD4
 226         sldi.   r12, rN, 3
 227         bne     cr6, L(dLcr6)
 228         addi    rSTR1, rSTR1, 8
 229         addi    rSTR2, rSTR2, 8
 230         bne     cr5, L(dLcr5)
 231         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 232         bne     L(d00)
 233         li      rRTN, 0
 234         blr
 235
 236 /* Remainder is 24 */
 237         .align 4
 238 L(dP3):
 239         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 240         ld      rWORD3, 0(rSTR1)
 241         ld      rWORD4, 0(rSTR2)
 242         cmpld   cr1, rWORD3, rWORD4
 243 L(dP3e):
 244         ld      rWORD5, 8(rSTR1)
 245         ld      rWORD6, 8(rSTR2)
 246         cmpld   cr6, rWORD5, rWORD6
 247         blt     cr7, L(dP3x)
 248         ld      rWORD7, 16(rSTR1)
 249         ld      rWORD8, 16(rSTR2)
 250         cmpld   cr5, rWORD7, rWORD8
 251         ld      rWORD1, 24(rSTR1)
 252         ld      rWORD2, 24(rSTR2)
 253         cmpld   cr0, rWORD1, rWORD2
 254         addi    rSTR1, rSTR1, 16
 255         addi    rSTR2, rSTR2, 16
 256         bne     cr1, L(dLcr1)
 257         bne     cr6, L(dLcr6)
 258         b       L(dLoop1)
 259 /* Again we are on a early exit path (24-31 byte compare), we want to
 260    only use volitile registers and avoid restoring non-volitile
 261    registers.  */
 262         .align 4
 263 L(dP3x):
 264         ld      rWORD1, 16(rSTR1)
 265         ld      rWORD2, 16(rSTR2)
 266         cmpld   cr5, rWORD1, rWORD2
 267         sldi.   r12, rN, 3
 268         bne     cr1, L(dLcr1)
 269         addi    rSTR1, rSTR1, 16
 270         addi    rSTR2, rSTR2, 16
 271         bne     cr6, L(dLcr6)
 272         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 273         bne     cr5, L(dLcr5)
 274         bne     L(d00)
 275         li      rRTN, 0
 276         blr
 277
 278 /* Count is a multiple of 32, remainder is 0 */
 279         .align 4
 280 L(dP4):
 281         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 282         ld      rWORD1, 0(rSTR1)
 283         ld      rWORD2, 0(rSTR2)
 284         cmpld   cr0, rWORD1, rWORD2
 285 L(dP4e):
 286         ld      rWORD3, 8(rSTR1)
 287         ld      rWORD4, 8(rSTR2)
 288         cmpld   cr1, rWORD3, rWORD4
 289         ld      rWORD5, 16(rSTR1)
 290         ld      rWORD6, 16(rSTR2)
 291         cmpld   cr6, rWORD5, rWORD6
 292         ldu     rWORD7, 24(rSTR1)
 293         ldu     rWORD8, 24(rSTR2)
 294         cmpld   cr5, rWORD7, rWORD8
 295         bne     cr0, L(dLcr0)
 296         bne     cr1, L(dLcr1)
 297         bdz-    L(d24)          /* Adjust CTR as we start with +4 */
 298 /* This is the primary loop */
 299         .align 4
 300 L(dLoop):
 301         ld      rWORD1, 8(rSTR1)
 302         ld      rWORD2, 8(rSTR2)
 303         cmpld   cr1, rWORD3, rWORD4
 304         bne     cr6, L(dLcr6)
 305 L(dLoop1):
 306         ld      rWORD3, 16(rSTR1)
 307         ld      rWORD4, 16(rSTR2)
 308         cmpld   cr6, rWORD5, rWORD6
 309         bne     cr5, L(dLcr5)
 310 L(dLoop2):
 311         ld      rWORD5, 24(rSTR1)
 312         ld      rWORD6, 24(rSTR2)
 313         cmpld   cr5, rWORD7, rWORD8
 314         bne     cr0, L(dLcr0)
 315 L(dLoop3):
 316         ldu     rWORD7, 32(rSTR1)
 317         ldu     rWORD8, 32(rSTR2)
 318         bne-    cr1, L(dLcr1)
 319         cmpld   cr0, rWORD1, rWORD2
 320         bdnz+   L(dLoop)
 321
 322 L(dL4):
 323         cmpld   cr1, rWORD3, rWORD4
 324         bne     cr6, L(dLcr6)
 325         cmpld   cr6, rWORD5, rWORD6
 326         bne     cr5, L(dLcr5)
 327         cmpld   cr5, rWORD7, rWORD8
 328 L(d44):
 329         bne     cr0, L(dLcr0)
 330 L(d34):
 331         bne     cr1, L(dLcr1)
 332 L(d24):
 333         bne     cr6, L(dLcr6)
 334 L(d14):
 335         sldi.   r12, rN, 3
 336         bne     cr5, L(dLcr5)
 337 L(d04):
 338         ld      rWORD8,-8(r1)
 339         ld      rWORD7,-16(r1)
 340         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 341         beq     L(zeroLength)
 342 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
 343    we are aligned it is safe to load the whole double word, and use
 344    shift right double to elliminate bits beyond the compare length.  */
 345 L(d00):
 346         ld      rWORD1, 8(rSTR1)
 347         ld      rWORD2, 8(rSTR2)
 348         srd     rWORD1, rWORD1, rN
 349         srd     rWORD2, rWORD2, rN
 350         cmpld   cr5, rWORD1, rWORD2
 351         bne     cr5, L(dLcr5x)
 352         li      rRTN, 0
 353         blr
 354         .align 4
 355 L(dLcr0):
 356         ld      rWORD8,-8(r1)
 357         ld      rWORD7,-16(r1)
 358         li      rRTN, 1
 359         bgtlr   cr0
 360         li      rRTN, -1
 361         blr
 362         .align 4
 363 L(dLcr1):
 364         ld      rWORD8,-8(r1)
 365         ld      rWORD7,-16(r1)
 366         li      rRTN, 1
 367         bgtlr   cr1
 368         li      rRTN, -1
 369         blr
 370         .align 4
 371 L(dLcr6):
 372         ld      rWORD8,-8(r1)
 373         ld      rWORD7,-16(r1)
 374         li      rRTN, 1
 375         bgtlr   cr6
 376         li      rRTN, -1
 377         blr
 378         .align 4
 379 L(dLcr5):
 380         ld      rWORD8,-8(r1)
 381         ld      rWORD7,-16(r1)
 382 L(dLcr5x):
 383         li      rRTN, 1
 384         bgtlr   cr5
 385         li      rRTN, -1
 386         blr
 387
 388         .align 4
 389 L(bytealigned):
 390         mtctr   rN      /* Power4 wants mtctr 1st in dispatch group */
 391         beq-    cr6, L(zeroLength)
 392
 393 /* We need to prime this loop.  This loop is swing modulo scheduled
 394    to avoid pipe delays.  The dependent instruction latencies (load to
 395    compare to conditional branch) is 2 to 3 cycles.  In this loop each
 396    dispatch group ends in a branch and takes 1 cycle.  Effectively
 397    the first iteration of the loop only serves to load operands and
 398    branches based on compares are delayed until the next loop.
 399
 400    So we must precondition some registers and condition codes so that
 401    we don't exit the loop early on the first iteration.  */
 402
 403         lbz     rWORD1, 0(rSTR1)
 404         lbz     rWORD2, 0(rSTR2)
 405         bdz-    L(b11)
 406         cmpld   cr0, rWORD1, rWORD2
 407         lbz     rWORD3, 1(rSTR1)
 408         lbz     rWORD4, 1(rSTR2)
 409         bdz-    L(b12)
 410         cmpld   cr1, rWORD3, rWORD4
 411         lbzu    rWORD5, 2(rSTR1)
 412         lbzu    rWORD6, 2(rSTR2)
 413         bdz-    L(b13)
 414         .align 4
 415 L(bLoop):
 416         lbzu    rWORD1, 1(rSTR1)
 417         lbzu    rWORD2, 1(rSTR2)
 418         bne-    cr0, L(bLcr0)
 419
 420         cmpld   cr6, rWORD5, rWORD6
 421         bdz-    L(b3i)
 422
 423         lbzu    rWORD3, 1(rSTR1)
 424         lbzu    rWORD4, 1(rSTR2)
 425         bne-    cr1, L(bLcr1)
 426
 427         cmpld   cr0, rWORD1, rWORD2
 428         bdz-    L(b2i)
 429
 430         lbzu    rWORD5, 1(rSTR1)
 431         lbzu    rWORD6, 1(rSTR2)
 432         bne-    cr6, L(bLcr6)
 433
 434         cmpld   cr1, rWORD3, rWORD4
 435         bdnz+   L(bLoop)
 436
 437 /* We speculatively loading bytes before we have tested the previous
 438    bytes.  But we must avoid overrunning the length (in the ctr) to
 439    prevent these speculative loads from causing a segfault.  In this
 440    case the loop will exit early (before the all pending bytes are
 441    tested.  In this case we must complete the pending operations
 442    before returning.  */
 443 L(b1i):
 444         bne-    cr0, L(bLcr0)
 445         bne-    cr1, L(bLcr1)
 446         b       L(bx56)
 447         .align 4
 448 L(b2i):
 449         bne-    cr6, L(bLcr6)
 450         bne-    cr0, L(bLcr0)
 451         b       L(bx34)
 452         .align 4
 453 L(b3i):
 454         bne-    cr1, L(bLcr1)
 455         bne-    cr6, L(bLcr6)
 456         b       L(bx12)
 457         .align 4
 458 L(bLcr0):
 459         li      rRTN, 1
 460         bgtlr   cr0
 461         li      rRTN, -1
 462         blr
 463 L(bLcr1):
 464         li      rRTN, 1
 465         bgtlr   cr1
 466         li      rRTN, -1
 467         blr
 468 L(bLcr6):
 469         li      rRTN, 1
 470         bgtlr   cr6
 471         li      rRTN, -1
 472         blr
 473
 474 L(b13):
 475         bne-    cr0, L(bx12)
 476         bne-    cr1, L(bx34)
 477 L(bx56):
 478         sub     rRTN, rWORD5, rWORD6
 479         blr
 480         nop
 481 L(b12):
 482         bne-    cr0, L(bx12)
 483 L(bx34):
 484         sub     rRTN, rWORD3, rWORD4
 485         blr
 486 L(b11):
 487 L(bx12):
 488         sub     rRTN, rWORD1, rWORD2
 489         blr
 490         .align 4
 491 L(zeroLengthReturn):
 492         ld      rWORD8,-8(r1)
 493         ld      rWORD7,-16(r1)
 494 L(zeroLength):
 495         li      rRTN, 0
 496         blr
 497
 498         .align 4
 499 /* At this point we know the strings have different alignment and the
 500    compare length is at least 8 bytes.  rBITDIF containes the low order
 501    3 bits of rSTR1 and cr5 contains the result of the logical compare
 502    of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is double word
 503    aligned and can perform the DWunaligned loop.
 504
 505    Otherwise we know that rSTR1 is not aready DW aligned yet.
 506    So we can force the string addresses to the next lower DW
 507    boundary and special case this first DW word using shift left to
 508    ellimiate bits preceeding the first byte.  Since we want to join the
 509    normal (DWaligned) compare loop, starting at the second double word,
 510    we need to adjust the length (rN) and special case the loop
 511    versioning for the first DW. This insures that the loop count is
 512    correct and the first DW (shifted) is in the expected resister pair.  */
 513 #define rSHL    r29     /* Unaligned shift left count.  */
 514 #define rSHR    r28     /* Unaligned shift right count.  */
 515 #define rB              r27     /* Left rotation temp for rWORD2.  */
 516 #define rD              r26     /* Left rotation temp for rWORD4.  */
 517 #define rF              r25     /* Left rotation temp for rWORD6.  */
 518 #define rH              r24     /* Left rotation temp for rWORD8.  */
 519 #define rA              r0      /* Right rotation temp for rWORD2.  */
 520 #define rC              r12     /* Right rotation temp for rWORD4.  */
 521 #define rE              r0      /* Right rotation temp for rWORD6.  */
 522 #define rG              r12     /* Right rotation temp for rWORD8.  */
 523 L(unaligned):
 524         std     r29,-24(r1)
 525         cfi_offset(r29,-24)
 526         clrldi  rSHL, rSTR2, 61
 527         beq-    cr6, L(duzeroLength)
 528         std     r28,-32(r1)
 529         cfi_offset(r28,-32)
 530         beq     cr5, L(DWunaligned)
 531         std     r27,-40(r1)
 532         cfi_offset(r27,-40)
 533 /* Adjust the logical start of rSTR2 ro compensate for the extra bits
 534    in the 1st rSTR1 DW.  */
 535         sub     r27, rSTR2, rBITDIF
 536 /* But do not attempt to address the DW before that DW that contains
 537    the actual start of rSTR2.  */
 538         clrrdi  rSTR2, rSTR2, 3
 539         std     r26,-48(r1)
 540         cfi_offset(r26,-48)
 541 /* Compute the leaft/right shift counts for the unalign rSTR2,
 542    compensating for the logical (DW aligned) start of rSTR1.  */
 543         clrldi  rSHL, r27, 61
 544         clrrdi  rSTR1, rSTR1, 3
 545         std     r25,-56(r1)
 546         cfi_offset(r25,-56)
 547         sldi    rSHL, rSHL, 3
 548         cmpld   cr5, r27, rSTR2
 549         add     rN, rN, rBITDIF
 550         sldi    r11, rBITDIF, 3
 551         std     r24,-64(r1)
 552         cfi_offset(r24,-64)
 553         subfic  rSHR, rSHL, 64
 554         srdi    rTMP, rN, 5     /* Divide by 32 */
 555         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
 556 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
 557    this special case those bits may be discarded anyway.  Also we
 558    must avoid loading a DW where none of the bits are part of rSTR2 as
 559    this may cross a page boundary and cause a page fault.  */
 560         li      rWORD8, 0
 561         blt     cr5, L(dus0)
 562         ld      rWORD8, 0(rSTR2)
 563         la      rSTR2, 8(rSTR2)
 564         sld     rWORD8, rWORD8, rSHL
 565
 566 L(dus0):
 567         ld      rWORD1, 0(rSTR1)
 568         ld      rWORD2, 0(rSTR2)
 569         cmpldi  cr1, rBITDIF, 16
 570         cmpldi  cr7, rN, 32
 571         srd     rG, rWORD2, rSHR
 572         clrldi  rN, rN, 61
 573         beq     L(duPs4)
 574         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 575         or      rWORD8, rG, rWORD8
 576         bgt     cr1, L(duPs3)
 577         beq     cr1, L(duPs2)
 578
 579 /* Remainder is 8 */
 580         .align 4
 581 L(dusP1):
 582         sld     rB, rWORD2, rSHL
 583         sld     rWORD7, rWORD1, r11
 584         sld     rWORD8, rWORD8, r11
 585         bge     cr7, L(duP1e)
 586 /* At this point we exit early with the first double word compare
 587    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
 588    how we handle the remaining bytes.  */
 589         cmpld   cr5, rWORD7, rWORD8
 590         sldi.   rN, rN, 3
 591         bne     cr5, L(duLcr5)
 592         cmpld   cr7, rN, rSHR
 593         beq     L(duZeroReturn)
 594         li      rA, 0
 595         ble     cr7, L(dutrim)
 596         ld      rWORD2, 8(rSTR2)
 597         srd     rA, rWORD2, rSHR
 598         b       L(dutrim)
 599 /* Remainder is 16 */
 600         .align 4
 601 L(duPs2):
 602         sld     rH, rWORD2, rSHL
 603         sld     rWORD5, rWORD1, r11
 604         sld     rWORD6, rWORD8, r11
 605         b       L(duP2e)
 606 /* Remainder is 24 */
 607         .align 4
 608 L(duPs3):
 609         sld     rF, rWORD2, rSHL
 610         sld     rWORD3, rWORD1, r11
 611         sld     rWORD4, rWORD8, r11
 612         b       L(duP3e)
 613 /* Count is a multiple of 32, remainder is 0 */
 614         .align 4
 615 L(duPs4):
 616         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 617         or      rWORD8, rG, rWORD8
 618         sld     rD, rWORD2, rSHL
 619         sld     rWORD1, rWORD1, r11
 620         sld     rWORD2, rWORD8, r11
 621         b       L(duP4e)
 622
 623 /* At this point we know rSTR1 is double word aligned and the
 624    compare length is at least 8 bytes.  */
 625         .align 4
 626 L(DWunaligned):
 627         std     r27,-40(r1)
 628         cfi_offset(r27,-40)
 629         clrrdi  rSTR2, rSTR2, 3
 630         std     r26,-48(r1)
 631         cfi_offset(r26,-48)
 632         srdi    rTMP, rN, 5     /* Divide by 32 */
 633         std     r25,-56(r1)
 634         cfi_offset(r25,-56)
 635         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
 636         std     r24,-64(r1)
 637         cfi_offset(r24,-64)
 638         sldi    rSHL, rSHL, 3
 639         ld      rWORD6, 0(rSTR2)
 640         ldu     rWORD8, 8(rSTR2)
 641         cmpldi  cr1, rBITDIF, 16
 642         cmpldi  cr7, rN, 32
 643         clrldi  rN, rN, 61
 644         subfic  rSHR, rSHL, 64
 645         sld     rH, rWORD6, rSHL
 646         beq     L(duP4)
 647         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 648         bgt     cr1, L(duP3)
 649         beq     cr1, L(duP2)
 650
 651 /* Remainder is 8 */
 652         .align 4
 653 L(duP1):
 654         srd     rG, rWORD8, rSHR
 655         ld      rWORD7, 0(rSTR1)
 656         sld     rB, rWORD8, rSHL
 657         or      rWORD8, rG, rH
 658         blt     cr7, L(duP1x)
 659 L(duP1e):
 660         ld      rWORD1, 8(rSTR1)
 661         ld      rWORD2, 8(rSTR2)
 662         cmpld   cr5, rWORD7, rWORD8
 663         srd     rA, rWORD2, rSHR
 664         sld     rD, rWORD2, rSHL
 665         or      rWORD2, rA, rB
 666         ld      rWORD3, 16(rSTR1)
 667         ld      rWORD4, 16(rSTR2)
 668         cmpld   cr0, rWORD1, rWORD2
 669         srd     rC, rWORD4, rSHR
 670         sld     rF, rWORD4, rSHL
 671         bne     cr5, L(duLcr5)
 672         or      rWORD4, rC, rD
 673         ld      rWORD5, 24(rSTR1)
 674         ld      rWORD6, 24(rSTR2)
 675         cmpld   cr1, rWORD3, rWORD4
 676         srd     rE, rWORD6, rSHR
 677         sld     rH, rWORD6, rSHL
 678         bne     cr0, L(duLcr0)
 679         or      rWORD6, rE, rF
 680         cmpld   cr6, rWORD5, rWORD6
 681         b       L(duLoop3)
 682         .align 4
 683 /* At this point we exit early with the first double word compare
 684    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
 685    how we handle the remaining bytes.  */
 686 L(duP1x):
 687         cmpld   cr5, rWORD7, rWORD8
 688         sldi.   rN, rN, 3
 689         bne     cr5, L(duLcr5)
 690         cmpld   cr7, rN, rSHR
 691         beq     L(duZeroReturn)
 692         li      rA, 0
 693         ble     cr7, L(dutrim)
 694         ld      rWORD2, 8(rSTR2)
 695         srd     rA, rWORD2, rSHR
 696         b       L(dutrim)
 697 /* Remainder is 16 */
 698         .align 4
 699 L(duP2):
 700         srd     rE, rWORD8, rSHR
 701         ld      rWORD5, 0(rSTR1)
 702         or      rWORD6, rE, rH
 703         sld     rH, rWORD8, rSHL
 704 L(duP2e):
 705         ld      rWORD7, 8(rSTR1)
 706         ld      rWORD8, 8(rSTR2)
 707         cmpld   cr6, rWORD5, rWORD6
 708         srd     rG, rWORD8, rSHR
 709         sld     rB, rWORD8, rSHL
 710         or      rWORD8, rG, rH
 711         blt     cr7, L(duP2x)
 712         ld      rWORD1, 16(rSTR1)
 713         ld      rWORD2, 16(rSTR2)
 714         cmpld   cr5, rWORD7, rWORD8
 715         bne     cr6, L(duLcr6)
 716         srd     rA, rWORD2, rSHR
 717         sld     rD, rWORD2, rSHL
 718         or      rWORD2, rA, rB
 719         ld      rWORD3, 24(rSTR1)
 720         ld      rWORD4, 24(rSTR2)
 721         cmpld   cr0, rWORD1, rWORD2
 722         bne     cr5, L(duLcr5)
 723         srd     rC, rWORD4, rSHR
 724         sld     rF, rWORD4, rSHL
 725         or      rWORD4, rC, rD
 726         addi    rSTR1, rSTR1, 8
 727         addi    rSTR2, rSTR2, 8
 728         cmpld   cr1, rWORD3, rWORD4
 729         b       L(duLoop2)
 730         .align 4
 731 L(duP2x):
 732         cmpld   cr5, rWORD7, rWORD8
 733         addi    rSTR1, rSTR1, 8
 734         addi    rSTR2, rSTR2, 8
 735         bne     cr6, L(duLcr6)
 736         sldi.   rN, rN, 3
 737         bne     cr5, L(duLcr5)
 738         cmpld   cr7, rN, rSHR
 739         beq     L(duZeroReturn)
 740         li      rA, 0
 741         ble     cr7, L(dutrim)
 742         ld      rWORD2, 8(rSTR2)
 743         srd     rA, rWORD2, rSHR
 744         b       L(dutrim)
 745
 746 /* Remainder is 24 */
 747         .align 4
 748 L(duP3):
 749         srd     rC, rWORD8, rSHR
 750         ld      rWORD3, 0(rSTR1)
 751         sld     rF, rWORD8, rSHL
 752         or      rWORD4, rC, rH
 753 L(duP3e):
 754         ld      rWORD5, 8(rSTR1)
 755         ld      rWORD6, 8(rSTR2)
 756         cmpld   cr1, rWORD3, rWORD4
 757         srd     rE, rWORD6, rSHR
 758         sld     rH, rWORD6, rSHL
 759         or      rWORD6, rE, rF
 760         ld      rWORD7, 16(rSTR1)
 761         ld      rWORD8, 16(rSTR2)
 762         cmpld   cr6, rWORD5, rWORD6
 763         bne     cr1, L(duLcr1)
 764         srd     rG, rWORD8, rSHR
 765         sld     rB, rWORD8, rSHL
 766         or      rWORD8, rG, rH
 767         blt     cr7, L(duP3x)
 768         ld      rWORD1, 24(rSTR1)
 769         ld      rWORD2, 24(rSTR2)
 770         cmpld   cr5, rWORD7, rWORD8
 771         bne     cr6, L(duLcr6)
 772         srd     rA, rWORD2, rSHR
 773         sld     rD, rWORD2, rSHL
 774         or      rWORD2, rA, rB
 775         addi    rSTR1, rSTR1, 16
 776         addi    rSTR2, rSTR2, 16
 777         cmpld   cr0, rWORD1, rWORD2
 778         b       L(duLoop1)
 779         .align 4
 780 L(duP3x):
 781         addi    rSTR1, rSTR1, 16
 782         addi    rSTR2, rSTR2, 16
 783         bne     cr1, L(duLcr1)
 784         cmpld   cr5, rWORD7, rWORD8
 785         bne     cr6, L(duLcr6)
 786         sldi.   rN, rN, 3
 787         bne     cr5, L(duLcr5)
 788         cmpld   cr7, rN, rSHR
 789         beq     L(duZeroReturn)
 790         li      rA, 0
 791         ble     cr7, L(dutrim)
 792         ld      rWORD2, 8(rSTR2)
 793         srd     rA, rWORD2, rSHR
 794         b       L(dutrim)
 795
 796 /* Count is a multiple of 32, remainder is 0 */
 797         .align 4
 798 L(duP4):
 799         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 800         srd     rA, rWORD8, rSHR
 801         ld      rWORD1, 0(rSTR1)
 802         sld     rD, rWORD8, rSHL
 803         or      rWORD2, rA, rH
 804 L(duP4e):
 805         ld      rWORD3, 8(rSTR1)
 806         ld      rWORD4, 8(rSTR2)
 807         cmpld   cr0, rWORD1, rWORD2
 808         srd     rC, rWORD4, rSHR
 809         sld     rF, rWORD4, rSHL
 810         or      rWORD4, rC, rD
 811         ld      rWORD5, 16(rSTR1)
 812         ld      rWORD6, 16(rSTR2)
 813         cmpld   cr1, rWORD3, rWORD4
 814         bne     cr0, L(duLcr0)
 815         srd     rE, rWORD6, rSHR
 816         sld     rH, rWORD6, rSHL
 817         or      rWORD6, rE, rF
 818         ldu     rWORD7, 24(rSTR1)
 819         ldu     rWORD8, 24(rSTR2)
 820         cmpld   cr6, rWORD5, rWORD6
 821         bne     cr1, L(duLcr1)
 822         srd     rG, rWORD8, rSHR
 823         sld     rB, rWORD8, rSHL
 824         or      rWORD8, rG, rH
 825         cmpld   cr5, rWORD7, rWORD8
 826         bdz-    L(du24)         /* Adjust CTR as we start with +4 */
 827 /* This is the primary loop */
 828         .align 4
 829 L(duLoop):
 830         ld      rWORD1, 8(rSTR1)
 831         ld      rWORD2, 8(rSTR2)
 832         cmpld   cr1, rWORD3, rWORD4
 833         bne     cr6, L(duLcr6)
 834         srd     rA, rWORD2, rSHR
 835         sld     rD, rWORD2, rSHL
 836         or      rWORD2, rA, rB
 837 L(duLoop1):
 838         ld      rWORD3, 16(rSTR1)
 839         ld      rWORD4, 16(rSTR2)
 840         cmpld   cr6, rWORD5, rWORD6
 841         bne     cr5, L(duLcr5)
 842         srd     rC, rWORD4, rSHR
 843         sld     rF, rWORD4, rSHL
 844         or      rWORD4, rC, rD
 845 L(duLoop2):
 846         ld      rWORD5, 24(rSTR1)
 847         ld      rWORD6, 24(rSTR2)
 848         cmpld   cr5, rWORD7, rWORD8
 849         bne     cr0, L(duLcr0)
 850         srd     rE, rWORD6, rSHR
 851         sld     rH, rWORD6, rSHL
 852         or      rWORD6, rE, rF
 853 L(duLoop3):
 854         ldu     rWORD7, 32(rSTR1)
 855         ldu     rWORD8, 32(rSTR2)
 856         cmpld   cr0, rWORD1, rWORD2
 857         bne-    cr1, L(duLcr1)
 858         srd     rG, rWORD8, rSHR
 859         sld     rB, rWORD8, rSHL
 860         or      rWORD8, rG, rH
 861         bdnz+   L(duLoop)
 862
 863 L(duL4):
 864         bne     cr1, L(duLcr1)
 865         cmpld   cr1, rWORD3, rWORD4
 866         bne     cr6, L(duLcr6)
 867         cmpld   cr6, rWORD5, rWORD6
 868         bne     cr5, L(duLcr5)
 869         cmpld   cr5, rWORD7, rWORD8
 870 L(du44):
 871         bne     cr0, L(duLcr0)
 872 L(du34):
 873         bne     cr1, L(duLcr1)
 874 L(du24):
 875         bne     cr6, L(duLcr6)
 876 L(du14):
 877         sldi.   rN, rN, 3
 878         bne     cr5, L(duLcr5)
 879 /* At this point we have a remainder of 1 to 7 bytes to compare.  We use
 880    shift right double to elliminate bits beyond the compare length.
 881    This allows the use of double word subtract to compute the final
 882    result.
 883
 884    However it may not be safe to load rWORD2 which may be beyond the
 885    string length. So we compare the bit length of the remainder to
 886    the right shift count (rSHR). If the bit count is less than or equal
 887    we do not need to load rWORD2 (all significant bits are already in
 888    rB).  */
 889         cmpld   cr7, rN, rSHR
 890         beq     L(duZeroReturn)
 891         li      rA, 0
 892         ble     cr7, L(dutrim)
 893         ld      rWORD2, 8(rSTR2)
 894         srd     rA, rWORD2, rSHR
 895         .align 4
 896 L(dutrim):
 897         ld      rWORD1, 8(rSTR1)
 898         ld      rWORD8,-8(r1)
 899         subfic  rN, rN, 64      /* Shift count is 64 - (rN * 8).  */
 900         or      rWORD2, rA, rB
 901         ld      rWORD7,-16(r1)
 902         ld      r29,-24(r1)
 903         srd     rWORD1, rWORD1, rN
 904         srd     rWORD2, rWORD2, rN
 905         ld      r28,-32(r1)
 906         ld      r27,-40(r1)
 907         li      rRTN, 0
 908         cmpld   cr0, rWORD1, rWORD2
 909         ld      r26,-48(r1)
 910         ld      r25,-56(r1)
 911         beq     cr0, L(dureturn24)
 912         li      rRTN, 1
 913         ld      r24,-64(r1)
 914         bgtlr   cr0
 915         li      rRTN, -1
 916         blr
 917         .align 4
 918 L(duLcr0):
 919         ld      rWORD8,-8(r1)
 920         ld      rWORD7,-16(r1)
 921         li      rRTN, 1
 922         bgt     cr0, L(dureturn29)
 923         ld      r29,-24(r1)
 924         ld      r28,-32(r1)
 925         li      rRTN, -1
 926         b       L(dureturn27)
 927         .align 4
 928 L(duLcr1):
 929         ld      rWORD8,-8(r1)
 930         ld      rWORD7,-16(r1)
 931         li      rRTN, 1
 932         bgt     cr1, L(dureturn29)
 933         ld      r29,-24(r1)
 934         ld      r28,-32(r1)
 935         li      rRTN, -1
 936         b       L(dureturn27)
 937         .align 4
 938 L(duLcr6):
 939         ld      rWORD8,-8(r1)
 940         ld      rWORD7,-16(r1)
 941         li      rRTN, 1
 942         bgt     cr6, L(dureturn29)
 943         ld      r29,-24(r1)
 944         ld      r28,-32(r1)
 945         li      rRTN, -1
 946         b       L(dureturn27)
 947         .align 4
 948 L(duLcr5):
 949         ld      rWORD8,-8(r1)
 950         ld      rWORD7,-16(r1)
 951         li      rRTN, 1
 952         bgt     cr5, L(dureturn29)
 953         ld      r29,-24(r1)
 954         ld      r28,-32(r1)
 955         li      rRTN, -1
 956         b       L(dureturn27)
 957         .align  3
 958 L(duZeroReturn):
 959         li      rRTN,0
 960         .align  4
 961 L(dureturn):
 962         ld      rWORD8,-8(r1)
 963         ld      rWORD7,-16(r1)
 964 L(dureturn29):
 965         ld      r29,-24(r1)
 966         ld      r28,-32(r1)
 967 L(dureturn27):
 968         ld      r27,-40(r1)
 969 L(dureturn26):
 970         ld      r26,-48(r1)
 971 L(dureturn25):
 972         ld      r25,-56(r1)
 973 L(dureturn24):
 974         ld      r24,-64(r1)
 975         blr
 976 L(duzeroLength):
 977         li      rRTN,0
 978         blr
 979
 980 END (BP_SYM (memcmp))
 981 libc_hidden_builtin_def (memcmp)
 982 weak_alias (memcmp, bcmp)