sysdeps/powerpc/powerpc64/power4/memcmp.S

   1 /* Optimized strcmp implementation for PowerPC64.
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include <bp-sym.h>
  21 #include <bp-asm.h>
  22
  23 /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
  24
  25         .machine power4
  26 EALIGN (BP_SYM(memcmp), 4, 0)
  27         CALL_MCOUNT 3
  28
  29 #define rTMP    r0
  30 #define rRTN    r3
  31 #define rSTR1   r3      /* first string arg */
  32 #define rSTR2   r4      /* second string arg */
  33 #define rN      r5      /* max string length */
  34 /* Note:  The Bounded pointer support in this code is broken.  This code
  35    was inherited from PPC32 and that support was never completed.
  36    Current PPC gcc does not support -fbounds-check or -fbounded-pointers.  */
  37 #define rWORD1  r6      /* current word in s1 */
  38 #define rWORD2  r7      /* current word in s2 */
  39 #define rWORD3  r8      /* next word in s1 */
  40 #define rWORD4  r9      /* next word in s2 */
  41 #define rWORD5  r10     /* next word in s1 */
  42 #define rWORD6  r11     /* next word in s2 */
  43 #define rBITDIF r12     /* bits that differ in s1 & s2 words */
  44 #define rWORD7  r30     /* next word in s1 */
  45 #define rWORD8  r31     /* next word in s2 */
  46
  47         xor     rTMP, rSTR2, rSTR1
  48         cmpldi  cr6, rN, 0
  49         cmpldi  cr1, rN, 12
  50         clrldi. rTMP, rTMP, 61
  51         clrldi  rBITDIF, rSTR1, 61
  52         cmpldi  cr5, rBITDIF, 0
  53         beq-    cr6, L(zeroLength)
  54         dcbt    0,rSTR1
  55         dcbt    0,rSTR2
  56 /* If less than 8 bytes or not aligned, use the unaligned
  57    byte loop.  */
  58         blt     cr1, L(bytealigned)
  59         std     rWORD8,-8(r1)
  60         cfi_offset(rWORD8,-8)
  61         std     rWORD7,-16(r1)
  62         cfi_offset(rWORD7,-16)
  63         bne     L(unaligned)
  64 /* At this point we know both strings have the same alignment and the
  65    compare length is at least 8 bytes.  rBITDIF contains the low order
  66    3 bits of rSTR1 and cr5 contains the result of the logical compare
  67    of rBITDIF to 0.  If rBITDIF == 0 then we are already double word
  68    aligned and can perform the DWaligned loop.
  69
  70    Otherwise we know the two strings have the same alignment (but not
  71    yet DW).  So we can force the string addresses to the next lower DW
  72    boundary and special case this first DW word using shift left to
  73    eliminate bits preceding the first byte.  Since we want to join the
  74    normal (DWaligned) compare loop, starting at the second double word,
  75    we need to adjust the length (rN) and special case the loop
  76    versioning for the first DW. This insures that the loop count is
  77    correct and the first DW (shifted) is in the expected resister pair.  */
  78         .align 4
  79 L(samealignment):
  80         clrrdi  rSTR1, rSTR1, 3
  81         clrrdi  rSTR2, rSTR2, 3
  82         beq     cr5, L(DWaligned)
  83         add     rN, rN, rBITDIF
  84         sldi    r11, rBITDIF, 3
  85         srdi    rTMP, rN, 5     /* Divide by 32 */
  86         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
  87         ld      rWORD1, 0(rSTR1)
  88         ld      rWORD2, 0(rSTR2)
  89         cmpldi  cr1, rBITDIF, 16
  90         cmpldi  cr7, rN, 32
  91         clrldi  rN, rN, 61
  92         beq     L(dPs4)
  93         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
  94         bgt     cr1, L(dPs3)
  95         beq     cr1, L(dPs2)
  96
  97 /* Remainder is 8 */
  98         .align 3
  99 L(dsP1):
 100         sld     rWORD5, rWORD1, r11
 101         sld     rWORD6, rWORD2, r11
 102         cmpld   cr5, rWORD5, rWORD6
 103         blt     cr7, L(dP1x)
 104 /* Do something useful in this cycle since we have to branch anyway.  */
 105         ld      rWORD1, 8(rSTR1)
 106         ld      rWORD2, 8(rSTR2)
 107         cmpld   cr0, rWORD1, rWORD2
 108         b       L(dP1e)
 109 /* Remainder is 16 */
 110         .align 4
 111 L(dPs2):
 112         sld     rWORD5, rWORD1, r11
 113         sld     rWORD6, rWORD2, r11
 114         cmpld   cr6, rWORD5, rWORD6
 115         blt     cr7, L(dP2x)
 116 /* Do something useful in this cycle since we have to branch anyway.  */
 117         ld      rWORD7, 8(rSTR1)
 118         ld      rWORD8, 8(rSTR2)
 119         cmpld   cr5, rWORD7, rWORD8
 120         b       L(dP2e)
 121 /* Remainder is 24 */
 122         .align 4
 123 L(dPs3):
 124         sld     rWORD3, rWORD1, r11
 125         sld     rWORD4, rWORD2, r11
 126         cmpld   cr1, rWORD3, rWORD4
 127         b       L(dP3e)
 128 /* Count is a multiple of 32, remainder is 0 */
 129         .align 4
 130 L(dPs4):
 131         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 132         sld     rWORD1, rWORD1, r11
 133         sld     rWORD2, rWORD2, r11
 134         cmpld   cr0, rWORD1, rWORD2
 135         b       L(dP4e)
 136
 137 /* At this point we know both strings are double word aligned and the
 138    compare length is at least 8 bytes.  */
 139         .align 4
 140 L(DWaligned):
 141         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
 142         srdi    rTMP, rN, 5     /* Divide by 32 */
 143         cmpldi  cr1, rBITDIF, 16
 144         cmpldi  cr7, rN, 32
 145         clrldi  rN, rN, 61
 146         beq     L(dP4)
 147         bgt     cr1, L(dP3)
 148         beq     cr1, L(dP2)
 149
 150 /* Remainder is 8 */
 151         .align 4
 152 L(dP1):
 153         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 154 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
 155    (8-15 byte compare), we want to use only volatile registers.  This
 156    means we can avoid restoring non-volatile registers since we did not
 157    change any on the early exit path.  The key here is the non-early
 158    exit path only cares about the condition code (cr5), not about which
 159    register pair was used.  */
 160         ld      rWORD5, 0(rSTR1)
 161         ld      rWORD6, 0(rSTR2)
 162         cmpld   cr5, rWORD5, rWORD6
 163         blt     cr7, L(dP1x)
 164         ld      rWORD1, 8(rSTR1)
 165         ld      rWORD2, 8(rSTR2)
 166         cmpld   cr0, rWORD1, rWORD2
 167 L(dP1e):
 168         ld      rWORD3, 16(rSTR1)
 169         ld      rWORD4, 16(rSTR2)
 170         cmpld   cr1, rWORD3, rWORD4
 171         ld      rWORD5, 24(rSTR1)
 172         ld      rWORD6, 24(rSTR2)
 173         cmpld   cr6, rWORD5, rWORD6
 174         bne     cr5, L(dLcr5)
 175         bne     cr0, L(dLcr0)
 176
 177         ldu     rWORD7, 32(rSTR1)
 178         ldu     rWORD8, 32(rSTR2)
 179         bne     cr1, L(dLcr1)
 180         cmpld   cr5, rWORD7, rWORD8
 181         bdnz    L(dLoop)
 182         bne     cr6, L(dLcr6)
 183         ld      rWORD8,-8(r1)
 184         ld      rWORD7,-16(r1)
 185         .align 3
 186 L(dP1x):
 187         sldi.   r12, rN, 3
 188         bne     cr5, L(dLcr5)
 189         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 190         bne     L(d00)
 191         li      rRTN, 0
 192         blr
 193
 194 /* Remainder is 16 */
 195         .align 4
 196 L(dP2):
 197         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 198         ld      rWORD5, 0(rSTR1)
 199         ld      rWORD6, 0(rSTR2)
 200         cmpld   cr6, rWORD5, rWORD6
 201         blt     cr7, L(dP2x)
 202         ld      rWORD7, 8(rSTR1)
 203         ld      rWORD8, 8(rSTR2)
 204         cmpld   cr5, rWORD7, rWORD8
 205 L(dP2e):
 206         ld      rWORD1, 16(rSTR1)
 207         ld      rWORD2, 16(rSTR2)
 208         cmpld   cr0, rWORD1, rWORD2
 209         ld      rWORD3, 24(rSTR1)
 210         ld      rWORD4, 24(rSTR2)
 211         cmpld   cr1, rWORD3, rWORD4
 212         addi    rSTR1, rSTR1, 8
 213         addi    rSTR2, rSTR2, 8
 214         bne     cr6, L(dLcr6)
 215         bne     cr5, L(dLcr5)
 216         b       L(dLoop2)
 217 /* Again we are on a early exit path (16-23 byte compare), we want to
 218    only use volatile registers and avoid restoring non-volatile
 219    registers.  */
 220         .align 4
 221 L(dP2x):
 222         ld      rWORD3, 8(rSTR1)
 223         ld      rWORD4, 8(rSTR2)
 224         cmpld   cr5, rWORD3, rWORD4
 225         sldi.   r12, rN, 3
 226         bne     cr6, L(dLcr6)
 227         addi    rSTR1, rSTR1, 8
 228         addi    rSTR2, rSTR2, 8
 229         bne     cr5, L(dLcr5)
 230         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 231         bne     L(d00)
 232         li      rRTN, 0
 233         blr
 234
 235 /* Remainder is 24 */
 236         .align 4
 237 L(dP3):
 238         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 239         ld      rWORD3, 0(rSTR1)
 240         ld      rWORD4, 0(rSTR2)
 241         cmpld   cr1, rWORD3, rWORD4
 242 L(dP3e):
 243         ld      rWORD5, 8(rSTR1)
 244         ld      rWORD6, 8(rSTR2)
 245         cmpld   cr6, rWORD5, rWORD6
 246         blt     cr7, L(dP3x)
 247         ld      rWORD7, 16(rSTR1)
 248         ld      rWORD8, 16(rSTR2)
 249         cmpld   cr5, rWORD7, rWORD8
 250         ld      rWORD1, 24(rSTR1)
 251         ld      rWORD2, 24(rSTR2)
 252         cmpld   cr0, rWORD1, rWORD2
 253         addi    rSTR1, rSTR1, 16
 254         addi    rSTR2, rSTR2, 16
 255         bne     cr1, L(dLcr1)
 256         bne     cr6, L(dLcr6)
 257         b       L(dLoop1)
 258 /* Again we are on a early exit path (24-31 byte compare), we want to
 259    only use volatile registers and avoid restoring non-volatile
 260    registers.  */
 261         .align 4
 262 L(dP3x):
 263         ld      rWORD1, 16(rSTR1)
 264         ld      rWORD2, 16(rSTR2)
 265         cmpld   cr5, rWORD1, rWORD2
 266         sldi.   r12, rN, 3
 267         bne     cr1, L(dLcr1)
 268         addi    rSTR1, rSTR1, 16
 269         addi    rSTR2, rSTR2, 16
 270         bne     cr6, L(dLcr6)
 271         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 272         bne     cr5, L(dLcr5)
 273         bne     L(d00)
 274         li      rRTN, 0
 275         blr
 276
 277 /* Count is a multiple of 32, remainder is 0 */
 278         .align 4
 279 L(dP4):
 280         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 281         ld      rWORD1, 0(rSTR1)
 282         ld      rWORD2, 0(rSTR2)
 283         cmpld   cr0, rWORD1, rWORD2
 284 L(dP4e):
 285         ld      rWORD3, 8(rSTR1)
 286         ld      rWORD4, 8(rSTR2)
 287         cmpld   cr1, rWORD3, rWORD4
 288         ld      rWORD5, 16(rSTR1)
 289         ld      rWORD6, 16(rSTR2)
 290         cmpld   cr6, rWORD5, rWORD6
 291         ldu     rWORD7, 24(rSTR1)
 292         ldu     rWORD8, 24(rSTR2)
 293         cmpld   cr5, rWORD7, rWORD8
 294         bne     cr0, L(dLcr0)
 295         bne     cr1, L(dLcr1)
 296         bdz-    L(d24)          /* Adjust CTR as we start with +4 */
 297 /* This is the primary loop */
 298         .align 4
 299 L(dLoop):
 300         ld      rWORD1, 8(rSTR1)
 301         ld      rWORD2, 8(rSTR2)
 302         cmpld   cr1, rWORD3, rWORD4
 303         bne     cr6, L(dLcr6)
 304 L(dLoop1):
 305         ld      rWORD3, 16(rSTR1)
 306         ld      rWORD4, 16(rSTR2)
 307         cmpld   cr6, rWORD5, rWORD6
 308         bne     cr5, L(dLcr5)
 309 L(dLoop2):
 310         ld      rWORD5, 24(rSTR1)
 311         ld      rWORD6, 24(rSTR2)
 312         cmpld   cr5, rWORD7, rWORD8
 313         bne     cr0, L(dLcr0)
 314 L(dLoop3):
 315         ldu     rWORD7, 32(rSTR1)
 316         ldu     rWORD8, 32(rSTR2)
 317         bne-    cr1, L(dLcr1)
 318         cmpld   cr0, rWORD1, rWORD2
 319         bdnz+   L(dLoop)
 320
 321 L(dL4):
 322         cmpld   cr1, rWORD3, rWORD4
 323         bne     cr6, L(dLcr6)
 324         cmpld   cr6, rWORD5, rWORD6
 325         bne     cr5, L(dLcr5)
 326         cmpld   cr5, rWORD7, rWORD8
 327 L(d44):
 328         bne     cr0, L(dLcr0)
 329 L(d34):
 330         bne     cr1, L(dLcr1)
 331 L(d24):
 332         bne     cr6, L(dLcr6)
 333 L(d14):
 334         sldi.   r12, rN, 3
 335         bne     cr5, L(dLcr5)
 336 L(d04):
 337         ld      rWORD8,-8(r1)
 338         ld      rWORD7,-16(r1)
 339         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 340         beq     L(zeroLength)
 341 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
 342    we are aligned it is safe to load the whole double word, and use
 343    shift right double to eliminate bits beyond the compare length.  */
 344 L(d00):
 345         ld      rWORD1, 8(rSTR1)
 346         ld      rWORD2, 8(rSTR2)
 347         srd     rWORD1, rWORD1, rN
 348         srd     rWORD2, rWORD2, rN
 349         cmpld   cr5, rWORD1, rWORD2
 350         bne     cr5, L(dLcr5x)
 351         li      rRTN, 0
 352         blr
 353         .align 4
 354 L(dLcr0):
 355         ld      rWORD8,-8(r1)
 356         ld      rWORD7,-16(r1)
 357         li      rRTN, 1
 358         bgtlr   cr0
 359         li      rRTN, -1
 360         blr
 361         .align 4
 362 L(dLcr1):
 363         ld      rWORD8,-8(r1)
 364         ld      rWORD7,-16(r1)
 365         li      rRTN, 1
 366         bgtlr   cr1
 367         li      rRTN, -1
 368         blr
 369         .align 4
 370 L(dLcr6):
 371         ld      rWORD8,-8(r1)
 372         ld      rWORD7,-16(r1)
 373         li      rRTN, 1
 374         bgtlr   cr6
 375         li      rRTN, -1
 376         blr
 377         .align 4
 378 L(dLcr5):
 379         ld      rWORD8,-8(r1)
 380         ld      rWORD7,-16(r1)
 381 L(dLcr5x):
 382         li      rRTN, 1
 383         bgtlr   cr5
 384         li      rRTN, -1
 385         blr
 386
 387         .align 4
 388 L(bytealigned):
 389         mtctr   rN      /* Power4 wants mtctr 1st in dispatch group */
 390         beq-    cr6, L(zeroLength)
 391
 392 /* We need to prime this loop.  This loop is swing modulo scheduled
 393    to avoid pipe delays.  The dependent instruction latencies (load to
 394    compare to conditional branch) is 2 to 3 cycles.  In this loop each
 395    dispatch group ends in a branch and takes 1 cycle.  Effectively
 396    the first iteration of the loop only serves to load operands and
 397    branches based on compares are delayed until the next loop.
 398
 399    So we must precondition some registers and condition codes so that
 400    we don't exit the loop early on the first iteration.  */
 401
 402         lbz     rWORD1, 0(rSTR1)
 403         lbz     rWORD2, 0(rSTR2)
 404         bdz-    L(b11)
 405         cmpld   cr0, rWORD1, rWORD2
 406         lbz     rWORD3, 1(rSTR1)
 407         lbz     rWORD4, 1(rSTR2)
 408         bdz-    L(b12)
 409         cmpld   cr1, rWORD3, rWORD4
 410         lbzu    rWORD5, 2(rSTR1)
 411         lbzu    rWORD6, 2(rSTR2)
 412         bdz-    L(b13)
 413         .align 4
 414 L(bLoop):
 415         lbzu    rWORD1, 1(rSTR1)
 416         lbzu    rWORD2, 1(rSTR2)
 417         bne-    cr0, L(bLcr0)
 418
 419         cmpld   cr6, rWORD5, rWORD6
 420         bdz-    L(b3i)
 421
 422         lbzu    rWORD3, 1(rSTR1)
 423         lbzu    rWORD4, 1(rSTR2)
 424         bne-    cr1, L(bLcr1)
 425
 426         cmpld   cr0, rWORD1, rWORD2
 427         bdz-    L(b2i)
 428
 429         lbzu    rWORD5, 1(rSTR1)
 430         lbzu    rWORD6, 1(rSTR2)
 431         bne-    cr6, L(bLcr6)
 432
 433         cmpld   cr1, rWORD3, rWORD4
 434         bdnz+   L(bLoop)
 435
 436 /* We speculatively loading bytes before we have tested the previous
 437    bytes.  But we must avoid overrunning the length (in the ctr) to
 438    prevent these speculative loads from causing a segfault.  In this
 439    case the loop will exit early (before the all pending bytes are
 440    tested.  In this case we must complete the pending operations
 441    before returning.  */
 442 L(b1i):
 443         bne-    cr0, L(bLcr0)
 444         bne-    cr1, L(bLcr1)
 445         b       L(bx56)
 446         .align 4
 447 L(b2i):
 448         bne-    cr6, L(bLcr6)
 449         bne-    cr0, L(bLcr0)
 450         b       L(bx34)
 451         .align 4
 452 L(b3i):
 453         bne-    cr1, L(bLcr1)
 454         bne-    cr6, L(bLcr6)
 455         b       L(bx12)
 456         .align 4
 457 L(bLcr0):
 458         li      rRTN, 1
 459         bgtlr   cr0
 460         li      rRTN, -1
 461         blr
 462 L(bLcr1):
 463         li      rRTN, 1
 464         bgtlr   cr1
 465         li      rRTN, -1
 466         blr
 467 L(bLcr6):
 468         li      rRTN, 1
 469         bgtlr   cr6
 470         li      rRTN, -1
 471         blr
 472
 473 L(b13):
 474         bne-    cr0, L(bx12)
 475         bne-    cr1, L(bx34)
 476 L(bx56):
 477         sub     rRTN, rWORD5, rWORD6
 478         blr
 479         nop
 480 L(b12):
 481         bne-    cr0, L(bx12)
 482 L(bx34):
 483         sub     rRTN, rWORD3, rWORD4
 484         blr
 485 L(b11):
 486 L(bx12):
 487         sub     rRTN, rWORD1, rWORD2
 488         blr
 489         .align 4
 490 L(zeroLengthReturn):
 491         ld      rWORD8,-8(r1)
 492         ld      rWORD7,-16(r1)
 493 L(zeroLength):
 494         li      rRTN, 0
 495         blr
 496
 497         .align 4
 498 /* At this point we know the strings have different alignment and the
 499    compare length is at least 8 bytes.  rBITDIF contains the low order
 500    3 bits of rSTR1 and cr5 contains the result of the logical compare
 501    of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is double word
 502    aligned and can perform the DWunaligned loop.
 503
 504    Otherwise we know that rSTR1 is not already DW aligned yet.
 505    So we can force the string addresses to the next lower DW
 506    boundary and special case this first DW word using shift left to
 507    eliminate bits preceding the first byte.  Since we want to join the
 508    normal (DWaligned) compare loop, starting at the second double word,
 509    we need to adjust the length (rN) and special case the loop
 510    versioning for the first DW. This insures that the loop count is
 511    correct and the first DW (shifted) is in the expected resister pair.  */
 512 #define rSHL    r29     /* Unaligned shift left count.  */
 513 #define rSHR    r28     /* Unaligned shift right count.  */
 514 #define rB              r27     /* Left rotation temp for rWORD2.  */
 515 #define rD              r26     /* Left rotation temp for rWORD4.  */
 516 #define rF              r25     /* Left rotation temp for rWORD6.  */
 517 #define rH              r24     /* Left rotation temp for rWORD8.  */
 518 #define rA              r0      /* Right rotation temp for rWORD2.  */
 519 #define rC              r12     /* Right rotation temp for rWORD4.  */
 520 #define rE              r0      /* Right rotation temp for rWORD6.  */
 521 #define rG              r12     /* Right rotation temp for rWORD8.  */
 522 L(unaligned):
 523         std     r29,-24(r1)
 524         cfi_offset(r29,-24)
 525         clrldi  rSHL, rSTR2, 61
 526         beq-    cr6, L(duzeroLength)
 527         std     r28,-32(r1)
 528         cfi_offset(r28,-32)
 529         beq     cr5, L(DWunaligned)
 530         std     r27,-40(r1)
 531         cfi_offset(r27,-40)
 532 /* Adjust the logical start of rSTR2 ro compensate for the extra bits
 533    in the 1st rSTR1 DW.  */
 534         sub     r27, rSTR2, rBITDIF
 535 /* But do not attempt to address the DW before that DW that contains
 536    the actual start of rSTR2.  */
 537         clrrdi  rSTR2, rSTR2, 3
 538         std     r26,-48(r1)
 539         cfi_offset(r26,-48)
 540 /* Compute the left/right shift counts for the unalign rSTR2,
 541    compensating for the logical (DW aligned) start of rSTR1.  */
 542         clrldi  rSHL, r27, 61
 543         clrrdi  rSTR1, rSTR1, 3
 544         std     r25,-56(r1)
 545         cfi_offset(r25,-56)
 546         sldi    rSHL, rSHL, 3
 547         cmpld   cr5, r27, rSTR2
 548         add     rN, rN, rBITDIF
 549         sldi    r11, rBITDIF, 3
 550         std     r24,-64(r1)
 551         cfi_offset(r24,-64)
 552         subfic  rSHR, rSHL, 64
 553         srdi    rTMP, rN, 5     /* Divide by 32 */
 554         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
 555 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
 556    this special case those bits may be discarded anyway.  Also we
 557    must avoid loading a DW where none of the bits are part of rSTR2 as
 558    this may cross a page boundary and cause a page fault.  */
 559         li      rWORD8, 0
 560         blt     cr5, L(dus0)
 561         ld      rWORD8, 0(rSTR2)
 562         la      rSTR2, 8(rSTR2)
 563         sld     rWORD8, rWORD8, rSHL
 564
 565 L(dus0):
 566         ld      rWORD1, 0(rSTR1)
 567         ld      rWORD2, 0(rSTR2)
 568         cmpldi  cr1, rBITDIF, 16
 569         cmpldi  cr7, rN, 32
 570         srd     rG, rWORD2, rSHR
 571         clrldi  rN, rN, 61
 572         beq     L(duPs4)
 573         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 574         or      rWORD8, rG, rWORD8
 575         bgt     cr1, L(duPs3)
 576         beq     cr1, L(duPs2)
 577
 578 /* Remainder is 8 */
 579         .align 4
 580 L(dusP1):
 581         sld     rB, rWORD2, rSHL
 582         sld     rWORD7, rWORD1, r11
 583         sld     rWORD8, rWORD8, r11
 584         bge     cr7, L(duP1e)
 585 /* At this point we exit early with the first double word compare
 586    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
 587    how we handle the remaining bytes.  */
 588         cmpld   cr5, rWORD7, rWORD8
 589         sldi.   rN, rN, 3
 590         bne     cr5, L(duLcr5)
 591         cmpld   cr7, rN, rSHR
 592         beq     L(duZeroReturn)
 593         li      rA, 0
 594         ble     cr7, L(dutrim)
 595         ld      rWORD2, 8(rSTR2)
 596         srd     rA, rWORD2, rSHR
 597         b       L(dutrim)
 598 /* Remainder is 16 */
 599         .align 4
 600 L(duPs2):
 601         sld     rH, rWORD2, rSHL
 602         sld     rWORD5, rWORD1, r11
 603         sld     rWORD6, rWORD8, r11
 604         b       L(duP2e)
 605 /* Remainder is 24 */
 606         .align 4
 607 L(duPs3):
 608         sld     rF, rWORD2, rSHL
 609         sld     rWORD3, rWORD1, r11
 610         sld     rWORD4, rWORD8, r11
 611         b       L(duP3e)
 612 /* Count is a multiple of 32, remainder is 0 */
 613         .align 4
 614 L(duPs4):
 615         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 616         or      rWORD8, rG, rWORD8
 617         sld     rD, rWORD2, rSHL
 618         sld     rWORD1, rWORD1, r11
 619         sld     rWORD2, rWORD8, r11
 620         b       L(duP4e)
 621
 622 /* At this point we know rSTR1 is double word aligned and the
 623    compare length is at least 8 bytes.  */
 624         .align 4
 625 L(DWunaligned):
 626         std     r27,-40(r1)
 627         cfi_offset(r27,-40)
 628         clrrdi  rSTR2, rSTR2, 3
 629         std     r26,-48(r1)
 630         cfi_offset(r26,-48)
 631         srdi    rTMP, rN, 5     /* Divide by 32 */
 632         std     r25,-56(r1)
 633         cfi_offset(r25,-56)
 634         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
 635         std     r24,-64(r1)
 636         cfi_offset(r24,-64)
 637         sldi    rSHL, rSHL, 3
 638         ld      rWORD6, 0(rSTR2)
 639         ldu     rWORD8, 8(rSTR2)
 640         cmpldi  cr1, rBITDIF, 16
 641         cmpldi  cr7, rN, 32
 642         clrldi  rN, rN, 61
 643         subfic  rSHR, rSHL, 64
 644         sld     rH, rWORD6, rSHL
 645         beq     L(duP4)
 646         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 647         bgt     cr1, L(duP3)
 648         beq     cr1, L(duP2)
 649
 650 /* Remainder is 8 */
 651         .align 4
 652 L(duP1):
 653         srd     rG, rWORD8, rSHR
 654         ld      rWORD7, 0(rSTR1)
 655         sld     rB, rWORD8, rSHL
 656         or      rWORD8, rG, rH
 657         blt     cr7, L(duP1x)
 658 L(duP1e):
 659         ld      rWORD1, 8(rSTR1)
 660         ld      rWORD2, 8(rSTR2)
 661         cmpld   cr5, rWORD7, rWORD8
 662         srd     rA, rWORD2, rSHR
 663         sld     rD, rWORD2, rSHL
 664         or      rWORD2, rA, rB
 665         ld      rWORD3, 16(rSTR1)
 666         ld      rWORD4, 16(rSTR2)
 667         cmpld   cr0, rWORD1, rWORD2
 668         srd     rC, rWORD4, rSHR
 669         sld     rF, rWORD4, rSHL
 670         bne     cr5, L(duLcr5)
 671         or      rWORD4, rC, rD
 672         ld      rWORD5, 24(rSTR1)
 673         ld      rWORD6, 24(rSTR2)
 674         cmpld   cr1, rWORD3, rWORD4
 675         srd     rE, rWORD6, rSHR
 676         sld     rH, rWORD6, rSHL
 677         bne     cr0, L(duLcr0)
 678         or      rWORD6, rE, rF
 679         cmpld   cr6, rWORD5, rWORD6
 680         b       L(duLoop3)
 681         .align 4
 682 /* At this point we exit early with the first double word compare
 683    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
 684    how we handle the remaining bytes.  */
 685 L(duP1x):
 686         cmpld   cr5, rWORD7, rWORD8
 687         sldi.   rN, rN, 3
 688         bne     cr5, L(duLcr5)
 689         cmpld   cr7, rN, rSHR
 690         beq     L(duZeroReturn)
 691         li      rA, 0
 692         ble     cr7, L(dutrim)
 693         ld      rWORD2, 8(rSTR2)
 694         srd     rA, rWORD2, rSHR
 695         b       L(dutrim)
 696 /* Remainder is 16 */
 697         .align 4
 698 L(duP2):
 699         srd     rE, rWORD8, rSHR
 700         ld      rWORD5, 0(rSTR1)
 701         or      rWORD6, rE, rH
 702         sld     rH, rWORD8, rSHL
 703 L(duP2e):
 704         ld      rWORD7, 8(rSTR1)
 705         ld      rWORD8, 8(rSTR2)
 706         cmpld   cr6, rWORD5, rWORD6
 707         srd     rG, rWORD8, rSHR
 708         sld     rB, rWORD8, rSHL
 709         or      rWORD8, rG, rH
 710         blt     cr7, L(duP2x)
 711         ld      rWORD1, 16(rSTR1)
 712         ld      rWORD2, 16(rSTR2)
 713         cmpld   cr5, rWORD7, rWORD8
 714         bne     cr6, L(duLcr6)
 715         srd     rA, rWORD2, rSHR
 716         sld     rD, rWORD2, rSHL
 717         or      rWORD2, rA, rB
 718         ld      rWORD3, 24(rSTR1)
 719         ld      rWORD4, 24(rSTR2)
 720         cmpld   cr0, rWORD1, rWORD2
 721         bne     cr5, L(duLcr5)
 722         srd     rC, rWORD4, rSHR
 723         sld     rF, rWORD4, rSHL
 724         or      rWORD4, rC, rD
 725         addi    rSTR1, rSTR1, 8
 726         addi    rSTR2, rSTR2, 8
 727         cmpld   cr1, rWORD3, rWORD4
 728         b       L(duLoop2)
 729         .align 4
 730 L(duP2x):
 731         cmpld   cr5, rWORD7, rWORD8
 732         addi    rSTR1, rSTR1, 8
 733         addi    rSTR2, rSTR2, 8
 734         bne     cr6, L(duLcr6)
 735         sldi.   rN, rN, 3
 736         bne     cr5, L(duLcr5)
 737         cmpld   cr7, rN, rSHR
 738         beq     L(duZeroReturn)
 739         li      rA, 0
 740         ble     cr7, L(dutrim)
 741         ld      rWORD2, 8(rSTR2)
 742         srd     rA, rWORD2, rSHR
 743         b       L(dutrim)
 744
 745 /* Remainder is 24 */
 746         .align 4
 747 L(duP3):
 748         srd     rC, rWORD8, rSHR
 749         ld      rWORD3, 0(rSTR1)
 750         sld     rF, rWORD8, rSHL
 751         or      rWORD4, rC, rH
 752 L(duP3e):
 753         ld      rWORD5, 8(rSTR1)
 754         ld      rWORD6, 8(rSTR2)
 755         cmpld   cr1, rWORD3, rWORD4
 756         srd     rE, rWORD6, rSHR
 757         sld     rH, rWORD6, rSHL
 758         or      rWORD6, rE, rF
 759         ld      rWORD7, 16(rSTR1)
 760         ld      rWORD8, 16(rSTR2)
 761         cmpld   cr6, rWORD5, rWORD6
 762         bne     cr1, L(duLcr1)
 763         srd     rG, rWORD8, rSHR
 764         sld     rB, rWORD8, rSHL
 765         or      rWORD8, rG, rH
 766         blt     cr7, L(duP3x)
 767         ld      rWORD1, 24(rSTR1)
 768         ld      rWORD2, 24(rSTR2)
 769         cmpld   cr5, rWORD7, rWORD8
 770         bne     cr6, L(duLcr6)
 771         srd     rA, rWORD2, rSHR
 772         sld     rD, rWORD2, rSHL
 773         or      rWORD2, rA, rB
 774         addi    rSTR1, rSTR1, 16
 775         addi    rSTR2, rSTR2, 16
 776         cmpld   cr0, rWORD1, rWORD2
 777         b       L(duLoop1)
 778         .align 4
 779 L(duP3x):
 780         addi    rSTR1, rSTR1, 16
 781         addi    rSTR2, rSTR2, 16
 782         bne     cr1, L(duLcr1)
 783         cmpld   cr5, rWORD7, rWORD8
 784         bne     cr6, L(duLcr6)
 785         sldi.   rN, rN, 3
 786         bne     cr5, L(duLcr5)
 787         cmpld   cr7, rN, rSHR
 788         beq     L(duZeroReturn)
 789         li      rA, 0
 790         ble     cr7, L(dutrim)
 791         ld      rWORD2, 8(rSTR2)
 792         srd     rA, rWORD2, rSHR
 793         b       L(dutrim)
 794
 795 /* Count is a multiple of 32, remainder is 0 */
 796         .align 4
 797 L(duP4):
 798         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 799         srd     rA, rWORD8, rSHR
 800         ld      rWORD1, 0(rSTR1)
 801         sld     rD, rWORD8, rSHL
 802         or      rWORD2, rA, rH
 803 L(duP4e):
 804         ld      rWORD3, 8(rSTR1)
 805         ld      rWORD4, 8(rSTR2)
 806         cmpld   cr0, rWORD1, rWORD2
 807         srd     rC, rWORD4, rSHR
 808         sld     rF, rWORD4, rSHL
 809         or      rWORD4, rC, rD
 810         ld      rWORD5, 16(rSTR1)
 811         ld      rWORD6, 16(rSTR2)
 812         cmpld   cr1, rWORD3, rWORD4
 813         bne     cr0, L(duLcr0)
 814         srd     rE, rWORD6, rSHR
 815         sld     rH, rWORD6, rSHL
 816         or      rWORD6, rE, rF
 817         ldu     rWORD7, 24(rSTR1)
 818         ldu     rWORD8, 24(rSTR2)
 819         cmpld   cr6, rWORD5, rWORD6
 820         bne     cr1, L(duLcr1)
 821         srd     rG, rWORD8, rSHR
 822         sld     rB, rWORD8, rSHL
 823         or      rWORD8, rG, rH
 824         cmpld   cr5, rWORD7, rWORD8
 825         bdz-    L(du24)         /* Adjust CTR as we start with +4 */
 826 /* This is the primary loop */
 827         .align 4
 828 L(duLoop):
 829         ld      rWORD1, 8(rSTR1)
 830         ld      rWORD2, 8(rSTR2)
 831         cmpld   cr1, rWORD3, rWORD4
 832         bne     cr6, L(duLcr6)
 833         srd     rA, rWORD2, rSHR
 834         sld     rD, rWORD2, rSHL
 835         or      rWORD2, rA, rB
 836 L(duLoop1):
 837         ld      rWORD3, 16(rSTR1)
 838         ld      rWORD4, 16(rSTR2)
 839         cmpld   cr6, rWORD5, rWORD6
 840         bne     cr5, L(duLcr5)
 841         srd     rC, rWORD4, rSHR
 842         sld     rF, rWORD4, rSHL
 843         or      rWORD4, rC, rD
 844 L(duLoop2):
 845         ld      rWORD5, 24(rSTR1)
 846         ld      rWORD6, 24(rSTR2)
 847         cmpld   cr5, rWORD7, rWORD8
 848         bne     cr0, L(duLcr0)
 849         srd     rE, rWORD6, rSHR
 850         sld     rH, rWORD6, rSHL
 851         or      rWORD6, rE, rF
 852 L(duLoop3):
 853         ldu     rWORD7, 32(rSTR1)
 854         ldu     rWORD8, 32(rSTR2)
 855         cmpld   cr0, rWORD1, rWORD2
 856         bne-    cr1, L(duLcr1)
 857         srd     rG, rWORD8, rSHR
 858         sld     rB, rWORD8, rSHL
 859         or      rWORD8, rG, rH
 860         bdnz+   L(duLoop)
 861
 862 L(duL4):
 863         bne     cr1, L(duLcr1)
 864         cmpld   cr1, rWORD3, rWORD4
 865         bne     cr6, L(duLcr6)
 866         cmpld   cr6, rWORD5, rWORD6
 867         bne     cr5, L(duLcr5)
 868         cmpld   cr5, rWORD7, rWORD8
 869 L(du44):
 870         bne     cr0, L(duLcr0)
 871 L(du34):
 872         bne     cr1, L(duLcr1)
 873 L(du24):
 874         bne     cr6, L(duLcr6)
 875 L(du14):
 876         sldi.   rN, rN, 3
 877         bne     cr5, L(duLcr5)
 878 /* At this point we have a remainder of 1 to 7 bytes to compare.  We use
 879    shift right double to eliminate bits beyond the compare length.
 880    This allows the use of double word subtract to compute the final
 881    result.
 882
 883    However it may not be safe to load rWORD2 which may be beyond the
 884    string length. So we compare the bit length of the remainder to
 885    the right shift count (rSHR). If the bit count is less than or equal
 886    we do not need to load rWORD2 (all significant bits are already in
 887    rB).  */
 888         cmpld   cr7, rN, rSHR
 889         beq     L(duZeroReturn)
 890         li      rA, 0
 891         ble     cr7, L(dutrim)
 892         ld      rWORD2, 8(rSTR2)
 893         srd     rA, rWORD2, rSHR
 894         .align 4
 895 L(dutrim):
 896         ld      rWORD1, 8(rSTR1)
 897         ld      rWORD8,-8(r1)
 898         subfic  rN, rN, 64      /* Shift count is 64 - (rN * 8).  */
 899         or      rWORD2, rA, rB
 900         ld      rWORD7,-16(r1)
 901         ld      r29,-24(r1)
 902         srd     rWORD1, rWORD1, rN
 903         srd     rWORD2, rWORD2, rN
 904         ld      r28,-32(r1)
 905         ld      r27,-40(r1)
 906         li      rRTN, 0
 907         cmpld   cr0, rWORD1, rWORD2
 908         ld      r26,-48(r1)
 909         ld      r25,-56(r1)
 910         beq     cr0, L(dureturn24)
 911         li      rRTN, 1
 912         ld      r24,-64(r1)
 913         bgtlr   cr0
 914         li      rRTN, -1
 915         blr
 916         .align 4
 917 L(duLcr0):
 918         ld      rWORD8,-8(r1)
 919         ld      rWORD7,-16(r1)
 920         li      rRTN, 1
 921         bgt     cr0, L(dureturn29)
 922         ld      r29,-24(r1)
 923         ld      r28,-32(r1)
 924         li      rRTN, -1
 925         b       L(dureturn27)
 926         .align 4
 927 L(duLcr1):
 928         ld      rWORD8,-8(r1)
 929         ld      rWORD7,-16(r1)
 930         li      rRTN, 1
 931         bgt     cr1, L(dureturn29)
 932         ld      r29,-24(r1)
 933         ld      r28,-32(r1)
 934         li      rRTN, -1
 935         b       L(dureturn27)
 936         .align 4
 937 L(duLcr6):
 938         ld      rWORD8,-8(r1)
 939         ld      rWORD7,-16(r1)
 940         li      rRTN, 1
 941         bgt     cr6, L(dureturn29)
 942         ld      r29,-24(r1)
 943         ld      r28,-32(r1)
 944         li      rRTN, -1
 945         b       L(dureturn27)
 946         .align 4
 947 L(duLcr5):
 948         ld      rWORD8,-8(r1)
 949         ld      rWORD7,-16(r1)
 950         li      rRTN, 1
 951         bgt     cr5, L(dureturn29)
 952         ld      r29,-24(r1)
 953         ld      r28,-32(r1)
 954         li      rRTN, -1
 955         b       L(dureturn27)
 956         .align  3
 957 L(duZeroReturn):
 958         li      rRTN,0
 959         .align  4
 960 L(dureturn):
 961         ld      rWORD8,-8(r1)
 962         ld      rWORD7,-16(r1)
 963 L(dureturn29):
 964         ld      r29,-24(r1)
 965         ld      r28,-32(r1)
 966 L(dureturn27):
 967         ld      r27,-40(r1)
 968 L(dureturn26):
 969         ld      r26,-48(r1)
 970 L(dureturn25):
 971         ld      r25,-56(r1)
 972 L(dureturn24):
 973         ld      r24,-64(r1)
 974         blr
 975 L(duzeroLength):
 976         li      rRTN,0
 977         blr
 978
 979 END (BP_SYM (memcmp))
 980 libc_hidden_builtin_def (memcmp)
 981 weak_alias (memcmp, bcmp)