powerpc-cpu/sysdeps/powerpc/powerpc32/power4/memcmp.S

   1 /* Optimized strcmp implementation for PowerPC64.
   2    Copyright (C) 2003, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
  18    02110-1301 USA.  */
  19
  20 #include <sysdep.h>
  21 #include <bp-sym.h>
  22 #include <bp-asm.h>
  23
  24 /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
  25
  26 EALIGN (BP_SYM(memcmp), 4, 0)
  27         CALL_MCOUNT
  28
  29 #define rTMP    r0
  30 #define rRTN    r3
  31 #define rSTR1   r3      /* first string arg */
  32 #define rSTR2   r4      /* second string arg */
  33 #define rN      r5      /* max string length */
  34 #define rWORD1  r6      /* current word in s1 */
  35 #define rWORD2  r7      /* current word in s2 */
  36 #define rWORD3  r8      /* next word in s1 */
  37 #define rWORD4  r9      /* next word in s2 */
  38 #define rWORD5  r10     /* next word in s1 */
  39 #define rWORD6  r11     /* next word in s2 */
  40 #define rBITDIF r12     /* bits that differ in s1 & s2 words */
  41 #define rWORD7  r30     /* next word in s1 */
  42 #define rWORD8  r31     /* next word in s2 */
  43
  44         xor     rTMP, rSTR2, rSTR1
  45         cmplwi  cr6, rN, 0
  46         cmplwi  cr1, rN, 12
  47         clrlwi. rTMP, rTMP, 30
  48         clrlwi  rBITDIF, rSTR1, 30
  49         cmplwi  cr5, rBITDIF, 0
  50         beq-    cr6, L(zeroLength)
  51         dcbt    0,rSTR1
  52         dcbt    0,rSTR2
  53 /* If less than 8 bytes or not aligned, use the unaligned
  54    byte loop.  */
  55         blt     cr1, L(bytealigned)
  56         stwu    1,-64(1)
  57         cfi_adjust_cfa_offset(64)
  58         stw     r31,48(1)
  59         cfi_offset(31,(48-64))
  60         stw     r30,44(1)
  61         cfi_offset(30,(44-64))
  62         bne     L(unaligned)
  63 /* At this point we know both strings have the same alignment and the
  64    compare length is at least 8 bytes.  rBITDIF contains the low order
  65    2 bits of rSTR1 and cr5 contains the result of the logical compare
  66    of rBITDIF to 0.  If rBITDIF == 0 then we are already word
  67    aligned and can perform the word aligned loop.
  68
  69    Otherwise we know the two strings have the same alignment (but not
  70    yet word aligned).  So we force the string addresses to the next lower
  71    word boundary and special case this first word using shift left to
  72    eliminate bits preceeding the first byte.  Since we want to join the
  73    normal (word aligned) compare loop, starting at the second word,
  74    we need to adjust the length (rN) and special case the loop
  75    versioning for the first word. This insures that the loop count is
  76    correct and the first word (shifted) is in the expected register pair. */
  77         .align 4
  78 L(samealignment):
  79         clrrwi  rSTR1, rSTR1, 2
  80         clrrwi  rSTR2, rSTR2, 2
  81         beq     cr5, L(Waligned)
  82         add     rN, rN, rBITDIF
  83         slwi    r11, rBITDIF, 3
  84         srwi    rTMP, rN, 4      /* Divide by 16 */
  85         andi.   rBITDIF, rN, 12  /* Get the word remainder */
  86         lwz     rWORD1, 0(rSTR1)
  87         lwz     rWORD2, 0(rSTR2)
  88         cmplwi  cr1, rBITDIF, 8
  89         cmplwi  cr7, rN, 16
  90         clrlwi  rN, rN, 30
  91         beq     L(dPs4)
  92         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
  93         bgt     cr1, L(dPs3)
  94         beq     cr1, L(dPs2)
  95
  96 /* Remainder is 4 */
  97         .align 3
  98 L(dsP1):
  99         slw     rWORD5, rWORD1, r11
 100         slw     rWORD6, rWORD2, r11
 101         cmplw   cr5, rWORD5, rWORD6
 102         blt     cr7, L(dP1x)
 103 /* Do something useful in this cycle since we have to branch anyway.  */
 104         lwz     rWORD1, 4(rSTR1)
 105         lwz     rWORD2, 4(rSTR2)
 106         cmplw   cr0, rWORD1, rWORD2
 107         b       L(dP1e)
 108 /* Remainder is 8 */
 109         .align 4
 110 L(dPs2):
 111         slw     rWORD5, rWORD1, r11
 112         slw     rWORD6, rWORD2, r11
 113         cmplw   cr6, rWORD5, rWORD6
 114         blt     cr7, L(dP2x)
 115 /* Do something useful in this cycle since we have to branch anyway.  */
 116         lwz     rWORD7, 4(rSTR1)
 117         lwz     rWORD8, 4(rSTR2)
 118         cmplw   cr5, rWORD7, rWORD8
 119         b       L(dP2e)
 120 /* Remainder is 12 */
 121         .align 4
 122 L(dPs3):
 123         slw     rWORD3, rWORD1, r11
 124         slw     rWORD4, rWORD2, r11
 125         cmplw   cr1, rWORD3, rWORD4
 126         b       L(dP3e)
 127 /* Count is a multiple of 16, remainder is 0 */
 128         .align 4
 129 L(dPs4):
 130         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 131         slw     rWORD1, rWORD1, r11
 132         slw     rWORD2, rWORD2, r11
 133         cmplw   cr0, rWORD1, rWORD2
 134         b       L(dP4e)
 135
 136 /* At this point we know both strings are word aligned and the
 137    compare length is at least 8 bytes.  */
 138         .align 4
 139 L(Waligned):
 140         andi.   rBITDIF, rN, 12  /* Get the word remainder */
 141         srwi    rTMP, rN, 4      /* Divide by 16 */
 142         cmplwi  cr1, rBITDIF, 8
 143         cmplwi  cr7, rN, 16
 144         clrlwi  rN, rN, 30
 145         beq     L(dP4)
 146         bgt     cr1, L(dP3)
 147         beq     cr1, L(dP2)
 148
 149 /* Remainder is 4 */
 150         .align 4
 151 L(dP1):
 152         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 153 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
 154    (8-15 byte compare), we want to use only volatile registers.  This
 155    means we can avoid restoring non-volatile registers since we did not
 156    change any on the early exit path.  The key here is the non-early
 157    exit path only cares about the condition code (cr5), not about which
 158    register pair was used.  */
 159         lwz     rWORD5, 0(rSTR1)
 160         lwz     rWORD6, 0(rSTR2)
 161         cmplw   cr5, rWORD5, rWORD6
 162         blt     cr7, L(dP1x)
 163         lwz     rWORD1, 4(rSTR1)
 164         lwz     rWORD2, 4(rSTR2)
 165         cmplw   cr0, rWORD1, rWORD2
 166 L(dP1e):
 167         lwz     rWORD3, 8(rSTR1)
 168         lwz     rWORD4, 8(rSTR2)
 169         cmplw   cr1, rWORD3, rWORD4
 170         lwz     rWORD5, 12(rSTR1)
 171         lwz     rWORD6, 12(rSTR2)
 172         cmplw   cr6, rWORD5, rWORD6
 173         bne     cr5, L(dLcr5)
 174         bne     cr0, L(dLcr0)
 175
 176         lwzu    rWORD7, 16(rSTR1)
 177         lwzu    rWORD8, 16(rSTR2)
 178         bne     cr1, L(dLcr1)
 179         cmplw   cr5, rWORD7, rWORD8
 180         bdnz    L(dLoop)
 181         bne     cr6, L(dLcr6)
 182         lwz     r30,44(1)
 183         lwz     r31,48(1)
 184         .align 3
 185 L(dP1x):
 186         slwi.   r12, rN, 3
 187         bne     cr5, L(dLcr5)
 188         subfic  rN, r12, 32     /* Shift count is 32 - (rN * 8).  */
 189         lwz     1,0(1)
 190         bne     L(d00)
 191         li      rRTN, 0
 192         blr
 193
 194 /* Remainder is 8 */
 195         .align 4
 196 L(dP2):
 197         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 198         lwz     rWORD5, 0(rSTR1)
 199         lwz     rWORD6, 0(rSTR2)
 200         cmplw   cr6, rWORD5, rWORD6
 201         blt     cr7, L(dP2x)
 202         lwz     rWORD7, 4(rSTR1)
 203         lwz     rWORD8, 4(rSTR2)
 204         cmplw   cr5, rWORD7, rWORD8
 205 L(dP2e):
 206         lwz     rWORD1, 8(rSTR1)
 207         lwz     rWORD2, 8(rSTR2)
 208         cmplw   cr0, rWORD1, rWORD2
 209         lwz     rWORD3, 12(rSTR1)
 210         lwz     rWORD4, 12(rSTR2)
 211         cmplw   cr1, rWORD3, rWORD4
 212         addi    rSTR1, rSTR1, 4
 213         addi    rSTR2, rSTR2, 4
 214         bne     cr6, L(dLcr6)
 215         bne     cr5, L(dLcr5)
 216         b       L(dLoop2)
 217 /* Again we are on a early exit path (16-23 byte compare), we want to
 218    only use volatile registers and avoid restoring non-volatile
 219    registers.  */
 220         .align 4
 221 L(dP2x):
 222         lwz     rWORD3, 4(rSTR1)
 223         lwz     rWORD4, 4(rSTR2)
 224         cmplw   cr5, rWORD3, rWORD4
 225         slwi.   r12, rN, 3
 226         bne     cr6, L(dLcr6)
 227         addi    rSTR1, rSTR1, 4
 228         addi    rSTR2, rSTR2, 4
 229         bne     cr5, L(dLcr5)
 230         subfic  rN, r12, 32     /* Shift count is 32 - (rN * 8).  */
 231         lwz     1,0(1)
 232         bne     L(d00)
 233         li      rRTN, 0
 234         blr
 235
 236 /* Remainder is 12 */
 237         .align 4
 238 L(dP3):
 239         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 240         lwz     rWORD3, 0(rSTR1)
 241         lwz     rWORD4, 0(rSTR2)
 242         cmplw   cr1, rWORD3, rWORD4
 243 L(dP3e):
 244         lwz     rWORD5, 4(rSTR1)
 245         lwz     rWORD6, 4(rSTR2)
 246         cmplw   cr6, rWORD5, rWORD6
 247         blt     cr7, L(dP3x)
 248         lwz     rWORD7, 8(rSTR1)
 249         lwz     rWORD8, 8(rSTR2)
 250         cmplw   cr5, rWORD7, rWORD8
 251         lwz     rWORD1, 12(rSTR1)
 252         lwz     rWORD2, 12(rSTR2)
 253         cmplw   cr0, rWORD1, rWORD2
 254         addi    rSTR1, rSTR1, 8
 255         addi    rSTR2, rSTR2, 8
 256         bne     cr1, L(dLcr1)
 257         bne     cr6, L(dLcr6)
 258         b       L(dLoop1)
 259 /* Again we are on a early exit path (24-31 byte compare), we want to
 260    only use volatile registers and avoid restoring non-volatile
 261    registers.  */
 262         .align 4
 263 L(dP3x):
 264         lwz     rWORD1, 8(rSTR1)
 265         lwz     rWORD2, 8(rSTR2)
 266         cmplw   cr5, rWORD1, rWORD2
 267         slwi.   r12, rN, 3
 268         bne     cr1, L(dLcr1)
 269         addi    rSTR1, rSTR1, 8
 270         addi    rSTR2, rSTR2, 8
 271         bne     cr6, L(dLcr6)
 272         subfic  rN, r12, 32     /* Shift count is 32 - (rN * 8).  */
 273         bne     cr5, L(dLcr5)
 274         lwz     1,0(1)
 275         bne     L(d00)
 276         li      rRTN, 0
 277         blr
 278
 279 /* Count is a multiple of 16, remainder is 0 */
 280         .align 4
 281 L(dP4):
 282         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 283         lwz     rWORD1, 0(rSTR1)
 284         lwz     rWORD2, 0(rSTR2)
 285         cmplw   cr0, rWORD1, rWORD2
 286 L(dP4e):
 287         lwz     rWORD3, 4(rSTR1)
 288         lwz     rWORD4, 4(rSTR2)
 289         cmplw   cr1, rWORD3, rWORD4
 290         lwz     rWORD5, 8(rSTR1)
 291         lwz     rWORD6, 8(rSTR2)
 292         cmplw   cr6, rWORD5, rWORD6
 293         lwzu    rWORD7, 12(rSTR1)
 294         lwzu    rWORD8, 12(rSTR2)
 295         cmplw   cr5, rWORD7, rWORD8
 296         bne     cr0, L(dLcr0)
 297         bne     cr1, L(dLcr1)
 298         bdz-    L(d24)          /* Adjust CTR as we start with +4 */
 299 /* This is the primary loop */
 300         .align 4
 301 L(dLoop):
 302         lwz     rWORD1, 4(rSTR1)
 303         lwz     rWORD2, 4(rSTR2)
 304         cmplw   cr1, rWORD3, rWORD4
 305         bne     cr6, L(dLcr6)
 306 L(dLoop1):
 307         lwz     rWORD3, 8(rSTR1)
 308         lwz     rWORD4, 8(rSTR2)
 309         cmplw   cr6, rWORD5, rWORD6
 310         bne     cr5, L(dLcr5)
 311 L(dLoop2):
 312         lwz     rWORD5, 12(rSTR1)
 313         lwz     rWORD6, 12(rSTR2)
 314         cmplw   cr5, rWORD7, rWORD8
 315         bne     cr0, L(dLcr0)
 316 L(dLoop3):
 317         lwzu    rWORD7, 16(rSTR1)
 318         lwzu    rWORD8, 16(rSTR2)
 319         bne-    cr1, L(dLcr1)
 320         cmplw   cr0, rWORD1, rWORD2
 321         bdnz+   L(dLoop)
 322
 323 L(dL4):
 324         cmplw   cr1, rWORD3, rWORD4
 325         bne     cr6, L(dLcr6)
 326         cmplw   cr6, rWORD5, rWORD6
 327         bne     cr5, L(dLcr5)
 328         cmplw   cr5, rWORD7, rWORD8
 329 L(d44):
 330         bne     cr0, L(dLcr0)
 331 L(d34):
 332         bne     cr1, L(dLcr1)
 333 L(d24):
 334         bne     cr6, L(dLcr6)
 335 L(d14):
 336         slwi.   r12, rN, 3
 337         bne     cr5, L(dLcr5)
 338 L(d04):
 339         lwz     r30,44(1)
 340         lwz     r31,48(1)
 341         lwz     1,0(1)
 342         subfic  rN, r12, 32     /* Shift count is 32 - (rN * 8).  */
 343         beq     L(zeroLength)
 344 /* At this point we have a remainder of 1 to 3 bytes to compare.  Since
 345    we are aligned it is safe to load the whole word, and use
 346    shift right to eliminate bits beyond the compare length. */
 347 L(d00):
 348         lwz     rWORD1, 4(rSTR1)
 349         lwz     rWORD2, 4(rSTR2)
 350         srw     rWORD1, rWORD1, rN
 351         srw     rWORD2, rWORD2, rN
 352         cmplw   rWORD1,rWORD2
 353         li      rRTN,0
 354         beqlr
 355         li      rRTN,1
 356         bgtlr
 357         li      rRTN,-1
 358         blr
 359
 360         .align 4
 361 L(dLcr0):
 362         lwz     r30,44(1)
 363         lwz     r31,48(1)
 364         li      rRTN, 1
 365         lwz     1,0(1)
 366         bgtlr   cr0
 367         li      rRTN, -1
 368         blr
 369         .align 4
 370 L(dLcr1):
 371         lwz     r30,44(1)
 372         lwz     r31,48(1)
 373         li      rRTN, 1
 374         lwz     1,0(1)
 375         bgtlr   cr1
 376         li      rRTN, -1
 377         blr
 378         .align 4
 379 L(dLcr6):
 380         lwz     r30,44(1)
 381         lwz     r31,48(1)
 382         li      rRTN, 1
 383         lwz     1,0(1)
 384         bgtlr   cr6
 385         li      rRTN, -1
 386         blr
 387         .align 4
 388 L(dLcr5):
 389         lwz     r30,44(1)
 390         lwz     r31,48(1)
 391 L(dLcr5x):
 392         li      rRTN, 1
 393         lwz     1,0(1)
 394         bgtlr   cr5
 395         li      rRTN, -1
 396         blr
 397
 398         .align 4
 399 L(bytealigned):
 400         cfi_adjust_cfa_offset(-64)
 401         mtctr   rN      /* Power4 wants mtctr 1st in dispatch group */
 402
 403 /* We need to prime this loop.  This loop is swing modulo scheduled
 404    to avoid pipe delays.  The dependent instruction latencies (load to
 405    compare to conditional branch) is 2 to 3 cycles.  In this loop each
 406    dispatch group ends in a branch and takes 1 cycle.  Effectively
 407    the first iteration of the loop only serves to load operands and
 408    branches based on compares are delayed until the next loop.
 409
 410    So we must precondition some registers and condition codes so that
 411    we don't exit the loop early on the first iteration.  */
 412
 413         lbz     rWORD1, 0(rSTR1)
 414         lbz     rWORD2, 0(rSTR2)
 415         bdz-    L(b11)
 416         cmplw   cr0, rWORD1, rWORD2
 417         lbz     rWORD3, 1(rSTR1)
 418         lbz     rWORD4, 1(rSTR2)
 419         bdz-    L(b12)
 420         cmplw   cr1, rWORD3, rWORD4
 421         lbzu    rWORD5, 2(rSTR1)
 422         lbzu    rWORD6, 2(rSTR2)
 423         bdz-    L(b13)
 424         .align 4
 425 L(bLoop):
 426         lbzu    rWORD1, 1(rSTR1)
 427         lbzu    rWORD2, 1(rSTR2)
 428         bne-    cr0, L(bLcr0)
 429
 430         cmplw   cr6, rWORD5, rWORD6
 431         bdz-    L(b3i)
 432
 433         lbzu    rWORD3, 1(rSTR1)
 434         lbzu    rWORD4, 1(rSTR2)
 435         bne-    cr1, L(bLcr1)
 436
 437         cmplw   cr0, rWORD1, rWORD2
 438         bdz-    L(b2i)
 439
 440         lbzu    rWORD5, 1(rSTR1)
 441         lbzu    rWORD6, 1(rSTR2)
 442         bne-    cr6, L(bLcr6)
 443
 444         cmplw   cr1, rWORD3, rWORD4
 445         bdnz+   L(bLoop)
 446
 447 /* We speculatively loading bytes before we have tested the previous
 448    bytes.  But we must avoid overrunning the length (in the ctr) to
 449    prevent these speculative loads from causing a segfault.  In this
 450    case the loop will exit early (before the all pending bytes are
 451    tested.  In this case we must complete the pending operations
 452    before returning.  */
 453 L(b1i):
 454         bne-    cr0, L(bLcr0)
 455         bne-    cr1, L(bLcr1)
 456         b       L(bx56)
 457         .align 4
 458 L(b2i):
 459         bne-    cr6, L(bLcr6)
 460         bne-    cr0, L(bLcr0)
 461         b       L(bx34)
 462         .align 4
 463 L(b3i):
 464         bne-    cr1, L(bLcr1)
 465         bne-    cr6, L(bLcr6)
 466         b       L(bx12)
 467         .align 4
 468 L(bLcr0):
 469         li      rRTN, 1
 470         bgtlr   cr0
 471         li      rRTN, -1
 472         blr
 473 L(bLcr1):
 474         li      rRTN, 1
 475         bgtlr   cr1
 476         li      rRTN, -1
 477         blr
 478 L(bLcr6):
 479         li      rRTN, 1
 480         bgtlr   cr6
 481         li      rRTN, -1
 482         blr
 483
 484 L(b13):
 485         bne-    cr0, L(bx12)
 486         bne-    cr1, L(bx34)
 487 L(bx56):
 488         sub     rRTN, rWORD5, rWORD6
 489         blr
 490         nop
 491 L(b12):
 492         bne-    cr0, L(bx12)
 493 L(bx34):
 494         sub     rRTN, rWORD3, rWORD4
 495         blr
 496
 497 L(b11):
 498 L(bx12):
 499         sub     rRTN, rWORD1, rWORD2
 500         blr
 501
 502         .align 4
 503 L(zeroLengthReturn):
 504
 505 L(zeroLength):
 506         li      rRTN, 0
 507         blr
 508
 509         cfi_adjust_cfa_offset(64)
 510         .align 4
 511 /* At this point we know the strings have different alignment and the
 512    compare length is at least 8 bytes.  rBITDIF contains the low order
 513    2 bits of rSTR1 and cr5 contains the result of the logical compare
 514    of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is word aligned and can
 515    perform the Wunaligned loop.
 516
 517    Otherwise we know that rSTR1 is not aready word aligned yet.
 518    So we can force the string addresses to the next lower word
 519    boundary and special case this first word using shift left to
 520    eliminate bits preceeding the first byte.  Since we want to join the
 521    normal (Wualigned) compare loop, starting at the second word,
 522    we need to adjust the length (rN) and special case the loop
 523    versioning for the first W. This insures that the loop count is
 524    correct and the first W (shifted) is in the expected resister pair.  */
 525 #define rSHL            r29     /* Unaligned shift left count.  */
 526 #define rSHR            r28     /* Unaligned shift right count.  */
 527 #define rB              r27     /* Left rotation temp for rWORD2.  */
 528 #define rD              r26     /* Left rotation temp for rWORD4.  */
 529 #define rF              r25     /* Left rotation temp for rWORD6.  */
 530 #define rH              r24     /* Left rotation temp for rWORD8.  */
 531 #define rA              r0      /* Right rotation temp for rWORD2.  */
 532 #define rC              r12     /* Right rotation temp for rWORD4.  */
 533 #define rE              r0      /* Right rotation temp for rWORD6.  */
 534 #define rG              r12     /* Right rotation temp for rWORD8.  */
 535 L(unaligned):
 536         stw     r29,40(r1)
 537         cfi_offset(r29,(40-64))
 538         clrlwi  rSHL, rSTR2, 30
 539         stw     r28,36(r1)
 540         cfi_offset(r28,(36-64))
 541         beq     cr5, L(Wunaligned)
 542         stw     r27,32(r1)
 543         cfi_offset(r27,(32-64))
 544 /* Adjust the logical start of rSTR2 to compensate for the extra bits
 545    in the 1st rSTR1 W.  */
 546         sub     r27, rSTR2, rBITDIF
 547 /* But do not attempt to address the W before that W that contains
 548    the actual start of rSTR2.  */
 549         clrrwi  rSTR2, rSTR2, 2
 550         stw     r26,28(r1)
 551         cfi_offset(r26,(28-64))
 552 /* Compute the left/right shift counts for the unalign rSTR2,
 553    compensating for the logical (W aligned) start of rSTR1.  */
 554         clrlwi  rSHL, r27, 30
 555         clrrwi  rSTR1, rSTR1, 2
 556         stw     r25,24(r1)
 557         cfi_offset(r25,(24-64))
 558         slwi    rSHL, rSHL, 3
 559         cmplw   cr5, r27, rSTR2
 560         add     rN, rN, rBITDIF
 561         slwi    r11, rBITDIF, 3
 562         stw     r24,20(r1)
 563         cfi_offset(r24,(20-64))
 564         subfic  rSHR, rSHL, 32
 565         srwi    rTMP, rN, 4      /* Divide by 16 */
 566         andi.   rBITDIF, rN, 12  /* Get the W remainder */
 567 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
 568    this special case those bits may be discarded anyway.  Also we
 569    must avoid loading a W where none of the bits are part of rSTR2 as
 570    this may cross a page boundary and cause a page fault.  */
 571         li      rWORD8, 0
 572         blt     cr5, L(dus0)
 573         lwz     rWORD8, 0(rSTR2)
 574         la      rSTR2, 4(rSTR2)
 575         slw     rWORD8, rWORD8, rSHL
 576
 577 L(dus0):
 578         lwz     rWORD1, 0(rSTR1)
 579         lwz     rWORD2, 0(rSTR2)
 580         cmplwi  cr1, rBITDIF, 8
 581         cmplwi  cr7, rN, 16
 582         srw     rG, rWORD2, rSHR
 583         clrlwi  rN, rN, 30
 584         beq     L(duPs4)
 585         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 586         or      rWORD8, rG, rWORD8
 587         bgt     cr1, L(duPs3)
 588         beq     cr1, L(duPs2)
 589
 590 /* Remainder is 4 */
 591         .align 4
 592 L(dusP1):
 593         slw     rB, rWORD2, rSHL
 594         slw     rWORD7, rWORD1, r11
 595         slw     rWORD8, rWORD8, r11
 596         bge     cr7, L(duP1e)
 597 /* At this point we exit early with the first word compare
 598    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
 599    how we handle the remaining bytes.  */
 600         cmplw   cr5, rWORD7, rWORD8
 601         slwi.   rN, rN, 3
 602         bne     cr5, L(duLcr5)
 603         cmplw   cr7, rN, rSHR
 604         beq     L(duZeroReturn)
 605         li      rA, 0
 606         ble     cr7, L(dutrim)
 607         lwz     rWORD2, 4(rSTR2)
 608         srw     rA, rWORD2, rSHR
 609         b       L(dutrim)
 610 /* Remainder is 8 */
 611         .align 4
 612 L(duPs2):
 613         slw     rH, rWORD2, rSHL
 614         slw     rWORD5, rWORD1, r11
 615         slw     rWORD6, rWORD8, r11
 616         b       L(duP2e)
 617 /* Remainder is 12 */
 618         .align 4
 619 L(duPs3):
 620         slw     rF, rWORD2, rSHL
 621         slw     rWORD3, rWORD1, r11
 622         slw     rWORD4, rWORD8, r11
 623         b       L(duP3e)
 624 /* Count is a multiple of 16, remainder is 0 */
 625         .align 4
 626 L(duPs4):
 627         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 628         or      rWORD8, rG, rWORD8
 629         slw     rD, rWORD2, rSHL
 630         slw     rWORD1, rWORD1, r11
 631         slw     rWORD2, rWORD8, r11
 632         b       L(duP4e)
 633
 634 /* At this point we know rSTR1 is word aligned and the
 635    compare length is at least 8 bytes.  */
 636         .align 4
 637 L(Wunaligned):
 638         stw     r27,32(r1)
 639         cfi_offset(r27,(32-64))
 640         clrrwi  rSTR2, rSTR2, 2
 641         stw     r26,28(r1)
 642         cfi_offset(r26,(28-64))
 643         srwi    rTMP, rN, 4      /* Divide by 16 */
 644         stw     r25,24(r1)
 645         cfi_offset(r25,(24-64))
 646         andi.   rBITDIF, rN, 12  /* Get the W remainder */
 647         stw     r24,20(r1)
 648         cfi_offset(r24,(24-64))
 649         slwi    rSHL, rSHL, 3
 650         lwz     rWORD6, 0(rSTR2)
 651         lwzu    rWORD8, 4(rSTR2)
 652         cmplwi  cr1, rBITDIF, 8
 653         cmplwi  cr7, rN, 16
 654         clrlwi  rN, rN, 30
 655         subfic  rSHR, rSHL, 32
 656         slw     rH, rWORD6, rSHL
 657         beq     L(duP4)
 658         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 659         bgt     cr1, L(duP3)
 660         beq     cr1, L(duP2)
 661
 662 /* Remainder is 4 */
 663         .align 4
 664 L(duP1):
 665         srw     rG, rWORD8, rSHR
 666         lwz     rWORD7, 0(rSTR1)
 667         slw     rB, rWORD8, rSHL
 668         or      rWORD8, rG, rH
 669         blt     cr7, L(duP1x)
 670 L(duP1e):
 671         lwz     rWORD1, 4(rSTR1)
 672         lwz     rWORD2, 4(rSTR2)
 673         cmplw   cr5, rWORD7, rWORD8
 674         srw     rA, rWORD2, rSHR
 675         slw     rD, rWORD2, rSHL
 676         or      rWORD2, rA, rB
 677         lwz     rWORD3, 8(rSTR1)
 678         lwz     rWORD4, 8(rSTR2)
 679         cmplw   cr0, rWORD1, rWORD2
 680         srw     rC, rWORD4, rSHR
 681         slw     rF, rWORD4, rSHL
 682         bne     cr5, L(duLcr5)
 683         or      rWORD4, rC, rD
 684         lwz     rWORD5, 12(rSTR1)
 685         lwz     rWORD6, 12(rSTR2)
 686         cmplw   cr1, rWORD3, rWORD4
 687         srw     rE, rWORD6, rSHR
 688         slw     rH, rWORD6, rSHL
 689         bne     cr0, L(duLcr0)
 690         or      rWORD6, rE, rF
 691         cmplw   cr6, rWORD5, rWORD6
 692         b       L(duLoop3)
 693         .align 4
 694 /* At this point we exit early with the first word compare
 695    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
 696    how we handle the remaining bytes.  */
 697 L(duP1x):
 698         cmplw   cr5, rWORD7, rWORD8
 699         slwi.   rN, rN, 3
 700         bne     cr5, L(duLcr5)
 701         cmplw   cr7, rN, rSHR
 702         beq     L(duZeroReturn)
 703         li      rA, 0
 704         ble     cr7, L(dutrim)
 705         ld      rWORD2, 8(rSTR2)
 706         srw     rA, rWORD2, rSHR
 707         b       L(dutrim)
 708 /* Remainder is 8 */
 709         .align 4
 710 L(duP2):
 711         srw     rE, rWORD8, rSHR
 712         lwz     rWORD5, 0(rSTR1)
 713         or      rWORD6, rE, rH
 714         slw     rH, rWORD8, rSHL
 715 L(duP2e):
 716         lwz     rWORD7, 4(rSTR1)
 717         lwz     rWORD8, 4(rSTR2)
 718         cmplw   cr6, rWORD5, rWORD6
 719         srw     rG, rWORD8, rSHR
 720         slw     rB, rWORD8, rSHL
 721         or      rWORD8, rG, rH
 722         blt     cr7, L(duP2x)
 723         lwz     rWORD1, 8(rSTR1)
 724         lwz     rWORD2, 8(rSTR2)
 725         cmplw   cr5, rWORD7, rWORD8
 726         bne     cr6, L(duLcr6)
 727         srw     rA, rWORD2, rSHR
 728         slw     rD, rWORD2, rSHL
 729         or      rWORD2, rA, rB
 730         lwz     rWORD3, 12(rSTR1)
 731         lwz     rWORD4, 12(rSTR2)
 732         cmplw   cr0, rWORD1, rWORD2
 733         bne     cr5, L(duLcr5)
 734         srw     rC, rWORD4, rSHR
 735         slw     rF, rWORD4, rSHL
 736         or      rWORD4, rC, rD
 737         addi    rSTR1, rSTR1, 4
 738         addi    rSTR2, rSTR2, 4
 739         cmplw   cr1, rWORD3, rWORD4
 740         b       L(duLoop2)
 741         .align 4
 742 L(duP2x):
 743         cmplw   cr5, rWORD7, rWORD8
 744         addi    rSTR1, rSTR1, 4
 745         addi    rSTR2, rSTR2, 4
 746         bne     cr6, L(duLcr6)
 747         slwi.   rN, rN, 3
 748         bne     cr5, L(duLcr5)
 749         cmplw   cr7, rN, rSHR
 750         beq     L(duZeroReturn)
 751         li      rA, 0
 752         ble     cr7, L(dutrim)
 753         lwz     rWORD2, 4(rSTR2)
 754         srw     rA, rWORD2, rSHR
 755         b       L(dutrim)
 756
 757 /* Remainder is 12 */
 758         .align 4
 759 L(duP3):
 760         srw     rC, rWORD8, rSHR
 761         lwz     rWORD3, 0(rSTR1)
 762         slw     rF, rWORD8, rSHL
 763         or      rWORD4, rC, rH
 764 L(duP3e):
 765         lwz     rWORD5, 4(rSTR1)
 766         lwz     rWORD6, 4(rSTR2)
 767         cmplw   cr1, rWORD3, rWORD4
 768         srw     rE, rWORD6, rSHR
 769         slw     rH, rWORD6, rSHL
 770         or      rWORD6, rE, rF
 771         lwz     rWORD7, 8(rSTR1)
 772         lwz     rWORD8, 8(rSTR2)
 773         cmplw   cr6, rWORD5, rWORD6
 774         bne     cr1, L(duLcr1)
 775         srw     rG, rWORD8, rSHR
 776         slw     rB, rWORD8, rSHL
 777         or      rWORD8, rG, rH
 778         blt     cr7, L(duP3x)
 779         lwz     rWORD1, 12(rSTR1)
 780         lwz     rWORD2, 12(rSTR2)
 781         cmplw   cr5, rWORD7, rWORD8
 782         bne     cr6, L(duLcr6)
 783         srw     rA, rWORD2, rSHR
 784         slw     rD, rWORD2, rSHL
 785         or      rWORD2, rA, rB
 786         addi    rSTR1, rSTR1, 8
 787         addi    rSTR2, rSTR2, 8
 788         cmplw   cr0, rWORD1, rWORD2
 789         b       L(duLoop1)
 790         .align 4
 791 L(duP3x):
 792         addi    rSTR1, rSTR1, 8
 793         addi    rSTR2, rSTR2, 8
 794         bne     cr1, L(duLcr1)
 795         cmplw   cr5, rWORD7, rWORD8
 796         bne     cr6, L(duLcr6)
 797         slwi.   rN, rN, 3
 798         bne     cr5, L(duLcr5)
 799         cmplw   cr7, rN, rSHR
 800         beq     L(duZeroReturn)
 801         li      rA, 0
 802         ble     cr7, L(dutrim)
 803         lwz     rWORD2, 4(rSTR2)
 804         srw     rA, rWORD2, rSHR
 805         b       L(dutrim)
 806
 807 /* Count is a multiple of 16, remainder is 0 */
 808         .align 4
 809 L(duP4):
 810         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 811         srw     rA, rWORD8, rSHR
 812         lwz     rWORD1, 0(rSTR1)
 813         slw     rD, rWORD8, rSHL
 814         or      rWORD2, rA, rH
 815 L(duP4e):
 816         lwz     rWORD3, 4(rSTR1)
 817         lwz     rWORD4, 4(rSTR2)
 818         cmplw   cr0, rWORD1, rWORD2
 819         srw     rC, rWORD4, rSHR
 820         slw     rF, rWORD4, rSHL
 821         or      rWORD4, rC, rD
 822         lwz     rWORD5, 8(rSTR1)
 823         lwz     rWORD6, 8(rSTR2)
 824         cmplw   cr1, rWORD3, rWORD4
 825         bne     cr0, L(duLcr0)
 826         srw     rE, rWORD6, rSHR
 827         slw     rH, rWORD6, rSHL
 828         or      rWORD6, rE, rF
 829         lwzu    rWORD7, 12(rSTR1)
 830         lwzu    rWORD8, 12(rSTR2)
 831         cmplw   cr6, rWORD5, rWORD6
 832         bne     cr1, L(duLcr1)
 833         srw     rG, rWORD8, rSHR
 834         slw     rB, rWORD8, rSHL
 835         or      rWORD8, rG, rH
 836         cmplw   cr5, rWORD7, rWORD8
 837         bdz-    L(du24)         /* Adjust CTR as we start with +4 */
 838 /* This is the primary loop */
 839         .align 4
 840 L(duLoop):
 841         lwz     rWORD1, 4(rSTR1)
 842         lwz     rWORD2, 4(rSTR2)
 843         cmplw   cr1, rWORD3, rWORD4
 844         bne     cr6, L(duLcr6)
 845         srw     rA, rWORD2, rSHR
 846         slw     rD, rWORD2, rSHL
 847         or      rWORD2, rA, rB
 848 L(duLoop1):
 849         lwz     rWORD3, 8(rSTR1)
 850         lwz     rWORD4, 8(rSTR2)
 851         cmplw   cr6, rWORD5, rWORD6
 852         bne     cr5, L(duLcr5)
 853         srw     rC, rWORD4, rSHR
 854         slw     rF, rWORD4, rSHL
 855         or      rWORD4, rC, rD
 856 L(duLoop2):
 857         lwz     rWORD5, 12(rSTR1)
 858         lwz     rWORD6, 12(rSTR2)
 859         cmplw   cr5, rWORD7, rWORD8
 860         bne     cr0, L(duLcr0)
 861         srw     rE, rWORD6, rSHR
 862         slw     rH, rWORD6, rSHL
 863         or      rWORD6, rE, rF
 864 L(duLoop3):
 865         lwzu    rWORD7, 16(rSTR1)
 866         lwzu    rWORD8, 16(rSTR2)
 867         cmplw   cr0, rWORD1, rWORD2
 868         bne-    cr1, L(duLcr1)
 869         srw     rG, rWORD8, rSHR
 870         slw     rB, rWORD8, rSHL
 871         or      rWORD8, rG, rH
 872         bdnz+   L(duLoop)
 873
 874 L(duL4):
 875         bne     cr1, L(duLcr1)
 876         cmplw   cr1, rWORD3, rWORD4
 877         bne     cr6, L(duLcr6)
 878         cmplw   cr6, rWORD5, rWORD6
 879         bne     cr5, L(duLcr5)
 880         cmplw   cr5, rWORD7, rWORD8
 881 L(du44):
 882         bne     cr0, L(duLcr0)
 883 L(du34):
 884         bne     cr1, L(duLcr1)
 885 L(du24):
 886         bne     cr6, L(duLcr6)
 887 L(du14):
 888         slwi.   rN, rN, 3
 889         bne     cr5, L(duLcr5)
 890 /* At this point we have a remainder of 1 to 3 bytes to compare.  We use
 891    shift right to eliminate bits beyond the compare length.
 892
 893    However it may not be safe to load rWORD2 which may be beyond the
 894    string length. So we compare the bit length of the remainder to
 895    the right shift count (rSHR). If the bit count is less than or equal
 896    we do not need to load rWORD2 (all significant bits are already in
 897    rB).  */
 898         cmplw   cr7, rN, rSHR
 899         beq     L(duZeroReturn)
 900         li      rA, 0
 901         ble     cr7, L(dutrim)
 902         lwz     rWORD2, 4(rSTR2)
 903         srw     rA, rWORD2, rSHR
 904         .align 4
 905 L(dutrim):
 906         lwz     rWORD1, 4(rSTR1)
 907         lwz     r31,48(1)
 908         subfic  rN, rN, 32      /* Shift count is 32 - (rN * 8).  */
 909         or      rWORD2, rA, rB
 910         lwz     r30,44(1)
 911         lwz     r29,40(r1)
 912         srw     rWORD1, rWORD1, rN
 913         srw     rWORD2, rWORD2, rN
 914         lwz     r28,36(r1)
 915         lwz     r27,32(r1)
 916         cmplw   rWORD1,rWORD2
 917         li      rRTN,0
 918         beq     L(dureturn26)
 919         li      rRTN,1
 920         bgt     L(dureturn26)
 921         li      rRTN,-1
 922         b    L(dureturn26)
 923         .align 4
 924 L(duLcr0):
 925         lwz     r31,48(1)
 926         lwz     r30,44(1)
 927         li      rRTN, 1
 928         bgt     cr0, L(dureturn29)
 929         lwz     r29,40(r1)
 930         lwz     r28,36(r1)
 931         li      rRTN, -1
 932         b       L(dureturn27)
 933         .align 4
 934 L(duLcr1):
 935         lwz     r31,48(1)
 936         lwz     r30,44(1)
 937         li      rRTN, 1
 938         bgt     cr1, L(dureturn29)
 939         lwz     r29,40(r1)
 940         lwz     r28,36(r1)
 941         li      rRTN, -1
 942         b       L(dureturn27)
 943         .align 4
 944 L(duLcr6):
 945         lwz     r31,48(1)
 946         lwz     r30,44(1)
 947         li      rRTN, 1
 948         bgt     cr6, L(dureturn29)
 949         lwz     r29,40(r1)
 950         lwz     r28,36(r1)
 951         li      rRTN, -1
 952         b       L(dureturn27)
 953         .align 4
 954 L(duLcr5):
 955         lwz     r31,48(1)
 956         lwz     r30,44(1)
 957         li      rRTN, 1
 958         bgt     cr5, L(dureturn29)
 959         lwz     r29,40(r1)
 960         lwz     r28,36(r1)
 961         li      rRTN, -1
 962         b       L(dureturn27)
 963         .align  3
 964 L(duZeroReturn):
 965         li      rRTN,0
 966         .align  4
 967 L(dureturn):
 968         lwz     r31,48(1)
 969         lwz     r30,44(1)
 970 L(dureturn29):
 971         lwz     r29,40(r1)
 972         lwz     r28,36(r1)
 973 L(dureturn27):
 974         lwz     r27,32(r1)
 975 L(dureturn26):
 976         lwz     r26,28(r1)
 977 L(dureturn25):
 978         lwz     r25,24(r1)
 979         lwz     r24,20(r1)
 980         lwz     1,0(1)
 981         blr
 982 END (BP_SYM (memcmp))
 983
 984 libc_hidden_builtin_def (memcmp)
 985 weak_alias (memcmp, bcmp)