sysdeps/powerpc/powerpc32/power4/memcmp.S

   1 /* Optimized strcmp implementation for PowerPC64.
   2    Copyright (C) 2003, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
  18    02110-1301 USA.  */
  19
  20 #include <sysdep.h>
  21 #include <bp-sym.h>
  22 #include <bp-asm.h>
  23
  24 /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
  25
  26         .machine power4
  27 EALIGN (BP_SYM(memcmp), 4, 0)
  28         CALL_MCOUNT
  29
  30 #define rTMP    r0
  31 #define rRTN    r3
  32 #define rSTR1   r3      /* first string arg */
  33 #define rSTR2   r4      /* second string arg */
  34 #define rN      r5      /* max string length */
  35 #define rWORD1  r6      /* current word in s1 */
  36 #define rWORD2  r7      /* current word in s2 */
  37 #define rWORD3  r8      /* next word in s1 */
  38 #define rWORD4  r9      /* next word in s2 */
  39 #define rWORD5  r10     /* next word in s1 */
  40 #define rWORD6  r11     /* next word in s2 */
  41 #define rBITDIF r12     /* bits that differ in s1 & s2 words */
  42 #define rWORD7  r30     /* next word in s1 */
  43 #define rWORD8  r31     /* next word in s2 */
  44
  45         xor     rTMP, rSTR2, rSTR1
  46         cmplwi  cr6, rN, 0
  47         cmplwi  cr1, rN, 12
  48         clrlwi. rTMP, rTMP, 30
  49         clrlwi  rBITDIF, rSTR1, 30
  50         cmplwi  cr5, rBITDIF, 0
  51         beq-    cr6, L(zeroLength)
  52         dcbt    0,rSTR1
  53         dcbt    0,rSTR2
  54 /* If less than 8 bytes or not aligned, use the unaligned
  55    byte loop.  */
  56         blt     cr1, L(bytealigned)
  57         stwu    1,-64(1)
  58         cfi_adjust_cfa_offset(64)
  59         stw     r31,48(1)
  60         cfi_offset(31,(48-64))
  61         stw     r30,44(1)
  62         cfi_offset(30,(44-64))
  63         bne     L(unaligned)
  64 /* At this point we know both strings have the same alignment and the
  65    compare length is at least 8 bytes.  rBITDIF contains the low order
  66    2 bits of rSTR1 and cr5 contains the result of the logical compare
  67    of rBITDIF to 0.  If rBITDIF == 0 then we are already word
  68    aligned and can perform the word aligned loop.
  69
  70    Otherwise we know the two strings have the same alignment (but not
  71    yet word aligned).  So we force the string addresses to the next lower
  72    word boundary and special case this first word using shift left to
  73    eliminate bits preceeding the first byte.  Since we want to join the
  74    normal (word aligned) compare loop, starting at the second word,
  75    we need to adjust the length (rN) and special case the loop
  76    versioning for the first word. This insures that the loop count is
  77    correct and the first word (shifted) is in the expected register pair. */
  78         .align 4
  79 L(samealignment):
  80         clrrwi  rSTR1, rSTR1, 2
  81         clrrwi  rSTR2, rSTR2, 2
  82         beq     cr5, L(Waligned)
  83         add     rN, rN, rBITDIF
  84         slwi    r11, rBITDIF, 3
  85         srwi    rTMP, rN, 4      /* Divide by 16 */
  86         andi.   rBITDIF, rN, 12  /* Get the word remainder */
  87         lwz     rWORD1, 0(rSTR1)
  88         lwz     rWORD2, 0(rSTR2)
  89         cmplwi  cr1, rBITDIF, 8
  90         cmplwi  cr7, rN, 16
  91         clrlwi  rN, rN, 30
  92         beq     L(dPs4)
  93         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
  94         bgt     cr1, L(dPs3)
  95         beq     cr1, L(dPs2)
  96
  97 /* Remainder is 4 */
  98         .align 3
  99 L(dsP1):
 100         slw     rWORD5, rWORD1, r11
 101         slw     rWORD6, rWORD2, r11
 102         cmplw   cr5, rWORD5, rWORD6
 103         blt     cr7, L(dP1x)
 104 /* Do something useful in this cycle since we have to branch anyway.  */
 105         lwz     rWORD1, 4(rSTR1)
 106         lwz     rWORD2, 4(rSTR2)
 107         cmplw   cr0, rWORD1, rWORD2
 108         b       L(dP1e)
 109 /* Remainder is 8 */
 110         .align 4
 111 L(dPs2):
 112         slw     rWORD5, rWORD1, r11
 113         slw     rWORD6, rWORD2, r11
 114         cmplw   cr6, rWORD5, rWORD6
 115         blt     cr7, L(dP2x)
 116 /* Do something useful in this cycle since we have to branch anyway.  */
 117         lwz     rWORD7, 4(rSTR1)
 118         lwz     rWORD8, 4(rSTR2)
 119         cmplw   cr5, rWORD7, rWORD8
 120         b       L(dP2e)
 121 /* Remainder is 12 */
 122         .align 4
 123 L(dPs3):
 124         slw     rWORD3, rWORD1, r11
 125         slw     rWORD4, rWORD2, r11
 126         cmplw   cr1, rWORD3, rWORD4
 127         b       L(dP3e)
 128 /* Count is a multiple of 16, remainder is 0 */
 129         .align 4
 130 L(dPs4):
 131         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 132         slw     rWORD1, rWORD1, r11
 133         slw     rWORD2, rWORD2, r11
 134         cmplw   cr0, rWORD1, rWORD2
 135         b       L(dP4e)
 136
 137 /* At this point we know both strings are word aligned and the
 138    compare length is at least 8 bytes.  */
 139         .align 4
 140 L(Waligned):
 141         andi.   rBITDIF, rN, 12  /* Get the word remainder */
 142         srwi    rTMP, rN, 4      /* Divide by 16 */
 143         cmplwi  cr1, rBITDIF, 8
 144         cmplwi  cr7, rN, 16
 145         clrlwi  rN, rN, 30
 146         beq     L(dP4)
 147         bgt     cr1, L(dP3)
 148         beq     cr1, L(dP2)
 149
 150 /* Remainder is 4 */
 151         .align 4
 152 L(dP1):
 153         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 154 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
 155    (8-15 byte compare), we want to use only volatile registers.  This
 156    means we can avoid restoring non-volatile registers since we did not
 157    change any on the early exit path.  The key here is the non-early
 158    exit path only cares about the condition code (cr5), not about which
 159    register pair was used.  */
 160         lwz     rWORD5, 0(rSTR1)
 161         lwz     rWORD6, 0(rSTR2)
 162         cmplw   cr5, rWORD5, rWORD6
 163         blt     cr7, L(dP1x)
 164         lwz     rWORD1, 4(rSTR1)
 165         lwz     rWORD2, 4(rSTR2)
 166         cmplw   cr0, rWORD1, rWORD2
 167 L(dP1e):
 168         lwz     rWORD3, 8(rSTR1)
 169         lwz     rWORD4, 8(rSTR2)
 170         cmplw   cr1, rWORD3, rWORD4
 171         lwz     rWORD5, 12(rSTR1)
 172         lwz     rWORD6, 12(rSTR2)
 173         cmplw   cr6, rWORD5, rWORD6
 174         bne     cr5, L(dLcr5)
 175         bne     cr0, L(dLcr0)
 176
 177         lwzu    rWORD7, 16(rSTR1)
 178         lwzu    rWORD8, 16(rSTR2)
 179         bne     cr1, L(dLcr1)
 180         cmplw   cr5, rWORD7, rWORD8
 181         bdnz    L(dLoop)
 182         bne     cr6, L(dLcr6)
 183         lwz     r30,44(1)
 184         lwz     r31,48(1)
 185         .align 3
 186 L(dP1x):
 187         slwi.   r12, rN, 3
 188         bne     cr5, L(dLcr5)
 189         subfic  rN, r12, 32     /* Shift count is 32 - (rN * 8).  */
 190         lwz     1,0(1)
 191         bne     L(d00)
 192         li      rRTN, 0
 193         blr
 194
 195 /* Remainder is 8 */
 196         .align 4
 197 L(dP2):
 198         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 199         lwz     rWORD5, 0(rSTR1)
 200         lwz     rWORD6, 0(rSTR2)
 201         cmplw   cr6, rWORD5, rWORD6
 202         blt     cr7, L(dP2x)
 203         lwz     rWORD7, 4(rSTR1)
 204         lwz     rWORD8, 4(rSTR2)
 205         cmplw   cr5, rWORD7, rWORD8
 206 L(dP2e):
 207         lwz     rWORD1, 8(rSTR1)
 208         lwz     rWORD2, 8(rSTR2)
 209         cmplw   cr0, rWORD1, rWORD2
 210         lwz     rWORD3, 12(rSTR1)
 211         lwz     rWORD4, 12(rSTR2)
 212         cmplw   cr1, rWORD3, rWORD4
 213         addi    rSTR1, rSTR1, 4
 214         addi    rSTR2, rSTR2, 4
 215         bne     cr6, L(dLcr6)
 216         bne     cr5, L(dLcr5)
 217         b       L(dLoop2)
 218 /* Again we are on a early exit path (16-23 byte compare), we want to
 219    only use volatile registers and avoid restoring non-volatile
 220    registers.  */
 221         .align 4
 222 L(dP2x):
 223         lwz     rWORD3, 4(rSTR1)
 224         lwz     rWORD4, 4(rSTR2)
 225         cmplw   cr5, rWORD3, rWORD4
 226         slwi.   r12, rN, 3
 227         bne     cr6, L(dLcr6)
 228         addi    rSTR1, rSTR1, 4
 229         addi    rSTR2, rSTR2, 4
 230         bne     cr5, L(dLcr5)
 231         subfic  rN, r12, 32     /* Shift count is 32 - (rN * 8).  */
 232         lwz     1,0(1)
 233         bne     L(d00)
 234         li      rRTN, 0
 235         blr
 236
 237 /* Remainder is 12 */
 238         .align 4
 239 L(dP3):
 240         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 241         lwz     rWORD3, 0(rSTR1)
 242         lwz     rWORD4, 0(rSTR2)
 243         cmplw   cr1, rWORD3, rWORD4
 244 L(dP3e):
 245         lwz     rWORD5, 4(rSTR1)
 246         lwz     rWORD6, 4(rSTR2)
 247         cmplw   cr6, rWORD5, rWORD6
 248         blt     cr7, L(dP3x)
 249         lwz     rWORD7, 8(rSTR1)
 250         lwz     rWORD8, 8(rSTR2)
 251         cmplw   cr5, rWORD7, rWORD8
 252         lwz     rWORD1, 12(rSTR1)
 253         lwz     rWORD2, 12(rSTR2)
 254         cmplw   cr0, rWORD1, rWORD2
 255         addi    rSTR1, rSTR1, 8
 256         addi    rSTR2, rSTR2, 8
 257         bne     cr1, L(dLcr1)
 258         bne     cr6, L(dLcr6)
 259         b       L(dLoop1)
 260 /* Again we are on a early exit path (24-31 byte compare), we want to
 261    only use volatile registers and avoid restoring non-volatile
 262    registers.  */
 263         .align 4
 264 L(dP3x):
 265         lwz     rWORD1, 8(rSTR1)
 266         lwz     rWORD2, 8(rSTR2)
 267         cmplw   cr5, rWORD1, rWORD2
 268         slwi.   r12, rN, 3
 269         bne     cr1, L(dLcr1)
 270         addi    rSTR1, rSTR1, 8
 271         addi    rSTR2, rSTR2, 8
 272         bne     cr6, L(dLcr6)
 273         subfic  rN, r12, 32     /* Shift count is 32 - (rN * 8).  */
 274         bne     cr5, L(dLcr5)
 275         lwz     1,0(1)
 276         bne     L(d00)
 277         li      rRTN, 0
 278         blr
 279
 280 /* Count is a multiple of 16, remainder is 0 */
 281         .align 4
 282 L(dP4):
 283         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 284         lwz     rWORD1, 0(rSTR1)
 285         lwz     rWORD2, 0(rSTR2)
 286         cmplw   cr0, rWORD1, rWORD2
 287 L(dP4e):
 288         lwz     rWORD3, 4(rSTR1)
 289         lwz     rWORD4, 4(rSTR2)
 290         cmplw   cr1, rWORD3, rWORD4
 291         lwz     rWORD5, 8(rSTR1)
 292         lwz     rWORD6, 8(rSTR2)
 293         cmplw   cr6, rWORD5, rWORD6
 294         lwzu    rWORD7, 12(rSTR1)
 295         lwzu    rWORD8, 12(rSTR2)
 296         cmplw   cr5, rWORD7, rWORD8
 297         bne     cr0, L(dLcr0)
 298         bne     cr1, L(dLcr1)
 299         bdz-    L(d24)          /* Adjust CTR as we start with +4 */
 300 /* This is the primary loop */
 301         .align 4
 302 L(dLoop):
 303         lwz     rWORD1, 4(rSTR1)
 304         lwz     rWORD2, 4(rSTR2)
 305         cmplw   cr1, rWORD3, rWORD4
 306         bne     cr6, L(dLcr6)
 307 L(dLoop1):
 308         lwz     rWORD3, 8(rSTR1)
 309         lwz     rWORD4, 8(rSTR2)
 310         cmplw   cr6, rWORD5, rWORD6
 311         bne     cr5, L(dLcr5)
 312 L(dLoop2):
 313         lwz     rWORD5, 12(rSTR1)
 314         lwz     rWORD6, 12(rSTR2)
 315         cmplw   cr5, rWORD7, rWORD8
 316         bne     cr0, L(dLcr0)
 317 L(dLoop3):
 318         lwzu    rWORD7, 16(rSTR1)
 319         lwzu    rWORD8, 16(rSTR2)
 320         bne-    cr1, L(dLcr1)
 321         cmplw   cr0, rWORD1, rWORD2
 322         bdnz+   L(dLoop)
 323
 324 L(dL4):
 325         cmplw   cr1, rWORD3, rWORD4
 326         bne     cr6, L(dLcr6)
 327         cmplw   cr6, rWORD5, rWORD6
 328         bne     cr5, L(dLcr5)
 329         cmplw   cr5, rWORD7, rWORD8
 330 L(d44):
 331         bne     cr0, L(dLcr0)
 332 L(d34):
 333         bne     cr1, L(dLcr1)
 334 L(d24):
 335         bne     cr6, L(dLcr6)
 336 L(d14):
 337         slwi.   r12, rN, 3
 338         bne     cr5, L(dLcr5)
 339 L(d04):
 340         lwz     r30,44(1)
 341         lwz     r31,48(1)
 342         lwz     1,0(1)
 343         subfic  rN, r12, 32     /* Shift count is 32 - (rN * 8).  */
 344         beq     L(zeroLength)
 345 /* At this point we have a remainder of 1 to 3 bytes to compare.  Since
 346    we are aligned it is safe to load the whole word, and use
 347    shift right to eliminate bits beyond the compare length. */
 348 L(d00):
 349         lwz     rWORD1, 4(rSTR1)
 350         lwz     rWORD2, 4(rSTR2)
 351         srw     rWORD1, rWORD1, rN
 352         srw     rWORD2, rWORD2, rN
 353         cmplw   rWORD1,rWORD2
 354         li      rRTN,0
 355         beqlr
 356         li      rRTN,1
 357         bgtlr
 358         li      rRTN,-1
 359         blr
 360
 361         .align 4
 362 L(dLcr0):
 363         lwz     r30,44(1)
 364         lwz     r31,48(1)
 365         li      rRTN, 1
 366         lwz     1,0(1)
 367         bgtlr   cr0
 368         li      rRTN, -1
 369         blr
 370         .align 4
 371 L(dLcr1):
 372         lwz     r30,44(1)
 373         lwz     r31,48(1)
 374         li      rRTN, 1
 375         lwz     1,0(1)
 376         bgtlr   cr1
 377         li      rRTN, -1
 378         blr
 379         .align 4
 380 L(dLcr6):
 381         lwz     r30,44(1)
 382         lwz     r31,48(1)
 383         li      rRTN, 1
 384         lwz     1,0(1)
 385         bgtlr   cr6
 386         li      rRTN, -1
 387         blr
 388         .align 4
 389 L(dLcr5):
 390         lwz     r30,44(1)
 391         lwz     r31,48(1)
 392 L(dLcr5x):
 393         li      rRTN, 1
 394         lwz     1,0(1)
 395         bgtlr   cr5
 396         li      rRTN, -1
 397         blr
 398
 399         .align 4
 400 L(bytealigned):
 401         cfi_adjust_cfa_offset(-64)
 402         mtctr   rN      /* Power4 wants mtctr 1st in dispatch group */
 403
 404 /* We need to prime this loop.  This loop is swing modulo scheduled
 405    to avoid pipe delays.  The dependent instruction latencies (load to
 406    compare to conditional branch) is 2 to 3 cycles.  In this loop each
 407    dispatch group ends in a branch and takes 1 cycle.  Effectively
 408    the first iteration of the loop only serves to load operands and
 409    branches based on compares are delayed until the next loop.
 410
 411    So we must precondition some registers and condition codes so that
 412    we don't exit the loop early on the first iteration.  */
 413
 414         lbz     rWORD1, 0(rSTR1)
 415         lbz     rWORD2, 0(rSTR2)
 416         bdz-    L(b11)
 417         cmplw   cr0, rWORD1, rWORD2
 418         lbz     rWORD3, 1(rSTR1)
 419         lbz     rWORD4, 1(rSTR2)
 420         bdz-    L(b12)
 421         cmplw   cr1, rWORD3, rWORD4
 422         lbzu    rWORD5, 2(rSTR1)
 423         lbzu    rWORD6, 2(rSTR2)
 424         bdz-    L(b13)
 425         .align 4
 426 L(bLoop):
 427         lbzu    rWORD1, 1(rSTR1)
 428         lbzu    rWORD2, 1(rSTR2)
 429         bne-    cr0, L(bLcr0)
 430
 431         cmplw   cr6, rWORD5, rWORD6
 432         bdz-    L(b3i)
 433
 434         lbzu    rWORD3, 1(rSTR1)
 435         lbzu    rWORD4, 1(rSTR2)
 436         bne-    cr1, L(bLcr1)
 437
 438         cmplw   cr0, rWORD1, rWORD2
 439         bdz-    L(b2i)
 440
 441         lbzu    rWORD5, 1(rSTR1)
 442         lbzu    rWORD6, 1(rSTR2)
 443         bne-    cr6, L(bLcr6)
 444
 445         cmplw   cr1, rWORD3, rWORD4
 446         bdnz+   L(bLoop)
 447
 448 /* We speculatively loading bytes before we have tested the previous
 449    bytes.  But we must avoid overrunning the length (in the ctr) to
 450    prevent these speculative loads from causing a segfault.  In this
 451    case the loop will exit early (before the all pending bytes are
 452    tested.  In this case we must complete the pending operations
 453    before returning.  */
 454 L(b1i):
 455         bne-    cr0, L(bLcr0)
 456         bne-    cr1, L(bLcr1)
 457         b       L(bx56)
 458         .align 4
 459 L(b2i):
 460         bne-    cr6, L(bLcr6)
 461         bne-    cr0, L(bLcr0)
 462         b       L(bx34)
 463         .align 4
 464 L(b3i):
 465         bne-    cr1, L(bLcr1)
 466         bne-    cr6, L(bLcr6)
 467         b       L(bx12)
 468         .align 4
 469 L(bLcr0):
 470         li      rRTN, 1
 471         bgtlr   cr0
 472         li      rRTN, -1
 473         blr
 474 L(bLcr1):
 475         li      rRTN, 1
 476         bgtlr   cr1
 477         li      rRTN, -1
 478         blr
 479 L(bLcr6):
 480         li      rRTN, 1
 481         bgtlr   cr6
 482         li      rRTN, -1
 483         blr
 484
 485 L(b13):
 486         bne-    cr0, L(bx12)
 487         bne-    cr1, L(bx34)
 488 L(bx56):
 489         sub     rRTN, rWORD5, rWORD6
 490         blr
 491         nop
 492 L(b12):
 493         bne-    cr0, L(bx12)
 494 L(bx34):
 495         sub     rRTN, rWORD3, rWORD4
 496         blr
 497
 498 L(b11):
 499 L(bx12):
 500         sub     rRTN, rWORD1, rWORD2
 501         blr
 502
 503         .align 4
 504 L(zeroLengthReturn):
 505
 506 L(zeroLength):
 507         li      rRTN, 0
 508         blr
 509
 510         cfi_adjust_cfa_offset(64)
 511         .align 4
 512 /* At this point we know the strings have different alignment and the
 513    compare length is at least 8 bytes.  rBITDIF contains the low order
 514    2 bits of rSTR1 and cr5 contains the result of the logical compare
 515    of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is word aligned and can
 516    perform the Wunaligned loop.
 517
 518    Otherwise we know that rSTR1 is not aready word aligned yet.
 519    So we can force the string addresses to the next lower word
 520    boundary and special case this first word using shift left to
 521    eliminate bits preceeding the first byte.  Since we want to join the
 522    normal (Wualigned) compare loop, starting at the second word,
 523    we need to adjust the length (rN) and special case the loop
 524    versioning for the first W. This insures that the loop count is
 525    correct and the first W (shifted) is in the expected resister pair.  */
 526 #define rSHL            r29     /* Unaligned shift left count.  */
 527 #define rSHR            r28     /* Unaligned shift right count.  */
 528 #define rB              r27     /* Left rotation temp for rWORD2.  */
 529 #define rD              r26     /* Left rotation temp for rWORD4.  */
 530 #define rF              r25     /* Left rotation temp for rWORD6.  */
 531 #define rH              r24     /* Left rotation temp for rWORD8.  */
 532 #define rA              r0      /* Right rotation temp for rWORD2.  */
 533 #define rC              r12     /* Right rotation temp for rWORD4.  */
 534 #define rE              r0      /* Right rotation temp for rWORD6.  */
 535 #define rG              r12     /* Right rotation temp for rWORD8.  */
 536 L(unaligned):
 537         stw     r29,40(r1)
 538         cfi_offset(r29,(40-64))
 539         clrlwi  rSHL, rSTR2, 30
 540         stw     r28,36(r1)
 541         cfi_offset(r28,(36-64))
 542         beq     cr5, L(Wunaligned)
 543         stw     r27,32(r1)
 544         cfi_offset(r27,(32-64))
 545 /* Adjust the logical start of rSTR2 to compensate for the extra bits
 546    in the 1st rSTR1 W.  */
 547         sub     r27, rSTR2, rBITDIF
 548 /* But do not attempt to address the W before that W that contains
 549    the actual start of rSTR2.  */
 550         clrrwi  rSTR2, rSTR2, 2
 551         stw     r26,28(r1)
 552         cfi_offset(r26,(28-64))
 553 /* Compute the left/right shift counts for the unalign rSTR2,
 554    compensating for the logical (W aligned) start of rSTR1.  */
 555         clrlwi  rSHL, r27, 30
 556         clrrwi  rSTR1, rSTR1, 2
 557         stw     r25,24(r1)
 558         cfi_offset(r25,(24-64))
 559         slwi    rSHL, rSHL, 3
 560         cmplw   cr5, r27, rSTR2
 561         add     rN, rN, rBITDIF
 562         slwi    r11, rBITDIF, 3
 563         stw     r24,20(r1)
 564         cfi_offset(r24,(20-64))
 565         subfic  rSHR, rSHL, 32
 566         srwi    rTMP, rN, 4      /* Divide by 16 */
 567         andi.   rBITDIF, rN, 12  /* Get the W remainder */
 568 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
 569    this special case those bits may be discarded anyway.  Also we
 570    must avoid loading a W where none of the bits are part of rSTR2 as
 571    this may cross a page boundary and cause a page fault.  */
 572         li      rWORD8, 0
 573         blt     cr5, L(dus0)
 574         lwz     rWORD8, 0(rSTR2)
 575         la      rSTR2, 4(rSTR2)
 576         slw     rWORD8, rWORD8, rSHL
 577
 578 L(dus0):
 579         lwz     rWORD1, 0(rSTR1)
 580         lwz     rWORD2, 0(rSTR2)
 581         cmplwi  cr1, rBITDIF, 8
 582         cmplwi  cr7, rN, 16
 583         srw     rG, rWORD2, rSHR
 584         clrlwi  rN, rN, 30
 585         beq     L(duPs4)
 586         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 587         or      rWORD8, rG, rWORD8
 588         bgt     cr1, L(duPs3)
 589         beq     cr1, L(duPs2)
 590
 591 /* Remainder is 4 */
 592         .align 4
 593 L(dusP1):
 594         slw     rB, rWORD2, rSHL
 595         slw     rWORD7, rWORD1, r11
 596         slw     rWORD8, rWORD8, r11
 597         bge     cr7, L(duP1e)
 598 /* At this point we exit early with the first word compare
 599    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
 600    how we handle the remaining bytes.  */
 601         cmplw   cr5, rWORD7, rWORD8
 602         slwi.   rN, rN, 3
 603         bne     cr5, L(duLcr5)
 604         cmplw   cr7, rN, rSHR
 605         beq     L(duZeroReturn)
 606         li      rA, 0
 607         ble     cr7, L(dutrim)
 608         lwz     rWORD2, 4(rSTR2)
 609         srw     rA, rWORD2, rSHR
 610         b       L(dutrim)
 611 /* Remainder is 8 */
 612         .align 4
 613 L(duPs2):
 614         slw     rH, rWORD2, rSHL
 615         slw     rWORD5, rWORD1, r11
 616         slw     rWORD6, rWORD8, r11
 617         b       L(duP2e)
 618 /* Remainder is 12 */
 619         .align 4
 620 L(duPs3):
 621         slw     rF, rWORD2, rSHL
 622         slw     rWORD3, rWORD1, r11
 623         slw     rWORD4, rWORD8, r11
 624         b       L(duP3e)
 625 /* Count is a multiple of 16, remainder is 0 */
 626         .align 4
 627 L(duPs4):
 628         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 629         or      rWORD8, rG, rWORD8
 630         slw     rD, rWORD2, rSHL
 631         slw     rWORD1, rWORD1, r11
 632         slw     rWORD2, rWORD8, r11
 633         b       L(duP4e)
 634
 635 /* At this point we know rSTR1 is word aligned and the
 636    compare length is at least 8 bytes.  */
 637         .align 4
 638 L(Wunaligned):
 639         stw     r27,32(r1)
 640         cfi_offset(r27,(32-64))
 641         clrrwi  rSTR2, rSTR2, 2
 642         stw     r26,28(r1)
 643         cfi_offset(r26,(28-64))
 644         srwi    rTMP, rN, 4      /* Divide by 16 */
 645         stw     r25,24(r1)
 646         cfi_offset(r25,(24-64))
 647         andi.   rBITDIF, rN, 12  /* Get the W remainder */
 648         stw     r24,20(r1)
 649         cfi_offset(r24,(24-64))
 650         slwi    rSHL, rSHL, 3
 651         lwz     rWORD6, 0(rSTR2)
 652         lwzu    rWORD8, 4(rSTR2)
 653         cmplwi  cr1, rBITDIF, 8
 654         cmplwi  cr7, rN, 16
 655         clrlwi  rN, rN, 30
 656         subfic  rSHR, rSHL, 32
 657         slw     rH, rWORD6, rSHL
 658         beq     L(duP4)
 659         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 660         bgt     cr1, L(duP3)
 661         beq     cr1, L(duP2)
 662
 663 /* Remainder is 4 */
 664         .align 4
 665 L(duP1):
 666         srw     rG, rWORD8, rSHR
 667         lwz     rWORD7, 0(rSTR1)
 668         slw     rB, rWORD8, rSHL
 669         or      rWORD8, rG, rH
 670         blt     cr7, L(duP1x)
 671 L(duP1e):
 672         lwz     rWORD1, 4(rSTR1)
 673         lwz     rWORD2, 4(rSTR2)
 674         cmplw   cr5, rWORD7, rWORD8
 675         srw     rA, rWORD2, rSHR
 676         slw     rD, rWORD2, rSHL
 677         or      rWORD2, rA, rB
 678         lwz     rWORD3, 8(rSTR1)
 679         lwz     rWORD4, 8(rSTR2)
 680         cmplw   cr0, rWORD1, rWORD2
 681         srw     rC, rWORD4, rSHR
 682         slw     rF, rWORD4, rSHL
 683         bne     cr5, L(duLcr5)
 684         or      rWORD4, rC, rD
 685         lwz     rWORD5, 12(rSTR1)
 686         lwz     rWORD6, 12(rSTR2)
 687         cmplw   cr1, rWORD3, rWORD4
 688         srw     rE, rWORD6, rSHR
 689         slw     rH, rWORD6, rSHL
 690         bne     cr0, L(duLcr0)
 691         or      rWORD6, rE, rF
 692         cmplw   cr6, rWORD5, rWORD6
 693         b       L(duLoop3)
 694         .align 4
 695 /* At this point we exit early with the first word compare
 696    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
 697    how we handle the remaining bytes.  */
 698 L(duP1x):
 699         cmplw   cr5, rWORD7, rWORD8
 700         slwi.   rN, rN, 3
 701         bne     cr5, L(duLcr5)
 702         cmplw   cr7, rN, rSHR
 703         beq     L(duZeroReturn)
 704         li      rA, 0
 705         ble     cr7, L(dutrim)
 706         ld      rWORD2, 8(rSTR2)
 707         srw     rA, rWORD2, rSHR
 708         b       L(dutrim)
 709 /* Remainder is 8 */
 710         .align 4
 711 L(duP2):
 712         srw     rE, rWORD8, rSHR
 713         lwz     rWORD5, 0(rSTR1)
 714         or      rWORD6, rE, rH
 715         slw     rH, rWORD8, rSHL
 716 L(duP2e):
 717         lwz     rWORD7, 4(rSTR1)
 718         lwz     rWORD8, 4(rSTR2)
 719         cmplw   cr6, rWORD5, rWORD6
 720         srw     rG, rWORD8, rSHR
 721         slw     rB, rWORD8, rSHL
 722         or      rWORD8, rG, rH
 723         blt     cr7, L(duP2x)
 724         lwz     rWORD1, 8(rSTR1)
 725         lwz     rWORD2, 8(rSTR2)
 726         cmplw   cr5, rWORD7, rWORD8
 727         bne     cr6, L(duLcr6)
 728         srw     rA, rWORD2, rSHR
 729         slw     rD, rWORD2, rSHL
 730         or      rWORD2, rA, rB
 731         lwz     rWORD3, 12(rSTR1)
 732         lwz     rWORD4, 12(rSTR2)
 733         cmplw   cr0, rWORD1, rWORD2
 734         bne     cr5, L(duLcr5)
 735         srw     rC, rWORD4, rSHR
 736         slw     rF, rWORD4, rSHL
 737         or      rWORD4, rC, rD
 738         addi    rSTR1, rSTR1, 4
 739         addi    rSTR2, rSTR2, 4
 740         cmplw   cr1, rWORD3, rWORD4
 741         b       L(duLoop2)
 742         .align 4
 743 L(duP2x):
 744         cmplw   cr5, rWORD7, rWORD8
 745         addi    rSTR1, rSTR1, 4
 746         addi    rSTR2, rSTR2, 4
 747         bne     cr6, L(duLcr6)
 748         slwi.   rN, rN, 3
 749         bne     cr5, L(duLcr5)
 750         cmplw   cr7, rN, rSHR
 751         beq     L(duZeroReturn)
 752         li      rA, 0
 753         ble     cr7, L(dutrim)
 754         lwz     rWORD2, 4(rSTR2)
 755         srw     rA, rWORD2, rSHR
 756         b       L(dutrim)
 757
 758 /* Remainder is 12 */
 759         .align 4
 760 L(duP3):
 761         srw     rC, rWORD8, rSHR
 762         lwz     rWORD3, 0(rSTR1)
 763         slw     rF, rWORD8, rSHL
 764         or      rWORD4, rC, rH
 765 L(duP3e):
 766         lwz     rWORD5, 4(rSTR1)
 767         lwz     rWORD6, 4(rSTR2)
 768         cmplw   cr1, rWORD3, rWORD4
 769         srw     rE, rWORD6, rSHR
 770         slw     rH, rWORD6, rSHL
 771         or      rWORD6, rE, rF
 772         lwz     rWORD7, 8(rSTR1)
 773         lwz     rWORD8, 8(rSTR2)
 774         cmplw   cr6, rWORD5, rWORD6
 775         bne     cr1, L(duLcr1)
 776         srw     rG, rWORD8, rSHR
 777         slw     rB, rWORD8, rSHL
 778         or      rWORD8, rG, rH
 779         blt     cr7, L(duP3x)
 780         lwz     rWORD1, 12(rSTR1)
 781         lwz     rWORD2, 12(rSTR2)
 782         cmplw   cr5, rWORD7, rWORD8
 783         bne     cr6, L(duLcr6)
 784         srw     rA, rWORD2, rSHR
 785         slw     rD, rWORD2, rSHL
 786         or      rWORD2, rA, rB
 787         addi    rSTR1, rSTR1, 8
 788         addi    rSTR2, rSTR2, 8
 789         cmplw   cr0, rWORD1, rWORD2
 790         b       L(duLoop1)
 791         .align 4
 792 L(duP3x):
 793         addi    rSTR1, rSTR1, 8
 794         addi    rSTR2, rSTR2, 8
 795         bne     cr1, L(duLcr1)
 796         cmplw   cr5, rWORD7, rWORD8
 797         bne     cr6, L(duLcr6)
 798         slwi.   rN, rN, 3
 799         bne     cr5, L(duLcr5)
 800         cmplw   cr7, rN, rSHR
 801         beq     L(duZeroReturn)
 802         li      rA, 0
 803         ble     cr7, L(dutrim)
 804         lwz     rWORD2, 4(rSTR2)
 805         srw     rA, rWORD2, rSHR
 806         b       L(dutrim)
 807
 808 /* Count is a multiple of 16, remainder is 0 */
 809         .align 4
 810 L(duP4):
 811         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 812         srw     rA, rWORD8, rSHR
 813         lwz     rWORD1, 0(rSTR1)
 814         slw     rD, rWORD8, rSHL
 815         or      rWORD2, rA, rH
 816 L(duP4e):
 817         lwz     rWORD3, 4(rSTR1)
 818         lwz     rWORD4, 4(rSTR2)
 819         cmplw   cr0, rWORD1, rWORD2
 820         srw     rC, rWORD4, rSHR
 821         slw     rF, rWORD4, rSHL
 822         or      rWORD4, rC, rD
 823         lwz     rWORD5, 8(rSTR1)
 824         lwz     rWORD6, 8(rSTR2)
 825         cmplw   cr1, rWORD3, rWORD4
 826         bne     cr0, L(duLcr0)
 827         srw     rE, rWORD6, rSHR
 828         slw     rH, rWORD6, rSHL
 829         or      rWORD6, rE, rF
 830         lwzu    rWORD7, 12(rSTR1)
 831         lwzu    rWORD8, 12(rSTR2)
 832         cmplw   cr6, rWORD5, rWORD6
 833         bne     cr1, L(duLcr1)
 834         srw     rG, rWORD8, rSHR
 835         slw     rB, rWORD8, rSHL
 836         or      rWORD8, rG, rH
 837         cmplw   cr5, rWORD7, rWORD8
 838         bdz-    L(du24)         /* Adjust CTR as we start with +4 */
 839 /* This is the primary loop */
 840         .align 4
 841 L(duLoop):
 842         lwz     rWORD1, 4(rSTR1)
 843         lwz     rWORD2, 4(rSTR2)
 844         cmplw   cr1, rWORD3, rWORD4
 845         bne     cr6, L(duLcr6)
 846         srw     rA, rWORD2, rSHR
 847         slw     rD, rWORD2, rSHL
 848         or      rWORD2, rA, rB
 849 L(duLoop1):
 850         lwz     rWORD3, 8(rSTR1)
 851         lwz     rWORD4, 8(rSTR2)
 852         cmplw   cr6, rWORD5, rWORD6
 853         bne     cr5, L(duLcr5)
 854         srw     rC, rWORD4, rSHR
 855         slw     rF, rWORD4, rSHL
 856         or      rWORD4, rC, rD
 857 L(duLoop2):
 858         lwz     rWORD5, 12(rSTR1)
 859         lwz     rWORD6, 12(rSTR2)
 860         cmplw   cr5, rWORD7, rWORD8
 861         bne     cr0, L(duLcr0)
 862         srw     rE, rWORD6, rSHR
 863         slw     rH, rWORD6, rSHL
 864         or      rWORD6, rE, rF
 865 L(duLoop3):
 866         lwzu    rWORD7, 16(rSTR1)
 867         lwzu    rWORD8, 16(rSTR2)
 868         cmplw   cr0, rWORD1, rWORD2
 869         bne-    cr1, L(duLcr1)
 870         srw     rG, rWORD8, rSHR
 871         slw     rB, rWORD8, rSHL
 872         or      rWORD8, rG, rH
 873         bdnz+   L(duLoop)
 874
 875 L(duL4):
 876         bne     cr1, L(duLcr1)
 877         cmplw   cr1, rWORD3, rWORD4
 878         bne     cr6, L(duLcr6)
 879         cmplw   cr6, rWORD5, rWORD6
 880         bne     cr5, L(duLcr5)
 881         cmplw   cr5, rWORD7, rWORD8
 882 L(du44):
 883         bne     cr0, L(duLcr0)
 884 L(du34):
 885         bne     cr1, L(duLcr1)
 886 L(du24):
 887         bne     cr6, L(duLcr6)
 888 L(du14):
 889         slwi.   rN, rN, 3
 890         bne     cr5, L(duLcr5)
 891 /* At this point we have a remainder of 1 to 3 bytes to compare.  We use
 892    shift right to eliminate bits beyond the compare length.
 893
 894    However it may not be safe to load rWORD2 which may be beyond the
 895    string length. So we compare the bit length of the remainder to
 896    the right shift count (rSHR). If the bit count is less than or equal
 897    we do not need to load rWORD2 (all significant bits are already in
 898    rB).  */
 899         cmplw   cr7, rN, rSHR
 900         beq     L(duZeroReturn)
 901         li      rA, 0
 902         ble     cr7, L(dutrim)
 903         lwz     rWORD2, 4(rSTR2)
 904         srw     rA, rWORD2, rSHR
 905         .align 4
 906 L(dutrim):
 907         lwz     rWORD1, 4(rSTR1)
 908         lwz     r31,48(1)
 909         subfic  rN, rN, 32      /* Shift count is 32 - (rN * 8).  */
 910         or      rWORD2, rA, rB
 911         lwz     r30,44(1)
 912         lwz     r29,40(r1)
 913         srw     rWORD1, rWORD1, rN
 914         srw     rWORD2, rWORD2, rN
 915         lwz     r28,36(r1)
 916         lwz     r27,32(r1)
 917         cmplw   rWORD1,rWORD2
 918         li      rRTN,0
 919         beq     L(dureturn26)
 920         li      rRTN,1
 921         bgt     L(dureturn26)
 922         li      rRTN,-1
 923         b    L(dureturn26)
 924         .align 4
 925 L(duLcr0):
 926         lwz     r31,48(1)
 927         lwz     r30,44(1)
 928         li      rRTN, 1
 929         bgt     cr0, L(dureturn29)
 930         lwz     r29,40(r1)
 931         lwz     r28,36(r1)
 932         li      rRTN, -1
 933         b       L(dureturn27)
 934         .align 4
 935 L(duLcr1):
 936         lwz     r31,48(1)
 937         lwz     r30,44(1)
 938         li      rRTN, 1
 939         bgt     cr1, L(dureturn29)
 940         lwz     r29,40(r1)
 941         lwz     r28,36(r1)
 942         li      rRTN, -1
 943         b       L(dureturn27)
 944         .align 4
 945 L(duLcr6):
 946         lwz     r31,48(1)
 947         lwz     r30,44(1)
 948         li      rRTN, 1
 949         bgt     cr6, L(dureturn29)
 950         lwz     r29,40(r1)
 951         lwz     r28,36(r1)
 952         li      rRTN, -1
 953         b       L(dureturn27)
 954         .align 4
 955 L(duLcr5):
 956         lwz     r31,48(1)
 957         lwz     r30,44(1)
 958         li      rRTN, 1
 959         bgt     cr5, L(dureturn29)
 960         lwz     r29,40(r1)
 961         lwz     r28,36(r1)
 962         li      rRTN, -1
 963         b       L(dureturn27)
 964         .align  3
 965 L(duZeroReturn):
 966         li      rRTN,0
 967         .align  4
 968 L(dureturn):
 969         lwz     r31,48(1)
 970         lwz     r30,44(1)
 971 L(dureturn29):
 972         lwz     r29,40(r1)
 973         lwz     r28,36(r1)
 974 L(dureturn27):
 975         lwz     r27,32(r1)
 976 L(dureturn26):
 977         lwz     r26,28(r1)
 978 L(dureturn25):
 979         lwz     r25,24(r1)
 980         lwz     r24,20(r1)
 981         lwz     1,0(1)
 982         blr
 983 END (BP_SYM (memcmp))
 984
 985 libc_hidden_builtin_def (memcmp)
 986 weak_alias (memcmp, bcmp)