sysdeps/powerpc/powerpc64/le/power9/strcmp.S

   1 /* Optimized strcmp implementation for PowerPC64/POWER9.
   2    Copyright (C) 2016-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18 #include <sysdep.h>
  19
  20 #ifndef STRCMP
  21 # define STRCMP strcmp
  22 #endif
  23
  24 /* Implements the function
  25
  26    int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
  27
  28    The implementation uses unaligned doubleword access for first 32 bytes
  29    as in POWER8 patch and uses vectorised loops after that.  */
  30
  31 /* TODO: Change this to actual instructions when minimum binutils is upgraded
  32    to 2.27.  Macros are defined below for these newer instructions in order
  33    to maintain compatibility.  */
  34 #define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
  35
  36 #define VEXTUBRX(t,a,b) .long (0x1000070d \
  37                                 | ((t)<<(32-11))  \
  38                                 | ((a)<<(32-16))  \
  39                                 | ((b)<<(32-21)) )
  40
  41 #define VCMPNEZB(t,a,b) .long (0x10000507 \
  42                                 | ((t)<<(32-11))  \
  43                                 | ((a)<<(32-16))  \
  44                                 | ((b)<<(32-21)) )
  45
  46 /* Get 16 bytes for unaligned case.
  47    reg1: Vector to hold next 16 bytes.
  48    reg2: Address to read from.
  49    reg3: Permute control vector.  */
  50 #define GET16BYTES(reg1, reg2, reg3) \
  51         lvx     reg1, 0, reg2; \
  52         vperm   v8, v2, reg1, reg3; \
  53         vcmpequb.       v8, v0, v8; \
  54         beq     cr6, 1f; \
  55         vspltisb        v9, 0; \
  56         b       2f; \
  57         .align 4; \
  58 1: \
  59         addi    r6, reg2, 16; \
  60         lvx     v9, 0, r6; \
  61 2: \
  62         vperm   reg1, v9, reg1, reg3;
  63
  64 /* TODO: change this to .machine power9 when the minimum required binutils
  65    allows it.  */
  66
  67         .machine  power7
  68 ENTRY_TOCLESS (STRCMP, 4)
  69         li      r0, 0
  70
  71         /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
  72            the code:
  73
  74             (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
  75
  76            with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
  77
  78         rldicl  r7, r3, 0, 52
  79         rldicl  r9, r4, 0, 52
  80         cmpldi  cr7, r7, 4096-16
  81         bgt     cr7, L(pagecross_check)
  82         cmpldi  cr5, r9, 4096-16
  83         bgt     cr5, L(pagecross_check)
  84
  85         /* For short strings up to 16 bytes,  load both s1 and s2 using
  86            unaligned dwords and compare.  */
  87         ld      r8, 0(r3)
  88         ld      r10, 0(r4)
  89         cmpb    r12, r8, r0
  90         cmpb    r11, r8, r10
  91         orc.    r9, r12, r11
  92         bne     cr0, L(different_nocmpb)
  93
  94         ld      r8, 8(r3)
  95         ld      r10, 8(r4)
  96         cmpb    r12, r8, r0
  97         cmpb    r11, r8, r10
  98         orc.    r9, r12, r11
  99         bne     cr0, L(different_nocmpb)
 100
 101         addi    r7, r3, 16
 102         addi    r4, r4, 16
 103
 104 L(align):
 105         /* Now it has checked for first 16 bytes.  */
 106         vspltisb        v0, 0
 107         vspltisb        v2, -1
 108         lvsr    v6, 0, r4   /* Compute mask.  */
 109         or      r5, r4, r7
 110         andi.   r5, r5, 0xF
 111         beq     cr0, L(aligned)
 112         andi.   r5, r7, 0xF
 113         beq     cr0, L(s1_align)
 114         lvsr    v10, 0, r7   /* Compute mask.  */
 115
 116         /* Both s1 and s2 are unaligned.  */
 117         GET16BYTES(v4, r7, v10)
 118         GET16BYTES(v5, r4, v6)
 119         VCMPNEZB(v7, v5, v4)
 120         beq     cr6, L(match)
 121         b       L(different)
 122
 123         /* Align s1 to qw and adjust s2 address.  */
 124         .align  4
 125 L(match):
 126         clrldi  r6, r7, 60
 127         subfic  r5, r6, 16
 128         add     r7, r7, r5
 129         add     r4, r4, r5
 130         andi.   r5, r4, 0xF
 131         beq     cr0, L(aligned)
 132         lvsr    v6, 0, r4
 133         /* There are 2 loops depending on the input alignment.
 134            Each loop gets 16 bytes from s1 and s2 and compares.
 135            Loop until a mismatch or null occurs.  */
 136 L(s1_align):
 137         lvx     v4, r7, r0
 138         GET16BYTES(v5, r4, v6)
 139         VCMPNEZB(v7, v5, v4)
 140         addi    r7, r7, 16
 141         addi    r4, r4, 16
 142         bne     cr6, L(different)
 143
 144         lvx     v4, r7, r0
 145         GET16BYTES(v5, r4, v6)
 146         VCMPNEZB(v7, v5, v4)
 147         addi    r7, r7, 16
 148         addi    r4, r4, 16
 149         bne     cr6, L(different)
 150
 151         lvx     v4, r7, r0
 152         GET16BYTES(v5, r4, v6)
 153         VCMPNEZB(v7, v5, v4)
 154         addi    r7, r7, 16
 155         addi    r4, r4, 16
 156         bne     cr6, L(different)
 157
 158         lvx     v4, r7, r0
 159         GET16BYTES(v5, r4, v6)
 160         VCMPNEZB(v7, v5, v4)
 161         addi    r7, r7, 16
 162         addi    r4, r4, 16
 163         beq     cr6, L(s1_align)
 164         b       L(different)
 165
 166         .align  4
 167 L(aligned):
 168         lvx     v4, 0, r7
 169         lvx     v5, 0, r4
 170         VCMPNEZB(v7, v5, v4)
 171         addi    r7, r7, 16
 172         addi    r4, r4, 16
 173         bne     cr6, L(different)
 174
 175         lvx     v4, 0, r7
 176         lvx     v5, 0, r4
 177         VCMPNEZB(v7, v5, v4)
 178         addi    r7, r7, 16
 179         addi    r4, r4, 16
 180         bne     cr6, L(different)
 181
 182         lvx     v4, 0, r7
 183         lvx     v5, 0, r4
 184         VCMPNEZB(v7, v5, v4)
 185         addi    r7, r7, 16
 186         addi    r4, r4, 16
 187         bne     cr6, L(different)
 188
 189         lvx     v4, 0, r7
 190         lvx     v5, 0, r4
 191         VCMPNEZB(v7, v5, v4)
 192         addi    r7, r7, 16
 193         addi    r4, r4, 16
 194         beq     cr6, L(aligned)
 195
 196         /* Calculate and return the difference.  */
 197 L(different):
 198         VCTZLSBB(r6, v7)
 199         VEXTUBRX(r5, r6, v4)
 200         VEXTUBRX(r4, r6, v5)
 201         subf    r3, r4, r5
 202         extsw   r3, r3
 203         blr
 204
 205         .align  4
 206 L(different_nocmpb):
 207         neg     r3, r9
 208         and     r9, r9, r3
 209         cntlzd  r9, r9
 210         subfic  r9, r9, 63
 211         srd     r3, r8, r9
 212         srd     r10, r10, r9
 213         rldicl  r10, r10, 0, 56
 214         rldicl  r3, r3, 0, 56
 215         subf    r3, r10, r3
 216         extsw   r3, r3
 217         blr
 218
 219         .align  4
 220 L(pagecross_check):
 221         subfic  r9, r9, 4096
 222         subfic  r7, r7, 4096
 223         cmpld   cr7, r7, r9
 224         bge     cr7, L(pagecross)
 225         mr      r7, r9
 226
 227         /* If unaligned 16 bytes reads across a 4K page boundary, it uses
 228            a simple byte a byte comparison until the page alignment for s1
 229            is reached.  */
 230 L(pagecross):
 231         add     r7, r3, r7
 232         subf    r9, r3, r7
 233         mtctr   r9
 234
 235         .align  4
 236 L(pagecross_loop):
 237         /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
 238            and if *s1 is '\0'.  */
 239         lbz     r9, 0(r3)
 240         lbz     r10, 0(r4)
 241         addi    r3, r3, 1
 242         addi    r4, r4, 1
 243         cmplw   cr7, r9, r10
 244         cmpdi   cr5, r9, r0
 245         bne     cr7, L(pagecross_ne)
 246         beq     cr5, L(pagecross_nullfound)
 247         bdnz    L(pagecross_loop)
 248         b       L(align)
 249
 250         .align  4
 251 L(pagecross_ne):
 252         extsw   r3, r9
 253         mr      r9, r10
 254 L(pagecross_retdiff):
 255         subf    r9, r9, r3
 256         extsw   r3, r9
 257         blr
 258
 259         .align  4
 260 L(pagecross_nullfound):
 261         li      r3, 0
 262         b       L(pagecross_retdiff)
 263 END (STRCMP)
 264 libc_hidden_builtin_def (strcmp)