Fix ICE in lto_symtab_merge_symbols_1 (PR lto/88004).
[official-gcc.git] / gcc / config / rs6000 / rs6000-string.c
blob22fe966d5729d2f294688e295224e0baad79f362
1 /* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
3 Copyright (C) 1991-2018 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "tm_p.h"
31 #include "ira.h"
32 #include "print-tree.h"
33 #include "varasm.h"
34 #include "explow.h"
35 #include "expr.h"
36 #include "output.h"
37 #include "target.h"
39 /* Expand a block clear operation, and return 1 if successful. Return 0
40 if we should let the compiler generate normal code.
42 operands[0] is the destination
43 operands[1] is the length
44 operands[3] is the alignment */
46 int
47 expand_block_clear (rtx operands[])
49 rtx orig_dest = operands[0];
50 rtx bytes_rtx = operands[1];
51 rtx align_rtx = operands[3];
52 bool constp = (GET_CODE (bytes_rtx) == CONST_INT);
53 HOST_WIDE_INT align;
54 HOST_WIDE_INT bytes;
55 int offset;
56 int clear_bytes;
57 int clear_step;
59 /* If this is not a fixed size move, just call memcpy */
60 if (! constp)
61 return 0;
63 /* This must be a fixed size alignment */
64 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
65 align = INTVAL (align_rtx) * BITS_PER_UNIT;
67 /* Anything to clear? */
68 bytes = INTVAL (bytes_rtx);
69 if (bytes <= 0)
70 return 1;
72 /* Use the builtin memset after a point, to avoid huge code bloat.
73 When optimize_size, avoid any significant code bloat; calling
74 memset is about 4 instructions, so allow for one instruction to
75 load zero and three to do clearing. */
76 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
77 clear_step = 16;
78 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
79 clear_step = 8;
80 else
81 clear_step = 4;
83 if (optimize_size && bytes > 3 * clear_step)
84 return 0;
85 if (! optimize_size && bytes > 8 * clear_step)
86 return 0;
88 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
90 machine_mode mode = BLKmode;
91 rtx dest;
93 if (TARGET_ALTIVEC
94 && ((bytes >= 16 && align >= 128)
95 || (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX)))
97 clear_bytes = 16;
98 mode = V4SImode;
100 else if (bytes >= 8 && TARGET_POWERPC64
101 && (align >= 64 || !STRICT_ALIGNMENT))
103 clear_bytes = 8;
104 mode = DImode;
105 if (offset == 0 && align < 64)
107 rtx addr;
109 /* If the address form is reg+offset with offset not a
110 multiple of four, reload into reg indirect form here
111 rather than waiting for reload. This way we get one
112 reload, not one per store. */
113 addr = XEXP (orig_dest, 0);
114 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
115 && GET_CODE (XEXP (addr, 1)) == CONST_INT
116 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
118 addr = copy_addr_to_reg (addr);
119 orig_dest = replace_equiv_address (orig_dest, addr);
123 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
124 { /* move 4 bytes */
125 clear_bytes = 4;
126 mode = SImode;
128 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
129 { /* move 2 bytes */
130 clear_bytes = 2;
131 mode = HImode;
133 else /* move 1 byte at a time */
135 clear_bytes = 1;
136 mode = QImode;
139 dest = adjust_address (orig_dest, mode, offset);
141 emit_move_insn (dest, CONST0_RTX (mode));
144 return 1;
147 /* Figure out the correct instructions to generate to load data for
148 block compare. MODE is used for the read from memory, and
149 data is zero extended if REG is wider than MODE. If LE code
150 is being generated, bswap loads are used.
152 REG is the destination register to move the data into.
153 MEM is the memory block being read.
154 MODE is the mode of memory to use for the read. */
155 static void
156 do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
158 switch (GET_MODE (reg))
160 case E_V16QImode:
161 switch (mode)
163 case E_V16QImode:
164 if (!BYTES_BIG_ENDIAN)
166 if (TARGET_P9_VECTOR)
167 emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
168 else
170 rtx reg_v2di = simplify_gen_subreg (V2DImode, reg,
171 V16QImode, 0);
172 gcc_assert (MEM_P (mem));
173 rtx addr = XEXP (mem, 0);
174 rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
175 MEM_COPY_ATTRIBUTES (mem_v2di, mem);
176 set_mem_size (mem, GET_MODE_SIZE (V2DImode));
177 emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
180 else
181 emit_insn (gen_vsx_movv2di_64bit (reg, mem));
182 break;
183 default:
184 gcc_unreachable ();
186 break;
187 case E_DImode:
188 switch (mode)
190 case E_QImode:
191 emit_insn (gen_zero_extendqidi2 (reg, mem));
192 break;
193 case E_HImode:
195 rtx src = mem;
196 if (!BYTES_BIG_ENDIAN)
198 src = gen_reg_rtx (HImode);
199 emit_insn (gen_bswaphi2 (src, mem));
201 emit_insn (gen_zero_extendhidi2 (reg, src));
202 break;
204 case E_SImode:
206 rtx src = mem;
207 if (!BYTES_BIG_ENDIAN)
209 src = gen_reg_rtx (SImode);
210 emit_insn (gen_bswapsi2 (src, mem));
212 emit_insn (gen_zero_extendsidi2 (reg, src));
214 break;
215 case E_DImode:
216 if (!BYTES_BIG_ENDIAN)
217 emit_insn (gen_bswapdi2 (reg, mem));
218 else
219 emit_insn (gen_movdi (reg, mem));
220 break;
221 default:
222 gcc_unreachable ();
224 break;
226 case E_SImode:
227 switch (mode)
229 case E_QImode:
230 emit_insn (gen_zero_extendqisi2 (reg, mem));
231 break;
232 case E_HImode:
234 rtx src = mem;
235 if (!BYTES_BIG_ENDIAN)
237 src = gen_reg_rtx (HImode);
238 emit_insn (gen_bswaphi2 (src, mem));
240 emit_insn (gen_zero_extendhisi2 (reg, src));
241 break;
243 case E_SImode:
244 if (!BYTES_BIG_ENDIAN)
245 emit_insn (gen_bswapsi2 (reg, mem));
246 else
247 emit_insn (gen_movsi (reg, mem));
248 break;
249 case E_DImode:
250 /* DImode is larger than the destination reg so is not expected. */
251 gcc_unreachable ();
252 break;
253 default:
254 gcc_unreachable ();
256 break;
258 case E_QImode:
259 gcc_assert (mode == E_QImode);
260 emit_move_insn (reg, mem);
261 break;
263 default:
264 gcc_unreachable ();
265 break;
269 /* Select the mode to be used for reading the next chunk of bytes
270 in the compare.
272 OFFSET is the current read offset from the beginning of the block.
273 BYTES is the number of bytes remaining to be read.
274 ALIGN is the minimum alignment of the memory blocks being compared in bytes. */
275 static machine_mode
276 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
277 unsigned HOST_WIDE_INT bytes,
278 unsigned HOST_WIDE_INT align)
280 /* First see if we can do a whole load unit
281 as that will be more efficient than a larger load + shift. */
283 /* If big, use biggest chunk.
284 If exactly chunk size, use that size.
285 If remainder can be done in one piece with shifting, do that.
286 Do largest chunk possible without violating alignment rules. */
288 /* The most we can read without potential page crossing. */
289 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
291 /* If we have an LE target without ldbrx and word_mode is DImode,
292 then we must avoid using word_mode. */
293 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
294 && word_mode == DImode);
296 if (word_mode_ok && bytes >= UNITS_PER_WORD)
297 return word_mode;
298 else if (bytes == GET_MODE_SIZE (SImode))
299 return SImode;
300 else if (bytes == GET_MODE_SIZE (HImode))
301 return HImode;
302 else if (bytes == GET_MODE_SIZE (QImode))
303 return QImode;
304 else if (bytes < GET_MODE_SIZE (SImode)
305 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
306 && offset >= GET_MODE_SIZE (SImode) - bytes)
307 /* This matches the case were we have SImode and 3 bytes
308 and offset >= 1 and permits us to move back one and overlap
309 with the previous read, thus avoiding having to shift
310 unwanted bytes off of the input. */
311 return SImode;
312 else if (word_mode_ok && bytes < UNITS_PER_WORD
313 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
314 && offset >= UNITS_PER_WORD-bytes)
315 /* Similarly, if we can use DImode it will get matched here and
316 can do an overlapping read that ends at the end of the block. */
317 return word_mode;
318 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
319 /* It is safe to do all remaining in one load of largest size,
320 possibly with a shift to get rid of unwanted bytes. */
321 return word_mode;
322 else if (maxread >= GET_MODE_SIZE (SImode))
323 /* It is safe to do all remaining in one SImode load,
324 possibly with a shift to get rid of unwanted bytes. */
325 return SImode;
326 else if (bytes > GET_MODE_SIZE (SImode))
327 return SImode;
328 else if (bytes > GET_MODE_SIZE (HImode))
329 return HImode;
331 /* final fallback is do one byte */
332 return QImode;
335 /* Compute the alignment of pointer+OFFSET where the original alignment
336 of pointer was BASE_ALIGN. */
337 static unsigned HOST_WIDE_INT
338 compute_current_alignment (unsigned HOST_WIDE_INT base_align,
339 unsigned HOST_WIDE_INT offset)
341 if (offset == 0)
342 return base_align;
343 return MIN (base_align, offset & -offset);
346 /* Prepare address and then do a load.
348 MODE is the mode to use for the load.
349 DEST is the destination register for the data.
350 ADDR is the address to be loaded.
351 ORIG_ADDR is the original address expression. */
352 static void
353 do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
354 rtx orig_addr)
356 rtx mem = gen_rtx_MEM (mode, addr);
357 MEM_COPY_ATTRIBUTES (mem, orig_addr);
358 set_mem_size (mem, GET_MODE_SIZE (mode));
359 do_load_for_compare (dest, mem, mode);
360 return;
363 /* Do a branch for an if/else decision.
365 CMPMODE is the mode to use for the comparison.
366 COMPARISON is the rtx code for the compare needed.
367 A is the first thing to be compared.
368 B is the second thing to be compared.
369 CR is the condition code reg input, or NULL_RTX.
370 TRUE_LABEL is the label to branch to if the condition is true.
372 The return value is the CR used for the comparison.
373 If CR is null_rtx, then a new register of CMPMODE is generated.
374 If A and B are both null_rtx, then CR must not be null, and the
375 compare is not generated so you can use this with a dot form insn. */
377 static void
378 do_ifelse (machine_mode cmpmode, rtx_code comparison,
379 rtx a, rtx b, rtx cr, rtx true_label)
381 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
382 || (a != NULL_RTX && b != NULL_RTX));
384 if (cr != NULL_RTX)
385 gcc_assert (GET_MODE (cr) == cmpmode);
386 else
387 cr = gen_reg_rtx (cmpmode);
389 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
391 if (a != NULL_RTX)
392 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
394 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
396 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
397 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
398 JUMP_LABEL (j) = true_label;
399 LABEL_NUSES (true_label) += 1;
402 /* Emit an isel of the proper mode for DEST.
404 DEST is the isel destination register.
405 SRC1 is the isel source if CR is true.
406 SRC2 is the isel source if CR is false.
407 CR is the condition for the isel. */
408 static void
409 do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
411 if (GET_MODE (dest) == DImode)
412 emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
413 else
414 emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
417 /* Emit a subtract of the proper mode for DEST.
419 DEST is the destination register for the subtract.
420 SRC1 is the first subtract input.
421 SRC2 is the second subtract input.
423 Computes DEST = SRC1-SRC2. */
424 static void
425 do_sub3 (rtx dest, rtx src1, rtx src2)
427 if (GET_MODE (dest) == DImode)
428 emit_insn (gen_subdi3 (dest, src1, src2));
429 else
430 emit_insn (gen_subsi3 (dest, src1, src2));
433 /* Emit an add of the proper mode for DEST.
435 DEST is the destination register for the add.
436 SRC1 is the first add input.
437 SRC2 is the second add input.
439 Computes DEST = SRC1+SRC2. */
440 static void
441 do_add3 (rtx dest, rtx src1, rtx src2)
443 if (GET_MODE (dest) == DImode)
444 emit_insn (gen_adddi3 (dest, src1, src2));
445 else
446 emit_insn (gen_addsi3 (dest, src1, src2));
449 /* Emit an and of the proper mode for DEST.
451 DEST is the destination register for the and.
452 SRC1 is the first and input.
453 SRC2 is the second and input.
455 Computes DEST = SRC1&SRC2. */
456 static void
457 do_and3 (rtx dest, rtx src1, rtx src2)
459 if (GET_MODE (dest) == DImode)
460 emit_insn (gen_anddi3 (dest, src1, src2));
461 else
462 emit_insn (gen_andsi3 (dest, src1, src2));
465 /* Emit an cmpb of the proper mode for DEST.
467 DEST is the destination register for the cmpb.
468 SRC1 is the first input.
469 SRC2 is the second input.
471 Computes cmpb of SRC1, SRC2. */
472 static void
473 do_cmpb3 (rtx dest, rtx src1, rtx src2)
475 if (GET_MODE (dest) == DImode)
476 emit_insn (gen_cmpbdi3 (dest, src1, src2));
477 else
478 emit_insn (gen_cmpbsi3 (dest, src1, src2));
481 /* Emit a rotl of the proper mode for DEST.
483 DEST is the destination register for the and.
484 SRC1 is the first and input.
485 SRC2 is the second and input.
487 Computes DEST = SRC1 rotated left by SRC2. */
488 static void
489 do_rotl3 (rtx dest, rtx src1, rtx src2)
491 if (GET_MODE (dest) == DImode)
492 emit_insn (gen_rotldi3 (dest, src1, src2));
493 else
494 emit_insn (gen_rotlsi3 (dest, src1, src2));
497 /* Generate rtl for a load, shift, and compare of less than a full word.
499 LOAD_MODE is the machine mode for the loads.
500 DIFF is the reg for the difference.
501 CMP_REM is the reg containing the remaining bytes to compare.
502 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
503 SRC1_ADDR is the first source address.
504 SRC2_ADDR is the second source address.
505 ORIG_SRC1 is the original first source block's address rtx.
506 ORIG_SRC2 is the original second source block's address rtx. */
507 static void
508 do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
509 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
511 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
512 rtx shift_amount = gen_reg_rtx (word_mode);
513 rtx d1 = gen_reg_rtx (word_mode);
514 rtx d2 = gen_reg_rtx (word_mode);
516 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
517 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
518 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
520 if (word_mode == DImode)
522 emit_insn (gen_ashldi3 (shift_amount, shift_amount,
523 GEN_INT (LOG2_BITS_PER_UNIT)));
524 emit_insn (gen_lshrdi3 (d1, d1,
525 gen_lowpart (SImode, shift_amount)));
526 emit_insn (gen_lshrdi3 (d2, d2,
527 gen_lowpart (SImode, shift_amount)));
529 else
531 emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
532 GEN_INT (LOG2_BITS_PER_UNIT)));
533 emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
534 emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
537 if (TARGET_P9_MISC)
539 /* Generate a compare, and convert with a setb later. */
540 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
541 emit_insn (gen_rtx_SET (dcond, cmp));
543 else
545 if (word_mode == DImode)
546 emit_insn (gen_subfdi3_carry (diff, d2, d1));
547 else
548 emit_insn (gen_subfsi3_carry (diff, d2, d1));
552 /* Generate rtl for an overlapping load and compare of less than a
553 full load_mode. This assumes that the previous word is part of the
554 block being compared so it's ok to back up part of a word so we can
555 compare the last unaligned full word that ends at the end of the block.
557 LOAD_MODE is the machine mode for the loads.
558 ISCONST tells whether the remaining length is a constant or in a register.
559 BYTES_REM is the remaining length if ISCONST is true.
560 DIFF is the reg for the difference.
561 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
562 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
563 SRC1_ADDR is the first source address.
564 SRC2_ADDR is the second source address.
565 ORIG_SRC1 is the original first source block's address rtx.
566 ORIG_SRC2 is the original second source block's address rtx. */
567 static void
568 do_overlap_load_compare (machine_mode load_mode, bool isConst,
569 HOST_WIDE_INT bytes_rem, rtx diff,
570 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
571 rtx orig_src1, rtx orig_src2)
573 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
574 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
575 rtx d1 = gen_reg_rtx (word_mode);
576 rtx d2 = gen_reg_rtx (word_mode);
578 rtx addr1, addr2;
579 if (!isConst || addr_adj)
581 rtx adj_reg = gen_reg_rtx (word_mode);
582 if (isConst)
583 emit_move_insn (adj_reg, GEN_INT (-addr_adj));
584 else
586 rtx reg_lms = gen_reg_rtx (word_mode);
587 emit_move_insn (reg_lms, GEN_INT (load_mode_size));
588 do_sub3 (adj_reg, cmp_rem, reg_lms);
591 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
592 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
594 else
596 addr1 = src1_addr;
597 addr2 = src2_addr;
600 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
601 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
603 if (TARGET_P9_MISC)
605 /* Generate a compare, and convert with a setb later. */
606 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
607 emit_insn (gen_rtx_SET (dcond, cmp));
609 else
611 if (word_mode == DImode)
612 emit_insn (gen_subfdi3_carry (diff, d2, d1));
613 else
614 emit_insn (gen_subfsi3_carry (diff, d2, d1));
618 /* Expand a block compare operation using loop code, and return true
619 if successful. Return false if we should let the compiler generate
620 normal code, probably a memcmp call.
622 OPERANDS[0] is the target (result).
623 OPERANDS[1] is the first source.
624 OPERANDS[2] is the second source.
625 OPERANDS[3] is the length.
626 OPERANDS[4] is the alignment. */
627 bool
628 expand_compare_loop (rtx operands[])
630 rtx target = operands[0];
631 rtx orig_src1 = operands[1];
632 rtx orig_src2 = operands[2];
633 rtx bytes_rtx = operands[3];
634 rtx align_rtx = operands[4];
636 /* This case is complicated to handle because the subtract
637 with carry instructions do not generate the 64-bit
638 carry and so we must emit code to calculate it ourselves.
639 We choose not to implement this yet. */
640 if (TARGET_32BIT && TARGET_POWERPC64)
641 return false;
643 /* Allow non-const length. */
644 int bytes_is_const = CONST_INT_P (bytes_rtx);
646 /* This must be a fixed size alignment. */
647 if (!CONST_INT_P (align_rtx))
648 return false;
650 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
651 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
652 HOST_WIDE_INT minalign = MIN (align1, align2);
654 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
656 gcc_assert (GET_MODE (target) == SImode);
658 /* Anything to move? */
659 HOST_WIDE_INT bytes = 0;
660 if (bytes_is_const)
661 bytes = INTVAL (bytes_rtx);
663 if (bytes_is_const && bytes == 0)
664 return true;
666 /* Limit the amount we compare, if known statically. */
667 HOST_WIDE_INT max_bytes;
668 switch (rs6000_tune)
670 case PROCESSOR_POWER7:
671 if (!bytes_is_const)
672 if (minalign < 8)
673 max_bytes = 0;
674 else
675 max_bytes = 128;
676 else
677 if (minalign < 8)
678 max_bytes = 32;
679 else
680 max_bytes = 128;
681 break;
682 case PROCESSOR_POWER8:
683 if (!bytes_is_const)
684 max_bytes = 0;
685 else
686 if (minalign < 8)
687 max_bytes = 128;
688 else
689 max_bytes = 64;
690 break;
691 case PROCESSOR_POWER9:
692 if (bytes_is_const)
693 max_bytes = 191;
694 else
695 max_bytes = 0;
696 break;
697 default:
698 max_bytes = 128;
701 /* Allow the option to override the default. */
702 if (rs6000_block_compare_inline_loop_limit >= 0)
703 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
705 if (max_bytes == 0)
706 return false;
708 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */
709 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */
710 HOST_WIDE_INT niter;
711 rtx iter = gen_reg_rtx (word_mode);
712 rtx iv1 = gen_reg_rtx (word_mode);
713 rtx iv2 = gen_reg_rtx (word_mode);
714 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */
715 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */
716 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */
717 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */
719 /* Strip unneeded subreg from length if there is one. */
720 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
721 bytes_rtx = SUBREG_REG (bytes_rtx);
722 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
723 maybe have to deal with the case were bytes_rtx is SImode and
724 word_mode is DImode. */
725 if (!bytes_is_const)
727 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
728 /* Do not expect length longer than word_mode. */
729 return false;
730 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
732 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
733 bytes_rtx = force_reg (word_mode,
734 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
735 bytes_rtx));
737 else
738 /* Make sure it's in a register before we get started. */
739 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
742 machine_mode load_mode = word_mode;
743 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
745 /* Number of bytes per iteration of the unrolled loop. */
746 HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
747 /* max iters and bytes compared in the loop. */
748 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
749 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
750 int l2lb = floor_log2 (loop_bytes);
752 if (bytes_is_const && (max_bytes < load_mode_size
753 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
754 return false;
756 bool no_remainder_code = false;
757 rtx final_label = gen_label_rtx ();
758 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
759 rtx diff_label = gen_label_rtx ();
760 rtx library_call_label = NULL;
761 rtx cleanup_label = gen_label_rtx ();
763 rtx cr;
765 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
766 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
768 /* Difference found is stored here before jump to diff_label. */
769 rtx diff = gen_reg_rtx (word_mode);
770 rtx j;
772 /* Example of generated code for 35 bytes aligned 1 byte.
774 mtctr 8
775 li 6,0
776 li 5,8
777 .L13:
778 ldbrx 7,3,6
779 ldbrx 9,10,6
780 ldbrx 0,3,5
781 ldbrx 4,10,5
782 addi 6,6,16
783 addi 5,5,16
784 subfc. 9,9,7
785 bne 0,.L10
786 subfc. 9,4,0
787 bdnzt 2,.L13
788 bne 0,.L10
789 add 3,3,6
790 add 10,10,6
791 addi 9,3,-5
792 ldbrx 7,0,9
793 addi 9,10,-5
794 ldbrx 9,0,9
795 subfc 9,9,7
796 .p2align 4,,15
797 .L10:
798 popcntd 9,9
799 subfe 10,10,10
800 or 9,9,10
802 Compiled with -fno-reorder-blocks for clarity. */
804 /* Structure of what we're going to do:
805 Two separate lengths: what we will compare before bailing to library
806 call (max_bytes), and the total length to be checked.
807 if length <= 16, branch to linear cleanup code starting with
808 remainder length check (length not known at compile time)
809 set up 2 iv's and load count reg, compute remainder length
810 unrollx2 compare loop
811 if loop exit due to a difference, branch to difference handling code
812 if remainder length < 8, branch to final cleanup compare
813 load and compare 8B
814 final cleanup comparison (depends on alignment and length)
815 load 8B, shift off bytes past length, compare
816 load 8B ending at last byte and compare
817 load/compare 1 byte at a time (short block abutting 4k boundary)
818 difference handling, 64->32 conversion
819 final result
820 branch around memcmp call
821 memcmp library call
824 /* If bytes is not const, compare length and branch directly
825 to the cleanup code that can handle 0-16 bytes if length
826 is >= 16. Stash away bytes-max_bytes for the library call. */
827 if (bytes_is_const)
829 /* These need to be set for some of the places we may jump to. */
830 if (bytes > max_bytes)
832 no_remainder_code = true;
833 niter = max_loop_iter;
834 library_call_label = gen_label_rtx ();
836 else
838 niter = bytes / loop_bytes;
840 emit_move_insn (iter, GEN_INT (niter));
841 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
842 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
844 else
846 library_call_label = gen_label_rtx ();
848 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
849 emit_move_insn (cmp_rem, bytes_rtx);
851 /* Check for > max_bytes bytes. We want to bail out as quickly as
852 possible if we have to go over to memcmp. */
853 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
854 NULL_RTX, library_call_label);
856 /* Check for < loop_bytes bytes. */
857 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
858 NULL_RTX, cleanup_label);
860 /* Loop compare bytes and iterations if bytes>max_bytes. */
861 rtx mb_reg = gen_reg_rtx (word_mode);
862 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
863 rtx mi_reg = gen_reg_rtx (word_mode);
864 emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
866 /* Compute number of loop iterations if bytes <= max_bytes. */
867 if (word_mode == DImode)
868 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
869 else
870 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
872 /* Compute bytes to compare in loop if bytes <= max_bytes. */
873 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
874 if (word_mode == DImode)
876 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
878 else
880 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
883 /* Check for bytes <= max_bytes. */
884 if (TARGET_ISEL)
886 /* P9 has fast isel so we use one compare and two isel. */
887 cr = gen_reg_rtx (CCmode);
888 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
889 GEN_INT (max_bytes));
890 emit_move_insn (cr, compare_rtx);
891 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
892 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
893 do_isel (iter, cmp_rtx, iter, mi_reg, cr);
895 else
897 rtx lab_after = gen_label_rtx ();
898 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
899 NULL_RTX, lab_after);
900 emit_move_insn (loop_cmp, mb_reg);
901 emit_move_insn (iter, mi_reg);
902 emit_label (lab_after);
905 /* Now compute remainder bytes which isn't used until after the loop. */
906 do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
909 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */
910 /* For p9 we need to have just one of these as multiple places define
911 it and it gets used by the setb at the end. */
912 if (TARGET_P9_MISC)
913 dcond = gen_reg_rtx (CCUNSmode);
915 if (!bytes_is_const || bytes >= loop_bytes)
917 /* It should not be possible to come here if remaining bytes is
918 < 16 in the runtime case either. Compute number of loop
919 iterations. We compare 2*word_mode per iteration so 16B for
920 64-bit code and 8B for 32-bit. Set up two induction
921 variables and load count register. */
923 /* HACK ALERT: create hard reg for CTR here. If we just use a
924 pseudo, cse will get rid of it and then the allocator will
925 see it used in the lshr above and won't give us ctr. */
926 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
927 emit_move_insn (ctr, iter);
928 emit_move_insn (diff, GEN_INT (0));
929 emit_move_insn (iv1, GEN_INT (0));
930 emit_move_insn (iv2, GEN_INT (load_mode_size));
932 /* inner loop to compare 2*word_mode */
933 rtx loop_top_label = gen_label_rtx ();
934 emit_label (loop_top_label);
936 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
937 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
939 do_load_for_compare_from_addr (load_mode, d1_1,
940 src1_ix1, orig_src1);
941 do_load_for_compare_from_addr (load_mode, d2_1,
942 src2_ix1, orig_src2);
943 do_add3 (iv1, iv1, GEN_INT (loop_bytes));
945 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
946 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
948 do_load_for_compare_from_addr (load_mode, d1_2,
949 src1_ix2, orig_src1);
950 do_load_for_compare_from_addr (load_mode, d2_2,
951 src2_ix2, orig_src2);
952 do_add3 (iv2, iv2, GEN_INT (loop_bytes));
954 if (TARGET_P9_MISC)
956 /* Generate a compare, and convert with a setb later. */
957 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
958 emit_insn (gen_rtx_SET (dcond, cmp));
960 else
962 dcond = gen_reg_rtx (CCmode);
963 if (word_mode == DImode)
964 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
965 else
966 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
969 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
970 dcond, diff_label);
972 if (TARGET_P9_MISC)
974 /* Generate a compare, and convert with a setb later. */
975 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
976 emit_insn (gen_rtx_SET (dcond, cmp));
978 else
980 dcond = gen_reg_rtx (CCmode);
981 if (word_mode == DImode)
982 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
983 else
984 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
987 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
988 if (TARGET_64BIT)
989 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
990 eqrtx, dcond));
991 else
992 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
993 eqrtx, dcond));
994 JUMP_LABEL (j) = loop_top_label;
995 LABEL_NUSES (loop_top_label) += 1;
998 HOST_WIDE_INT bytes_remaining = 0;
999 if (bytes_is_const)
1000 bytes_remaining = (bytes % loop_bytes);
1002 /* If diff is nonzero, branch to difference handling
1003 code. If we exit here with a nonzero diff, it is
1004 because the second word differed. */
1005 if (TARGET_P9_MISC)
1006 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, diff_label);
1007 else
1008 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, diff_label);
1010 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
1012 /* If the length is known at compile time, then we will always
1013 have a remainder to go to the library call with. */
1014 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
1015 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
1016 JUMP_LABEL (j) = library_call_label;
1017 LABEL_NUSES (library_call_label) += 1;
1018 emit_barrier ();
1021 if (bytes_is_const && bytes_remaining == 0)
1023 /* No remainder and if we are here then diff is 0 so just return 0 */
1024 if (TARGET_64BIT)
1025 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1026 else
1027 emit_move_insn (target, diff);
1028 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1029 JUMP_LABEL (j) = final_label;
1030 LABEL_NUSES (final_label) += 1;
1031 emit_barrier ();
1033 else if (!no_remainder_code)
1035 /* Update addresses to point to the next word to examine. */
1036 do_add3 (src1_addr, src1_addr, iv1);
1037 do_add3 (src2_addr, src2_addr, iv1);
1039 emit_label (cleanup_label);
1041 if (!bytes_is_const)
1043 /* If we're dealing with runtime length, we have to check if
1044 it's zero after the loop. When length is known at compile
1045 time the no-remainder condition is dealt with above. By
1046 doing this after cleanup_label, we also deal with the
1047 case where length is 0 at the start and we bypass the
1048 loop with a branch to cleanup_label. */
1049 emit_move_insn (target, const0_rtx);
1050 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1051 NULL_RTX, final_label);
1054 rtx final_cleanup = gen_label_rtx ();
1055 rtx cmp_rem_before = gen_reg_rtx (word_mode);
1056 /* Compare one more word_mode chunk if needed. */
1057 if (!bytes_is_const || bytes_remaining >= load_mode_size)
1059 /* If remainder length < word length, branch to final
1060 cleanup compare. */
1061 if (!bytes_is_const)
1062 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1063 NULL_RTX, final_cleanup);
1065 /* load and compare 8B */
1066 do_load_for_compare_from_addr (load_mode, d1_1,
1067 src1_addr, orig_src1);
1068 do_load_for_compare_from_addr (load_mode, d2_1,
1069 src2_addr, orig_src2);
1071 /* Compare the word, see if we need to do the last partial. */
1072 if (TARGET_P9_MISC)
1074 /* Generate a compare, and convert with a setb later. */
1075 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1076 emit_insn (gen_rtx_SET (dcond, cmp));
1078 else
1080 dcond = gen_reg_rtx (CCmode);
1081 if (word_mode == DImode)
1082 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1083 else
1084 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1087 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1088 dcond, diff_label);
1090 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1091 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1092 emit_move_insn (cmp_rem_before, cmp_rem);
1093 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1094 if (bytes_is_const)
1095 bytes_remaining -= load_mode_size;
1096 else
1097 /* See if remaining length is now zero. We previously set
1098 target to 0 so we can just jump to the end. */
1099 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1100 NULL_RTX, final_label);
1104 /* Cases:
1105 bytes_is_const
1106 We can always shift back to do an overlapping compare
1107 of the last chunk because we know length >= 8.
1109 !bytes_is_const
1110 align>=load_mode_size
1111 Read word_mode and mask
1112 align<load_mode_size
1113 avoid stepping past end
1115 Three strategies:
1116 * decrement address and do overlapping compare
1117 * read word_mode and mask
1118 * carefully avoid crossing 4k boundary
1121 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1122 && align1 >= load_mode_size && align2 >= load_mode_size)
1124 /* Alignment is larger than word_mode so we do not need to be
1125 concerned with extra page crossings. But, we do not know
1126 that the length is larger than load_mode_size so we might
1127 end up compareing against data before the block if we try
1128 an overlapping compare. Also we use this on P7 for fixed length
1129 remainder because P7 doesn't like overlapping unaligned.
1130 Strategy: load 8B, shift off bytes past length, and compare. */
1131 emit_label (final_cleanup);
1132 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1133 src1_addr, src2_addr, orig_src1, orig_src2);
1135 else if (bytes_remaining && bytes_is_const)
1137 /* We do not do loop expand if length < 32 so we know at the
1138 end we can do an overlapping compare.
1139 Strategy: shift address back and do word_mode load that
1140 ends at the end of the block. */
1141 emit_label (final_cleanup);
1142 do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1143 cmp_rem, dcond, src1_addr, src2_addr,
1144 orig_src1, orig_src2);
1146 else if (!bytes_is_const)
1148 rtx handle4k_label = gen_label_rtx ();
1149 rtx nonconst_overlap = gen_label_rtx ();
1150 emit_label (nonconst_overlap);
1152 /* Here we have to handle the case where whe have runtime
1153 length which may be too short for overlap compare, and
1154 alignment is not at least load_mode_size so we have to
1155 tread carefully to avoid stepping across 4k boundaries. */
1157 /* If the length after the loop was larger than word_mode
1158 size, we can just do an overlapping compare and we're
1159 done. We fall through to this code from the word_mode
1160 compare that preceeds this. */
1161 do_overlap_load_compare (load_mode, false, 0, diff,
1162 cmp_rem, dcond, src1_addr, src2_addr,
1163 orig_src1, orig_src2);
1165 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1166 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1167 JUMP_LABEL (j) = diff_label;
1168 LABEL_NUSES (diff_label) += 1;
1169 emit_barrier ();
1171 /* If we couldn't do the overlap compare we have to be more
1172 careful of the 4k boundary. Test to see if either
1173 address is less than word_mode_size away from a 4k
1174 boundary. If not, then we can do a load/shift/compare
1175 and we are done. We come to this code if length was less
1176 than word_mode_size. */
1178 emit_label (final_cleanup);
1180 /* We can still avoid the slow case if the length was larger
1181 than one loop iteration, in which case go do the overlap
1182 load compare path. */
1183 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1184 NULL_RTX, nonconst_overlap);
1186 rtx rem4k = gen_reg_rtx (word_mode);
1187 rtx dist1 = gen_reg_rtx (word_mode);
1188 rtx dist2 = gen_reg_rtx (word_mode);
1189 do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1190 if (word_mode == SImode)
1191 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1192 else
1193 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1194 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, handle4k_label);
1195 if (word_mode == SImode)
1196 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1197 else
1198 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1199 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, handle4k_label);
1201 /* We don't have a 4k boundary to deal with, so do
1202 a load/shift/compare and jump to diff. */
1204 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1205 src1_addr, src2_addr, orig_src1, orig_src2);
1207 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1208 JUMP_LABEL (j) = diff_label;
1209 LABEL_NUSES (diff_label) += 1;
1210 emit_barrier ();
1212 /* Finally in the unlikely case we are inching up to a
1213 4k boundary we use a compact lbzx/compare loop to do
1214 it a byte at a time. */
1216 emit_label (handle4k_label);
1218 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1219 emit_move_insn (ctr, cmp_rem);
1220 rtx ixreg = gen_reg_rtx (Pmode);
1221 emit_move_insn (ixreg, const0_rtx);
1223 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1224 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1225 rtx d1 = gen_reg_rtx (word_mode);
1226 rtx d2 = gen_reg_rtx (word_mode);
1228 rtx fc_loop = gen_label_rtx ();
1229 emit_label (fc_loop);
1231 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1232 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1234 do_add3 (ixreg, ixreg, const1_rtx);
1236 rtx cond = gen_reg_rtx (CCmode);
1237 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1238 rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1240 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1241 if (TARGET_64BIT)
1242 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1243 eqrtx, cond));
1244 else
1245 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1246 eqrtx, cond));
1247 JUMP_LABEL (j) = fc_loop;
1248 LABEL_NUSES (fc_loop) += 1;
1250 if (TARGET_64BIT)
1251 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1252 else
1253 emit_move_insn (target, diff);
1255 /* Since we are comparing bytes, the difference can be used
1256 as the final result and we are done here. */
1257 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1258 JUMP_LABEL (j) = final_label;
1259 LABEL_NUSES (final_label) += 1;
1260 emit_barrier ();
1264 emit_label (diff_label);
1265 /* difference handling, 64->32 conversion */
1267 /* We need to produce DI result from sub, then convert to target SI
1268 while maintaining <0 / ==0 / >0 properties. This sequence works:
1269 subfc L,A,B
1270 subfe H,H,H
1271 popcntd L,L
1272 rldimi L,H,6,0
1274 This is an alternate one Segher cooked up if somebody
1275 wants to expand this for something that doesn't have popcntd:
1276 subfc L,a,b
1277 subfe H,x,x
1278 addic t,L,-1
1279 subfe v,t,L
1280 or z,v,H
1282 And finally, p9 can just do this:
1283 cmpld A,B
1284 setb r */
1286 if (TARGET_P9_MISC)
1287 emit_insn (gen_setb_unsigned (target, dcond));
1288 else
1290 if (TARGET_64BIT)
1292 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1293 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1294 emit_insn (gen_popcntddi2 (diff, diff));
1295 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1296 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1298 else
1300 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1301 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1302 emit_insn (gen_popcntdsi2 (diff, diff));
1303 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1307 if (library_call_label != NULL)
1309 /* Branch around memcmp call. */
1310 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1311 JUMP_LABEL (j) = final_label;
1312 LABEL_NUSES (final_label) += 1;
1313 emit_barrier ();
1315 /* Make memcmp library call. cmp_rem is the remaining bytes that
1316 were compared and cmp_rem is the expected amount to be compared
1317 by memcmp. If we don't find a difference in the loop compare, do
1318 the library call directly instead of doing a small compare just
1319 to get to an arbitrary boundary before calling it anyway.
1320 Also, update addresses to point to the next word to examine. */
1321 emit_label (library_call_label);
1323 rtx len_rtx = gen_reg_rtx (word_mode);
1324 if (bytes_is_const)
1326 emit_move_insn (len_rtx, cmp_rem);
1327 do_add3 (src1_addr, src1_addr, iv1);
1328 do_add3 (src2_addr, src2_addr, iv1);
1330 else
1331 emit_move_insn (len_rtx, bytes_rtx);
1333 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1334 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1335 target, LCT_NORMAL, GET_MODE (target),
1336 src1_addr, Pmode,
1337 src2_addr, Pmode,
1338 len_rtx, GET_MODE (len_rtx));
1341 /* emit final_label */
1342 emit_label (final_label);
1343 return true;
1346 /* Expand a block compare operation, and return true if successful.
1347 Return false if we should let the compiler generate normal code,
1348 probably a memcmp call.
1350 OPERANDS[0] is the target (result).
1351 OPERANDS[1] is the first source.
1352 OPERANDS[2] is the second source.
1353 OPERANDS[3] is the length.
1354 OPERANDS[4] is the alignment. */
1355 bool
1356 expand_block_compare (rtx operands[])
1358 rtx target = operands[0];
1359 rtx orig_src1 = operands[1];
1360 rtx orig_src2 = operands[2];
1361 rtx bytes_rtx = operands[3];
1362 rtx align_rtx = operands[4];
1363 HOST_WIDE_INT cmp_bytes = 0;
1364 rtx src1 = orig_src1;
1365 rtx src2 = orig_src2;
1367 /* This case is complicated to handle because the subtract
1368 with carry instructions do not generate the 64-bit
1369 carry and so we must emit code to calculate it ourselves.
1370 We choose not to implement this yet. */
1371 if (TARGET_32BIT && TARGET_POWERPC64)
1372 return false;
1374 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1376 /* Allow this param to shut off all expansion. */
1377 if (rs6000_block_compare_inline_limit == 0)
1378 return false;
1380 /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1381 However slow_unaligned_access returns true on P7 even though the
1382 performance of this code is good there. */
1383 if (!isP7
1384 && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1385 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
1386 return false;
1388 /* Unaligned l*brx traps on P7 so don't do this. However this should
1389 not affect much because LE isn't really supported on P7 anyway. */
1390 if (isP7 && !BYTES_BIG_ENDIAN)
1391 return false;
1393 /* If this is not a fixed size compare, try generating loop code and
1394 if that fails just call memcmp. */
1395 if (!CONST_INT_P (bytes_rtx))
1396 return expand_compare_loop (operands);
1398 /* This must be a fixed size alignment. */
1399 if (!CONST_INT_P (align_rtx))
1400 return false;
1402 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1404 gcc_assert (GET_MODE (target) == SImode);
1406 /* Anything to move? */
1407 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1408 if (bytes == 0)
1409 return true;
1411 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1412 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1413 /* P7/P8 code uses cond for subfc. but P9 uses
1414 it for cmpld which needs CCUNSmode. */
1415 rtx cond;
1416 if (TARGET_P9_MISC)
1417 cond = gen_reg_rtx (CCUNSmode);
1418 else
1419 cond = gen_reg_rtx (CCmode);
1421 /* Strategy phase. How many ops will this take and should we expand it? */
1423 unsigned HOST_WIDE_INT offset = 0;
1424 machine_mode load_mode =
1425 select_block_compare_mode (offset, bytes, base_align);
1426 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1428 /* We don't want to generate too much code. The loop code can take
1429 over for lengths greater than 31 bytes. */
1430 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
1431 if (!IN_RANGE (bytes, 1, max_bytes))
1432 return expand_compare_loop (operands);
1434 /* The code generated for p7 and older is not faster than glibc
1435 memcmp if alignment is small and length is not short, so bail
1436 out to avoid those conditions. */
1437 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
1438 && ((base_align == 1 && bytes > 16)
1439 || (base_align == 2 && bytes > 32)))
1440 return false;
1442 bool generate_6432_conversion = false;
1443 rtx convert_label = NULL;
1444 rtx final_label = NULL;
1446 /* Example of generated code for 18 bytes aligned 1 byte.
1447 Compiled with -fno-reorder-blocks for clarity.
1448 ldbrx 10,31,8
1449 ldbrx 9,7,8
1450 subfc. 9,9,10
1451 bne 0,.L6487
1452 addi 9,12,8
1453 addi 5,11,8
1454 ldbrx 10,0,9
1455 ldbrx 9,0,5
1456 subfc. 9,9,10
1457 bne 0,.L6487
1458 addi 9,12,16
1459 lhbrx 10,0,9
1460 addi 9,11,16
1461 lhbrx 9,0,9
1462 subf 9,9,10
1463 b .L6488
1464 .p2align 4,,15
1465 .L6487: #convert_label
1466 popcntd 9,9
1467 subfe 10,10,10
1468 or 9,9,10
1469 .L6488: #final_label
1470 extsw 10,9
1472 We start off with DImode for two blocks that jump to the DI->SI conversion
1473 if the difference is found there, then a final block of HImode that skips
1474 the DI->SI conversion. */
1476 while (bytes > 0)
1478 unsigned int align = compute_current_alignment (base_align, offset);
1479 load_mode = select_block_compare_mode (offset, bytes, align);
1480 load_mode_size = GET_MODE_SIZE (load_mode);
1481 if (bytes >= load_mode_size)
1482 cmp_bytes = load_mode_size;
1483 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1485 /* Move this load back so it doesn't go past the end.
1486 P8/P9 can do this efficiently. */
1487 unsigned int extra_bytes = load_mode_size - bytes;
1488 cmp_bytes = bytes;
1489 if (extra_bytes < offset)
1491 offset -= extra_bytes;
1492 cmp_bytes = load_mode_size;
1493 bytes = cmp_bytes;
1496 else
1497 /* P7 and earlier can't do the overlapping load trick fast,
1498 so this forces a non-overlapping load and a shift to get
1499 rid of the extra bytes. */
1500 cmp_bytes = bytes;
1502 src1 = adjust_address (orig_src1, load_mode, offset);
1503 src2 = adjust_address (orig_src2, load_mode, offset);
1505 if (!REG_P (XEXP (src1, 0)))
1507 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1508 src1 = replace_equiv_address (src1, src1_reg);
1510 set_mem_size (src1, load_mode_size);
1512 if (!REG_P (XEXP (src2, 0)))
1514 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1515 src2 = replace_equiv_address (src2, src2_reg);
1517 set_mem_size (src2, load_mode_size);
1519 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1520 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1522 if (cmp_bytes < load_mode_size)
1524 /* Shift unneeded bytes off. */
1525 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1526 if (word_mode == DImode)
1528 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1529 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1531 else
1533 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1534 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1538 int remain = bytes - cmp_bytes;
1539 if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
1541 /* Target is larger than load size so we don't need to
1542 reduce result size. */
1544 /* We previously did a block that need 64->32 conversion but
1545 the current block does not, so a label is needed to jump
1546 to the end. */
1547 if (generate_6432_conversion && !final_label)
1548 final_label = gen_label_rtx ();
1550 if (remain > 0)
1552 /* This is not the last block, branch to the end if the result
1553 of this subtract is not zero. */
1554 if (!final_label)
1555 final_label = gen_label_rtx ();
1556 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1557 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1558 rtx cr = gen_reg_rtx (CCmode);
1559 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
1560 emit_insn (gen_movsi (target,
1561 gen_lowpart (SImode, tmp_reg_src2)));
1562 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1563 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1564 fin_ref, pc_rtx);
1565 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1566 JUMP_LABEL (j) = final_label;
1567 LABEL_NUSES (final_label) += 1;
1569 else
1571 if (word_mode == DImode)
1573 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1574 tmp_reg_src2));
1575 emit_insn (gen_movsi (target,
1576 gen_lowpart (SImode, tmp_reg_src2)));
1578 else
1579 emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
1581 if (final_label)
1583 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1584 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1585 JUMP_LABEL (j) = final_label;
1586 LABEL_NUSES (final_label) += 1;
1587 emit_barrier ();
1591 else
1593 /* Do we need a 64->32 conversion block? We need the 64->32
1594 conversion even if target size == load_mode size because
1595 the subtract generates one extra bit. */
1596 generate_6432_conversion = true;
1598 if (remain > 0)
1600 if (!convert_label)
1601 convert_label = gen_label_rtx ();
1603 /* Compare to zero and branch to convert_label if not zero. */
1604 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1605 if (TARGET_P9_MISC)
1607 /* Generate a compare, and convert with a setb later. */
1608 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1609 tmp_reg_src2);
1610 emit_insn (gen_rtx_SET (cond, cmp));
1612 else
1613 /* Generate a subfc. and use the longer
1614 sequence for conversion. */
1615 if (TARGET_64BIT)
1616 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1617 tmp_reg_src1, cond));
1618 else
1619 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1620 tmp_reg_src1, cond));
1621 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1622 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1623 cvt_ref, pc_rtx);
1624 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1625 JUMP_LABEL (j) = convert_label;
1626 LABEL_NUSES (convert_label) += 1;
1628 else
1630 /* Just do the subtract/compare. Since this is the last block
1631 the convert code will be generated immediately following. */
1632 if (TARGET_P9_MISC)
1634 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1635 tmp_reg_src2);
1636 emit_insn (gen_rtx_SET (cond, cmp));
1638 else
1639 if (TARGET_64BIT)
1640 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
1641 tmp_reg_src1));
1642 else
1643 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
1644 tmp_reg_src1));
1648 offset += cmp_bytes;
1649 bytes -= cmp_bytes;
1652 if (generate_6432_conversion)
1654 if (convert_label)
1655 emit_label (convert_label);
1657 /* We need to produce DI result from sub, then convert to target SI
1658 while maintaining <0 / ==0 / >0 properties. This sequence works:
1659 subfc L,A,B
1660 subfe H,H,H
1661 popcntd L,L
1662 rldimi L,H,6,0
1664 This is an alternate one Segher cooked up if somebody
1665 wants to expand this for something that doesn't have popcntd:
1666 subfc L,a,b
1667 subfe H,x,x
1668 addic t,L,-1
1669 subfe v,t,L
1670 or z,v,H
1672 And finally, p9 can just do this:
1673 cmpld A,B
1674 setb r */
1676 if (TARGET_P9_MISC)
1678 emit_insn (gen_setb_unsigned (target, cond));
1680 else
1682 if (TARGET_64BIT)
1684 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1685 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1686 emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
1687 emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
1688 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
1690 else
1692 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1693 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1694 emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
1695 emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
1700 if (final_label)
1701 emit_label (final_label);
1703 gcc_assert (bytes == 0);
1704 return true;
1707 /* Generate page crossing check and branch code to set up for
1708 strncmp when we don't have DI alignment.
1709 STRNCMP_LABEL is the label to branch if there is a page crossing.
1710 SRC_ADDR is the string address to be examined.
1711 BYTES is the max number of bytes to compare. */
1712 static void
1713 expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
1715 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
1716 rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr));
1717 do_and3 (src_pgoff, src_addr, GEN_INT (0xfff));
1718 rtx cond = gen_reg_rtx (CCmode);
1719 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff,
1720 GEN_INT (4096 - bytes)));
1722 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
1724 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1725 lab_ref, pc_rtx);
1726 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1727 JUMP_LABEL (j) = strncmp_label;
1728 LABEL_NUSES (strncmp_label) += 1;
1731 /* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
1732 BYTES_TO_COMPARE is the number of bytes to be compared.
1733 BASE_ALIGN is the smaller of the alignment of the two strings.
1734 ORIG_SRC1 is the unmodified rtx for the first string.
1735 ORIG_SRC2 is the unmodified rtx for the second string.
1736 TMP_REG_SRC1 is the register for loading the first string.
1737 TMP_REG_SRC2 is the register for loading the second string.
1738 RESULT_REG is the rtx for the result register.
1739 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
1740 to strcmp/strncmp if we have equality at the end of the inline comparison.
1741 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
1742 to clean up and generate the final comparison result.
1743 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
1744 set the final result. */
1745 static void
1746 expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
1747 unsigned int base_align,
1748 rtx orig_src1, rtx orig_src2,
1749 rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
1750 bool equality_compare_rest, rtx *p_cleanup_label,
1751 rtx final_move_label)
1753 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
1754 machine_mode load_mode;
1755 unsigned int load_mode_size;
1756 unsigned HOST_WIDE_INT cmp_bytes = 0;
1757 unsigned HOST_WIDE_INT offset = 0;
1758 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
1759 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
1760 gcc_assert (p_cleanup_label != NULL);
1761 rtx cleanup_label = *p_cleanup_label;
1763 while (bytes_to_compare > 0)
1765 /* GPR compare sequence:
1766 check each 8B with: ld/ld/cmpb/cmpb/orc./bne
1768 cleanup code at end:
1769 cntlzd get bit of first zero/diff byte
1770 subfic convert for rldcl use
1771 rldcl rldcl extract diff/zero byte
1772 subf subtract for final result
1774 The last compare can branch around the cleanup code if the
1775 result is zero because the strings are exactly equal. */
1777 unsigned int align = compute_current_alignment (base_align, offset);
1778 load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
1779 load_mode_size = GET_MODE_SIZE (load_mode);
1780 if (bytes_to_compare >= load_mode_size)
1781 cmp_bytes = load_mode_size;
1782 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1784 /* Move this load back so it doesn't go past the end.
1785 P8/P9 can do this efficiently. */
1786 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
1787 cmp_bytes = bytes_to_compare;
1788 if (extra_bytes < offset)
1790 offset -= extra_bytes;
1791 cmp_bytes = load_mode_size;
1792 bytes_to_compare = cmp_bytes;
1795 else
1796 /* P7 and earlier can't do the overlapping load trick fast,
1797 so this forces a non-overlapping load and a shift to get
1798 rid of the extra bytes. */
1799 cmp_bytes = bytes_to_compare;
1801 rtx offset_rtx;
1802 if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
1803 offset_rtx = GEN_INT (offset);
1804 else
1806 offset_rtx = gen_reg_rtx (Pmode);
1807 emit_move_insn (offset_rtx, GEN_INT (offset));
1809 rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
1810 rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
1812 do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
1813 do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
1815 /* We must always left-align the data we read, and
1816 clear any bytes to the right that are beyond the string.
1817 Otherwise the cmpb sequence won't produce the correct
1818 results. However if there is only one byte left, we
1819 can just subtract to get the final result so the shifts
1820 and clears are not needed. */
1822 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
1824 /* Loading just a single byte is a special case. If we are
1825 loading more than that, we have to check whether we are
1826 looking at the entire chunk of data. If not, rotate left and
1827 clear right so that bytes we aren't supposed to look at are
1828 zeroed, and the first byte we are supposed to compare is
1829 leftmost. */
1830 if (load_mode_size != 1)
1832 if (load_mode_size < word_mode_size)
1834 /* Rotate left first. */
1835 rtx sh = GEN_INT (BITS_PER_UNIT
1836 * (word_mode_size - load_mode_size));
1837 do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
1838 do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
1841 if (cmp_bytes < word_mode_size)
1843 /* Now clear right. This plus the rotate can be
1844 turned into a rldicr instruction. */
1845 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
1846 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
1847 do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
1848 do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
1852 /* Cases to handle. A and B are chunks of the two strings.
1853 1: Not end of comparison:
1854 A != B: branch to cleanup code to compute result.
1855 A == B: check for 0 byte, next block if not found.
1856 2: End of the inline comparison:
1857 A != B: branch to cleanup code to compute result.
1858 A == B: check for 0 byte, call strcmp/strncmp
1859 3: compared requested N bytes:
1860 A == B: branch to result 0.
1861 A != B: cleanup code to compute result. */
1863 rtx dst_label;
1864 if (remain > 0 || equality_compare_rest)
1866 /* Branch to cleanup code, otherwise fall through to do
1867 more compares. */
1868 if (!cleanup_label)
1869 cleanup_label = gen_label_rtx ();
1870 dst_label = cleanup_label;
1872 else
1873 /* Branch to end and produce result of 0. */
1874 dst_label = final_move_label;
1876 if (load_mode_size == 1)
1878 /* Special case for comparing just single byte. */
1879 if (equality_compare_rest)
1881 /* Use subf./bne to branch to final_move_label if the
1882 byte differs, otherwise fall through to the strncmp
1883 call. We must also check for a zero byte here as we
1884 must not make the library call if this is the end of
1885 the string. */
1887 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
1888 rtx cond = gen_reg_rtx (CCmode);
1889 rtx diff_rtx = gen_rtx_MINUS (word_mode,
1890 tmp_reg_src1, tmp_reg_src2);
1891 rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
1892 rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1894 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1895 lab_ref, pc_rtx);
1896 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1897 JUMP_LABEL (j) = final_move_label;
1898 LABEL_NUSES (final_move_label) += 1;
1900 /* Check for zero byte here before fall through to
1901 library call. This catches the case where the
1902 strings are equal and end in a zero byte at this
1903 position. */
1905 rtx cond0 = gen_reg_rtx (CCmode);
1906 emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
1907 const0_rtx));
1909 rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
1911 rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
1912 lab_ref, pc_rtx);
1913 rtx j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
1914 JUMP_LABEL (j0) = final_move_label;
1915 LABEL_NUSES (final_move_label) += 1;
1917 else
1919 /* This is the last byte to be compared so we can use
1920 subf to compute the final result and branch
1921 unconditionally to final_move_label. */
1923 do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
1925 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
1926 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1927 JUMP_LABEL (j) = final_move_label;
1928 LABEL_NUSES (final_move_label) += 1;
1929 emit_barrier ();
1932 else
1934 rtx cmpb_zero = gen_reg_rtx (word_mode);
1935 rtx cmpb_diff = gen_reg_rtx (word_mode);
1936 rtx zero_reg = gen_reg_rtx (word_mode);
1937 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
1938 rtx cond = gen_reg_rtx (CCmode);
1940 emit_move_insn (zero_reg, GEN_INT (0));
1941 do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
1942 do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
1943 rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
1944 rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
1946 rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
1948 rtx cmp_rtx;
1949 if (remain == 0 && !equality_compare_rest)
1950 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
1951 else
1952 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1954 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1955 lab_ref, pc_rtx);
1956 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1957 JUMP_LABEL (j) = dst_label;
1958 LABEL_NUSES (dst_label) += 1;
1961 offset += cmp_bytes;
1962 bytes_to_compare -= cmp_bytes;
1965 *p_cleanup_label = cleanup_label;
1966 return;
1969 /* Generate the sequence of compares for strcmp/strncmp using vec/vsx
1970 instructions.
1972 BYTES_TO_COMPARE is the number of bytes to be compared.
1973 ORIG_SRC1 is the unmodified rtx for the first string.
1974 ORIG_SRC2 is the unmodified rtx for the second string.
1975 S1ADDR is the register to use for the base address of the first string.
1976 S2ADDR is the register to use for the base address of the second string.
1977 OFF_REG is the register to use for the string offset for loads.
1978 S1DATA is the register for loading the first string.
1979 S2DATA is the register for loading the second string.
1980 VEC_RESULT is the rtx for the vector result indicating the byte difference.
1981 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
1982 to strcmp/strncmp if we have equality at the end of the inline comparison.
1983 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code to clean up
1984 and generate the final comparison result.
1985 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
1986 set the final result. */
1987 static void
1988 expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
1989 rtx orig_src1, rtx orig_src2,
1990 rtx s1addr, rtx s2addr, rtx off_reg,
1991 rtx s1data, rtx s2data,
1992 rtx vec_result, bool equality_compare_rest,
1993 rtx *p_cleanup_label, rtx final_move_label)
1995 machine_mode load_mode;
1996 unsigned int load_mode_size;
1997 unsigned HOST_WIDE_INT cmp_bytes = 0;
1998 unsigned HOST_WIDE_INT offset = 0;
2000 gcc_assert (p_cleanup_label != NULL);
2001 rtx cleanup_label = *p_cleanup_label;
2003 emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
2004 emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
2006 unsigned int i;
2007 rtx zr[16];
2008 for (i = 0; i < 16; i++)
2009 zr[i] = GEN_INT (0);
2010 rtvec zv = gen_rtvec_v (16, zr);
2011 rtx zero_reg = gen_reg_rtx (V16QImode);
2012 rs6000_expand_vector_init (zero_reg, gen_rtx_PARALLEL (V16QImode, zv));
2014 while (bytes_to_compare > 0)
2016 /* VEC/VSX compare sequence for P8:
2017 check each 16B with:
2018 lxvd2x 32,28,8
2019 lxvd2x 33,29,8
2020 vcmpequb 2,0,1 # compare strings
2021 vcmpequb 4,0,3 # compare w/ 0
2022 xxlorc 37,36,34 # first FF byte is either mismatch or end of string
2023 vcmpequb. 7,5,3 # reg 7 contains 0
2024 bnl 6,.Lmismatch
2026 For the P8 LE case, we use lxvd2x and compare full 16 bytes
2027 but then use use vgbbd and a shift to get two bytes with the
2028 information we need in the correct order.
2030 VEC/VSX compare sequence if TARGET_P9_VECTOR:
2031 lxvb16x/lxvb16x # load 16B of each string
2032 vcmpnezb. # produces difference location or zero byte location
2033 bne 6,.Lmismatch
2035 Use the overlapping compare trick for the last block if it is
2036 less than 16 bytes.
2039 load_mode = V16QImode;
2040 load_mode_size = GET_MODE_SIZE (load_mode);
2042 if (bytes_to_compare >= load_mode_size)
2043 cmp_bytes = load_mode_size;
2044 else
2046 /* Move this load back so it doesn't go past the end. P8/P9
2047 can do this efficiently. This is never called with less
2048 than 16 bytes so we should always be able to do this. */
2049 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
2050 cmp_bytes = bytes_to_compare;
2051 gcc_assert (offset > extra_bytes);
2052 offset -= extra_bytes;
2053 cmp_bytes = load_mode_size;
2054 bytes_to_compare = cmp_bytes;
2057 /* The offset currently used is always kept in off_reg so that the
2058 cleanup code on P8 can use it to extract the differing byte. */
2059 emit_move_insn (off_reg, GEN_INT (offset));
2061 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
2062 do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1);
2063 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
2064 do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2);
2066 /* Cases to handle. A and B are chunks of the two strings.
2067 1: Not end of comparison:
2068 A != B: branch to cleanup code to compute result.
2069 A == B: next block
2070 2: End of the inline comparison:
2071 A != B: branch to cleanup code to compute result.
2072 A == B: call strcmp/strncmp
2073 3: compared requested N bytes:
2074 A == B: branch to result 0.
2075 A != B: cleanup code to compute result. */
2077 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
2079 if (TARGET_P9_VECTOR)
2080 emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data));
2081 else
2083 /* Emit instructions to do comparison and zero check. */
2084 rtx cmp_res = gen_reg_rtx (load_mode);
2085 rtx cmp_zero = gen_reg_rtx (load_mode);
2086 rtx cmp_combined = gen_reg_rtx (load_mode);
2087 emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data));
2088 emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg));
2089 emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res));
2090 emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg));
2093 bool branch_to_cleanup = (remain > 0 || equality_compare_rest);
2094 rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
2095 rtx dst_label;
2096 rtx cmp_rtx;
2097 if (branch_to_cleanup)
2099 /* Branch to cleanup code, otherwise fall through to do more
2100 compares. P8 and P9 use different CR bits because on P8
2101 we are looking at the result of a comparsion vs a
2102 register of zeroes so the all-true condition means no
2103 difference or zero was found. On P9, vcmpnezb sets a byte
2104 to 0xff if there is a mismatch or zero, so the all-false
2105 condition indicates we found no difference or zero. */
2106 if (!cleanup_label)
2107 cleanup_label = gen_label_rtx ();
2108 dst_label = cleanup_label;
2109 if (TARGET_P9_VECTOR)
2110 cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx);
2111 else
2112 cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx);
2114 else
2116 /* Branch to final return or fall through to cleanup,
2117 result is already set to 0. */
2118 dst_label = final_move_label;
2119 if (TARGET_P9_VECTOR)
2120 cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx);
2121 else
2122 cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx);
2125 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2126 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2127 lab_ref, pc_rtx);
2128 rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2129 JUMP_LABEL (j2) = dst_label;
2130 LABEL_NUSES (dst_label) += 1;
2132 offset += cmp_bytes;
2133 bytes_to_compare -= cmp_bytes;
2135 *p_cleanup_label = cleanup_label;
2136 return;
2139 /* Generate the final sequence that identifies the differing
2140 byte and generates the final result, taking into account
2141 zero bytes:
2143 cntlzd get bit of first zero/diff byte
2144 addi convert for rldcl use
2145 rldcl rldcl extract diff/zero byte
2146 subf subtract for final result
2148 STR1 is the reg rtx for data from string 1.
2149 STR2 is the reg rtx for data from string 2.
2150 RESULT is the reg rtx for the comparison result. */
2152 static void
2153 emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
2155 machine_mode m = GET_MODE (str1);
2156 rtx rot_amt = gen_reg_rtx (m);
2158 rtx rot1_1 = gen_reg_rtx (m);
2159 rtx rot1_2 = gen_reg_rtx (m);
2160 rtx rot2_1 = gen_reg_rtx (m);
2161 rtx rot2_2 = gen_reg_rtx (m);
2163 if (m == SImode)
2165 emit_insn (gen_clzsi2 (rot_amt, result));
2166 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2167 emit_insn (gen_rotlsi3 (rot1_1, str1,
2168 gen_lowpart (SImode, rot_amt)));
2169 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2170 emit_insn (gen_rotlsi3 (rot2_1, str2,
2171 gen_lowpart (SImode, rot_amt)));
2172 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2173 emit_insn (gen_subsi3 (result, rot1_2, rot2_2));
2175 else if (m == DImode)
2177 emit_insn (gen_clzdi2 (rot_amt, result));
2178 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2179 emit_insn (gen_rotldi3 (rot1_1, str1,
2180 gen_lowpart (SImode, rot_amt)));
2181 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2182 emit_insn (gen_rotldi3 (rot2_1, str2,
2183 gen_lowpart (SImode, rot_amt)));
2184 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2185 emit_insn (gen_subdi3 (result, rot1_2, rot2_2));
2187 else
2188 gcc_unreachable ();
2190 return;
2193 /* Generate the final sequence that identifies the differing
2194 byte and generates the final result, taking into account
2195 zero bytes:
2198 vgbbd 0,0
2199 vsldoi 0,0,0,9
2200 mfvsrd 9,32
2201 addi 10,9,-1 # count trailing zero bits
2202 andc 9,10,9
2203 popcntd 9,9
2204 lbzx 10,28,9 # use that offset to load differing byte
2205 lbzx 3,29,9
2206 subf 3,3,10 # subtract for final result
2209 vclzlsbb # counts trailing bytes with lsb=0
2210 vextublx # extract differing byte
2212 STR1 is the reg rtx for data from string 1.
2213 STR2 is the reg rtx for data from string 2.
2214 RESULT is the reg rtx for the comparison result.
2215 S1ADDR is the register to use for the base address of the first string.
2216 S2ADDR is the register to use for the base address of the second string.
2217 ORIG_SRC1 is the unmodified rtx for the first string.
2218 ORIG_SRC2 is the unmodified rtx for the second string.
2219 OFF_REG is the register to use for the string offset for loads.
2220 VEC_RESULT is the rtx for the vector result indicating the byte difference.
2223 static void
2224 emit_final_str_compare_vec (rtx str1, rtx str2, rtx result,
2225 rtx s1addr, rtx s2addr,
2226 rtx orig_src1, rtx orig_src2,
2227 rtx off_reg, rtx vec_result)
2229 if (TARGET_P9_VECTOR)
2231 rtx diffix = gen_reg_rtx (SImode);
2232 rtx chr1 = gen_reg_rtx (SImode);
2233 rtx chr2 = gen_reg_rtx (SImode);
2234 rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0);
2235 rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0);
2236 emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result));
2237 emit_insn (gen_vextublx (chr1, diffix, str1));
2238 emit_insn (gen_vextublx (chr2, diffix, str2));
2239 do_sub3 (result, chr1_di, chr2_di);
2241 else
2243 gcc_assert (TARGET_P8_VECTOR);
2244 rtx diffix = gen_reg_rtx (DImode);
2245 rtx result_gbbd = gen_reg_rtx (V16QImode);
2246 /* Since each byte of the input is either 00 or FF, the bytes in
2247 dw0 and dw1 after vgbbd are all identical to each other. */
2248 emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
2249 /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
2250 For BE, we shift by 7 and get AB in the high two bytes then CLZ. */
2251 rtx result_shifted = gen_reg_rtx (V16QImode);
2252 int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9;
2253 emit_insn (gen_altivec_vsldoi_v16qi (result_shifted,result_gbbd,result_gbbd, GEN_INT (shift_amt)));
2255 rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0);
2256 emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted));
2257 rtx count = gen_reg_rtx (DImode);
2259 if (BYTES_BIG_ENDIAN)
2260 emit_insn (gen_clzdi2 (count, diffix));
2261 else
2262 emit_insn (gen_ctzdi2 (count, diffix));
2264 /* P8 doesn't have a good solution for extracting one byte from
2265 a vsx reg like vextublx on P9 so we just compute the offset
2266 of the differing byte and load it from each string. */
2267 do_add3 (off_reg, off_reg, count);
2269 rtx chr1 = gen_reg_rtx (QImode);
2270 rtx chr2 = gen_reg_rtx (QImode);
2271 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
2272 do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1);
2273 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
2274 do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2);
2275 machine_mode rmode = GET_MODE (result);
2276 rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0);
2277 rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0);
2278 do_sub3 (result, chr1_rm, chr2_rm);
2281 return;
2284 /* Expand a string compare operation with length, and return
2285 true if successful. Return false if we should let the
2286 compiler generate normal code, probably a strncmp call.
2288 OPERANDS[0] is the target (result).
2289 OPERANDS[1] is the first source.
2290 OPERANDS[2] is the second source.
2291 If NO_LENGTH is zero, then:
2292 OPERANDS[3] is the length.
2293 OPERANDS[4] is the alignment in bytes.
2294 If NO_LENGTH is nonzero, then:
2295 OPERANDS[3] is the alignment in bytes. */
2296 bool
2297 expand_strn_compare (rtx operands[], int no_length)
2299 rtx target = operands[0];
2300 rtx orig_src1 = operands[1];
2301 rtx orig_src2 = operands[2];
2302 rtx bytes_rtx, align_rtx;
2303 if (no_length)
2305 bytes_rtx = NULL;
2306 align_rtx = operands[3];
2308 else
2310 bytes_rtx = operands[3];
2311 align_rtx = operands[4];
2314 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2315 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
2317 /* If we have a length, it must be constant. This simplifies things
2318 a bit as we don't have to generate code to check if we've exceeded
2319 the length. Later this could be expanded to handle this case. */
2320 if (!no_length && !CONST_INT_P (bytes_rtx))
2321 return false;
2323 /* This must be a fixed size alignment. */
2324 if (!CONST_INT_P (align_rtx))
2325 return false;
2327 unsigned int base_align = UINTVAL (align_rtx);
2328 unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
2329 unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
2331 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
2332 if (targetm.slow_unaligned_access (word_mode, align1)
2333 || targetm.slow_unaligned_access (word_mode, align2))
2334 return false;
2336 gcc_assert (GET_MODE (target) == SImode);
2338 unsigned int required_align = 8;
2340 unsigned HOST_WIDE_INT offset = 0;
2341 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
2342 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
2344 if (no_length)
2345 bytes = rs6000_string_compare_inline_limit;
2346 else
2347 bytes = UINTVAL (bytes_rtx);
2349 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
2350 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2351 at least POWER8. That way we can rely on overlapping compares to
2352 do the final comparison of less than 16 bytes. Also I do not
2353 want to deal with making this work for 32 bits. In addition, we
2354 have to make sure that we have at least P8_VECTOR (we don't allow
2355 P9_VECTOR without P8_VECTOR). */
2356 int use_vec = (bytes >= 16 && !TARGET_32BIT
2357 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
2359 if (use_vec)
2360 required_align = 16;
2362 machine_mode load_mode;
2363 rtx tmp_reg_src1, tmp_reg_src2;
2364 if (use_vec)
2366 load_mode = V16QImode;
2367 tmp_reg_src1 = gen_reg_rtx (V16QImode);
2368 tmp_reg_src2 = gen_reg_rtx (V16QImode);
2370 else
2372 load_mode = select_block_compare_mode (0, bytes, base_align);
2373 tmp_reg_src1 = gen_reg_rtx (word_mode);
2374 tmp_reg_src2 = gen_reg_rtx (word_mode);
2377 compare_length = rs6000_string_compare_inline_limit;
2379 /* If we have equality at the end of the last compare and we have not
2380 found the end of the string, we need to call strcmp/strncmp to
2381 compare the remainder. */
2382 bool equality_compare_rest = false;
2384 if (no_length)
2386 bytes = compare_length;
2387 equality_compare_rest = true;
2389 else
2391 if (bytes <= compare_length)
2392 compare_length = bytes;
2393 else
2394 equality_compare_rest = true;
2397 rtx result_reg = gen_reg_rtx (word_mode);
2398 rtx final_move_label = gen_label_rtx ();
2399 rtx final_label = gen_label_rtx ();
2400 rtx begin_compare_label = NULL;
2402 if (base_align < required_align)
2404 /* Generate code that checks distance to 4k boundary for this case. */
2405 begin_compare_label = gen_label_rtx ();
2406 rtx strncmp_label = gen_label_rtx ();
2407 rtx jmp;
2409 /* Strncmp for power8 in glibc does this:
2410 rldicl r8,r3,0,52
2411 cmpldi cr7,r8,4096-16
2412 bgt cr7,L(pagecross) */
2414 /* Make sure that the length we use for the alignment test and
2415 the subsequent code generation are in agreement so we do not
2416 go past the length we tested for a 4k boundary crossing. */
2417 unsigned HOST_WIDE_INT align_test = compare_length;
2418 if (align_test < required_align)
2420 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
2421 base_align = align_test;
2423 else
2425 align_test = ROUND_UP (align_test, required_align);
2426 base_align = required_align;
2429 if (align1 < required_align)
2430 expand_strncmp_align_check (strncmp_label, src1_addr, align_test);
2431 if (align2 < required_align)
2432 expand_strncmp_align_check (strncmp_label, src2_addr, align_test);
2434 /* Now generate the following sequence:
2435 - branch to begin_compare
2436 - strncmp_label
2437 - call to strncmp
2438 - branch to final_label
2439 - begin_compare_label */
2441 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
2442 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
2443 JUMP_LABEL (jmp) = begin_compare_label;
2444 LABEL_NUSES (begin_compare_label) += 1;
2445 emit_barrier ();
2447 emit_label (strncmp_label);
2449 if (no_length)
2451 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2452 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2453 target, LCT_NORMAL, GET_MODE (target),
2454 force_reg (Pmode, src1_addr), Pmode,
2455 force_reg (Pmode, src2_addr), Pmode);
2457 else
2459 /* -m32 -mpowerpc64 results in word_mode being DImode even
2460 though otherwise it is 32-bit. The length arg to strncmp
2461 is a size_t which will be the same size as pointers. */
2462 rtx len_rtx = gen_reg_rtx (Pmode);
2463 emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
2465 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2466 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2467 target, LCT_NORMAL, GET_MODE (target),
2468 force_reg (Pmode, src1_addr), Pmode,
2469 force_reg (Pmode, src2_addr), Pmode,
2470 len_rtx, Pmode);
2473 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2474 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2475 JUMP_LABEL (jmp) = final_label;
2476 LABEL_NUSES (final_label) += 1;
2477 emit_barrier ();
2478 emit_label (begin_compare_label);
2481 rtx cleanup_label = NULL;
2482 rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL;
2484 /* Generate a sequence of GPR or VEC/VSX instructions to compare out
2485 to the length specified. */
2486 if (use_vec)
2488 s1addr = gen_reg_rtx (Pmode);
2489 s2addr = gen_reg_rtx (Pmode);
2490 off_reg = gen_reg_rtx (Pmode);
2491 vec_result = gen_reg_rtx (load_mode);
2492 emit_move_insn (result_reg, GEN_INT (0));
2493 expand_strncmp_vec_sequence (compare_length,
2494 orig_src1, orig_src2,
2495 s1addr, s2addr, off_reg,
2496 tmp_reg_src1, tmp_reg_src2,
2497 vec_result,
2498 equality_compare_rest,
2499 &cleanup_label, final_move_label);
2501 else
2502 expand_strncmp_gpr_sequence (compare_length, base_align,
2503 orig_src1, orig_src2,
2504 tmp_reg_src1, tmp_reg_src2,
2505 result_reg,
2506 equality_compare_rest,
2507 &cleanup_label, final_move_label);
2509 offset = compare_length;
2511 if (equality_compare_rest)
2513 /* Update pointers past what has been compared already. */
2514 rtx src1 = force_reg (Pmode,
2515 gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)));
2516 rtx src2 = force_reg (Pmode,
2517 gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)));
2519 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2520 if (no_length)
2522 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2523 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2524 target, LCT_NORMAL, GET_MODE (target),
2525 src1, Pmode, src2, Pmode);
2527 else
2529 rtx len_rtx = gen_reg_rtx (Pmode);
2530 emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
2531 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2532 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2533 target, LCT_NORMAL, GET_MODE (target),
2534 src1, Pmode, src2, Pmode, len_rtx, Pmode);
2537 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2538 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2539 JUMP_LABEL (jmp) = final_label;
2540 LABEL_NUSES (final_label) += 1;
2541 emit_barrier ();
2544 if (cleanup_label)
2545 emit_label (cleanup_label);
2547 if (use_vec)
2548 emit_final_str_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg,
2549 s1addr, s2addr, orig_src1, orig_src2,
2550 off_reg, vec_result);
2551 else
2552 emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
2554 emit_label (final_move_label);
2555 emit_insn (gen_movsi (target,
2556 gen_lowpart (SImode, result_reg)));
2557 emit_label (final_label);
2558 return true;
2561 /* Expand a block move operation, and return 1 if successful. Return 0
2562 if we should let the compiler generate normal code.
2564 operands[0] is the destination
2565 operands[1] is the source
2566 operands[2] is the length
2567 operands[3] is the alignment */
2569 #define MAX_MOVE_REG 4
2572 expand_block_move (rtx operands[])
2574 rtx orig_dest = operands[0];
2575 rtx orig_src = operands[1];
2576 rtx bytes_rtx = operands[2];
2577 rtx align_rtx = operands[3];
2578 int constp = (GET_CODE (bytes_rtx) == CONST_INT);
2579 int align;
2580 int bytes;
2581 int offset;
2582 int move_bytes;
2583 rtx stores[MAX_MOVE_REG];
2584 int num_reg = 0;
2586 /* If this is not a fixed size move, just call memcpy */
2587 if (! constp)
2588 return 0;
2590 /* This must be a fixed size alignment */
2591 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
2592 align = INTVAL (align_rtx) * BITS_PER_UNIT;
2594 /* Anything to move? */
2595 bytes = INTVAL (bytes_rtx);
2596 if (bytes <= 0)
2597 return 1;
2599 if (bytes > rs6000_block_move_inline_limit)
2600 return 0;
2602 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2604 union {
2605 rtx (*movmemsi) (rtx, rtx, rtx, rtx);
2606 rtx (*mov) (rtx, rtx);
2607 } gen_func;
2608 machine_mode mode = BLKmode;
2609 rtx src, dest;
2611 /* Altivec first, since it will be faster than a string move
2612 when it applies, and usually not significantly larger. */
2613 if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || align >= 128))
2615 move_bytes = 16;
2616 mode = V4SImode;
2617 gen_func.mov = gen_movv4si;
2619 else if (bytes >= 8 && TARGET_POWERPC64
2620 && (align >= 64 || !STRICT_ALIGNMENT))
2622 move_bytes = 8;
2623 mode = DImode;
2624 gen_func.mov = gen_movdi;
2625 if (offset == 0 && align < 64)
2627 rtx addr;
2629 /* If the address form is reg+offset with offset not a
2630 multiple of four, reload into reg indirect form here
2631 rather than waiting for reload. This way we get one
2632 reload, not one per load and/or store. */
2633 addr = XEXP (orig_dest, 0);
2634 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2635 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2636 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2638 addr = copy_addr_to_reg (addr);
2639 orig_dest = replace_equiv_address (orig_dest, addr);
2641 addr = XEXP (orig_src, 0);
2642 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2643 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2644 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2646 addr = copy_addr_to_reg (addr);
2647 orig_src = replace_equiv_address (orig_src, addr);
2651 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2652 { /* move 4 bytes */
2653 move_bytes = 4;
2654 mode = SImode;
2655 gen_func.mov = gen_movsi;
2657 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2658 { /* move 2 bytes */
2659 move_bytes = 2;
2660 mode = HImode;
2661 gen_func.mov = gen_movhi;
2663 else /* move 1 byte at a time */
2665 move_bytes = 1;
2666 mode = QImode;
2667 gen_func.mov = gen_movqi;
2670 src = adjust_address (orig_src, mode, offset);
2671 dest = adjust_address (orig_dest, mode, offset);
2673 if (mode != BLKmode)
2675 rtx tmp_reg = gen_reg_rtx (mode);
2677 emit_insn ((*gen_func.mov) (tmp_reg, src));
2678 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2681 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2683 int i;
2684 for (i = 0; i < num_reg; i++)
2685 emit_insn (stores[i]);
2686 num_reg = 0;
2689 if (mode == BLKmode)
2691 /* Move the address into scratch registers. The movmemsi
2692 patterns require zero offset. */
2693 if (!REG_P (XEXP (src, 0)))
2695 rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
2696 src = replace_equiv_address (src, src_reg);
2698 set_mem_size (src, move_bytes);
2700 if (!REG_P (XEXP (dest, 0)))
2702 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
2703 dest = replace_equiv_address (dest, dest_reg);
2705 set_mem_size (dest, move_bytes);
2707 emit_insn ((*gen_func.movmemsi) (dest, src,
2708 GEN_INT (move_bytes & 31),
2709 align_rtx));
2713 return 1;