config.gcc (powerpc64le*): Revert January 16th...
[official-gcc.git] / gcc / config / rs6000 / rs6000-string.c
blob7e49568a1ed9ea375078d7141733ecc74585f28a
1 /* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
3 Copyright (C) 1991-2018 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "tm_p.h"
31 #include "ira.h"
32 #include "print-tree.h"
33 #include "varasm.h"
34 #include "explow.h"
35 #include "expr.h"
36 #include "output.h"
37 #include "target.h"
39 /* Expand a block clear operation, and return 1 if successful. Return 0
40 if we should let the compiler generate normal code.
42 operands[0] is the destination
43 operands[1] is the length
44 operands[3] is the alignment */
46 int
47 expand_block_clear (rtx operands[])
49 rtx orig_dest = operands[0];
50 rtx bytes_rtx = operands[1];
51 rtx align_rtx = operands[3];
52 bool constp = (GET_CODE (bytes_rtx) == CONST_INT);
53 HOST_WIDE_INT align;
54 HOST_WIDE_INT bytes;
55 int offset;
56 int clear_bytes;
57 int clear_step;
59 /* If this is not a fixed size move, just call memcpy */
60 if (! constp)
61 return 0;
63 /* This must be a fixed size alignment */
64 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
65 align = INTVAL (align_rtx) * BITS_PER_UNIT;
67 /* Anything to clear? */
68 bytes = INTVAL (bytes_rtx);
69 if (bytes <= 0)
70 return 1;
72 /* Use the builtin memset after a point, to avoid huge code bloat.
73 When optimize_size, avoid any significant code bloat; calling
74 memset is about 4 instructions, so allow for one instruction to
75 load zero and three to do clearing. */
76 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
77 clear_step = 16;
78 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
79 clear_step = 8;
80 else
81 clear_step = 4;
83 if (optimize_size && bytes > 3 * clear_step)
84 return 0;
85 if (! optimize_size && bytes > 8 * clear_step)
86 return 0;
88 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
90 machine_mode mode = BLKmode;
91 rtx dest;
93 if (bytes >= 16 && TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
95 clear_bytes = 16;
96 mode = V4SImode;
98 else if (bytes >= 8 && TARGET_POWERPC64
99 && (align >= 64 || !STRICT_ALIGNMENT))
101 clear_bytes = 8;
102 mode = DImode;
103 if (offset == 0 && align < 64)
105 rtx addr;
107 /* If the address form is reg+offset with offset not a
108 multiple of four, reload into reg indirect form here
109 rather than waiting for reload. This way we get one
110 reload, not one per store. */
111 addr = XEXP (orig_dest, 0);
112 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
113 && GET_CODE (XEXP (addr, 1)) == CONST_INT
114 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
116 addr = copy_addr_to_reg (addr);
117 orig_dest = replace_equiv_address (orig_dest, addr);
121 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
122 { /* move 4 bytes */
123 clear_bytes = 4;
124 mode = SImode;
126 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
127 { /* move 2 bytes */
128 clear_bytes = 2;
129 mode = HImode;
131 else /* move 1 byte at a time */
133 clear_bytes = 1;
134 mode = QImode;
137 dest = adjust_address (orig_dest, mode, offset);
139 emit_move_insn (dest, CONST0_RTX (mode));
142 return 1;
145 /* Figure out the correct instructions to generate to load data for
146 block compare. MODE is used for the read from memory, and
147 data is zero extended if REG is wider than MODE. If LE code
148 is being generated, bswap loads are used.
150 REG is the destination register to move the data into.
151 MEM is the memory block being read.
152 MODE is the mode of memory to use for the read. */
153 static void
154 do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
156 switch (GET_MODE (reg))
158 case E_DImode:
159 switch (mode)
161 case E_QImode:
162 emit_insn (gen_zero_extendqidi2 (reg, mem));
163 break;
164 case E_HImode:
166 rtx src = mem;
167 if (!BYTES_BIG_ENDIAN)
169 src = gen_reg_rtx (HImode);
170 emit_insn (gen_bswaphi2 (src, mem));
172 emit_insn (gen_zero_extendhidi2 (reg, src));
173 break;
175 case E_SImode:
177 rtx src = mem;
178 if (!BYTES_BIG_ENDIAN)
180 src = gen_reg_rtx (SImode);
181 emit_insn (gen_bswapsi2 (src, mem));
183 emit_insn (gen_zero_extendsidi2 (reg, src));
185 break;
186 case E_DImode:
187 if (!BYTES_BIG_ENDIAN)
188 emit_insn (gen_bswapdi2 (reg, mem));
189 else
190 emit_insn (gen_movdi (reg, mem));
191 break;
192 default:
193 gcc_unreachable ();
195 break;
197 case E_SImode:
198 switch (mode)
200 case E_QImode:
201 emit_insn (gen_zero_extendqisi2 (reg, mem));
202 break;
203 case E_HImode:
205 rtx src = mem;
206 if (!BYTES_BIG_ENDIAN)
208 src = gen_reg_rtx (HImode);
209 emit_insn (gen_bswaphi2 (src, mem));
211 emit_insn (gen_zero_extendhisi2 (reg, src));
212 break;
214 case E_SImode:
215 if (!BYTES_BIG_ENDIAN)
216 emit_insn (gen_bswapsi2 (reg, mem));
217 else
218 emit_insn (gen_movsi (reg, mem));
219 break;
220 case E_DImode:
221 /* DImode is larger than the destination reg so is not expected. */
222 gcc_unreachable ();
223 break;
224 default:
225 gcc_unreachable ();
227 break;
228 default:
229 gcc_unreachable ();
230 break;
234 /* Select the mode to be used for reading the next chunk of bytes
235 in the compare.
237 OFFSET is the current read offset from the beginning of the block.
238 BYTES is the number of bytes remaining to be read.
239 ALIGN is the minimum alignment of the memory blocks being compared in bytes.
240 WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
241 the largest allowable mode. */
242 static machine_mode
243 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
244 unsigned HOST_WIDE_INT bytes,
245 unsigned HOST_WIDE_INT align, bool word_mode_ok)
247 /* First see if we can do a whole load unit
248 as that will be more efficient than a larger load + shift. */
250 /* If big, use biggest chunk.
251 If exactly chunk size, use that size.
252 If remainder can be done in one piece with shifting, do that.
253 Do largest chunk possible without violating alignment rules. */
255 /* The most we can read without potential page crossing. */
256 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
258 if (word_mode_ok && bytes >= UNITS_PER_WORD)
259 return word_mode;
260 else if (bytes == GET_MODE_SIZE (SImode))
261 return SImode;
262 else if (bytes == GET_MODE_SIZE (HImode))
263 return HImode;
264 else if (bytes == GET_MODE_SIZE (QImode))
265 return QImode;
266 else if (bytes < GET_MODE_SIZE (SImode)
267 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
268 && offset >= GET_MODE_SIZE (SImode) - bytes)
269 /* This matches the case were we have SImode and 3 bytes
270 and offset >= 1 and permits us to move back one and overlap
271 with the previous read, thus avoiding having to shift
272 unwanted bytes off of the input. */
273 return SImode;
274 else if (word_mode_ok && bytes < UNITS_PER_WORD
275 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
276 && offset >= UNITS_PER_WORD-bytes)
277 /* Similarly, if we can use DImode it will get matched here and
278 can do an overlapping read that ends at the end of the block. */
279 return word_mode;
280 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
281 /* It is safe to do all remaining in one load of largest size,
282 possibly with a shift to get rid of unwanted bytes. */
283 return word_mode;
284 else if (maxread >= GET_MODE_SIZE (SImode))
285 /* It is safe to do all remaining in one SImode load,
286 possibly with a shift to get rid of unwanted bytes. */
287 return SImode;
288 else if (bytes > GET_MODE_SIZE (SImode))
289 return SImode;
290 else if (bytes > GET_MODE_SIZE (HImode))
291 return HImode;
293 /* final fallback is do one byte */
294 return QImode;
297 /* Compute the alignment of pointer+OFFSET where the original alignment
298 of pointer was BASE_ALIGN. */
299 static unsigned HOST_WIDE_INT
300 compute_current_alignment (unsigned HOST_WIDE_INT base_align,
301 unsigned HOST_WIDE_INT offset)
303 if (offset == 0)
304 return base_align;
305 return MIN (base_align, offset & -offset);
308 /* Prepare address and then do a load.
310 MODE is the mode to use for the load.
311 DEST is the destination register for the data.
312 ADDR is the address to be loaded.
313 ORIG_ADDR is the original address expression. */
314 static void
315 do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
316 rtx orig_addr)
318 rtx mem = gen_rtx_MEM (mode, addr);
319 MEM_COPY_ATTRIBUTES (mem, orig_addr);
320 set_mem_size (mem, GET_MODE_SIZE (mode));
321 do_load_for_compare (dest, mem, mode);
322 return;
325 /* Do a branch for an if/else decision.
327 CMPMODE is the mode to use for the comparison.
328 COMPARISON is the rtx code for the compare needed.
329 A is the first thing to be compared.
330 B is the second thing to be compared.
331 CR is the condition code reg input, or NULL_RTX.
332 TRUE_LABEL is the label to branch to if the condition is true.
334 The return value is the CR used for the comparison.
335 If CR is null_rtx, then a new register of CMPMODE is generated.
336 If A and B are both null_rtx, then CR must not be null, and the
337 compare is not generated so you can use this with a dot form insn. */
339 static void
340 do_ifelse (machine_mode cmpmode, rtx_code comparison,
341 rtx a, rtx b, rtx cr, rtx true_label)
343 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
344 || (a != NULL_RTX && b != NULL_RTX));
346 if (cr != NULL_RTX)
347 gcc_assert (GET_MODE (cr) == cmpmode);
348 else
349 cr = gen_reg_rtx (cmpmode);
351 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
353 if (a != NULL_RTX)
354 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
356 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
358 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
359 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
360 JUMP_LABEL (j) = true_label;
361 LABEL_NUSES (true_label) += 1;
364 /* Emit an isel of the proper mode for DEST.
366 DEST is the isel destination register.
367 SRC1 is the isel source if CR is true.
368 SRC2 is the isel source if CR is false.
369 CR is the condition for the isel. */
370 static void
371 do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
373 if (GET_MODE (dest) == DImode)
374 emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
375 else
376 emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
379 /* Emit a subtract of the proper mode for DEST.
381 DEST is the destination register for the subtract.
382 SRC1 is the first subtract input.
383 SRC2 is the second subtract input.
385 Computes DEST = SRC1-SRC2. */
386 static void
387 do_sub3 (rtx dest, rtx src1, rtx src2)
389 if (GET_MODE (dest) == DImode)
390 emit_insn (gen_subdi3 (dest, src1, src2));
391 else
392 emit_insn (gen_subsi3 (dest, src1, src2));
395 /* Emit an add of the proper mode for DEST.
397 DEST is the destination register for the add.
398 SRC1 is the first add input.
399 SRC2 is the second add input.
401 Computes DEST = SRC1+SRC2. */
402 static void
403 do_add3 (rtx dest, rtx src1, rtx src2)
405 if (GET_MODE (dest) == DImode)
406 emit_insn (gen_adddi3 (dest, src1, src2));
407 else
408 emit_insn (gen_addsi3 (dest, src1, src2));
411 /* Emit an and of the proper mode for DEST.
413 DEST is the destination register for the and.
414 SRC1 is the first and input.
415 SRC2 is the second and input.
417 Computes DEST = SRC1&SRC2. */
418 static void
419 do_and3 (rtx dest, rtx src1, rtx src2)
421 if (GET_MODE (dest) == DImode)
422 emit_insn (gen_anddi3 (dest, src1, src2));
423 else
424 emit_insn (gen_andsi3 (dest, src1, src2));
427 /* Emit an cmpb of the proper mode for DEST.
429 DEST is the destination register for the cmpb.
430 SRC1 is the first input.
431 SRC2 is the second input.
433 Computes cmpb of SRC1, SRC2. */
434 static void
435 do_cmpb3 (rtx dest, rtx src1, rtx src2)
437 if (GET_MODE (dest) == DImode)
438 emit_insn (gen_cmpbdi3 (dest, src1, src2));
439 else
440 emit_insn (gen_cmpbsi3 (dest, src1, src2));
443 /* Emit a rotl of the proper mode for DEST.
445 DEST is the destination register for the and.
446 SRC1 is the first and input.
447 SRC2 is the second and input.
449 Computes DEST = SRC1 rotated left by SRC2. */
450 static void
451 do_rotl3 (rtx dest, rtx src1, rtx src2)
453 if (GET_MODE (dest) == DImode)
454 emit_insn (gen_rotldi3 (dest, src1, src2));
455 else
456 emit_insn (gen_rotlsi3 (dest, src1, src2));
459 /* Generate rtl for a load, shift, and compare of less than a full word.
461 LOAD_MODE is the machine mode for the loads.
462 DIFF is the reg for the difference.
463 CMP_REM is the reg containing the remaining bytes to compare.
464 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
465 SRC1_ADDR is the first source address.
466 SRC2_ADDR is the second source address.
467 ORIG_SRC1 is the original first source block's address rtx.
468 ORIG_SRC2 is the original second source block's address rtx. */
469 static void
470 do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
471 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
473 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
474 rtx shift_amount = gen_reg_rtx (word_mode);
475 rtx d1 = gen_reg_rtx (word_mode);
476 rtx d2 = gen_reg_rtx (word_mode);
478 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
479 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
480 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
482 if (word_mode == DImode)
484 emit_insn (gen_ashldi3 (shift_amount, shift_amount,
485 GEN_INT (LOG2_BITS_PER_UNIT)));
486 emit_insn (gen_lshrdi3 (d1, d1,
487 gen_lowpart (SImode, shift_amount)));
488 emit_insn (gen_lshrdi3 (d2, d2,
489 gen_lowpart (SImode, shift_amount)));
491 else
493 emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
494 GEN_INT (LOG2_BITS_PER_UNIT)));
495 emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
496 emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
499 if (TARGET_P9_MISC)
501 /* Generate a compare, and convert with a setb later. */
502 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
503 emit_insn (gen_rtx_SET (dcond, cmp));
505 else
507 if (word_mode == DImode)
508 emit_insn (gen_subfdi3_carry (diff, d2, d1));
509 else
510 emit_insn (gen_subfsi3_carry (diff, d2, d1));
514 /* Generate rtl for an overlapping load and compare of less than a
515 full load_mode. This assumes that the previous word is part of the
516 block being compared so it's ok to back up part of a word so we can
517 compare the last unaligned full word that ends at the end of the block.
519 LOAD_MODE is the machine mode for the loads.
520 ISCONST tells whether the remaining length is a constant or in a register.
521 BYTES_REM is the remaining length if ISCONST is true.
522 DIFF is the reg for the difference.
523 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
524 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
525 SRC1_ADDR is the first source address.
526 SRC2_ADDR is the second source address.
527 ORIG_SRC1 is the original first source block's address rtx.
528 ORIG_SRC2 is the original second source block's address rtx. */
529 static void
530 do_overlap_load_compare (machine_mode load_mode, bool isConst,
531 HOST_WIDE_INT bytes_rem, rtx diff,
532 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
533 rtx orig_src1, rtx orig_src2)
535 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
536 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
537 rtx d1 = gen_reg_rtx (word_mode);
538 rtx d2 = gen_reg_rtx (word_mode);
540 rtx addr1, addr2;
541 if (!isConst || addr_adj)
543 rtx adj_reg = gen_reg_rtx (word_mode);
544 if (isConst)
545 emit_move_insn (adj_reg, GEN_INT (-addr_adj));
546 else
548 rtx reg_lms = gen_reg_rtx (word_mode);
549 emit_move_insn (reg_lms, GEN_INT (load_mode_size));
550 do_sub3 (adj_reg, cmp_rem, reg_lms);
553 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
554 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
556 else
558 addr1 = src1_addr;
559 addr2 = src2_addr;
562 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
563 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
565 if (TARGET_P9_MISC)
567 /* Generate a compare, and convert with a setb later. */
568 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
569 emit_insn (gen_rtx_SET (dcond, cmp));
571 else
573 if (word_mode == DImode)
574 emit_insn (gen_subfdi3_carry (diff, d2, d1));
575 else
576 emit_insn (gen_subfsi3_carry (diff, d2, d1));
580 /* Expand a block compare operation using loop code, and return true
581 if successful. Return false if we should let the compiler generate
582 normal code, probably a memcmp call.
584 OPERANDS[0] is the target (result).
585 OPERANDS[1] is the first source.
586 OPERANDS[2] is the second source.
587 OPERANDS[3] is the length.
588 OPERANDS[4] is the alignment. */
589 bool
590 expand_compare_loop (rtx operands[])
592 rtx target = operands[0];
593 rtx orig_src1 = operands[1];
594 rtx orig_src2 = operands[2];
595 rtx bytes_rtx = operands[3];
596 rtx align_rtx = operands[4];
598 /* This case is complicated to handle because the subtract
599 with carry instructions do not generate the 64-bit
600 carry and so we must emit code to calculate it ourselves.
601 We choose not to implement this yet. */
602 if (TARGET_32BIT && TARGET_POWERPC64)
603 return false;
605 /* Allow non-const length. */
606 int bytes_is_const = CONST_INT_P (bytes_rtx);
608 /* This must be a fixed size alignment. */
609 if (!CONST_INT_P (align_rtx))
610 return false;
612 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
613 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
614 HOST_WIDE_INT minalign = MIN (align1, align2);
616 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
618 gcc_assert (GET_MODE (target) == SImode);
620 /* Anything to move? */
621 HOST_WIDE_INT bytes = 0;
622 if (bytes_is_const)
623 bytes = INTVAL (bytes_rtx);
625 if (bytes_is_const && bytes == 0)
626 return true;
628 /* Limit the amount we compare, if known statically. */
629 HOST_WIDE_INT max_bytes;
630 switch (rs6000_tune)
632 case PROCESSOR_POWER7:
633 if (!bytes_is_const)
634 if (minalign < 8)
635 max_bytes = 0;
636 else
637 max_bytes = 128;
638 else
639 if (minalign < 8)
640 max_bytes = 32;
641 else
642 max_bytes = 128;
643 break;
644 case PROCESSOR_POWER8:
645 if (!bytes_is_const)
646 max_bytes = 0;
647 else
648 if (minalign < 8)
649 max_bytes = 128;
650 else
651 max_bytes = 64;
652 break;
653 case PROCESSOR_POWER9:
654 if (bytes_is_const)
655 max_bytes = 191;
656 else
657 max_bytes = 0;
658 break;
659 default:
660 max_bytes = 128;
663 /* Allow the option to override the default. */
664 if (rs6000_block_compare_inline_loop_limit >= 0)
665 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
667 if (max_bytes == 0)
668 return false;
670 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */
671 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */
672 HOST_WIDE_INT niter;
673 rtx iter = gen_reg_rtx (word_mode);
674 rtx iv1 = gen_reg_rtx (word_mode);
675 rtx iv2 = gen_reg_rtx (word_mode);
676 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */
677 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */
678 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */
679 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */
681 /* Strip unneeded subreg from length if there is one. */
682 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
683 bytes_rtx = SUBREG_REG (bytes_rtx);
684 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
685 maybe have to deal with the case were bytes_rtx is SImode and
686 word_mode is DImode. */
687 if (!bytes_is_const)
689 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
690 /* Do not expect length longer than word_mode. */
691 return false;
692 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
694 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
695 bytes_rtx = force_reg (word_mode,
696 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
697 bytes_rtx));
699 else
700 /* Make sure it's in a register before we get started. */
701 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
704 machine_mode load_mode = word_mode;
705 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
707 /* Number of bytes per iteration of the unrolled loop. */
708 HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
709 /* max iters and bytes compared in the loop. */
710 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
711 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
712 int l2lb = floor_log2 (loop_bytes);
714 if (bytes_is_const && (max_bytes < load_mode_size
715 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
716 return false;
718 bool no_remainder_code = false;
719 rtx final_label = gen_label_rtx ();
720 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
721 rtx diff_label = gen_label_rtx ();
722 rtx library_call_label = NULL;
723 rtx cleanup_label = gen_label_rtx ();
725 rtx cr;
727 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
728 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
730 /* Difference found is stored here before jump to diff_label. */
731 rtx diff = gen_reg_rtx (word_mode);
732 rtx j;
734 /* Example of generated code for 35 bytes aligned 1 byte.
736 mtctr 8
737 li 6,0
738 li 5,8
739 .L13:
740 ldbrx 7,3,6
741 ldbrx 9,10,6
742 ldbrx 0,3,5
743 ldbrx 4,10,5
744 addi 6,6,16
745 addi 5,5,16
746 subfc. 9,9,7
747 bne 0,.L10
748 subfc. 9,4,0
749 bdnzt 2,.L13
750 bne 0,.L10
751 add 3,3,6
752 add 10,10,6
753 addi 9,3,-5
754 ldbrx 7,0,9
755 addi 9,10,-5
756 ldbrx 9,0,9
757 subfc 9,9,7
758 .p2align 4,,15
759 .L10:
760 popcntd 9,9
761 subfe 10,10,10
762 or 9,9,10
764 Compiled with -fno-reorder-blocks for clarity. */
766 /* Structure of what we're going to do:
767 Two separate lengths: what we will compare before bailing to library
768 call (max_bytes), and the total length to be checked.
769 if length <= 16, branch to linear cleanup code starting with
770 remainder length check (length not known at compile time)
771 set up 2 iv's and load count reg, compute remainder length
772 unrollx2 compare loop
773 if loop exit due to a difference, branch to difference handling code
774 if remainder length < 8, branch to final cleanup compare
775 load and compare 8B
776 final cleanup comparison (depends on alignment and length)
777 load 8B, shift off bytes past length, compare
778 load 8B ending at last byte and compare
779 load/compare 1 byte at a time (short block abutting 4k boundary)
780 difference handling, 64->32 conversion
781 final result
782 branch around memcmp call
783 memcmp library call
786 /* If bytes is not const, compare length and branch directly
787 to the cleanup code that can handle 0-16 bytes if length
788 is >= 16. Stash away bytes-max_bytes for the library call. */
789 if (bytes_is_const)
791 /* These need to be set for some of the places we may jump to. */
792 if (bytes > max_bytes)
794 no_remainder_code = true;
795 niter = max_loop_iter;
796 library_call_label = gen_label_rtx ();
798 else
800 niter = bytes / loop_bytes;
802 emit_move_insn (iter, GEN_INT (niter));
803 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
804 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
806 else
808 library_call_label = gen_label_rtx ();
810 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
811 emit_move_insn (cmp_rem, bytes_rtx);
813 /* Check for > max_bytes bytes. We want to bail out as quickly as
814 possible if we have to go over to memcmp. */
815 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
816 NULL_RTX, library_call_label);
818 /* Check for < loop_bytes bytes. */
819 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
820 NULL_RTX, cleanup_label);
822 /* Loop compare bytes and iterations if bytes>max_bytes. */
823 rtx mb_reg = gen_reg_rtx (word_mode);
824 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
825 rtx mi_reg = gen_reg_rtx (word_mode);
826 emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
828 /* Compute number of loop iterations if bytes <= max_bytes. */
829 if (word_mode == DImode)
830 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
831 else
832 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
834 /* Compute bytes to compare in loop if bytes <= max_bytes. */
835 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
836 if (word_mode == DImode)
838 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
840 else
842 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
845 /* Check for bytes <= max_bytes. */
846 if (TARGET_ISEL)
848 /* P9 has fast isel so we use one compare and two isel. */
849 cr = gen_reg_rtx (CCmode);
850 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
851 GEN_INT (max_bytes));
852 emit_move_insn (cr, compare_rtx);
853 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
854 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
855 do_isel (iter, cmp_rtx, iter, mi_reg, cr);
857 else
859 rtx lab_after = gen_label_rtx ();
860 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
861 NULL_RTX, lab_after);
862 emit_move_insn (loop_cmp, mb_reg);
863 emit_move_insn (iter, mi_reg);
864 emit_label (lab_after);
867 /* Now compute remainder bytes which isn't used until after the loop. */
868 do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
871 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */
872 /* For p9 we need to have just one of these as multiple places define
873 it and it gets used by the setb at the end. */
874 if (TARGET_P9_MISC)
875 dcond = gen_reg_rtx (CCUNSmode);
877 if (!bytes_is_const || bytes >= loop_bytes)
879 /* It should not be possible to come here if remaining bytes is
880 < 16 in the runtime case either. Compute number of loop
881 iterations. We compare 2*word_mode per iteration so 16B for
882 64-bit code and 8B for 32-bit. Set up two induction
883 variables and load count register. */
885 /* HACK ALERT: create hard reg for CTR here. If we just use a
886 pseudo, cse will get rid of it and then the allocator will
887 see it used in the lshr above and won't give us ctr. */
888 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
889 emit_move_insn (ctr, iter);
890 emit_move_insn (diff, GEN_INT (0));
891 emit_move_insn (iv1, GEN_INT (0));
892 emit_move_insn (iv2, GEN_INT (load_mode_size));
894 /* inner loop to compare 2*word_mode */
895 rtx loop_top_label = gen_label_rtx ();
896 emit_label (loop_top_label);
898 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
899 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
901 do_load_for_compare_from_addr (load_mode, d1_1,
902 src1_ix1, orig_src1);
903 do_load_for_compare_from_addr (load_mode, d2_1,
904 src2_ix1, orig_src2);
905 do_add3 (iv1, iv1, GEN_INT (loop_bytes));
907 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
908 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
910 do_load_for_compare_from_addr (load_mode, d1_2,
911 src1_ix2, orig_src1);
912 do_load_for_compare_from_addr (load_mode, d2_2,
913 src2_ix2, orig_src2);
914 do_add3 (iv2, iv2, GEN_INT (loop_bytes));
916 if (TARGET_P9_MISC)
918 /* Generate a compare, and convert with a setb later. */
919 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
920 emit_insn (gen_rtx_SET (dcond, cmp));
922 else
924 dcond = gen_reg_rtx (CCmode);
925 if (word_mode == DImode)
926 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
927 else
928 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
931 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
932 dcond, diff_label);
934 if (TARGET_P9_MISC)
936 /* Generate a compare, and convert with a setb later. */
937 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
938 emit_insn (gen_rtx_SET (dcond, cmp));
940 else
942 dcond = gen_reg_rtx (CCmode);
943 if (word_mode == DImode)
944 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
945 else
946 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
949 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
950 if (TARGET_64BIT)
951 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
952 eqrtx, dcond));
953 else
954 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
955 eqrtx, dcond));
956 JUMP_LABEL (j) = loop_top_label;
957 LABEL_NUSES (loop_top_label) += 1;
960 HOST_WIDE_INT bytes_remaining = 0;
961 if (bytes_is_const)
962 bytes_remaining = (bytes % loop_bytes);
964 /* If diff is nonzero, branch to difference handling
965 code. If we exit here with a nonzero diff, it is
966 because the second word differed. */
967 if (TARGET_P9_MISC)
968 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, diff_label);
969 else
970 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, diff_label);
972 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
974 /* If the length is known at compile time, then we will always
975 have a remainder to go to the library call with. */
976 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
977 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
978 JUMP_LABEL (j) = library_call_label;
979 LABEL_NUSES (library_call_label) += 1;
980 emit_barrier ();
983 if (bytes_is_const && bytes_remaining == 0)
985 /* No remainder and if we are here then diff is 0 so just return 0 */
986 if (TARGET_64BIT)
987 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
988 else
989 emit_move_insn (target, diff);
990 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
991 JUMP_LABEL (j) = final_label;
992 LABEL_NUSES (final_label) += 1;
993 emit_barrier ();
995 else if (!no_remainder_code)
997 /* Update addresses to point to the next word to examine. */
998 do_add3 (src1_addr, src1_addr, iv1);
999 do_add3 (src2_addr, src2_addr, iv1);
1001 emit_label (cleanup_label);
1003 if (!bytes_is_const)
1005 /* If we're dealing with runtime length, we have to check if
1006 it's zero after the loop. When length is known at compile
1007 time the no-remainder condition is dealt with above. By
1008 doing this after cleanup_label, we also deal with the
1009 case where length is 0 at the start and we bypass the
1010 loop with a branch to cleanup_label. */
1011 emit_move_insn (target, const0_rtx);
1012 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1013 NULL_RTX, final_label);
1016 rtx final_cleanup = gen_label_rtx ();
1017 rtx cmp_rem_before = gen_reg_rtx (word_mode);
1018 /* Compare one more word_mode chunk if needed. */
1019 if (!bytes_is_const || bytes_remaining >= load_mode_size)
1021 /* If remainder length < word length, branch to final
1022 cleanup compare. */
1023 if (!bytes_is_const)
1024 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1025 NULL_RTX, final_cleanup);
1027 /* load and compare 8B */
1028 do_load_for_compare_from_addr (load_mode, d1_1,
1029 src1_addr, orig_src1);
1030 do_load_for_compare_from_addr (load_mode, d2_1,
1031 src2_addr, orig_src2);
1033 /* Compare the word, see if we need to do the last partial. */
1034 if (TARGET_P9_MISC)
1036 /* Generate a compare, and convert with a setb later. */
1037 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1038 emit_insn (gen_rtx_SET (dcond, cmp));
1040 else
1042 dcond = gen_reg_rtx (CCmode);
1043 if (word_mode == DImode)
1044 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1045 else
1046 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1049 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1050 dcond, diff_label);
1052 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1053 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1054 emit_move_insn (cmp_rem_before, cmp_rem);
1055 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1056 if (bytes_is_const)
1057 bytes_remaining -= load_mode_size;
1058 else
1059 /* See if remaining length is now zero. We previously set
1060 target to 0 so we can just jump to the end. */
1061 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1062 NULL_RTX, final_label);
1066 /* Cases:
1067 bytes_is_const
1068 We can always shift back to do an overlapping compare
1069 of the last chunk because we know length >= 8.
1071 !bytes_is_const
1072 align>=load_mode_size
1073 Read word_mode and mask
1074 align<load_mode_size
1075 avoid stepping past end
1077 Three strategies:
1078 * decrement address and do overlapping compare
1079 * read word_mode and mask
1080 * carefully avoid crossing 4k boundary
1083 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1084 && align1 >= load_mode_size && align2 >= load_mode_size)
1086 /* Alignment is larger than word_mode so we do not need to be
1087 concerned with extra page crossings. But, we do not know
1088 that the length is larger than load_mode_size so we might
1089 end up compareing against data before the block if we try
1090 an overlapping compare. Also we use this on P7 for fixed length
1091 remainder because P7 doesn't like overlapping unaligned.
1092 Strategy: load 8B, shift off bytes past length, and compare. */
1093 emit_label (final_cleanup);
1094 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1095 src1_addr, src2_addr, orig_src1, orig_src2);
1097 else if (bytes_remaining && bytes_is_const)
1099 /* We do not do loop expand if length < 32 so we know at the
1100 end we can do an overlapping compare.
1101 Strategy: shift address back and do word_mode load that
1102 ends at the end of the block. */
1103 emit_label (final_cleanup);
1104 do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1105 cmp_rem, dcond, src1_addr, src2_addr,
1106 orig_src1, orig_src2);
1108 else if (!bytes_is_const)
1110 rtx handle4k_label = gen_label_rtx ();
1111 rtx nonconst_overlap = gen_label_rtx ();
1112 emit_label (nonconst_overlap);
1114 /* Here we have to handle the case where whe have runtime
1115 length which may be too short for overlap compare, and
1116 alignment is not at least load_mode_size so we have to
1117 tread carefully to avoid stepping across 4k boundaries. */
1119 /* If the length after the loop was larger than word_mode
1120 size, we can just do an overlapping compare and we're
1121 done. We fall through to this code from the word_mode
1122 compare that preceeds this. */
1123 do_overlap_load_compare (load_mode, false, 0, diff,
1124 cmp_rem, dcond, src1_addr, src2_addr,
1125 orig_src1, orig_src2);
1127 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1128 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1129 JUMP_LABEL (j) = diff_label;
1130 LABEL_NUSES (diff_label) += 1;
1131 emit_barrier ();
1133 /* If we couldn't do the overlap compare we have to be more
1134 careful of the 4k boundary. Test to see if either
1135 address is less than word_mode_size away from a 4k
1136 boundary. If not, then we can do a load/shift/compare
1137 and we are done. We come to this code if length was less
1138 than word_mode_size. */
1140 emit_label (final_cleanup);
1142 /* We can still avoid the slow case if the length was larger
1143 than one loop iteration, in which case go do the overlap
1144 load compare path. */
1145 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1146 NULL_RTX, nonconst_overlap);
1148 rtx rem4k = gen_reg_rtx (word_mode);
1149 rtx dist1 = gen_reg_rtx (word_mode);
1150 rtx dist2 = gen_reg_rtx (word_mode);
1151 do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1152 if (word_mode == SImode)
1153 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1154 else
1155 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1156 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, handle4k_label);
1157 if (word_mode == SImode)
1158 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1159 else
1160 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1161 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, handle4k_label);
1163 /* We don't have a 4k boundary to deal with, so do
1164 a load/shift/compare and jump to diff. */
1166 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1167 src1_addr, src2_addr, orig_src1, orig_src2);
1169 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1170 JUMP_LABEL (j) = diff_label;
1171 LABEL_NUSES (diff_label) += 1;
1172 emit_barrier ();
1174 /* Finally in the unlikely case we are inching up to a
1175 4k boundary we use a compact lbzx/compare loop to do
1176 it a byte at a time. */
1178 emit_label (handle4k_label);
1180 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1181 emit_move_insn (ctr, cmp_rem);
1182 rtx ixreg = gen_reg_rtx (Pmode);
1183 emit_move_insn (ixreg, const0_rtx);
1185 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1186 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1187 rtx d1 = gen_reg_rtx (word_mode);
1188 rtx d2 = gen_reg_rtx (word_mode);
1190 rtx fc_loop = gen_label_rtx ();
1191 emit_label (fc_loop);
1193 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1194 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1196 do_add3 (ixreg, ixreg, const1_rtx);
1198 rtx cond = gen_reg_rtx (CCmode);
1199 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1200 rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1202 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1203 if (TARGET_64BIT)
1204 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1205 eqrtx, cond));
1206 else
1207 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1208 eqrtx, cond));
1209 JUMP_LABEL (j) = fc_loop;
1210 LABEL_NUSES (fc_loop) += 1;
1212 if (TARGET_64BIT)
1213 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1214 else
1215 emit_move_insn (target, diff);
1217 /* Since we are comparing bytes, the difference can be used
1218 as the final result and we are done here. */
1219 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1220 JUMP_LABEL (j) = final_label;
1221 LABEL_NUSES (final_label) += 1;
1222 emit_barrier ();
1226 emit_label (diff_label);
1227 /* difference handling, 64->32 conversion */
1229 /* We need to produce DI result from sub, then convert to target SI
1230 while maintaining <0 / ==0 / >0 properties. This sequence works:
1231 subfc L,A,B
1232 subfe H,H,H
1233 popcntd L,L
1234 rldimi L,H,6,0
1236 This is an alternate one Segher cooked up if somebody
1237 wants to expand this for something that doesn't have popcntd:
1238 subfc L,a,b
1239 subfe H,x,x
1240 addic t,L,-1
1241 subfe v,t,L
1242 or z,v,H
1244 And finally, p9 can just do this:
1245 cmpld A,B
1246 setb r */
1248 if (TARGET_P9_MISC)
1249 emit_insn (gen_setb_unsigned (target, dcond));
1250 else
1252 if (TARGET_64BIT)
1254 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1255 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1256 emit_insn (gen_popcntddi2 (diff, diff));
1257 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1258 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1260 else
1262 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1263 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1264 emit_insn (gen_popcntdsi2 (diff, diff));
1265 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1269 if (library_call_label != NULL)
1271 /* Branch around memcmp call. */
1272 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1273 JUMP_LABEL (j) = final_label;
1274 LABEL_NUSES (final_label) += 1;
1275 emit_barrier ();
1277 /* Make memcmp library call. cmp_rem is the remaining bytes that
1278 were compared and cmp_rem is the expected amount to be compared
1279 by memcmp. If we don't find a difference in the loop compare, do
1280 the library call directly instead of doing a small compare just
1281 to get to an arbitrary boundary before calling it anyway.
1282 Also, update addresses to point to the next word to examine. */
1283 emit_label (library_call_label);
1285 rtx len_rtx = gen_reg_rtx (word_mode);
1286 if (bytes_is_const)
1288 emit_move_insn (len_rtx, cmp_rem);
1289 do_add3 (src1_addr, src1_addr, iv1);
1290 do_add3 (src2_addr, src2_addr, iv1);
1292 else
1293 emit_move_insn (len_rtx, bytes_rtx);
1295 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1296 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1297 target, LCT_NORMAL, GET_MODE (target),
1298 src1_addr, Pmode,
1299 src2_addr, Pmode,
1300 len_rtx, GET_MODE (len_rtx));
1303 /* emit final_label */
1304 emit_label (final_label);
1305 return true;
1308 /* Expand a block compare operation, and return true if successful.
1309 Return false if we should let the compiler generate normal code,
1310 probably a memcmp call.
1312 OPERANDS[0] is the target (result).
1313 OPERANDS[1] is the first source.
1314 OPERANDS[2] is the second source.
1315 OPERANDS[3] is the length.
1316 OPERANDS[4] is the alignment. */
1317 bool
1318 expand_block_compare (rtx operands[])
1320 rtx target = operands[0];
1321 rtx orig_src1 = operands[1];
1322 rtx orig_src2 = operands[2];
1323 rtx bytes_rtx = operands[3];
1324 rtx align_rtx = operands[4];
1325 HOST_WIDE_INT cmp_bytes = 0;
1326 rtx src1 = orig_src1;
1327 rtx src2 = orig_src2;
1329 /* This case is complicated to handle because the subtract
1330 with carry instructions do not generate the 64-bit
1331 carry and so we must emit code to calculate it ourselves.
1332 We choose not to implement this yet. */
1333 if (TARGET_32BIT && TARGET_POWERPC64)
1334 return false;
1336 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1338 /* Allow this param to shut off all expansion. */
1339 if (rs6000_block_compare_inline_limit == 0)
1340 return false;
1342 /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1343 However slow_unaligned_access returns true on P7 even though the
1344 performance of this code is good there. */
1345 if (!isP7
1346 && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1347 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
1348 return false;
1350 /* Unaligned l*brx traps on P7 so don't do this. However this should
1351 not affect much because LE isn't really supported on P7 anyway. */
1352 if (isP7 && !BYTES_BIG_ENDIAN)
1353 return false;
1355 /* If this is not a fixed size compare, try generating loop code and
1356 if that fails just call memcmp. */
1357 if (!CONST_INT_P (bytes_rtx))
1358 return expand_compare_loop (operands);
1360 /* This must be a fixed size alignment. */
1361 if (!CONST_INT_P (align_rtx))
1362 return false;
1364 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1366 gcc_assert (GET_MODE (target) == SImode);
1368 /* Anything to move? */
1369 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1370 if (bytes == 0)
1371 return true;
1373 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1374 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1375 /* P7/P8 code uses cond for subfc. but P9 uses
1376 it for cmpld which needs CCUNSmode. */
1377 rtx cond;
1378 if (TARGET_P9_MISC)
1379 cond = gen_reg_rtx (CCUNSmode);
1380 else
1381 cond = gen_reg_rtx (CCmode);
1383 /* If we have an LE target without ldbrx and word_mode is DImode,
1384 then we must avoid using word_mode. */
1385 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
1386 && word_mode == DImode);
1388 /* Strategy phase. How many ops will this take and should we expand it? */
1390 unsigned HOST_WIDE_INT offset = 0;
1391 machine_mode load_mode =
1392 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
1393 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1395 /* We don't want to generate too much code. The loop code can take
1396 over for lengths greater than 31 bytes. */
1397 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
1398 if (!IN_RANGE (bytes, 1, max_bytes))
1399 return expand_compare_loop (operands);
1401 /* The code generated for p7 and older is not faster than glibc
1402 memcmp if alignment is small and length is not short, so bail
1403 out to avoid those conditions. */
1404 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
1405 && ((base_align == 1 && bytes > 16)
1406 || (base_align == 2 && bytes > 32)))
1407 return false;
1409 bool generate_6432_conversion = false;
1410 rtx convert_label = NULL;
1411 rtx final_label = NULL;
1413 /* Example of generated code for 18 bytes aligned 1 byte.
1414 Compiled with -fno-reorder-blocks for clarity.
1415 ldbrx 10,31,8
1416 ldbrx 9,7,8
1417 subfc. 9,9,10
1418 bne 0,.L6487
1419 addi 9,12,8
1420 addi 5,11,8
1421 ldbrx 10,0,9
1422 ldbrx 9,0,5
1423 subfc. 9,9,10
1424 bne 0,.L6487
1425 addi 9,12,16
1426 lhbrx 10,0,9
1427 addi 9,11,16
1428 lhbrx 9,0,9
1429 subf 9,9,10
1430 b .L6488
1431 .p2align 4,,15
1432 .L6487: #convert_label
1433 popcntd 9,9
1434 subfe 10,10,10
1435 or 9,9,10
1436 .L6488: #final_label
1437 extsw 10,9
1439 We start off with DImode for two blocks that jump to the DI->SI conversion
1440 if the difference is found there, then a final block of HImode that skips
1441 the DI->SI conversion. */
1443 while (bytes > 0)
1445 unsigned int align = compute_current_alignment (base_align, offset);
1446 load_mode = select_block_compare_mode (offset, bytes,
1447 align, word_mode_ok);
1448 load_mode_size = GET_MODE_SIZE (load_mode);
1449 if (bytes >= load_mode_size)
1450 cmp_bytes = load_mode_size;
1451 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1453 /* Move this load back so it doesn't go past the end.
1454 P8/P9 can do this efficiently. */
1455 unsigned int extra_bytes = load_mode_size - bytes;
1456 cmp_bytes = bytes;
1457 if (extra_bytes < offset)
1459 offset -= extra_bytes;
1460 cmp_bytes = load_mode_size;
1461 bytes = cmp_bytes;
1464 else
1465 /* P7 and earlier can't do the overlapping load trick fast,
1466 so this forces a non-overlapping load and a shift to get
1467 rid of the extra bytes. */
1468 cmp_bytes = bytes;
1470 src1 = adjust_address (orig_src1, load_mode, offset);
1471 src2 = adjust_address (orig_src2, load_mode, offset);
1473 if (!REG_P (XEXP (src1, 0)))
1475 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1476 src1 = replace_equiv_address (src1, src1_reg);
1478 set_mem_size (src1, load_mode_size);
1480 if (!REG_P (XEXP (src2, 0)))
1482 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1483 src2 = replace_equiv_address (src2, src2_reg);
1485 set_mem_size (src2, load_mode_size);
1487 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1488 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1490 if (cmp_bytes < load_mode_size)
1492 /* Shift unneeded bytes off. */
1493 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1494 if (word_mode == DImode)
1496 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1497 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1499 else
1501 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1502 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1506 int remain = bytes - cmp_bytes;
1507 if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
1509 /* Target is larger than load size so we don't need to
1510 reduce result size. */
1512 /* We previously did a block that need 64->32 conversion but
1513 the current block does not, so a label is needed to jump
1514 to the end. */
1515 if (generate_6432_conversion && !final_label)
1516 final_label = gen_label_rtx ();
1518 if (remain > 0)
1520 /* This is not the last block, branch to the end if the result
1521 of this subtract is not zero. */
1522 if (!final_label)
1523 final_label = gen_label_rtx ();
1524 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1525 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1526 rtx cr = gen_reg_rtx (CCmode);
1527 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
1528 emit_insn (gen_movsi (target,
1529 gen_lowpart (SImode, tmp_reg_src2)));
1530 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1531 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1532 fin_ref, pc_rtx);
1533 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1534 JUMP_LABEL (j) = final_label;
1535 LABEL_NUSES (final_label) += 1;
1537 else
1539 if (word_mode == DImode)
1541 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1542 tmp_reg_src2));
1543 emit_insn (gen_movsi (target,
1544 gen_lowpart (SImode, tmp_reg_src2)));
1546 else
1547 emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
1549 if (final_label)
1551 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1552 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1553 JUMP_LABEL (j) = final_label;
1554 LABEL_NUSES (final_label) += 1;
1555 emit_barrier ();
1559 else
1561 /* Do we need a 64->32 conversion block? We need the 64->32
1562 conversion even if target size == load_mode size because
1563 the subtract generates one extra bit. */
1564 generate_6432_conversion = true;
1566 if (remain > 0)
1568 if (!convert_label)
1569 convert_label = gen_label_rtx ();
1571 /* Compare to zero and branch to convert_label if not zero. */
1572 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1573 if (TARGET_P9_MISC)
1575 /* Generate a compare, and convert with a setb later. */
1576 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1577 tmp_reg_src2);
1578 emit_insn (gen_rtx_SET (cond, cmp));
1580 else
1581 /* Generate a subfc. and use the longer
1582 sequence for conversion. */
1583 if (TARGET_64BIT)
1584 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1585 tmp_reg_src1, cond));
1586 else
1587 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1588 tmp_reg_src1, cond));
1589 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1590 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1591 cvt_ref, pc_rtx);
1592 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1593 JUMP_LABEL (j) = convert_label;
1594 LABEL_NUSES (convert_label) += 1;
1596 else
1598 /* Just do the subtract/compare. Since this is the last block
1599 the convert code will be generated immediately following. */
1600 if (TARGET_P9_MISC)
1602 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1603 tmp_reg_src2);
1604 emit_insn (gen_rtx_SET (cond, cmp));
1606 else
1607 if (TARGET_64BIT)
1608 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
1609 tmp_reg_src1));
1610 else
1611 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
1612 tmp_reg_src1));
1616 offset += cmp_bytes;
1617 bytes -= cmp_bytes;
1620 if (generate_6432_conversion)
1622 if (convert_label)
1623 emit_label (convert_label);
1625 /* We need to produce DI result from sub, then convert to target SI
1626 while maintaining <0 / ==0 / >0 properties. This sequence works:
1627 subfc L,A,B
1628 subfe H,H,H
1629 popcntd L,L
1630 rldimi L,H,6,0
1632 This is an alternate one Segher cooked up if somebody
1633 wants to expand this for something that doesn't have popcntd:
1634 subfc L,a,b
1635 subfe H,x,x
1636 addic t,L,-1
1637 subfe v,t,L
1638 or z,v,H
1640 And finally, p9 can just do this:
1641 cmpld A,B
1642 setb r */
1644 if (TARGET_P9_MISC)
1646 emit_insn (gen_setb_unsigned (target, cond));
1648 else
1650 if (TARGET_64BIT)
1652 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1653 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1654 emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
1655 emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
1656 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
1658 else
1660 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1661 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1662 emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
1663 emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
1668 if (final_label)
1669 emit_label (final_label);
1671 gcc_assert (bytes == 0);
1672 return true;
1675 /* Generate page crossing check and branch code to set up for
1676 strncmp when we don't have DI alignment.
1677 STRNCMP_LABEL is the label to branch if there is a page crossing.
1678 SRC_ADDR is the string address to be examined.
1679 BYTES is the max number of bytes to compare. */
1680 static void
1681 expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
1683 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
1684 rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr));
1685 do_and3 (src_pgoff, src_addr, GEN_INT (0xfff));
1686 rtx cond = gen_reg_rtx (CCmode);
1687 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff,
1688 GEN_INT (4096 - bytes)));
1690 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
1692 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1693 lab_ref, pc_rtx);
1694 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1695 JUMP_LABEL (j) = strncmp_label;
1696 LABEL_NUSES (strncmp_label) += 1;
1699 /* Generate the final sequence that identifies the differing
1700 byte and generates the final result, taking into account
1701 zero bytes:
1703 cmpb cmpb_result1, src1, src2
1704 cmpb cmpb_result2, src1, zero
1705 orc cmpb_result1, cmp_result1, cmpb_result2
1706 cntlzd get bit of first zero/diff byte
1707 addi convert for rldcl use
1708 rldcl rldcl extract diff/zero byte
1709 subf subtract for final result
1711 STR1 is the reg rtx for data from string 1.
1712 STR2 is the reg rtx for data from string 2.
1713 RESULT is the reg rtx for the comparison result. */
1715 static void
1716 emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
1718 machine_mode m = GET_MODE (str1);
1719 rtx cmpb_diff = gen_reg_rtx (m);
1720 rtx cmpb_zero = gen_reg_rtx (m);
1721 rtx rot_amt = gen_reg_rtx (m);
1722 rtx zero_reg = gen_reg_rtx (m);
1724 rtx rot1_1 = gen_reg_rtx (m);
1725 rtx rot1_2 = gen_reg_rtx (m);
1726 rtx rot2_1 = gen_reg_rtx (m);
1727 rtx rot2_2 = gen_reg_rtx (m);
1729 if (m == SImode)
1731 emit_insn (gen_cmpbsi3 (cmpb_diff, str1, str2));
1732 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
1733 emit_insn (gen_cmpbsi3 (cmpb_zero, str1, zero_reg));
1734 emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
1735 emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
1736 emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
1737 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
1738 emit_insn (gen_rotlsi3 (rot1_1, str1,
1739 gen_lowpart (SImode, rot_amt)));
1740 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
1741 emit_insn (gen_rotlsi3 (rot2_1, str2,
1742 gen_lowpart (SImode, rot_amt)));
1743 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
1744 emit_insn (gen_subsi3 (result, rot1_2, rot2_2));
1746 else if (m == DImode)
1748 emit_insn (gen_cmpbdi3 (cmpb_diff, str1, str2));
1749 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
1750 emit_insn (gen_cmpbdi3 (cmpb_zero, str1, zero_reg));
1751 emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
1752 emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
1753 emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
1754 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
1755 emit_insn (gen_rotldi3 (rot1_1, str1,
1756 gen_lowpart (SImode, rot_amt)));
1757 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
1758 emit_insn (gen_rotldi3 (rot2_1, str2,
1759 gen_lowpart (SImode, rot_amt)));
1760 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
1761 emit_insn (gen_subdi3 (result, rot1_2, rot2_2));
1763 else
1764 gcc_unreachable ();
1766 return;
1769 /* Expand a string compare operation with length, and return
1770 true if successful. Return false if we should let the
1771 compiler generate normal code, probably a strncmp call.
1773 OPERANDS[0] is the target (result).
1774 OPERANDS[1] is the first source.
1775 OPERANDS[2] is the second source.
1776 If NO_LENGTH is zero, then:
1777 OPERANDS[3] is the length.
1778 OPERANDS[4] is the alignment in bytes.
1779 If NO_LENGTH is nonzero, then:
1780 OPERANDS[3] is the alignment in bytes. */
1781 bool
1782 expand_strn_compare (rtx operands[], int no_length)
1784 rtx target = operands[0];
1785 rtx orig_src1 = operands[1];
1786 rtx orig_src2 = operands[2];
1787 rtx bytes_rtx, align_rtx;
1788 if (no_length)
1790 bytes_rtx = NULL;
1791 align_rtx = operands[3];
1793 else
1795 bytes_rtx = operands[3];
1796 align_rtx = operands[4];
1798 unsigned HOST_WIDE_INT cmp_bytes = 0;
1799 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
1800 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
1802 /* If we have a length, it must be constant. This simplifies things
1803 a bit as we don't have to generate code to check if we've exceeded
1804 the length. Later this could be expanded to handle this case. */
1805 if (!no_length && !CONST_INT_P (bytes_rtx))
1806 return false;
1808 /* This must be a fixed size alignment. */
1809 if (!CONST_INT_P (align_rtx))
1810 return false;
1812 unsigned int base_align = UINTVAL (align_rtx);
1813 unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
1814 unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
1816 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
1817 if (targetm.slow_unaligned_access (word_mode, align1)
1818 || targetm.slow_unaligned_access (word_mode, align2))
1819 return false;
1821 gcc_assert (GET_MODE (target) == SImode);
1823 /* If we have an LE target without ldbrx and word_mode is DImode,
1824 then we must avoid using word_mode. */
1825 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
1826 && word_mode == DImode);
1828 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
1830 unsigned HOST_WIDE_INT offset = 0;
1831 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
1832 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
1833 if (no_length)
1834 /* Use this as a standin to determine the mode to use. */
1835 bytes = rs6000_string_compare_inline_limit * word_mode_size;
1836 else
1837 bytes = UINTVAL (bytes_rtx);
1839 machine_mode load_mode =
1840 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
1841 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1842 compare_length = rs6000_string_compare_inline_limit * load_mode_size;
1844 /* If we have equality at the end of the last compare and we have not
1845 found the end of the string, we need to call strcmp/strncmp to
1846 compare the remainder. */
1847 bool equality_compare_rest = false;
1849 if (no_length)
1851 bytes = compare_length;
1852 equality_compare_rest = true;
1854 else
1856 if (bytes <= compare_length)
1857 compare_length = bytes;
1858 else
1859 equality_compare_rest = true;
1862 rtx result_reg = gen_reg_rtx (word_mode);
1863 rtx final_move_label = gen_label_rtx ();
1864 rtx final_label = gen_label_rtx ();
1865 rtx begin_compare_label = NULL;
1866 unsigned int required_align = 8;
1868 if (base_align < required_align)
1870 /* Generate code that checks distance to 4k boundary for this case. */
1871 begin_compare_label = gen_label_rtx ();
1872 rtx strncmp_label = gen_label_rtx ();
1873 rtx jmp;
1875 /* Strncmp for power8 in glibc does this:
1876 rldicl r8,r3,0,52
1877 cmpldi cr7,r8,4096-16
1878 bgt cr7,L(pagecross) */
1880 /* Make sure that the length we use for the alignment test and
1881 the subsequent code generation are in agreement so we do not
1882 go past the length we tested for a 4k boundary crossing. */
1883 unsigned HOST_WIDE_INT align_test = compare_length;
1884 if (align_test < 8)
1886 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
1887 base_align = align_test;
1889 else
1891 align_test = ROUND_UP (align_test, required_align);
1892 base_align = required_align;
1895 if (align1 < required_align)
1896 expand_strncmp_align_check (strncmp_label, src1_addr, align_test);
1897 if (align2 < required_align)
1898 expand_strncmp_align_check (strncmp_label, src2_addr, align_test);
1900 /* Now generate the following sequence:
1901 - branch to begin_compare
1902 - strncmp_label
1903 - call to strncmp
1904 - branch to final_label
1905 - begin_compare_label */
1907 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
1908 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
1909 JUMP_LABEL (jmp) = begin_compare_label;
1910 LABEL_NUSES (begin_compare_label) += 1;
1911 emit_barrier ();
1913 emit_label (strncmp_label);
1915 if (no_length)
1917 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
1918 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1919 target, LCT_NORMAL, GET_MODE (target),
1920 force_reg (Pmode, src1_addr), Pmode,
1921 force_reg (Pmode, src2_addr), Pmode);
1923 else
1925 /* -m32 -mpowerpc64 results in word_mode being DImode even
1926 though otherwise it is 32-bit. The length arg to strncmp
1927 is a size_t which will be the same size as pointers. */
1928 rtx len_rtx = gen_reg_rtx (Pmode);
1929 emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
1931 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
1932 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1933 target, LCT_NORMAL, GET_MODE (target),
1934 force_reg (Pmode, src1_addr), Pmode,
1935 force_reg (Pmode, src2_addr), Pmode,
1936 len_rtx, Pmode);
1939 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1940 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1941 JUMP_LABEL (jmp) = final_label;
1942 LABEL_NUSES (final_label) += 1;
1943 emit_barrier ();
1944 emit_label (begin_compare_label);
1947 rtx cleanup_label = NULL;
1948 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1949 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1951 /* Generate a sequence of GPR or VEC/VSX instructions to compare out
1952 to the length specified. */
1953 unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
1954 while (bytes_to_compare > 0)
1956 /* GPR compare sequence:
1957 check each 8B with: ld/ld cmpd bne
1958 If equal, use rldicr/cmpb to check for zero byte.
1959 cleanup code at end:
1960 cmpb get byte that differs
1961 cmpb look for zero byte
1962 orc combine
1963 cntlzd get bit of first zero/diff byte
1964 subfic convert for rldcl use
1965 rldcl rldcl extract diff/zero byte
1966 subf subtract for final result
1968 The last compare can branch around the cleanup code if the
1969 result is zero because the strings are exactly equal. */
1971 unsigned int align = compute_current_alignment (base_align, offset);
1972 load_mode = select_block_compare_mode (offset, bytes_to_compare,
1973 align, word_mode_ok);
1974 load_mode_size = GET_MODE_SIZE (load_mode);
1975 if (bytes_to_compare >= load_mode_size)
1976 cmp_bytes = load_mode_size;
1977 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1979 /* Move this load back so it doesn't go past the end.
1980 P8/P9 can do this efficiently. */
1981 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
1982 cmp_bytes = bytes_to_compare;
1983 if (extra_bytes < offset)
1985 offset -= extra_bytes;
1986 cmp_bytes = load_mode_size;
1987 bytes_to_compare = cmp_bytes;
1990 else
1991 /* P7 and earlier can't do the overlapping load trick fast,
1992 so this forces a non-overlapping load and a shift to get
1993 rid of the extra bytes. */
1994 cmp_bytes = bytes_to_compare;
1996 rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
1997 do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
1998 rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
1999 do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
2001 /* We must always left-align the data we read, and
2002 clear any bytes to the right that are beyond the string.
2003 Otherwise the cmpb sequence won't produce the correct
2004 results. The beginning of the compare will be done
2005 with word_mode so will not have any extra shifts or
2006 clear rights. */
2008 if (load_mode_size < word_mode_size)
2010 /* Rotate left first. */
2011 rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
2012 do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
2013 do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
2016 if (cmp_bytes < word_mode_size)
2018 /* Now clear right. This plus the rotate can be
2019 turned into a rldicr instruction. */
2020 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2021 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2022 do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
2023 do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
2026 /* Cases to handle. A and B are chunks of the two strings.
2027 1: Not end of comparison:
2028 A != B: branch to cleanup code to compute result.
2029 A == B: check for 0 byte, next block if not found.
2030 2: End of the inline comparison:
2031 A != B: branch to cleanup code to compute result.
2032 A == B: check for 0 byte, call strcmp/strncmp
2033 3: compared requested N bytes:
2034 A == B: branch to result 0.
2035 A != B: cleanup code to compute result. */
2037 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
2039 rtx dst_label;
2040 if (remain > 0 || equality_compare_rest)
2042 /* Branch to cleanup code, otherwise fall through to do
2043 more compares. */
2044 if (!cleanup_label)
2045 cleanup_label = gen_label_rtx ();
2046 dst_label = cleanup_label;
2048 else
2049 /* Branch to end and produce result of 0. */
2050 dst_label = final_move_label;
2052 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2053 rtx cond = gen_reg_rtx (CCmode);
2055 /* Always produce the 0 result, it is needed if
2056 cmpb finds a 0 byte in this chunk. */
2057 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
2058 rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
2060 rtx cmp_rtx;
2061 if (remain == 0 && !equality_compare_rest)
2062 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
2063 else
2064 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2066 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2067 lab_ref, pc_rtx);
2068 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2069 JUMP_LABEL (j) = dst_label;
2070 LABEL_NUSES (dst_label) += 1;
2072 if (remain > 0 || equality_compare_rest)
2074 /* Generate a cmpb to test for a 0 byte and branch
2075 to final result if found. */
2076 rtx cmpb_zero = gen_reg_rtx (word_mode);
2077 rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2078 rtx condz = gen_reg_rtx (CCmode);
2079 rtx zero_reg = gen_reg_rtx (word_mode);
2080 emit_move_insn (zero_reg, GEN_INT (0));
2081 do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
2083 if (cmp_bytes < word_mode_size)
2085 /* Don't want to look at zero bytes past end. */
2086 HOST_WIDE_INT mb =
2087 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2088 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2089 do_and3 (cmpb_zero, cmpb_zero, mask);
2092 emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
2093 rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
2094 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
2095 lab_ref_fin, pc_rtx);
2096 rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2097 JUMP_LABEL (j2) = final_move_label;
2098 LABEL_NUSES (final_move_label) += 1;
2102 offset += cmp_bytes;
2103 bytes_to_compare -= cmp_bytes;
2106 if (equality_compare_rest)
2108 /* Update pointers past what has been compared already. */
2109 rtx src1 = force_reg (Pmode,
2110 gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)));
2111 rtx src2 = force_reg (Pmode,
2112 gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)));
2114 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2115 if (no_length)
2117 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2118 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2119 target, LCT_NORMAL, GET_MODE (target),
2120 src1, Pmode, src2, Pmode);
2122 else
2124 rtx len_rtx = gen_reg_rtx (Pmode);
2125 emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
2126 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2127 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2128 target, LCT_NORMAL, GET_MODE (target),
2129 src1, Pmode, src2, Pmode, len_rtx, Pmode);
2132 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2133 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2134 JUMP_LABEL (jmp) = final_label;
2135 LABEL_NUSES (final_label) += 1;
2136 emit_barrier ();
2139 if (cleanup_label)
2140 emit_label (cleanup_label);
2142 emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
2144 emit_label (final_move_label);
2145 emit_insn (gen_movsi (target,
2146 gen_lowpart (SImode, result_reg)));
2147 emit_label (final_label);
2148 return true;
2151 /* Expand a block move operation, and return 1 if successful. Return 0
2152 if we should let the compiler generate normal code.
2154 operands[0] is the destination
2155 operands[1] is the source
2156 operands[2] is the length
2157 operands[3] is the alignment */
2159 #define MAX_MOVE_REG 4
2162 expand_block_move (rtx operands[])
2164 rtx orig_dest = operands[0];
2165 rtx orig_src = operands[1];
2166 rtx bytes_rtx = operands[2];
2167 rtx align_rtx = operands[3];
2168 int constp = (GET_CODE (bytes_rtx) == CONST_INT);
2169 int align;
2170 int bytes;
2171 int offset;
2172 int move_bytes;
2173 rtx stores[MAX_MOVE_REG];
2174 int num_reg = 0;
2176 /* If this is not a fixed size move, just call memcpy */
2177 if (! constp)
2178 return 0;
2180 /* This must be a fixed size alignment */
2181 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
2182 align = INTVAL (align_rtx) * BITS_PER_UNIT;
2184 /* Anything to move? */
2185 bytes = INTVAL (bytes_rtx);
2186 if (bytes <= 0)
2187 return 1;
2189 if (bytes > rs6000_block_move_inline_limit)
2190 return 0;
2192 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2194 union {
2195 rtx (*movmemsi) (rtx, rtx, rtx, rtx);
2196 rtx (*mov) (rtx, rtx);
2197 } gen_func;
2198 machine_mode mode = BLKmode;
2199 rtx src, dest;
2201 /* Altivec first, since it will be faster than a string move
2202 when it applies, and usually not significantly larger. */
2203 if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || align >= 128))
2205 move_bytes = 16;
2206 mode = V4SImode;
2207 gen_func.mov = gen_movv4si;
2209 else if (bytes >= 8 && TARGET_POWERPC64
2210 && (align >= 64 || !STRICT_ALIGNMENT))
2212 move_bytes = 8;
2213 mode = DImode;
2214 gen_func.mov = gen_movdi;
2215 if (offset == 0 && align < 64)
2217 rtx addr;
2219 /* If the address form is reg+offset with offset not a
2220 multiple of four, reload into reg indirect form here
2221 rather than waiting for reload. This way we get one
2222 reload, not one per load and/or store. */
2223 addr = XEXP (orig_dest, 0);
2224 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2225 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2226 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2228 addr = copy_addr_to_reg (addr);
2229 orig_dest = replace_equiv_address (orig_dest, addr);
2231 addr = XEXP (orig_src, 0);
2232 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2233 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2234 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2236 addr = copy_addr_to_reg (addr);
2237 orig_src = replace_equiv_address (orig_src, addr);
2241 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2242 { /* move 4 bytes */
2243 move_bytes = 4;
2244 mode = SImode;
2245 gen_func.mov = gen_movsi;
2247 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2248 { /* move 2 bytes */
2249 move_bytes = 2;
2250 mode = HImode;
2251 gen_func.mov = gen_movhi;
2253 else /* move 1 byte at a time */
2255 move_bytes = 1;
2256 mode = QImode;
2257 gen_func.mov = gen_movqi;
2260 src = adjust_address (orig_src, mode, offset);
2261 dest = adjust_address (orig_dest, mode, offset);
2263 if (mode != BLKmode)
2265 rtx tmp_reg = gen_reg_rtx (mode);
2267 emit_insn ((*gen_func.mov) (tmp_reg, src));
2268 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2271 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2273 int i;
2274 for (i = 0; i < num_reg; i++)
2275 emit_insn (stores[i]);
2276 num_reg = 0;
2279 if (mode == BLKmode)
2281 /* Move the address into scratch registers. The movmemsi
2282 patterns require zero offset. */
2283 if (!REG_P (XEXP (src, 0)))
2285 rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
2286 src = replace_equiv_address (src, src_reg);
2288 set_mem_size (src, move_bytes);
2290 if (!REG_P (XEXP (dest, 0)))
2292 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
2293 dest = replace_equiv_address (dest, dest_reg);
2295 set_mem_size (dest, move_bytes);
2297 emit_insn ((*gen_func.movmemsi) (dest, src,
2298 GEN_INT (move_bytes & 31),
2299 align_rtx));
2303 return 1;