1 /* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
3 Copyright (C) 1991-2018 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
25 #include "coretypes.h"
32 #include "print-tree.h"
39 /* Expand a block clear operation, and return 1 if successful. Return 0
40 if we should let the compiler generate normal code.
42 operands[0] is the destination
43 operands[1] is the length
44 operands[3] is the alignment */
47 expand_block_clear (rtx operands
[])
49 rtx orig_dest
= operands
[0];
50 rtx bytes_rtx
= operands
[1];
51 rtx align_rtx
= operands
[3];
52 bool constp
= (GET_CODE (bytes_rtx
) == CONST_INT
);
59 /* If this is not a fixed size move, just call memcpy */
63 /* This must be a fixed size alignment */
64 gcc_assert (GET_CODE (align_rtx
) == CONST_INT
);
65 align
= INTVAL (align_rtx
) * BITS_PER_UNIT
;
67 /* Anything to clear? */
68 bytes
= INTVAL (bytes_rtx
);
72 /* Use the builtin memset after a point, to avoid huge code bloat.
73 When optimize_size, avoid any significant code bloat; calling
74 memset is about 4 instructions, so allow for one instruction to
75 load zero and three to do clearing. */
76 if (TARGET_ALTIVEC
&& (align
>= 128 || TARGET_EFFICIENT_UNALIGNED_VSX
))
78 else if (TARGET_POWERPC64
&& (align
>= 64 || !STRICT_ALIGNMENT
))
83 if (optimize_size
&& bytes
> 3 * clear_step
)
85 if (! optimize_size
&& bytes
> 8 * clear_step
)
88 for (offset
= 0; bytes
> 0; offset
+= clear_bytes
, bytes
-= clear_bytes
)
90 machine_mode mode
= BLKmode
;
94 && ((bytes
>= 16 && align
>= 128)
95 || (bytes
>= 32 && TARGET_EFFICIENT_UNALIGNED_VSX
)))
100 else if (bytes
>= 8 && TARGET_POWERPC64
101 && (align
>= 64 || !STRICT_ALIGNMENT
))
105 if (offset
== 0 && align
< 64)
109 /* If the address form is reg+offset with offset not a
110 multiple of four, reload into reg indirect form here
111 rather than waiting for reload. This way we get one
112 reload, not one per store. */
113 addr
= XEXP (orig_dest
, 0);
114 if ((GET_CODE (addr
) == PLUS
|| GET_CODE (addr
) == LO_SUM
)
115 && GET_CODE (XEXP (addr
, 1)) == CONST_INT
116 && (INTVAL (XEXP (addr
, 1)) & 3) != 0)
118 addr
= copy_addr_to_reg (addr
);
119 orig_dest
= replace_equiv_address (orig_dest
, addr
);
123 else if (bytes
>= 4 && (align
>= 32 || !STRICT_ALIGNMENT
))
128 else if (bytes
>= 2 && (align
>= 16 || !STRICT_ALIGNMENT
))
133 else /* move 1 byte at a time */
139 dest
= adjust_address (orig_dest
, mode
, offset
);
141 emit_move_insn (dest
, CONST0_RTX (mode
));
147 /* Figure out the correct instructions to generate to load data for
148 block compare. MODE is used for the read from memory, and
149 data is zero extended if REG is wider than MODE. If LE code
150 is being generated, bswap loads are used.
152 REG is the destination register to move the data into.
153 MEM is the memory block being read.
154 MODE is the mode of memory to use for the read. */
156 do_load_for_compare (rtx reg
, rtx mem
, machine_mode mode
)
158 switch (GET_MODE (reg
))
164 if (!BYTES_BIG_ENDIAN
)
166 if (TARGET_P9_VECTOR
)
167 emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg
, mem
));
170 rtx reg_v2di
= simplify_gen_subreg (V2DImode
, reg
,
172 gcc_assert (MEM_P (mem
));
173 rtx addr
= XEXP (mem
, 0);
174 rtx mem_v2di
= gen_rtx_MEM (V2DImode
, addr
);
175 MEM_COPY_ATTRIBUTES (mem_v2di
, mem
);
176 set_mem_size (mem
, GET_MODE_SIZE (V2DImode
));
177 emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di
, mem_v2di
));
181 emit_insn (gen_vsx_movv2di_64bit (reg
, mem
));
191 emit_insn (gen_zero_extendqidi2 (reg
, mem
));
196 if (!BYTES_BIG_ENDIAN
)
198 src
= gen_reg_rtx (HImode
);
199 emit_insn (gen_bswaphi2 (src
, mem
));
201 emit_insn (gen_zero_extendhidi2 (reg
, src
));
207 if (!BYTES_BIG_ENDIAN
)
209 src
= gen_reg_rtx (SImode
);
210 emit_insn (gen_bswapsi2 (src
, mem
));
212 emit_insn (gen_zero_extendsidi2 (reg
, src
));
216 if (!BYTES_BIG_ENDIAN
)
217 emit_insn (gen_bswapdi2 (reg
, mem
));
219 emit_insn (gen_movdi (reg
, mem
));
230 emit_insn (gen_zero_extendqisi2 (reg
, mem
));
235 if (!BYTES_BIG_ENDIAN
)
237 src
= gen_reg_rtx (HImode
);
238 emit_insn (gen_bswaphi2 (src
, mem
));
240 emit_insn (gen_zero_extendhisi2 (reg
, src
));
244 if (!BYTES_BIG_ENDIAN
)
245 emit_insn (gen_bswapsi2 (reg
, mem
));
247 emit_insn (gen_movsi (reg
, mem
));
250 /* DImode is larger than the destination reg so is not expected. */
259 gcc_assert (mode
== E_QImode
);
260 emit_move_insn (reg
, mem
);
269 /* Select the mode to be used for reading the next chunk of bytes
272 OFFSET is the current read offset from the beginning of the block.
273 BYTES is the number of bytes remaining to be read.
274 ALIGN is the minimum alignment of the memory blocks being compared in bytes. */
276 select_block_compare_mode (unsigned HOST_WIDE_INT offset
,
277 unsigned HOST_WIDE_INT bytes
,
278 unsigned HOST_WIDE_INT align
)
280 /* First see if we can do a whole load unit
281 as that will be more efficient than a larger load + shift. */
283 /* If big, use biggest chunk.
284 If exactly chunk size, use that size.
285 If remainder can be done in one piece with shifting, do that.
286 Do largest chunk possible without violating alignment rules. */
288 /* The most we can read without potential page crossing. */
289 unsigned HOST_WIDE_INT maxread
= ROUND_UP (bytes
, align
);
291 /* If we have an LE target without ldbrx and word_mode is DImode,
292 then we must avoid using word_mode. */
293 int word_mode_ok
= !(!BYTES_BIG_ENDIAN
&& !TARGET_LDBRX
294 && word_mode
== DImode
);
296 if (word_mode_ok
&& bytes
>= UNITS_PER_WORD
)
298 else if (bytes
== GET_MODE_SIZE (SImode
))
300 else if (bytes
== GET_MODE_SIZE (HImode
))
302 else if (bytes
== GET_MODE_SIZE (QImode
))
304 else if (bytes
< GET_MODE_SIZE (SImode
)
305 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
306 && offset
>= GET_MODE_SIZE (SImode
) - bytes
)
307 /* This matches the case were we have SImode and 3 bytes
308 and offset >= 1 and permits us to move back one and overlap
309 with the previous read, thus avoiding having to shift
310 unwanted bytes off of the input. */
312 else if (word_mode_ok
&& bytes
< UNITS_PER_WORD
313 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
314 && offset
>= UNITS_PER_WORD
-bytes
)
315 /* Similarly, if we can use DImode it will get matched here and
316 can do an overlapping read that ends at the end of the block. */
318 else if (word_mode_ok
&& maxread
>= UNITS_PER_WORD
)
319 /* It is safe to do all remaining in one load of largest size,
320 possibly with a shift to get rid of unwanted bytes. */
322 else if (maxread
>= GET_MODE_SIZE (SImode
))
323 /* It is safe to do all remaining in one SImode load,
324 possibly with a shift to get rid of unwanted bytes. */
326 else if (bytes
> GET_MODE_SIZE (SImode
))
328 else if (bytes
> GET_MODE_SIZE (HImode
))
331 /* final fallback is do one byte */
335 /* Compute the alignment of pointer+OFFSET where the original alignment
336 of pointer was BASE_ALIGN. */
337 static unsigned HOST_WIDE_INT
338 compute_current_alignment (unsigned HOST_WIDE_INT base_align
,
339 unsigned HOST_WIDE_INT offset
)
343 return MIN (base_align
, offset
& -offset
);
346 /* Prepare address and then do a load.
348 MODE is the mode to use for the load.
349 DEST is the destination register for the data.
350 ADDR is the address to be loaded.
351 ORIG_ADDR is the original address expression. */
353 do_load_for_compare_from_addr (machine_mode mode
, rtx dest
, rtx addr
,
356 rtx mem
= gen_rtx_MEM (mode
, addr
);
357 MEM_COPY_ATTRIBUTES (mem
, orig_addr
);
358 set_mem_size (mem
, GET_MODE_SIZE (mode
));
359 do_load_for_compare (dest
, mem
, mode
);
363 /* Do a branch for an if/else decision.
365 CMPMODE is the mode to use for the comparison.
366 COMPARISON is the rtx code for the compare needed.
367 A is the first thing to be compared.
368 B is the second thing to be compared.
369 CR is the condition code reg input, or NULL_RTX.
370 TRUE_LABEL is the label to branch to if the condition is true.
372 The return value is the CR used for the comparison.
373 If CR is null_rtx, then a new register of CMPMODE is generated.
374 If A and B are both null_rtx, then CR must not be null, and the
375 compare is not generated so you can use this with a dot form insn. */
378 do_ifelse (machine_mode cmpmode
, rtx_code comparison
,
379 rtx a
, rtx b
, rtx cr
, rtx true_label
)
381 gcc_assert ((a
== NULL_RTX
&& b
== NULL_RTX
&& cr
!= NULL_RTX
)
382 || (a
!= NULL_RTX
&& b
!= NULL_RTX
));
385 gcc_assert (GET_MODE (cr
) == cmpmode
);
387 cr
= gen_reg_rtx (cmpmode
);
389 rtx label_ref
= gen_rtx_LABEL_REF (VOIDmode
, true_label
);
392 emit_move_insn (cr
, gen_rtx_COMPARE (cmpmode
, a
, b
));
394 rtx cmp_rtx
= gen_rtx_fmt_ee (comparison
, VOIDmode
, cr
, const0_rtx
);
396 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, cmp_rtx
, label_ref
, pc_rtx
);
397 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
398 JUMP_LABEL (j
) = true_label
;
399 LABEL_NUSES (true_label
) += 1;
402 /* Emit an isel of the proper mode for DEST.
404 DEST is the isel destination register.
405 SRC1 is the isel source if CR is true.
406 SRC2 is the isel source if CR is false.
407 CR is the condition for the isel. */
409 do_isel (rtx dest
, rtx cmp
, rtx src_t
, rtx src_f
, rtx cr
)
411 if (GET_MODE (dest
) == DImode
)
412 emit_insn (gen_isel_signed_di (dest
, cmp
, src_t
, src_f
, cr
));
414 emit_insn (gen_isel_signed_si (dest
, cmp
, src_t
, src_f
, cr
));
417 /* Emit a subtract of the proper mode for DEST.
419 DEST is the destination register for the subtract.
420 SRC1 is the first subtract input.
421 SRC2 is the second subtract input.
423 Computes DEST = SRC1-SRC2. */
425 do_sub3 (rtx dest
, rtx src1
, rtx src2
)
427 if (GET_MODE (dest
) == DImode
)
428 emit_insn (gen_subdi3 (dest
, src1
, src2
));
430 emit_insn (gen_subsi3 (dest
, src1
, src2
));
433 /* Emit an add of the proper mode for DEST.
435 DEST is the destination register for the add.
436 SRC1 is the first add input.
437 SRC2 is the second add input.
439 Computes DEST = SRC1+SRC2. */
441 do_add3 (rtx dest
, rtx src1
, rtx src2
)
443 if (GET_MODE (dest
) == DImode
)
444 emit_insn (gen_adddi3 (dest
, src1
, src2
));
446 emit_insn (gen_addsi3 (dest
, src1
, src2
));
449 /* Emit an and of the proper mode for DEST.
451 DEST is the destination register for the and.
452 SRC1 is the first and input.
453 SRC2 is the second and input.
455 Computes DEST = SRC1&SRC2. */
457 do_and3 (rtx dest
, rtx src1
, rtx src2
)
459 if (GET_MODE (dest
) == DImode
)
460 emit_insn (gen_anddi3 (dest
, src1
, src2
));
462 emit_insn (gen_andsi3 (dest
, src1
, src2
));
465 /* Emit an cmpb of the proper mode for DEST.
467 DEST is the destination register for the cmpb.
468 SRC1 is the first input.
469 SRC2 is the second input.
471 Computes cmpb of SRC1, SRC2. */
473 do_cmpb3 (rtx dest
, rtx src1
, rtx src2
)
475 if (GET_MODE (dest
) == DImode
)
476 emit_insn (gen_cmpbdi3 (dest
, src1
, src2
));
478 emit_insn (gen_cmpbsi3 (dest
, src1
, src2
));
481 /* Emit a rotl of the proper mode for DEST.
483 DEST is the destination register for the and.
484 SRC1 is the first and input.
485 SRC2 is the second and input.
487 Computes DEST = SRC1 rotated left by SRC2. */
489 do_rotl3 (rtx dest
, rtx src1
, rtx src2
)
491 if (GET_MODE (dest
) == DImode
)
492 emit_insn (gen_rotldi3 (dest
, src1
, src2
));
494 emit_insn (gen_rotlsi3 (dest
, src1
, src2
));
497 /* Generate rtl for a load, shift, and compare of less than a full word.
499 LOAD_MODE is the machine mode for the loads.
500 DIFF is the reg for the difference.
501 CMP_REM is the reg containing the remaining bytes to compare.
502 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
503 SRC1_ADDR is the first source address.
504 SRC2_ADDR is the second source address.
505 ORIG_SRC1 is the original first source block's address rtx.
506 ORIG_SRC2 is the original second source block's address rtx. */
508 do_load_mask_compare (const machine_mode load_mode
, rtx diff
, rtx cmp_rem
, rtx dcond
,
509 rtx src1_addr
, rtx src2_addr
, rtx orig_src1
, rtx orig_src2
)
511 HOST_WIDE_INT load_mode_size
= GET_MODE_SIZE (load_mode
);
512 rtx shift_amount
= gen_reg_rtx (word_mode
);
513 rtx d1
= gen_reg_rtx (word_mode
);
514 rtx d2
= gen_reg_rtx (word_mode
);
516 do_load_for_compare_from_addr (load_mode
, d1
, src1_addr
, orig_src1
);
517 do_load_for_compare_from_addr (load_mode
, d2
, src2_addr
, orig_src2
);
518 do_sub3 (shift_amount
, GEN_INT (load_mode_size
), cmp_rem
);
520 if (word_mode
== DImode
)
522 emit_insn (gen_ashldi3 (shift_amount
, shift_amount
,
523 GEN_INT (LOG2_BITS_PER_UNIT
)));
524 emit_insn (gen_lshrdi3 (d1
, d1
,
525 gen_lowpart (SImode
, shift_amount
)));
526 emit_insn (gen_lshrdi3 (d2
, d2
,
527 gen_lowpart (SImode
, shift_amount
)));
531 emit_insn (gen_ashlsi3 (shift_amount
, shift_amount
,
532 GEN_INT (LOG2_BITS_PER_UNIT
)));
533 emit_insn (gen_lshrsi3 (d1
, d1
, shift_amount
));
534 emit_insn (gen_lshrsi3 (d2
, d2
, shift_amount
));
539 /* Generate a compare, and convert with a setb later. */
540 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, d1
, d2
);
541 emit_insn (gen_rtx_SET (dcond
, cmp
));
545 if (word_mode
== DImode
)
546 emit_insn (gen_subfdi3_carry (diff
, d2
, d1
));
548 emit_insn (gen_subfsi3_carry (diff
, d2
, d1
));
552 /* Generate rtl for an overlapping load and compare of less than a
553 full load_mode. This assumes that the previous word is part of the
554 block being compared so it's ok to back up part of a word so we can
555 compare the last unaligned full word that ends at the end of the block.
557 LOAD_MODE is the machine mode for the loads.
558 ISCONST tells whether the remaining length is a constant or in a register.
559 BYTES_REM is the remaining length if ISCONST is true.
560 DIFF is the reg for the difference.
561 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
562 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
563 SRC1_ADDR is the first source address.
564 SRC2_ADDR is the second source address.
565 ORIG_SRC1 is the original first source block's address rtx.
566 ORIG_SRC2 is the original second source block's address rtx. */
568 do_overlap_load_compare (machine_mode load_mode
, bool isConst
,
569 HOST_WIDE_INT bytes_rem
, rtx diff
,
570 rtx cmp_rem
, rtx dcond
, rtx src1_addr
, rtx src2_addr
,
571 rtx orig_src1
, rtx orig_src2
)
573 HOST_WIDE_INT load_mode_size
= GET_MODE_SIZE (load_mode
);
574 HOST_WIDE_INT addr_adj
= load_mode_size
- bytes_rem
;
575 rtx d1
= gen_reg_rtx (word_mode
);
576 rtx d2
= gen_reg_rtx (word_mode
);
579 if (!isConst
|| addr_adj
)
581 rtx adj_reg
= gen_reg_rtx (word_mode
);
583 emit_move_insn (adj_reg
, GEN_INT (-addr_adj
));
586 rtx reg_lms
= gen_reg_rtx (word_mode
);
587 emit_move_insn (reg_lms
, GEN_INT (load_mode_size
));
588 do_sub3 (adj_reg
, cmp_rem
, reg_lms
);
591 addr1
= gen_rtx_PLUS (word_mode
, src1_addr
, adj_reg
);
592 addr2
= gen_rtx_PLUS (word_mode
, src2_addr
, adj_reg
);
600 do_load_for_compare_from_addr (load_mode
, d1
, addr1
, orig_src1
);
601 do_load_for_compare_from_addr (load_mode
, d2
, addr2
, orig_src2
);
605 /* Generate a compare, and convert with a setb later. */
606 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, d1
, d2
);
607 emit_insn (gen_rtx_SET (dcond
, cmp
));
611 if (word_mode
== DImode
)
612 emit_insn (gen_subfdi3_carry (diff
, d2
, d1
));
614 emit_insn (gen_subfsi3_carry (diff
, d2
, d1
));
618 /* Expand a block compare operation using loop code, and return true
619 if successful. Return false if we should let the compiler generate
620 normal code, probably a memcmp call.
622 OPERANDS[0] is the target (result).
623 OPERANDS[1] is the first source.
624 OPERANDS[2] is the second source.
625 OPERANDS[3] is the length.
626 OPERANDS[4] is the alignment. */
628 expand_compare_loop (rtx operands
[])
630 rtx target
= operands
[0];
631 rtx orig_src1
= operands
[1];
632 rtx orig_src2
= operands
[2];
633 rtx bytes_rtx
= operands
[3];
634 rtx align_rtx
= operands
[4];
636 /* This case is complicated to handle because the subtract
637 with carry instructions do not generate the 64-bit
638 carry and so we must emit code to calculate it ourselves.
639 We choose not to implement this yet. */
640 if (TARGET_32BIT
&& TARGET_POWERPC64
)
643 /* Allow non-const length. */
644 int bytes_is_const
= CONST_INT_P (bytes_rtx
);
646 /* This must be a fixed size alignment. */
647 if (!CONST_INT_P (align_rtx
))
650 HOST_WIDE_INT align1
= MEM_ALIGN (orig_src1
) / BITS_PER_UNIT
;
651 HOST_WIDE_INT align2
= MEM_ALIGN (orig_src2
) / BITS_PER_UNIT
;
652 HOST_WIDE_INT minalign
= MIN (align1
, align2
);
654 bool isP7
= (rs6000_tune
== PROCESSOR_POWER7
);
656 gcc_assert (GET_MODE (target
) == SImode
);
658 /* Anything to move? */
659 HOST_WIDE_INT bytes
= 0;
661 bytes
= INTVAL (bytes_rtx
);
663 if (bytes_is_const
&& bytes
== 0)
666 /* Limit the amount we compare, if known statically. */
667 HOST_WIDE_INT max_bytes
;
670 case PROCESSOR_POWER7
:
682 case PROCESSOR_POWER8
:
691 case PROCESSOR_POWER9
:
701 /* Allow the option to override the default. */
702 if (rs6000_block_compare_inline_loop_limit
>= 0)
703 max_bytes
= (unsigned HOST_WIDE_INT
) rs6000_block_compare_inline_loop_limit
;
708 rtx cmp_rem
= gen_reg_rtx (word_mode
); /* Remainder for library call. */
709 rtx loop_cmp
= gen_reg_rtx (word_mode
); /* Actual amount compared by loop. */
711 rtx iter
= gen_reg_rtx (word_mode
);
712 rtx iv1
= gen_reg_rtx (word_mode
);
713 rtx iv2
= gen_reg_rtx (word_mode
);
714 rtx d1_1
= gen_reg_rtx (word_mode
); /* Addr expression src1+iv1 */
715 rtx d1_2
= gen_reg_rtx (word_mode
); /* Addr expression src1+iv2 */
716 rtx d2_1
= gen_reg_rtx (word_mode
); /* Addr expression src2+iv1 */
717 rtx d2_2
= gen_reg_rtx (word_mode
); /* Addr expression src2+iv2 */
719 /* Strip unneeded subreg from length if there is one. */
720 if (SUBREG_P (bytes_rtx
) && subreg_lowpart_p (bytes_rtx
))
721 bytes_rtx
= SUBREG_REG (bytes_rtx
);
722 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
723 maybe have to deal with the case were bytes_rtx is SImode and
724 word_mode is DImode. */
727 if (GET_MODE_SIZE (GET_MODE (bytes_rtx
)) > GET_MODE_SIZE (word_mode
))
728 /* Do not expect length longer than word_mode. */
730 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx
)) < GET_MODE_SIZE (word_mode
))
732 bytes_rtx
= force_reg (GET_MODE (bytes_rtx
), bytes_rtx
);
733 bytes_rtx
= force_reg (word_mode
,
734 gen_rtx_fmt_e (ZERO_EXTEND
, word_mode
,
738 /* Make sure it's in a register before we get started. */
739 bytes_rtx
= force_reg (GET_MODE (bytes_rtx
), bytes_rtx
);
742 machine_mode load_mode
= word_mode
;
743 HOST_WIDE_INT load_mode_size
= GET_MODE_SIZE (load_mode
);
745 /* Number of bytes per iteration of the unrolled loop. */
746 HOST_WIDE_INT loop_bytes
= 2 * load_mode_size
;
747 /* max iters and bytes compared in the loop. */
748 HOST_WIDE_INT max_loop_iter
= max_bytes
/ loop_bytes
;
749 HOST_WIDE_INT max_loop_bytes
= max_loop_iter
* loop_bytes
;
750 int l2lb
= floor_log2 (loop_bytes
);
752 if (bytes_is_const
&& (max_bytes
< load_mode_size
753 || !IN_RANGE (bytes
, load_mode_size
, max_bytes
)))
756 bool no_remainder_code
= false;
757 rtx final_label
= gen_label_rtx ();
758 rtx final_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_label
);
759 rtx diff_label
= gen_label_rtx ();
760 rtx library_call_label
= NULL
;
761 rtx cleanup_label
= gen_label_rtx ();
765 rtx src1_addr
= copy_addr_to_reg (XEXP (orig_src1
, 0));
766 rtx src2_addr
= copy_addr_to_reg (XEXP (orig_src2
, 0));
768 /* Difference found is stored here before jump to diff_label. */
769 rtx diff
= gen_reg_rtx (word_mode
);
772 /* Example of generated code for 35 bytes aligned 1 byte.
802 Compiled with -fno-reorder-blocks for clarity. */
804 /* Structure of what we're going to do:
805 Two separate lengths: what we will compare before bailing to library
806 call (max_bytes), and the total length to be checked.
807 if length <= 16, branch to linear cleanup code starting with
808 remainder length check (length not known at compile time)
809 set up 2 iv's and load count reg, compute remainder length
810 unrollx2 compare loop
811 if loop exit due to a difference, branch to difference handling code
812 if remainder length < 8, branch to final cleanup compare
814 final cleanup comparison (depends on alignment and length)
815 load 8B, shift off bytes past length, compare
816 load 8B ending at last byte and compare
817 load/compare 1 byte at a time (short block abutting 4k boundary)
818 difference handling, 64->32 conversion
820 branch around memcmp call
824 /* If bytes is not const, compare length and branch directly
825 to the cleanup code that can handle 0-16 bytes if length
826 is >= 16. Stash away bytes-max_bytes for the library call. */
829 /* These need to be set for some of the places we may jump to. */
830 if (bytes
> max_bytes
)
832 no_remainder_code
= true;
833 niter
= max_loop_iter
;
834 library_call_label
= gen_label_rtx ();
838 niter
= bytes
/ loop_bytes
;
840 emit_move_insn (iter
, GEN_INT (niter
));
841 emit_move_insn (loop_cmp
, GEN_INT (niter
* loop_bytes
));
842 emit_move_insn (cmp_rem
, GEN_INT (bytes
- niter
* loop_bytes
));
846 library_call_label
= gen_label_rtx ();
848 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
849 emit_move_insn (cmp_rem
, bytes_rtx
);
851 /* Check for > max_bytes bytes. We want to bail out as quickly as
852 possible if we have to go over to memcmp. */
853 do_ifelse (CCmode
, GT
, bytes_rtx
, GEN_INT (max_bytes
),
854 NULL_RTX
, library_call_label
);
856 /* Check for < loop_bytes bytes. */
857 do_ifelse (CCmode
, LT
, bytes_rtx
, GEN_INT (loop_bytes
),
858 NULL_RTX
, cleanup_label
);
860 /* Loop compare bytes and iterations if bytes>max_bytes. */
861 rtx mb_reg
= gen_reg_rtx (word_mode
);
862 emit_move_insn (mb_reg
, GEN_INT (max_loop_bytes
));
863 rtx mi_reg
= gen_reg_rtx (word_mode
);
864 emit_move_insn (mi_reg
, GEN_INT (max_loop_iter
));
866 /* Compute number of loop iterations if bytes <= max_bytes. */
867 if (word_mode
== DImode
)
868 emit_insn (gen_lshrdi3 (iter
, bytes_rtx
, GEN_INT (l2lb
)));
870 emit_insn (gen_lshrsi3 (iter
, bytes_rtx
, GEN_INT (l2lb
)));
872 /* Compute bytes to compare in loop if bytes <= max_bytes. */
873 rtx mask
= GEN_INT (HOST_WIDE_INT_M1U
<< l2lb
);
874 if (word_mode
== DImode
)
876 emit_insn (gen_anddi3 (loop_cmp
, bytes_rtx
, mask
));
880 emit_insn (gen_andsi3 (loop_cmp
, bytes_rtx
, mask
));
883 /* Check for bytes <= max_bytes. */
886 /* P9 has fast isel so we use one compare and two isel. */
887 cr
= gen_reg_rtx (CCmode
);
888 rtx compare_rtx
= gen_rtx_COMPARE (CCmode
, bytes_rtx
,
889 GEN_INT (max_bytes
));
890 emit_move_insn (cr
, compare_rtx
);
891 rtx cmp_rtx
= gen_rtx_LE (VOIDmode
, cr
, const0_rtx
);
892 do_isel (loop_cmp
, cmp_rtx
, loop_cmp
, mb_reg
, cr
);
893 do_isel (iter
, cmp_rtx
, iter
, mi_reg
, cr
);
897 rtx lab_after
= gen_label_rtx ();
898 do_ifelse (CCmode
, LE
, bytes_rtx
, GEN_INT (max_bytes
),
899 NULL_RTX
, lab_after
);
900 emit_move_insn (loop_cmp
, mb_reg
);
901 emit_move_insn (iter
, mi_reg
);
902 emit_label (lab_after
);
905 /* Now compute remainder bytes which isn't used until after the loop. */
906 do_sub3 (cmp_rem
, bytes_rtx
, loop_cmp
);
909 rtx dcond
= NULL_RTX
; /* Used for when we jump to diff_label. */
910 /* For p9 we need to have just one of these as multiple places define
911 it and it gets used by the setb at the end. */
913 dcond
= gen_reg_rtx (CCUNSmode
);
915 if (!bytes_is_const
|| bytes
>= loop_bytes
)
917 /* It should not be possible to come here if remaining bytes is
918 < 16 in the runtime case either. Compute number of loop
919 iterations. We compare 2*word_mode per iteration so 16B for
920 64-bit code and 8B for 32-bit. Set up two induction
921 variables and load count register. */
923 /* HACK ALERT: create hard reg for CTR here. If we just use a
924 pseudo, cse will get rid of it and then the allocator will
925 see it used in the lshr above and won't give us ctr. */
926 rtx ctr
= gen_rtx_REG (Pmode
, CTR_REGNO
);
927 emit_move_insn (ctr
, iter
);
928 emit_move_insn (diff
, GEN_INT (0));
929 emit_move_insn (iv1
, GEN_INT (0));
930 emit_move_insn (iv2
, GEN_INT (load_mode_size
));
932 /* inner loop to compare 2*word_mode */
933 rtx loop_top_label
= gen_label_rtx ();
934 emit_label (loop_top_label
);
936 rtx src1_ix1
= gen_rtx_PLUS (word_mode
, src1_addr
, iv1
);
937 rtx src2_ix1
= gen_rtx_PLUS (word_mode
, src2_addr
, iv1
);
939 do_load_for_compare_from_addr (load_mode
, d1_1
,
940 src1_ix1
, orig_src1
);
941 do_load_for_compare_from_addr (load_mode
, d2_1
,
942 src2_ix1
, orig_src2
);
943 do_add3 (iv1
, iv1
, GEN_INT (loop_bytes
));
945 rtx src1_ix2
= gen_rtx_PLUS (word_mode
, src1_addr
, iv2
);
946 rtx src2_ix2
= gen_rtx_PLUS (word_mode
, src2_addr
, iv2
);
948 do_load_for_compare_from_addr (load_mode
, d1_2
,
949 src1_ix2
, orig_src1
);
950 do_load_for_compare_from_addr (load_mode
, d2_2
,
951 src2_ix2
, orig_src2
);
952 do_add3 (iv2
, iv2
, GEN_INT (loop_bytes
));
956 /* Generate a compare, and convert with a setb later. */
957 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, d1_1
, d2_1
);
958 emit_insn (gen_rtx_SET (dcond
, cmp
));
962 dcond
= gen_reg_rtx (CCmode
);
963 if (word_mode
== DImode
)
964 emit_insn (gen_subfdi3_carry_dot2 (diff
, d2_1
, d1_1
, dcond
));
966 emit_insn (gen_subfsi3_carry_dot2 (diff
, d2_1
, d1_1
, dcond
));
969 do_ifelse (GET_MODE (dcond
), NE
, NULL_RTX
, NULL_RTX
,
974 /* Generate a compare, and convert with a setb later. */
975 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, d1_2
, d2_2
);
976 emit_insn (gen_rtx_SET (dcond
, cmp
));
980 dcond
= gen_reg_rtx (CCmode
);
981 if (word_mode
== DImode
)
982 emit_insn (gen_subfdi3_carry_dot2 (diff
, d2_2
, d1_2
, dcond
));
984 emit_insn (gen_subfsi3_carry_dot2 (diff
, d2_2
, d1_2
, dcond
));
987 rtx eqrtx
= gen_rtx_EQ (VOIDmode
, d1_2
, d2_2
);
989 j
= emit_jump_insn (gen_bdnztf_di (loop_top_label
, ctr
, ctr
,
992 j
= emit_jump_insn (gen_bdnztf_si (loop_top_label
, ctr
, ctr
,
994 JUMP_LABEL (j
) = loop_top_label
;
995 LABEL_NUSES (loop_top_label
) += 1;
998 HOST_WIDE_INT bytes_remaining
= 0;
1000 bytes_remaining
= (bytes
% loop_bytes
);
1002 /* If diff is nonzero, branch to difference handling
1003 code. If we exit here with a nonzero diff, it is
1004 because the second word differed. */
1006 do_ifelse (CCUNSmode
, NE
, NULL_RTX
, NULL_RTX
, dcond
, diff_label
);
1008 do_ifelse (CCmode
, NE
, diff
, const0_rtx
, NULL_RTX
, diff_label
);
1010 if (library_call_label
!= NULL
&& bytes_is_const
&& bytes
> max_bytes
)
1012 /* If the length is known at compile time, then we will always
1013 have a remainder to go to the library call with. */
1014 rtx library_call_ref
= gen_rtx_LABEL_REF (VOIDmode
, library_call_label
);
1015 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, library_call_ref
));
1016 JUMP_LABEL (j
) = library_call_label
;
1017 LABEL_NUSES (library_call_label
) += 1;
1021 if (bytes_is_const
&& bytes_remaining
== 0)
1023 /* No remainder and if we are here then diff is 0 so just return 0 */
1025 emit_insn (gen_movsi (target
, gen_lowpart (SImode
, diff
)));
1027 emit_move_insn (target
, diff
);
1028 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, final_ref
));
1029 JUMP_LABEL (j
) = final_label
;
1030 LABEL_NUSES (final_label
) += 1;
1033 else if (!no_remainder_code
)
1035 /* Update addresses to point to the next word to examine. */
1036 do_add3 (src1_addr
, src1_addr
, iv1
);
1037 do_add3 (src2_addr
, src2_addr
, iv1
);
1039 emit_label (cleanup_label
);
1041 if (!bytes_is_const
)
1043 /* If we're dealing with runtime length, we have to check if
1044 it's zero after the loop. When length is known at compile
1045 time the no-remainder condition is dealt with above. By
1046 doing this after cleanup_label, we also deal with the
1047 case where length is 0 at the start and we bypass the
1048 loop with a branch to cleanup_label. */
1049 emit_move_insn (target
, const0_rtx
);
1050 do_ifelse (CCmode
, EQ
, cmp_rem
, const0_rtx
,
1051 NULL_RTX
, final_label
);
1054 rtx final_cleanup
= gen_label_rtx ();
1055 rtx cmp_rem_before
= gen_reg_rtx (word_mode
);
1056 /* Compare one more word_mode chunk if needed. */
1057 if (!bytes_is_const
|| bytes_remaining
>= load_mode_size
)
1059 /* If remainder length < word length, branch to final
1061 if (!bytes_is_const
)
1062 do_ifelse (CCmode
, LT
, cmp_rem
, GEN_INT (load_mode_size
),
1063 NULL_RTX
, final_cleanup
);
1065 /* load and compare 8B */
1066 do_load_for_compare_from_addr (load_mode
, d1_1
,
1067 src1_addr
, orig_src1
);
1068 do_load_for_compare_from_addr (load_mode
, d2_1
,
1069 src2_addr
, orig_src2
);
1071 /* Compare the word, see if we need to do the last partial. */
1074 /* Generate a compare, and convert with a setb later. */
1075 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, d1_1
, d2_1
);
1076 emit_insn (gen_rtx_SET (dcond
, cmp
));
1080 dcond
= gen_reg_rtx (CCmode
);
1081 if (word_mode
== DImode
)
1082 emit_insn (gen_subfdi3_carry_dot2 (diff
, d2_1
, d1_1
, dcond
));
1084 emit_insn (gen_subfsi3_carry_dot2 (diff
, d2_1
, d1_1
, dcond
));
1087 do_ifelse (GET_MODE (dcond
), NE
, NULL_RTX
, NULL_RTX
,
1090 do_add3 (src1_addr
, src1_addr
, GEN_INT (load_mode_size
));
1091 do_add3 (src2_addr
, src2_addr
, GEN_INT (load_mode_size
));
1092 emit_move_insn (cmp_rem_before
, cmp_rem
);
1093 do_add3 (cmp_rem
, cmp_rem
, GEN_INT (-load_mode_size
));
1095 bytes_remaining
-= load_mode_size
;
1097 /* See if remaining length is now zero. We previously set
1098 target to 0 so we can just jump to the end. */
1099 do_ifelse (CCmode
, EQ
, cmp_rem
, const0_rtx
,
1100 NULL_RTX
, final_label
);
1106 We can always shift back to do an overlapping compare
1107 of the last chunk because we know length >= 8.
1110 align>=load_mode_size
1111 Read word_mode and mask
1112 align<load_mode_size
1113 avoid stepping past end
1116 * decrement address and do overlapping compare
1117 * read word_mode and mask
1118 * carefully avoid crossing 4k boundary
1121 if ((!bytes_is_const
|| (bytes_is_const
&& bytes_remaining
&& isP7
))
1122 && align1
>= load_mode_size
&& align2
>= load_mode_size
)
1124 /* Alignment is larger than word_mode so we do not need to be
1125 concerned with extra page crossings. But, we do not know
1126 that the length is larger than load_mode_size so we might
1127 end up compareing against data before the block if we try
1128 an overlapping compare. Also we use this on P7 for fixed length
1129 remainder because P7 doesn't like overlapping unaligned.
1130 Strategy: load 8B, shift off bytes past length, and compare. */
1131 emit_label (final_cleanup
);
1132 do_load_mask_compare (load_mode
, diff
, cmp_rem
, dcond
,
1133 src1_addr
, src2_addr
, orig_src1
, orig_src2
);
1135 else if (bytes_remaining
&& bytes_is_const
)
1137 /* We do not do loop expand if length < 32 so we know at the
1138 end we can do an overlapping compare.
1139 Strategy: shift address back and do word_mode load that
1140 ends at the end of the block. */
1141 emit_label (final_cleanup
);
1142 do_overlap_load_compare (load_mode
, true, bytes_remaining
, diff
,
1143 cmp_rem
, dcond
, src1_addr
, src2_addr
,
1144 orig_src1
, orig_src2
);
1146 else if (!bytes_is_const
)
1148 rtx handle4k_label
= gen_label_rtx ();
1149 rtx nonconst_overlap
= gen_label_rtx ();
1150 emit_label (nonconst_overlap
);
1152 /* Here we have to handle the case where whe have runtime
1153 length which may be too short for overlap compare, and
1154 alignment is not at least load_mode_size so we have to
1155 tread carefully to avoid stepping across 4k boundaries. */
1157 /* If the length after the loop was larger than word_mode
1158 size, we can just do an overlapping compare and we're
1159 done. We fall through to this code from the word_mode
1160 compare that preceeds this. */
1161 do_overlap_load_compare (load_mode
, false, 0, diff
,
1162 cmp_rem
, dcond
, src1_addr
, src2_addr
,
1163 orig_src1
, orig_src2
);
1165 rtx diff_ref
= gen_rtx_LABEL_REF (VOIDmode
, diff_label
);
1166 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, diff_ref
));
1167 JUMP_LABEL (j
) = diff_label
;
1168 LABEL_NUSES (diff_label
) += 1;
1171 /* If we couldn't do the overlap compare we have to be more
1172 careful of the 4k boundary. Test to see if either
1173 address is less than word_mode_size away from a 4k
1174 boundary. If not, then we can do a load/shift/compare
1175 and we are done. We come to this code if length was less
1176 than word_mode_size. */
1178 emit_label (final_cleanup
);
1180 /* We can still avoid the slow case if the length was larger
1181 than one loop iteration, in which case go do the overlap
1182 load compare path. */
1183 do_ifelse (CCmode
, GT
, bytes_rtx
, GEN_INT (loop_bytes
),
1184 NULL_RTX
, nonconst_overlap
);
1186 rtx rem4k
= gen_reg_rtx (word_mode
);
1187 rtx dist1
= gen_reg_rtx (word_mode
);
1188 rtx dist2
= gen_reg_rtx (word_mode
);
1189 do_sub3 (rem4k
, GEN_INT (4096), cmp_rem
);
1190 if (word_mode
== SImode
)
1191 emit_insn (gen_andsi3 (dist1
, src1_addr
, GEN_INT (0xfff)));
1193 emit_insn (gen_anddi3 (dist1
, src1_addr
, GEN_INT (0xfff)));
1194 do_ifelse (CCmode
, LE
, dist1
, rem4k
, NULL_RTX
, handle4k_label
);
1195 if (word_mode
== SImode
)
1196 emit_insn (gen_andsi3 (dist2
, src2_addr
, GEN_INT (0xfff)));
1198 emit_insn (gen_anddi3 (dist2
, src2_addr
, GEN_INT (0xfff)));
1199 do_ifelse (CCmode
, LE
, dist2
, rem4k
, NULL_RTX
, handle4k_label
);
1201 /* We don't have a 4k boundary to deal with, so do
1202 a load/shift/compare and jump to diff. */
1204 do_load_mask_compare (load_mode
, diff
, cmp_rem
, dcond
,
1205 src1_addr
, src2_addr
, orig_src1
, orig_src2
);
1207 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, diff_ref
));
1208 JUMP_LABEL (j
) = diff_label
;
1209 LABEL_NUSES (diff_label
) += 1;
1212 /* Finally in the unlikely case we are inching up to a
1213 4k boundary we use a compact lbzx/compare loop to do
1214 it a byte at a time. */
1216 emit_label (handle4k_label
);
1218 rtx ctr
= gen_rtx_REG (Pmode
, CTR_REGNO
);
1219 emit_move_insn (ctr
, cmp_rem
);
1220 rtx ixreg
= gen_reg_rtx (Pmode
);
1221 emit_move_insn (ixreg
, const0_rtx
);
1223 rtx src1_ix
= gen_rtx_PLUS (word_mode
, src1_addr
, ixreg
);
1224 rtx src2_ix
= gen_rtx_PLUS (word_mode
, src2_addr
, ixreg
);
1225 rtx d1
= gen_reg_rtx (word_mode
);
1226 rtx d2
= gen_reg_rtx (word_mode
);
1228 rtx fc_loop
= gen_label_rtx ();
1229 emit_label (fc_loop
);
1231 do_load_for_compare_from_addr (QImode
, d1
, src1_ix
, orig_src1
);
1232 do_load_for_compare_from_addr (QImode
, d2
, src2_ix
, orig_src2
);
1234 do_add3 (ixreg
, ixreg
, const1_rtx
);
1236 rtx cond
= gen_reg_rtx (CCmode
);
1237 rtx subexpr
= gen_rtx_MINUS (word_mode
, d1
, d2
);
1238 rs6000_emit_dot_insn (diff
, subexpr
, 2, cond
);
1240 rtx eqrtx
= gen_rtx_EQ (VOIDmode
, d1
, d2
);
1242 j
= emit_jump_insn (gen_bdnztf_di (fc_loop
, ctr
, ctr
,
1245 j
= emit_jump_insn (gen_bdnztf_si (fc_loop
, ctr
, ctr
,
1247 JUMP_LABEL (j
) = fc_loop
;
1248 LABEL_NUSES (fc_loop
) += 1;
1251 emit_insn (gen_movsi (target
, gen_lowpart (SImode
, diff
)));
1253 emit_move_insn (target
, diff
);
1255 /* Since we are comparing bytes, the difference can be used
1256 as the final result and we are done here. */
1257 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, final_ref
));
1258 JUMP_LABEL (j
) = final_label
;
1259 LABEL_NUSES (final_label
) += 1;
1264 emit_label (diff_label
);
1265 /* difference handling, 64->32 conversion */
1267 /* We need to produce DI result from sub, then convert to target SI
1268 while maintaining <0 / ==0 / >0 properties. This sequence works:
1274 This is an alternate one Segher cooked up if somebody
1275 wants to expand this for something that doesn't have popcntd:
1282 And finally, p9 can just do this:
1287 emit_insn (gen_setb_unsigned (target
, dcond
));
1292 rtx tmp_reg_ca
= gen_reg_rtx (DImode
);
1293 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca
));
1294 emit_insn (gen_popcntddi2 (diff
, diff
));
1295 emit_insn (gen_iordi3 (diff
, diff
, tmp_reg_ca
));
1296 emit_insn (gen_movsi (target
, gen_lowpart (SImode
, diff
)));
1300 rtx tmp_reg_ca
= gen_reg_rtx (SImode
);
1301 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca
));
1302 emit_insn (gen_popcntdsi2 (diff
, diff
));
1303 emit_insn (gen_iorsi3 (target
, diff
, tmp_reg_ca
));
1307 if (library_call_label
!= NULL
)
1309 /* Branch around memcmp call. */
1310 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, final_ref
));
1311 JUMP_LABEL (j
) = final_label
;
1312 LABEL_NUSES (final_label
) += 1;
1315 /* Make memcmp library call. cmp_rem is the remaining bytes that
1316 were compared and cmp_rem is the expected amount to be compared
1317 by memcmp. If we don't find a difference in the loop compare, do
1318 the library call directly instead of doing a small compare just
1319 to get to an arbitrary boundary before calling it anyway.
1320 Also, update addresses to point to the next word to examine. */
1321 emit_label (library_call_label
);
1323 rtx len_rtx
= gen_reg_rtx (word_mode
);
1326 emit_move_insn (len_rtx
, cmp_rem
);
1327 do_add3 (src1_addr
, src1_addr
, iv1
);
1328 do_add3 (src2_addr
, src2_addr
, iv1
);
1331 emit_move_insn (len_rtx
, bytes_rtx
);
1333 tree fun
= builtin_decl_explicit (BUILT_IN_MEMCMP
);
1334 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
1335 target
, LCT_NORMAL
, GET_MODE (target
),
1338 len_rtx
, GET_MODE (len_rtx
));
1341 /* emit final_label */
1342 emit_label (final_label
);
1346 /* Expand a block compare operation, and return true if successful.
1347 Return false if we should let the compiler generate normal code,
1348 probably a memcmp call.
1350 OPERANDS[0] is the target (result).
1351 OPERANDS[1] is the first source.
1352 OPERANDS[2] is the second source.
1353 OPERANDS[3] is the length.
1354 OPERANDS[4] is the alignment. */
1356 expand_block_compare (rtx operands
[])
1358 rtx target
= operands
[0];
1359 rtx orig_src1
= operands
[1];
1360 rtx orig_src2
= operands
[2];
1361 rtx bytes_rtx
= operands
[3];
1362 rtx align_rtx
= operands
[4];
1363 HOST_WIDE_INT cmp_bytes
= 0;
1364 rtx src1
= orig_src1
;
1365 rtx src2
= orig_src2
;
1367 /* This case is complicated to handle because the subtract
1368 with carry instructions do not generate the 64-bit
1369 carry and so we must emit code to calculate it ourselves.
1370 We choose not to implement this yet. */
1371 if (TARGET_32BIT
&& TARGET_POWERPC64
)
1374 bool isP7
= (rs6000_tune
== PROCESSOR_POWER7
);
1376 /* Allow this param to shut off all expansion. */
1377 if (rs6000_block_compare_inline_limit
== 0)
1380 /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1381 However slow_unaligned_access returns true on P7 even though the
1382 performance of this code is good there. */
1384 && (targetm
.slow_unaligned_access (word_mode
, MEM_ALIGN (orig_src1
))
1385 || targetm
.slow_unaligned_access (word_mode
, MEM_ALIGN (orig_src2
))))
1388 /* Unaligned l*brx traps on P7 so don't do this. However this should
1389 not affect much because LE isn't really supported on P7 anyway. */
1390 if (isP7
&& !BYTES_BIG_ENDIAN
)
1393 /* If this is not a fixed size compare, try generating loop code and
1394 if that fails just call memcmp. */
1395 if (!CONST_INT_P (bytes_rtx
))
1396 return expand_compare_loop (operands
);
1398 /* This must be a fixed size alignment. */
1399 if (!CONST_INT_P (align_rtx
))
1402 unsigned int base_align
= UINTVAL (align_rtx
) / BITS_PER_UNIT
;
1404 gcc_assert (GET_MODE (target
) == SImode
);
1406 /* Anything to move? */
1407 unsigned HOST_WIDE_INT bytes
= UINTVAL (bytes_rtx
);
1411 rtx tmp_reg_src1
= gen_reg_rtx (word_mode
);
1412 rtx tmp_reg_src2
= gen_reg_rtx (word_mode
);
1413 /* P7/P8 code uses cond for subfc. but P9 uses
1414 it for cmpld which needs CCUNSmode. */
1417 cond
= gen_reg_rtx (CCUNSmode
);
1419 cond
= gen_reg_rtx (CCmode
);
1421 /* Strategy phase. How many ops will this take and should we expand it? */
1423 unsigned HOST_WIDE_INT offset
= 0;
1424 machine_mode load_mode
=
1425 select_block_compare_mode (offset
, bytes
, base_align
);
1426 unsigned int load_mode_size
= GET_MODE_SIZE (load_mode
);
1428 /* We don't want to generate too much code. The loop code can take
1429 over for lengths greater than 31 bytes. */
1430 unsigned HOST_WIDE_INT max_bytes
= rs6000_block_compare_inline_limit
;
1431 if (!IN_RANGE (bytes
, 1, max_bytes
))
1432 return expand_compare_loop (operands
);
1434 /* The code generated for p7 and older is not faster than glibc
1435 memcmp if alignment is small and length is not short, so bail
1436 out to avoid those conditions. */
1437 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
1438 && ((base_align
== 1 && bytes
> 16)
1439 || (base_align
== 2 && bytes
> 32)))
1442 bool generate_6432_conversion
= false;
1443 rtx convert_label
= NULL
;
1444 rtx final_label
= NULL
;
1446 /* Example of generated code for 18 bytes aligned 1 byte.
1447 Compiled with -fno-reorder-blocks for clarity.
1465 .L6487: #convert_label
1469 .L6488: #final_label
1472 We start off with DImode for two blocks that jump to the DI->SI conversion
1473 if the difference is found there, then a final block of HImode that skips
1474 the DI->SI conversion. */
1478 unsigned int align
= compute_current_alignment (base_align
, offset
);
1479 load_mode
= select_block_compare_mode (offset
, bytes
, align
);
1480 load_mode_size
= GET_MODE_SIZE (load_mode
);
1481 if (bytes
>= load_mode_size
)
1482 cmp_bytes
= load_mode_size
;
1483 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
)
1485 /* Move this load back so it doesn't go past the end.
1486 P8/P9 can do this efficiently. */
1487 unsigned int extra_bytes
= load_mode_size
- bytes
;
1489 if (extra_bytes
< offset
)
1491 offset
-= extra_bytes
;
1492 cmp_bytes
= load_mode_size
;
1497 /* P7 and earlier can't do the overlapping load trick fast,
1498 so this forces a non-overlapping load and a shift to get
1499 rid of the extra bytes. */
1502 src1
= adjust_address (orig_src1
, load_mode
, offset
);
1503 src2
= adjust_address (orig_src2
, load_mode
, offset
);
1505 if (!REG_P (XEXP (src1
, 0)))
1507 rtx src1_reg
= copy_addr_to_reg (XEXP (src1
, 0));
1508 src1
= replace_equiv_address (src1
, src1_reg
);
1510 set_mem_size (src1
, load_mode_size
);
1512 if (!REG_P (XEXP (src2
, 0)))
1514 rtx src2_reg
= copy_addr_to_reg (XEXP (src2
, 0));
1515 src2
= replace_equiv_address (src2
, src2_reg
);
1517 set_mem_size (src2
, load_mode_size
);
1519 do_load_for_compare (tmp_reg_src1
, src1
, load_mode
);
1520 do_load_for_compare (tmp_reg_src2
, src2
, load_mode
);
1522 if (cmp_bytes
< load_mode_size
)
1524 /* Shift unneeded bytes off. */
1525 rtx sh
= GEN_INT (BITS_PER_UNIT
* (load_mode_size
- cmp_bytes
));
1526 if (word_mode
== DImode
)
1528 emit_insn (gen_lshrdi3 (tmp_reg_src1
, tmp_reg_src1
, sh
));
1529 emit_insn (gen_lshrdi3 (tmp_reg_src2
, tmp_reg_src2
, sh
));
1533 emit_insn (gen_lshrsi3 (tmp_reg_src1
, tmp_reg_src1
, sh
));
1534 emit_insn (gen_lshrsi3 (tmp_reg_src2
, tmp_reg_src2
, sh
));
1538 int remain
= bytes
- cmp_bytes
;
1539 if (GET_MODE_SIZE (GET_MODE (target
)) > GET_MODE_SIZE (load_mode
))
1541 /* Target is larger than load size so we don't need to
1542 reduce result size. */
1544 /* We previously did a block that need 64->32 conversion but
1545 the current block does not, so a label is needed to jump
1547 if (generate_6432_conversion
&& !final_label
)
1548 final_label
= gen_label_rtx ();
1552 /* This is not the last block, branch to the end if the result
1553 of this subtract is not zero. */
1555 final_label
= gen_label_rtx ();
1556 rtx fin_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_label
);
1557 rtx tmp
= gen_rtx_MINUS (word_mode
, tmp_reg_src1
, tmp_reg_src2
);
1558 rtx cr
= gen_reg_rtx (CCmode
);
1559 rs6000_emit_dot_insn (tmp_reg_src2
, tmp
, 2, cr
);
1560 emit_insn (gen_movsi (target
,
1561 gen_lowpart (SImode
, tmp_reg_src2
)));
1562 rtx ne_rtx
= gen_rtx_NE (VOIDmode
, cr
, const0_rtx
);
1563 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, ne_rtx
,
1565 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
1566 JUMP_LABEL (j
) = final_label
;
1567 LABEL_NUSES (final_label
) += 1;
1571 if (word_mode
== DImode
)
1573 emit_insn (gen_subdi3 (tmp_reg_src2
, tmp_reg_src1
,
1575 emit_insn (gen_movsi (target
,
1576 gen_lowpart (SImode
, tmp_reg_src2
)));
1579 emit_insn (gen_subsi3 (target
, tmp_reg_src1
, tmp_reg_src2
));
1583 rtx fin_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_label
);
1584 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, fin_ref
));
1585 JUMP_LABEL (j
) = final_label
;
1586 LABEL_NUSES (final_label
) += 1;
1593 /* Do we need a 64->32 conversion block? We need the 64->32
1594 conversion even if target size == load_mode size because
1595 the subtract generates one extra bit. */
1596 generate_6432_conversion
= true;
1601 convert_label
= gen_label_rtx ();
1603 /* Compare to zero and branch to convert_label if not zero. */
1604 rtx cvt_ref
= gen_rtx_LABEL_REF (VOIDmode
, convert_label
);
1607 /* Generate a compare, and convert with a setb later. */
1608 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, tmp_reg_src1
,
1610 emit_insn (gen_rtx_SET (cond
, cmp
));
1613 /* Generate a subfc. and use the longer
1614 sequence for conversion. */
1616 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2
, tmp_reg_src2
,
1617 tmp_reg_src1
, cond
));
1619 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2
, tmp_reg_src2
,
1620 tmp_reg_src1
, cond
));
1621 rtx ne_rtx
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
1622 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, ne_rtx
,
1624 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
1625 JUMP_LABEL (j
) = convert_label
;
1626 LABEL_NUSES (convert_label
) += 1;
1630 /* Just do the subtract/compare. Since this is the last block
1631 the convert code will be generated immediately following. */
1634 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, tmp_reg_src1
,
1636 emit_insn (gen_rtx_SET (cond
, cmp
));
1640 emit_insn (gen_subfdi3_carry (tmp_reg_src2
, tmp_reg_src2
,
1643 emit_insn (gen_subfsi3_carry (tmp_reg_src2
, tmp_reg_src2
,
1648 offset
+= cmp_bytes
;
1652 if (generate_6432_conversion
)
1655 emit_label (convert_label
);
1657 /* We need to produce DI result from sub, then convert to target SI
1658 while maintaining <0 / ==0 / >0 properties. This sequence works:
1664 This is an alternate one Segher cooked up if somebody
1665 wants to expand this for something that doesn't have popcntd:
1672 And finally, p9 can just do this:
1678 emit_insn (gen_setb_unsigned (target
, cond
));
1684 rtx tmp_reg_ca
= gen_reg_rtx (DImode
);
1685 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca
));
1686 emit_insn (gen_popcntddi2 (tmp_reg_src2
, tmp_reg_src2
));
1687 emit_insn (gen_iordi3 (tmp_reg_src2
, tmp_reg_src2
, tmp_reg_ca
));
1688 emit_insn (gen_movsi (target
, gen_lowpart (SImode
, tmp_reg_src2
)));
1692 rtx tmp_reg_ca
= gen_reg_rtx (SImode
);
1693 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca
));
1694 emit_insn (gen_popcntdsi2 (tmp_reg_src2
, tmp_reg_src2
));
1695 emit_insn (gen_iorsi3 (target
, tmp_reg_src2
, tmp_reg_ca
));
1701 emit_label (final_label
);
1703 gcc_assert (bytes
== 0);
1707 /* Generate page crossing check and branch code to set up for
1708 strncmp when we don't have DI alignment.
1709 STRNCMP_LABEL is the label to branch if there is a page crossing.
1710 SRC_ADDR is the string address to be examined.
1711 BYTES is the max number of bytes to compare. */
1713 expand_strncmp_align_check (rtx strncmp_label
, rtx src_addr
, HOST_WIDE_INT bytes
)
1715 rtx lab_ref
= gen_rtx_LABEL_REF (VOIDmode
, strncmp_label
);
1716 rtx src_pgoff
= gen_reg_rtx (GET_MODE (src_addr
));
1717 do_and3 (src_pgoff
, src_addr
, GEN_INT (0xfff));
1718 rtx cond
= gen_reg_rtx (CCmode
);
1719 emit_move_insn (cond
, gen_rtx_COMPARE (CCmode
, src_pgoff
,
1720 GEN_INT (4096 - bytes
)));
1722 rtx cmp_rtx
= gen_rtx_GE (VOIDmode
, cond
, const0_rtx
);
1724 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, cmp_rtx
,
1726 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
1727 JUMP_LABEL (j
) = strncmp_label
;
1728 LABEL_NUSES (strncmp_label
) += 1;
1731 /* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
1732 BYTES_TO_COMPARE is the number of bytes to be compared.
1733 BASE_ALIGN is the smaller of the alignment of the two strings.
1734 ORIG_SRC1 is the unmodified rtx for the first string.
1735 ORIG_SRC2 is the unmodified rtx for the second string.
1736 TMP_REG_SRC1 is the register for loading the first string.
1737 TMP_REG_SRC2 is the register for loading the second string.
1738 RESULT_REG is the rtx for the result register.
1739 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
1740 to strcmp/strncmp if we have equality at the end of the inline comparison.
1741 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
1742 to clean up and generate the final comparison result.
1743 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
1744 set the final result. */
1746 expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare
,
1747 unsigned int base_align
,
1748 rtx orig_src1
, rtx orig_src2
,
1749 rtx tmp_reg_src1
, rtx tmp_reg_src2
, rtx result_reg
,
1750 bool equality_compare_rest
, rtx
*p_cleanup_label
,
1751 rtx final_move_label
)
1753 unsigned int word_mode_size
= GET_MODE_SIZE (word_mode
);
1754 machine_mode load_mode
;
1755 unsigned int load_mode_size
;
1756 unsigned HOST_WIDE_INT cmp_bytes
= 0;
1757 unsigned HOST_WIDE_INT offset
= 0;
1758 rtx src1_addr
= force_reg (Pmode
, XEXP (orig_src1
, 0));
1759 rtx src2_addr
= force_reg (Pmode
, XEXP (orig_src2
, 0));
1760 gcc_assert (p_cleanup_label
!= NULL
);
1761 rtx cleanup_label
= *p_cleanup_label
;
1763 while (bytes_to_compare
> 0)
1765 /* GPR compare sequence:
1766 check each 8B with: ld/ld/cmpb/cmpb/orc./bne
1768 cleanup code at end:
1769 cntlzd get bit of first zero/diff byte
1770 subfic convert for rldcl use
1771 rldcl rldcl extract diff/zero byte
1772 subf subtract for final result
1774 The last compare can branch around the cleanup code if the
1775 result is zero because the strings are exactly equal. */
1777 unsigned int align
= compute_current_alignment (base_align
, offset
);
1778 load_mode
= select_block_compare_mode (offset
, bytes_to_compare
, align
);
1779 load_mode_size
= GET_MODE_SIZE (load_mode
);
1780 if (bytes_to_compare
>= load_mode_size
)
1781 cmp_bytes
= load_mode_size
;
1782 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
)
1784 /* Move this load back so it doesn't go past the end.
1785 P8/P9 can do this efficiently. */
1786 unsigned int extra_bytes
= load_mode_size
- bytes_to_compare
;
1787 cmp_bytes
= bytes_to_compare
;
1788 if (extra_bytes
< offset
)
1790 offset
-= extra_bytes
;
1791 cmp_bytes
= load_mode_size
;
1792 bytes_to_compare
= cmp_bytes
;
1796 /* P7 and earlier can't do the overlapping load trick fast,
1797 so this forces a non-overlapping load and a shift to get
1798 rid of the extra bytes. */
1799 cmp_bytes
= bytes_to_compare
;
1802 if (BYTES_BIG_ENDIAN
|| TARGET_AVOID_XFORM
)
1803 offset_rtx
= GEN_INT (offset
);
1806 offset_rtx
= gen_reg_rtx (Pmode
);
1807 emit_move_insn (offset_rtx
, GEN_INT (offset
));
1809 rtx addr1
= gen_rtx_PLUS (Pmode
, src1_addr
, offset_rtx
);
1810 rtx addr2
= gen_rtx_PLUS (Pmode
, src2_addr
, offset_rtx
);
1812 do_load_for_compare_from_addr (load_mode
, tmp_reg_src1
, addr1
, orig_src1
);
1813 do_load_for_compare_from_addr (load_mode
, tmp_reg_src2
, addr2
, orig_src2
);
1815 /* We must always left-align the data we read, and
1816 clear any bytes to the right that are beyond the string.
1817 Otherwise the cmpb sequence won't produce the correct
1818 results. However if there is only one byte left, we
1819 can just subtract to get the final result so the shifts
1820 and clears are not needed. */
1822 unsigned HOST_WIDE_INT remain
= bytes_to_compare
- cmp_bytes
;
1824 /* Loading just a single byte is a special case. If we are
1825 loading more than that, we have to check whether we are
1826 looking at the entire chunk of data. If not, rotate left and
1827 clear right so that bytes we aren't supposed to look at are
1828 zeroed, and the first byte we are supposed to compare is
1830 if (load_mode_size
!= 1)
1832 if (load_mode_size
< word_mode_size
)
1834 /* Rotate left first. */
1835 rtx sh
= GEN_INT (BITS_PER_UNIT
1836 * (word_mode_size
- load_mode_size
));
1837 do_rotl3 (tmp_reg_src1
, tmp_reg_src1
, sh
);
1838 do_rotl3 (tmp_reg_src2
, tmp_reg_src2
, sh
);
1841 if (cmp_bytes
< word_mode_size
)
1843 /* Now clear right. This plus the rotate can be
1844 turned into a rldicr instruction. */
1845 HOST_WIDE_INT mb
= BITS_PER_UNIT
* (word_mode_size
- cmp_bytes
);
1846 rtx mask
= GEN_INT (HOST_WIDE_INT_M1U
<< mb
);
1847 do_and3 (tmp_reg_src1
, tmp_reg_src1
, mask
);
1848 do_and3 (tmp_reg_src2
, tmp_reg_src2
, mask
);
1852 /* Cases to handle. A and B are chunks of the two strings.
1853 1: Not end of comparison:
1854 A != B: branch to cleanup code to compute result.
1855 A == B: check for 0 byte, next block if not found.
1856 2: End of the inline comparison:
1857 A != B: branch to cleanup code to compute result.
1858 A == B: check for 0 byte, call strcmp/strncmp
1859 3: compared requested N bytes:
1860 A == B: branch to result 0.
1861 A != B: cleanup code to compute result. */
1864 if (remain
> 0 || equality_compare_rest
)
1866 /* Branch to cleanup code, otherwise fall through to do
1869 cleanup_label
= gen_label_rtx ();
1870 dst_label
= cleanup_label
;
1873 /* Branch to end and produce result of 0. */
1874 dst_label
= final_move_label
;
1876 if (load_mode_size
== 1)
1878 /* Special case for comparing just single byte. */
1879 if (equality_compare_rest
)
1881 /* Use subf./bne to branch to final_move_label if the
1882 byte differs, otherwise fall through to the strncmp
1883 call. We must also check for a zero byte here as we
1884 must not make the library call if this is the end of
1887 rtx lab_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_move_label
);
1888 rtx cond
= gen_reg_rtx (CCmode
);
1889 rtx diff_rtx
= gen_rtx_MINUS (word_mode
,
1890 tmp_reg_src1
, tmp_reg_src2
);
1891 rs6000_emit_dot_insn (result_reg
, diff_rtx
, 2, cond
);
1892 rtx cmp_rtx
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
1894 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, cmp_rtx
,
1896 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
1897 JUMP_LABEL (j
) = final_move_label
;
1898 LABEL_NUSES (final_move_label
) += 1;
1900 /* Check for zero byte here before fall through to
1901 library call. This catches the case where the
1902 strings are equal and end in a zero byte at this
1905 rtx cond0
= gen_reg_rtx (CCmode
);
1906 emit_move_insn (cond0
, gen_rtx_COMPARE (CCmode
, tmp_reg_src1
,
1909 rtx cmp0eq_rtx
= gen_rtx_EQ (VOIDmode
, cond0
, const0_rtx
);
1911 rtx ifelse0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, cmp0eq_rtx
,
1913 rtx j0
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse0
));
1914 JUMP_LABEL (j0
) = final_move_label
;
1915 LABEL_NUSES (final_move_label
) += 1;
1919 /* This is the last byte to be compared so we can use
1920 subf to compute the final result and branch
1921 unconditionally to final_move_label. */
1923 do_sub3 (result_reg
, tmp_reg_src1
, tmp_reg_src2
);
1925 rtx fin_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_move_label
);
1926 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, fin_ref
));
1927 JUMP_LABEL (j
) = final_move_label
;
1928 LABEL_NUSES (final_move_label
) += 1;
1934 rtx cmpb_zero
= gen_reg_rtx (word_mode
);
1935 rtx cmpb_diff
= gen_reg_rtx (word_mode
);
1936 rtx zero_reg
= gen_reg_rtx (word_mode
);
1937 rtx lab_ref
= gen_rtx_LABEL_REF (VOIDmode
, dst_label
);
1938 rtx cond
= gen_reg_rtx (CCmode
);
1940 emit_move_insn (zero_reg
, GEN_INT (0));
1941 do_cmpb3 (cmpb_diff
, tmp_reg_src1
, tmp_reg_src2
);
1942 do_cmpb3 (cmpb_zero
, tmp_reg_src1
, zero_reg
);
1943 rtx not_diff
= gen_rtx_NOT (word_mode
, cmpb_diff
);
1944 rtx orc_rtx
= gen_rtx_IOR (word_mode
, not_diff
, cmpb_zero
);
1946 rs6000_emit_dot_insn (result_reg
, orc_rtx
, 2, cond
);
1949 if (remain
== 0 && !equality_compare_rest
)
1950 cmp_rtx
= gen_rtx_EQ (VOIDmode
, cond
, const0_rtx
);
1952 cmp_rtx
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
1954 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, cmp_rtx
,
1956 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
1957 JUMP_LABEL (j
) = dst_label
;
1958 LABEL_NUSES (dst_label
) += 1;
1961 offset
+= cmp_bytes
;
1962 bytes_to_compare
-= cmp_bytes
;
1965 *p_cleanup_label
= cleanup_label
;
1969 /* Generate the sequence of compares for strcmp/strncmp using vec/vsx
1972 BYTES_TO_COMPARE is the number of bytes to be compared.
1973 ORIG_SRC1 is the unmodified rtx for the first string.
1974 ORIG_SRC2 is the unmodified rtx for the second string.
1975 S1ADDR is the register to use for the base address of the first string.
1976 S2ADDR is the register to use for the base address of the second string.
1977 OFF_REG is the register to use for the string offset for loads.
1978 S1DATA is the register for loading the first string.
1979 S2DATA is the register for loading the second string.
1980 VEC_RESULT is the rtx for the vector result indicating the byte difference.
1981 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
1982 to strcmp/strncmp if we have equality at the end of the inline comparison.
1983 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code to clean up
1984 and generate the final comparison result.
1985 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
1986 set the final result. */
1988 expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare
,
1989 rtx orig_src1
, rtx orig_src2
,
1990 rtx s1addr
, rtx s2addr
, rtx off_reg
,
1991 rtx s1data
, rtx s2data
,
1992 rtx vec_result
, bool equality_compare_rest
,
1993 rtx
*p_cleanup_label
, rtx final_move_label
)
1995 machine_mode load_mode
;
1996 unsigned int load_mode_size
;
1997 unsigned HOST_WIDE_INT cmp_bytes
= 0;
1998 unsigned HOST_WIDE_INT offset
= 0;
2000 gcc_assert (p_cleanup_label
!= NULL
);
2001 rtx cleanup_label
= *p_cleanup_label
;
2003 emit_move_insn (s1addr
, force_reg (Pmode
, XEXP (orig_src1
, 0)));
2004 emit_move_insn (s2addr
, force_reg (Pmode
, XEXP (orig_src2
, 0)));
2008 for (i
= 0; i
< 16; i
++)
2009 zr
[i
] = GEN_INT (0);
2010 rtvec zv
= gen_rtvec_v (16, zr
);
2011 rtx zero_reg
= gen_reg_rtx (V16QImode
);
2012 rs6000_expand_vector_init (zero_reg
, gen_rtx_PARALLEL (V16QImode
, zv
));
2014 while (bytes_to_compare
> 0)
2016 /* VEC/VSX compare sequence for P8:
2017 check each 16B with:
2020 vcmpequb 2,0,1 # compare strings
2021 vcmpequb 4,0,3 # compare w/ 0
2022 xxlorc 37,36,34 # first FF byte is either mismatch or end of string
2023 vcmpequb. 7,5,3 # reg 7 contains 0
2026 For the P8 LE case, we use lxvd2x and compare full 16 bytes
2027 but then use use vgbbd and a shift to get two bytes with the
2028 information we need in the correct order.
2030 VEC/VSX compare sequence if TARGET_P9_VECTOR:
2031 lxvb16x/lxvb16x # load 16B of each string
2032 vcmpnezb. # produces difference location or zero byte location
2035 Use the overlapping compare trick for the last block if it is
2039 load_mode
= V16QImode
;
2040 load_mode_size
= GET_MODE_SIZE (load_mode
);
2042 if (bytes_to_compare
>= load_mode_size
)
2043 cmp_bytes
= load_mode_size
;
2046 /* Move this load back so it doesn't go past the end. P8/P9
2047 can do this efficiently. This is never called with less
2048 than 16 bytes so we should always be able to do this. */
2049 unsigned int extra_bytes
= load_mode_size
- bytes_to_compare
;
2050 cmp_bytes
= bytes_to_compare
;
2051 gcc_assert (offset
> extra_bytes
);
2052 offset
-= extra_bytes
;
2053 cmp_bytes
= load_mode_size
;
2054 bytes_to_compare
= cmp_bytes
;
2057 /* The offset currently used is always kept in off_reg so that the
2058 cleanup code on P8 can use it to extract the differing byte. */
2059 emit_move_insn (off_reg
, GEN_INT (offset
));
2061 rtx addr1
= gen_rtx_PLUS (Pmode
, s1addr
, off_reg
);
2062 do_load_for_compare_from_addr (load_mode
, s1data
, addr1
, orig_src1
);
2063 rtx addr2
= gen_rtx_PLUS (Pmode
, s2addr
, off_reg
);
2064 do_load_for_compare_from_addr (load_mode
, s2data
, addr2
, orig_src2
);
2066 /* Cases to handle. A and B are chunks of the two strings.
2067 1: Not end of comparison:
2068 A != B: branch to cleanup code to compute result.
2070 2: End of the inline comparison:
2071 A != B: branch to cleanup code to compute result.
2072 A == B: call strcmp/strncmp
2073 3: compared requested N bytes:
2074 A == B: branch to result 0.
2075 A != B: cleanup code to compute result. */
2077 unsigned HOST_WIDE_INT remain
= bytes_to_compare
- cmp_bytes
;
2079 if (TARGET_P9_VECTOR
)
2080 emit_insn (gen_vcmpnezb_p (vec_result
, s1data
, s2data
));
2083 /* Emit instructions to do comparison and zero check. */
2084 rtx cmp_res
= gen_reg_rtx (load_mode
);
2085 rtx cmp_zero
= gen_reg_rtx (load_mode
);
2086 rtx cmp_combined
= gen_reg_rtx (load_mode
);
2087 emit_insn (gen_altivec_eqv16qi (cmp_res
, s1data
, s2data
));
2088 emit_insn (gen_altivec_eqv16qi (cmp_zero
, s1data
, zero_reg
));
2089 emit_insn (gen_orcv16qi3 (vec_result
, cmp_zero
, cmp_res
));
2090 emit_insn (gen_altivec_vcmpequb_p (cmp_combined
, vec_result
, zero_reg
));
2093 bool branch_to_cleanup
= (remain
> 0 || equality_compare_rest
);
2094 rtx cr6
= gen_rtx_REG (CCmode
, CR6_REGNO
);
2097 if (branch_to_cleanup
)
2099 /* Branch to cleanup code, otherwise fall through to do more
2100 compares. P8 and P9 use different CR bits because on P8
2101 we are looking at the result of a comparsion vs a
2102 register of zeroes so the all-true condition means no
2103 difference or zero was found. On P9, vcmpnezb sets a byte
2104 to 0xff if there is a mismatch or zero, so the all-false
2105 condition indicates we found no difference or zero. */
2107 cleanup_label
= gen_label_rtx ();
2108 dst_label
= cleanup_label
;
2109 if (TARGET_P9_VECTOR
)
2110 cmp_rtx
= gen_rtx_NE (VOIDmode
, cr6
, const0_rtx
);
2112 cmp_rtx
= gen_rtx_GE (VOIDmode
, cr6
, const0_rtx
);
2116 /* Branch to final return or fall through to cleanup,
2117 result is already set to 0. */
2118 dst_label
= final_move_label
;
2119 if (TARGET_P9_VECTOR
)
2120 cmp_rtx
= gen_rtx_EQ (VOIDmode
, cr6
, const0_rtx
);
2122 cmp_rtx
= gen_rtx_LT (VOIDmode
, cr6
, const0_rtx
);
2125 rtx lab_ref
= gen_rtx_LABEL_REF (VOIDmode
, dst_label
);
2126 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, cmp_rtx
,
2128 rtx j2
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
2129 JUMP_LABEL (j2
) = dst_label
;
2130 LABEL_NUSES (dst_label
) += 1;
2132 offset
+= cmp_bytes
;
2133 bytes_to_compare
-= cmp_bytes
;
2135 *p_cleanup_label
= cleanup_label
;
2139 /* Generate the final sequence that identifies the differing
2140 byte and generates the final result, taking into account
2143 cntlzd get bit of first zero/diff byte
2144 addi convert for rldcl use
2145 rldcl rldcl extract diff/zero byte
2146 subf subtract for final result
2148 STR1 is the reg rtx for data from string 1.
2149 STR2 is the reg rtx for data from string 2.
2150 RESULT is the reg rtx for the comparison result. */
2153 emit_final_str_compare_gpr (rtx str1
, rtx str2
, rtx result
)
2155 machine_mode m
= GET_MODE (str1
);
2156 rtx rot_amt
= gen_reg_rtx (m
);
2158 rtx rot1_1
= gen_reg_rtx (m
);
2159 rtx rot1_2
= gen_reg_rtx (m
);
2160 rtx rot2_1
= gen_reg_rtx (m
);
2161 rtx rot2_2
= gen_reg_rtx (m
);
2165 emit_insn (gen_clzsi2 (rot_amt
, result
));
2166 emit_insn (gen_addsi3 (rot_amt
, rot_amt
, GEN_INT (8)));
2167 emit_insn (gen_rotlsi3 (rot1_1
, str1
,
2168 gen_lowpart (SImode
, rot_amt
)));
2169 emit_insn (gen_andsi3_mask (rot1_2
, rot1_1
, GEN_INT (0xff)));
2170 emit_insn (gen_rotlsi3 (rot2_1
, str2
,
2171 gen_lowpart (SImode
, rot_amt
)));
2172 emit_insn (gen_andsi3_mask (rot2_2
, rot2_1
, GEN_INT (0xff)));
2173 emit_insn (gen_subsi3 (result
, rot1_2
, rot2_2
));
2175 else if (m
== DImode
)
2177 emit_insn (gen_clzdi2 (rot_amt
, result
));
2178 emit_insn (gen_adddi3 (rot_amt
, rot_amt
, GEN_INT (8)));
2179 emit_insn (gen_rotldi3 (rot1_1
, str1
,
2180 gen_lowpart (SImode
, rot_amt
)));
2181 emit_insn (gen_anddi3_mask (rot1_2
, rot1_1
, GEN_INT (0xff)));
2182 emit_insn (gen_rotldi3 (rot2_1
, str2
,
2183 gen_lowpart (SImode
, rot_amt
)));
2184 emit_insn (gen_anddi3_mask (rot2_2
, rot2_1
, GEN_INT (0xff)));
2185 emit_insn (gen_subdi3 (result
, rot1_2
, rot2_2
));
2193 /* Generate the final sequence that identifies the differing
2194 byte and generates the final result, taking into account
2201 addi 10,9,-1 # count trailing zero bits
2204 lbzx 10,28,9 # use that offset to load differing byte
2206 subf 3,3,10 # subtract for final result
2209 vclzlsbb # counts trailing bytes with lsb=0
2210 vextublx # extract differing byte
2212 STR1 is the reg rtx for data from string 1.
2213 STR2 is the reg rtx for data from string 2.
2214 RESULT is the reg rtx for the comparison result.
2215 S1ADDR is the register to use for the base address of the first string.
2216 S2ADDR is the register to use for the base address of the second string.
2217 ORIG_SRC1 is the unmodified rtx for the first string.
2218 ORIG_SRC2 is the unmodified rtx for the second string.
2219 OFF_REG is the register to use for the string offset for loads.
2220 VEC_RESULT is the rtx for the vector result indicating the byte difference.
2224 emit_final_str_compare_vec (rtx str1
, rtx str2
, rtx result
,
2225 rtx s1addr
, rtx s2addr
,
2226 rtx orig_src1
, rtx orig_src2
,
2227 rtx off_reg
, rtx vec_result
)
2229 if (TARGET_P9_VECTOR
)
2231 rtx diffix
= gen_reg_rtx (SImode
);
2232 rtx chr1
= gen_reg_rtx (SImode
);
2233 rtx chr2
= gen_reg_rtx (SImode
);
2234 rtx chr1_di
= simplify_gen_subreg (DImode
, chr1
, SImode
, 0);
2235 rtx chr2_di
= simplify_gen_subreg (DImode
, chr2
, SImode
, 0);
2236 emit_insn (gen_vclzlsbb_v16qi (diffix
, vec_result
));
2237 emit_insn (gen_vextublx (chr1
, diffix
, str1
));
2238 emit_insn (gen_vextublx (chr2
, diffix
, str2
));
2239 do_sub3 (result
, chr1_di
, chr2_di
);
2243 gcc_assert (TARGET_P8_VECTOR
);
2244 rtx diffix
= gen_reg_rtx (DImode
);
2245 rtx result_gbbd
= gen_reg_rtx (V16QImode
);
2246 /* Since each byte of the input is either 00 or FF, the bytes in
2247 dw0 and dw1 after vgbbd are all identical to each other. */
2248 emit_insn (gen_p8v_vgbbd (result_gbbd
, vec_result
));
2249 /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
2250 For BE, we shift by 7 and get AB in the high two bytes then CLZ. */
2251 rtx result_shifted
= gen_reg_rtx (V16QImode
);
2252 int shift_amt
= (BYTES_BIG_ENDIAN
) ? 7 : 9;
2253 emit_insn (gen_altivec_vsldoi_v16qi (result_shifted
,result_gbbd
,result_gbbd
, GEN_INT (shift_amt
)));
2255 rtx diffix_df
= simplify_gen_subreg (DFmode
, diffix
, DImode
, 0);
2256 emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df
, result_shifted
));
2257 rtx count
= gen_reg_rtx (DImode
);
2259 if (BYTES_BIG_ENDIAN
)
2260 emit_insn (gen_clzdi2 (count
, diffix
));
2262 emit_insn (gen_ctzdi2 (count
, diffix
));
2264 /* P8 doesn't have a good solution for extracting one byte from
2265 a vsx reg like vextublx on P9 so we just compute the offset
2266 of the differing byte and load it from each string. */
2267 do_add3 (off_reg
, off_reg
, count
);
2269 rtx chr1
= gen_reg_rtx (QImode
);
2270 rtx chr2
= gen_reg_rtx (QImode
);
2271 rtx addr1
= gen_rtx_PLUS (Pmode
, s1addr
, off_reg
);
2272 do_load_for_compare_from_addr (QImode
, chr1
, addr1
, orig_src1
);
2273 rtx addr2
= gen_rtx_PLUS (Pmode
, s2addr
, off_reg
);
2274 do_load_for_compare_from_addr (QImode
, chr2
, addr2
, orig_src2
);
2275 machine_mode rmode
= GET_MODE (result
);
2276 rtx chr1_rm
= simplify_gen_subreg (rmode
, chr1
, QImode
, 0);
2277 rtx chr2_rm
= simplify_gen_subreg (rmode
, chr2
, QImode
, 0);
2278 do_sub3 (result
, chr1_rm
, chr2_rm
);
2284 /* Expand a string compare operation with length, and return
2285 true if successful. Return false if we should let the
2286 compiler generate normal code, probably a strncmp call.
2288 OPERANDS[0] is the target (result).
2289 OPERANDS[1] is the first source.
2290 OPERANDS[2] is the second source.
2291 If NO_LENGTH is zero, then:
2292 OPERANDS[3] is the length.
2293 OPERANDS[4] is the alignment in bytes.
2294 If NO_LENGTH is nonzero, then:
2295 OPERANDS[3] is the alignment in bytes. */
2297 expand_strn_compare (rtx operands
[], int no_length
)
2299 rtx target
= operands
[0];
2300 rtx orig_src1
= operands
[1];
2301 rtx orig_src2
= operands
[2];
2302 rtx bytes_rtx
, align_rtx
;
2306 align_rtx
= operands
[3];
2310 bytes_rtx
= operands
[3];
2311 align_rtx
= operands
[4];
2314 rtx src1_addr
= force_reg (Pmode
, XEXP (orig_src1
, 0));
2315 rtx src2_addr
= force_reg (Pmode
, XEXP (orig_src2
, 0));
2317 /* If we have a length, it must be constant. This simplifies things
2318 a bit as we don't have to generate code to check if we've exceeded
2319 the length. Later this could be expanded to handle this case. */
2320 if (!no_length
&& !CONST_INT_P (bytes_rtx
))
2323 /* This must be a fixed size alignment. */
2324 if (!CONST_INT_P (align_rtx
))
2327 unsigned int base_align
= UINTVAL (align_rtx
);
2328 unsigned int align1
= MEM_ALIGN (orig_src1
) / BITS_PER_UNIT
;
2329 unsigned int align2
= MEM_ALIGN (orig_src2
) / BITS_PER_UNIT
;
2331 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
2332 if (targetm
.slow_unaligned_access (word_mode
, align1
)
2333 || targetm
.slow_unaligned_access (word_mode
, align2
))
2336 gcc_assert (GET_MODE (target
) == SImode
);
2338 unsigned int required_align
= 8;
2340 unsigned HOST_WIDE_INT offset
= 0;
2341 unsigned HOST_WIDE_INT bytes
; /* N from the strncmp args if available. */
2342 unsigned HOST_WIDE_INT compare_length
; /* How much to compare inline. */
2345 bytes
= rs6000_string_compare_inline_limit
;
2347 bytes
= UINTVAL (bytes_rtx
);
2349 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
2350 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2351 at least POWER8. That way we can rely on overlapping compares to
2352 do the final comparison of less than 16 bytes. Also I do not
2353 want to deal with making this work for 32 bits. In addition, we
2354 have to make sure that we have at least P8_VECTOR (we don't allow
2355 P9_VECTOR without P8_VECTOR). */
2356 int use_vec
= (bytes
>= 16 && !TARGET_32BIT
2357 && TARGET_EFFICIENT_UNALIGNED_VSX
&& TARGET_P8_VECTOR
);
2360 required_align
= 16;
2362 machine_mode load_mode
;
2363 rtx tmp_reg_src1
, tmp_reg_src2
;
2366 load_mode
= V16QImode
;
2367 tmp_reg_src1
= gen_reg_rtx (V16QImode
);
2368 tmp_reg_src2
= gen_reg_rtx (V16QImode
);
2372 load_mode
= select_block_compare_mode (0, bytes
, base_align
);
2373 tmp_reg_src1
= gen_reg_rtx (word_mode
);
2374 tmp_reg_src2
= gen_reg_rtx (word_mode
);
2377 compare_length
= rs6000_string_compare_inline_limit
;
2379 /* If we have equality at the end of the last compare and we have not
2380 found the end of the string, we need to call strcmp/strncmp to
2381 compare the remainder. */
2382 bool equality_compare_rest
= false;
2386 bytes
= compare_length
;
2387 equality_compare_rest
= true;
2391 if (bytes
<= compare_length
)
2392 compare_length
= bytes
;
2394 equality_compare_rest
= true;
2397 rtx result_reg
= gen_reg_rtx (word_mode
);
2398 rtx final_move_label
= gen_label_rtx ();
2399 rtx final_label
= gen_label_rtx ();
2400 rtx begin_compare_label
= NULL
;
2402 if (base_align
< required_align
)
2404 /* Generate code that checks distance to 4k boundary for this case. */
2405 begin_compare_label
= gen_label_rtx ();
2406 rtx strncmp_label
= gen_label_rtx ();
2409 /* Strncmp for power8 in glibc does this:
2411 cmpldi cr7,r8,4096-16
2412 bgt cr7,L(pagecross) */
2414 /* Make sure that the length we use for the alignment test and
2415 the subsequent code generation are in agreement so we do not
2416 go past the length we tested for a 4k boundary crossing. */
2417 unsigned HOST_WIDE_INT align_test
= compare_length
;
2418 if (align_test
< required_align
)
2420 align_test
= HOST_WIDE_INT_1U
<< ceil_log2 (align_test
);
2421 base_align
= align_test
;
2425 align_test
= ROUND_UP (align_test
, required_align
);
2426 base_align
= required_align
;
2429 if (align1
< required_align
)
2430 expand_strncmp_align_check (strncmp_label
, src1_addr
, align_test
);
2431 if (align2
< required_align
)
2432 expand_strncmp_align_check (strncmp_label
, src2_addr
, align_test
);
2434 /* Now generate the following sequence:
2435 - branch to begin_compare
2438 - branch to final_label
2439 - begin_compare_label */
2441 rtx cmp_ref
= gen_rtx_LABEL_REF (VOIDmode
, begin_compare_label
);
2442 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, cmp_ref
));
2443 JUMP_LABEL (jmp
) = begin_compare_label
;
2444 LABEL_NUSES (begin_compare_label
) += 1;
2447 emit_label (strncmp_label
);
2451 tree fun
= builtin_decl_explicit (BUILT_IN_STRCMP
);
2452 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
2453 target
, LCT_NORMAL
, GET_MODE (target
),
2454 force_reg (Pmode
, src1_addr
), Pmode
,
2455 force_reg (Pmode
, src2_addr
), Pmode
);
2459 /* -m32 -mpowerpc64 results in word_mode being DImode even
2460 though otherwise it is 32-bit. The length arg to strncmp
2461 is a size_t which will be the same size as pointers. */
2462 rtx len_rtx
= gen_reg_rtx (Pmode
);
2463 emit_move_insn (len_rtx
, gen_int_mode (bytes
, Pmode
));
2465 tree fun
= builtin_decl_explicit (BUILT_IN_STRNCMP
);
2466 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
2467 target
, LCT_NORMAL
, GET_MODE (target
),
2468 force_reg (Pmode
, src1_addr
), Pmode
,
2469 force_reg (Pmode
, src2_addr
), Pmode
,
2473 rtx fin_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_label
);
2474 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, fin_ref
));
2475 JUMP_LABEL (jmp
) = final_label
;
2476 LABEL_NUSES (final_label
) += 1;
2478 emit_label (begin_compare_label
);
2481 rtx cleanup_label
= NULL
;
2482 rtx s1addr
= NULL
, s2addr
= NULL
, off_reg
= NULL
, vec_result
= NULL
;
2484 /* Generate a sequence of GPR or VEC/VSX instructions to compare out
2485 to the length specified. */
2488 s1addr
= gen_reg_rtx (Pmode
);
2489 s2addr
= gen_reg_rtx (Pmode
);
2490 off_reg
= gen_reg_rtx (Pmode
);
2491 vec_result
= gen_reg_rtx (load_mode
);
2492 emit_move_insn (result_reg
, GEN_INT (0));
2493 expand_strncmp_vec_sequence (compare_length
,
2494 orig_src1
, orig_src2
,
2495 s1addr
, s2addr
, off_reg
,
2496 tmp_reg_src1
, tmp_reg_src2
,
2498 equality_compare_rest
,
2499 &cleanup_label
, final_move_label
);
2502 expand_strncmp_gpr_sequence (compare_length
, base_align
,
2503 orig_src1
, orig_src2
,
2504 tmp_reg_src1
, tmp_reg_src2
,
2506 equality_compare_rest
,
2507 &cleanup_label
, final_move_label
);
2509 offset
= compare_length
;
2511 if (equality_compare_rest
)
2513 /* Update pointers past what has been compared already. */
2514 rtx src1
= force_reg (Pmode
,
2515 gen_rtx_PLUS (Pmode
, src1_addr
, GEN_INT (offset
)));
2516 rtx src2
= force_reg (Pmode
,
2517 gen_rtx_PLUS (Pmode
, src2_addr
, GEN_INT (offset
)));
2519 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2522 tree fun
= builtin_decl_explicit (BUILT_IN_STRCMP
);
2523 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
2524 target
, LCT_NORMAL
, GET_MODE (target
),
2525 src1
, Pmode
, src2
, Pmode
);
2529 rtx len_rtx
= gen_reg_rtx (Pmode
);
2530 emit_move_insn (len_rtx
, gen_int_mode (bytes
- compare_length
, Pmode
));
2531 tree fun
= builtin_decl_explicit (BUILT_IN_STRNCMP
);
2532 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
2533 target
, LCT_NORMAL
, GET_MODE (target
),
2534 src1
, Pmode
, src2
, Pmode
, len_rtx
, Pmode
);
2537 rtx fin_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_label
);
2538 rtx jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, fin_ref
));
2539 JUMP_LABEL (jmp
) = final_label
;
2540 LABEL_NUSES (final_label
) += 1;
2545 emit_label (cleanup_label
);
2548 emit_final_str_compare_vec (tmp_reg_src1
, tmp_reg_src2
, result_reg
,
2549 s1addr
, s2addr
, orig_src1
, orig_src2
,
2550 off_reg
, vec_result
);
2552 emit_final_str_compare_gpr (tmp_reg_src1
, tmp_reg_src2
, result_reg
);
2554 emit_label (final_move_label
);
2555 emit_insn (gen_movsi (target
,
2556 gen_lowpart (SImode
, result_reg
)));
2557 emit_label (final_label
);
2561 /* Expand a block move operation, and return 1 if successful. Return 0
2562 if we should let the compiler generate normal code.
2564 operands[0] is the destination
2565 operands[1] is the source
2566 operands[2] is the length
2567 operands[3] is the alignment */
2569 #define MAX_MOVE_REG 4
2572 expand_block_move (rtx operands
[])
2574 rtx orig_dest
= operands
[0];
2575 rtx orig_src
= operands
[1];
2576 rtx bytes_rtx
= operands
[2];
2577 rtx align_rtx
= operands
[3];
2578 int constp
= (GET_CODE (bytes_rtx
) == CONST_INT
);
2583 rtx stores
[MAX_MOVE_REG
];
2586 /* If this is not a fixed size move, just call memcpy */
2590 /* This must be a fixed size alignment */
2591 gcc_assert (GET_CODE (align_rtx
) == CONST_INT
);
2592 align
= INTVAL (align_rtx
) * BITS_PER_UNIT
;
2594 /* Anything to move? */
2595 bytes
= INTVAL (bytes_rtx
);
2599 if (bytes
> rs6000_block_move_inline_limit
)
2602 for (offset
= 0; bytes
> 0; offset
+= move_bytes
, bytes
-= move_bytes
)
2605 rtx (*movmemsi
) (rtx
, rtx
, rtx
, rtx
);
2606 rtx (*mov
) (rtx
, rtx
);
2608 machine_mode mode
= BLKmode
;
2611 /* Altivec first, since it will be faster than a string move
2612 when it applies, and usually not significantly larger. */
2613 if (TARGET_ALTIVEC
&& bytes
>= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX
|| align
>= 128))
2617 gen_func
.mov
= gen_movv4si
;
2619 else if (bytes
>= 8 && TARGET_POWERPC64
2620 && (align
>= 64 || !STRICT_ALIGNMENT
))
2624 gen_func
.mov
= gen_movdi
;
2625 if (offset
== 0 && align
< 64)
2629 /* If the address form is reg+offset with offset not a
2630 multiple of four, reload into reg indirect form here
2631 rather than waiting for reload. This way we get one
2632 reload, not one per load and/or store. */
2633 addr
= XEXP (orig_dest
, 0);
2634 if ((GET_CODE (addr
) == PLUS
|| GET_CODE (addr
) == LO_SUM
)
2635 && GET_CODE (XEXP (addr
, 1)) == CONST_INT
2636 && (INTVAL (XEXP (addr
, 1)) & 3) != 0)
2638 addr
= copy_addr_to_reg (addr
);
2639 orig_dest
= replace_equiv_address (orig_dest
, addr
);
2641 addr
= XEXP (orig_src
, 0);
2642 if ((GET_CODE (addr
) == PLUS
|| GET_CODE (addr
) == LO_SUM
)
2643 && GET_CODE (XEXP (addr
, 1)) == CONST_INT
2644 && (INTVAL (XEXP (addr
, 1)) & 3) != 0)
2646 addr
= copy_addr_to_reg (addr
);
2647 orig_src
= replace_equiv_address (orig_src
, addr
);
2651 else if (bytes
>= 4 && (align
>= 32 || !STRICT_ALIGNMENT
))
2652 { /* move 4 bytes */
2655 gen_func
.mov
= gen_movsi
;
2657 else if (bytes
>= 2 && (align
>= 16 || !STRICT_ALIGNMENT
))
2658 { /* move 2 bytes */
2661 gen_func
.mov
= gen_movhi
;
2663 else /* move 1 byte at a time */
2667 gen_func
.mov
= gen_movqi
;
2670 src
= adjust_address (orig_src
, mode
, offset
);
2671 dest
= adjust_address (orig_dest
, mode
, offset
);
2673 if (mode
!= BLKmode
)
2675 rtx tmp_reg
= gen_reg_rtx (mode
);
2677 emit_insn ((*gen_func
.mov
) (tmp_reg
, src
));
2678 stores
[num_reg
++] = (*gen_func
.mov
) (dest
, tmp_reg
);
2681 if (mode
== BLKmode
|| num_reg
>= MAX_MOVE_REG
|| bytes
== move_bytes
)
2684 for (i
= 0; i
< num_reg
; i
++)
2685 emit_insn (stores
[i
]);
2689 if (mode
== BLKmode
)
2691 /* Move the address into scratch registers. The movmemsi
2692 patterns require zero offset. */
2693 if (!REG_P (XEXP (src
, 0)))
2695 rtx src_reg
= copy_addr_to_reg (XEXP (src
, 0));
2696 src
= replace_equiv_address (src
, src_reg
);
2698 set_mem_size (src
, move_bytes
);
2700 if (!REG_P (XEXP (dest
, 0)))
2702 rtx dest_reg
= copy_addr_to_reg (XEXP (dest
, 0));
2703 dest
= replace_equiv_address (dest
, dest_reg
);
2705 set_mem_size (dest
, move_bytes
);
2707 emit_insn ((*gen_func
.movmemsi
) (dest
, src
,
2708 GEN_INT (move_bytes
& 31),