From 47bc734289bd1af7f7e99a3c49620241ca79b5f4 Mon Sep 17 00:00:00 2001 From: ams Date: Thu, 17 Jan 2019 12:31:28 +0000 Subject: [PATCH] GCN machine description This patch contains the machine description portion of the GCN back-end. I've broken it out mainly to avoid the mailing list size limit. 2019-01-17 Andrew Stubbs Kwok Cheung Yeung Julian Brown Tom de Vries Jan Hubicka Martin Jambor gcc/ * config/gcn/constraints.md: New file. * config/gcn/gcn-valu.md: New file. * config/gcn/gcn.md: New file. * config/gcn/predicates.md: New file. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@268022 138bc75d-0d04-0410-961f-82ee72b054a4 --- gcc/ChangeLog | 12 + gcc/config/gcn/constraints.md | 139 ++ gcc/config/gcn/gcn-valu.md | 3049 +++++++++++++++++++++++++++++++++++++++++ gcc/config/gcn/gcn.md | 1857 +++++++++++++++++++++++++ gcc/config/gcn/predicates.md | 199 +++ 5 files changed, 5256 insertions(+) create mode 100644 gcc/config/gcn/constraints.md create mode 100644 gcc/config/gcn/gcn-valu.md create mode 100644 gcc/config/gcn/gcn.md create mode 100644 gcc/config/gcn/predicates.md diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d5dde3f90d1..12489cdf838 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,15 @@ +2019-01-17 Andrew Stubbs + Kwok Cheung Yeung + Julian Brown + Tom de Vries + Jan Hubicka + Martin Jambor + + * config/gcn/constraints.md: New file. + * config/gcn/gcn-valu.md: New file. + * config/gcn/gcn.md: New file. + * config/gcn/predicates.md: New file. + 2019-01-17 Eric Botcazou * gimple-ssa-isolate-paths.c (stmt_uses_name_in_undefined_way): Replace diff --git a/gcc/config/gcn/constraints.md b/gcc/config/gcn/constraints.md new file mode 100644 index 00000000000..08ba76afc00 --- /dev/null +++ b/gcc/config/gcn/constraints.md @@ -0,0 +1,139 @@ +;; Constraint definitions for GCN. +;; Copyright (C) 2016-2019 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_constraint "I" + "Inline integer constant" + (and (match_code "const_int") + (match_test "ival >= -16 && ival <= 64"))) + +(define_constraint "J" + "Signed integer 16-bit inline constant" + (and (match_code "const_int") + (match_test "((unsigned HOST_WIDE_INT) ival + 0x8000) < 0x10000"))) + +(define_constraint "Kf" + "Immeditate constant -1" + (and (match_code "const_int") + (match_test "ival == -1"))) + +(define_constraint "L" + "Unsigned integer 15-bit constant" + (and (match_code "const_int") + (match_test "((unsigned HOST_WIDE_INT) ival) < 0x8000"))) + +(define_constraint "A" + "Inline immediate parameter" + (and (match_code "const_int,const_double,const_vector") + (match_test "gcn_inline_constant_p (op)"))) + +(define_constraint "B" + "Immediate 32-bit parameter" + (and (match_code "const_int,const_double,const_vector") + (match_test "gcn_constant_p (op)"))) + +(define_constraint "C" + "Immediate 32-bit parameter zero-extended to 64-bits" + (and (match_code "const_int,const_double,const_vector") + (match_test "gcn_constant64_p (op)"))) + +(define_constraint "DA" + "Splittable inline immediate 64-bit parameter" + (and (match_code "const_int,const_double,const_vector") + (match_test "gcn_inline_constant64_p (op)"))) + +(define_constraint "DB" + "Splittable immediate 64-bit parameter" + (match_code "const_int,const_double,const_vector")) + +(define_constraint "U" + "unspecified value" + (match_code "unspec")) + +(define_constraint "Y" + "Symbol or label for relative calls" + (match_code "symbol_ref,label_ref")) + +(define_register_constraint "v" "VGPR_REGS" + "VGPR registers") + +(define_register_constraint "Sg" "SGPR_REGS" + "SGPR registers") + +(define_register_constraint "SD" "SGPR_DST_REGS" + "registers useable as a destination of scalar operation") + +(define_register_constraint "SS" "SGPR_SRC_REGS" + "registers useable as a source of scalar operation") + +(define_register_constraint "Sm" "SGPR_MEM_SRC_REGS" + "registers useable as a source of scalar memory operation") + +(define_register_constraint "Sv" "SGPR_VOP_SRC_REGS" + "registers useable as a source of VOP3A instruction") + +(define_register_constraint "ca" "ALL_CONDITIONAL_REGS" + "SCC VCCZ or EXECZ") + +(define_register_constraint "cs" "SCC_CONDITIONAL_REG" + "SCC") + +(define_register_constraint "cV" "VCC_CONDITIONAL_REG" + "VCC") + +(define_register_constraint "e" "EXEC_MASK_REG" + "EXEC") + +(define_special_memory_constraint "RB" + "Buffer memory address to scratch memory." + (and (match_code "mem") + (match_test "AS_SCRATCH_P (MEM_ADDR_SPACE (op))"))) + +(define_special_memory_constraint "RF" + "Buffer memory address to flat memory." + (and (match_code "mem") + (match_test "AS_FLAT_P (MEM_ADDR_SPACE (op)) + && gcn_flat_address_p (XEXP (op, 0), mode)"))) + +(define_special_memory_constraint "RS" + "Buffer memory address to scalar flat memory." + (and (match_code "mem") + (match_test "AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op)) + && gcn_scalar_flat_mem_p (op)"))) + +(define_special_memory_constraint "RL" + "Buffer memory address to LDS memory." + (and (match_code "mem") + (match_test "AS_LDS_P (MEM_ADDR_SPACE (op))"))) + +(define_special_memory_constraint "RG" + "Buffer memory address to GDS memory." + (and (match_code "mem") + (match_test "AS_GDS_P (MEM_ADDR_SPACE (op))"))) + +(define_special_memory_constraint "RD" + "Buffer memory address to GDS or LDS memory." + (and (match_code "mem") + (ior (match_test "AS_GDS_P (MEM_ADDR_SPACE (op))") + (match_test "AS_LDS_P (MEM_ADDR_SPACE (op))")))) + +(define_special_memory_constraint "RM" + "Memory address to global (main) memory." + (and (match_code "mem") + (match_test "AS_GLOBAL_P (MEM_ADDR_SPACE (op)) + && gcn_global_address_p (XEXP (op, 0))"))) diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md new file mode 100644 index 00000000000..3cc59dd1cd6 --- /dev/null +++ b/gcc/config/gcn/gcn-valu.md @@ -0,0 +1,3049 @@ +;; Copyright (C) 2016-2019 Free Software Foundation, Inc. + +;; This file is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3 of the License, or (at your option) +;; any later version. + +;; This file is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;; {{{ Vector iterators + +; Vector modes for one vector register +(define_mode_iterator VEC_1REG_MODE + [V64QI V64HI V64SI V64HF V64SF]) +(define_mode_iterator VEC_1REG_ALT + [V64QI V64HI V64SI V64HF V64SF]) + +(define_mode_iterator VEC_1REG_INT_MODE + [V64QI V64HI V64SI]) +(define_mode_iterator VEC_1REG_INT_ALT + [V64QI V64HI V64SI]) + +; Vector modes for two vector registers +(define_mode_iterator VEC_2REG_MODE + [V64DI V64DF]) + +; All of above +(define_mode_iterator VEC_REG_MODE + [V64QI V64HI V64SI V64HF V64SF ; Single reg + V64DI V64DF]) ; Double reg + +(define_mode_attr scalar_mode + [(V64QI "qi") (V64HI "hi") (V64SI "si") + (V64HF "hf") (V64SF "sf") (V64DI "di") (V64DF "df")]) + +(define_mode_attr SCALAR_MODE + [(V64QI "QI") (V64HI "HI") (V64SI "SI") + (V64HF "HF") (V64SF "SF") (V64DI "DI") (V64DF "DF")]) + +;; }}} +;; {{{ Substitutions + +(define_subst_attr "exec" "vec_merge" + "" "_exec") +(define_subst_attr "exec_clobber" "vec_merge_with_clobber" + "" "_exec") +(define_subst_attr "exec_vcc" "vec_merge_with_vcc" + "" "_exec") +(define_subst_attr "exec_scatter" "scatter_store" + "" "_exec") + +(define_subst "vec_merge" + [(set (match_operand:VEC_REG_MODE 0) + (match_operand:VEC_REG_MODE 1))] + "" + [(set (match_dup 0) + (vec_merge:VEC_REG_MODE + (match_dup 1) + (match_operand:VEC_REG_MODE 3 "gcn_register_or_unspec_operand" "U0") + (match_operand:DI 4 "gcn_exec_reg_operand" "e")))]) + +(define_subst "vec_merge_with_clobber" + [(set (match_operand:VEC_REG_MODE 0) + (match_operand:VEC_REG_MODE 1)) + (clobber (match_operand 2))] + "" + [(set (match_dup 0) + (vec_merge:VEC_REG_MODE + (match_dup 1) + (match_operand:VEC_REG_MODE 3 "gcn_register_or_unspec_operand" "U0") + (match_operand:DI 4 "gcn_exec_reg_operand" "e"))) + (clobber (match_dup 2))]) + +(define_subst "vec_merge_with_vcc" + [(set (match_operand:VEC_REG_MODE 0) + (match_operand:VEC_REG_MODE 1)) + (set (match_operand:DI 2) + (match_operand:DI 3))] + "" + [(parallel + [(set (match_dup 0) + (vec_merge:VEC_REG_MODE + (match_dup 1) + (match_operand:VEC_REG_MODE 4 + "gcn_register_or_unspec_operand" "U0") + (match_operand:DI 5 "gcn_exec_reg_operand" "e"))) + (set (match_dup 2) + (and:DI (match_dup 3) + (reg:DI EXEC_REG)))])]) + +(define_subst "scatter_store" + [(set (mem:BLK (scratch)) + (unspec:BLK + [(match_operand 0) + (match_operand 1) + (match_operand 2) + (match_operand 3)] + UNSPEC_SCATTER))] + "" + [(set (mem:BLK (scratch)) + (unspec:BLK + [(match_dup 0) + (match_dup 1) + (match_dup 2) + (match_dup 3) + (match_operand:DI 4 "gcn_exec_reg_operand" "e")] + UNSPEC_SCATTER))]) + +;; }}} +;; {{{ Vector moves + +; This is the entry point for all vector register moves. Memory accesses can +; come this way also, but will more usually use the reload_in/out, +; gather/scatter, maskload/store, etc. + +(define_expand "mov" + [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand") + (match_operand:VEC_REG_MODE 1 "general_operand"))] + "" + { + if (MEM_P (operands[0]) && !lra_in_progress && !reload_completed) + { + operands[1] = force_reg (mode, operands[1]); + rtx scratch = gen_rtx_SCRATCH (V64DImode); + rtx a = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0])); + rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0])); + rtx expr = gcn_expand_scalar_to_vector_address (mode, NULL, + operands[0], + scratch); + emit_insn (gen_scatter_expr (expr, operands[1], a, v)); + DONE; + } + else if (MEM_P (operands[1]) && !lra_in_progress && !reload_completed) + { + rtx scratch = gen_rtx_SCRATCH (V64DImode); + rtx a = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1])); + rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1])); + rtx expr = gcn_expand_scalar_to_vector_address (mode, NULL, + operands[1], + scratch); + emit_insn (gen_gather_expr (operands[0], expr, a, v)); + DONE; + } + else if ((MEM_P (operands[0]) || MEM_P (operands[1]))) + { + gcc_assert (!reload_completed); + rtx scratch = gen_reg_rtx (V64DImode); + emit_insn (gen_mov_sgprbase (operands[0], operands[1], scratch)); + DONE; + } + }) + +; A pseudo instruction that helps LRA use the "U0" constraint. + +(define_insn "mov_unspec" + [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand" "=v") + (match_operand:VEC_REG_MODE 1 "gcn_unspec_operand" " U"))] + "" + "" + [(set_attr "type" "unknown") + (set_attr "length" "0")]) + +(define_insn "*mov" + [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "=v,v") + (match_operand:VEC_1REG_MODE 1 "general_operand" "vA,B"))] + "" + "v_mov_b32\t%0, %1" + [(set_attr "type" "vop1,vop1") + (set_attr "length" "4,8")]) + +(define_insn "mov_exec" + [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" + "=v, v, v, v, v, m") + (vec_merge:VEC_1REG_MODE + (match_operand:VEC_1REG_MODE 1 "general_operand" + "vA, B, v,vA, m, v") + (match_operand:VEC_1REG_MODE 3 "gcn_alu_or_unspec_operand" + "U0,U0,vA,vA,U0,U0") + (match_operand:DI 2 "register_operand" " e, e,cV,Sv, e, e"))) + (clobber (match_scratch:V64DI 4 "=X, X, X, X,&v,&v"))] + "!MEM_P (operands[0]) || REG_P (operands[1])" + "@ + v_mov_b32\t%0, %1 + v_mov_b32\t%0, %1 + v_cndmask_b32\t%0, %3, %1, vcc + v_cndmask_b32\t%0, %3, %1, %2 + # + #" + [(set_attr "type" "vop1,vop1,vop2,vop3a,*,*") + (set_attr "length" "4,8,4,8,16,16")]) + +; This variant does not accept an unspec, but does permit MEM +; read/modify/write which is necessary for maskstore. + +;(define_insn "*mov_exec_match" +; [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "=v,v, v, m") +; (vec_merge:VEC_1REG_MODE +; (match_operand:VEC_1REG_MODE 1 "general_operand" "vA,B, m, v") +; (match_dup 0) +; (match_operand:DI 2 "gcn_exec_reg_operand" " e,e, e, e"))) +; (clobber (match_scratch:V64DI 3 "=X,X,&v,&v"))] +; "!MEM_P (operands[0]) || REG_P (operands[1])" +; "@ +; v_mov_b32\t%0, %1 +; v_mov_b32\t%0, %1 +; # +; #" +; [(set_attr "type" "vop1,vop1,*,*") +; (set_attr "length" "4,8,16,16")]) + +(define_insn "*mov" + [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "=v") + (match_operand:VEC_2REG_MODE 1 "general_operand" "vDB"))] + "" + { + if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) + return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1"; + else + return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1"; + } + [(set_attr "type" "vmult") + (set_attr "length" "16")]) + +(define_insn "mov_exec" + [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" + "= v, v, v, v, m") + (vec_merge:VEC_2REG_MODE + (match_operand:VEC_2REG_MODE 1 "general_operand" + "vDB, v0, v0, m, v") + (match_operand:VEC_2REG_MODE 3 "gcn_alu_or_unspec_operand" + " U0,vDA0,vDA0,U0,U0") + (match_operand:DI 2 "register_operand" " e, cV, Sv, e, e"))) + (clobber (match_scratch:V64DI 4 "= X, X, X,&v,&v"))] + "!MEM_P (operands[0]) || REG_P (operands[1])" + { + if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) + switch (which_alternative) + { + case 0: + return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1"; + case 1: + return "v_cndmask_b32\t%L0, %L3, %L1, vcc\;" + "v_cndmask_b32\t%H0, %H3, %H1, vcc"; + case 2: + return "v_cndmask_b32\t%L0, %L3, %L1, %2\;" + "v_cndmask_b32\t%H0, %H3, %H1, %2"; + } + else + switch (which_alternative) + { + case 0: + return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1"; + case 1: + return "v_cndmask_b32\t%H0, %H3, %H1, vcc\;" + "v_cndmask_b32\t%L0, %L3, %L1, vcc"; + case 2: + return "v_cndmask_b32\t%H0, %H3, %H1, %2\;" + "v_cndmask_b32\t%L0, %L3, %L1, %2"; + } + + return "#"; + } + [(set_attr "type" "vmult,vmult,vmult,*,*") + (set_attr "length" "16,16,16,16,16")]) + +; This variant does not accept an unspec, but does permit MEM +; read/modify/write which is necessary for maskstore. + +;(define_insn "*mov_exec_match" +; [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "=v, v, m") +; (vec_merge:VEC_2REG_MODE +; (match_operand:VEC_2REG_MODE 1 "general_operand" "vDB, m, v") +; (match_dup 0) +; (match_operand:DI 2 "gcn_exec_reg_operand" " e, e, e"))) +; (clobber (match_scratch:V64DI 3 "=X,&v,&v"))] +; "!MEM_P (operands[0]) || REG_P (operands[1])" +; "@ +; * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \ +; return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \ +; else \ +; return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\"; +; # +; #" +; [(set_attr "type" "vmult,*,*") +; (set_attr "length" "16,16,16")]) + +; A SGPR-base load looks like: +; v, Sv +; +; There's no hardware instruction that corresponds to this, but vector base +; addresses are placed in an SGPR because it is easier to add to a vector. +; We also have a temporary vT, and the vector v1 holding numbered lanes. +; +; Rewrite as: +; vT = v1 << log2(element-size) +; vT += Sv +; flat_load v, vT + +(define_insn "mov_sgprbase" + [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "= v, v, v, m") + (unspec:VEC_1REG_MODE + [(match_operand:VEC_1REG_MODE 1 "general_operand" " vA,vB, m, v")] + UNSPEC_SGPRBASE)) + (clobber (match_operand:V64DI 2 "register_operand" "=&v,&v,&v,&v"))] + "lra_in_progress || reload_completed" + "@ + v_mov_b32\t%0, %1 + v_mov_b32\t%0, %1 + # + #" + [(set_attr "type" "vop1,vop1,*,*") + (set_attr "length" "4,8,12,12")]) + +(define_insn "mov_sgprbase" + [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "= v, v, m") + (unspec:VEC_2REG_MODE + [(match_operand:VEC_2REG_MODE 1 "general_operand" "vDB, m, v")] + UNSPEC_SGPRBASE)) + (clobber (match_operand:V64DI 2 "register_operand" "=&v,&v,&v"))] + "lra_in_progress || reload_completed" + "@ + * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \ + return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \ + else \ + return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\"; + # + #" + [(set_attr "type" "vmult,*,*") + (set_attr "length" "8,12,12")]) + +; reload_in was once a standard name, but here it's only referenced by +; gcn_secondary_reload. It allows a reload with a scratch register. + +(define_expand "reload_in" + [(set (match_operand:VEC_REG_MODE 0 "register_operand" "= v") + (match_operand:VEC_REG_MODE 1 "memory_operand" " m")) + (clobber (match_operand:V64DI 2 "register_operand" "=&v"))] + "" + { + emit_insn (gen_mov_sgprbase (operands[0], operands[1], operands[2])); + DONE; + }) + +; reload_out is similar to reload_in, above. + +(define_expand "reload_out" + [(set (match_operand:VEC_REG_MODE 0 "memory_operand" "= m") + (match_operand:VEC_REG_MODE 1 "register_operand" " v")) + (clobber (match_operand:V64DI 2 "register_operand" "=&v"))] + "" + { + emit_insn (gen_mov_sgprbase (operands[0], operands[1], operands[2])); + DONE; + }) + +; Expand scalar addresses into gather/scatter patterns + +(define_split + [(set (match_operand:VEC_REG_MODE 0 "memory_operand") + (unspec:VEC_REG_MODE + [(match_operand:VEC_REG_MODE 1 "general_operand")] + UNSPEC_SGPRBASE)) + (clobber (match_scratch:V64DI 2))] + "" + [(set (mem:BLK (scratch)) + (unspec:BLK [(match_dup 5) (match_dup 1) (match_dup 6) (match_dup 7)] + UNSPEC_SCATTER))] + { + operands[5] = gcn_expand_scalar_to_vector_address (mode, NULL, + operands[0], + operands[2]); + operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0])); + operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0])); + }) + +(define_split + [(set (match_operand:VEC_REG_MODE 0 "memory_operand") + (vec_merge:VEC_REG_MODE + (match_operand:VEC_REG_MODE 1 "general_operand") + (match_operand:VEC_REG_MODE 2 "") + (match_operand:DI 3 "gcn_exec_reg_operand"))) + (clobber (match_scratch:V64DI 4))] + "" + [(set (mem:BLK (scratch)) + (unspec:BLK [(match_dup 5) (match_dup 1) + (match_dup 6) (match_dup 7) (match_dup 3)] + UNSPEC_SCATTER))] + { + operands[5] = gcn_expand_scalar_to_vector_address (mode, + operands[3], + operands[0], + operands[4]); + operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0])); + operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0])); + }) + +(define_split + [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand") + (unspec:VEC_REG_MODE + [(match_operand:VEC_REG_MODE 1 "memory_operand")] + UNSPEC_SGPRBASE)) + (clobber (match_scratch:V64DI 2))] + "" + [(set (match_dup 0) + (unspec:VEC_REG_MODE [(match_dup 5) (match_dup 6) (match_dup 7) + (mem:BLK (scratch))] + UNSPEC_GATHER))] + { + operands[5] = gcn_expand_scalar_to_vector_address (mode, NULL, + operands[1], + operands[2]); + operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1])); + operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1])); + }) + +(define_split + [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand") + (vec_merge:VEC_REG_MODE + (match_operand:VEC_REG_MODE 1 "memory_operand") + (match_operand:VEC_REG_MODE 2 "") + (match_operand:DI 3 "gcn_exec_reg_operand"))) + (clobber (match_scratch:V64DI 4))] + "" + [(set (match_dup 0) + (vec_merge:VEC_REG_MODE + (unspec:VEC_REG_MODE [(match_dup 5) (match_dup 6) (match_dup 7) + (mem:BLK (scratch))] + UNSPEC_GATHER) + (match_dup 2) + (match_dup 3)))] + { + operands[5] = gcn_expand_scalar_to_vector_address (mode, + operands[3], + operands[1], + operands[4]); + operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1])); + operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1])); + }) + +; TODO: Add zero/sign extending variants. + +;; }}} +;; {{{ Lane moves + +; v_writelane and v_readlane work regardless of exec flags. +; We allow source to be scratch. +; +; FIXME these should take A immediates + +(define_insn "*vec_set" + [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "= v") + (vec_merge:VEC_1REG_MODE + (vec_duplicate:VEC_1REG_MODE + (match_operand: 1 "register_operand" " Sv")) + (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand" + " U0") + (ashift (const_int 1) + (match_operand:SI 2 "gcn_alu_operand" "SvB"))))] + "" + "v_writelane_b32 %0, %1, %2" + [(set_attr "type" "vop3a") + (set_attr "length" "8") + (set_attr "exec" "none") + (set_attr "laneselect" "yes")]) + +; FIXME: 64bit operations really should be splitters, but I am not sure how +; to represent vertical subregs. +(define_insn "*vec_set" + [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "= v") + (vec_merge:VEC_2REG_MODE + (vec_duplicate:VEC_2REG_MODE + (match_operand: 1 "register_operand" " Sv")) + (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand" + " U0") + (ashift (const_int 1) + (match_operand:SI 2 "gcn_alu_operand" "SvB"))))] + "" + "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2" + [(set_attr "type" "vmult") + (set_attr "length" "16") + (set_attr "exec" "none") + (set_attr "laneselect" "yes")]) + +(define_expand "vec_set" + [(set (match_operand:VEC_REG_MODE 0 "register_operand") + (vec_merge:VEC_REG_MODE + (vec_duplicate:VEC_REG_MODE + (match_operand: 1 "register_operand")) + (match_dup 0) + (ashift (const_int 1) (match_operand:SI 2 "gcn_alu_operand"))))] + "") + +(define_insn "*vec_set_1" + [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v") + (vec_merge:VEC_1REG_MODE + (vec_duplicate:VEC_1REG_MODE + (match_operand: 1 "register_operand" "Sv")) + (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand" + "U0") + (match_operand:SI 2 "const_int_operand" " i")))] + "((unsigned) exact_log2 (INTVAL (operands[2])) < 64)" + { + operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2]))); + return "v_writelane_b32 %0, %1, %2"; + } + [(set_attr "type" "vop3a") + (set_attr "length" "8") + (set_attr "exec" "none") + (set_attr "laneselect" "yes")]) + +(define_insn "*vec_set_1" + [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "=v") + (vec_merge:VEC_2REG_MODE + (vec_duplicate:VEC_2REG_MODE + (match_operand: 1 "register_operand" "Sv")) + (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand" + "U0") + (match_operand:SI 2 "const_int_operand" " i")))] + "((unsigned) exact_log2 (INTVAL (operands[2])) < 64)" + { + operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2]))); + return "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2"; + } + [(set_attr "type" "vmult") + (set_attr "length" "16") + (set_attr "exec" "none") + (set_attr "laneselect" "yes")]) + +(define_insn "vec_duplicate" + [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v") + (vec_duplicate:VEC_1REG_MODE + (match_operand: 1 "gcn_alu_operand" "SvB")))] + "" + "v_mov_b32\t%0, %1" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "vec_duplicate" + [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "= v") + (vec_duplicate:VEC_2REG_MODE + (match_operand: 1 "gcn_alu_operand" "SvDB")))] + "" + "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1" + [(set_attr "type" "vop3a") + (set_attr "length" "16")]) + +(define_insn "vec_extract" + [(set (match_operand: 0 "register_operand" "=Sg") + (vec_select: + (match_operand:VEC_1REG_MODE 1 "register_operand" " v") + (parallel [(match_operand:SI 2 "gcn_alu_operand" "SvB")])))] + "" + "v_readlane_b32 %0, %1, %2" + [(set_attr "type" "vop3a") + (set_attr "length" "8") + (set_attr "exec" "none") + (set_attr "laneselect" "yes")]) + +(define_insn "vec_extract" + [(set (match_operand: 0 "register_operand" "=Sg") + (vec_select: + (match_operand:VEC_2REG_MODE 1 "register_operand" " v") + (parallel [(match_operand:SI 2 "gcn_alu_operand" "SvB")])))] + "" + "v_readlane_b32 %L0, %L1, %2\;v_readlane_b32 %H0, %H1, %2" + [(set_attr "type" "vmult") + (set_attr "length" "16") + (set_attr "exec" "none") + (set_attr "laneselect" "yes")]) + +(define_expand "vec_init" + [(match_operand:VEC_REG_MODE 0 "register_operand") + (match_operand 1)] + "" + { + gcn_expand_vector_init (operands[0], operands[1]); + DONE; + }) + +;; }}} +;; {{{ Scatter / Gather + +;; GCN does not have an instruction for loading a vector from contiguous +;; memory so *all* loads and stores are eventually converted to scatter +;; or gather. +;; +;; GCC does not permit MEM to hold vectors of addresses, so we must use an +;; unspec. The unspec formats are as follows: +;; +;; (unspec:V64?? +;; [(
) +;; () +;; () +;; (mem:BLK (scratch))] +;; UNSPEC_GATHER) +;; +;; (unspec:BLK +;; [(
) +;; () +;; () +;; () +;; ()] +;; UNSPEC_SCATTER) +;; +;; - Loads are expected to be wrapped in a vec_merge, so do not need . +;; - The mem:BLK does not contain any real information, but indicates that an +;; unknown memory read is taking place. Stores are expected to use a similar +;; mem:BLK outside the unspec. +;; - The address space and glc (volatile) fields are there to replace the +;; fields normally found in a MEM. +;; - Multiple forms of address expression are supported, below. + +(define_expand "gather_load" + [(match_operand:VEC_REG_MODE 0 "register_operand") + (match_operand:DI 1 "register_operand") + (match_operand 2 "register_operand") + (match_operand 3 "immediate_operand") + (match_operand:SI 4 "gcn_alu_operand")] + "" + { + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1], + operands[2], operands[4], + INTVAL (operands[3]), NULL); + + if (GET_MODE (addr) == V64DImode) + emit_insn (gen_gather_insn_1offset (operands[0], addr, const0_rtx, + const0_rtx, const0_rtx)); + else + emit_insn (gen_gather_insn_2offsets (operands[0], operands[1], + addr, const0_rtx, const0_rtx, + const0_rtx)); + DONE; + }) + +(define_expand "gather_exec" + [(match_operand:VEC_REG_MODE 0 "register_operand") + (match_operand:DI 1 "register_operand") + (match_operand:V64SI 2 "register_operand") + (match_operand 3 "immediate_operand") + (match_operand:SI 4 "gcn_alu_operand") + (match_operand:DI 5 "gcn_exec_reg_operand")] + "" + { + rtx undefmode = gcn_gen_undef (mode); + + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1], + operands[2], operands[4], + INTVAL (operands[3]), operands[5]); + + if (GET_MODE (addr) == V64DImode) + emit_insn (gen_gather_insn_1offset_exec (operands[0], addr, + const0_rtx, const0_rtx, + const0_rtx, undefmode, + operands[5])); + else + emit_insn (gen_gather_insn_2offsets_exec (operands[0], operands[1], + addr, const0_rtx, + const0_rtx, const0_rtx, + undefmode, operands[5])); + DONE; + }) + +; Allow any address expression +(define_expand "gather_expr" + [(set (match_operand:VEC_REG_MODE 0 "register_operand") + (unspec:VEC_REG_MODE + [(match_operand 1 "") + (match_operand 2 "immediate_operand") + (match_operand 3 "immediate_operand") + (mem:BLK (scratch))] + UNSPEC_GATHER))] + "" + {}) + +(define_insn "gather_insn_1offset" + [(set (match_operand:VEC_REG_MODE 0 "register_operand" "=v") + (unspec:VEC_REG_MODE + [(plus:V64DI (match_operand:V64DI 1 "register_operand" " v") + (vec_duplicate:V64DI + (match_operand 2 "immediate_operand" " n"))) + (match_operand 3 "immediate_operand" " n") + (match_operand 4 "immediate_operand" " n") + (mem:BLK (scratch))] + UNSPEC_GATHER))] + "(AS_FLAT_P (INTVAL (operands[3])) + && ((TARGET_GCN3 && INTVAL(operands[2]) == 0) + || ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x1000))) + || (AS_GLOBAL_P (INTVAL (operands[3])) + && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))" + { + addr_space_t as = INTVAL (operands[3]); + const char *glc = INTVAL (operands[4]) ? " glc" : ""; + + static char buf[200]; + if (AS_FLAT_P (as)) + { + if (TARGET_GCN5_PLUS) + sprintf (buf, "flat_load%%s0\t%%0, %%1 offset:%%2%s\;s_waitcnt\t0", + glc); + else + sprintf (buf, "flat_load%%s0\t%%0, %%1%s\;s_waitcnt\t0", glc); + } + else if (AS_GLOBAL_P (as)) + sprintf (buf, "global_load%%s0\t%%0, %%1, off offset:%%2%s\;" + "s_waitcnt\tvmcnt(0)", glc); + else + gcc_unreachable (); + + return buf; + } + [(set_attr "type" "flat") + (set_attr "length" "12")]) + +(define_insn "gather_insn_1offset_ds" + [(set (match_operand:VEC_REG_MODE 0 "register_operand" "=v") + (unspec:VEC_REG_MODE + [(plus:V64SI (match_operand:V64SI 1 "register_operand" " v") + (vec_duplicate:V64SI + (match_operand 2 "immediate_operand" " n"))) + (match_operand 3 "immediate_operand" " n") + (match_operand 4 "immediate_operand" " n") + (mem:BLK (scratch))] + UNSPEC_GATHER))] + "(AS_ANY_DS_P (INTVAL (operands[3])) + && ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x10000))" + { + addr_space_t as = INTVAL (operands[3]); + static char buf[200]; + sprintf (buf, "ds_read%%b0\t%%0, %%1 offset:%%2%s\;s_waitcnt\tlgkmcnt(0)", + (AS_GDS_P (as) ? " gds" : "")); + return buf; + } + [(set_attr "type" "ds") + (set_attr "length" "12")]) + +(define_insn "gather_insn_2offsets" + [(set (match_operand:VEC_REG_MODE 0 "register_operand" "=v") + (unspec:VEC_REG_MODE + [(plus:V64DI + (plus:V64DI + (vec_duplicate:V64DI + (match_operand:DI 1 "register_operand" "Sv")) + (sign_extend:V64DI + (match_operand:V64SI 2 "register_operand" " v"))) + (vec_duplicate:V64DI (match_operand 3 "immediate_operand" " n"))) + (match_operand 4 "immediate_operand" " n") + (match_operand 5 "immediate_operand" " n") + (mem:BLK (scratch))] + UNSPEC_GATHER))] + "(AS_GLOBAL_P (INTVAL (operands[4])) + && (((unsigned HOST_WIDE_INT)INTVAL(operands[3]) + 0x1000) < 0x2000))" + { + addr_space_t as = INTVAL (operands[4]); + const char *glc = INTVAL (operands[5]) ? " glc" : ""; + + static char buf[200]; + if (AS_GLOBAL_P (as)) + { + /* Work around assembler bug in which a 64-bit register is expected, + but a 32-bit value would be correct. */ + int reg = REGNO (operands[2]) - FIRST_VGPR_REG; + sprintf (buf, "global_load%%s0\t%%0, v[%d:%d], %%1 offset:%%3%s\;" + "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc); + } + else + gcc_unreachable (); + + return buf; + } + [(set_attr "type" "flat") + (set_attr "length" "12")]) + +(define_expand "scatter_store" + [(match_operand:DI 0 "register_operand") + (match_operand 1 "register_operand") + (match_operand 2 "immediate_operand") + (match_operand:SI 3 "gcn_alu_operand") + (match_operand:VEC_REG_MODE 4 "register_operand")] + "" + { + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0], + operands[1], operands[3], + INTVAL (operands[2]), NULL); + + if (GET_MODE (addr) == V64DImode) + emit_insn (gen_scatter_insn_1offset (addr, const0_rtx, operands[4], + const0_rtx, const0_rtx)); + else + emit_insn (gen_scatter_insn_2offsets (operands[0], addr, + const0_rtx, operands[4], + const0_rtx, const0_rtx)); + DONE; + }) + +(define_expand "scatter_exec" + [(match_operand:DI 0 "register_operand") + (match_operand 1 "register_operand") + (match_operand 2 "immediate_operand") + (match_operand:SI 3 "gcn_alu_operand") + (match_operand:VEC_REG_MODE 4 "register_operand") + (match_operand:DI 5 "gcn_exec_reg_operand")] + "" + { + operands[5] = force_reg (DImode, operands[5]); + + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0], + operands[1], operands[3], + INTVAL (operands[2]), operands[5]); + + if (GET_MODE (addr) == V64DImode) + emit_insn (gen_scatter_insn_1offset_exec (addr, const0_rtx, + operands[4], const0_rtx, + const0_rtx, + operands[5])); + else + emit_insn (gen_scatter_insn_2offsets_exec (operands[0], addr, + const0_rtx, operands[4], + const0_rtx, const0_rtx, + operands[5])); + DONE; + }) + +; Allow any address expression +(define_expand "scatter_expr" + [(set (mem:BLK (scratch)) + (unspec:BLK + [(match_operand:V64DI 0 "") + (match_operand:VEC_REG_MODE 1 "register_operand") + (match_operand 2 "immediate_operand") + (match_operand 3 "immediate_operand")] + UNSPEC_SCATTER))] + "" + {}) + +(define_insn "scatter_insn_1offset" + [(set (mem:BLK (scratch)) + (unspec:BLK + [(plus:V64DI (match_operand:V64DI 0 "register_operand" "v") + (vec_duplicate:V64DI + (match_operand 1 "immediate_operand" "n"))) + (match_operand:VEC_REG_MODE 2 "register_operand" "v") + (match_operand 3 "immediate_operand" "n") + (match_operand 4 "immediate_operand" "n")] + UNSPEC_SCATTER))] + "(AS_FLAT_P (INTVAL (operands[3])) + && (INTVAL(operands[1]) == 0 + || (TARGET_GCN5_PLUS + && (unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x1000))) + || (AS_GLOBAL_P (INTVAL (operands[3])) + && (((unsigned HOST_WIDE_INT)INTVAL(operands[1]) + 0x1000) < 0x2000))" + { + addr_space_t as = INTVAL (operands[3]); + const char *glc = INTVAL (operands[4]) ? " glc" : ""; + + static char buf[200]; + if (AS_FLAT_P (as)) + { + if (TARGET_GCN5_PLUS) + sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s\;" + "s_waitcnt\texpcnt(0)", glc); + else + sprintf (buf, "flat_store%%s2\t%%0, %%2%s\;s_waitcnt\texpcnt(0)", + glc); + } + else if (AS_GLOBAL_P (as)) + sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s\;" + "s_waitcnt\texpcnt(0)", glc); + else + gcc_unreachable (); + + return buf; + } + [(set_attr "type" "flat") + (set_attr "length" "12")]) + +(define_insn "scatter_insn_1offset_ds" + [(set (mem:BLK (scratch)) + (unspec:BLK + [(plus:V64SI (match_operand:V64SI 0 "register_operand" "v") + (vec_duplicate:V64SI + (match_operand 1 "immediate_operand" "n"))) + (match_operand:VEC_REG_MODE 2 "register_operand" "v") + (match_operand 3 "immediate_operand" "n") + (match_operand 4 "immediate_operand" "n")] + UNSPEC_SCATTER))] + "(AS_ANY_DS_P (INTVAL (operands[3])) + && ((unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x10000))" + { + addr_space_t as = INTVAL (operands[3]); + static char buf[200]; + sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\texpcnt(0)", + (AS_GDS_P (as) ? " gds" : "")); + return buf; + } + [(set_attr "type" "ds") + (set_attr "length" "12")]) + +(define_insn "scatter_insn_2offsets" + [(set (mem:BLK (scratch)) + (unspec:BLK + [(plus:V64DI + (plus:V64DI + (vec_duplicate:V64DI + (match_operand:DI 0 "register_operand" "Sv")) + (sign_extend:V64DI + (match_operand:V64SI 1 "register_operand" " v"))) + (vec_duplicate:V64DI (match_operand 2 "immediate_operand" + " n"))) + (match_operand:VEC_REG_MODE 3 "register_operand" " v") + (match_operand 4 "immediate_operand" " n") + (match_operand 5 "immediate_operand" " n")] + UNSPEC_SCATTER))] + "(AS_GLOBAL_P (INTVAL (operands[4])) + && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))" + { + addr_space_t as = INTVAL (operands[4]); + const char *glc = INTVAL (operands[5]) ? " glc" : ""; + + static char buf[200]; + if (AS_GLOBAL_P (as)) + { + /* Work around assembler bug in which a 64-bit register is expected, + but a 32-bit value would be correct. */ + int reg = REGNO (operands[1]) - FIRST_VGPR_REG; + sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s\;" + "s_waitcnt\texpcnt(0)", reg, reg + 1, glc); + } + else + gcc_unreachable (); + + return buf; + } + [(set_attr "type" "flat") + (set_attr "length" "12")]) + +;; }}} +;; {{{ Permutations + +(define_insn "ds_bpermute" + [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v") + (unspec:VEC_1REG_MODE + [(match_operand:VEC_1REG_MODE 2 "register_operand" " v") + (match_operand:V64SI 1 "register_operand" " v") + (match_operand:DI 3 "gcn_exec_reg_operand" " e")] + UNSPEC_BPERMUTE))] + "" + "ds_bpermute_b32\t%0, %1, %2\;s_waitcnt\tlgkmcnt(0)" + [(set_attr "type" "vop2") + (set_attr "length" "12")]) + +(define_insn_and_split "ds_bpermute" + [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "=&v") + (unspec:VEC_2REG_MODE + [(match_operand:VEC_2REG_MODE 2 "register_operand" " v0") + (match_operand:V64SI 1 "register_operand" " v") + (match_operand:DI 3 "gcn_exec_reg_operand" " e")] + UNSPEC_BPERMUTE))] + "" + "#" + "reload_completed" + [(set (match_dup 4) (unspec:V64SI [(match_dup 6) (match_dup 1) (match_dup 3)] + UNSPEC_BPERMUTE)) + (set (match_dup 5) (unspec:V64SI [(match_dup 7) (match_dup 1) (match_dup 3)] + UNSPEC_BPERMUTE))] + { + operands[4] = gcn_operand_part (mode, operands[0], 0); + operands[5] = gcn_operand_part (mode, operands[0], 1); + operands[6] = gcn_operand_part (mode, operands[2], 0); + operands[7] = gcn_operand_part (mode, operands[2], 1); + } + [(set_attr "type" "vmult") + (set_attr "length" "24")]) + +;; }}} +;; {{{ ALU special case: add/sub + +(define_insn "addv64si3" + [(set (match_operand:V64SI 0 "register_operand" "= v") + (plus:V64SI + (match_operand:V64SI 1 "register_operand" "% v") + (match_operand:V64SI 2 "gcn_alu_operand" "vSvB"))) + (clobber (reg:DI VCC_REG))] + "" + "v_add%^_u32\t%0, vcc, %2, %1" + [(set_attr "type" "vop2") + (set_attr "length" "8")]) + +(define_insn "addv64si3_dup" + [(set (match_operand:V64SI 0 "register_operand" "= v") + (plus:V64SI + (vec_duplicate:V64SI + (match_operand:SI 2 "gcn_alu_operand" "SvB")) + (match_operand:V64SI 1 "register_operand" " v"))) + (clobber (reg:DI VCC_REG))] + "" + "v_add%^_u32\t%0, vcc, %2, %1" + [(set_attr "type" "vop2") + (set_attr "length" "8")]) + +(define_insn "addv64si3_vcc" + [(set (match_operand:V64SI 0 "register_operand" "= v, v") + (plus:V64SI + (match_operand:V64SI 1 "register_operand" "% v, v") + (match_operand:V64SI 2 "gcn_alu_operand" "vSvB,vSvB"))) + (set (match_operand:DI 3 "register_operand" "= cV, Sg") + (ltu:DI (plus:V64SI (match_dup 1) (match_dup 2)) + (match_dup 1)))] + "" + "v_add%^_u32\t%0, %3, %2, %1" + [(set_attr "type" "vop2,vop3b") + (set_attr "length" "8")]) + +; This pattern only changes the VCC bits when the corresponding lane is +; enabled, so the set must be described as an ior. + +(define_insn "addv64si3_vcc_dup" + [(set (match_operand:V64SI 0 "register_operand" "= v, v") + (plus:V64SI + (vec_duplicate:V64SI + (match_operand:SI 1 "gcn_alu_operand" "SvB,SvB")) + (match_operand:V64SI 2 "register_operand" " v, v"))) + (set (match_operand:DI 3 "register_operand" "=cV, Sg") + (ltu:DI (plus:V64SI (vec_duplicate:V64SI (match_dup 2)) + (match_dup 1)) + (vec_duplicate:V64SI (match_dup 2))))] + "" + "v_add%^_u32\t%0, %3, %2, %1" + [(set_attr "type" "vop2,vop3b") + (set_attr "length" "8,8")]) + +; This pattern does not accept SGPR because VCC read already counts as an +; SGPR use and number of SGPR operands is limited to 1. + +(define_insn "addcv64si3" + [(set (match_operand:V64SI 0 "register_operand" "=v,v") + (plus:V64SI + (plus:V64SI + (vec_merge:V64SI + (vec_duplicate:V64SI (const_int 1)) + (vec_duplicate:V64SI (const_int 0)) + (match_operand:DI 3 "register_operand" " cV,Sv")) + (match_operand:V64SI 1 "gcn_alu_operand" "%vA,vA")) + (match_operand:V64SI 2 "gcn_alu_operand" " vB,vB"))) + (set (match_operand:DI 4 "register_operand" "=cV,Sg") + (ior:DI (ltu:DI (plus:V64SI + (plus:V64SI + (vec_merge:V64SI + (vec_duplicate:V64SI (const_int 1)) + (vec_duplicate:V64SI (const_int 0)) + (match_dup 3)) + (match_dup 1)) + (match_dup 2)) + (match_dup 2)) + (ltu:DI (plus:V64SI + (vec_merge:V64SI + (vec_duplicate:V64SI (const_int 1)) + (vec_duplicate:V64SI (const_int 0)) + (match_dup 3)) + (match_dup 1)) + (match_dup 1))))] + "" + "v_addc%^_u32\t%0, %4, %1, %2, %3" + [(set_attr "type" "vop2,vop3b") + (set_attr "length" "4,8")]) + +(define_insn "addcv64si3_dup" + [(set (match_operand:V64SI 0 "register_operand" "=v,v") + (plus:V64SI + (plus:V64SI + (vec_merge:V64SI + (vec_duplicate:V64SI (const_int 1)) + (vec_duplicate:V64SI (const_int 0)) + (match_operand:DI 3 "register_operand" " cV, Sv")) + (match_operand:V64SI 1 "gcn_alu_operand" "%vA, vA")) + (vec_duplicate:V64SI + (match_operand:SI 2 "gcn_alu_operand" "SvB,SvB")))) + (set (match_operand:DI 4 "register_operand" "=cV, Sg") + (ior:DI (ltu:DI (plus:V64SI (plus:V64SI + (vec_merge:V64SI + (vec_duplicate:V64SI (const_int 1)) + (vec_duplicate:V64SI (const_int 0)) + (match_dup 3)) + (match_dup 1)) + (vec_duplicate:V64SI + (match_dup 2))) + (vec_duplicate:V64SI + (match_dup 2))) + (ltu:DI (plus:V64SI (vec_merge:V64SI + (vec_duplicate:V64SI (const_int 1)) + (vec_duplicate:V64SI (const_int 0)) + (match_dup 3)) + (match_dup 1)) + (match_dup 1))))] + "" + "v_addc%^_u32\t%0, %4, %1, %2, %3" + [(set_attr "type" "vop2,vop3b") + (set_attr "length" "4,8")]) + +(define_insn "subv64si3" + [(set (match_operand:V64SI 0 "register_operand" "= v, v") + (minus:V64SI + (match_operand:V64SI 1 "gcn_alu_operand" "vSvB, v") + (match_operand:V64SI 2 "gcn_alu_operand" " v,vSvB"))) + (clobber (reg:DI VCC_REG))] + "" + "@ + v_sub%^_u32\t%0, vcc, %1, %2 + v_subrev%^_u32\t%0, vcc, %2, %1" + [(set_attr "type" "vop2") + (set_attr "length" "8,8")]) + +(define_insn "subv64si3_vcc" + [(set (match_operand:V64SI 0 "register_operand" "= v, v, v, v") + (minus:V64SI + (match_operand:V64SI 1 "gcn_alu_operand" "vSvB,vSvB, v, v") + (match_operand:V64SI 2 "gcn_alu_operand" " v, v,vSvB,vSvB"))) + (set (match_operand:DI 3 "register_operand" "= cV, Sg, cV, Sg") + (gtu:DI (minus:V64SI (match_dup 1) (match_dup 2)) + (match_dup 1)))] + "" + "@ + v_sub%^_u32\t%0, %3, %1, %2 + v_sub%^_u32\t%0, %3, %1, %2 + v_subrev%^_u32\t%0, %3, %2, %1 + v_subrev%^_u32\t%0, %3, %2, %1" + [(set_attr "type" "vop2,vop3b,vop2,vop3b") + (set_attr "length" "8")]) + +; This pattern does not accept SGPR because VCC read already counts +; as a SGPR use and number of SGPR operands is limited to 1. + +(define_insn "subcv64si3" + [(set (match_operand:V64SI 0 "register_operand" "= v, v, v, v") + (minus:V64SI + (minus:V64SI + (vec_merge:V64SI + (vec_duplicate:V64SI (const_int 1)) + (vec_duplicate:V64SI (const_int 0)) + (match_operand:DI 3 "gcn_alu_operand" " cV,Sv,cV,Sv")) + (match_operand:V64SI 1 "gcn_alu_operand" " vA,vA,vB,vB")) + (match_operand:V64SI 2 "gcn_alu_operand" " vB,vB,vA,vA"))) + (set (match_operand:DI 4 "register_operand" "=cV,Sg,cV,Sg") + (ior:DI (gtu:DI (minus:V64SI (minus:V64SI + (vec_merge:V64SI + (vec_duplicate:V64SI (const_int 1)) + (vec_duplicate:V64SI (const_int 0)) + (match_dup 3)) + (match_dup 1)) + (match_dup 2)) + (match_dup 2)) + (ltu:DI (minus:V64SI (vec_merge:V64SI + (vec_duplicate:V64SI (const_int 1)) + (vec_duplicate:V64SI (const_int 0)) + (match_dup 3)) + (match_dup 1)) + (match_dup 1))))] + "" + "@ + v_subb%^_u32\t%0, %4, %1, %2, %3 + v_subb%^_u32\t%0, %4, %1, %2, %3 + v_subbrev%^_u32\t%0, %4, %2, %1, %3 + v_subbrev%^_u32\t%0, %4, %2, %1, %3" + [(set_attr "type" "vop2,vop3b,vop2,vop3b") + (set_attr "length" "8")]) + +(define_insn_and_split "addv64di3" + [(set (match_operand:V64DI 0 "register_operand" "= &v") + (plus:V64DI + (match_operand:V64DI 1 "register_operand" "% v0") + (match_operand:V64DI 2 "gcn_alu_operand" "vSvB0"))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[1]) + && gcn_can_split_p (V64DImode, operands[2])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_addv64si3_vcc + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (V64DImode, operands[1], 0), + gcn_operand_part (V64DImode, operands[2], 0), + vcc)); + emit_insn (gen_addcv64si3 + (gcn_operand_part (V64DImode, operands[0], 1), + gcn_operand_part (V64DImode, operands[1], 1), + gcn_operand_part (V64DImode, operands[2], 1), + vcc, vcc)); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + +(define_insn_and_split "addv64di3_exec" + [(set (match_operand:V64DI 0 "register_operand" "= &v") + (vec_merge:V64DI + (plus:V64DI + (match_operand:V64DI 1 "register_operand" "% v0") + (match_operand:V64DI 2 "gcn_alu_operand" "vSvB0")) + (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[1]) + && gcn_can_split_p (V64DImode, operands[2]) + && gcn_can_split_p (V64DImode, operands[4])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_addv64si3_vcc_exec + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (V64DImode, operands[1], 0), + gcn_operand_part (V64DImode, operands[2], 0), + vcc, + gcn_operand_part (V64DImode, operands[3], 0), + operands[4])); + emit_insn (gen_addcv64si3_exec + (gcn_operand_part (V64DImode, operands[0], 1), + gcn_operand_part (V64DImode, operands[1], 1), + gcn_operand_part (V64DImode, operands[2], 1), + vcc, vcc, + gcn_operand_part (V64DImode, operands[3], 1), + operands[4])); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + +(define_insn_and_split "subv64di3" + [(set (match_operand:V64DI 0 "register_operand" "= &v, &v") + (minus:V64DI + (match_operand:V64DI 1 "gcn_alu_operand" "vSvB0, v0") + (match_operand:V64DI 2 "gcn_alu_operand" " v0,vSvB0"))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[1]) + && gcn_can_split_p (V64DImode, operands[2])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_subv64si3_vcc + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (V64DImode, operands[1], 0), + gcn_operand_part (V64DImode, operands[2], 0), + vcc)); + emit_insn (gen_subcv64si3 + (gcn_operand_part (V64DImode, operands[0], 1), + gcn_operand_part (V64DImode, operands[1], 1), + gcn_operand_part (V64DImode, operands[2], 1), + vcc, vcc)); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8,8")]) + +(define_insn_and_split "subv64di3_exec" + [(set (match_operand:V64DI 0 "register_operand" "= &v, &v") + (vec_merge:V64DI + (minus:V64DI + (match_operand:V64DI 1 "gcn_alu_operand" "vSvB0, v0") + (match_operand:V64DI 2 "gcn_alu_operand" " v0,vSvB0")) + (match_operand:V64DI 3 "gcn_register_or_unspec_operand" + " U0, U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e, e"))) + (clobber (reg:DI VCC_REG))] + "register_operand (operands[1], VOIDmode) + || register_operand (operands[2], VOIDmode)" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[1]) + && gcn_can_split_p (V64DImode, operands[2]) + && gcn_can_split_p (V64DImode, operands[3])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_subv64si3_vcc_exec + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (V64DImode, operands[1], 0), + gcn_operand_part (V64DImode, operands[2], 0), + vcc, + gcn_operand_part (V64DImode, operands[3], 0), + operands[4])); + emit_insn (gen_subcv64si3_exec + (gcn_operand_part (V64DImode, operands[0], 1), + gcn_operand_part (V64DImode, operands[1], 1), + gcn_operand_part (V64DImode, operands[2], 1), + vcc, vcc, + gcn_operand_part (V64DImode, operands[3], 1), + operands[4])); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8,8")]) + +(define_insn_and_split "addv64di3_dup" + [(set (match_operand:V64DI 0 "register_operand" "= &v") + (plus:V64DI + (match_operand:V64DI 1 "register_operand" " v0") + (vec_duplicate:V64DI + (match_operand:DI 2 "gcn_alu_operand" "SvDB")))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[1]) + && gcn_can_split_p (V64DImode, operands[2])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_addv64si3_vcc_dup + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (DImode, operands[2], 0), + gcn_operand_part (V64DImode, operands[1], 0), + vcc)); + emit_insn (gen_addcv64si3_dup + (gcn_operand_part (V64DImode, operands[0], 1), + gcn_operand_part (V64DImode, operands[1], 1), + gcn_operand_part (DImode, operands[2], 1), + vcc, vcc)); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + +(define_insn_and_split "addv64di3_dup_exec" + [(set (match_operand:V64DI 0 "register_operand" "= &v") + (vec_merge:V64DI + (plus:V64DI + (match_operand:V64DI 1 "register_operand" " v0") + (vec_duplicate:V64DI + (match_operand:DI 2 "gcn_alu_operand" "SvDB"))) + (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[1]) + && gcn_can_split_p (V64DImode, operands[2]) + && gcn_can_split_p (V64DImode, operands[3])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_addv64si3_vcc_dup_exec + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (DImode, operands[2], 0), + gcn_operand_part (V64DImode, operands[1], 0), + vcc, + gcn_operand_part (V64DImode, operands[3], 0), + operands[4])); + emit_insn (gen_addcv64si3_dup_exec + (gcn_operand_part (V64DImode, operands[0], 1), + gcn_operand_part (V64DImode, operands[1], 1), + gcn_operand_part (DImode, operands[2], 1), + vcc, vcc, + gcn_operand_part (V64DImode, operands[3], 1), + operands[4])); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + +(define_insn_and_split "addv64di3_zext" + [(set (match_operand:V64DI 0 "register_operand" "=&v,&v") + (plus:V64DI + (zero_extend:V64DI + (match_operand:V64SI 1 "gcn_alu_operand" "0vA,0vB")) + (match_operand:V64DI 2 "gcn_alu_operand" "0vB,0vA"))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[2])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_addv64si3_vcc + (gcn_operand_part (V64DImode, operands[0], 0), + operands[1], + gcn_operand_part (V64DImode, operands[2], 0), + vcc)); + emit_insn (gen_addcv64si3 + (gcn_operand_part (V64DImode, operands[0], 1), + gcn_operand_part (V64DImode, operands[2], 1), + const0_rtx, vcc, vcc)); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8,8")]) + +(define_insn_and_split "addv64di3_zext_exec" + [(set (match_operand:V64DI 0 "register_operand" "=&v,&v") + (vec_merge:V64DI + (plus:V64DI + (zero_extend:V64DI + (match_operand:V64SI 1 "gcn_alu_operand" "0vA,0vB")) + (match_operand:V64DI 2 "gcn_alu_operand" "0vB,0vA")) + (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0, U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e, e"))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[2]) + && gcn_can_split_p (V64DImode, operands[3])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_addv64si3_vcc_exec + (gcn_operand_part (V64DImode, operands[0], 0), + operands[1], + gcn_operand_part (V64DImode, operands[2], 0), + vcc, + gcn_operand_part (V64DImode, operands[3], 0), + operands[4])); + emit_insn (gen_addcv64si3_exec + (gcn_operand_part (V64DImode, operands[0], 1), + gcn_operand_part (V64DImode, operands[2], 1), + const0_rtx, vcc, vcc, + gcn_operand_part (V64DImode, operands[3], 1), + operands[4])); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8,8")]) + +(define_insn_and_split "addv64di3_zext_dup" + [(set (match_operand:V64DI 0 "register_operand" "=&v") + (plus:V64DI + (zero_extend:V64DI + (vec_duplicate:V64SI + (match_operand:SI 1 "gcn_alu_operand" "BSv"))) + (match_operand:V64DI 2 "gcn_alu_operand" "vA0"))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[2])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_addv64si3_vcc_dup + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (DImode, operands[1], 0), + gcn_operand_part (V64DImode, operands[2], 0), + vcc)); + emit_insn (gen_addcv64si3 + (gcn_operand_part (V64DImode, operands[0], 1), + gcn_operand_part (V64DImode, operands[2], 1), + const0_rtx, vcc, vcc)); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + +(define_insn_and_split "addv64di3_zext_dup_exec" + [(set (match_operand:V64DI 0 "register_operand" "=&v") + (vec_merge:V64DI + (plus:V64DI + (zero_extend:V64DI + (vec_duplicate:V64SI + (match_operand:SI 1 "gcn_alu_operand" "BSv"))) + (match_operand:V64DI 2 "gcn_alu_operand" "vA0")) + (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[2]) + && gcn_can_split_p (V64DImode, operands[3])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_addv64si3_vcc_dup_exec + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (DImode, operands[1], 0), + gcn_operand_part (V64DImode, operands[2], 0), + vcc, + gcn_operand_part (V64DImode, operands[3], 0), + operands[4])); + emit_insn (gen_addcv64si3_exec + (gcn_operand_part (V64DImode, operands[0], 1), + gcn_operand_part (V64DImode, operands[2], 1), + const0_rtx, vcc, vcc, + gcn_operand_part (V64DImode, operands[3], 1), + operands[4])); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + +(define_insn_and_split "addv64di3_zext_dup2" + [(set (match_operand:V64DI 0 "register_operand" "= v") + (plus:V64DI + (zero_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand" " vA")) + (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSv")))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_addv64si3_vcc_dup + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (DImode, operands[2], 0), + operands[1], + vcc)); + rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1); + emit_insn (gen_vec_duplicatev64si + (dsthi, gcn_operand_part (DImode, operands[2], 1))); + emit_insn (gen_addcv64si3 (dsthi, dsthi, const0_rtx, vcc, vcc)); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + +(define_insn_and_split "addv64di3_zext_dup2_exec" + [(set (match_operand:V64DI 0 "register_operand" "= v") + (vec_merge:V64DI + (plus:V64DI + (zero_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand" + " vA")) + (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSv"))) + (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[3])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_addv64si3_vcc_dup_exec + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (DImode, operands[2], 0), + operands[1], + vcc, + gcn_operand_part (V64DImode, operands[3], 0), + operands[4])); + rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1); + emit_insn (gen_vec_duplicatev64si_exec + (dsthi, gcn_operand_part (DImode, operands[2], 1), + gcn_gen_undef (V64SImode), operands[4])); + emit_insn (gen_addcv64si3_exec + (dsthi, dsthi, const0_rtx, vcc, vcc, + gcn_operand_part (V64DImode, operands[3], 1), + operands[4])); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + +(define_insn_and_split "addv64di3_sext_dup2" + [(set (match_operand:V64DI 0 "register_operand" "= v") + (plus:V64DI + (sign_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand" " vA")) + (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSv")))) + (clobber (match_scratch:V64SI 3 "=&v")) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_ashrv64si3 (operands[3], operands[1], GEN_INT (31))); + emit_insn (gen_addv64si3_vcc_dup + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (DImode, operands[2], 0), + operands[1], + vcc)); + rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1); + emit_insn (gen_vec_duplicatev64si + (dsthi, gcn_operand_part (DImode, operands[2], 1))); + emit_insn (gen_addcv64si3 (dsthi, dsthi, operands[3], vcc, vcc)); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + +(define_insn_and_split "addv64di3_sext_dup2_exec" + [(set (match_operand:V64DI 0 "register_operand" "= v") + (vec_merge:V64DI + (plus:V64DI + (sign_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand" + " vA")) + (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSv"))) + (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (match_scratch:V64SI 5 "=&v")) + (clobber (reg:DI VCC_REG))] + "" + "#" + "gcn_can_split_p (V64DImode, operands[0]) + && gcn_can_split_p (V64DImode, operands[3])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_ashrv64si3_exec (operands[5], operands[1], GEN_INT (31), + gcn_gen_undef (V64SImode), operands[4])); + emit_insn (gen_addv64si3_vcc_dup_exec + (gcn_operand_part (V64DImode, operands[0], 0), + gcn_operand_part (DImode, operands[2], 0), + operands[1], + vcc, + gcn_operand_part (V64DImode, operands[3], 0), + operands[4])); + rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1); + emit_insn (gen_vec_duplicatev64si_exec + (dsthi, gcn_operand_part (DImode, operands[2], 1), + gcn_gen_undef (V64SImode), operands[4])); + emit_insn (gen_addcv64si3_exec + (dsthi, dsthi, operands[5], vcc, vcc, + gcn_operand_part (V64DImode, operands[3], 1), + operands[4])); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + +;; }}} +;; {{{ DS memory ALU: add/sub + +(define_mode_iterator DS_ARITH_MODE [V64SI V64SF V64DI]) +(define_mode_iterator DS_ARITH_SCALAR_MODE [SI SF DI]) + +;; FIXME: the vector patterns probably need RD expanded to a vector of +;; addresses. For now, the only way a vector can get into LDS is +;; if the user puts it there manually. +;; +;; FIXME: the scalar patterns are probably fine in themselves, but need to be +;; checked to see if anything can ever use them. + +(define_insn "add3_ds" + [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand" "=RD") + (plus:DS_ARITH_MODE + (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" "%RD") + (match_operand:DS_ARITH_MODE 2 "register_operand" " v")))] + "rtx_equal_p (operands[0], operands[1])" + "ds_add%u0\t%A0, %2%O0" + [(set_attr "type" "ds") + (set_attr "length" "8")]) + +(define_insn "add3_ds_scalar" + [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD") + (plus:DS_ARITH_SCALAR_MODE + (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand" + "%RD") + (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand" " v")))] + "rtx_equal_p (operands[0], operands[1])" + "ds_add%u0\t%A0, %2%O0" + [(set_attr "type" "ds") + (set_attr "length" "8")]) + +(define_insn "sub3_ds" + [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand" "=RD") + (minus:DS_ARITH_MODE + (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" " RD") + (match_operand:DS_ARITH_MODE 2 "register_operand" " v")))] + "rtx_equal_p (operands[0], operands[1])" + "ds_sub%u0\t%A0, %2%O0" + [(set_attr "type" "ds") + (set_attr "length" "8")]) + +(define_insn "sub3_ds_scalar" + [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD") + (minus:DS_ARITH_SCALAR_MODE + (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand" + " RD") + (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand" " v")))] + "rtx_equal_p (operands[0], operands[1])" + "ds_sub%u0\t%A0, %2%O0" + [(set_attr "type" "ds") + (set_attr "length" "8")]) + +(define_insn "subr3_ds" + [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand" "=RD") + (minus:DS_ARITH_MODE + (match_operand:DS_ARITH_MODE 2 "register_operand" " v") + (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" " RD")))] + "rtx_equal_p (operands[0], operands[1])" + "ds_rsub%u0\t%A0, %2%O0" + [(set_attr "type" "ds") + (set_attr "length" "8")]) + +(define_insn "subr3_ds_scalar" + [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD") + (minus:DS_ARITH_SCALAR_MODE + (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand" " v") + (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand" + " RD")))] + "rtx_equal_p (operands[0], operands[1])" + "ds_rsub%u0\t%A0, %2%O0" + [(set_attr "type" "ds") + (set_attr "length" "8")]) + +;; }}} +;; {{{ ALU special case: mult + +(define_insn "mulv64si3_highpart" + [(set (match_operand:V64SI 0 "register_operand" "= v") + (truncate:V64SI + (lshiftrt:V64DI + (mult:V64DI + (any_extend:V64DI + (match_operand:V64SI 1 "gcn_alu_operand" " %v")) + (any_extend:V64DI + (match_operand:V64SI 2 "gcn_alu_operand" "vSvA"))) + (const_int 32))))] + "" + "v_mul_hi0\t%0, %2, %1" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "mulv64si3" + [(set (match_operand:V64SI 0 "register_operand" "= v") + (mult:V64SI + (match_operand:V64SI 1 "gcn_alu_operand" "%vSvA") + (match_operand:V64SI 2 "gcn_alu_operand" " vSvA")))] + "" + "v_mul_lo_u32\t%0, %1, %2" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "mulv64si3_dup" + [(set (match_operand:V64SI 0 "register_operand" "= v") + (mult:V64SI + (match_operand:V64SI 1 "gcn_alu_operand" "%vSvA") + (vec_duplicate:V64SI + (match_operand:SI 2 "gcn_alu_operand" " SvA"))))] + "" + "v_mul_lo_u32\t%0, %1, %2" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn_and_split "mulv64di3" + [(set (match_operand:V64DI 0 "register_operand" "=&v") + (mult:V64DI + (match_operand:V64DI 1 "gcn_alu_operand" "% v") + (match_operand:V64DI 2 "gcn_alu_operand" "vDA"))) + (clobber (match_scratch:V64SI 3 "=&v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0); + rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1); + rtx left_lo = gcn_operand_part (V64DImode, operands[1], 0); + rtx left_hi = gcn_operand_part (V64DImode, operands[1], 1); + rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0); + rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1); + rtx tmp = operands[3]; + + emit_insn (gen_mulv64si3 (out_lo, left_lo, right_lo)); + emit_insn (gen_umulv64si3_highpart (out_hi, left_lo, right_lo)); + emit_insn (gen_mulv64si3 (tmp, left_hi, right_lo)); + emit_insn (gen_addv64si3 (out_hi, out_hi, tmp)); + emit_insn (gen_mulv64si3 (tmp, left_lo, right_hi)); + emit_insn (gen_addv64si3 (out_hi, out_hi, tmp)); + emit_insn (gen_mulv64si3 (tmp, left_hi, right_hi)); + emit_insn (gen_addv64si3 (out_hi, out_hi, tmp)); + DONE; + }) + +(define_insn_and_split "mulv64di3_exec" + [(set (match_operand:V64DI 0 "register_operand" "=&v") + (vec_merge:V64DI + (mult:V64DI + (match_operand:V64DI 1 "gcn_alu_operand" "% v") + (match_operand:V64DI 2 "gcn_alu_operand" "vDA")) + (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (match_scratch:V64SI 5 "=&v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0); + rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1); + rtx left_lo = gcn_operand_part (V64DImode, operands[1], 0); + rtx left_hi = gcn_operand_part (V64DImode, operands[1], 1); + rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0); + rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1); + rtx exec = operands[4]; + rtx tmp = operands[5]; + + rtx old_lo, old_hi; + if (GET_CODE (operands[3]) == UNSPEC) + { + old_lo = old_hi = gcn_gen_undef (V64SImode); + } + else + { + old_lo = gcn_operand_part (V64DImode, operands[3], 0); + old_hi = gcn_operand_part (V64DImode, operands[3], 1); + } + + rtx undef = gcn_gen_undef (V64SImode); + + emit_insn (gen_mulv64si3_exec (out_lo, left_lo, right_lo, old_lo, exec)); + emit_insn (gen_umulv64si3_highpart_exec (out_hi, left_lo, right_lo, + old_hi, exec)); + emit_insn (gen_mulv64si3_exec (tmp, left_hi, right_lo, undef, exec)); + emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec)); + emit_insn (gen_mulv64si3_exec (tmp, left_lo, right_hi, undef, exec)); + emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec)); + emit_insn (gen_mulv64si3_exec (tmp, left_hi, right_hi, undef, exec)); + emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec)); + DONE; + }) + +(define_insn_and_split "mulv64di3_zext" + [(set (match_operand:V64DI 0 "register_operand" "=&v") + (mult:V64DI + (zero_extend:V64DI + (match_operand:V64SI 1 "gcn_alu_operand" " v")) + (match_operand:V64DI 2 "gcn_alu_operand" "vDA"))) + (clobber (match_scratch:V64SI 3 "=&v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0); + rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1); + rtx left = operands[1]; + rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0); + rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1); + rtx tmp = operands[3]; + + emit_insn (gen_mulv64si3 (out_lo, left, right_lo)); + emit_insn (gen_umulv64si3_highpart (out_hi, left, right_lo)); + emit_insn (gen_mulv64si3 (tmp, left, right_hi)); + emit_insn (gen_addv64si3 (out_hi, out_hi, tmp)); + DONE; + }) + +(define_insn_and_split "mulv64di3_zext_exec" + [(set (match_operand:V64DI 0 "register_operand" "=&v") + (vec_merge:V64DI + (mult:V64DI + (zero_extend:V64DI + (match_operand:V64SI 1 "gcn_alu_operand" " v")) + (match_operand:V64DI 2 "gcn_alu_operand" "vDA")) + (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (match_scratch:V64SI 5 "=&v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0); + rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1); + rtx left = operands[1]; + rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0); + rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1); + rtx exec = operands[4]; + rtx tmp = operands[5]; + + rtx old_lo, old_hi; + if (GET_CODE (operands[3]) == UNSPEC) + { + old_lo = old_hi = gcn_gen_undef (V64SImode); + } + else + { + old_lo = gcn_operand_part (V64DImode, operands[3], 0); + old_hi = gcn_operand_part (V64DImode, operands[3], 1); + } + + rtx undef = gcn_gen_undef (V64SImode); + + emit_insn (gen_mulv64si3_exec (out_lo, left, right_lo, old_lo, exec)); + emit_insn (gen_umulv64si3_highpart_exec (out_hi, left, right_lo, + old_hi, exec)); + emit_insn (gen_mulv64si3_exec (tmp, left, right_hi, undef, exec)); + emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec)); + DONE; + }) + +(define_insn_and_split "mulv64di3_zext_dup2" + [(set (match_operand:V64DI 0 "register_operand" "= &v") + (mult:V64DI + (zero_extend:V64DI + (match_operand:V64SI 1 "gcn_alu_operand" " v")) + (vec_duplicate:V64DI + (match_operand:DI 2 "gcn_alu_operand" "SvDA")))) + (clobber (match_scratch:V64SI 3 "= &v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0); + rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1); + rtx left = operands[1]; + rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0); + rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1); + rtx tmp = operands[3]; + + emit_insn (gen_mulv64si3 (out_lo, left, right_lo)); + emit_insn (gen_umulv64si3_highpart (out_hi, left, right_lo)); + emit_insn (gen_mulv64si3 (tmp, left, right_hi)); + emit_insn (gen_addv64si3 (out_hi, out_hi, tmp)); + DONE; + }) + +(define_insn_and_split "mulv64di3_zext_dup2_exec" + [(set (match_operand:V64DI 0 "register_operand" "= &v") + (vec_merge:V64DI + (mult:V64DI + (zero_extend:V64DI + (match_operand:V64SI 1 "gcn_alu_operand" " v")) + (vec_duplicate:V64DI + (match_operand:DI 2 "gcn_alu_operand" "SvDA"))) + (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (match_scratch:V64SI 5 "= &v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0); + rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1); + rtx left = operands[1]; + rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0); + rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1); + rtx exec = operands[4]; + rtx tmp = operands[5]; + + rtx old_lo, old_hi; + if (GET_CODE (operands[3]) == UNSPEC) + { + old_lo = old_hi = gcn_gen_undef (V64SImode); + } + else + { + old_lo = gcn_operand_part (V64DImode, operands[3], 0); + old_hi = gcn_operand_part (V64DImode, operands[3], 1); + } + + rtx undef = gcn_gen_undef (V64SImode); + + emit_insn (gen_mulv64si3_exec (out_lo, left, right_lo, old_lo, exec)); + emit_insn (gen_umulv64si3_highpart_exec (out_hi, left, right_lo, + old_hi, exec)); + emit_insn (gen_mulv64si3_exec (tmp, left, right_hi, undef, exec)); + emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec)); + DONE; + }) + +;; }}} +;; {{{ ALU generic case + +(define_mode_iterator VEC_INT_MODE [V64QI V64HI V64SI V64DI]) + +(define_code_iterator bitop [and ior xor]) +(define_code_iterator shiftop [ashift lshiftrt ashiftrt]) +(define_code_iterator minmaxop [smin smax umin umax]) + +(define_insn "2" + [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "= v") + (bitunop:VEC_1REG_INT_MODE + (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand" "vSvB")))] + "" + "v_0\t%0, %1" + [(set_attr "type" "vop1") + (set_attr "length" "8")]) + +(define_insn "3" + [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "= v,RD") + (bitop:VEC_1REG_INT_MODE + (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand" + "% v, 0") + (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1com_operand" + "vSvB, v")))] + "" + "@ + v_0\t%0, %2, %1 + ds_0\t%A0, %2%O0" + [(set_attr "type" "vop2,ds") + (set_attr "length" "8,8")]) + +(define_insn_and_split "v64di3" + [(set (match_operand:V64DI 0 "gcn_valu_dst_operand" "=&v,RD") + (bitop:V64DI + (match_operand:V64DI 1 "gcn_valu_src0_operand" "% v,RD") + (match_operand:V64DI 2 "gcn_valu_src1com_operand" "vSvB, v")))] + "" + "@ + # + ds_0\t%A0, %2%O0" + "(reload_completed && !gcn_ds_memory_operand (operands[0], V64DImode))" + [(set (match_dup 3) + (bitop:V64SI (match_dup 5) (match_dup 7))) + (set (match_dup 4) + (bitop:V64SI (match_dup 6) (match_dup 8)))] + { + operands[3] = gcn_operand_part (V64DImode, operands[0], 0); + operands[4] = gcn_operand_part (V64DImode, operands[0], 1); + operands[5] = gcn_operand_part (V64DImode, operands[1], 0); + operands[6] = gcn_operand_part (V64DImode, operands[1], 1); + operands[7] = gcn_operand_part (V64DImode, operands[2], 0); + operands[8] = gcn_operand_part (V64DImode, operands[2], 1); + } + [(set_attr "type" "vmult,ds") + (set_attr "length" "16,8")]) + +(define_insn_and_split "v64di3_exec" + [(set (match_operand:V64DI 0 "gcn_valu_dst_operand" "=&v,RD") + (vec_merge:V64DI + (bitop:V64DI + (match_operand:V64DI 1 "gcn_valu_src0_operand" "% v,RD") + (match_operand:V64DI 2 "gcn_valu_src1com_operand" "vSvB, v")) + (match_operand:V64DI 3 "gcn_register_ds_or_unspec_operand" + " U0,U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e, e")))] + "!memory_operand (operands[0], VOIDmode) + || (rtx_equal_p (operands[0], operands[1]) + && register_operand (operands[2], VOIDmode))" + "@ + # + ds_0\t%A0, %2%O0" + "(reload_completed && !gcn_ds_memory_operand (operands[0], V64DImode))" + [(set (match_dup 5) + (vec_merge:V64SI + (bitop:V64SI (match_dup 7) (match_dup 9)) + (match_dup 11) + (match_dup 4))) + (set (match_dup 6) + (vec_merge:V64SI + (bitop:V64SI (match_dup 8) (match_dup 10)) + (match_dup 12) + (match_dup 4)))] + { + operands[5] = gcn_operand_part (V64DImode, operands[0], 0); + operands[6] = gcn_operand_part (V64DImode, operands[0], 1); + operands[7] = gcn_operand_part (V64DImode, operands[1], 0); + operands[8] = gcn_operand_part (V64DImode, operands[1], 1); + operands[9] = gcn_operand_part (V64DImode, operands[2], 0); + operands[10] = gcn_operand_part (V64DImode, operands[2], 1); + operands[11] = gcn_operand_part (V64DImode, operands[3], 0); + operands[12] = gcn_operand_part (V64DImode, operands[3], 1); + } + [(set_attr "type" "vmult,ds") + (set_attr "length" "16,8")]) + +(define_insn "v64si3" + [(set (match_operand:V64SI 0 "register_operand" "= v") + (shiftop:V64SI + (match_operand:V64SI 1 "gcn_alu_operand" " v") + (vec_duplicate:V64SI + (match_operand:SI 2 "gcn_alu_operand" "SvB"))))] + "" + "v_0\t%0, %2, %1" + [(set_attr "type" "vop2") + (set_attr "length" "8")]) + +(define_insn "vv64si3" + [(set (match_operand:V64SI 0 "register_operand" "=v") + (shiftop:V64SI + (match_operand:V64SI 1 "gcn_alu_operand" " v") + (match_operand:V64SI 2 "gcn_alu_operand" "vB")))] + "" + "v_0\t%0, %2, %1" + [(set_attr "type" "vop2") + (set_attr "length" "8")]) + +(define_insn "3" + [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "= v,RD") + (minmaxop:VEC_1REG_INT_MODE + (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand" + "% v, 0") + (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1com_operand" + "vSvB, v")))] + "" + "@ + v_0\t%0, %2, %1 + ds_0\t%A0, %2%O0" + [(set_attr "type" "vop2,ds") + (set_attr "length" "8,8")]) + +;; }}} +;; {{{ FP binops - special cases + +; GCN does not directly provide a DFmode subtract instruction, so we do it by +; adding the negated second operand to the first. + +(define_insn "subv64df3" + [(set (match_operand:V64DF 0 "register_operand" "= v, v") + (minus:V64DF + (match_operand:V64DF 1 "gcn_alu_operand" "vSvB, v") + (match_operand:V64DF 2 "gcn_alu_operand" " v,vSvB")))] + "" + "@ + v_add_f64\t%0, %1, -%2 + v_add_f64\t%0, -%2, %1" + [(set_attr "type" "vop3a") + (set_attr "length" "8,8")]) + +(define_insn "subdf" + [(set (match_operand:DF 0 "register_operand" "= v, v") + (minus:DF + (match_operand:DF 1 "gcn_alu_operand" "vSvB, v") + (match_operand:DF 2 "gcn_alu_operand" " v,vSvB")))] + "" + "@ + v_add_f64\t%0, %1, -%2 + v_add_f64\t%0, -%2, %1" + [(set_attr "type" "vop3a") + (set_attr "length" "8,8")]) + +;; }}} +;; {{{ FP binops - generic + +(define_mode_iterator VEC_FP_MODE [V64HF V64SF V64DF]) +(define_mode_iterator VEC_FP_1REG_MODE [V64HF V64SF]) +(define_mode_iterator FP_MODE [HF SF DF]) +(define_mode_iterator FP_1REG_MODE [HF SF]) + +(define_code_iterator comm_fp [plus mult smin smax]) +(define_code_iterator nocomm_fp [minus]) +(define_code_iterator all_fp [plus mult minus smin smax]) + +(define_insn "3" + [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v") + (comm_fp:VEC_FP_MODE + (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "% v") + (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" "vSvB")))] + "" + "v_0\t%0, %2, %1" + [(set_attr "type" "vop2") + (set_attr "length" "8")]) + +(define_insn "3" + [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand" "= v, RL") + (comm_fp:FP_MODE + (match_operand:FP_MODE 1 "gcn_valu_src0_operand" "% v, 0") + (match_operand:FP_MODE 2 "gcn_valu_src1_operand" "vSvB,vSvB")))] + "" + "@ + v_0\t%0, %2, %1 + v_0\t%0, %1%O0" + [(set_attr "type" "vop2,ds") + (set_attr "length" "8")]) + +(define_insn "3" + [(set (match_operand:VEC_FP_1REG_MODE 0 "register_operand" "= v, v") + (nocomm_fp:VEC_FP_1REG_MODE + (match_operand:VEC_FP_1REG_MODE 1 "gcn_alu_operand" "vSvB, v") + (match_operand:VEC_FP_1REG_MODE 2 "gcn_alu_operand" " v,vSvB")))] + "" + "@ + v_0\t%0, %1, %2 + v_0\t%0, %2, %1" + [(set_attr "type" "vop2") + (set_attr "length" "8,8")]) + +(define_insn "3" + [(set (match_operand:FP_1REG_MODE 0 "register_operand" "= v, v") + (nocomm_fp:FP_1REG_MODE + (match_operand:FP_1REG_MODE 1 "gcn_alu_operand" "vSvB, v") + (match_operand:FP_1REG_MODE 2 "gcn_alu_operand" " v,vSvB")))] + "" + "@ + v_0\t%0, %1, %2 + v_0\t%0, %2, %1" + [(set_attr "type" "vop2") + (set_attr "length" "8,8")]) + +;; }}} +;; {{{ FP unops + +(define_insn "abs2" + [(set (match_operand:FP_MODE 0 "register_operand" "=v") + (abs:FP_MODE (match_operand:FP_MODE 1 "register_operand" " v")))] + "" + "v_add%i0\t%0, 0, |%1|" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "abs2" + [(set (match_operand:VEC_FP_MODE 0 "register_operand" "=v") + (abs:VEC_FP_MODE + (match_operand:VEC_FP_MODE 1 "register_operand" " v")))] + "" + "v_add%i0\t%0, 0, |%1|" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "neg2" + [(set (match_operand:VEC_FP_MODE 0 "register_operand" "=v") + (neg:VEC_FP_MODE + (match_operand:VEC_FP_MODE 1 "register_operand" " v")))] + "" + "v_add%i0\t%0, 0, -%1" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "sqrt2" + [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v") + (sqrt:VEC_FP_MODE + (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "vSvB")))] + "flag_unsafe_math_optimizations" + "v_sqrt%i0\t%0, %1" + [(set_attr "type" "vop1") + (set_attr "length" "8")]) + +(define_insn "sqrt2" + [(set (match_operand:FP_MODE 0 "register_operand" "= v") + (sqrt:FP_MODE + (match_operand:FP_MODE 1 "gcn_alu_operand" "vSvB")))] + "flag_unsafe_math_optimizations" + "v_sqrt%i0\t%0, %1" + [(set_attr "type" "vop1") + (set_attr "length" "8")]) + +;; }}} +;; {{{ FP fused multiply and add + +(define_insn "fma4" + [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v, v") + (fma:VEC_FP_MODE + (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "% vA, vA") + (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" " vA,vSvA") + (match_operand:VEC_FP_MODE 3 "gcn_alu_operand" "vSvA, vA")))] + "" + "v_fma%i0\t%0, %1, %2, %3" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "fma4_negop2" + [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v, v, v") + (fma:VEC_FP_MODE + (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" " vA, vA,vSvA") + (neg:VEC_FP_MODE + (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" " vA,vSvA, vA")) + (match_operand:VEC_FP_MODE 3 "gcn_alu_operand" "vSvA, vA, vA")))] + "" + "v_fma%i0\t%0, %1, -%2, %3" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "fma4" + [(set (match_operand:FP_MODE 0 "register_operand" "= v, v") + (fma:FP_MODE + (match_operand:FP_MODE 1 "gcn_alu_operand" "% vA, vA") + (match_operand:FP_MODE 2 "gcn_alu_operand" " vA,vSvA") + (match_operand:FP_MODE 3 "gcn_alu_operand" "vSvA, vA")))] + "" + "v_fma%i0\t%0, %1, %2, %3" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "fma4_negop2" + [(set (match_operand:FP_MODE 0 "register_operand" "= v, v, v") + (fma:FP_MODE + (match_operand:FP_MODE 1 "gcn_alu_operand" " vA, vA,vSvA") + (neg:FP_MODE + (match_operand:FP_MODE 2 "gcn_alu_operand" " vA,vSvA, vA")) + (match_operand:FP_MODE 3 "gcn_alu_operand" "vSvA, vA, vA")))] + "" + "v_fma%i0\t%0, %1, -%2, %3" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +;; }}} +;; {{{ FP division + +(define_insn "recip2" + [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v") + (div:VEC_FP_MODE + (vec_duplicate:VEC_FP_MODE (float: (const_int 1))) + (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "vSvB")))] + "" + "v_rcp%i0\t%0, %1" + [(set_attr "type" "vop1") + (set_attr "length" "8")]) + +(define_insn "recip2" + [(set (match_operand:FP_MODE 0 "register_operand" "= v") + (div:FP_MODE + (float:FP_MODE (const_int 1)) + (match_operand:FP_MODE 1 "gcn_alu_operand" "vSvB")))] + "" + "v_rcp%i0\t%0, %1" + [(set_attr "type" "vop1") + (set_attr "length" "8")]) + +;; Do division via a = b * 1/c +;; The v_rcp_* instructions are not sufficiently accurate on their own, +;; so we use 2 v_fma_* instructions to do one round of Newton-Raphson +;; which the ISA manual says is enough to improve the reciprocal accuracy. +;; +;; FIXME: This does not handle denormals, NaNs, division-by-zero etc. + +(define_expand "div3" + [(match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand") + (match_operand:VEC_FP_MODE 1 "gcn_valu_src0_operand") + (match_operand:VEC_FP_MODE 2 "gcn_valu_src0_operand")] + "flag_reciprocal_math" + { + rtx two = gcn_vec_constant (mode, + const_double_from_real_value (dconst2, mode)); + rtx initrcp = gen_reg_rtx (mode); + rtx fma = gen_reg_rtx (mode); + rtx rcp; + + bool is_rcp = (GET_CODE (operands[1]) == CONST_VECTOR + && real_identical + (CONST_DOUBLE_REAL_VALUE + (CONST_VECTOR_ELT (operands[1], 0)), &dconstm1)); + + if (is_rcp) + rcp = operands[0]; + else + rcp = gen_reg_rtx (mode); + + emit_insn (gen_recip2 (initrcp, operands[2])); + emit_insn (gen_fma4_negop2 (fma, initrcp, operands[2], two)); + emit_insn (gen_mul3 (rcp, initrcp, fma)); + + if (!is_rcp) + emit_insn (gen_mul3 (operands[0], operands[1], rcp)); + + DONE; + }) + +(define_expand "div3" + [(match_operand:FP_MODE 0 "gcn_valu_dst_operand") + (match_operand:FP_MODE 1 "gcn_valu_src0_operand") + (match_operand:FP_MODE 2 "gcn_valu_src0_operand")] + "flag_reciprocal_math" + { + rtx two = const_double_from_real_value (dconst2, mode); + rtx initrcp = gen_reg_rtx (mode); + rtx fma = gen_reg_rtx (mode); + rtx rcp; + + bool is_rcp = (GET_CODE (operands[1]) == CONST_DOUBLE + && real_identical (CONST_DOUBLE_REAL_VALUE (operands[1]), + &dconstm1)); + + if (is_rcp) + rcp = operands[0]; + else + rcp = gen_reg_rtx (mode); + + emit_insn (gen_recip2 (initrcp, operands[2])); + emit_insn (gen_fma4_negop2 (fma, initrcp, operands[2], two)); + emit_insn (gen_mul3 (rcp, initrcp, fma)); + + if (!is_rcp) + emit_insn (gen_mul3 (operands[0], operands[1], rcp)); + + DONE; + }) + +;; }}} +;; {{{ Int/FP conversions + +(define_mode_iterator CVT_FROM_MODE [HI SI HF SF DF]) +(define_mode_iterator CVT_TO_MODE [HI SI HF SF DF]) + +(define_mode_iterator VCVT_FROM_MODE [V64HI V64SI V64HF V64SF V64DF]) +(define_mode_iterator VCVT_TO_MODE [V64HI V64SI V64HF V64SF V64DF]) + +(define_code_iterator cvt_op [fix unsigned_fix + float unsigned_float + float_extend float_truncate]) +(define_code_attr cvt_name [(fix "fix_trunc") (unsigned_fix "fixuns_trunc") + (float "float") (unsigned_float "floatuns") + (float_extend "extend") (float_truncate "trunc")]) +(define_code_attr cvt_operands [(fix "%i0%i1") (unsigned_fix "%u0%i1") + (float "%i0%i1") (unsigned_float "%i0%u1") + (float_extend "%i0%i1") + (float_truncate "%i0%i1")]) + +(define_insn "2" + [(set (match_operand:CVT_TO_MODE 0 "register_operand" "= v") + (cvt_op:CVT_TO_MODE + (match_operand:CVT_FROM_MODE 1 "gcn_alu_operand" "vSvB")))] + "gcn_valid_cvt_p (mode, mode, + _cvt)" + "v_cvt\t%0, %1" + [(set_attr "type" "vop1") + (set_attr "length" "8")]) + +(define_insn "2" + [(set (match_operand:VCVT_TO_MODE 0 "register_operand" "= v") + (cvt_op:VCVT_TO_MODE + (match_operand:VCVT_FROM_MODE 1 "gcn_alu_operand" "vSvB")))] + "gcn_valid_cvt_p (mode, mode, + _cvt)" + "v_cvt\t%0, %1" + [(set_attr "type" "vop1") + (set_attr "length" "8")]) + +;; }}} +;; {{{ Int/int conversions + +;; GCC can already do these for scalar types, but not for vector types. +;; Unfortunately you can't just do SUBREG on a vector to select the low part, +;; so there must be a few tricks here. + +(define_insn_and_split "vec_truncatev64div64si" + [(set (match_operand:V64SI 0 "register_operand" "=v,&v") + (truncate:V64SI + (match_operand:V64DI 1 "register_operand" " 0, v")))] + "" + "#" + "reload_completed" + [(set (match_dup 0) (match_dup 1))] + { + operands[1] = gcn_operand_part (V64SImode, operands[1], 0); + } + [(set_attr "type" "vop2") + (set_attr "length" "0,4")]) + +(define_insn_and_split "vec_truncatev64div64si_exec" + [(set (match_operand:V64SI 0 "register_operand" "=v,&v") + (vec_merge:V64SI + (truncate:V64SI + (match_operand:V64DI 1 "register_operand" " 0, v")) + (match_operand:V64SI 2 "gcn_alu_or_unspec_operand" "U0,U0") + (match_operand:DI 3 "gcn_exec_operand" " e, e")))] + "" + "#" + "reload_completed" + [(parallel [(set (match_dup 0) + (vec_merge:V64SI (match_dup 1) (match_dup 2) (match_dup 3))) + (clobber (scratch:V64DI))])] + { + operands[1] = gcn_operand_part (V64SImode, operands[1], 0); + } + [(set_attr "type" "vop2") + (set_attr "length" "0,4")]) + +;; }}} +;; {{{ Vector comparison/merge + +(define_insn "vec_cmpdi" + [(set (match_operand:DI 0 "register_operand" "=cV,cV, e, e,Sg,Sg") + (match_operator 1 "comparison_operator" + [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand" + "vSv, B,vSv, B, v,vA") + (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand" + " v, v, v, v,vA, v")])) + (clobber (match_scratch:DI 4 "= X, X, cV,cV, X, X"))] + "" + "@ + v_cmp%E1\tvcc, %2, %3 + v_cmp%E1\tvcc, %2, %3 + v_cmpx%E1\tvcc, %2, %3 + v_cmpx%E1\tvcc, %2, %3 + v_cmp%E1\t%0, %2, %3 + v_cmp%E1\t%0, %2, %3" + [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a") + (set_attr "length" "4,8,4,8,8,8")]) + +(define_expand "vec_cmpudi" + [(match_operand:DI 0 "register_operand") + (match_operator 1 "comparison_operator" + [(match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand") + (match_operand:VEC_1REG_INT_MODE 3 "gcn_vop3_operand")])] + "" + { + /* Unsigned comparisons use the same patterns as signed comparisons, + except that they use unsigned operators (e.g. LTU vs LT). + The '%E1' directive then does the Right Thing. */ + emit_insn (gen_vec_cmpdi (operands[0], operands[1], operands[2], + operands[3])); + DONE; + }) + +(define_insn "vec_cmpdi_exec" + [(set (match_operand:DI 0 "register_operand" "=cV,cV, e, e,Sg,Sg") + (and:DI + (match_operator 1 "comparison_operator" + [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand" + "vSv, B,vSv, B, v,vA") + (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand" + " v, v, v, v,vA, v")]) + (match_operand:DI 4 "gcn_exec_reg_operand" " e, e, e, e, e, e"))) + (clobber (match_scratch:DI 5 "= X, X, cV,cV, X, X"))] + "" + "@ + v_cmp%E1\tvcc, %2, %3 + v_cmp%E1\tvcc, %2, %3 + v_cmpx%E1\tvcc, %2, %3 + v_cmpx%E1\tvcc, %2, %3 + v_cmp%E1\t%0, %2, %3 + v_cmp%E1\t%0, %2, %3" + [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a") + (set_attr "length" "4,8,4,8,8,8")]) + +(define_insn "vec_cmpdi_dup" + [(set (match_operand:DI 0 "register_operand" "=cV,cV, e,e,Sg") + (match_operator 1 "comparison_operator" + [(vec_duplicate:VEC_1REG_MODE + (match_operand: 2 "gcn_alu_operand" + " Sv, B,Sv,B, A")) + (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand" + " v, v, v,v, v")])) + (clobber (match_scratch:DI 4 "= X,X,cV,cV, X"))] + "" + "@ + v_cmp%E1\tvcc, %2, %3 + v_cmp%E1\tvcc, %2, %3 + v_cmpx%E1\tvcc, %2, %3 + v_cmpx%E1\tvcc, %2, %3 + v_cmp%E1\t%0, %2, %3" + [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a") + (set_attr "length" "4,8,4,8,8")]) + +(define_insn "vec_cmpdi_dup_exec" + [(set (match_operand:DI 0 "register_operand" "=cV,cV, e,e,Sg") + (and:DI + (match_operator 1 "comparison_operator" + [(vec_duplicate:VEC_1REG_MODE + (match_operand: 2 "gcn_alu_operand" + " Sv, B,Sv,B, A")) + (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand" + " v, v, v,v, v")]) + (match_operand:DI 4 "gcn_exec_reg_operand" " e, e, e,e, e"))) + (clobber (match_scratch:DI 5 "= X,X,cV,cV, X"))] + "" + "@ + v_cmp%E1\tvcc, %2, %3 + v_cmp%E1\tvcc, %2, %3 + v_cmpx%E1\tvcc, %2, %3 + v_cmpx%E1\tvcc, %2, %3 + v_cmp%E1\t%0, %2, %3" + [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a") + (set_attr "length" "4,8,4,8,8")]) + +(define_expand "vcond_mask_di" + [(parallel + [(set (match_operand:VEC_REG_MODE 0 "register_operand" "") + (vec_merge:VEC_REG_MODE + (match_operand:VEC_REG_MODE 1 "gcn_vop3_operand" "") + (match_operand:VEC_REG_MODE 2 "gcn_alu_operand" "") + (match_operand:DI 3 "register_operand" ""))) + (clobber (scratch:V64DI))])] + "" + "") + +(define_expand "vcond" + [(match_operand:VEC_1REG_MODE 0 "register_operand") + (match_operand:VEC_1REG_MODE 1 "gcn_vop3_operand") + (match_operand:VEC_1REG_MODE 2 "gcn_alu_operand") + (match_operator 3 "comparison_operator" + [(match_operand:VEC_1REG_ALT 4 "gcn_alu_operand") + (match_operand:VEC_1REG_ALT 5 "gcn_vop3_operand")])] + "" + { + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_vec_cmpdi (tmp, operands[3], operands[4], + operands[5])); + emit_insn (gen_vcond_mask_di (operands[0], operands[1], operands[2], + tmp)); + DONE; + }) + +(define_expand "vcond_exec" + [(match_operand:VEC_1REG_MODE 0 "register_operand") + (match_operand:VEC_1REG_MODE 1 "gcn_vop3_operand") + (match_operand:VEC_1REG_MODE 2 "gcn_alu_operand") + (match_operator 3 "comparison_operator" + [(match_operand:VEC_1REG_ALT 4 "gcn_alu_operand") + (match_operand:VEC_1REG_ALT 5 "gcn_vop3_operand")]) + (match_operand:DI 6 "gcn_exec_reg_operand" "e")] + "" + { + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_vec_cmpdi_exec (tmp, operands[3], operands[4], + operands[5], operands[6])); + emit_insn (gen_vcond_mask_di (operands[0], operands[1], operands[2], + tmp)); + DONE; + }) + +(define_expand "vcondu" + [(match_operand:VEC_1REG_INT_MODE 0 "register_operand") + (match_operand:VEC_1REG_INT_MODE 1 "gcn_vop3_operand") + (match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand") + (match_operator 3 "comparison_operator" + [(match_operand:VEC_1REG_INT_ALT 4 "gcn_alu_operand") + (match_operand:VEC_1REG_INT_ALT 5 "gcn_vop3_operand")])] + "" + { + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_vec_cmpdi (tmp, operands[3], operands[4], + operands[5])); + emit_insn (gen_vcond_mask_di (operands[0], operands[1], operands[2], + tmp)); + DONE; + }) + +(define_expand "vcondu_exec" + [(match_operand:VEC_1REG_INT_MODE 0 "register_operand") + (match_operand:VEC_1REG_INT_MODE 1 "gcn_vop3_operand") + (match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand") + (match_operator 3 "comparison_operator" + [(match_operand:VEC_1REG_INT_ALT 4 "gcn_alu_operand") + (match_operand:VEC_1REG_INT_ALT 5 "gcn_vop3_operand")]) + (match_operand:DI 6 "gcn_exec_reg_operand" "e")] + "" + { + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_vec_cmpdi_exec (tmp, operands[3], operands[4], + operands[5], operands[6])); + emit_insn (gen_vcond_mask_di (operands[0], operands[1], operands[2], + tmp)); + DONE; + }) + +;; }}} +;; {{{ Fully masked loop support + +(define_expand "while_ultsidi" + [(match_operand:DI 0 "register_operand") + (match_operand:SI 1 "") + (match_operand:SI 2 "")] + "" + { + if (GET_CODE (operands[1]) != CONST_INT + || GET_CODE (operands[2]) != CONST_INT) + { + rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1)); + rtx tmp = _0_1_2_3; + if (GET_CODE (operands[1]) != CONST_INT + || INTVAL (operands[1]) != 0) + { + tmp = gen_reg_rtx (V64SImode); + emit_insn (gen_addv64si3_dup (tmp, _0_1_2_3, operands[1])); + } + emit_insn (gen_vec_cmpv64sidi_dup (operands[0], + gen_rtx_GT (VOIDmode, 0, 0), + operands[2], tmp)); + } + else + { + HOST_WIDE_INT diff = INTVAL (operands[2]) - INTVAL (operands[1]); + HOST_WIDE_INT mask = (diff >= 64 ? -1 + : ~((unsigned HOST_WIDE_INT)-1 << diff)); + emit_move_insn (operands[0], gen_rtx_CONST_INT (VOIDmode, mask)); + } + DONE; + }) + +(define_expand "maskloaddi" + [(match_operand:VEC_REG_MODE 0 "register_operand") + (match_operand:VEC_REG_MODE 1 "memory_operand") + (match_operand 2 "")] + "" + { + rtx exec = force_reg (DImode, operands[2]); + rtx addr = gcn_expand_scalar_to_vector_address + (mode, exec, operands[1], gen_rtx_SCRATCH (V64DImode)); + rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1])); + rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1])); + rtx undef = gcn_gen_undef (mode); + emit_insn (gen_gather_expr_exec (operands[0], addr, as, v, undef, + exec)); + DONE; + }) + +(define_expand "maskstoredi" + [(match_operand:VEC_REG_MODE 0 "memory_operand") + (match_operand:VEC_REG_MODE 1 "register_operand") + (match_operand 2 "")] + "" + { + rtx exec = force_reg (DImode, operands[2]); + rtx addr = gcn_expand_scalar_to_vector_address + (mode, exec, operands[0], gen_rtx_SCRATCH (V64DImode)); + rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0])); + rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0])); + emit_insn (gen_scatter_expr_exec (addr, operands[1], as, v, exec)); + DONE; + }) + +(define_expand "mask_gather_load" + [(match_operand:VEC_REG_MODE 0 "register_operand") + (match_operand:DI 1 "register_operand") + (match_operand 2 "register_operand") + (match_operand 3 "immediate_operand") + (match_operand:SI 4 "gcn_alu_operand") + (match_operand:DI 5 "")] + "" + { + rtx exec = force_reg (DImode, operands[5]); + + /* TODO: more conversions will be needed when more types are vectorized. */ + if (GET_MODE (operands[2]) == V64DImode) + { + rtx tmp = gen_reg_rtx (V64SImode); + emit_insn (gen_vec_truncatev64div64si_exec (tmp, operands[2], + gcn_gen_undef (V64SImode), + exec)); + operands[2] = tmp; + } + + emit_insn (gen_gather_exec (operands[0], operands[1], operands[2], + operands[3], operands[4], exec)); + DONE; + }) + +(define_expand "mask_scatter_store" + [(match_operand:DI 0 "register_operand") + (match_operand 1 "register_operand") + (match_operand 2 "immediate_operand") + (match_operand:SI 3 "gcn_alu_operand") + (match_operand:VEC_REG_MODE 4 "register_operand") + (match_operand:DI 5 "")] + "" + { + rtx exec = force_reg (DImode, operands[5]); + + /* TODO: more conversions will be needed when more types are vectorized. */ + if (GET_MODE (operands[1]) == V64DImode) + { + rtx tmp = gen_reg_rtx (V64SImode); + emit_insn (gen_vec_truncatev64div64si_exec (tmp, operands[1], + gcn_gen_undef (V64SImode), + exec)); + operands[1] = tmp; + } + + emit_insn (gen_scatter_exec (operands[0], operands[1], operands[2], + operands[3], operands[4], exec)); + DONE; + }) + +; FIXME this should be VEC_REG_MODE, but not all dependencies are implemented. +(define_mode_iterator COND_MODE [V64SI V64DI V64SF V64DF]) +(define_mode_iterator COND_INT_MODE [V64SI V64DI]) + +(define_code_iterator cond_op [plus minus]) + +(define_expand "cond_" + [(match_operand:COND_MODE 0 "register_operand") + (match_operand:DI 1 "register_operand") + (cond_op:COND_MODE + (match_operand:COND_MODE 2 "gcn_alu_operand") + (match_operand:COND_MODE 3 "gcn_alu_operand")) + (match_operand:COND_MODE 4 "register_operand")] + "" + { + operands[1] = force_reg (DImode, operands[1]); + operands[2] = force_reg (mode, operands[2]); + + emit_insn (gen_3_exec (operands[0], operands[2], + operands[3], operands[4], + operands[1])); + DONE; + }) + +(define_code_iterator cond_bitop [and ior xor]) + +(define_expand "cond_" + [(match_operand:COND_INT_MODE 0 "register_operand") + (match_operand:DI 1 "register_operand") + (cond_bitop:COND_INT_MODE + (match_operand:COND_INT_MODE 2 "gcn_alu_operand") + (match_operand:COND_INT_MODE 3 "gcn_alu_operand")) + (match_operand:COND_INT_MODE 4 "register_operand")] + "" + { + operands[1] = force_reg (DImode, operands[1]); + operands[2] = force_reg (mode, operands[2]); + + emit_insn (gen_3_exec (operands[0], operands[2], + operands[3], operands[4], + operands[1])); + DONE; + }) + +;; }}} +;; {{{ Vector reductions + +(define_int_iterator REDUC_UNSPEC [UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR + UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR + UNSPEC_PLUS_DPP_SHR + UNSPEC_AND_DPP_SHR + UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR]) + +(define_int_iterator REDUC_2REG_UNSPEC [UNSPEC_PLUS_DPP_SHR + UNSPEC_AND_DPP_SHR + UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR]) + +; FIXME: Isn't there a better way of doing this? +(define_int_attr reduc_unspec [(UNSPEC_SMIN_DPP_SHR "UNSPEC_SMIN_DPP_SHR") + (UNSPEC_SMAX_DPP_SHR "UNSPEC_SMAX_DPP_SHR") + (UNSPEC_UMIN_DPP_SHR "UNSPEC_UMIN_DPP_SHR") + (UNSPEC_UMAX_DPP_SHR "UNSPEC_UMAX_DPP_SHR") + (UNSPEC_PLUS_DPP_SHR "UNSPEC_PLUS_DPP_SHR") + (UNSPEC_AND_DPP_SHR "UNSPEC_AND_DPP_SHR") + (UNSPEC_IOR_DPP_SHR "UNSPEC_IOR_DPP_SHR") + (UNSPEC_XOR_DPP_SHR "UNSPEC_XOR_DPP_SHR")]) + +(define_int_attr reduc_op [(UNSPEC_SMIN_DPP_SHR "smin") + (UNSPEC_SMAX_DPP_SHR "smax") + (UNSPEC_UMIN_DPP_SHR "umin") + (UNSPEC_UMAX_DPP_SHR "umax") + (UNSPEC_PLUS_DPP_SHR "plus") + (UNSPEC_AND_DPP_SHR "and") + (UNSPEC_IOR_DPP_SHR "ior") + (UNSPEC_XOR_DPP_SHR "xor")]) + +(define_int_attr reduc_insn [(UNSPEC_SMIN_DPP_SHR "v_min%i0") + (UNSPEC_SMAX_DPP_SHR "v_max%i0") + (UNSPEC_UMIN_DPP_SHR "v_min%u0") + (UNSPEC_UMAX_DPP_SHR "v_max%u0") + (UNSPEC_PLUS_DPP_SHR "v_add%u0") + (UNSPEC_AND_DPP_SHR "v_and%b0") + (UNSPEC_IOR_DPP_SHR "v_or%b0") + (UNSPEC_XOR_DPP_SHR "v_xor%b0")]) + +(define_expand "reduc__scal_" + [(set (match_operand: 0 "register_operand") + (unspec: + [(match_operand:VEC_1REG_MODE 1 "register_operand")] + REDUC_UNSPEC))] + "" + { + rtx tmp = gcn_expand_reduc_scalar (mode, operands[1], + ); + + /* The result of the reduction is in lane 63 of tmp. */ + emit_insn (gen_mov_from_lane63_ (operands[0], tmp)); + + DONE; + }) + +(define_expand "reduc__scal_v64di" + [(set (match_operand:DI 0 "register_operand") + (unspec:DI + [(match_operand:V64DI 1 "register_operand")] + REDUC_2REG_UNSPEC))] + "" + { + rtx tmp = gcn_expand_reduc_scalar (V64DImode, operands[1], + ); + + /* The result of the reduction is in lane 63 of tmp. */ + emit_insn (gen_mov_from_lane63_v64di (operands[0], tmp)); + + DONE; + }) + +(define_insn "*_dpp_shr_" + [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v") + (unspec:VEC_1REG_MODE + [(match_operand:VEC_1REG_MODE 1 "register_operand" "v") + (match_operand:VEC_1REG_MODE 2 "register_operand" "v") + (match_operand:SI 3 "const_int_operand" "n")] + REDUC_UNSPEC))] + "!(TARGET_GCN3 && SCALAR_INT_MODE_P (mode) + && == UNSPEC_PLUS_DPP_SHR)" + { + return gcn_expand_dpp_shr_insn (mode, "", + , INTVAL (operands[3])); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "8")]) + +(define_insn_and_split "*_dpp_shr_v64di" + [(set (match_operand:V64DI 0 "register_operand" "=&v") + (unspec:V64DI + [(match_operand:V64DI 1 "register_operand" "v0") + (match_operand:V64DI 2 "register_operand" "v0") + (match_operand:SI 3 "const_int_operand" "n")] + REDUC_2REG_UNSPEC))] + "" + "#" + "reload_completed" + [(set (match_dup 4) + (unspec:V64SI + [(match_dup 6) (match_dup 8) (match_dup 3)] REDUC_2REG_UNSPEC)) + (set (match_dup 5) + (unspec:V64SI + [(match_dup 7) (match_dup 9) (match_dup 3)] REDUC_2REG_UNSPEC))] + { + operands[4] = gcn_operand_part (V64DImode, operands[0], 0); + operands[5] = gcn_operand_part (V64DImode, operands[0], 1); + operands[6] = gcn_operand_part (V64DImode, operands[1], 0); + operands[7] = gcn_operand_part (V64DImode, operands[1], 1); + operands[8] = gcn_operand_part (V64DImode, operands[2], 0); + operands[9] = gcn_operand_part (V64DImode, operands[2], 1); + } + [(set_attr "type" "vmult") + (set_attr "length" "16")]) + +; Special cases for addition. + +(define_insn "*plus_carry_dpp_shr_" + [(set (match_operand:VEC_1REG_INT_MODE 0 "register_operand" "=v") + (unspec:VEC_1REG_INT_MODE + [(match_operand:VEC_1REG_INT_MODE 1 "register_operand" "v") + (match_operand:VEC_1REG_INT_MODE 2 "register_operand" "v") + (match_operand:SI 3 "const_int_operand" "n")] + UNSPEC_PLUS_CARRY_DPP_SHR)) + (clobber (reg:DI VCC_REG))] + "" + { + const char *insn = TARGET_GCN3 ? "v_add%u0" : "v_add_co%u0"; + return gcn_expand_dpp_shr_insn (mode, insn, + UNSPEC_PLUS_CARRY_DPP_SHR, + INTVAL (operands[3])); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "8")]) + +(define_insn "*plus_carry_in_dpp_shr_v64si" + [(set (match_operand:V64SI 0 "register_operand" "=v") + (unspec:V64SI + [(match_operand:V64SI 1 "register_operand" "v") + (match_operand:V64SI 2 "register_operand" "v") + (match_operand:SI 3 "const_int_operand" "n") + (match_operand:DI 4 "register_operand" "cV")] + UNSPEC_PLUS_CARRY_IN_DPP_SHR)) + (clobber (reg:DI VCC_REG))] + "" + { + const char *insn = TARGET_GCN3 ? "v_addc%u0" : "v_addc_co%u0"; + return gcn_expand_dpp_shr_insn (V64SImode, insn, + UNSPEC_PLUS_CARRY_IN_DPP_SHR, + INTVAL (operands[3])); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "8")]) + +(define_insn_and_split "*plus_carry_dpp_shr_v64di" + [(set (match_operand:V64DI 0 "register_operand" "=&v") + (unspec:V64DI + [(match_operand:V64DI 1 "register_operand" "v0") + (match_operand:V64DI 2 "register_operand" "v0") + (match_operand:SI 3 "const_int_operand" "n")] + UNSPEC_PLUS_CARRY_DPP_SHR)) + (clobber (reg:DI VCC_REG))] + "" + "#" + "reload_completed" + [(parallel [(set (match_dup 4) + (unspec:V64SI + [(match_dup 6) (match_dup 8) (match_dup 3)] + UNSPEC_PLUS_CARRY_DPP_SHR)) + (clobber (reg:DI VCC_REG))]) + (parallel [(set (match_dup 5) + (unspec:V64SI + [(match_dup 7) (match_dup 9) (match_dup 3) (reg:DI VCC_REG)] + UNSPEC_PLUS_CARRY_IN_DPP_SHR)) + (clobber (reg:DI VCC_REG))])] + { + operands[4] = gcn_operand_part (V64DImode, operands[0], 0); + operands[5] = gcn_operand_part (V64DImode, operands[0], 1); + operands[6] = gcn_operand_part (V64DImode, operands[1], 0); + operands[7] = gcn_operand_part (V64DImode, operands[1], 1); + operands[8] = gcn_operand_part (V64DImode, operands[2], 0); + operands[9] = gcn_operand_part (V64DImode, operands[2], 1); + } + [(set_attr "type" "vmult") + (set_attr "length" "16")]) + +; Instructions to move a scalar value from lane 63 of a vector register. +(define_insn "mov_from_lane63_" + [(set (match_operand: 0 "register_operand" "=Sg,v") + (unspec: + [(match_operand:VEC_1REG_MODE 1 "register_operand" "v,v")] + UNSPEC_MOV_FROM_LANE63))] + "" + "@ + v_readlane_b32\t%0, %1, 63 + v_mov_b32\t%0, %1 wave_ror:1" + [(set_attr "type" "vop3a,vop_dpp") + (set_attr "exec" "none,*") + (set_attr "length" "8")]) + +(define_insn "mov_from_lane63_v64di" + [(set (match_operand:DI 0 "register_operand" "=Sg,v") + (unspec:DI + [(match_operand:V64DI 1 "register_operand" "v,v")] + UNSPEC_MOV_FROM_LANE63))] + "" + "@ + v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63 + * if (REGNO (operands[0]) <= REGNO (operands[1])) \ + return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\" \ + \"v_mov_b32\t%H0, %H1 wave_ror:1\"; \ + else \ + return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\" \ + \"v_mov_b32\t%L0, %L1 wave_ror:1\";" + [(set_attr "type" "vop3a,vop_dpp") + (set_attr "exec" "none,*") + (set_attr "length" "8")]) + +;; }}} +;; {{{ Miscellaneous + +(define_expand "vec_seriesv64si" + [(match_operand:V64SI 0 "register_operand") + (match_operand:SI 1 "gcn_alu_operand") + (match_operand:SI 2 "gcn_alu_operand")] + "" + { + rtx tmp = gen_reg_rtx (V64SImode); + rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1)); + + emit_insn (gen_mulv64si3_dup (tmp, v1, operands[2])); + emit_insn (gen_addv64si3_dup (operands[0], tmp, operands[1])); + DONE; + }) + +(define_expand "vec_seriesv64di" + [(match_operand:V64DI 0 "register_operand") + (match_operand:DI 1 "gcn_alu_operand") + (match_operand:DI 2 "gcn_alu_operand")] + "" + { + rtx tmp = gen_reg_rtx (V64DImode); + rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1)); + + emit_insn (gen_mulv64di3_zext_dup2 (tmp, v1, operands[2])); + emit_insn (gen_addv64di3_dup (operands[0], tmp, operands[1])); + DONE; + }) + +;; }}} diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md new file mode 100644 index 00000000000..4573a4ce32f --- /dev/null +++ b/gcc/config/gcn/gcn.md @@ -0,0 +1,1857 @@ +;; Copyright (C) 2016-2019 Free Software Foundation, Inc. + +;; This file is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3 of the License, or (at your option) +;; any later version. + +;; This file is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;;- See file "rtl.def" for documentation on define_insn, match_*, et. al. + +(include "predicates.md") +(include "constraints.md") + +;; {{{ Constants and enums + +; Named registers +(define_constants + [(FIRST_SGPR_REG 0) + (LAST_SGPR_REG 101) + (FLAT_SCRATCH_REG 102) + (FLAT_SCRATCH_LO_REG 102) + (FLAT_SCRATCH_HI_REG 103) + (XNACK_MASK_REG 104) + (XNACK_MASK_LO_REG 104) + (XNACK_MASK_HI_REG 105) + (VCC_REG 106) + (VCC_LO_REG 106) + (VCC_HI_REG 107) + (VCCZ_REG 108) + (TBA_REG 109) + (TBA_LO_REG 109) + (TBA_HI_REG 110) + (TMA_REG 111) + (TMA_LO_REG 111) + (TMA_HI_REG 112) + (TTMP0_REG 113) + (TTMP11_REG 124) + (M0_REG 125) + (EXEC_REG 126) + (EXEC_LO_REG 126) + (EXEC_HI_REG 127) + (EXECZ_REG 128) + (SCC_REG 129) + (FIRST_VGPR_REG 160) + (LAST_VGPR_REG 415)]) + +(define_constants + [(SP_REGNUM 16) + (LR_REGNUM 18) + (AP_REGNUM 416) + (FP_REGNUM 418)]) + +(define_c_enum "unspecv" [ + UNSPECV_PROLOGUE_USE + UNSPECV_KERNEL_RETURN + UNSPECV_BARRIER + UNSPECV_ATOMIC + UNSPECV_ICACHE_INV]) + +(define_c_enum "unspec" [ + UNSPEC_VECTOR + UNSPEC_BPERMUTE + UNSPEC_SGPRBASE + UNSPEC_MEMORY_BARRIER + UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR + UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR + UNSPEC_PLUS_DPP_SHR + UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR + UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR + UNSPEC_MOV_FROM_LANE63 + UNSPEC_GATHER + UNSPEC_SCATTER]) + +;; }}} +;; {{{ Attributes + +; Instruction type (encoding) as described in the ISA specification. +; The following table summarizes possible operands of individual instruction +; types and corresponding constraints. +; +; sop2 - scalar, two inputs, one output +; ssrc0/ssrc1: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec +; vccz,execz,scc,inline immedate,fp inline immediate +; sdst: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec +; +; Constraints "=SD, SD", "SSA,SSB","SSB,SSA" +; +; sopk - scalar, inline constant input, one output +; simm16: 16bit inline constant +; sdst: same as sop2/ssrc0 +; +; Constraints "=SD", "J" +; +; sop1 - scalar, one input, one output +; ssrc0: same as sop2/ssrc0. FIXME: manual omit VCCZ +; sdst: same as sop2/sdst +; +; Constraints "=SD", "SSA" +; +; sopc - scalar, two inputs, one comparsion +; ssrc0: same as sop2/ssc0. +; +; Constraints "SSI,SSA","SSA,SSI" +; +; sopp - scalar, one constant input, one special +; simm16 +; +; smem - scalar memory +; sbase: aligned pair of sgprs. Specify {size[15:0], base[47:0]} in +; dwords +; sdata: sgpr0-102, flat_scratch, xnack, vcc, tba, tma +; offset: sgpr or 20bit unsigned byte offset +; +; vop2 - vector, two inputs, one output +; vsrc0: sgpr0-102,flat_scratch,xnack,vcc,tba,ttmp0-11,m0,exec, +; inline constant -16 to -64, fp inline immediate, vccz, execz, +; scc, lds, literal constant, vgpr0-255 +; vsrc1: vgpr0-255 +; vdst: vgpr0-255 +; Limitations: At most one SGPR, at most one constant +; if constant is used, SGPR must be M0 +; Only SRC0 can be LDS_DIRECT +; +; constraints: "=v", "vBSv", "v" +; +; vop1 - vector, one input, one output +; vsrc0: same as vop2/src0 +; vdst: vgpr0-255 +; +; constraints: "=v", "vBSv" +; +; vopc - vector, two inputs, one comparsion output; +; vsrc0: same as vop2/src0 +; vsrc1: vgpr0-255 +; vdst: +; +; constraints: "vASv", "v" +; +; vop3a - vector, three inputs, one output +; vdst: vgpr0-255, for v_cmp sgpr or vcc +; abs,clamp +; vsrc0: sgpr0-102,vcc,tba,ttmp0-11,m0,exec, +; inline constant -16 to -64, fp inline immediate, vccz, execz, +; scc, lds_direct +; FIXME: really missing 1/pi? really 104 SGPRs +; +; vop3b - vector, three inputs, one vector output, one scalar output +; vsrc0,vsrc1,vsrc2: same as vop3a vsrc0 +; vdst: vgpr0-255 +; sdst: sgpr0-103/vcc/tba/tma/ttmp0-11 +; +; vop_sdwa - second dword for vop1/vop2/vopc for specifying sub-dword address +; src0: vgpr0-255 +; dst_sel: BYTE_0-3, WORD_0-1, DWORD +; dst_unused: UNUSED_PAD, UNUSED_SEXT, UNUSED_PRESERVE +; clamp: true/false +; src0_sel: BYTE_0-3, WORD_0-1, DWORD +; flags: src0_sext, src0_neg, src0_abs, src1_sel, src1_sext, src1_neg, + ; src1_abs +; +; vop_dpp - second dword for vop1/vop2/vopc for specifying data-parallel ops +; src0: vgpr0-255 +; dpp_ctrl: quad_perm, row_sl0-15, row_sr0-15, row_rr0-15, wf_sl1, +; wf_rl1, wf_sr1, wf_rr1, row_mirror, row_half_mirror, +; bcast15, bcast31 +; flags: src0_neg, src0_abs, src1_neg, src1_abs +; bank_mask: 4-bit mask +; row_mask: 4-bit mask +; +; ds - Local and global data share instructions. +; offset0: 8-bit constant +; offset1: 8-bit constant +; flag: gds +; addr: vgpr0-255 +; data0: vgpr0-255 +; data1: vgpr0-255 +; vdst: vgpr0-255 +; +; mubuf - Untyped memory buffer operation. First word with LDS, second word +; non-LDS. +; offset: 12-bit constant +; vaddr: vgpr0-255 +; vdata: vgpr0-255 +; srsrc: sgpr0-102 +; soffset: sgpr0-102 +; flags: offen, idxen, glc, lds, slc, tfe +; +; mtbuf - Typed memory buffer operation. Two words +; offset: 12-bit constant +; dfmt: 4-bit constant +; nfmt: 3-bit constant +; vaddr: vgpr0-255 +; vdata: vgpr0-255 +; srsrc: sgpr0-102 +; soffset: sgpr0-102 +; flags: offen, idxen, glc, lds, slc, tfe +; +; flat - flat or global memory operations +; flags: glc, slc +; addr: vgpr0-255 +; data: vgpr0-255 +; vdst: vgpr0-255 +; +; mult - expands to multiple instructions (pseudo encoding) +; +; vmult - as mult, when a vector instruction is used. + +(define_attr "type" + "unknown,sop1,sop2,sopk,sopc,sopp,smem,ds,vop2,vop1,vopc, + vop3a,vop3b,vop_sdwa,vop_dpp,mubuf,mtbuf,flat,mult,vmult" + (const_string "unknown")) + +; Set if instruction is executed in scalar or vector unit + +(define_attr "unit" "unknown,scalar,vector" + (cond [(eq_attr "type" "sop1,sop2,sopk,sopc,sopp,smem,mult") + (const_string "scalar") + (eq_attr "type" "vop2,vop1,vopc,vop3a,vop3b,ds, + vop_sdwa,vop_dpp,flat,vmult") + (const_string "vector")] + (const_string "unknown"))) + +; All vector instructions run as 64 threads as predicated by the EXEC +; register. Scalar operations in vector register require a single lane +; enabled, vector moves require a full set of lanes enabled, and most vector +; operations handle the lane masking themselves. +; The md_reorg pass is responsible for ensuring that EXEC is set appropriately +; according to the following settings: +; auto - md_reorg will inspect def/use to determine what to do. +; none - exec is not needed. +; single - disable all but lane zero. +; full - enable all lanes. + +(define_attr "exec" "auto,none,single,full" + (const_string "auto")) + +; Infer the (worst-case) length from the instruction type by default. Many +; types can have an optional immediate word following, which we include here. +; "Multiple" types are counted as two 64-bit instructions. This is just a +; default fallback: it can be overridden per-alternative in insn patterns for +; greater accuracy. + +(define_attr "length" "" + (cond [(eq_attr "type" "sop1") (const_int 8) + (eq_attr "type" "sop2") (const_int 8) + (eq_attr "type" "sopk") (const_int 8) + (eq_attr "type" "sopc") (const_int 8) + (eq_attr "type" "sopp") (const_int 4) + (eq_attr "type" "smem") (const_int 8) + (eq_attr "type" "ds") (const_int 8) + (eq_attr "type" "vop1") (const_int 8) + (eq_attr "type" "vop2") (const_int 8) + (eq_attr "type" "vopc") (const_int 8) + (eq_attr "type" "vop3a") (const_int 8) + (eq_attr "type" "vop3b") (const_int 8) + (eq_attr "type" "vop_sdwa") (const_int 8) + (eq_attr "type" "vop_dpp") (const_int 8) + (eq_attr "type" "flat") (const_int 8) + (eq_attr "type" "mult") (const_int 16) + (eq_attr "type" "vmult") (const_int 16)] + (const_int 4))) + +; Disable alternatives that only apply to specific ISA variants. + +(define_attr "gcn_version" "gcn3,gcn5" (const_string "gcn3")) + +(define_attr "enabled" "" + (cond [(eq_attr "gcn_version" "gcn3") (const_int 1) + (and (eq_attr "gcn_version" "gcn5") + (ne (symbol_ref "TARGET_GCN5_PLUS") (const_int 0))) + (const_int 1)] + (const_int 0))) + +; We need to be able to identify v_readlane and v_writelane with +; SGPR lane selection in order to handle "Manually Inserted Wait States". + +(define_attr "laneselect" "yes,no" (const_string "no")) + +;; }}} +;; {{{ Iterators useful across the wole machine description + +(define_mode_iterator SIDI [SI DI]) +(define_mode_iterator SFDF [SF DF]) +(define_mode_iterator SISF [SI SF]) +(define_mode_iterator QIHI [QI HI]) +(define_mode_iterator DIDF [DI DF]) + +;; }}} +;; {{{ Attributes. + +; Translate RTX code into GCN instruction mnemonics with and without +; suffixes such as _b32, etc. + +(define_code_attr mnemonic + [(minus "sub%i") + (plus "add%i") + (ashift "lshl%b") + (lshiftrt "lshr%b") + (ashiftrt "ashr%i") + (and "and%B") + (ior "or%B") + (xor "xor%B") + (mult "mul%i") + (smin "min%i") + (smax "max%i") + (umin "min%u") + (umax "max%u") + (not "not%b") + (popcount "bcnt_u32%b")]) + +(define_code_attr bare_mnemonic + [(plus "add") + (minus "sub") + (and "and") + (ior "or") + (xor "xor")]) + +(define_code_attr s_mnemonic + [(not "not%b") + (popcount "bcnt1_i32%b")]) + +(define_code_attr revmnemonic + [(minus "subrev%i") + (ashift "lshlrev%b") + (lshiftrt "lshrrev%b") + (ashiftrt "ashrrev%i")]) + +; Translate RTX code into corresponding expander name. + +(define_code_attr expander + [(and "and") + (ior "ior") + (xor "xor") + (plus "add") + (minus "sub") + (ashift "ashl") + (lshiftrt "lshr") + (ashiftrt "ashr") + (mult "mul") + (smin "smin") + (smax "smax") + (umin "umin") + (umax "umax") + (not "one_cmpl") + (popcount "popcount")]) + +;; }}} +;; {{{ Miscellaneous instructions + +(define_insn "nop" + [(const_int 0)] + "" + "s_nop\t0x0" + [(set_attr "type" "sopp")]) + +; FIXME: What should the value of the immediate be? Zero is disallowed, so +; pick 1 for now. +(define_insn "trap" + [(trap_if (const_int 1) (const_int 0))] + "" + "s_trap\t1" + [(set_attr "type" "sopp")]) + +;; }}} +;; {{{ Moves + +;; All scalar modes we support moves in. +(define_mode_iterator MOV_MODE [BI QI HI SI DI TI SF DF]) + +; This is the entry point for creating all kinds of scalar moves, +; including reloads and symbols. + +(define_expand "mov" + [(set (match_operand:MOV_MODE 0 "nonimmediate_operand") + (match_operand:MOV_MODE 1 "general_operand"))] + "" + { + if (MEM_P (operands[0])) + operands[1] = force_reg (mode, operands[1]); + + if (!lra_in_progress && !reload_completed + && !gcn_valid_move_p (mode, operands[0], operands[1])) + { + /* Something is probably trying to generate a move + which can only work indirectly. + E.g. Move from LDS memory to SGPR hardreg + or MEM:QI to SGPR. */ + rtx tmpreg = gen_reg_rtx (mode); + emit_insn (gen_mov (tmpreg, operands[1])); + emit_insn (gen_mov (operands[0], tmpreg)); + DONE; + } + + if (mode == DImode + && (GET_CODE (operands[1]) == SYMBOL_REF + || GET_CODE (operands[1]) == LABEL_REF)) + { + emit_insn (gen_movdi_symbol (operands[0], operands[1])); + DONE; + } + }) + +; Split invalid moves into two valid moves + +(define_split + [(set (match_operand:MOV_MODE 0 "nonimmediate_operand") + (match_operand:MOV_MODE 1 "general_operand"))] + "!reload_completed && !lra_in_progress + && !gcn_valid_move_p (mode, operands[0], operands[1])" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (match_dup 2))] + { + operands[2] = gen_reg_rtx(mode); + }) + +; We need BImode move so we can reload flags registers. + +(define_insn "*movbi" + [(set (match_operand:BI 0 "nonimmediate_operand" + "=Sg, v,Sg,cs,cV,cV,Sm,RS, v,RF, v,RM") + (match_operand:BI 1 "gcn_load_operand" + "SSA,vSvA, v,SS, v,SS,RS,Sm,RF, v,RM, v"))] + "" + { + /* SCC as an operand is currently not accepted by the LLVM assembler, so + we emit bytes directly as a workaround. */ + switch (which_alternative) { + case 0: + if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG) + return "; s_mov_b32\t%0,%1 is not supported by the assembler.\;" + ".byte\t0xfd\;" + ".byte\t0x0\;" + ".byte\t0x80|%R0\;" + ".byte\t0xbe"; + else + return "s_mov_b32\t%0, %1"; + case 1: + if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG) + return "; v_mov_b32\t%0, %1\;" + ".byte\t0xfd\;" + ".byte\t0x2\;" + ".byte\t((%V0<<1)&0xff)\;" + ".byte\t0x7e|(%V0>>7)"; + else + return "v_mov_b32\t%0, %1"; + case 2: + return "v_readlane_b32\t%0, %1, 0"; + case 3: + return "s_cmpk_lg_u32\t%1, 0"; + case 4: + return "v_cmp_ne_u32\tvcc, 0, %1"; + case 5: + if (REGNO (operands[1]) == SCC_REG) + return "; s_mov_b32\t%0, %1 is not supported by the assembler.\;" + ".byte\t0xfd\;" + ".byte\t0x0\;" + ".byte\t0xea\;" + ".byte\t0xbe\;" + "s_mov_b32\tvcc_hi, 0"; + else + return "s_mov_b32\tvcc_lo, %1\;" + "s_mov_b32\tvcc_hi, 0"; + case 6: + return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)"; + case 7: + return "s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)"; + case 8: + return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0"; + case 9: + return "flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"; + case 10: + return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)"; + case 11: + return "global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"; + default: + gcc_unreachable (); + } + } + [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,flat,flat, + flat,flat") + (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*") + (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12")]) + +; 32bit move pattern + +(define_insn "*mov_insn" + [(set (match_operand:SISF 0 "nonimmediate_operand" + "=SD,SD,SD,SD,RB,Sm,RS,v,Sg, v, v,RF,v,RLRG, v,SD, v,RM") + (match_operand:SISF 1 "gcn_load_operand" + "SSA, J, B,RB,Sm,RS,Sm,v, v,Sv,RF, v,B, v,RLRG, Y,RM, v"))] + "" + "@ + s_mov_b32\t%0, %1 + s_movk_i32\t%0, %1 + s_mov_b32\t%0, %1 + s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0) + s_buffer_store%s1\t%1, s[0:3], %0\;s_waitcnt\texpcnt(0) + s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0) + s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0) + v_mov_b32\t%0, %1 + v_readlane_b32\t%0, %1, 0 + v_writelane_b32\t%0, %1, 0 + flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0 + flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0) + v_mov_b32\t%0, %1 + ds_write_b32\t%A0, %1%O0\;s_waitcnt\texpcnt(0) + ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + s_mov_b32\t%0, %1 + global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)" + [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat, + flat,vop1,ds,ds,sop1,flat,flat") + (set_attr "exec" "*,*,*,*,*,*,*,*,none,none,*,*,*,*,*,*,*,*") + (set_attr "length" "4,4,8,12,12,12,12,4,8,8,12,12,8,12,12,8,12,12")]) + +; 8/16bit move pattern + +(define_insn "*mov_insn" + [(set (match_operand:QIHI 0 "nonimmediate_operand" + "=SD,SD,SD,v,Sg, v, v,RF,v,RLRG, v, v,RM") + (match_operand:QIHI 1 "gcn_load_operand" + "SSA, J, B,v, v,Sv,RF, v,B, v,RLRG,RM, v"))] + "gcn_valid_move_p (mode, operands[0], operands[1])" + "@ + s_mov_b32\t%0, %1 + s_movk_i32\t%0, %1 + s_mov_b32\t%0, %1 + v_mov_b32\t%0, %1 + v_readlane_b32\t%0, %1, 0 + v_writelane_b32\t%0, %1, 0 + flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0 + flat_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0) + v_mov_b32\t%0, %1 + ds_write%b0\t%A0, %1%O0\;s_waitcnt\texpcnt(0) + ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + global_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)" + [(set_attr "type" + "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat") + (set_attr "exec" "*,*,*,*,none,none,*,*,*,*,*,*,*") + (set_attr "length" "4,4,8,4,4,4,12,12,8,12,12,12,12")]) + +; 64bit move pattern + +(define_insn_and_split "*mov_insn" + [(set (match_operand:DIDF 0 "nonimmediate_operand" + "=SD,SD,SD,RS,Sm,v, v,Sg, v, v,RF,RLRG, v, v,RM") + (match_operand:DIDF 1 "general_operand" + "SSA, C,DB,Sm,RS,v,DB, v,Sv,RF, v, v,RLRG,RM, v"))] + "GET_CODE(operands[1]) != SYMBOL_REF" + "@ + s_mov_b64\t%0, %1 + s_mov_b64\t%0, %1 + # + s_store_dwordx2\t%1, %A0\;s_waitcnt\texpcnt(0) + s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0) + # + # + # + # + flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0 + flat_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0) + ds_write_b64\t%A0, %1%O0\;s_waitcnt\texpcnt(0) + ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + global_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)" + "(reload_completed && !MEM_P (operands[0]) && !MEM_P (operands[1]) + && !gcn_sgpr_move_p (operands[0], operands[1])) + || (GET_CODE (operands[1]) == CONST_INT && !gcn_constant64_p (operands[1]))" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] + { + rtx inlo = gen_lowpart (SImode, operands[1]); + rtx inhi = gen_highpart_mode (SImode, mode, operands[1]); + rtx outlo = gen_lowpart (SImode, operands[0]); + rtx outhi = gen_highpart_mode (SImode, mode, operands[0]); + + /* Ensure that overlapping registers aren't corrupted. */ + if (REGNO (outlo) == REGNO (inhi)) + { + operands[0] = outhi; + operands[1] = inhi; + operands[2] = outlo; + operands[3] = inlo; + } + else + { + operands[0] = outlo; + operands[1] = inlo; + operands[2] = outhi; + operands[3] = inhi; + } + } + [(set_attr "type" "sop1,sop1,mult,smem,smem,vmult,vmult,vmult,vmult,flat, + flat,ds,ds,flat,flat") + (set_attr "length" "4,8,*,12,12,*,*,*,*,12,12,12,12,12,12")]) + +; 128-bit move. + +(define_insn_and_split "*movti_insn" + [(set (match_operand:TI 0 "nonimmediate_operand" + "=SD,RS,Sm,RF, v,v, v,SD,RM, v,RL, v") + (match_operand:TI 1 "general_operand" + "SSB,Sm,RS, v,RF,v,Sv, v, v,RM, v,RL"))] + "" + "@ + # + s_store_dwordx4\t%1, %A0\;s_waitcnt\texpcnt(0) + s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0) + flat_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0) + flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0 + # + # + # + global_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0) + global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + ds_write_b128\t%A0, %1%O0\;s_waitcnt\texpcnt(0) + ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)" + "reload_completed + && REG_P (operands[0]) + && (REG_P (operands[1]) || GET_CODE (operands[1]) == CONST_INT)" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3)) + (set (match_dup 4) (match_dup 5)) + (set (match_dup 6) (match_dup 7))] + { + operands[6] = gcn_operand_part (TImode, operands[0], 3); + operands[7] = gcn_operand_part (TImode, operands[1], 3); + operands[4] = gcn_operand_part (TImode, operands[0], 2); + operands[5] = gcn_operand_part (TImode, operands[1], 2); + operands[2] = gcn_operand_part (TImode, operands[0], 1); + operands[3] = gcn_operand_part (TImode, operands[1], 1); + operands[0] = gcn_operand_part (TImode, operands[0], 0); + operands[1] = gcn_operand_part (TImode, operands[1], 0); + } + [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\ + ds,ds") + (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")]) + +;; }}} +;; {{{ Prologue/Epilogue + +(define_insn "prologue_use" + [(unspec_volatile [(match_operand 0)] UNSPECV_PROLOGUE_USE)] + "" + "" + [(set_attr "length" "0")]) + +(define_expand "prologue" + [(const_int 0)] + "" + { + gcn_expand_prologue (); + DONE; + }) + +(define_expand "epilogue" + [(const_int 0)] + "" + { + gcn_expand_epilogue (); + DONE; + }) + +;; }}} +;; {{{ Control flow + +; This pattern must satisfy simplejump_p, which means it cannot be a parallel +; that clobbers SCC. Thus, we must preserve SCC if we're generating a long +; branch sequence. + +(define_insn "jump" + [(set (pc) + (label_ref (match_operand 0)))] + "" + { + if (get_attr_length (insn) == 4) + return "s_branch\t%0"; + else + /* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG. */ + return "; s_mov_b32\ts22, scc is not supported by the assembler.\;" + ".long\t0xbe9600fd\;" + "s_getpc_b64\ts[20:21]\;" + "s_add_u32\ts20, s20, %0@rel32@lo+4\;" + "s_addc_u32\ts21, s21, %0@rel32@hi+4\;" + "s_cmpk_lg_u32\ts22, 0\;" + "s_setpc_b64\ts[20:21]"; + } + [(set_attr "type" "sopp") + (set (attr "length") + (if_then_else (and (ge (minus (match_dup 0) (pc)) + (const_int -131072)) + (lt (minus (match_dup 0) (pc)) + (const_int 131072))) + (const_int 4) + (const_int 32)))]) + +(define_insn "indirect_jump" + [(set (pc) + (match_operand:DI 0 "register_operand" "Sg"))] + "" + "s_setpc_b64\t%0" + [(set_attr "type" "sop1") + (set_attr "length" "4")]) + +(define_insn "cjump" + [(set (pc) + (if_then_else + (match_operator:BI 1 "gcn_conditional_operator" + [(match_operand:BI 2 "gcn_conditional_register_operand" "ca,cV") + (const_int 0)]) + (label_ref (match_operand 0)) + (pc)))] + "" + { + if (get_attr_length (insn) == 4) + return "s_cbranch%C1\t%0"; + else + { + /* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG but + restores SCC. */ + if (REGNO (operands[2]) == SCC_REG) + { + if (GET_CODE (operands[1]) == EQ) + return "s_cbranch%c1\t.Lskip%=\;" + "s_getpc_b64\ts[20:21]\;" + "s_add_u32\ts20, s20, %0@rel32@lo+4\;" + "s_addc_u32\ts21, s21, %0@rel32@hi+4\;" + "s_cmp_lg_u32\t0, 0\;" + "s_setpc_b64\ts[20:21]\n" + ".Lskip%=:"; + else + return "s_cbranch%c1\t.Lskip%=\;" + "s_getpc_b64\ts[20:21]\;" + "s_add_u32\ts20, s20, %0@rel32@lo+4\;" + "s_addc_u32\ts21, s21, %0@rel32@hi+4\;" + "s_cmp_eq_u32\t0, 0\;" + "s_setpc_b64\ts[20:21]\n" + ".Lskip%=:"; + } + else + return "s_cbranch%c1\t.Lskip%=\;" + "; s_mov_b32\ts22, scc is not supported by the assembler.\;" + ".byte\t0xfd\;" + ".byte\t0x0\;" + ".byte\t0x80|22\;" + ".byte\t0xbe\;" + "s_getpc_b64\ts[20:21]\;" + "s_add_u32\ts20, s20, %0@rel32@lo+4\;" + "s_addc_u32\ts21, s21, %0@rel32@hi+4\;" + "s_cmpk_lg_u32\ts22, 0\;" + "s_setpc_b64\ts[20:21]\n" + ".Lskip%=:"; + } + } + [(set_attr "type" "sopp") + (set (attr "length") + (if_then_else (and (ge (minus (match_dup 0) (pc)) + (const_int -131072)) + (lt (minus (match_dup 0) (pc)) + (const_int 131072))) + (const_int 4) + (const_int 36)))]) + +; Returning from a normal function is different to returning from a +; kernel function. + +(define_insn "gcn_return" + [(return)] + "" + { + if (cfun && cfun->machine && cfun->machine->normal_function) + return "s_setpc_b64\ts[18:19]"; + else + return "s_dcache_wb\;s_endpgm"; + } + [(set_attr "type" "sop1") + (set_attr "length" "8")]) + +(define_expand "call" + [(parallel [(call (match_operand 0 "") + (match_operand 1 "")) + (clobber (reg:DI LR_REGNUM)) + (clobber (match_scratch:DI 2))])] + "" + {}) + +(define_insn "gcn_simple_call" + [(call (mem (match_operand 0 "immediate_operand" "Y,B")) + (match_operand 1 "const_int_operand")) + (clobber (reg:DI LR_REGNUM)) + (clobber (match_scratch:DI 2 "=&Sg,X"))] + "" + "@ + s_getpc_b64\t%2\;s_add_u32\t%L2, %L2, %0@rel32@lo+4\;s_addc_u32\t%H2, %H2, %0@rel32@hi+4\;s_swappc_b64\ts[18:19], %2 + s_swappc_b64\ts[18:19], %0" + [(set_attr "type" "mult,sop1") + (set_attr "length" "24,4")]) + +(define_insn "movdi_symbol" + [(set (match_operand:DI 0 "nonimmediate_operand" "=Sg") + (match_operand:DI 1 "general_operand" "Y")) + (clobber (reg:BI SCC_REG))] + "GET_CODE (operands[1]) == SYMBOL_REF || GET_CODE (operands[1]) == LABEL_REF" + { + if (SYMBOL_REF_P (operands[1]) + && SYMBOL_REF_WEAK (operands[1])) + return "s_getpc_b64\t%0\;" + "s_add_u32\t%L0, %L0, %1@gotpcrel32@lo+4\;" + "s_addc_u32\t%H0, %H0, %1@gotpcrel32@hi+4\;" + "s_load_dwordx2\t%0, %0\;" + "s_waitcnt\tlgkmcnt(0)"; + + return "s_getpc_b64\t%0\;" + "s_add_u32\t%L0, %L0, %1@rel32@lo+4\;" + "s_addc_u32\t%H0, %H0, %1@rel32@hi+4"; + } + [(set_attr "type" "mult") + (set_attr "length" "32")]) + +(define_insn "gcn_indirect_call" + [(call (mem (match_operand:DI 0 "register_operand" "Sg")) + (match_operand 1 "" "")) + (clobber (reg:DI LR_REGNUM)) + (clobber (match_scratch:DI 2 "=X"))] + "" + "s_swappc_b64\ts[18:19], %0" + [(set_attr "type" "sop1") + (set_attr "length" "4")]) + +(define_expand "call_value" + [(parallel [(set (match_operand 0 "") + (call (match_operand 1 "") + (match_operand 2 ""))) + (clobber (reg:DI LR_REGNUM)) + (clobber (match_scratch:DI 3))])] + "" + {}) + +(define_insn "gcn_call_value" + [(set (match_operand 0 "register_operand" "=Sg,Sg") + (call (mem (match_operand 1 "immediate_operand" "Y,B")) + (match_operand 2 "const_int_operand"))) + (clobber (reg:DI LR_REGNUM)) + (clobber (match_scratch:DI 3 "=&Sg,X"))] + "" + "@ + s_getpc_b64\t%3\;s_add_u32\t%L3, %L3, %1@rel32@lo+4\;s_addc_u32\t%H3, %H3, %1@rel32@hi+4\;s_swappc_b64\ts[18:19], %3 + s_swappc_b64\ts[18:19], %1" + [(set_attr "type" "sop1") + (set_attr "length" "24")]) + +(define_insn "gcn_call_value_indirect" + [(set (match_operand 0 "register_operand" "=Sg") + (call (mem (match_operand:DI 1 "register_operand" "Sg")) + (match_operand 2 "" ""))) + (clobber (reg:DI LR_REGNUM)) + (clobber (match_scratch:DI 3 "=X"))] + "" + "s_swappc_b64\ts[18:19], %1" + [(set_attr "type" "sop1") + (set_attr "length" "4")]) + +; GCN does not have an instruction to clear only part of the instruction +; cache, so the operands are ignored. + +(define_insn "clear_icache" + [(unspec_volatile + [(match_operand 0 "") (match_operand 1 "")] + UNSPECV_ICACHE_INV)] + "" + "s_icache_inv" + [(set_attr "type" "sopp") + (set_attr "length" "4")]) + +;; }}} +;; {{{ Conditionals + +; 32-bit compare, scalar unit only + +(define_insn "cstoresi4" + [(set (match_operand:BI 0 "gcn_conditional_register_operand" + "=cs, cs, cs, cs") + (match_operator:BI 1 "gcn_compare_operator" + [(match_operand:SI 2 "gcn_alu_operand" "SSA,SSA,SSB, SS") + (match_operand:SI 3 "gcn_alu_operand" "SSA,SSL, SS,SSB")]))] + "" + "@ + s_cmp%D1\t%2, %3 + s_cmpk%D1\t%2, %3 + s_cmp%D1\t%2, %3 + s_cmp%D1\t%2, %3" + [(set_attr "type" "sopc,sopk,sopk,sopk") + (set_attr "length" "4,4,8,8")]) + +(define_expand "cbranchsi4" + [(match_operator 0 "gcn_compare_operator" + [(match_operand:SI 1 "gcn_alu_operand") + (match_operand:SI 2 "gcn_alu_operand")]) + (match_operand 3)] + "" + { + rtx cc = gen_reg_rtx (BImode); + emit_insn (gen_cstoresi4 (cc, operands[0], operands[1], operands[2])); + emit_jump_insn (gen_cjump (operands[3], + gen_rtx_NE (BImode, cc, const0_rtx), cc)); + DONE; + }) + +; 64-bit compare; either unit, but scalar allows limited operators + +(define_expand "cstoredi4" + [(set (match_operand:BI 0 "gcn_conditional_register_operand") + (match_operator:BI 1 "gcn_compare_operator" + [(match_operand:DI 2 "gcn_alu_operand") + (match_operand:DI 3 "gcn_alu_operand")]))] + "" + {}) + +(define_insn "cstoredi4_vec_and_scalar" + [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cs, cV") + (match_operator:BI 1 "gcn_compare_64bit_operator" + [(match_operand:DI 2 "gcn_alu_operand" "%SSA,vSvC") + (match_operand:DI 3 "gcn_alu_operand" " SSC, v")]))] + "" + "@ + s_cmp%D1\t%2, %3 + v_cmp%E1\tvcc, %2, %3" + [(set_attr "type" "sopc,vopc") + (set_attr "length" "8")]) + +(define_insn "cstoredi4_vector" + [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cV") + (match_operator:BI 1 "gcn_compare_operator" + [(match_operand:DI 2 "gcn_alu_operand" "vSvB") + (match_operand:DI 3 "gcn_alu_operand" " v")]))] + "" + "v_cmp%E1\tvcc, %2, %3" + [(set_attr "type" "vopc") + (set_attr "length" "8")]) + +(define_expand "cbranchdi4" + [(match_operator 0 "gcn_compare_operator" + [(match_operand:DI 1 "gcn_alu_operand") + (match_operand:DI 2 "gcn_alu_operand")]) + (match_operand 3)] + "" + { + rtx cc = gen_reg_rtx (BImode); + emit_insn (gen_cstoredi4 (cc, operands[0], operands[1], operands[2])); + emit_jump_insn (gen_cjump (operands[3], + gen_rtx_NE (BImode, cc, const0_rtx), cc)); + DONE; + }) + +; FP compare; vector unit only + +(define_insn "cstore4" + [(set (match_operand:BI 0 "gcn_conditional_register_operand" "=cV") + (match_operator:BI 1 "gcn_fp_compare_operator" + [(match_operand:SFDF 2 "gcn_alu_operand" "vB") + (match_operand:SFDF 3 "gcn_alu_operand" "v")]))] + "" + "v_cmp%E1\tvcc, %2, %3" + [(set_attr "type" "vopc") + (set_attr "length" "8")]) + +(define_expand "cbranch4" + [(match_operator 0 "gcn_fp_compare_operator" + [(match_operand:SFDF 1 "gcn_alu_operand") + (match_operand:SFDF 2 "gcn_alu_operand")]) + (match_operand 3)] + "" + { + rtx cc = gen_reg_rtx (BImode); + emit_insn (gen_cstore4 (cc, operands[0], operands[1], operands[2])); + emit_jump_insn (gen_cjump (operands[3], + gen_rtx_NE (BImode, cc, const0_rtx), cc)); + DONE; + }) + +;; }}} +;; {{{ ALU special cases: Plus + +(define_insn "addsi3" + [(set (match_operand:SI 0 "register_operand" "= Sg, Sg, Sg, v") + (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA, v") + (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ, B,vBSv"))) + (clobber (match_scratch:BI 3 "= cs, cs, cs, X")) + (clobber (match_scratch:DI 4 "= X, X, X, cV"))] + "" + "@ + s_add_i32\t%0, %1, %2 + s_addk_i32\t%0, %2 + s_add_i32\t%0, %1, %2 + v_add%^_u32\t%0, vcc, %2, %1" + [(set_attr "type" "sop2,sopk,sop2,vop2") + (set_attr "length" "4,4,8,8")]) + +(define_expand "addsi3_scc" + [(parallel [(set (match_operand:SI 0 "register_operand") + (plus:SI (match_operand:SI 1 "gcn_alu_operand") + (match_operand:SI 2 "gcn_alu_operand"))) + (clobber (reg:BI SCC_REG)) + (clobber (scratch:DI))])] + "" + {}) + +; Having this as an insn_and_split allows us to keep together DImode adds +; through some RTL optimisation passes, and means the CC reg we set isn't +; dependent on the constraint alternative (which doesn't seem to work well). + +; There's an early clobber in the case where "v[0:1]=v[1:2]+?" but +; "v[0:1]=v[0:1]+?" is fine (as is "v[1:2]=v[0:1]+?", but that's trickier). + +; If v_addc_u32 is used to add with carry, a 32-bit literal constant cannot be +; used as an operand due to the read of VCC, so we restrict constants to the +; inlinable range for that alternative. + +(define_insn_and_split "adddi3" + [(set (match_operand:DI 0 "register_operand" + "=&Sg,&Sg,&Sg,&Sg,&v,&v,&v,&v") + (plus:DI (match_operand:DI 1 "register_operand" + " Sg, 0, 0, Sg, v, 0, 0, v") + (match_operand:DI 2 "nonmemory_operand" + " 0,SgB, 0,SgB, 0,vA, 0,vA"))) + (clobber (match_scratch:BI 3 "= cs, cs, cs, cs, X, X, X, X")) + (clobber (match_scratch:DI 4 "= X, X, X, X,cV,cV,cV,cV"))] + "" + "#" + "&& reload_completed" + [(const_int 0)] + { + rtx cc = gen_rtx_REG (BImode, gcn_vgpr_register_operand (operands[1], + DImode) + ? VCC_REG : SCC_REG); + + emit_insn (gen_addsi3_scalar_carry + (gcn_operand_part (DImode, operands[0], 0), + gcn_operand_part (DImode, operands[1], 0), + gcn_operand_part (DImode, operands[2], 0), + cc)); + rtx val = gcn_operand_part (DImode, operands[2], 1); + if (val != const0_rtx) + emit_insn (gen_addcsi3_scalar + (gcn_operand_part (DImode, operands[0], 1), + gcn_operand_part (DImode, operands[1], 1), + gcn_operand_part (DImode, operands[2], 1), + cc, cc)); + else + emit_insn (gen_addcsi3_scalar_zero + (gcn_operand_part (DImode, operands[0], 1), + gcn_operand_part (DImode, operands[1], 1), + cc)); + DONE; + } + [(set_attr "type" "mult,mult,mult,mult,vmult,vmult,vmult,vmult") + (set_attr "length" "8")]) + +(define_expand "adddi3_scc" + [(parallel [(set (match_operand:DI 0 "register_operand") + (plus:DI (match_operand:DI 1 "register_operand") + (match_operand:DI 2 "nonmemory_operand"))) + (clobber (reg:BI SCC_REG)) + (clobber (scratch:DI))])] + "" + {}) + +;; Add with carry. + +(define_insn "addsi3_scalar_carry" + [(set (match_operand:SI 0 "register_operand" "= Sg, v") + (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, v") + (match_operand:SI 2 "gcn_alu_operand" " SgB,vB"))) + (set (match_operand:BI 3 "register_operand" "= cs,cV") + (ltu:BI (plus:SI (match_dup 1) + (match_dup 2)) + (match_dup 1)))] + "" + "@ + s_add_u32\t%0, %1, %2 + v_add%^_u32\t%0, vcc, %2, %1" + [(set_attr "type" "sop2,vop2") + (set_attr "length" "8,8")]) + +(define_insn "addsi3_scalar_carry_cst" + [(set (match_operand:SI 0 "register_operand" "=Sg, v") + (plus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA, v") + (match_operand:SI 2 "const_int_operand" " n, n"))) + (set (match_operand:BI 4 "register_operand" "=cs,cV") + (geu:BI (plus:SI (match_dup 1) + (match_dup 2)) + (match_operand:SI 3 "const_int_operand" " n, n")))] + "INTVAL (operands[2]) == -INTVAL (operands[3])" + "@ + s_add_u32\t%0, %1, %2 + v_add%^_u32\t%0, vcc, %2, %1" + [(set_attr "type" "sop2,vop2") + (set_attr "length" "4")]) + +(define_insn "addcsi3_scalar" + [(set (match_operand:SI 0 "register_operand" "= Sg, v") + (plus:SI (plus:SI (zero_extend:SI + (match_operand:BI 3 "register_operand" "= cs,cV")) + (match_operand:SI 1 "gcn_alu_operand" "%SgA, v")) + (match_operand:SI 2 "gcn_alu_operand" " SgB,vA"))) + (set (match_operand:BI 4 "register_operand" "= 3, 3") + (ior:BI (ltu:BI (plus:SI + (plus:SI + (zero_extend:SI (match_dup 3)) + (match_dup 1)) + (match_dup 2)) + (match_dup 2)) + (ltu:BI (plus:SI (zero_extend:SI (match_dup 3)) (match_dup 1)) + (match_dup 1))))] + "" + "@ + s_addc_u32\t%0, %1, %2 + v_addc%^_u32\t%0, vcc, %2, %1, vcc" + [(set_attr "type" "sop2,vop2") + (set_attr "length" "8,4")]) + +(define_insn "addcsi3_scalar_zero" + [(set (match_operand:SI 0 "register_operand" "=Sg, v") + (plus:SI (zero_extend:SI + (match_operand:BI 2 "register_operand" "=cs,cV")) + (match_operand:SI 1 "gcn_alu_operand" "SgA, v"))) + (set (match_dup 2) + (ltu:BI (plus:SI (zero_extend:SI (match_dup 2)) + (match_dup 1)) + (match_dup 1)))] + "" + "@ + s_addc_u32\t%0, %1, 0 + v_addc%^_u32\t%0, vcc, 0, %1, vcc" + [(set_attr "type" "sop2,vop2") + (set_attr "length" "4")]) + +; "addptr" is the same as "add" except that it must not write to VCC or SCC +; as a side-effect. Unfortunately GCN does not have a suitable instruction +; for this, so we use a custom VOP3 add with CC_SAVE_REG as a temp. +; Note that it is not safe to save/clobber/restore SCC because doing so will +; break data-flow analysis, so this must use vector registers. + +(define_insn "addptrdi3" + [(set (match_operand:DI 0 "register_operand" "= &v") + (plus:DI (match_operand:DI 1 "register_operand" " v0") + (match_operand:DI 2 "nonmemory_operand" "vDA0")))] + "" + { + rtx new_operands[4] = { operands[0], operands[1], operands[2], + gen_rtx_REG (DImode, CC_SAVE_REG) }; + + output_asm_insn ("v_add%^_u32 %L0, %3, %L2, %L1", new_operands); + output_asm_insn ("v_addc%^_u32 %H0, %3, %H2, %H1, %3", new_operands); + + return ""; + } + [(set_attr "type" "vmult") + (set_attr "length" "16")]) + +;; }}} +;; {{{ ALU special cases: Minus + +(define_insn "subsi3" + [(set (match_operand:SI 0 "register_operand" "=Sg, Sg, v, v") + (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA, v,vBSv") + (match_operand:SI 2 "gcn_alu_operand" "SgA, B, vBSv, v"))) + (clobber (match_scratch:BI 3 "=cs, cs, X, X")) + (clobber (match_scratch:DI 4 "= X, X, cV, cV"))] + "" + "@ + s_sub_i32\t%0, %1, %2 + s_sub_i32\t%0, %1, %2 + v_subrev%^_u32\t%0, vcc, %2, %1 + v_sub%^_u32\t%0, vcc, %1, %2" + [(set_attr "type" "sop2,sop2,vop2,vop2") + (set_attr "length" "4,8,8,8")]) + +(define_insn_and_split "subdi3" + [(set (match_operand:DI 0 "register_operand" "=Sg, Sg") + (minus:DI + (match_operand:DI 1 "gcn_alu_operand" "SgA,SgB") + (match_operand:DI 2 "gcn_alu_operand" "SgB,SgA"))) + (clobber (reg:BI SCC_REG))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + emit_insn (gen_subsi3_scalar_carry + (gcn_operand_part (DImode, operands[0], 0), + gcn_operand_part (DImode, operands[1], 0), + gcn_operand_part (DImode, operands[2], 0))); + rtx val = gcn_operand_part (DImode, operands[2], 1); + if (val != const0_rtx) + emit_insn (gen_subcsi3_scalar + (gcn_operand_part (DImode, operands[0], 1), + gcn_operand_part (DImode, operands[1], 1), + gcn_operand_part (DImode, operands[2], 1))); + else + emit_insn (gen_subcsi3_scalar_zero + (gcn_operand_part (DImode, operands[0], 1), + gcn_operand_part (DImode, operands[1], 1))); + DONE; + } + [(set_attr "length" "8")]) + +(define_insn "subsi3_scalar_carry" + [(set (match_operand:SI 0 "register_operand" "=Sg, Sg") + (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB") + (match_operand:SI 2 "gcn_alu_operand" "SgB,SgA"))) + (set (reg:BI SCC_REG) + (gtu:BI (minus:SI (match_dup 1) + (match_dup 2)) + (match_dup 1)))] + "" + "s_sub_u32\t%0, %1, %2" + [(set_attr "type" "sop2") + (set_attr "length" "8")]) + +(define_insn "subsi3_scalar_carry_cst" + [(set (match_operand:SI 0 "register_operand" "=Sg") + (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA") + (match_operand:SI 2 "const_int_operand" " n"))) + (set (reg:BI SCC_REG) + (leu:BI (minus:SI (match_dup 1) + (match_dup 2)) + (match_operand:SI 3 "const_int_operand" " n")))] + "INTVAL (operands[2]) == -INTVAL (operands[3])" + "s_sub_u32\t%0, %1, %2" + [(set_attr "type" "sop2") + (set_attr "length" "4")]) + +(define_insn "subcsi3_scalar" + [(set (match_operand:SI 0 "register_operand" "=Sg, Sg") + (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG)) + (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB")) + (match_operand:SI 2 "gcn_alu_operand" "SgB,SgA"))) + (set (reg:BI SCC_REG) + (ior:BI (gtu:BI (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG)) + (match_dup 1)) + (match_dup 2)) + (match_dup 1)) + (gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG)) + (match_dup 1)) + (match_dup 1))))] + "" + "s_subb_u32\t%0, %1, %2" + [(set_attr "type" "sop2") + (set_attr "length" "8")]) + +(define_insn "subcsi3_scalar_zero" + [(set (match_operand:SI 0 "register_operand" "=Sg") + (minus:SI (zero_extend:SI (reg:BI SCC_REG)) + (match_operand:SI 1 "gcn_alu_operand" "SgA"))) + (set (reg:BI SCC_REG) + (gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG)) (match_dup 1)) + (match_dup 1)))] + "" + "s_subb_u32\t%0, %1, 0" + [(set_attr "type" "sop2") + (set_attr "length" "4")]) + +;; }}} +;; {{{ ALU: mult + +; Vector multiply has vop3a encoding, but no corresponding vop2a, so no long +; immediate. +(define_insn "mulsi3" + [(set (match_operand:SI 0 "register_operand" "= Sg,Sg, Sg, v") + (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA, v") + (match_operand:SI 2 "gcn_alu_operand" " SgA, J, B,vASv")))] + "" + "@ + s_mul_i32\t%0, %1, %2 + s_mulk_i32\t%0, %2 + s_mul_i32\t%0, %1, %2 + v_mul_lo_i32\t%0, %1, %2" + [(set_attr "type" "sop2,sopk,sop2,vop3a") + (set_attr "length" "4,4,8,4")]) + +(define_code_iterator any_extend [sign_extend zero_extend]) +(define_code_attr sgnsuffix [(sign_extend "%i") (zero_extend "%u")]) +(define_code_attr su [(sign_extend "s") (zero_extend "u")]) +(define_code_attr u [(sign_extend "") (zero_extend "u")]) +(define_code_attr iu [(sign_extend "i") (zero_extend "u")]) +(define_code_attr e [(sign_extend "e") (zero_extend "")]) + +(define_insn "mulsi3_highpart" + [(set (match_operand:SI 0 "register_operand" "= v") + (truncate:SI + (lshiftrt:DI + (mult:DI + (any_extend:DI + (match_operand:SI 1 "register_operand" "% v")) + (any_extend:DI + (match_operand:SI 2 "register_operand" "vSv"))) + (const_int 32))))] + "" + "v_mul_hi0\t%0, %2, %1" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "mulhisi3" + [(set (match_operand:SI 0 "register_operand" "=v") + (mult:SI + (any_extend:SI (match_operand:HI 1 "register_operand" "%v")) + (any_extend:SI (match_operand:HI 2 "register_operand" " v"))))] + "" + "v_mul_32_24_sdwa\t%0, %1, %2 src0_sel:WORD_0 src1_sel:WORD_0" + [(set_attr "type" "vop_sdwa") + (set_attr "length" "8")]) + +(define_insn "mulqihi3_scalar" + [(set (match_operand:HI 0 "register_operand" "=v") + (mult:HI + (any_extend:HI (match_operand:QI 1 "register_operand" "%v")) + (any_extend:HI (match_operand:QI 2 "register_operand" " v"))))] + "" + "v_mul_32_24_sdwa\t%0, %1, %2 src0_sel:BYTE_0 src1_sel:BYTE_0" + [(set_attr "type" "vop_sdwa") + (set_attr "length" "8")]) + +;; }}} +;; {{{ ALU: generic 32-bit unop + +(define_code_iterator bitunop [not popcount]) +(define_code_attr popcount_extra_op [(not "") (popcount ", 0")]) + +(define_insn "si2" + [(set (match_operand:SI 0 "register_operand" "=Sg, v") + (bitunop:SI + (match_operand:SI 1 "gcn_alu_operand" "SgB,vSvB"))) + (clobber (match_scratch:BI 2 "=cs, X"))] + "" + "@ + s_0\t%0, %1 + v_0\t%0, %1" + [(set_attr "type" "sop1,vop1") + (set_attr "length" "8")]) + +;; }}} +;; {{{ ALU: generic 32-bit binop + +; No plus and mult - they have variant with 16bit immediate +; and thus are defined later. +(define_code_iterator binop [and ior xor smin smax umin umax + ashift lshiftrt ashiftrt]) +(define_code_iterator vec_and_scalar_com [and ior xor smin smax umin umax]) +(define_code_iterator vec_and_scalar_nocom [ashift lshiftrt ashiftrt]) + +(define_insn "si3" + [(set (match_operand:SI 0 "gcn_valu_dst_operand" "= Sg, v,RD") + (vec_and_scalar_com:SI + (match_operand:SI 1 "gcn_valu_src0_operand" "%SgA,vSvB, 0") + (match_operand:SI 2 "gcn_alu_operand" " SgB, v, v"))) + (clobber (match_scratch:BI 3 "= cs, X, X"))] + "" + "@ + s_0\t%0, %1, %2 + v_0\t%0, %1, %2 + ds_0\t%A0, %2%O0" + [(set_attr "type" "sop2,vop2,ds") + (set_attr "length" "8")]) + +(define_insn "si3" + [(set (match_operand:SI 0 "register_operand" "=Sg, Sg, v") + (vec_and_scalar_nocom:SI + (match_operand:SI 1 "gcn_alu_operand" "SgB,SgA, v") + (match_operand:SI 2 "gcn_alu_operand" "SgA,SgB,vSvB"))) + (clobber (match_scratch:BI 3 "=cs, cs, X"))] + "" + "@ + s_0\t%0, %1, %2 + s_0\t%0, %1, %2 + v_0\t%0, %2, %1" + [(set_attr "type" "sop2,sop2,vop2") + (set_attr "length" "8")]) + +(define_expand "si3_scc" + [(parallel [(set (match_operand:SI 0 "gcn_valu_dst_operand") + (binop:SI + (match_operand:SI 1 "gcn_valu_src0_operand") + (match_operand:SI 2 "gcn_alu_operand"))) + (clobber (reg:BI SCC_REG))])] + "" + {}) + +;; }}} +;; {{{ ALU: generic 64-bit + +(define_code_iterator vec_and_scalar64_com [and ior xor]) + +(define_insn_and_split "di3" + [(set (match_operand:DI 0 "register_operand" "= Sg, &v, &v") + (vec_and_scalar64_com:DI + (match_operand:DI 1 "gcn_alu_operand" "%SgA,vSvDB,vSvDB") + (match_operand:DI 2 "gcn_alu_operand" " SgC, v, 0"))) + (clobber (match_scratch:BI 3 "= cs, X, X"))] + "" + "@ + s_0\t%0, %1, %2 + # + #" + "reload_completed && gcn_vgpr_register_operand (operands[0], DImode)" + [(parallel [(set (match_dup 4) + (vec_and_scalar64_com:SI (match_dup 5) (match_dup 6))) + (clobber (match_dup 3))]) + (parallel [(set (match_dup 7) + (vec_and_scalar64_com:SI (match_dup 8) (match_dup 9))) + (clobber (match_dup 3))])] + { + operands[4] = gcn_operand_part (DImode, operands[0], 0); + operands[5] = gcn_operand_part (DImode, operands[1], 0); + operands[6] = gcn_operand_part (DImode, operands[2], 0); + operands[7] = gcn_operand_part (DImode, operands[0], 1); + operands[8] = gcn_operand_part (DImode, operands[1], 1); + operands[9] = gcn_operand_part (DImode, operands[2], 1); + } + [(set_attr "type" "sop2,vop2,vop2") + (set_attr "length" "8")]) + +(define_insn "di3" + [(set (match_operand:DI 0 "register_operand" "=Sg, Sg, v") + (vec_and_scalar_nocom:DI + (match_operand:DI 1 "gcn_alu_operand" "SgC,SgA, v") + (match_operand:SI 2 "gcn_alu_operand" "SgA,SgC,vSvC"))) + (clobber (match_scratch:BI 3 "=cs, cs, X"))] + "" + "@ + s_0\t%0, %1, %2 + s_0\t%0, %1, %2 + v_0\t%0, %2, %1" + [(set_attr "type" "sop2,sop2,vop2") + (set_attr "length" "8")]) + +;; }}} +;; {{{ Atomics + +; Each compute unit has it's own L1 cache. The L2 cache is shared between +; all the compute units. Any load or store instruction can skip L1 and +; access L2 directly using the "glc" flag. Atomic instructions also skip +; L1. The L1 cache can be flushed and invalidated using instructions. +; +; Therefore, in order for "acquire" and "release" atomic modes to work +; correctly across compute units we must flush before each "release" +; and invalidate the cache after each "acquire". It might seem like +; invalidation could be safely done before an "acquire", but since each +; compute unit can run up to 40 threads simultaneously, all reading values +; into the L1 cache, this is not actually safe. +; +; Additionally, scalar flat instructions access L2 via a different cache +; (the "constant cache"), so they have separate constrol instructions. We +; do not attempt to invalidate both caches at once; instead, atomics +; operating on scalar flat pointers will flush the constant cache, and +; atomics operating on flat or global pointers will flush L1. It is up to +; the programmer to get this right. + +(define_code_iterator atomicops [plus minus and ior xor]) +(define_mode_attr X [(SI "") (DI "_X2")]) + +;; TODO compare_and_swap test_and_set inc dec +;; Hardware also supports min and max, but GCC does not. + +(define_expand "memory_barrier" + [(set (match_dup 0) + (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))] + "" + { + operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); + MEM_VOLATILE_P (operands[0]) = 1; + }) + +(define_insn "*memory_barrier" + [(set (match_operand:BLK 0) + (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))] + "" + "buffer_wbinvl1_vol" + [(set_attr "type" "mubuf") + (set_attr "length" "4")]) + +; FIXME: These patterns have been disabled as they do not seem to work +; reliably - they can cause hangs or incorrect results. +; TODO: flush caches according to memory model +(define_insn "atomic_fetch_" + [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v") + (match_operand:SIDI 1 "memory_operand" "+RS,RF,RM")) + (set (match_dup 1) + (unspec_volatile:SIDI + [(atomicops:SIDI + (match_dup 1) + (match_operand:SIDI 2 "register_operand" " Sm, v, v"))] + UNSPECV_ATOMIC)) + (use (match_operand 3 "const_int_operand"))] + "0 /* Disabled. */" + "@ + s_atomic_\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0) + flat_atomic_\t%0, %1, %2 glc\;s_waitcnt\t0 + global_atomic_\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)" + [(set_attr "type" "smem,flat,flat") + (set_attr "length" "12") + (set_attr "gcn_version" "gcn5,*,gcn5")]) + +; FIXME: These patterns are disabled because the instructions don't +; seem to work as advertised. Specifically, OMP "team distribute" +; reductions apparently "lose" some of the writes, similar to what +; you might expect from a concurrent non-atomic read-modify-write. +; TODO: flush caches according to memory model +(define_insn "atomic_" + [(set (match_operand:SIDI 0 "memory_operand" "+RS,RF,RM") + (unspec_volatile:SIDI + [(atomicops:SIDI + (match_dup 0) + (match_operand:SIDI 1 "register_operand" " Sm, v, v"))] + UNSPECV_ATOMIC)) + (use (match_operand 2 "const_int_operand"))] + "0 /* Disabled. */" + "@ + s_atomic_\t%0, %1\;s_waitcnt\tlgkmcnt(0) + flat_atomic_\t%0, %1\;s_waitcnt\t0 + global_atomic_\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)" + [(set_attr "type" "smem,flat,flat") + (set_attr "length" "12") + (set_attr "gcn_version" "gcn5,*,gcn5")]) + +(define_mode_attr x2 [(SI "DI") (DI "TI")]) +(define_mode_attr size [(SI "4") (DI "8")]) +(define_mode_attr bitsize [(SI "32") (DI "64")]) + +(define_expand "sync_compare_and_swap" + [(match_operand:SIDI 0 "register_operand") + (match_operand:SIDI 1 "memory_operand") + (match_operand:SIDI 2 "register_operand") + (match_operand:SIDI 3 "register_operand")] + "" + { + if (MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_LDS) + { + emit_insn (gen_sync_compare_and_swap_lds_insn (operands[0], + operands[1], + operands[2], + operands[3])); + DONE; + } + + /* Operands 2 and 3 must be placed in consecutive registers, and passed + as a combined value. */ + rtx src_cmp = gen_reg_rtx (mode); + emit_move_insn (gen_rtx_SUBREG (mode, src_cmp, 0), operands[3]); + emit_move_insn (gen_rtx_SUBREG (mode, src_cmp, ), operands[2]); + emit_insn (gen_sync_compare_and_swap_insn (operands[0], + operands[1], + src_cmp)); + DONE; + }) + +(define_insn "sync_compare_and_swap_insn" + [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v") + (match_operand:SIDI 1 "memory_operand" "+RS,RF,RM")) + (set (match_dup 1) + (unspec_volatile:SIDI + [(match_operand: 2 "register_operand" " Sm, v, v")] + UNSPECV_ATOMIC))] + "" + "@ + s_atomic_cmpswap\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0) + flat_atomic_cmpswap\t%0, %1, %2 glc\;s_waitcnt\t0 + global_atomic_cmpswap\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)" + [(set_attr "type" "smem,flat,flat") + (set_attr "length" "12") + (set_attr "gcn_version" "gcn5,*,gcn5")]) + +(define_insn "sync_compare_and_swap_lds_insn" + [(set (match_operand:SIDI 0 "register_operand" "= v") + (unspec_volatile:SIDI + [(match_operand:SIDI 1 "memory_operand" "+RL")] + UNSPECV_ATOMIC)) + (set (match_dup 1) + (unspec_volatile:SIDI + [(match_operand:SIDI 2 "register_operand" " v") + (match_operand:SIDI 3 "register_operand" " v")] + UNSPECV_ATOMIC))] + "" + "ds_cmpst_rtn_b %0, %1, %2, %3\;s_waitcnt\tlgkmcnt(0)" + [(set_attr "type" "ds") + (set_attr "length" "12")]) + +(define_insn "atomic_load" + [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v") + (unspec_volatile:SIDI + [(match_operand:SIDI 1 "memory_operand" " RS,RF,RM")] + UNSPECV_ATOMIC)) + (use (match_operand:SIDI 2 "immediate_operand" " i, i, i"))] + "" + { + switch (INTVAL (operands[2])) + { + case MEMMODEL_RELAXED: + switch (which_alternative) + { + case 0: + return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)"; + case 1: + return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0"; + case 2: + return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)"; + } + break; + case MEMMODEL_CONSUME: + case MEMMODEL_ACQUIRE: + case MEMMODEL_SYNC_ACQUIRE: + switch (which_alternative) + { + case 0: + return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;" + "s_dcache_wb_vol"; + case 1: + return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;" + "buffer_wbinvl1_vol"; + case 2: + return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;" + "buffer_wbinvl1_vol"; + } + break; + case MEMMODEL_ACQ_REL: + case MEMMODEL_SEQ_CST: + case MEMMODEL_SYNC_SEQ_CST: + switch (which_alternative) + { + case 0: + return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;" + "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol"; + case 1: + return "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;" + "s_waitcnt\t0\;buffer_wbinvl1_vol"; + case 2: + return "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;" + "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"; + } + break; + } + gcc_unreachable (); + } + [(set_attr "type" "smem,flat,flat") + (set_attr "length" "20") + (set_attr "gcn_version" "gcn5,*,gcn5")]) + +(define_insn "atomic_store" + [(set (match_operand:SIDI 0 "memory_operand" "=RS,RF,RM") + (unspec_volatile:SIDI + [(match_operand:SIDI 1 "register_operand" " Sm, v, v")] + UNSPECV_ATOMIC)) + (use (match_operand:SIDI 2 "immediate_operand" " i, i, i"))] + "" + { + switch (INTVAL (operands[2])) + { + case MEMMODEL_RELAXED: + switch (which_alternative) + { + case 0: + return "s_store%o1\t%1, %A0 glc\;s_waitcnt\tlgkmcnt(0)"; + case 1: + return "flat_store%o1\t%A0, %1%O0 glc\;s_waitcnt\t0"; + case 2: + return "global_store%o1\t%A0, %1%O0 glc\;s_waitcnt\tvmcnt(0)"; + } + break; + case MEMMODEL_RELEASE: + case MEMMODEL_SYNC_RELEASE: + switch (which_alternative) + { + case 0: + return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;" + "s_waitcnt\texpcnt(0)"; + case 1: + return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;" + "s_waitcnt\texpcnt(0)"; + case 2: + return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;" + "s_waitcnt\texpcnt(0)"; + } + break; + case MEMMODEL_ACQ_REL: + case MEMMODEL_SEQ_CST: + case MEMMODEL_SYNC_SEQ_CST: + switch (which_alternative) + { + case 0: + return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;" + "s_waitcnt\texpcnt(0)\;s_dcache_inv_vol"; + case 1: + return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;" + "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol"; + case 2: + return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;" + "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol"; + } + break; + } + gcc_unreachable (); + } + [(set_attr "type" "smem,flat,flat") + (set_attr "length" "20") + (set_attr "gcn_version" "gcn5,*,gcn5")]) + +(define_insn "atomic_exchange" + [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v") + (match_operand:SIDI 1 "memory_operand" "+RS,RF,RM")) + (set (match_dup 1) + (unspec_volatile:SIDI + [(match_operand:SIDI 2 "register_operand" " Sm, v, v")] + UNSPECV_ATOMIC)) + (use (match_operand 3 "immediate_operand"))] + "" + { + switch (INTVAL (operands[3])) + { + case MEMMODEL_RELAXED: + switch (which_alternative) + { + case 0: + return "s_atomic_swap\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)"; + case 1: + return "flat_atomic_swap\t%0, %1, %2 glc\;s_waitcnt\t0"; + case 2: + return "global_atomic_swap\t%0, %A1, %2%O1 glc\;" + "s_waitcnt\tvmcnt(0)"; + } + break; + case MEMMODEL_CONSUME: + case MEMMODEL_ACQUIRE: + case MEMMODEL_SYNC_ACQUIRE: + switch (which_alternative) + { + case 0: + return "s_atomic_swap\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)\;" + "s_dcache_wb_vol\;s_dcache_inv_vol"; + case 1: + return "flat_atomic_swap\t%0, %1, %2 glc\;s_waitcnt\t0\;" + "buffer_wbinvl1_vol"; + case 2: + return "global_atomic_swap\t%0, %A1, %2%O1 glc\;" + "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"; + } + break; + case MEMMODEL_RELEASE: + case MEMMODEL_SYNC_RELEASE: + switch (which_alternative) + { + case 0: + return "s_dcache_wb_vol\;s_atomic_swap\t%0, %1, %2 glc\;" + "s_waitcnt\tlgkmcnt(0)"; + case 1: + return "buffer_wbinvl1_vol\;flat_atomic_swap\t%0, %1, %2 glc\;" + "s_waitcnt\t0"; + case 2: + return "buffer_wbinvl1_vol\;" + "global_atomic_swap\t%0, %A1, %2%O1 glc\;" + "s_waitcnt\tvmcnt(0)"; + } + break; + case MEMMODEL_ACQ_REL: + case MEMMODEL_SEQ_CST: + case MEMMODEL_SYNC_SEQ_CST: + switch (which_alternative) + { + case 0: + return "s_dcache_wb_vol\;s_atomic_swap\t%0, %1, %2 glc\;" + "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol"; + case 1: + return "buffer_wbinvl1_vol\;flat_atomic_swap\t%0, %1, %2 glc\;" + "s_waitcnt\t0\;buffer_wbinvl1_vol"; + case 2: + return "buffer_wbinvl1_vol\;" + "global_atomic_swap\t%0, %A1, %2%O1 glc\;" + "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"; + } + break; + } + gcc_unreachable (); + } + [(set_attr "type" "smem,flat,flat") + (set_attr "length" "20") + (set_attr "gcn_version" "gcn5,*,gcn5")]) + +;; }}} +;; {{{ OpenACC / OpenMP + +(define_expand "oacc_dim_size" + [(match_operand:SI 0 "register_operand") + (match_operand:SI 1 "const_int_operand")] + "" + { + rtx tmp = gcn_oacc_dim_size (INTVAL (operands[1])); + emit_move_insn (operands[0], gen_lowpart (SImode, tmp)); + DONE; + }) + +(define_expand "oacc_dim_pos" + [(match_operand:SI 0 "register_operand") + (match_operand:SI 1 "const_int_operand")] + "" + { + emit_move_insn (operands[0], gcn_oacc_dim_pos (INTVAL (operands[1]))); + DONE; + }) + +(define_expand "gcn_wavefront_barrier" + [(set (match_dup 0) + (unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))] + "" + { + operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); + MEM_VOLATILE_P (operands[0]) = 1; + }) + +(define_insn "*gcn_wavefront_barrier" + [(set (match_operand:BLK 0 "") + (unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))] + "" + "s_barrier" + [(set_attr "type" "sopp")]) + +(define_expand "oacc_fork" + [(set (match_operand:SI 0 "") + (match_operand:SI 1 "")) + (use (match_operand:SI 2 ""))] + "" + { + /* We need to have oacc_fork/oacc_join named patterns as a pair, + but the fork isn't actually used. */ + gcc_unreachable (); + }) + +(define_expand "oacc_join" + [(set (match_operand:SI 0 "") + (match_operand:SI 1 "")) + (use (match_operand:SI 2 ""))] + "" + { + emit_insn (gen_gcn_wavefront_barrier ()); + DONE; + }) + +;; }}} + +(include "gcn-valu.md") diff --git a/gcc/config/gcn/predicates.md b/gcc/config/gcn/predicates.md new file mode 100644 index 00000000000..5b54f49f3cd --- /dev/null +++ b/gcc/config/gcn/predicates.md @@ -0,0 +1,199 @@ +;; Predicate definitions for GCN. +;; Copyright (C) 2016-2019 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . +;; Return true if VALUE can be stored in a sign extended immediate field. + +(define_predicate "gcn_conditional_register_operand" + (match_operand 0 "register_operand") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + if (!REG_P (op) || GET_MODE (op) != BImode) + return 0; + + return REGNO (op) == VCCZ_REG + || REGNO (op) == VCC_REG /* Implied VCCZ. */ + || REGNO (op) == SCC_REG + || REGNO (op) == EXECZ_REG + || REGNO (op) >= FIRST_PSEUDO_REGISTER; +}) + +(define_predicate "gcn_ssrc_register_operand" + (match_operand 0 "register_operand") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + if (!REG_P (op)) + return false; + + return SSRC_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER; +}) + +(define_predicate "gcn_sdst_register_operand" + (match_operand 0 "register_operand") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + if (!REG_P (op)) + return false; + + return SDST_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER; +}) + +(define_predicate "gcn_vgpr_register_operand" + (match_operand 0 "register_operand") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + if (!REG_P (op)) + return false; + + return VGPR_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER; +}) + +(define_predicate "gcn_inline_immediate_operand" + (match_code "const_int,const_double,const_vector") +{ + return gcn_inline_constant_p (op); +}) + +(define_predicate "gcn_vop3_operand" + (ior (match_operand 0 "gcn_inline_immediate_operand") + (match_operand 0 "register_operand"))) + +(define_predicate "gcn_vec0_operand" + (match_code "const_vector") +{ + return CONST_VECTOR_ELT (op, 0) == const0_rtx && gcn_inline_constant_p (op); +}) + +(define_predicate "gcn_vec1_operand" + (match_code "const_vector") +{ + return CONST_VECTOR_ELT (op, 0) == const1_rtx && gcn_inline_constant_p (op); +}) + +(define_predicate "gcn_vec1d_operand" + (match_code "const_vector") +{ + if (!gcn_inline_constant_p (op)) + return false; + + rtx elem = CONST_VECTOR_ELT (op, 0); + if (!CONST_DOUBLE_P (elem)) + return false; + return real_identical (CONST_DOUBLE_REAL_VALUE (elem), &dconst1); +}) + +(define_predicate "gcn_const1d_operand" + (match_code "const_double") +{ + return gcn_inline_constant_p (op) + && real_identical (CONST_DOUBLE_REAL_VALUE (op), &dconst1); +}) + +(define_predicate "gcn_32bit_immediate_operand" + (match_code "const_int,const_double,const_vector,symbol_ref,label_ref") +{ + return gcn_constant_p (op); +}) + +; LRA works smoother when exec values are immediate constants +; prior register allocation. +(define_predicate "gcn_exec_operand" + (ior (match_operand 0 "register_operand") + (match_code "const_int"))) + +(define_predicate "gcn_exec_reg_operand" + (match_operand 0 "register_operand")) + +(define_predicate "gcn_load_operand" + (ior (match_operand 0 "nonimmediate_operand") + (match_operand 0 "gcn_32bit_immediate_operand"))) + +(define_predicate "gcn_alu_operand" + (ior (match_operand 0 "register_operand") + (match_operand 0 "gcn_32bit_immediate_operand"))) + +(define_predicate "gcn_ds_memory_operand" + (and (match_code "mem") + (and (match_test "AS_ANY_DS_P (MEM_ADDR_SPACE (op))") + (match_operand 0 "memory_operand")))) + +(define_predicate "gcn_valu_dst_operand" + (ior (match_operand 0 "register_operand") + (match_operand 0 "gcn_ds_memory_operand"))) + +(define_predicate "gcn_valu_src0_operand" + (ior (match_operand 0 "register_operand") + (ior (match_operand 0 "gcn_32bit_immediate_operand") + (match_operand 0 "gcn_ds_memory_operand")))) + +(define_predicate "gcn_valu_src1_operand" + (match_operand 0 "register_operand")) + +(define_predicate "gcn_valu_src1com_operand" + (ior (match_operand 0 "register_operand") + (match_operand 0 "gcn_32bit_immediate_operand"))) + +(define_predicate "gcn_conditional_operator" + (match_code "eq,ne")) + +(define_predicate "gcn_compare_64bit_operator" + (match_code "eq,ne")) + +(define_predicate "gcn_compare_operator" + (match_code "eq,ne,gt,ge,lt,le,gtu,geu,ltu,leu")) + +(define_predicate "gcn_fp_compare_operator" + (match_code "eq,ne,gt,ge,lt,le,gtu,geu,ltu,leu,ordered,unordered")) + +(define_predicate "unary_operator" + (match_code "not,popcount")) + +(define_predicate "binary_operator" + (match_code "and,ior,xor,ashift,lshiftrt,ashiftrt,smin,smax,umin,umax")) + +(define_predicate "gcn_unspec_operand" + (and (match_code "unspec") + (match_test "XINT (op, 1) == UNSPEC_VECTOR"))) + +(define_predicate "general_or_unspec_operand" + (ior (match_operand 0 "general_operand") + (and (match_code "unspec") + (match_test "XINT (op, 1) == UNSPEC_VECTOR")))) + +(define_predicate "gcn_register_or_unspec_operand" + (ior (match_operand 0 "register_operand") + (and (match_code "unspec") + (match_test "XINT (op, 1) == UNSPEC_VECTOR")))) + +(define_predicate "gcn_alu_or_unspec_operand" + (ior (match_operand 0 "gcn_alu_operand") + (and (match_code "unspec") + (match_test "XINT (op, 1) == UNSPEC_VECTOR")))) + +(define_predicate "gcn_register_ds_or_unspec_operand" + (ior (match_operand 0 "register_operand") + (ior (match_operand 0 "gcn_ds_memory_operand") + (and (match_code "unspec") + (match_test "XINT (op, 1) == UNSPEC_VECTOR"))))) -- 2.11.4.GIT