gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55
  56 /* Return true if load- or store-lanes optab OPTAB is implemented for
  57    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  58
  59 static bool
  60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  61                               tree vectype, unsigned HOST_WIDE_INT count)
  62 {
  63   machine_mode mode;
  64   scalar_int_mode array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   limit_p = !targetm.array_mode_supported_p (mode, count);
  69   if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
  70                           limit_p).exists (&array_mode))
  71     {
  72       if (dump_enabled_p ())
  73         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  74                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  75                          GET_MODE_NAME (mode), count);
  76       return false;
  77     }
  78
  79   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  80     {
  81       if (dump_enabled_p ())
  82         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  83                          "cannot use %s<%s><%s>\n", name,
  84                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  85       return false;
  86     }
  87
  88   if (dump_enabled_p ())
  89     dump_printf_loc (MSG_NOTE, vect_location,
  90                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  91                      GET_MODE_NAME (mode));
  92
  93   return true;
  94 }
  95
  96
  97 /* Return the smallest scalar part of STMT.
  98    This is used to determine the vectype of the stmt.  We generally set the
  99    vectype according to the type of the result (lhs).  For stmts whose
 100    result-type is different than the type of the arguments (e.g., demotion,
 101    promotion), vectype will be reset appropriately (later).  Note that we have
 102    to visit the smallest datatype in this function, because that determines the
 103    VF.  If the smallest datatype in the loop is present only as the rhs of a
 104    promotion operation - we'd miss it.
 105    Such a case, where a variable of this datatype does not appear in the lhs
 106    anywhere in the loop, can only occur if it's an invariant: e.g.:
 107    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 108    invariant motion.  However, we cannot rely on invariant motion to always
 109    take invariants out of the loop, and so in the case of promotion we also
 110    have to check the rhs.
 111    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 112    types.  */
 113
 114 tree
 115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 116                                HOST_WIDE_INT *rhs_size_unit)
 117 {
 118   tree scalar_type = gimple_expr_type (stmt);
 119   HOST_WIDE_INT lhs, rhs;
 120
 121   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 122
 123   if (is_gimple_assign (stmt)
 124       && (gimple_assign_cast_p (stmt)
 125           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 126           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 127           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 128     {
 129       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 130
 131       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 132       if (rhs < lhs)
 133         scalar_type = rhs_type;
 134     }
 135
 136   *lhs_size_unit = lhs;
 137   *rhs_size_unit = rhs;
 138   return scalar_type;
 139 }
 140
 141
 142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 143    tested at run-time.  Return TRUE if DDR was successfully inserted.
 144    Return false if versioning is not supported.  */
 145
 146 static bool
 147 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 148 {
 149   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 150
 151   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 152     return false;
 153
 154   if (!runtime_alias_check_p (ddr, loop,
 155                               optimize_loop_nest_for_speed_p (loop)))
 156     return false;
 157
 158   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 159   return true;
 160 }
 161
 162
 163 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 164    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 165    distances.  These distances are conservatively correct but they don't
 166    reflect a guaranteed dependence.
 167
 168    Return true if this function does all the work necessary to avoid
 169    an alias or false if the caller should use the dependence distances
 170    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 171    the depth of the loop described by LOOP_VINFO and the other arguments
 172    are as for vect_analyze_data_ref_dependence.  */
 173
 174 static bool
 175 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 176                                        loop_vec_info loop_vinfo,
 177                                        int loop_depth, int *max_vf)
 178 {
 179   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 180   lambda_vector dist_v;
 181   unsigned int i;
 182   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 183     {
 184       int dist = dist_v[loop_depth];
 185       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 186         {
 187           /* If the user asserted safelen >= DIST consecutive iterations
 188              can be executed concurrently, assume independence.
 189
 190              ??? An alternative would be to add the alias check even
 191              in this case, and vectorize the fallback loop with the
 192              maximum VF set to safelen.  However, if the user has
 193              explicitly given a length, it's less likely that that
 194              would be a win.  */
 195           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 196             {
 197               if (loop->safelen < *max_vf)
 198                 *max_vf = loop->safelen;
 199               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 200               continue;
 201             }
 202
 203           /* For dependence distances of 2 or more, we have the option
 204              of limiting VF or checking for an alias at runtime.
 205              Prefer to check at runtime if we can, to avoid limiting
 206              the VF unnecessarily when the bases are in fact independent.
 207
 208              Note that the alias checks will be removed if the VF ends up
 209              being small enough.  */
 210           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 211         }
 212     }
 213   return true;
 214 }
 215
 216
 217 /* Function vect_analyze_data_ref_dependence.
 218
 219    Return TRUE if there (might) exist a dependence between a memory-reference
 220    DRA and a memory-reference DRB.  When versioning for alias may check a
 221    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 222    the data dependence.  */
 223
 224 static bool
 225 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 226                                   loop_vec_info loop_vinfo, int *max_vf)
 227 {
 228   unsigned int i;
 229   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 230   struct data_reference *dra = DDR_A (ddr);
 231   struct data_reference *drb = DDR_B (ddr);
 232   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 233   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 234   lambda_vector dist_v;
 235   unsigned int loop_depth;
 236
 237   /* In loop analysis all data references should be vectorizable.  */
 238   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 239       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 240     gcc_unreachable ();
 241
 242   /* Independent data accesses.  */
 243   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 244     return false;
 245
 246   if (dra == drb
 247       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 248     return false;
 249
 250   /* We do not have to consider dependences between accesses that belong
 251      to the same group.  */
 252   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 253       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 254     return false;
 255
 256   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 257      least two scalar iterations, there is always also a true dependence.
 258      As the vectorizer does not re-order loads and stores we can ignore
 259      the anti-dependence if TBAA can disambiguate both DRs similar to the
 260      case with known negative distance anti-dependences (positive
 261      distance anti-dependences would violate TBAA constraints).  */
 262   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 263        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 264       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 265                                  get_alias_set (DR_REF (drb))))
 266     return false;
 267
 268   /* Unknown data dependence.  */
 269   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 270     {
 271       /* If user asserted safelen consecutive iterations can be
 272          executed concurrently, assume independence.  */
 273       if (loop->safelen >= 2)
 274         {
 275           if (loop->safelen < *max_vf)
 276             *max_vf = loop->safelen;
 277           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 278           return false;
 279         }
 280
 281       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 282           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 283         {
 284           if (dump_enabled_p ())
 285             {
 286               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 287                                "versioning for alias not supported for: "
 288                                "can't determine dependence between ");
 289               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 290                                  DR_REF (dra));
 291               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 292               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 293                                  DR_REF (drb));
 294               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 295             }
 296           return true;
 297         }
 298
 299       if (dump_enabled_p ())
 300         {
 301           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 302                            "versioning for alias required: "
 303                            "can't determine dependence between ");
 304           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 305                              DR_REF (dra));
 306           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 307           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 308                              DR_REF (drb));
 309           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 310         }
 311
 312       /* Add to list of ddrs that need to be tested at run-time.  */
 313       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 314     }
 315
 316   /* Known data dependence.  */
 317   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 318     {
 319       /* If user asserted safelen consecutive iterations can be
 320          executed concurrently, assume independence.  */
 321       if (loop->safelen >= 2)
 322         {
 323           if (loop->safelen < *max_vf)
 324             *max_vf = loop->safelen;
 325           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 326           return false;
 327         }
 328
 329       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 330           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 331         {
 332           if (dump_enabled_p ())
 333             {
 334               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 335                                "versioning for alias not supported for: "
 336                                "bad dist vector for ");
 337               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 338                                  DR_REF (dra));
 339               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 340               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 341                                  DR_REF (drb));
 342               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 343             }
 344           return true;
 345         }
 346
 347       if (dump_enabled_p ())
 348         {
 349           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 350                            "versioning for alias required: "
 351                            "bad dist vector for ");
 352           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 353           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 354           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 355           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 356         }
 357       /* Add to list of ddrs that need to be tested at run-time.  */
 358       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 359     }
 360
 361   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 362
 363   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 364       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 365                                                 loop_depth, max_vf))
 366     return false;
 367
 368   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 369     {
 370       int dist = dist_v[loop_depth];
 371
 372       if (dump_enabled_p ())
 373         dump_printf_loc (MSG_NOTE, vect_location,
 374                          "dependence distance  = %d.\n", dist);
 375
 376       if (dist == 0)
 377         {
 378           if (dump_enabled_p ())
 379             {
 380               dump_printf_loc (MSG_NOTE, vect_location,
 381                                "dependence distance == 0 between ");
 382               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 383               dump_printf (MSG_NOTE, " and ");
 384               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 385               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 386             }
 387
 388           /* When we perform grouped accesses and perform implicit CSE
 389              by detecting equal accesses and doing disambiguation with
 390              runtime alias tests like for
 391                 .. = a[i];
 392                 .. = a[i+1];
 393                 a[i] = ..;
 394                 a[i+1] = ..;
 395                 *p = ..;
 396                 .. = a[i];
 397                 .. = a[i+1];
 398              where we will end up loading { a[i], a[i+1] } once, make
 399              sure that inserting group loads before the first load and
 400              stores after the last store will do the right thing.
 401              Similar for groups like
 402                 a[i] = ...;
 403                 ... = a[i];
 404                 a[i+1] = ...;
 405              where loads from the group interleave with the store.  */
 406           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 407               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 408             {
 409               gimple *earlier_stmt;
 410               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 411               if (DR_IS_WRITE
 412                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 413                 {
 414                   if (dump_enabled_p ())
 415                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 416                                      "READ_WRITE dependence in interleaving."
 417                                      "\n");
 418                   return true;
 419                 }
 420             }
 421
 422           continue;
 423         }
 424
 425       if (dist > 0 && DDR_REVERSED_P (ddr))
 426         {
 427           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 428              reversed (to make distance vector positive), and the actual
 429              distance is negative.  */
 430           if (dump_enabled_p ())
 431             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 432                              "dependence distance negative.\n");
 433           /* Record a negative dependence distance to later limit the
 434              amount of stmt copying / unrolling we can perform.
 435              Only need to handle read-after-write dependence.  */
 436           if (DR_IS_READ (drb)
 437               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 438                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 439             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 440           continue;
 441         }
 442
 443       if (abs (dist) >= 2
 444           && abs (dist) < *max_vf)
 445         {
 446           /* The dependence distance requires reduction of the maximal
 447              vectorization factor.  */
 448           *max_vf = abs (dist);
 449           if (dump_enabled_p ())
 450             dump_printf_loc (MSG_NOTE, vect_location,
 451                              "adjusting maximal vectorization factor to %i\n",
 452                              *max_vf);
 453         }
 454
 455       if (abs (dist) >= *max_vf)
 456         {
 457           /* Dependence distance does not create dependence, as far as
 458              vectorization is concerned, in this case.  */
 459           if (dump_enabled_p ())
 460             dump_printf_loc (MSG_NOTE, vect_location,
 461                              "dependence distance >= VF.\n");
 462           continue;
 463         }
 464
 465       if (dump_enabled_p ())
 466         {
 467           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 468                        "not vectorized, possible dependence "
 469                        "between data-refs ");
 470           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 471           dump_printf (MSG_NOTE,  " and ");
 472           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 473           dump_printf (MSG_NOTE,  "\n");
 474         }
 475
 476       return true;
 477     }
 478
 479   return false;
 480 }
 481
 482 /* Function vect_analyze_data_ref_dependences.
 483
 484    Examine all the data references in the loop, and make sure there do not
 485    exist any data dependences between them.  Set *MAX_VF according to
 486    the maximum vectorization factor the data dependences allow.  */
 487
 488 bool
 489 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 490 {
 491   unsigned int i;
 492   struct data_dependence_relation *ddr;
 493
 494   if (dump_enabled_p ())
 495     dump_printf_loc (MSG_NOTE, vect_location,
 496                      "=== vect_analyze_data_ref_dependences ===\n");
 497
 498   LOOP_VINFO_DDRS (loop_vinfo)
 499     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 500              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 501   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 502   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 503   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 504                                 &LOOP_VINFO_DDRS (loop_vinfo),
 505                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 506     return false;
 507
 508   /* For epilogues we either have no aliases or alias versioning
 509      was applied to original loop.  Therefore we may just get max_vf
 510      using VF of original loop.  */
 511   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 512     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 513   else
 514     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 515       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 516         return false;
 517
 518   return true;
 519 }
 520
 521
 522 /* Function vect_slp_analyze_data_ref_dependence.
 523
 524    Return TRUE if there (might) exist a dependence between a memory-reference
 525    DRA and a memory-reference DRB.  When versioning for alias may check a
 526    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 527    the data dependence.  */
 528
 529 static bool
 530 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 531 {
 532   struct data_reference *dra = DDR_A (ddr);
 533   struct data_reference *drb = DDR_B (ddr);
 534
 535   /* We need to check dependences of statements marked as unvectorizable
 536      as well, they still can prohibit vectorization.  */
 537
 538   /* Independent data accesses.  */
 539   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 540     return false;
 541
 542   if (dra == drb)
 543     return false;
 544
 545   /* Read-read is OK.  */
 546   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 547     return false;
 548
 549   /* If dra and drb are part of the same interleaving chain consider
 550      them independent.  */
 551   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 552       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 553           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 554     return false;
 555
 556   /* Unknown data dependence.  */
 557   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 558     {
 559       if  (dump_enabled_p ())
 560         {
 561           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 562                            "can't determine dependence between ");
 563           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 564           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 565           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 566           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 567         }
 568     }
 569   else if (dump_enabled_p ())
 570     {
 571       dump_printf_loc (MSG_NOTE, vect_location,
 572                        "determined dependence between ");
 573       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 574       dump_printf (MSG_NOTE, " and ");
 575       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 576       dump_printf (MSG_NOTE,  "\n");
 577     }
 578
 579   return true;
 580 }
 581
 582
 583 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 584    contain the vector of scalar stores of this instance if we are
 585    disambiguating the loads.  */
 586
 587 static bool
 588 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 589                                    vec<gimple *> stores, gimple *last_store)
 590 {
 591   /* This walks over all stmts involved in the SLP load/store done
 592      in NODE verifying we can sink them up to the last stmt in the
 593      group.  */
 594   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 595   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 596     {
 597       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 598       if (access == last_access)
 599         continue;
 600       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 601       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 602            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 603         {
 604           gimple *stmt = gsi_stmt (gsi);
 605           if (! gimple_vuse (stmt)
 606               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 607             continue;
 608
 609           /* If we couldn't record a (single) data reference for this
 610              stmt we have to give up.  */
 611           /* ???  Here and below if dependence analysis fails we can resort
 612              to the alias oracle which can handle more kinds of stmts.  */
 613           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 614           if (!dr_b)
 615             return false;
 616
 617           bool dependent = false;
 618           /* If we run into a store of this same instance (we've just
 619              marked those) then delay dependence checking until we run
 620              into the last store because this is where it will have
 621              been sunk to (and we verify if we can do that as well).  */
 622           if (gimple_visited_p (stmt))
 623             {
 624               if (stmt != last_store)
 625                 continue;
 626               unsigned i;
 627               gimple *store;
 628               FOR_EACH_VEC_ELT (stores, i, store)
 629                 {
 630                   data_reference *store_dr
 631                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 632                   ddr_p ddr = initialize_data_dependence_relation
 633                                 (dr_a, store_dr, vNULL);
 634                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 635                   free_dependence_relation (ddr);
 636                   if (dependent)
 637                     break;
 638                 }
 639             }
 640           else
 641             {
 642               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 643                                                                dr_b, vNULL);
 644               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 645               free_dependence_relation (ddr);
 646             }
 647           if (dependent)
 648             return false;
 649         }
 650     }
 651   return true;
 652 }
 653
 654
 655 /* Function vect_analyze_data_ref_dependences.
 656
 657    Examine all the data references in the basic-block, and make sure there
 658    do not exist any data dependences between them.  Set *MAX_VF according to
 659    the maximum vectorization factor the data dependences allow.  */
 660
 661 bool
 662 vect_slp_analyze_instance_dependence (slp_instance instance)
 663 {
 664   if (dump_enabled_p ())
 665     dump_printf_loc (MSG_NOTE, vect_location,
 666                      "=== vect_slp_analyze_instance_dependence ===\n");
 667
 668   /* The stores of this instance are at the root of the SLP tree.  */
 669   slp_tree store = SLP_INSTANCE_TREE (instance);
 670   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 671     store = NULL;
 672
 673   /* Verify we can sink stores to the vectorized stmt insert location.  */
 674   gimple *last_store = NULL;
 675   if (store)
 676     {
 677       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 678         return false;
 679
 680       /* Mark stores in this instance and remember the last one.  */
 681       last_store = vect_find_last_scalar_stmt_in_slp (store);
 682       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 683         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 684     }
 685
 686   bool res = true;
 687
 688   /* Verify we can sink loads to the vectorized stmt insert location,
 689      special-casing stores of this instance.  */
 690   slp_tree load;
 691   unsigned int i;
 692   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 693     if (! vect_slp_analyze_node_dependences (instance, load,
 694                                              store
 695                                              ? SLP_TREE_SCALAR_STMTS (store)
 696                                              : vNULL, last_store))
 697       {
 698         res = false;
 699         break;
 700       }
 701
 702   /* Unset the visited flag.  */
 703   if (store)
 704     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 705       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 706
 707   return res;
 708 }
 709
 710 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 711    the statement that contains DRB, which is useful for recording in the
 712    dump file.  */
 713
 714 static void
 715 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 716                             innermost_loop_behavior *drb)
 717 {
 718   bool existed;
 719   innermost_loop_behavior *&entry
 720     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 721   if (!existed || entry->base_alignment < drb->base_alignment)
 722     {
 723       entry = drb;
 724       if (dump_enabled_p ())
 725         {
 726           dump_printf_loc (MSG_NOTE, vect_location,
 727                            "recording new base alignment for ");
 728           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 729           dump_printf (MSG_NOTE, "\n");
 730           dump_printf_loc (MSG_NOTE, vect_location,
 731                            "  alignment:    %d\n", drb->base_alignment);
 732           dump_printf_loc (MSG_NOTE, vect_location,
 733                            "  misalignment: %d\n", drb->base_misalignment);
 734           dump_printf_loc (MSG_NOTE, vect_location,
 735                            "  based on:     ");
 736           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 737         }
 738     }
 739 }
 740
 741 /* If the region we're going to vectorize is reached, all unconditional
 742    data references occur at least once.  We can therefore pool the base
 743    alignment guarantees from each unconditional reference.  Do this by
 744    going through all the data references in VINFO and checking whether
 745    the containing statement makes the reference unconditionally.  If so,
 746    record the alignment of the base address in VINFO so that it can be
 747    used for all other references with the same base.  */
 748
 749 void
 750 vect_record_base_alignments (vec_info *vinfo)
 751 {
 752   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 753   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 754   data_reference *dr;
 755   unsigned int i;
 756   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 757     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 758       {
 759         gimple *stmt = DR_STMT (dr);
 760         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 761
 762         /* If DR is nested in the loop that is being vectorized, we can also
 763            record the alignment of the base wrt the outer loop.  */
 764         if (loop && nested_in_vect_loop_p (loop, stmt))
 765           {
 766             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 767             vect_record_base_alignment
 768               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 769           }
 770       }
 771 }
 772
 773 /* Function vect_compute_data_ref_alignment
 774
 775    Compute the misalignment of the data reference DR.
 776
 777    Output:
 778    1. If during the misalignment computation it is found that the data reference
 779       cannot be vectorized then false is returned.
 780    2. DR_MISALIGNMENT (DR) is defined.
 781
 782    FOR NOW: No analysis is actually performed. Misalignment is calculated
 783    only for trivial cases. TODO.  */
 784
 785 bool
 786 vect_compute_data_ref_alignment (struct data_reference *dr)
 787 {
 788   gimple *stmt = DR_STMT (dr);
 789   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 790   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 791   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 792   struct loop *loop = NULL;
 793   tree ref = DR_REF (dr);
 794   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 795
 796   if (dump_enabled_p ())
 797     dump_printf_loc (MSG_NOTE, vect_location,
 798                      "vect_compute_data_ref_alignment:\n");
 799
 800   if (loop_vinfo)
 801     loop = LOOP_VINFO_LOOP (loop_vinfo);
 802
 803   /* Initialize misalignment to unknown.  */
 804   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 805
 806   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 807   bool step_preserves_misalignment_p;
 808
 809   /* No step for BB vectorization.  */
 810   if (!loop)
 811     {
 812       gcc_assert (integer_zerop (drb->step));
 813       step_preserves_misalignment_p = true;
 814     }
 815
 816   /* In case the dataref is in an inner-loop of the loop that is being
 817      vectorized (LOOP), we use the base and misalignment information
 818      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 819      stays the same throughout the execution of the inner-loop, which is why
 820      we have to check that the stride of the dataref in the inner-loop evenly
 821      divides by the vector size.  */
 822   else if (nested_in_vect_loop_p (loop, stmt))
 823     {
 824       step_preserves_misalignment_p
 825         = (DR_STEP_ALIGNMENT (dr)
 826            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 827
 828       if (dump_enabled_p ())
 829         {
 830           if (step_preserves_misalignment_p)
 831             dump_printf_loc (MSG_NOTE, vect_location,
 832                              "inner step divides the vector-size.\n");
 833           else
 834             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 835                              "inner step doesn't divide the vector-size.\n");
 836         }
 837     }
 838
 839   /* Similarly we can only use base and misalignment information relative to
 840      an innermost loop if the misalignment stays the same throughout the
 841      execution of the loop.  As above, this is the case if the stride of
 842      the dataref evenly divides by the vector size.  */
 843   else
 844     {
 845       unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 846       step_preserves_misalignment_p
 847         = ((DR_STEP_ALIGNMENT (dr) * vf)
 848            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 849
 850       if (!step_preserves_misalignment_p && dump_enabled_p ())
 851         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 852                          "step doesn't divide the vector-size.\n");
 853     }
 854
 855   unsigned int base_alignment = drb->base_alignment;
 856   unsigned int base_misalignment = drb->base_misalignment;
 857   unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
 858
 859   /* Calculate the maximum of the pooled base address alignment and the
 860      alignment that we can compute for DR itself.  */
 861   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 862   if (entry && base_alignment < (*entry)->base_alignment)
 863     {
 864       base_alignment = (*entry)->base_alignment;
 865       base_misalignment = (*entry)->base_misalignment;
 866     }
 867
 868   if (drb->offset_alignment < vector_alignment
 869       || !step_preserves_misalignment_p
 870       /* We need to know whether the step wrt the vectorized loop is
 871          negative when computing the starting misalignment below.  */
 872       || TREE_CODE (drb->step) != INTEGER_CST)
 873     {
 874       if (dump_enabled_p ())
 875         {
 876           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 877                            "Unknown alignment for access: ");
 878           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 879           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 880         }
 881       return true;
 882     }
 883
 884   if (base_alignment < vector_alignment)
 885     {
 886       tree base = drb->base_address;
 887       if (TREE_CODE (base) == ADDR_EXPR)
 888         base = TREE_OPERAND (base, 0);
 889       if (!vect_can_force_dr_alignment_p (base,
 890                                           vector_alignment * BITS_PER_UNIT))
 891         {
 892           if (dump_enabled_p ())
 893             {
 894               dump_printf_loc (MSG_NOTE, vect_location,
 895                                "can't force alignment of ref: ");
 896               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 897               dump_printf (MSG_NOTE, "\n");
 898             }
 899           return true;
 900         }
 901
 902       if (DECL_USER_ALIGN (base))
 903         {
 904           if (dump_enabled_p ())
 905             {
 906               dump_printf_loc (MSG_NOTE, vect_location,
 907                                "not forcing alignment of user-aligned "
 908                                "variable: ");
 909               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 910               dump_printf (MSG_NOTE, "\n");
 911             }
 912           return true;
 913         }
 914
 915       /* Force the alignment of the decl.
 916          NOTE: This is the only change to the code we make during
 917          the analysis phase, before deciding to vectorize the loop.  */
 918       if (dump_enabled_p ())
 919         {
 920           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 921           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 922           dump_printf (MSG_NOTE, "\n");
 923         }
 924
 925       DR_VECT_AUX (dr)->base_decl = base;
 926       DR_VECT_AUX (dr)->base_misaligned = true;
 927       base_misalignment = 0;
 928     }
 929   unsigned int misalignment = (base_misalignment
 930                                + TREE_INT_CST_LOW (drb->init));
 931
 932   /* If this is a backward running DR then first access in the larger
 933      vectype actually is N-1 elements before the address in the DR.
 934      Adjust misalign accordingly.  */
 935   if (tree_int_cst_sgn (drb->step) < 0)
 936     /* PLUS because STEP is negative.  */
 937     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 938                      * TREE_INT_CST_LOW (drb->step));
 939
 940   SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
 941
 942   if (dump_enabled_p ())
 943     {
 944       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 945                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 946       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 947       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 948     }
 949
 950   return true;
 951 }
 952
 953
 954 /* Function vect_update_misalignment_for_peel.
 955    Sets DR's misalignment
 956    - to 0 if it has the same alignment as DR_PEEL,
 957    - to the misalignment computed using NPEEL if DR's salignment is known,
 958    - to -1 (unknown) otherwise.
 959
 960    DR - the data reference whose misalignment is to be adjusted.
 961    DR_PEEL - the data reference whose misalignment is being made
 962              zero in the vector loop by the peel.
 963    NPEEL - the number of iterations in the peel loop if the misalignment
 964            of DR_PEEL is known at compile time.  */
 965
 966 static void
 967 vect_update_misalignment_for_peel (struct data_reference *dr,
 968                                    struct data_reference *dr_peel, int npeel)
 969 {
 970   unsigned int i;
 971   vec<dr_p> same_aligned_drs;
 972   struct data_reference *current_dr;
 973   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 974   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 975   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 976   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 977
 978  /* For interleaved data accesses the step in the loop must be multiplied by
 979      the size of the interleaving group.  */
 980   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 981     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 982   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 983     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 984
 985   /* It can be assumed that the data refs with the same alignment as dr_peel
 986      are aligned in the vector loop.  */
 987   same_aligned_drs
 988     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 989   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
 990     {
 991       if (current_dr != dr)
 992         continue;
 993       gcc_assert (!known_alignment_for_access_p (dr)
 994                   || !known_alignment_for_access_p (dr_peel)
 995                   || (DR_MISALIGNMENT (dr) / dr_size
 996                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
 997       SET_DR_MISALIGNMENT (dr, 0);
 998       return;
 999     }
1000
1001   if (known_alignment_for_access_p (dr)
1002       && known_alignment_for_access_p (dr_peel))
1003     {
1004       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1005       int misal = DR_MISALIGNMENT (dr);
1006       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1007       misal += negative ? -npeel * dr_size : npeel * dr_size;
1008       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1009       SET_DR_MISALIGNMENT (dr, misal);
1010       return;
1011     }
1012
1013   if (dump_enabled_p ())
1014     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1015                      "to unknown (-1).\n");
1016   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1017 }
1018
1019
1020 /* Function verify_data_ref_alignment
1021
1022    Return TRUE if DR can be handled with respect to alignment.  */
1023
1024 static bool
1025 verify_data_ref_alignment (data_reference_p dr)
1026 {
1027   enum dr_alignment_support supportable_dr_alignment
1028     = vect_supportable_dr_alignment (dr, false);
1029   if (!supportable_dr_alignment)
1030     {
1031       if (dump_enabled_p ())
1032         {
1033           if (DR_IS_READ (dr))
1034             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035                              "not vectorized: unsupported unaligned load.");
1036           else
1037             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038                              "not vectorized: unsupported unaligned "
1039                              "store.");
1040
1041           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1042                              DR_REF (dr));
1043           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1044         }
1045       return false;
1046     }
1047
1048   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1049     dump_printf_loc (MSG_NOTE, vect_location,
1050                      "Vectorizing an unaligned access.\n");
1051
1052   return true;
1053 }
1054
1055 /* Function vect_verify_datarefs_alignment
1056
1057    Return TRUE if all data references in the loop can be
1058    handled with respect to alignment.  */
1059
1060 bool
1061 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1062 {
1063   vec<data_reference_p> datarefs = vinfo->datarefs;
1064   struct data_reference *dr;
1065   unsigned int i;
1066
1067   FOR_EACH_VEC_ELT (datarefs, i, dr)
1068     {
1069       gimple *stmt = DR_STMT (dr);
1070       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1071
1072       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1073         continue;
1074
1075       /* For interleaving, only the alignment of the first access matters.   */
1076       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1077           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1078         continue;
1079
1080       /* Strided accesses perform only component accesses, alignment is
1081          irrelevant for them.  */
1082       if (STMT_VINFO_STRIDED_P (stmt_info)
1083           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1084         continue;
1085
1086       if (! verify_data_ref_alignment (dr))
1087         return false;
1088     }
1089
1090   return true;
1091 }
1092
1093 /* Given an memory reference EXP return whether its alignment is less
1094    than its size.  */
1095
1096 static bool
1097 not_size_aligned (tree exp)
1098 {
1099   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1100     return true;
1101
1102   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1103           > get_object_alignment (exp));
1104 }
1105
1106 /* Function vector_alignment_reachable_p
1107
1108    Return true if vector alignment for DR is reachable by peeling
1109    a few loop iterations.  Return false otherwise.  */
1110
1111 static bool
1112 vector_alignment_reachable_p (struct data_reference *dr)
1113 {
1114   gimple *stmt = DR_STMT (dr);
1115   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1116   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1117
1118   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1119     {
1120       /* For interleaved access we peel only if number of iterations in
1121          the prolog loop ({VF - misalignment}), is a multiple of the
1122          number of the interleaved accesses.  */
1123       int elem_size, mis_in_elements;
1124       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1125
1126       /* FORNOW: handle only known alignment.  */
1127       if (!known_alignment_for_access_p (dr))
1128         return false;
1129
1130       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1131       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1132
1133       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1134         return false;
1135     }
1136
1137   /* If misalignment is known at the compile time then allow peeling
1138      only if natural alignment is reachable through peeling.  */
1139   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1140     {
1141       HOST_WIDE_INT elmsize =
1142                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1143       if (dump_enabled_p ())
1144         {
1145           dump_printf_loc (MSG_NOTE, vect_location,
1146                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1147           dump_printf (MSG_NOTE,
1148                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1149         }
1150       if (DR_MISALIGNMENT (dr) % elmsize)
1151         {
1152           if (dump_enabled_p ())
1153             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1154                              "data size does not divide the misalignment.\n");
1155           return false;
1156         }
1157     }
1158
1159   if (!known_alignment_for_access_p (dr))
1160     {
1161       tree type = TREE_TYPE (DR_REF (dr));
1162       bool is_packed = not_size_aligned (DR_REF (dr));
1163       if (dump_enabled_p ())
1164         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165                          "Unknown misalignment, %snaturally aligned\n",
1166                          is_packed ? "not " : "");
1167       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1168     }
1169
1170   return true;
1171 }
1172
1173
1174 /* Calculate the cost of the memory access represented by DR.  */
1175
1176 static void
1177 vect_get_data_access_cost (struct data_reference *dr,
1178                            unsigned int *inside_cost,
1179                            unsigned int *outside_cost,
1180                            stmt_vector_for_cost *body_cost_vec)
1181 {
1182   gimple *stmt = DR_STMT (dr);
1183   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1184   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1185   int ncopies;
1186
1187   if (PURE_SLP_STMT (stmt_info))
1188     ncopies = 1;
1189   else
1190     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1191
1192   if (DR_IS_READ (dr))
1193     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1194                         NULL, body_cost_vec, false);
1195   else
1196     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1197
1198   if (dump_enabled_p ())
1199     dump_printf_loc (MSG_NOTE, vect_location,
1200                      "vect_get_data_access_cost: inside_cost = %d, "
1201                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1202 }
1203
1204
1205 typedef struct _vect_peel_info
1206 {
1207   struct data_reference *dr;
1208   int npeel;
1209   unsigned int count;
1210 } *vect_peel_info;
1211
1212 typedef struct _vect_peel_extended_info
1213 {
1214   struct _vect_peel_info peel_info;
1215   unsigned int inside_cost;
1216   unsigned int outside_cost;
1217 } *vect_peel_extended_info;
1218
1219
1220 /* Peeling hashtable helpers.  */
1221
1222 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1223 {
1224   static inline hashval_t hash (const _vect_peel_info *);
1225   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1226 };
1227
1228 inline hashval_t
1229 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1230 {
1231   return (hashval_t) peel_info->npeel;
1232 }
1233
1234 inline bool
1235 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1236 {
1237   return (a->npeel == b->npeel);
1238 }
1239
1240
1241 /* Insert DR into peeling hash table with NPEEL as key.  */
1242
1243 static void
1244 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1245                           loop_vec_info loop_vinfo, struct data_reference *dr,
1246                           int npeel)
1247 {
1248   struct _vect_peel_info elem, *slot;
1249   _vect_peel_info **new_slot;
1250   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1251
1252   elem.npeel = npeel;
1253   slot = peeling_htab->find (&elem);
1254   if (slot)
1255     slot->count++;
1256   else
1257     {
1258       slot = XNEW (struct _vect_peel_info);
1259       slot->npeel = npeel;
1260       slot->dr = dr;
1261       slot->count = 1;
1262       new_slot = peeling_htab->find_slot (slot, INSERT);
1263       *new_slot = slot;
1264     }
1265
1266   if (!supportable_dr_alignment
1267       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1268     slot->count += VECT_MAX_COST;
1269 }
1270
1271
1272 /* Traverse peeling hash table to find peeling option that aligns maximum
1273    number of data accesses.  */
1274
1275 int
1276 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1277                                      _vect_peel_extended_info *max)
1278 {
1279   vect_peel_info elem = *slot;
1280
1281   if (elem->count > max->peel_info.count
1282       || (elem->count == max->peel_info.count
1283           && max->peel_info.npeel > elem->npeel))
1284     {
1285       max->peel_info.npeel = elem->npeel;
1286       max->peel_info.count = elem->count;
1287       max->peel_info.dr = elem->dr;
1288     }
1289
1290   return 1;
1291 }
1292
1293 /* Get the costs of peeling NPEEL iterations checking data access costs
1294    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1295    misalignment will be zero after peeling.  */
1296
1297 static void
1298 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1299                                 struct data_reference *dr0,
1300                                 unsigned int *inside_cost,
1301                                 unsigned int *outside_cost,
1302                                 stmt_vector_for_cost *body_cost_vec,
1303                                 unsigned int npeel,
1304                                 bool unknown_misalignment)
1305 {
1306   unsigned i;
1307   data_reference *dr;
1308
1309   FOR_EACH_VEC_ELT (datarefs, i, dr)
1310     {
1311       gimple *stmt = DR_STMT (dr);
1312       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1313       /* For interleaving, only the alignment of the first access
1314          matters.  */
1315       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1316           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1317         continue;
1318
1319       /* Strided accesses perform only component accesses, alignment is
1320          irrelevant for them.  */
1321       if (STMT_VINFO_STRIDED_P (stmt_info)
1322           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1323         continue;
1324
1325       int save_misalignment;
1326       save_misalignment = DR_MISALIGNMENT (dr);
1327       if (npeel == 0)
1328         ;
1329       else if (unknown_misalignment && dr == dr0)
1330         SET_DR_MISALIGNMENT (dr, 0);
1331       else
1332         vect_update_misalignment_for_peel (dr, dr0, npeel);
1333       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1334                                  body_cost_vec);
1335       SET_DR_MISALIGNMENT (dr, save_misalignment);
1336     }
1337 }
1338
1339 /* Traverse peeling hash table and calculate cost for each peeling option.
1340    Find the one with the lowest cost.  */
1341
1342 int
1343 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1344                                    _vect_peel_extended_info *min)
1345 {
1346   vect_peel_info elem = *slot;
1347   int dummy;
1348   unsigned int inside_cost = 0, outside_cost = 0;
1349   gimple *stmt = DR_STMT (elem->dr);
1350   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1351   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1352   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1353                        epilogue_cost_vec;
1354
1355   prologue_cost_vec.create (2);
1356   body_cost_vec.create (2);
1357   epilogue_cost_vec.create (2);
1358
1359   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1360                                   elem->dr, &inside_cost, &outside_cost,
1361                                   &body_cost_vec, elem->npeel, false);
1362
1363   body_cost_vec.release ();
1364
1365   outside_cost += vect_get_known_peeling_cost
1366     (loop_vinfo, elem->npeel, &dummy,
1367      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1368      &prologue_cost_vec, &epilogue_cost_vec);
1369
1370   /* Prologue and epilogue costs are added to the target model later.
1371      These costs depend only on the scalar iteration cost, the
1372      number of peeling iterations finally chosen, and the number of
1373      misaligned statements.  So discard the information found here.  */
1374   prologue_cost_vec.release ();
1375   epilogue_cost_vec.release ();
1376
1377   if (inside_cost < min->inside_cost
1378       || (inside_cost == min->inside_cost
1379           && outside_cost < min->outside_cost))
1380     {
1381       min->inside_cost = inside_cost;
1382       min->outside_cost = outside_cost;
1383       min->peel_info.dr = elem->dr;
1384       min->peel_info.npeel = elem->npeel;
1385       min->peel_info.count = elem->count;
1386     }
1387
1388   return 1;
1389 }
1390
1391
1392 /* Choose best peeling option by traversing peeling hash table and either
1393    choosing an option with the lowest cost (if cost model is enabled) or the
1394    option that aligns as many accesses as possible.  */
1395
1396 static struct _vect_peel_extended_info
1397 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1398                                        loop_vec_info loop_vinfo)
1399 {
1400    struct _vect_peel_extended_info res;
1401
1402    res.peel_info.dr = NULL;
1403
1404    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1405      {
1406        res.inside_cost = INT_MAX;
1407        res.outside_cost = INT_MAX;
1408        peeling_htab->traverse <_vect_peel_extended_info *,
1409                                vect_peeling_hash_get_lowest_cost> (&res);
1410      }
1411    else
1412      {
1413        res.peel_info.count = 0;
1414        peeling_htab->traverse <_vect_peel_extended_info *,
1415                                vect_peeling_hash_get_most_frequent> (&res);
1416        res.inside_cost = 0;
1417        res.outside_cost = 0;
1418      }
1419
1420    return res;
1421 }
1422
1423 /* Return true if the new peeling NPEEL is supported.  */
1424
1425 static bool
1426 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1427                           unsigned npeel)
1428 {
1429   unsigned i;
1430   struct data_reference *dr = NULL;
1431   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1432   gimple *stmt;
1433   stmt_vec_info stmt_info;
1434   enum dr_alignment_support supportable_dr_alignment;
1435
1436   /* Ensure that all data refs can be vectorized after the peel.  */
1437   FOR_EACH_VEC_ELT (datarefs, i, dr)
1438     {
1439       int save_misalignment;
1440
1441       if (dr == dr0)
1442         continue;
1443
1444       stmt = DR_STMT (dr);
1445       stmt_info = vinfo_for_stmt (stmt);
1446       /* For interleaving, only the alignment of the first access
1447          matters.  */
1448       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1449           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1450         continue;
1451
1452       /* Strided accesses perform only component accesses, alignment is
1453          irrelevant for them.  */
1454       if (STMT_VINFO_STRIDED_P (stmt_info)
1455           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1456         continue;
1457
1458       save_misalignment = DR_MISALIGNMENT (dr);
1459       vect_update_misalignment_for_peel (dr, dr0, npeel);
1460       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1461       SET_DR_MISALIGNMENT (dr, save_misalignment);
1462
1463       if (!supportable_dr_alignment)
1464         return false;
1465     }
1466
1467   return true;
1468 }
1469
1470 /* Function vect_enhance_data_refs_alignment
1471
1472    This pass will use loop versioning and loop peeling in order to enhance
1473    the alignment of data references in the loop.
1474
1475    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1476    original loop is to be vectorized.  Any other loops that are created by
1477    the transformations performed in this pass - are not supposed to be
1478    vectorized.  This restriction will be relaxed.
1479
1480    This pass will require a cost model to guide it whether to apply peeling
1481    or versioning or a combination of the two.  For example, the scheme that
1482    intel uses when given a loop with several memory accesses, is as follows:
1483    choose one memory access ('p') which alignment you want to force by doing
1484    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1485    other accesses are not necessarily aligned, or (2) use loop versioning to
1486    generate one loop in which all accesses are aligned, and another loop in
1487    which only 'p' is necessarily aligned.
1488
1489    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1490    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1491    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1492
1493    Devising a cost model is the most critical aspect of this work.  It will
1494    guide us on which access to peel for, whether to use loop versioning, how
1495    many versions to create, etc.  The cost model will probably consist of
1496    generic considerations as well as target specific considerations (on
1497    powerpc for example, misaligned stores are more painful than misaligned
1498    loads).
1499
1500    Here are the general steps involved in alignment enhancements:
1501
1502      -- original loop, before alignment analysis:
1503         for (i=0; i<N; i++){
1504           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1505           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1506         }
1507
1508      -- After vect_compute_data_refs_alignment:
1509         for (i=0; i<N; i++){
1510           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1511           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1512         }
1513
1514      -- Possibility 1: we do loop versioning:
1515      if (p is aligned) {
1516         for (i=0; i<N; i++){    # loop 1A
1517           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1518           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1519         }
1520      }
1521      else {
1522         for (i=0; i<N; i++){    # loop 1B
1523           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1524           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1525         }
1526      }
1527
1528      -- Possibility 2: we do loop peeling:
1529      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1530         x = q[i];
1531         p[i] = y;
1532      }
1533      for (i = 3; i < N; i++){   # loop 2A
1534         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1535         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1536      }
1537
1538      -- Possibility 3: combination of loop peeling and versioning:
1539      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1540         x = q[i];
1541         p[i] = y;
1542      }
1543      if (p is aligned) {
1544         for (i = 3; i<N; i++){  # loop 3A
1545           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1546           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1547         }
1548      }
1549      else {
1550         for (i = 3; i<N; i++){  # loop 3B
1551           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1552           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1553         }
1554      }
1555
1556      These loops are later passed to loop_transform to be vectorized.  The
1557      vectorizer will use the alignment information to guide the transformation
1558      (whether to generate regular loads/stores, or with special handling for
1559      misalignment).  */
1560
1561 bool
1562 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1563 {
1564   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1565   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566   enum dr_alignment_support supportable_dr_alignment;
1567   struct data_reference *dr0 = NULL, *first_store = NULL;
1568   struct data_reference *dr;
1569   unsigned int i, j;
1570   bool do_peeling = false;
1571   bool do_versioning = false;
1572   bool stat;
1573   gimple *stmt;
1574   stmt_vec_info stmt_info;
1575   unsigned int npeel = 0;
1576   bool one_misalignment_known = false;
1577   bool one_misalignment_unknown = false;
1578   bool one_dr_unsupportable = false;
1579   struct data_reference *unsupportable_dr = NULL;
1580   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1581   unsigned possible_npeel_number = 1;
1582   tree vectype;
1583   unsigned int nelements, mis, same_align_drs_max = 0;
1584   hash_table<peel_info_hasher> peeling_htab (1);
1585
1586   if (dump_enabled_p ())
1587     dump_printf_loc (MSG_NOTE, vect_location,
1588                      "=== vect_enhance_data_refs_alignment ===\n");
1589
1590   /* Reset data so we can safely be called multiple times.  */
1591   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1592   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1593
1594   /* While cost model enhancements are expected in the future, the high level
1595      view of the code at this time is as follows:
1596
1597      A) If there is a misaligned access then see if peeling to align
1598         this access can make all data references satisfy
1599         vect_supportable_dr_alignment.  If so, update data structures
1600         as needed and return true.
1601
1602      B) If peeling wasn't possible and there is a data reference with an
1603         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1604         then see if loop versioning checks can be used to make all data
1605         references satisfy vect_supportable_dr_alignment.  If so, update
1606         data structures as needed and return true.
1607
1608      C) If neither peeling nor versioning were successful then return false if
1609         any data reference does not satisfy vect_supportable_dr_alignment.
1610
1611      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1612
1613      Note, Possibility 3 above (which is peeling and versioning together) is not
1614      being done at this time.  */
1615
1616   /* (1) Peeling to force alignment.  */
1617
1618   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1619      Considerations:
1620      + How many accesses will become aligned due to the peeling
1621      - How many accesses will become unaligned due to the peeling,
1622        and the cost of misaligned accesses.
1623      - The cost of peeling (the extra runtime checks, the increase
1624        in code size).  */
1625
1626   FOR_EACH_VEC_ELT (datarefs, i, dr)
1627     {
1628       stmt = DR_STMT (dr);
1629       stmt_info = vinfo_for_stmt (stmt);
1630
1631       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1632         continue;
1633
1634       /* For interleaving, only the alignment of the first access
1635          matters.  */
1636       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1637           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1638         continue;
1639
1640       /* For invariant accesses there is nothing to enhance.  */
1641       if (integer_zerop (DR_STEP (dr)))
1642         continue;
1643
1644       /* Strided accesses perform only component accesses, alignment is
1645          irrelevant for them.  */
1646       if (STMT_VINFO_STRIDED_P (stmt_info)
1647           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1648         continue;
1649
1650       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1651       do_peeling = vector_alignment_reachable_p (dr);
1652       if (do_peeling)
1653         {
1654           if (known_alignment_for_access_p (dr))
1655             {
1656               unsigned int npeel_tmp = 0;
1657               bool negative = tree_int_cst_compare (DR_STEP (dr),
1658                                                     size_zero_node) < 0;
1659
1660               vectype = STMT_VINFO_VECTYPE (stmt_info);
1661               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1662               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1663                                                 TREE_TYPE (DR_REF (dr))));
1664               if (DR_MISALIGNMENT (dr) != 0)
1665                 npeel_tmp = (negative ? (mis - nelements)
1666                              : (nelements - mis)) & (nelements - 1);
1667
1668               /* For multiple types, it is possible that the bigger type access
1669                  will have more than one peeling option.  E.g., a loop with two
1670                  types: one of size (vector size / 4), and the other one of
1671                  size (vector size / 8).  Vectorization factor will 8.  If both
1672                  accesses are misaligned by 3, the first one needs one scalar
1673                  iteration to be aligned, and the second one needs 5.  But the
1674                  first one will be aligned also by peeling 5 scalar
1675                  iterations, and in that case both accesses will be aligned.
1676                  Hence, except for the immediate peeling amount, we also want
1677                  to try to add full vector size, while we don't exceed
1678                  vectorization factor.
1679                  We do this automatically for cost model, since we calculate
1680                  cost for every peeling option.  */
1681               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1682                 {
1683                   if (STMT_SLP_TYPE (stmt_info))
1684                     possible_npeel_number
1685                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1686                   else
1687                     possible_npeel_number = vf / nelements;
1688
1689                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1690                      allow peeling NELEMENTS.  */
1691                   if (DR_MISALIGNMENT (dr) == 0)
1692                     possible_npeel_number++;
1693                 }
1694
1695               /* Save info about DR in the hash table.  Also include peeling
1696                  amounts according to the explanation above.  */
1697               for (j = 0; j < possible_npeel_number; j++)
1698                 {
1699                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1700                                             dr, npeel_tmp);
1701                   npeel_tmp += nelements;
1702                 }
1703
1704               one_misalignment_known = true;
1705             }
1706           else
1707             {
1708               /* If we don't know any misalignment values, we prefer
1709                  peeling for data-ref that has the maximum number of data-refs
1710                  with the same alignment, unless the target prefers to align
1711                  stores over load.  */
1712               unsigned same_align_drs
1713                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1714               if (!dr0
1715                   || same_align_drs_max < same_align_drs)
1716                 {
1717                   same_align_drs_max = same_align_drs;
1718                   dr0 = dr;
1719                 }
1720               /* For data-refs with the same number of related
1721                  accesses prefer the one where the misalign
1722                  computation will be invariant in the outermost loop.  */
1723               else if (same_align_drs_max == same_align_drs)
1724                 {
1725                   struct loop *ivloop0, *ivloop;
1726                   ivloop0 = outermost_invariant_loop_for_expr
1727                     (loop, DR_BASE_ADDRESS (dr0));
1728                   ivloop = outermost_invariant_loop_for_expr
1729                     (loop, DR_BASE_ADDRESS (dr));
1730                   if ((ivloop && !ivloop0)
1731                       || (ivloop && ivloop0
1732                           && flow_loop_nested_p (ivloop, ivloop0)))
1733                     dr0 = dr;
1734                 }
1735
1736               one_misalignment_unknown = true;
1737
1738               /* Check for data refs with unsupportable alignment that
1739                  can be peeled.  */
1740               if (!supportable_dr_alignment)
1741               {
1742                 one_dr_unsupportable = true;
1743                 unsupportable_dr = dr;
1744               }
1745
1746               if (!first_store && DR_IS_WRITE (dr))
1747                 first_store = dr;
1748             }
1749         }
1750       else
1751         {
1752           if (!aligned_access_p (dr))
1753             {
1754               if (dump_enabled_p ())
1755                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1756                                  "vector alignment may not be reachable\n");
1757               break;
1758             }
1759         }
1760     }
1761
1762   /* Check if we can possibly peel the loop.  */
1763   if (!vect_can_advance_ivs_p (loop_vinfo)
1764       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1765       || loop->inner)
1766     do_peeling = false;
1767
1768   struct _vect_peel_extended_info peel_for_known_alignment;
1769   struct _vect_peel_extended_info peel_for_unknown_alignment;
1770   struct _vect_peel_extended_info best_peel;
1771
1772   peel_for_unknown_alignment.inside_cost = INT_MAX;
1773   peel_for_unknown_alignment.outside_cost = INT_MAX;
1774   peel_for_unknown_alignment.peel_info.count = 0;
1775
1776   if (do_peeling
1777       && one_misalignment_unknown)
1778     {
1779       /* Check if the target requires to prefer stores over loads, i.e., if
1780          misaligned stores are more expensive than misaligned loads (taking
1781          drs with same alignment into account).  */
1782       unsigned int load_inside_cost = 0;
1783       unsigned int load_outside_cost = 0;
1784       unsigned int store_inside_cost = 0;
1785       unsigned int store_outside_cost = 0;
1786
1787       stmt_vector_for_cost dummy;
1788       dummy.create (2);
1789       vect_get_peeling_costs_all_drs (datarefs, dr0,
1790                                       &load_inside_cost,
1791                                       &load_outside_cost,
1792                                       &dummy, vf / 2, true);
1793       dummy.release ();
1794
1795       if (first_store)
1796         {
1797           dummy.create (2);
1798           vect_get_peeling_costs_all_drs (datarefs, first_store,
1799                                           &store_inside_cost,
1800                                           &store_outside_cost,
1801                                           &dummy, vf / 2, true);
1802           dummy.release ();
1803         }
1804       else
1805         {
1806           store_inside_cost = INT_MAX;
1807           store_outside_cost = INT_MAX;
1808         }
1809
1810       if (load_inside_cost > store_inside_cost
1811           || (load_inside_cost == store_inside_cost
1812               && load_outside_cost > store_outside_cost))
1813         {
1814           dr0 = first_store;
1815           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1816           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1817         }
1818       else
1819         {
1820           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1821           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1822         }
1823
1824       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1825       prologue_cost_vec.create (2);
1826       epilogue_cost_vec.create (2);
1827
1828       int dummy2;
1829       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1830         (loop_vinfo, vf / 2, &dummy2,
1831          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1832          &prologue_cost_vec, &epilogue_cost_vec);
1833
1834       prologue_cost_vec.release ();
1835       epilogue_cost_vec.release ();
1836
1837       peel_for_unknown_alignment.peel_info.count = 1
1838         + STMT_VINFO_SAME_ALIGN_REFS
1839         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1840     }
1841
1842   peel_for_unknown_alignment.peel_info.npeel = 0;
1843   peel_for_unknown_alignment.peel_info.dr = dr0;
1844
1845   best_peel = peel_for_unknown_alignment;
1846
1847   peel_for_known_alignment.inside_cost = INT_MAX;
1848   peel_for_known_alignment.outside_cost = INT_MAX;
1849   peel_for_known_alignment.peel_info.count = 0;
1850   peel_for_known_alignment.peel_info.dr = NULL;
1851
1852   if (do_peeling && one_misalignment_known)
1853     {
1854       /* Peeling is possible, but there is no data access that is not supported
1855          unless aligned.  So we try to choose the best possible peeling from
1856          the hash table.  */
1857       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1858         (&peeling_htab, loop_vinfo);
1859     }
1860
1861   /* Compare costs of peeling for known and unknown alignment. */
1862   if (peel_for_known_alignment.peel_info.dr != NULL
1863       && peel_for_unknown_alignment.inside_cost
1864       >= peel_for_known_alignment.inside_cost)
1865     {
1866       best_peel = peel_for_known_alignment;
1867
1868       /* If the best peeling for known alignment has NPEEL == 0, perform no
1869          peeling at all except if there is an unsupportable dr that we can
1870          align.  */
1871       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1872         do_peeling = false;
1873     }
1874
1875   /* If there is an unsupportable data ref, prefer this over all choices so far
1876      since we'd have to discard a chosen peeling except when it accidentally
1877      aligned the unsupportable data ref.  */
1878   if (one_dr_unsupportable)
1879     dr0 = unsupportable_dr;
1880   else if (do_peeling)
1881     {
1882       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1883          TODO: Use nopeel_outside_cost or get rid of it?  */
1884       unsigned nopeel_inside_cost = 0;
1885       unsigned nopeel_outside_cost = 0;
1886
1887       stmt_vector_for_cost dummy;
1888       dummy.create (2);
1889       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1890                                       &nopeel_outside_cost, &dummy, 0, false);
1891       dummy.release ();
1892
1893       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1894          costs will be recorded.  */
1895       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1896       prologue_cost_vec.create (2);
1897       epilogue_cost_vec.create (2);
1898
1899       int dummy2;
1900       nopeel_outside_cost += vect_get_known_peeling_cost
1901         (loop_vinfo, 0, &dummy2,
1902          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1903          &prologue_cost_vec, &epilogue_cost_vec);
1904
1905       prologue_cost_vec.release ();
1906       epilogue_cost_vec.release ();
1907
1908       npeel = best_peel.peel_info.npeel;
1909       dr0 = best_peel.peel_info.dr;
1910
1911       /* If no peeling is not more expensive than the best peeling we
1912          have so far, don't perform any peeling.  */
1913       if (nopeel_inside_cost <= best_peel.inside_cost)
1914         do_peeling = false;
1915     }
1916
1917   if (do_peeling)
1918     {
1919       stmt = DR_STMT (dr0);
1920       stmt_info = vinfo_for_stmt (stmt);
1921       vectype = STMT_VINFO_VECTYPE (stmt_info);
1922       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1923
1924       if (known_alignment_for_access_p (dr0))
1925         {
1926           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1927                                                 size_zero_node) < 0;
1928           if (!npeel)
1929             {
1930               /* Since it's known at compile time, compute the number of
1931                  iterations in the peeled loop (the peeling factor) for use in
1932                  updating DR_MISALIGNMENT values.  The peeling factor is the
1933                  vectorization factor minus the misalignment as an element
1934                  count.  */
1935               mis = DR_MISALIGNMENT (dr0);
1936               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1937               npeel = ((negative ? mis - nelements : nelements - mis)
1938                        & (nelements - 1));
1939             }
1940
1941           /* For interleaved data access every iteration accesses all the
1942              members of the group, therefore we divide the number of iterations
1943              by the group size.  */
1944           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1945           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1946             npeel /= GROUP_SIZE (stmt_info);
1947
1948           if (dump_enabled_p ())
1949             dump_printf_loc (MSG_NOTE, vect_location,
1950                              "Try peeling by %d\n", npeel);
1951         }
1952
1953       /* Ensure that all datarefs can be vectorized after the peel.  */
1954       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1955         do_peeling = false;
1956
1957       /* Check if all datarefs are supportable and log.  */
1958       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1959         {
1960           stat = vect_verify_datarefs_alignment (loop_vinfo);
1961           if (!stat)
1962             do_peeling = false;
1963           else
1964             return stat;
1965         }
1966
1967       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1968       if (do_peeling)
1969         {
1970           unsigned max_allowed_peel
1971             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1972           if (max_allowed_peel != (unsigned)-1)
1973             {
1974               unsigned max_peel = npeel;
1975               if (max_peel == 0)
1976                 {
1977                   gimple *dr_stmt = DR_STMT (dr0);
1978                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1979                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1980                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1981                 }
1982               if (max_peel > max_allowed_peel)
1983                 {
1984                   do_peeling = false;
1985                   if (dump_enabled_p ())
1986                     dump_printf_loc (MSG_NOTE, vect_location,
1987                         "Disable peeling, max peels reached: %d\n", max_peel);
1988                 }
1989             }
1990         }
1991
1992       /* Cost model #2 - if peeling may result in a remaining loop not
1993          iterating enough to be vectorized then do not peel.  */
1994       if (do_peeling
1995           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1996         {
1997           unsigned max_peel
1998             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1999           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
2000               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
2001             do_peeling = false;
2002         }
2003
2004       if (do_peeling)
2005         {
2006           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2007              If the misalignment of DR_i is identical to that of dr0 then set
2008              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2009              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2010              by the peeling factor times the element size of DR_i (MOD the
2011              vectorization factor times the size).  Otherwise, the
2012              misalignment of DR_i must be set to unknown.  */
2013           FOR_EACH_VEC_ELT (datarefs, i, dr)
2014             if (dr != dr0)
2015               {
2016                 /* Strided accesses perform only component accesses, alignment
2017                    is irrelevant for them.  */
2018                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2019                 if (STMT_VINFO_STRIDED_P (stmt_info)
2020                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2021                   continue;
2022
2023                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2024               }
2025
2026           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2027           if (npeel)
2028             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2029           else
2030             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2031               = DR_MISALIGNMENT (dr0);
2032           SET_DR_MISALIGNMENT (dr0, 0);
2033           if (dump_enabled_p ())
2034             {
2035               dump_printf_loc (MSG_NOTE, vect_location,
2036                                "Alignment of access forced using peeling.\n");
2037               dump_printf_loc (MSG_NOTE, vect_location,
2038                                "Peeling for alignment will be applied.\n");
2039             }
2040
2041           /* The inside-loop cost will be accounted for in vectorizable_load
2042              and vectorizable_store correctly with adjusted alignments.
2043              Drop the body_cst_vec on the floor here.  */
2044           stat = vect_verify_datarefs_alignment (loop_vinfo);
2045           gcc_assert (stat);
2046           return stat;
2047         }
2048     }
2049
2050   /* (2) Versioning to force alignment.  */
2051
2052   /* Try versioning if:
2053      1) optimize loop for speed
2054      2) there is at least one unsupported misaligned data ref with an unknown
2055         misalignment, and
2056      3) all misaligned data refs with a known misalignment are supported, and
2057      4) the number of runtime alignment checks is within reason.  */
2058
2059   do_versioning =
2060         optimize_loop_nest_for_speed_p (loop)
2061         && (!loop->inner); /* FORNOW */
2062
2063   if (do_versioning)
2064     {
2065       FOR_EACH_VEC_ELT (datarefs, i, dr)
2066         {
2067           stmt = DR_STMT (dr);
2068           stmt_info = vinfo_for_stmt (stmt);
2069
2070           /* For interleaving, only the alignment of the first access
2071              matters.  */
2072           if (aligned_access_p (dr)
2073               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2074                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2075             continue;
2076
2077           if (STMT_VINFO_STRIDED_P (stmt_info))
2078             {
2079               /* Strided loads perform only component accesses, alignment is
2080                  irrelevant for them.  */
2081               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2082                 continue;
2083               do_versioning = false;
2084               break;
2085             }
2086
2087           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2088
2089           if (!supportable_dr_alignment)
2090             {
2091               gimple *stmt;
2092               int mask;
2093               tree vectype;
2094
2095               if (known_alignment_for_access_p (dr)
2096                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2097                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2098                 {
2099                   do_versioning = false;
2100                   break;
2101                 }
2102
2103               stmt = DR_STMT (dr);
2104               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2105               gcc_assert (vectype);
2106
2107               /* The rightmost bits of an aligned address must be zeros.
2108                  Construct the mask needed for this test.  For example,
2109                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2110                  mask must be 15 = 0xf. */
2111               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2112
2113               /* FORNOW: use the same mask to test all potentially unaligned
2114                  references in the loop.  The vectorizer currently supports
2115                  a single vector size, see the reference to
2116                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2117                  vectorization factor is computed.  */
2118               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2119                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2120               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2121               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2122                       DR_STMT (dr));
2123             }
2124         }
2125
2126       /* Versioning requires at least one misaligned data reference.  */
2127       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2128         do_versioning = false;
2129       else if (!do_versioning)
2130         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2131     }
2132
2133   if (do_versioning)
2134     {
2135       vec<gimple *> may_misalign_stmts
2136         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2137       gimple *stmt;
2138
2139       /* It can now be assumed that the data references in the statements
2140          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2141          of the loop being vectorized.  */
2142       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2143         {
2144           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2145           dr = STMT_VINFO_DATA_REF (stmt_info);
2146           SET_DR_MISALIGNMENT (dr, 0);
2147           if (dump_enabled_p ())
2148             dump_printf_loc (MSG_NOTE, vect_location,
2149                              "Alignment of access forced using versioning.\n");
2150         }
2151
2152       if (dump_enabled_p ())
2153         dump_printf_loc (MSG_NOTE, vect_location,
2154                          "Versioning for alignment will be applied.\n");
2155
2156       /* Peeling and versioning can't be done together at this time.  */
2157       gcc_assert (! (do_peeling && do_versioning));
2158
2159       stat = vect_verify_datarefs_alignment (loop_vinfo);
2160       gcc_assert (stat);
2161       return stat;
2162     }
2163
2164   /* This point is reached if neither peeling nor versioning is being done.  */
2165   gcc_assert (! (do_peeling || do_versioning));
2166
2167   stat = vect_verify_datarefs_alignment (loop_vinfo);
2168   return stat;
2169 }
2170
2171
2172 /* Function vect_find_same_alignment_drs.
2173
2174    Update group and alignment relations according to the chosen
2175    vectorization factor.  */
2176
2177 static void
2178 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2179 {
2180   struct data_reference *dra = DDR_A (ddr);
2181   struct data_reference *drb = DDR_B (ddr);
2182   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2183   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2184
2185   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2186     return;
2187
2188   if (dra == drb)
2189     return;
2190
2191   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2192       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2193       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2194     return;
2195
2196   /* Two references with distance zero have the same alignment.  */
2197   offset_int diff = (wi::to_offset (DR_INIT (dra))
2198                      - wi::to_offset (DR_INIT (drb)));
2199   if (diff != 0)
2200     {
2201       /* Get the wider of the two alignments.  */
2202       unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2203       unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2204       unsigned int max_align = MAX (align_a, align_b);
2205
2206       /* Require the gap to be a multiple of the larger vector alignment.  */
2207       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2208         return;
2209     }
2210
2211   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2212   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2213   if (dump_enabled_p ())
2214     {
2215       dump_printf_loc (MSG_NOTE, vect_location,
2216                        "accesses have the same alignment: ");
2217       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2218       dump_printf (MSG_NOTE,  " and ");
2219       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2220       dump_printf (MSG_NOTE, "\n");
2221     }
2222 }
2223
2224
2225 /* Function vect_analyze_data_refs_alignment
2226
2227    Analyze the alignment of the data-references in the loop.
2228    Return FALSE if a data reference is found that cannot be vectorized.  */
2229
2230 bool
2231 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2232 {
2233   if (dump_enabled_p ())
2234     dump_printf_loc (MSG_NOTE, vect_location,
2235                      "=== vect_analyze_data_refs_alignment ===\n");
2236
2237   /* Mark groups of data references with same alignment using
2238      data dependence information.  */
2239   vec<ddr_p> ddrs = vinfo->ddrs;
2240   struct data_dependence_relation *ddr;
2241   unsigned int i;
2242
2243   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2244     vect_find_same_alignment_drs (ddr);
2245
2246   vec<data_reference_p> datarefs = vinfo->datarefs;
2247   struct data_reference *dr;
2248
2249   vect_record_base_alignments (vinfo);
2250   FOR_EACH_VEC_ELT (datarefs, i, dr)
2251     {
2252       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2253       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2254           && !vect_compute_data_ref_alignment (dr))
2255         {
2256           /* Strided accesses perform only component accesses, misalignment
2257              information is irrelevant for them.  */
2258           if (STMT_VINFO_STRIDED_P (stmt_info)
2259               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2260             continue;
2261
2262           if (dump_enabled_p ())
2263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264                              "not vectorized: can't calculate alignment "
2265                              "for data ref.\n");
2266
2267           return false;
2268         }
2269     }
2270
2271   return true;
2272 }
2273
2274
2275 /* Analyze alignment of DRs of stmts in NODE.  */
2276
2277 static bool
2278 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2279 {
2280   /* We vectorize from the first scalar stmt in the node unless
2281      the node is permuted in which case we start from the first
2282      element in the group.  */
2283   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2284   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2285   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2286     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2287
2288   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2289   if (! vect_compute_data_ref_alignment (dr)
2290       /* For creating the data-ref pointer we need alignment of the
2291          first element anyway.  */
2292       || (dr != first_dr
2293           && ! vect_compute_data_ref_alignment (first_dr))
2294       || ! verify_data_ref_alignment (dr))
2295     {
2296       if (dump_enabled_p ())
2297         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298                          "not vectorized: bad data alignment in basic "
2299                          "block.\n");
2300       return false;
2301     }
2302
2303   return true;
2304 }
2305
2306 /* Function vect_slp_analyze_instance_alignment
2307
2308    Analyze the alignment of the data-references in the SLP instance.
2309    Return FALSE if a data reference is found that cannot be vectorized.  */
2310
2311 bool
2312 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2313 {
2314   if (dump_enabled_p ())
2315     dump_printf_loc (MSG_NOTE, vect_location,
2316                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2317
2318   slp_tree node;
2319   unsigned i;
2320   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2321     if (! vect_slp_analyze_and_verify_node_alignment (node))
2322       return false;
2323
2324   node = SLP_INSTANCE_TREE (instance);
2325   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2326       && ! vect_slp_analyze_and_verify_node_alignment
2327              (SLP_INSTANCE_TREE (instance)))
2328     return false;
2329
2330   return true;
2331 }
2332
2333
2334 /* Analyze groups of accesses: check that DR belongs to a group of
2335    accesses of legal size, step, etc.  Detect gaps, single element
2336    interleaving, and other special cases. Set grouped access info.
2337    Collect groups of strided stores for further use in SLP analysis.
2338    Worker for vect_analyze_group_access.  */
2339
2340 static bool
2341 vect_analyze_group_access_1 (struct data_reference *dr)
2342 {
2343   tree step = DR_STEP (dr);
2344   tree scalar_type = TREE_TYPE (DR_REF (dr));
2345   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2346   gimple *stmt = DR_STMT (dr);
2347   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2348   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2349   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2350   HOST_WIDE_INT dr_step = -1;
2351   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2352   bool slp_impossible = false;
2353
2354   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2355      size of the interleaving group (including gaps).  */
2356   if (tree_fits_shwi_p (step))
2357     {
2358       dr_step = tree_to_shwi (step);
2359       /* Check that STEP is a multiple of type size.  Otherwise there is
2360          a non-element-sized gap at the end of the group which we
2361          cannot represent in GROUP_GAP or GROUP_SIZE.
2362          ???  As we can handle non-constant step fine here we should
2363          simply remove uses of GROUP_GAP between the last and first
2364          element and instead rely on DR_STEP.  GROUP_SIZE then would
2365          simply not include that gap.  */
2366       if ((dr_step % type_size) != 0)
2367         {
2368           if (dump_enabled_p ())
2369             {
2370               dump_printf_loc (MSG_NOTE, vect_location,
2371                                "Step ");
2372               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2373               dump_printf (MSG_NOTE,
2374                            " is not a multiple of the element size for ");
2375               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2376               dump_printf (MSG_NOTE, "\n");
2377             }
2378           return false;
2379         }
2380       groupsize = absu_hwi (dr_step) / type_size;
2381     }
2382   else
2383     groupsize = 0;
2384
2385   /* Not consecutive access is possible only if it is a part of interleaving.  */
2386   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2387     {
2388       /* Check if it this DR is a part of interleaving, and is a single
2389          element of the group that is accessed in the loop.  */
2390
2391       /* Gaps are supported only for loads. STEP must be a multiple of the type
2392          size.  The size of the group must be a power of 2.  */
2393       if (DR_IS_READ (dr)
2394           && (dr_step % type_size) == 0
2395           && groupsize > 0
2396           && pow2p_hwi (groupsize))
2397         {
2398           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2399           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2400           GROUP_GAP (stmt_info) = groupsize - 1;
2401           if (dump_enabled_p ())
2402             {
2403               dump_printf_loc (MSG_NOTE, vect_location,
2404                                "Detected single element interleaving ");
2405               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2406               dump_printf (MSG_NOTE, " step ");
2407               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2408               dump_printf (MSG_NOTE, "\n");
2409             }
2410
2411           return true;
2412         }
2413
2414       if (dump_enabled_p ())
2415         {
2416           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2417                            "not consecutive access ");
2418           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2419         }
2420
2421       if (bb_vinfo)
2422         {
2423           /* Mark the statement as unvectorizable.  */
2424           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2425           return true;
2426         }
2427
2428       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2429       STMT_VINFO_STRIDED_P (stmt_info) = true;
2430       return true;
2431     }
2432
2433   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2434     {
2435       /* First stmt in the interleaving chain. Check the chain.  */
2436       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2437       struct data_reference *data_ref = dr;
2438       unsigned int count = 1;
2439       tree prev_init = DR_INIT (data_ref);
2440       gimple *prev = stmt;
2441       HOST_WIDE_INT diff, gaps = 0;
2442
2443       while (next)
2444         {
2445           /* Skip same data-refs.  In case that two or more stmts share
2446              data-ref (supported only for loads), we vectorize only the first
2447              stmt, and the rest get their vectorized loads from the first
2448              one.  */
2449           if (!tree_int_cst_compare (DR_INIT (data_ref),
2450                                      DR_INIT (STMT_VINFO_DATA_REF (
2451                                                    vinfo_for_stmt (next)))))
2452             {
2453               if (DR_IS_WRITE (data_ref))
2454                 {
2455                   if (dump_enabled_p ())
2456                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2457                                      "Two store stmts share the same dr.\n");
2458                   return false;
2459                 }
2460
2461               if (dump_enabled_p ())
2462                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2463                                  "Two or more load stmts share the same dr.\n");
2464
2465               /* For load use the same data-ref load.  */
2466               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2467
2468               prev = next;
2469               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2470               continue;
2471             }
2472
2473           prev = next;
2474           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2475
2476           /* All group members have the same STEP by construction.  */
2477           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2478
2479           /* Check that the distance between two accesses is equal to the type
2480              size. Otherwise, we have gaps.  */
2481           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2482                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2483           if (diff != 1)
2484             {
2485               /* FORNOW: SLP of accesses with gaps is not supported.  */
2486               slp_impossible = true;
2487               if (DR_IS_WRITE (data_ref))
2488                 {
2489                   if (dump_enabled_p ())
2490                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2491                                      "interleaved store with gaps\n");
2492                   return false;
2493                 }
2494
2495               gaps += diff - 1;
2496             }
2497
2498           last_accessed_element += diff;
2499
2500           /* Store the gap from the previous member of the group. If there is no
2501              gap in the access, GROUP_GAP is always 1.  */
2502           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2503
2504           prev_init = DR_INIT (data_ref);
2505           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2506           /* Count the number of data-refs in the chain.  */
2507           count++;
2508         }
2509
2510       if (groupsize == 0)
2511         groupsize = count + gaps;
2512
2513       /* This could be UINT_MAX but as we are generating code in a very
2514          inefficient way we have to cap earlier.  See PR78699 for example.  */
2515       if (groupsize > 4096)
2516         {
2517           if (dump_enabled_p ())
2518             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2519                              "group is too large\n");
2520           return false;
2521         }
2522
2523       /* Check that the size of the interleaving is equal to count for stores,
2524          i.e., that there are no gaps.  */
2525       if (groupsize != count
2526           && !DR_IS_READ (dr))
2527         {
2528           if (dump_enabled_p ())
2529             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2530                              "interleaved store with gaps\n");
2531           return false;
2532         }
2533
2534       /* If there is a gap after the last load in the group it is the
2535          difference between the groupsize and the last accessed
2536          element.
2537          When there is no gap, this difference should be 0.  */
2538       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2539
2540       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2541       if (dump_enabled_p ())
2542         {
2543           dump_printf_loc (MSG_NOTE, vect_location,
2544                            "Detected interleaving ");
2545           if (DR_IS_READ (dr))
2546             dump_printf (MSG_NOTE, "load ");
2547           else
2548             dump_printf (MSG_NOTE, "store ");
2549           dump_printf (MSG_NOTE, "of size %u starting with ",
2550                        (unsigned)groupsize);
2551           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2552           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2553             dump_printf_loc (MSG_NOTE, vect_location,
2554                              "There is a gap of %u elements after the group\n",
2555                              GROUP_GAP (vinfo_for_stmt (stmt)));
2556         }
2557
2558       /* SLP: create an SLP data structure for every interleaving group of
2559          stores for further analysis in vect_analyse_slp.  */
2560       if (DR_IS_WRITE (dr) && !slp_impossible)
2561         {
2562           if (loop_vinfo)
2563             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2564           if (bb_vinfo)
2565             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2566         }
2567     }
2568
2569   return true;
2570 }
2571
2572 /* Analyze groups of accesses: check that DR belongs to a group of
2573    accesses of legal size, step, etc.  Detect gaps, single element
2574    interleaving, and other special cases. Set grouped access info.
2575    Collect groups of strided stores for further use in SLP analysis.  */
2576
2577 static bool
2578 vect_analyze_group_access (struct data_reference *dr)
2579 {
2580   if (!vect_analyze_group_access_1 (dr))
2581     {
2582       /* Dissolve the group if present.  */
2583       gimple *next;
2584       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2585       while (stmt)
2586         {
2587           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2588           next = GROUP_NEXT_ELEMENT (vinfo);
2589           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2590           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2591           stmt = next;
2592         }
2593       return false;
2594     }
2595   return true;
2596 }
2597
2598 /* Analyze the access pattern of the data-reference DR.
2599    In case of non-consecutive accesses call vect_analyze_group_access() to
2600    analyze groups of accesses.  */
2601
2602 static bool
2603 vect_analyze_data_ref_access (struct data_reference *dr)
2604 {
2605   tree step = DR_STEP (dr);
2606   tree scalar_type = TREE_TYPE (DR_REF (dr));
2607   gimple *stmt = DR_STMT (dr);
2608   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2609   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2610   struct loop *loop = NULL;
2611
2612   if (loop_vinfo)
2613     loop = LOOP_VINFO_LOOP (loop_vinfo);
2614
2615   if (loop_vinfo && !step)
2616     {
2617       if (dump_enabled_p ())
2618         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2619                          "bad data-ref access in loop\n");
2620       return false;
2621     }
2622
2623   /* Allow loads with zero step in inner-loop vectorization.  */
2624   if (loop_vinfo && integer_zerop (step))
2625     {
2626       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2627       if (!nested_in_vect_loop_p (loop, stmt))
2628         return DR_IS_READ (dr);
2629       /* Allow references with zero step for outer loops marked
2630          with pragma omp simd only - it guarantees absence of
2631          loop-carried dependencies between inner loop iterations.  */
2632       if (!loop->force_vectorize)
2633         {
2634           if (dump_enabled_p ())
2635             dump_printf_loc (MSG_NOTE, vect_location,
2636                              "zero step in inner loop of nest\n");
2637           return false;
2638         }
2639     }
2640
2641   if (loop && nested_in_vect_loop_p (loop, stmt))
2642     {
2643       /* Interleaved accesses are not yet supported within outer-loop
2644         vectorization for references in the inner-loop.  */
2645       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2646
2647       /* For the rest of the analysis we use the outer-loop step.  */
2648       step = STMT_VINFO_DR_STEP (stmt_info);
2649       if (integer_zerop (step))
2650         {
2651           if (dump_enabled_p ())
2652             dump_printf_loc (MSG_NOTE, vect_location,
2653                              "zero step in outer loop.\n");
2654           return DR_IS_READ (dr);
2655         }
2656     }
2657
2658   /* Consecutive?  */
2659   if (TREE_CODE (step) == INTEGER_CST)
2660     {
2661       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2662       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2663           || (dr_step < 0
2664               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2665         {
2666           /* Mark that it is not interleaving.  */
2667           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2668           return true;
2669         }
2670     }
2671
2672   if (loop && nested_in_vect_loop_p (loop, stmt))
2673     {
2674       if (dump_enabled_p ())
2675         dump_printf_loc (MSG_NOTE, vect_location,
2676                          "grouped access in outer loop.\n");
2677       return false;
2678     }
2679
2680
2681   /* Assume this is a DR handled by non-constant strided load case.  */
2682   if (TREE_CODE (step) != INTEGER_CST)
2683     return (STMT_VINFO_STRIDED_P (stmt_info)
2684             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2685                 || vect_analyze_group_access (dr)));
2686
2687   /* Not consecutive access - check if it's a part of interleaving group.  */
2688   return vect_analyze_group_access (dr);
2689 }
2690
2691 /* Compare two data-references DRA and DRB to group them into chunks
2692    suitable for grouping.  */
2693
2694 static int
2695 dr_group_sort_cmp (const void *dra_, const void *drb_)
2696 {
2697   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2698   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2699   int cmp;
2700
2701   /* Stabilize sort.  */
2702   if (dra == drb)
2703     return 0;
2704
2705   /* DRs in different loops never belong to the same group.  */
2706   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2707   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2708   if (loopa != loopb)
2709     return loopa->num < loopb->num ? -1 : 1;
2710
2711   /* Ordering of DRs according to base.  */
2712   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2713     {
2714       cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2715                                    DR_BASE_ADDRESS (drb));
2716       if (cmp != 0)
2717         return cmp;
2718     }
2719
2720   /* And according to DR_OFFSET.  */
2721   if (!dr_equal_offsets_p (dra, drb))
2722     {
2723       cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2724       if (cmp != 0)
2725         return cmp;
2726     }
2727
2728   /* Put reads before writes.  */
2729   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2730     return DR_IS_READ (dra) ? -1 : 1;
2731
2732   /* Then sort after access size.  */
2733   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2734                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2735     {
2736       cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2737                                    TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2738       if (cmp != 0)
2739         return cmp;
2740     }
2741
2742   /* And after step.  */
2743   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2744     {
2745       cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2746       if (cmp != 0)
2747         return cmp;
2748     }
2749
2750   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2751   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2752   if (cmp == 0)
2753     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2754   return cmp;
2755 }
2756
2757 /* Function vect_analyze_data_ref_accesses.
2758
2759    Analyze the access pattern of all the data references in the loop.
2760
2761    FORNOW: the only access pattern that is considered vectorizable is a
2762            simple step 1 (consecutive) access.
2763
2764    FORNOW: handle only arrays and pointer accesses.  */
2765
2766 bool
2767 vect_analyze_data_ref_accesses (vec_info *vinfo)
2768 {
2769   unsigned int i;
2770   vec<data_reference_p> datarefs = vinfo->datarefs;
2771   struct data_reference *dr;
2772
2773   if (dump_enabled_p ())
2774     dump_printf_loc (MSG_NOTE, vect_location,
2775                      "=== vect_analyze_data_ref_accesses ===\n");
2776
2777   if (datarefs.is_empty ())
2778     return true;
2779
2780   /* Sort the array of datarefs to make building the interleaving chains
2781      linear.  Don't modify the original vector's order, it is needed for
2782      determining what dependencies are reversed.  */
2783   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2784   datarefs_copy.qsort (dr_group_sort_cmp);
2785
2786   /* Build the interleaving chains.  */
2787   for (i = 0; i < datarefs_copy.length () - 1;)
2788     {
2789       data_reference_p dra = datarefs_copy[i];
2790       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2791       stmt_vec_info lastinfo = NULL;
2792       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2793         {
2794           ++i;
2795           continue;
2796         }
2797       for (i = i + 1; i < datarefs_copy.length (); ++i)
2798         {
2799           data_reference_p drb = datarefs_copy[i];
2800           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2801           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2802             break;
2803
2804           /* ???  Imperfect sorting (non-compatible types, non-modulo
2805              accesses, same accesses) can lead to a group to be artificially
2806              split here as we don't just skip over those.  If it really
2807              matters we can push those to a worklist and re-iterate
2808              over them.  The we can just skip ahead to the next DR here.  */
2809
2810           /* DRs in a different loop should not be put into the same
2811              interleaving group.  */
2812           if (gimple_bb (DR_STMT (dra))->loop_father
2813               != gimple_bb (DR_STMT (drb))->loop_father)
2814             break;
2815
2816           /* Check that the data-refs have same first location (except init)
2817              and they are both either store or load (not load and store,
2818              not masked loads or stores).  */
2819           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2820               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2821                                    DR_BASE_ADDRESS (drb), 0)
2822               || !dr_equal_offsets_p (dra, drb)
2823               || !gimple_assign_single_p (DR_STMT (dra))
2824               || !gimple_assign_single_p (DR_STMT (drb)))
2825             break;
2826
2827           /* Check that the data-refs have the same constant size.  */
2828           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2829           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2830           if (!tree_fits_uhwi_p (sza)
2831               || !tree_fits_uhwi_p (szb)
2832               || !tree_int_cst_equal (sza, szb))
2833             break;
2834
2835           /* Check that the data-refs have the same step.  */
2836           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2837             break;
2838
2839           /* Do not place the same access in the interleaving chain twice.  */
2840           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2841             break;
2842
2843           /* Check the types are compatible.
2844              ???  We don't distinguish this during sorting.  */
2845           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2846                                    TREE_TYPE (DR_REF (drb))))
2847             break;
2848
2849           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2850           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2851           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2852           gcc_assert (init_a <= init_b);
2853
2854           /* If init_b == init_a + the size of the type * k, we have an
2855              interleaving, and DRA is accessed before DRB.  */
2856           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2857           if (type_size_a == 0
2858               || (init_b - init_a) % type_size_a != 0)
2859             break;
2860
2861           /* If we have a store, the accesses are adjacent.  This splits
2862              groups into chunks we support (we don't support vectorization
2863              of stores with gaps).  */
2864           if (!DR_IS_READ (dra)
2865               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2866                                              (DR_INIT (datarefs_copy[i-1]))
2867                   != type_size_a))
2868             break;
2869
2870           /* If the step (if not zero or non-constant) is greater than the
2871              difference between data-refs' inits this splits groups into
2872              suitable sizes.  */
2873           if (tree_fits_shwi_p (DR_STEP (dra)))
2874             {
2875               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2876               if (step != 0 && step <= (init_b - init_a))
2877                 break;
2878             }
2879
2880           if (dump_enabled_p ())
2881             {
2882               dump_printf_loc (MSG_NOTE, vect_location,
2883                                "Detected interleaving ");
2884               if (DR_IS_READ (dra))
2885                 dump_printf (MSG_NOTE, "load ");
2886               else
2887                 dump_printf (MSG_NOTE, "store ");
2888               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2889               dump_printf (MSG_NOTE,  " and ");
2890               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2891               dump_printf (MSG_NOTE, "\n");
2892             }
2893
2894           /* Link the found element into the group list.  */
2895           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2896             {
2897               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2898               lastinfo = stmtinfo_a;
2899             }
2900           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2901           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2902           lastinfo = stmtinfo_b;
2903         }
2904     }
2905
2906   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2907     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2908         && !vect_analyze_data_ref_access (dr))
2909       {
2910         if (dump_enabled_p ())
2911           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912                            "not vectorized: complicated access pattern.\n");
2913
2914         if (is_a <bb_vec_info> (vinfo))
2915           {
2916             /* Mark the statement as not vectorizable.  */
2917             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2918             continue;
2919           }
2920         else
2921           {
2922             datarefs_copy.release ();
2923             return false;
2924           }
2925       }
2926
2927   datarefs_copy.release ();
2928   return true;
2929 }
2930
2931 /* Function vect_vfa_segment_size.
2932
2933    Create an expression that computes the size of segment
2934    that will be accessed for a data reference.  The functions takes into
2935    account that realignment loads may access one more vector.
2936
2937    Input:
2938      DR: The data reference.
2939      LENGTH_FACTOR: segment length to consider.
2940
2941    Return an expression whose value is the size of segment which will be
2942    accessed by DR.  */
2943
2944 static tree
2945 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2946 {
2947   tree segment_length;
2948
2949   if (integer_zerop (DR_STEP (dr)))
2950     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2951   else
2952     segment_length = size_binop (MULT_EXPR,
2953                                  fold_convert (sizetype, DR_STEP (dr)),
2954                                  fold_convert (sizetype, length_factor));
2955
2956   if (vect_supportable_dr_alignment (dr, false)
2957         == dr_explicit_realign_optimized)
2958     {
2959       tree vector_size = TYPE_SIZE_UNIT
2960                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2961
2962       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2963     }
2964   return segment_length;
2965 }
2966
2967 /* Function vect_no_alias_p.
2968
2969    Given data references A and B with equal base and offset, the alias
2970    relation can be decided at compilation time, return TRUE if they do
2971    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
2972    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2973    respectively.  */
2974
2975 static bool
2976 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2977                  tree segment_length_a, tree segment_length_b)
2978 {
2979   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2980               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2981   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2982     return false;
2983
2984   tree seg_a_min = DR_INIT (a);
2985   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2986                                 seg_a_min, segment_length_a);
2987   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2988      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2989      [a, a+12) */
2990   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2991     {
2992       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2993       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2994                                seg_a_max, unit_size);
2995       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2996                                DR_INIT (a), unit_size);
2997     }
2998   tree seg_b_min = DR_INIT (b);
2999   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3000                                 seg_b_min, segment_length_b);
3001   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3002     {
3003       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3004       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3005                                seg_b_max, unit_size);
3006       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3007                                DR_INIT (b), unit_size);
3008     }
3009
3010   if (tree_int_cst_le (seg_a_max, seg_b_min)
3011       || tree_int_cst_le (seg_b_max, seg_a_min))
3012     return true;
3013
3014   return false;
3015 }
3016
3017 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3018    in DDR is >= VF.  */
3019
3020 static bool
3021 dependence_distance_ge_vf (data_dependence_relation *ddr,
3022                            unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3023 {
3024   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3025       || DDR_NUM_DIST_VECTS (ddr) == 0)
3026     return false;
3027
3028   /* If the dependence is exact, we should have limited the VF instead.  */
3029   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3030
3031   unsigned int i;
3032   lambda_vector dist_v;
3033   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3034     {
3035       HOST_WIDE_INT dist = dist_v[loop_depth];
3036       if (dist != 0
3037           && !(dist > 0 && DDR_REVERSED_P (ddr))
3038           && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3039         return false;
3040     }
3041
3042   if (dump_enabled_p ())
3043     {
3044       dump_printf_loc (MSG_NOTE, vect_location,
3045                        "dependence distance between ");
3046       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3047       dump_printf (MSG_NOTE,  " and ");
3048       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3049       dump_printf (MSG_NOTE,  " is >= VF\n");
3050     }
3051
3052   return true;
3053 }
3054
3055 /* Function vect_prune_runtime_alias_test_list.
3056
3057    Prune a list of ddrs to be tested at run-time by versioning for alias.
3058    Merge several alias checks into one if possible.
3059    Return FALSE if resulting list of ddrs is longer then allowed by
3060    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3061
3062 bool
3063 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3064 {
3065   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3066   hash_set <tree_pair_hash> compared_objects;
3067
3068   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3069   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3070     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3071   vec<vec_object_pair> &check_unequal_addrs
3072     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3073   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3074   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3075
3076   ddr_p ddr;
3077   unsigned int i;
3078   tree length_factor;
3079
3080   if (dump_enabled_p ())
3081     dump_printf_loc (MSG_NOTE, vect_location,
3082                      "=== vect_prune_runtime_alias_test_list ===\n");
3083
3084   if (may_alias_ddrs.is_empty ())
3085     return true;
3086
3087   comp_alias_ddrs.create (may_alias_ddrs.length ());
3088
3089   unsigned int loop_depth
3090     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3091                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3092
3093   /* First, we collect all data ref pairs for aliasing checks.  */
3094   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3095     {
3096       int comp_res;
3097       struct data_reference *dr_a, *dr_b;
3098       gimple *dr_group_first_a, *dr_group_first_b;
3099       tree segment_length_a, segment_length_b;
3100       gimple *stmt_a, *stmt_b;
3101
3102       /* Ignore the alias if the VF we chose ended up being no greater
3103          than the dependence distance.  */
3104       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3105         continue;
3106
3107       if (DDR_OBJECT_A (ddr))
3108         {
3109           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3110           if (!compared_objects.add (new_pair))
3111             {
3112               if (dump_enabled_p ())
3113                 {
3114                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3115                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3116                   dump_printf (MSG_NOTE, " and ");
3117                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3118                   dump_printf (MSG_NOTE, " have different addresses\n");
3119                 }
3120               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3121             }
3122           continue;
3123         }
3124
3125       dr_a = DDR_A (ddr);
3126       stmt_a = DR_STMT (DDR_A (ddr));
3127       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3128       if (dr_group_first_a)
3129         {
3130           stmt_a = dr_group_first_a;
3131           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3132         }
3133
3134       dr_b = DDR_B (ddr);
3135       stmt_b = DR_STMT (DDR_B (ddr));
3136       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3137       if (dr_group_first_b)
3138         {
3139           stmt_b = dr_group_first_b;
3140           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3141         }
3142
3143       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3144         length_factor = scalar_loop_iters;
3145       else
3146         length_factor = size_int (vect_factor);
3147       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3148       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3149
3150       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3151                                         DR_BASE_ADDRESS (dr_b));
3152       if (comp_res == 0)
3153         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3154                                           DR_OFFSET (dr_b));
3155
3156       /* Alias is known at compilation time.  */
3157       if (comp_res == 0
3158           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3159           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3160           && TREE_CODE (segment_length_a) == INTEGER_CST
3161           && TREE_CODE (segment_length_b) == INTEGER_CST)
3162         {
3163           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3164             continue;
3165
3166           if (dump_enabled_p ())
3167             dump_printf_loc (MSG_NOTE, vect_location,
3168                              "not vectorized: compilation time alias.\n");
3169
3170           return false;
3171         }
3172
3173       dr_with_seg_len_pair_t dr_with_seg_len_pair
3174           (dr_with_seg_len (dr_a, segment_length_a),
3175            dr_with_seg_len (dr_b, segment_length_b));
3176
3177       /* Canonicalize pairs by sorting the two DR members.  */
3178       if (comp_res > 0)
3179         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3180
3181       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3182     }
3183
3184   prune_runtime_alias_test_list (&comp_alias_ddrs,
3185                                  (unsigned HOST_WIDE_INT) vect_factor);
3186
3187   unsigned int count = (comp_alias_ddrs.length ()
3188                         + check_unequal_addrs.length ());
3189   dump_printf_loc (MSG_NOTE, vect_location,
3190                    "improved number of alias checks from %d to %d\n",
3191                    may_alias_ddrs.length (), count);
3192   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3193     {
3194       if (dump_enabled_p ())
3195         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3196                          "number of versioning for alias "
3197                          "run-time tests exceeds %d "
3198                          "(--param vect-max-version-for-alias-checks)\n",
3199                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3200       return false;
3201     }
3202
3203   return true;
3204 }
3205
3206 /* Return true if a non-affine read or write in STMT is suitable for a
3207    gather load or scatter store.  Describe the operation in *INFO if so.  */
3208
3209 bool
3210 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3211                            gather_scatter_info *info)
3212 {
3213   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3214   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3215   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3216   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3217   tree offtype = NULL_TREE;
3218   tree decl, base, off;
3219   machine_mode pmode;
3220   int punsignedp, reversep, pvolatilep = 0;
3221
3222   base = DR_REF (dr);
3223   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3224      see if we can use the def stmt of the address.  */
3225   if (is_gimple_call (stmt)
3226       && gimple_call_internal_p (stmt)
3227       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3228           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3229       && TREE_CODE (base) == MEM_REF
3230       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3231       && integer_zerop (TREE_OPERAND (base, 1))
3232       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3233     {
3234       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3235       if (is_gimple_assign (def_stmt)
3236           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3237         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3238     }
3239
3240   /* The gather and scatter builtins need address of the form
3241      loop_invariant + vector * {1, 2, 4, 8}
3242      or
3243      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3244      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3245      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3246      multiplications and additions in it.  To get a vector, we need
3247      a single SSA_NAME that will be defined in the loop and will
3248      contain everything that is not loop invariant and that can be
3249      vectorized.  The following code attempts to find such a preexistng
3250      SSA_NAME OFF and put the loop invariants into a tree BASE
3251      that can be gimplified before the loop.  */
3252   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3253                               &punsignedp, &reversep, &pvolatilep);
3254   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3255
3256   if (TREE_CODE (base) == MEM_REF)
3257     {
3258       if (!integer_zerop (TREE_OPERAND (base, 1)))
3259         {
3260           if (off == NULL_TREE)
3261             {
3262               offset_int moff = mem_ref_offset (base);
3263               off = wide_int_to_tree (sizetype, moff);
3264             }
3265           else
3266             off = size_binop (PLUS_EXPR, off,
3267                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3268         }
3269       base = TREE_OPERAND (base, 0);
3270     }
3271   else
3272     base = build_fold_addr_expr (base);
3273
3274   if (off == NULL_TREE)
3275     off = size_zero_node;
3276
3277   /* If base is not loop invariant, either off is 0, then we start with just
3278      the constant offset in the loop invariant BASE and continue with base
3279      as OFF, otherwise give up.
3280      We could handle that case by gimplifying the addition of base + off
3281      into some SSA_NAME and use that as off, but for now punt.  */
3282   if (!expr_invariant_in_loop_p (loop, base))
3283     {
3284       if (!integer_zerop (off))
3285         return false;
3286       off = base;
3287       base = size_int (pbitpos / BITS_PER_UNIT);
3288     }
3289   /* Otherwise put base + constant offset into the loop invariant BASE
3290      and continue with OFF.  */
3291   else
3292     {
3293       base = fold_convert (sizetype, base);
3294       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3295     }
3296
3297   /* OFF at this point may be either a SSA_NAME or some tree expression
3298      from get_inner_reference.  Try to peel off loop invariants from it
3299      into BASE as long as possible.  */
3300   STRIP_NOPS (off);
3301   while (offtype == NULL_TREE)
3302     {
3303       enum tree_code code;
3304       tree op0, op1, add = NULL_TREE;
3305
3306       if (TREE_CODE (off) == SSA_NAME)
3307         {
3308           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3309
3310           if (expr_invariant_in_loop_p (loop, off))
3311             return false;
3312
3313           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3314             break;
3315
3316           op0 = gimple_assign_rhs1 (def_stmt);
3317           code = gimple_assign_rhs_code (def_stmt);
3318           op1 = gimple_assign_rhs2 (def_stmt);
3319         }
3320       else
3321         {
3322           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3323             return false;
3324           code = TREE_CODE (off);
3325           extract_ops_from_tree (off, &code, &op0, &op1);
3326         }
3327       switch (code)
3328         {
3329         case POINTER_PLUS_EXPR:
3330         case PLUS_EXPR:
3331           if (expr_invariant_in_loop_p (loop, op0))
3332             {
3333               add = op0;
3334               off = op1;
3335             do_add:
3336               add = fold_convert (sizetype, add);
3337               if (scale != 1)
3338                 add = size_binop (MULT_EXPR, add, size_int (scale));
3339               base = size_binop (PLUS_EXPR, base, add);
3340               continue;
3341             }
3342           if (expr_invariant_in_loop_p (loop, op1))
3343             {
3344               add = op1;
3345               off = op0;
3346               goto do_add;
3347             }
3348           break;
3349         case MINUS_EXPR:
3350           if (expr_invariant_in_loop_p (loop, op1))
3351             {
3352               add = fold_convert (sizetype, op1);
3353               add = size_binop (MINUS_EXPR, size_zero_node, add);
3354               off = op0;
3355               goto do_add;
3356             }
3357           break;
3358         case MULT_EXPR:
3359           if (scale == 1 && tree_fits_shwi_p (op1))
3360             {
3361               scale = tree_to_shwi (op1);
3362               off = op0;
3363               continue;
3364             }
3365           break;
3366         case SSA_NAME:
3367           off = op0;
3368           continue;
3369         CASE_CONVERT:
3370           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3371               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3372             break;
3373           if (TYPE_PRECISION (TREE_TYPE (op0))
3374               == TYPE_PRECISION (TREE_TYPE (off)))
3375             {
3376               off = op0;
3377               continue;
3378             }
3379           if (TYPE_PRECISION (TREE_TYPE (op0))
3380               < TYPE_PRECISION (TREE_TYPE (off)))
3381             {
3382               off = op0;
3383               offtype = TREE_TYPE (off);
3384               STRIP_NOPS (off);
3385               continue;
3386             }
3387           break;
3388         default:
3389           break;
3390         }
3391       break;
3392     }
3393
3394   /* If at the end OFF still isn't a SSA_NAME or isn't
3395      defined in the loop, punt.  */
3396   if (TREE_CODE (off) != SSA_NAME
3397       || expr_invariant_in_loop_p (loop, off))
3398     return false;
3399
3400   if (offtype == NULL_TREE)
3401     offtype = TREE_TYPE (off);
3402
3403   if (DR_IS_READ (dr))
3404     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3405                                              offtype, scale);
3406   else
3407     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3408                                               offtype, scale);
3409
3410   if (decl == NULL_TREE)
3411     return false;
3412
3413   info->decl = decl;
3414   info->base = base;
3415   info->offset = off;
3416   info->offset_dt = vect_unknown_def_type;
3417   info->offset_vectype = NULL_TREE;
3418   info->scale = scale;
3419   return true;
3420 }
3421
3422 /* Function vect_analyze_data_refs.
3423
3424   Find all the data references in the loop or basic block.
3425
3426    The general structure of the analysis of data refs in the vectorizer is as
3427    follows:
3428    1- vect_analyze_data_refs(loop/bb): call
3429       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3430       in the loop/bb and their dependences.
3431    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3432    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3433    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3434
3435 */
3436
3437 bool
3438 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3439 {
3440   struct loop *loop = NULL;
3441   unsigned int i;
3442   struct data_reference *dr;
3443   tree scalar_type;
3444
3445   if (dump_enabled_p ())
3446     dump_printf_loc (MSG_NOTE, vect_location,
3447                      "=== vect_analyze_data_refs ===\n");
3448
3449   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3450     loop = LOOP_VINFO_LOOP (loop_vinfo);
3451
3452   /* Go through the data-refs, check that the analysis succeeded.  Update
3453      pointer from stmt_vec_info struct to DR and vectype.  */
3454
3455   vec<data_reference_p> datarefs = vinfo->datarefs;
3456   FOR_EACH_VEC_ELT (datarefs, i, dr)
3457     {
3458       gimple *stmt;
3459       stmt_vec_info stmt_info;
3460       tree base, offset, init;
3461       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3462       bool simd_lane_access = false;
3463       int vf;
3464
3465 again:
3466       if (!dr || !DR_REF (dr))
3467         {
3468           if (dump_enabled_p ())
3469             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3470                              "not vectorized: unhandled data-ref\n");
3471           return false;
3472         }
3473
3474       stmt = DR_STMT (dr);
3475       stmt_info = vinfo_for_stmt (stmt);
3476
3477       /* Discard clobbers from the dataref vector.  We will remove
3478          clobber stmts during vectorization.  */
3479       if (gimple_clobber_p (stmt))
3480         {
3481           free_data_ref (dr);
3482           if (i == datarefs.length () - 1)
3483             {
3484               datarefs.pop ();
3485               break;
3486             }
3487           datarefs.ordered_remove (i);
3488           dr = datarefs[i];
3489           goto again;
3490         }
3491
3492       /* Check that analysis of the data-ref succeeded.  */
3493       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3494           || !DR_STEP (dr))
3495         {
3496           bool maybe_gather
3497             = DR_IS_READ (dr)
3498               && !TREE_THIS_VOLATILE (DR_REF (dr))
3499               && targetm.vectorize.builtin_gather != NULL;
3500           bool maybe_scatter
3501             = DR_IS_WRITE (dr)
3502               && !TREE_THIS_VOLATILE (DR_REF (dr))
3503               && targetm.vectorize.builtin_scatter != NULL;
3504           bool maybe_simd_lane_access
3505             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3506
3507           /* If target supports vector gather loads or scatter stores, or if
3508              this might be a SIMD lane access, see if they can't be used.  */
3509           if (is_a <loop_vec_info> (vinfo)
3510               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3511               && !nested_in_vect_loop_p (loop, stmt))
3512             {
3513               struct data_reference *newdr
3514                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3515                                    DR_REF (dr), stmt, !maybe_scatter,
3516                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3517               gcc_assert (newdr != NULL && DR_REF (newdr));
3518               if (DR_BASE_ADDRESS (newdr)
3519                   && DR_OFFSET (newdr)
3520                   && DR_INIT (newdr)
3521                   && DR_STEP (newdr)
3522                   && integer_zerop (DR_STEP (newdr)))
3523                 {
3524                   if (maybe_simd_lane_access)
3525                     {
3526                       tree off = DR_OFFSET (newdr);
3527                       STRIP_NOPS (off);
3528                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3529                           && TREE_CODE (off) == MULT_EXPR
3530                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3531                         {
3532                           tree step = TREE_OPERAND (off, 1);
3533                           off = TREE_OPERAND (off, 0);
3534                           STRIP_NOPS (off);
3535                           if (CONVERT_EXPR_P (off)
3536                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3537                                                                           0)))
3538                                  < TYPE_PRECISION (TREE_TYPE (off)))
3539                             off = TREE_OPERAND (off, 0);
3540                           if (TREE_CODE (off) == SSA_NAME)
3541                             {
3542                               gimple *def = SSA_NAME_DEF_STMT (off);
3543                               tree reft = TREE_TYPE (DR_REF (newdr));
3544                               if (is_gimple_call (def)
3545                                   && gimple_call_internal_p (def)
3546                                   && (gimple_call_internal_fn (def)
3547                                       == IFN_GOMP_SIMD_LANE))
3548                                 {
3549                                   tree arg = gimple_call_arg (def, 0);
3550                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3551                                   arg = SSA_NAME_VAR (arg);
3552                                   if (arg == loop->simduid
3553                                       /* For now.  */
3554                                       && tree_int_cst_equal
3555                                            (TYPE_SIZE_UNIT (reft),
3556                                             step))
3557                                     {
3558                                       DR_OFFSET (newdr) = ssize_int (0);
3559                                       DR_STEP (newdr) = step;
3560                                       DR_OFFSET_ALIGNMENT (newdr)
3561                                         = BIGGEST_ALIGNMENT;
3562                                       DR_STEP_ALIGNMENT (newdr)
3563                                         = highest_pow2_factor (step);
3564                                       dr = newdr;
3565                                       simd_lane_access = true;
3566                                     }
3567                                 }
3568                             }
3569                         }
3570                     }
3571                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3572                     {
3573                       dr = newdr;
3574                       if (maybe_gather)
3575                         gatherscatter = GATHER;
3576                       else
3577                         gatherscatter = SCATTER;
3578                     }
3579                 }
3580               if (gatherscatter == SG_NONE && !simd_lane_access)
3581                 free_data_ref (newdr);
3582             }
3583
3584           if (gatherscatter == SG_NONE && !simd_lane_access)
3585             {
3586               if (dump_enabled_p ())
3587                 {
3588                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589                                    "not vectorized: data ref analysis "
3590                                    "failed ");
3591                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3592                 }
3593
3594               if (is_a <bb_vec_info> (vinfo))
3595                 break;
3596
3597               return false;
3598             }
3599         }
3600
3601       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3602         {
3603           if (dump_enabled_p ())
3604             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3605                              "not vectorized: base addr of dr is a "
3606                              "constant\n");
3607
3608           if (is_a <bb_vec_info> (vinfo))
3609             break;
3610
3611           if (gatherscatter != SG_NONE || simd_lane_access)
3612             free_data_ref (dr);
3613           return false;
3614         }
3615
3616       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3617         {
3618           if (dump_enabled_p ())
3619             {
3620               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3621                                "not vectorized: volatile type ");
3622               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3623             }
3624
3625           if (is_a <bb_vec_info> (vinfo))
3626             break;
3627
3628           return false;
3629         }
3630
3631       if (stmt_can_throw_internal (stmt))
3632         {
3633           if (dump_enabled_p ())
3634             {
3635               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3636                                "not vectorized: statement can throw an "
3637                                "exception ");
3638               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3639             }
3640
3641           if (is_a <bb_vec_info> (vinfo))
3642             break;
3643
3644           if (gatherscatter != SG_NONE || simd_lane_access)
3645             free_data_ref (dr);
3646           return false;
3647         }
3648
3649       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3650           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3651         {
3652           if (dump_enabled_p ())
3653             {
3654               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3655                                "not vectorized: statement is bitfield "
3656                                "access ");
3657               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3658             }
3659
3660           if (is_a <bb_vec_info> (vinfo))
3661             break;
3662
3663           if (gatherscatter != SG_NONE || simd_lane_access)
3664             free_data_ref (dr);
3665           return false;
3666         }
3667
3668       base = unshare_expr (DR_BASE_ADDRESS (dr));
3669       offset = unshare_expr (DR_OFFSET (dr));
3670       init = unshare_expr (DR_INIT (dr));
3671
3672       if (is_gimple_call (stmt)
3673           && (!gimple_call_internal_p (stmt)
3674               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3675                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3676         {
3677           if (dump_enabled_p ())
3678             {
3679               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3680                                "not vectorized: dr in a call ");
3681               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3682             }
3683
3684           if (is_a <bb_vec_info> (vinfo))
3685             break;
3686
3687           if (gatherscatter != SG_NONE || simd_lane_access)
3688             free_data_ref (dr);
3689           return false;
3690         }
3691
3692       /* Update DR field in stmt_vec_info struct.  */
3693
3694       /* If the dataref is in an inner-loop of the loop that is considered for
3695          for vectorization, we also want to analyze the access relative to
3696          the outer-loop (DR contains information only relative to the
3697          inner-most enclosing loop).  We do that by building a reference to the
3698          first location accessed by the inner-loop, and analyze it relative to
3699          the outer-loop.  */
3700       if (loop && nested_in_vect_loop_p (loop, stmt))
3701         {
3702           /* Build a reference to the first location accessed by the
3703              inner loop: *(BASE + INIT + OFFSET).  By construction,
3704              this address must be invariant in the inner loop, so we
3705              can consider it as being used in the outer loop.  */
3706           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3707                                           init, offset);
3708           tree init_addr = fold_build_pointer_plus (base, init_offset);
3709           tree init_ref = build_fold_indirect_ref (init_addr);
3710
3711           if (dump_enabled_p ())
3712             {
3713               dump_printf_loc (MSG_NOTE, vect_location,
3714                                "analyze in outer loop: ");
3715               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3716               dump_printf (MSG_NOTE, "\n");
3717             }
3718
3719           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3720                                      init_ref, loop))
3721             /* dr_analyze_innermost already explained the failure.  */
3722             return false;
3723
3724           if (dump_enabled_p ())
3725             {
3726               dump_printf_loc (MSG_NOTE, vect_location,
3727                                "\touter base_address: ");
3728               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3729                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3730               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3731               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3732                                  STMT_VINFO_DR_OFFSET (stmt_info));
3733               dump_printf (MSG_NOTE,
3734                            "\n\touter constant offset from base address: ");
3735               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3736                                  STMT_VINFO_DR_INIT (stmt_info));
3737               dump_printf (MSG_NOTE, "\n\touter step: ");
3738               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3739                                  STMT_VINFO_DR_STEP (stmt_info));
3740               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3741                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3742               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3743                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3744               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3745                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3746               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3747                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3748             }
3749         }
3750
3751       if (STMT_VINFO_DATA_REF (stmt_info))
3752         {
3753           if (dump_enabled_p ())
3754             {
3755               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3756                                "not vectorized: more than one data ref "
3757                                "in stmt: ");
3758               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3759             }
3760
3761           if (is_a <bb_vec_info> (vinfo))
3762             break;
3763
3764           if (gatherscatter != SG_NONE || simd_lane_access)
3765             free_data_ref (dr);
3766           return false;
3767         }
3768
3769       STMT_VINFO_DATA_REF (stmt_info) = dr;
3770       if (simd_lane_access)
3771         {
3772           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3773           free_data_ref (datarefs[i]);
3774           datarefs[i] = dr;
3775         }
3776
3777       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3778           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3779           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3780         {
3781           if (dump_enabled_p ())
3782             {
3783               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3784                                "not vectorized: base object not addressable "
3785                                "for stmt: ");
3786               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3787             }
3788           if (is_a <bb_vec_info> (vinfo))
3789             {
3790               /* In BB vectorization the ref can still participate
3791                  in dependence analysis, we just can't vectorize it.  */
3792               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3793               continue;
3794             }
3795           return false;
3796         }
3797
3798       /* Set vectype for STMT.  */
3799       scalar_type = TREE_TYPE (DR_REF (dr));
3800       STMT_VINFO_VECTYPE (stmt_info)
3801         = get_vectype_for_scalar_type (scalar_type);
3802       if (!STMT_VINFO_VECTYPE (stmt_info))
3803         {
3804           if (dump_enabled_p ())
3805             {
3806               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3807                                "not vectorized: no vectype for stmt: ");
3808               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3809               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3810               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3811                                  scalar_type);
3812               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3813             }
3814
3815           if (is_a <bb_vec_info> (vinfo))
3816             {
3817               /* No vector type is fine, the ref can still participate
3818                  in dependence analysis, we just can't vectorize it.  */
3819               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3820               continue;
3821             }
3822
3823           if (gatherscatter != SG_NONE || simd_lane_access)
3824             {
3825               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3826               if (gatherscatter != SG_NONE)
3827                 free_data_ref (dr);
3828             }
3829           return false;
3830         }
3831       else
3832         {
3833           if (dump_enabled_p ())
3834             {
3835               dump_printf_loc (MSG_NOTE, vect_location,
3836                                "got vectype for stmt: ");
3837               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3838               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3839                                  STMT_VINFO_VECTYPE (stmt_info));
3840               dump_printf (MSG_NOTE, "\n");
3841             }
3842         }
3843
3844       /* Adjust the minimal vectorization factor according to the
3845          vector type.  */
3846       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3847       if (vf > *min_vf)
3848         *min_vf = vf;
3849
3850       if (gatherscatter != SG_NONE)
3851         {
3852           gather_scatter_info gs_info;
3853           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3854                                           &gs_info)
3855               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3856             {
3857               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3858               free_data_ref (dr);
3859               if (dump_enabled_p ())
3860                 {
3861                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3862                                    (gatherscatter == GATHER) ?
3863                                    "not vectorized: not suitable for gather "
3864                                    "load " :
3865                                    "not vectorized: not suitable for scatter "
3866                                    "store ");
3867                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3868                 }
3869               return false;
3870             }
3871
3872           free_data_ref (datarefs[i]);
3873           datarefs[i] = dr;
3874           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3875         }
3876
3877       else if (is_a <loop_vec_info> (vinfo)
3878                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3879         {
3880           if (nested_in_vect_loop_p (loop, stmt))
3881             {
3882               if (dump_enabled_p ())
3883                 {
3884                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3885                                    "not vectorized: not suitable for strided "
3886                                    "load ");
3887                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3888                 }
3889               return false;
3890             }
3891           STMT_VINFO_STRIDED_P (stmt_info) = true;
3892         }
3893     }
3894
3895   /* If we stopped analysis at the first dataref we could not analyze
3896      when trying to vectorize a basic-block mark the rest of the datarefs
3897      as not vectorizable and truncate the vector of datarefs.  That
3898      avoids spending useless time in analyzing their dependence.  */
3899   if (i != datarefs.length ())
3900     {
3901       gcc_assert (is_a <bb_vec_info> (vinfo));
3902       for (unsigned j = i; j < datarefs.length (); ++j)
3903         {
3904           data_reference_p dr = datarefs[j];
3905           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3906           free_data_ref (dr);
3907         }
3908       datarefs.truncate (i);
3909     }
3910
3911   return true;
3912 }
3913
3914
3915 /* Function vect_get_new_vect_var.
3916
3917    Returns a name for a new variable.  The current naming scheme appends the
3918    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3919    the name of vectorizer generated variables, and appends that to NAME if
3920    provided.  */
3921
3922 tree
3923 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3924 {
3925   const char *prefix;
3926   tree new_vect_var;
3927
3928   switch (var_kind)
3929   {
3930   case vect_simple_var:
3931     prefix = "vect";
3932     break;
3933   case vect_scalar_var:
3934     prefix = "stmp";
3935     break;
3936   case vect_mask_var:
3937     prefix = "mask";
3938     break;
3939   case vect_pointer_var:
3940     prefix = "vectp";
3941     break;
3942   default:
3943     gcc_unreachable ();
3944   }
3945
3946   if (name)
3947     {
3948       char* tmp = concat (prefix, "_", name, NULL);
3949       new_vect_var = create_tmp_reg (type, tmp);
3950       free (tmp);
3951     }
3952   else
3953     new_vect_var = create_tmp_reg (type, prefix);
3954
3955   return new_vect_var;
3956 }
3957
3958 /* Like vect_get_new_vect_var but return an SSA name.  */
3959
3960 tree
3961 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3962 {
3963   const char *prefix;
3964   tree new_vect_var;
3965
3966   switch (var_kind)
3967   {
3968   case vect_simple_var:
3969     prefix = "vect";
3970     break;
3971   case vect_scalar_var:
3972     prefix = "stmp";
3973     break;
3974   case vect_pointer_var:
3975     prefix = "vectp";
3976     break;
3977   default:
3978     gcc_unreachable ();
3979   }
3980
3981   if (name)
3982     {
3983       char* tmp = concat (prefix, "_", name, NULL);
3984       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3985       free (tmp);
3986     }
3987   else
3988     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3989
3990   return new_vect_var;
3991 }
3992
3993 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3994
3995 static void
3996 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3997                                   stmt_vec_info stmt_info)
3998 {
3999   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4000   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4001   int misalign = DR_MISALIGNMENT (dr);
4002   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4003     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4004   else
4005     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4006 }
4007
4008 /* Function vect_create_addr_base_for_vector_ref.
4009
4010    Create an expression that computes the address of the first memory location
4011    that will be accessed for a data reference.
4012
4013    Input:
4014    STMT: The statement containing the data reference.
4015    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4016    OFFSET: Optional. If supplied, it is be added to the initial address.
4017    LOOP:    Specify relative to which loop-nest should the address be computed.
4018             For example, when the dataref is in an inner-loop nested in an
4019             outer-loop that is now being vectorized, LOOP can be either the
4020             outer-loop, or the inner-loop.  The first memory location accessed
4021             by the following dataref ('in' points to short):
4022
4023                 for (i=0; i<N; i++)
4024                    for (j=0; j<M; j++)
4025                      s += in[i+j]
4026
4027             is as follows:
4028             if LOOP=i_loop:     &in             (relative to i_loop)
4029             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4030    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4031             initial address.  Unlike OFFSET, which is number of elements to
4032             be added, BYTE_OFFSET is measured in bytes.
4033
4034    Output:
4035    1. Return an SSA_NAME whose value is the address of the memory location of
4036       the first vector of the data reference.
4037    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4038       these statement(s) which define the returned SSA_NAME.
4039
4040    FORNOW: We are only handling array accesses with step 1.  */
4041
4042 tree
4043 vect_create_addr_base_for_vector_ref (gimple *stmt,
4044                                       gimple_seq *new_stmt_list,
4045                                       tree offset,
4046                                       tree byte_offset)
4047 {
4048   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4049   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4050   const char *base_name;
4051   tree addr_base;
4052   tree dest;
4053   gimple_seq seq = NULL;
4054   tree vect_ptr_type;
4055   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4056   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4057   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4058
4059   tree data_ref_base = unshare_expr (drb->base_address);
4060   tree base_offset = unshare_expr (drb->offset);
4061   tree init = unshare_expr (drb->init);
4062
4063   if (loop_vinfo)
4064     base_name = get_name (data_ref_base);
4065   else
4066     {
4067       base_offset = ssize_int (0);
4068       init = ssize_int (0);
4069       base_name = get_name (DR_REF (dr));
4070     }
4071
4072   /* Create base_offset */
4073   base_offset = size_binop (PLUS_EXPR,
4074                             fold_convert (sizetype, base_offset),
4075                             fold_convert (sizetype, init));
4076
4077   if (offset)
4078     {
4079       offset = fold_build2 (MULT_EXPR, sizetype,
4080                             fold_convert (sizetype, offset), step);
4081       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4082                                  base_offset, offset);
4083     }
4084   if (byte_offset)
4085     {
4086       byte_offset = fold_convert (sizetype, byte_offset);
4087       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4088                                  base_offset, byte_offset);
4089     }
4090
4091   /* base + base_offset */
4092   if (loop_vinfo)
4093     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4094   else
4095     {
4096       addr_base = build1 (ADDR_EXPR,
4097                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4098                           unshare_expr (DR_REF (dr)));
4099     }
4100
4101   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4102   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4103   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4104   gimple_seq_add_seq (new_stmt_list, seq);
4105
4106   if (DR_PTR_INFO (dr)
4107       && TREE_CODE (addr_base) == SSA_NAME
4108       && !SSA_NAME_PTR_INFO (addr_base))
4109     {
4110       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4111       if (offset || byte_offset)
4112         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4113     }
4114
4115   if (dump_enabled_p ())
4116     {
4117       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4118       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4119       dump_printf (MSG_NOTE, "\n");
4120     }
4121
4122   return addr_base;
4123 }
4124
4125
4126 /* Function vect_create_data_ref_ptr.
4127
4128    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4129    location accessed in the loop by STMT, along with the def-use update
4130    chain to appropriately advance the pointer through the loop iterations.
4131    Also set aliasing information for the pointer.  This pointer is used by
4132    the callers to this function to create a memory reference expression for
4133    vector load/store access.
4134
4135    Input:
4136    1. STMT: a stmt that references memory. Expected to be of the form
4137          GIMPLE_ASSIGN <name, data-ref> or
4138          GIMPLE_ASSIGN <data-ref, name>.
4139    2. AGGR_TYPE: the type of the reference, which should be either a vector
4140         or an array.
4141    3. AT_LOOP: the loop where the vector memref is to be created.
4142    4. OFFSET (optional): an offset to be added to the initial address accessed
4143         by the data-ref in STMT.
4144    5. BSI: location where the new stmts are to be placed if there is no loop
4145    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4146         pointing to the initial address.
4147    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4148         to the initial address accessed by the data-ref in STMT.  This is
4149         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4150         in bytes.
4151
4152    Output:
4153    1. Declare a new ptr to vector_type, and have it point to the base of the
4154       data reference (initial addressed accessed by the data reference).
4155       For example, for vector of type V8HI, the following code is generated:
4156
4157       v8hi *ap;
4158       ap = (v8hi *)initial_address;
4159
4160       if OFFSET is not supplied:
4161          initial_address = &a[init];
4162       if OFFSET is supplied:
4163          initial_address = &a[init + OFFSET];
4164       if BYTE_OFFSET is supplied:
4165          initial_address = &a[init] + BYTE_OFFSET;
4166
4167       Return the initial_address in INITIAL_ADDRESS.
4168
4169    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4170       update the pointer in each iteration of the loop.
4171
4172       Return the increment stmt that updates the pointer in PTR_INCR.
4173
4174    3. Set INV_P to true if the access pattern of the data reference in the
4175       vectorized loop is invariant.  Set it to false otherwise.
4176
4177    4. Return the pointer.  */
4178
4179 tree
4180 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4181                           tree offset, tree *initial_address,
4182                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4183                           bool only_init, bool *inv_p, tree byte_offset)
4184 {
4185   const char *base_name;
4186   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4187   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4188   struct loop *loop = NULL;
4189   bool nested_in_vect_loop = false;
4190   struct loop *containing_loop = NULL;
4191   tree aggr_ptr_type;
4192   tree aggr_ptr;
4193   tree new_temp;
4194   gimple_seq new_stmt_list = NULL;
4195   edge pe = NULL;
4196   basic_block new_bb;
4197   tree aggr_ptr_init;
4198   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4199   tree aptr;
4200   gimple_stmt_iterator incr_gsi;
4201   bool insert_after;
4202   tree indx_before_incr, indx_after_incr;
4203   gimple *incr;
4204   tree step;
4205   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4206
4207   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4208               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4209
4210   if (loop_vinfo)
4211     {
4212       loop = LOOP_VINFO_LOOP (loop_vinfo);
4213       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4214       containing_loop = (gimple_bb (stmt))->loop_father;
4215       pe = loop_preheader_edge (loop);
4216     }
4217   else
4218     {
4219       gcc_assert (bb_vinfo);
4220       only_init = true;
4221       *ptr_incr = NULL;
4222     }
4223
4224   /* Check the step (evolution) of the load in LOOP, and record
4225      whether it's invariant.  */
4226   step = vect_dr_behavior (dr)->step;
4227   if (integer_zerop (step))
4228     *inv_p = true;
4229   else
4230     *inv_p = false;
4231
4232   /* Create an expression for the first address accessed by this load
4233      in LOOP.  */
4234   base_name = get_name (DR_BASE_ADDRESS (dr));
4235
4236   if (dump_enabled_p ())
4237     {
4238       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4239       dump_printf_loc (MSG_NOTE, vect_location,
4240                        "create %s-pointer variable to type: ",
4241                        get_tree_code_name (TREE_CODE (aggr_type)));
4242       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4243       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4244         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4245       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4246         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4247       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4248         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4249       else
4250         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4251       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4252       dump_printf (MSG_NOTE, "\n");
4253     }
4254
4255   /* (1) Create the new aggregate-pointer variable.
4256      Vector and array types inherit the alias set of their component
4257      type by default so we need to use a ref-all pointer if the data
4258      reference does not conflict with the created aggregated data
4259      reference because it is not addressable.  */
4260   bool need_ref_all = false;
4261   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4262                               get_alias_set (DR_REF (dr))))
4263     need_ref_all = true;
4264   /* Likewise for any of the data references in the stmt group.  */
4265   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4266     {
4267       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4268       do
4269         {
4270           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4271           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4272           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4273                                       get_alias_set (DR_REF (sdr))))
4274             {
4275               need_ref_all = true;
4276               break;
4277             }
4278           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4279         }
4280       while (orig_stmt);
4281     }
4282   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4283                                                need_ref_all);
4284   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4285
4286
4287   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4288      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4289      def-use update cycles for the pointer: one relative to the outer-loop
4290      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4291      to the inner-loop (which is the inner-most loop containing the dataref),
4292      and this is done be step (5) below.
4293
4294      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4295      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4296      redundant.  Steps (3),(4) create the following:
4297
4298         vp0 = &base_addr;
4299         LOOP:   vp1 = phi(vp0,vp2)
4300                 ...
4301                 ...
4302                 vp2 = vp1 + step
4303                 goto LOOP
4304
4305      If there is an inner-loop nested in loop, then step (5) will also be
4306      applied, and an additional update in the inner-loop will be created:
4307
4308         vp0 = &base_addr;
4309         LOOP:   vp1 = phi(vp0,vp2)
4310                 ...
4311         inner:     vp3 = phi(vp1,vp4)
4312                    vp4 = vp3 + inner_step
4313                    if () goto inner
4314                 ...
4315                 vp2 = vp1 + step
4316                 if () goto LOOP   */
4317
4318   /* (2) Calculate the initial address of the aggregate-pointer, and set
4319      the aggregate-pointer to point to it before the loop.  */
4320
4321   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4322
4323   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4324                                                    offset, byte_offset);
4325   if (new_stmt_list)
4326     {
4327       if (pe)
4328         {
4329           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4330           gcc_assert (!new_bb);
4331         }
4332       else
4333         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4334     }
4335
4336   *initial_address = new_temp;
4337   aggr_ptr_init = new_temp;
4338
4339   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4340      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4341      inner-loop nested in LOOP (during outer-loop vectorization).  */
4342
4343   /* No update in loop is required.  */
4344   if (only_init && (!loop_vinfo || at_loop == loop))
4345     aptr = aggr_ptr_init;
4346   else
4347     {
4348       /* The step of the aggregate pointer is the type size.  */
4349       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4350       /* One exception to the above is when the scalar step of the load in
4351          LOOP is zero. In this case the step here is also zero.  */
4352       if (*inv_p)
4353         iv_step = size_zero_node;
4354       else if (tree_int_cst_sgn (step) == -1)
4355         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4356
4357       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4358
4359       create_iv (aggr_ptr_init,
4360                  fold_convert (aggr_ptr_type, iv_step),
4361                  aggr_ptr, loop, &incr_gsi, insert_after,
4362                  &indx_before_incr, &indx_after_incr);
4363       incr = gsi_stmt (incr_gsi);
4364       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4365
4366       /* Copy the points-to information if it exists. */
4367       if (DR_PTR_INFO (dr))
4368         {
4369           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4370           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4371         }
4372       if (ptr_incr)
4373         *ptr_incr = incr;
4374
4375       aptr = indx_before_incr;
4376     }
4377
4378   if (!nested_in_vect_loop || only_init)
4379     return aptr;
4380
4381
4382   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4383      nested in LOOP, if exists.  */
4384
4385   gcc_assert (nested_in_vect_loop);
4386   if (!only_init)
4387     {
4388       standard_iv_increment_position (containing_loop, &incr_gsi,
4389                                       &insert_after);
4390       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4391                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4392                  &indx_after_incr);
4393       incr = gsi_stmt (incr_gsi);
4394       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4395
4396       /* Copy the points-to information if it exists. */
4397       if (DR_PTR_INFO (dr))
4398         {
4399           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4400           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4401         }
4402       if (ptr_incr)
4403         *ptr_incr = incr;
4404
4405       return indx_before_incr;
4406     }
4407   else
4408     gcc_unreachable ();
4409 }
4410
4411
4412 /* Function bump_vector_ptr
4413
4414    Increment a pointer (to a vector type) by vector-size. If requested,
4415    i.e. if PTR-INCR is given, then also connect the new increment stmt
4416    to the existing def-use update-chain of the pointer, by modifying
4417    the PTR_INCR as illustrated below:
4418
4419    The pointer def-use update-chain before this function:
4420                         DATAREF_PTR = phi (p_0, p_2)
4421                         ....
4422         PTR_INCR:       p_2 = DATAREF_PTR + step
4423
4424    The pointer def-use update-chain after this function:
4425                         DATAREF_PTR = phi (p_0, p_2)
4426                         ....
4427                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4428                         ....
4429         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4430
4431    Input:
4432    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4433                  in the loop.
4434    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4435               the loop.  The increment amount across iterations is expected
4436               to be vector_size.
4437    BSI - location where the new update stmt is to be placed.
4438    STMT - the original scalar memory-access stmt that is being vectorized.
4439    BUMP - optional. The offset by which to bump the pointer. If not given,
4440           the offset is assumed to be vector_size.
4441
4442    Output: Return NEW_DATAREF_PTR as illustrated above.
4443
4444 */
4445
4446 tree
4447 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4448                  gimple *stmt, tree bump)
4449 {
4450   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4451   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4452   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4453   tree update = TYPE_SIZE_UNIT (vectype);
4454   gassign *incr_stmt;
4455   ssa_op_iter iter;
4456   use_operand_p use_p;
4457   tree new_dataref_ptr;
4458
4459   if (bump)
4460     update = bump;
4461
4462   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4463     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4464   else
4465     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4466   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4467                                    dataref_ptr, update);
4468   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4469
4470   /* Copy the points-to information if it exists. */
4471   if (DR_PTR_INFO (dr))
4472     {
4473       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4474       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4475     }
4476
4477   if (!ptr_incr)
4478     return new_dataref_ptr;
4479
4480   /* Update the vector-pointer's cross-iteration increment.  */
4481   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4482     {
4483       tree use = USE_FROM_PTR (use_p);
4484
4485       if (use == dataref_ptr)
4486         SET_USE (use_p, new_dataref_ptr);
4487       else
4488         gcc_assert (tree_int_cst_compare (use, update) == 0);
4489     }
4490
4491   return new_dataref_ptr;
4492 }
4493
4494
4495 /* Function vect_create_destination_var.
4496
4497    Create a new temporary of type VECTYPE.  */
4498
4499 tree
4500 vect_create_destination_var (tree scalar_dest, tree vectype)
4501 {
4502   tree vec_dest;
4503   const char *name;
4504   char *new_name;
4505   tree type;
4506   enum vect_var_kind kind;
4507
4508   kind = vectype
4509     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4510     ? vect_mask_var
4511     : vect_simple_var
4512     : vect_scalar_var;
4513   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4514
4515   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4516
4517   name = get_name (scalar_dest);
4518   if (name)
4519     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4520   else
4521     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4522   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4523   free (new_name);
4524
4525   return vec_dest;
4526 }
4527
4528 /* Function vect_grouped_store_supported.
4529
4530    Returns TRUE if interleave high and interleave low permutations
4531    are supported, and FALSE otherwise.  */
4532
4533 bool
4534 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4535 {
4536   machine_mode mode = TYPE_MODE (vectype);
4537
4538   /* vect_permute_store_chain requires the group size to be equal to 3 or
4539      be a power of two.  */
4540   if (count != 3 && exact_log2 (count) == -1)
4541     {
4542       if (dump_enabled_p ())
4543         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4544                          "the size of the group of accesses"
4545                          " is not a power of 2 or not eqaul to 3\n");
4546       return false;
4547     }
4548
4549   /* Check that the permutation is supported.  */
4550   if (VECTOR_MODE_P (mode))
4551     {
4552       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4553       auto_vec_perm_indices sel (nelt);
4554       sel.quick_grow (nelt);
4555
4556       if (count == 3)
4557         {
4558           unsigned int j0 = 0, j1 = 0, j2 = 0;
4559           unsigned int i, j;
4560
4561           for (j = 0; j < 3; j++)
4562             {
4563               int nelt0 = ((3 - j) * nelt) % 3;
4564               int nelt1 = ((3 - j) * nelt + 1) % 3;
4565               int nelt2 = ((3 - j) * nelt + 2) % 3;
4566               for (i = 0; i < nelt; i++)
4567                 {
4568                   if (3 * i + nelt0 < nelt)
4569                     sel[3 * i + nelt0] = j0++;
4570                   if (3 * i + nelt1 < nelt)
4571                     sel[3 * i + nelt1] = nelt + j1++;
4572                   if (3 * i + nelt2 < nelt)
4573                     sel[3 * i + nelt2] = 0;
4574                 }
4575               if (!can_vec_perm_p (mode, false, &sel))
4576                 {
4577                   if (dump_enabled_p ())
4578                     dump_printf (MSG_MISSED_OPTIMIZATION,
4579                                  "permutaion op not supported by target.\n");
4580                   return false;
4581                 }
4582
4583               for (i = 0; i < nelt; i++)
4584                 {
4585                   if (3 * i + nelt0 < nelt)
4586                     sel[3 * i + nelt0] = 3 * i + nelt0;
4587                   if (3 * i + nelt1 < nelt)
4588                     sel[3 * i + nelt1] = 3 * i + nelt1;
4589                   if (3 * i + nelt2 < nelt)
4590                     sel[3 * i + nelt2] = nelt + j2++;
4591                 }
4592               if (!can_vec_perm_p (mode, false, &sel))
4593                 {
4594                   if (dump_enabled_p ())
4595                     dump_printf (MSG_MISSED_OPTIMIZATION,
4596                                  "permutaion op not supported by target.\n");
4597                   return false;
4598                 }
4599             }
4600           return true;
4601         }
4602       else
4603         {
4604           /* If length is not equal to 3 then only power of 2 is supported.  */
4605           gcc_assert (pow2p_hwi (count));
4606
4607           for (i = 0; i < nelt / 2; i++)
4608             {
4609               sel[i * 2] = i;
4610               sel[i * 2 + 1] = i + nelt;
4611             }
4612           if (can_vec_perm_p (mode, false, &sel))
4613             {
4614               for (i = 0; i < nelt; i++)
4615                 sel[i] += nelt / 2;
4616               if (can_vec_perm_p (mode, false, &sel))
4617                 return true;
4618             }
4619         }
4620     }
4621
4622   if (dump_enabled_p ())
4623     dump_printf (MSG_MISSED_OPTIMIZATION,
4624                  "permutaion op not supported by target.\n");
4625   return false;
4626 }
4627
4628
4629 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4630    type VECTYPE.  */
4631
4632 bool
4633 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4634 {
4635   return vect_lanes_optab_supported_p ("vec_store_lanes",
4636                                        vec_store_lanes_optab,
4637                                        vectype, count);
4638 }
4639
4640
4641 /* Function vect_permute_store_chain.
4642
4643    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4644    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4645    the data correctly for the stores.  Return the final references for stores
4646    in RESULT_CHAIN.
4647
4648    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4649    The input is 4 vectors each containing 8 elements.  We assign a number to
4650    each element, the input sequence is:
4651
4652    1st vec:   0  1  2  3  4  5  6  7
4653    2nd vec:   8  9 10 11 12 13 14 15
4654    3rd vec:  16 17 18 19 20 21 22 23
4655    4th vec:  24 25 26 27 28 29 30 31
4656
4657    The output sequence should be:
4658
4659    1st vec:  0  8 16 24  1  9 17 25
4660    2nd vec:  2 10 18 26  3 11 19 27
4661    3rd vec:  4 12 20 28  5 13 21 30
4662    4th vec:  6 14 22 30  7 15 23 31
4663
4664    i.e., we interleave the contents of the four vectors in their order.
4665
4666    We use interleave_high/low instructions to create such output.  The input of
4667    each interleave_high/low operation is two vectors:
4668    1st vec    2nd vec
4669    0 1 2 3    4 5 6 7
4670    the even elements of the result vector are obtained left-to-right from the
4671    high/low elements of the first vector.  The odd elements of the result are
4672    obtained left-to-right from the high/low elements of the second vector.
4673    The output of interleave_high will be:   0 4 1 5
4674    and of interleave_low:                   2 6 3 7
4675
4676
4677    The permutation is done in log LENGTH stages.  In each stage interleave_high
4678    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4679    where the first argument is taken from the first half of DR_CHAIN and the
4680    second argument from it's second half.
4681    In our example,
4682
4683    I1: interleave_high (1st vec, 3rd vec)
4684    I2: interleave_low (1st vec, 3rd vec)
4685    I3: interleave_high (2nd vec, 4th vec)
4686    I4: interleave_low (2nd vec, 4th vec)
4687
4688    The output for the first stage is:
4689
4690    I1:  0 16  1 17  2 18  3 19
4691    I2:  4 20  5 21  6 22  7 23
4692    I3:  8 24  9 25 10 26 11 27
4693    I4: 12 28 13 29 14 30 15 31
4694
4695    The output of the second stage, i.e. the final result is:
4696
4697    I1:  0  8 16 24  1  9 17 25
4698    I2:  2 10 18 26  3 11 19 27
4699    I3:  4 12 20 28  5 13 21 30
4700    I4:  6 14 22 30  7 15 23 31.  */
4701
4702 void
4703 vect_permute_store_chain (vec<tree> dr_chain,
4704                           unsigned int length,
4705                           gimple *stmt,
4706                           gimple_stmt_iterator *gsi,
4707                           vec<tree> *result_chain)
4708 {
4709   tree vect1, vect2, high, low;
4710   gimple *perm_stmt;
4711   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4712   tree perm_mask_low, perm_mask_high;
4713   tree data_ref;
4714   tree perm3_mask_low, perm3_mask_high;
4715   unsigned int i, n, log_length = exact_log2 (length);
4716   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4717
4718   auto_vec_perm_indices sel (nelt);
4719   sel.quick_grow (nelt);
4720
4721   result_chain->quick_grow (length);
4722   memcpy (result_chain->address (), dr_chain.address (),
4723           length * sizeof (tree));
4724
4725   if (length == 3)
4726     {
4727       unsigned int j0 = 0, j1 = 0, j2 = 0;
4728
4729       for (j = 0; j < 3; j++)
4730         {
4731           int nelt0 = ((3 - j) * nelt) % 3;
4732           int nelt1 = ((3 - j) * nelt + 1) % 3;
4733           int nelt2 = ((3 - j) * nelt + 2) % 3;
4734
4735           for (i = 0; i < nelt; i++)
4736             {
4737               if (3 * i + nelt0 < nelt)
4738                 sel[3 * i + nelt0] = j0++;
4739               if (3 * i + nelt1 < nelt)
4740                 sel[3 * i + nelt1] = nelt + j1++;
4741               if (3 * i + nelt2 < nelt)
4742                 sel[3 * i + nelt2] = 0;
4743             }
4744           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4745
4746           for (i = 0; i < nelt; i++)
4747             {
4748               if (3 * i + nelt0 < nelt)
4749                 sel[3 * i + nelt0] = 3 * i + nelt0;
4750               if (3 * i + nelt1 < nelt)
4751                 sel[3 * i + nelt1] = 3 * i + nelt1;
4752               if (3 * i + nelt2 < nelt)
4753                 sel[3 * i + nelt2] = nelt + j2++;
4754             }
4755           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4756
4757           vect1 = dr_chain[0];
4758           vect2 = dr_chain[1];
4759
4760           /* Create interleaving stmt:
4761              low = VEC_PERM_EXPR <vect1, vect2,
4762                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4763                                    j + 2, nelt + j + 2, *, ...}>  */
4764           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4765           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4766                                            vect2, perm3_mask_low);
4767           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4768
4769           vect1 = data_ref;
4770           vect2 = dr_chain[2];
4771           /* Create interleaving stmt:
4772              low = VEC_PERM_EXPR <vect1, vect2,
4773                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4774                                    6, 7, nelt + j + 2, ...}>  */
4775           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4776           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4777                                            vect2, perm3_mask_high);
4778           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4779           (*result_chain)[j] = data_ref;
4780         }
4781     }
4782   else
4783     {
4784       /* If length is not equal to 3 then only power of 2 is supported.  */
4785       gcc_assert (pow2p_hwi (length));
4786
4787       for (i = 0, n = nelt / 2; i < n; i++)
4788         {
4789           sel[i * 2] = i;
4790           sel[i * 2 + 1] = i + nelt;
4791         }
4792         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4793
4794         for (i = 0; i < nelt; i++)
4795           sel[i] += nelt / 2;
4796         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4797
4798         for (i = 0, n = log_length; i < n; i++)
4799           {
4800             for (j = 0; j < length/2; j++)
4801               {
4802                 vect1 = dr_chain[j];
4803                 vect2 = dr_chain[j+length/2];
4804
4805                 /* Create interleaving stmt:
4806                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4807                                                         ...}>  */
4808                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4809                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4810                                                  vect2, perm_mask_high);
4811                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4812                 (*result_chain)[2*j] = high;
4813
4814                 /* Create interleaving stmt:
4815                    low = VEC_PERM_EXPR <vect1, vect2,
4816                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4817                                          ...}>  */
4818                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4819                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4820                                                  vect2, perm_mask_low);
4821                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4822                 (*result_chain)[2*j+1] = low;
4823               }
4824             memcpy (dr_chain.address (), result_chain->address (),
4825                     length * sizeof (tree));
4826           }
4827     }
4828 }
4829
4830 /* Function vect_setup_realignment
4831
4832    This function is called when vectorizing an unaligned load using
4833    the dr_explicit_realign[_optimized] scheme.
4834    This function generates the following code at the loop prolog:
4835
4836       p = initial_addr;
4837    x  msq_init = *(floor(p));   # prolog load
4838       realignment_token = call target_builtin;
4839     loop:
4840    x  msq = phi (msq_init, ---)
4841
4842    The stmts marked with x are generated only for the case of
4843    dr_explicit_realign_optimized.
4844
4845    The code above sets up a new (vector) pointer, pointing to the first
4846    location accessed by STMT, and a "floor-aligned" load using that pointer.
4847    It also generates code to compute the "realignment-token" (if the relevant
4848    target hook was defined), and creates a phi-node at the loop-header bb
4849    whose arguments are the result of the prolog-load (created by this
4850    function) and the result of a load that takes place in the loop (to be
4851    created by the caller to this function).
4852
4853    For the case of dr_explicit_realign_optimized:
4854    The caller to this function uses the phi-result (msq) to create the
4855    realignment code inside the loop, and sets up the missing phi argument,
4856    as follows:
4857     loop:
4858       msq = phi (msq_init, lsq)
4859       lsq = *(floor(p'));        # load in loop
4860       result = realign_load (msq, lsq, realignment_token);
4861
4862    For the case of dr_explicit_realign:
4863     loop:
4864       msq = *(floor(p));        # load in loop
4865       p' = p + (VS-1);
4866       lsq = *(floor(p'));       # load in loop
4867       result = realign_load (msq, lsq, realignment_token);
4868
4869    Input:
4870    STMT - (scalar) load stmt to be vectorized. This load accesses
4871           a memory location that may be unaligned.
4872    BSI - place where new code is to be inserted.
4873    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4874                               is used.
4875
4876    Output:
4877    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4878                        target hook, if defined.
4879    Return value - the result of the loop-header phi node.  */
4880
4881 tree
4882 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4883                         tree *realignment_token,
4884                         enum dr_alignment_support alignment_support_scheme,
4885                         tree init_addr,
4886                         struct loop **at_loop)
4887 {
4888   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4889   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4890   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4891   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4892   struct loop *loop = NULL;
4893   edge pe = NULL;
4894   tree scalar_dest = gimple_assign_lhs (stmt);
4895   tree vec_dest;
4896   gimple *inc;
4897   tree ptr;
4898   tree data_ref;
4899   basic_block new_bb;
4900   tree msq_init = NULL_TREE;
4901   tree new_temp;
4902   gphi *phi_stmt;
4903   tree msq = NULL_TREE;
4904   gimple_seq stmts = NULL;
4905   bool inv_p;
4906   bool compute_in_loop = false;
4907   bool nested_in_vect_loop = false;
4908   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4909   struct loop *loop_for_initial_load = NULL;
4910
4911   if (loop_vinfo)
4912     {
4913       loop = LOOP_VINFO_LOOP (loop_vinfo);
4914       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4915     }
4916
4917   gcc_assert (alignment_support_scheme == dr_explicit_realign
4918               || alignment_support_scheme == dr_explicit_realign_optimized);
4919
4920   /* We need to generate three things:
4921      1. the misalignment computation
4922      2. the extra vector load (for the optimized realignment scheme).
4923      3. the phi node for the two vectors from which the realignment is
4924       done (for the optimized realignment scheme).  */
4925
4926   /* 1. Determine where to generate the misalignment computation.
4927
4928      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4929      calculation will be generated by this function, outside the loop (in the
4930      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4931      caller, inside the loop.
4932
4933      Background: If the misalignment remains fixed throughout the iterations of
4934      the loop, then both realignment schemes are applicable, and also the
4935      misalignment computation can be done outside LOOP.  This is because we are
4936      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4937      are a multiple of VS (the Vector Size), and therefore the misalignment in
4938      different vectorized LOOP iterations is always the same.
4939      The problem arises only if the memory access is in an inner-loop nested
4940      inside LOOP, which is now being vectorized using outer-loop vectorization.
4941      This is the only case when the misalignment of the memory access may not
4942      remain fixed throughout the iterations of the inner-loop (as explained in
4943      detail in vect_supportable_dr_alignment).  In this case, not only is the
4944      optimized realignment scheme not applicable, but also the misalignment
4945      computation (and generation of the realignment token that is passed to
4946      REALIGN_LOAD) have to be done inside the loop.
4947
4948      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4949      or not, which in turn determines if the misalignment is computed inside
4950      the inner-loop, or outside LOOP.  */
4951
4952   if (init_addr != NULL_TREE || !loop_vinfo)
4953     {
4954       compute_in_loop = true;
4955       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4956     }
4957
4958
4959   /* 2. Determine where to generate the extra vector load.
4960
4961      For the optimized realignment scheme, instead of generating two vector
4962      loads in each iteration, we generate a single extra vector load in the
4963      preheader of the loop, and in each iteration reuse the result of the
4964      vector load from the previous iteration.  In case the memory access is in
4965      an inner-loop nested inside LOOP, which is now being vectorized using
4966      outer-loop vectorization, we need to determine whether this initial vector
4967      load should be generated at the preheader of the inner-loop, or can be
4968      generated at the preheader of LOOP.  If the memory access has no evolution
4969      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4970      to be generated inside LOOP (in the preheader of the inner-loop).  */
4971
4972   if (nested_in_vect_loop)
4973     {
4974       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4975       bool invariant_in_outerloop =
4976             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4977       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4978     }
4979   else
4980     loop_for_initial_load = loop;
4981   if (at_loop)
4982     *at_loop = loop_for_initial_load;
4983
4984   if (loop_for_initial_load)
4985     pe = loop_preheader_edge (loop_for_initial_load);
4986
4987   /* 3. For the case of the optimized realignment, create the first vector
4988       load at the loop preheader.  */
4989
4990   if (alignment_support_scheme == dr_explicit_realign_optimized)
4991     {
4992       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4993       gassign *new_stmt;
4994
4995       gcc_assert (!compute_in_loop);
4996       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4997       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4998                                       NULL_TREE, &init_addr, NULL, &inc,
4999                                       true, &inv_p);
5000       if (TREE_CODE (ptr) == SSA_NAME)
5001         new_temp = copy_ssa_name (ptr);
5002       else
5003         new_temp = make_ssa_name (TREE_TYPE (ptr));
5004       new_stmt = gimple_build_assign
5005                    (new_temp, BIT_AND_EXPR, ptr,
5006                     build_int_cst (TREE_TYPE (ptr),
5007                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5008       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5009       gcc_assert (!new_bb);
5010       data_ref
5011         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5012                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5013       new_stmt = gimple_build_assign (vec_dest, data_ref);
5014       new_temp = make_ssa_name (vec_dest, new_stmt);
5015       gimple_assign_set_lhs (new_stmt, new_temp);
5016       if (pe)
5017         {
5018           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5019           gcc_assert (!new_bb);
5020         }
5021       else
5022          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5023
5024       msq_init = gimple_assign_lhs (new_stmt);
5025     }
5026
5027   /* 4. Create realignment token using a target builtin, if available.
5028       It is done either inside the containing loop, or before LOOP (as
5029       determined above).  */
5030
5031   if (targetm.vectorize.builtin_mask_for_load)
5032     {
5033       gcall *new_stmt;
5034       tree builtin_decl;
5035
5036       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5037       if (!init_addr)
5038         {
5039           /* Generate the INIT_ADDR computation outside LOOP.  */
5040           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5041                                                             NULL_TREE);
5042           if (loop)
5043             {
5044               pe = loop_preheader_edge (loop);
5045               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5046               gcc_assert (!new_bb);
5047             }
5048           else
5049              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5050         }
5051
5052       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5053       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5054       vec_dest =
5055         vect_create_destination_var (scalar_dest,
5056                                      gimple_call_return_type (new_stmt));
5057       new_temp = make_ssa_name (vec_dest, new_stmt);
5058       gimple_call_set_lhs (new_stmt, new_temp);
5059
5060       if (compute_in_loop)
5061         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5062       else
5063         {
5064           /* Generate the misalignment computation outside LOOP.  */
5065           pe = loop_preheader_edge (loop);
5066           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5067           gcc_assert (!new_bb);
5068         }
5069
5070       *realignment_token = gimple_call_lhs (new_stmt);
5071
5072       /* The result of the CALL_EXPR to this builtin is determined from
5073          the value of the parameter and no global variables are touched
5074          which makes the builtin a "const" function.  Requiring the
5075          builtin to have the "const" attribute makes it unnecessary
5076          to call mark_call_clobbered.  */
5077       gcc_assert (TREE_READONLY (builtin_decl));
5078     }
5079
5080   if (alignment_support_scheme == dr_explicit_realign)
5081     return msq;
5082
5083   gcc_assert (!compute_in_loop);
5084   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5085
5086
5087   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5088
5089   pe = loop_preheader_edge (containing_loop);
5090   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5091   msq = make_ssa_name (vec_dest);
5092   phi_stmt = create_phi_node (msq, containing_loop->header);
5093   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5094
5095   return msq;
5096 }
5097
5098
5099 /* Function vect_grouped_load_supported.
5100
5101    COUNT is the size of the load group (the number of statements plus the
5102    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5103    only one statement, with a gap of COUNT - 1.
5104
5105    Returns true if a suitable permute exists.  */
5106
5107 bool
5108 vect_grouped_load_supported (tree vectype, bool single_element_p,
5109                              unsigned HOST_WIDE_INT count)
5110 {
5111   machine_mode mode = TYPE_MODE (vectype);
5112
5113   /* If this is single-element interleaving with an element distance
5114      that leaves unused vector loads around punt - we at least create
5115      very sub-optimal code in that case (and blow up memory,
5116      see PR65518).  */
5117   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5118     {
5119       if (dump_enabled_p ())
5120         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5121                          "single-element interleaving not supported "
5122                          "for not adjacent vector loads\n");
5123       return false;
5124     }
5125
5126   /* vect_permute_load_chain requires the group size to be equal to 3 or
5127      be a power of two.  */
5128   if (count != 3 && exact_log2 (count) == -1)
5129     {
5130       if (dump_enabled_p ())
5131         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5132                          "the size of the group of accesses"
5133                          " is not a power of 2 or not equal to 3\n");
5134       return false;
5135     }
5136
5137   /* Check that the permutation is supported.  */
5138   if (VECTOR_MODE_P (mode))
5139     {
5140       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5141       auto_vec_perm_indices sel (nelt);
5142       sel.quick_grow (nelt);
5143
5144       if (count == 3)
5145         {
5146           unsigned int k;
5147           for (k = 0; k < 3; k++)
5148             {
5149               for (i = 0; i < nelt; i++)
5150                 if (3 * i + k < 2 * nelt)
5151                   sel[i] = 3 * i + k;
5152                 else
5153                   sel[i] = 0;
5154               if (!can_vec_perm_p (mode, false, &sel))
5155                 {
5156                   if (dump_enabled_p ())
5157                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5158                                      "shuffle of 3 loads is not supported by"
5159                                      " target\n");
5160                   return false;
5161                 }
5162               for (i = 0, j = 0; i < nelt; i++)
5163                 if (3 * i + k < 2 * nelt)
5164                   sel[i] = i;
5165                 else
5166                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5167               if (!can_vec_perm_p (mode, false, &sel))
5168                 {
5169                   if (dump_enabled_p ())
5170                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5171                                      "shuffle of 3 loads is not supported by"
5172                                      " target\n");
5173                   return false;
5174                 }
5175             }
5176           return true;
5177         }
5178       else
5179         {
5180           /* If length is not equal to 3 then only power of 2 is supported.  */
5181           gcc_assert (pow2p_hwi (count));
5182           for (i = 0; i < nelt; i++)
5183             sel[i] = i * 2;
5184           if (can_vec_perm_p (mode, false, &sel))
5185             {
5186               for (i = 0; i < nelt; i++)
5187                 sel[i] = i * 2 + 1;
5188               if (can_vec_perm_p (mode, false, &sel))
5189                 return true;
5190             }
5191         }
5192     }
5193
5194   if (dump_enabled_p ())
5195     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5196                      "extract even/odd not supported by target\n");
5197   return false;
5198 }
5199
5200 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5201    type VECTYPE.  */
5202
5203 bool
5204 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5205 {
5206   return vect_lanes_optab_supported_p ("vec_load_lanes",
5207                                        vec_load_lanes_optab,
5208                                        vectype, count);
5209 }
5210
5211 /* Function vect_permute_load_chain.
5212
5213    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5214    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5215    the input data correctly.  Return the final references for loads in
5216    RESULT_CHAIN.
5217
5218    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5219    The input is 4 vectors each containing 8 elements. We assign a number to each
5220    element, the input sequence is:
5221
5222    1st vec:   0  1  2  3  4  5  6  7
5223    2nd vec:   8  9 10 11 12 13 14 15
5224    3rd vec:  16 17 18 19 20 21 22 23
5225    4th vec:  24 25 26 27 28 29 30 31
5226
5227    The output sequence should be:
5228
5229    1st vec:  0 4  8 12 16 20 24 28
5230    2nd vec:  1 5  9 13 17 21 25 29
5231    3rd vec:  2 6 10 14 18 22 26 30
5232    4th vec:  3 7 11 15 19 23 27 31
5233
5234    i.e., the first output vector should contain the first elements of each
5235    interleaving group, etc.
5236
5237    We use extract_even/odd instructions to create such output.  The input of
5238    each extract_even/odd operation is two vectors
5239    1st vec    2nd vec
5240    0 1 2 3    4 5 6 7
5241
5242    and the output is the vector of extracted even/odd elements.  The output of
5243    extract_even will be:   0 2 4 6
5244    and of extract_odd:     1 3 5 7
5245
5246
5247    The permutation is done in log LENGTH stages.  In each stage extract_even
5248    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5249    their order.  In our example,
5250
5251    E1: extract_even (1st vec, 2nd vec)
5252    E2: extract_odd (1st vec, 2nd vec)
5253    E3: extract_even (3rd vec, 4th vec)
5254    E4: extract_odd (3rd vec, 4th vec)
5255
5256    The output for the first stage will be:
5257
5258    E1:  0  2  4  6  8 10 12 14
5259    E2:  1  3  5  7  9 11 13 15
5260    E3: 16 18 20 22 24 26 28 30
5261    E4: 17 19 21 23 25 27 29 31
5262
5263    In order to proceed and create the correct sequence for the next stage (or
5264    for the correct output, if the second stage is the last one, as in our
5265    example), we first put the output of extract_even operation and then the
5266    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5267    The input for the second stage is:
5268
5269    1st vec (E1):  0  2  4  6  8 10 12 14
5270    2nd vec (E3): 16 18 20 22 24 26 28 30
5271    3rd vec (E2):  1  3  5  7  9 11 13 15
5272    4th vec (E4): 17 19 21 23 25 27 29 31
5273
5274    The output of the second stage:
5275
5276    E1: 0 4  8 12 16 20 24 28
5277    E2: 2 6 10 14 18 22 26 30
5278    E3: 1 5  9 13 17 21 25 29
5279    E4: 3 7 11 15 19 23 27 31
5280
5281    And RESULT_CHAIN after reordering:
5282
5283    1st vec (E1):  0 4  8 12 16 20 24 28
5284    2nd vec (E3):  1 5  9 13 17 21 25 29
5285    3rd vec (E2):  2 6 10 14 18 22 26 30
5286    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5287
5288 static void
5289 vect_permute_load_chain (vec<tree> dr_chain,
5290                          unsigned int length,
5291                          gimple *stmt,
5292                          gimple_stmt_iterator *gsi,
5293                          vec<tree> *result_chain)
5294 {
5295   tree data_ref, first_vect, second_vect;
5296   tree perm_mask_even, perm_mask_odd;
5297   tree perm3_mask_low, perm3_mask_high;
5298   gimple *perm_stmt;
5299   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5300   unsigned int i, j, log_length = exact_log2 (length);
5301   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5302
5303   auto_vec_perm_indices sel (nelt);
5304   sel.quick_grow (nelt);
5305
5306   result_chain->quick_grow (length);
5307   memcpy (result_chain->address (), dr_chain.address (),
5308           length * sizeof (tree));
5309
5310   if (length == 3)
5311     {
5312       unsigned int k;
5313
5314       for (k = 0; k < 3; k++)
5315         {
5316           for (i = 0; i < nelt; i++)
5317             if (3 * i + k < 2 * nelt)
5318               sel[i] = 3 * i + k;
5319             else
5320               sel[i] = 0;
5321           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5322
5323           for (i = 0, j = 0; i < nelt; i++)
5324             if (3 * i + k < 2 * nelt)
5325               sel[i] = i;
5326             else
5327               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5328
5329           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5330
5331           first_vect = dr_chain[0];
5332           second_vect = dr_chain[1];
5333
5334           /* Create interleaving stmt (low part of):
5335              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5336                                                              ...}>  */
5337           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5338           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5339                                            second_vect, perm3_mask_low);
5340           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5341
5342           /* Create interleaving stmt (high part of):
5343              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5344                                                               ...}>  */
5345           first_vect = data_ref;
5346           second_vect = dr_chain[2];
5347           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5348           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5349                                            second_vect, perm3_mask_high);
5350           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5351           (*result_chain)[k] = data_ref;
5352         }
5353     }
5354   else
5355     {
5356       /* If length is not equal to 3 then only power of 2 is supported.  */
5357       gcc_assert (pow2p_hwi (length));
5358
5359       for (i = 0; i < nelt; ++i)
5360         sel[i] = i * 2;
5361       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5362
5363       for (i = 0; i < nelt; ++i)
5364         sel[i] = i * 2 + 1;
5365       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5366
5367       for (i = 0; i < log_length; i++)
5368         {
5369           for (j = 0; j < length; j += 2)
5370             {
5371               first_vect = dr_chain[j];
5372               second_vect = dr_chain[j+1];
5373
5374               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5375               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5376               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5377                                                first_vect, second_vect,
5378                                                perm_mask_even);
5379               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5380               (*result_chain)[j/2] = data_ref;
5381
5382               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5383               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5384               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5385                                                first_vect, second_vect,
5386                                                perm_mask_odd);
5387               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5388               (*result_chain)[j/2+length/2] = data_ref;
5389             }
5390           memcpy (dr_chain.address (), result_chain->address (),
5391                   length * sizeof (tree));
5392         }
5393     }
5394 }
5395
5396 /* Function vect_shift_permute_load_chain.
5397
5398    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5399    sequence of stmts to reorder the input data accordingly.
5400    Return the final references for loads in RESULT_CHAIN.
5401    Return true if successed, false otherwise.
5402
5403    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5404    The input is 3 vectors each containing 8 elements.  We assign a
5405    number to each element, the input sequence is:
5406
5407    1st vec:   0  1  2  3  4  5  6  7
5408    2nd vec:   8  9 10 11 12 13 14 15
5409    3rd vec:  16 17 18 19 20 21 22 23
5410
5411    The output sequence should be:
5412
5413    1st vec:  0 3 6  9 12 15 18 21
5414    2nd vec:  1 4 7 10 13 16 19 22
5415    3rd vec:  2 5 8 11 14 17 20 23
5416
5417    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5418
5419    First we shuffle all 3 vectors to get correct elements order:
5420
5421    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5422    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5423    3rd vec:  (16 19 22) (17 20 23) (18 21)
5424
5425    Next we unite and shift vector 3 times:
5426
5427    1st step:
5428      shift right by 6 the concatenation of:
5429      "1st vec" and  "2nd vec"
5430        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5431      "2nd vec" and  "3rd vec"
5432        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5433      "3rd vec" and  "1st vec"
5434        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5435                              | New vectors                   |
5436
5437      So that now new vectors are:
5438
5439      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5440      2nd vec:  (10 13) (16 19 22) (17 20 23)
5441      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5442
5443    2nd step:
5444      shift right by 5 the concatenation of:
5445      "1st vec" and  "3rd vec"
5446        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5447      "2nd vec" and  "1st vec"
5448        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5449      "3rd vec" and  "2nd vec"
5450        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5451                           | New vectors                   |
5452
5453      So that now new vectors are:
5454
5455      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5456      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5457      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5458
5459    3rd step:
5460      shift right by 5 the concatenation of:
5461      "1st vec" and  "1st vec"
5462        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5463      shift right by 3 the concatenation of:
5464      "2nd vec" and  "2nd vec"
5465                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5466                           | New vectors                   |
5467
5468      So that now all vectors are READY:
5469      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5470      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5471      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5472
5473    This algorithm is faster than one in vect_permute_load_chain if:
5474      1.  "shift of a concatination" is faster than general permutation.
5475          This is usually so.
5476      2.  The TARGET machine can't execute vector instructions in parallel.
5477          This is because each step of the algorithm depends on previous.
5478          The algorithm in vect_permute_load_chain is much more parallel.
5479
5480    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5481 */
5482
5483 static bool
5484 vect_shift_permute_load_chain (vec<tree> dr_chain,
5485                                unsigned int length,
5486                                gimple *stmt,
5487                                gimple_stmt_iterator *gsi,
5488                                vec<tree> *result_chain)
5489 {
5490   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5491   tree perm2_mask1, perm2_mask2, perm3_mask;
5492   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5493   gimple *perm_stmt;
5494
5495   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5496   unsigned int i;
5497   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5498   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5499   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5500
5501   auto_vec_perm_indices sel (nelt);
5502   sel.quick_grow (nelt);
5503
5504   result_chain->quick_grow (length);
5505   memcpy (result_chain->address (), dr_chain.address (),
5506           length * sizeof (tree));
5507
5508   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5509     {
5510       unsigned int j, log_length = exact_log2 (length);
5511       for (i = 0; i < nelt / 2; ++i)
5512         sel[i] = i * 2;
5513       for (i = 0; i < nelt / 2; ++i)
5514         sel[nelt / 2 + i] = i * 2 + 1;
5515       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5516         {
5517           if (dump_enabled_p ())
5518             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5519                              "shuffle of 2 fields structure is not \
5520                               supported by target\n");
5521           return false;
5522         }
5523       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5524
5525       for (i = 0; i < nelt / 2; ++i)
5526         sel[i] = i * 2 + 1;
5527       for (i = 0; i < nelt / 2; ++i)
5528         sel[nelt / 2 + i] = i * 2;
5529       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5530         {
5531           if (dump_enabled_p ())
5532             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5533                              "shuffle of 2 fields structure is not \
5534                               supported by target\n");
5535           return false;
5536         }
5537       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5538
5539       /* Generating permutation constant to shift all elements.
5540          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5541       for (i = 0; i < nelt; i++)
5542         sel[i] = nelt / 2 + i;
5543       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5544         {
5545           if (dump_enabled_p ())
5546             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5547                              "shift permutation is not supported by target\n");
5548           return false;
5549         }
5550       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5551
5552       /* Generating permutation constant to select vector from 2.
5553          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5554       for (i = 0; i < nelt / 2; i++)
5555         sel[i] = i;
5556       for (i = nelt / 2; i < nelt; i++)
5557         sel[i] = nelt + i;
5558       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5559         {
5560           if (dump_enabled_p ())
5561             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5562                              "select is not supported by target\n");
5563           return false;
5564         }
5565       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5566
5567       for (i = 0; i < log_length; i++)
5568         {
5569           for (j = 0; j < length; j += 2)
5570             {
5571               first_vect = dr_chain[j];
5572               second_vect = dr_chain[j + 1];
5573
5574               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5575               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5576                                                first_vect, first_vect,
5577                                                perm2_mask1);
5578               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5579               vect[0] = data_ref;
5580
5581               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5582               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5583                                                second_vect, second_vect,
5584                                                perm2_mask2);
5585               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5586               vect[1] = data_ref;
5587
5588               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5589               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5590                                                vect[0], vect[1], shift1_mask);
5591               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5592               (*result_chain)[j/2 + length/2] = data_ref;
5593
5594               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5595               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5596                                                vect[0], vect[1], select_mask);
5597               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5598               (*result_chain)[j/2] = data_ref;
5599             }
5600           memcpy (dr_chain.address (), result_chain->address (),
5601                   length * sizeof (tree));
5602         }
5603       return true;
5604     }
5605   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5606     {
5607       unsigned int k = 0, l = 0;
5608
5609       /* Generating permutation constant to get all elements in rigth order.
5610          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5611       for (i = 0; i < nelt; i++)
5612         {
5613           if (3 * k + (l % 3) >= nelt)
5614             {
5615               k = 0;
5616               l += (3 - (nelt % 3));
5617             }
5618           sel[i] = 3 * k + (l % 3);
5619           k++;
5620         }
5621       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5622         {
5623           if (dump_enabled_p ())
5624             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5625                              "shuffle of 3 fields structure is not \
5626                               supported by target\n");
5627           return false;
5628         }
5629       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5630
5631       /* Generating permutation constant to shift all elements.
5632          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5633       for (i = 0; i < nelt; i++)
5634         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5635       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5636         {
5637           if (dump_enabled_p ())
5638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5639                              "shift permutation is not supported by target\n");
5640           return false;
5641         }
5642       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5643
5644       /* Generating permutation constant to shift all elements.
5645          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5646       for (i = 0; i < nelt; i++)
5647         sel[i] = 2 * (nelt / 3) + 1 + i;
5648       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5649         {
5650           if (dump_enabled_p ())
5651             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5652                              "shift permutation is not supported by target\n");
5653           return false;
5654         }
5655       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5656
5657       /* Generating permutation constant to shift all elements.
5658          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5659       for (i = 0; i < nelt; i++)
5660         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5661       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5662         {
5663           if (dump_enabled_p ())
5664             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5665                              "shift permutation is not supported by target\n");
5666           return false;
5667         }
5668       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5669
5670       /* Generating permutation constant to shift all elements.
5671          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5672       for (i = 0; i < nelt; i++)
5673         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5674       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5675         {
5676           if (dump_enabled_p ())
5677             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5678                              "shift permutation is not supported by target\n");
5679           return false;
5680         }
5681       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5682
5683       for (k = 0; k < 3; k++)
5684         {
5685           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5686           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5687                                            dr_chain[k], dr_chain[k],
5688                                            perm3_mask);
5689           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5690           vect[k] = data_ref;
5691         }
5692
5693       for (k = 0; k < 3; k++)
5694         {
5695           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5696           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5697                                            vect[k % 3], vect[(k + 1) % 3],
5698                                            shift1_mask);
5699           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5700           vect_shift[k] = data_ref;
5701         }
5702
5703       for (k = 0; k < 3; k++)
5704         {
5705           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5706           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5707                                            vect_shift[(4 - k) % 3],
5708                                            vect_shift[(3 - k) % 3],
5709                                            shift2_mask);
5710           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5711           vect[k] = data_ref;
5712         }
5713
5714       (*result_chain)[3 - (nelt % 3)] = vect[2];
5715
5716       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5717       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5718                                        vect[0], shift3_mask);
5719       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5720       (*result_chain)[nelt % 3] = data_ref;
5721
5722       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5723       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5724                                        vect[1], shift4_mask);
5725       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5726       (*result_chain)[0] = data_ref;
5727       return true;
5728     }
5729   return false;
5730 }
5731
5732 /* Function vect_transform_grouped_load.
5733
5734    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5735    to perform their permutation and ascribe the result vectorized statements to
5736    the scalar statements.
5737 */
5738
5739 void
5740 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5741                              gimple_stmt_iterator *gsi)
5742 {
5743   machine_mode mode;
5744   vec<tree> result_chain = vNULL;
5745
5746   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5747      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5748      vectors, that are ready for vector computation.  */
5749   result_chain.create (size);
5750
5751   /* If reassociation width for vector type is 2 or greater target machine can
5752      execute 2 or more vector instructions in parallel.  Otherwise try to
5753      get chain for loads group using vect_shift_permute_load_chain.  */
5754   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5755   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5756       || pow2p_hwi (size)
5757       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5758                                          gsi, &result_chain))
5759     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5760   vect_record_grouped_load_vectors (stmt, result_chain);
5761   result_chain.release ();
5762 }
5763
5764 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5765    generated as part of the vectorization of STMT.  Assign the statement
5766    for each vector to the associated scalar statement.  */
5767
5768 void
5769 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5770 {
5771   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5772   gimple *next_stmt, *new_stmt;
5773   unsigned int i, gap_count;
5774   tree tmp_data_ref;
5775
5776   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5777      Since we scan the chain starting from it's first node, their order
5778      corresponds the order of data-refs in RESULT_CHAIN.  */
5779   next_stmt = first_stmt;
5780   gap_count = 1;
5781   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5782     {
5783       if (!next_stmt)
5784         break;
5785
5786       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5787        code elimination pass later.  No need to check for the first stmt in
5788        the group, since it always exists.
5789        GROUP_GAP is the number of steps in elements from the previous
5790        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5791        correspond to the gaps.  */
5792       if (next_stmt != first_stmt
5793           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5794       {
5795         gap_count++;
5796         continue;
5797       }
5798
5799       while (next_stmt)
5800         {
5801           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5802           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5803              copies, and we put the new vector statement in the first available
5804              RELATED_STMT.  */
5805           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5806             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5807           else
5808             {
5809               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5810                 {
5811                   gimple *prev_stmt =
5812                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5813                   gimple *rel_stmt =
5814                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5815                   while (rel_stmt)
5816                     {
5817                       prev_stmt = rel_stmt;
5818                       rel_stmt =
5819                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5820                     }
5821
5822                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5823                     new_stmt;
5824                 }
5825             }
5826
5827           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5828           gap_count = 1;
5829           /* If NEXT_STMT accesses the same DR as the previous statement,
5830              put the same TMP_DATA_REF as its vectorized statement; otherwise
5831              get the next data-ref from RESULT_CHAIN.  */
5832           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5833             break;
5834         }
5835     }
5836 }
5837
5838 /* Function vect_force_dr_alignment_p.
5839
5840    Returns whether the alignment of a DECL can be forced to be aligned
5841    on ALIGNMENT bit boundary.  */
5842
5843 bool
5844 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5845 {
5846   if (!VAR_P (decl))
5847     return false;
5848
5849   if (decl_in_symtab_p (decl)
5850       && !symtab_node::get (decl)->can_increase_alignment_p ())
5851     return false;
5852
5853   if (TREE_STATIC (decl))
5854     return (alignment <= MAX_OFILE_ALIGNMENT);
5855   else
5856     return (alignment <= MAX_STACK_ALIGNMENT);
5857 }
5858
5859
5860 /* Return whether the data reference DR is supported with respect to its
5861    alignment.
5862    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5863    it is aligned, i.e., check if it is possible to vectorize it with different
5864    alignment.  */
5865
5866 enum dr_alignment_support
5867 vect_supportable_dr_alignment (struct data_reference *dr,
5868                                bool check_aligned_accesses)
5869 {
5870   gimple *stmt = DR_STMT (dr);
5871   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5872   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5873   machine_mode mode = TYPE_MODE (vectype);
5874   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5875   struct loop *vect_loop = NULL;
5876   bool nested_in_vect_loop = false;
5877
5878   if (aligned_access_p (dr) && !check_aligned_accesses)
5879     return dr_aligned;
5880
5881   /* For now assume all conditional loads/stores support unaligned
5882      access without any special code.  */
5883   if (is_gimple_call (stmt)
5884       && gimple_call_internal_p (stmt)
5885       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5886           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5887     return dr_unaligned_supported;
5888
5889   if (loop_vinfo)
5890     {
5891       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5892       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5893     }
5894
5895   /* Possibly unaligned access.  */
5896
5897   /* We can choose between using the implicit realignment scheme (generating
5898      a misaligned_move stmt) and the explicit realignment scheme (generating
5899      aligned loads with a REALIGN_LOAD).  There are two variants to the
5900      explicit realignment scheme: optimized, and unoptimized.
5901      We can optimize the realignment only if the step between consecutive
5902      vector loads is equal to the vector size.  Since the vector memory
5903      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5904      is guaranteed that the misalignment amount remains the same throughout the
5905      execution of the vectorized loop.  Therefore, we can create the
5906      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5907      at the loop preheader.
5908
5909      However, in the case of outer-loop vectorization, when vectorizing a
5910      memory access in the inner-loop nested within the LOOP that is now being
5911      vectorized, while it is guaranteed that the misalignment of the
5912      vectorized memory access will remain the same in different outer-loop
5913      iterations, it is *not* guaranteed that is will remain the same throughout
5914      the execution of the inner-loop.  This is because the inner-loop advances
5915      with the original scalar step (and not in steps of VS).  If the inner-loop
5916      step happens to be a multiple of VS, then the misalignment remains fixed
5917      and we can use the optimized realignment scheme.  For example:
5918
5919       for (i=0; i<N; i++)
5920         for (j=0; j<M; j++)
5921           s += a[i+j];
5922
5923      When vectorizing the i-loop in the above example, the step between
5924      consecutive vector loads is 1, and so the misalignment does not remain
5925      fixed across the execution of the inner-loop, and the realignment cannot
5926      be optimized (as illustrated in the following pseudo vectorized loop):
5927
5928       for (i=0; i<N; i+=4)
5929         for (j=0; j<M; j++){
5930           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5931                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5932                          // (assuming that we start from an aligned address).
5933           }
5934
5935      We therefore have to use the unoptimized realignment scheme:
5936
5937       for (i=0; i<N; i+=4)
5938           for (j=k; j<M; j+=4)
5939           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5940                            // that the misalignment of the initial address is
5941                            // 0).
5942
5943      The loop can then be vectorized as follows:
5944
5945       for (k=0; k<4; k++){
5946         rt = get_realignment_token (&vp[k]);
5947         for (i=0; i<N; i+=4){
5948           v1 = vp[i+k];
5949           for (j=k; j<M; j+=4){
5950             v2 = vp[i+j+VS-1];
5951             va = REALIGN_LOAD <v1,v2,rt>;
5952             vs += va;
5953             v1 = v2;
5954           }
5955         }
5956     } */
5957
5958   if (DR_IS_READ (dr))
5959     {
5960       bool is_packed = false;
5961       tree type = (TREE_TYPE (DR_REF (dr)));
5962
5963       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5964           && (!targetm.vectorize.builtin_mask_for_load
5965               || targetm.vectorize.builtin_mask_for_load ()))
5966         {
5967           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5968
5969           /* If we are doing SLP then the accesses need not have the
5970              same alignment, instead it depends on the SLP group size.  */
5971           if (loop_vinfo
5972               && STMT_SLP_TYPE (stmt_info)
5973               && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5974                   * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5975                   % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5976             ;
5977           else if (!loop_vinfo
5978                    || (nested_in_vect_loop
5979                        && (TREE_INT_CST_LOW (DR_STEP (dr))
5980                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5981             return dr_explicit_realign;
5982           else
5983             return dr_explicit_realign_optimized;
5984         }
5985       if (!known_alignment_for_access_p (dr))
5986         is_packed = not_size_aligned (DR_REF (dr));
5987
5988       if (targetm.vectorize.support_vector_misalignment
5989             (mode, type, DR_MISALIGNMENT (dr), is_packed))
5990         /* Can't software pipeline the loads, but can at least do them.  */
5991         return dr_unaligned_supported;
5992     }
5993   else
5994     {
5995       bool is_packed = false;
5996       tree type = (TREE_TYPE (DR_REF (dr)));
5997
5998       if (!known_alignment_for_access_p (dr))
5999         is_packed = not_size_aligned (DR_REF (dr));
6000
6001      if (targetm.vectorize.support_vector_misalignment
6002            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6003        return dr_unaligned_supported;
6004     }
6005
6006   /* Unsupported.  */
6007   return dr_unaligned_unsupported;
6008 }