gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54
  55 /* Return true if load- or store-lanes optab OPTAB is implemented for
  56    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  57
  58 static bool
  59 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  60                               tree vectype, unsigned HOST_WIDE_INT count)
  61 {
  62   machine_mode mode, array_mode;
  63   bool limit_p;
  64
  65   mode = TYPE_MODE (vectype);
  66   limit_p = !targetm.array_mode_supported_p (mode, count);
  67   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  68                               MODE_INT, limit_p);
  69
  70   if (array_mode == BLKmode)
  71     {
  72       if (dump_enabled_p ())
  73         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  74                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  75                          GET_MODE_NAME (mode), count);
  76       return false;
  77     }
  78
  79   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  80     {
  81       if (dump_enabled_p ())
  82         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  83                          "cannot use %s<%s><%s>\n", name,
  84                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  85       return false;
  86     }
  87
  88   if (dump_enabled_p ())
  89     dump_printf_loc (MSG_NOTE, vect_location,
  90                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  91                      GET_MODE_NAME (mode));
  92
  93   return true;
  94 }
  95
  96
  97 /* Return the smallest scalar part of STMT.
  98    This is used to determine the vectype of the stmt.  We generally set the
  99    vectype according to the type of the result (lhs).  For stmts whose
 100    result-type is different than the type of the arguments (e.g., demotion,
 101    promotion), vectype will be reset appropriately (later).  Note that we have
 102    to visit the smallest datatype in this function, because that determines the
 103    VF.  If the smallest datatype in the loop is present only as the rhs of a
 104    promotion operation - we'd miss it.
 105    Such a case, where a variable of this datatype does not appear in the lhs
 106    anywhere in the loop, can only occur if it's an invariant: e.g.:
 107    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 108    invariant motion.  However, we cannot rely on invariant motion to always
 109    take invariants out of the loop, and so in the case of promotion we also
 110    have to check the rhs.
 111    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 112    types.  */
 113
 114 tree
 115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 116                                HOST_WIDE_INT *rhs_size_unit)
 117 {
 118   tree scalar_type = gimple_expr_type (stmt);
 119   HOST_WIDE_INT lhs, rhs;
 120
 121   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 122
 123   if (is_gimple_assign (stmt)
 124       && (gimple_assign_cast_p (stmt)
 125           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 126           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 127           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 128     {
 129       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 130
 131       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 132       if (rhs < lhs)
 133         scalar_type = rhs_type;
 134     }
 135
 136   *lhs_size_unit = lhs;
 137   *rhs_size_unit = rhs;
 138   return scalar_type;
 139 }
 140
 141
 142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 143    tested at run-time.  Return TRUE if DDR was successfully inserted.
 144    Return false if versioning is not supported.  */
 145
 146 static bool
 147 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 148 {
 149   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 150
 151   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 152     return false;
 153
 154   if (!runtime_alias_check_p (ddr, loop,
 155                               optimize_loop_nest_for_speed_p (loop)))
 156     return false;
 157
 158   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 159   return true;
 160 }
 161
 162
 163 /* Function vect_analyze_data_ref_dependence.
 164
 165    Return TRUE if there (might) exist a dependence between a memory-reference
 166    DRA and a memory-reference DRB.  When versioning for alias may check a
 167    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 168    the data dependence.  */
 169
 170 static bool
 171 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 172                                   loop_vec_info loop_vinfo, int *max_vf)
 173 {
 174   unsigned int i;
 175   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 176   struct data_reference *dra = DDR_A (ddr);
 177   struct data_reference *drb = DDR_B (ddr);
 178   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 179   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 180   lambda_vector dist_v;
 181   unsigned int loop_depth;
 182
 183   /* In loop analysis all data references should be vectorizable.  */
 184   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 185       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 186     gcc_unreachable ();
 187
 188   /* Independent data accesses.  */
 189   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 190     return false;
 191
 192   if (dra == drb
 193       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 194     return false;
 195
 196   /* We do not have to consider dependences between accesses that belong
 197      to the same group.  */
 198   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 199       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 200     return false;
 201
 202   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 203      least two scalar iterations, there is always also a true dependence.
 204      As the vectorizer does not re-order loads and stores we can ignore
 205      the anti-dependence if TBAA can disambiguate both DRs similar to the
 206      case with known negative distance anti-dependences (positive
 207      distance anti-dependences would violate TBAA constraints).  */
 208   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 209        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 210       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 211                                  get_alias_set (DR_REF (drb))))
 212     return false;
 213
 214   /* Unknown data dependence.  */
 215   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 216     {
 217       /* If user asserted safelen consecutive iterations can be
 218          executed concurrently, assume independence.  */
 219       if (loop->safelen >= 2)
 220         {
 221           if (loop->safelen < *max_vf)
 222             *max_vf = loop->safelen;
 223           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 224           return false;
 225         }
 226
 227       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 228           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 229         {
 230           if (dump_enabled_p ())
 231             {
 232               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 233                                "versioning for alias not supported for: "
 234                                "can't determine dependence between ");
 235               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 236                                  DR_REF (dra));
 237               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 238               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 239                                  DR_REF (drb));
 240               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 241             }
 242           return true;
 243         }
 244
 245       if (dump_enabled_p ())
 246         {
 247           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 248                            "versioning for alias required: "
 249                            "can't determine dependence between ");
 250           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 251                              DR_REF (dra));
 252           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 253           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 254                              DR_REF (drb));
 255           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 256         }
 257
 258       /* Add to list of ddrs that need to be tested at run-time.  */
 259       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 260     }
 261
 262   /* Known data dependence.  */
 263   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 264     {
 265       /* If user asserted safelen consecutive iterations can be
 266          executed concurrently, assume independence.  */
 267       if (loop->safelen >= 2)
 268         {
 269           if (loop->safelen < *max_vf)
 270             *max_vf = loop->safelen;
 271           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 272           return false;
 273         }
 274
 275       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 276           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 277         {
 278           if (dump_enabled_p ())
 279             {
 280               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 281                                "versioning for alias not supported for: "
 282                                "bad dist vector for ");
 283               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 284                                  DR_REF (dra));
 285               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 286               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 287                                  DR_REF (drb));
 288               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 289             }
 290           return true;
 291         }
 292
 293       if (dump_enabled_p ())
 294         {
 295           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 296                            "versioning for alias required: "
 297                            "bad dist vector for ");
 298           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 299           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 300           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 301           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 302         }
 303       /* Add to list of ddrs that need to be tested at run-time.  */
 304       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 305     }
 306
 307   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 308   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 309     {
 310       int dist = dist_v[loop_depth];
 311
 312       if (dump_enabled_p ())
 313         dump_printf_loc (MSG_NOTE, vect_location,
 314                          "dependence distance  = %d.\n", dist);
 315
 316       if (dist == 0)
 317         {
 318           if (dump_enabled_p ())
 319             {
 320               dump_printf_loc (MSG_NOTE, vect_location,
 321                                "dependence distance == 0 between ");
 322               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 323               dump_printf (MSG_NOTE, " and ");
 324               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 325               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 326             }
 327
 328           /* When we perform grouped accesses and perform implicit CSE
 329              by detecting equal accesses and doing disambiguation with
 330              runtime alias tests like for
 331                 .. = a[i];
 332                 .. = a[i+1];
 333                 a[i] = ..;
 334                 a[i+1] = ..;
 335                 *p = ..;
 336                 .. = a[i];
 337                 .. = a[i+1];
 338              where we will end up loading { a[i], a[i+1] } once, make
 339              sure that inserting group loads before the first load and
 340              stores after the last store will do the right thing.
 341              Similar for groups like
 342                 a[i] = ...;
 343                 ... = a[i];
 344                 a[i+1] = ...;
 345              where loads from the group interleave with the store.  */
 346           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 347               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 348             {
 349               gimple *earlier_stmt;
 350               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 351               if (DR_IS_WRITE
 352                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 353                 {
 354                   if (dump_enabled_p ())
 355                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 356                                      "READ_WRITE dependence in interleaving."
 357                                      "\n");
 358                   return true;
 359                 }
 360             }
 361
 362           continue;
 363         }
 364
 365       if (dist > 0 && DDR_REVERSED_P (ddr))
 366         {
 367           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 368              reversed (to make distance vector positive), and the actual
 369              distance is negative.  */
 370           if (dump_enabled_p ())
 371             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 372                              "dependence distance negative.\n");
 373           /* Record a negative dependence distance to later limit the
 374              amount of stmt copying / unrolling we can perform.
 375              Only need to handle read-after-write dependence.  */
 376           if (DR_IS_READ (drb)
 377               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 378                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 379             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 380           continue;
 381         }
 382
 383       if (abs (dist) >= 2
 384           && abs (dist) < *max_vf)
 385         {
 386           /* The dependence distance requires reduction of the maximal
 387              vectorization factor.  */
 388           *max_vf = abs (dist);
 389           if (dump_enabled_p ())
 390             dump_printf_loc (MSG_NOTE, vect_location,
 391                              "adjusting maximal vectorization factor to %i\n",
 392                              *max_vf);
 393         }
 394
 395       if (abs (dist) >= *max_vf)
 396         {
 397           /* Dependence distance does not create dependence, as far as
 398              vectorization is concerned, in this case.  */
 399           if (dump_enabled_p ())
 400             dump_printf_loc (MSG_NOTE, vect_location,
 401                              "dependence distance >= VF.\n");
 402           continue;
 403         }
 404
 405       if (dump_enabled_p ())
 406         {
 407           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                        "not vectorized, possible dependence "
 409                        "between data-refs ");
 410           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 411           dump_printf (MSG_NOTE,  " and ");
 412           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 413           dump_printf (MSG_NOTE,  "\n");
 414         }
 415
 416       return true;
 417     }
 418
 419   return false;
 420 }
 421
 422 /* Function vect_analyze_data_ref_dependences.
 423
 424    Examine all the data references in the loop, and make sure there do not
 425    exist any data dependences between them.  Set *MAX_VF according to
 426    the maximum vectorization factor the data dependences allow.  */
 427
 428 bool
 429 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 430 {
 431   unsigned int i;
 432   struct data_dependence_relation *ddr;
 433
 434   if (dump_enabled_p ())
 435     dump_printf_loc (MSG_NOTE, vect_location,
 436                      "=== vect_analyze_data_ref_dependences ===\n");
 437
 438   LOOP_VINFO_DDRS (loop_vinfo)
 439     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 440              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 441   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 442   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 443   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 444                                 &LOOP_VINFO_DDRS (loop_vinfo),
 445                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 446     return false;
 447
 448   /* For epilogues we either have no aliases or alias versioning
 449      was applied to original loop.  Therefore we may just get max_vf
 450      using VF of original loop.  */
 451   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 452     *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
 453   else
 454     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 455       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 456         return false;
 457
 458   return true;
 459 }
 460
 461
 462 /* Function vect_slp_analyze_data_ref_dependence.
 463
 464    Return TRUE if there (might) exist a dependence between a memory-reference
 465    DRA and a memory-reference DRB.  When versioning for alias may check a
 466    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 467    the data dependence.  */
 468
 469 static bool
 470 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 471 {
 472   struct data_reference *dra = DDR_A (ddr);
 473   struct data_reference *drb = DDR_B (ddr);
 474
 475   /* We need to check dependences of statements marked as unvectorizable
 476      as well, they still can prohibit vectorization.  */
 477
 478   /* Independent data accesses.  */
 479   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 480     return false;
 481
 482   if (dra == drb)
 483     return false;
 484
 485   /* Read-read is OK.  */
 486   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 487     return false;
 488
 489   /* If dra and drb are part of the same interleaving chain consider
 490      them independent.  */
 491   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 492       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 493           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 494     return false;
 495
 496   /* Unknown data dependence.  */
 497   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 498     {
 499       if  (dump_enabled_p ())
 500         {
 501           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 502                            "can't determine dependence between ");
 503           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 504           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 505           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 506           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 507         }
 508     }
 509   else if (dump_enabled_p ())
 510     {
 511       dump_printf_loc (MSG_NOTE, vect_location,
 512                        "determined dependence between ");
 513       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 514       dump_printf (MSG_NOTE, " and ");
 515       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 516       dump_printf (MSG_NOTE,  "\n");
 517     }
 518
 519   return true;
 520 }
 521
 522
 523 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 524    contain the vector of scalar stores of this instance if we are
 525    disambiguating the loads.  */
 526
 527 static bool
 528 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 529                                    vec<gimple *> stores, gimple *last_store)
 530 {
 531   /* This walks over all stmts involved in the SLP load/store done
 532      in NODE verifying we can sink them up to the last stmt in the
 533      group.  */
 534   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 535   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 536     {
 537       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 538       if (access == last_access)
 539         continue;
 540       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 541       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 542            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 543         {
 544           gimple *stmt = gsi_stmt (gsi);
 545           if (! gimple_vuse (stmt)
 546               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 547             continue;
 548
 549           /* If we couldn't record a (single) data reference for this
 550              stmt we have to give up.  */
 551           /* ???  Here and below if dependence analysis fails we can resort
 552              to the alias oracle which can handle more kinds of stmts.  */
 553           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 554           if (!dr_b)
 555             return false;
 556
 557           bool dependent = false;
 558           /* If we run into a store of this same instance (we've just
 559              marked those) then delay dependence checking until we run
 560              into the last store because this is where it will have
 561              been sunk to (and we verify if we can do that as well).  */
 562           if (gimple_visited_p (stmt))
 563             {
 564               if (stmt != last_store)
 565                 continue;
 566               unsigned i;
 567               gimple *store;
 568               FOR_EACH_VEC_ELT (stores, i, store)
 569                 {
 570                   data_reference *store_dr
 571                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 572                   ddr_p ddr = initialize_data_dependence_relation
 573                                 (dr_a, store_dr, vNULL);
 574                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 575                   free_dependence_relation (ddr);
 576                   if (dependent)
 577                     break;
 578                 }
 579             }
 580           else
 581             {
 582               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 583                                                                dr_b, vNULL);
 584               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 585               free_dependence_relation (ddr);
 586             }
 587           if (dependent)
 588             return false;
 589         }
 590     }
 591   return true;
 592 }
 593
 594
 595 /* Function vect_analyze_data_ref_dependences.
 596
 597    Examine all the data references in the basic-block, and make sure there
 598    do not exist any data dependences between them.  Set *MAX_VF according to
 599    the maximum vectorization factor the data dependences allow.  */
 600
 601 bool
 602 vect_slp_analyze_instance_dependence (slp_instance instance)
 603 {
 604   if (dump_enabled_p ())
 605     dump_printf_loc (MSG_NOTE, vect_location,
 606                      "=== vect_slp_analyze_instance_dependence ===\n");
 607
 608   /* The stores of this instance are at the root of the SLP tree.  */
 609   slp_tree store = SLP_INSTANCE_TREE (instance);
 610   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 611     store = NULL;
 612
 613   /* Verify we can sink stores to the vectorized stmt insert location.  */
 614   gimple *last_store = NULL;
 615   if (store)
 616     {
 617       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 618         return false;
 619
 620       /* Mark stores in this instance and remember the last one.  */
 621       last_store = vect_find_last_scalar_stmt_in_slp (store);
 622       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 623         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 624     }
 625
 626   bool res = true;
 627
 628   /* Verify we can sink loads to the vectorized stmt insert location,
 629      special-casing stores of this instance.  */
 630   slp_tree load;
 631   unsigned int i;
 632   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 633     if (! vect_slp_analyze_node_dependences (instance, load,
 634                                              store
 635                                              ? SLP_TREE_SCALAR_STMTS (store)
 636                                              : vNULL, last_store))
 637       {
 638         res = false;
 639         break;
 640       }
 641
 642   /* Unset the visited flag.  */
 643   if (store)
 644     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 645       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 646
 647   return res;
 648 }
 649
 650 /* Function vect_compute_data_ref_alignment
 651
 652    Compute the misalignment of the data reference DR.
 653
 654    Output:
 655    1. If during the misalignment computation it is found that the data reference
 656       cannot be vectorized then false is returned.
 657    2. DR_MISALIGNMENT (DR) is defined.
 658
 659    FOR NOW: No analysis is actually performed. Misalignment is calculated
 660    only for trivial cases. TODO.  */
 661
 662 bool
 663 vect_compute_data_ref_alignment (struct data_reference *dr)
 664 {
 665   gimple *stmt = DR_STMT (dr);
 666   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 667   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 668   struct loop *loop = NULL;
 669   tree ref = DR_REF (dr);
 670   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 671
 672   if (dump_enabled_p ())
 673     dump_printf_loc (MSG_NOTE, vect_location,
 674                      "vect_compute_data_ref_alignment:\n");
 675
 676   if (loop_vinfo)
 677     loop = LOOP_VINFO_LOOP (loop_vinfo);
 678
 679   /* Initialize misalignment to unknown.  */
 680   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 681
 682   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 683   bool step_preserves_misalignment_p;
 684
 685   /* No step for BB vectorization.  */
 686   if (!loop)
 687     {
 688       gcc_assert (integer_zerop (drb->step));
 689       step_preserves_misalignment_p = true;
 690     }
 691
 692   /* In case the dataref is in an inner-loop of the loop that is being
 693      vectorized (LOOP), we use the base and misalignment information
 694      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 695      stays the same throughout the execution of the inner-loop, which is why
 696      we have to check that the stride of the dataref in the inner-loop evenly
 697      divides by the vector size.  */
 698   else if (nested_in_vect_loop_p (loop, stmt))
 699     {
 700       step_preserves_misalignment_p
 701         = (DR_STEP_ALIGNMENT (dr)
 702            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 703
 704       if (dump_enabled_p ())
 705         {
 706           if (step_preserves_misalignment_p)
 707             dump_printf_loc (MSG_NOTE, vect_location,
 708                              "inner step divides the vector-size.\n");
 709           else
 710             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 711                              "inner step doesn't divide the vector-size.\n");
 712         }
 713     }
 714
 715   /* Similarly we can only use base and misalignment information relative to
 716      an innermost loop if the misalignment stays the same throughout the
 717      execution of the loop.  As above, this is the case if the stride of
 718      the dataref evenly divides by the vector size.  */
 719   else
 720     {
 721       unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 722       step_preserves_misalignment_p
 723         = ((DR_STEP_ALIGNMENT (dr) * vf)
 724            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 725
 726       if (!step_preserves_misalignment_p && dump_enabled_p ())
 727         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 728                          "step doesn't divide the vector-size.\n");
 729     }
 730
 731   unsigned int base_alignment = drb->base_alignment;
 732   unsigned int base_misalignment = drb->base_misalignment;
 733   unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
 734   unsigned HOST_WIDE_INT element_alignment
 735     = TYPE_ALIGN_UNIT (TREE_TYPE (vectype));
 736
 737   if (base_alignment >= element_alignment
 738       && (base_misalignment & (element_alignment - 1)) == 0)
 739     DR_VECT_AUX (dr)->base_element_aligned = true;
 740
 741   if (drb->offset_alignment < vector_alignment
 742       || !step_preserves_misalignment_p
 743       /* We need to know whether the step wrt the vectorized loop is
 744          negative when computing the starting misalignment below.  */
 745       || TREE_CODE (drb->step) != INTEGER_CST)
 746     {
 747       if (dump_enabled_p ())
 748         {
 749           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 750                            "Unknown alignment for access: ");
 751           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 752           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 753         }
 754       return true;
 755     }
 756
 757   if (base_alignment < vector_alignment)
 758     {
 759       tree base = drb->base_address;
 760       if (TREE_CODE (base) == ADDR_EXPR)
 761         base = TREE_OPERAND (base, 0);
 762       if (!vect_can_force_dr_alignment_p (base,
 763                                           vector_alignment * BITS_PER_UNIT))
 764         {
 765           if (dump_enabled_p ())
 766             {
 767               dump_printf_loc (MSG_NOTE, vect_location,
 768                                "can't force alignment of ref: ");
 769               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 770               dump_printf (MSG_NOTE, "\n");
 771             }
 772           return true;
 773         }
 774
 775       if (DECL_USER_ALIGN (base))
 776         {
 777           if (dump_enabled_p ())
 778             {
 779               dump_printf_loc (MSG_NOTE, vect_location,
 780                                "not forcing alignment of user-aligned "
 781                                "variable: ");
 782               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 783               dump_printf (MSG_NOTE, "\n");
 784             }
 785           return true;
 786         }
 787
 788       /* Force the alignment of the decl.
 789          NOTE: This is the only change to the code we make during
 790          the analysis phase, before deciding to vectorize the loop.  */
 791       if (dump_enabled_p ())
 792         {
 793           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 794           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 795           dump_printf (MSG_NOTE, "\n");
 796         }
 797
 798       DR_VECT_AUX (dr)->base_decl = base;
 799       DR_VECT_AUX (dr)->base_misaligned = true;
 800       DR_VECT_AUX (dr)->base_element_aligned = true;
 801       base_misalignment = 0;
 802     }
 803   unsigned int misalignment = (base_misalignment
 804                                + TREE_INT_CST_LOW (drb->init));
 805
 806   /* If this is a backward running DR then first access in the larger
 807      vectype actually is N-1 elements before the address in the DR.
 808      Adjust misalign accordingly.  */
 809   if (tree_int_cst_sgn (drb->step) < 0)
 810     /* PLUS because STEP is negative.  */
 811     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 812                      * TREE_INT_CST_LOW (drb->step));
 813
 814   SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
 815
 816   if (dump_enabled_p ())
 817     {
 818       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 819                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 820       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 821       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 822     }
 823
 824   return true;
 825 }
 826
 827
 828 /* Function vect_update_misalignment_for_peel.
 829    Sets DR's misalignment
 830    - to 0 if it has the same alignment as DR_PEEL,
 831    - to the misalignment computed using NPEEL if DR's salignment is known,
 832    - to -1 (unknown) otherwise.
 833
 834    DR - the data reference whose misalignment is to be adjusted.
 835    DR_PEEL - the data reference whose misalignment is being made
 836              zero in the vector loop by the peel.
 837    NPEEL - the number of iterations in the peel loop if the misalignment
 838            of DR_PEEL is known at compile time.  */
 839
 840 static void
 841 vect_update_misalignment_for_peel (struct data_reference *dr,
 842                                    struct data_reference *dr_peel, int npeel)
 843 {
 844   unsigned int i;
 845   vec<dr_p> same_aligned_drs;
 846   struct data_reference *current_dr;
 847   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 848   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 849   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 850   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 851
 852  /* For interleaved data accesses the step in the loop must be multiplied by
 853      the size of the interleaving group.  */
 854   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 855     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 856   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 857     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 858
 859   /* It can be assumed that the data refs with the same alignment as dr_peel
 860      are aligned in the vector loop.  */
 861   same_aligned_drs
 862     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 863   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
 864     {
 865       if (current_dr != dr)
 866         continue;
 867       gcc_assert (!known_alignment_for_access_p (dr)
 868                   || !known_alignment_for_access_p (dr_peel)
 869                   || (DR_MISALIGNMENT (dr) / dr_size
 870                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
 871       SET_DR_MISALIGNMENT (dr, 0);
 872       return;
 873     }
 874
 875   if (known_alignment_for_access_p (dr)
 876       && known_alignment_for_access_p (dr_peel))
 877     {
 878       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 879       int misal = DR_MISALIGNMENT (dr);
 880       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 881       misal += negative ? -npeel * dr_size : npeel * dr_size;
 882       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 883       SET_DR_MISALIGNMENT (dr, misal);
 884       return;
 885     }
 886
 887   if (dump_enabled_p ())
 888     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
 889                      "to unknown (-1).\n");
 890   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 891 }
 892
 893
 894 /* Function verify_data_ref_alignment
 895
 896    Return TRUE if DR can be handled with respect to alignment.  */
 897
 898 static bool
 899 verify_data_ref_alignment (data_reference_p dr)
 900 {
 901   enum dr_alignment_support supportable_dr_alignment
 902     = vect_supportable_dr_alignment (dr, false);
 903   if (!supportable_dr_alignment)
 904     {
 905       if (dump_enabled_p ())
 906         {
 907           if (DR_IS_READ (dr))
 908             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 909                              "not vectorized: unsupported unaligned load.");
 910           else
 911             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 912                              "not vectorized: unsupported unaligned "
 913                              "store.");
 914
 915           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 916                              DR_REF (dr));
 917           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 918         }
 919       return false;
 920     }
 921
 922   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 923     dump_printf_loc (MSG_NOTE, vect_location,
 924                      "Vectorizing an unaligned access.\n");
 925
 926   return true;
 927 }
 928
 929 /* Function vect_verify_datarefs_alignment
 930
 931    Return TRUE if all data references in the loop can be
 932    handled with respect to alignment.  */
 933
 934 bool
 935 vect_verify_datarefs_alignment (loop_vec_info vinfo)
 936 {
 937   vec<data_reference_p> datarefs = vinfo->datarefs;
 938   struct data_reference *dr;
 939   unsigned int i;
 940
 941   FOR_EACH_VEC_ELT (datarefs, i, dr)
 942     {
 943       gimple *stmt = DR_STMT (dr);
 944       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 945
 946       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 947         continue;
 948
 949       /* For interleaving, only the alignment of the first access matters.   */
 950       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
 951           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 952         continue;
 953
 954       /* Strided accesses perform only component accesses, alignment is
 955          irrelevant for them.  */
 956       if (STMT_VINFO_STRIDED_P (stmt_info)
 957           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 958         continue;
 959
 960       if (! verify_data_ref_alignment (dr))
 961         return false;
 962     }
 963
 964   return true;
 965 }
 966
 967 /* Given an memory reference EXP return whether its alignment is less
 968    than its size.  */
 969
 970 static bool
 971 not_size_aligned (tree exp)
 972 {
 973   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 974     return true;
 975
 976   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 977           > get_object_alignment (exp));
 978 }
 979
 980 /* Function vector_alignment_reachable_p
 981
 982    Return true if vector alignment for DR is reachable by peeling
 983    a few loop iterations.  Return false otherwise.  */
 984
 985 static bool
 986 vector_alignment_reachable_p (struct data_reference *dr)
 987 {
 988   gimple *stmt = DR_STMT (dr);
 989   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 990   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 991
 992   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 993     {
 994       /* For interleaved access we peel only if number of iterations in
 995          the prolog loop ({VF - misalignment}), is a multiple of the
 996          number of the interleaved accesses.  */
 997       int elem_size, mis_in_elements;
 998       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 999
1000       /* FORNOW: handle only known alignment.  */
1001       if (!known_alignment_for_access_p (dr))
1002         return false;
1003
1004       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1005       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1006
1007       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1008         return false;
1009     }
1010
1011   /* If misalignment is known at the compile time then allow peeling
1012      only if natural alignment is reachable through peeling.  */
1013   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1014     {
1015       HOST_WIDE_INT elmsize =
1016                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1017       if (dump_enabled_p ())
1018         {
1019           dump_printf_loc (MSG_NOTE, vect_location,
1020                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1021           dump_printf (MSG_NOTE,
1022                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1023         }
1024       if (DR_MISALIGNMENT (dr) % elmsize)
1025         {
1026           if (dump_enabled_p ())
1027             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1028                              "data size does not divide the misalignment.\n");
1029           return false;
1030         }
1031     }
1032
1033   if (!known_alignment_for_access_p (dr))
1034     {
1035       tree type = TREE_TYPE (DR_REF (dr));
1036       bool is_packed = not_size_aligned (DR_REF (dr));
1037       if (dump_enabled_p ())
1038         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1039                          "Unknown misalignment, %snaturally aligned\n",
1040                          is_packed ? "not " : "");
1041       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1042     }
1043
1044   return true;
1045 }
1046
1047
1048 /* Calculate the cost of the memory access represented by DR.  */
1049
1050 static void
1051 vect_get_data_access_cost (struct data_reference *dr,
1052                            unsigned int *inside_cost,
1053                            unsigned int *outside_cost,
1054                            stmt_vector_for_cost *body_cost_vec)
1055 {
1056   gimple *stmt = DR_STMT (dr);
1057   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1058   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1059   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1060   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1061   int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly  */
1062
1063   if (DR_IS_READ (dr))
1064     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1065                         NULL, body_cost_vec, false);
1066   else
1067     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1068
1069   if (dump_enabled_p ())
1070     dump_printf_loc (MSG_NOTE, vect_location,
1071                      "vect_get_data_access_cost: inside_cost = %d, "
1072                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1073 }
1074
1075
1076 typedef struct _vect_peel_info
1077 {
1078   struct data_reference *dr;
1079   int npeel;
1080   unsigned int count;
1081 } *vect_peel_info;
1082
1083 typedef struct _vect_peel_extended_info
1084 {
1085   struct _vect_peel_info peel_info;
1086   unsigned int inside_cost;
1087   unsigned int outside_cost;
1088   stmt_vector_for_cost body_cost_vec;
1089 } *vect_peel_extended_info;
1090
1091
1092 /* Peeling hashtable helpers.  */
1093
1094 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1095 {
1096   static inline hashval_t hash (const _vect_peel_info *);
1097   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1098 };
1099
1100 inline hashval_t
1101 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1102 {
1103   return (hashval_t) peel_info->npeel;
1104 }
1105
1106 inline bool
1107 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1108 {
1109   return (a->npeel == b->npeel);
1110 }
1111
1112
1113 /* Insert DR into peeling hash table with NPEEL as key.  */
1114
1115 static void
1116 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1117                           loop_vec_info loop_vinfo, struct data_reference *dr,
1118                           int npeel)
1119 {
1120   struct _vect_peel_info elem, *slot;
1121   _vect_peel_info **new_slot;
1122   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1123
1124   elem.npeel = npeel;
1125   slot = peeling_htab->find (&elem);
1126   if (slot)
1127     slot->count++;
1128   else
1129     {
1130       slot = XNEW (struct _vect_peel_info);
1131       slot->npeel = npeel;
1132       slot->dr = dr;
1133       slot->count = 1;
1134       new_slot = peeling_htab->find_slot (slot, INSERT);
1135       *new_slot = slot;
1136     }
1137
1138   if (!supportable_dr_alignment
1139       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1140     slot->count += VECT_MAX_COST;
1141 }
1142
1143
1144 /* Traverse peeling hash table to find peeling option that aligns maximum
1145    number of data accesses.  */
1146
1147 int
1148 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1149                                      _vect_peel_extended_info *max)
1150 {
1151   vect_peel_info elem = *slot;
1152
1153   if (elem->count > max->peel_info.count
1154       || (elem->count == max->peel_info.count
1155           && max->peel_info.npeel > elem->npeel))
1156     {
1157       max->peel_info.npeel = elem->npeel;
1158       max->peel_info.count = elem->count;
1159       max->peel_info.dr = elem->dr;
1160     }
1161
1162   return 1;
1163 }
1164
1165 /* Get the costs of peeling NPEEL iterations checking data access costs
1166    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1167    misalignment will be zero after peeling.  */
1168
1169 static void
1170 vect_get_peeling_costs_all_drs (struct data_reference *dr0,
1171                                 unsigned int *inside_cost,
1172                                 unsigned int *outside_cost,
1173                                 stmt_vector_for_cost *body_cost_vec,
1174                                 unsigned int npeel,
1175                                 bool unknown_misalignment)
1176 {
1177   gimple *stmt = DR_STMT (dr0);
1178   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1179   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1180   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1181
1182   unsigned i;
1183   data_reference *dr;
1184
1185   FOR_EACH_VEC_ELT (datarefs, i, dr)
1186     {
1187       stmt = DR_STMT (dr);
1188       stmt_info = vinfo_for_stmt (stmt);
1189       /* For interleaving, only the alignment of the first access
1190          matters.  */
1191       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1192           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1193         continue;
1194
1195       /* Strided accesses perform only component accesses, alignment is
1196          irrelevant for them.  */
1197       if (STMT_VINFO_STRIDED_P (stmt_info)
1198           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1199         continue;
1200
1201       int save_misalignment;
1202       save_misalignment = DR_MISALIGNMENT (dr);
1203       if (unknown_misalignment && dr == dr0)
1204         SET_DR_MISALIGNMENT (dr, 0);
1205       else
1206         vect_update_misalignment_for_peel (dr, dr0, npeel);
1207       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1208                                  body_cost_vec);
1209       SET_DR_MISALIGNMENT (dr, save_misalignment);
1210     }
1211 }
1212
1213 /* Traverse peeling hash table and calculate cost for each peeling option.
1214    Find the one with the lowest cost.  */
1215
1216 int
1217 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1218                                    _vect_peel_extended_info *min)
1219 {
1220   vect_peel_info elem = *slot;
1221   int dummy;
1222   unsigned int inside_cost = 0, outside_cost = 0;
1223   gimple *stmt = DR_STMT (elem->dr);
1224   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1225   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1226   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1227                        epilogue_cost_vec;
1228
1229   prologue_cost_vec.create (2);
1230   body_cost_vec.create (2);
1231   epilogue_cost_vec.create (2);
1232
1233   vect_get_peeling_costs_all_drs (elem->dr, &inside_cost, &outside_cost,
1234                                   &body_cost_vec, elem->npeel, false);
1235
1236   outside_cost += vect_get_known_peeling_cost
1237     (loop_vinfo, elem->npeel, &dummy,
1238      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1239      &prologue_cost_vec, &epilogue_cost_vec);
1240
1241   /* Prologue and epilogue costs are added to the target model later.
1242      These costs depend only on the scalar iteration cost, the
1243      number of peeling iterations finally chosen, and the number of
1244      misaligned statements.  So discard the information found here.  */
1245   prologue_cost_vec.release ();
1246   epilogue_cost_vec.release ();
1247
1248   if (inside_cost < min->inside_cost
1249       || (inside_cost == min->inside_cost
1250           && outside_cost < min->outside_cost))
1251     {
1252       min->inside_cost = inside_cost;
1253       min->outside_cost = outside_cost;
1254       min->body_cost_vec.release ();
1255       min->body_cost_vec = body_cost_vec;
1256       min->peel_info.dr = elem->dr;
1257       min->peel_info.npeel = elem->npeel;
1258       min->peel_info.count = elem->count;
1259     }
1260   else
1261     body_cost_vec.release ();
1262
1263   return 1;
1264 }
1265
1266
1267 /* Choose best peeling option by traversing peeling hash table and either
1268    choosing an option with the lowest cost (if cost model is enabled) or the
1269    option that aligns as many accesses as possible.  */
1270
1271 static struct _vect_peel_extended_info
1272 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1273                                        loop_vec_info loop_vinfo,
1274                                        unsigned int *npeel,
1275                                        stmt_vector_for_cost *body_cost_vec)
1276 {
1277    struct _vect_peel_extended_info res;
1278
1279    res.peel_info.dr = NULL;
1280    res.body_cost_vec = stmt_vector_for_cost ();
1281
1282    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1283      {
1284        res.inside_cost = INT_MAX;
1285        res.outside_cost = INT_MAX;
1286        peeling_htab->traverse <_vect_peel_extended_info *,
1287                                vect_peeling_hash_get_lowest_cost> (&res);
1288      }
1289    else
1290      {
1291        res.peel_info.count = 0;
1292        peeling_htab->traverse <_vect_peel_extended_info *,
1293                                vect_peeling_hash_get_most_frequent> (&res);
1294        res.inside_cost = 0;
1295        res.outside_cost = 0;
1296      }
1297
1298    *npeel = res.peel_info.npeel;
1299    *body_cost_vec = res.body_cost_vec;
1300    return res;
1301 }
1302
1303 /* Return true if the new peeling NPEEL is supported.  */
1304
1305 static bool
1306 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1307                           unsigned npeel)
1308 {
1309   unsigned i;
1310   struct data_reference *dr = NULL;
1311   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1312   gimple *stmt;
1313   stmt_vec_info stmt_info;
1314   enum dr_alignment_support supportable_dr_alignment;
1315
1316   /* Ensure that all data refs can be vectorized after the peel.  */
1317   FOR_EACH_VEC_ELT (datarefs, i, dr)
1318     {
1319       int save_misalignment;
1320
1321       if (dr == dr0)
1322         continue;
1323
1324       stmt = DR_STMT (dr);
1325       stmt_info = vinfo_for_stmt (stmt);
1326       /* For interleaving, only the alignment of the first access
1327          matters.  */
1328       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1329           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1330         continue;
1331
1332       /* Strided accesses perform only component accesses, alignment is
1333          irrelevant for them.  */
1334       if (STMT_VINFO_STRIDED_P (stmt_info)
1335           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1336         continue;
1337
1338       save_misalignment = DR_MISALIGNMENT (dr);
1339       vect_update_misalignment_for_peel (dr, dr0, npeel);
1340       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1341       SET_DR_MISALIGNMENT (dr, save_misalignment);
1342
1343       if (!supportable_dr_alignment)
1344         return false;
1345     }
1346
1347   return true;
1348 }
1349
1350 /* Function vect_enhance_data_refs_alignment
1351
1352    This pass will use loop versioning and loop peeling in order to enhance
1353    the alignment of data references in the loop.
1354
1355    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1356    original loop is to be vectorized.  Any other loops that are created by
1357    the transformations performed in this pass - are not supposed to be
1358    vectorized.  This restriction will be relaxed.
1359
1360    This pass will require a cost model to guide it whether to apply peeling
1361    or versioning or a combination of the two.  For example, the scheme that
1362    intel uses when given a loop with several memory accesses, is as follows:
1363    choose one memory access ('p') which alignment you want to force by doing
1364    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1365    other accesses are not necessarily aligned, or (2) use loop versioning to
1366    generate one loop in which all accesses are aligned, and another loop in
1367    which only 'p' is necessarily aligned.
1368
1369    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1370    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1371    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1372
1373    Devising a cost model is the most critical aspect of this work.  It will
1374    guide us on which access to peel for, whether to use loop versioning, how
1375    many versions to create, etc.  The cost model will probably consist of
1376    generic considerations as well as target specific considerations (on
1377    powerpc for example, misaligned stores are more painful than misaligned
1378    loads).
1379
1380    Here are the general steps involved in alignment enhancements:
1381
1382      -- original loop, before alignment analysis:
1383         for (i=0; i<N; i++){
1384           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1385           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1386         }
1387
1388      -- After vect_compute_data_refs_alignment:
1389         for (i=0; i<N; i++){
1390           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1391           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1392         }
1393
1394      -- Possibility 1: we do loop versioning:
1395      if (p is aligned) {
1396         for (i=0; i<N; i++){    # loop 1A
1397           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1398           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1399         }
1400      }
1401      else {
1402         for (i=0; i<N; i++){    # loop 1B
1403           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1404           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1405         }
1406      }
1407
1408      -- Possibility 2: we do loop peeling:
1409      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1410         x = q[i];
1411         p[i] = y;
1412      }
1413      for (i = 3; i < N; i++){   # loop 2A
1414         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1415         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1416      }
1417
1418      -- Possibility 3: combination of loop peeling and versioning:
1419      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1420         x = q[i];
1421         p[i] = y;
1422      }
1423      if (p is aligned) {
1424         for (i = 3; i<N; i++){  # loop 3A
1425           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1426           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1427         }
1428      }
1429      else {
1430         for (i = 3; i<N; i++){  # loop 3B
1431           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1432           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1433         }
1434      }
1435
1436      These loops are later passed to loop_transform to be vectorized.  The
1437      vectorizer will use the alignment information to guide the transformation
1438      (whether to generate regular loads/stores, or with special handling for
1439      misalignment).  */
1440
1441 bool
1442 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1443 {
1444   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1445   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1446   enum dr_alignment_support supportable_dr_alignment;
1447   struct data_reference *dr0 = NULL, *first_store = NULL;
1448   struct data_reference *dr;
1449   unsigned int i, j;
1450   bool do_peeling = false;
1451   bool do_versioning = false;
1452   bool stat;
1453   gimple *stmt;
1454   stmt_vec_info stmt_info;
1455   unsigned int npeel = 0;
1456   bool one_misalignment_known = false;
1457   bool one_misalignment_unknown = false;
1458   bool one_dr_unsupportable = false;
1459   struct data_reference *unsupportable_dr = NULL;
1460   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1461   unsigned possible_npeel_number = 1;
1462   tree vectype;
1463   unsigned int nelements, mis, same_align_drs_max = 0;
1464   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1465   hash_table<peel_info_hasher> peeling_htab (1);
1466
1467   if (dump_enabled_p ())
1468     dump_printf_loc (MSG_NOTE, vect_location,
1469                      "=== vect_enhance_data_refs_alignment ===\n");
1470
1471   /* Reset data so we can safely be called multiple times.  */
1472   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1473   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1474
1475   /* While cost model enhancements are expected in the future, the high level
1476      view of the code at this time is as follows:
1477
1478      A) If there is a misaligned access then see if peeling to align
1479         this access can make all data references satisfy
1480         vect_supportable_dr_alignment.  If so, update data structures
1481         as needed and return true.
1482
1483      B) If peeling wasn't possible and there is a data reference with an
1484         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1485         then see if loop versioning checks can be used to make all data
1486         references satisfy vect_supportable_dr_alignment.  If so, update
1487         data structures as needed and return true.
1488
1489      C) If neither peeling nor versioning were successful then return false if
1490         any data reference does not satisfy vect_supportable_dr_alignment.
1491
1492      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1493
1494      Note, Possibility 3 above (which is peeling and versioning together) is not
1495      being done at this time.  */
1496
1497   /* (1) Peeling to force alignment.  */
1498
1499   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1500      Considerations:
1501      + How many accesses will become aligned due to the peeling
1502      - How many accesses will become unaligned due to the peeling,
1503        and the cost of misaligned accesses.
1504      - The cost of peeling (the extra runtime checks, the increase
1505        in code size).  */
1506
1507   FOR_EACH_VEC_ELT (datarefs, i, dr)
1508     {
1509       stmt = DR_STMT (dr);
1510       stmt_info = vinfo_for_stmt (stmt);
1511
1512       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1513         continue;
1514
1515       /* For interleaving, only the alignment of the first access
1516          matters.  */
1517       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1518           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1519         continue;
1520
1521       /* For invariant accesses there is nothing to enhance.  */
1522       if (integer_zerop (DR_STEP (dr)))
1523         continue;
1524
1525       /* Strided accesses perform only component accesses, alignment is
1526          irrelevant for them.  */
1527       if (STMT_VINFO_STRIDED_P (stmt_info)
1528           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1529         continue;
1530
1531       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1532       do_peeling = vector_alignment_reachable_p (dr);
1533       if (do_peeling)
1534         {
1535           if (known_alignment_for_access_p (dr))
1536             {
1537               unsigned int npeel_tmp = 0;
1538               bool negative = tree_int_cst_compare (DR_STEP (dr),
1539                                                     size_zero_node) < 0;
1540
1541               vectype = STMT_VINFO_VECTYPE (stmt_info);
1542               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1543               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1544                                                 TREE_TYPE (DR_REF (dr))));
1545               if (DR_MISALIGNMENT (dr) != 0)
1546                 npeel_tmp = (negative ? (mis - nelements)
1547                              : (nelements - mis)) & (nelements - 1);
1548
1549               /* For multiple types, it is possible that the bigger type access
1550                  will have more than one peeling option.  E.g., a loop with two
1551                  types: one of size (vector size / 4), and the other one of
1552                  size (vector size / 8).  Vectorization factor will 8.  If both
1553                  accesses are misaligned by 3, the first one needs one scalar
1554                  iteration to be aligned, and the second one needs 5.  But the
1555                  first one will be aligned also by peeling 5 scalar
1556                  iterations, and in that case both accesses will be aligned.
1557                  Hence, except for the immediate peeling amount, we also want
1558                  to try to add full vector size, while we don't exceed
1559                  vectorization factor.
1560                  We do this automatically for cost model, since we calculate
1561                  cost for every peeling option.  */
1562               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1563                 {
1564                   if (STMT_SLP_TYPE (stmt_info))
1565                     possible_npeel_number
1566                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1567                   else
1568                     possible_npeel_number = vf / nelements;
1569
1570                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1571                      allow peeling NELEMENTS.  */
1572                   if (DR_MISALIGNMENT (dr) == 0)
1573                     possible_npeel_number++;
1574                 }
1575
1576               /* Save info about DR in the hash table.  Also include peeling
1577                  amounts according to the explanation above.  */
1578               for (j = 0; j < possible_npeel_number; j++)
1579                 {
1580                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1581                                             dr, npeel_tmp);
1582                   npeel_tmp += nelements;
1583                 }
1584
1585               one_misalignment_known = true;
1586             }
1587           else
1588             {
1589               /* If we don't know any misalignment values, we prefer
1590                  peeling for data-ref that has the maximum number of data-refs
1591                  with the same alignment, unless the target prefers to align
1592                  stores over load.  */
1593               unsigned same_align_drs
1594                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1595               if (!dr0
1596                   || same_align_drs_max < same_align_drs)
1597                 {
1598                   same_align_drs_max = same_align_drs;
1599                   dr0 = dr;
1600                 }
1601               /* For data-refs with the same number of related
1602                  accesses prefer the one where the misalign
1603                  computation will be invariant in the outermost loop.  */
1604               else if (same_align_drs_max == same_align_drs)
1605                 {
1606                   struct loop *ivloop0, *ivloop;
1607                   ivloop0 = outermost_invariant_loop_for_expr
1608                     (loop, DR_BASE_ADDRESS (dr0));
1609                   ivloop = outermost_invariant_loop_for_expr
1610                     (loop, DR_BASE_ADDRESS (dr));
1611                   if ((ivloop && !ivloop0)
1612                       || (ivloop && ivloop0
1613                           && flow_loop_nested_p (ivloop, ivloop0)))
1614                     dr0 = dr;
1615                 }
1616
1617               one_misalignment_unknown = true;
1618
1619               /* Check for data refs with unsupportable alignment that
1620                  can be peeled.  */
1621               if (!supportable_dr_alignment)
1622               {
1623                 one_dr_unsupportable = true;
1624                 unsupportable_dr = dr;
1625               }
1626
1627               if (!first_store && DR_IS_WRITE (dr))
1628                 first_store = dr;
1629             }
1630         }
1631       else
1632         {
1633           if (!aligned_access_p (dr))
1634             {
1635               if (dump_enabled_p ())
1636                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637                                  "vector alignment may not be reachable\n");
1638               break;
1639             }
1640         }
1641     }
1642
1643   /* Check if we can possibly peel the loop.  */
1644   if (!vect_can_advance_ivs_p (loop_vinfo)
1645       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1646       || loop->inner)
1647     do_peeling = false;
1648
1649   struct _vect_peel_extended_info peel_for_known_alignment;
1650   struct _vect_peel_extended_info peel_for_unknown_alignment;
1651   struct _vect_peel_extended_info best_peel;
1652
1653   peel_for_unknown_alignment.inside_cost = INT_MAX;
1654   peel_for_unknown_alignment.outside_cost = INT_MAX;
1655   peel_for_unknown_alignment.peel_info.count = 0;
1656
1657   if (do_peeling
1658       && one_misalignment_unknown)
1659     {
1660       /* Check if the target requires to prefer stores over loads, i.e., if
1661          misaligned stores are more expensive than misaligned loads (taking
1662          drs with same alignment into account).  */
1663       unsigned int load_inside_cost = 0;
1664       unsigned int load_outside_cost = 0;
1665       unsigned int store_inside_cost = 0;
1666       unsigned int store_outside_cost = 0;
1667
1668       stmt_vector_for_cost dummy;
1669       dummy.create (2);
1670       vect_get_peeling_costs_all_drs (dr0,
1671                                       &load_inside_cost,
1672                                       &load_outside_cost,
1673                                       &dummy, vf / 2, true);
1674       dummy.release ();
1675
1676       if (first_store)
1677         {
1678           dummy.create (2);
1679           vect_get_peeling_costs_all_drs (first_store,
1680                                           &store_inside_cost,
1681                                           &store_outside_cost,
1682                                           &dummy, vf / 2, true);
1683           dummy.release ();
1684         }
1685       else
1686         {
1687           store_inside_cost = INT_MAX;
1688           store_outside_cost = INT_MAX;
1689         }
1690
1691       if (load_inside_cost > store_inside_cost
1692           || (load_inside_cost == store_inside_cost
1693               && load_outside_cost > store_outside_cost))
1694         {
1695           dr0 = first_store;
1696           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1697           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1698         }
1699       else
1700         {
1701           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1702           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1703         }
1704
1705       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1706       prologue_cost_vec.create (2);
1707       epilogue_cost_vec.create (2);
1708
1709       int dummy2;
1710       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1711         (loop_vinfo, vf / 2, &dummy2,
1712          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1713          &prologue_cost_vec, &epilogue_cost_vec);
1714
1715       prologue_cost_vec.release ();
1716       epilogue_cost_vec.release ();
1717
1718       peel_for_unknown_alignment.peel_info.count = 1
1719         + STMT_VINFO_SAME_ALIGN_REFS
1720         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1721     }
1722
1723   peel_for_unknown_alignment.peel_info.npeel = 0;
1724   peel_for_unknown_alignment.peel_info.dr = dr0;
1725
1726   best_peel = peel_for_unknown_alignment;
1727
1728   peel_for_known_alignment.inside_cost = INT_MAX;
1729   peel_for_known_alignment.outside_cost = INT_MAX;
1730   peel_for_known_alignment.peel_info.count = 0;
1731   peel_for_known_alignment.peel_info.dr = NULL;
1732
1733   if (do_peeling && one_misalignment_known)
1734     {
1735       /* Peeling is possible, but there is no data access that is not supported
1736          unless aligned.  So we try to choose the best possible peeling from
1737          the hash table.  */
1738       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1739         (&peeling_htab, loop_vinfo, &npeel, &body_cost_vec);
1740     }
1741
1742   /* Compare costs of peeling for known and unknown alignment. */
1743   if (peel_for_known_alignment.peel_info.dr != NULL
1744       && peel_for_unknown_alignment.inside_cost
1745       >= peel_for_known_alignment.inside_cost)
1746     {
1747       best_peel = peel_for_known_alignment;
1748
1749       /* If the best peeling for known alignment has NPEEL == 0, perform no
1750          peeling at all except if there is an unsupportable dr that we can
1751          align.  */
1752       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1753         do_peeling = false;
1754     }
1755
1756   /* If there is an unsupportable data ref, prefer this over all choices so far
1757      since we'd have to discard a chosen peeling except when it accidentally
1758      aligned the unsupportable data ref.  */
1759   if (one_dr_unsupportable)
1760     dr0 = unsupportable_dr;
1761   else if (do_peeling)
1762     {
1763       /* Calculate the penalty for no peeling, i.e. leaving everything
1764          unaligned.
1765          TODO: Adapt vect_get_peeling_costs_all_drs and use here.  */
1766       unsigned nopeel_inside_cost = 0;
1767       unsigned nopeel_outside_cost = 0;
1768
1769       stmt_vector_for_cost dummy;
1770       dummy.create (2);
1771       FOR_EACH_VEC_ELT (datarefs, i, dr)
1772         vect_get_data_access_cost (dr, &nopeel_inside_cost,
1773                                    &nopeel_outside_cost, &dummy);
1774       dummy.release ();
1775
1776       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1777          costs will be recorded.  */
1778       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1779       prologue_cost_vec.create (2);
1780       epilogue_cost_vec.create (2);
1781
1782       int dummy2;
1783       nopeel_outside_cost += vect_get_known_peeling_cost
1784         (loop_vinfo, 0, &dummy2,
1785          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1786          &prologue_cost_vec, &epilogue_cost_vec);
1787
1788       prologue_cost_vec.release ();
1789       epilogue_cost_vec.release ();
1790
1791       npeel = best_peel.peel_info.npeel;
1792       dr0 = best_peel.peel_info.dr;
1793
1794       /* If no peeling is not more expensive than the best peeling we
1795          have so far, don't perform any peeling.  */
1796       if (nopeel_inside_cost <= best_peel.inside_cost)
1797         do_peeling = false;
1798     }
1799
1800   if (do_peeling)
1801     {
1802       stmt = DR_STMT (dr0);
1803       stmt_info = vinfo_for_stmt (stmt);
1804       vectype = STMT_VINFO_VECTYPE (stmt_info);
1805       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1806
1807       if (known_alignment_for_access_p (dr0))
1808         {
1809           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1810                                                 size_zero_node) < 0;
1811           if (!npeel)
1812             {
1813               /* Since it's known at compile time, compute the number of
1814                  iterations in the peeled loop (the peeling factor) for use in
1815                  updating DR_MISALIGNMENT values.  The peeling factor is the
1816                  vectorization factor minus the misalignment as an element
1817                  count.  */
1818               mis = DR_MISALIGNMENT (dr0);
1819               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1820               npeel = ((negative ? mis - nelements : nelements - mis)
1821                        & (nelements - 1));
1822             }
1823
1824           /* For interleaved data access every iteration accesses all the
1825              members of the group, therefore we divide the number of iterations
1826              by the group size.  */
1827           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1828           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1829             npeel /= GROUP_SIZE (stmt_info);
1830
1831           if (dump_enabled_p ())
1832             dump_printf_loc (MSG_NOTE, vect_location,
1833                              "Try peeling by %d\n", npeel);
1834         }
1835
1836       /* Ensure that all datarefs can be vectorized after the peel.  */
1837       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1838         do_peeling = false;
1839
1840       /* Check if all datarefs are supportable and log.  */
1841       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1842         {
1843           stat = vect_verify_datarefs_alignment (loop_vinfo);
1844           if (!stat)
1845             do_peeling = false;
1846           else
1847             {
1848               body_cost_vec.release ();
1849               return stat;
1850             }
1851         }
1852
1853       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1854       if (do_peeling)
1855         {
1856           unsigned max_allowed_peel
1857             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1858           if (max_allowed_peel != (unsigned)-1)
1859             {
1860               unsigned max_peel = npeel;
1861               if (max_peel == 0)
1862                 {
1863                   gimple *dr_stmt = DR_STMT (dr0);
1864                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1865                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1866                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1867                 }
1868               if (max_peel > max_allowed_peel)
1869                 {
1870                   do_peeling = false;
1871                   if (dump_enabled_p ())
1872                     dump_printf_loc (MSG_NOTE, vect_location,
1873                         "Disable peeling, max peels reached: %d\n", max_peel);
1874                 }
1875             }
1876         }
1877
1878       /* Cost model #2 - if peeling may result in a remaining loop not
1879          iterating enough to be vectorized then do not peel.  */
1880       if (do_peeling
1881           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1882         {
1883           unsigned max_peel
1884             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1885           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1886               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1887             do_peeling = false;
1888         }
1889
1890       if (do_peeling)
1891         {
1892           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1893              If the misalignment of DR_i is identical to that of dr0 then set
1894              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1895              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1896              by the peeling factor times the element size of DR_i (MOD the
1897              vectorization factor times the size).  Otherwise, the
1898              misalignment of DR_i must be set to unknown.  */
1899           FOR_EACH_VEC_ELT (datarefs, i, dr)
1900             if (dr != dr0)
1901               {
1902                 /* Strided accesses perform only component accesses, alignment
1903                    is irrelevant for them.  */
1904                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1905                 if (STMT_VINFO_STRIDED_P (stmt_info)
1906                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1907                   continue;
1908
1909                 vect_update_misalignment_for_peel (dr, dr0, npeel);
1910               }
1911
1912           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1913           if (npeel)
1914             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1915           else
1916             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1917               = DR_MISALIGNMENT (dr0);
1918           SET_DR_MISALIGNMENT (dr0, 0);
1919           if (dump_enabled_p ())
1920             {
1921               dump_printf_loc (MSG_NOTE, vect_location,
1922                                "Alignment of access forced using peeling.\n");
1923               dump_printf_loc (MSG_NOTE, vect_location,
1924                                "Peeling for alignment will be applied.\n");
1925             }
1926           /* The inside-loop cost will be accounted for in vectorizable_load
1927              and vectorizable_store correctly with adjusted alignments.
1928              Drop the body_cst_vec on the floor here.  */
1929           body_cost_vec.release ();
1930
1931           stat = vect_verify_datarefs_alignment (loop_vinfo);
1932           gcc_assert (stat);
1933           return stat;
1934         }
1935     }
1936
1937   body_cost_vec.release ();
1938
1939   /* (2) Versioning to force alignment.  */
1940
1941   /* Try versioning if:
1942      1) optimize loop for speed
1943      2) there is at least one unsupported misaligned data ref with an unknown
1944         misalignment, and
1945      3) all misaligned data refs with a known misalignment are supported, and
1946      4) the number of runtime alignment checks is within reason.  */
1947
1948   do_versioning =
1949         optimize_loop_nest_for_speed_p (loop)
1950         && (!loop->inner); /* FORNOW */
1951
1952   if (do_versioning)
1953     {
1954       FOR_EACH_VEC_ELT (datarefs, i, dr)
1955         {
1956           stmt = DR_STMT (dr);
1957           stmt_info = vinfo_for_stmt (stmt);
1958
1959           /* For interleaving, only the alignment of the first access
1960              matters.  */
1961           if (aligned_access_p (dr)
1962               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1963                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1964             continue;
1965
1966           if (STMT_VINFO_STRIDED_P (stmt_info))
1967             {
1968               /* Strided loads perform only component accesses, alignment is
1969                  irrelevant for them.  */
1970               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1971                 continue;
1972               do_versioning = false;
1973               break;
1974             }
1975
1976           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1977
1978           if (!supportable_dr_alignment)
1979             {
1980               gimple *stmt;
1981               int mask;
1982               tree vectype;
1983
1984               if (known_alignment_for_access_p (dr)
1985                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1986                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1987                 {
1988                   do_versioning = false;
1989                   break;
1990                 }
1991
1992               stmt = DR_STMT (dr);
1993               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1994               gcc_assert (vectype);
1995
1996               /* The rightmost bits of an aligned address must be zeros.
1997                  Construct the mask needed for this test.  For example,
1998                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1999                  mask must be 15 = 0xf. */
2000               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2001
2002               /* FORNOW: use the same mask to test all potentially unaligned
2003                  references in the loop.  The vectorizer currently supports
2004                  a single vector size, see the reference to
2005                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2006                  vectorization factor is computed.  */
2007               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2008                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2009               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2010               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2011                       DR_STMT (dr));
2012             }
2013         }
2014
2015       /* Versioning requires at least one misaligned data reference.  */
2016       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2017         do_versioning = false;
2018       else if (!do_versioning)
2019         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2020     }
2021
2022   if (do_versioning)
2023     {
2024       vec<gimple *> may_misalign_stmts
2025         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2026       gimple *stmt;
2027
2028       /* It can now be assumed that the data references in the statements
2029          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2030          of the loop being vectorized.  */
2031       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2032         {
2033           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2034           dr = STMT_VINFO_DATA_REF (stmt_info);
2035           SET_DR_MISALIGNMENT (dr, 0);
2036           if (dump_enabled_p ())
2037             dump_printf_loc (MSG_NOTE, vect_location,
2038                              "Alignment of access forced using versioning.\n");
2039         }
2040
2041       if (dump_enabled_p ())
2042         dump_printf_loc (MSG_NOTE, vect_location,
2043                          "Versioning for alignment will be applied.\n");
2044
2045       /* Peeling and versioning can't be done together at this time.  */
2046       gcc_assert (! (do_peeling && do_versioning));
2047
2048       stat = vect_verify_datarefs_alignment (loop_vinfo);
2049       gcc_assert (stat);
2050       return stat;
2051     }
2052
2053   /* This point is reached if neither peeling nor versioning is being done.  */
2054   gcc_assert (! (do_peeling || do_versioning));
2055
2056   stat = vect_verify_datarefs_alignment (loop_vinfo);
2057   return stat;
2058 }
2059
2060
2061 /* Function vect_find_same_alignment_drs.
2062
2063    Update group and alignment relations according to the chosen
2064    vectorization factor.  */
2065
2066 static void
2067 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2068 {
2069   struct data_reference *dra = DDR_A (ddr);
2070   struct data_reference *drb = DDR_B (ddr);
2071   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2072   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2073
2074   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2075     return;
2076
2077   if (dra == drb)
2078     return;
2079
2080   if (!operand_equal_p (DR_BASE_OBJECT (dra), DR_BASE_OBJECT (drb),
2081                         OEP_ADDRESS_OF)
2082       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2083       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2084     return;
2085
2086   /* Two references with distance zero have the same alignment.  */
2087   offset_int diff = (wi::to_offset (DR_INIT (dra))
2088                      - wi::to_offset (DR_INIT (drb)));
2089   if (diff != 0)
2090     {
2091       /* Get the wider of the two alignments.  */
2092       unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2093       unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2094       unsigned int max_align = MAX (align_a, align_b);
2095
2096       /* Require the gap to be a multiple of the larger vector alignment.  */
2097       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2098         return;
2099     }
2100
2101   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2102   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2103   if (dump_enabled_p ())
2104     {
2105       dump_printf_loc (MSG_NOTE, vect_location,
2106                        "accesses have the same alignment: ");
2107       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2108       dump_printf (MSG_NOTE,  " and ");
2109       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2110       dump_printf (MSG_NOTE, "\n");
2111     }
2112 }
2113
2114
2115 /* Function vect_analyze_data_refs_alignment
2116
2117    Analyze the alignment of the data-references in the loop.
2118    Return FALSE if a data reference is found that cannot be vectorized.  */
2119
2120 bool
2121 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2122 {
2123   if (dump_enabled_p ())
2124     dump_printf_loc (MSG_NOTE, vect_location,
2125                      "=== vect_analyze_data_refs_alignment ===\n");
2126
2127   /* Mark groups of data references with same alignment using
2128      data dependence information.  */
2129   vec<ddr_p> ddrs = vinfo->ddrs;
2130   struct data_dependence_relation *ddr;
2131   unsigned int i;
2132
2133   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2134     vect_find_same_alignment_drs (ddr);
2135
2136   vec<data_reference_p> datarefs = vinfo->datarefs;
2137   struct data_reference *dr;
2138
2139   FOR_EACH_VEC_ELT (datarefs, i, dr)
2140     {
2141       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2142       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2143           && !vect_compute_data_ref_alignment (dr))
2144         {
2145           /* Strided accesses perform only component accesses, misalignment
2146              information is irrelevant for them.  */
2147           if (STMT_VINFO_STRIDED_P (stmt_info)
2148               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2149             continue;
2150
2151           if (dump_enabled_p ())
2152             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2153                              "not vectorized: can't calculate alignment "
2154                              "for data ref.\n");
2155
2156           return false;
2157         }
2158     }
2159
2160   return true;
2161 }
2162
2163
2164 /* Analyze alignment of DRs of stmts in NODE.  */
2165
2166 static bool
2167 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2168 {
2169   /* We vectorize from the first scalar stmt in the node unless
2170      the node is permuted in which case we start from the first
2171      element in the group.  */
2172   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2173   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2174   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2175     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2176
2177   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2178   if (! vect_compute_data_ref_alignment (dr)
2179       /* For creating the data-ref pointer we need alignment of the
2180          first element anyway.  */
2181       || (dr != first_dr
2182           && ! vect_compute_data_ref_alignment (first_dr))
2183       || ! verify_data_ref_alignment (dr))
2184     {
2185       if (dump_enabled_p ())
2186         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2187                          "not vectorized: bad data alignment in basic "
2188                          "block.\n");
2189       return false;
2190     }
2191
2192   return true;
2193 }
2194
2195 /* Function vect_slp_analyze_instance_alignment
2196
2197    Analyze the alignment of the data-references in the SLP instance.
2198    Return FALSE if a data reference is found that cannot be vectorized.  */
2199
2200 bool
2201 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2202 {
2203   if (dump_enabled_p ())
2204     dump_printf_loc (MSG_NOTE, vect_location,
2205                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2206
2207   slp_tree node;
2208   unsigned i;
2209   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2210     if (! vect_slp_analyze_and_verify_node_alignment (node))
2211       return false;
2212
2213   node = SLP_INSTANCE_TREE (instance);
2214   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2215       && ! vect_slp_analyze_and_verify_node_alignment
2216              (SLP_INSTANCE_TREE (instance)))
2217     return false;
2218
2219   return true;
2220 }
2221
2222
2223 /* Analyze groups of accesses: check that DR belongs to a group of
2224    accesses of legal size, step, etc.  Detect gaps, single element
2225    interleaving, and other special cases. Set grouped access info.
2226    Collect groups of strided stores for further use in SLP analysis.
2227    Worker for vect_analyze_group_access.  */
2228
2229 static bool
2230 vect_analyze_group_access_1 (struct data_reference *dr)
2231 {
2232   tree step = DR_STEP (dr);
2233   tree scalar_type = TREE_TYPE (DR_REF (dr));
2234   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2235   gimple *stmt = DR_STMT (dr);
2236   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2237   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2238   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2239   HOST_WIDE_INT dr_step = -1;
2240   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2241   bool slp_impossible = false;
2242
2243   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2244      size of the interleaving group (including gaps).  */
2245   if (tree_fits_shwi_p (step))
2246     {
2247       dr_step = tree_to_shwi (step);
2248       /* Check that STEP is a multiple of type size.  Otherwise there is
2249          a non-element-sized gap at the end of the group which we
2250          cannot represent in GROUP_GAP or GROUP_SIZE.
2251          ???  As we can handle non-constant step fine here we should
2252          simply remove uses of GROUP_GAP between the last and first
2253          element and instead rely on DR_STEP.  GROUP_SIZE then would
2254          simply not include that gap.  */
2255       if ((dr_step % type_size) != 0)
2256         {
2257           if (dump_enabled_p ())
2258             {
2259               dump_printf_loc (MSG_NOTE, vect_location,
2260                                "Step ");
2261               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2262               dump_printf (MSG_NOTE,
2263                            " is not a multiple of the element size for ");
2264               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2265               dump_printf (MSG_NOTE, "\n");
2266             }
2267           return false;
2268         }
2269       groupsize = absu_hwi (dr_step) / type_size;
2270     }
2271   else
2272     groupsize = 0;
2273
2274   /* Not consecutive access is possible only if it is a part of interleaving.  */
2275   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2276     {
2277       /* Check if it this DR is a part of interleaving, and is a single
2278          element of the group that is accessed in the loop.  */
2279
2280       /* Gaps are supported only for loads. STEP must be a multiple of the type
2281          size.  The size of the group must be a power of 2.  */
2282       if (DR_IS_READ (dr)
2283           && (dr_step % type_size) == 0
2284           && groupsize > 0
2285           && pow2p_hwi (groupsize))
2286         {
2287           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2288           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2289           GROUP_GAP (stmt_info) = groupsize - 1;
2290           if (dump_enabled_p ())
2291             {
2292               dump_printf_loc (MSG_NOTE, vect_location,
2293                                "Detected single element interleaving ");
2294               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2295               dump_printf (MSG_NOTE, " step ");
2296               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2297               dump_printf (MSG_NOTE, "\n");
2298             }
2299
2300           return true;
2301         }
2302
2303       if (dump_enabled_p ())
2304         {
2305           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306                            "not consecutive access ");
2307           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2308         }
2309
2310       if (bb_vinfo)
2311         {
2312           /* Mark the statement as unvectorizable.  */
2313           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2314           return true;
2315         }
2316
2317       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2318       STMT_VINFO_STRIDED_P (stmt_info) = true;
2319       return true;
2320     }
2321
2322   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2323     {
2324       /* First stmt in the interleaving chain. Check the chain.  */
2325       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2326       struct data_reference *data_ref = dr;
2327       unsigned int count = 1;
2328       tree prev_init = DR_INIT (data_ref);
2329       gimple *prev = stmt;
2330       HOST_WIDE_INT diff, gaps = 0;
2331
2332       while (next)
2333         {
2334           /* Skip same data-refs.  In case that two or more stmts share
2335              data-ref (supported only for loads), we vectorize only the first
2336              stmt, and the rest get their vectorized loads from the first
2337              one.  */
2338           if (!tree_int_cst_compare (DR_INIT (data_ref),
2339                                      DR_INIT (STMT_VINFO_DATA_REF (
2340                                                    vinfo_for_stmt (next)))))
2341             {
2342               if (DR_IS_WRITE (data_ref))
2343                 {
2344                   if (dump_enabled_p ())
2345                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                                      "Two store stmts share the same dr.\n");
2347                   return false;
2348                 }
2349
2350               if (dump_enabled_p ())
2351                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352                                  "Two or more load stmts share the same dr.\n");
2353
2354               /* For load use the same data-ref load.  */
2355               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2356
2357               prev = next;
2358               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2359               continue;
2360             }
2361
2362           prev = next;
2363           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2364
2365           /* All group members have the same STEP by construction.  */
2366           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2367
2368           /* Check that the distance between two accesses is equal to the type
2369              size. Otherwise, we have gaps.  */
2370           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2371                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2372           if (diff != 1)
2373             {
2374               /* FORNOW: SLP of accesses with gaps is not supported.  */
2375               slp_impossible = true;
2376               if (DR_IS_WRITE (data_ref))
2377                 {
2378                   if (dump_enabled_p ())
2379                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2380                                      "interleaved store with gaps\n");
2381                   return false;
2382                 }
2383
2384               gaps += diff - 1;
2385             }
2386
2387           last_accessed_element += diff;
2388
2389           /* Store the gap from the previous member of the group. If there is no
2390              gap in the access, GROUP_GAP is always 1.  */
2391           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2392
2393           prev_init = DR_INIT (data_ref);
2394           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2395           /* Count the number of data-refs in the chain.  */
2396           count++;
2397         }
2398
2399       if (groupsize == 0)
2400         groupsize = count + gaps;
2401
2402       /* This could be UINT_MAX but as we are generating code in a very
2403          inefficient way we have to cap earlier.  See PR78699 for example.  */
2404       if (groupsize > 4096)
2405         {
2406           if (dump_enabled_p ())
2407             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408                              "group is too large\n");
2409           return false;
2410         }
2411
2412       /* Check that the size of the interleaving is equal to count for stores,
2413          i.e., that there are no gaps.  */
2414       if (groupsize != count
2415           && !DR_IS_READ (dr))
2416         {
2417           if (dump_enabled_p ())
2418             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2419                              "interleaved store with gaps\n");
2420           return false;
2421         }
2422
2423       /* If there is a gap after the last load in the group it is the
2424          difference between the groupsize and the last accessed
2425          element.
2426          When there is no gap, this difference should be 0.  */
2427       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2428
2429       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2430       if (dump_enabled_p ())
2431         {
2432           dump_printf_loc (MSG_NOTE, vect_location,
2433                            "Detected interleaving ");
2434           if (DR_IS_READ (dr))
2435             dump_printf (MSG_NOTE, "load ");
2436           else
2437             dump_printf (MSG_NOTE, "store ");
2438           dump_printf (MSG_NOTE, "of size %u starting with ",
2439                        (unsigned)groupsize);
2440           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2441           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2442             dump_printf_loc (MSG_NOTE, vect_location,
2443                              "There is a gap of %u elements after the group\n",
2444                              GROUP_GAP (vinfo_for_stmt (stmt)));
2445         }
2446
2447       /* SLP: create an SLP data structure for every interleaving group of
2448          stores for further analysis in vect_analyse_slp.  */
2449       if (DR_IS_WRITE (dr) && !slp_impossible)
2450         {
2451           if (loop_vinfo)
2452             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2453           if (bb_vinfo)
2454             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2455         }
2456     }
2457
2458   return true;
2459 }
2460
2461 /* Analyze groups of accesses: check that DR belongs to a group of
2462    accesses of legal size, step, etc.  Detect gaps, single element
2463    interleaving, and other special cases. Set grouped access info.
2464    Collect groups of strided stores for further use in SLP analysis.  */
2465
2466 static bool
2467 vect_analyze_group_access (struct data_reference *dr)
2468 {
2469   if (!vect_analyze_group_access_1 (dr))
2470     {
2471       /* Dissolve the group if present.  */
2472       gimple *next;
2473       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2474       while (stmt)
2475         {
2476           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2477           next = GROUP_NEXT_ELEMENT (vinfo);
2478           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2479           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2480           stmt = next;
2481         }
2482       return false;
2483     }
2484   return true;
2485 }
2486
2487 /* Analyze the access pattern of the data-reference DR.
2488    In case of non-consecutive accesses call vect_analyze_group_access() to
2489    analyze groups of accesses.  */
2490
2491 static bool
2492 vect_analyze_data_ref_access (struct data_reference *dr)
2493 {
2494   tree step = DR_STEP (dr);
2495   tree scalar_type = TREE_TYPE (DR_REF (dr));
2496   gimple *stmt = DR_STMT (dr);
2497   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2498   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2499   struct loop *loop = NULL;
2500
2501   if (loop_vinfo)
2502     loop = LOOP_VINFO_LOOP (loop_vinfo);
2503
2504   if (loop_vinfo && !step)
2505     {
2506       if (dump_enabled_p ())
2507         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2508                          "bad data-ref access in loop\n");
2509       return false;
2510     }
2511
2512   /* Allow loads with zero step in inner-loop vectorization.  */
2513   if (loop_vinfo && integer_zerop (step))
2514     {
2515       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2516       if (!nested_in_vect_loop_p (loop, stmt))
2517         return DR_IS_READ (dr);
2518       /* Allow references with zero step for outer loops marked
2519          with pragma omp simd only - it guarantees absence of
2520          loop-carried dependencies between inner loop iterations.  */
2521       if (!loop->force_vectorize)
2522         {
2523           if (dump_enabled_p ())
2524             dump_printf_loc (MSG_NOTE, vect_location,
2525                              "zero step in inner loop of nest\n");
2526           return false;
2527         }
2528     }
2529
2530   if (loop && nested_in_vect_loop_p (loop, stmt))
2531     {
2532       /* Interleaved accesses are not yet supported within outer-loop
2533         vectorization for references in the inner-loop.  */
2534       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2535
2536       /* For the rest of the analysis we use the outer-loop step.  */
2537       step = STMT_VINFO_DR_STEP (stmt_info);
2538       if (integer_zerop (step))
2539         {
2540           if (dump_enabled_p ())
2541             dump_printf_loc (MSG_NOTE, vect_location,
2542                              "zero step in outer loop.\n");
2543           return DR_IS_READ (dr);
2544         }
2545     }
2546
2547   /* Consecutive?  */
2548   if (TREE_CODE (step) == INTEGER_CST)
2549     {
2550       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2551       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2552           || (dr_step < 0
2553               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2554         {
2555           /* Mark that it is not interleaving.  */
2556           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2557           return true;
2558         }
2559     }
2560
2561   if (loop && nested_in_vect_loop_p (loop, stmt))
2562     {
2563       if (dump_enabled_p ())
2564         dump_printf_loc (MSG_NOTE, vect_location,
2565                          "grouped access in outer loop.\n");
2566       return false;
2567     }
2568
2569
2570   /* Assume this is a DR handled by non-constant strided load case.  */
2571   if (TREE_CODE (step) != INTEGER_CST)
2572     return (STMT_VINFO_STRIDED_P (stmt_info)
2573             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2574                 || vect_analyze_group_access (dr)));
2575
2576   /* Not consecutive access - check if it's a part of interleaving group.  */
2577   return vect_analyze_group_access (dr);
2578 }
2579
2580 /* Compare two data-references DRA and DRB to group them into chunks
2581    suitable for grouping.  */
2582
2583 static int
2584 dr_group_sort_cmp (const void *dra_, const void *drb_)
2585 {
2586   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2587   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2588   int cmp;
2589
2590   /* Stabilize sort.  */
2591   if (dra == drb)
2592     return 0;
2593
2594   /* DRs in different loops never belong to the same group.  */
2595   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2596   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2597   if (loopa != loopb)
2598     return loopa->num < loopb->num ? -1 : 1;
2599
2600   /* Ordering of DRs according to base.  */
2601   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2602     {
2603       cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2604                                    DR_BASE_ADDRESS (drb));
2605       if (cmp != 0)
2606         return cmp;
2607     }
2608
2609   /* And according to DR_OFFSET.  */
2610   if (!dr_equal_offsets_p (dra, drb))
2611     {
2612       cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2613       if (cmp != 0)
2614         return cmp;
2615     }
2616
2617   /* Put reads before writes.  */
2618   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2619     return DR_IS_READ (dra) ? -1 : 1;
2620
2621   /* Then sort after access size.  */
2622   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2623                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2624     {
2625       cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2626                                    TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2627       if (cmp != 0)
2628         return cmp;
2629     }
2630
2631   /* And after step.  */
2632   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2633     {
2634       cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2635       if (cmp != 0)
2636         return cmp;
2637     }
2638
2639   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2640   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2641   if (cmp == 0)
2642     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2643   return cmp;
2644 }
2645
2646 /* Function vect_analyze_data_ref_accesses.
2647
2648    Analyze the access pattern of all the data references in the loop.
2649
2650    FORNOW: the only access pattern that is considered vectorizable is a
2651            simple step 1 (consecutive) access.
2652
2653    FORNOW: handle only arrays and pointer accesses.  */
2654
2655 bool
2656 vect_analyze_data_ref_accesses (vec_info *vinfo)
2657 {
2658   unsigned int i;
2659   vec<data_reference_p> datarefs = vinfo->datarefs;
2660   struct data_reference *dr;
2661
2662   if (dump_enabled_p ())
2663     dump_printf_loc (MSG_NOTE, vect_location,
2664                      "=== vect_analyze_data_ref_accesses ===\n");
2665
2666   if (datarefs.is_empty ())
2667     return true;
2668
2669   /* Sort the array of datarefs to make building the interleaving chains
2670      linear.  Don't modify the original vector's order, it is needed for
2671      determining what dependencies are reversed.  */
2672   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2673   datarefs_copy.qsort (dr_group_sort_cmp);
2674
2675   /* Build the interleaving chains.  */
2676   for (i = 0; i < datarefs_copy.length () - 1;)
2677     {
2678       data_reference_p dra = datarefs_copy[i];
2679       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2680       stmt_vec_info lastinfo = NULL;
2681       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2682         {
2683           ++i;
2684           continue;
2685         }
2686       for (i = i + 1; i < datarefs_copy.length (); ++i)
2687         {
2688           data_reference_p drb = datarefs_copy[i];
2689           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2690           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2691             break;
2692
2693           /* ???  Imperfect sorting (non-compatible types, non-modulo
2694              accesses, same accesses) can lead to a group to be artificially
2695              split here as we don't just skip over those.  If it really
2696              matters we can push those to a worklist and re-iterate
2697              over them.  The we can just skip ahead to the next DR here.  */
2698
2699           /* DRs in a different loop should not be put into the same
2700              interleaving group.  */
2701           if (gimple_bb (DR_STMT (dra))->loop_father
2702               != gimple_bb (DR_STMT (drb))->loop_father)
2703             break;
2704
2705           /* Check that the data-refs have same first location (except init)
2706              and they are both either store or load (not load and store,
2707              not masked loads or stores).  */
2708           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2709               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2710                                    DR_BASE_ADDRESS (drb), 0)
2711               || !dr_equal_offsets_p (dra, drb)
2712               || !gimple_assign_single_p (DR_STMT (dra))
2713               || !gimple_assign_single_p (DR_STMT (drb)))
2714             break;
2715
2716           /* Check that the data-refs have the same constant size.  */
2717           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2718           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2719           if (!tree_fits_uhwi_p (sza)
2720               || !tree_fits_uhwi_p (szb)
2721               || !tree_int_cst_equal (sza, szb))
2722             break;
2723
2724           /* Check that the data-refs have the same step.  */
2725           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2726             break;
2727
2728           /* Do not place the same access in the interleaving chain twice.  */
2729           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2730             break;
2731
2732           /* Check the types are compatible.
2733              ???  We don't distinguish this during sorting.  */
2734           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2735                                    TREE_TYPE (DR_REF (drb))))
2736             break;
2737
2738           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2739           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2740           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2741           gcc_assert (init_a <= init_b);
2742
2743           /* If init_b == init_a + the size of the type * k, we have an
2744              interleaving, and DRA is accessed before DRB.  */
2745           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2746           if (type_size_a == 0
2747               || (init_b - init_a) % type_size_a != 0)
2748             break;
2749
2750           /* If we have a store, the accesses are adjacent.  This splits
2751              groups into chunks we support (we don't support vectorization
2752              of stores with gaps).  */
2753           if (!DR_IS_READ (dra)
2754               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2755                                              (DR_INIT (datarefs_copy[i-1]))
2756                   != type_size_a))
2757             break;
2758
2759           /* If the step (if not zero or non-constant) is greater than the
2760              difference between data-refs' inits this splits groups into
2761              suitable sizes.  */
2762           if (tree_fits_shwi_p (DR_STEP (dra)))
2763             {
2764               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2765               if (step != 0 && step <= (init_b - init_a))
2766                 break;
2767             }
2768
2769           if (dump_enabled_p ())
2770             {
2771               dump_printf_loc (MSG_NOTE, vect_location,
2772                                "Detected interleaving ");
2773               if (DR_IS_READ (dra))
2774                 dump_printf (MSG_NOTE, "load ");
2775               else
2776                 dump_printf (MSG_NOTE, "store ");
2777               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2778               dump_printf (MSG_NOTE,  " and ");
2779               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2780               dump_printf (MSG_NOTE, "\n");
2781             }
2782
2783           /* Link the found element into the group list.  */
2784           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2785             {
2786               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2787               lastinfo = stmtinfo_a;
2788             }
2789           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2790           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2791           lastinfo = stmtinfo_b;
2792         }
2793     }
2794
2795   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2796     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2797         && !vect_analyze_data_ref_access (dr))
2798       {
2799         if (dump_enabled_p ())
2800           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2801                            "not vectorized: complicated access pattern.\n");
2802
2803         if (is_a <bb_vec_info> (vinfo))
2804           {
2805             /* Mark the statement as not vectorizable.  */
2806             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2807             continue;
2808           }
2809         else
2810           {
2811             datarefs_copy.release ();
2812             return false;
2813           }
2814       }
2815
2816   datarefs_copy.release ();
2817   return true;
2818 }
2819
2820 /* Function vect_vfa_segment_size.
2821
2822    Create an expression that computes the size of segment
2823    that will be accessed for a data reference.  The functions takes into
2824    account that realignment loads may access one more vector.
2825
2826    Input:
2827      DR: The data reference.
2828      LENGTH_FACTOR: segment length to consider.
2829
2830    Return an expression whose value is the size of segment which will be
2831    accessed by DR.  */
2832
2833 static tree
2834 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2835 {
2836   tree segment_length;
2837
2838   if (integer_zerop (DR_STEP (dr)))
2839     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2840   else
2841     segment_length = size_binop (MULT_EXPR,
2842                                  fold_convert (sizetype, DR_STEP (dr)),
2843                                  fold_convert (sizetype, length_factor));
2844
2845   if (vect_supportable_dr_alignment (dr, false)
2846         == dr_explicit_realign_optimized)
2847     {
2848       tree vector_size = TYPE_SIZE_UNIT
2849                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2850
2851       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2852     }
2853   return segment_length;
2854 }
2855
2856 /* Function vect_no_alias_p.
2857
2858    Given data references A and B with equal base and offset, the alias
2859    relation can be decided at compilation time, return TRUE if they do
2860    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
2861    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2862    respectively.  */
2863
2864 static bool
2865 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2866                  tree segment_length_a, tree segment_length_b)
2867 {
2868   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2869               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2870   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2871     return false;
2872
2873   tree seg_a_min = DR_INIT (a);
2874   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2875                                 seg_a_min, segment_length_a);
2876   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2877      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2878      [a, a+12) */
2879   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2880     {
2881       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2882       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2883                                seg_a_max, unit_size);
2884       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2885                                DR_INIT (a), unit_size);
2886     }
2887   tree seg_b_min = DR_INIT (b);
2888   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2889                                 seg_b_min, segment_length_b);
2890   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2891     {
2892       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2893       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2894                                seg_b_max, unit_size);
2895       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2896                                DR_INIT (b), unit_size);
2897     }
2898
2899   if (tree_int_cst_le (seg_a_max, seg_b_min)
2900       || tree_int_cst_le (seg_b_max, seg_a_min))
2901     return true;
2902
2903   return false;
2904 }
2905
2906 /* Function vect_prune_runtime_alias_test_list.
2907
2908    Prune a list of ddrs to be tested at run-time by versioning for alias.
2909    Merge several alias checks into one if possible.
2910    Return FALSE if resulting list of ddrs is longer then allowed by
2911    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2912
2913 bool
2914 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2915 {
2916   vec<ddr_p> may_alias_ddrs =
2917     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2918   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2919     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2920   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2921   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2922
2923   ddr_p ddr;
2924   unsigned int i;
2925   tree length_factor;
2926
2927   if (dump_enabled_p ())
2928     dump_printf_loc (MSG_NOTE, vect_location,
2929                      "=== vect_prune_runtime_alias_test_list ===\n");
2930
2931   if (may_alias_ddrs.is_empty ())
2932     return true;
2933
2934   comp_alias_ddrs.create (may_alias_ddrs.length ());
2935
2936   /* First, we collect all data ref pairs for aliasing checks.  */
2937   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2938     {
2939       int comp_res;
2940       struct data_reference *dr_a, *dr_b;
2941       gimple *dr_group_first_a, *dr_group_first_b;
2942       tree segment_length_a, segment_length_b;
2943       gimple *stmt_a, *stmt_b;
2944
2945       dr_a = DDR_A (ddr);
2946       stmt_a = DR_STMT (DDR_A (ddr));
2947       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2948       if (dr_group_first_a)
2949         {
2950           stmt_a = dr_group_first_a;
2951           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2952         }
2953
2954       dr_b = DDR_B (ddr);
2955       stmt_b = DR_STMT (DDR_B (ddr));
2956       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2957       if (dr_group_first_b)
2958         {
2959           stmt_b = dr_group_first_b;
2960           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2961         }
2962
2963       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2964         length_factor = scalar_loop_iters;
2965       else
2966         length_factor = size_int (vect_factor);
2967       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2968       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2969
2970       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2971                                         DR_BASE_ADDRESS (dr_b));
2972       if (comp_res == 0)
2973         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
2974                                           DR_OFFSET (dr_b));
2975
2976       /* Alias is known at compilation time.  */
2977       if (comp_res == 0
2978           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
2979           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
2980           && TREE_CODE (segment_length_a) == INTEGER_CST
2981           && TREE_CODE (segment_length_b) == INTEGER_CST)
2982         {
2983           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
2984             continue;
2985
2986           if (dump_enabled_p ())
2987             dump_printf_loc (MSG_NOTE, vect_location,
2988                              "not vectorized: compilation time alias.\n");
2989
2990           return false;
2991         }
2992
2993       dr_with_seg_len_pair_t dr_with_seg_len_pair
2994           (dr_with_seg_len (dr_a, segment_length_a),
2995            dr_with_seg_len (dr_b, segment_length_b));
2996
2997       /* Canonicalize pairs by sorting the two DR members.  */
2998       if (comp_res > 0)
2999         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3000
3001       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3002     }
3003
3004   prune_runtime_alias_test_list (&comp_alias_ddrs,
3005                                  (unsigned HOST_WIDE_INT) vect_factor);
3006   dump_printf_loc (MSG_NOTE, vect_location,
3007                    "improved number of alias checks from %d to %d\n",
3008                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
3009   if ((int) comp_alias_ddrs.length () >
3010       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3011     {
3012       if (dump_enabled_p ())
3013         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3014                          "number of versioning for alias "
3015                          "run-time tests exceeds %d "
3016                          "(--param vect-max-version-for-alias-checks)\n",
3017                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3018       return false;
3019     }
3020
3021   /* All alias checks have been resolved at compilation time.  */
3022   if (!comp_alias_ddrs.length ())
3023     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3024
3025   return true;
3026 }
3027
3028 /* Return true if a non-affine read or write in STMT is suitable for a
3029    gather load or scatter store.  Describe the operation in *INFO if so.  */
3030
3031 bool
3032 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3033                            gather_scatter_info *info)
3034 {
3035   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3036   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3037   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3038   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3039   tree offtype = NULL_TREE;
3040   tree decl, base, off;
3041   machine_mode pmode;
3042   int punsignedp, reversep, pvolatilep = 0;
3043
3044   base = DR_REF (dr);
3045   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3046      see if we can use the def stmt of the address.  */
3047   if (is_gimple_call (stmt)
3048       && gimple_call_internal_p (stmt)
3049       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3050           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3051       && TREE_CODE (base) == MEM_REF
3052       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3053       && integer_zerop (TREE_OPERAND (base, 1))
3054       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3055     {
3056       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3057       if (is_gimple_assign (def_stmt)
3058           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3059         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3060     }
3061
3062   /* The gather and scatter builtins need address of the form
3063      loop_invariant + vector * {1, 2, 4, 8}
3064      or
3065      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3066      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3067      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3068      multiplications and additions in it.  To get a vector, we need
3069      a single SSA_NAME that will be defined in the loop and will
3070      contain everything that is not loop invariant and that can be
3071      vectorized.  The following code attempts to find such a preexistng
3072      SSA_NAME OFF and put the loop invariants into a tree BASE
3073      that can be gimplified before the loop.  */
3074   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3075                               &punsignedp, &reversep, &pvolatilep);
3076   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3077
3078   if (TREE_CODE (base) == MEM_REF)
3079     {
3080       if (!integer_zerop (TREE_OPERAND (base, 1)))
3081         {
3082           if (off == NULL_TREE)
3083             {
3084               offset_int moff = mem_ref_offset (base);
3085               off = wide_int_to_tree (sizetype, moff);
3086             }
3087           else
3088             off = size_binop (PLUS_EXPR, off,
3089                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3090         }
3091       base = TREE_OPERAND (base, 0);
3092     }
3093   else
3094     base = build_fold_addr_expr (base);
3095
3096   if (off == NULL_TREE)
3097     off = size_zero_node;
3098
3099   /* If base is not loop invariant, either off is 0, then we start with just
3100      the constant offset in the loop invariant BASE and continue with base
3101      as OFF, otherwise give up.
3102      We could handle that case by gimplifying the addition of base + off
3103      into some SSA_NAME and use that as off, but for now punt.  */
3104   if (!expr_invariant_in_loop_p (loop, base))
3105     {
3106       if (!integer_zerop (off))
3107         return false;
3108       off = base;
3109       base = size_int (pbitpos / BITS_PER_UNIT);
3110     }
3111   /* Otherwise put base + constant offset into the loop invariant BASE
3112      and continue with OFF.  */
3113   else
3114     {
3115       base = fold_convert (sizetype, base);
3116       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3117     }
3118
3119   /* OFF at this point may be either a SSA_NAME or some tree expression
3120      from get_inner_reference.  Try to peel off loop invariants from it
3121      into BASE as long as possible.  */
3122   STRIP_NOPS (off);
3123   while (offtype == NULL_TREE)
3124     {
3125       enum tree_code code;
3126       tree op0, op1, add = NULL_TREE;
3127
3128       if (TREE_CODE (off) == SSA_NAME)
3129         {
3130           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3131
3132           if (expr_invariant_in_loop_p (loop, off))
3133             return false;
3134
3135           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3136             break;
3137
3138           op0 = gimple_assign_rhs1 (def_stmt);
3139           code = gimple_assign_rhs_code (def_stmt);
3140           op1 = gimple_assign_rhs2 (def_stmt);
3141         }
3142       else
3143         {
3144           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3145             return false;
3146           code = TREE_CODE (off);
3147           extract_ops_from_tree (off, &code, &op0, &op1);
3148         }
3149       switch (code)
3150         {
3151         case POINTER_PLUS_EXPR:
3152         case PLUS_EXPR:
3153           if (expr_invariant_in_loop_p (loop, op0))
3154             {
3155               add = op0;
3156               off = op1;
3157             do_add:
3158               add = fold_convert (sizetype, add);
3159               if (scale != 1)
3160                 add = size_binop (MULT_EXPR, add, size_int (scale));
3161               base = size_binop (PLUS_EXPR, base, add);
3162               continue;
3163             }
3164           if (expr_invariant_in_loop_p (loop, op1))
3165             {
3166               add = op1;
3167               off = op0;
3168               goto do_add;
3169             }
3170           break;
3171         case MINUS_EXPR:
3172           if (expr_invariant_in_loop_p (loop, op1))
3173             {
3174               add = fold_convert (sizetype, op1);
3175               add = size_binop (MINUS_EXPR, size_zero_node, add);
3176               off = op0;
3177               goto do_add;
3178             }
3179           break;
3180         case MULT_EXPR:
3181           if (scale == 1 && tree_fits_shwi_p (op1))
3182             {
3183               scale = tree_to_shwi (op1);
3184               off = op0;
3185               continue;
3186             }
3187           break;
3188         case SSA_NAME:
3189           off = op0;
3190           continue;
3191         CASE_CONVERT:
3192           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3193               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3194             break;
3195           if (TYPE_PRECISION (TREE_TYPE (op0))
3196               == TYPE_PRECISION (TREE_TYPE (off)))
3197             {
3198               off = op0;
3199               continue;
3200             }
3201           if (TYPE_PRECISION (TREE_TYPE (op0))
3202               < TYPE_PRECISION (TREE_TYPE (off)))
3203             {
3204               off = op0;
3205               offtype = TREE_TYPE (off);
3206               STRIP_NOPS (off);
3207               continue;
3208             }
3209           break;
3210         default:
3211           break;
3212         }
3213       break;
3214     }
3215
3216   /* If at the end OFF still isn't a SSA_NAME or isn't
3217      defined in the loop, punt.  */
3218   if (TREE_CODE (off) != SSA_NAME
3219       || expr_invariant_in_loop_p (loop, off))
3220     return false;
3221
3222   if (offtype == NULL_TREE)
3223     offtype = TREE_TYPE (off);
3224
3225   if (DR_IS_READ (dr))
3226     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3227                                              offtype, scale);
3228   else
3229     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3230                                               offtype, scale);
3231
3232   if (decl == NULL_TREE)
3233     return false;
3234
3235   info->decl = decl;
3236   info->base = base;
3237   info->offset = off;
3238   info->offset_dt = vect_unknown_def_type;
3239   info->offset_vectype = NULL_TREE;
3240   info->scale = scale;
3241   return true;
3242 }
3243
3244 /* Function vect_analyze_data_refs.
3245
3246   Find all the data references in the loop or basic block.
3247
3248    The general structure of the analysis of data refs in the vectorizer is as
3249    follows:
3250    1- vect_analyze_data_refs(loop/bb): call
3251       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3252       in the loop/bb and their dependences.
3253    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3254    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3255    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3256
3257 */
3258
3259 bool
3260 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3261 {
3262   struct loop *loop = NULL;
3263   unsigned int i;
3264   struct data_reference *dr;
3265   tree scalar_type;
3266
3267   if (dump_enabled_p ())
3268     dump_printf_loc (MSG_NOTE, vect_location,
3269                      "=== vect_analyze_data_refs ===\n");
3270
3271   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3272     loop = LOOP_VINFO_LOOP (loop_vinfo);
3273
3274   /* Go through the data-refs, check that the analysis succeeded.  Update
3275      pointer from stmt_vec_info struct to DR and vectype.  */
3276
3277   vec<data_reference_p> datarefs = vinfo->datarefs;
3278   FOR_EACH_VEC_ELT (datarefs, i, dr)
3279     {
3280       gimple *stmt;
3281       stmt_vec_info stmt_info;
3282       tree base, offset, init;
3283       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3284       bool simd_lane_access = false;
3285       int vf;
3286
3287 again:
3288       if (!dr || !DR_REF (dr))
3289         {
3290           if (dump_enabled_p ())
3291             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3292                              "not vectorized: unhandled data-ref\n");
3293           return false;
3294         }
3295
3296       stmt = DR_STMT (dr);
3297       stmt_info = vinfo_for_stmt (stmt);
3298
3299       /* Discard clobbers from the dataref vector.  We will remove
3300          clobber stmts during vectorization.  */
3301       if (gimple_clobber_p (stmt))
3302         {
3303           free_data_ref (dr);
3304           if (i == datarefs.length () - 1)
3305             {
3306               datarefs.pop ();
3307               break;
3308             }
3309           datarefs.ordered_remove (i);
3310           dr = datarefs[i];
3311           goto again;
3312         }
3313
3314       /* Check that analysis of the data-ref succeeded.  */
3315       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3316           || !DR_STEP (dr))
3317         {
3318           bool maybe_gather
3319             = DR_IS_READ (dr)
3320               && !TREE_THIS_VOLATILE (DR_REF (dr))
3321               && targetm.vectorize.builtin_gather != NULL;
3322           bool maybe_scatter
3323             = DR_IS_WRITE (dr)
3324               && !TREE_THIS_VOLATILE (DR_REF (dr))
3325               && targetm.vectorize.builtin_scatter != NULL;
3326           bool maybe_simd_lane_access
3327             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3328
3329           /* If target supports vector gather loads or scatter stores, or if
3330              this might be a SIMD lane access, see if they can't be used.  */
3331           if (is_a <loop_vec_info> (vinfo)
3332               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3333               && !nested_in_vect_loop_p (loop, stmt))
3334             {
3335               struct data_reference *newdr
3336                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3337                                    DR_REF (dr), stmt, maybe_scatter ? false : true);
3338               gcc_assert (newdr != NULL && DR_REF (newdr));
3339               if (DR_BASE_ADDRESS (newdr)
3340                   && DR_OFFSET (newdr)
3341                   && DR_INIT (newdr)
3342                   && DR_STEP (newdr)
3343                   && integer_zerop (DR_STEP (newdr)))
3344                 {
3345                   if (maybe_simd_lane_access)
3346                     {
3347                       tree off = DR_OFFSET (newdr);
3348                       STRIP_NOPS (off);
3349                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3350                           && TREE_CODE (off) == MULT_EXPR
3351                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3352                         {
3353                           tree step = TREE_OPERAND (off, 1);
3354                           off = TREE_OPERAND (off, 0);
3355                           STRIP_NOPS (off);
3356                           if (CONVERT_EXPR_P (off)
3357                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3358                                                                           0)))
3359                                  < TYPE_PRECISION (TREE_TYPE (off)))
3360                             off = TREE_OPERAND (off, 0);
3361                           if (TREE_CODE (off) == SSA_NAME)
3362                             {
3363                               gimple *def = SSA_NAME_DEF_STMT (off);
3364                               tree reft = TREE_TYPE (DR_REF (newdr));
3365                               if (is_gimple_call (def)
3366                                   && gimple_call_internal_p (def)
3367                                   && (gimple_call_internal_fn (def)
3368                                       == IFN_GOMP_SIMD_LANE))
3369                                 {
3370                                   tree arg = gimple_call_arg (def, 0);
3371                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3372                                   arg = SSA_NAME_VAR (arg);
3373                                   if (arg == loop->simduid
3374                                       /* For now.  */
3375                                       && tree_int_cst_equal
3376                                            (TYPE_SIZE_UNIT (reft),
3377                                             step))
3378                                     {
3379                                       DR_OFFSET (newdr) = ssize_int (0);
3380                                       DR_STEP (newdr) = step;
3381                                       DR_OFFSET_ALIGNMENT (newdr)
3382                                         = BIGGEST_ALIGNMENT;
3383                                       DR_STEP_ALIGNMENT (newdr)
3384                                         = highest_pow2_factor (step);
3385                                       dr = newdr;
3386                                       simd_lane_access = true;
3387                                     }
3388                                 }
3389                             }
3390                         }
3391                     }
3392                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3393                     {
3394                       dr = newdr;
3395                       if (maybe_gather)
3396                         gatherscatter = GATHER;
3397                       else
3398                         gatherscatter = SCATTER;
3399                     }
3400                 }
3401               if (gatherscatter == SG_NONE && !simd_lane_access)
3402                 free_data_ref (newdr);
3403             }
3404
3405           if (gatherscatter == SG_NONE && !simd_lane_access)
3406             {
3407               if (dump_enabled_p ())
3408                 {
3409                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3410                                    "not vectorized: data ref analysis "
3411                                    "failed ");
3412                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3413                 }
3414
3415               if (is_a <bb_vec_info> (vinfo))
3416                 break;
3417
3418               return false;
3419             }
3420         }
3421
3422       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3423         {
3424           if (dump_enabled_p ())
3425             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3426                              "not vectorized: base addr of dr is a "
3427                              "constant\n");
3428
3429           if (is_a <bb_vec_info> (vinfo))
3430             break;
3431
3432           if (gatherscatter != SG_NONE || simd_lane_access)
3433             free_data_ref (dr);
3434           return false;
3435         }
3436
3437       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3438         {
3439           if (dump_enabled_p ())
3440             {
3441               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3442                                "not vectorized: volatile type ");
3443               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3444             }
3445
3446           if (is_a <bb_vec_info> (vinfo))
3447             break;
3448
3449           return false;
3450         }
3451
3452       if (stmt_can_throw_internal (stmt))
3453         {
3454           if (dump_enabled_p ())
3455             {
3456               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3457                                "not vectorized: statement can throw an "
3458                                "exception ");
3459               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3460             }
3461
3462           if (is_a <bb_vec_info> (vinfo))
3463             break;
3464
3465           if (gatherscatter != SG_NONE || simd_lane_access)
3466             free_data_ref (dr);
3467           return false;
3468         }
3469
3470       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3471           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3472         {
3473           if (dump_enabled_p ())
3474             {
3475               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3476                                "not vectorized: statement is bitfield "
3477                                "access ");
3478               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3479             }
3480
3481           if (is_a <bb_vec_info> (vinfo))
3482             break;
3483
3484           if (gatherscatter != SG_NONE || simd_lane_access)
3485             free_data_ref (dr);
3486           return false;
3487         }
3488
3489       base = unshare_expr (DR_BASE_ADDRESS (dr));
3490       offset = unshare_expr (DR_OFFSET (dr));
3491       init = unshare_expr (DR_INIT (dr));
3492
3493       if (is_gimple_call (stmt)
3494           && (!gimple_call_internal_p (stmt)
3495               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3496                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3497         {
3498           if (dump_enabled_p ())
3499             {
3500               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3501                                "not vectorized: dr in a call ");
3502               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3503             }
3504
3505           if (is_a <bb_vec_info> (vinfo))
3506             break;
3507
3508           if (gatherscatter != SG_NONE || simd_lane_access)
3509             free_data_ref (dr);
3510           return false;
3511         }
3512
3513       /* Update DR field in stmt_vec_info struct.  */
3514
3515       /* If the dataref is in an inner-loop of the loop that is considered for
3516          for vectorization, we also want to analyze the access relative to
3517          the outer-loop (DR contains information only relative to the
3518          inner-most enclosing loop).  We do that by building a reference to the
3519          first location accessed by the inner-loop, and analyze it relative to
3520          the outer-loop.  */
3521       if (loop && nested_in_vect_loop_p (loop, stmt))
3522         {
3523           /* Build a reference to the first location accessed by the
3524              inner loop: *(BASE + INIT + OFFSET).  By construction,
3525              this address must be invariant in the inner loop, so we
3526              can consider it as being used in the outer loop.  */
3527           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3528                                           init, offset);
3529           tree init_addr = fold_build_pointer_plus (base, init_offset);
3530           tree init_ref = build_fold_indirect_ref (init_addr);
3531
3532           if (dump_enabled_p ())
3533             {
3534               dump_printf_loc (MSG_NOTE, vect_location,
3535                                "analyze in outer loop: ");
3536               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3537               dump_printf (MSG_NOTE, "\n");
3538             }
3539
3540           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3541                                      init_ref, loop))
3542             /* dr_analyze_innermost already explained the failure.  */
3543             return false;
3544
3545           if (dump_enabled_p ())
3546             {
3547               dump_printf_loc (MSG_NOTE, vect_location,
3548                                "\touter base_address: ");
3549               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3550                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3551               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3552               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3553                                  STMT_VINFO_DR_OFFSET (stmt_info));
3554               dump_printf (MSG_NOTE,
3555                            "\n\touter constant offset from base address: ");
3556               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3557                                  STMT_VINFO_DR_INIT (stmt_info));
3558               dump_printf (MSG_NOTE, "\n\touter step: ");
3559               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3560                                  STMT_VINFO_DR_STEP (stmt_info));
3561               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3562                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3563               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3564                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3565               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3566                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3567               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3568                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3569             }
3570         }
3571
3572       if (STMT_VINFO_DATA_REF (stmt_info))
3573         {
3574           if (dump_enabled_p ())
3575             {
3576               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3577                                "not vectorized: more than one data ref "
3578                                "in stmt: ");
3579               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3580             }
3581
3582           if (is_a <bb_vec_info> (vinfo))
3583             break;
3584
3585           if (gatherscatter != SG_NONE || simd_lane_access)
3586             free_data_ref (dr);
3587           return false;
3588         }
3589
3590       STMT_VINFO_DATA_REF (stmt_info) = dr;
3591       if (simd_lane_access)
3592         {
3593           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3594           free_data_ref (datarefs[i]);
3595           datarefs[i] = dr;
3596         }
3597
3598       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3599           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3600           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3601         {
3602           if (dump_enabled_p ())
3603             {
3604               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3605                                "not vectorized: base object not addressable "
3606                                "for stmt: ");
3607               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3608             }
3609           if (is_a <bb_vec_info> (vinfo))
3610             {
3611               /* In BB vectorization the ref can still participate
3612                  in dependence analysis, we just can't vectorize it.  */
3613               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3614               continue;
3615             }
3616           return false;
3617         }
3618
3619       /* Set vectype for STMT.  */
3620       scalar_type = TREE_TYPE (DR_REF (dr));
3621       STMT_VINFO_VECTYPE (stmt_info)
3622         = get_vectype_for_scalar_type (scalar_type);
3623       if (!STMT_VINFO_VECTYPE (stmt_info))
3624         {
3625           if (dump_enabled_p ())
3626             {
3627               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3628                                "not vectorized: no vectype for stmt: ");
3629               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3630               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3631               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3632                                  scalar_type);
3633               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3634             }
3635
3636           if (is_a <bb_vec_info> (vinfo))
3637             {
3638               /* No vector type is fine, the ref can still participate
3639                  in dependence analysis, we just can't vectorize it.  */
3640               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3641               continue;
3642             }
3643
3644           if (gatherscatter != SG_NONE || simd_lane_access)
3645             {
3646               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3647               if (gatherscatter != SG_NONE)
3648                 free_data_ref (dr);
3649             }
3650           return false;
3651         }
3652       else
3653         {
3654           if (dump_enabled_p ())
3655             {
3656               dump_printf_loc (MSG_NOTE, vect_location,
3657                                "got vectype for stmt: ");
3658               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3659               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3660                                  STMT_VINFO_VECTYPE (stmt_info));
3661               dump_printf (MSG_NOTE, "\n");
3662             }
3663         }
3664
3665       /* Adjust the minimal vectorization factor according to the
3666          vector type.  */
3667       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3668       if (vf > *min_vf)
3669         *min_vf = vf;
3670
3671       if (gatherscatter != SG_NONE)
3672         {
3673           gather_scatter_info gs_info;
3674           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3675                                           &gs_info)
3676               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3677             {
3678               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3679               free_data_ref (dr);
3680               if (dump_enabled_p ())
3681                 {
3682                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3683                                    (gatherscatter == GATHER) ?
3684                                    "not vectorized: not suitable for gather "
3685                                    "load " :
3686                                    "not vectorized: not suitable for scatter "
3687                                    "store ");
3688                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3689                 }
3690               return false;
3691             }
3692
3693           free_data_ref (datarefs[i]);
3694           datarefs[i] = dr;
3695           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3696         }
3697
3698       else if (is_a <loop_vec_info> (vinfo)
3699                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3700         {
3701           if (nested_in_vect_loop_p (loop, stmt))
3702             {
3703               if (dump_enabled_p ())
3704                 {
3705                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3706                                    "not vectorized: not suitable for strided "
3707                                    "load ");
3708                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3709                 }
3710               return false;
3711             }
3712           STMT_VINFO_STRIDED_P (stmt_info) = true;
3713         }
3714     }
3715
3716   /* If we stopped analysis at the first dataref we could not analyze
3717      when trying to vectorize a basic-block mark the rest of the datarefs
3718      as not vectorizable and truncate the vector of datarefs.  That
3719      avoids spending useless time in analyzing their dependence.  */
3720   if (i != datarefs.length ())
3721     {
3722       gcc_assert (is_a <bb_vec_info> (vinfo));
3723       for (unsigned j = i; j < datarefs.length (); ++j)
3724         {
3725           data_reference_p dr = datarefs[j];
3726           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3727           free_data_ref (dr);
3728         }
3729       datarefs.truncate (i);
3730     }
3731
3732   return true;
3733 }
3734
3735
3736 /* Function vect_get_new_vect_var.
3737
3738    Returns a name for a new variable.  The current naming scheme appends the
3739    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3740    the name of vectorizer generated variables, and appends that to NAME if
3741    provided.  */
3742
3743 tree
3744 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3745 {
3746   const char *prefix;
3747   tree new_vect_var;
3748
3749   switch (var_kind)
3750   {
3751   case vect_simple_var:
3752     prefix = "vect";
3753     break;
3754   case vect_scalar_var:
3755     prefix = "stmp";
3756     break;
3757   case vect_mask_var:
3758     prefix = "mask";
3759     break;
3760   case vect_pointer_var:
3761     prefix = "vectp";
3762     break;
3763   default:
3764     gcc_unreachable ();
3765   }
3766
3767   if (name)
3768     {
3769       char* tmp = concat (prefix, "_", name, NULL);
3770       new_vect_var = create_tmp_reg (type, tmp);
3771       free (tmp);
3772     }
3773   else
3774     new_vect_var = create_tmp_reg (type, prefix);
3775
3776   return new_vect_var;
3777 }
3778
3779 /* Like vect_get_new_vect_var but return an SSA name.  */
3780
3781 tree
3782 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3783 {
3784   const char *prefix;
3785   tree new_vect_var;
3786
3787   switch (var_kind)
3788   {
3789   case vect_simple_var:
3790     prefix = "vect";
3791     break;
3792   case vect_scalar_var:
3793     prefix = "stmp";
3794     break;
3795   case vect_pointer_var:
3796     prefix = "vectp";
3797     break;
3798   default:
3799     gcc_unreachable ();
3800   }
3801
3802   if (name)
3803     {
3804       char* tmp = concat (prefix, "_", name, NULL);
3805       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3806       free (tmp);
3807     }
3808   else
3809     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3810
3811   return new_vect_var;
3812 }
3813
3814 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3815
3816 static void
3817 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3818                                   stmt_vec_info stmt_info)
3819 {
3820   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3821   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3822   int misalign = DR_MISALIGNMENT (dr);
3823   if (misalign == DR_MISALIGNMENT_UNKNOWN)
3824     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3825   else
3826     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3827 }
3828
3829 /* Function vect_create_addr_base_for_vector_ref.
3830
3831    Create an expression that computes the address of the first memory location
3832    that will be accessed for a data reference.
3833
3834    Input:
3835    STMT: The statement containing the data reference.
3836    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3837    OFFSET: Optional. If supplied, it is be added to the initial address.
3838    LOOP:    Specify relative to which loop-nest should the address be computed.
3839             For example, when the dataref is in an inner-loop nested in an
3840             outer-loop that is now being vectorized, LOOP can be either the
3841             outer-loop, or the inner-loop.  The first memory location accessed
3842             by the following dataref ('in' points to short):
3843
3844                 for (i=0; i<N; i++)
3845                    for (j=0; j<M; j++)
3846                      s += in[i+j]
3847
3848             is as follows:
3849             if LOOP=i_loop:     &in             (relative to i_loop)
3850             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3851    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3852             initial address.  Unlike OFFSET, which is number of elements to
3853             be added, BYTE_OFFSET is measured in bytes.
3854
3855    Output:
3856    1. Return an SSA_NAME whose value is the address of the memory location of
3857       the first vector of the data reference.
3858    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3859       these statement(s) which define the returned SSA_NAME.
3860
3861    FORNOW: We are only handling array accesses with step 1.  */
3862
3863 tree
3864 vect_create_addr_base_for_vector_ref (gimple *stmt,
3865                                       gimple_seq *new_stmt_list,
3866                                       tree offset,
3867                                       tree byte_offset)
3868 {
3869   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3870   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3871   const char *base_name;
3872   tree addr_base;
3873   tree dest;
3874   gimple_seq seq = NULL;
3875   tree vect_ptr_type;
3876   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3877   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3878   innermost_loop_behavior *drb = vect_dr_behavior (dr);
3879
3880   tree data_ref_base = unshare_expr (drb->base_address);
3881   tree base_offset = unshare_expr (drb->offset);
3882   tree init = unshare_expr (drb->init);
3883
3884   if (loop_vinfo)
3885     base_name = get_name (data_ref_base);
3886   else
3887     {
3888       base_offset = ssize_int (0);
3889       init = ssize_int (0);
3890       base_name = get_name (DR_REF (dr));
3891     }
3892
3893   /* Create base_offset */
3894   base_offset = size_binop (PLUS_EXPR,
3895                             fold_convert (sizetype, base_offset),
3896                             fold_convert (sizetype, init));
3897
3898   if (offset)
3899     {
3900       offset = fold_build2 (MULT_EXPR, sizetype,
3901                             fold_convert (sizetype, offset), step);
3902       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3903                                  base_offset, offset);
3904     }
3905   if (byte_offset)
3906     {
3907       byte_offset = fold_convert (sizetype, byte_offset);
3908       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3909                                  base_offset, byte_offset);
3910     }
3911
3912   /* base + base_offset */
3913   if (loop_vinfo)
3914     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3915   else
3916     {
3917       addr_base = build1 (ADDR_EXPR,
3918                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3919                           unshare_expr (DR_REF (dr)));
3920     }
3921
3922   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3923   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3924   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3925   gimple_seq_add_seq (new_stmt_list, seq);
3926
3927   if (DR_PTR_INFO (dr)
3928       && TREE_CODE (addr_base) == SSA_NAME
3929       && !SSA_NAME_PTR_INFO (addr_base))
3930     {
3931       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3932       if (offset || byte_offset)
3933         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3934     }
3935
3936   if (dump_enabled_p ())
3937     {
3938       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3939       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3940       dump_printf (MSG_NOTE, "\n");
3941     }
3942
3943   return addr_base;
3944 }
3945
3946
3947 /* Function vect_create_data_ref_ptr.
3948
3949    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3950    location accessed in the loop by STMT, along with the def-use update
3951    chain to appropriately advance the pointer through the loop iterations.
3952    Also set aliasing information for the pointer.  This pointer is used by
3953    the callers to this function to create a memory reference expression for
3954    vector load/store access.
3955
3956    Input:
3957    1. STMT: a stmt that references memory. Expected to be of the form
3958          GIMPLE_ASSIGN <name, data-ref> or
3959          GIMPLE_ASSIGN <data-ref, name>.
3960    2. AGGR_TYPE: the type of the reference, which should be either a vector
3961         or an array.
3962    3. AT_LOOP: the loop where the vector memref is to be created.
3963    4. OFFSET (optional): an offset to be added to the initial address accessed
3964         by the data-ref in STMT.
3965    5. BSI: location where the new stmts are to be placed if there is no loop
3966    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3967         pointing to the initial address.
3968    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3969         to the initial address accessed by the data-ref in STMT.  This is
3970         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
3971         in bytes.
3972
3973    Output:
3974    1. Declare a new ptr to vector_type, and have it point to the base of the
3975       data reference (initial addressed accessed by the data reference).
3976       For example, for vector of type V8HI, the following code is generated:
3977
3978       v8hi *ap;
3979       ap = (v8hi *)initial_address;
3980
3981       if OFFSET is not supplied:
3982          initial_address = &a[init];
3983       if OFFSET is supplied:
3984          initial_address = &a[init + OFFSET];
3985       if BYTE_OFFSET is supplied:
3986          initial_address = &a[init] + BYTE_OFFSET;
3987
3988       Return the initial_address in INITIAL_ADDRESS.
3989
3990    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
3991       update the pointer in each iteration of the loop.
3992
3993       Return the increment stmt that updates the pointer in PTR_INCR.
3994
3995    3. Set INV_P to true if the access pattern of the data reference in the
3996       vectorized loop is invariant.  Set it to false otherwise.
3997
3998    4. Return the pointer.  */
3999
4000 tree
4001 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4002                           tree offset, tree *initial_address,
4003                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4004                           bool only_init, bool *inv_p, tree byte_offset)
4005 {
4006   const char *base_name;
4007   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4008   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4009   struct loop *loop = NULL;
4010   bool nested_in_vect_loop = false;
4011   struct loop *containing_loop = NULL;
4012   tree aggr_ptr_type;
4013   tree aggr_ptr;
4014   tree new_temp;
4015   gimple_seq new_stmt_list = NULL;
4016   edge pe = NULL;
4017   basic_block new_bb;
4018   tree aggr_ptr_init;
4019   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4020   tree aptr;
4021   gimple_stmt_iterator incr_gsi;
4022   bool insert_after;
4023   tree indx_before_incr, indx_after_incr;
4024   gimple *incr;
4025   tree step;
4026   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4027
4028   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4029               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4030
4031   if (loop_vinfo)
4032     {
4033       loop = LOOP_VINFO_LOOP (loop_vinfo);
4034       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4035       containing_loop = (gimple_bb (stmt))->loop_father;
4036       pe = loop_preheader_edge (loop);
4037     }
4038   else
4039     {
4040       gcc_assert (bb_vinfo);
4041       only_init = true;
4042       *ptr_incr = NULL;
4043     }
4044
4045   /* Check the step (evolution) of the load in LOOP, and record
4046      whether it's invariant.  */
4047   step = vect_dr_behavior (dr)->step;
4048   if (integer_zerop (step))
4049     *inv_p = true;
4050   else
4051     *inv_p = false;
4052
4053   /* Create an expression for the first address accessed by this load
4054      in LOOP.  */
4055   base_name = get_name (DR_BASE_ADDRESS (dr));
4056
4057   if (dump_enabled_p ())
4058     {
4059       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4060       dump_printf_loc (MSG_NOTE, vect_location,
4061                        "create %s-pointer variable to type: ",
4062                        get_tree_code_name (TREE_CODE (aggr_type)));
4063       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4064       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4065         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4066       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4067         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4068       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4069         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4070       else
4071         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4072       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4073       dump_printf (MSG_NOTE, "\n");
4074     }
4075
4076   /* (1) Create the new aggregate-pointer variable.
4077      Vector and array types inherit the alias set of their component
4078      type by default so we need to use a ref-all pointer if the data
4079      reference does not conflict with the created aggregated data
4080      reference because it is not addressable.  */
4081   bool need_ref_all = false;
4082   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4083                               get_alias_set (DR_REF (dr))))
4084     need_ref_all = true;
4085   /* Likewise for any of the data references in the stmt group.  */
4086   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4087     {
4088       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4089       do
4090         {
4091           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4092           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4093           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4094                                       get_alias_set (DR_REF (sdr))))
4095             {
4096               need_ref_all = true;
4097               break;
4098             }
4099           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4100         }
4101       while (orig_stmt);
4102     }
4103   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4104                                                need_ref_all);
4105   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4106
4107
4108   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4109      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4110      def-use update cycles for the pointer: one relative to the outer-loop
4111      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4112      to the inner-loop (which is the inner-most loop containing the dataref),
4113      and this is done be step (5) below.
4114
4115      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4116      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4117      redundant.  Steps (3),(4) create the following:
4118
4119         vp0 = &base_addr;
4120         LOOP:   vp1 = phi(vp0,vp2)
4121                 ...
4122                 ...
4123                 vp2 = vp1 + step
4124                 goto LOOP
4125
4126      If there is an inner-loop nested in loop, then step (5) will also be
4127      applied, and an additional update in the inner-loop will be created:
4128
4129         vp0 = &base_addr;
4130         LOOP:   vp1 = phi(vp0,vp2)
4131                 ...
4132         inner:     vp3 = phi(vp1,vp4)
4133                    vp4 = vp3 + inner_step
4134                    if () goto inner
4135                 ...
4136                 vp2 = vp1 + step
4137                 if () goto LOOP   */
4138
4139   /* (2) Calculate the initial address of the aggregate-pointer, and set
4140      the aggregate-pointer to point to it before the loop.  */
4141
4142   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4143
4144   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4145                                                    offset, byte_offset);
4146   if (new_stmt_list)
4147     {
4148       if (pe)
4149         {
4150           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4151           gcc_assert (!new_bb);
4152         }
4153       else
4154         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4155     }
4156
4157   *initial_address = new_temp;
4158   aggr_ptr_init = new_temp;
4159
4160   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4161      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4162      inner-loop nested in LOOP (during outer-loop vectorization).  */
4163
4164   /* No update in loop is required.  */
4165   if (only_init && (!loop_vinfo || at_loop == loop))
4166     aptr = aggr_ptr_init;
4167   else
4168     {
4169       /* The step of the aggregate pointer is the type size.  */
4170       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4171       /* One exception to the above is when the scalar step of the load in
4172          LOOP is zero. In this case the step here is also zero.  */
4173       if (*inv_p)
4174         iv_step = size_zero_node;
4175       else if (tree_int_cst_sgn (step) == -1)
4176         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4177
4178       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4179
4180       create_iv (aggr_ptr_init,
4181                  fold_convert (aggr_ptr_type, iv_step),
4182                  aggr_ptr, loop, &incr_gsi, insert_after,
4183                  &indx_before_incr, &indx_after_incr);
4184       incr = gsi_stmt (incr_gsi);
4185       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4186
4187       /* Copy the points-to information if it exists. */
4188       if (DR_PTR_INFO (dr))
4189         {
4190           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4191           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4192         }
4193       if (ptr_incr)
4194         *ptr_incr = incr;
4195
4196       aptr = indx_before_incr;
4197     }
4198
4199   if (!nested_in_vect_loop || only_init)
4200     return aptr;
4201
4202
4203   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4204      nested in LOOP, if exists.  */
4205
4206   gcc_assert (nested_in_vect_loop);
4207   if (!only_init)
4208     {
4209       standard_iv_increment_position (containing_loop, &incr_gsi,
4210                                       &insert_after);
4211       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4212                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4213                  &indx_after_incr);
4214       incr = gsi_stmt (incr_gsi);
4215       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4216
4217       /* Copy the points-to information if it exists. */
4218       if (DR_PTR_INFO (dr))
4219         {
4220           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4221           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4222         }
4223       if (ptr_incr)
4224         *ptr_incr = incr;
4225
4226       return indx_before_incr;
4227     }
4228   else
4229     gcc_unreachable ();
4230 }
4231
4232
4233 /* Function bump_vector_ptr
4234
4235    Increment a pointer (to a vector type) by vector-size. If requested,
4236    i.e. if PTR-INCR is given, then also connect the new increment stmt
4237    to the existing def-use update-chain of the pointer, by modifying
4238    the PTR_INCR as illustrated below:
4239
4240    The pointer def-use update-chain before this function:
4241                         DATAREF_PTR = phi (p_0, p_2)
4242                         ....
4243         PTR_INCR:       p_2 = DATAREF_PTR + step
4244
4245    The pointer def-use update-chain after this function:
4246                         DATAREF_PTR = phi (p_0, p_2)
4247                         ....
4248                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4249                         ....
4250         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4251
4252    Input:
4253    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4254                  in the loop.
4255    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4256               the loop.  The increment amount across iterations is expected
4257               to be vector_size.
4258    BSI - location where the new update stmt is to be placed.
4259    STMT - the original scalar memory-access stmt that is being vectorized.
4260    BUMP - optional. The offset by which to bump the pointer. If not given,
4261           the offset is assumed to be vector_size.
4262
4263    Output: Return NEW_DATAREF_PTR as illustrated above.
4264
4265 */
4266
4267 tree
4268 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4269                  gimple *stmt, tree bump)
4270 {
4271   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4272   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4273   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4274   tree update = TYPE_SIZE_UNIT (vectype);
4275   gassign *incr_stmt;
4276   ssa_op_iter iter;
4277   use_operand_p use_p;
4278   tree new_dataref_ptr;
4279
4280   if (bump)
4281     update = bump;
4282
4283   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4284     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4285   else
4286     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4287   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4288                                    dataref_ptr, update);
4289   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4290
4291   /* Copy the points-to information if it exists. */
4292   if (DR_PTR_INFO (dr))
4293     {
4294       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4295       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4296     }
4297
4298   if (!ptr_incr)
4299     return new_dataref_ptr;
4300
4301   /* Update the vector-pointer's cross-iteration increment.  */
4302   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4303     {
4304       tree use = USE_FROM_PTR (use_p);
4305
4306       if (use == dataref_ptr)
4307         SET_USE (use_p, new_dataref_ptr);
4308       else
4309         gcc_assert (tree_int_cst_compare (use, update) == 0);
4310     }
4311
4312   return new_dataref_ptr;
4313 }
4314
4315
4316 /* Function vect_create_destination_var.
4317
4318    Create a new temporary of type VECTYPE.  */
4319
4320 tree
4321 vect_create_destination_var (tree scalar_dest, tree vectype)
4322 {
4323   tree vec_dest;
4324   const char *name;
4325   char *new_name;
4326   tree type;
4327   enum vect_var_kind kind;
4328
4329   kind = vectype
4330     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4331     ? vect_mask_var
4332     : vect_simple_var
4333     : vect_scalar_var;
4334   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4335
4336   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4337
4338   name = get_name (scalar_dest);
4339   if (name)
4340     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4341   else
4342     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4343   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4344   free (new_name);
4345
4346   return vec_dest;
4347 }
4348
4349 /* Function vect_grouped_store_supported.
4350
4351    Returns TRUE if interleave high and interleave low permutations
4352    are supported, and FALSE otherwise.  */
4353
4354 bool
4355 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4356 {
4357   machine_mode mode = TYPE_MODE (vectype);
4358
4359   /* vect_permute_store_chain requires the group size to be equal to 3 or
4360      be a power of two.  */
4361   if (count != 3 && exact_log2 (count) == -1)
4362     {
4363       if (dump_enabled_p ())
4364         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4365                          "the size of the group of accesses"
4366                          " is not a power of 2 or not eqaul to 3\n");
4367       return false;
4368     }
4369
4370   /* Check that the permutation is supported.  */
4371   if (VECTOR_MODE_P (mode))
4372     {
4373       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4374       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4375
4376       if (count == 3)
4377         {
4378           unsigned int j0 = 0, j1 = 0, j2 = 0;
4379           unsigned int i, j;
4380
4381           for (j = 0; j < 3; j++)
4382             {
4383               int nelt0 = ((3 - j) * nelt) % 3;
4384               int nelt1 = ((3 - j) * nelt + 1) % 3;
4385               int nelt2 = ((3 - j) * nelt + 2) % 3;
4386               for (i = 0; i < nelt; i++)
4387                 {
4388                   if (3 * i + nelt0 < nelt)
4389                     sel[3 * i + nelt0] = j0++;
4390                   if (3 * i + nelt1 < nelt)
4391                     sel[3 * i + nelt1] = nelt + j1++;
4392                   if (3 * i + nelt2 < nelt)
4393                     sel[3 * i + nelt2] = 0;
4394                 }
4395               if (!can_vec_perm_p (mode, false, sel))
4396                 {
4397                   if (dump_enabled_p ())
4398                     dump_printf (MSG_MISSED_OPTIMIZATION,
4399                                  "permutaion op not supported by target.\n");
4400                   return false;
4401                 }
4402
4403               for (i = 0; i < nelt; i++)
4404                 {
4405                   if (3 * i + nelt0 < nelt)
4406                     sel[3 * i + nelt0] = 3 * i + nelt0;
4407                   if (3 * i + nelt1 < nelt)
4408                     sel[3 * i + nelt1] = 3 * i + nelt1;
4409                   if (3 * i + nelt2 < nelt)
4410                     sel[3 * i + nelt2] = nelt + j2++;
4411                 }
4412               if (!can_vec_perm_p (mode, false, sel))
4413                 {
4414                   if (dump_enabled_p ())
4415                     dump_printf (MSG_MISSED_OPTIMIZATION,
4416                                  "permutaion op not supported by target.\n");
4417                   return false;
4418                 }
4419             }
4420           return true;
4421         }
4422       else
4423         {
4424           /* If length is not equal to 3 then only power of 2 is supported.  */
4425           gcc_assert (pow2p_hwi (count));
4426
4427           for (i = 0; i < nelt / 2; i++)
4428             {
4429               sel[i * 2] = i;
4430               sel[i * 2 + 1] = i + nelt;
4431             }
4432             if (can_vec_perm_p (mode, false, sel))
4433               {
4434                 for (i = 0; i < nelt; i++)
4435                   sel[i] += nelt / 2;
4436                 if (can_vec_perm_p (mode, false, sel))
4437                   return true;
4438               }
4439         }
4440     }
4441
4442   if (dump_enabled_p ())
4443     dump_printf (MSG_MISSED_OPTIMIZATION,
4444                  "permutaion op not supported by target.\n");
4445   return false;
4446 }
4447
4448
4449 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4450    type VECTYPE.  */
4451
4452 bool
4453 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4454 {
4455   return vect_lanes_optab_supported_p ("vec_store_lanes",
4456                                        vec_store_lanes_optab,
4457                                        vectype, count);
4458 }
4459
4460
4461 /* Function vect_permute_store_chain.
4462
4463    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4464    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4465    the data correctly for the stores.  Return the final references for stores
4466    in RESULT_CHAIN.
4467
4468    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4469    The input is 4 vectors each containing 8 elements.  We assign a number to
4470    each element, the input sequence is:
4471
4472    1st vec:   0  1  2  3  4  5  6  7
4473    2nd vec:   8  9 10 11 12 13 14 15
4474    3rd vec:  16 17 18 19 20 21 22 23
4475    4th vec:  24 25 26 27 28 29 30 31
4476
4477    The output sequence should be:
4478
4479    1st vec:  0  8 16 24  1  9 17 25
4480    2nd vec:  2 10 18 26  3 11 19 27
4481    3rd vec:  4 12 20 28  5 13 21 30
4482    4th vec:  6 14 22 30  7 15 23 31
4483
4484    i.e., we interleave the contents of the four vectors in their order.
4485
4486    We use interleave_high/low instructions to create such output.  The input of
4487    each interleave_high/low operation is two vectors:
4488    1st vec    2nd vec
4489    0 1 2 3    4 5 6 7
4490    the even elements of the result vector are obtained left-to-right from the
4491    high/low elements of the first vector.  The odd elements of the result are
4492    obtained left-to-right from the high/low elements of the second vector.
4493    The output of interleave_high will be:   0 4 1 5
4494    and of interleave_low:                   2 6 3 7
4495
4496
4497    The permutation is done in log LENGTH stages.  In each stage interleave_high
4498    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4499    where the first argument is taken from the first half of DR_CHAIN and the
4500    second argument from it's second half.
4501    In our example,
4502
4503    I1: interleave_high (1st vec, 3rd vec)
4504    I2: interleave_low (1st vec, 3rd vec)
4505    I3: interleave_high (2nd vec, 4th vec)
4506    I4: interleave_low (2nd vec, 4th vec)
4507
4508    The output for the first stage is:
4509
4510    I1:  0 16  1 17  2 18  3 19
4511    I2:  4 20  5 21  6 22  7 23
4512    I3:  8 24  9 25 10 26 11 27
4513    I4: 12 28 13 29 14 30 15 31
4514
4515    The output of the second stage, i.e. the final result is:
4516
4517    I1:  0  8 16 24  1  9 17 25
4518    I2:  2 10 18 26  3 11 19 27
4519    I3:  4 12 20 28  5 13 21 30
4520    I4:  6 14 22 30  7 15 23 31.  */
4521
4522 void
4523 vect_permute_store_chain (vec<tree> dr_chain,
4524                           unsigned int length,
4525                           gimple *stmt,
4526                           gimple_stmt_iterator *gsi,
4527                           vec<tree> *result_chain)
4528 {
4529   tree vect1, vect2, high, low;
4530   gimple *perm_stmt;
4531   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4532   tree perm_mask_low, perm_mask_high;
4533   tree data_ref;
4534   tree perm3_mask_low, perm3_mask_high;
4535   unsigned int i, n, log_length = exact_log2 (length);
4536   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4537   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4538
4539   result_chain->quick_grow (length);
4540   memcpy (result_chain->address (), dr_chain.address (),
4541           length * sizeof (tree));
4542
4543   if (length == 3)
4544     {
4545       unsigned int j0 = 0, j1 = 0, j2 = 0;
4546
4547       for (j = 0; j < 3; j++)
4548         {
4549           int nelt0 = ((3 - j) * nelt) % 3;
4550           int nelt1 = ((3 - j) * nelt + 1) % 3;
4551           int nelt2 = ((3 - j) * nelt + 2) % 3;
4552
4553           for (i = 0; i < nelt; i++)
4554             {
4555               if (3 * i + nelt0 < nelt)
4556                 sel[3 * i + nelt0] = j0++;
4557               if (3 * i + nelt1 < nelt)
4558                 sel[3 * i + nelt1] = nelt + j1++;
4559               if (3 * i + nelt2 < nelt)
4560                 sel[3 * i + nelt2] = 0;
4561             }
4562           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4563
4564           for (i = 0; i < nelt; i++)
4565             {
4566               if (3 * i + nelt0 < nelt)
4567                 sel[3 * i + nelt0] = 3 * i + nelt0;
4568               if (3 * i + nelt1 < nelt)
4569                 sel[3 * i + nelt1] = 3 * i + nelt1;
4570               if (3 * i + nelt2 < nelt)
4571                 sel[3 * i + nelt2] = nelt + j2++;
4572             }
4573           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4574
4575           vect1 = dr_chain[0];
4576           vect2 = dr_chain[1];
4577
4578           /* Create interleaving stmt:
4579              low = VEC_PERM_EXPR <vect1, vect2,
4580                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4581                                    j + 2, nelt + j + 2, *, ...}>  */
4582           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4583           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4584                                            vect2, perm3_mask_low);
4585           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4586
4587           vect1 = data_ref;
4588           vect2 = dr_chain[2];
4589           /* Create interleaving stmt:
4590              low = VEC_PERM_EXPR <vect1, vect2,
4591                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4592                                    6, 7, nelt + j + 2, ...}>  */
4593           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4594           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4595                                            vect2, perm3_mask_high);
4596           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4597           (*result_chain)[j] = data_ref;
4598         }
4599     }
4600   else
4601     {
4602       /* If length is not equal to 3 then only power of 2 is supported.  */
4603       gcc_assert (pow2p_hwi (length));
4604
4605       for (i = 0, n = nelt / 2; i < n; i++)
4606         {
4607           sel[i * 2] = i;
4608           sel[i * 2 + 1] = i + nelt;
4609         }
4610         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4611
4612         for (i = 0; i < nelt; i++)
4613           sel[i] += nelt / 2;
4614         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4615
4616         for (i = 0, n = log_length; i < n; i++)
4617           {
4618             for (j = 0; j < length/2; j++)
4619               {
4620                 vect1 = dr_chain[j];
4621                 vect2 = dr_chain[j+length/2];
4622
4623                 /* Create interleaving stmt:
4624                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4625                                                         ...}>  */
4626                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4627                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4628                                                  vect2, perm_mask_high);
4629                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4630                 (*result_chain)[2*j] = high;
4631
4632                 /* Create interleaving stmt:
4633                    low = VEC_PERM_EXPR <vect1, vect2,
4634                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4635                                          ...}>  */
4636                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4637                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4638                                                  vect2, perm_mask_low);
4639                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4640                 (*result_chain)[2*j+1] = low;
4641               }
4642             memcpy (dr_chain.address (), result_chain->address (),
4643                     length * sizeof (tree));
4644           }
4645     }
4646 }
4647
4648 /* Function vect_setup_realignment
4649
4650    This function is called when vectorizing an unaligned load using
4651    the dr_explicit_realign[_optimized] scheme.
4652    This function generates the following code at the loop prolog:
4653
4654       p = initial_addr;
4655    x  msq_init = *(floor(p));   # prolog load
4656       realignment_token = call target_builtin;
4657     loop:
4658    x  msq = phi (msq_init, ---)
4659
4660    The stmts marked with x are generated only for the case of
4661    dr_explicit_realign_optimized.
4662
4663    The code above sets up a new (vector) pointer, pointing to the first
4664    location accessed by STMT, and a "floor-aligned" load using that pointer.
4665    It also generates code to compute the "realignment-token" (if the relevant
4666    target hook was defined), and creates a phi-node at the loop-header bb
4667    whose arguments are the result of the prolog-load (created by this
4668    function) and the result of a load that takes place in the loop (to be
4669    created by the caller to this function).
4670
4671    For the case of dr_explicit_realign_optimized:
4672    The caller to this function uses the phi-result (msq) to create the
4673    realignment code inside the loop, and sets up the missing phi argument,
4674    as follows:
4675     loop:
4676       msq = phi (msq_init, lsq)
4677       lsq = *(floor(p'));        # load in loop
4678       result = realign_load (msq, lsq, realignment_token);
4679
4680    For the case of dr_explicit_realign:
4681     loop:
4682       msq = *(floor(p));        # load in loop
4683       p' = p + (VS-1);
4684       lsq = *(floor(p'));       # load in loop
4685       result = realign_load (msq, lsq, realignment_token);
4686
4687    Input:
4688    STMT - (scalar) load stmt to be vectorized. This load accesses
4689           a memory location that may be unaligned.
4690    BSI - place where new code is to be inserted.
4691    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4692                               is used.
4693
4694    Output:
4695    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4696                        target hook, if defined.
4697    Return value - the result of the loop-header phi node.  */
4698
4699 tree
4700 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4701                         tree *realignment_token,
4702                         enum dr_alignment_support alignment_support_scheme,
4703                         tree init_addr,
4704                         struct loop **at_loop)
4705 {
4706   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4707   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4708   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4709   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4710   struct loop *loop = NULL;
4711   edge pe = NULL;
4712   tree scalar_dest = gimple_assign_lhs (stmt);
4713   tree vec_dest;
4714   gimple *inc;
4715   tree ptr;
4716   tree data_ref;
4717   basic_block new_bb;
4718   tree msq_init = NULL_TREE;
4719   tree new_temp;
4720   gphi *phi_stmt;
4721   tree msq = NULL_TREE;
4722   gimple_seq stmts = NULL;
4723   bool inv_p;
4724   bool compute_in_loop = false;
4725   bool nested_in_vect_loop = false;
4726   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4727   struct loop *loop_for_initial_load = NULL;
4728
4729   if (loop_vinfo)
4730     {
4731       loop = LOOP_VINFO_LOOP (loop_vinfo);
4732       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4733     }
4734
4735   gcc_assert (alignment_support_scheme == dr_explicit_realign
4736               || alignment_support_scheme == dr_explicit_realign_optimized);
4737
4738   /* We need to generate three things:
4739      1. the misalignment computation
4740      2. the extra vector load (for the optimized realignment scheme).
4741      3. the phi node for the two vectors from which the realignment is
4742       done (for the optimized realignment scheme).  */
4743
4744   /* 1. Determine where to generate the misalignment computation.
4745
4746      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4747      calculation will be generated by this function, outside the loop (in the
4748      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4749      caller, inside the loop.
4750
4751      Background: If the misalignment remains fixed throughout the iterations of
4752      the loop, then both realignment schemes are applicable, and also the
4753      misalignment computation can be done outside LOOP.  This is because we are
4754      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4755      are a multiple of VS (the Vector Size), and therefore the misalignment in
4756      different vectorized LOOP iterations is always the same.
4757      The problem arises only if the memory access is in an inner-loop nested
4758      inside LOOP, which is now being vectorized using outer-loop vectorization.
4759      This is the only case when the misalignment of the memory access may not
4760      remain fixed throughout the iterations of the inner-loop (as explained in
4761      detail in vect_supportable_dr_alignment).  In this case, not only is the
4762      optimized realignment scheme not applicable, but also the misalignment
4763      computation (and generation of the realignment token that is passed to
4764      REALIGN_LOAD) have to be done inside the loop.
4765
4766      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4767      or not, which in turn determines if the misalignment is computed inside
4768      the inner-loop, or outside LOOP.  */
4769
4770   if (init_addr != NULL_TREE || !loop_vinfo)
4771     {
4772       compute_in_loop = true;
4773       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4774     }
4775
4776
4777   /* 2. Determine where to generate the extra vector load.
4778
4779      For the optimized realignment scheme, instead of generating two vector
4780      loads in each iteration, we generate a single extra vector load in the
4781      preheader of the loop, and in each iteration reuse the result of the
4782      vector load from the previous iteration.  In case the memory access is in
4783      an inner-loop nested inside LOOP, which is now being vectorized using
4784      outer-loop vectorization, we need to determine whether this initial vector
4785      load should be generated at the preheader of the inner-loop, or can be
4786      generated at the preheader of LOOP.  If the memory access has no evolution
4787      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4788      to be generated inside LOOP (in the preheader of the inner-loop).  */
4789
4790   if (nested_in_vect_loop)
4791     {
4792       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4793       bool invariant_in_outerloop =
4794             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4795       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4796     }
4797   else
4798     loop_for_initial_load = loop;
4799   if (at_loop)
4800     *at_loop = loop_for_initial_load;
4801
4802   if (loop_for_initial_load)
4803     pe = loop_preheader_edge (loop_for_initial_load);
4804
4805   /* 3. For the case of the optimized realignment, create the first vector
4806       load at the loop preheader.  */
4807
4808   if (alignment_support_scheme == dr_explicit_realign_optimized)
4809     {
4810       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4811       gassign *new_stmt;
4812
4813       gcc_assert (!compute_in_loop);
4814       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4815       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4816                                       NULL_TREE, &init_addr, NULL, &inc,
4817                                       true, &inv_p);
4818       if (TREE_CODE (ptr) == SSA_NAME)
4819         new_temp = copy_ssa_name (ptr);
4820       else
4821         new_temp = make_ssa_name (TREE_TYPE (ptr));
4822       new_stmt = gimple_build_assign
4823                    (new_temp, BIT_AND_EXPR, ptr,
4824                     build_int_cst (TREE_TYPE (ptr),
4825                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4826       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4827       gcc_assert (!new_bb);
4828       data_ref
4829         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4830                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4831       new_stmt = gimple_build_assign (vec_dest, data_ref);
4832       new_temp = make_ssa_name (vec_dest, new_stmt);
4833       gimple_assign_set_lhs (new_stmt, new_temp);
4834       if (pe)
4835         {
4836           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4837           gcc_assert (!new_bb);
4838         }
4839       else
4840          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4841
4842       msq_init = gimple_assign_lhs (new_stmt);
4843     }
4844
4845   /* 4. Create realignment token using a target builtin, if available.
4846       It is done either inside the containing loop, or before LOOP (as
4847       determined above).  */
4848
4849   if (targetm.vectorize.builtin_mask_for_load)
4850     {
4851       gcall *new_stmt;
4852       tree builtin_decl;
4853
4854       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4855       if (!init_addr)
4856         {
4857           /* Generate the INIT_ADDR computation outside LOOP.  */
4858           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4859                                                             NULL_TREE);
4860           if (loop)
4861             {
4862               pe = loop_preheader_edge (loop);
4863               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4864               gcc_assert (!new_bb);
4865             }
4866           else
4867              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4868         }
4869
4870       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4871       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4872       vec_dest =
4873         vect_create_destination_var (scalar_dest,
4874                                      gimple_call_return_type (new_stmt));
4875       new_temp = make_ssa_name (vec_dest, new_stmt);
4876       gimple_call_set_lhs (new_stmt, new_temp);
4877
4878       if (compute_in_loop)
4879         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4880       else
4881         {
4882           /* Generate the misalignment computation outside LOOP.  */
4883           pe = loop_preheader_edge (loop);
4884           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4885           gcc_assert (!new_bb);
4886         }
4887
4888       *realignment_token = gimple_call_lhs (new_stmt);
4889
4890       /* The result of the CALL_EXPR to this builtin is determined from
4891          the value of the parameter and no global variables are touched
4892          which makes the builtin a "const" function.  Requiring the
4893          builtin to have the "const" attribute makes it unnecessary
4894          to call mark_call_clobbered.  */
4895       gcc_assert (TREE_READONLY (builtin_decl));
4896     }
4897
4898   if (alignment_support_scheme == dr_explicit_realign)
4899     return msq;
4900
4901   gcc_assert (!compute_in_loop);
4902   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4903
4904
4905   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4906
4907   pe = loop_preheader_edge (containing_loop);
4908   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4909   msq = make_ssa_name (vec_dest);
4910   phi_stmt = create_phi_node (msq, containing_loop->header);
4911   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4912
4913   return msq;
4914 }
4915
4916
4917 /* Function vect_grouped_load_supported.
4918
4919    COUNT is the size of the load group (the number of statements plus the
4920    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
4921    only one statement, with a gap of COUNT - 1.
4922
4923    Returns true if a suitable permute exists.  */
4924
4925 bool
4926 vect_grouped_load_supported (tree vectype, bool single_element_p,
4927                              unsigned HOST_WIDE_INT count)
4928 {
4929   machine_mode mode = TYPE_MODE (vectype);
4930
4931   /* If this is single-element interleaving with an element distance
4932      that leaves unused vector loads around punt - we at least create
4933      very sub-optimal code in that case (and blow up memory,
4934      see PR65518).  */
4935   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
4936     {
4937       if (dump_enabled_p ())
4938         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4939                          "single-element interleaving not supported "
4940                          "for not adjacent vector loads\n");
4941       return false;
4942     }
4943
4944   /* vect_permute_load_chain requires the group size to be equal to 3 or
4945      be a power of two.  */
4946   if (count != 3 && exact_log2 (count) == -1)
4947     {
4948       if (dump_enabled_p ())
4949         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4950                          "the size of the group of accesses"
4951                          " is not a power of 2 or not equal to 3\n");
4952       return false;
4953     }
4954
4955   /* Check that the permutation is supported.  */
4956   if (VECTOR_MODE_P (mode))
4957     {
4958       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4959       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4960
4961       if (count == 3)
4962         {
4963           unsigned int k;
4964           for (k = 0; k < 3; k++)
4965             {
4966               for (i = 0; i < nelt; i++)
4967                 if (3 * i + k < 2 * nelt)
4968                   sel[i] = 3 * i + k;
4969                 else
4970                   sel[i] = 0;
4971               if (!can_vec_perm_p (mode, false, sel))
4972                 {
4973                   if (dump_enabled_p ())
4974                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4975                                      "shuffle of 3 loads is not supported by"
4976                                      " target\n");
4977                   return false;
4978                 }
4979               for (i = 0, j = 0; i < nelt; i++)
4980                 if (3 * i + k < 2 * nelt)
4981                   sel[i] = i;
4982                 else
4983                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4984               if (!can_vec_perm_p (mode, false, sel))
4985                 {
4986                   if (dump_enabled_p ())
4987                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4988                                      "shuffle of 3 loads is not supported by"
4989                                      " target\n");
4990                   return false;
4991                 }
4992             }
4993           return true;
4994         }
4995       else
4996         {
4997           /* If length is not equal to 3 then only power of 2 is supported.  */
4998           gcc_assert (pow2p_hwi (count));
4999           for (i = 0; i < nelt; i++)
5000             sel[i] = i * 2;
5001           if (can_vec_perm_p (mode, false, sel))
5002             {
5003               for (i = 0; i < nelt; i++)
5004                 sel[i] = i * 2 + 1;
5005               if (can_vec_perm_p (mode, false, sel))
5006                 return true;
5007             }
5008         }
5009     }
5010
5011   if (dump_enabled_p ())
5012     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5013                      "extract even/odd not supported by target\n");
5014   return false;
5015 }
5016
5017 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5018    type VECTYPE.  */
5019
5020 bool
5021 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5022 {
5023   return vect_lanes_optab_supported_p ("vec_load_lanes",
5024                                        vec_load_lanes_optab,
5025                                        vectype, count);
5026 }
5027
5028 /* Function vect_permute_load_chain.
5029
5030    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5031    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5032    the input data correctly.  Return the final references for loads in
5033    RESULT_CHAIN.
5034
5035    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5036    The input is 4 vectors each containing 8 elements. We assign a number to each
5037    element, the input sequence is:
5038
5039    1st vec:   0  1  2  3  4  5  6  7
5040    2nd vec:   8  9 10 11 12 13 14 15
5041    3rd vec:  16 17 18 19 20 21 22 23
5042    4th vec:  24 25 26 27 28 29 30 31
5043
5044    The output sequence should be:
5045
5046    1st vec:  0 4  8 12 16 20 24 28
5047    2nd vec:  1 5  9 13 17 21 25 29
5048    3rd vec:  2 6 10 14 18 22 26 30
5049    4th vec:  3 7 11 15 19 23 27 31
5050
5051    i.e., the first output vector should contain the first elements of each
5052    interleaving group, etc.
5053
5054    We use extract_even/odd instructions to create such output.  The input of
5055    each extract_even/odd operation is two vectors
5056    1st vec    2nd vec
5057    0 1 2 3    4 5 6 7
5058
5059    and the output is the vector of extracted even/odd elements.  The output of
5060    extract_even will be:   0 2 4 6
5061    and of extract_odd:     1 3 5 7
5062
5063
5064    The permutation is done in log LENGTH stages.  In each stage extract_even
5065    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5066    their order.  In our example,
5067
5068    E1: extract_even (1st vec, 2nd vec)
5069    E2: extract_odd (1st vec, 2nd vec)
5070    E3: extract_even (3rd vec, 4th vec)
5071    E4: extract_odd (3rd vec, 4th vec)
5072
5073    The output for the first stage will be:
5074
5075    E1:  0  2  4  6  8 10 12 14
5076    E2:  1  3  5  7  9 11 13 15
5077    E3: 16 18 20 22 24 26 28 30
5078    E4: 17 19 21 23 25 27 29 31
5079
5080    In order to proceed and create the correct sequence for the next stage (or
5081    for the correct output, if the second stage is the last one, as in our
5082    example), we first put the output of extract_even operation and then the
5083    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5084    The input for the second stage is:
5085
5086    1st vec (E1):  0  2  4  6  8 10 12 14
5087    2nd vec (E3): 16 18 20 22 24 26 28 30
5088    3rd vec (E2):  1  3  5  7  9 11 13 15
5089    4th vec (E4): 17 19 21 23 25 27 29 31
5090
5091    The output of the second stage:
5092
5093    E1: 0 4  8 12 16 20 24 28
5094    E2: 2 6 10 14 18 22 26 30
5095    E3: 1 5  9 13 17 21 25 29
5096    E4: 3 7 11 15 19 23 27 31
5097
5098    And RESULT_CHAIN after reordering:
5099
5100    1st vec (E1):  0 4  8 12 16 20 24 28
5101    2nd vec (E3):  1 5  9 13 17 21 25 29
5102    3rd vec (E2):  2 6 10 14 18 22 26 30
5103    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5104
5105 static void
5106 vect_permute_load_chain (vec<tree> dr_chain,
5107                          unsigned int length,
5108                          gimple *stmt,
5109                          gimple_stmt_iterator *gsi,
5110                          vec<tree> *result_chain)
5111 {
5112   tree data_ref, first_vect, second_vect;
5113   tree perm_mask_even, perm_mask_odd;
5114   tree perm3_mask_low, perm3_mask_high;
5115   gimple *perm_stmt;
5116   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5117   unsigned int i, j, log_length = exact_log2 (length);
5118   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5119   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5120
5121   result_chain->quick_grow (length);
5122   memcpy (result_chain->address (), dr_chain.address (),
5123           length * sizeof (tree));
5124
5125   if (length == 3)
5126     {
5127       unsigned int k;
5128
5129       for (k = 0; k < 3; k++)
5130         {
5131           for (i = 0; i < nelt; i++)
5132             if (3 * i + k < 2 * nelt)
5133               sel[i] = 3 * i + k;
5134             else
5135               sel[i] = 0;
5136           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5137
5138           for (i = 0, j = 0; i < nelt; i++)
5139             if (3 * i + k < 2 * nelt)
5140               sel[i] = i;
5141             else
5142               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5143
5144           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5145
5146           first_vect = dr_chain[0];
5147           second_vect = dr_chain[1];
5148
5149           /* Create interleaving stmt (low part of):
5150              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5151                                                              ...}>  */
5152           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5153           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5154                                            second_vect, perm3_mask_low);
5155           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5156
5157           /* Create interleaving stmt (high part of):
5158              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5159                                                               ...}>  */
5160           first_vect = data_ref;
5161           second_vect = dr_chain[2];
5162           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5163           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5164                                            second_vect, perm3_mask_high);
5165           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5166           (*result_chain)[k] = data_ref;
5167         }
5168     }
5169   else
5170     {
5171       /* If length is not equal to 3 then only power of 2 is supported.  */
5172       gcc_assert (pow2p_hwi (length));
5173
5174       for (i = 0; i < nelt; ++i)
5175         sel[i] = i * 2;
5176       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5177
5178       for (i = 0; i < nelt; ++i)
5179         sel[i] = i * 2 + 1;
5180       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5181
5182       for (i = 0; i < log_length; i++)
5183         {
5184           for (j = 0; j < length; j += 2)
5185             {
5186               first_vect = dr_chain[j];
5187               second_vect = dr_chain[j+1];
5188
5189               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5190               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5191               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5192                                                first_vect, second_vect,
5193                                                perm_mask_even);
5194               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5195               (*result_chain)[j/2] = data_ref;
5196
5197               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5198               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5199               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5200                                                first_vect, second_vect,
5201                                                perm_mask_odd);
5202               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5203               (*result_chain)[j/2+length/2] = data_ref;
5204             }
5205           memcpy (dr_chain.address (), result_chain->address (),
5206                   length * sizeof (tree));
5207         }
5208     }
5209 }
5210
5211 /* Function vect_shift_permute_load_chain.
5212
5213    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5214    sequence of stmts to reorder the input data accordingly.
5215    Return the final references for loads in RESULT_CHAIN.
5216    Return true if successed, false otherwise.
5217
5218    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5219    The input is 3 vectors each containing 8 elements.  We assign a
5220    number to each element, the input sequence is:
5221
5222    1st vec:   0  1  2  3  4  5  6  7
5223    2nd vec:   8  9 10 11 12 13 14 15
5224    3rd vec:  16 17 18 19 20 21 22 23
5225
5226    The output sequence should be:
5227
5228    1st vec:  0 3 6  9 12 15 18 21
5229    2nd vec:  1 4 7 10 13 16 19 22
5230    3rd vec:  2 5 8 11 14 17 20 23
5231
5232    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5233
5234    First we shuffle all 3 vectors to get correct elements order:
5235
5236    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5237    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5238    3rd vec:  (16 19 22) (17 20 23) (18 21)
5239
5240    Next we unite and shift vector 3 times:
5241
5242    1st step:
5243      shift right by 6 the concatenation of:
5244      "1st vec" and  "2nd vec"
5245        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5246      "2nd vec" and  "3rd vec"
5247        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5248      "3rd vec" and  "1st vec"
5249        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5250                              | New vectors                   |
5251
5252      So that now new vectors are:
5253
5254      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5255      2nd vec:  (10 13) (16 19 22) (17 20 23)
5256      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5257
5258    2nd step:
5259      shift right by 5 the concatenation of:
5260      "1st vec" and  "3rd vec"
5261        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5262      "2nd vec" and  "1st vec"
5263        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5264      "3rd vec" and  "2nd vec"
5265        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5266                           | New vectors                   |
5267
5268      So that now new vectors are:
5269
5270      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5271      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5272      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5273
5274    3rd step:
5275      shift right by 5 the concatenation of:
5276      "1st vec" and  "1st vec"
5277        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5278      shift right by 3 the concatenation of:
5279      "2nd vec" and  "2nd vec"
5280                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5281                           | New vectors                   |
5282
5283      So that now all vectors are READY:
5284      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5285      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5286      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5287
5288    This algorithm is faster than one in vect_permute_load_chain if:
5289      1.  "shift of a concatination" is faster than general permutation.
5290          This is usually so.
5291      2.  The TARGET machine can't execute vector instructions in parallel.
5292          This is because each step of the algorithm depends on previous.
5293          The algorithm in vect_permute_load_chain is much more parallel.
5294
5295    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5296 */
5297
5298 static bool
5299 vect_shift_permute_load_chain (vec<tree> dr_chain,
5300                                unsigned int length,
5301                                gimple *stmt,
5302                                gimple_stmt_iterator *gsi,
5303                                vec<tree> *result_chain)
5304 {
5305   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5306   tree perm2_mask1, perm2_mask2, perm3_mask;
5307   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5308   gimple *perm_stmt;
5309
5310   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5311   unsigned int i;
5312   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5313   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5314   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5315   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5316
5317   result_chain->quick_grow (length);
5318   memcpy (result_chain->address (), dr_chain.address (),
5319           length * sizeof (tree));
5320
5321   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5322     {
5323       unsigned int j, log_length = exact_log2 (length);
5324       for (i = 0; i < nelt / 2; ++i)
5325         sel[i] = i * 2;
5326       for (i = 0; i < nelt / 2; ++i)
5327         sel[nelt / 2 + i] = i * 2 + 1;
5328       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5329         {
5330           if (dump_enabled_p ())
5331             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5332                              "shuffle of 2 fields structure is not \
5333                               supported by target\n");
5334           return false;
5335         }
5336       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5337
5338       for (i = 0; i < nelt / 2; ++i)
5339         sel[i] = i * 2 + 1;
5340       for (i = 0; i < nelt / 2; ++i)
5341         sel[nelt / 2 + i] = i * 2;
5342       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5343         {
5344           if (dump_enabled_p ())
5345             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5346                              "shuffle of 2 fields structure is not \
5347                               supported by target\n");
5348           return false;
5349         }
5350       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5351
5352       /* Generating permutation constant to shift all elements.
5353          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5354       for (i = 0; i < nelt; i++)
5355         sel[i] = nelt / 2 + i;
5356       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5357         {
5358           if (dump_enabled_p ())
5359             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5360                              "shift permutation is not supported by target\n");
5361           return false;
5362         }
5363       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5364
5365       /* Generating permutation constant to select vector from 2.
5366          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5367       for (i = 0; i < nelt / 2; i++)
5368         sel[i] = i;
5369       for (i = nelt / 2; i < nelt; i++)
5370         sel[i] = nelt + i;
5371       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5372         {
5373           if (dump_enabled_p ())
5374             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5375                              "select is not supported by target\n");
5376           return false;
5377         }
5378       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5379
5380       for (i = 0; i < log_length; i++)
5381         {
5382           for (j = 0; j < length; j += 2)
5383             {
5384               first_vect = dr_chain[j];
5385               second_vect = dr_chain[j + 1];
5386
5387               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5388               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5389                                                first_vect, first_vect,
5390                                                perm2_mask1);
5391               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5392               vect[0] = data_ref;
5393
5394               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5395               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5396                                                second_vect, second_vect,
5397                                                perm2_mask2);
5398               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5399               vect[1] = data_ref;
5400
5401               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5402               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5403                                                vect[0], vect[1], shift1_mask);
5404               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5405               (*result_chain)[j/2 + length/2] = data_ref;
5406
5407               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5408               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5409                                                vect[0], vect[1], select_mask);
5410               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5411               (*result_chain)[j/2] = data_ref;
5412             }
5413           memcpy (dr_chain.address (), result_chain->address (),
5414                   length * sizeof (tree));
5415         }
5416       return true;
5417     }
5418   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5419     {
5420       unsigned int k = 0, l = 0;
5421
5422       /* Generating permutation constant to get all elements in rigth order.
5423          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5424       for (i = 0; i < nelt; i++)
5425         {
5426           if (3 * k + (l % 3) >= nelt)
5427             {
5428               k = 0;
5429               l += (3 - (nelt % 3));
5430             }
5431           sel[i] = 3 * k + (l % 3);
5432           k++;
5433         }
5434       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5435         {
5436           if (dump_enabled_p ())
5437             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5438                              "shuffle of 3 fields structure is not \
5439                               supported by target\n");
5440           return false;
5441         }
5442       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5443
5444       /* Generating permutation constant to shift all elements.
5445          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5446       for (i = 0; i < nelt; i++)
5447         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5448       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5449         {
5450           if (dump_enabled_p ())
5451             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5452                              "shift permutation is not supported by target\n");
5453           return false;
5454         }
5455       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5456
5457       /* Generating permutation constant to shift all elements.
5458          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5459       for (i = 0; i < nelt; i++)
5460         sel[i] = 2 * (nelt / 3) + 1 + i;
5461       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5462         {
5463           if (dump_enabled_p ())
5464             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5465                              "shift permutation is not supported by target\n");
5466           return false;
5467         }
5468       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5469
5470       /* Generating permutation constant to shift all elements.
5471          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5472       for (i = 0; i < nelt; i++)
5473         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5474       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5475         {
5476           if (dump_enabled_p ())
5477             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5478                              "shift permutation is not supported by target\n");
5479           return false;
5480         }
5481       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5482
5483       /* Generating permutation constant to shift all elements.
5484          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5485       for (i = 0; i < nelt; i++)
5486         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5487       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5488         {
5489           if (dump_enabled_p ())
5490             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5491                              "shift permutation is not supported by target\n");
5492           return false;
5493         }
5494       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5495
5496       for (k = 0; k < 3; k++)
5497         {
5498           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5499           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5500                                            dr_chain[k], dr_chain[k],
5501                                            perm3_mask);
5502           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5503           vect[k] = data_ref;
5504         }
5505
5506       for (k = 0; k < 3; k++)
5507         {
5508           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5509           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5510                                            vect[k % 3], vect[(k + 1) % 3],
5511                                            shift1_mask);
5512           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5513           vect_shift[k] = data_ref;
5514         }
5515
5516       for (k = 0; k < 3; k++)
5517         {
5518           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5519           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5520                                            vect_shift[(4 - k) % 3],
5521                                            vect_shift[(3 - k) % 3],
5522                                            shift2_mask);
5523           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5524           vect[k] = data_ref;
5525         }
5526
5527       (*result_chain)[3 - (nelt % 3)] = vect[2];
5528
5529       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5530       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5531                                        vect[0], shift3_mask);
5532       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5533       (*result_chain)[nelt % 3] = data_ref;
5534
5535       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5536       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5537                                        vect[1], shift4_mask);
5538       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5539       (*result_chain)[0] = data_ref;
5540       return true;
5541     }
5542   return false;
5543 }
5544
5545 /* Function vect_transform_grouped_load.
5546
5547    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5548    to perform their permutation and ascribe the result vectorized statements to
5549    the scalar statements.
5550 */
5551
5552 void
5553 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5554                              gimple_stmt_iterator *gsi)
5555 {
5556   machine_mode mode;
5557   vec<tree> result_chain = vNULL;
5558
5559   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5560      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5561      vectors, that are ready for vector computation.  */
5562   result_chain.create (size);
5563
5564   /* If reassociation width for vector type is 2 or greater target machine can
5565      execute 2 or more vector instructions in parallel.  Otherwise try to
5566      get chain for loads group using vect_shift_permute_load_chain.  */
5567   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5568   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5569       || pow2p_hwi (size)
5570       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5571                                          gsi, &result_chain))
5572     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5573   vect_record_grouped_load_vectors (stmt, result_chain);
5574   result_chain.release ();
5575 }
5576
5577 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5578    generated as part of the vectorization of STMT.  Assign the statement
5579    for each vector to the associated scalar statement.  */
5580
5581 void
5582 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5583 {
5584   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5585   gimple *next_stmt, *new_stmt;
5586   unsigned int i, gap_count;
5587   tree tmp_data_ref;
5588
5589   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5590      Since we scan the chain starting from it's first node, their order
5591      corresponds the order of data-refs in RESULT_CHAIN.  */
5592   next_stmt = first_stmt;
5593   gap_count = 1;
5594   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5595     {
5596       if (!next_stmt)
5597         break;
5598
5599       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5600        code elimination pass later.  No need to check for the first stmt in
5601        the group, since it always exists.
5602        GROUP_GAP is the number of steps in elements from the previous
5603        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5604        correspond to the gaps.  */
5605       if (next_stmt != first_stmt
5606           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5607       {
5608         gap_count++;
5609         continue;
5610       }
5611
5612       while (next_stmt)
5613         {
5614           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5615           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5616              copies, and we put the new vector statement in the first available
5617              RELATED_STMT.  */
5618           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5619             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5620           else
5621             {
5622               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5623                 {
5624                   gimple *prev_stmt =
5625                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5626                   gimple *rel_stmt =
5627                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5628                   while (rel_stmt)
5629                     {
5630                       prev_stmt = rel_stmt;
5631                       rel_stmt =
5632                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5633                     }
5634
5635                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5636                     new_stmt;
5637                 }
5638             }
5639
5640           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5641           gap_count = 1;
5642           /* If NEXT_STMT accesses the same DR as the previous statement,
5643              put the same TMP_DATA_REF as its vectorized statement; otherwise
5644              get the next data-ref from RESULT_CHAIN.  */
5645           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5646             break;
5647         }
5648     }
5649 }
5650
5651 /* Function vect_force_dr_alignment_p.
5652
5653    Returns whether the alignment of a DECL can be forced to be aligned
5654    on ALIGNMENT bit boundary.  */
5655
5656 bool
5657 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5658 {
5659   if (!VAR_P (decl))
5660     return false;
5661
5662   if (decl_in_symtab_p (decl)
5663       && !symtab_node::get (decl)->can_increase_alignment_p ())
5664     return false;
5665
5666   if (TREE_STATIC (decl))
5667     return (alignment <= MAX_OFILE_ALIGNMENT);
5668   else
5669     return (alignment <= MAX_STACK_ALIGNMENT);
5670 }
5671
5672
5673 /* Return whether the data reference DR is supported with respect to its
5674    alignment.
5675    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5676    it is aligned, i.e., check if it is possible to vectorize it with different
5677    alignment.  */
5678
5679 enum dr_alignment_support
5680 vect_supportable_dr_alignment (struct data_reference *dr,
5681                                bool check_aligned_accesses)
5682 {
5683   gimple *stmt = DR_STMT (dr);
5684   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5685   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5686   machine_mode mode = TYPE_MODE (vectype);
5687   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5688   struct loop *vect_loop = NULL;
5689   bool nested_in_vect_loop = false;
5690
5691   if (aligned_access_p (dr) && !check_aligned_accesses)
5692     return dr_aligned;
5693
5694   /* For now assume all conditional loads/stores support unaligned
5695      access without any special code.  */
5696   if (is_gimple_call (stmt)
5697       && gimple_call_internal_p (stmt)
5698       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5699           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5700     return dr_unaligned_supported;
5701
5702   if (loop_vinfo)
5703     {
5704       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5705       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5706     }
5707
5708   /* Possibly unaligned access.  */
5709
5710   /* We can choose between using the implicit realignment scheme (generating
5711      a misaligned_move stmt) and the explicit realignment scheme (generating
5712      aligned loads with a REALIGN_LOAD).  There are two variants to the
5713      explicit realignment scheme: optimized, and unoptimized.
5714      We can optimize the realignment only if the step between consecutive
5715      vector loads is equal to the vector size.  Since the vector memory
5716      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5717      is guaranteed that the misalignment amount remains the same throughout the
5718      execution of the vectorized loop.  Therefore, we can create the
5719      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5720      at the loop preheader.
5721
5722      However, in the case of outer-loop vectorization, when vectorizing a
5723      memory access in the inner-loop nested within the LOOP that is now being
5724      vectorized, while it is guaranteed that the misalignment of the
5725      vectorized memory access will remain the same in different outer-loop
5726      iterations, it is *not* guaranteed that is will remain the same throughout
5727      the execution of the inner-loop.  This is because the inner-loop advances
5728      with the original scalar step (and not in steps of VS).  If the inner-loop
5729      step happens to be a multiple of VS, then the misalignment remains fixed
5730      and we can use the optimized realignment scheme.  For example:
5731
5732       for (i=0; i<N; i++)
5733         for (j=0; j<M; j++)
5734           s += a[i+j];
5735
5736      When vectorizing the i-loop in the above example, the step between
5737      consecutive vector loads is 1, and so the misalignment does not remain
5738      fixed across the execution of the inner-loop, and the realignment cannot
5739      be optimized (as illustrated in the following pseudo vectorized loop):
5740
5741       for (i=0; i<N; i+=4)
5742         for (j=0; j<M; j++){
5743           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5744                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5745                          // (assuming that we start from an aligned address).
5746           }
5747
5748      We therefore have to use the unoptimized realignment scheme:
5749
5750       for (i=0; i<N; i+=4)
5751           for (j=k; j<M; j+=4)
5752           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5753                            // that the misalignment of the initial address is
5754                            // 0).
5755
5756      The loop can then be vectorized as follows:
5757
5758       for (k=0; k<4; k++){
5759         rt = get_realignment_token (&vp[k]);
5760         for (i=0; i<N; i+=4){
5761           v1 = vp[i+k];
5762           for (j=k; j<M; j+=4){
5763             v2 = vp[i+j+VS-1];
5764             va = REALIGN_LOAD <v1,v2,rt>;
5765             vs += va;
5766             v1 = v2;
5767           }
5768         }
5769     } */
5770
5771   if (DR_IS_READ (dr))
5772     {
5773       bool is_packed = false;
5774       tree type = (TREE_TYPE (DR_REF (dr)));
5775
5776       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5777           && (!targetm.vectorize.builtin_mask_for_load
5778               || targetm.vectorize.builtin_mask_for_load ()))
5779         {
5780           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5781
5782           /* If we are doing SLP then the accesses need not have the
5783              same alignment, instead it depends on the SLP group size.  */
5784           if (loop_vinfo
5785               && STMT_SLP_TYPE (stmt_info)
5786               && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5787                   * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5788                   % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5789             ;
5790           else if (!loop_vinfo
5791                    || (nested_in_vect_loop
5792                        && (TREE_INT_CST_LOW (DR_STEP (dr))
5793                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5794             return dr_explicit_realign;
5795           else
5796             return dr_explicit_realign_optimized;
5797         }
5798       if (!known_alignment_for_access_p (dr))
5799         is_packed = not_size_aligned (DR_REF (dr));
5800
5801       if (targetm.vectorize.support_vector_misalignment
5802             (mode, type, DR_MISALIGNMENT (dr), is_packed))
5803         /* Can't software pipeline the loads, but can at least do them.  */
5804         return dr_unaligned_supported;
5805     }
5806   else
5807     {
5808       bool is_packed = false;
5809       tree type = (TREE_TYPE (DR_REF (dr)));
5810
5811       if (!known_alignment_for_access_p (dr))
5812         is_packed = not_size_aligned (DR_REF (dr));
5813
5814      if (targetm.vectorize.support_vector_misalignment
5815            (mode, type, DR_MISALIGNMENT (dr), is_packed))
5816        return dr_unaligned_supported;
5817     }
5818
5819   /* Unsupported.  */
5820   return dr_unaligned_unsupported;
5821 }