gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54
  55 /* Return true if load- or store-lanes optab OPTAB is implemented for
  56    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  57
  58 static bool
  59 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  60                               tree vectype, unsigned HOST_WIDE_INT count)
  61 {
  62   machine_mode mode, array_mode;
  63   bool limit_p;
  64
  65   mode = TYPE_MODE (vectype);
  66   limit_p = !targetm.array_mode_supported_p (mode, count);
  67   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  68                               MODE_INT, limit_p);
  69
  70   if (array_mode == BLKmode)
  71     {
  72       if (dump_enabled_p ())
  73         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  74                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  75                          GET_MODE_NAME (mode), count);
  76       return false;
  77     }
  78
  79   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  80     {
  81       if (dump_enabled_p ())
  82         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  83                          "cannot use %s<%s><%s>\n", name,
  84                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  85       return false;
  86     }
  87
  88   if (dump_enabled_p ())
  89     dump_printf_loc (MSG_NOTE, vect_location,
  90                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  91                      GET_MODE_NAME (mode));
  92
  93   return true;
  94 }
  95
  96
  97 /* Return the smallest scalar part of STMT.
  98    This is used to determine the vectype of the stmt.  We generally set the
  99    vectype according to the type of the result (lhs).  For stmts whose
 100    result-type is different than the type of the arguments (e.g., demotion,
 101    promotion), vectype will be reset appropriately (later).  Note that we have
 102    to visit the smallest datatype in this function, because that determines the
 103    VF.  If the smallest datatype in the loop is present only as the rhs of a
 104    promotion operation - we'd miss it.
 105    Such a case, where a variable of this datatype does not appear in the lhs
 106    anywhere in the loop, can only occur if it's an invariant: e.g.:
 107    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 108    invariant motion.  However, we cannot rely on invariant motion to always
 109    take invariants out of the loop, and so in the case of promotion we also
 110    have to check the rhs.
 111    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 112    types.  */
 113
 114 tree
 115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 116                                HOST_WIDE_INT *rhs_size_unit)
 117 {
 118   tree scalar_type = gimple_expr_type (stmt);
 119   HOST_WIDE_INT lhs, rhs;
 120
 121   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 122
 123   if (is_gimple_assign (stmt)
 124       && (gimple_assign_cast_p (stmt)
 125           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 126           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 127           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 128     {
 129       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 130
 131       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 132       if (rhs < lhs)
 133         scalar_type = rhs_type;
 134     }
 135
 136   *lhs_size_unit = lhs;
 137   *rhs_size_unit = rhs;
 138   return scalar_type;
 139 }
 140
 141
 142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 143    tested at run-time.  Return TRUE if DDR was successfully inserted.
 144    Return false if versioning is not supported.  */
 145
 146 static bool
 147 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 148 {
 149   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 150
 151   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 152     return false;
 153
 154   if (!runtime_alias_check_p (ddr, loop,
 155                               optimize_loop_nest_for_speed_p (loop)))
 156     return false;
 157
 158   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 159   return true;
 160 }
 161
 162
 163 /* Function vect_analyze_data_ref_dependence.
 164
 165    Return TRUE if there (might) exist a dependence between a memory-reference
 166    DRA and a memory-reference DRB.  When versioning for alias may check a
 167    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 168    the data dependence.  */
 169
 170 static bool
 171 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 172                                   loop_vec_info loop_vinfo, int *max_vf)
 173 {
 174   unsigned int i;
 175   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 176   struct data_reference *dra = DDR_A (ddr);
 177   struct data_reference *drb = DDR_B (ddr);
 178   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 179   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 180   lambda_vector dist_v;
 181   unsigned int loop_depth;
 182
 183   /* In loop analysis all data references should be vectorizable.  */
 184   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 185       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 186     gcc_unreachable ();
 187
 188   /* Independent data accesses.  */
 189   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 190     return false;
 191
 192   if (dra == drb
 193       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 194     return false;
 195
 196   /* We do not have to consider dependences between accesses that belong
 197      to the same group.  */
 198   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 199       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 200     return false;
 201
 202   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 203      least two scalar iterations, there is always also a true dependence.
 204      As the vectorizer does not re-order loads and stores we can ignore
 205      the anti-dependence if TBAA can disambiguate both DRs similar to the
 206      case with known negative distance anti-dependences (positive
 207      distance anti-dependences would violate TBAA constraints).  */
 208   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 209        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 210       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 211                                  get_alias_set (DR_REF (drb))))
 212     return false;
 213
 214   /* Unknown data dependence.  */
 215   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 216     {
 217       /* If user asserted safelen consecutive iterations can be
 218          executed concurrently, assume independence.  */
 219       if (loop->safelen >= 2)
 220         {
 221           if (loop->safelen < *max_vf)
 222             *max_vf = loop->safelen;
 223           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 224           return false;
 225         }
 226
 227       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 228           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 229         {
 230           if (dump_enabled_p ())
 231             {
 232               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 233                                "versioning for alias not supported for: "
 234                                "can't determine dependence between ");
 235               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 236                                  DR_REF (dra));
 237               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 238               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 239                                  DR_REF (drb));
 240               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 241             }
 242           return true;
 243         }
 244
 245       if (dump_enabled_p ())
 246         {
 247           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 248                            "versioning for alias required: "
 249                            "can't determine dependence between ");
 250           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 251                              DR_REF (dra));
 252           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 253           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 254                              DR_REF (drb));
 255           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 256         }
 257
 258       /* Add to list of ddrs that need to be tested at run-time.  */
 259       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 260     }
 261
 262   /* Known data dependence.  */
 263   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 264     {
 265       /* If user asserted safelen consecutive iterations can be
 266          executed concurrently, assume independence.  */
 267       if (loop->safelen >= 2)
 268         {
 269           if (loop->safelen < *max_vf)
 270             *max_vf = loop->safelen;
 271           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 272           return false;
 273         }
 274
 275       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 276           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 277         {
 278           if (dump_enabled_p ())
 279             {
 280               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 281                                "versioning for alias not supported for: "
 282                                "bad dist vector for ");
 283               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 284                                  DR_REF (dra));
 285               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 286               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 287                                  DR_REF (drb));
 288               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 289             }
 290           return true;
 291         }
 292
 293       if (dump_enabled_p ())
 294         {
 295           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 296                            "versioning for alias required: "
 297                            "bad dist vector for ");
 298           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 299           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 300           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 301           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 302         }
 303       /* Add to list of ddrs that need to be tested at run-time.  */
 304       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 305     }
 306
 307   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 308   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 309     {
 310       int dist = dist_v[loop_depth];
 311
 312       if (dump_enabled_p ())
 313         dump_printf_loc (MSG_NOTE, vect_location,
 314                          "dependence distance  = %d.\n", dist);
 315
 316       if (dist == 0)
 317         {
 318           if (dump_enabled_p ())
 319             {
 320               dump_printf_loc (MSG_NOTE, vect_location,
 321                                "dependence distance == 0 between ");
 322               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 323               dump_printf (MSG_NOTE, " and ");
 324               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 325               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 326             }
 327
 328           /* When we perform grouped accesses and perform implicit CSE
 329              by detecting equal accesses and doing disambiguation with
 330              runtime alias tests like for
 331                 .. = a[i];
 332                 .. = a[i+1];
 333                 a[i] = ..;
 334                 a[i+1] = ..;
 335                 *p = ..;
 336                 .. = a[i];
 337                 .. = a[i+1];
 338              where we will end up loading { a[i], a[i+1] } once, make
 339              sure that inserting group loads before the first load and
 340              stores after the last store will do the right thing.
 341              Similar for groups like
 342                 a[i] = ...;
 343                 ... = a[i];
 344                 a[i+1] = ...;
 345              where loads from the group interleave with the store.  */
 346           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 347               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 348             {
 349               gimple *earlier_stmt;
 350               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 351               if (DR_IS_WRITE
 352                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 353                 {
 354                   if (dump_enabled_p ())
 355                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 356                                      "READ_WRITE dependence in interleaving."
 357                                      "\n");
 358                   return true;
 359                 }
 360             }
 361
 362           continue;
 363         }
 364
 365       if (dist > 0 && DDR_REVERSED_P (ddr))
 366         {
 367           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 368              reversed (to make distance vector positive), and the actual
 369              distance is negative.  */
 370           if (dump_enabled_p ())
 371             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 372                              "dependence distance negative.\n");
 373           /* Record a negative dependence distance to later limit the
 374              amount of stmt copying / unrolling we can perform.
 375              Only need to handle read-after-write dependence.  */
 376           if (DR_IS_READ (drb)
 377               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 378                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 379             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 380           continue;
 381         }
 382
 383       if (abs (dist) >= 2
 384           && abs (dist) < *max_vf)
 385         {
 386           /* The dependence distance requires reduction of the maximal
 387              vectorization factor.  */
 388           *max_vf = abs (dist);
 389           if (dump_enabled_p ())
 390             dump_printf_loc (MSG_NOTE, vect_location,
 391                              "adjusting maximal vectorization factor to %i\n",
 392                              *max_vf);
 393         }
 394
 395       if (abs (dist) >= *max_vf)
 396         {
 397           /* Dependence distance does not create dependence, as far as
 398              vectorization is concerned, in this case.  */
 399           if (dump_enabled_p ())
 400             dump_printf_loc (MSG_NOTE, vect_location,
 401                              "dependence distance >= VF.\n");
 402           continue;
 403         }
 404
 405       if (dump_enabled_p ())
 406         {
 407           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                        "not vectorized, possible dependence "
 409                        "between data-refs ");
 410           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 411           dump_printf (MSG_NOTE,  " and ");
 412           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 413           dump_printf (MSG_NOTE,  "\n");
 414         }
 415
 416       return true;
 417     }
 418
 419   return false;
 420 }
 421
 422 /* Function vect_analyze_data_ref_dependences.
 423
 424    Examine all the data references in the loop, and make sure there do not
 425    exist any data dependences between them.  Set *MAX_VF according to
 426    the maximum vectorization factor the data dependences allow.  */
 427
 428 bool
 429 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 430 {
 431   unsigned int i;
 432   struct data_dependence_relation *ddr;
 433
 434   if (dump_enabled_p ())
 435     dump_printf_loc (MSG_NOTE, vect_location,
 436                      "=== vect_analyze_data_ref_dependences ===\n");
 437
 438   LOOP_VINFO_DDRS (loop_vinfo)
 439     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 440              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 441   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 442   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 443   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 444                                 &LOOP_VINFO_DDRS (loop_vinfo),
 445                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 446     return false;
 447
 448   /* For epilogues we either have no aliases or alias versioning
 449      was applied to original loop.  Therefore we may just get max_vf
 450      using VF of original loop.  */
 451   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 452     *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
 453   else
 454     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 455       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 456         return false;
 457
 458   return true;
 459 }
 460
 461
 462 /* Function vect_slp_analyze_data_ref_dependence.
 463
 464    Return TRUE if there (might) exist a dependence between a memory-reference
 465    DRA and a memory-reference DRB.  When versioning for alias may check a
 466    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 467    the data dependence.  */
 468
 469 static bool
 470 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 471 {
 472   struct data_reference *dra = DDR_A (ddr);
 473   struct data_reference *drb = DDR_B (ddr);
 474
 475   /* We need to check dependences of statements marked as unvectorizable
 476      as well, they still can prohibit vectorization.  */
 477
 478   /* Independent data accesses.  */
 479   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 480     return false;
 481
 482   if (dra == drb)
 483     return false;
 484
 485   /* Read-read is OK.  */
 486   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 487     return false;
 488
 489   /* If dra and drb are part of the same interleaving chain consider
 490      them independent.  */
 491   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 492       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 493           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 494     return false;
 495
 496   /* Unknown data dependence.  */
 497   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 498     {
 499       if  (dump_enabled_p ())
 500         {
 501           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 502                            "can't determine dependence between ");
 503           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 504           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 505           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 506           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 507         }
 508     }
 509   else if (dump_enabled_p ())
 510     {
 511       dump_printf_loc (MSG_NOTE, vect_location,
 512                        "determined dependence between ");
 513       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 514       dump_printf (MSG_NOTE, " and ");
 515       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 516       dump_printf (MSG_NOTE,  "\n");
 517     }
 518
 519   return true;
 520 }
 521
 522
 523 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 524    contain the vector of scalar stores of this instance if we are
 525    disambiguating the loads.  */
 526
 527 static bool
 528 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 529                                    vec<gimple *> stores, gimple *last_store)
 530 {
 531   /* This walks over all stmts involved in the SLP load/store done
 532      in NODE verifying we can sink them up to the last stmt in the
 533      group.  */
 534   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 535   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 536     {
 537       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 538       if (access == last_access)
 539         continue;
 540       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 541       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 542            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 543         {
 544           gimple *stmt = gsi_stmt (gsi);
 545           if (! gimple_vuse (stmt)
 546               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 547             continue;
 548
 549           /* If we couldn't record a (single) data reference for this
 550              stmt we have to give up.  */
 551           /* ???  Here and below if dependence analysis fails we can resort
 552              to the alias oracle which can handle more kinds of stmts.  */
 553           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 554           if (!dr_b)
 555             return false;
 556
 557           bool dependent = false;
 558           /* If we run into a store of this same instance (we've just
 559              marked those) then delay dependence checking until we run
 560              into the last store because this is where it will have
 561              been sunk to (and we verify if we can do that as well).  */
 562           if (gimple_visited_p (stmt))
 563             {
 564               if (stmt != last_store)
 565                 continue;
 566               unsigned i;
 567               gimple *store;
 568               FOR_EACH_VEC_ELT (stores, i, store)
 569                 {
 570                   data_reference *store_dr
 571                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 572                   ddr_p ddr = initialize_data_dependence_relation
 573                                 (dr_a, store_dr, vNULL);
 574                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 575                   free_dependence_relation (ddr);
 576                   if (dependent)
 577                     break;
 578                 }
 579             }
 580           else
 581             {
 582               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 583                                                                dr_b, vNULL);
 584               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 585               free_dependence_relation (ddr);
 586             }
 587           if (dependent)
 588             return false;
 589         }
 590     }
 591   return true;
 592 }
 593
 594
 595 /* Function vect_analyze_data_ref_dependences.
 596
 597    Examine all the data references in the basic-block, and make sure there
 598    do not exist any data dependences between them.  Set *MAX_VF according to
 599    the maximum vectorization factor the data dependences allow.  */
 600
 601 bool
 602 vect_slp_analyze_instance_dependence (slp_instance instance)
 603 {
 604   if (dump_enabled_p ())
 605     dump_printf_loc (MSG_NOTE, vect_location,
 606                      "=== vect_slp_analyze_instance_dependence ===\n");
 607
 608   /* The stores of this instance are at the root of the SLP tree.  */
 609   slp_tree store = SLP_INSTANCE_TREE (instance);
 610   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 611     store = NULL;
 612
 613   /* Verify we can sink stores to the vectorized stmt insert location.  */
 614   gimple *last_store = NULL;
 615   if (store)
 616     {
 617       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 618         return false;
 619
 620       /* Mark stores in this instance and remember the last one.  */
 621       last_store = vect_find_last_scalar_stmt_in_slp (store);
 622       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 623         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 624     }
 625
 626   bool res = true;
 627
 628   /* Verify we can sink loads to the vectorized stmt insert location,
 629      special-casing stores of this instance.  */
 630   slp_tree load;
 631   unsigned int i;
 632   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 633     if (! vect_slp_analyze_node_dependences (instance, load,
 634                                              store
 635                                              ? SLP_TREE_SCALAR_STMTS (store)
 636                                              : vNULL, last_store))
 637       {
 638         res = false;
 639         break;
 640       }
 641
 642   /* Unset the visited flag.  */
 643   if (store)
 644     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 645       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 646
 647   return res;
 648 }
 649
 650 /* Function vect_compute_data_ref_alignment
 651
 652    Compute the misalignment of the data reference DR.
 653
 654    Output:
 655    1. If during the misalignment computation it is found that the data reference
 656       cannot be vectorized then false is returned.
 657    2. DR_MISALIGNMENT (DR) is defined.
 658
 659    FOR NOW: No analysis is actually performed. Misalignment is calculated
 660    only for trivial cases. TODO.  */
 661
 662 bool
 663 vect_compute_data_ref_alignment (struct data_reference *dr)
 664 {
 665   gimple *stmt = DR_STMT (dr);
 666   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 667   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 668   struct loop *loop = NULL;
 669   tree ref = DR_REF (dr);
 670   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 671
 672   if (dump_enabled_p ())
 673     dump_printf_loc (MSG_NOTE, vect_location,
 674                      "vect_compute_data_ref_alignment:\n");
 675
 676   if (loop_vinfo)
 677     loop = LOOP_VINFO_LOOP (loop_vinfo);
 678
 679   /* Initialize misalignment to unknown.  */
 680   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 681
 682   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 683   bool step_preserves_misalignment_p;
 684
 685   /* No step for BB vectorization.  */
 686   if (!loop)
 687     {
 688       gcc_assert (integer_zerop (drb->step));
 689       step_preserves_misalignment_p = true;
 690     }
 691
 692   /* In case the dataref is in an inner-loop of the loop that is being
 693      vectorized (LOOP), we use the base and misalignment information
 694      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 695      stays the same throughout the execution of the inner-loop, which is why
 696      we have to check that the stride of the dataref in the inner-loop evenly
 697      divides by the vector size.  */
 698   else if (nested_in_vect_loop_p (loop, stmt))
 699     {
 700       step_preserves_misalignment_p
 701         = (DR_STEP_ALIGNMENT (dr)
 702            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 703
 704       if (dump_enabled_p ())
 705         {
 706           if (step_preserves_misalignment_p)
 707             dump_printf_loc (MSG_NOTE, vect_location,
 708                              "inner step divides the vector-size.\n");
 709           else
 710             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 711                              "inner step doesn't divide the vector-size.\n");
 712         }
 713     }
 714
 715   /* Similarly we can only use base and misalignment information relative to
 716      an innermost loop if the misalignment stays the same throughout the
 717      execution of the loop.  As above, this is the case if the stride of
 718      the dataref evenly divides by the vector size.  */
 719   else
 720     {
 721       unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 722       step_preserves_misalignment_p
 723         = ((DR_STEP_ALIGNMENT (dr) * vf)
 724            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 725
 726       if (!step_preserves_misalignment_p && dump_enabled_p ())
 727         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 728                          "step doesn't divide the vector-size.\n");
 729     }
 730
 731   unsigned int base_alignment = drb->base_alignment;
 732   unsigned int base_misalignment = drb->base_misalignment;
 733   unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
 734
 735   if (drb->offset_alignment < vector_alignment
 736       || !step_preserves_misalignment_p
 737       /* We need to know whether the step wrt the vectorized loop is
 738          negative when computing the starting misalignment below.  */
 739       || TREE_CODE (drb->step) != INTEGER_CST)
 740     {
 741       if (dump_enabled_p ())
 742         {
 743           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 744                            "Unknown alignment for access: ");
 745           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 746           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 747         }
 748       return true;
 749     }
 750
 751   if (base_alignment < vector_alignment)
 752     {
 753       tree base = drb->base_address;
 754       if (TREE_CODE (base) == ADDR_EXPR)
 755         base = TREE_OPERAND (base, 0);
 756       if (!vect_can_force_dr_alignment_p (base,
 757                                           vector_alignment * BITS_PER_UNIT))
 758         {
 759           if (dump_enabled_p ())
 760             {
 761               dump_printf_loc (MSG_NOTE, vect_location,
 762                                "can't force alignment of ref: ");
 763               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 764               dump_printf (MSG_NOTE, "\n");
 765             }
 766           return true;
 767         }
 768
 769       if (DECL_USER_ALIGN (base))
 770         {
 771           if (dump_enabled_p ())
 772             {
 773               dump_printf_loc (MSG_NOTE, vect_location,
 774                                "not forcing alignment of user-aligned "
 775                                "variable: ");
 776               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 777               dump_printf (MSG_NOTE, "\n");
 778             }
 779           return true;
 780         }
 781
 782       /* Force the alignment of the decl.
 783          NOTE: This is the only change to the code we make during
 784          the analysis phase, before deciding to vectorize the loop.  */
 785       if (dump_enabled_p ())
 786         {
 787           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 788           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 789           dump_printf (MSG_NOTE, "\n");
 790         }
 791
 792       DR_VECT_AUX (dr)->base_decl = base;
 793       DR_VECT_AUX (dr)->base_misaligned = true;
 794       base_misalignment = 0;
 795     }
 796   unsigned int misalignment = (base_misalignment
 797                                + TREE_INT_CST_LOW (drb->init));
 798
 799   /* If this is a backward running DR then first access in the larger
 800      vectype actually is N-1 elements before the address in the DR.
 801      Adjust misalign accordingly.  */
 802   if (tree_int_cst_sgn (drb->step) < 0)
 803     /* PLUS because STEP is negative.  */
 804     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 805                      * TREE_INT_CST_LOW (drb->step));
 806
 807   SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
 808
 809   if (dump_enabled_p ())
 810     {
 811       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 812                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 813       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 814       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 815     }
 816
 817   return true;
 818 }
 819
 820
 821 /* Function vect_update_misalignment_for_peel.
 822    Sets DR's misalignment
 823    - to 0 if it has the same alignment as DR_PEEL,
 824    - to the misalignment computed using NPEEL if DR's salignment is known,
 825    - to -1 (unknown) otherwise.
 826
 827    DR - the data reference whose misalignment is to be adjusted.
 828    DR_PEEL - the data reference whose misalignment is being made
 829              zero in the vector loop by the peel.
 830    NPEEL - the number of iterations in the peel loop if the misalignment
 831            of DR_PEEL is known at compile time.  */
 832
 833 static void
 834 vect_update_misalignment_for_peel (struct data_reference *dr,
 835                                    struct data_reference *dr_peel, int npeel)
 836 {
 837   unsigned int i;
 838   vec<dr_p> same_aligned_drs;
 839   struct data_reference *current_dr;
 840   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 841   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 842   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 843   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 844
 845  /* For interleaved data accesses the step in the loop must be multiplied by
 846      the size of the interleaving group.  */
 847   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 848     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 849   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 850     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 851
 852   /* It can be assumed that the data refs with the same alignment as dr_peel
 853      are aligned in the vector loop.  */
 854   same_aligned_drs
 855     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 856   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
 857     {
 858       if (current_dr != dr)
 859         continue;
 860       gcc_assert (!known_alignment_for_access_p (dr)
 861                   || !known_alignment_for_access_p (dr_peel)
 862                   || (DR_MISALIGNMENT (dr) / dr_size
 863                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
 864       SET_DR_MISALIGNMENT (dr, 0);
 865       return;
 866     }
 867
 868   if (known_alignment_for_access_p (dr)
 869       && known_alignment_for_access_p (dr_peel))
 870     {
 871       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 872       int misal = DR_MISALIGNMENT (dr);
 873       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 874       misal += negative ? -npeel * dr_size : npeel * dr_size;
 875       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 876       SET_DR_MISALIGNMENT (dr, misal);
 877       return;
 878     }
 879
 880   if (dump_enabled_p ())
 881     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
 882                      "to unknown (-1).\n");
 883   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 884 }
 885
 886
 887 /* Function verify_data_ref_alignment
 888
 889    Return TRUE if DR can be handled with respect to alignment.  */
 890
 891 static bool
 892 verify_data_ref_alignment (data_reference_p dr)
 893 {
 894   enum dr_alignment_support supportable_dr_alignment
 895     = vect_supportable_dr_alignment (dr, false);
 896   if (!supportable_dr_alignment)
 897     {
 898       if (dump_enabled_p ())
 899         {
 900           if (DR_IS_READ (dr))
 901             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 902                              "not vectorized: unsupported unaligned load.");
 903           else
 904             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 905                              "not vectorized: unsupported unaligned "
 906                              "store.");
 907
 908           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 909                              DR_REF (dr));
 910           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 911         }
 912       return false;
 913     }
 914
 915   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 916     dump_printf_loc (MSG_NOTE, vect_location,
 917                      "Vectorizing an unaligned access.\n");
 918
 919   return true;
 920 }
 921
 922 /* Function vect_verify_datarefs_alignment
 923
 924    Return TRUE if all data references in the loop can be
 925    handled with respect to alignment.  */
 926
 927 bool
 928 vect_verify_datarefs_alignment (loop_vec_info vinfo)
 929 {
 930   vec<data_reference_p> datarefs = vinfo->datarefs;
 931   struct data_reference *dr;
 932   unsigned int i;
 933
 934   FOR_EACH_VEC_ELT (datarefs, i, dr)
 935     {
 936       gimple *stmt = DR_STMT (dr);
 937       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 938
 939       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 940         continue;
 941
 942       /* For interleaving, only the alignment of the first access matters.   */
 943       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
 944           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 945         continue;
 946
 947       /* Strided accesses perform only component accesses, alignment is
 948          irrelevant for them.  */
 949       if (STMT_VINFO_STRIDED_P (stmt_info)
 950           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 951         continue;
 952
 953       if (! verify_data_ref_alignment (dr))
 954         return false;
 955     }
 956
 957   return true;
 958 }
 959
 960 /* Given an memory reference EXP return whether its alignment is less
 961    than its size.  */
 962
 963 static bool
 964 not_size_aligned (tree exp)
 965 {
 966   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 967     return true;
 968
 969   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 970           > get_object_alignment (exp));
 971 }
 972
 973 /* Function vector_alignment_reachable_p
 974
 975    Return true if vector alignment for DR is reachable by peeling
 976    a few loop iterations.  Return false otherwise.  */
 977
 978 static bool
 979 vector_alignment_reachable_p (struct data_reference *dr)
 980 {
 981   gimple *stmt = DR_STMT (dr);
 982   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 983   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 984
 985   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 986     {
 987       /* For interleaved access we peel only if number of iterations in
 988          the prolog loop ({VF - misalignment}), is a multiple of the
 989          number of the interleaved accesses.  */
 990       int elem_size, mis_in_elements;
 991       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 992
 993       /* FORNOW: handle only known alignment.  */
 994       if (!known_alignment_for_access_p (dr))
 995         return false;
 996
 997       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
 998       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
 999
1000       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1001         return false;
1002     }
1003
1004   /* If misalignment is known at the compile time then allow peeling
1005      only if natural alignment is reachable through peeling.  */
1006   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1007     {
1008       HOST_WIDE_INT elmsize =
1009                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1010       if (dump_enabled_p ())
1011         {
1012           dump_printf_loc (MSG_NOTE, vect_location,
1013                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1014           dump_printf (MSG_NOTE,
1015                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1016         }
1017       if (DR_MISALIGNMENT (dr) % elmsize)
1018         {
1019           if (dump_enabled_p ())
1020             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021                              "data size does not divide the misalignment.\n");
1022           return false;
1023         }
1024     }
1025
1026   if (!known_alignment_for_access_p (dr))
1027     {
1028       tree type = TREE_TYPE (DR_REF (dr));
1029       bool is_packed = not_size_aligned (DR_REF (dr));
1030       if (dump_enabled_p ())
1031         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032                          "Unknown misalignment, %snaturally aligned\n",
1033                          is_packed ? "not " : "");
1034       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1035     }
1036
1037   return true;
1038 }
1039
1040
1041 /* Calculate the cost of the memory access represented by DR.  */
1042
1043 static void
1044 vect_get_data_access_cost (struct data_reference *dr,
1045                            unsigned int *inside_cost,
1046                            unsigned int *outside_cost,
1047                            stmt_vector_for_cost *body_cost_vec)
1048 {
1049   gimple *stmt = DR_STMT (dr);
1050   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1051   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1052   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1053   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1054   int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly  */
1055
1056   if (DR_IS_READ (dr))
1057     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1058                         NULL, body_cost_vec, false);
1059   else
1060     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1061
1062   if (dump_enabled_p ())
1063     dump_printf_loc (MSG_NOTE, vect_location,
1064                      "vect_get_data_access_cost: inside_cost = %d, "
1065                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1066 }
1067
1068
1069 typedef struct _vect_peel_info
1070 {
1071   struct data_reference *dr;
1072   int npeel;
1073   unsigned int count;
1074 } *vect_peel_info;
1075
1076 typedef struct _vect_peel_extended_info
1077 {
1078   struct _vect_peel_info peel_info;
1079   unsigned int inside_cost;
1080   unsigned int outside_cost;
1081   stmt_vector_for_cost body_cost_vec;
1082 } *vect_peel_extended_info;
1083
1084
1085 /* Peeling hashtable helpers.  */
1086
1087 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1088 {
1089   static inline hashval_t hash (const _vect_peel_info *);
1090   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1091 };
1092
1093 inline hashval_t
1094 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1095 {
1096   return (hashval_t) peel_info->npeel;
1097 }
1098
1099 inline bool
1100 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1101 {
1102   return (a->npeel == b->npeel);
1103 }
1104
1105
1106 /* Insert DR into peeling hash table with NPEEL as key.  */
1107
1108 static void
1109 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1110                           loop_vec_info loop_vinfo, struct data_reference *dr,
1111                           int npeel)
1112 {
1113   struct _vect_peel_info elem, *slot;
1114   _vect_peel_info **new_slot;
1115   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1116
1117   elem.npeel = npeel;
1118   slot = peeling_htab->find (&elem);
1119   if (slot)
1120     slot->count++;
1121   else
1122     {
1123       slot = XNEW (struct _vect_peel_info);
1124       slot->npeel = npeel;
1125       slot->dr = dr;
1126       slot->count = 1;
1127       new_slot = peeling_htab->find_slot (slot, INSERT);
1128       *new_slot = slot;
1129     }
1130
1131   if (!supportable_dr_alignment
1132       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1133     slot->count += VECT_MAX_COST;
1134 }
1135
1136
1137 /* Traverse peeling hash table to find peeling option that aligns maximum
1138    number of data accesses.  */
1139
1140 int
1141 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1142                                      _vect_peel_extended_info *max)
1143 {
1144   vect_peel_info elem = *slot;
1145
1146   if (elem->count > max->peel_info.count
1147       || (elem->count == max->peel_info.count
1148           && max->peel_info.npeel > elem->npeel))
1149     {
1150       max->peel_info.npeel = elem->npeel;
1151       max->peel_info.count = elem->count;
1152       max->peel_info.dr = elem->dr;
1153     }
1154
1155   return 1;
1156 }
1157
1158 /* Get the costs of peeling NPEEL iterations checking data access costs
1159    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1160    misalignment will be zero after peeling.  */
1161
1162 static void
1163 vect_get_peeling_costs_all_drs (struct data_reference *dr0,
1164                                 unsigned int *inside_cost,
1165                                 unsigned int *outside_cost,
1166                                 stmt_vector_for_cost *body_cost_vec,
1167                                 unsigned int npeel,
1168                                 bool unknown_misalignment)
1169 {
1170   gimple *stmt = DR_STMT (dr0);
1171   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1172   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1173   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1174
1175   unsigned i;
1176   data_reference *dr;
1177
1178   FOR_EACH_VEC_ELT (datarefs, i, dr)
1179     {
1180       stmt = DR_STMT (dr);
1181       stmt_info = vinfo_for_stmt (stmt);
1182       /* For interleaving, only the alignment of the first access
1183          matters.  */
1184       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1185           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1186         continue;
1187
1188       /* Strided accesses perform only component accesses, alignment is
1189          irrelevant for them.  */
1190       if (STMT_VINFO_STRIDED_P (stmt_info)
1191           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1192         continue;
1193
1194       int save_misalignment;
1195       save_misalignment = DR_MISALIGNMENT (dr);
1196       if (unknown_misalignment && dr == dr0)
1197         SET_DR_MISALIGNMENT (dr, 0);
1198       else
1199         vect_update_misalignment_for_peel (dr, dr0, npeel);
1200       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1201                                  body_cost_vec);
1202       SET_DR_MISALIGNMENT (dr, save_misalignment);
1203     }
1204 }
1205
1206 /* Traverse peeling hash table and calculate cost for each peeling option.
1207    Find the one with the lowest cost.  */
1208
1209 int
1210 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1211                                    _vect_peel_extended_info *min)
1212 {
1213   vect_peel_info elem = *slot;
1214   int dummy;
1215   unsigned int inside_cost = 0, outside_cost = 0;
1216   gimple *stmt = DR_STMT (elem->dr);
1217   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1218   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1219   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1220                        epilogue_cost_vec;
1221
1222   prologue_cost_vec.create (2);
1223   body_cost_vec.create (2);
1224   epilogue_cost_vec.create (2);
1225
1226   vect_get_peeling_costs_all_drs (elem->dr, &inside_cost, &outside_cost,
1227                                   &body_cost_vec, elem->npeel, false);
1228
1229   outside_cost += vect_get_known_peeling_cost
1230     (loop_vinfo, elem->npeel, &dummy,
1231      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1232      &prologue_cost_vec, &epilogue_cost_vec);
1233
1234   /* Prologue and epilogue costs are added to the target model later.
1235      These costs depend only on the scalar iteration cost, the
1236      number of peeling iterations finally chosen, and the number of
1237      misaligned statements.  So discard the information found here.  */
1238   prologue_cost_vec.release ();
1239   epilogue_cost_vec.release ();
1240
1241   if (inside_cost < min->inside_cost
1242       || (inside_cost == min->inside_cost
1243           && outside_cost < min->outside_cost))
1244     {
1245       min->inside_cost = inside_cost;
1246       min->outside_cost = outside_cost;
1247       min->body_cost_vec.release ();
1248       min->body_cost_vec = body_cost_vec;
1249       min->peel_info.dr = elem->dr;
1250       min->peel_info.npeel = elem->npeel;
1251       min->peel_info.count = elem->count;
1252     }
1253   else
1254     body_cost_vec.release ();
1255
1256   return 1;
1257 }
1258
1259
1260 /* Choose best peeling option by traversing peeling hash table and either
1261    choosing an option with the lowest cost (if cost model is enabled) or the
1262    option that aligns as many accesses as possible.  */
1263
1264 static struct _vect_peel_extended_info
1265 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1266                                        loop_vec_info loop_vinfo,
1267                                        unsigned int *npeel,
1268                                        stmt_vector_for_cost *body_cost_vec)
1269 {
1270    struct _vect_peel_extended_info res;
1271
1272    res.peel_info.dr = NULL;
1273    res.body_cost_vec = stmt_vector_for_cost ();
1274
1275    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1276      {
1277        res.inside_cost = INT_MAX;
1278        res.outside_cost = INT_MAX;
1279        peeling_htab->traverse <_vect_peel_extended_info *,
1280                                vect_peeling_hash_get_lowest_cost> (&res);
1281      }
1282    else
1283      {
1284        res.peel_info.count = 0;
1285        peeling_htab->traverse <_vect_peel_extended_info *,
1286                                vect_peeling_hash_get_most_frequent> (&res);
1287        res.inside_cost = 0;
1288        res.outside_cost = 0;
1289      }
1290
1291    *npeel = res.peel_info.npeel;
1292    *body_cost_vec = res.body_cost_vec;
1293    return res;
1294 }
1295
1296 /* Return true if the new peeling NPEEL is supported.  */
1297
1298 static bool
1299 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1300                           unsigned npeel)
1301 {
1302   unsigned i;
1303   struct data_reference *dr = NULL;
1304   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1305   gimple *stmt;
1306   stmt_vec_info stmt_info;
1307   enum dr_alignment_support supportable_dr_alignment;
1308
1309   /* Ensure that all data refs can be vectorized after the peel.  */
1310   FOR_EACH_VEC_ELT (datarefs, i, dr)
1311     {
1312       int save_misalignment;
1313
1314       if (dr == dr0)
1315         continue;
1316
1317       stmt = DR_STMT (dr);
1318       stmt_info = vinfo_for_stmt (stmt);
1319       /* For interleaving, only the alignment of the first access
1320          matters.  */
1321       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1322           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1323         continue;
1324
1325       /* Strided accesses perform only component accesses, alignment is
1326          irrelevant for them.  */
1327       if (STMT_VINFO_STRIDED_P (stmt_info)
1328           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1329         continue;
1330
1331       save_misalignment = DR_MISALIGNMENT (dr);
1332       vect_update_misalignment_for_peel (dr, dr0, npeel);
1333       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1334       SET_DR_MISALIGNMENT (dr, save_misalignment);
1335
1336       if (!supportable_dr_alignment)
1337         return false;
1338     }
1339
1340   return true;
1341 }
1342
1343 /* Function vect_enhance_data_refs_alignment
1344
1345    This pass will use loop versioning and loop peeling in order to enhance
1346    the alignment of data references in the loop.
1347
1348    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1349    original loop is to be vectorized.  Any other loops that are created by
1350    the transformations performed in this pass - are not supposed to be
1351    vectorized.  This restriction will be relaxed.
1352
1353    This pass will require a cost model to guide it whether to apply peeling
1354    or versioning or a combination of the two.  For example, the scheme that
1355    intel uses when given a loop with several memory accesses, is as follows:
1356    choose one memory access ('p') which alignment you want to force by doing
1357    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1358    other accesses are not necessarily aligned, or (2) use loop versioning to
1359    generate one loop in which all accesses are aligned, and another loop in
1360    which only 'p' is necessarily aligned.
1361
1362    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1363    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1364    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1365
1366    Devising a cost model is the most critical aspect of this work.  It will
1367    guide us on which access to peel for, whether to use loop versioning, how
1368    many versions to create, etc.  The cost model will probably consist of
1369    generic considerations as well as target specific considerations (on
1370    powerpc for example, misaligned stores are more painful than misaligned
1371    loads).
1372
1373    Here are the general steps involved in alignment enhancements:
1374
1375      -- original loop, before alignment analysis:
1376         for (i=0; i<N; i++){
1377           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1378           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1379         }
1380
1381      -- After vect_compute_data_refs_alignment:
1382         for (i=0; i<N; i++){
1383           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1384           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1385         }
1386
1387      -- Possibility 1: we do loop versioning:
1388      if (p is aligned) {
1389         for (i=0; i<N; i++){    # loop 1A
1390           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1391           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1392         }
1393      }
1394      else {
1395         for (i=0; i<N; i++){    # loop 1B
1396           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1397           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1398         }
1399      }
1400
1401      -- Possibility 2: we do loop peeling:
1402      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1403         x = q[i];
1404         p[i] = y;
1405      }
1406      for (i = 3; i < N; i++){   # loop 2A
1407         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1408         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1409      }
1410
1411      -- Possibility 3: combination of loop peeling and versioning:
1412      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1413         x = q[i];
1414         p[i] = y;
1415      }
1416      if (p is aligned) {
1417         for (i = 3; i<N; i++){  # loop 3A
1418           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1419           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1420         }
1421      }
1422      else {
1423         for (i = 3; i<N; i++){  # loop 3B
1424           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1425           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1426         }
1427      }
1428
1429      These loops are later passed to loop_transform to be vectorized.  The
1430      vectorizer will use the alignment information to guide the transformation
1431      (whether to generate regular loads/stores, or with special handling for
1432      misalignment).  */
1433
1434 bool
1435 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1436 {
1437   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1438   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1439   enum dr_alignment_support supportable_dr_alignment;
1440   struct data_reference *dr0 = NULL, *first_store = NULL;
1441   struct data_reference *dr;
1442   unsigned int i, j;
1443   bool do_peeling = false;
1444   bool do_versioning = false;
1445   bool stat;
1446   gimple *stmt;
1447   stmt_vec_info stmt_info;
1448   unsigned int npeel = 0;
1449   bool one_misalignment_known = false;
1450   bool one_misalignment_unknown = false;
1451   bool one_dr_unsupportable = false;
1452   struct data_reference *unsupportable_dr = NULL;
1453   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1454   unsigned possible_npeel_number = 1;
1455   tree vectype;
1456   unsigned int nelements, mis, same_align_drs_max = 0;
1457   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1458   hash_table<peel_info_hasher> peeling_htab (1);
1459
1460   if (dump_enabled_p ())
1461     dump_printf_loc (MSG_NOTE, vect_location,
1462                      "=== vect_enhance_data_refs_alignment ===\n");
1463
1464   /* Reset data so we can safely be called multiple times.  */
1465   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1466   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1467
1468   /* While cost model enhancements are expected in the future, the high level
1469      view of the code at this time is as follows:
1470
1471      A) If there is a misaligned access then see if peeling to align
1472         this access can make all data references satisfy
1473         vect_supportable_dr_alignment.  If so, update data structures
1474         as needed and return true.
1475
1476      B) If peeling wasn't possible and there is a data reference with an
1477         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1478         then see if loop versioning checks can be used to make all data
1479         references satisfy vect_supportable_dr_alignment.  If so, update
1480         data structures as needed and return true.
1481
1482      C) If neither peeling nor versioning were successful then return false if
1483         any data reference does not satisfy vect_supportable_dr_alignment.
1484
1485      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1486
1487      Note, Possibility 3 above (which is peeling and versioning together) is not
1488      being done at this time.  */
1489
1490   /* (1) Peeling to force alignment.  */
1491
1492   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1493      Considerations:
1494      + How many accesses will become aligned due to the peeling
1495      - How many accesses will become unaligned due to the peeling,
1496        and the cost of misaligned accesses.
1497      - The cost of peeling (the extra runtime checks, the increase
1498        in code size).  */
1499
1500   FOR_EACH_VEC_ELT (datarefs, i, dr)
1501     {
1502       stmt = DR_STMT (dr);
1503       stmt_info = vinfo_for_stmt (stmt);
1504
1505       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1506         continue;
1507
1508       /* For interleaving, only the alignment of the first access
1509          matters.  */
1510       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1511           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1512         continue;
1513
1514       /* For invariant accesses there is nothing to enhance.  */
1515       if (integer_zerop (DR_STEP (dr)))
1516         continue;
1517
1518       /* Strided accesses perform only component accesses, alignment is
1519          irrelevant for them.  */
1520       if (STMT_VINFO_STRIDED_P (stmt_info)
1521           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1522         continue;
1523
1524       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1525       do_peeling = vector_alignment_reachable_p (dr);
1526       if (do_peeling)
1527         {
1528           if (known_alignment_for_access_p (dr))
1529             {
1530               unsigned int npeel_tmp = 0;
1531               bool negative = tree_int_cst_compare (DR_STEP (dr),
1532                                                     size_zero_node) < 0;
1533
1534               vectype = STMT_VINFO_VECTYPE (stmt_info);
1535               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1536               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1537                                                 TREE_TYPE (DR_REF (dr))));
1538               if (DR_MISALIGNMENT (dr) != 0)
1539                 npeel_tmp = (negative ? (mis - nelements)
1540                              : (nelements - mis)) & (nelements - 1);
1541
1542               /* For multiple types, it is possible that the bigger type access
1543                  will have more than one peeling option.  E.g., a loop with two
1544                  types: one of size (vector size / 4), and the other one of
1545                  size (vector size / 8).  Vectorization factor will 8.  If both
1546                  accesses are misaligned by 3, the first one needs one scalar
1547                  iteration to be aligned, and the second one needs 5.  But the
1548                  first one will be aligned also by peeling 5 scalar
1549                  iterations, and in that case both accesses will be aligned.
1550                  Hence, except for the immediate peeling amount, we also want
1551                  to try to add full vector size, while we don't exceed
1552                  vectorization factor.
1553                  We do this automatically for cost model, since we calculate
1554                  cost for every peeling option.  */
1555               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1556                 {
1557                   if (STMT_SLP_TYPE (stmt_info))
1558                     possible_npeel_number
1559                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1560                   else
1561                     possible_npeel_number = vf / nelements;
1562
1563                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1564                      allow peeling NELEMENTS.  */
1565                   if (DR_MISALIGNMENT (dr) == 0)
1566                     possible_npeel_number++;
1567                 }
1568
1569               /* Save info about DR in the hash table.  Also include peeling
1570                  amounts according to the explanation above.  */
1571               for (j = 0; j < possible_npeel_number; j++)
1572                 {
1573                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1574                                             dr, npeel_tmp);
1575                   npeel_tmp += nelements;
1576                 }
1577
1578               one_misalignment_known = true;
1579             }
1580           else
1581             {
1582               /* If we don't know any misalignment values, we prefer
1583                  peeling for data-ref that has the maximum number of data-refs
1584                  with the same alignment, unless the target prefers to align
1585                  stores over load.  */
1586               unsigned same_align_drs
1587                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1588               if (!dr0
1589                   || same_align_drs_max < same_align_drs)
1590                 {
1591                   same_align_drs_max = same_align_drs;
1592                   dr0 = dr;
1593                 }
1594               /* For data-refs with the same number of related
1595                  accesses prefer the one where the misalign
1596                  computation will be invariant in the outermost loop.  */
1597               else if (same_align_drs_max == same_align_drs)
1598                 {
1599                   struct loop *ivloop0, *ivloop;
1600                   ivloop0 = outermost_invariant_loop_for_expr
1601                     (loop, DR_BASE_ADDRESS (dr0));
1602                   ivloop = outermost_invariant_loop_for_expr
1603                     (loop, DR_BASE_ADDRESS (dr));
1604                   if ((ivloop && !ivloop0)
1605                       || (ivloop && ivloop0
1606                           && flow_loop_nested_p (ivloop, ivloop0)))
1607                     dr0 = dr;
1608                 }
1609
1610               one_misalignment_unknown = true;
1611
1612               /* Check for data refs with unsupportable alignment that
1613                  can be peeled.  */
1614               if (!supportable_dr_alignment)
1615               {
1616                 one_dr_unsupportable = true;
1617                 unsupportable_dr = dr;
1618               }
1619
1620               if (!first_store && DR_IS_WRITE (dr))
1621                 first_store = dr;
1622             }
1623         }
1624       else
1625         {
1626           if (!aligned_access_p (dr))
1627             {
1628               if (dump_enabled_p ())
1629                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630                                  "vector alignment may not be reachable\n");
1631               break;
1632             }
1633         }
1634     }
1635
1636   /* Check if we can possibly peel the loop.  */
1637   if (!vect_can_advance_ivs_p (loop_vinfo)
1638       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1639       || loop->inner)
1640     do_peeling = false;
1641
1642   struct _vect_peel_extended_info peel_for_known_alignment;
1643   struct _vect_peel_extended_info peel_for_unknown_alignment;
1644   struct _vect_peel_extended_info best_peel;
1645
1646   peel_for_unknown_alignment.inside_cost = INT_MAX;
1647   peel_for_unknown_alignment.outside_cost = INT_MAX;
1648   peel_for_unknown_alignment.peel_info.count = 0;
1649
1650   if (do_peeling
1651       && one_misalignment_unknown)
1652     {
1653       /* Check if the target requires to prefer stores over loads, i.e., if
1654          misaligned stores are more expensive than misaligned loads (taking
1655          drs with same alignment into account).  */
1656       unsigned int load_inside_cost = 0;
1657       unsigned int load_outside_cost = 0;
1658       unsigned int store_inside_cost = 0;
1659       unsigned int store_outside_cost = 0;
1660
1661       stmt_vector_for_cost dummy;
1662       dummy.create (2);
1663       vect_get_peeling_costs_all_drs (dr0,
1664                                       &load_inside_cost,
1665                                       &load_outside_cost,
1666                                       &dummy, vf / 2, true);
1667       dummy.release ();
1668
1669       if (first_store)
1670         {
1671           dummy.create (2);
1672           vect_get_peeling_costs_all_drs (first_store,
1673                                           &store_inside_cost,
1674                                           &store_outside_cost,
1675                                           &dummy, vf / 2, true);
1676           dummy.release ();
1677         }
1678       else
1679         {
1680           store_inside_cost = INT_MAX;
1681           store_outside_cost = INT_MAX;
1682         }
1683
1684       if (load_inside_cost > store_inside_cost
1685           || (load_inside_cost == store_inside_cost
1686               && load_outside_cost > store_outside_cost))
1687         {
1688           dr0 = first_store;
1689           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1690           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1691         }
1692       else
1693         {
1694           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1695           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1696         }
1697
1698       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1699       prologue_cost_vec.create (2);
1700       epilogue_cost_vec.create (2);
1701
1702       int dummy2;
1703       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1704         (loop_vinfo, vf / 2, &dummy2,
1705          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1706          &prologue_cost_vec, &epilogue_cost_vec);
1707
1708       prologue_cost_vec.release ();
1709       epilogue_cost_vec.release ();
1710
1711       peel_for_unknown_alignment.peel_info.count = 1
1712         + STMT_VINFO_SAME_ALIGN_REFS
1713         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1714     }
1715
1716   peel_for_unknown_alignment.peel_info.npeel = 0;
1717   peel_for_unknown_alignment.peel_info.dr = dr0;
1718
1719   best_peel = peel_for_unknown_alignment;
1720
1721   peel_for_known_alignment.inside_cost = INT_MAX;
1722   peel_for_known_alignment.outside_cost = INT_MAX;
1723   peel_for_known_alignment.peel_info.count = 0;
1724   peel_for_known_alignment.peel_info.dr = NULL;
1725
1726   if (do_peeling && one_misalignment_known)
1727     {
1728       /* Peeling is possible, but there is no data access that is not supported
1729          unless aligned.  So we try to choose the best possible peeling from
1730          the hash table.  */
1731       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1732         (&peeling_htab, loop_vinfo, &npeel, &body_cost_vec);
1733     }
1734
1735   /* Compare costs of peeling for known and unknown alignment. */
1736   if (peel_for_known_alignment.peel_info.dr != NULL
1737       && peel_for_unknown_alignment.inside_cost
1738       >= peel_for_known_alignment.inside_cost)
1739     {
1740       best_peel = peel_for_known_alignment;
1741
1742       /* If the best peeling for known alignment has NPEEL == 0, perform no
1743          peeling at all except if there is an unsupportable dr that we can
1744          align.  */
1745       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1746         do_peeling = false;
1747     }
1748
1749   /* If there is an unsupportable data ref, prefer this over all choices so far
1750      since we'd have to discard a chosen peeling except when it accidentally
1751      aligned the unsupportable data ref.  */
1752   if (one_dr_unsupportable)
1753     dr0 = unsupportable_dr;
1754   else if (do_peeling)
1755     {
1756       /* Calculate the penalty for no peeling, i.e. leaving everything
1757          unaligned.
1758          TODO: Adapt vect_get_peeling_costs_all_drs and use here.  */
1759       unsigned nopeel_inside_cost = 0;
1760       unsigned nopeel_outside_cost = 0;
1761
1762       stmt_vector_for_cost dummy;
1763       dummy.create (2);
1764       FOR_EACH_VEC_ELT (datarefs, i, dr)
1765         vect_get_data_access_cost (dr, &nopeel_inside_cost,
1766                                    &nopeel_outside_cost, &dummy);
1767       dummy.release ();
1768
1769       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1770          costs will be recorded.  */
1771       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1772       prologue_cost_vec.create (2);
1773       epilogue_cost_vec.create (2);
1774
1775       int dummy2;
1776       nopeel_outside_cost += vect_get_known_peeling_cost
1777         (loop_vinfo, 0, &dummy2,
1778          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1779          &prologue_cost_vec, &epilogue_cost_vec);
1780
1781       prologue_cost_vec.release ();
1782       epilogue_cost_vec.release ();
1783
1784       npeel = best_peel.peel_info.npeel;
1785       dr0 = best_peel.peel_info.dr;
1786
1787       /* If no peeling is not more expensive than the best peeling we
1788          have so far, don't perform any peeling.  */
1789       if (nopeel_inside_cost <= best_peel.inside_cost)
1790         do_peeling = false;
1791     }
1792
1793   if (do_peeling)
1794     {
1795       stmt = DR_STMT (dr0);
1796       stmt_info = vinfo_for_stmt (stmt);
1797       vectype = STMT_VINFO_VECTYPE (stmt_info);
1798       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1799
1800       if (known_alignment_for_access_p (dr0))
1801         {
1802           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1803                                                 size_zero_node) < 0;
1804           if (!npeel)
1805             {
1806               /* Since it's known at compile time, compute the number of
1807                  iterations in the peeled loop (the peeling factor) for use in
1808                  updating DR_MISALIGNMENT values.  The peeling factor is the
1809                  vectorization factor minus the misalignment as an element
1810                  count.  */
1811               mis = DR_MISALIGNMENT (dr0);
1812               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1813               npeel = ((negative ? mis - nelements : nelements - mis)
1814                        & (nelements - 1));
1815             }
1816
1817           /* For interleaved data access every iteration accesses all the
1818              members of the group, therefore we divide the number of iterations
1819              by the group size.  */
1820           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1821           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1822             npeel /= GROUP_SIZE (stmt_info);
1823
1824           if (dump_enabled_p ())
1825             dump_printf_loc (MSG_NOTE, vect_location,
1826                              "Try peeling by %d\n", npeel);
1827         }
1828
1829       /* Ensure that all datarefs can be vectorized after the peel.  */
1830       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1831         do_peeling = false;
1832
1833       /* Check if all datarefs are supportable and log.  */
1834       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1835         {
1836           stat = vect_verify_datarefs_alignment (loop_vinfo);
1837           if (!stat)
1838             do_peeling = false;
1839           else
1840             {
1841               body_cost_vec.release ();
1842               return stat;
1843             }
1844         }
1845
1846       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1847       if (do_peeling)
1848         {
1849           unsigned max_allowed_peel
1850             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1851           if (max_allowed_peel != (unsigned)-1)
1852             {
1853               unsigned max_peel = npeel;
1854               if (max_peel == 0)
1855                 {
1856                   gimple *dr_stmt = DR_STMT (dr0);
1857                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1858                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1859                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1860                 }
1861               if (max_peel > max_allowed_peel)
1862                 {
1863                   do_peeling = false;
1864                   if (dump_enabled_p ())
1865                     dump_printf_loc (MSG_NOTE, vect_location,
1866                         "Disable peeling, max peels reached: %d\n", max_peel);
1867                 }
1868             }
1869         }
1870
1871       /* Cost model #2 - if peeling may result in a remaining loop not
1872          iterating enough to be vectorized then do not peel.  */
1873       if (do_peeling
1874           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1875         {
1876           unsigned max_peel
1877             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1878           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1879               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1880             do_peeling = false;
1881         }
1882
1883       if (do_peeling)
1884         {
1885           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1886              If the misalignment of DR_i is identical to that of dr0 then set
1887              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1888              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1889              by the peeling factor times the element size of DR_i (MOD the
1890              vectorization factor times the size).  Otherwise, the
1891              misalignment of DR_i must be set to unknown.  */
1892           FOR_EACH_VEC_ELT (datarefs, i, dr)
1893             if (dr != dr0)
1894               {
1895                 /* Strided accesses perform only component accesses, alignment
1896                    is irrelevant for them.  */
1897                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1898                 if (STMT_VINFO_STRIDED_P (stmt_info)
1899                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1900                   continue;
1901
1902                 vect_update_misalignment_for_peel (dr, dr0, npeel);
1903               }
1904
1905           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1906           if (npeel)
1907             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1908           else
1909             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1910               = DR_MISALIGNMENT (dr0);
1911           SET_DR_MISALIGNMENT (dr0, 0);
1912           if (dump_enabled_p ())
1913             {
1914               dump_printf_loc (MSG_NOTE, vect_location,
1915                                "Alignment of access forced using peeling.\n");
1916               dump_printf_loc (MSG_NOTE, vect_location,
1917                                "Peeling for alignment will be applied.\n");
1918             }
1919           /* The inside-loop cost will be accounted for in vectorizable_load
1920              and vectorizable_store correctly with adjusted alignments.
1921              Drop the body_cst_vec on the floor here.  */
1922           body_cost_vec.release ();
1923
1924           stat = vect_verify_datarefs_alignment (loop_vinfo);
1925           gcc_assert (stat);
1926           return stat;
1927         }
1928     }
1929
1930   body_cost_vec.release ();
1931
1932   /* (2) Versioning to force alignment.  */
1933
1934   /* Try versioning if:
1935      1) optimize loop for speed
1936      2) there is at least one unsupported misaligned data ref with an unknown
1937         misalignment, and
1938      3) all misaligned data refs with a known misalignment are supported, and
1939      4) the number of runtime alignment checks is within reason.  */
1940
1941   do_versioning =
1942         optimize_loop_nest_for_speed_p (loop)
1943         && (!loop->inner); /* FORNOW */
1944
1945   if (do_versioning)
1946     {
1947       FOR_EACH_VEC_ELT (datarefs, i, dr)
1948         {
1949           stmt = DR_STMT (dr);
1950           stmt_info = vinfo_for_stmt (stmt);
1951
1952           /* For interleaving, only the alignment of the first access
1953              matters.  */
1954           if (aligned_access_p (dr)
1955               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1956                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1957             continue;
1958
1959           if (STMT_VINFO_STRIDED_P (stmt_info))
1960             {
1961               /* Strided loads perform only component accesses, alignment is
1962                  irrelevant for them.  */
1963               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1964                 continue;
1965               do_versioning = false;
1966               break;
1967             }
1968
1969           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1970
1971           if (!supportable_dr_alignment)
1972             {
1973               gimple *stmt;
1974               int mask;
1975               tree vectype;
1976
1977               if (known_alignment_for_access_p (dr)
1978                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1979                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1980                 {
1981                   do_versioning = false;
1982                   break;
1983                 }
1984
1985               stmt = DR_STMT (dr);
1986               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1987               gcc_assert (vectype);
1988
1989               /* The rightmost bits of an aligned address must be zeros.
1990                  Construct the mask needed for this test.  For example,
1991                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1992                  mask must be 15 = 0xf. */
1993               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1994
1995               /* FORNOW: use the same mask to test all potentially unaligned
1996                  references in the loop.  The vectorizer currently supports
1997                  a single vector size, see the reference to
1998                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1999                  vectorization factor is computed.  */
2000               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2001                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2002               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2003               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2004                       DR_STMT (dr));
2005             }
2006         }
2007
2008       /* Versioning requires at least one misaligned data reference.  */
2009       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2010         do_versioning = false;
2011       else if (!do_versioning)
2012         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2013     }
2014
2015   if (do_versioning)
2016     {
2017       vec<gimple *> may_misalign_stmts
2018         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2019       gimple *stmt;
2020
2021       /* It can now be assumed that the data references in the statements
2022          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2023          of the loop being vectorized.  */
2024       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2025         {
2026           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2027           dr = STMT_VINFO_DATA_REF (stmt_info);
2028           SET_DR_MISALIGNMENT (dr, 0);
2029           if (dump_enabled_p ())
2030             dump_printf_loc (MSG_NOTE, vect_location,
2031                              "Alignment of access forced using versioning.\n");
2032         }
2033
2034       if (dump_enabled_p ())
2035         dump_printf_loc (MSG_NOTE, vect_location,
2036                          "Versioning for alignment will be applied.\n");
2037
2038       /* Peeling and versioning can't be done together at this time.  */
2039       gcc_assert (! (do_peeling && do_versioning));
2040
2041       stat = vect_verify_datarefs_alignment (loop_vinfo);
2042       gcc_assert (stat);
2043       return stat;
2044     }
2045
2046   /* This point is reached if neither peeling nor versioning is being done.  */
2047   gcc_assert (! (do_peeling || do_versioning));
2048
2049   stat = vect_verify_datarefs_alignment (loop_vinfo);
2050   return stat;
2051 }
2052
2053
2054 /* Function vect_find_same_alignment_drs.
2055
2056    Update group and alignment relations according to the chosen
2057    vectorization factor.  */
2058
2059 static void
2060 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2061 {
2062   struct data_reference *dra = DDR_A (ddr);
2063   struct data_reference *drb = DDR_B (ddr);
2064   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2065   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2066
2067   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2068     return;
2069
2070   if (dra == drb)
2071     return;
2072
2073   if (!operand_equal_p (DR_BASE_OBJECT (dra), DR_BASE_OBJECT (drb),
2074                         OEP_ADDRESS_OF)
2075       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2076       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2077     return;
2078
2079   /* Two references with distance zero have the same alignment.  */
2080   offset_int diff = (wi::to_offset (DR_INIT (dra))
2081                      - wi::to_offset (DR_INIT (drb)));
2082   if (diff != 0)
2083     {
2084       /* Get the wider of the two alignments.  */
2085       unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2086       unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2087       unsigned int max_align = MAX (align_a, align_b);
2088
2089       /* Require the gap to be a multiple of the larger vector alignment.  */
2090       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2091         return;
2092     }
2093
2094   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2095   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2096   if (dump_enabled_p ())
2097     {
2098       dump_printf_loc (MSG_NOTE, vect_location,
2099                        "accesses have the same alignment: ");
2100       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2101       dump_printf (MSG_NOTE,  " and ");
2102       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2103       dump_printf (MSG_NOTE, "\n");
2104     }
2105 }
2106
2107
2108 /* Function vect_analyze_data_refs_alignment
2109
2110    Analyze the alignment of the data-references in the loop.
2111    Return FALSE if a data reference is found that cannot be vectorized.  */
2112
2113 bool
2114 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2115 {
2116   if (dump_enabled_p ())
2117     dump_printf_loc (MSG_NOTE, vect_location,
2118                      "=== vect_analyze_data_refs_alignment ===\n");
2119
2120   /* Mark groups of data references with same alignment using
2121      data dependence information.  */
2122   vec<ddr_p> ddrs = vinfo->ddrs;
2123   struct data_dependence_relation *ddr;
2124   unsigned int i;
2125
2126   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2127     vect_find_same_alignment_drs (ddr);
2128
2129   vec<data_reference_p> datarefs = vinfo->datarefs;
2130   struct data_reference *dr;
2131
2132   FOR_EACH_VEC_ELT (datarefs, i, dr)
2133     {
2134       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2135       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2136           && !vect_compute_data_ref_alignment (dr))
2137         {
2138           /* Strided accesses perform only component accesses, misalignment
2139              information is irrelevant for them.  */
2140           if (STMT_VINFO_STRIDED_P (stmt_info)
2141               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2142             continue;
2143
2144           if (dump_enabled_p ())
2145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2146                              "not vectorized: can't calculate alignment "
2147                              "for data ref.\n");
2148
2149           return false;
2150         }
2151     }
2152
2153   return true;
2154 }
2155
2156
2157 /* Analyze alignment of DRs of stmts in NODE.  */
2158
2159 static bool
2160 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2161 {
2162   /* We vectorize from the first scalar stmt in the node unless
2163      the node is permuted in which case we start from the first
2164      element in the group.  */
2165   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2166   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2167   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2168     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2169
2170   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2171   if (! vect_compute_data_ref_alignment (dr)
2172       /* For creating the data-ref pointer we need alignment of the
2173          first element anyway.  */
2174       || (dr != first_dr
2175           && ! vect_compute_data_ref_alignment (first_dr))
2176       || ! verify_data_ref_alignment (dr))
2177     {
2178       if (dump_enabled_p ())
2179         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180                          "not vectorized: bad data alignment in basic "
2181                          "block.\n");
2182       return false;
2183     }
2184
2185   return true;
2186 }
2187
2188 /* Function vect_slp_analyze_instance_alignment
2189
2190    Analyze the alignment of the data-references in the SLP instance.
2191    Return FALSE if a data reference is found that cannot be vectorized.  */
2192
2193 bool
2194 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2195 {
2196   if (dump_enabled_p ())
2197     dump_printf_loc (MSG_NOTE, vect_location,
2198                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2199
2200   slp_tree node;
2201   unsigned i;
2202   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2203     if (! vect_slp_analyze_and_verify_node_alignment (node))
2204       return false;
2205
2206   node = SLP_INSTANCE_TREE (instance);
2207   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2208       && ! vect_slp_analyze_and_verify_node_alignment
2209              (SLP_INSTANCE_TREE (instance)))
2210     return false;
2211
2212   return true;
2213 }
2214
2215
2216 /* Analyze groups of accesses: check that DR belongs to a group of
2217    accesses of legal size, step, etc.  Detect gaps, single element
2218    interleaving, and other special cases. Set grouped access info.
2219    Collect groups of strided stores for further use in SLP analysis.
2220    Worker for vect_analyze_group_access.  */
2221
2222 static bool
2223 vect_analyze_group_access_1 (struct data_reference *dr)
2224 {
2225   tree step = DR_STEP (dr);
2226   tree scalar_type = TREE_TYPE (DR_REF (dr));
2227   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2228   gimple *stmt = DR_STMT (dr);
2229   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2230   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2231   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2232   HOST_WIDE_INT dr_step = -1;
2233   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2234   bool slp_impossible = false;
2235
2236   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2237      size of the interleaving group (including gaps).  */
2238   if (tree_fits_shwi_p (step))
2239     {
2240       dr_step = tree_to_shwi (step);
2241       /* Check that STEP is a multiple of type size.  Otherwise there is
2242          a non-element-sized gap at the end of the group which we
2243          cannot represent in GROUP_GAP or GROUP_SIZE.
2244          ???  As we can handle non-constant step fine here we should
2245          simply remove uses of GROUP_GAP between the last and first
2246          element and instead rely on DR_STEP.  GROUP_SIZE then would
2247          simply not include that gap.  */
2248       if ((dr_step % type_size) != 0)
2249         {
2250           if (dump_enabled_p ())
2251             {
2252               dump_printf_loc (MSG_NOTE, vect_location,
2253                                "Step ");
2254               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2255               dump_printf (MSG_NOTE,
2256                            " is not a multiple of the element size for ");
2257               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2258               dump_printf (MSG_NOTE, "\n");
2259             }
2260           return false;
2261         }
2262       groupsize = absu_hwi (dr_step) / type_size;
2263     }
2264   else
2265     groupsize = 0;
2266
2267   /* Not consecutive access is possible only if it is a part of interleaving.  */
2268   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2269     {
2270       /* Check if it this DR is a part of interleaving, and is a single
2271          element of the group that is accessed in the loop.  */
2272
2273       /* Gaps are supported only for loads. STEP must be a multiple of the type
2274          size.  The size of the group must be a power of 2.  */
2275       if (DR_IS_READ (dr)
2276           && (dr_step % type_size) == 0
2277           && groupsize > 0
2278           && pow2p_hwi (groupsize))
2279         {
2280           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2281           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2282           GROUP_GAP (stmt_info) = groupsize - 1;
2283           if (dump_enabled_p ())
2284             {
2285               dump_printf_loc (MSG_NOTE, vect_location,
2286                                "Detected single element interleaving ");
2287               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2288               dump_printf (MSG_NOTE, " step ");
2289               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2290               dump_printf (MSG_NOTE, "\n");
2291             }
2292
2293           return true;
2294         }
2295
2296       if (dump_enabled_p ())
2297         {
2298           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299                            "not consecutive access ");
2300           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2301         }
2302
2303       if (bb_vinfo)
2304         {
2305           /* Mark the statement as unvectorizable.  */
2306           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2307           return true;
2308         }
2309
2310       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2311       STMT_VINFO_STRIDED_P (stmt_info) = true;
2312       return true;
2313     }
2314
2315   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2316     {
2317       /* First stmt in the interleaving chain. Check the chain.  */
2318       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2319       struct data_reference *data_ref = dr;
2320       unsigned int count = 1;
2321       tree prev_init = DR_INIT (data_ref);
2322       gimple *prev = stmt;
2323       HOST_WIDE_INT diff, gaps = 0;
2324
2325       while (next)
2326         {
2327           /* Skip same data-refs.  In case that two or more stmts share
2328              data-ref (supported only for loads), we vectorize only the first
2329              stmt, and the rest get their vectorized loads from the first
2330              one.  */
2331           if (!tree_int_cst_compare (DR_INIT (data_ref),
2332                                      DR_INIT (STMT_VINFO_DATA_REF (
2333                                                    vinfo_for_stmt (next)))))
2334             {
2335               if (DR_IS_WRITE (data_ref))
2336                 {
2337                   if (dump_enabled_p ())
2338                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339                                      "Two store stmts share the same dr.\n");
2340                   return false;
2341                 }
2342
2343               if (dump_enabled_p ())
2344                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345                                  "Two or more load stmts share the same dr.\n");
2346
2347               /* For load use the same data-ref load.  */
2348               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2349
2350               prev = next;
2351               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2352               continue;
2353             }
2354
2355           prev = next;
2356           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2357
2358           /* All group members have the same STEP by construction.  */
2359           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2360
2361           /* Check that the distance between two accesses is equal to the type
2362              size. Otherwise, we have gaps.  */
2363           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2364                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2365           if (diff != 1)
2366             {
2367               /* FORNOW: SLP of accesses with gaps is not supported.  */
2368               slp_impossible = true;
2369               if (DR_IS_WRITE (data_ref))
2370                 {
2371                   if (dump_enabled_p ())
2372                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2373                                      "interleaved store with gaps\n");
2374                   return false;
2375                 }
2376
2377               gaps += diff - 1;
2378             }
2379
2380           last_accessed_element += diff;
2381
2382           /* Store the gap from the previous member of the group. If there is no
2383              gap in the access, GROUP_GAP is always 1.  */
2384           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2385
2386           prev_init = DR_INIT (data_ref);
2387           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2388           /* Count the number of data-refs in the chain.  */
2389           count++;
2390         }
2391
2392       if (groupsize == 0)
2393         groupsize = count + gaps;
2394
2395       /* This could be UINT_MAX but as we are generating code in a very
2396          inefficient way we have to cap earlier.  See PR78699 for example.  */
2397       if (groupsize > 4096)
2398         {
2399           if (dump_enabled_p ())
2400             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401                              "group is too large\n");
2402           return false;
2403         }
2404
2405       /* Check that the size of the interleaving is equal to count for stores,
2406          i.e., that there are no gaps.  */
2407       if (groupsize != count
2408           && !DR_IS_READ (dr))
2409         {
2410           if (dump_enabled_p ())
2411             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2412                              "interleaved store with gaps\n");
2413           return false;
2414         }
2415
2416       /* If there is a gap after the last load in the group it is the
2417          difference between the groupsize and the last accessed
2418          element.
2419          When there is no gap, this difference should be 0.  */
2420       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2421
2422       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2423       if (dump_enabled_p ())
2424         {
2425           dump_printf_loc (MSG_NOTE, vect_location,
2426                            "Detected interleaving ");
2427           if (DR_IS_READ (dr))
2428             dump_printf (MSG_NOTE, "load ");
2429           else
2430             dump_printf (MSG_NOTE, "store ");
2431           dump_printf (MSG_NOTE, "of size %u starting with ",
2432                        (unsigned)groupsize);
2433           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2434           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2435             dump_printf_loc (MSG_NOTE, vect_location,
2436                              "There is a gap of %u elements after the group\n",
2437                              GROUP_GAP (vinfo_for_stmt (stmt)));
2438         }
2439
2440       /* SLP: create an SLP data structure for every interleaving group of
2441          stores for further analysis in vect_analyse_slp.  */
2442       if (DR_IS_WRITE (dr) && !slp_impossible)
2443         {
2444           if (loop_vinfo)
2445             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2446           if (bb_vinfo)
2447             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2448         }
2449     }
2450
2451   return true;
2452 }
2453
2454 /* Analyze groups of accesses: check that DR belongs to a group of
2455    accesses of legal size, step, etc.  Detect gaps, single element
2456    interleaving, and other special cases. Set grouped access info.
2457    Collect groups of strided stores for further use in SLP analysis.  */
2458
2459 static bool
2460 vect_analyze_group_access (struct data_reference *dr)
2461 {
2462   if (!vect_analyze_group_access_1 (dr))
2463     {
2464       /* Dissolve the group if present.  */
2465       gimple *next;
2466       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2467       while (stmt)
2468         {
2469           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2470           next = GROUP_NEXT_ELEMENT (vinfo);
2471           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2472           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2473           stmt = next;
2474         }
2475       return false;
2476     }
2477   return true;
2478 }
2479
2480 /* Analyze the access pattern of the data-reference DR.
2481    In case of non-consecutive accesses call vect_analyze_group_access() to
2482    analyze groups of accesses.  */
2483
2484 static bool
2485 vect_analyze_data_ref_access (struct data_reference *dr)
2486 {
2487   tree step = DR_STEP (dr);
2488   tree scalar_type = TREE_TYPE (DR_REF (dr));
2489   gimple *stmt = DR_STMT (dr);
2490   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2491   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2492   struct loop *loop = NULL;
2493
2494   if (loop_vinfo)
2495     loop = LOOP_VINFO_LOOP (loop_vinfo);
2496
2497   if (loop_vinfo && !step)
2498     {
2499       if (dump_enabled_p ())
2500         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2501                          "bad data-ref access in loop\n");
2502       return false;
2503     }
2504
2505   /* Allow loads with zero step in inner-loop vectorization.  */
2506   if (loop_vinfo && integer_zerop (step))
2507     {
2508       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2509       if (!nested_in_vect_loop_p (loop, stmt))
2510         return DR_IS_READ (dr);
2511       /* Allow references with zero step for outer loops marked
2512          with pragma omp simd only - it guarantees absence of
2513          loop-carried dependencies between inner loop iterations.  */
2514       if (!loop->force_vectorize)
2515         {
2516           if (dump_enabled_p ())
2517             dump_printf_loc (MSG_NOTE, vect_location,
2518                              "zero step in inner loop of nest\n");
2519           return false;
2520         }
2521     }
2522
2523   if (loop && nested_in_vect_loop_p (loop, stmt))
2524     {
2525       /* Interleaved accesses are not yet supported within outer-loop
2526         vectorization for references in the inner-loop.  */
2527       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2528
2529       /* For the rest of the analysis we use the outer-loop step.  */
2530       step = STMT_VINFO_DR_STEP (stmt_info);
2531       if (integer_zerop (step))
2532         {
2533           if (dump_enabled_p ())
2534             dump_printf_loc (MSG_NOTE, vect_location,
2535                              "zero step in outer loop.\n");
2536           return DR_IS_READ (dr);
2537         }
2538     }
2539
2540   /* Consecutive?  */
2541   if (TREE_CODE (step) == INTEGER_CST)
2542     {
2543       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2544       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2545           || (dr_step < 0
2546               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2547         {
2548           /* Mark that it is not interleaving.  */
2549           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2550           return true;
2551         }
2552     }
2553
2554   if (loop && nested_in_vect_loop_p (loop, stmt))
2555     {
2556       if (dump_enabled_p ())
2557         dump_printf_loc (MSG_NOTE, vect_location,
2558                          "grouped access in outer loop.\n");
2559       return false;
2560     }
2561
2562
2563   /* Assume this is a DR handled by non-constant strided load case.  */
2564   if (TREE_CODE (step) != INTEGER_CST)
2565     return (STMT_VINFO_STRIDED_P (stmt_info)
2566             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2567                 || vect_analyze_group_access (dr)));
2568
2569   /* Not consecutive access - check if it's a part of interleaving group.  */
2570   return vect_analyze_group_access (dr);
2571 }
2572
2573 /* Compare two data-references DRA and DRB to group them into chunks
2574    suitable for grouping.  */
2575
2576 static int
2577 dr_group_sort_cmp (const void *dra_, const void *drb_)
2578 {
2579   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2580   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2581   int cmp;
2582
2583   /* Stabilize sort.  */
2584   if (dra == drb)
2585     return 0;
2586
2587   /* DRs in different loops never belong to the same group.  */
2588   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2589   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2590   if (loopa != loopb)
2591     return loopa->num < loopb->num ? -1 : 1;
2592
2593   /* Ordering of DRs according to base.  */
2594   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2595     {
2596       cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2597                                    DR_BASE_ADDRESS (drb));
2598       if (cmp != 0)
2599         return cmp;
2600     }
2601
2602   /* And according to DR_OFFSET.  */
2603   if (!dr_equal_offsets_p (dra, drb))
2604     {
2605       cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2606       if (cmp != 0)
2607         return cmp;
2608     }
2609
2610   /* Put reads before writes.  */
2611   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2612     return DR_IS_READ (dra) ? -1 : 1;
2613
2614   /* Then sort after access size.  */
2615   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2616                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2617     {
2618       cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2619                                    TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2620       if (cmp != 0)
2621         return cmp;
2622     }
2623
2624   /* And after step.  */
2625   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2626     {
2627       cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2628       if (cmp != 0)
2629         return cmp;
2630     }
2631
2632   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2633   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2634   if (cmp == 0)
2635     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2636   return cmp;
2637 }
2638
2639 /* Function vect_analyze_data_ref_accesses.
2640
2641    Analyze the access pattern of all the data references in the loop.
2642
2643    FORNOW: the only access pattern that is considered vectorizable is a
2644            simple step 1 (consecutive) access.
2645
2646    FORNOW: handle only arrays and pointer accesses.  */
2647
2648 bool
2649 vect_analyze_data_ref_accesses (vec_info *vinfo)
2650 {
2651   unsigned int i;
2652   vec<data_reference_p> datarefs = vinfo->datarefs;
2653   struct data_reference *dr;
2654
2655   if (dump_enabled_p ())
2656     dump_printf_loc (MSG_NOTE, vect_location,
2657                      "=== vect_analyze_data_ref_accesses ===\n");
2658
2659   if (datarefs.is_empty ())
2660     return true;
2661
2662   /* Sort the array of datarefs to make building the interleaving chains
2663      linear.  Don't modify the original vector's order, it is needed for
2664      determining what dependencies are reversed.  */
2665   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2666   datarefs_copy.qsort (dr_group_sort_cmp);
2667
2668   /* Build the interleaving chains.  */
2669   for (i = 0; i < datarefs_copy.length () - 1;)
2670     {
2671       data_reference_p dra = datarefs_copy[i];
2672       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2673       stmt_vec_info lastinfo = NULL;
2674       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2675         {
2676           ++i;
2677           continue;
2678         }
2679       for (i = i + 1; i < datarefs_copy.length (); ++i)
2680         {
2681           data_reference_p drb = datarefs_copy[i];
2682           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2683           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2684             break;
2685
2686           /* ???  Imperfect sorting (non-compatible types, non-modulo
2687              accesses, same accesses) can lead to a group to be artificially
2688              split here as we don't just skip over those.  If it really
2689              matters we can push those to a worklist and re-iterate
2690              over them.  The we can just skip ahead to the next DR here.  */
2691
2692           /* DRs in a different loop should not be put into the same
2693              interleaving group.  */
2694           if (gimple_bb (DR_STMT (dra))->loop_father
2695               != gimple_bb (DR_STMT (drb))->loop_father)
2696             break;
2697
2698           /* Check that the data-refs have same first location (except init)
2699              and they are both either store or load (not load and store,
2700              not masked loads or stores).  */
2701           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2702               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2703                                    DR_BASE_ADDRESS (drb), 0)
2704               || !dr_equal_offsets_p (dra, drb)
2705               || !gimple_assign_single_p (DR_STMT (dra))
2706               || !gimple_assign_single_p (DR_STMT (drb)))
2707             break;
2708
2709           /* Check that the data-refs have the same constant size.  */
2710           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2711           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2712           if (!tree_fits_uhwi_p (sza)
2713               || !tree_fits_uhwi_p (szb)
2714               || !tree_int_cst_equal (sza, szb))
2715             break;
2716
2717           /* Check that the data-refs have the same step.  */
2718           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2719             break;
2720
2721           /* Do not place the same access in the interleaving chain twice.  */
2722           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2723             break;
2724
2725           /* Check the types are compatible.
2726              ???  We don't distinguish this during sorting.  */
2727           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2728                                    TREE_TYPE (DR_REF (drb))))
2729             break;
2730
2731           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2732           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2733           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2734           gcc_assert (init_a <= init_b);
2735
2736           /* If init_b == init_a + the size of the type * k, we have an
2737              interleaving, and DRA is accessed before DRB.  */
2738           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2739           if (type_size_a == 0
2740               || (init_b - init_a) % type_size_a != 0)
2741             break;
2742
2743           /* If we have a store, the accesses are adjacent.  This splits
2744              groups into chunks we support (we don't support vectorization
2745              of stores with gaps).  */
2746           if (!DR_IS_READ (dra)
2747               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2748                                              (DR_INIT (datarefs_copy[i-1]))
2749                   != type_size_a))
2750             break;
2751
2752           /* If the step (if not zero or non-constant) is greater than the
2753              difference between data-refs' inits this splits groups into
2754              suitable sizes.  */
2755           if (tree_fits_shwi_p (DR_STEP (dra)))
2756             {
2757               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2758               if (step != 0 && step <= (init_b - init_a))
2759                 break;
2760             }
2761
2762           if (dump_enabled_p ())
2763             {
2764               dump_printf_loc (MSG_NOTE, vect_location,
2765                                "Detected interleaving ");
2766               if (DR_IS_READ (dra))
2767                 dump_printf (MSG_NOTE, "load ");
2768               else
2769                 dump_printf (MSG_NOTE, "store ");
2770               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2771               dump_printf (MSG_NOTE,  " and ");
2772               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2773               dump_printf (MSG_NOTE, "\n");
2774             }
2775
2776           /* Link the found element into the group list.  */
2777           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2778             {
2779               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2780               lastinfo = stmtinfo_a;
2781             }
2782           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2783           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2784           lastinfo = stmtinfo_b;
2785         }
2786     }
2787
2788   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2789     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2790         && !vect_analyze_data_ref_access (dr))
2791       {
2792         if (dump_enabled_p ())
2793           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2794                            "not vectorized: complicated access pattern.\n");
2795
2796         if (is_a <bb_vec_info> (vinfo))
2797           {
2798             /* Mark the statement as not vectorizable.  */
2799             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2800             continue;
2801           }
2802         else
2803           {
2804             datarefs_copy.release ();
2805             return false;
2806           }
2807       }
2808
2809   datarefs_copy.release ();
2810   return true;
2811 }
2812
2813 /* Function vect_vfa_segment_size.
2814
2815    Create an expression that computes the size of segment
2816    that will be accessed for a data reference.  The functions takes into
2817    account that realignment loads may access one more vector.
2818
2819    Input:
2820      DR: The data reference.
2821      LENGTH_FACTOR: segment length to consider.
2822
2823    Return an expression whose value is the size of segment which will be
2824    accessed by DR.  */
2825
2826 static tree
2827 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2828 {
2829   tree segment_length;
2830
2831   if (integer_zerop (DR_STEP (dr)))
2832     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2833   else
2834     segment_length = size_binop (MULT_EXPR,
2835                                  fold_convert (sizetype, DR_STEP (dr)),
2836                                  fold_convert (sizetype, length_factor));
2837
2838   if (vect_supportable_dr_alignment (dr, false)
2839         == dr_explicit_realign_optimized)
2840     {
2841       tree vector_size = TYPE_SIZE_UNIT
2842                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2843
2844       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2845     }
2846   return segment_length;
2847 }
2848
2849 /* Function vect_no_alias_p.
2850
2851    Given data references A and B with equal base and offset, the alias
2852    relation can be decided at compilation time, return TRUE if they do
2853    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
2854    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2855    respectively.  */
2856
2857 static bool
2858 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2859                  tree segment_length_a, tree segment_length_b)
2860 {
2861   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2862               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2863   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2864     return false;
2865
2866   tree seg_a_min = DR_INIT (a);
2867   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2868                                 seg_a_min, segment_length_a);
2869   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2870      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2871      [a, a+12) */
2872   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2873     {
2874       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2875       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2876                                seg_a_max, unit_size);
2877       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2878                                DR_INIT (a), unit_size);
2879     }
2880   tree seg_b_min = DR_INIT (b);
2881   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2882                                 seg_b_min, segment_length_b);
2883   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2884     {
2885       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2886       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2887                                seg_b_max, unit_size);
2888       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2889                                DR_INIT (b), unit_size);
2890     }
2891
2892   if (tree_int_cst_le (seg_a_max, seg_b_min)
2893       || tree_int_cst_le (seg_b_max, seg_a_min))
2894     return true;
2895
2896   return false;
2897 }
2898
2899 /* Function vect_prune_runtime_alias_test_list.
2900
2901    Prune a list of ddrs to be tested at run-time by versioning for alias.
2902    Merge several alias checks into one if possible.
2903    Return FALSE if resulting list of ddrs is longer then allowed by
2904    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2905
2906 bool
2907 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2908 {
2909   vec<ddr_p> may_alias_ddrs =
2910     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2911   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2912     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2913   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2914   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2915
2916   ddr_p ddr;
2917   unsigned int i;
2918   tree length_factor;
2919
2920   if (dump_enabled_p ())
2921     dump_printf_loc (MSG_NOTE, vect_location,
2922                      "=== vect_prune_runtime_alias_test_list ===\n");
2923
2924   if (may_alias_ddrs.is_empty ())
2925     return true;
2926
2927   comp_alias_ddrs.create (may_alias_ddrs.length ());
2928
2929   /* First, we collect all data ref pairs for aliasing checks.  */
2930   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2931     {
2932       int comp_res;
2933       struct data_reference *dr_a, *dr_b;
2934       gimple *dr_group_first_a, *dr_group_first_b;
2935       tree segment_length_a, segment_length_b;
2936       gimple *stmt_a, *stmt_b;
2937
2938       dr_a = DDR_A (ddr);
2939       stmt_a = DR_STMT (DDR_A (ddr));
2940       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2941       if (dr_group_first_a)
2942         {
2943           stmt_a = dr_group_first_a;
2944           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2945         }
2946
2947       dr_b = DDR_B (ddr);
2948       stmt_b = DR_STMT (DDR_B (ddr));
2949       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2950       if (dr_group_first_b)
2951         {
2952           stmt_b = dr_group_first_b;
2953           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2954         }
2955
2956       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2957         length_factor = scalar_loop_iters;
2958       else
2959         length_factor = size_int (vect_factor);
2960       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2961       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2962
2963       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2964                                         DR_BASE_ADDRESS (dr_b));
2965       if (comp_res == 0)
2966         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
2967                                           DR_OFFSET (dr_b));
2968
2969       /* Alias is known at compilation time.  */
2970       if (comp_res == 0
2971           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
2972           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
2973           && TREE_CODE (segment_length_a) == INTEGER_CST
2974           && TREE_CODE (segment_length_b) == INTEGER_CST)
2975         {
2976           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
2977             continue;
2978
2979           if (dump_enabled_p ())
2980             dump_printf_loc (MSG_NOTE, vect_location,
2981                              "not vectorized: compilation time alias.\n");
2982
2983           return false;
2984         }
2985
2986       dr_with_seg_len_pair_t dr_with_seg_len_pair
2987           (dr_with_seg_len (dr_a, segment_length_a),
2988            dr_with_seg_len (dr_b, segment_length_b));
2989
2990       /* Canonicalize pairs by sorting the two DR members.  */
2991       if (comp_res > 0)
2992         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2993
2994       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2995     }
2996
2997   prune_runtime_alias_test_list (&comp_alias_ddrs,
2998                                  (unsigned HOST_WIDE_INT) vect_factor);
2999   dump_printf_loc (MSG_NOTE, vect_location,
3000                    "improved number of alias checks from %d to %d\n",
3001                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
3002   if ((int) comp_alias_ddrs.length () >
3003       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3004     {
3005       if (dump_enabled_p ())
3006         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3007                          "number of versioning for alias "
3008                          "run-time tests exceeds %d "
3009                          "(--param vect-max-version-for-alias-checks)\n",
3010                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3011       return false;
3012     }
3013
3014   /* All alias checks have been resolved at compilation time.  */
3015   if (!comp_alias_ddrs.length ())
3016     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3017
3018   return true;
3019 }
3020
3021 /* Return true if a non-affine read or write in STMT is suitable for a
3022    gather load or scatter store.  Describe the operation in *INFO if so.  */
3023
3024 bool
3025 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3026                            gather_scatter_info *info)
3027 {
3028   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3029   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3030   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3031   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3032   tree offtype = NULL_TREE;
3033   tree decl, base, off;
3034   machine_mode pmode;
3035   int punsignedp, reversep, pvolatilep = 0;
3036
3037   base = DR_REF (dr);
3038   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3039      see if we can use the def stmt of the address.  */
3040   if (is_gimple_call (stmt)
3041       && gimple_call_internal_p (stmt)
3042       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3043           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3044       && TREE_CODE (base) == MEM_REF
3045       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3046       && integer_zerop (TREE_OPERAND (base, 1))
3047       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3048     {
3049       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3050       if (is_gimple_assign (def_stmt)
3051           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3052         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3053     }
3054
3055   /* The gather and scatter builtins need address of the form
3056      loop_invariant + vector * {1, 2, 4, 8}
3057      or
3058      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3059      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3060      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3061      multiplications and additions in it.  To get a vector, we need
3062      a single SSA_NAME that will be defined in the loop and will
3063      contain everything that is not loop invariant and that can be
3064      vectorized.  The following code attempts to find such a preexistng
3065      SSA_NAME OFF and put the loop invariants into a tree BASE
3066      that can be gimplified before the loop.  */
3067   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3068                               &punsignedp, &reversep, &pvolatilep);
3069   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3070
3071   if (TREE_CODE (base) == MEM_REF)
3072     {
3073       if (!integer_zerop (TREE_OPERAND (base, 1)))
3074         {
3075           if (off == NULL_TREE)
3076             {
3077               offset_int moff = mem_ref_offset (base);
3078               off = wide_int_to_tree (sizetype, moff);
3079             }
3080           else
3081             off = size_binop (PLUS_EXPR, off,
3082                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3083         }
3084       base = TREE_OPERAND (base, 0);
3085     }
3086   else
3087     base = build_fold_addr_expr (base);
3088
3089   if (off == NULL_TREE)
3090     off = size_zero_node;
3091
3092   /* If base is not loop invariant, either off is 0, then we start with just
3093      the constant offset in the loop invariant BASE and continue with base
3094      as OFF, otherwise give up.
3095      We could handle that case by gimplifying the addition of base + off
3096      into some SSA_NAME and use that as off, but for now punt.  */
3097   if (!expr_invariant_in_loop_p (loop, base))
3098     {
3099       if (!integer_zerop (off))
3100         return false;
3101       off = base;
3102       base = size_int (pbitpos / BITS_PER_UNIT);
3103     }
3104   /* Otherwise put base + constant offset into the loop invariant BASE
3105      and continue with OFF.  */
3106   else
3107     {
3108       base = fold_convert (sizetype, base);
3109       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3110     }
3111
3112   /* OFF at this point may be either a SSA_NAME or some tree expression
3113      from get_inner_reference.  Try to peel off loop invariants from it
3114      into BASE as long as possible.  */
3115   STRIP_NOPS (off);
3116   while (offtype == NULL_TREE)
3117     {
3118       enum tree_code code;
3119       tree op0, op1, add = NULL_TREE;
3120
3121       if (TREE_CODE (off) == SSA_NAME)
3122         {
3123           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3124
3125           if (expr_invariant_in_loop_p (loop, off))
3126             return false;
3127
3128           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3129             break;
3130
3131           op0 = gimple_assign_rhs1 (def_stmt);
3132           code = gimple_assign_rhs_code (def_stmt);
3133           op1 = gimple_assign_rhs2 (def_stmt);
3134         }
3135       else
3136         {
3137           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3138             return false;
3139           code = TREE_CODE (off);
3140           extract_ops_from_tree (off, &code, &op0, &op1);
3141         }
3142       switch (code)
3143         {
3144         case POINTER_PLUS_EXPR:
3145         case PLUS_EXPR:
3146           if (expr_invariant_in_loop_p (loop, op0))
3147             {
3148               add = op0;
3149               off = op1;
3150             do_add:
3151               add = fold_convert (sizetype, add);
3152               if (scale != 1)
3153                 add = size_binop (MULT_EXPR, add, size_int (scale));
3154               base = size_binop (PLUS_EXPR, base, add);
3155               continue;
3156             }
3157           if (expr_invariant_in_loop_p (loop, op1))
3158             {
3159               add = op1;
3160               off = op0;
3161               goto do_add;
3162             }
3163           break;
3164         case MINUS_EXPR:
3165           if (expr_invariant_in_loop_p (loop, op1))
3166             {
3167               add = fold_convert (sizetype, op1);
3168               add = size_binop (MINUS_EXPR, size_zero_node, add);
3169               off = op0;
3170               goto do_add;
3171             }
3172           break;
3173         case MULT_EXPR:
3174           if (scale == 1 && tree_fits_shwi_p (op1))
3175             {
3176               scale = tree_to_shwi (op1);
3177               off = op0;
3178               continue;
3179             }
3180           break;
3181         case SSA_NAME:
3182           off = op0;
3183           continue;
3184         CASE_CONVERT:
3185           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3186               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3187             break;
3188           if (TYPE_PRECISION (TREE_TYPE (op0))
3189               == TYPE_PRECISION (TREE_TYPE (off)))
3190             {
3191               off = op0;
3192               continue;
3193             }
3194           if (TYPE_PRECISION (TREE_TYPE (op0))
3195               < TYPE_PRECISION (TREE_TYPE (off)))
3196             {
3197               off = op0;
3198               offtype = TREE_TYPE (off);
3199               STRIP_NOPS (off);
3200               continue;
3201             }
3202           break;
3203         default:
3204           break;
3205         }
3206       break;
3207     }
3208
3209   /* If at the end OFF still isn't a SSA_NAME or isn't
3210      defined in the loop, punt.  */
3211   if (TREE_CODE (off) != SSA_NAME
3212       || expr_invariant_in_loop_p (loop, off))
3213     return false;
3214
3215   if (offtype == NULL_TREE)
3216     offtype = TREE_TYPE (off);
3217
3218   if (DR_IS_READ (dr))
3219     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3220                                              offtype, scale);
3221   else
3222     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3223                                               offtype, scale);
3224
3225   if (decl == NULL_TREE)
3226     return false;
3227
3228   info->decl = decl;
3229   info->base = base;
3230   info->offset = off;
3231   info->offset_dt = vect_unknown_def_type;
3232   info->offset_vectype = NULL_TREE;
3233   info->scale = scale;
3234   return true;
3235 }
3236
3237 /* Function vect_analyze_data_refs.
3238
3239   Find all the data references in the loop or basic block.
3240
3241    The general structure of the analysis of data refs in the vectorizer is as
3242    follows:
3243    1- vect_analyze_data_refs(loop/bb): call
3244       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3245       in the loop/bb and their dependences.
3246    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3247    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3248    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3249
3250 */
3251
3252 bool
3253 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3254 {
3255   struct loop *loop = NULL;
3256   unsigned int i;
3257   struct data_reference *dr;
3258   tree scalar_type;
3259
3260   if (dump_enabled_p ())
3261     dump_printf_loc (MSG_NOTE, vect_location,
3262                      "=== vect_analyze_data_refs ===\n");
3263
3264   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3265     loop = LOOP_VINFO_LOOP (loop_vinfo);
3266
3267   /* Go through the data-refs, check that the analysis succeeded.  Update
3268      pointer from stmt_vec_info struct to DR and vectype.  */
3269
3270   vec<data_reference_p> datarefs = vinfo->datarefs;
3271   FOR_EACH_VEC_ELT (datarefs, i, dr)
3272     {
3273       gimple *stmt;
3274       stmt_vec_info stmt_info;
3275       tree base, offset, init;
3276       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3277       bool simd_lane_access = false;
3278       int vf;
3279
3280 again:
3281       if (!dr || !DR_REF (dr))
3282         {
3283           if (dump_enabled_p ())
3284             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3285                              "not vectorized: unhandled data-ref\n");
3286           return false;
3287         }
3288
3289       stmt = DR_STMT (dr);
3290       stmt_info = vinfo_for_stmt (stmt);
3291
3292       /* Discard clobbers from the dataref vector.  We will remove
3293          clobber stmts during vectorization.  */
3294       if (gimple_clobber_p (stmt))
3295         {
3296           free_data_ref (dr);
3297           if (i == datarefs.length () - 1)
3298             {
3299               datarefs.pop ();
3300               break;
3301             }
3302           datarefs.ordered_remove (i);
3303           dr = datarefs[i];
3304           goto again;
3305         }
3306
3307       /* Check that analysis of the data-ref succeeded.  */
3308       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3309           || !DR_STEP (dr))
3310         {
3311           bool maybe_gather
3312             = DR_IS_READ (dr)
3313               && !TREE_THIS_VOLATILE (DR_REF (dr))
3314               && targetm.vectorize.builtin_gather != NULL;
3315           bool maybe_scatter
3316             = DR_IS_WRITE (dr)
3317               && !TREE_THIS_VOLATILE (DR_REF (dr))
3318               && targetm.vectorize.builtin_scatter != NULL;
3319           bool maybe_simd_lane_access
3320             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3321
3322           /* If target supports vector gather loads or scatter stores, or if
3323              this might be a SIMD lane access, see if they can't be used.  */
3324           if (is_a <loop_vec_info> (vinfo)
3325               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3326               && !nested_in_vect_loop_p (loop, stmt))
3327             {
3328               struct data_reference *newdr
3329                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3330                                    DR_REF (dr), stmt, maybe_scatter ? false : true);
3331               gcc_assert (newdr != NULL && DR_REF (newdr));
3332               if (DR_BASE_ADDRESS (newdr)
3333                   && DR_OFFSET (newdr)
3334                   && DR_INIT (newdr)
3335                   && DR_STEP (newdr)
3336                   && integer_zerop (DR_STEP (newdr)))
3337                 {
3338                   if (maybe_simd_lane_access)
3339                     {
3340                       tree off = DR_OFFSET (newdr);
3341                       STRIP_NOPS (off);
3342                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3343                           && TREE_CODE (off) == MULT_EXPR
3344                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3345                         {
3346                           tree step = TREE_OPERAND (off, 1);
3347                           off = TREE_OPERAND (off, 0);
3348                           STRIP_NOPS (off);
3349                           if (CONVERT_EXPR_P (off)
3350                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3351                                                                           0)))
3352                                  < TYPE_PRECISION (TREE_TYPE (off)))
3353                             off = TREE_OPERAND (off, 0);
3354                           if (TREE_CODE (off) == SSA_NAME)
3355                             {
3356                               gimple *def = SSA_NAME_DEF_STMT (off);
3357                               tree reft = TREE_TYPE (DR_REF (newdr));
3358                               if (is_gimple_call (def)
3359                                   && gimple_call_internal_p (def)
3360                                   && (gimple_call_internal_fn (def)
3361                                       == IFN_GOMP_SIMD_LANE))
3362                                 {
3363                                   tree arg = gimple_call_arg (def, 0);
3364                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3365                                   arg = SSA_NAME_VAR (arg);
3366                                   if (arg == loop->simduid
3367                                       /* For now.  */
3368                                       && tree_int_cst_equal
3369                                            (TYPE_SIZE_UNIT (reft),
3370                                             step))
3371                                     {
3372                                       DR_OFFSET (newdr) = ssize_int (0);
3373                                       DR_STEP (newdr) = step;
3374                                       DR_OFFSET_ALIGNMENT (newdr)
3375                                         = BIGGEST_ALIGNMENT;
3376                                       DR_STEP_ALIGNMENT (newdr)
3377                                         = highest_pow2_factor (step);
3378                                       dr = newdr;
3379                                       simd_lane_access = true;
3380                                     }
3381                                 }
3382                             }
3383                         }
3384                     }
3385                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3386                     {
3387                       dr = newdr;
3388                       if (maybe_gather)
3389                         gatherscatter = GATHER;
3390                       else
3391                         gatherscatter = SCATTER;
3392                     }
3393                 }
3394               if (gatherscatter == SG_NONE && !simd_lane_access)
3395                 free_data_ref (newdr);
3396             }
3397
3398           if (gatherscatter == SG_NONE && !simd_lane_access)
3399             {
3400               if (dump_enabled_p ())
3401                 {
3402                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3403                                    "not vectorized: data ref analysis "
3404                                    "failed ");
3405                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3406                 }
3407
3408               if (is_a <bb_vec_info> (vinfo))
3409                 break;
3410
3411               return false;
3412             }
3413         }
3414
3415       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3416         {
3417           if (dump_enabled_p ())
3418             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3419                              "not vectorized: base addr of dr is a "
3420                              "constant\n");
3421
3422           if (is_a <bb_vec_info> (vinfo))
3423             break;
3424
3425           if (gatherscatter != SG_NONE || simd_lane_access)
3426             free_data_ref (dr);
3427           return false;
3428         }
3429
3430       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3431         {
3432           if (dump_enabled_p ())
3433             {
3434               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3435                                "not vectorized: volatile type ");
3436               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3437             }
3438
3439           if (is_a <bb_vec_info> (vinfo))
3440             break;
3441
3442           return false;
3443         }
3444
3445       if (stmt_can_throw_internal (stmt))
3446         {
3447           if (dump_enabled_p ())
3448             {
3449               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3450                                "not vectorized: statement can throw an "
3451                                "exception ");
3452               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3453             }
3454
3455           if (is_a <bb_vec_info> (vinfo))
3456             break;
3457
3458           if (gatherscatter != SG_NONE || simd_lane_access)
3459             free_data_ref (dr);
3460           return false;
3461         }
3462
3463       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3464           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3465         {
3466           if (dump_enabled_p ())
3467             {
3468               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3469                                "not vectorized: statement is bitfield "
3470                                "access ");
3471               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3472             }
3473
3474           if (is_a <bb_vec_info> (vinfo))
3475             break;
3476
3477           if (gatherscatter != SG_NONE || simd_lane_access)
3478             free_data_ref (dr);
3479           return false;
3480         }
3481
3482       base = unshare_expr (DR_BASE_ADDRESS (dr));
3483       offset = unshare_expr (DR_OFFSET (dr));
3484       init = unshare_expr (DR_INIT (dr));
3485
3486       if (is_gimple_call (stmt)
3487           && (!gimple_call_internal_p (stmt)
3488               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3489                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3490         {
3491           if (dump_enabled_p ())
3492             {
3493               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3494                                "not vectorized: dr in a call ");
3495               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3496             }
3497
3498           if (is_a <bb_vec_info> (vinfo))
3499             break;
3500
3501           if (gatherscatter != SG_NONE || simd_lane_access)
3502             free_data_ref (dr);
3503           return false;
3504         }
3505
3506       /* Update DR field in stmt_vec_info struct.  */
3507
3508       /* If the dataref is in an inner-loop of the loop that is considered for
3509          for vectorization, we also want to analyze the access relative to
3510          the outer-loop (DR contains information only relative to the
3511          inner-most enclosing loop).  We do that by building a reference to the
3512          first location accessed by the inner-loop, and analyze it relative to
3513          the outer-loop.  */
3514       if (loop && nested_in_vect_loop_p (loop, stmt))
3515         {
3516           /* Build a reference to the first location accessed by the
3517              inner loop: *(BASE + INIT + OFFSET).  By construction,
3518              this address must be invariant in the inner loop, so we
3519              can consider it as being used in the outer loop.  */
3520           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3521                                           init, offset);
3522           tree init_addr = fold_build_pointer_plus (base, init_offset);
3523           tree init_ref = build_fold_indirect_ref (init_addr);
3524
3525           if (dump_enabled_p ())
3526             {
3527               dump_printf_loc (MSG_NOTE, vect_location,
3528                                "analyze in outer loop: ");
3529               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3530               dump_printf (MSG_NOTE, "\n");
3531             }
3532
3533           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3534                                      init_ref, loop))
3535             /* dr_analyze_innermost already explained the failure.  */
3536             return false;
3537
3538           if (dump_enabled_p ())
3539             {
3540               dump_printf_loc (MSG_NOTE, vect_location,
3541                                "\touter base_address: ");
3542               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3543                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3544               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3545               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3546                                  STMT_VINFO_DR_OFFSET (stmt_info));
3547               dump_printf (MSG_NOTE,
3548                            "\n\touter constant offset from base address: ");
3549               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3550                                  STMT_VINFO_DR_INIT (stmt_info));
3551               dump_printf (MSG_NOTE, "\n\touter step: ");
3552               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3553                                  STMT_VINFO_DR_STEP (stmt_info));
3554               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3555                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3556               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3557                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3558               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3559                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3560               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3561                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3562             }
3563         }
3564
3565       if (STMT_VINFO_DATA_REF (stmt_info))
3566         {
3567           if (dump_enabled_p ())
3568             {
3569               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3570                                "not vectorized: more than one data ref "
3571                                "in stmt: ");
3572               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3573             }
3574
3575           if (is_a <bb_vec_info> (vinfo))
3576             break;
3577
3578           if (gatherscatter != SG_NONE || simd_lane_access)
3579             free_data_ref (dr);
3580           return false;
3581         }
3582
3583       STMT_VINFO_DATA_REF (stmt_info) = dr;
3584       if (simd_lane_access)
3585         {
3586           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3587           free_data_ref (datarefs[i]);
3588           datarefs[i] = dr;
3589         }
3590
3591       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3592           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3593           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3594         {
3595           if (dump_enabled_p ())
3596             {
3597               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3598                                "not vectorized: base object not addressable "
3599                                "for stmt: ");
3600               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3601             }
3602           if (is_a <bb_vec_info> (vinfo))
3603             {
3604               /* In BB vectorization the ref can still participate
3605                  in dependence analysis, we just can't vectorize it.  */
3606               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3607               continue;
3608             }
3609           return false;
3610         }
3611
3612       /* Set vectype for STMT.  */
3613       scalar_type = TREE_TYPE (DR_REF (dr));
3614       STMT_VINFO_VECTYPE (stmt_info)
3615         = get_vectype_for_scalar_type (scalar_type);
3616       if (!STMT_VINFO_VECTYPE (stmt_info))
3617         {
3618           if (dump_enabled_p ())
3619             {
3620               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3621                                "not vectorized: no vectype for stmt: ");
3622               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3623               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3624               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3625                                  scalar_type);
3626               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3627             }
3628
3629           if (is_a <bb_vec_info> (vinfo))
3630             {
3631               /* No vector type is fine, the ref can still participate
3632                  in dependence analysis, we just can't vectorize it.  */
3633               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3634               continue;
3635             }
3636
3637           if (gatherscatter != SG_NONE || simd_lane_access)
3638             {
3639               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3640               if (gatherscatter != SG_NONE)
3641                 free_data_ref (dr);
3642             }
3643           return false;
3644         }
3645       else
3646         {
3647           if (dump_enabled_p ())
3648             {
3649               dump_printf_loc (MSG_NOTE, vect_location,
3650                                "got vectype for stmt: ");
3651               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3652               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3653                                  STMT_VINFO_VECTYPE (stmt_info));
3654               dump_printf (MSG_NOTE, "\n");
3655             }
3656         }
3657
3658       /* Adjust the minimal vectorization factor according to the
3659          vector type.  */
3660       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3661       if (vf > *min_vf)
3662         *min_vf = vf;
3663
3664       if (gatherscatter != SG_NONE)
3665         {
3666           gather_scatter_info gs_info;
3667           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3668                                           &gs_info)
3669               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3670             {
3671               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3672               free_data_ref (dr);
3673               if (dump_enabled_p ())
3674                 {
3675                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3676                                    (gatherscatter == GATHER) ?
3677                                    "not vectorized: not suitable for gather "
3678                                    "load " :
3679                                    "not vectorized: not suitable for scatter "
3680                                    "store ");
3681                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3682                 }
3683               return false;
3684             }
3685
3686           free_data_ref (datarefs[i]);
3687           datarefs[i] = dr;
3688           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3689         }
3690
3691       else if (is_a <loop_vec_info> (vinfo)
3692                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3693         {
3694           if (nested_in_vect_loop_p (loop, stmt))
3695             {
3696               if (dump_enabled_p ())
3697                 {
3698                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3699                                    "not vectorized: not suitable for strided "
3700                                    "load ");
3701                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3702                 }
3703               return false;
3704             }
3705           STMT_VINFO_STRIDED_P (stmt_info) = true;
3706         }
3707     }
3708
3709   /* If we stopped analysis at the first dataref we could not analyze
3710      when trying to vectorize a basic-block mark the rest of the datarefs
3711      as not vectorizable and truncate the vector of datarefs.  That
3712      avoids spending useless time in analyzing their dependence.  */
3713   if (i != datarefs.length ())
3714     {
3715       gcc_assert (is_a <bb_vec_info> (vinfo));
3716       for (unsigned j = i; j < datarefs.length (); ++j)
3717         {
3718           data_reference_p dr = datarefs[j];
3719           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3720           free_data_ref (dr);
3721         }
3722       datarefs.truncate (i);
3723     }
3724
3725   return true;
3726 }
3727
3728
3729 /* Function vect_get_new_vect_var.
3730
3731    Returns a name for a new variable.  The current naming scheme appends the
3732    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3733    the name of vectorizer generated variables, and appends that to NAME if
3734    provided.  */
3735
3736 tree
3737 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3738 {
3739   const char *prefix;
3740   tree new_vect_var;
3741
3742   switch (var_kind)
3743   {
3744   case vect_simple_var:
3745     prefix = "vect";
3746     break;
3747   case vect_scalar_var:
3748     prefix = "stmp";
3749     break;
3750   case vect_mask_var:
3751     prefix = "mask";
3752     break;
3753   case vect_pointer_var:
3754     prefix = "vectp";
3755     break;
3756   default:
3757     gcc_unreachable ();
3758   }
3759
3760   if (name)
3761     {
3762       char* tmp = concat (prefix, "_", name, NULL);
3763       new_vect_var = create_tmp_reg (type, tmp);
3764       free (tmp);
3765     }
3766   else
3767     new_vect_var = create_tmp_reg (type, prefix);
3768
3769   return new_vect_var;
3770 }
3771
3772 /* Like vect_get_new_vect_var but return an SSA name.  */
3773
3774 tree
3775 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3776 {
3777   const char *prefix;
3778   tree new_vect_var;
3779
3780   switch (var_kind)
3781   {
3782   case vect_simple_var:
3783     prefix = "vect";
3784     break;
3785   case vect_scalar_var:
3786     prefix = "stmp";
3787     break;
3788   case vect_pointer_var:
3789     prefix = "vectp";
3790     break;
3791   default:
3792     gcc_unreachable ();
3793   }
3794
3795   if (name)
3796     {
3797       char* tmp = concat (prefix, "_", name, NULL);
3798       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3799       free (tmp);
3800     }
3801   else
3802     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3803
3804   return new_vect_var;
3805 }
3806
3807 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3808
3809 static void
3810 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3811                                   stmt_vec_info stmt_info)
3812 {
3813   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3814   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3815   int misalign = DR_MISALIGNMENT (dr);
3816   if (misalign == DR_MISALIGNMENT_UNKNOWN)
3817     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3818   else
3819     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3820 }
3821
3822 /* Function vect_create_addr_base_for_vector_ref.
3823
3824    Create an expression that computes the address of the first memory location
3825    that will be accessed for a data reference.
3826
3827    Input:
3828    STMT: The statement containing the data reference.
3829    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3830    OFFSET: Optional. If supplied, it is be added to the initial address.
3831    LOOP:    Specify relative to which loop-nest should the address be computed.
3832             For example, when the dataref is in an inner-loop nested in an
3833             outer-loop that is now being vectorized, LOOP can be either the
3834             outer-loop, or the inner-loop.  The first memory location accessed
3835             by the following dataref ('in' points to short):
3836
3837                 for (i=0; i<N; i++)
3838                    for (j=0; j<M; j++)
3839                      s += in[i+j]
3840
3841             is as follows:
3842             if LOOP=i_loop:     &in             (relative to i_loop)
3843             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3844    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3845             initial address.  Unlike OFFSET, which is number of elements to
3846             be added, BYTE_OFFSET is measured in bytes.
3847
3848    Output:
3849    1. Return an SSA_NAME whose value is the address of the memory location of
3850       the first vector of the data reference.
3851    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3852       these statement(s) which define the returned SSA_NAME.
3853
3854    FORNOW: We are only handling array accesses with step 1.  */
3855
3856 tree
3857 vect_create_addr_base_for_vector_ref (gimple *stmt,
3858                                       gimple_seq *new_stmt_list,
3859                                       tree offset,
3860                                       tree byte_offset)
3861 {
3862   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3863   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3864   const char *base_name;
3865   tree addr_base;
3866   tree dest;
3867   gimple_seq seq = NULL;
3868   tree vect_ptr_type;
3869   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3870   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3871   innermost_loop_behavior *drb = vect_dr_behavior (dr);
3872
3873   tree data_ref_base = unshare_expr (drb->base_address);
3874   tree base_offset = unshare_expr (drb->offset);
3875   tree init = unshare_expr (drb->init);
3876
3877   if (loop_vinfo)
3878     base_name = get_name (data_ref_base);
3879   else
3880     {
3881       base_offset = ssize_int (0);
3882       init = ssize_int (0);
3883       base_name = get_name (DR_REF (dr));
3884     }
3885
3886   /* Create base_offset */
3887   base_offset = size_binop (PLUS_EXPR,
3888                             fold_convert (sizetype, base_offset),
3889                             fold_convert (sizetype, init));
3890
3891   if (offset)
3892     {
3893       offset = fold_build2 (MULT_EXPR, sizetype,
3894                             fold_convert (sizetype, offset), step);
3895       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3896                                  base_offset, offset);
3897     }
3898   if (byte_offset)
3899     {
3900       byte_offset = fold_convert (sizetype, byte_offset);
3901       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3902                                  base_offset, byte_offset);
3903     }
3904
3905   /* base + base_offset */
3906   if (loop_vinfo)
3907     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3908   else
3909     {
3910       addr_base = build1 (ADDR_EXPR,
3911                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3912                           unshare_expr (DR_REF (dr)));
3913     }
3914
3915   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3916   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3917   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3918   gimple_seq_add_seq (new_stmt_list, seq);
3919
3920   if (DR_PTR_INFO (dr)
3921       && TREE_CODE (addr_base) == SSA_NAME
3922       && !SSA_NAME_PTR_INFO (addr_base))
3923     {
3924       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3925       if (offset || byte_offset)
3926         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3927     }
3928
3929   if (dump_enabled_p ())
3930     {
3931       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3932       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3933       dump_printf (MSG_NOTE, "\n");
3934     }
3935
3936   return addr_base;
3937 }
3938
3939
3940 /* Function vect_create_data_ref_ptr.
3941
3942    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3943    location accessed in the loop by STMT, along with the def-use update
3944    chain to appropriately advance the pointer through the loop iterations.
3945    Also set aliasing information for the pointer.  This pointer is used by
3946    the callers to this function to create a memory reference expression for
3947    vector load/store access.
3948
3949    Input:
3950    1. STMT: a stmt that references memory. Expected to be of the form
3951          GIMPLE_ASSIGN <name, data-ref> or
3952          GIMPLE_ASSIGN <data-ref, name>.
3953    2. AGGR_TYPE: the type of the reference, which should be either a vector
3954         or an array.
3955    3. AT_LOOP: the loop where the vector memref is to be created.
3956    4. OFFSET (optional): an offset to be added to the initial address accessed
3957         by the data-ref in STMT.
3958    5. BSI: location where the new stmts are to be placed if there is no loop
3959    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3960         pointing to the initial address.
3961    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3962         to the initial address accessed by the data-ref in STMT.  This is
3963         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
3964         in bytes.
3965
3966    Output:
3967    1. Declare a new ptr to vector_type, and have it point to the base of the
3968       data reference (initial addressed accessed by the data reference).
3969       For example, for vector of type V8HI, the following code is generated:
3970
3971       v8hi *ap;
3972       ap = (v8hi *)initial_address;
3973
3974       if OFFSET is not supplied:
3975          initial_address = &a[init];
3976       if OFFSET is supplied:
3977          initial_address = &a[init + OFFSET];
3978       if BYTE_OFFSET is supplied:
3979          initial_address = &a[init] + BYTE_OFFSET;
3980
3981       Return the initial_address in INITIAL_ADDRESS.
3982
3983    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
3984       update the pointer in each iteration of the loop.
3985
3986       Return the increment stmt that updates the pointer in PTR_INCR.
3987
3988    3. Set INV_P to true if the access pattern of the data reference in the
3989       vectorized loop is invariant.  Set it to false otherwise.
3990
3991    4. Return the pointer.  */
3992
3993 tree
3994 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
3995                           tree offset, tree *initial_address,
3996                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
3997                           bool only_init, bool *inv_p, tree byte_offset)
3998 {
3999   const char *base_name;
4000   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4001   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4002   struct loop *loop = NULL;
4003   bool nested_in_vect_loop = false;
4004   struct loop *containing_loop = NULL;
4005   tree aggr_ptr_type;
4006   tree aggr_ptr;
4007   tree new_temp;
4008   gimple_seq new_stmt_list = NULL;
4009   edge pe = NULL;
4010   basic_block new_bb;
4011   tree aggr_ptr_init;
4012   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4013   tree aptr;
4014   gimple_stmt_iterator incr_gsi;
4015   bool insert_after;
4016   tree indx_before_incr, indx_after_incr;
4017   gimple *incr;
4018   tree step;
4019   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4020
4021   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4022               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4023
4024   if (loop_vinfo)
4025     {
4026       loop = LOOP_VINFO_LOOP (loop_vinfo);
4027       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4028       containing_loop = (gimple_bb (stmt))->loop_father;
4029       pe = loop_preheader_edge (loop);
4030     }
4031   else
4032     {
4033       gcc_assert (bb_vinfo);
4034       only_init = true;
4035       *ptr_incr = NULL;
4036     }
4037
4038   /* Check the step (evolution) of the load in LOOP, and record
4039      whether it's invariant.  */
4040   step = vect_dr_behavior (dr)->step;
4041   if (integer_zerop (step))
4042     *inv_p = true;
4043   else
4044     *inv_p = false;
4045
4046   /* Create an expression for the first address accessed by this load
4047      in LOOP.  */
4048   base_name = get_name (DR_BASE_ADDRESS (dr));
4049
4050   if (dump_enabled_p ())
4051     {
4052       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4053       dump_printf_loc (MSG_NOTE, vect_location,
4054                        "create %s-pointer variable to type: ",
4055                        get_tree_code_name (TREE_CODE (aggr_type)));
4056       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4057       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4058         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4059       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4060         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4061       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4062         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4063       else
4064         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4065       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4066       dump_printf (MSG_NOTE, "\n");
4067     }
4068
4069   /* (1) Create the new aggregate-pointer variable.
4070      Vector and array types inherit the alias set of their component
4071      type by default so we need to use a ref-all pointer if the data
4072      reference does not conflict with the created aggregated data
4073      reference because it is not addressable.  */
4074   bool need_ref_all = false;
4075   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4076                               get_alias_set (DR_REF (dr))))
4077     need_ref_all = true;
4078   /* Likewise for any of the data references in the stmt group.  */
4079   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4080     {
4081       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4082       do
4083         {
4084           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4085           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4086           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4087                                       get_alias_set (DR_REF (sdr))))
4088             {
4089               need_ref_all = true;
4090               break;
4091             }
4092           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4093         }
4094       while (orig_stmt);
4095     }
4096   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4097                                                need_ref_all);
4098   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4099
4100
4101   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4102      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4103      def-use update cycles for the pointer: one relative to the outer-loop
4104      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4105      to the inner-loop (which is the inner-most loop containing the dataref),
4106      and this is done be step (5) below.
4107
4108      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4109      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4110      redundant.  Steps (3),(4) create the following:
4111
4112         vp0 = &base_addr;
4113         LOOP:   vp1 = phi(vp0,vp2)
4114                 ...
4115                 ...
4116                 vp2 = vp1 + step
4117                 goto LOOP
4118
4119      If there is an inner-loop nested in loop, then step (5) will also be
4120      applied, and an additional update in the inner-loop will be created:
4121
4122         vp0 = &base_addr;
4123         LOOP:   vp1 = phi(vp0,vp2)
4124                 ...
4125         inner:     vp3 = phi(vp1,vp4)
4126                    vp4 = vp3 + inner_step
4127                    if () goto inner
4128                 ...
4129                 vp2 = vp1 + step
4130                 if () goto LOOP   */
4131
4132   /* (2) Calculate the initial address of the aggregate-pointer, and set
4133      the aggregate-pointer to point to it before the loop.  */
4134
4135   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4136
4137   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4138                                                    offset, byte_offset);
4139   if (new_stmt_list)
4140     {
4141       if (pe)
4142         {
4143           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4144           gcc_assert (!new_bb);
4145         }
4146       else
4147         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4148     }
4149
4150   *initial_address = new_temp;
4151   aggr_ptr_init = new_temp;
4152
4153   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4154      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4155      inner-loop nested in LOOP (during outer-loop vectorization).  */
4156
4157   /* No update in loop is required.  */
4158   if (only_init && (!loop_vinfo || at_loop == loop))
4159     aptr = aggr_ptr_init;
4160   else
4161     {
4162       /* The step of the aggregate pointer is the type size.  */
4163       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4164       /* One exception to the above is when the scalar step of the load in
4165          LOOP is zero. In this case the step here is also zero.  */
4166       if (*inv_p)
4167         iv_step = size_zero_node;
4168       else if (tree_int_cst_sgn (step) == -1)
4169         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4170
4171       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4172
4173       create_iv (aggr_ptr_init,
4174                  fold_convert (aggr_ptr_type, iv_step),
4175                  aggr_ptr, loop, &incr_gsi, insert_after,
4176                  &indx_before_incr, &indx_after_incr);
4177       incr = gsi_stmt (incr_gsi);
4178       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4179
4180       /* Copy the points-to information if it exists. */
4181       if (DR_PTR_INFO (dr))
4182         {
4183           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4184           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4185         }
4186       if (ptr_incr)
4187         *ptr_incr = incr;
4188
4189       aptr = indx_before_incr;
4190     }
4191
4192   if (!nested_in_vect_loop || only_init)
4193     return aptr;
4194
4195
4196   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4197      nested in LOOP, if exists.  */
4198
4199   gcc_assert (nested_in_vect_loop);
4200   if (!only_init)
4201     {
4202       standard_iv_increment_position (containing_loop, &incr_gsi,
4203                                       &insert_after);
4204       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4205                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4206                  &indx_after_incr);
4207       incr = gsi_stmt (incr_gsi);
4208       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4209
4210       /* Copy the points-to information if it exists. */
4211       if (DR_PTR_INFO (dr))
4212         {
4213           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4214           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4215         }
4216       if (ptr_incr)
4217         *ptr_incr = incr;
4218
4219       return indx_before_incr;
4220     }
4221   else
4222     gcc_unreachable ();
4223 }
4224
4225
4226 /* Function bump_vector_ptr
4227
4228    Increment a pointer (to a vector type) by vector-size. If requested,
4229    i.e. if PTR-INCR is given, then also connect the new increment stmt
4230    to the existing def-use update-chain of the pointer, by modifying
4231    the PTR_INCR as illustrated below:
4232
4233    The pointer def-use update-chain before this function:
4234                         DATAREF_PTR = phi (p_0, p_2)
4235                         ....
4236         PTR_INCR:       p_2 = DATAREF_PTR + step
4237
4238    The pointer def-use update-chain after this function:
4239                         DATAREF_PTR = phi (p_0, p_2)
4240                         ....
4241                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4242                         ....
4243         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4244
4245    Input:
4246    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4247                  in the loop.
4248    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4249               the loop.  The increment amount across iterations is expected
4250               to be vector_size.
4251    BSI - location where the new update stmt is to be placed.
4252    STMT - the original scalar memory-access stmt that is being vectorized.
4253    BUMP - optional. The offset by which to bump the pointer. If not given,
4254           the offset is assumed to be vector_size.
4255
4256    Output: Return NEW_DATAREF_PTR as illustrated above.
4257
4258 */
4259
4260 tree
4261 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4262                  gimple *stmt, tree bump)
4263 {
4264   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4265   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4266   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4267   tree update = TYPE_SIZE_UNIT (vectype);
4268   gassign *incr_stmt;
4269   ssa_op_iter iter;
4270   use_operand_p use_p;
4271   tree new_dataref_ptr;
4272
4273   if (bump)
4274     update = bump;
4275
4276   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4277     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4278   else
4279     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4280   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4281                                    dataref_ptr, update);
4282   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4283
4284   /* Copy the points-to information if it exists. */
4285   if (DR_PTR_INFO (dr))
4286     {
4287       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4288       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4289     }
4290
4291   if (!ptr_incr)
4292     return new_dataref_ptr;
4293
4294   /* Update the vector-pointer's cross-iteration increment.  */
4295   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4296     {
4297       tree use = USE_FROM_PTR (use_p);
4298
4299       if (use == dataref_ptr)
4300         SET_USE (use_p, new_dataref_ptr);
4301       else
4302         gcc_assert (tree_int_cst_compare (use, update) == 0);
4303     }
4304
4305   return new_dataref_ptr;
4306 }
4307
4308
4309 /* Function vect_create_destination_var.
4310
4311    Create a new temporary of type VECTYPE.  */
4312
4313 tree
4314 vect_create_destination_var (tree scalar_dest, tree vectype)
4315 {
4316   tree vec_dest;
4317   const char *name;
4318   char *new_name;
4319   tree type;
4320   enum vect_var_kind kind;
4321
4322   kind = vectype
4323     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4324     ? vect_mask_var
4325     : vect_simple_var
4326     : vect_scalar_var;
4327   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4328
4329   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4330
4331   name = get_name (scalar_dest);
4332   if (name)
4333     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4334   else
4335     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4336   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4337   free (new_name);
4338
4339   return vec_dest;
4340 }
4341
4342 /* Function vect_grouped_store_supported.
4343
4344    Returns TRUE if interleave high and interleave low permutations
4345    are supported, and FALSE otherwise.  */
4346
4347 bool
4348 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4349 {
4350   machine_mode mode = TYPE_MODE (vectype);
4351
4352   /* vect_permute_store_chain requires the group size to be equal to 3 or
4353      be a power of two.  */
4354   if (count != 3 && exact_log2 (count) == -1)
4355     {
4356       if (dump_enabled_p ())
4357         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4358                          "the size of the group of accesses"
4359                          " is not a power of 2 or not eqaul to 3\n");
4360       return false;
4361     }
4362
4363   /* Check that the permutation is supported.  */
4364   if (VECTOR_MODE_P (mode))
4365     {
4366       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4367       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4368
4369       if (count == 3)
4370         {
4371           unsigned int j0 = 0, j1 = 0, j2 = 0;
4372           unsigned int i, j;
4373
4374           for (j = 0; j < 3; j++)
4375             {
4376               int nelt0 = ((3 - j) * nelt) % 3;
4377               int nelt1 = ((3 - j) * nelt + 1) % 3;
4378               int nelt2 = ((3 - j) * nelt + 2) % 3;
4379               for (i = 0; i < nelt; i++)
4380                 {
4381                   if (3 * i + nelt0 < nelt)
4382                     sel[3 * i + nelt0] = j0++;
4383                   if (3 * i + nelt1 < nelt)
4384                     sel[3 * i + nelt1] = nelt + j1++;
4385                   if (3 * i + nelt2 < nelt)
4386                     sel[3 * i + nelt2] = 0;
4387                 }
4388               if (!can_vec_perm_p (mode, false, sel))
4389                 {
4390                   if (dump_enabled_p ())
4391                     dump_printf (MSG_MISSED_OPTIMIZATION,
4392                                  "permutaion op not supported by target.\n");
4393                   return false;
4394                 }
4395
4396               for (i = 0; i < nelt; i++)
4397                 {
4398                   if (3 * i + nelt0 < nelt)
4399                     sel[3 * i + nelt0] = 3 * i + nelt0;
4400                   if (3 * i + nelt1 < nelt)
4401                     sel[3 * i + nelt1] = 3 * i + nelt1;
4402                   if (3 * i + nelt2 < nelt)
4403                     sel[3 * i + nelt2] = nelt + j2++;
4404                 }
4405               if (!can_vec_perm_p (mode, false, sel))
4406                 {
4407                   if (dump_enabled_p ())
4408                     dump_printf (MSG_MISSED_OPTIMIZATION,
4409                                  "permutaion op not supported by target.\n");
4410                   return false;
4411                 }
4412             }
4413           return true;
4414         }
4415       else
4416         {
4417           /* If length is not equal to 3 then only power of 2 is supported.  */
4418           gcc_assert (pow2p_hwi (count));
4419
4420           for (i = 0; i < nelt / 2; i++)
4421             {
4422               sel[i * 2] = i;
4423               sel[i * 2 + 1] = i + nelt;
4424             }
4425             if (can_vec_perm_p (mode, false, sel))
4426               {
4427                 for (i = 0; i < nelt; i++)
4428                   sel[i] += nelt / 2;
4429                 if (can_vec_perm_p (mode, false, sel))
4430                   return true;
4431               }
4432         }
4433     }
4434
4435   if (dump_enabled_p ())
4436     dump_printf (MSG_MISSED_OPTIMIZATION,
4437                  "permutaion op not supported by target.\n");
4438   return false;
4439 }
4440
4441
4442 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4443    type VECTYPE.  */
4444
4445 bool
4446 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4447 {
4448   return vect_lanes_optab_supported_p ("vec_store_lanes",
4449                                        vec_store_lanes_optab,
4450                                        vectype, count);
4451 }
4452
4453
4454 /* Function vect_permute_store_chain.
4455
4456    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4457    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4458    the data correctly for the stores.  Return the final references for stores
4459    in RESULT_CHAIN.
4460
4461    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4462    The input is 4 vectors each containing 8 elements.  We assign a number to
4463    each element, the input sequence is:
4464
4465    1st vec:   0  1  2  3  4  5  6  7
4466    2nd vec:   8  9 10 11 12 13 14 15
4467    3rd vec:  16 17 18 19 20 21 22 23
4468    4th vec:  24 25 26 27 28 29 30 31
4469
4470    The output sequence should be:
4471
4472    1st vec:  0  8 16 24  1  9 17 25
4473    2nd vec:  2 10 18 26  3 11 19 27
4474    3rd vec:  4 12 20 28  5 13 21 30
4475    4th vec:  6 14 22 30  7 15 23 31
4476
4477    i.e., we interleave the contents of the four vectors in their order.
4478
4479    We use interleave_high/low instructions to create such output.  The input of
4480    each interleave_high/low operation is two vectors:
4481    1st vec    2nd vec
4482    0 1 2 3    4 5 6 7
4483    the even elements of the result vector are obtained left-to-right from the
4484    high/low elements of the first vector.  The odd elements of the result are
4485    obtained left-to-right from the high/low elements of the second vector.
4486    The output of interleave_high will be:   0 4 1 5
4487    and of interleave_low:                   2 6 3 7
4488
4489
4490    The permutation is done in log LENGTH stages.  In each stage interleave_high
4491    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4492    where the first argument is taken from the first half of DR_CHAIN and the
4493    second argument from it's second half.
4494    In our example,
4495
4496    I1: interleave_high (1st vec, 3rd vec)
4497    I2: interleave_low (1st vec, 3rd vec)
4498    I3: interleave_high (2nd vec, 4th vec)
4499    I4: interleave_low (2nd vec, 4th vec)
4500
4501    The output for the first stage is:
4502
4503    I1:  0 16  1 17  2 18  3 19
4504    I2:  4 20  5 21  6 22  7 23
4505    I3:  8 24  9 25 10 26 11 27
4506    I4: 12 28 13 29 14 30 15 31
4507
4508    The output of the second stage, i.e. the final result is:
4509
4510    I1:  0  8 16 24  1  9 17 25
4511    I2:  2 10 18 26  3 11 19 27
4512    I3:  4 12 20 28  5 13 21 30
4513    I4:  6 14 22 30  7 15 23 31.  */
4514
4515 void
4516 vect_permute_store_chain (vec<tree> dr_chain,
4517                           unsigned int length,
4518                           gimple *stmt,
4519                           gimple_stmt_iterator *gsi,
4520                           vec<tree> *result_chain)
4521 {
4522   tree vect1, vect2, high, low;
4523   gimple *perm_stmt;
4524   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4525   tree perm_mask_low, perm_mask_high;
4526   tree data_ref;
4527   tree perm3_mask_low, perm3_mask_high;
4528   unsigned int i, n, log_length = exact_log2 (length);
4529   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4530   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4531
4532   result_chain->quick_grow (length);
4533   memcpy (result_chain->address (), dr_chain.address (),
4534           length * sizeof (tree));
4535
4536   if (length == 3)
4537     {
4538       unsigned int j0 = 0, j1 = 0, j2 = 0;
4539
4540       for (j = 0; j < 3; j++)
4541         {
4542           int nelt0 = ((3 - j) * nelt) % 3;
4543           int nelt1 = ((3 - j) * nelt + 1) % 3;
4544           int nelt2 = ((3 - j) * nelt + 2) % 3;
4545
4546           for (i = 0; i < nelt; i++)
4547             {
4548               if (3 * i + nelt0 < nelt)
4549                 sel[3 * i + nelt0] = j0++;
4550               if (3 * i + nelt1 < nelt)
4551                 sel[3 * i + nelt1] = nelt + j1++;
4552               if (3 * i + nelt2 < nelt)
4553                 sel[3 * i + nelt2] = 0;
4554             }
4555           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4556
4557           for (i = 0; i < nelt; i++)
4558             {
4559               if (3 * i + nelt0 < nelt)
4560                 sel[3 * i + nelt0] = 3 * i + nelt0;
4561               if (3 * i + nelt1 < nelt)
4562                 sel[3 * i + nelt1] = 3 * i + nelt1;
4563               if (3 * i + nelt2 < nelt)
4564                 sel[3 * i + nelt2] = nelt + j2++;
4565             }
4566           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4567
4568           vect1 = dr_chain[0];
4569           vect2 = dr_chain[1];
4570
4571           /* Create interleaving stmt:
4572              low = VEC_PERM_EXPR <vect1, vect2,
4573                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4574                                    j + 2, nelt + j + 2, *, ...}>  */
4575           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4576           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4577                                            vect2, perm3_mask_low);
4578           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4579
4580           vect1 = data_ref;
4581           vect2 = dr_chain[2];
4582           /* Create interleaving stmt:
4583              low = VEC_PERM_EXPR <vect1, vect2,
4584                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4585                                    6, 7, nelt + j + 2, ...}>  */
4586           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4587           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4588                                            vect2, perm3_mask_high);
4589           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4590           (*result_chain)[j] = data_ref;
4591         }
4592     }
4593   else
4594     {
4595       /* If length is not equal to 3 then only power of 2 is supported.  */
4596       gcc_assert (pow2p_hwi (length));
4597
4598       for (i = 0, n = nelt / 2; i < n; i++)
4599         {
4600           sel[i * 2] = i;
4601           sel[i * 2 + 1] = i + nelt;
4602         }
4603         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4604
4605         for (i = 0; i < nelt; i++)
4606           sel[i] += nelt / 2;
4607         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4608
4609         for (i = 0, n = log_length; i < n; i++)
4610           {
4611             for (j = 0; j < length/2; j++)
4612               {
4613                 vect1 = dr_chain[j];
4614                 vect2 = dr_chain[j+length/2];
4615
4616                 /* Create interleaving stmt:
4617                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4618                                                         ...}>  */
4619                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4620                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4621                                                  vect2, perm_mask_high);
4622                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4623                 (*result_chain)[2*j] = high;
4624
4625                 /* Create interleaving stmt:
4626                    low = VEC_PERM_EXPR <vect1, vect2,
4627                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4628                                          ...}>  */
4629                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4630                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4631                                                  vect2, perm_mask_low);
4632                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4633                 (*result_chain)[2*j+1] = low;
4634               }
4635             memcpy (dr_chain.address (), result_chain->address (),
4636                     length * sizeof (tree));
4637           }
4638     }
4639 }
4640
4641 /* Function vect_setup_realignment
4642
4643    This function is called when vectorizing an unaligned load using
4644    the dr_explicit_realign[_optimized] scheme.
4645    This function generates the following code at the loop prolog:
4646
4647       p = initial_addr;
4648    x  msq_init = *(floor(p));   # prolog load
4649       realignment_token = call target_builtin;
4650     loop:
4651    x  msq = phi (msq_init, ---)
4652
4653    The stmts marked with x are generated only for the case of
4654    dr_explicit_realign_optimized.
4655
4656    The code above sets up a new (vector) pointer, pointing to the first
4657    location accessed by STMT, and a "floor-aligned" load using that pointer.
4658    It also generates code to compute the "realignment-token" (if the relevant
4659    target hook was defined), and creates a phi-node at the loop-header bb
4660    whose arguments are the result of the prolog-load (created by this
4661    function) and the result of a load that takes place in the loop (to be
4662    created by the caller to this function).
4663
4664    For the case of dr_explicit_realign_optimized:
4665    The caller to this function uses the phi-result (msq) to create the
4666    realignment code inside the loop, and sets up the missing phi argument,
4667    as follows:
4668     loop:
4669       msq = phi (msq_init, lsq)
4670       lsq = *(floor(p'));        # load in loop
4671       result = realign_load (msq, lsq, realignment_token);
4672
4673    For the case of dr_explicit_realign:
4674     loop:
4675       msq = *(floor(p));        # load in loop
4676       p' = p + (VS-1);
4677       lsq = *(floor(p'));       # load in loop
4678       result = realign_load (msq, lsq, realignment_token);
4679
4680    Input:
4681    STMT - (scalar) load stmt to be vectorized. This load accesses
4682           a memory location that may be unaligned.
4683    BSI - place where new code is to be inserted.
4684    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4685                               is used.
4686
4687    Output:
4688    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4689                        target hook, if defined.
4690    Return value - the result of the loop-header phi node.  */
4691
4692 tree
4693 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4694                         tree *realignment_token,
4695                         enum dr_alignment_support alignment_support_scheme,
4696                         tree init_addr,
4697                         struct loop **at_loop)
4698 {
4699   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4700   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4701   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4702   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4703   struct loop *loop = NULL;
4704   edge pe = NULL;
4705   tree scalar_dest = gimple_assign_lhs (stmt);
4706   tree vec_dest;
4707   gimple *inc;
4708   tree ptr;
4709   tree data_ref;
4710   basic_block new_bb;
4711   tree msq_init = NULL_TREE;
4712   tree new_temp;
4713   gphi *phi_stmt;
4714   tree msq = NULL_TREE;
4715   gimple_seq stmts = NULL;
4716   bool inv_p;
4717   bool compute_in_loop = false;
4718   bool nested_in_vect_loop = false;
4719   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4720   struct loop *loop_for_initial_load = NULL;
4721
4722   if (loop_vinfo)
4723     {
4724       loop = LOOP_VINFO_LOOP (loop_vinfo);
4725       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4726     }
4727
4728   gcc_assert (alignment_support_scheme == dr_explicit_realign
4729               || alignment_support_scheme == dr_explicit_realign_optimized);
4730
4731   /* We need to generate three things:
4732      1. the misalignment computation
4733      2. the extra vector load (for the optimized realignment scheme).
4734      3. the phi node for the two vectors from which the realignment is
4735       done (for the optimized realignment scheme).  */
4736
4737   /* 1. Determine where to generate the misalignment computation.
4738
4739      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4740      calculation will be generated by this function, outside the loop (in the
4741      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4742      caller, inside the loop.
4743
4744      Background: If the misalignment remains fixed throughout the iterations of
4745      the loop, then both realignment schemes are applicable, and also the
4746      misalignment computation can be done outside LOOP.  This is because we are
4747      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4748      are a multiple of VS (the Vector Size), and therefore the misalignment in
4749      different vectorized LOOP iterations is always the same.
4750      The problem arises only if the memory access is in an inner-loop nested
4751      inside LOOP, which is now being vectorized using outer-loop vectorization.
4752      This is the only case when the misalignment of the memory access may not
4753      remain fixed throughout the iterations of the inner-loop (as explained in
4754      detail in vect_supportable_dr_alignment).  In this case, not only is the
4755      optimized realignment scheme not applicable, but also the misalignment
4756      computation (and generation of the realignment token that is passed to
4757      REALIGN_LOAD) have to be done inside the loop.
4758
4759      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4760      or not, which in turn determines if the misalignment is computed inside
4761      the inner-loop, or outside LOOP.  */
4762
4763   if (init_addr != NULL_TREE || !loop_vinfo)
4764     {
4765       compute_in_loop = true;
4766       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4767     }
4768
4769
4770   /* 2. Determine where to generate the extra vector load.
4771
4772      For the optimized realignment scheme, instead of generating two vector
4773      loads in each iteration, we generate a single extra vector load in the
4774      preheader of the loop, and in each iteration reuse the result of the
4775      vector load from the previous iteration.  In case the memory access is in
4776      an inner-loop nested inside LOOP, which is now being vectorized using
4777      outer-loop vectorization, we need to determine whether this initial vector
4778      load should be generated at the preheader of the inner-loop, or can be
4779      generated at the preheader of LOOP.  If the memory access has no evolution
4780      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4781      to be generated inside LOOP (in the preheader of the inner-loop).  */
4782
4783   if (nested_in_vect_loop)
4784     {
4785       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4786       bool invariant_in_outerloop =
4787             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4788       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4789     }
4790   else
4791     loop_for_initial_load = loop;
4792   if (at_loop)
4793     *at_loop = loop_for_initial_load;
4794
4795   if (loop_for_initial_load)
4796     pe = loop_preheader_edge (loop_for_initial_load);
4797
4798   /* 3. For the case of the optimized realignment, create the first vector
4799       load at the loop preheader.  */
4800
4801   if (alignment_support_scheme == dr_explicit_realign_optimized)
4802     {
4803       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4804       gassign *new_stmt;
4805
4806       gcc_assert (!compute_in_loop);
4807       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4808       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4809                                       NULL_TREE, &init_addr, NULL, &inc,
4810                                       true, &inv_p);
4811       if (TREE_CODE (ptr) == SSA_NAME)
4812         new_temp = copy_ssa_name (ptr);
4813       else
4814         new_temp = make_ssa_name (TREE_TYPE (ptr));
4815       new_stmt = gimple_build_assign
4816                    (new_temp, BIT_AND_EXPR, ptr,
4817                     build_int_cst (TREE_TYPE (ptr),
4818                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4819       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4820       gcc_assert (!new_bb);
4821       data_ref
4822         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4823                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4824       new_stmt = gimple_build_assign (vec_dest, data_ref);
4825       new_temp = make_ssa_name (vec_dest, new_stmt);
4826       gimple_assign_set_lhs (new_stmt, new_temp);
4827       if (pe)
4828         {
4829           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4830           gcc_assert (!new_bb);
4831         }
4832       else
4833          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4834
4835       msq_init = gimple_assign_lhs (new_stmt);
4836     }
4837
4838   /* 4. Create realignment token using a target builtin, if available.
4839       It is done either inside the containing loop, or before LOOP (as
4840       determined above).  */
4841
4842   if (targetm.vectorize.builtin_mask_for_load)
4843     {
4844       gcall *new_stmt;
4845       tree builtin_decl;
4846
4847       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4848       if (!init_addr)
4849         {
4850           /* Generate the INIT_ADDR computation outside LOOP.  */
4851           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4852                                                             NULL_TREE);
4853           if (loop)
4854             {
4855               pe = loop_preheader_edge (loop);
4856               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4857               gcc_assert (!new_bb);
4858             }
4859           else
4860              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4861         }
4862
4863       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4864       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4865       vec_dest =
4866         vect_create_destination_var (scalar_dest,
4867                                      gimple_call_return_type (new_stmt));
4868       new_temp = make_ssa_name (vec_dest, new_stmt);
4869       gimple_call_set_lhs (new_stmt, new_temp);
4870
4871       if (compute_in_loop)
4872         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4873       else
4874         {
4875           /* Generate the misalignment computation outside LOOP.  */
4876           pe = loop_preheader_edge (loop);
4877           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4878           gcc_assert (!new_bb);
4879         }
4880
4881       *realignment_token = gimple_call_lhs (new_stmt);
4882
4883       /* The result of the CALL_EXPR to this builtin is determined from
4884          the value of the parameter and no global variables are touched
4885          which makes the builtin a "const" function.  Requiring the
4886          builtin to have the "const" attribute makes it unnecessary
4887          to call mark_call_clobbered.  */
4888       gcc_assert (TREE_READONLY (builtin_decl));
4889     }
4890
4891   if (alignment_support_scheme == dr_explicit_realign)
4892     return msq;
4893
4894   gcc_assert (!compute_in_loop);
4895   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4896
4897
4898   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4899
4900   pe = loop_preheader_edge (containing_loop);
4901   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4902   msq = make_ssa_name (vec_dest);
4903   phi_stmt = create_phi_node (msq, containing_loop->header);
4904   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4905
4906   return msq;
4907 }
4908
4909
4910 /* Function vect_grouped_load_supported.
4911
4912    COUNT is the size of the load group (the number of statements plus the
4913    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
4914    only one statement, with a gap of COUNT - 1.
4915
4916    Returns true if a suitable permute exists.  */
4917
4918 bool
4919 vect_grouped_load_supported (tree vectype, bool single_element_p,
4920                              unsigned HOST_WIDE_INT count)
4921 {
4922   machine_mode mode = TYPE_MODE (vectype);
4923
4924   /* If this is single-element interleaving with an element distance
4925      that leaves unused vector loads around punt - we at least create
4926      very sub-optimal code in that case (and blow up memory,
4927      see PR65518).  */
4928   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
4929     {
4930       if (dump_enabled_p ())
4931         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4932                          "single-element interleaving not supported "
4933                          "for not adjacent vector loads\n");
4934       return false;
4935     }
4936
4937   /* vect_permute_load_chain requires the group size to be equal to 3 or
4938      be a power of two.  */
4939   if (count != 3 && exact_log2 (count) == -1)
4940     {
4941       if (dump_enabled_p ())
4942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4943                          "the size of the group of accesses"
4944                          " is not a power of 2 or not equal to 3\n");
4945       return false;
4946     }
4947
4948   /* Check that the permutation is supported.  */
4949   if (VECTOR_MODE_P (mode))
4950     {
4951       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4952       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4953
4954       if (count == 3)
4955         {
4956           unsigned int k;
4957           for (k = 0; k < 3; k++)
4958             {
4959               for (i = 0; i < nelt; i++)
4960                 if (3 * i + k < 2 * nelt)
4961                   sel[i] = 3 * i + k;
4962                 else
4963                   sel[i] = 0;
4964               if (!can_vec_perm_p (mode, false, sel))
4965                 {
4966                   if (dump_enabled_p ())
4967                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4968                                      "shuffle of 3 loads is not supported by"
4969                                      " target\n");
4970                   return false;
4971                 }
4972               for (i = 0, j = 0; i < nelt; i++)
4973                 if (3 * i + k < 2 * nelt)
4974                   sel[i] = i;
4975                 else
4976                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4977               if (!can_vec_perm_p (mode, false, sel))
4978                 {
4979                   if (dump_enabled_p ())
4980                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4981                                      "shuffle of 3 loads is not supported by"
4982                                      " target\n");
4983                   return false;
4984                 }
4985             }
4986           return true;
4987         }
4988       else
4989         {
4990           /* If length is not equal to 3 then only power of 2 is supported.  */
4991           gcc_assert (pow2p_hwi (count));
4992           for (i = 0; i < nelt; i++)
4993             sel[i] = i * 2;
4994           if (can_vec_perm_p (mode, false, sel))
4995             {
4996               for (i = 0; i < nelt; i++)
4997                 sel[i] = i * 2 + 1;
4998               if (can_vec_perm_p (mode, false, sel))
4999                 return true;
5000             }
5001         }
5002     }
5003
5004   if (dump_enabled_p ())
5005     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5006                      "extract even/odd not supported by target\n");
5007   return false;
5008 }
5009
5010 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5011    type VECTYPE.  */
5012
5013 bool
5014 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5015 {
5016   return vect_lanes_optab_supported_p ("vec_load_lanes",
5017                                        vec_load_lanes_optab,
5018                                        vectype, count);
5019 }
5020
5021 /* Function vect_permute_load_chain.
5022
5023    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5024    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5025    the input data correctly.  Return the final references for loads in
5026    RESULT_CHAIN.
5027
5028    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5029    The input is 4 vectors each containing 8 elements. We assign a number to each
5030    element, the input sequence is:
5031
5032    1st vec:   0  1  2  3  4  5  6  7
5033    2nd vec:   8  9 10 11 12 13 14 15
5034    3rd vec:  16 17 18 19 20 21 22 23
5035    4th vec:  24 25 26 27 28 29 30 31
5036
5037    The output sequence should be:
5038
5039    1st vec:  0 4  8 12 16 20 24 28
5040    2nd vec:  1 5  9 13 17 21 25 29
5041    3rd vec:  2 6 10 14 18 22 26 30
5042    4th vec:  3 7 11 15 19 23 27 31
5043
5044    i.e., the first output vector should contain the first elements of each
5045    interleaving group, etc.
5046
5047    We use extract_even/odd instructions to create such output.  The input of
5048    each extract_even/odd operation is two vectors
5049    1st vec    2nd vec
5050    0 1 2 3    4 5 6 7
5051
5052    and the output is the vector of extracted even/odd elements.  The output of
5053    extract_even will be:   0 2 4 6
5054    and of extract_odd:     1 3 5 7
5055
5056
5057    The permutation is done in log LENGTH stages.  In each stage extract_even
5058    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5059    their order.  In our example,
5060
5061    E1: extract_even (1st vec, 2nd vec)
5062    E2: extract_odd (1st vec, 2nd vec)
5063    E3: extract_even (3rd vec, 4th vec)
5064    E4: extract_odd (3rd vec, 4th vec)
5065
5066    The output for the first stage will be:
5067
5068    E1:  0  2  4  6  8 10 12 14
5069    E2:  1  3  5  7  9 11 13 15
5070    E3: 16 18 20 22 24 26 28 30
5071    E4: 17 19 21 23 25 27 29 31
5072
5073    In order to proceed and create the correct sequence for the next stage (or
5074    for the correct output, if the second stage is the last one, as in our
5075    example), we first put the output of extract_even operation and then the
5076    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5077    The input for the second stage is:
5078
5079    1st vec (E1):  0  2  4  6  8 10 12 14
5080    2nd vec (E3): 16 18 20 22 24 26 28 30
5081    3rd vec (E2):  1  3  5  7  9 11 13 15
5082    4th vec (E4): 17 19 21 23 25 27 29 31
5083
5084    The output of the second stage:
5085
5086    E1: 0 4  8 12 16 20 24 28
5087    E2: 2 6 10 14 18 22 26 30
5088    E3: 1 5  9 13 17 21 25 29
5089    E4: 3 7 11 15 19 23 27 31
5090
5091    And RESULT_CHAIN after reordering:
5092
5093    1st vec (E1):  0 4  8 12 16 20 24 28
5094    2nd vec (E3):  1 5  9 13 17 21 25 29
5095    3rd vec (E2):  2 6 10 14 18 22 26 30
5096    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5097
5098 static void
5099 vect_permute_load_chain (vec<tree> dr_chain,
5100                          unsigned int length,
5101                          gimple *stmt,
5102                          gimple_stmt_iterator *gsi,
5103                          vec<tree> *result_chain)
5104 {
5105   tree data_ref, first_vect, second_vect;
5106   tree perm_mask_even, perm_mask_odd;
5107   tree perm3_mask_low, perm3_mask_high;
5108   gimple *perm_stmt;
5109   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5110   unsigned int i, j, log_length = exact_log2 (length);
5111   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5112   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5113
5114   result_chain->quick_grow (length);
5115   memcpy (result_chain->address (), dr_chain.address (),
5116           length * sizeof (tree));
5117
5118   if (length == 3)
5119     {
5120       unsigned int k;
5121
5122       for (k = 0; k < 3; k++)
5123         {
5124           for (i = 0; i < nelt; i++)
5125             if (3 * i + k < 2 * nelt)
5126               sel[i] = 3 * i + k;
5127             else
5128               sel[i] = 0;
5129           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5130
5131           for (i = 0, j = 0; i < nelt; i++)
5132             if (3 * i + k < 2 * nelt)
5133               sel[i] = i;
5134             else
5135               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5136
5137           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5138
5139           first_vect = dr_chain[0];
5140           second_vect = dr_chain[1];
5141
5142           /* Create interleaving stmt (low part of):
5143              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5144                                                              ...}>  */
5145           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5146           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5147                                            second_vect, perm3_mask_low);
5148           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5149
5150           /* Create interleaving stmt (high part of):
5151              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5152                                                               ...}>  */
5153           first_vect = data_ref;
5154           second_vect = dr_chain[2];
5155           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5156           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5157                                            second_vect, perm3_mask_high);
5158           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5159           (*result_chain)[k] = data_ref;
5160         }
5161     }
5162   else
5163     {
5164       /* If length is not equal to 3 then only power of 2 is supported.  */
5165       gcc_assert (pow2p_hwi (length));
5166
5167       for (i = 0; i < nelt; ++i)
5168         sel[i] = i * 2;
5169       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5170
5171       for (i = 0; i < nelt; ++i)
5172         sel[i] = i * 2 + 1;
5173       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5174
5175       for (i = 0; i < log_length; i++)
5176         {
5177           for (j = 0; j < length; j += 2)
5178             {
5179               first_vect = dr_chain[j];
5180               second_vect = dr_chain[j+1];
5181
5182               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5183               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5184               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5185                                                first_vect, second_vect,
5186                                                perm_mask_even);
5187               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5188               (*result_chain)[j/2] = data_ref;
5189
5190               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5191               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5192               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5193                                                first_vect, second_vect,
5194                                                perm_mask_odd);
5195               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5196               (*result_chain)[j/2+length/2] = data_ref;
5197             }
5198           memcpy (dr_chain.address (), result_chain->address (),
5199                   length * sizeof (tree));
5200         }
5201     }
5202 }
5203
5204 /* Function vect_shift_permute_load_chain.
5205
5206    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5207    sequence of stmts to reorder the input data accordingly.
5208    Return the final references for loads in RESULT_CHAIN.
5209    Return true if successed, false otherwise.
5210
5211    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5212    The input is 3 vectors each containing 8 elements.  We assign a
5213    number to each element, the input sequence is:
5214
5215    1st vec:   0  1  2  3  4  5  6  7
5216    2nd vec:   8  9 10 11 12 13 14 15
5217    3rd vec:  16 17 18 19 20 21 22 23
5218
5219    The output sequence should be:
5220
5221    1st vec:  0 3 6  9 12 15 18 21
5222    2nd vec:  1 4 7 10 13 16 19 22
5223    3rd vec:  2 5 8 11 14 17 20 23
5224
5225    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5226
5227    First we shuffle all 3 vectors to get correct elements order:
5228
5229    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5230    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5231    3rd vec:  (16 19 22) (17 20 23) (18 21)
5232
5233    Next we unite and shift vector 3 times:
5234
5235    1st step:
5236      shift right by 6 the concatenation of:
5237      "1st vec" and  "2nd vec"
5238        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5239      "2nd vec" and  "3rd vec"
5240        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5241      "3rd vec" and  "1st vec"
5242        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5243                              | New vectors                   |
5244
5245      So that now new vectors are:
5246
5247      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5248      2nd vec:  (10 13) (16 19 22) (17 20 23)
5249      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5250
5251    2nd step:
5252      shift right by 5 the concatenation of:
5253      "1st vec" and  "3rd vec"
5254        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5255      "2nd vec" and  "1st vec"
5256        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5257      "3rd vec" and  "2nd vec"
5258        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5259                           | New vectors                   |
5260
5261      So that now new vectors are:
5262
5263      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5264      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5265      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5266
5267    3rd step:
5268      shift right by 5 the concatenation of:
5269      "1st vec" and  "1st vec"
5270        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5271      shift right by 3 the concatenation of:
5272      "2nd vec" and  "2nd vec"
5273                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5274                           | New vectors                   |
5275
5276      So that now all vectors are READY:
5277      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5278      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5279      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5280
5281    This algorithm is faster than one in vect_permute_load_chain if:
5282      1.  "shift of a concatination" is faster than general permutation.
5283          This is usually so.
5284      2.  The TARGET machine can't execute vector instructions in parallel.
5285          This is because each step of the algorithm depends on previous.
5286          The algorithm in vect_permute_load_chain is much more parallel.
5287
5288    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5289 */
5290
5291 static bool
5292 vect_shift_permute_load_chain (vec<tree> dr_chain,
5293                                unsigned int length,
5294                                gimple *stmt,
5295                                gimple_stmt_iterator *gsi,
5296                                vec<tree> *result_chain)
5297 {
5298   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5299   tree perm2_mask1, perm2_mask2, perm3_mask;
5300   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5301   gimple *perm_stmt;
5302
5303   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5304   unsigned int i;
5305   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5306   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5307   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5308   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5309
5310   result_chain->quick_grow (length);
5311   memcpy (result_chain->address (), dr_chain.address (),
5312           length * sizeof (tree));
5313
5314   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5315     {
5316       unsigned int j, log_length = exact_log2 (length);
5317       for (i = 0; i < nelt / 2; ++i)
5318         sel[i] = i * 2;
5319       for (i = 0; i < nelt / 2; ++i)
5320         sel[nelt / 2 + i] = i * 2 + 1;
5321       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5322         {
5323           if (dump_enabled_p ())
5324             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5325                              "shuffle of 2 fields structure is not \
5326                               supported by target\n");
5327           return false;
5328         }
5329       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5330
5331       for (i = 0; i < nelt / 2; ++i)
5332         sel[i] = i * 2 + 1;
5333       for (i = 0; i < nelt / 2; ++i)
5334         sel[nelt / 2 + i] = i * 2;
5335       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5336         {
5337           if (dump_enabled_p ())
5338             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5339                              "shuffle of 2 fields structure is not \
5340                               supported by target\n");
5341           return false;
5342         }
5343       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5344
5345       /* Generating permutation constant to shift all elements.
5346          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5347       for (i = 0; i < nelt; i++)
5348         sel[i] = nelt / 2 + i;
5349       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5350         {
5351           if (dump_enabled_p ())
5352             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5353                              "shift permutation is not supported by target\n");
5354           return false;
5355         }
5356       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5357
5358       /* Generating permutation constant to select vector from 2.
5359          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5360       for (i = 0; i < nelt / 2; i++)
5361         sel[i] = i;
5362       for (i = nelt / 2; i < nelt; i++)
5363         sel[i] = nelt + i;
5364       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5365         {
5366           if (dump_enabled_p ())
5367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5368                              "select is not supported by target\n");
5369           return false;
5370         }
5371       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5372
5373       for (i = 0; i < log_length; i++)
5374         {
5375           for (j = 0; j < length; j += 2)
5376             {
5377               first_vect = dr_chain[j];
5378               second_vect = dr_chain[j + 1];
5379
5380               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5381               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5382                                                first_vect, first_vect,
5383                                                perm2_mask1);
5384               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5385               vect[0] = data_ref;
5386
5387               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5388               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5389                                                second_vect, second_vect,
5390                                                perm2_mask2);
5391               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5392               vect[1] = data_ref;
5393
5394               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5395               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5396                                                vect[0], vect[1], shift1_mask);
5397               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5398               (*result_chain)[j/2 + length/2] = data_ref;
5399
5400               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5401               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5402                                                vect[0], vect[1], select_mask);
5403               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5404               (*result_chain)[j/2] = data_ref;
5405             }
5406           memcpy (dr_chain.address (), result_chain->address (),
5407                   length * sizeof (tree));
5408         }
5409       return true;
5410     }
5411   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5412     {
5413       unsigned int k = 0, l = 0;
5414
5415       /* Generating permutation constant to get all elements in rigth order.
5416          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5417       for (i = 0; i < nelt; i++)
5418         {
5419           if (3 * k + (l % 3) >= nelt)
5420             {
5421               k = 0;
5422               l += (3 - (nelt % 3));
5423             }
5424           sel[i] = 3 * k + (l % 3);
5425           k++;
5426         }
5427       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5428         {
5429           if (dump_enabled_p ())
5430             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5431                              "shuffle of 3 fields structure is not \
5432                               supported by target\n");
5433           return false;
5434         }
5435       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5436
5437       /* Generating permutation constant to shift all elements.
5438          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5439       for (i = 0; i < nelt; i++)
5440         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5441       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5442         {
5443           if (dump_enabled_p ())
5444             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5445                              "shift permutation is not supported by target\n");
5446           return false;
5447         }
5448       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5449
5450       /* Generating permutation constant to shift all elements.
5451          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5452       for (i = 0; i < nelt; i++)
5453         sel[i] = 2 * (nelt / 3) + 1 + i;
5454       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5455         {
5456           if (dump_enabled_p ())
5457             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5458                              "shift permutation is not supported by target\n");
5459           return false;
5460         }
5461       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5462
5463       /* Generating permutation constant to shift all elements.
5464          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5465       for (i = 0; i < nelt; i++)
5466         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5467       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5468         {
5469           if (dump_enabled_p ())
5470             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5471                              "shift permutation is not supported by target\n");
5472           return false;
5473         }
5474       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5475
5476       /* Generating permutation constant to shift all elements.
5477          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5478       for (i = 0; i < nelt; i++)
5479         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5480       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5481         {
5482           if (dump_enabled_p ())
5483             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5484                              "shift permutation is not supported by target\n");
5485           return false;
5486         }
5487       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5488
5489       for (k = 0; k < 3; k++)
5490         {
5491           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5492           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5493                                            dr_chain[k], dr_chain[k],
5494                                            perm3_mask);
5495           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5496           vect[k] = data_ref;
5497         }
5498
5499       for (k = 0; k < 3; k++)
5500         {
5501           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5502           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5503                                            vect[k % 3], vect[(k + 1) % 3],
5504                                            shift1_mask);
5505           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5506           vect_shift[k] = data_ref;
5507         }
5508
5509       for (k = 0; k < 3; k++)
5510         {
5511           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5512           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5513                                            vect_shift[(4 - k) % 3],
5514                                            vect_shift[(3 - k) % 3],
5515                                            shift2_mask);
5516           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5517           vect[k] = data_ref;
5518         }
5519
5520       (*result_chain)[3 - (nelt % 3)] = vect[2];
5521
5522       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5523       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5524                                        vect[0], shift3_mask);
5525       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5526       (*result_chain)[nelt % 3] = data_ref;
5527
5528       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5529       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5530                                        vect[1], shift4_mask);
5531       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5532       (*result_chain)[0] = data_ref;
5533       return true;
5534     }
5535   return false;
5536 }
5537
5538 /* Function vect_transform_grouped_load.
5539
5540    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5541    to perform their permutation and ascribe the result vectorized statements to
5542    the scalar statements.
5543 */
5544
5545 void
5546 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5547                              gimple_stmt_iterator *gsi)
5548 {
5549   machine_mode mode;
5550   vec<tree> result_chain = vNULL;
5551
5552   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5553      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5554      vectors, that are ready for vector computation.  */
5555   result_chain.create (size);
5556
5557   /* If reassociation width for vector type is 2 or greater target machine can
5558      execute 2 or more vector instructions in parallel.  Otherwise try to
5559      get chain for loads group using vect_shift_permute_load_chain.  */
5560   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5561   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5562       || pow2p_hwi (size)
5563       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5564                                          gsi, &result_chain))
5565     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5566   vect_record_grouped_load_vectors (stmt, result_chain);
5567   result_chain.release ();
5568 }
5569
5570 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5571    generated as part of the vectorization of STMT.  Assign the statement
5572    for each vector to the associated scalar statement.  */
5573
5574 void
5575 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5576 {
5577   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5578   gimple *next_stmt, *new_stmt;
5579   unsigned int i, gap_count;
5580   tree tmp_data_ref;
5581
5582   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5583      Since we scan the chain starting from it's first node, their order
5584      corresponds the order of data-refs in RESULT_CHAIN.  */
5585   next_stmt = first_stmt;
5586   gap_count = 1;
5587   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5588     {
5589       if (!next_stmt)
5590         break;
5591
5592       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5593        code elimination pass later.  No need to check for the first stmt in
5594        the group, since it always exists.
5595        GROUP_GAP is the number of steps in elements from the previous
5596        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5597        correspond to the gaps.  */
5598       if (next_stmt != first_stmt
5599           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5600       {
5601         gap_count++;
5602         continue;
5603       }
5604
5605       while (next_stmt)
5606         {
5607           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5608           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5609              copies, and we put the new vector statement in the first available
5610              RELATED_STMT.  */
5611           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5612             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5613           else
5614             {
5615               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5616                 {
5617                   gimple *prev_stmt =
5618                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5619                   gimple *rel_stmt =
5620                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5621                   while (rel_stmt)
5622                     {
5623                       prev_stmt = rel_stmt;
5624                       rel_stmt =
5625                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5626                     }
5627
5628                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5629                     new_stmt;
5630                 }
5631             }
5632
5633           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5634           gap_count = 1;
5635           /* If NEXT_STMT accesses the same DR as the previous statement,
5636              put the same TMP_DATA_REF as its vectorized statement; otherwise
5637              get the next data-ref from RESULT_CHAIN.  */
5638           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5639             break;
5640         }
5641     }
5642 }
5643
5644 /* Function vect_force_dr_alignment_p.
5645
5646    Returns whether the alignment of a DECL can be forced to be aligned
5647    on ALIGNMENT bit boundary.  */
5648
5649 bool
5650 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5651 {
5652   if (!VAR_P (decl))
5653     return false;
5654
5655   if (decl_in_symtab_p (decl)
5656       && !symtab_node::get (decl)->can_increase_alignment_p ())
5657     return false;
5658
5659   if (TREE_STATIC (decl))
5660     return (alignment <= MAX_OFILE_ALIGNMENT);
5661   else
5662     return (alignment <= MAX_STACK_ALIGNMENT);
5663 }
5664
5665
5666 /* Return whether the data reference DR is supported with respect to its
5667    alignment.
5668    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5669    it is aligned, i.e., check if it is possible to vectorize it with different
5670    alignment.  */
5671
5672 enum dr_alignment_support
5673 vect_supportable_dr_alignment (struct data_reference *dr,
5674                                bool check_aligned_accesses)
5675 {
5676   gimple *stmt = DR_STMT (dr);
5677   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5678   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5679   machine_mode mode = TYPE_MODE (vectype);
5680   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5681   struct loop *vect_loop = NULL;
5682   bool nested_in_vect_loop = false;
5683
5684   if (aligned_access_p (dr) && !check_aligned_accesses)
5685     return dr_aligned;
5686
5687   /* For now assume all conditional loads/stores support unaligned
5688      access without any special code.  */
5689   if (is_gimple_call (stmt)
5690       && gimple_call_internal_p (stmt)
5691       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5692           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5693     return dr_unaligned_supported;
5694
5695   if (loop_vinfo)
5696     {
5697       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5698       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5699     }
5700
5701   /* Possibly unaligned access.  */
5702
5703   /* We can choose between using the implicit realignment scheme (generating
5704      a misaligned_move stmt) and the explicit realignment scheme (generating
5705      aligned loads with a REALIGN_LOAD).  There are two variants to the
5706      explicit realignment scheme: optimized, and unoptimized.
5707      We can optimize the realignment only if the step between consecutive
5708      vector loads is equal to the vector size.  Since the vector memory
5709      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5710      is guaranteed that the misalignment amount remains the same throughout the
5711      execution of the vectorized loop.  Therefore, we can create the
5712      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5713      at the loop preheader.
5714
5715      However, in the case of outer-loop vectorization, when vectorizing a
5716      memory access in the inner-loop nested within the LOOP that is now being
5717      vectorized, while it is guaranteed that the misalignment of the
5718      vectorized memory access will remain the same in different outer-loop
5719      iterations, it is *not* guaranteed that is will remain the same throughout
5720      the execution of the inner-loop.  This is because the inner-loop advances
5721      with the original scalar step (and not in steps of VS).  If the inner-loop
5722      step happens to be a multiple of VS, then the misalignment remains fixed
5723      and we can use the optimized realignment scheme.  For example:
5724
5725       for (i=0; i<N; i++)
5726         for (j=0; j<M; j++)
5727           s += a[i+j];
5728
5729      When vectorizing the i-loop in the above example, the step between
5730      consecutive vector loads is 1, and so the misalignment does not remain
5731      fixed across the execution of the inner-loop, and the realignment cannot
5732      be optimized (as illustrated in the following pseudo vectorized loop):
5733
5734       for (i=0; i<N; i+=4)
5735         for (j=0; j<M; j++){
5736           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5737                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5738                          // (assuming that we start from an aligned address).
5739           }
5740
5741      We therefore have to use the unoptimized realignment scheme:
5742
5743       for (i=0; i<N; i+=4)
5744           for (j=k; j<M; j+=4)
5745           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5746                            // that the misalignment of the initial address is
5747                            // 0).
5748
5749      The loop can then be vectorized as follows:
5750
5751       for (k=0; k<4; k++){
5752         rt = get_realignment_token (&vp[k]);
5753         for (i=0; i<N; i+=4){
5754           v1 = vp[i+k];
5755           for (j=k; j<M; j+=4){
5756             v2 = vp[i+j+VS-1];
5757             va = REALIGN_LOAD <v1,v2,rt>;
5758             vs += va;
5759             v1 = v2;
5760           }
5761         }
5762     } */
5763
5764   if (DR_IS_READ (dr))
5765     {
5766       bool is_packed = false;
5767       tree type = (TREE_TYPE (DR_REF (dr)));
5768
5769       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5770           && (!targetm.vectorize.builtin_mask_for_load
5771               || targetm.vectorize.builtin_mask_for_load ()))
5772         {
5773           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5774
5775           /* If we are doing SLP then the accesses need not have the
5776              same alignment, instead it depends on the SLP group size.  */
5777           if (loop_vinfo
5778               && STMT_SLP_TYPE (stmt_info)
5779               && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5780                   * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5781                   % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5782             ;
5783           else if (!loop_vinfo
5784                    || (nested_in_vect_loop
5785                        && (TREE_INT_CST_LOW (DR_STEP (dr))
5786                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5787             return dr_explicit_realign;
5788           else
5789             return dr_explicit_realign_optimized;
5790         }
5791       if (!known_alignment_for_access_p (dr))
5792         is_packed = not_size_aligned (DR_REF (dr));
5793
5794       if (targetm.vectorize.support_vector_misalignment
5795             (mode, type, DR_MISALIGNMENT (dr), is_packed))
5796         /* Can't software pipeline the loads, but can at least do them.  */
5797         return dr_unaligned_supported;
5798     }
5799   else
5800     {
5801       bool is_packed = false;
5802       tree type = (TREE_TYPE (DR_REF (dr)));
5803
5804       if (!known_alignment_for_access_p (dr))
5805         is_packed = not_size_aligned (DR_REF (dr));
5806
5807      if (targetm.vectorize.support_vector_misalignment
5808            (mode, type, DR_MISALIGNMENT (dr), is_packed))
5809        return dr_unaligned_supported;
5810     }
5811
5812   /* Unsupported.  */
5813   return dr_unaligned_unsupported;
5814 }