gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55
  56 /* Return true if load- or store-lanes optab OPTAB is implemented for
  57    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  58
  59 static bool
  60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  61                               tree vectype, unsigned HOST_WIDE_INT count)
  62 {
  63   machine_mode mode;
  64   scalar_int_mode array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   limit_p = !targetm.array_mode_supported_p (mode, count);
  69   if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
  70                           limit_p).exists (&array_mode))
  71     {
  72       if (dump_enabled_p ())
  73         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  74                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  75                          GET_MODE_NAME (mode), count);
  76       return false;
  77     }
  78
  79   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  80     {
  81       if (dump_enabled_p ())
  82         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  83                          "cannot use %s<%s><%s>\n", name,
  84                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  85       return false;
  86     }
  87
  88   if (dump_enabled_p ())
  89     dump_printf_loc (MSG_NOTE, vect_location,
  90                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  91                      GET_MODE_NAME (mode));
  92
  93   return true;
  94 }
  95
  96
  97 /* Return the smallest scalar part of STMT.
  98    This is used to determine the vectype of the stmt.  We generally set the
  99    vectype according to the type of the result (lhs).  For stmts whose
 100    result-type is different than the type of the arguments (e.g., demotion,
 101    promotion), vectype will be reset appropriately (later).  Note that we have
 102    to visit the smallest datatype in this function, because that determines the
 103    VF.  If the smallest datatype in the loop is present only as the rhs of a
 104    promotion operation - we'd miss it.
 105    Such a case, where a variable of this datatype does not appear in the lhs
 106    anywhere in the loop, can only occur if it's an invariant: e.g.:
 107    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 108    invariant motion.  However, we cannot rely on invariant motion to always
 109    take invariants out of the loop, and so in the case of promotion we also
 110    have to check the rhs.
 111    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 112    types.  */
 113
 114 tree
 115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 116                                HOST_WIDE_INT *rhs_size_unit)
 117 {
 118   tree scalar_type = gimple_expr_type (stmt);
 119   HOST_WIDE_INT lhs, rhs;
 120
 121   /* During the analysis phase, this function is called on arbitrary
 122      statements that might not have scalar results.  */
 123   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 124     return scalar_type;
 125
 126   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 127
 128   if (is_gimple_assign (stmt)
 129       && (gimple_assign_cast_p (stmt)
 130           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 131           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 132           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 133     {
 134       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 135
 136       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 137       if (rhs < lhs)
 138         scalar_type = rhs_type;
 139     }
 140
 141   *lhs_size_unit = lhs;
 142   *rhs_size_unit = rhs;
 143   return scalar_type;
 144 }
 145
 146
 147 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 148    tested at run-time.  Return TRUE if DDR was successfully inserted.
 149    Return false if versioning is not supported.  */
 150
 151 static bool
 152 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 153 {
 154   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 155
 156   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 157     return false;
 158
 159   if (!runtime_alias_check_p (ddr, loop,
 160                               optimize_loop_nest_for_speed_p (loop)))
 161     return false;
 162
 163   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 164   return true;
 165 }
 166
 167
 168 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 169    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 170    distances.  These distances are conservatively correct but they don't
 171    reflect a guaranteed dependence.
 172
 173    Return true if this function does all the work necessary to avoid
 174    an alias or false if the caller should use the dependence distances
 175    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 176    the depth of the loop described by LOOP_VINFO and the other arguments
 177    are as for vect_analyze_data_ref_dependence.  */
 178
 179 static bool
 180 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 181                                        loop_vec_info loop_vinfo,
 182                                        int loop_depth, int *max_vf)
 183 {
 184   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 185   lambda_vector dist_v;
 186   unsigned int i;
 187   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 188     {
 189       int dist = dist_v[loop_depth];
 190       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 191         {
 192           /* If the user asserted safelen >= DIST consecutive iterations
 193              can be executed concurrently, assume independence.
 194
 195              ??? An alternative would be to add the alias check even
 196              in this case, and vectorize the fallback loop with the
 197              maximum VF set to safelen.  However, if the user has
 198              explicitly given a length, it's less likely that that
 199              would be a win.  */
 200           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 201             {
 202               if (loop->safelen < *max_vf)
 203                 *max_vf = loop->safelen;
 204               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 205               continue;
 206             }
 207
 208           /* For dependence distances of 2 or more, we have the option
 209              of limiting VF or checking for an alias at runtime.
 210              Prefer to check at runtime if we can, to avoid limiting
 211              the VF unnecessarily when the bases are in fact independent.
 212
 213              Note that the alias checks will be removed if the VF ends up
 214              being small enough.  */
 215           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 216         }
 217     }
 218   return true;
 219 }
 220
 221
 222 /* Function vect_analyze_data_ref_dependence.
 223
 224    Return TRUE if there (might) exist a dependence between a memory-reference
 225    DRA and a memory-reference DRB.  When versioning for alias may check a
 226    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 227    the data dependence.  */
 228
 229 static bool
 230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 231                                   loop_vec_info loop_vinfo, int *max_vf)
 232 {
 233   unsigned int i;
 234   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 235   struct data_reference *dra = DDR_A (ddr);
 236   struct data_reference *drb = DDR_B (ddr);
 237   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 238   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 239   lambda_vector dist_v;
 240   unsigned int loop_depth;
 241
 242   /* In loop analysis all data references should be vectorizable.  */
 243   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 244       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 245     gcc_unreachable ();
 246
 247   /* Independent data accesses.  */
 248   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 249     return false;
 250
 251   if (dra == drb
 252       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 253     return false;
 254
 255   /* We do not have to consider dependences between accesses that belong
 256      to the same group.  */
 257   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 258       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 259     return false;
 260
 261   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 262      least two scalar iterations, there is always also a true dependence.
 263      As the vectorizer does not re-order loads and stores we can ignore
 264      the anti-dependence if TBAA can disambiguate both DRs similar to the
 265      case with known negative distance anti-dependences (positive
 266      distance anti-dependences would violate TBAA constraints).  */
 267   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 268        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 269       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 270                                  get_alias_set (DR_REF (drb))))
 271     return false;
 272
 273   /* Unknown data dependence.  */
 274   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 275     {
 276       /* If user asserted safelen consecutive iterations can be
 277          executed concurrently, assume independence.  */
 278       if (loop->safelen >= 2)
 279         {
 280           if (loop->safelen < *max_vf)
 281             *max_vf = loop->safelen;
 282           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 283           return false;
 284         }
 285
 286       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 287           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 288         {
 289           if (dump_enabled_p ())
 290             {
 291               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 292                                "versioning for alias not supported for: "
 293                                "can't determine dependence between ");
 294               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 295                                  DR_REF (dra));
 296               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 297               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 298                                  DR_REF (drb));
 299               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 300             }
 301           return true;
 302         }
 303
 304       if (dump_enabled_p ())
 305         {
 306           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 307                            "versioning for alias required: "
 308                            "can't determine dependence between ");
 309           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 310                              DR_REF (dra));
 311           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 312           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 313                              DR_REF (drb));
 314           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 315         }
 316
 317       /* Add to list of ddrs that need to be tested at run-time.  */
 318       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 319     }
 320
 321   /* Known data dependence.  */
 322   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 323     {
 324       /* If user asserted safelen consecutive iterations can be
 325          executed concurrently, assume independence.  */
 326       if (loop->safelen >= 2)
 327         {
 328           if (loop->safelen < *max_vf)
 329             *max_vf = loop->safelen;
 330           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 331           return false;
 332         }
 333
 334       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 335           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 336         {
 337           if (dump_enabled_p ())
 338             {
 339               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 340                                "versioning for alias not supported for: "
 341                                "bad dist vector for ");
 342               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 343                                  DR_REF (dra));
 344               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 345               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 346                                  DR_REF (drb));
 347               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 348             }
 349           return true;
 350         }
 351
 352       if (dump_enabled_p ())
 353         {
 354           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 355                            "versioning for alias required: "
 356                            "bad dist vector for ");
 357           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 358           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 359           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 360           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 361         }
 362       /* Add to list of ddrs that need to be tested at run-time.  */
 363       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 364     }
 365
 366   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 367
 368   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 369       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 370                                                 loop_depth, max_vf))
 371     return false;
 372
 373   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 374     {
 375       int dist = dist_v[loop_depth];
 376
 377       if (dump_enabled_p ())
 378         dump_printf_loc (MSG_NOTE, vect_location,
 379                          "dependence distance  = %d.\n", dist);
 380
 381       if (dist == 0)
 382         {
 383           if (dump_enabled_p ())
 384             {
 385               dump_printf_loc (MSG_NOTE, vect_location,
 386                                "dependence distance == 0 between ");
 387               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 388               dump_printf (MSG_NOTE, " and ");
 389               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 390               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 391             }
 392
 393           /* When we perform grouped accesses and perform implicit CSE
 394              by detecting equal accesses and doing disambiguation with
 395              runtime alias tests like for
 396                 .. = a[i];
 397                 .. = a[i+1];
 398                 a[i] = ..;
 399                 a[i+1] = ..;
 400                 *p = ..;
 401                 .. = a[i];
 402                 .. = a[i+1];
 403              where we will end up loading { a[i], a[i+1] } once, make
 404              sure that inserting group loads before the first load and
 405              stores after the last store will do the right thing.
 406              Similar for groups like
 407                 a[i] = ...;
 408                 ... = a[i];
 409                 a[i+1] = ...;
 410              where loads from the group interleave with the store.  */
 411           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 412               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 413             {
 414               gimple *earlier_stmt;
 415               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 416               if (DR_IS_WRITE
 417                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 418                 {
 419                   if (dump_enabled_p ())
 420                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 421                                      "READ_WRITE dependence in interleaving."
 422                                      "\n");
 423                   return true;
 424                 }
 425             }
 426
 427           continue;
 428         }
 429
 430       if (dist > 0 && DDR_REVERSED_P (ddr))
 431         {
 432           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 433              reversed (to make distance vector positive), and the actual
 434              distance is negative.  */
 435           if (dump_enabled_p ())
 436             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                              "dependence distance negative.\n");
 438           /* Record a negative dependence distance to later limit the
 439              amount of stmt copying / unrolling we can perform.
 440              Only need to handle read-after-write dependence.  */
 441           if (DR_IS_READ (drb)
 442               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 443                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 444             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 445           continue;
 446         }
 447
 448       if (abs (dist) >= 2
 449           && abs (dist) < *max_vf)
 450         {
 451           /* The dependence distance requires reduction of the maximal
 452              vectorization factor.  */
 453           *max_vf = abs (dist);
 454           if (dump_enabled_p ())
 455             dump_printf_loc (MSG_NOTE, vect_location,
 456                              "adjusting maximal vectorization factor to %i\n",
 457                              *max_vf);
 458         }
 459
 460       if (abs (dist) >= *max_vf)
 461         {
 462           /* Dependence distance does not create dependence, as far as
 463              vectorization is concerned, in this case.  */
 464           if (dump_enabled_p ())
 465             dump_printf_loc (MSG_NOTE, vect_location,
 466                              "dependence distance >= VF.\n");
 467           continue;
 468         }
 469
 470       if (dump_enabled_p ())
 471         {
 472           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 473                        "not vectorized, possible dependence "
 474                        "between data-refs ");
 475           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 476           dump_printf (MSG_NOTE,  " and ");
 477           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 478           dump_printf (MSG_NOTE,  "\n");
 479         }
 480
 481       return true;
 482     }
 483
 484   return false;
 485 }
 486
 487 /* Function vect_analyze_data_ref_dependences.
 488
 489    Examine all the data references in the loop, and make sure there do not
 490    exist any data dependences between them.  Set *MAX_VF according to
 491    the maximum vectorization factor the data dependences allow.  */
 492
 493 bool
 494 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 495 {
 496   unsigned int i;
 497   struct data_dependence_relation *ddr;
 498
 499   if (dump_enabled_p ())
 500     dump_printf_loc (MSG_NOTE, vect_location,
 501                      "=== vect_analyze_data_ref_dependences ===\n");
 502
 503   LOOP_VINFO_DDRS (loop_vinfo)
 504     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 505              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 506   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 507   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 508   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 509                                 &LOOP_VINFO_DDRS (loop_vinfo),
 510                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 511     return false;
 512
 513   /* For epilogues we either have no aliases or alias versioning
 514      was applied to original loop.  Therefore we may just get max_vf
 515      using VF of original loop.  */
 516   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 517     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 518   else
 519     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 520       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 521         return false;
 522
 523   return true;
 524 }
 525
 526
 527 /* Function vect_slp_analyze_data_ref_dependence.
 528
 529    Return TRUE if there (might) exist a dependence between a memory-reference
 530    DRA and a memory-reference DRB.  When versioning for alias may check a
 531    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 532    the data dependence.  */
 533
 534 static bool
 535 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 536 {
 537   struct data_reference *dra = DDR_A (ddr);
 538   struct data_reference *drb = DDR_B (ddr);
 539
 540   /* We need to check dependences of statements marked as unvectorizable
 541      as well, they still can prohibit vectorization.  */
 542
 543   /* Independent data accesses.  */
 544   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 545     return false;
 546
 547   if (dra == drb)
 548     return false;
 549
 550   /* Read-read is OK.  */
 551   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 552     return false;
 553
 554   /* If dra and drb are part of the same interleaving chain consider
 555      them independent.  */
 556   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 557       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 558           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 559     return false;
 560
 561   /* Unknown data dependence.  */
 562   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 563     {
 564       if  (dump_enabled_p ())
 565         {
 566           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 567                            "can't determine dependence between ");
 568           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 569           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 570           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 571           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 572         }
 573     }
 574   else if (dump_enabled_p ())
 575     {
 576       dump_printf_loc (MSG_NOTE, vect_location,
 577                        "determined dependence between ");
 578       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 579       dump_printf (MSG_NOTE, " and ");
 580       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 581       dump_printf (MSG_NOTE,  "\n");
 582     }
 583
 584   return true;
 585 }
 586
 587
 588 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 589    contain the vector of scalar stores of this instance if we are
 590    disambiguating the loads.  */
 591
 592 static bool
 593 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 594                                    vec<gimple *> stores, gimple *last_store)
 595 {
 596   /* This walks over all stmts involved in the SLP load/store done
 597      in NODE verifying we can sink them up to the last stmt in the
 598      group.  */
 599   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 600   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 601     {
 602       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 603       if (access == last_access)
 604         continue;
 605       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 606       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 607            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 608         {
 609           gimple *stmt = gsi_stmt (gsi);
 610           if (! gimple_vuse (stmt)
 611               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 612             continue;
 613
 614           /* If we couldn't record a (single) data reference for this
 615              stmt we have to give up.  */
 616           /* ???  Here and below if dependence analysis fails we can resort
 617              to the alias oracle which can handle more kinds of stmts.  */
 618           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 619           if (!dr_b)
 620             return false;
 621
 622           bool dependent = false;
 623           /* If we run into a store of this same instance (we've just
 624              marked those) then delay dependence checking until we run
 625              into the last store because this is where it will have
 626              been sunk to (and we verify if we can do that as well).  */
 627           if (gimple_visited_p (stmt))
 628             {
 629               if (stmt != last_store)
 630                 continue;
 631               unsigned i;
 632               gimple *store;
 633               FOR_EACH_VEC_ELT (stores, i, store)
 634                 {
 635                   data_reference *store_dr
 636                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 637                   ddr_p ddr = initialize_data_dependence_relation
 638                                 (dr_a, store_dr, vNULL);
 639                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 640                   free_dependence_relation (ddr);
 641                   if (dependent)
 642                     break;
 643                 }
 644             }
 645           else
 646             {
 647               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 648                                                                dr_b, vNULL);
 649               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 650               free_dependence_relation (ddr);
 651             }
 652           if (dependent)
 653             return false;
 654         }
 655     }
 656   return true;
 657 }
 658
 659
 660 /* Function vect_analyze_data_ref_dependences.
 661
 662    Examine all the data references in the basic-block, and make sure there
 663    do not exist any data dependences between them.  Set *MAX_VF according to
 664    the maximum vectorization factor the data dependences allow.  */
 665
 666 bool
 667 vect_slp_analyze_instance_dependence (slp_instance instance)
 668 {
 669   if (dump_enabled_p ())
 670     dump_printf_loc (MSG_NOTE, vect_location,
 671                      "=== vect_slp_analyze_instance_dependence ===\n");
 672
 673   /* The stores of this instance are at the root of the SLP tree.  */
 674   slp_tree store = SLP_INSTANCE_TREE (instance);
 675   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 676     store = NULL;
 677
 678   /* Verify we can sink stores to the vectorized stmt insert location.  */
 679   gimple *last_store = NULL;
 680   if (store)
 681     {
 682       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 683         return false;
 684
 685       /* Mark stores in this instance and remember the last one.  */
 686       last_store = vect_find_last_scalar_stmt_in_slp (store);
 687       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 688         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 689     }
 690
 691   bool res = true;
 692
 693   /* Verify we can sink loads to the vectorized stmt insert location,
 694      special-casing stores of this instance.  */
 695   slp_tree load;
 696   unsigned int i;
 697   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 698     if (! vect_slp_analyze_node_dependences (instance, load,
 699                                              store
 700                                              ? SLP_TREE_SCALAR_STMTS (store)
 701                                              : vNULL, last_store))
 702       {
 703         res = false;
 704         break;
 705       }
 706
 707   /* Unset the visited flag.  */
 708   if (store)
 709     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 710       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 711
 712   return res;
 713 }
 714
 715 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 716    the statement that contains DRB, which is useful for recording in the
 717    dump file.  */
 718
 719 static void
 720 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 721                             innermost_loop_behavior *drb)
 722 {
 723   bool existed;
 724   innermost_loop_behavior *&entry
 725     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 726   if (!existed || entry->base_alignment < drb->base_alignment)
 727     {
 728       entry = drb;
 729       if (dump_enabled_p ())
 730         {
 731           dump_printf_loc (MSG_NOTE, vect_location,
 732                            "recording new base alignment for ");
 733           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 734           dump_printf (MSG_NOTE, "\n");
 735           dump_printf_loc (MSG_NOTE, vect_location,
 736                            "  alignment:    %d\n", drb->base_alignment);
 737           dump_printf_loc (MSG_NOTE, vect_location,
 738                            "  misalignment: %d\n", drb->base_misalignment);
 739           dump_printf_loc (MSG_NOTE, vect_location,
 740                            "  based on:     ");
 741           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 742         }
 743     }
 744 }
 745
 746 /* If the region we're going to vectorize is reached, all unconditional
 747    data references occur at least once.  We can therefore pool the base
 748    alignment guarantees from each unconditional reference.  Do this by
 749    going through all the data references in VINFO and checking whether
 750    the containing statement makes the reference unconditionally.  If so,
 751    record the alignment of the base address in VINFO so that it can be
 752    used for all other references with the same base.  */
 753
 754 void
 755 vect_record_base_alignments (vec_info *vinfo)
 756 {
 757   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 758   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 759   data_reference *dr;
 760   unsigned int i;
 761   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 762     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 763       {
 764         gimple *stmt = DR_STMT (dr);
 765         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 766
 767         /* If DR is nested in the loop that is being vectorized, we can also
 768            record the alignment of the base wrt the outer loop.  */
 769         if (loop && nested_in_vect_loop_p (loop, stmt))
 770           {
 771             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 772             vect_record_base_alignment
 773               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 774           }
 775       }
 776 }
 777
 778 /* Function vect_compute_data_ref_alignment
 779
 780    Compute the misalignment of the data reference DR.
 781
 782    Output:
 783    1. If during the misalignment computation it is found that the data reference
 784       cannot be vectorized then false is returned.
 785    2. DR_MISALIGNMENT (DR) is defined.
 786
 787    FOR NOW: No analysis is actually performed. Misalignment is calculated
 788    only for trivial cases. TODO.  */
 789
 790 bool
 791 vect_compute_data_ref_alignment (struct data_reference *dr)
 792 {
 793   gimple *stmt = DR_STMT (dr);
 794   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 795   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 796   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 797   struct loop *loop = NULL;
 798   tree ref = DR_REF (dr);
 799   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 800
 801   if (dump_enabled_p ())
 802     dump_printf_loc (MSG_NOTE, vect_location,
 803                      "vect_compute_data_ref_alignment:\n");
 804
 805   if (loop_vinfo)
 806     loop = LOOP_VINFO_LOOP (loop_vinfo);
 807
 808   /* Initialize misalignment to unknown.  */
 809   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 810
 811   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 812   bool step_preserves_misalignment_p;
 813
 814   /* No step for BB vectorization.  */
 815   if (!loop)
 816     {
 817       gcc_assert (integer_zerop (drb->step));
 818       step_preserves_misalignment_p = true;
 819     }
 820
 821   /* In case the dataref is in an inner-loop of the loop that is being
 822      vectorized (LOOP), we use the base and misalignment information
 823      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 824      stays the same throughout the execution of the inner-loop, which is why
 825      we have to check that the stride of the dataref in the inner-loop evenly
 826      divides by the vector size.  */
 827   else if (nested_in_vect_loop_p (loop, stmt))
 828     {
 829       step_preserves_misalignment_p
 830         = (DR_STEP_ALIGNMENT (dr)
 831            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 832
 833       if (dump_enabled_p ())
 834         {
 835           if (step_preserves_misalignment_p)
 836             dump_printf_loc (MSG_NOTE, vect_location,
 837                              "inner step divides the vector-size.\n");
 838           else
 839             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 840                              "inner step doesn't divide the vector-size.\n");
 841         }
 842     }
 843
 844   /* Similarly we can only use base and misalignment information relative to
 845      an innermost loop if the misalignment stays the same throughout the
 846      execution of the loop.  As above, this is the case if the stride of
 847      the dataref evenly divides by the vector size.  */
 848   else
 849     {
 850       unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 851       step_preserves_misalignment_p
 852         = ((DR_STEP_ALIGNMENT (dr) * vf)
 853            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 854
 855       if (!step_preserves_misalignment_p && dump_enabled_p ())
 856         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 857                          "step doesn't divide the vector-size.\n");
 858     }
 859
 860   unsigned int base_alignment = drb->base_alignment;
 861   unsigned int base_misalignment = drb->base_misalignment;
 862   unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
 863
 864   /* Calculate the maximum of the pooled base address alignment and the
 865      alignment that we can compute for DR itself.  */
 866   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 867   if (entry && base_alignment < (*entry)->base_alignment)
 868     {
 869       base_alignment = (*entry)->base_alignment;
 870       base_misalignment = (*entry)->base_misalignment;
 871     }
 872
 873   if (drb->offset_alignment < vector_alignment
 874       || !step_preserves_misalignment_p
 875       /* We need to know whether the step wrt the vectorized loop is
 876          negative when computing the starting misalignment below.  */
 877       || TREE_CODE (drb->step) != INTEGER_CST)
 878     {
 879       if (dump_enabled_p ())
 880         {
 881           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 882                            "Unknown alignment for access: ");
 883           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 884           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 885         }
 886       return true;
 887     }
 888
 889   if (base_alignment < vector_alignment)
 890     {
 891       tree base = drb->base_address;
 892       if (TREE_CODE (base) == ADDR_EXPR)
 893         base = TREE_OPERAND (base, 0);
 894       if (!vect_can_force_dr_alignment_p (base,
 895                                           vector_alignment * BITS_PER_UNIT))
 896         {
 897           if (dump_enabled_p ())
 898             {
 899               dump_printf_loc (MSG_NOTE, vect_location,
 900                                "can't force alignment of ref: ");
 901               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 902               dump_printf (MSG_NOTE, "\n");
 903             }
 904           return true;
 905         }
 906
 907       if (DECL_USER_ALIGN (base))
 908         {
 909           if (dump_enabled_p ())
 910             {
 911               dump_printf_loc (MSG_NOTE, vect_location,
 912                                "not forcing alignment of user-aligned "
 913                                "variable: ");
 914               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 915               dump_printf (MSG_NOTE, "\n");
 916             }
 917           return true;
 918         }
 919
 920       /* Force the alignment of the decl.
 921          NOTE: This is the only change to the code we make during
 922          the analysis phase, before deciding to vectorize the loop.  */
 923       if (dump_enabled_p ())
 924         {
 925           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 926           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 927           dump_printf (MSG_NOTE, "\n");
 928         }
 929
 930       DR_VECT_AUX (dr)->base_decl = base;
 931       DR_VECT_AUX (dr)->base_misaligned = true;
 932       base_misalignment = 0;
 933     }
 934   unsigned int misalignment = (base_misalignment
 935                                + TREE_INT_CST_LOW (drb->init));
 936
 937   /* If this is a backward running DR then first access in the larger
 938      vectype actually is N-1 elements before the address in the DR.
 939      Adjust misalign accordingly.  */
 940   if (tree_int_cst_sgn (drb->step) < 0)
 941     /* PLUS because STEP is negative.  */
 942     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 943                      * TREE_INT_CST_LOW (drb->step));
 944
 945   SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
 946
 947   if (dump_enabled_p ())
 948     {
 949       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 950                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 951       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 952       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 953     }
 954
 955   return true;
 956 }
 957
 958
 959 /* Function vect_update_misalignment_for_peel.
 960    Sets DR's misalignment
 961    - to 0 if it has the same alignment as DR_PEEL,
 962    - to the misalignment computed using NPEEL if DR's salignment is known,
 963    - to -1 (unknown) otherwise.
 964
 965    DR - the data reference whose misalignment is to be adjusted.
 966    DR_PEEL - the data reference whose misalignment is being made
 967              zero in the vector loop by the peel.
 968    NPEEL - the number of iterations in the peel loop if the misalignment
 969            of DR_PEEL is known at compile time.  */
 970
 971 static void
 972 vect_update_misalignment_for_peel (struct data_reference *dr,
 973                                    struct data_reference *dr_peel, int npeel)
 974 {
 975   unsigned int i;
 976   vec<dr_p> same_aligned_drs;
 977   struct data_reference *current_dr;
 978   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 979   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 980   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 981   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 982
 983  /* For interleaved data accesses the step in the loop must be multiplied by
 984      the size of the interleaving group.  */
 985   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 986     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 987   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 988     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 989
 990   /* It can be assumed that the data refs with the same alignment as dr_peel
 991      are aligned in the vector loop.  */
 992   same_aligned_drs
 993     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 994   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
 995     {
 996       if (current_dr != dr)
 997         continue;
 998       gcc_assert (!known_alignment_for_access_p (dr)
 999                   || !known_alignment_for_access_p (dr_peel)
1000                   || (DR_MISALIGNMENT (dr) / dr_size
1001                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1002       SET_DR_MISALIGNMENT (dr, 0);
1003       return;
1004     }
1005
1006   if (known_alignment_for_access_p (dr)
1007       && known_alignment_for_access_p (dr_peel))
1008     {
1009       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1010       int misal = DR_MISALIGNMENT (dr);
1011       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1012       misal += negative ? -npeel * dr_size : npeel * dr_size;
1013       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1014       SET_DR_MISALIGNMENT (dr, misal);
1015       return;
1016     }
1017
1018   if (dump_enabled_p ())
1019     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1020                      "to unknown (-1).\n");
1021   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1022 }
1023
1024
1025 /* Function verify_data_ref_alignment
1026
1027    Return TRUE if DR can be handled with respect to alignment.  */
1028
1029 static bool
1030 verify_data_ref_alignment (data_reference_p dr)
1031 {
1032   enum dr_alignment_support supportable_dr_alignment
1033     = vect_supportable_dr_alignment (dr, false);
1034   if (!supportable_dr_alignment)
1035     {
1036       if (dump_enabled_p ())
1037         {
1038           if (DR_IS_READ (dr))
1039             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1040                              "not vectorized: unsupported unaligned load.");
1041           else
1042             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1043                              "not vectorized: unsupported unaligned "
1044                              "store.");
1045
1046           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1047                              DR_REF (dr));
1048           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1049         }
1050       return false;
1051     }
1052
1053   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1054     dump_printf_loc (MSG_NOTE, vect_location,
1055                      "Vectorizing an unaligned access.\n");
1056
1057   return true;
1058 }
1059
1060 /* Function vect_verify_datarefs_alignment
1061
1062    Return TRUE if all data references in the loop can be
1063    handled with respect to alignment.  */
1064
1065 bool
1066 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1067 {
1068   vec<data_reference_p> datarefs = vinfo->datarefs;
1069   struct data_reference *dr;
1070   unsigned int i;
1071
1072   FOR_EACH_VEC_ELT (datarefs, i, dr)
1073     {
1074       gimple *stmt = DR_STMT (dr);
1075       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1076
1077       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1078         continue;
1079
1080       /* For interleaving, only the alignment of the first access matters.   */
1081       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1082           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1083         continue;
1084
1085       /* Strided accesses perform only component accesses, alignment is
1086          irrelevant for them.  */
1087       if (STMT_VINFO_STRIDED_P (stmt_info)
1088           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1089         continue;
1090
1091       if (! verify_data_ref_alignment (dr))
1092         return false;
1093     }
1094
1095   return true;
1096 }
1097
1098 /* Given an memory reference EXP return whether its alignment is less
1099    than its size.  */
1100
1101 static bool
1102 not_size_aligned (tree exp)
1103 {
1104   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1105     return true;
1106
1107   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1108           > get_object_alignment (exp));
1109 }
1110
1111 /* Function vector_alignment_reachable_p
1112
1113    Return true if vector alignment for DR is reachable by peeling
1114    a few loop iterations.  Return false otherwise.  */
1115
1116 static bool
1117 vector_alignment_reachable_p (struct data_reference *dr)
1118 {
1119   gimple *stmt = DR_STMT (dr);
1120   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1121   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1122
1123   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1124     {
1125       /* For interleaved access we peel only if number of iterations in
1126          the prolog loop ({VF - misalignment}), is a multiple of the
1127          number of the interleaved accesses.  */
1128       int elem_size, mis_in_elements;
1129       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1130
1131       /* FORNOW: handle only known alignment.  */
1132       if (!known_alignment_for_access_p (dr))
1133         return false;
1134
1135       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1136       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1137
1138       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1139         return false;
1140     }
1141
1142   /* If misalignment is known at the compile time then allow peeling
1143      only if natural alignment is reachable through peeling.  */
1144   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1145     {
1146       HOST_WIDE_INT elmsize =
1147                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1148       if (dump_enabled_p ())
1149         {
1150           dump_printf_loc (MSG_NOTE, vect_location,
1151                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1152           dump_printf (MSG_NOTE,
1153                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1154         }
1155       if (DR_MISALIGNMENT (dr) % elmsize)
1156         {
1157           if (dump_enabled_p ())
1158             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159                              "data size does not divide the misalignment.\n");
1160           return false;
1161         }
1162     }
1163
1164   if (!known_alignment_for_access_p (dr))
1165     {
1166       tree type = TREE_TYPE (DR_REF (dr));
1167       bool is_packed = not_size_aligned (DR_REF (dr));
1168       if (dump_enabled_p ())
1169         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170                          "Unknown misalignment, %snaturally aligned\n",
1171                          is_packed ? "not " : "");
1172       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1173     }
1174
1175   return true;
1176 }
1177
1178
1179 /* Calculate the cost of the memory access represented by DR.  */
1180
1181 static void
1182 vect_get_data_access_cost (struct data_reference *dr,
1183                            unsigned int *inside_cost,
1184                            unsigned int *outside_cost,
1185                            stmt_vector_for_cost *body_cost_vec)
1186 {
1187   gimple *stmt = DR_STMT (dr);
1188   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1189   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1190   int ncopies;
1191
1192   if (PURE_SLP_STMT (stmt_info))
1193     ncopies = 1;
1194   else
1195     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1196
1197   if (DR_IS_READ (dr))
1198     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1199                         NULL, body_cost_vec, false);
1200   else
1201     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1202
1203   if (dump_enabled_p ())
1204     dump_printf_loc (MSG_NOTE, vect_location,
1205                      "vect_get_data_access_cost: inside_cost = %d, "
1206                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1207 }
1208
1209
1210 typedef struct _vect_peel_info
1211 {
1212   struct data_reference *dr;
1213   int npeel;
1214   unsigned int count;
1215 } *vect_peel_info;
1216
1217 typedef struct _vect_peel_extended_info
1218 {
1219   struct _vect_peel_info peel_info;
1220   unsigned int inside_cost;
1221   unsigned int outside_cost;
1222 } *vect_peel_extended_info;
1223
1224
1225 /* Peeling hashtable helpers.  */
1226
1227 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1228 {
1229   static inline hashval_t hash (const _vect_peel_info *);
1230   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1231 };
1232
1233 inline hashval_t
1234 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1235 {
1236   return (hashval_t) peel_info->npeel;
1237 }
1238
1239 inline bool
1240 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1241 {
1242   return (a->npeel == b->npeel);
1243 }
1244
1245
1246 /* Insert DR into peeling hash table with NPEEL as key.  */
1247
1248 static void
1249 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1250                           loop_vec_info loop_vinfo, struct data_reference *dr,
1251                           int npeel)
1252 {
1253   struct _vect_peel_info elem, *slot;
1254   _vect_peel_info **new_slot;
1255   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1256
1257   elem.npeel = npeel;
1258   slot = peeling_htab->find (&elem);
1259   if (slot)
1260     slot->count++;
1261   else
1262     {
1263       slot = XNEW (struct _vect_peel_info);
1264       slot->npeel = npeel;
1265       slot->dr = dr;
1266       slot->count = 1;
1267       new_slot = peeling_htab->find_slot (slot, INSERT);
1268       *new_slot = slot;
1269     }
1270
1271   if (!supportable_dr_alignment
1272       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1273     slot->count += VECT_MAX_COST;
1274 }
1275
1276
1277 /* Traverse peeling hash table to find peeling option that aligns maximum
1278    number of data accesses.  */
1279
1280 int
1281 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1282                                      _vect_peel_extended_info *max)
1283 {
1284   vect_peel_info elem = *slot;
1285
1286   if (elem->count > max->peel_info.count
1287       || (elem->count == max->peel_info.count
1288           && max->peel_info.npeel > elem->npeel))
1289     {
1290       max->peel_info.npeel = elem->npeel;
1291       max->peel_info.count = elem->count;
1292       max->peel_info.dr = elem->dr;
1293     }
1294
1295   return 1;
1296 }
1297
1298 /* Get the costs of peeling NPEEL iterations checking data access costs
1299    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1300    misalignment will be zero after peeling.  */
1301
1302 static void
1303 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1304                                 struct data_reference *dr0,
1305                                 unsigned int *inside_cost,
1306                                 unsigned int *outside_cost,
1307                                 stmt_vector_for_cost *body_cost_vec,
1308                                 unsigned int npeel,
1309                                 bool unknown_misalignment)
1310 {
1311   unsigned i;
1312   data_reference *dr;
1313
1314   FOR_EACH_VEC_ELT (datarefs, i, dr)
1315     {
1316       gimple *stmt = DR_STMT (dr);
1317       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1318       /* For interleaving, only the alignment of the first access
1319          matters.  */
1320       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1321           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1322         continue;
1323
1324       /* Strided accesses perform only component accesses, alignment is
1325          irrelevant for them.  */
1326       if (STMT_VINFO_STRIDED_P (stmt_info)
1327           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1328         continue;
1329
1330       int save_misalignment;
1331       save_misalignment = DR_MISALIGNMENT (dr);
1332       if (npeel == 0)
1333         ;
1334       else if (unknown_misalignment && dr == dr0)
1335         SET_DR_MISALIGNMENT (dr, 0);
1336       else
1337         vect_update_misalignment_for_peel (dr, dr0, npeel);
1338       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1339                                  body_cost_vec);
1340       SET_DR_MISALIGNMENT (dr, save_misalignment);
1341     }
1342 }
1343
1344 /* Traverse peeling hash table and calculate cost for each peeling option.
1345    Find the one with the lowest cost.  */
1346
1347 int
1348 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1349                                    _vect_peel_extended_info *min)
1350 {
1351   vect_peel_info elem = *slot;
1352   int dummy;
1353   unsigned int inside_cost = 0, outside_cost = 0;
1354   gimple *stmt = DR_STMT (elem->dr);
1355   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1356   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1357   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1358                        epilogue_cost_vec;
1359
1360   prologue_cost_vec.create (2);
1361   body_cost_vec.create (2);
1362   epilogue_cost_vec.create (2);
1363
1364   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1365                                   elem->dr, &inside_cost, &outside_cost,
1366                                   &body_cost_vec, elem->npeel, false);
1367
1368   body_cost_vec.release ();
1369
1370   outside_cost += vect_get_known_peeling_cost
1371     (loop_vinfo, elem->npeel, &dummy,
1372      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1373      &prologue_cost_vec, &epilogue_cost_vec);
1374
1375   /* Prologue and epilogue costs are added to the target model later.
1376      These costs depend only on the scalar iteration cost, the
1377      number of peeling iterations finally chosen, and the number of
1378      misaligned statements.  So discard the information found here.  */
1379   prologue_cost_vec.release ();
1380   epilogue_cost_vec.release ();
1381
1382   if (inside_cost < min->inside_cost
1383       || (inside_cost == min->inside_cost
1384           && outside_cost < min->outside_cost))
1385     {
1386       min->inside_cost = inside_cost;
1387       min->outside_cost = outside_cost;
1388       min->peel_info.dr = elem->dr;
1389       min->peel_info.npeel = elem->npeel;
1390       min->peel_info.count = elem->count;
1391     }
1392
1393   return 1;
1394 }
1395
1396
1397 /* Choose best peeling option by traversing peeling hash table and either
1398    choosing an option with the lowest cost (if cost model is enabled) or the
1399    option that aligns as many accesses as possible.  */
1400
1401 static struct _vect_peel_extended_info
1402 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1403                                        loop_vec_info loop_vinfo)
1404 {
1405    struct _vect_peel_extended_info res;
1406
1407    res.peel_info.dr = NULL;
1408
1409    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1410      {
1411        res.inside_cost = INT_MAX;
1412        res.outside_cost = INT_MAX;
1413        peeling_htab->traverse <_vect_peel_extended_info *,
1414                                vect_peeling_hash_get_lowest_cost> (&res);
1415      }
1416    else
1417      {
1418        res.peel_info.count = 0;
1419        peeling_htab->traverse <_vect_peel_extended_info *,
1420                                vect_peeling_hash_get_most_frequent> (&res);
1421        res.inside_cost = 0;
1422        res.outside_cost = 0;
1423      }
1424
1425    return res;
1426 }
1427
1428 /* Return true if the new peeling NPEEL is supported.  */
1429
1430 static bool
1431 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1432                           unsigned npeel)
1433 {
1434   unsigned i;
1435   struct data_reference *dr = NULL;
1436   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1437   gimple *stmt;
1438   stmt_vec_info stmt_info;
1439   enum dr_alignment_support supportable_dr_alignment;
1440
1441   /* Ensure that all data refs can be vectorized after the peel.  */
1442   FOR_EACH_VEC_ELT (datarefs, i, dr)
1443     {
1444       int save_misalignment;
1445
1446       if (dr == dr0)
1447         continue;
1448
1449       stmt = DR_STMT (dr);
1450       stmt_info = vinfo_for_stmt (stmt);
1451       /* For interleaving, only the alignment of the first access
1452          matters.  */
1453       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1454           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1455         continue;
1456
1457       /* Strided accesses perform only component accesses, alignment is
1458          irrelevant for them.  */
1459       if (STMT_VINFO_STRIDED_P (stmt_info)
1460           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1461         continue;
1462
1463       save_misalignment = DR_MISALIGNMENT (dr);
1464       vect_update_misalignment_for_peel (dr, dr0, npeel);
1465       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1466       SET_DR_MISALIGNMENT (dr, save_misalignment);
1467
1468       if (!supportable_dr_alignment)
1469         return false;
1470     }
1471
1472   return true;
1473 }
1474
1475 /* Function vect_enhance_data_refs_alignment
1476
1477    This pass will use loop versioning and loop peeling in order to enhance
1478    the alignment of data references in the loop.
1479
1480    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1481    original loop is to be vectorized.  Any other loops that are created by
1482    the transformations performed in this pass - are not supposed to be
1483    vectorized.  This restriction will be relaxed.
1484
1485    This pass will require a cost model to guide it whether to apply peeling
1486    or versioning or a combination of the two.  For example, the scheme that
1487    intel uses when given a loop with several memory accesses, is as follows:
1488    choose one memory access ('p') which alignment you want to force by doing
1489    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1490    other accesses are not necessarily aligned, or (2) use loop versioning to
1491    generate one loop in which all accesses are aligned, and another loop in
1492    which only 'p' is necessarily aligned.
1493
1494    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1495    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1496    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1497
1498    Devising a cost model is the most critical aspect of this work.  It will
1499    guide us on which access to peel for, whether to use loop versioning, how
1500    many versions to create, etc.  The cost model will probably consist of
1501    generic considerations as well as target specific considerations (on
1502    powerpc for example, misaligned stores are more painful than misaligned
1503    loads).
1504
1505    Here are the general steps involved in alignment enhancements:
1506
1507      -- original loop, before alignment analysis:
1508         for (i=0; i<N; i++){
1509           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1510           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1511         }
1512
1513      -- After vect_compute_data_refs_alignment:
1514         for (i=0; i<N; i++){
1515           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1516           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1517         }
1518
1519      -- Possibility 1: we do loop versioning:
1520      if (p is aligned) {
1521         for (i=0; i<N; i++){    # loop 1A
1522           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1523           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1524         }
1525      }
1526      else {
1527         for (i=0; i<N; i++){    # loop 1B
1528           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1529           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1530         }
1531      }
1532
1533      -- Possibility 2: we do loop peeling:
1534      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1535         x = q[i];
1536         p[i] = y;
1537      }
1538      for (i = 3; i < N; i++){   # loop 2A
1539         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1540         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1541      }
1542
1543      -- Possibility 3: combination of loop peeling and versioning:
1544      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1545         x = q[i];
1546         p[i] = y;
1547      }
1548      if (p is aligned) {
1549         for (i = 3; i<N; i++){  # loop 3A
1550           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1551           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1552         }
1553      }
1554      else {
1555         for (i = 3; i<N; i++){  # loop 3B
1556           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1557           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1558         }
1559      }
1560
1561      These loops are later passed to loop_transform to be vectorized.  The
1562      vectorizer will use the alignment information to guide the transformation
1563      (whether to generate regular loads/stores, or with special handling for
1564      misalignment).  */
1565
1566 bool
1567 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1568 {
1569   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1570   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1571   enum dr_alignment_support supportable_dr_alignment;
1572   struct data_reference *dr0 = NULL, *first_store = NULL;
1573   struct data_reference *dr;
1574   unsigned int i, j;
1575   bool do_peeling = false;
1576   bool do_versioning = false;
1577   bool stat;
1578   gimple *stmt;
1579   stmt_vec_info stmt_info;
1580   unsigned int npeel = 0;
1581   bool one_misalignment_known = false;
1582   bool one_misalignment_unknown = false;
1583   bool one_dr_unsupportable = false;
1584   struct data_reference *unsupportable_dr = NULL;
1585   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1586   unsigned possible_npeel_number = 1;
1587   tree vectype;
1588   unsigned int nelements, mis, same_align_drs_max = 0;
1589   hash_table<peel_info_hasher> peeling_htab (1);
1590
1591   if (dump_enabled_p ())
1592     dump_printf_loc (MSG_NOTE, vect_location,
1593                      "=== vect_enhance_data_refs_alignment ===\n");
1594
1595   /* Reset data so we can safely be called multiple times.  */
1596   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1597   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1598
1599   /* While cost model enhancements are expected in the future, the high level
1600      view of the code at this time is as follows:
1601
1602      A) If there is a misaligned access then see if peeling to align
1603         this access can make all data references satisfy
1604         vect_supportable_dr_alignment.  If so, update data structures
1605         as needed and return true.
1606
1607      B) If peeling wasn't possible and there is a data reference with an
1608         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1609         then see if loop versioning checks can be used to make all data
1610         references satisfy vect_supportable_dr_alignment.  If so, update
1611         data structures as needed and return true.
1612
1613      C) If neither peeling nor versioning were successful then return false if
1614         any data reference does not satisfy vect_supportable_dr_alignment.
1615
1616      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1617
1618      Note, Possibility 3 above (which is peeling and versioning together) is not
1619      being done at this time.  */
1620
1621   /* (1) Peeling to force alignment.  */
1622
1623   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1624      Considerations:
1625      + How many accesses will become aligned due to the peeling
1626      - How many accesses will become unaligned due to the peeling,
1627        and the cost of misaligned accesses.
1628      - The cost of peeling (the extra runtime checks, the increase
1629        in code size).  */
1630
1631   FOR_EACH_VEC_ELT (datarefs, i, dr)
1632     {
1633       stmt = DR_STMT (dr);
1634       stmt_info = vinfo_for_stmt (stmt);
1635
1636       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1637         continue;
1638
1639       /* For interleaving, only the alignment of the first access
1640          matters.  */
1641       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1642           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1643         continue;
1644
1645       /* For invariant accesses there is nothing to enhance.  */
1646       if (integer_zerop (DR_STEP (dr)))
1647         continue;
1648
1649       /* Strided accesses perform only component accesses, alignment is
1650          irrelevant for them.  */
1651       if (STMT_VINFO_STRIDED_P (stmt_info)
1652           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1653         continue;
1654
1655       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1656       do_peeling = vector_alignment_reachable_p (dr);
1657       if (do_peeling)
1658         {
1659           if (known_alignment_for_access_p (dr))
1660             {
1661               unsigned int npeel_tmp = 0;
1662               bool negative = tree_int_cst_compare (DR_STEP (dr),
1663                                                     size_zero_node) < 0;
1664
1665               vectype = STMT_VINFO_VECTYPE (stmt_info);
1666               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1667               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1668                                                 TREE_TYPE (DR_REF (dr))));
1669               if (DR_MISALIGNMENT (dr) != 0)
1670                 npeel_tmp = (negative ? (mis - nelements)
1671                              : (nelements - mis)) & (nelements - 1);
1672
1673               /* For multiple types, it is possible that the bigger type access
1674                  will have more than one peeling option.  E.g., a loop with two
1675                  types: one of size (vector size / 4), and the other one of
1676                  size (vector size / 8).  Vectorization factor will 8.  If both
1677                  accesses are misaligned by 3, the first one needs one scalar
1678                  iteration to be aligned, and the second one needs 5.  But the
1679                  first one will be aligned also by peeling 5 scalar
1680                  iterations, and in that case both accesses will be aligned.
1681                  Hence, except for the immediate peeling amount, we also want
1682                  to try to add full vector size, while we don't exceed
1683                  vectorization factor.
1684                  We do this automatically for cost model, since we calculate
1685                  cost for every peeling option.  */
1686               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1687                 {
1688                   if (STMT_SLP_TYPE (stmt_info))
1689                     possible_npeel_number
1690                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1691                   else
1692                     possible_npeel_number = vf / nelements;
1693
1694                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1695                      allow peeling NELEMENTS.  */
1696                   if (DR_MISALIGNMENT (dr) == 0)
1697                     possible_npeel_number++;
1698                 }
1699
1700               /* Save info about DR in the hash table.  Also include peeling
1701                  amounts according to the explanation above.  */
1702               for (j = 0; j < possible_npeel_number; j++)
1703                 {
1704                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1705                                             dr, npeel_tmp);
1706                   npeel_tmp += nelements;
1707                 }
1708
1709               one_misalignment_known = true;
1710             }
1711           else
1712             {
1713               /* If we don't know any misalignment values, we prefer
1714                  peeling for data-ref that has the maximum number of data-refs
1715                  with the same alignment, unless the target prefers to align
1716                  stores over load.  */
1717               unsigned same_align_drs
1718                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1719               if (!dr0
1720                   || same_align_drs_max < same_align_drs)
1721                 {
1722                   same_align_drs_max = same_align_drs;
1723                   dr0 = dr;
1724                 }
1725               /* For data-refs with the same number of related
1726                  accesses prefer the one where the misalign
1727                  computation will be invariant in the outermost loop.  */
1728               else if (same_align_drs_max == same_align_drs)
1729                 {
1730                   struct loop *ivloop0, *ivloop;
1731                   ivloop0 = outermost_invariant_loop_for_expr
1732                     (loop, DR_BASE_ADDRESS (dr0));
1733                   ivloop = outermost_invariant_loop_for_expr
1734                     (loop, DR_BASE_ADDRESS (dr));
1735                   if ((ivloop && !ivloop0)
1736                       || (ivloop && ivloop0
1737                           && flow_loop_nested_p (ivloop, ivloop0)))
1738                     dr0 = dr;
1739                 }
1740
1741               one_misalignment_unknown = true;
1742
1743               /* Check for data refs with unsupportable alignment that
1744                  can be peeled.  */
1745               if (!supportable_dr_alignment)
1746               {
1747                 one_dr_unsupportable = true;
1748                 unsupportable_dr = dr;
1749               }
1750
1751               if (!first_store && DR_IS_WRITE (dr))
1752                 first_store = dr;
1753             }
1754         }
1755       else
1756         {
1757           if (!aligned_access_p (dr))
1758             {
1759               if (dump_enabled_p ())
1760                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761                                  "vector alignment may not be reachable\n");
1762               break;
1763             }
1764         }
1765     }
1766
1767   /* Check if we can possibly peel the loop.  */
1768   if (!vect_can_advance_ivs_p (loop_vinfo)
1769       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1770       || loop->inner)
1771     do_peeling = false;
1772
1773   struct _vect_peel_extended_info peel_for_known_alignment;
1774   struct _vect_peel_extended_info peel_for_unknown_alignment;
1775   struct _vect_peel_extended_info best_peel;
1776
1777   peel_for_unknown_alignment.inside_cost = INT_MAX;
1778   peel_for_unknown_alignment.outside_cost = INT_MAX;
1779   peel_for_unknown_alignment.peel_info.count = 0;
1780
1781   if (do_peeling
1782       && one_misalignment_unknown)
1783     {
1784       /* Check if the target requires to prefer stores over loads, i.e., if
1785          misaligned stores are more expensive than misaligned loads (taking
1786          drs with same alignment into account).  */
1787       unsigned int load_inside_cost = 0;
1788       unsigned int load_outside_cost = 0;
1789       unsigned int store_inside_cost = 0;
1790       unsigned int store_outside_cost = 0;
1791
1792       stmt_vector_for_cost dummy;
1793       dummy.create (2);
1794       vect_get_peeling_costs_all_drs (datarefs, dr0,
1795                                       &load_inside_cost,
1796                                       &load_outside_cost,
1797                                       &dummy, vf / 2, true);
1798       dummy.release ();
1799
1800       if (first_store)
1801         {
1802           dummy.create (2);
1803           vect_get_peeling_costs_all_drs (datarefs, first_store,
1804                                           &store_inside_cost,
1805                                           &store_outside_cost,
1806                                           &dummy, vf / 2, true);
1807           dummy.release ();
1808         }
1809       else
1810         {
1811           store_inside_cost = INT_MAX;
1812           store_outside_cost = INT_MAX;
1813         }
1814
1815       if (load_inside_cost > store_inside_cost
1816           || (load_inside_cost == store_inside_cost
1817               && load_outside_cost > store_outside_cost))
1818         {
1819           dr0 = first_store;
1820           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1821           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1822         }
1823       else
1824         {
1825           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1826           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1827         }
1828
1829       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1830       prologue_cost_vec.create (2);
1831       epilogue_cost_vec.create (2);
1832
1833       int dummy2;
1834       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1835         (loop_vinfo, vf / 2, &dummy2,
1836          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1837          &prologue_cost_vec, &epilogue_cost_vec);
1838
1839       prologue_cost_vec.release ();
1840       epilogue_cost_vec.release ();
1841
1842       peel_for_unknown_alignment.peel_info.count = 1
1843         + STMT_VINFO_SAME_ALIGN_REFS
1844         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1845     }
1846
1847   peel_for_unknown_alignment.peel_info.npeel = 0;
1848   peel_for_unknown_alignment.peel_info.dr = dr0;
1849
1850   best_peel = peel_for_unknown_alignment;
1851
1852   peel_for_known_alignment.inside_cost = INT_MAX;
1853   peel_for_known_alignment.outside_cost = INT_MAX;
1854   peel_for_known_alignment.peel_info.count = 0;
1855   peel_for_known_alignment.peel_info.dr = NULL;
1856
1857   if (do_peeling && one_misalignment_known)
1858     {
1859       /* Peeling is possible, but there is no data access that is not supported
1860          unless aligned.  So we try to choose the best possible peeling from
1861          the hash table.  */
1862       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1863         (&peeling_htab, loop_vinfo);
1864     }
1865
1866   /* Compare costs of peeling for known and unknown alignment. */
1867   if (peel_for_known_alignment.peel_info.dr != NULL
1868       && peel_for_unknown_alignment.inside_cost
1869       >= peel_for_known_alignment.inside_cost)
1870     {
1871       best_peel = peel_for_known_alignment;
1872
1873       /* If the best peeling for known alignment has NPEEL == 0, perform no
1874          peeling at all except if there is an unsupportable dr that we can
1875          align.  */
1876       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1877         do_peeling = false;
1878     }
1879
1880   /* If there is an unsupportable data ref, prefer this over all choices so far
1881      since we'd have to discard a chosen peeling except when it accidentally
1882      aligned the unsupportable data ref.  */
1883   if (one_dr_unsupportable)
1884     dr0 = unsupportable_dr;
1885   else if (do_peeling)
1886     {
1887       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1888          TODO: Use nopeel_outside_cost or get rid of it?  */
1889       unsigned nopeel_inside_cost = 0;
1890       unsigned nopeel_outside_cost = 0;
1891
1892       stmt_vector_for_cost dummy;
1893       dummy.create (2);
1894       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1895                                       &nopeel_outside_cost, &dummy, 0, false);
1896       dummy.release ();
1897
1898       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1899          costs will be recorded.  */
1900       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1901       prologue_cost_vec.create (2);
1902       epilogue_cost_vec.create (2);
1903
1904       int dummy2;
1905       nopeel_outside_cost += vect_get_known_peeling_cost
1906         (loop_vinfo, 0, &dummy2,
1907          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1908          &prologue_cost_vec, &epilogue_cost_vec);
1909
1910       prologue_cost_vec.release ();
1911       epilogue_cost_vec.release ();
1912
1913       npeel = best_peel.peel_info.npeel;
1914       dr0 = best_peel.peel_info.dr;
1915
1916       /* If no peeling is not more expensive than the best peeling we
1917          have so far, don't perform any peeling.  */
1918       if (nopeel_inside_cost <= best_peel.inside_cost)
1919         do_peeling = false;
1920     }
1921
1922   if (do_peeling)
1923     {
1924       stmt = DR_STMT (dr0);
1925       stmt_info = vinfo_for_stmt (stmt);
1926       vectype = STMT_VINFO_VECTYPE (stmt_info);
1927       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1928
1929       if (known_alignment_for_access_p (dr0))
1930         {
1931           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1932                                                 size_zero_node) < 0;
1933           if (!npeel)
1934             {
1935               /* Since it's known at compile time, compute the number of
1936                  iterations in the peeled loop (the peeling factor) for use in
1937                  updating DR_MISALIGNMENT values.  The peeling factor is the
1938                  vectorization factor minus the misalignment as an element
1939                  count.  */
1940               mis = DR_MISALIGNMENT (dr0);
1941               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1942               npeel = ((negative ? mis - nelements : nelements - mis)
1943                        & (nelements - 1));
1944             }
1945
1946           /* For interleaved data access every iteration accesses all the
1947              members of the group, therefore we divide the number of iterations
1948              by the group size.  */
1949           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1950           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1951             npeel /= GROUP_SIZE (stmt_info);
1952
1953           if (dump_enabled_p ())
1954             dump_printf_loc (MSG_NOTE, vect_location,
1955                              "Try peeling by %d\n", npeel);
1956         }
1957
1958       /* Ensure that all datarefs can be vectorized after the peel.  */
1959       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1960         do_peeling = false;
1961
1962       /* Check if all datarefs are supportable and log.  */
1963       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1964         {
1965           stat = vect_verify_datarefs_alignment (loop_vinfo);
1966           if (!stat)
1967             do_peeling = false;
1968           else
1969             return stat;
1970         }
1971
1972       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1973       if (do_peeling)
1974         {
1975           unsigned max_allowed_peel
1976             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1977           if (max_allowed_peel != (unsigned)-1)
1978             {
1979               unsigned max_peel = npeel;
1980               if (max_peel == 0)
1981                 {
1982                   gimple *dr_stmt = DR_STMT (dr0);
1983                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1984                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1985                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1986                 }
1987               if (max_peel > max_allowed_peel)
1988                 {
1989                   do_peeling = false;
1990                   if (dump_enabled_p ())
1991                     dump_printf_loc (MSG_NOTE, vect_location,
1992                         "Disable peeling, max peels reached: %d\n", max_peel);
1993                 }
1994             }
1995         }
1996
1997       /* Cost model #2 - if peeling may result in a remaining loop not
1998          iterating enough to be vectorized then do not peel.  */
1999       if (do_peeling
2000           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2001         {
2002           unsigned max_peel
2003             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
2004           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
2005               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
2006             do_peeling = false;
2007         }
2008
2009       if (do_peeling)
2010         {
2011           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2012              If the misalignment of DR_i is identical to that of dr0 then set
2013              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2014              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2015              by the peeling factor times the element size of DR_i (MOD the
2016              vectorization factor times the size).  Otherwise, the
2017              misalignment of DR_i must be set to unknown.  */
2018           FOR_EACH_VEC_ELT (datarefs, i, dr)
2019             if (dr != dr0)
2020               {
2021                 /* Strided accesses perform only component accesses, alignment
2022                    is irrelevant for them.  */
2023                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2024                 if (STMT_VINFO_STRIDED_P (stmt_info)
2025                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2026                   continue;
2027
2028                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2029               }
2030
2031           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2032           if (npeel)
2033             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2034           else
2035             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2036               = DR_MISALIGNMENT (dr0);
2037           SET_DR_MISALIGNMENT (dr0, 0);
2038           if (dump_enabled_p ())
2039             {
2040               dump_printf_loc (MSG_NOTE, vect_location,
2041                                "Alignment of access forced using peeling.\n");
2042               dump_printf_loc (MSG_NOTE, vect_location,
2043                                "Peeling for alignment will be applied.\n");
2044             }
2045
2046           /* The inside-loop cost will be accounted for in vectorizable_load
2047              and vectorizable_store correctly with adjusted alignments.
2048              Drop the body_cst_vec on the floor here.  */
2049           stat = vect_verify_datarefs_alignment (loop_vinfo);
2050           gcc_assert (stat);
2051           return stat;
2052         }
2053     }
2054
2055   /* (2) Versioning to force alignment.  */
2056
2057   /* Try versioning if:
2058      1) optimize loop for speed
2059      2) there is at least one unsupported misaligned data ref with an unknown
2060         misalignment, and
2061      3) all misaligned data refs with a known misalignment are supported, and
2062      4) the number of runtime alignment checks is within reason.  */
2063
2064   do_versioning =
2065         optimize_loop_nest_for_speed_p (loop)
2066         && (!loop->inner); /* FORNOW */
2067
2068   if (do_versioning)
2069     {
2070       FOR_EACH_VEC_ELT (datarefs, i, dr)
2071         {
2072           stmt = DR_STMT (dr);
2073           stmt_info = vinfo_for_stmt (stmt);
2074
2075           /* For interleaving, only the alignment of the first access
2076              matters.  */
2077           if (aligned_access_p (dr)
2078               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2079                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2080             continue;
2081
2082           if (STMT_VINFO_STRIDED_P (stmt_info))
2083             {
2084               /* Strided loads perform only component accesses, alignment is
2085                  irrelevant for them.  */
2086               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2087                 continue;
2088               do_versioning = false;
2089               break;
2090             }
2091
2092           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2093
2094           if (!supportable_dr_alignment)
2095             {
2096               gimple *stmt;
2097               int mask;
2098               tree vectype;
2099
2100               if (known_alignment_for_access_p (dr)
2101                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2102                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2103                 {
2104                   do_versioning = false;
2105                   break;
2106                 }
2107
2108               stmt = DR_STMT (dr);
2109               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2110               gcc_assert (vectype);
2111
2112               /* The rightmost bits of an aligned address must be zeros.
2113                  Construct the mask needed for this test.  For example,
2114                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2115                  mask must be 15 = 0xf. */
2116               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2117
2118               /* FORNOW: use the same mask to test all potentially unaligned
2119                  references in the loop.  The vectorizer currently supports
2120                  a single vector size, see the reference to
2121                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2122                  vectorization factor is computed.  */
2123               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2124                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2125               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2126               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2127                       DR_STMT (dr));
2128             }
2129         }
2130
2131       /* Versioning requires at least one misaligned data reference.  */
2132       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2133         do_versioning = false;
2134       else if (!do_versioning)
2135         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2136     }
2137
2138   if (do_versioning)
2139     {
2140       vec<gimple *> may_misalign_stmts
2141         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2142       gimple *stmt;
2143
2144       /* It can now be assumed that the data references in the statements
2145          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2146          of the loop being vectorized.  */
2147       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2148         {
2149           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2150           dr = STMT_VINFO_DATA_REF (stmt_info);
2151           SET_DR_MISALIGNMENT (dr, 0);
2152           if (dump_enabled_p ())
2153             dump_printf_loc (MSG_NOTE, vect_location,
2154                              "Alignment of access forced using versioning.\n");
2155         }
2156
2157       if (dump_enabled_p ())
2158         dump_printf_loc (MSG_NOTE, vect_location,
2159                          "Versioning for alignment will be applied.\n");
2160
2161       /* Peeling and versioning can't be done together at this time.  */
2162       gcc_assert (! (do_peeling && do_versioning));
2163
2164       stat = vect_verify_datarefs_alignment (loop_vinfo);
2165       gcc_assert (stat);
2166       return stat;
2167     }
2168
2169   /* This point is reached if neither peeling nor versioning is being done.  */
2170   gcc_assert (! (do_peeling || do_versioning));
2171
2172   stat = vect_verify_datarefs_alignment (loop_vinfo);
2173   return stat;
2174 }
2175
2176
2177 /* Function vect_find_same_alignment_drs.
2178
2179    Update group and alignment relations according to the chosen
2180    vectorization factor.  */
2181
2182 static void
2183 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2184 {
2185   struct data_reference *dra = DDR_A (ddr);
2186   struct data_reference *drb = DDR_B (ddr);
2187   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2188   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2189
2190   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2191     return;
2192
2193   if (dra == drb)
2194     return;
2195
2196   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2197       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2198       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2199     return;
2200
2201   /* Two references with distance zero have the same alignment.  */
2202   offset_int diff = (wi::to_offset (DR_INIT (dra))
2203                      - wi::to_offset (DR_INIT (drb)));
2204   if (diff != 0)
2205     {
2206       /* Get the wider of the two alignments.  */
2207       unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2208       unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2209       unsigned int max_align = MAX (align_a, align_b);
2210
2211       /* Require the gap to be a multiple of the larger vector alignment.  */
2212       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2213         return;
2214     }
2215
2216   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2217   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2218   if (dump_enabled_p ())
2219     {
2220       dump_printf_loc (MSG_NOTE, vect_location,
2221                        "accesses have the same alignment: ");
2222       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2223       dump_printf (MSG_NOTE,  " and ");
2224       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2225       dump_printf (MSG_NOTE, "\n");
2226     }
2227 }
2228
2229
2230 /* Function vect_analyze_data_refs_alignment
2231
2232    Analyze the alignment of the data-references in the loop.
2233    Return FALSE if a data reference is found that cannot be vectorized.  */
2234
2235 bool
2236 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2237 {
2238   if (dump_enabled_p ())
2239     dump_printf_loc (MSG_NOTE, vect_location,
2240                      "=== vect_analyze_data_refs_alignment ===\n");
2241
2242   /* Mark groups of data references with same alignment using
2243      data dependence information.  */
2244   vec<ddr_p> ddrs = vinfo->ddrs;
2245   struct data_dependence_relation *ddr;
2246   unsigned int i;
2247
2248   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2249     vect_find_same_alignment_drs (ddr);
2250
2251   vec<data_reference_p> datarefs = vinfo->datarefs;
2252   struct data_reference *dr;
2253
2254   vect_record_base_alignments (vinfo);
2255   FOR_EACH_VEC_ELT (datarefs, i, dr)
2256     {
2257       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2258       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2259           && !vect_compute_data_ref_alignment (dr))
2260         {
2261           /* Strided accesses perform only component accesses, misalignment
2262              information is irrelevant for them.  */
2263           if (STMT_VINFO_STRIDED_P (stmt_info)
2264               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2265             continue;
2266
2267           if (dump_enabled_p ())
2268             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269                              "not vectorized: can't calculate alignment "
2270                              "for data ref.\n");
2271
2272           return false;
2273         }
2274     }
2275
2276   return true;
2277 }
2278
2279
2280 /* Analyze alignment of DRs of stmts in NODE.  */
2281
2282 static bool
2283 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2284 {
2285   /* We vectorize from the first scalar stmt in the node unless
2286      the node is permuted in which case we start from the first
2287      element in the group.  */
2288   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2289   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2290   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2291     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2292
2293   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2294   if (! vect_compute_data_ref_alignment (dr)
2295       /* For creating the data-ref pointer we need alignment of the
2296          first element anyway.  */
2297       || (dr != first_dr
2298           && ! vect_compute_data_ref_alignment (first_dr))
2299       || ! verify_data_ref_alignment (dr))
2300     {
2301       if (dump_enabled_p ())
2302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303                          "not vectorized: bad data alignment in basic "
2304                          "block.\n");
2305       return false;
2306     }
2307
2308   return true;
2309 }
2310
2311 /* Function vect_slp_analyze_instance_alignment
2312
2313    Analyze the alignment of the data-references in the SLP instance.
2314    Return FALSE if a data reference is found that cannot be vectorized.  */
2315
2316 bool
2317 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2318 {
2319   if (dump_enabled_p ())
2320     dump_printf_loc (MSG_NOTE, vect_location,
2321                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2322
2323   slp_tree node;
2324   unsigned i;
2325   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2326     if (! vect_slp_analyze_and_verify_node_alignment (node))
2327       return false;
2328
2329   node = SLP_INSTANCE_TREE (instance);
2330   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2331       && ! vect_slp_analyze_and_verify_node_alignment
2332              (SLP_INSTANCE_TREE (instance)))
2333     return false;
2334
2335   return true;
2336 }
2337
2338
2339 /* Analyze groups of accesses: check that DR belongs to a group of
2340    accesses of legal size, step, etc.  Detect gaps, single element
2341    interleaving, and other special cases. Set grouped access info.
2342    Collect groups of strided stores for further use in SLP analysis.
2343    Worker for vect_analyze_group_access.  */
2344
2345 static bool
2346 vect_analyze_group_access_1 (struct data_reference *dr)
2347 {
2348   tree step = DR_STEP (dr);
2349   tree scalar_type = TREE_TYPE (DR_REF (dr));
2350   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2351   gimple *stmt = DR_STMT (dr);
2352   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2353   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2354   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2355   HOST_WIDE_INT dr_step = -1;
2356   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2357   bool slp_impossible = false;
2358
2359   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2360      size of the interleaving group (including gaps).  */
2361   if (tree_fits_shwi_p (step))
2362     {
2363       dr_step = tree_to_shwi (step);
2364       /* Check that STEP is a multiple of type size.  Otherwise there is
2365          a non-element-sized gap at the end of the group which we
2366          cannot represent in GROUP_GAP or GROUP_SIZE.
2367          ???  As we can handle non-constant step fine here we should
2368          simply remove uses of GROUP_GAP between the last and first
2369          element and instead rely on DR_STEP.  GROUP_SIZE then would
2370          simply not include that gap.  */
2371       if ((dr_step % type_size) != 0)
2372         {
2373           if (dump_enabled_p ())
2374             {
2375               dump_printf_loc (MSG_NOTE, vect_location,
2376                                "Step ");
2377               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2378               dump_printf (MSG_NOTE,
2379                            " is not a multiple of the element size for ");
2380               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2381               dump_printf (MSG_NOTE, "\n");
2382             }
2383           return false;
2384         }
2385       groupsize = absu_hwi (dr_step) / type_size;
2386     }
2387   else
2388     groupsize = 0;
2389
2390   /* Not consecutive access is possible only if it is a part of interleaving.  */
2391   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2392     {
2393       /* Check if it this DR is a part of interleaving, and is a single
2394          element of the group that is accessed in the loop.  */
2395
2396       /* Gaps are supported only for loads. STEP must be a multiple of the type
2397          size.  The size of the group must be a power of 2.  */
2398       if (DR_IS_READ (dr)
2399           && (dr_step % type_size) == 0
2400           && groupsize > 0
2401           && pow2p_hwi (groupsize))
2402         {
2403           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2404           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2405           GROUP_GAP (stmt_info) = groupsize - 1;
2406           if (dump_enabled_p ())
2407             {
2408               dump_printf_loc (MSG_NOTE, vect_location,
2409                                "Detected single element interleaving ");
2410               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2411               dump_printf (MSG_NOTE, " step ");
2412               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2413               dump_printf (MSG_NOTE, "\n");
2414             }
2415
2416           return true;
2417         }
2418
2419       if (dump_enabled_p ())
2420         {
2421           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2422                            "not consecutive access ");
2423           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2424         }
2425
2426       if (bb_vinfo)
2427         {
2428           /* Mark the statement as unvectorizable.  */
2429           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2430           return true;
2431         }
2432
2433       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2434       STMT_VINFO_STRIDED_P (stmt_info) = true;
2435       return true;
2436     }
2437
2438   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2439     {
2440       /* First stmt in the interleaving chain. Check the chain.  */
2441       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2442       struct data_reference *data_ref = dr;
2443       unsigned int count = 1;
2444       tree prev_init = DR_INIT (data_ref);
2445       gimple *prev = stmt;
2446       HOST_WIDE_INT diff, gaps = 0;
2447
2448       while (next)
2449         {
2450           /* Skip same data-refs.  In case that two or more stmts share
2451              data-ref (supported only for loads), we vectorize only the first
2452              stmt, and the rest get their vectorized loads from the first
2453              one.  */
2454           if (!tree_int_cst_compare (DR_INIT (data_ref),
2455                                      DR_INIT (STMT_VINFO_DATA_REF (
2456                                                    vinfo_for_stmt (next)))))
2457             {
2458               if (DR_IS_WRITE (data_ref))
2459                 {
2460                   if (dump_enabled_p ())
2461                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2462                                      "Two store stmts share the same dr.\n");
2463                   return false;
2464                 }
2465
2466               if (dump_enabled_p ())
2467                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2468                                  "Two or more load stmts share the same dr.\n");
2469
2470               /* For load use the same data-ref load.  */
2471               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2472
2473               prev = next;
2474               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2475               continue;
2476             }
2477
2478           prev = next;
2479           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2480
2481           /* All group members have the same STEP by construction.  */
2482           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2483
2484           /* Check that the distance between two accesses is equal to the type
2485              size. Otherwise, we have gaps.  */
2486           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2487                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2488           if (diff != 1)
2489             {
2490               /* FORNOW: SLP of accesses with gaps is not supported.  */
2491               slp_impossible = true;
2492               if (DR_IS_WRITE (data_ref))
2493                 {
2494                   if (dump_enabled_p ())
2495                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2496                                      "interleaved store with gaps\n");
2497                   return false;
2498                 }
2499
2500               gaps += diff - 1;
2501             }
2502
2503           last_accessed_element += diff;
2504
2505           /* Store the gap from the previous member of the group. If there is no
2506              gap in the access, GROUP_GAP is always 1.  */
2507           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2508
2509           prev_init = DR_INIT (data_ref);
2510           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2511           /* Count the number of data-refs in the chain.  */
2512           count++;
2513         }
2514
2515       if (groupsize == 0)
2516         groupsize = count + gaps;
2517
2518       /* This could be UINT_MAX but as we are generating code in a very
2519          inefficient way we have to cap earlier.  See PR78699 for example.  */
2520       if (groupsize > 4096)
2521         {
2522           if (dump_enabled_p ())
2523             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2524                              "group is too large\n");
2525           return false;
2526         }
2527
2528       /* Check that the size of the interleaving is equal to count for stores,
2529          i.e., that there are no gaps.  */
2530       if (groupsize != count
2531           && !DR_IS_READ (dr))
2532         {
2533           if (dump_enabled_p ())
2534             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2535                              "interleaved store with gaps\n");
2536           return false;
2537         }
2538
2539       /* If there is a gap after the last load in the group it is the
2540          difference between the groupsize and the last accessed
2541          element.
2542          When there is no gap, this difference should be 0.  */
2543       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2544
2545       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2546       if (dump_enabled_p ())
2547         {
2548           dump_printf_loc (MSG_NOTE, vect_location,
2549                            "Detected interleaving ");
2550           if (DR_IS_READ (dr))
2551             dump_printf (MSG_NOTE, "load ");
2552           else
2553             dump_printf (MSG_NOTE, "store ");
2554           dump_printf (MSG_NOTE, "of size %u starting with ",
2555                        (unsigned)groupsize);
2556           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2557           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2558             dump_printf_loc (MSG_NOTE, vect_location,
2559                              "There is a gap of %u elements after the group\n",
2560                              GROUP_GAP (vinfo_for_stmt (stmt)));
2561         }
2562
2563       /* SLP: create an SLP data structure for every interleaving group of
2564          stores for further analysis in vect_analyse_slp.  */
2565       if (DR_IS_WRITE (dr) && !slp_impossible)
2566         {
2567           if (loop_vinfo)
2568             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2569           if (bb_vinfo)
2570             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2571         }
2572     }
2573
2574   return true;
2575 }
2576
2577 /* Analyze groups of accesses: check that DR belongs to a group of
2578    accesses of legal size, step, etc.  Detect gaps, single element
2579    interleaving, and other special cases. Set grouped access info.
2580    Collect groups of strided stores for further use in SLP analysis.  */
2581
2582 static bool
2583 vect_analyze_group_access (struct data_reference *dr)
2584 {
2585   if (!vect_analyze_group_access_1 (dr))
2586     {
2587       /* Dissolve the group if present.  */
2588       gimple *next;
2589       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2590       while (stmt)
2591         {
2592           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2593           next = GROUP_NEXT_ELEMENT (vinfo);
2594           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2595           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2596           stmt = next;
2597         }
2598       return false;
2599     }
2600   return true;
2601 }
2602
2603 /* Analyze the access pattern of the data-reference DR.
2604    In case of non-consecutive accesses call vect_analyze_group_access() to
2605    analyze groups of accesses.  */
2606
2607 static bool
2608 vect_analyze_data_ref_access (struct data_reference *dr)
2609 {
2610   tree step = DR_STEP (dr);
2611   tree scalar_type = TREE_TYPE (DR_REF (dr));
2612   gimple *stmt = DR_STMT (dr);
2613   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2614   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2615   struct loop *loop = NULL;
2616
2617   if (loop_vinfo)
2618     loop = LOOP_VINFO_LOOP (loop_vinfo);
2619
2620   if (loop_vinfo && !step)
2621     {
2622       if (dump_enabled_p ())
2623         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2624                          "bad data-ref access in loop\n");
2625       return false;
2626     }
2627
2628   /* Allow loads with zero step in inner-loop vectorization.  */
2629   if (loop_vinfo && integer_zerop (step))
2630     {
2631       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2632       if (!nested_in_vect_loop_p (loop, stmt))
2633         return DR_IS_READ (dr);
2634       /* Allow references with zero step for outer loops marked
2635          with pragma omp simd only - it guarantees absence of
2636          loop-carried dependencies between inner loop iterations.  */
2637       if (!loop->force_vectorize)
2638         {
2639           if (dump_enabled_p ())
2640             dump_printf_loc (MSG_NOTE, vect_location,
2641                              "zero step in inner loop of nest\n");
2642           return false;
2643         }
2644     }
2645
2646   if (loop && nested_in_vect_loop_p (loop, stmt))
2647     {
2648       /* Interleaved accesses are not yet supported within outer-loop
2649         vectorization for references in the inner-loop.  */
2650       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2651
2652       /* For the rest of the analysis we use the outer-loop step.  */
2653       step = STMT_VINFO_DR_STEP (stmt_info);
2654       if (integer_zerop (step))
2655         {
2656           if (dump_enabled_p ())
2657             dump_printf_loc (MSG_NOTE, vect_location,
2658                              "zero step in outer loop.\n");
2659           return DR_IS_READ (dr);
2660         }
2661     }
2662
2663   /* Consecutive?  */
2664   if (TREE_CODE (step) == INTEGER_CST)
2665     {
2666       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2667       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2668           || (dr_step < 0
2669               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2670         {
2671           /* Mark that it is not interleaving.  */
2672           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2673           return true;
2674         }
2675     }
2676
2677   if (loop && nested_in_vect_loop_p (loop, stmt))
2678     {
2679       if (dump_enabled_p ())
2680         dump_printf_loc (MSG_NOTE, vect_location,
2681                          "grouped access in outer loop.\n");
2682       return false;
2683     }
2684
2685
2686   /* Assume this is a DR handled by non-constant strided load case.  */
2687   if (TREE_CODE (step) != INTEGER_CST)
2688     return (STMT_VINFO_STRIDED_P (stmt_info)
2689             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2690                 || vect_analyze_group_access (dr)));
2691
2692   /* Not consecutive access - check if it's a part of interleaving group.  */
2693   return vect_analyze_group_access (dr);
2694 }
2695
2696 /* Compare two data-references DRA and DRB to group them into chunks
2697    suitable for grouping.  */
2698
2699 static int
2700 dr_group_sort_cmp (const void *dra_, const void *drb_)
2701 {
2702   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2703   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2704   int cmp;
2705
2706   /* Stabilize sort.  */
2707   if (dra == drb)
2708     return 0;
2709
2710   /* DRs in different loops never belong to the same group.  */
2711   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2712   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2713   if (loopa != loopb)
2714     return loopa->num < loopb->num ? -1 : 1;
2715
2716   /* Ordering of DRs according to base.  */
2717   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2718     {
2719       cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2720                                    DR_BASE_ADDRESS (drb));
2721       if (cmp != 0)
2722         return cmp;
2723     }
2724
2725   /* And according to DR_OFFSET.  */
2726   if (!dr_equal_offsets_p (dra, drb))
2727     {
2728       cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2729       if (cmp != 0)
2730         return cmp;
2731     }
2732
2733   /* Put reads before writes.  */
2734   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2735     return DR_IS_READ (dra) ? -1 : 1;
2736
2737   /* Then sort after access size.  */
2738   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2739                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2740     {
2741       cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2742                                    TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2743       if (cmp != 0)
2744         return cmp;
2745     }
2746
2747   /* And after step.  */
2748   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2749     {
2750       cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2751       if (cmp != 0)
2752         return cmp;
2753     }
2754
2755   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2756   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2757   if (cmp == 0)
2758     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2759   return cmp;
2760 }
2761
2762 /* Function vect_analyze_data_ref_accesses.
2763
2764    Analyze the access pattern of all the data references in the loop.
2765
2766    FORNOW: the only access pattern that is considered vectorizable is a
2767            simple step 1 (consecutive) access.
2768
2769    FORNOW: handle only arrays and pointer accesses.  */
2770
2771 bool
2772 vect_analyze_data_ref_accesses (vec_info *vinfo)
2773 {
2774   unsigned int i;
2775   vec<data_reference_p> datarefs = vinfo->datarefs;
2776   struct data_reference *dr;
2777
2778   if (dump_enabled_p ())
2779     dump_printf_loc (MSG_NOTE, vect_location,
2780                      "=== vect_analyze_data_ref_accesses ===\n");
2781
2782   if (datarefs.is_empty ())
2783     return true;
2784
2785   /* Sort the array of datarefs to make building the interleaving chains
2786      linear.  Don't modify the original vector's order, it is needed for
2787      determining what dependencies are reversed.  */
2788   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2789   datarefs_copy.qsort (dr_group_sort_cmp);
2790
2791   /* Build the interleaving chains.  */
2792   for (i = 0; i < datarefs_copy.length () - 1;)
2793     {
2794       data_reference_p dra = datarefs_copy[i];
2795       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2796       stmt_vec_info lastinfo = NULL;
2797       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2798         {
2799           ++i;
2800           continue;
2801         }
2802       for (i = i + 1; i < datarefs_copy.length (); ++i)
2803         {
2804           data_reference_p drb = datarefs_copy[i];
2805           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2806           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2807             break;
2808
2809           /* ???  Imperfect sorting (non-compatible types, non-modulo
2810              accesses, same accesses) can lead to a group to be artificially
2811              split here as we don't just skip over those.  If it really
2812              matters we can push those to a worklist and re-iterate
2813              over them.  The we can just skip ahead to the next DR here.  */
2814
2815           /* DRs in a different loop should not be put into the same
2816              interleaving group.  */
2817           if (gimple_bb (DR_STMT (dra))->loop_father
2818               != gimple_bb (DR_STMT (drb))->loop_father)
2819             break;
2820
2821           /* Check that the data-refs have same first location (except init)
2822              and they are both either store or load (not load and store,
2823              not masked loads or stores).  */
2824           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2825               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2826                                    DR_BASE_ADDRESS (drb), 0)
2827               || !dr_equal_offsets_p (dra, drb)
2828               || !gimple_assign_single_p (DR_STMT (dra))
2829               || !gimple_assign_single_p (DR_STMT (drb)))
2830             break;
2831
2832           /* Check that the data-refs have the same constant size.  */
2833           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2834           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2835           if (!tree_fits_uhwi_p (sza)
2836               || !tree_fits_uhwi_p (szb)
2837               || !tree_int_cst_equal (sza, szb))
2838             break;
2839
2840           /* Check that the data-refs have the same step.  */
2841           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2842             break;
2843
2844           /* Do not place the same access in the interleaving chain twice.  */
2845           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2846             break;
2847
2848           /* Check the types are compatible.
2849              ???  We don't distinguish this during sorting.  */
2850           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2851                                    TREE_TYPE (DR_REF (drb))))
2852             break;
2853
2854           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2855           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2856           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2857           gcc_assert (init_a <= init_b);
2858
2859           /* If init_b == init_a + the size of the type * k, we have an
2860              interleaving, and DRA is accessed before DRB.  */
2861           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2862           if (type_size_a == 0
2863               || (init_b - init_a) % type_size_a != 0)
2864             break;
2865
2866           /* If we have a store, the accesses are adjacent.  This splits
2867              groups into chunks we support (we don't support vectorization
2868              of stores with gaps).  */
2869           if (!DR_IS_READ (dra)
2870               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2871                                              (DR_INIT (datarefs_copy[i-1]))
2872                   != type_size_a))
2873             break;
2874
2875           /* If the step (if not zero or non-constant) is greater than the
2876              difference between data-refs' inits this splits groups into
2877              suitable sizes.  */
2878           if (tree_fits_shwi_p (DR_STEP (dra)))
2879             {
2880               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2881               if (step != 0 && step <= (init_b - init_a))
2882                 break;
2883             }
2884
2885           if (dump_enabled_p ())
2886             {
2887               dump_printf_loc (MSG_NOTE, vect_location,
2888                                "Detected interleaving ");
2889               if (DR_IS_READ (dra))
2890                 dump_printf (MSG_NOTE, "load ");
2891               else
2892                 dump_printf (MSG_NOTE, "store ");
2893               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2894               dump_printf (MSG_NOTE,  " and ");
2895               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2896               dump_printf (MSG_NOTE, "\n");
2897             }
2898
2899           /* Link the found element into the group list.  */
2900           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2901             {
2902               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2903               lastinfo = stmtinfo_a;
2904             }
2905           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2906           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2907           lastinfo = stmtinfo_b;
2908         }
2909     }
2910
2911   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2912     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2913         && !vect_analyze_data_ref_access (dr))
2914       {
2915         if (dump_enabled_p ())
2916           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2917                            "not vectorized: complicated access pattern.\n");
2918
2919         if (is_a <bb_vec_info> (vinfo))
2920           {
2921             /* Mark the statement as not vectorizable.  */
2922             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2923             continue;
2924           }
2925         else
2926           {
2927             datarefs_copy.release ();
2928             return false;
2929           }
2930       }
2931
2932   datarefs_copy.release ();
2933   return true;
2934 }
2935
2936 /* Function vect_vfa_segment_size.
2937
2938    Create an expression that computes the size of segment
2939    that will be accessed for a data reference.  The functions takes into
2940    account that realignment loads may access one more vector.
2941
2942    Input:
2943      DR: The data reference.
2944      LENGTH_FACTOR: segment length to consider.
2945
2946    Return an expression whose value is the size of segment which will be
2947    accessed by DR.  */
2948
2949 static tree
2950 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2951 {
2952   tree segment_length;
2953
2954   if (integer_zerop (DR_STEP (dr)))
2955     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2956   else
2957     segment_length = size_binop (MULT_EXPR,
2958                                  fold_convert (sizetype, DR_STEP (dr)),
2959                                  fold_convert (sizetype, length_factor));
2960
2961   if (vect_supportable_dr_alignment (dr, false)
2962         == dr_explicit_realign_optimized)
2963     {
2964       tree vector_size = TYPE_SIZE_UNIT
2965                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2966
2967       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2968     }
2969   return segment_length;
2970 }
2971
2972 /* Function vect_no_alias_p.
2973
2974    Given data references A and B with equal base and offset, the alias
2975    relation can be decided at compilation time, return TRUE if they do
2976    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
2977    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2978    respectively.  */
2979
2980 static bool
2981 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2982                  tree segment_length_a, tree segment_length_b)
2983 {
2984   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2985               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2986   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2987     return false;
2988
2989   tree seg_a_min = DR_INIT (a);
2990   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2991                                 seg_a_min, segment_length_a);
2992   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2993      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2994      [a, a+12) */
2995   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2996     {
2997       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2998       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2999                                seg_a_max, unit_size);
3000       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3001                                DR_INIT (a), unit_size);
3002     }
3003   tree seg_b_min = DR_INIT (b);
3004   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3005                                 seg_b_min, segment_length_b);
3006   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3007     {
3008       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3009       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3010                                seg_b_max, unit_size);
3011       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3012                                DR_INIT (b), unit_size);
3013     }
3014
3015   if (tree_int_cst_le (seg_a_max, seg_b_min)
3016       || tree_int_cst_le (seg_b_max, seg_a_min))
3017     return true;
3018
3019   return false;
3020 }
3021
3022 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3023    in DDR is >= VF.  */
3024
3025 static bool
3026 dependence_distance_ge_vf (data_dependence_relation *ddr,
3027                            unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3028 {
3029   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3030       || DDR_NUM_DIST_VECTS (ddr) == 0)
3031     return false;
3032
3033   /* If the dependence is exact, we should have limited the VF instead.  */
3034   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3035
3036   unsigned int i;
3037   lambda_vector dist_v;
3038   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3039     {
3040       HOST_WIDE_INT dist = dist_v[loop_depth];
3041       if (dist != 0
3042           && !(dist > 0 && DDR_REVERSED_P (ddr))
3043           && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3044         return false;
3045     }
3046
3047   if (dump_enabled_p ())
3048     {
3049       dump_printf_loc (MSG_NOTE, vect_location,
3050                        "dependence distance between ");
3051       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3052       dump_printf (MSG_NOTE,  " and ");
3053       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3054       dump_printf (MSG_NOTE,  " is >= VF\n");
3055     }
3056
3057   return true;
3058 }
3059
3060 /* Function vect_prune_runtime_alias_test_list.
3061
3062    Prune a list of ddrs to be tested at run-time by versioning for alias.
3063    Merge several alias checks into one if possible.
3064    Return FALSE if resulting list of ddrs is longer then allowed by
3065    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3066
3067 bool
3068 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3069 {
3070   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3071   hash_set <tree_pair_hash> compared_objects;
3072
3073   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3074   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3075     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3076   vec<vec_object_pair> &check_unequal_addrs
3077     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3078   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3079   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3080
3081   ddr_p ddr;
3082   unsigned int i;
3083   tree length_factor;
3084
3085   if (dump_enabled_p ())
3086     dump_printf_loc (MSG_NOTE, vect_location,
3087                      "=== vect_prune_runtime_alias_test_list ===\n");
3088
3089   if (may_alias_ddrs.is_empty ())
3090     return true;
3091
3092   comp_alias_ddrs.create (may_alias_ddrs.length ());
3093
3094   unsigned int loop_depth
3095     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3096                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3097
3098   /* First, we collect all data ref pairs for aliasing checks.  */
3099   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3100     {
3101       int comp_res;
3102       struct data_reference *dr_a, *dr_b;
3103       gimple *dr_group_first_a, *dr_group_first_b;
3104       tree segment_length_a, segment_length_b;
3105       gimple *stmt_a, *stmt_b;
3106
3107       /* Ignore the alias if the VF we chose ended up being no greater
3108          than the dependence distance.  */
3109       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3110         continue;
3111
3112       if (DDR_OBJECT_A (ddr))
3113         {
3114           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3115           if (!compared_objects.add (new_pair))
3116             {
3117               if (dump_enabled_p ())
3118                 {
3119                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3120                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3121                   dump_printf (MSG_NOTE, " and ");
3122                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3123                   dump_printf (MSG_NOTE, " have different addresses\n");
3124                 }
3125               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3126             }
3127           continue;
3128         }
3129
3130       dr_a = DDR_A (ddr);
3131       stmt_a = DR_STMT (DDR_A (ddr));
3132       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3133       if (dr_group_first_a)
3134         {
3135           stmt_a = dr_group_first_a;
3136           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3137         }
3138
3139       dr_b = DDR_B (ddr);
3140       stmt_b = DR_STMT (DDR_B (ddr));
3141       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3142       if (dr_group_first_b)
3143         {
3144           stmt_b = dr_group_first_b;
3145           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3146         }
3147
3148       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3149         length_factor = scalar_loop_iters;
3150       else
3151         length_factor = size_int (vect_factor);
3152       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3153       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3154
3155       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3156                                         DR_BASE_ADDRESS (dr_b));
3157       if (comp_res == 0)
3158         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3159                                           DR_OFFSET (dr_b));
3160
3161       /* Alias is known at compilation time.  */
3162       if (comp_res == 0
3163           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3164           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3165           && TREE_CODE (segment_length_a) == INTEGER_CST
3166           && TREE_CODE (segment_length_b) == INTEGER_CST)
3167         {
3168           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3169             continue;
3170
3171           if (dump_enabled_p ())
3172             dump_printf_loc (MSG_NOTE, vect_location,
3173                              "not vectorized: compilation time alias.\n");
3174
3175           return false;
3176         }
3177
3178       dr_with_seg_len_pair_t dr_with_seg_len_pair
3179           (dr_with_seg_len (dr_a, segment_length_a),
3180            dr_with_seg_len (dr_b, segment_length_b));
3181
3182       /* Canonicalize pairs by sorting the two DR members.  */
3183       if (comp_res > 0)
3184         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3185
3186       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3187     }
3188
3189   prune_runtime_alias_test_list (&comp_alias_ddrs,
3190                                  (unsigned HOST_WIDE_INT) vect_factor);
3191
3192   unsigned int count = (comp_alias_ddrs.length ()
3193                         + check_unequal_addrs.length ());
3194   dump_printf_loc (MSG_NOTE, vect_location,
3195                    "improved number of alias checks from %d to %d\n",
3196                    may_alias_ddrs.length (), count);
3197   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3198     {
3199       if (dump_enabled_p ())
3200         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3201                          "number of versioning for alias "
3202                          "run-time tests exceeds %d "
3203                          "(--param vect-max-version-for-alias-checks)\n",
3204                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3205       return false;
3206     }
3207
3208   return true;
3209 }
3210
3211 /* Return true if a non-affine read or write in STMT is suitable for a
3212    gather load or scatter store.  Describe the operation in *INFO if so.  */
3213
3214 bool
3215 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3216                            gather_scatter_info *info)
3217 {
3218   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3219   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3220   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3221   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3222   tree offtype = NULL_TREE;
3223   tree decl, base, off;
3224   machine_mode pmode;
3225   int punsignedp, reversep, pvolatilep = 0;
3226
3227   base = DR_REF (dr);
3228   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3229      see if we can use the def stmt of the address.  */
3230   if (is_gimple_call (stmt)
3231       && gimple_call_internal_p (stmt)
3232       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3233           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3234       && TREE_CODE (base) == MEM_REF
3235       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3236       && integer_zerop (TREE_OPERAND (base, 1))
3237       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3238     {
3239       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3240       if (is_gimple_assign (def_stmt)
3241           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3242         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3243     }
3244
3245   /* The gather and scatter builtins need address of the form
3246      loop_invariant + vector * {1, 2, 4, 8}
3247      or
3248      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3249      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3250      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3251      multiplications and additions in it.  To get a vector, we need
3252      a single SSA_NAME that will be defined in the loop and will
3253      contain everything that is not loop invariant and that can be
3254      vectorized.  The following code attempts to find such a preexistng
3255      SSA_NAME OFF and put the loop invariants into a tree BASE
3256      that can be gimplified before the loop.  */
3257   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3258                               &punsignedp, &reversep, &pvolatilep);
3259   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3260
3261   if (TREE_CODE (base) == MEM_REF)
3262     {
3263       if (!integer_zerop (TREE_OPERAND (base, 1)))
3264         {
3265           if (off == NULL_TREE)
3266             {
3267               offset_int moff = mem_ref_offset (base);
3268               off = wide_int_to_tree (sizetype, moff);
3269             }
3270           else
3271             off = size_binop (PLUS_EXPR, off,
3272                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3273         }
3274       base = TREE_OPERAND (base, 0);
3275     }
3276   else
3277     base = build_fold_addr_expr (base);
3278
3279   if (off == NULL_TREE)
3280     off = size_zero_node;
3281
3282   /* If base is not loop invariant, either off is 0, then we start with just
3283      the constant offset in the loop invariant BASE and continue with base
3284      as OFF, otherwise give up.
3285      We could handle that case by gimplifying the addition of base + off
3286      into some SSA_NAME and use that as off, but for now punt.  */
3287   if (!expr_invariant_in_loop_p (loop, base))
3288     {
3289       if (!integer_zerop (off))
3290         return false;
3291       off = base;
3292       base = size_int (pbitpos / BITS_PER_UNIT);
3293     }
3294   /* Otherwise put base + constant offset into the loop invariant BASE
3295      and continue with OFF.  */
3296   else
3297     {
3298       base = fold_convert (sizetype, base);
3299       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3300     }
3301
3302   /* OFF at this point may be either a SSA_NAME or some tree expression
3303      from get_inner_reference.  Try to peel off loop invariants from it
3304      into BASE as long as possible.  */
3305   STRIP_NOPS (off);
3306   while (offtype == NULL_TREE)
3307     {
3308       enum tree_code code;
3309       tree op0, op1, add = NULL_TREE;
3310
3311       if (TREE_CODE (off) == SSA_NAME)
3312         {
3313           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3314
3315           if (expr_invariant_in_loop_p (loop, off))
3316             return false;
3317
3318           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3319             break;
3320
3321           op0 = gimple_assign_rhs1 (def_stmt);
3322           code = gimple_assign_rhs_code (def_stmt);
3323           op1 = gimple_assign_rhs2 (def_stmt);
3324         }
3325       else
3326         {
3327           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3328             return false;
3329           code = TREE_CODE (off);
3330           extract_ops_from_tree (off, &code, &op0, &op1);
3331         }
3332       switch (code)
3333         {
3334         case POINTER_PLUS_EXPR:
3335         case PLUS_EXPR:
3336           if (expr_invariant_in_loop_p (loop, op0))
3337             {
3338               add = op0;
3339               off = op1;
3340             do_add:
3341               add = fold_convert (sizetype, add);
3342               if (scale != 1)
3343                 add = size_binop (MULT_EXPR, add, size_int (scale));
3344               base = size_binop (PLUS_EXPR, base, add);
3345               continue;
3346             }
3347           if (expr_invariant_in_loop_p (loop, op1))
3348             {
3349               add = op1;
3350               off = op0;
3351               goto do_add;
3352             }
3353           break;
3354         case MINUS_EXPR:
3355           if (expr_invariant_in_loop_p (loop, op1))
3356             {
3357               add = fold_convert (sizetype, op1);
3358               add = size_binop (MINUS_EXPR, size_zero_node, add);
3359               off = op0;
3360               goto do_add;
3361             }
3362           break;
3363         case MULT_EXPR:
3364           if (scale == 1 && tree_fits_shwi_p (op1))
3365             {
3366               scale = tree_to_shwi (op1);
3367               off = op0;
3368               continue;
3369             }
3370           break;
3371         case SSA_NAME:
3372           off = op0;
3373           continue;
3374         CASE_CONVERT:
3375           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3376               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3377             break;
3378           if (TYPE_PRECISION (TREE_TYPE (op0))
3379               == TYPE_PRECISION (TREE_TYPE (off)))
3380             {
3381               off = op0;
3382               continue;
3383             }
3384           if (TYPE_PRECISION (TREE_TYPE (op0))
3385               < TYPE_PRECISION (TREE_TYPE (off)))
3386             {
3387               off = op0;
3388               offtype = TREE_TYPE (off);
3389               STRIP_NOPS (off);
3390               continue;
3391             }
3392           break;
3393         default:
3394           break;
3395         }
3396       break;
3397     }
3398
3399   /* If at the end OFF still isn't a SSA_NAME or isn't
3400      defined in the loop, punt.  */
3401   if (TREE_CODE (off) != SSA_NAME
3402       || expr_invariant_in_loop_p (loop, off))
3403     return false;
3404
3405   if (offtype == NULL_TREE)
3406     offtype = TREE_TYPE (off);
3407
3408   if (DR_IS_READ (dr))
3409     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3410                                              offtype, scale);
3411   else
3412     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3413                                               offtype, scale);
3414
3415   if (decl == NULL_TREE)
3416     return false;
3417
3418   info->decl = decl;
3419   info->base = base;
3420   info->offset = off;
3421   info->offset_dt = vect_unknown_def_type;
3422   info->offset_vectype = NULL_TREE;
3423   info->scale = scale;
3424   return true;
3425 }
3426
3427 /* Function vect_analyze_data_refs.
3428
3429   Find all the data references in the loop or basic block.
3430
3431    The general structure of the analysis of data refs in the vectorizer is as
3432    follows:
3433    1- vect_analyze_data_refs(loop/bb): call
3434       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3435       in the loop/bb and their dependences.
3436    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3437    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3438    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3439
3440 */
3441
3442 bool
3443 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3444 {
3445   struct loop *loop = NULL;
3446   unsigned int i;
3447   struct data_reference *dr;
3448   tree scalar_type;
3449
3450   if (dump_enabled_p ())
3451     dump_printf_loc (MSG_NOTE, vect_location,
3452                      "=== vect_analyze_data_refs ===\n");
3453
3454   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3455     loop = LOOP_VINFO_LOOP (loop_vinfo);
3456
3457   /* Go through the data-refs, check that the analysis succeeded.  Update
3458      pointer from stmt_vec_info struct to DR and vectype.  */
3459
3460   vec<data_reference_p> datarefs = vinfo->datarefs;
3461   FOR_EACH_VEC_ELT (datarefs, i, dr)
3462     {
3463       gimple *stmt;
3464       stmt_vec_info stmt_info;
3465       tree base, offset, init;
3466       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3467       bool simd_lane_access = false;
3468       int vf;
3469
3470 again:
3471       if (!dr || !DR_REF (dr))
3472         {
3473           if (dump_enabled_p ())
3474             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3475                              "not vectorized: unhandled data-ref\n");
3476           return false;
3477         }
3478
3479       stmt = DR_STMT (dr);
3480       stmt_info = vinfo_for_stmt (stmt);
3481
3482       /* Discard clobbers from the dataref vector.  We will remove
3483          clobber stmts during vectorization.  */
3484       if (gimple_clobber_p (stmt))
3485         {
3486           free_data_ref (dr);
3487           if (i == datarefs.length () - 1)
3488             {
3489               datarefs.pop ();
3490               break;
3491             }
3492           datarefs.ordered_remove (i);
3493           dr = datarefs[i];
3494           goto again;
3495         }
3496
3497       /* Check that analysis of the data-ref succeeded.  */
3498       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3499           || !DR_STEP (dr))
3500         {
3501           bool maybe_gather
3502             = DR_IS_READ (dr)
3503               && !TREE_THIS_VOLATILE (DR_REF (dr))
3504               && targetm.vectorize.builtin_gather != NULL;
3505           bool maybe_scatter
3506             = DR_IS_WRITE (dr)
3507               && !TREE_THIS_VOLATILE (DR_REF (dr))
3508               && targetm.vectorize.builtin_scatter != NULL;
3509           bool maybe_simd_lane_access
3510             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3511
3512           /* If target supports vector gather loads or scatter stores, or if
3513              this might be a SIMD lane access, see if they can't be used.  */
3514           if (is_a <loop_vec_info> (vinfo)
3515               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3516               && !nested_in_vect_loop_p (loop, stmt))
3517             {
3518               struct data_reference *newdr
3519                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3520                                    DR_REF (dr), stmt, !maybe_scatter,
3521                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3522               gcc_assert (newdr != NULL && DR_REF (newdr));
3523               if (DR_BASE_ADDRESS (newdr)
3524                   && DR_OFFSET (newdr)
3525                   && DR_INIT (newdr)
3526                   && DR_STEP (newdr)
3527                   && integer_zerop (DR_STEP (newdr)))
3528                 {
3529                   if (maybe_simd_lane_access)
3530                     {
3531                       tree off = DR_OFFSET (newdr);
3532                       STRIP_NOPS (off);
3533                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3534                           && TREE_CODE (off) == MULT_EXPR
3535                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3536                         {
3537                           tree step = TREE_OPERAND (off, 1);
3538                           off = TREE_OPERAND (off, 0);
3539                           STRIP_NOPS (off);
3540                           if (CONVERT_EXPR_P (off)
3541                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3542                                                                           0)))
3543                                  < TYPE_PRECISION (TREE_TYPE (off)))
3544                             off = TREE_OPERAND (off, 0);
3545                           if (TREE_CODE (off) == SSA_NAME)
3546                             {
3547                               gimple *def = SSA_NAME_DEF_STMT (off);
3548                               tree reft = TREE_TYPE (DR_REF (newdr));
3549                               if (is_gimple_call (def)
3550                                   && gimple_call_internal_p (def)
3551                                   && (gimple_call_internal_fn (def)
3552                                       == IFN_GOMP_SIMD_LANE))
3553                                 {
3554                                   tree arg = gimple_call_arg (def, 0);
3555                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3556                                   arg = SSA_NAME_VAR (arg);
3557                                   if (arg == loop->simduid
3558                                       /* For now.  */
3559                                       && tree_int_cst_equal
3560                                            (TYPE_SIZE_UNIT (reft),
3561                                             step))
3562                                     {
3563                                       DR_OFFSET (newdr) = ssize_int (0);
3564                                       DR_STEP (newdr) = step;
3565                                       DR_OFFSET_ALIGNMENT (newdr)
3566                                         = BIGGEST_ALIGNMENT;
3567                                       DR_STEP_ALIGNMENT (newdr)
3568                                         = highest_pow2_factor (step);
3569                                       dr = newdr;
3570                                       simd_lane_access = true;
3571                                     }
3572                                 }
3573                             }
3574                         }
3575                     }
3576                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3577                     {
3578                       dr = newdr;
3579                       if (maybe_gather)
3580                         gatherscatter = GATHER;
3581                       else
3582                         gatherscatter = SCATTER;
3583                     }
3584                 }
3585               if (gatherscatter == SG_NONE && !simd_lane_access)
3586                 free_data_ref (newdr);
3587             }
3588
3589           if (gatherscatter == SG_NONE && !simd_lane_access)
3590             {
3591               if (dump_enabled_p ())
3592                 {
3593                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3594                                    "not vectorized: data ref analysis "
3595                                    "failed ");
3596                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3597                 }
3598
3599               if (is_a <bb_vec_info> (vinfo))
3600                 break;
3601
3602               return false;
3603             }
3604         }
3605
3606       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3607         {
3608           if (dump_enabled_p ())
3609             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3610                              "not vectorized: base addr of dr is a "
3611                              "constant\n");
3612
3613           if (is_a <bb_vec_info> (vinfo))
3614             break;
3615
3616           if (gatherscatter != SG_NONE || simd_lane_access)
3617             free_data_ref (dr);
3618           return false;
3619         }
3620
3621       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3622         {
3623           if (dump_enabled_p ())
3624             {
3625               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3626                                "not vectorized: volatile type ");
3627               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3628             }
3629
3630           if (is_a <bb_vec_info> (vinfo))
3631             break;
3632
3633           return false;
3634         }
3635
3636       if (stmt_can_throw_internal (stmt))
3637         {
3638           if (dump_enabled_p ())
3639             {
3640               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3641                                "not vectorized: statement can throw an "
3642                                "exception ");
3643               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3644             }
3645
3646           if (is_a <bb_vec_info> (vinfo))
3647             break;
3648
3649           if (gatherscatter != SG_NONE || simd_lane_access)
3650             free_data_ref (dr);
3651           return false;
3652         }
3653
3654       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3655           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3656         {
3657           if (dump_enabled_p ())
3658             {
3659               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3660                                "not vectorized: statement is bitfield "
3661                                "access ");
3662               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3663             }
3664
3665           if (is_a <bb_vec_info> (vinfo))
3666             break;
3667
3668           if (gatherscatter != SG_NONE || simd_lane_access)
3669             free_data_ref (dr);
3670           return false;
3671         }
3672
3673       base = unshare_expr (DR_BASE_ADDRESS (dr));
3674       offset = unshare_expr (DR_OFFSET (dr));
3675       init = unshare_expr (DR_INIT (dr));
3676
3677       if (is_gimple_call (stmt)
3678           && (!gimple_call_internal_p (stmt)
3679               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3680                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3681         {
3682           if (dump_enabled_p ())
3683             {
3684               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3685                                "not vectorized: dr in a call ");
3686               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3687             }
3688
3689           if (is_a <bb_vec_info> (vinfo))
3690             break;
3691
3692           if (gatherscatter != SG_NONE || simd_lane_access)
3693             free_data_ref (dr);
3694           return false;
3695         }
3696
3697       /* Update DR field in stmt_vec_info struct.  */
3698
3699       /* If the dataref is in an inner-loop of the loop that is considered for
3700          for vectorization, we also want to analyze the access relative to
3701          the outer-loop (DR contains information only relative to the
3702          inner-most enclosing loop).  We do that by building a reference to the
3703          first location accessed by the inner-loop, and analyze it relative to
3704          the outer-loop.  */
3705       if (loop && nested_in_vect_loop_p (loop, stmt))
3706         {
3707           /* Build a reference to the first location accessed by the
3708              inner loop: *(BASE + INIT + OFFSET).  By construction,
3709              this address must be invariant in the inner loop, so we
3710              can consider it as being used in the outer loop.  */
3711           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3712                                           init, offset);
3713           tree init_addr = fold_build_pointer_plus (base, init_offset);
3714           tree init_ref = build_fold_indirect_ref (init_addr);
3715
3716           if (dump_enabled_p ())
3717             {
3718               dump_printf_loc (MSG_NOTE, vect_location,
3719                                "analyze in outer loop: ");
3720               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3721               dump_printf (MSG_NOTE, "\n");
3722             }
3723
3724           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3725                                      init_ref, loop))
3726             /* dr_analyze_innermost already explained the failure.  */
3727             return false;
3728
3729           if (dump_enabled_p ())
3730             {
3731               dump_printf_loc (MSG_NOTE, vect_location,
3732                                "\touter base_address: ");
3733               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3734                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3735               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3736               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3737                                  STMT_VINFO_DR_OFFSET (stmt_info));
3738               dump_printf (MSG_NOTE,
3739                            "\n\touter constant offset from base address: ");
3740               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3741                                  STMT_VINFO_DR_INIT (stmt_info));
3742               dump_printf (MSG_NOTE, "\n\touter step: ");
3743               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3744                                  STMT_VINFO_DR_STEP (stmt_info));
3745               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3746                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3747               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3748                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3749               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3750                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3751               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3752                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3753             }
3754         }
3755
3756       if (STMT_VINFO_DATA_REF (stmt_info))
3757         {
3758           if (dump_enabled_p ())
3759             {
3760               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3761                                "not vectorized: more than one data ref "
3762                                "in stmt: ");
3763               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3764             }
3765
3766           if (is_a <bb_vec_info> (vinfo))
3767             break;
3768
3769           if (gatherscatter != SG_NONE || simd_lane_access)
3770             free_data_ref (dr);
3771           return false;
3772         }
3773
3774       STMT_VINFO_DATA_REF (stmt_info) = dr;
3775       if (simd_lane_access)
3776         {
3777           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3778           free_data_ref (datarefs[i]);
3779           datarefs[i] = dr;
3780         }
3781
3782       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3783           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3784           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3785         {
3786           if (dump_enabled_p ())
3787             {
3788               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3789                                "not vectorized: base object not addressable "
3790                                "for stmt: ");
3791               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3792             }
3793           if (is_a <bb_vec_info> (vinfo))
3794             {
3795               /* In BB vectorization the ref can still participate
3796                  in dependence analysis, we just can't vectorize it.  */
3797               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3798               continue;
3799             }
3800           return false;
3801         }
3802
3803       /* Set vectype for STMT.  */
3804       scalar_type = TREE_TYPE (DR_REF (dr));
3805       STMT_VINFO_VECTYPE (stmt_info)
3806         = get_vectype_for_scalar_type (scalar_type);
3807       if (!STMT_VINFO_VECTYPE (stmt_info))
3808         {
3809           if (dump_enabled_p ())
3810             {
3811               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3812                                "not vectorized: no vectype for stmt: ");
3813               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3814               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3815               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3816                                  scalar_type);
3817               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3818             }
3819
3820           if (is_a <bb_vec_info> (vinfo))
3821             {
3822               /* No vector type is fine, the ref can still participate
3823                  in dependence analysis, we just can't vectorize it.  */
3824               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3825               continue;
3826             }
3827
3828           if (gatherscatter != SG_NONE || simd_lane_access)
3829             {
3830               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3831               if (gatherscatter != SG_NONE)
3832                 free_data_ref (dr);
3833             }
3834           return false;
3835         }
3836       else
3837         {
3838           if (dump_enabled_p ())
3839             {
3840               dump_printf_loc (MSG_NOTE, vect_location,
3841                                "got vectype for stmt: ");
3842               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3843               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3844                                  STMT_VINFO_VECTYPE (stmt_info));
3845               dump_printf (MSG_NOTE, "\n");
3846             }
3847         }
3848
3849       /* Adjust the minimal vectorization factor according to the
3850          vector type.  */
3851       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3852       if (vf > *min_vf)
3853         *min_vf = vf;
3854
3855       if (gatherscatter != SG_NONE)
3856         {
3857           gather_scatter_info gs_info;
3858           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3859                                           &gs_info)
3860               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3861             {
3862               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3863               free_data_ref (dr);
3864               if (dump_enabled_p ())
3865                 {
3866                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3867                                    (gatherscatter == GATHER) ?
3868                                    "not vectorized: not suitable for gather "
3869                                    "load " :
3870                                    "not vectorized: not suitable for scatter "
3871                                    "store ");
3872                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3873                 }
3874               return false;
3875             }
3876
3877           free_data_ref (datarefs[i]);
3878           datarefs[i] = dr;
3879           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3880         }
3881
3882       else if (is_a <loop_vec_info> (vinfo)
3883                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3884         {
3885           if (nested_in_vect_loop_p (loop, stmt))
3886             {
3887               if (dump_enabled_p ())
3888                 {
3889                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3890                                    "not vectorized: not suitable for strided "
3891                                    "load ");
3892                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3893                 }
3894               return false;
3895             }
3896           STMT_VINFO_STRIDED_P (stmt_info) = true;
3897         }
3898     }
3899
3900   /* If we stopped analysis at the first dataref we could not analyze
3901      when trying to vectorize a basic-block mark the rest of the datarefs
3902      as not vectorizable and truncate the vector of datarefs.  That
3903      avoids spending useless time in analyzing their dependence.  */
3904   if (i != datarefs.length ())
3905     {
3906       gcc_assert (is_a <bb_vec_info> (vinfo));
3907       for (unsigned j = i; j < datarefs.length (); ++j)
3908         {
3909           data_reference_p dr = datarefs[j];
3910           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3911           free_data_ref (dr);
3912         }
3913       datarefs.truncate (i);
3914     }
3915
3916   return true;
3917 }
3918
3919
3920 /* Function vect_get_new_vect_var.
3921
3922    Returns a name for a new variable.  The current naming scheme appends the
3923    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3924    the name of vectorizer generated variables, and appends that to NAME if
3925    provided.  */
3926
3927 tree
3928 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3929 {
3930   const char *prefix;
3931   tree new_vect_var;
3932
3933   switch (var_kind)
3934   {
3935   case vect_simple_var:
3936     prefix = "vect";
3937     break;
3938   case vect_scalar_var:
3939     prefix = "stmp";
3940     break;
3941   case vect_mask_var:
3942     prefix = "mask";
3943     break;
3944   case vect_pointer_var:
3945     prefix = "vectp";
3946     break;
3947   default:
3948     gcc_unreachable ();
3949   }
3950
3951   if (name)
3952     {
3953       char* tmp = concat (prefix, "_", name, NULL);
3954       new_vect_var = create_tmp_reg (type, tmp);
3955       free (tmp);
3956     }
3957   else
3958     new_vect_var = create_tmp_reg (type, prefix);
3959
3960   return new_vect_var;
3961 }
3962
3963 /* Like vect_get_new_vect_var but return an SSA name.  */
3964
3965 tree
3966 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3967 {
3968   const char *prefix;
3969   tree new_vect_var;
3970
3971   switch (var_kind)
3972   {
3973   case vect_simple_var:
3974     prefix = "vect";
3975     break;
3976   case vect_scalar_var:
3977     prefix = "stmp";
3978     break;
3979   case vect_pointer_var:
3980     prefix = "vectp";
3981     break;
3982   default:
3983     gcc_unreachable ();
3984   }
3985
3986   if (name)
3987     {
3988       char* tmp = concat (prefix, "_", name, NULL);
3989       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3990       free (tmp);
3991     }
3992   else
3993     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3994
3995   return new_vect_var;
3996 }
3997
3998 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3999
4000 static void
4001 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4002                                   stmt_vec_info stmt_info)
4003 {
4004   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4005   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4006   int misalign = DR_MISALIGNMENT (dr);
4007   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4008     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4009   else
4010     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4011 }
4012
4013 /* Function vect_create_addr_base_for_vector_ref.
4014
4015    Create an expression that computes the address of the first memory location
4016    that will be accessed for a data reference.
4017
4018    Input:
4019    STMT: The statement containing the data reference.
4020    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4021    OFFSET: Optional. If supplied, it is be added to the initial address.
4022    LOOP:    Specify relative to which loop-nest should the address be computed.
4023             For example, when the dataref is in an inner-loop nested in an
4024             outer-loop that is now being vectorized, LOOP can be either the
4025             outer-loop, or the inner-loop.  The first memory location accessed
4026             by the following dataref ('in' points to short):
4027
4028                 for (i=0; i<N; i++)
4029                    for (j=0; j<M; j++)
4030                      s += in[i+j]
4031
4032             is as follows:
4033             if LOOP=i_loop:     &in             (relative to i_loop)
4034             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4035    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4036             initial address.  Unlike OFFSET, which is number of elements to
4037             be added, BYTE_OFFSET is measured in bytes.
4038
4039    Output:
4040    1. Return an SSA_NAME whose value is the address of the memory location of
4041       the first vector of the data reference.
4042    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4043       these statement(s) which define the returned SSA_NAME.
4044
4045    FORNOW: We are only handling array accesses with step 1.  */
4046
4047 tree
4048 vect_create_addr_base_for_vector_ref (gimple *stmt,
4049                                       gimple_seq *new_stmt_list,
4050                                       tree offset,
4051                                       tree byte_offset)
4052 {
4053   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4054   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4055   const char *base_name;
4056   tree addr_base;
4057   tree dest;
4058   gimple_seq seq = NULL;
4059   tree vect_ptr_type;
4060   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4061   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4062   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4063
4064   tree data_ref_base = unshare_expr (drb->base_address);
4065   tree base_offset = unshare_expr (drb->offset);
4066   tree init = unshare_expr (drb->init);
4067
4068   if (loop_vinfo)
4069     base_name = get_name (data_ref_base);
4070   else
4071     {
4072       base_offset = ssize_int (0);
4073       init = ssize_int (0);
4074       base_name = get_name (DR_REF (dr));
4075     }
4076
4077   /* Create base_offset */
4078   base_offset = size_binop (PLUS_EXPR,
4079                             fold_convert (sizetype, base_offset),
4080                             fold_convert (sizetype, init));
4081
4082   if (offset)
4083     {
4084       offset = fold_build2 (MULT_EXPR, sizetype,
4085                             fold_convert (sizetype, offset), step);
4086       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4087                                  base_offset, offset);
4088     }
4089   if (byte_offset)
4090     {
4091       byte_offset = fold_convert (sizetype, byte_offset);
4092       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4093                                  base_offset, byte_offset);
4094     }
4095
4096   /* base + base_offset */
4097   if (loop_vinfo)
4098     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4099   else
4100     {
4101       addr_base = build1 (ADDR_EXPR,
4102                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4103                           unshare_expr (DR_REF (dr)));
4104     }
4105
4106   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4107   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4108   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4109   gimple_seq_add_seq (new_stmt_list, seq);
4110
4111   if (DR_PTR_INFO (dr)
4112       && TREE_CODE (addr_base) == SSA_NAME
4113       && !SSA_NAME_PTR_INFO (addr_base))
4114     {
4115       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4116       if (offset || byte_offset)
4117         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4118     }
4119
4120   if (dump_enabled_p ())
4121     {
4122       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4123       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4124       dump_printf (MSG_NOTE, "\n");
4125     }
4126
4127   return addr_base;
4128 }
4129
4130
4131 /* Function vect_create_data_ref_ptr.
4132
4133    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4134    location accessed in the loop by STMT, along with the def-use update
4135    chain to appropriately advance the pointer through the loop iterations.
4136    Also set aliasing information for the pointer.  This pointer is used by
4137    the callers to this function to create a memory reference expression for
4138    vector load/store access.
4139
4140    Input:
4141    1. STMT: a stmt that references memory. Expected to be of the form
4142          GIMPLE_ASSIGN <name, data-ref> or
4143          GIMPLE_ASSIGN <data-ref, name>.
4144    2. AGGR_TYPE: the type of the reference, which should be either a vector
4145         or an array.
4146    3. AT_LOOP: the loop where the vector memref is to be created.
4147    4. OFFSET (optional): an offset to be added to the initial address accessed
4148         by the data-ref in STMT.
4149    5. BSI: location where the new stmts are to be placed if there is no loop
4150    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4151         pointing to the initial address.
4152    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4153         to the initial address accessed by the data-ref in STMT.  This is
4154         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4155         in bytes.
4156
4157    Output:
4158    1. Declare a new ptr to vector_type, and have it point to the base of the
4159       data reference (initial addressed accessed by the data reference).
4160       For example, for vector of type V8HI, the following code is generated:
4161
4162       v8hi *ap;
4163       ap = (v8hi *)initial_address;
4164
4165       if OFFSET is not supplied:
4166          initial_address = &a[init];
4167       if OFFSET is supplied:
4168          initial_address = &a[init + OFFSET];
4169       if BYTE_OFFSET is supplied:
4170          initial_address = &a[init] + BYTE_OFFSET;
4171
4172       Return the initial_address in INITIAL_ADDRESS.
4173
4174    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4175       update the pointer in each iteration of the loop.
4176
4177       Return the increment stmt that updates the pointer in PTR_INCR.
4178
4179    3. Set INV_P to true if the access pattern of the data reference in the
4180       vectorized loop is invariant.  Set it to false otherwise.
4181
4182    4. Return the pointer.  */
4183
4184 tree
4185 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4186                           tree offset, tree *initial_address,
4187                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4188                           bool only_init, bool *inv_p, tree byte_offset)
4189 {
4190   const char *base_name;
4191   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4192   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4193   struct loop *loop = NULL;
4194   bool nested_in_vect_loop = false;
4195   struct loop *containing_loop = NULL;
4196   tree aggr_ptr_type;
4197   tree aggr_ptr;
4198   tree new_temp;
4199   gimple_seq new_stmt_list = NULL;
4200   edge pe = NULL;
4201   basic_block new_bb;
4202   tree aggr_ptr_init;
4203   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4204   tree aptr;
4205   gimple_stmt_iterator incr_gsi;
4206   bool insert_after;
4207   tree indx_before_incr, indx_after_incr;
4208   gimple *incr;
4209   tree step;
4210   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4211
4212   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4213               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4214
4215   if (loop_vinfo)
4216     {
4217       loop = LOOP_VINFO_LOOP (loop_vinfo);
4218       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4219       containing_loop = (gimple_bb (stmt))->loop_father;
4220       pe = loop_preheader_edge (loop);
4221     }
4222   else
4223     {
4224       gcc_assert (bb_vinfo);
4225       only_init = true;
4226       *ptr_incr = NULL;
4227     }
4228
4229   /* Check the step (evolution) of the load in LOOP, and record
4230      whether it's invariant.  */
4231   step = vect_dr_behavior (dr)->step;
4232   if (integer_zerop (step))
4233     *inv_p = true;
4234   else
4235     *inv_p = false;
4236
4237   /* Create an expression for the first address accessed by this load
4238      in LOOP.  */
4239   base_name = get_name (DR_BASE_ADDRESS (dr));
4240
4241   if (dump_enabled_p ())
4242     {
4243       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4244       dump_printf_loc (MSG_NOTE, vect_location,
4245                        "create %s-pointer variable to type: ",
4246                        get_tree_code_name (TREE_CODE (aggr_type)));
4247       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4248       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4249         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4250       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4251         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4252       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4253         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4254       else
4255         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4256       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4257       dump_printf (MSG_NOTE, "\n");
4258     }
4259
4260   /* (1) Create the new aggregate-pointer variable.
4261      Vector and array types inherit the alias set of their component
4262      type by default so we need to use a ref-all pointer if the data
4263      reference does not conflict with the created aggregated data
4264      reference because it is not addressable.  */
4265   bool need_ref_all = false;
4266   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4267                               get_alias_set (DR_REF (dr))))
4268     need_ref_all = true;
4269   /* Likewise for any of the data references in the stmt group.  */
4270   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4271     {
4272       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4273       do
4274         {
4275           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4276           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4277           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4278                                       get_alias_set (DR_REF (sdr))))
4279             {
4280               need_ref_all = true;
4281               break;
4282             }
4283           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4284         }
4285       while (orig_stmt);
4286     }
4287   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4288                                                need_ref_all);
4289   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4290
4291
4292   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4293      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4294      def-use update cycles for the pointer: one relative to the outer-loop
4295      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4296      to the inner-loop (which is the inner-most loop containing the dataref),
4297      and this is done be step (5) below.
4298
4299      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4300      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4301      redundant.  Steps (3),(4) create the following:
4302
4303         vp0 = &base_addr;
4304         LOOP:   vp1 = phi(vp0,vp2)
4305                 ...
4306                 ...
4307                 vp2 = vp1 + step
4308                 goto LOOP
4309
4310      If there is an inner-loop nested in loop, then step (5) will also be
4311      applied, and an additional update in the inner-loop will be created:
4312
4313         vp0 = &base_addr;
4314         LOOP:   vp1 = phi(vp0,vp2)
4315                 ...
4316         inner:     vp3 = phi(vp1,vp4)
4317                    vp4 = vp3 + inner_step
4318                    if () goto inner
4319                 ...
4320                 vp2 = vp1 + step
4321                 if () goto LOOP   */
4322
4323   /* (2) Calculate the initial address of the aggregate-pointer, and set
4324      the aggregate-pointer to point to it before the loop.  */
4325
4326   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4327
4328   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4329                                                    offset, byte_offset);
4330   if (new_stmt_list)
4331     {
4332       if (pe)
4333         {
4334           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4335           gcc_assert (!new_bb);
4336         }
4337       else
4338         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4339     }
4340
4341   *initial_address = new_temp;
4342   aggr_ptr_init = new_temp;
4343
4344   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4345      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4346      inner-loop nested in LOOP (during outer-loop vectorization).  */
4347
4348   /* No update in loop is required.  */
4349   if (only_init && (!loop_vinfo || at_loop == loop))
4350     aptr = aggr_ptr_init;
4351   else
4352     {
4353       /* The step of the aggregate pointer is the type size.  */
4354       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4355       /* One exception to the above is when the scalar step of the load in
4356          LOOP is zero. In this case the step here is also zero.  */
4357       if (*inv_p)
4358         iv_step = size_zero_node;
4359       else if (tree_int_cst_sgn (step) == -1)
4360         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4361
4362       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4363
4364       create_iv (aggr_ptr_init,
4365                  fold_convert (aggr_ptr_type, iv_step),
4366                  aggr_ptr, loop, &incr_gsi, insert_after,
4367                  &indx_before_incr, &indx_after_incr);
4368       incr = gsi_stmt (incr_gsi);
4369       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4370
4371       /* Copy the points-to information if it exists. */
4372       if (DR_PTR_INFO (dr))
4373         {
4374           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4375           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4376         }
4377       if (ptr_incr)
4378         *ptr_incr = incr;
4379
4380       aptr = indx_before_incr;
4381     }
4382
4383   if (!nested_in_vect_loop || only_init)
4384     return aptr;
4385
4386
4387   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4388      nested in LOOP, if exists.  */
4389
4390   gcc_assert (nested_in_vect_loop);
4391   if (!only_init)
4392     {
4393       standard_iv_increment_position (containing_loop, &incr_gsi,
4394                                       &insert_after);
4395       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4396                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4397                  &indx_after_incr);
4398       incr = gsi_stmt (incr_gsi);
4399       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4400
4401       /* Copy the points-to information if it exists. */
4402       if (DR_PTR_INFO (dr))
4403         {
4404           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4405           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4406         }
4407       if (ptr_incr)
4408         *ptr_incr = incr;
4409
4410       return indx_before_incr;
4411     }
4412   else
4413     gcc_unreachable ();
4414 }
4415
4416
4417 /* Function bump_vector_ptr
4418
4419    Increment a pointer (to a vector type) by vector-size. If requested,
4420    i.e. if PTR-INCR is given, then also connect the new increment stmt
4421    to the existing def-use update-chain of the pointer, by modifying
4422    the PTR_INCR as illustrated below:
4423
4424    The pointer def-use update-chain before this function:
4425                         DATAREF_PTR = phi (p_0, p_2)
4426                         ....
4427         PTR_INCR:       p_2 = DATAREF_PTR + step
4428
4429    The pointer def-use update-chain after this function:
4430                         DATAREF_PTR = phi (p_0, p_2)
4431                         ....
4432                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4433                         ....
4434         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4435
4436    Input:
4437    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4438                  in the loop.
4439    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4440               the loop.  The increment amount across iterations is expected
4441               to be vector_size.
4442    BSI - location where the new update stmt is to be placed.
4443    STMT - the original scalar memory-access stmt that is being vectorized.
4444    BUMP - optional. The offset by which to bump the pointer. If not given,
4445           the offset is assumed to be vector_size.
4446
4447    Output: Return NEW_DATAREF_PTR as illustrated above.
4448
4449 */
4450
4451 tree
4452 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4453                  gimple *stmt, tree bump)
4454 {
4455   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4456   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4457   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4458   tree update = TYPE_SIZE_UNIT (vectype);
4459   gassign *incr_stmt;
4460   ssa_op_iter iter;
4461   use_operand_p use_p;
4462   tree new_dataref_ptr;
4463
4464   if (bump)
4465     update = bump;
4466
4467   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4468     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4469   else
4470     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4471   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4472                                    dataref_ptr, update);
4473   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4474
4475   /* Copy the points-to information if it exists. */
4476   if (DR_PTR_INFO (dr))
4477     {
4478       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4479       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4480     }
4481
4482   if (!ptr_incr)
4483     return new_dataref_ptr;
4484
4485   /* Update the vector-pointer's cross-iteration increment.  */
4486   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4487     {
4488       tree use = USE_FROM_PTR (use_p);
4489
4490       if (use == dataref_ptr)
4491         SET_USE (use_p, new_dataref_ptr);
4492       else
4493         gcc_assert (tree_int_cst_compare (use, update) == 0);
4494     }
4495
4496   return new_dataref_ptr;
4497 }
4498
4499
4500 /* Function vect_create_destination_var.
4501
4502    Create a new temporary of type VECTYPE.  */
4503
4504 tree
4505 vect_create_destination_var (tree scalar_dest, tree vectype)
4506 {
4507   tree vec_dest;
4508   const char *name;
4509   char *new_name;
4510   tree type;
4511   enum vect_var_kind kind;
4512
4513   kind = vectype
4514     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4515     ? vect_mask_var
4516     : vect_simple_var
4517     : vect_scalar_var;
4518   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4519
4520   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4521
4522   name = get_name (scalar_dest);
4523   if (name)
4524     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4525   else
4526     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4527   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4528   free (new_name);
4529
4530   return vec_dest;
4531 }
4532
4533 /* Function vect_grouped_store_supported.
4534
4535    Returns TRUE if interleave high and interleave low permutations
4536    are supported, and FALSE otherwise.  */
4537
4538 bool
4539 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4540 {
4541   machine_mode mode = TYPE_MODE (vectype);
4542
4543   /* vect_permute_store_chain requires the group size to be equal to 3 or
4544      be a power of two.  */
4545   if (count != 3 && exact_log2 (count) == -1)
4546     {
4547       if (dump_enabled_p ())
4548         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4549                          "the size of the group of accesses"
4550                          " is not a power of 2 or not eqaul to 3\n");
4551       return false;
4552     }
4553
4554   /* Check that the permutation is supported.  */
4555   if (VECTOR_MODE_P (mode))
4556     {
4557       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4558       auto_vec_perm_indices sel (nelt);
4559       sel.quick_grow (nelt);
4560
4561       if (count == 3)
4562         {
4563           unsigned int j0 = 0, j1 = 0, j2 = 0;
4564           unsigned int i, j;
4565
4566           for (j = 0; j < 3; j++)
4567             {
4568               int nelt0 = ((3 - j) * nelt) % 3;
4569               int nelt1 = ((3 - j) * nelt + 1) % 3;
4570               int nelt2 = ((3 - j) * nelt + 2) % 3;
4571               for (i = 0; i < nelt; i++)
4572                 {
4573                   if (3 * i + nelt0 < nelt)
4574                     sel[3 * i + nelt0] = j0++;
4575                   if (3 * i + nelt1 < nelt)
4576                     sel[3 * i + nelt1] = nelt + j1++;
4577                   if (3 * i + nelt2 < nelt)
4578                     sel[3 * i + nelt2] = 0;
4579                 }
4580               if (!can_vec_perm_p (mode, false, &sel))
4581                 {
4582                   if (dump_enabled_p ())
4583                     dump_printf (MSG_MISSED_OPTIMIZATION,
4584                                  "permutaion op not supported by target.\n");
4585                   return false;
4586                 }
4587
4588               for (i = 0; i < nelt; i++)
4589                 {
4590                   if (3 * i + nelt0 < nelt)
4591                     sel[3 * i + nelt0] = 3 * i + nelt0;
4592                   if (3 * i + nelt1 < nelt)
4593                     sel[3 * i + nelt1] = 3 * i + nelt1;
4594                   if (3 * i + nelt2 < nelt)
4595                     sel[3 * i + nelt2] = nelt + j2++;
4596                 }
4597               if (!can_vec_perm_p (mode, false, &sel))
4598                 {
4599                   if (dump_enabled_p ())
4600                     dump_printf (MSG_MISSED_OPTIMIZATION,
4601                                  "permutaion op not supported by target.\n");
4602                   return false;
4603                 }
4604             }
4605           return true;
4606         }
4607       else
4608         {
4609           /* If length is not equal to 3 then only power of 2 is supported.  */
4610           gcc_assert (pow2p_hwi (count));
4611
4612           for (i = 0; i < nelt / 2; i++)
4613             {
4614               sel[i * 2] = i;
4615               sel[i * 2 + 1] = i + nelt;
4616             }
4617           if (can_vec_perm_p (mode, false, &sel))
4618             {
4619               for (i = 0; i < nelt; i++)
4620                 sel[i] += nelt / 2;
4621               if (can_vec_perm_p (mode, false, &sel))
4622                 return true;
4623             }
4624         }
4625     }
4626
4627   if (dump_enabled_p ())
4628     dump_printf (MSG_MISSED_OPTIMIZATION,
4629                  "permutaion op not supported by target.\n");
4630   return false;
4631 }
4632
4633
4634 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4635    type VECTYPE.  */
4636
4637 bool
4638 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4639 {
4640   return vect_lanes_optab_supported_p ("vec_store_lanes",
4641                                        vec_store_lanes_optab,
4642                                        vectype, count);
4643 }
4644
4645
4646 /* Function vect_permute_store_chain.
4647
4648    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4649    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4650    the data correctly for the stores.  Return the final references for stores
4651    in RESULT_CHAIN.
4652
4653    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4654    The input is 4 vectors each containing 8 elements.  We assign a number to
4655    each element, the input sequence is:
4656
4657    1st vec:   0  1  2  3  4  5  6  7
4658    2nd vec:   8  9 10 11 12 13 14 15
4659    3rd vec:  16 17 18 19 20 21 22 23
4660    4th vec:  24 25 26 27 28 29 30 31
4661
4662    The output sequence should be:
4663
4664    1st vec:  0  8 16 24  1  9 17 25
4665    2nd vec:  2 10 18 26  3 11 19 27
4666    3rd vec:  4 12 20 28  5 13 21 30
4667    4th vec:  6 14 22 30  7 15 23 31
4668
4669    i.e., we interleave the contents of the four vectors in their order.
4670
4671    We use interleave_high/low instructions to create such output.  The input of
4672    each interleave_high/low operation is two vectors:
4673    1st vec    2nd vec
4674    0 1 2 3    4 5 6 7
4675    the even elements of the result vector are obtained left-to-right from the
4676    high/low elements of the first vector.  The odd elements of the result are
4677    obtained left-to-right from the high/low elements of the second vector.
4678    The output of interleave_high will be:   0 4 1 5
4679    and of interleave_low:                   2 6 3 7
4680
4681
4682    The permutation is done in log LENGTH stages.  In each stage interleave_high
4683    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4684    where the first argument is taken from the first half of DR_CHAIN and the
4685    second argument from it's second half.
4686    In our example,
4687
4688    I1: interleave_high (1st vec, 3rd vec)
4689    I2: interleave_low (1st vec, 3rd vec)
4690    I3: interleave_high (2nd vec, 4th vec)
4691    I4: interleave_low (2nd vec, 4th vec)
4692
4693    The output for the first stage is:
4694
4695    I1:  0 16  1 17  2 18  3 19
4696    I2:  4 20  5 21  6 22  7 23
4697    I3:  8 24  9 25 10 26 11 27
4698    I4: 12 28 13 29 14 30 15 31
4699
4700    The output of the second stage, i.e. the final result is:
4701
4702    I1:  0  8 16 24  1  9 17 25
4703    I2:  2 10 18 26  3 11 19 27
4704    I3:  4 12 20 28  5 13 21 30
4705    I4:  6 14 22 30  7 15 23 31.  */
4706
4707 void
4708 vect_permute_store_chain (vec<tree> dr_chain,
4709                           unsigned int length,
4710                           gimple *stmt,
4711                           gimple_stmt_iterator *gsi,
4712                           vec<tree> *result_chain)
4713 {
4714   tree vect1, vect2, high, low;
4715   gimple *perm_stmt;
4716   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4717   tree perm_mask_low, perm_mask_high;
4718   tree data_ref;
4719   tree perm3_mask_low, perm3_mask_high;
4720   unsigned int i, n, log_length = exact_log2 (length);
4721   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4722
4723   auto_vec_perm_indices sel (nelt);
4724   sel.quick_grow (nelt);
4725
4726   result_chain->quick_grow (length);
4727   memcpy (result_chain->address (), dr_chain.address (),
4728           length * sizeof (tree));
4729
4730   if (length == 3)
4731     {
4732       unsigned int j0 = 0, j1 = 0, j2 = 0;
4733
4734       for (j = 0; j < 3; j++)
4735         {
4736           int nelt0 = ((3 - j) * nelt) % 3;
4737           int nelt1 = ((3 - j) * nelt + 1) % 3;
4738           int nelt2 = ((3 - j) * nelt + 2) % 3;
4739
4740           for (i = 0; i < nelt; i++)
4741             {
4742               if (3 * i + nelt0 < nelt)
4743                 sel[3 * i + nelt0] = j0++;
4744               if (3 * i + nelt1 < nelt)
4745                 sel[3 * i + nelt1] = nelt + j1++;
4746               if (3 * i + nelt2 < nelt)
4747                 sel[3 * i + nelt2] = 0;
4748             }
4749           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4750
4751           for (i = 0; i < nelt; i++)
4752             {
4753               if (3 * i + nelt0 < nelt)
4754                 sel[3 * i + nelt0] = 3 * i + nelt0;
4755               if (3 * i + nelt1 < nelt)
4756                 sel[3 * i + nelt1] = 3 * i + nelt1;
4757               if (3 * i + nelt2 < nelt)
4758                 sel[3 * i + nelt2] = nelt + j2++;
4759             }
4760           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4761
4762           vect1 = dr_chain[0];
4763           vect2 = dr_chain[1];
4764
4765           /* Create interleaving stmt:
4766              low = VEC_PERM_EXPR <vect1, vect2,
4767                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4768                                    j + 2, nelt + j + 2, *, ...}>  */
4769           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4770           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4771                                            vect2, perm3_mask_low);
4772           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4773
4774           vect1 = data_ref;
4775           vect2 = dr_chain[2];
4776           /* Create interleaving stmt:
4777              low = VEC_PERM_EXPR <vect1, vect2,
4778                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4779                                    6, 7, nelt + j + 2, ...}>  */
4780           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4781           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4782                                            vect2, perm3_mask_high);
4783           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4784           (*result_chain)[j] = data_ref;
4785         }
4786     }
4787   else
4788     {
4789       /* If length is not equal to 3 then only power of 2 is supported.  */
4790       gcc_assert (pow2p_hwi (length));
4791
4792       for (i = 0, n = nelt / 2; i < n; i++)
4793         {
4794           sel[i * 2] = i;
4795           sel[i * 2 + 1] = i + nelt;
4796         }
4797         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4798
4799         for (i = 0; i < nelt; i++)
4800           sel[i] += nelt / 2;
4801         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4802
4803         for (i = 0, n = log_length; i < n; i++)
4804           {
4805             for (j = 0; j < length/2; j++)
4806               {
4807                 vect1 = dr_chain[j];
4808                 vect2 = dr_chain[j+length/2];
4809
4810                 /* Create interleaving stmt:
4811                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4812                                                         ...}>  */
4813                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4814                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4815                                                  vect2, perm_mask_high);
4816                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4817                 (*result_chain)[2*j] = high;
4818
4819                 /* Create interleaving stmt:
4820                    low = VEC_PERM_EXPR <vect1, vect2,
4821                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4822                                          ...}>  */
4823                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4824                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4825                                                  vect2, perm_mask_low);
4826                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4827                 (*result_chain)[2*j+1] = low;
4828               }
4829             memcpy (dr_chain.address (), result_chain->address (),
4830                     length * sizeof (tree));
4831           }
4832     }
4833 }
4834
4835 /* Function vect_setup_realignment
4836
4837    This function is called when vectorizing an unaligned load using
4838    the dr_explicit_realign[_optimized] scheme.
4839    This function generates the following code at the loop prolog:
4840
4841       p = initial_addr;
4842    x  msq_init = *(floor(p));   # prolog load
4843       realignment_token = call target_builtin;
4844     loop:
4845    x  msq = phi (msq_init, ---)
4846
4847    The stmts marked with x are generated only for the case of
4848    dr_explicit_realign_optimized.
4849
4850    The code above sets up a new (vector) pointer, pointing to the first
4851    location accessed by STMT, and a "floor-aligned" load using that pointer.
4852    It also generates code to compute the "realignment-token" (if the relevant
4853    target hook was defined), and creates a phi-node at the loop-header bb
4854    whose arguments are the result of the prolog-load (created by this
4855    function) and the result of a load that takes place in the loop (to be
4856    created by the caller to this function).
4857
4858    For the case of dr_explicit_realign_optimized:
4859    The caller to this function uses the phi-result (msq) to create the
4860    realignment code inside the loop, and sets up the missing phi argument,
4861    as follows:
4862     loop:
4863       msq = phi (msq_init, lsq)
4864       lsq = *(floor(p'));        # load in loop
4865       result = realign_load (msq, lsq, realignment_token);
4866
4867    For the case of dr_explicit_realign:
4868     loop:
4869       msq = *(floor(p));        # load in loop
4870       p' = p + (VS-1);
4871       lsq = *(floor(p'));       # load in loop
4872       result = realign_load (msq, lsq, realignment_token);
4873
4874    Input:
4875    STMT - (scalar) load stmt to be vectorized. This load accesses
4876           a memory location that may be unaligned.
4877    BSI - place where new code is to be inserted.
4878    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4879                               is used.
4880
4881    Output:
4882    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4883                        target hook, if defined.
4884    Return value - the result of the loop-header phi node.  */
4885
4886 tree
4887 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4888                         tree *realignment_token,
4889                         enum dr_alignment_support alignment_support_scheme,
4890                         tree init_addr,
4891                         struct loop **at_loop)
4892 {
4893   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4894   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4895   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4896   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4897   struct loop *loop = NULL;
4898   edge pe = NULL;
4899   tree scalar_dest = gimple_assign_lhs (stmt);
4900   tree vec_dest;
4901   gimple *inc;
4902   tree ptr;
4903   tree data_ref;
4904   basic_block new_bb;
4905   tree msq_init = NULL_TREE;
4906   tree new_temp;
4907   gphi *phi_stmt;
4908   tree msq = NULL_TREE;
4909   gimple_seq stmts = NULL;
4910   bool inv_p;
4911   bool compute_in_loop = false;
4912   bool nested_in_vect_loop = false;
4913   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4914   struct loop *loop_for_initial_load = NULL;
4915
4916   if (loop_vinfo)
4917     {
4918       loop = LOOP_VINFO_LOOP (loop_vinfo);
4919       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4920     }
4921
4922   gcc_assert (alignment_support_scheme == dr_explicit_realign
4923               || alignment_support_scheme == dr_explicit_realign_optimized);
4924
4925   /* We need to generate three things:
4926      1. the misalignment computation
4927      2. the extra vector load (for the optimized realignment scheme).
4928      3. the phi node for the two vectors from which the realignment is
4929       done (for the optimized realignment scheme).  */
4930
4931   /* 1. Determine where to generate the misalignment computation.
4932
4933      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4934      calculation will be generated by this function, outside the loop (in the
4935      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4936      caller, inside the loop.
4937
4938      Background: If the misalignment remains fixed throughout the iterations of
4939      the loop, then both realignment schemes are applicable, and also the
4940      misalignment computation can be done outside LOOP.  This is because we are
4941      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4942      are a multiple of VS (the Vector Size), and therefore the misalignment in
4943      different vectorized LOOP iterations is always the same.
4944      The problem arises only if the memory access is in an inner-loop nested
4945      inside LOOP, which is now being vectorized using outer-loop vectorization.
4946      This is the only case when the misalignment of the memory access may not
4947      remain fixed throughout the iterations of the inner-loop (as explained in
4948      detail in vect_supportable_dr_alignment).  In this case, not only is the
4949      optimized realignment scheme not applicable, but also the misalignment
4950      computation (and generation of the realignment token that is passed to
4951      REALIGN_LOAD) have to be done inside the loop.
4952
4953      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4954      or not, which in turn determines if the misalignment is computed inside
4955      the inner-loop, or outside LOOP.  */
4956
4957   if (init_addr != NULL_TREE || !loop_vinfo)
4958     {
4959       compute_in_loop = true;
4960       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4961     }
4962
4963
4964   /* 2. Determine where to generate the extra vector load.
4965
4966      For the optimized realignment scheme, instead of generating two vector
4967      loads in each iteration, we generate a single extra vector load in the
4968      preheader of the loop, and in each iteration reuse the result of the
4969      vector load from the previous iteration.  In case the memory access is in
4970      an inner-loop nested inside LOOP, which is now being vectorized using
4971      outer-loop vectorization, we need to determine whether this initial vector
4972      load should be generated at the preheader of the inner-loop, or can be
4973      generated at the preheader of LOOP.  If the memory access has no evolution
4974      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4975      to be generated inside LOOP (in the preheader of the inner-loop).  */
4976
4977   if (nested_in_vect_loop)
4978     {
4979       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4980       bool invariant_in_outerloop =
4981             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4982       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4983     }
4984   else
4985     loop_for_initial_load = loop;
4986   if (at_loop)
4987     *at_loop = loop_for_initial_load;
4988
4989   if (loop_for_initial_load)
4990     pe = loop_preheader_edge (loop_for_initial_load);
4991
4992   /* 3. For the case of the optimized realignment, create the first vector
4993       load at the loop preheader.  */
4994
4995   if (alignment_support_scheme == dr_explicit_realign_optimized)
4996     {
4997       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4998       gassign *new_stmt;
4999
5000       gcc_assert (!compute_in_loop);
5001       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5002       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5003                                       NULL_TREE, &init_addr, NULL, &inc,
5004                                       true, &inv_p);
5005       if (TREE_CODE (ptr) == SSA_NAME)
5006         new_temp = copy_ssa_name (ptr);
5007       else
5008         new_temp = make_ssa_name (TREE_TYPE (ptr));
5009       new_stmt = gimple_build_assign
5010                    (new_temp, BIT_AND_EXPR, ptr,
5011                     build_int_cst (TREE_TYPE (ptr),
5012                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5013       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5014       gcc_assert (!new_bb);
5015       data_ref
5016         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5017                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5018       new_stmt = gimple_build_assign (vec_dest, data_ref);
5019       new_temp = make_ssa_name (vec_dest, new_stmt);
5020       gimple_assign_set_lhs (new_stmt, new_temp);
5021       if (pe)
5022         {
5023           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5024           gcc_assert (!new_bb);
5025         }
5026       else
5027          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5028
5029       msq_init = gimple_assign_lhs (new_stmt);
5030     }
5031
5032   /* 4. Create realignment token using a target builtin, if available.
5033       It is done either inside the containing loop, or before LOOP (as
5034       determined above).  */
5035
5036   if (targetm.vectorize.builtin_mask_for_load)
5037     {
5038       gcall *new_stmt;
5039       tree builtin_decl;
5040
5041       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5042       if (!init_addr)
5043         {
5044           /* Generate the INIT_ADDR computation outside LOOP.  */
5045           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5046                                                             NULL_TREE);
5047           if (loop)
5048             {
5049               pe = loop_preheader_edge (loop);
5050               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5051               gcc_assert (!new_bb);
5052             }
5053           else
5054              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5055         }
5056
5057       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5058       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5059       vec_dest =
5060         vect_create_destination_var (scalar_dest,
5061                                      gimple_call_return_type (new_stmt));
5062       new_temp = make_ssa_name (vec_dest, new_stmt);
5063       gimple_call_set_lhs (new_stmt, new_temp);
5064
5065       if (compute_in_loop)
5066         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5067       else
5068         {
5069           /* Generate the misalignment computation outside LOOP.  */
5070           pe = loop_preheader_edge (loop);
5071           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5072           gcc_assert (!new_bb);
5073         }
5074
5075       *realignment_token = gimple_call_lhs (new_stmt);
5076
5077       /* The result of the CALL_EXPR to this builtin is determined from
5078          the value of the parameter and no global variables are touched
5079          which makes the builtin a "const" function.  Requiring the
5080          builtin to have the "const" attribute makes it unnecessary
5081          to call mark_call_clobbered.  */
5082       gcc_assert (TREE_READONLY (builtin_decl));
5083     }
5084
5085   if (alignment_support_scheme == dr_explicit_realign)
5086     return msq;
5087
5088   gcc_assert (!compute_in_loop);
5089   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5090
5091
5092   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5093
5094   pe = loop_preheader_edge (containing_loop);
5095   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5096   msq = make_ssa_name (vec_dest);
5097   phi_stmt = create_phi_node (msq, containing_loop->header);
5098   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5099
5100   return msq;
5101 }
5102
5103
5104 /* Function vect_grouped_load_supported.
5105
5106    COUNT is the size of the load group (the number of statements plus the
5107    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5108    only one statement, with a gap of COUNT - 1.
5109
5110    Returns true if a suitable permute exists.  */
5111
5112 bool
5113 vect_grouped_load_supported (tree vectype, bool single_element_p,
5114                              unsigned HOST_WIDE_INT count)
5115 {
5116   machine_mode mode = TYPE_MODE (vectype);
5117
5118   /* If this is single-element interleaving with an element distance
5119      that leaves unused vector loads around punt - we at least create
5120      very sub-optimal code in that case (and blow up memory,
5121      see PR65518).  */
5122   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5123     {
5124       if (dump_enabled_p ())
5125         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5126                          "single-element interleaving not supported "
5127                          "for not adjacent vector loads\n");
5128       return false;
5129     }
5130
5131   /* vect_permute_load_chain requires the group size to be equal to 3 or
5132      be a power of two.  */
5133   if (count != 3 && exact_log2 (count) == -1)
5134     {
5135       if (dump_enabled_p ())
5136         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5137                          "the size of the group of accesses"
5138                          " is not a power of 2 or not equal to 3\n");
5139       return false;
5140     }
5141
5142   /* Check that the permutation is supported.  */
5143   if (VECTOR_MODE_P (mode))
5144     {
5145       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5146       auto_vec_perm_indices sel (nelt);
5147       sel.quick_grow (nelt);
5148
5149       if (count == 3)
5150         {
5151           unsigned int k;
5152           for (k = 0; k < 3; k++)
5153             {
5154               for (i = 0; i < nelt; i++)
5155                 if (3 * i + k < 2 * nelt)
5156                   sel[i] = 3 * i + k;
5157                 else
5158                   sel[i] = 0;
5159               if (!can_vec_perm_p (mode, false, &sel))
5160                 {
5161                   if (dump_enabled_p ())
5162                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5163                                      "shuffle of 3 loads is not supported by"
5164                                      " target\n");
5165                   return false;
5166                 }
5167               for (i = 0, j = 0; i < nelt; i++)
5168                 if (3 * i + k < 2 * nelt)
5169                   sel[i] = i;
5170                 else
5171                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5172               if (!can_vec_perm_p (mode, false, &sel))
5173                 {
5174                   if (dump_enabled_p ())
5175                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5176                                      "shuffle of 3 loads is not supported by"
5177                                      " target\n");
5178                   return false;
5179                 }
5180             }
5181           return true;
5182         }
5183       else
5184         {
5185           /* If length is not equal to 3 then only power of 2 is supported.  */
5186           gcc_assert (pow2p_hwi (count));
5187           for (i = 0; i < nelt; i++)
5188             sel[i] = i * 2;
5189           if (can_vec_perm_p (mode, false, &sel))
5190             {
5191               for (i = 0; i < nelt; i++)
5192                 sel[i] = i * 2 + 1;
5193               if (can_vec_perm_p (mode, false, &sel))
5194                 return true;
5195             }
5196         }
5197     }
5198
5199   if (dump_enabled_p ())
5200     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5201                      "extract even/odd not supported by target\n");
5202   return false;
5203 }
5204
5205 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5206    type VECTYPE.  */
5207
5208 bool
5209 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5210 {
5211   return vect_lanes_optab_supported_p ("vec_load_lanes",
5212                                        vec_load_lanes_optab,
5213                                        vectype, count);
5214 }
5215
5216 /* Function vect_permute_load_chain.
5217
5218    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5219    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5220    the input data correctly.  Return the final references for loads in
5221    RESULT_CHAIN.
5222
5223    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5224    The input is 4 vectors each containing 8 elements. We assign a number to each
5225    element, the input sequence is:
5226
5227    1st vec:   0  1  2  3  4  5  6  7
5228    2nd vec:   8  9 10 11 12 13 14 15
5229    3rd vec:  16 17 18 19 20 21 22 23
5230    4th vec:  24 25 26 27 28 29 30 31
5231
5232    The output sequence should be:
5233
5234    1st vec:  0 4  8 12 16 20 24 28
5235    2nd vec:  1 5  9 13 17 21 25 29
5236    3rd vec:  2 6 10 14 18 22 26 30
5237    4th vec:  3 7 11 15 19 23 27 31
5238
5239    i.e., the first output vector should contain the first elements of each
5240    interleaving group, etc.
5241
5242    We use extract_even/odd instructions to create such output.  The input of
5243    each extract_even/odd operation is two vectors
5244    1st vec    2nd vec
5245    0 1 2 3    4 5 6 7
5246
5247    and the output is the vector of extracted even/odd elements.  The output of
5248    extract_even will be:   0 2 4 6
5249    and of extract_odd:     1 3 5 7
5250
5251
5252    The permutation is done in log LENGTH stages.  In each stage extract_even
5253    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5254    their order.  In our example,
5255
5256    E1: extract_even (1st vec, 2nd vec)
5257    E2: extract_odd (1st vec, 2nd vec)
5258    E3: extract_even (3rd vec, 4th vec)
5259    E4: extract_odd (3rd vec, 4th vec)
5260
5261    The output for the first stage will be:
5262
5263    E1:  0  2  4  6  8 10 12 14
5264    E2:  1  3  5  7  9 11 13 15
5265    E3: 16 18 20 22 24 26 28 30
5266    E4: 17 19 21 23 25 27 29 31
5267
5268    In order to proceed and create the correct sequence for the next stage (or
5269    for the correct output, if the second stage is the last one, as in our
5270    example), we first put the output of extract_even operation and then the
5271    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5272    The input for the second stage is:
5273
5274    1st vec (E1):  0  2  4  6  8 10 12 14
5275    2nd vec (E3): 16 18 20 22 24 26 28 30
5276    3rd vec (E2):  1  3  5  7  9 11 13 15
5277    4th vec (E4): 17 19 21 23 25 27 29 31
5278
5279    The output of the second stage:
5280
5281    E1: 0 4  8 12 16 20 24 28
5282    E2: 2 6 10 14 18 22 26 30
5283    E3: 1 5  9 13 17 21 25 29
5284    E4: 3 7 11 15 19 23 27 31
5285
5286    And RESULT_CHAIN after reordering:
5287
5288    1st vec (E1):  0 4  8 12 16 20 24 28
5289    2nd vec (E3):  1 5  9 13 17 21 25 29
5290    3rd vec (E2):  2 6 10 14 18 22 26 30
5291    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5292
5293 static void
5294 vect_permute_load_chain (vec<tree> dr_chain,
5295                          unsigned int length,
5296                          gimple *stmt,
5297                          gimple_stmt_iterator *gsi,
5298                          vec<tree> *result_chain)
5299 {
5300   tree data_ref, first_vect, second_vect;
5301   tree perm_mask_even, perm_mask_odd;
5302   tree perm3_mask_low, perm3_mask_high;
5303   gimple *perm_stmt;
5304   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5305   unsigned int i, j, log_length = exact_log2 (length);
5306   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5307
5308   auto_vec_perm_indices sel (nelt);
5309   sel.quick_grow (nelt);
5310
5311   result_chain->quick_grow (length);
5312   memcpy (result_chain->address (), dr_chain.address (),
5313           length * sizeof (tree));
5314
5315   if (length == 3)
5316     {
5317       unsigned int k;
5318
5319       for (k = 0; k < 3; k++)
5320         {
5321           for (i = 0; i < nelt; i++)
5322             if (3 * i + k < 2 * nelt)
5323               sel[i] = 3 * i + k;
5324             else
5325               sel[i] = 0;
5326           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5327
5328           for (i = 0, j = 0; i < nelt; i++)
5329             if (3 * i + k < 2 * nelt)
5330               sel[i] = i;
5331             else
5332               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5333
5334           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5335
5336           first_vect = dr_chain[0];
5337           second_vect = dr_chain[1];
5338
5339           /* Create interleaving stmt (low part of):
5340              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5341                                                              ...}>  */
5342           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5343           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5344                                            second_vect, perm3_mask_low);
5345           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5346
5347           /* Create interleaving stmt (high part of):
5348              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5349                                                               ...}>  */
5350           first_vect = data_ref;
5351           second_vect = dr_chain[2];
5352           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5353           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5354                                            second_vect, perm3_mask_high);
5355           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5356           (*result_chain)[k] = data_ref;
5357         }
5358     }
5359   else
5360     {
5361       /* If length is not equal to 3 then only power of 2 is supported.  */
5362       gcc_assert (pow2p_hwi (length));
5363
5364       for (i = 0; i < nelt; ++i)
5365         sel[i] = i * 2;
5366       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5367
5368       for (i = 0; i < nelt; ++i)
5369         sel[i] = i * 2 + 1;
5370       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5371
5372       for (i = 0; i < log_length; i++)
5373         {
5374           for (j = 0; j < length; j += 2)
5375             {
5376               first_vect = dr_chain[j];
5377               second_vect = dr_chain[j+1];
5378
5379               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5380               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5381               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5382                                                first_vect, second_vect,
5383                                                perm_mask_even);
5384               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5385               (*result_chain)[j/2] = data_ref;
5386
5387               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5388               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5389               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5390                                                first_vect, second_vect,
5391                                                perm_mask_odd);
5392               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5393               (*result_chain)[j/2+length/2] = data_ref;
5394             }
5395           memcpy (dr_chain.address (), result_chain->address (),
5396                   length * sizeof (tree));
5397         }
5398     }
5399 }
5400
5401 /* Function vect_shift_permute_load_chain.
5402
5403    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5404    sequence of stmts to reorder the input data accordingly.
5405    Return the final references for loads in RESULT_CHAIN.
5406    Return true if successed, false otherwise.
5407
5408    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5409    The input is 3 vectors each containing 8 elements.  We assign a
5410    number to each element, the input sequence is:
5411
5412    1st vec:   0  1  2  3  4  5  6  7
5413    2nd vec:   8  9 10 11 12 13 14 15
5414    3rd vec:  16 17 18 19 20 21 22 23
5415
5416    The output sequence should be:
5417
5418    1st vec:  0 3 6  9 12 15 18 21
5419    2nd vec:  1 4 7 10 13 16 19 22
5420    3rd vec:  2 5 8 11 14 17 20 23
5421
5422    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5423
5424    First we shuffle all 3 vectors to get correct elements order:
5425
5426    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5427    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5428    3rd vec:  (16 19 22) (17 20 23) (18 21)
5429
5430    Next we unite and shift vector 3 times:
5431
5432    1st step:
5433      shift right by 6 the concatenation of:
5434      "1st vec" and  "2nd vec"
5435        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5436      "2nd vec" and  "3rd vec"
5437        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5438      "3rd vec" and  "1st vec"
5439        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5440                              | New vectors                   |
5441
5442      So that now new vectors are:
5443
5444      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5445      2nd vec:  (10 13) (16 19 22) (17 20 23)
5446      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5447
5448    2nd step:
5449      shift right by 5 the concatenation of:
5450      "1st vec" and  "3rd vec"
5451        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5452      "2nd vec" and  "1st vec"
5453        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5454      "3rd vec" and  "2nd vec"
5455        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5456                           | New vectors                   |
5457
5458      So that now new vectors are:
5459
5460      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5461      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5462      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5463
5464    3rd step:
5465      shift right by 5 the concatenation of:
5466      "1st vec" and  "1st vec"
5467        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5468      shift right by 3 the concatenation of:
5469      "2nd vec" and  "2nd vec"
5470                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5471                           | New vectors                   |
5472
5473      So that now all vectors are READY:
5474      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5475      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5476      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5477
5478    This algorithm is faster than one in vect_permute_load_chain if:
5479      1.  "shift of a concatination" is faster than general permutation.
5480          This is usually so.
5481      2.  The TARGET machine can't execute vector instructions in parallel.
5482          This is because each step of the algorithm depends on previous.
5483          The algorithm in vect_permute_load_chain is much more parallel.
5484
5485    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5486 */
5487
5488 static bool
5489 vect_shift_permute_load_chain (vec<tree> dr_chain,
5490                                unsigned int length,
5491                                gimple *stmt,
5492                                gimple_stmt_iterator *gsi,
5493                                vec<tree> *result_chain)
5494 {
5495   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5496   tree perm2_mask1, perm2_mask2, perm3_mask;
5497   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5498   gimple *perm_stmt;
5499
5500   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5501   unsigned int i;
5502   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5503   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5504   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5505
5506   auto_vec_perm_indices sel (nelt);
5507   sel.quick_grow (nelt);
5508
5509   result_chain->quick_grow (length);
5510   memcpy (result_chain->address (), dr_chain.address (),
5511           length * sizeof (tree));
5512
5513   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5514     {
5515       unsigned int j, log_length = exact_log2 (length);
5516       for (i = 0; i < nelt / 2; ++i)
5517         sel[i] = i * 2;
5518       for (i = 0; i < nelt / 2; ++i)
5519         sel[nelt / 2 + i] = i * 2 + 1;
5520       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5521         {
5522           if (dump_enabled_p ())
5523             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5524                              "shuffle of 2 fields structure is not \
5525                               supported by target\n");
5526           return false;
5527         }
5528       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5529
5530       for (i = 0; i < nelt / 2; ++i)
5531         sel[i] = i * 2 + 1;
5532       for (i = 0; i < nelt / 2; ++i)
5533         sel[nelt / 2 + i] = i * 2;
5534       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5535         {
5536           if (dump_enabled_p ())
5537             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5538                              "shuffle of 2 fields structure is not \
5539                               supported by target\n");
5540           return false;
5541         }
5542       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5543
5544       /* Generating permutation constant to shift all elements.
5545          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5546       for (i = 0; i < nelt; i++)
5547         sel[i] = nelt / 2 + i;
5548       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5549         {
5550           if (dump_enabled_p ())
5551             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5552                              "shift permutation is not supported by target\n");
5553           return false;
5554         }
5555       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5556
5557       /* Generating permutation constant to select vector from 2.
5558          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5559       for (i = 0; i < nelt / 2; i++)
5560         sel[i] = i;
5561       for (i = nelt / 2; i < nelt; i++)
5562         sel[i] = nelt + i;
5563       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5564         {
5565           if (dump_enabled_p ())
5566             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5567                              "select is not supported by target\n");
5568           return false;
5569         }
5570       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5571
5572       for (i = 0; i < log_length; i++)
5573         {
5574           for (j = 0; j < length; j += 2)
5575             {
5576               first_vect = dr_chain[j];
5577               second_vect = dr_chain[j + 1];
5578
5579               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5580               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5581                                                first_vect, first_vect,
5582                                                perm2_mask1);
5583               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5584               vect[0] = data_ref;
5585
5586               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5587               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5588                                                second_vect, second_vect,
5589                                                perm2_mask2);
5590               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5591               vect[1] = data_ref;
5592
5593               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5594               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5595                                                vect[0], vect[1], shift1_mask);
5596               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5597               (*result_chain)[j/2 + length/2] = data_ref;
5598
5599               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5600               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5601                                                vect[0], vect[1], select_mask);
5602               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5603               (*result_chain)[j/2] = data_ref;
5604             }
5605           memcpy (dr_chain.address (), result_chain->address (),
5606                   length * sizeof (tree));
5607         }
5608       return true;
5609     }
5610   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5611     {
5612       unsigned int k = 0, l = 0;
5613
5614       /* Generating permutation constant to get all elements in rigth order.
5615          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5616       for (i = 0; i < nelt; i++)
5617         {
5618           if (3 * k + (l % 3) >= nelt)
5619             {
5620               k = 0;
5621               l += (3 - (nelt % 3));
5622             }
5623           sel[i] = 3 * k + (l % 3);
5624           k++;
5625         }
5626       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5627         {
5628           if (dump_enabled_p ())
5629             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5630                              "shuffle of 3 fields structure is not \
5631                               supported by target\n");
5632           return false;
5633         }
5634       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5635
5636       /* Generating permutation constant to shift all elements.
5637          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5638       for (i = 0; i < nelt; i++)
5639         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5640       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5641         {
5642           if (dump_enabled_p ())
5643             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5644                              "shift permutation is not supported by target\n");
5645           return false;
5646         }
5647       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5648
5649       /* Generating permutation constant to shift all elements.
5650          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5651       for (i = 0; i < nelt; i++)
5652         sel[i] = 2 * (nelt / 3) + 1 + i;
5653       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5654         {
5655           if (dump_enabled_p ())
5656             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5657                              "shift permutation is not supported by target\n");
5658           return false;
5659         }
5660       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5661
5662       /* Generating permutation constant to shift all elements.
5663          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5664       for (i = 0; i < nelt; i++)
5665         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5666       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5667         {
5668           if (dump_enabled_p ())
5669             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5670                              "shift permutation is not supported by target\n");
5671           return false;
5672         }
5673       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5674
5675       /* Generating permutation constant to shift all elements.
5676          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5677       for (i = 0; i < nelt; i++)
5678         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5679       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5680         {
5681           if (dump_enabled_p ())
5682             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5683                              "shift permutation is not supported by target\n");
5684           return false;
5685         }
5686       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5687
5688       for (k = 0; k < 3; k++)
5689         {
5690           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5691           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5692                                            dr_chain[k], dr_chain[k],
5693                                            perm3_mask);
5694           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5695           vect[k] = data_ref;
5696         }
5697
5698       for (k = 0; k < 3; k++)
5699         {
5700           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5701           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5702                                            vect[k % 3], vect[(k + 1) % 3],
5703                                            shift1_mask);
5704           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5705           vect_shift[k] = data_ref;
5706         }
5707
5708       for (k = 0; k < 3; k++)
5709         {
5710           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5711           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5712                                            vect_shift[(4 - k) % 3],
5713                                            vect_shift[(3 - k) % 3],
5714                                            shift2_mask);
5715           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5716           vect[k] = data_ref;
5717         }
5718
5719       (*result_chain)[3 - (nelt % 3)] = vect[2];
5720
5721       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5722       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5723                                        vect[0], shift3_mask);
5724       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5725       (*result_chain)[nelt % 3] = data_ref;
5726
5727       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5728       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5729                                        vect[1], shift4_mask);
5730       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5731       (*result_chain)[0] = data_ref;
5732       return true;
5733     }
5734   return false;
5735 }
5736
5737 /* Function vect_transform_grouped_load.
5738
5739    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5740    to perform their permutation and ascribe the result vectorized statements to
5741    the scalar statements.
5742 */
5743
5744 void
5745 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5746                              gimple_stmt_iterator *gsi)
5747 {
5748   machine_mode mode;
5749   vec<tree> result_chain = vNULL;
5750
5751   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5752      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5753      vectors, that are ready for vector computation.  */
5754   result_chain.create (size);
5755
5756   /* If reassociation width for vector type is 2 or greater target machine can
5757      execute 2 or more vector instructions in parallel.  Otherwise try to
5758      get chain for loads group using vect_shift_permute_load_chain.  */
5759   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5760   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5761       || pow2p_hwi (size)
5762       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5763                                          gsi, &result_chain))
5764     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5765   vect_record_grouped_load_vectors (stmt, result_chain);
5766   result_chain.release ();
5767 }
5768
5769 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5770    generated as part of the vectorization of STMT.  Assign the statement
5771    for each vector to the associated scalar statement.  */
5772
5773 void
5774 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5775 {
5776   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5777   gimple *next_stmt, *new_stmt;
5778   unsigned int i, gap_count;
5779   tree tmp_data_ref;
5780
5781   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5782      Since we scan the chain starting from it's first node, their order
5783      corresponds the order of data-refs in RESULT_CHAIN.  */
5784   next_stmt = first_stmt;
5785   gap_count = 1;
5786   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5787     {
5788       if (!next_stmt)
5789         break;
5790
5791       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5792        code elimination pass later.  No need to check for the first stmt in
5793        the group, since it always exists.
5794        GROUP_GAP is the number of steps in elements from the previous
5795        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5796        correspond to the gaps.  */
5797       if (next_stmt != first_stmt
5798           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5799       {
5800         gap_count++;
5801         continue;
5802       }
5803
5804       while (next_stmt)
5805         {
5806           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5807           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5808              copies, and we put the new vector statement in the first available
5809              RELATED_STMT.  */
5810           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5811             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5812           else
5813             {
5814               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5815                 {
5816                   gimple *prev_stmt =
5817                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5818                   gimple *rel_stmt =
5819                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5820                   while (rel_stmt)
5821                     {
5822                       prev_stmt = rel_stmt;
5823                       rel_stmt =
5824                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5825                     }
5826
5827                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5828                     new_stmt;
5829                 }
5830             }
5831
5832           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5833           gap_count = 1;
5834           /* If NEXT_STMT accesses the same DR as the previous statement,
5835              put the same TMP_DATA_REF as its vectorized statement; otherwise
5836              get the next data-ref from RESULT_CHAIN.  */
5837           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5838             break;
5839         }
5840     }
5841 }
5842
5843 /* Function vect_force_dr_alignment_p.
5844
5845    Returns whether the alignment of a DECL can be forced to be aligned
5846    on ALIGNMENT bit boundary.  */
5847
5848 bool
5849 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5850 {
5851   if (!VAR_P (decl))
5852     return false;
5853
5854   if (decl_in_symtab_p (decl)
5855       && !symtab_node::get (decl)->can_increase_alignment_p ())
5856     return false;
5857
5858   if (TREE_STATIC (decl))
5859     return (alignment <= MAX_OFILE_ALIGNMENT);
5860   else
5861     return (alignment <= MAX_STACK_ALIGNMENT);
5862 }
5863
5864
5865 /* Return whether the data reference DR is supported with respect to its
5866    alignment.
5867    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5868    it is aligned, i.e., check if it is possible to vectorize it with different
5869    alignment.  */
5870
5871 enum dr_alignment_support
5872 vect_supportable_dr_alignment (struct data_reference *dr,
5873                                bool check_aligned_accesses)
5874 {
5875   gimple *stmt = DR_STMT (dr);
5876   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5877   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5878   machine_mode mode = TYPE_MODE (vectype);
5879   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5880   struct loop *vect_loop = NULL;
5881   bool nested_in_vect_loop = false;
5882
5883   if (aligned_access_p (dr) && !check_aligned_accesses)
5884     return dr_aligned;
5885
5886   /* For now assume all conditional loads/stores support unaligned
5887      access without any special code.  */
5888   if (is_gimple_call (stmt)
5889       && gimple_call_internal_p (stmt)
5890       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5891           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5892     return dr_unaligned_supported;
5893
5894   if (loop_vinfo)
5895     {
5896       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5897       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5898     }
5899
5900   /* Possibly unaligned access.  */
5901
5902   /* We can choose between using the implicit realignment scheme (generating
5903      a misaligned_move stmt) and the explicit realignment scheme (generating
5904      aligned loads with a REALIGN_LOAD).  There are two variants to the
5905      explicit realignment scheme: optimized, and unoptimized.
5906      We can optimize the realignment only if the step between consecutive
5907      vector loads is equal to the vector size.  Since the vector memory
5908      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5909      is guaranteed that the misalignment amount remains the same throughout the
5910      execution of the vectorized loop.  Therefore, we can create the
5911      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5912      at the loop preheader.
5913
5914      However, in the case of outer-loop vectorization, when vectorizing a
5915      memory access in the inner-loop nested within the LOOP that is now being
5916      vectorized, while it is guaranteed that the misalignment of the
5917      vectorized memory access will remain the same in different outer-loop
5918      iterations, it is *not* guaranteed that is will remain the same throughout
5919      the execution of the inner-loop.  This is because the inner-loop advances
5920      with the original scalar step (and not in steps of VS).  If the inner-loop
5921      step happens to be a multiple of VS, then the misalignment remains fixed
5922      and we can use the optimized realignment scheme.  For example:
5923
5924       for (i=0; i<N; i++)
5925         for (j=0; j<M; j++)
5926           s += a[i+j];
5927
5928      When vectorizing the i-loop in the above example, the step between
5929      consecutive vector loads is 1, and so the misalignment does not remain
5930      fixed across the execution of the inner-loop, and the realignment cannot
5931      be optimized (as illustrated in the following pseudo vectorized loop):
5932
5933       for (i=0; i<N; i+=4)
5934         for (j=0; j<M; j++){
5935           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5936                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5937                          // (assuming that we start from an aligned address).
5938           }
5939
5940      We therefore have to use the unoptimized realignment scheme:
5941
5942       for (i=0; i<N; i+=4)
5943           for (j=k; j<M; j+=4)
5944           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5945                            // that the misalignment of the initial address is
5946                            // 0).
5947
5948      The loop can then be vectorized as follows:
5949
5950       for (k=0; k<4; k++){
5951         rt = get_realignment_token (&vp[k]);
5952         for (i=0; i<N; i+=4){
5953           v1 = vp[i+k];
5954           for (j=k; j<M; j+=4){
5955             v2 = vp[i+j+VS-1];
5956             va = REALIGN_LOAD <v1,v2,rt>;
5957             vs += va;
5958             v1 = v2;
5959           }
5960         }
5961     } */
5962
5963   if (DR_IS_READ (dr))
5964     {
5965       bool is_packed = false;
5966       tree type = (TREE_TYPE (DR_REF (dr)));
5967
5968       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5969           && (!targetm.vectorize.builtin_mask_for_load
5970               || targetm.vectorize.builtin_mask_for_load ()))
5971         {
5972           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5973
5974           /* If we are doing SLP then the accesses need not have the
5975              same alignment, instead it depends on the SLP group size.  */
5976           if (loop_vinfo
5977               && STMT_SLP_TYPE (stmt_info)
5978               && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5979                   * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5980                   % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5981             ;
5982           else if (!loop_vinfo
5983                    || (nested_in_vect_loop
5984                        && (TREE_INT_CST_LOW (DR_STEP (dr))
5985                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5986             return dr_explicit_realign;
5987           else
5988             return dr_explicit_realign_optimized;
5989         }
5990       if (!known_alignment_for_access_p (dr))
5991         is_packed = not_size_aligned (DR_REF (dr));
5992
5993       if (targetm.vectorize.support_vector_misalignment
5994             (mode, type, DR_MISALIGNMENT (dr), is_packed))
5995         /* Can't software pipeline the loads, but can at least do them.  */
5996         return dr_unaligned_supported;
5997     }
5998   else
5999     {
6000       bool is_packed = false;
6001       tree type = (TREE_TYPE (DR_REF (dr)));
6002
6003       if (!known_alignment_for_access_p (dr))
6004         is_packed = not_size_aligned (DR_REF (dr));
6005
6006      if (targetm.vectorize.support_vector_misalignment
6007            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6008        return dr_unaligned_supported;
6009     }
6010
6011   /* Unsupported.  */
6012   return dr_unaligned_unsupported;
6013 }