gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode;
  65   scalar_int_mode array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   limit_p = !targetm.array_mode_supported_p (mode, count);
  70   if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
  71                           limit_p).exists (&array_mode))
  72     {
  73       if (dump_enabled_p ())
  74         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  75                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  76                          GET_MODE_NAME (mode), count);
  77       return false;
  78     }
  79
  80   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  81     {
  82       if (dump_enabled_p ())
  83         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  84                          "cannot use %s<%s><%s>\n", name,
  85                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  86       return false;
  87     }
  88
  89   if (dump_enabled_p ())
  90     dump_printf_loc (MSG_NOTE, vect_location,
  91                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  92                      GET_MODE_NAME (mode));
  93
  94   return true;
  95 }
  96
  97
  98 /* Return the smallest scalar part of STMT.
  99    This is used to determine the vectype of the stmt.  We generally set the
 100    vectype according to the type of the result (lhs).  For stmts whose
 101    result-type is different than the type of the arguments (e.g., demotion,
 102    promotion), vectype will be reset appropriately (later).  Note that we have
 103    to visit the smallest datatype in this function, because that determines the
 104    VF.  If the smallest datatype in the loop is present only as the rhs of a
 105    promotion operation - we'd miss it.
 106    Such a case, where a variable of this datatype does not appear in the lhs
 107    anywhere in the loop, can only occur if it's an invariant: e.g.:
 108    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 109    invariant motion.  However, we cannot rely on invariant motion to always
 110    take invariants out of the loop, and so in the case of promotion we also
 111    have to check the rhs.
 112    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 113    types.  */
 114
 115 tree
 116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 117                                HOST_WIDE_INT *rhs_size_unit)
 118 {
 119   tree scalar_type = gimple_expr_type (stmt);
 120   HOST_WIDE_INT lhs, rhs;
 121
 122   /* During the analysis phase, this function is called on arbitrary
 123      statements that might not have scalar results.  */
 124   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 125     return scalar_type;
 126
 127   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 128
 129   if (is_gimple_assign (stmt)
 130       && (gimple_assign_cast_p (stmt)
 131           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 132           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 133           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 134     {
 135       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 136
 137       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 138       if (rhs < lhs)
 139         scalar_type = rhs_type;
 140     }
 141
 142   *lhs_size_unit = lhs;
 143   *rhs_size_unit = rhs;
 144   return scalar_type;
 145 }
 146
 147
 148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 149    tested at run-time.  Return TRUE if DDR was successfully inserted.
 150    Return false if versioning is not supported.  */
 151
 152 static bool
 153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 154 {
 155   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 156
 157   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 158     return false;
 159
 160   if (!runtime_alias_check_p (ddr, loop,
 161                               optimize_loop_nest_for_speed_p (loop)))
 162     return false;
 163
 164   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 165   return true;
 166 }
 167
 168
 169 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 170    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 171    distances.  These distances are conservatively correct but they don't
 172    reflect a guaranteed dependence.
 173
 174    Return true if this function does all the work necessary to avoid
 175    an alias or false if the caller should use the dependence distances
 176    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 177    the depth of the loop described by LOOP_VINFO and the other arguments
 178    are as for vect_analyze_data_ref_dependence.  */
 179
 180 static bool
 181 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 182                                        loop_vec_info loop_vinfo,
 183                                        int loop_depth, int *max_vf)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   lambda_vector dist_v;
 187   unsigned int i;
 188   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 189     {
 190       int dist = dist_v[loop_depth];
 191       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 192         {
 193           /* If the user asserted safelen >= DIST consecutive iterations
 194              can be executed concurrently, assume independence.
 195
 196              ??? An alternative would be to add the alias check even
 197              in this case, and vectorize the fallback loop with the
 198              maximum VF set to safelen.  However, if the user has
 199              explicitly given a length, it's less likely that that
 200              would be a win.  */
 201           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 202             {
 203               if (loop->safelen < *max_vf)
 204                 *max_vf = loop->safelen;
 205               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 206               continue;
 207             }
 208
 209           /* For dependence distances of 2 or more, we have the option
 210              of limiting VF or checking for an alias at runtime.
 211              Prefer to check at runtime if we can, to avoid limiting
 212              the VF unnecessarily when the bases are in fact independent.
 213
 214              Note that the alias checks will be removed if the VF ends up
 215              being small enough.  */
 216           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 217         }
 218     }
 219   return true;
 220 }
 221
 222
 223 /* Function vect_analyze_data_ref_dependence.
 224
 225    Return TRUE if there (might) exist a dependence between a memory-reference
 226    DRA and a memory-reference DRB.  When versioning for alias may check a
 227    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 228    the data dependence.  */
 229
 230 static bool
 231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 232                                   loop_vec_info loop_vinfo, int *max_vf)
 233 {
 234   unsigned int i;
 235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236   struct data_reference *dra = DDR_A (ddr);
 237   struct data_reference *drb = DDR_B (ddr);
 238   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 239   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 240   lambda_vector dist_v;
 241   unsigned int loop_depth;
 242
 243   /* In loop analysis all data references should be vectorizable.  */
 244   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 245       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 246     gcc_unreachable ();
 247
 248   /* Independent data accesses.  */
 249   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 250     return false;
 251
 252   if (dra == drb
 253       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 254     return false;
 255
 256   /* We do not have to consider dependences between accesses that belong
 257      to the same group.  */
 258   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 259       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 260     return false;
 261
 262   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 263      least two scalar iterations, there is always also a true dependence.
 264      As the vectorizer does not re-order loads and stores we can ignore
 265      the anti-dependence if TBAA can disambiguate both DRs similar to the
 266      case with known negative distance anti-dependences (positive
 267      distance anti-dependences would violate TBAA constraints).  */
 268   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 269        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 270       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 271                                  get_alias_set (DR_REF (drb))))
 272     return false;
 273
 274   /* Unknown data dependence.  */
 275   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 276     {
 277       /* If user asserted safelen consecutive iterations can be
 278          executed concurrently, assume independence.  */
 279       if (loop->safelen >= 2)
 280         {
 281           if (loop->safelen < *max_vf)
 282             *max_vf = loop->safelen;
 283           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 284           return false;
 285         }
 286
 287       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 288           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 289         {
 290           if (dump_enabled_p ())
 291             {
 292               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 293                                "versioning for alias not supported for: "
 294                                "can't determine dependence between ");
 295               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 296                                  DR_REF (dra));
 297               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 298               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 299                                  DR_REF (drb));
 300               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 301             }
 302           return true;
 303         }
 304
 305       if (dump_enabled_p ())
 306         {
 307           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 308                            "versioning for alias required: "
 309                            "can't determine dependence between ");
 310           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 311                              DR_REF (dra));
 312           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 313           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 314                              DR_REF (drb));
 315           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 316         }
 317
 318       /* Add to list of ddrs that need to be tested at run-time.  */
 319       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 320     }
 321
 322   /* Known data dependence.  */
 323   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 324     {
 325       /* If user asserted safelen consecutive iterations can be
 326          executed concurrently, assume independence.  */
 327       if (loop->safelen >= 2)
 328         {
 329           if (loop->safelen < *max_vf)
 330             *max_vf = loop->safelen;
 331           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 332           return false;
 333         }
 334
 335       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 336           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 337         {
 338           if (dump_enabled_p ())
 339             {
 340               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 341                                "versioning for alias not supported for: "
 342                                "bad dist vector for ");
 343               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 344                                  DR_REF (dra));
 345               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 346               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 347                                  DR_REF (drb));
 348               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 349             }
 350           return true;
 351         }
 352
 353       if (dump_enabled_p ())
 354         {
 355           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 356                            "versioning for alias required: "
 357                            "bad dist vector for ");
 358           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 359           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 360           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 361           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 362         }
 363       /* Add to list of ddrs that need to be tested at run-time.  */
 364       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 365     }
 366
 367   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 368
 369   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 370       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 371                                                 loop_depth, max_vf))
 372     return false;
 373
 374   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 375     {
 376       int dist = dist_v[loop_depth];
 377
 378       if (dump_enabled_p ())
 379         dump_printf_loc (MSG_NOTE, vect_location,
 380                          "dependence distance  = %d.\n", dist);
 381
 382       if (dist == 0)
 383         {
 384           if (dump_enabled_p ())
 385             {
 386               dump_printf_loc (MSG_NOTE, vect_location,
 387                                "dependence distance == 0 between ");
 388               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 389               dump_printf (MSG_NOTE, " and ");
 390               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 391               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 392             }
 393
 394           /* When we perform grouped accesses and perform implicit CSE
 395              by detecting equal accesses and doing disambiguation with
 396              runtime alias tests like for
 397                 .. = a[i];
 398                 .. = a[i+1];
 399                 a[i] = ..;
 400                 a[i+1] = ..;
 401                 *p = ..;
 402                 .. = a[i];
 403                 .. = a[i+1];
 404              where we will end up loading { a[i], a[i+1] } once, make
 405              sure that inserting group loads before the first load and
 406              stores after the last store will do the right thing.
 407              Similar for groups like
 408                 a[i] = ...;
 409                 ... = a[i];
 410                 a[i+1] = ...;
 411              where loads from the group interleave with the store.  */
 412           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 413               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 414             {
 415               gimple *earlier_stmt;
 416               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 417               if (DR_IS_WRITE
 418                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 419                 {
 420                   if (dump_enabled_p ())
 421                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 422                                      "READ_WRITE dependence in interleaving."
 423                                      "\n");
 424                   return true;
 425                 }
 426             }
 427
 428           continue;
 429         }
 430
 431       if (dist > 0 && DDR_REVERSED_P (ddr))
 432         {
 433           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 434              reversed (to make distance vector positive), and the actual
 435              distance is negative.  */
 436           if (dump_enabled_p ())
 437             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 438                              "dependence distance negative.\n");
 439           /* Record a negative dependence distance to later limit the
 440              amount of stmt copying / unrolling we can perform.
 441              Only need to handle read-after-write dependence.  */
 442           if (DR_IS_READ (drb)
 443               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 444                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 445             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 446           continue;
 447         }
 448
 449       if (abs (dist) >= 2
 450           && abs (dist) < *max_vf)
 451         {
 452           /* The dependence distance requires reduction of the maximal
 453              vectorization factor.  */
 454           *max_vf = abs (dist);
 455           if (dump_enabled_p ())
 456             dump_printf_loc (MSG_NOTE, vect_location,
 457                              "adjusting maximal vectorization factor to %i\n",
 458                              *max_vf);
 459         }
 460
 461       if (abs (dist) >= *max_vf)
 462         {
 463           /* Dependence distance does not create dependence, as far as
 464              vectorization is concerned, in this case.  */
 465           if (dump_enabled_p ())
 466             dump_printf_loc (MSG_NOTE, vect_location,
 467                              "dependence distance >= VF.\n");
 468           continue;
 469         }
 470
 471       if (dump_enabled_p ())
 472         {
 473           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                        "not vectorized, possible dependence "
 475                        "between data-refs ");
 476           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 477           dump_printf (MSG_NOTE,  " and ");
 478           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 479           dump_printf (MSG_NOTE,  "\n");
 480         }
 481
 482       return true;
 483     }
 484
 485   return false;
 486 }
 487
 488 /* Function vect_analyze_data_ref_dependences.
 489
 490    Examine all the data references in the loop, and make sure there do not
 491    exist any data dependences between them.  Set *MAX_VF according to
 492    the maximum vectorization factor the data dependences allow.  */
 493
 494 bool
 495 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 496 {
 497   unsigned int i;
 498   struct data_dependence_relation *ddr;
 499
 500   if (dump_enabled_p ())
 501     dump_printf_loc (MSG_NOTE, vect_location,
 502                      "=== vect_analyze_data_ref_dependences ===\n");
 503
 504   LOOP_VINFO_DDRS (loop_vinfo)
 505     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 506              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 507   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 508   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 509   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 510                                 &LOOP_VINFO_DDRS (loop_vinfo),
 511                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 512     return false;
 513
 514   /* For epilogues we either have no aliases or alias versioning
 515      was applied to original loop.  Therefore we may just get max_vf
 516      using VF of original loop.  */
 517   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 518     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 519   else
 520     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 521       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 522         return false;
 523
 524   return true;
 525 }
 526
 527
 528 /* Function vect_slp_analyze_data_ref_dependence.
 529
 530    Return TRUE if there (might) exist a dependence between a memory-reference
 531    DRA and a memory-reference DRB.  When versioning for alias may check a
 532    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 533    the data dependence.  */
 534
 535 static bool
 536 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 537 {
 538   struct data_reference *dra = DDR_A (ddr);
 539   struct data_reference *drb = DDR_B (ddr);
 540
 541   /* We need to check dependences of statements marked as unvectorizable
 542      as well, they still can prohibit vectorization.  */
 543
 544   /* Independent data accesses.  */
 545   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 546     return false;
 547
 548   if (dra == drb)
 549     return false;
 550
 551   /* Read-read is OK.  */
 552   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 553     return false;
 554
 555   /* If dra and drb are part of the same interleaving chain consider
 556      them independent.  */
 557   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 558       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 559           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 560     return false;
 561
 562   /* Unknown data dependence.  */
 563   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 564     {
 565       if  (dump_enabled_p ())
 566         {
 567           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 568                            "can't determine dependence between ");
 569           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 570           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 571           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 572           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 573         }
 574     }
 575   else if (dump_enabled_p ())
 576     {
 577       dump_printf_loc (MSG_NOTE, vect_location,
 578                        "determined dependence between ");
 579       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 580       dump_printf (MSG_NOTE, " and ");
 581       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 582       dump_printf (MSG_NOTE,  "\n");
 583     }
 584
 585   return true;
 586 }
 587
 588
 589 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 590    contain the vector of scalar stores of this instance if we are
 591    disambiguating the loads.  */
 592
 593 static bool
 594 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 595                                    vec<gimple *> stores, gimple *last_store)
 596 {
 597   /* This walks over all stmts involved in the SLP load/store done
 598      in NODE verifying we can sink them up to the last stmt in the
 599      group.  */
 600   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 601   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 602     {
 603       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 604       if (access == last_access)
 605         continue;
 606       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 607       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 608            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 609         {
 610           gimple *stmt = gsi_stmt (gsi);
 611           if (! gimple_vuse (stmt)
 612               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 613             continue;
 614
 615           /* If we couldn't record a (single) data reference for this
 616              stmt we have to give up.  */
 617           /* ???  Here and below if dependence analysis fails we can resort
 618              to the alias oracle which can handle more kinds of stmts.  */
 619           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 620           if (!dr_b)
 621             return false;
 622
 623           bool dependent = false;
 624           /* If we run into a store of this same instance (we've just
 625              marked those) then delay dependence checking until we run
 626              into the last store because this is where it will have
 627              been sunk to (and we verify if we can do that as well).  */
 628           if (gimple_visited_p (stmt))
 629             {
 630               if (stmt != last_store)
 631                 continue;
 632               unsigned i;
 633               gimple *store;
 634               FOR_EACH_VEC_ELT (stores, i, store)
 635                 {
 636                   data_reference *store_dr
 637                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 638                   ddr_p ddr = initialize_data_dependence_relation
 639                                 (dr_a, store_dr, vNULL);
 640                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 641                   free_dependence_relation (ddr);
 642                   if (dependent)
 643                     break;
 644                 }
 645             }
 646           else
 647             {
 648               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 649                                                                dr_b, vNULL);
 650               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 651               free_dependence_relation (ddr);
 652             }
 653           if (dependent)
 654             return false;
 655         }
 656     }
 657   return true;
 658 }
 659
 660
 661 /* Function vect_analyze_data_ref_dependences.
 662
 663    Examine all the data references in the basic-block, and make sure there
 664    do not exist any data dependences between them.  Set *MAX_VF according to
 665    the maximum vectorization factor the data dependences allow.  */
 666
 667 bool
 668 vect_slp_analyze_instance_dependence (slp_instance instance)
 669 {
 670   if (dump_enabled_p ())
 671     dump_printf_loc (MSG_NOTE, vect_location,
 672                      "=== vect_slp_analyze_instance_dependence ===\n");
 673
 674   /* The stores of this instance are at the root of the SLP tree.  */
 675   slp_tree store = SLP_INSTANCE_TREE (instance);
 676   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 677     store = NULL;
 678
 679   /* Verify we can sink stores to the vectorized stmt insert location.  */
 680   gimple *last_store = NULL;
 681   if (store)
 682     {
 683       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 684         return false;
 685
 686       /* Mark stores in this instance and remember the last one.  */
 687       last_store = vect_find_last_scalar_stmt_in_slp (store);
 688       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 689         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 690     }
 691
 692   bool res = true;
 693
 694   /* Verify we can sink loads to the vectorized stmt insert location,
 695      special-casing stores of this instance.  */
 696   slp_tree load;
 697   unsigned int i;
 698   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 699     if (! vect_slp_analyze_node_dependences (instance, load,
 700                                              store
 701                                              ? SLP_TREE_SCALAR_STMTS (store)
 702                                              : vNULL, last_store))
 703       {
 704         res = false;
 705         break;
 706       }
 707
 708   /* Unset the visited flag.  */
 709   if (store)
 710     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 711       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 712
 713   return res;
 714 }
 715
 716 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 717    the statement that contains DRB, which is useful for recording in the
 718    dump file.  */
 719
 720 static void
 721 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 722                             innermost_loop_behavior *drb)
 723 {
 724   bool existed;
 725   innermost_loop_behavior *&entry
 726     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 727   if (!existed || entry->base_alignment < drb->base_alignment)
 728     {
 729       entry = drb;
 730       if (dump_enabled_p ())
 731         {
 732           dump_printf_loc (MSG_NOTE, vect_location,
 733                            "recording new base alignment for ");
 734           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 735           dump_printf (MSG_NOTE, "\n");
 736           dump_printf_loc (MSG_NOTE, vect_location,
 737                            "  alignment:    %d\n", drb->base_alignment);
 738           dump_printf_loc (MSG_NOTE, vect_location,
 739                            "  misalignment: %d\n", drb->base_misalignment);
 740           dump_printf_loc (MSG_NOTE, vect_location,
 741                            "  based on:     ");
 742           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 743         }
 744     }
 745 }
 746
 747 /* If the region we're going to vectorize is reached, all unconditional
 748    data references occur at least once.  We can therefore pool the base
 749    alignment guarantees from each unconditional reference.  Do this by
 750    going through all the data references in VINFO and checking whether
 751    the containing statement makes the reference unconditionally.  If so,
 752    record the alignment of the base address in VINFO so that it can be
 753    used for all other references with the same base.  */
 754
 755 void
 756 vect_record_base_alignments (vec_info *vinfo)
 757 {
 758   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 759   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 760   data_reference *dr;
 761   unsigned int i;
 762   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 763     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 764       {
 765         gimple *stmt = DR_STMT (dr);
 766         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 767
 768         /* If DR is nested in the loop that is being vectorized, we can also
 769            record the alignment of the base wrt the outer loop.  */
 770         if (loop && nested_in_vect_loop_p (loop, stmt))
 771           {
 772             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 773             vect_record_base_alignment
 774               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 775           }
 776       }
 777 }
 778
 779 /* Return the target alignment for the vectorized form of DR.  */
 780
 781 static unsigned int
 782 vect_calculate_target_alignment (struct data_reference *dr)
 783 {
 784   gimple *stmt = DR_STMT (dr);
 785   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 786   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 787   return targetm.vectorize.preferred_vector_alignment (vectype);
 788 }
 789
 790 /* Function vect_compute_data_ref_alignment
 791
 792    Compute the misalignment of the data reference DR.
 793
 794    Output:
 795    1. If during the misalignment computation it is found that the data reference
 796       cannot be vectorized then false is returned.
 797    2. DR_MISALIGNMENT (DR) is defined.
 798
 799    FOR NOW: No analysis is actually performed. Misalignment is calculated
 800    only for trivial cases. TODO.  */
 801
 802 bool
 803 vect_compute_data_ref_alignment (struct data_reference *dr)
 804 {
 805   gimple *stmt = DR_STMT (dr);
 806   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 807   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 808   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 809   struct loop *loop = NULL;
 810   tree ref = DR_REF (dr);
 811   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 812
 813   if (dump_enabled_p ())
 814     dump_printf_loc (MSG_NOTE, vect_location,
 815                      "vect_compute_data_ref_alignment:\n");
 816
 817   if (loop_vinfo)
 818     loop = LOOP_VINFO_LOOP (loop_vinfo);
 819
 820   /* Initialize misalignment to unknown.  */
 821   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 822
 823   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 824   bool step_preserves_misalignment_p;
 825
 826   unsigned HOST_WIDE_INT vector_alignment
 827     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 828   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 829
 830   /* No step for BB vectorization.  */
 831   if (!loop)
 832     {
 833       gcc_assert (integer_zerop (drb->step));
 834       step_preserves_misalignment_p = true;
 835     }
 836
 837   /* In case the dataref is in an inner-loop of the loop that is being
 838      vectorized (LOOP), we use the base and misalignment information
 839      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 840      stays the same throughout the execution of the inner-loop, which is why
 841      we have to check that the stride of the dataref in the inner-loop evenly
 842      divides by the vector alignment.  */
 843   else if (nested_in_vect_loop_p (loop, stmt))
 844     {
 845       step_preserves_misalignment_p
 846         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 847
 848       if (dump_enabled_p ())
 849         {
 850           if (step_preserves_misalignment_p)
 851             dump_printf_loc (MSG_NOTE, vect_location,
 852                              "inner step divides the vector alignment.\n");
 853           else
 854             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 855                              "inner step doesn't divide the vector"
 856                              " alignment.\n");
 857         }
 858     }
 859
 860   /* Similarly we can only use base and misalignment information relative to
 861      an innermost loop if the misalignment stays the same throughout the
 862      execution of the loop.  As above, this is the case if the stride of
 863      the dataref evenly divides by the alignment.  */
 864   else
 865     {
 866       unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 867       step_preserves_misalignment_p
 868         = ((DR_STEP_ALIGNMENT (dr) * vf) % vector_alignment) == 0;
 869
 870       if (!step_preserves_misalignment_p && dump_enabled_p ())
 871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 872                          "step doesn't divide the vector alignment.\n");
 873     }
 874
 875   unsigned int base_alignment = drb->base_alignment;
 876   unsigned int base_misalignment = drb->base_misalignment;
 877
 878   /* Calculate the maximum of the pooled base address alignment and the
 879      alignment that we can compute for DR itself.  */
 880   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 881   if (entry && base_alignment < (*entry)->base_alignment)
 882     {
 883       base_alignment = (*entry)->base_alignment;
 884       base_misalignment = (*entry)->base_misalignment;
 885     }
 886
 887   if (drb->offset_alignment < vector_alignment
 888       || !step_preserves_misalignment_p
 889       /* We need to know whether the step wrt the vectorized loop is
 890          negative when computing the starting misalignment below.  */
 891       || TREE_CODE (drb->step) != INTEGER_CST)
 892     {
 893       if (dump_enabled_p ())
 894         {
 895           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 896                            "Unknown alignment for access: ");
 897           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 898           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 899         }
 900       return true;
 901     }
 902
 903   if (base_alignment < vector_alignment)
 904     {
 905       tree base = drb->base_address;
 906       if (TREE_CODE (base) == ADDR_EXPR)
 907         base = TREE_OPERAND (base, 0);
 908       if (!vect_can_force_dr_alignment_p (base,
 909                                           vector_alignment * BITS_PER_UNIT))
 910         {
 911           if (dump_enabled_p ())
 912             {
 913               dump_printf_loc (MSG_NOTE, vect_location,
 914                                "can't force alignment of ref: ");
 915               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 916               dump_printf (MSG_NOTE, "\n");
 917             }
 918           return true;
 919         }
 920
 921       if (DECL_USER_ALIGN (base))
 922         {
 923           if (dump_enabled_p ())
 924             {
 925               dump_printf_loc (MSG_NOTE, vect_location,
 926                                "not forcing alignment of user-aligned "
 927                                "variable: ");
 928               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 929               dump_printf (MSG_NOTE, "\n");
 930             }
 931           return true;
 932         }
 933
 934       /* Force the alignment of the decl.
 935          NOTE: This is the only change to the code we make during
 936          the analysis phase, before deciding to vectorize the loop.  */
 937       if (dump_enabled_p ())
 938         {
 939           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 940           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 941           dump_printf (MSG_NOTE, "\n");
 942         }
 943
 944       DR_VECT_AUX (dr)->base_decl = base;
 945       DR_VECT_AUX (dr)->base_misaligned = true;
 946       base_misalignment = 0;
 947     }
 948   poly_int64 misalignment
 949     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 950
 951   /* If this is a backward running DR then first access in the larger
 952      vectype actually is N-1 elements before the address in the DR.
 953      Adjust misalign accordingly.  */
 954   if (tree_int_cst_sgn (drb->step) < 0)
 955     /* PLUS because STEP is negative.  */
 956     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 957                      * TREE_INT_CST_LOW (drb->step));
 958
 959   unsigned int const_misalignment;
 960   if (!known_misalignment (misalignment, vector_alignment,
 961                            &const_misalignment))
 962     {
 963       if (dump_enabled_p ())
 964         {
 965           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 966                            "Non-constant misalignment for access: ");
 967           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 968           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 969         }
 970       return true;
 971     }
 972
 973   SET_DR_MISALIGNMENT (dr, const_misalignment);
 974
 975   if (dump_enabled_p ())
 976     {
 977       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 978                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 979       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 980       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 981     }
 982
 983   return true;
 984 }
 985
 986 /* Function vect_update_misalignment_for_peel.
 987    Sets DR's misalignment
 988    - to 0 if it has the same alignment as DR_PEEL,
 989    - to the misalignment computed using NPEEL if DR's salignment is known,
 990    - to -1 (unknown) otherwise.
 991
 992    DR - the data reference whose misalignment is to be adjusted.
 993    DR_PEEL - the data reference whose misalignment is being made
 994              zero in the vector loop by the peel.
 995    NPEEL - the number of iterations in the peel loop if the misalignment
 996            of DR_PEEL is known at compile time.  */
 997
 998 static void
 999 vect_update_misalignment_for_peel (struct data_reference *dr,
1000                                    struct data_reference *dr_peel, int npeel)
1001 {
1002   unsigned int i;
1003   vec<dr_p> same_aligned_drs;
1004   struct data_reference *current_dr;
1005   int dr_size = vect_get_scalar_dr_size (dr);
1006   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1007   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1008   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1009
1010  /* For interleaved data accesses the step in the loop must be multiplied by
1011      the size of the interleaving group.  */
1012   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1013     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1014   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1015     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1016
1017   /* It can be assumed that the data refs with the same alignment as dr_peel
1018      are aligned in the vector loop.  */
1019   same_aligned_drs
1020     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1021   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1022     {
1023       if (current_dr != dr)
1024         continue;
1025       gcc_assert (!known_alignment_for_access_p (dr)
1026                   || !known_alignment_for_access_p (dr_peel)
1027                   || (DR_MISALIGNMENT (dr) / dr_size
1028                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1029       SET_DR_MISALIGNMENT (dr, 0);
1030       return;
1031     }
1032
1033   if (known_alignment_for_access_p (dr)
1034       && known_alignment_for_access_p (dr_peel))
1035     {
1036       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1037       int misal = DR_MISALIGNMENT (dr);
1038       misal += negative ? -npeel * dr_size : npeel * dr_size;
1039       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1040       SET_DR_MISALIGNMENT (dr, misal);
1041       return;
1042     }
1043
1044   if (dump_enabled_p ())
1045     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1046                      "to unknown (-1).\n");
1047   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1048 }
1049
1050
1051 /* Function verify_data_ref_alignment
1052
1053    Return TRUE if DR can be handled with respect to alignment.  */
1054
1055 static bool
1056 verify_data_ref_alignment (data_reference_p dr)
1057 {
1058   enum dr_alignment_support supportable_dr_alignment
1059     = vect_supportable_dr_alignment (dr, false);
1060   if (!supportable_dr_alignment)
1061     {
1062       if (dump_enabled_p ())
1063         {
1064           if (DR_IS_READ (dr))
1065             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066                              "not vectorized: unsupported unaligned load.");
1067           else
1068             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1069                              "not vectorized: unsupported unaligned "
1070                              "store.");
1071
1072           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1073                              DR_REF (dr));
1074           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1075         }
1076       return false;
1077     }
1078
1079   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1080     dump_printf_loc (MSG_NOTE, vect_location,
1081                      "Vectorizing an unaligned access.\n");
1082
1083   return true;
1084 }
1085
1086 /* Function vect_verify_datarefs_alignment
1087
1088    Return TRUE if all data references in the loop can be
1089    handled with respect to alignment.  */
1090
1091 bool
1092 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1093 {
1094   vec<data_reference_p> datarefs = vinfo->datarefs;
1095   struct data_reference *dr;
1096   unsigned int i;
1097
1098   FOR_EACH_VEC_ELT (datarefs, i, dr)
1099     {
1100       gimple *stmt = DR_STMT (dr);
1101       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1102
1103       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1104         continue;
1105
1106       /* For interleaving, only the alignment of the first access matters.   */
1107       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1108           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1109         continue;
1110
1111       /* Strided accesses perform only component accesses, alignment is
1112          irrelevant for them.  */
1113       if (STMT_VINFO_STRIDED_P (stmt_info)
1114           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1115         continue;
1116
1117       if (! verify_data_ref_alignment (dr))
1118         return false;
1119     }
1120
1121   return true;
1122 }
1123
1124 /* Given an memory reference EXP return whether its alignment is less
1125    than its size.  */
1126
1127 static bool
1128 not_size_aligned (tree exp)
1129 {
1130   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1131     return true;
1132
1133   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1134           > get_object_alignment (exp));
1135 }
1136
1137 /* Function vector_alignment_reachable_p
1138
1139    Return true if vector alignment for DR is reachable by peeling
1140    a few loop iterations.  Return false otherwise.  */
1141
1142 static bool
1143 vector_alignment_reachable_p (struct data_reference *dr)
1144 {
1145   gimple *stmt = DR_STMT (dr);
1146   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1147   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1148
1149   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1150     {
1151       /* For interleaved access we peel only if number of iterations in
1152          the prolog loop ({VF - misalignment}), is a multiple of the
1153          number of the interleaved accesses.  */
1154       int elem_size, mis_in_elements;
1155       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1156
1157       /* FORNOW: handle only known alignment.  */
1158       if (!known_alignment_for_access_p (dr))
1159         return false;
1160
1161       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1162       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1163
1164       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1165         return false;
1166     }
1167
1168   /* If misalignment is known at the compile time then allow peeling
1169      only if natural alignment is reachable through peeling.  */
1170   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1171     {
1172       HOST_WIDE_INT elmsize =
1173                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1174       if (dump_enabled_p ())
1175         {
1176           dump_printf_loc (MSG_NOTE, vect_location,
1177                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1178           dump_printf (MSG_NOTE,
1179                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1180         }
1181       if (DR_MISALIGNMENT (dr) % elmsize)
1182         {
1183           if (dump_enabled_p ())
1184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185                              "data size does not divide the misalignment.\n");
1186           return false;
1187         }
1188     }
1189
1190   if (!known_alignment_for_access_p (dr))
1191     {
1192       tree type = TREE_TYPE (DR_REF (dr));
1193       bool is_packed = not_size_aligned (DR_REF (dr));
1194       if (dump_enabled_p ())
1195         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196                          "Unknown misalignment, %snaturally aligned\n",
1197                          is_packed ? "not " : "");
1198       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1199     }
1200
1201   return true;
1202 }
1203
1204
1205 /* Calculate the cost of the memory access represented by DR.  */
1206
1207 static void
1208 vect_get_data_access_cost (struct data_reference *dr,
1209                            unsigned int *inside_cost,
1210                            unsigned int *outside_cost,
1211                            stmt_vector_for_cost *body_cost_vec)
1212 {
1213   gimple *stmt = DR_STMT (dr);
1214   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1215   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1216   int ncopies;
1217
1218   if (PURE_SLP_STMT (stmt_info))
1219     ncopies = 1;
1220   else
1221     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1222
1223   if (DR_IS_READ (dr))
1224     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1225                         NULL, body_cost_vec, false);
1226   else
1227     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1228
1229   if (dump_enabled_p ())
1230     dump_printf_loc (MSG_NOTE, vect_location,
1231                      "vect_get_data_access_cost: inside_cost = %d, "
1232                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1233 }
1234
1235
1236 typedef struct _vect_peel_info
1237 {
1238   struct data_reference *dr;
1239   int npeel;
1240   unsigned int count;
1241 } *vect_peel_info;
1242
1243 typedef struct _vect_peel_extended_info
1244 {
1245   struct _vect_peel_info peel_info;
1246   unsigned int inside_cost;
1247   unsigned int outside_cost;
1248 } *vect_peel_extended_info;
1249
1250
1251 /* Peeling hashtable helpers.  */
1252
1253 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1254 {
1255   static inline hashval_t hash (const _vect_peel_info *);
1256   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1257 };
1258
1259 inline hashval_t
1260 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1261 {
1262   return (hashval_t) peel_info->npeel;
1263 }
1264
1265 inline bool
1266 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1267 {
1268   return (a->npeel == b->npeel);
1269 }
1270
1271
1272 /* Insert DR into peeling hash table with NPEEL as key.  */
1273
1274 static void
1275 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1276                           loop_vec_info loop_vinfo, struct data_reference *dr,
1277                           int npeel)
1278 {
1279   struct _vect_peel_info elem, *slot;
1280   _vect_peel_info **new_slot;
1281   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1282
1283   elem.npeel = npeel;
1284   slot = peeling_htab->find (&elem);
1285   if (slot)
1286     slot->count++;
1287   else
1288     {
1289       slot = XNEW (struct _vect_peel_info);
1290       slot->npeel = npeel;
1291       slot->dr = dr;
1292       slot->count = 1;
1293       new_slot = peeling_htab->find_slot (slot, INSERT);
1294       *new_slot = slot;
1295     }
1296
1297   if (!supportable_dr_alignment
1298       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1299     slot->count += VECT_MAX_COST;
1300 }
1301
1302
1303 /* Traverse peeling hash table to find peeling option that aligns maximum
1304    number of data accesses.  */
1305
1306 int
1307 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1308                                      _vect_peel_extended_info *max)
1309 {
1310   vect_peel_info elem = *slot;
1311
1312   if (elem->count > max->peel_info.count
1313       || (elem->count == max->peel_info.count
1314           && max->peel_info.npeel > elem->npeel))
1315     {
1316       max->peel_info.npeel = elem->npeel;
1317       max->peel_info.count = elem->count;
1318       max->peel_info.dr = elem->dr;
1319     }
1320
1321   return 1;
1322 }
1323
1324 /* Get the costs of peeling NPEEL iterations checking data access costs
1325    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1326    misalignment will be zero after peeling.  */
1327
1328 static void
1329 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1330                                 struct data_reference *dr0,
1331                                 unsigned int *inside_cost,
1332                                 unsigned int *outside_cost,
1333                                 stmt_vector_for_cost *body_cost_vec,
1334                                 unsigned int npeel,
1335                                 bool unknown_misalignment)
1336 {
1337   unsigned i;
1338   data_reference *dr;
1339
1340   FOR_EACH_VEC_ELT (datarefs, i, dr)
1341     {
1342       gimple *stmt = DR_STMT (dr);
1343       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1344       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1345         continue;
1346
1347       /* For interleaving, only the alignment of the first access
1348          matters.  */
1349       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1350           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1351         continue;
1352
1353       /* Strided accesses perform only component accesses, alignment is
1354          irrelevant for them.  */
1355       if (STMT_VINFO_STRIDED_P (stmt_info)
1356           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1357         continue;
1358
1359       int save_misalignment;
1360       save_misalignment = DR_MISALIGNMENT (dr);
1361       if (npeel == 0)
1362         ;
1363       else if (unknown_misalignment && dr == dr0)
1364         SET_DR_MISALIGNMENT (dr, 0);
1365       else
1366         vect_update_misalignment_for_peel (dr, dr0, npeel);
1367       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1368                                  body_cost_vec);
1369       SET_DR_MISALIGNMENT (dr, save_misalignment);
1370     }
1371 }
1372
1373 /* Traverse peeling hash table and calculate cost for each peeling option.
1374    Find the one with the lowest cost.  */
1375
1376 int
1377 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1378                                    _vect_peel_extended_info *min)
1379 {
1380   vect_peel_info elem = *slot;
1381   int dummy;
1382   unsigned int inside_cost = 0, outside_cost = 0;
1383   gimple *stmt = DR_STMT (elem->dr);
1384   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1385   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1386   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1387                        epilogue_cost_vec;
1388
1389   prologue_cost_vec.create (2);
1390   body_cost_vec.create (2);
1391   epilogue_cost_vec.create (2);
1392
1393   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1394                                   elem->dr, &inside_cost, &outside_cost,
1395                                   &body_cost_vec, elem->npeel, false);
1396
1397   body_cost_vec.release ();
1398
1399   outside_cost += vect_get_known_peeling_cost
1400     (loop_vinfo, elem->npeel, &dummy,
1401      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1402      &prologue_cost_vec, &epilogue_cost_vec);
1403
1404   /* Prologue and epilogue costs are added to the target model later.
1405      These costs depend only on the scalar iteration cost, the
1406      number of peeling iterations finally chosen, and the number of
1407      misaligned statements.  So discard the information found here.  */
1408   prologue_cost_vec.release ();
1409   epilogue_cost_vec.release ();
1410
1411   if (inside_cost < min->inside_cost
1412       || (inside_cost == min->inside_cost
1413           && outside_cost < min->outside_cost))
1414     {
1415       min->inside_cost = inside_cost;
1416       min->outside_cost = outside_cost;
1417       min->peel_info.dr = elem->dr;
1418       min->peel_info.npeel = elem->npeel;
1419       min->peel_info.count = elem->count;
1420     }
1421
1422   return 1;
1423 }
1424
1425
1426 /* Choose best peeling option by traversing peeling hash table and either
1427    choosing an option with the lowest cost (if cost model is enabled) or the
1428    option that aligns as many accesses as possible.  */
1429
1430 static struct _vect_peel_extended_info
1431 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1432                                        loop_vec_info loop_vinfo)
1433 {
1434    struct _vect_peel_extended_info res;
1435
1436    res.peel_info.dr = NULL;
1437
1438    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1439      {
1440        res.inside_cost = INT_MAX;
1441        res.outside_cost = INT_MAX;
1442        peeling_htab->traverse <_vect_peel_extended_info *,
1443                                vect_peeling_hash_get_lowest_cost> (&res);
1444      }
1445    else
1446      {
1447        res.peel_info.count = 0;
1448        peeling_htab->traverse <_vect_peel_extended_info *,
1449                                vect_peeling_hash_get_most_frequent> (&res);
1450        res.inside_cost = 0;
1451        res.outside_cost = 0;
1452      }
1453
1454    return res;
1455 }
1456
1457 /* Return true if the new peeling NPEEL is supported.  */
1458
1459 static bool
1460 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1461                           unsigned npeel)
1462 {
1463   unsigned i;
1464   struct data_reference *dr = NULL;
1465   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1466   gimple *stmt;
1467   stmt_vec_info stmt_info;
1468   enum dr_alignment_support supportable_dr_alignment;
1469
1470   /* Ensure that all data refs can be vectorized after the peel.  */
1471   FOR_EACH_VEC_ELT (datarefs, i, dr)
1472     {
1473       int save_misalignment;
1474
1475       if (dr == dr0)
1476         continue;
1477
1478       stmt = DR_STMT (dr);
1479       stmt_info = vinfo_for_stmt (stmt);
1480       /* For interleaving, only the alignment of the first access
1481          matters.  */
1482       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1483           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1484         continue;
1485
1486       /* Strided accesses perform only component accesses, alignment is
1487          irrelevant for them.  */
1488       if (STMT_VINFO_STRIDED_P (stmt_info)
1489           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1490         continue;
1491
1492       save_misalignment = DR_MISALIGNMENT (dr);
1493       vect_update_misalignment_for_peel (dr, dr0, npeel);
1494       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1495       SET_DR_MISALIGNMENT (dr, save_misalignment);
1496
1497       if (!supportable_dr_alignment)
1498         return false;
1499     }
1500
1501   return true;
1502 }
1503
1504 /* Function vect_enhance_data_refs_alignment
1505
1506    This pass will use loop versioning and loop peeling in order to enhance
1507    the alignment of data references in the loop.
1508
1509    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1510    original loop is to be vectorized.  Any other loops that are created by
1511    the transformations performed in this pass - are not supposed to be
1512    vectorized.  This restriction will be relaxed.
1513
1514    This pass will require a cost model to guide it whether to apply peeling
1515    or versioning or a combination of the two.  For example, the scheme that
1516    intel uses when given a loop with several memory accesses, is as follows:
1517    choose one memory access ('p') which alignment you want to force by doing
1518    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1519    other accesses are not necessarily aligned, or (2) use loop versioning to
1520    generate one loop in which all accesses are aligned, and another loop in
1521    which only 'p' is necessarily aligned.
1522
1523    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1524    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1525    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1526
1527    Devising a cost model is the most critical aspect of this work.  It will
1528    guide us on which access to peel for, whether to use loop versioning, how
1529    many versions to create, etc.  The cost model will probably consist of
1530    generic considerations as well as target specific considerations (on
1531    powerpc for example, misaligned stores are more painful than misaligned
1532    loads).
1533
1534    Here are the general steps involved in alignment enhancements:
1535
1536      -- original loop, before alignment analysis:
1537         for (i=0; i<N; i++){
1538           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1539           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1540         }
1541
1542      -- After vect_compute_data_refs_alignment:
1543         for (i=0; i<N; i++){
1544           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1545           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1546         }
1547
1548      -- Possibility 1: we do loop versioning:
1549      if (p is aligned) {
1550         for (i=0; i<N; i++){    # loop 1A
1551           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1552           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1553         }
1554      }
1555      else {
1556         for (i=0; i<N; i++){    # loop 1B
1557           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1558           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1559         }
1560      }
1561
1562      -- Possibility 2: we do loop peeling:
1563      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1564         x = q[i];
1565         p[i] = y;
1566      }
1567      for (i = 3; i < N; i++){   # loop 2A
1568         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1569         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1570      }
1571
1572      -- Possibility 3: combination of loop peeling and versioning:
1573      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1574         x = q[i];
1575         p[i] = y;
1576      }
1577      if (p is aligned) {
1578         for (i = 3; i<N; i++){  # loop 3A
1579           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1580           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1581         }
1582      }
1583      else {
1584         for (i = 3; i<N; i++){  # loop 3B
1585           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1586           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1587         }
1588      }
1589
1590      These loops are later passed to loop_transform to be vectorized.  The
1591      vectorizer will use the alignment information to guide the transformation
1592      (whether to generate regular loads/stores, or with special handling for
1593      misalignment).  */
1594
1595 bool
1596 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1597 {
1598   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1599   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1600   enum dr_alignment_support supportable_dr_alignment;
1601   struct data_reference *dr0 = NULL, *first_store = NULL;
1602   struct data_reference *dr;
1603   unsigned int i, j;
1604   bool do_peeling = false;
1605   bool do_versioning = false;
1606   bool stat;
1607   gimple *stmt;
1608   stmt_vec_info stmt_info;
1609   unsigned int npeel = 0;
1610   bool one_misalignment_known = false;
1611   bool one_misalignment_unknown = false;
1612   bool one_dr_unsupportable = false;
1613   struct data_reference *unsupportable_dr = NULL;
1614   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1615   unsigned possible_npeel_number = 1;
1616   tree vectype;
1617   unsigned int nelements, mis, same_align_drs_max = 0;
1618   hash_table<peel_info_hasher> peeling_htab (1);
1619
1620   if (dump_enabled_p ())
1621     dump_printf_loc (MSG_NOTE, vect_location,
1622                      "=== vect_enhance_data_refs_alignment ===\n");
1623
1624   /* Reset data so we can safely be called multiple times.  */
1625   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1626   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1627
1628   /* While cost model enhancements are expected in the future, the high level
1629      view of the code at this time is as follows:
1630
1631      A) If there is a misaligned access then see if peeling to align
1632         this access can make all data references satisfy
1633         vect_supportable_dr_alignment.  If so, update data structures
1634         as needed and return true.
1635
1636      B) If peeling wasn't possible and there is a data reference with an
1637         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1638         then see if loop versioning checks can be used to make all data
1639         references satisfy vect_supportable_dr_alignment.  If so, update
1640         data structures as needed and return true.
1641
1642      C) If neither peeling nor versioning were successful then return false if
1643         any data reference does not satisfy vect_supportable_dr_alignment.
1644
1645      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1646
1647      Note, Possibility 3 above (which is peeling and versioning together) is not
1648      being done at this time.  */
1649
1650   /* (1) Peeling to force alignment.  */
1651
1652   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1653      Considerations:
1654      + How many accesses will become aligned due to the peeling
1655      - How many accesses will become unaligned due to the peeling,
1656        and the cost of misaligned accesses.
1657      - The cost of peeling (the extra runtime checks, the increase
1658        in code size).  */
1659
1660   FOR_EACH_VEC_ELT (datarefs, i, dr)
1661     {
1662       stmt = DR_STMT (dr);
1663       stmt_info = vinfo_for_stmt (stmt);
1664
1665       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1666         continue;
1667
1668       /* For interleaving, only the alignment of the first access
1669          matters.  */
1670       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1671           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1672         continue;
1673
1674       /* For invariant accesses there is nothing to enhance.  */
1675       if (integer_zerop (DR_STEP (dr)))
1676         continue;
1677
1678       /* Strided accesses perform only component accesses, alignment is
1679          irrelevant for them.  */
1680       if (STMT_VINFO_STRIDED_P (stmt_info)
1681           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1682         continue;
1683
1684       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1685       do_peeling = vector_alignment_reachable_p (dr);
1686       if (do_peeling)
1687         {
1688           if (known_alignment_for_access_p (dr))
1689             {
1690               unsigned int npeel_tmp = 0;
1691               bool negative = tree_int_cst_compare (DR_STEP (dr),
1692                                                     size_zero_node) < 0;
1693
1694               vectype = STMT_VINFO_VECTYPE (stmt_info);
1695               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1696               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1697               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1698               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1699               if (DR_MISALIGNMENT (dr) != 0)
1700                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1701
1702               /* For multiple types, it is possible that the bigger type access
1703                  will have more than one peeling option.  E.g., a loop with two
1704                  types: one of size (vector size / 4), and the other one of
1705                  size (vector size / 8).  Vectorization factor will 8.  If both
1706                  accesses are misaligned by 3, the first one needs one scalar
1707                  iteration to be aligned, and the second one needs 5.  But the
1708                  first one will be aligned also by peeling 5 scalar
1709                  iterations, and in that case both accesses will be aligned.
1710                  Hence, except for the immediate peeling amount, we also want
1711                  to try to add full vector size, while we don't exceed
1712                  vectorization factor.
1713                  We do this automatically for cost model, since we calculate
1714                  cost for every peeling option.  */
1715               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1716                 {
1717                   if (STMT_SLP_TYPE (stmt_info))
1718                     possible_npeel_number
1719                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1720                   else
1721                     possible_npeel_number = vf / nelements;
1722
1723                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1724                      allow peeling NELEMENTS.  */
1725                   if (DR_MISALIGNMENT (dr) == 0)
1726                     possible_npeel_number++;
1727                 }
1728
1729               /* Save info about DR in the hash table.  Also include peeling
1730                  amounts according to the explanation above.  */
1731               for (j = 0; j < possible_npeel_number; j++)
1732                 {
1733                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1734                                             dr, npeel_tmp);
1735                   npeel_tmp += target_align / dr_size;
1736                 }
1737
1738               one_misalignment_known = true;
1739             }
1740           else
1741             {
1742               /* If we don't know any misalignment values, we prefer
1743                  peeling for data-ref that has the maximum number of data-refs
1744                  with the same alignment, unless the target prefers to align
1745                  stores over load.  */
1746               unsigned same_align_drs
1747                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1748               if (!dr0
1749                   || same_align_drs_max < same_align_drs)
1750                 {
1751                   same_align_drs_max = same_align_drs;
1752                   dr0 = dr;
1753                 }
1754               /* For data-refs with the same number of related
1755                  accesses prefer the one where the misalign
1756                  computation will be invariant in the outermost loop.  */
1757               else if (same_align_drs_max == same_align_drs)
1758                 {
1759                   struct loop *ivloop0, *ivloop;
1760                   ivloop0 = outermost_invariant_loop_for_expr
1761                     (loop, DR_BASE_ADDRESS (dr0));
1762                   ivloop = outermost_invariant_loop_for_expr
1763                     (loop, DR_BASE_ADDRESS (dr));
1764                   if ((ivloop && !ivloop0)
1765                       || (ivloop && ivloop0
1766                           && flow_loop_nested_p (ivloop, ivloop0)))
1767                     dr0 = dr;
1768                 }
1769
1770               one_misalignment_unknown = true;
1771
1772               /* Check for data refs with unsupportable alignment that
1773                  can be peeled.  */
1774               if (!supportable_dr_alignment)
1775               {
1776                 one_dr_unsupportable = true;
1777                 unsupportable_dr = dr;
1778               }
1779
1780               if (!first_store && DR_IS_WRITE (dr))
1781                 first_store = dr;
1782             }
1783         }
1784       else
1785         {
1786           if (!aligned_access_p (dr))
1787             {
1788               if (dump_enabled_p ())
1789                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1790                                  "vector alignment may not be reachable\n");
1791               break;
1792             }
1793         }
1794     }
1795
1796   /* Check if we can possibly peel the loop.  */
1797   if (!vect_can_advance_ivs_p (loop_vinfo)
1798       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1799       || loop->inner)
1800     do_peeling = false;
1801
1802   struct _vect_peel_extended_info peel_for_known_alignment;
1803   struct _vect_peel_extended_info peel_for_unknown_alignment;
1804   struct _vect_peel_extended_info best_peel;
1805
1806   peel_for_unknown_alignment.inside_cost = INT_MAX;
1807   peel_for_unknown_alignment.outside_cost = INT_MAX;
1808   peel_for_unknown_alignment.peel_info.count = 0;
1809
1810   if (do_peeling
1811       && one_misalignment_unknown)
1812     {
1813       /* Check if the target requires to prefer stores over loads, i.e., if
1814          misaligned stores are more expensive than misaligned loads (taking
1815          drs with same alignment into account).  */
1816       unsigned int load_inside_cost = 0;
1817       unsigned int load_outside_cost = 0;
1818       unsigned int store_inside_cost = 0;
1819       unsigned int store_outside_cost = 0;
1820
1821       stmt_vector_for_cost dummy;
1822       dummy.create (2);
1823       vect_get_peeling_costs_all_drs (datarefs, dr0,
1824                                       &load_inside_cost,
1825                                       &load_outside_cost,
1826                                       &dummy, vf / 2, true);
1827       dummy.release ();
1828
1829       if (first_store)
1830         {
1831           dummy.create (2);
1832           vect_get_peeling_costs_all_drs (datarefs, first_store,
1833                                           &store_inside_cost,
1834                                           &store_outside_cost,
1835                                           &dummy, vf / 2, true);
1836           dummy.release ();
1837         }
1838       else
1839         {
1840           store_inside_cost = INT_MAX;
1841           store_outside_cost = INT_MAX;
1842         }
1843
1844       if (load_inside_cost > store_inside_cost
1845           || (load_inside_cost == store_inside_cost
1846               && load_outside_cost > store_outside_cost))
1847         {
1848           dr0 = first_store;
1849           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1850           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1851         }
1852       else
1853         {
1854           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1855           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1856         }
1857
1858       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1859       prologue_cost_vec.create (2);
1860       epilogue_cost_vec.create (2);
1861
1862       int dummy2;
1863       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1864         (loop_vinfo, vf / 2, &dummy2,
1865          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1866          &prologue_cost_vec, &epilogue_cost_vec);
1867
1868       prologue_cost_vec.release ();
1869       epilogue_cost_vec.release ();
1870
1871       peel_for_unknown_alignment.peel_info.count = 1
1872         + STMT_VINFO_SAME_ALIGN_REFS
1873         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1874     }
1875
1876   peel_for_unknown_alignment.peel_info.npeel = 0;
1877   peel_for_unknown_alignment.peel_info.dr = dr0;
1878
1879   best_peel = peel_for_unknown_alignment;
1880
1881   peel_for_known_alignment.inside_cost = INT_MAX;
1882   peel_for_known_alignment.outside_cost = INT_MAX;
1883   peel_for_known_alignment.peel_info.count = 0;
1884   peel_for_known_alignment.peel_info.dr = NULL;
1885
1886   if (do_peeling && one_misalignment_known)
1887     {
1888       /* Peeling is possible, but there is no data access that is not supported
1889          unless aligned.  So we try to choose the best possible peeling from
1890          the hash table.  */
1891       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1892         (&peeling_htab, loop_vinfo);
1893     }
1894
1895   /* Compare costs of peeling for known and unknown alignment. */
1896   if (peel_for_known_alignment.peel_info.dr != NULL
1897       && peel_for_unknown_alignment.inside_cost
1898       >= peel_for_known_alignment.inside_cost)
1899     {
1900       best_peel = peel_for_known_alignment;
1901
1902       /* If the best peeling for known alignment has NPEEL == 0, perform no
1903          peeling at all except if there is an unsupportable dr that we can
1904          align.  */
1905       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1906         do_peeling = false;
1907     }
1908
1909   /* If there is an unsupportable data ref, prefer this over all choices so far
1910      since we'd have to discard a chosen peeling except when it accidentally
1911      aligned the unsupportable data ref.  */
1912   if (one_dr_unsupportable)
1913     dr0 = unsupportable_dr;
1914   else if (do_peeling)
1915     {
1916       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1917          TODO: Use nopeel_outside_cost or get rid of it?  */
1918       unsigned nopeel_inside_cost = 0;
1919       unsigned nopeel_outside_cost = 0;
1920
1921       stmt_vector_for_cost dummy;
1922       dummy.create (2);
1923       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1924                                       &nopeel_outside_cost, &dummy, 0, false);
1925       dummy.release ();
1926
1927       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1928          costs will be recorded.  */
1929       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1930       prologue_cost_vec.create (2);
1931       epilogue_cost_vec.create (2);
1932
1933       int dummy2;
1934       nopeel_outside_cost += vect_get_known_peeling_cost
1935         (loop_vinfo, 0, &dummy2,
1936          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1937          &prologue_cost_vec, &epilogue_cost_vec);
1938
1939       prologue_cost_vec.release ();
1940       epilogue_cost_vec.release ();
1941
1942       npeel = best_peel.peel_info.npeel;
1943       dr0 = best_peel.peel_info.dr;
1944
1945       /* If no peeling is not more expensive than the best peeling we
1946          have so far, don't perform any peeling.  */
1947       if (nopeel_inside_cost <= best_peel.inside_cost)
1948         do_peeling = false;
1949     }
1950
1951   if (do_peeling)
1952     {
1953       stmt = DR_STMT (dr0);
1954       stmt_info = vinfo_for_stmt (stmt);
1955       vectype = STMT_VINFO_VECTYPE (stmt_info);
1956
1957       if (known_alignment_for_access_p (dr0))
1958         {
1959           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1960                                                 size_zero_node) < 0;
1961           if (!npeel)
1962             {
1963               /* Since it's known at compile time, compute the number of
1964                  iterations in the peeled loop (the peeling factor) for use in
1965                  updating DR_MISALIGNMENT values.  The peeling factor is the
1966                  vectorization factor minus the misalignment as an element
1967                  count.  */
1968               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1969               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1970               npeel = ((mis & (target_align - 1))
1971                        / vect_get_scalar_dr_size (dr0));
1972             }
1973
1974           /* For interleaved data access every iteration accesses all the
1975              members of the group, therefore we divide the number of iterations
1976              by the group size.  */
1977           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1978           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1979             npeel /= GROUP_SIZE (stmt_info);
1980
1981           if (dump_enabled_p ())
1982             dump_printf_loc (MSG_NOTE, vect_location,
1983                              "Try peeling by %d\n", npeel);
1984         }
1985
1986       /* Ensure that all datarefs can be vectorized after the peel.  */
1987       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1988         do_peeling = false;
1989
1990       /* Check if all datarefs are supportable and log.  */
1991       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1992         {
1993           stat = vect_verify_datarefs_alignment (loop_vinfo);
1994           if (!stat)
1995             do_peeling = false;
1996           else
1997             return stat;
1998         }
1999
2000       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2001       if (do_peeling)
2002         {
2003           unsigned max_allowed_peel
2004             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2005           if (max_allowed_peel != (unsigned)-1)
2006             {
2007               unsigned max_peel = npeel;
2008               if (max_peel == 0)
2009                 {
2010                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2011                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2012                 }
2013               if (max_peel > max_allowed_peel)
2014                 {
2015                   do_peeling = false;
2016                   if (dump_enabled_p ())
2017                     dump_printf_loc (MSG_NOTE, vect_location,
2018                         "Disable peeling, max peels reached: %d\n", max_peel);
2019                 }
2020             }
2021         }
2022
2023       /* Cost model #2 - if peeling may result in a remaining loop not
2024          iterating enough to be vectorized then do not peel.  */
2025       if (do_peeling
2026           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2027         {
2028           unsigned max_peel
2029             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
2030           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
2031               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
2032             do_peeling = false;
2033         }
2034
2035       if (do_peeling)
2036         {
2037           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2038              If the misalignment of DR_i is identical to that of dr0 then set
2039              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2040              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2041              by the peeling factor times the element size of DR_i (MOD the
2042              vectorization factor times the size).  Otherwise, the
2043              misalignment of DR_i must be set to unknown.  */
2044           FOR_EACH_VEC_ELT (datarefs, i, dr)
2045             if (dr != dr0)
2046               {
2047                 /* Strided accesses perform only component accesses, alignment
2048                    is irrelevant for them.  */
2049                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2050                 if (STMT_VINFO_STRIDED_P (stmt_info)
2051                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052                   continue;
2053
2054                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2055               }
2056
2057           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2058           if (npeel)
2059             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2060           else
2061             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2062               = DR_MISALIGNMENT (dr0);
2063           SET_DR_MISALIGNMENT (dr0, 0);
2064           if (dump_enabled_p ())
2065             {
2066               dump_printf_loc (MSG_NOTE, vect_location,
2067                                "Alignment of access forced using peeling.\n");
2068               dump_printf_loc (MSG_NOTE, vect_location,
2069                                "Peeling for alignment will be applied.\n");
2070             }
2071
2072           /* The inside-loop cost will be accounted for in vectorizable_load
2073              and vectorizable_store correctly with adjusted alignments.
2074              Drop the body_cst_vec on the floor here.  */
2075           stat = vect_verify_datarefs_alignment (loop_vinfo);
2076           gcc_assert (stat);
2077           return stat;
2078         }
2079     }
2080
2081   /* (2) Versioning to force alignment.  */
2082
2083   /* Try versioning if:
2084      1) optimize loop for speed
2085      2) there is at least one unsupported misaligned data ref with an unknown
2086         misalignment, and
2087      3) all misaligned data refs with a known misalignment are supported, and
2088      4) the number of runtime alignment checks is within reason.  */
2089
2090   do_versioning =
2091         optimize_loop_nest_for_speed_p (loop)
2092         && (!loop->inner); /* FORNOW */
2093
2094   if (do_versioning)
2095     {
2096       FOR_EACH_VEC_ELT (datarefs, i, dr)
2097         {
2098           stmt = DR_STMT (dr);
2099           stmt_info = vinfo_for_stmt (stmt);
2100
2101           /* For interleaving, only the alignment of the first access
2102              matters.  */
2103           if (aligned_access_p (dr)
2104               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2105                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2106             continue;
2107
2108           if (STMT_VINFO_STRIDED_P (stmt_info))
2109             {
2110               /* Strided loads perform only component accesses, alignment is
2111                  irrelevant for them.  */
2112               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2113                 continue;
2114               do_versioning = false;
2115               break;
2116             }
2117
2118           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2119
2120           if (!supportable_dr_alignment)
2121             {
2122               gimple *stmt;
2123               int mask;
2124               tree vectype;
2125
2126               if (known_alignment_for_access_p (dr)
2127                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2128                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2129                 {
2130                   do_versioning = false;
2131                   break;
2132                 }
2133
2134               stmt = DR_STMT (dr);
2135               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2136               gcc_assert (vectype);
2137
2138               /* The rightmost bits of an aligned address must be zeros.
2139                  Construct the mask needed for this test.  For example,
2140                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2141                  mask must be 15 = 0xf. */
2142               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2143
2144               /* FORNOW: use the same mask to test all potentially unaligned
2145                  references in the loop.  The vectorizer currently supports
2146                  a single vector size, see the reference to
2147                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2148                  vectorization factor is computed.  */
2149               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2150                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2151               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2152               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2153                       DR_STMT (dr));
2154             }
2155         }
2156
2157       /* Versioning requires at least one misaligned data reference.  */
2158       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2159         do_versioning = false;
2160       else if (!do_versioning)
2161         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2162     }
2163
2164   if (do_versioning)
2165     {
2166       vec<gimple *> may_misalign_stmts
2167         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2168       gimple *stmt;
2169
2170       /* It can now be assumed that the data references in the statements
2171          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2172          of the loop being vectorized.  */
2173       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2174         {
2175           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2176           dr = STMT_VINFO_DATA_REF (stmt_info);
2177           SET_DR_MISALIGNMENT (dr, 0);
2178           if (dump_enabled_p ())
2179             dump_printf_loc (MSG_NOTE, vect_location,
2180                              "Alignment of access forced using versioning.\n");
2181         }
2182
2183       if (dump_enabled_p ())
2184         dump_printf_loc (MSG_NOTE, vect_location,
2185                          "Versioning for alignment will be applied.\n");
2186
2187       /* Peeling and versioning can't be done together at this time.  */
2188       gcc_assert (! (do_peeling && do_versioning));
2189
2190       stat = vect_verify_datarefs_alignment (loop_vinfo);
2191       gcc_assert (stat);
2192       return stat;
2193     }
2194
2195   /* This point is reached if neither peeling nor versioning is being done.  */
2196   gcc_assert (! (do_peeling || do_versioning));
2197
2198   stat = vect_verify_datarefs_alignment (loop_vinfo);
2199   return stat;
2200 }
2201
2202
2203 /* Function vect_find_same_alignment_drs.
2204
2205    Update group and alignment relations according to the chosen
2206    vectorization factor.  */
2207
2208 static void
2209 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2210 {
2211   struct data_reference *dra = DDR_A (ddr);
2212   struct data_reference *drb = DDR_B (ddr);
2213   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2214   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2215
2216   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2217     return;
2218
2219   if (dra == drb)
2220     return;
2221
2222   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2223       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2224       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2225     return;
2226
2227   /* Two references with distance zero have the same alignment.  */
2228   offset_int diff = (wi::to_offset (DR_INIT (dra))
2229                      - wi::to_offset (DR_INIT (drb)));
2230   if (diff != 0)
2231     {
2232       /* Get the wider of the two alignments.  */
2233       unsigned int align_a = (vect_calculate_target_alignment (dra)
2234                               / BITS_PER_UNIT);
2235       unsigned int align_b = (vect_calculate_target_alignment (drb)
2236                               / BITS_PER_UNIT);
2237       unsigned int max_align = MAX (align_a, align_b);
2238
2239       /* Require the gap to be a multiple of the larger vector alignment.  */
2240       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2241         return;
2242     }
2243
2244   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2245   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2246   if (dump_enabled_p ())
2247     {
2248       dump_printf_loc (MSG_NOTE, vect_location,
2249                        "accesses have the same alignment: ");
2250       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2251       dump_printf (MSG_NOTE,  " and ");
2252       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2253       dump_printf (MSG_NOTE, "\n");
2254     }
2255 }
2256
2257
2258 /* Function vect_analyze_data_refs_alignment
2259
2260    Analyze the alignment of the data-references in the loop.
2261    Return FALSE if a data reference is found that cannot be vectorized.  */
2262
2263 bool
2264 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2265 {
2266   if (dump_enabled_p ())
2267     dump_printf_loc (MSG_NOTE, vect_location,
2268                      "=== vect_analyze_data_refs_alignment ===\n");
2269
2270   /* Mark groups of data references with same alignment using
2271      data dependence information.  */
2272   vec<ddr_p> ddrs = vinfo->ddrs;
2273   struct data_dependence_relation *ddr;
2274   unsigned int i;
2275
2276   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2277     vect_find_same_alignment_drs (ddr);
2278
2279   vec<data_reference_p> datarefs = vinfo->datarefs;
2280   struct data_reference *dr;
2281
2282   vect_record_base_alignments (vinfo);
2283   FOR_EACH_VEC_ELT (datarefs, i, dr)
2284     {
2285       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2286       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2287           && !vect_compute_data_ref_alignment (dr))
2288         {
2289           /* Strided accesses perform only component accesses, misalignment
2290              information is irrelevant for them.  */
2291           if (STMT_VINFO_STRIDED_P (stmt_info)
2292               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2293             continue;
2294
2295           if (dump_enabled_p ())
2296             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2297                              "not vectorized: can't calculate alignment "
2298                              "for data ref.\n");
2299
2300           return false;
2301         }
2302     }
2303
2304   return true;
2305 }
2306
2307
2308 /* Analyze alignment of DRs of stmts in NODE.  */
2309
2310 static bool
2311 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2312 {
2313   /* We vectorize from the first scalar stmt in the node unless
2314      the node is permuted in which case we start from the first
2315      element in the group.  */
2316   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2317   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2318   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2319     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2320
2321   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2322   if (! vect_compute_data_ref_alignment (dr)
2323       /* For creating the data-ref pointer we need alignment of the
2324          first element anyway.  */
2325       || (dr != first_dr
2326           && ! vect_compute_data_ref_alignment (first_dr))
2327       || ! verify_data_ref_alignment (dr))
2328     {
2329       if (dump_enabled_p ())
2330         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2331                          "not vectorized: bad data alignment in basic "
2332                          "block.\n");
2333       return false;
2334     }
2335
2336   return true;
2337 }
2338
2339 /* Function vect_slp_analyze_instance_alignment
2340
2341    Analyze the alignment of the data-references in the SLP instance.
2342    Return FALSE if a data reference is found that cannot be vectorized.  */
2343
2344 bool
2345 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2346 {
2347   if (dump_enabled_p ())
2348     dump_printf_loc (MSG_NOTE, vect_location,
2349                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2350
2351   slp_tree node;
2352   unsigned i;
2353   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2354     if (! vect_slp_analyze_and_verify_node_alignment (node))
2355       return false;
2356
2357   node = SLP_INSTANCE_TREE (instance);
2358   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2359       && ! vect_slp_analyze_and_verify_node_alignment
2360              (SLP_INSTANCE_TREE (instance)))
2361     return false;
2362
2363   return true;
2364 }
2365
2366
2367 /* Analyze groups of accesses: check that DR belongs to a group of
2368    accesses of legal size, step, etc.  Detect gaps, single element
2369    interleaving, and other special cases. Set grouped access info.
2370    Collect groups of strided stores for further use in SLP analysis.
2371    Worker for vect_analyze_group_access.  */
2372
2373 static bool
2374 vect_analyze_group_access_1 (struct data_reference *dr)
2375 {
2376   tree step = DR_STEP (dr);
2377   tree scalar_type = TREE_TYPE (DR_REF (dr));
2378   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2379   gimple *stmt = DR_STMT (dr);
2380   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2381   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2382   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2383   HOST_WIDE_INT dr_step = -1;
2384   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2385   bool slp_impossible = false;
2386
2387   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2388      size of the interleaving group (including gaps).  */
2389   if (tree_fits_shwi_p (step))
2390     {
2391       dr_step = tree_to_shwi (step);
2392       /* Check that STEP is a multiple of type size.  Otherwise there is
2393          a non-element-sized gap at the end of the group which we
2394          cannot represent in GROUP_GAP or GROUP_SIZE.
2395          ???  As we can handle non-constant step fine here we should
2396          simply remove uses of GROUP_GAP between the last and first
2397          element and instead rely on DR_STEP.  GROUP_SIZE then would
2398          simply not include that gap.  */
2399       if ((dr_step % type_size) != 0)
2400         {
2401           if (dump_enabled_p ())
2402             {
2403               dump_printf_loc (MSG_NOTE, vect_location,
2404                                "Step ");
2405               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2406               dump_printf (MSG_NOTE,
2407                            " is not a multiple of the element size for ");
2408               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2409               dump_printf (MSG_NOTE, "\n");
2410             }
2411           return false;
2412         }
2413       groupsize = absu_hwi (dr_step) / type_size;
2414     }
2415   else
2416     groupsize = 0;
2417
2418   /* Not consecutive access is possible only if it is a part of interleaving.  */
2419   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2420     {
2421       /* Check if it this DR is a part of interleaving, and is a single
2422          element of the group that is accessed in the loop.  */
2423
2424       /* Gaps are supported only for loads. STEP must be a multiple of the type
2425          size.  The size of the group must be a power of 2.  */
2426       if (DR_IS_READ (dr)
2427           && (dr_step % type_size) == 0
2428           && groupsize > 0
2429           && pow2p_hwi (groupsize))
2430         {
2431           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2432           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2433           GROUP_GAP (stmt_info) = groupsize - 1;
2434           if (dump_enabled_p ())
2435             {
2436               dump_printf_loc (MSG_NOTE, vect_location,
2437                                "Detected single element interleaving ");
2438               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2439               dump_printf (MSG_NOTE, " step ");
2440               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2441               dump_printf (MSG_NOTE, "\n");
2442             }
2443
2444           return true;
2445         }
2446
2447       if (dump_enabled_p ())
2448         {
2449           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2450                            "not consecutive access ");
2451           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2452         }
2453
2454       if (bb_vinfo)
2455         {
2456           /* Mark the statement as unvectorizable.  */
2457           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2458           return true;
2459         }
2460
2461       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2462       STMT_VINFO_STRIDED_P (stmt_info) = true;
2463       return true;
2464     }
2465
2466   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2467     {
2468       /* First stmt in the interleaving chain. Check the chain.  */
2469       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2470       struct data_reference *data_ref = dr;
2471       unsigned int count = 1;
2472       tree prev_init = DR_INIT (data_ref);
2473       gimple *prev = stmt;
2474       HOST_WIDE_INT diff, gaps = 0;
2475
2476       while (next)
2477         {
2478           /* Skip same data-refs.  In case that two or more stmts share
2479              data-ref (supported only for loads), we vectorize only the first
2480              stmt, and the rest get their vectorized loads from the first
2481              one.  */
2482           if (!tree_int_cst_compare (DR_INIT (data_ref),
2483                                      DR_INIT (STMT_VINFO_DATA_REF (
2484                                                    vinfo_for_stmt (next)))))
2485             {
2486               if (DR_IS_WRITE (data_ref))
2487                 {
2488                   if (dump_enabled_p ())
2489                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2490                                      "Two store stmts share the same dr.\n");
2491                   return false;
2492                 }
2493
2494               if (dump_enabled_p ())
2495                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2496                                  "Two or more load stmts share the same dr.\n");
2497
2498               /* For load use the same data-ref load.  */
2499               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2500
2501               prev = next;
2502               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2503               continue;
2504             }
2505
2506           prev = next;
2507           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2508
2509           /* All group members have the same STEP by construction.  */
2510           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2511
2512           /* Check that the distance between two accesses is equal to the type
2513              size. Otherwise, we have gaps.  */
2514           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2515                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2516           if (diff != 1)
2517             {
2518               /* FORNOW: SLP of accesses with gaps is not supported.  */
2519               slp_impossible = true;
2520               if (DR_IS_WRITE (data_ref))
2521                 {
2522                   if (dump_enabled_p ())
2523                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2524                                      "interleaved store with gaps\n");
2525                   return false;
2526                 }
2527
2528               gaps += diff - 1;
2529             }
2530
2531           last_accessed_element += diff;
2532
2533           /* Store the gap from the previous member of the group. If there is no
2534              gap in the access, GROUP_GAP is always 1.  */
2535           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2536
2537           prev_init = DR_INIT (data_ref);
2538           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2539           /* Count the number of data-refs in the chain.  */
2540           count++;
2541         }
2542
2543       if (groupsize == 0)
2544         groupsize = count + gaps;
2545
2546       /* This could be UINT_MAX but as we are generating code in a very
2547          inefficient way we have to cap earlier.  See PR78699 for example.  */
2548       if (groupsize > 4096)
2549         {
2550           if (dump_enabled_p ())
2551             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2552                              "group is too large\n");
2553           return false;
2554         }
2555
2556       /* Check that the size of the interleaving is equal to count for stores,
2557          i.e., that there are no gaps.  */
2558       if (groupsize != count
2559           && !DR_IS_READ (dr))
2560         {
2561           if (dump_enabled_p ())
2562             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2563                              "interleaved store with gaps\n");
2564           return false;
2565         }
2566
2567       /* If there is a gap after the last load in the group it is the
2568          difference between the groupsize and the last accessed
2569          element.
2570          When there is no gap, this difference should be 0.  */
2571       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2572
2573       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2574       if (dump_enabled_p ())
2575         {
2576           dump_printf_loc (MSG_NOTE, vect_location,
2577                            "Detected interleaving ");
2578           if (DR_IS_READ (dr))
2579             dump_printf (MSG_NOTE, "load ");
2580           else
2581             dump_printf (MSG_NOTE, "store ");
2582           dump_printf (MSG_NOTE, "of size %u starting with ",
2583                        (unsigned)groupsize);
2584           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2585           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2586             dump_printf_loc (MSG_NOTE, vect_location,
2587                              "There is a gap of %u elements after the group\n",
2588                              GROUP_GAP (vinfo_for_stmt (stmt)));
2589         }
2590
2591       /* SLP: create an SLP data structure for every interleaving group of
2592          stores for further analysis in vect_analyse_slp.  */
2593       if (DR_IS_WRITE (dr) && !slp_impossible)
2594         {
2595           if (loop_vinfo)
2596             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2597           if (bb_vinfo)
2598             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2599         }
2600     }
2601
2602   return true;
2603 }
2604
2605 /* Analyze groups of accesses: check that DR belongs to a group of
2606    accesses of legal size, step, etc.  Detect gaps, single element
2607    interleaving, and other special cases. Set grouped access info.
2608    Collect groups of strided stores for further use in SLP analysis.  */
2609
2610 static bool
2611 vect_analyze_group_access (struct data_reference *dr)
2612 {
2613   if (!vect_analyze_group_access_1 (dr))
2614     {
2615       /* Dissolve the group if present.  */
2616       gimple *next;
2617       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2618       while (stmt)
2619         {
2620           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2621           next = GROUP_NEXT_ELEMENT (vinfo);
2622           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2623           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2624           stmt = next;
2625         }
2626       return false;
2627     }
2628   return true;
2629 }
2630
2631 /* Analyze the access pattern of the data-reference DR.
2632    In case of non-consecutive accesses call vect_analyze_group_access() to
2633    analyze groups of accesses.  */
2634
2635 static bool
2636 vect_analyze_data_ref_access (struct data_reference *dr)
2637 {
2638   tree step = DR_STEP (dr);
2639   tree scalar_type = TREE_TYPE (DR_REF (dr));
2640   gimple *stmt = DR_STMT (dr);
2641   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2642   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2643   struct loop *loop = NULL;
2644
2645   if (loop_vinfo)
2646     loop = LOOP_VINFO_LOOP (loop_vinfo);
2647
2648   if (loop_vinfo && !step)
2649     {
2650       if (dump_enabled_p ())
2651         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2652                          "bad data-ref access in loop\n");
2653       return false;
2654     }
2655
2656   /* Allow loads with zero step in inner-loop vectorization.  */
2657   if (loop_vinfo && integer_zerop (step))
2658     {
2659       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2660       if (!nested_in_vect_loop_p (loop, stmt))
2661         return DR_IS_READ (dr);
2662       /* Allow references with zero step for outer loops marked
2663          with pragma omp simd only - it guarantees absence of
2664          loop-carried dependencies between inner loop iterations.  */
2665       if (!loop->force_vectorize)
2666         {
2667           if (dump_enabled_p ())
2668             dump_printf_loc (MSG_NOTE, vect_location,
2669                              "zero step in inner loop of nest\n");
2670           return false;
2671         }
2672     }
2673
2674   if (loop && nested_in_vect_loop_p (loop, stmt))
2675     {
2676       /* Interleaved accesses are not yet supported within outer-loop
2677         vectorization for references in the inner-loop.  */
2678       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2679
2680       /* For the rest of the analysis we use the outer-loop step.  */
2681       step = STMT_VINFO_DR_STEP (stmt_info);
2682       if (integer_zerop (step))
2683         {
2684           if (dump_enabled_p ())
2685             dump_printf_loc (MSG_NOTE, vect_location,
2686                              "zero step in outer loop.\n");
2687           return DR_IS_READ (dr);
2688         }
2689     }
2690
2691   /* Consecutive?  */
2692   if (TREE_CODE (step) == INTEGER_CST)
2693     {
2694       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2695       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2696           || (dr_step < 0
2697               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2698         {
2699           /* Mark that it is not interleaving.  */
2700           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2701           return true;
2702         }
2703     }
2704
2705   if (loop && nested_in_vect_loop_p (loop, stmt))
2706     {
2707       if (dump_enabled_p ())
2708         dump_printf_loc (MSG_NOTE, vect_location,
2709                          "grouped access in outer loop.\n");
2710       return false;
2711     }
2712
2713
2714   /* Assume this is a DR handled by non-constant strided load case.  */
2715   if (TREE_CODE (step) != INTEGER_CST)
2716     return (STMT_VINFO_STRIDED_P (stmt_info)
2717             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2718                 || vect_analyze_group_access (dr)));
2719
2720   /* Not consecutive access - check if it's a part of interleaving group.  */
2721   return vect_analyze_group_access (dr);
2722 }
2723
2724 /* Compare two data-references DRA and DRB to group them into chunks
2725    suitable for grouping.  */
2726
2727 static int
2728 dr_group_sort_cmp (const void *dra_, const void *drb_)
2729 {
2730   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2731   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2732   int cmp;
2733
2734   /* Stabilize sort.  */
2735   if (dra == drb)
2736     return 0;
2737
2738   /* DRs in different loops never belong to the same group.  */
2739   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2740   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2741   if (loopa != loopb)
2742     return loopa->num < loopb->num ? -1 : 1;
2743
2744   /* Ordering of DRs according to base.  */
2745   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2746                                DR_BASE_ADDRESS (drb));
2747   if (cmp != 0)
2748     return cmp;
2749
2750   /* And according to DR_OFFSET.  */
2751   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2752   if (cmp != 0)
2753     return cmp;
2754
2755   /* Put reads before writes.  */
2756   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2757     return DR_IS_READ (dra) ? -1 : 1;
2758
2759   /* Then sort after access size.  */
2760   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2761                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2762   if (cmp != 0)
2763     return cmp;
2764
2765   /* And after step.  */
2766   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2767   if (cmp != 0)
2768     return cmp;
2769
2770   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2771   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2772   if (cmp == 0)
2773     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2774   return cmp;
2775 }
2776
2777 /* Function vect_analyze_data_ref_accesses.
2778
2779    Analyze the access pattern of all the data references in the loop.
2780
2781    FORNOW: the only access pattern that is considered vectorizable is a
2782            simple step 1 (consecutive) access.
2783
2784    FORNOW: handle only arrays and pointer accesses.  */
2785
2786 bool
2787 vect_analyze_data_ref_accesses (vec_info *vinfo)
2788 {
2789   unsigned int i;
2790   vec<data_reference_p> datarefs = vinfo->datarefs;
2791   struct data_reference *dr;
2792
2793   if (dump_enabled_p ())
2794     dump_printf_loc (MSG_NOTE, vect_location,
2795                      "=== vect_analyze_data_ref_accesses ===\n");
2796
2797   if (datarefs.is_empty ())
2798     return true;
2799
2800   /* Sort the array of datarefs to make building the interleaving chains
2801      linear.  Don't modify the original vector's order, it is needed for
2802      determining what dependencies are reversed.  */
2803   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2804   datarefs_copy.qsort (dr_group_sort_cmp);
2805
2806   /* Build the interleaving chains.  */
2807   for (i = 0; i < datarefs_copy.length () - 1;)
2808     {
2809       data_reference_p dra = datarefs_copy[i];
2810       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2811       stmt_vec_info lastinfo = NULL;
2812       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2813         {
2814           ++i;
2815           continue;
2816         }
2817       for (i = i + 1; i < datarefs_copy.length (); ++i)
2818         {
2819           data_reference_p drb = datarefs_copy[i];
2820           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2821           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2822             break;
2823
2824           /* ???  Imperfect sorting (non-compatible types, non-modulo
2825              accesses, same accesses) can lead to a group to be artificially
2826              split here as we don't just skip over those.  If it really
2827              matters we can push those to a worklist and re-iterate
2828              over them.  The we can just skip ahead to the next DR here.  */
2829
2830           /* DRs in a different loop should not be put into the same
2831              interleaving group.  */
2832           if (gimple_bb (DR_STMT (dra))->loop_father
2833               != gimple_bb (DR_STMT (drb))->loop_father)
2834             break;
2835
2836           /* Check that the data-refs have same first location (except init)
2837              and they are both either store or load (not load and store,
2838              not masked loads or stores).  */
2839           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2840               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2841                                         DR_BASE_ADDRESS (drb)) != 0
2842               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2843               || !gimple_assign_single_p (DR_STMT (dra))
2844               || !gimple_assign_single_p (DR_STMT (drb)))
2845             break;
2846
2847           /* Check that the data-refs have the same constant size.  */
2848           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2849           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2850           if (!tree_fits_uhwi_p (sza)
2851               || !tree_fits_uhwi_p (szb)
2852               || !tree_int_cst_equal (sza, szb))
2853             break;
2854
2855           /* Check that the data-refs have the same step.  */
2856           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2857             break;
2858
2859           /* Check the types are compatible.
2860              ???  We don't distinguish this during sorting.  */
2861           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2862                                    TREE_TYPE (DR_REF (drb))))
2863             break;
2864
2865           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2866           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2867           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2868           HOST_WIDE_INT init_prev
2869             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2870           gcc_assert (init_a <= init_b
2871                       && init_a <= init_prev
2872                       && init_prev <= init_b);
2873
2874           /* Do not place the same access in the interleaving chain twice.  */
2875           if (init_b == init_prev)
2876             {
2877               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2878                           < gimple_uid (DR_STMT (drb)));
2879               /* ???  For now we simply "drop" the later reference which is
2880                  otherwise the same rather than finishing off this group.
2881                  In the end we'd want to re-process duplicates forming
2882                  multiple groups from the refs, likely by just collecting
2883                  all candidates (including duplicates and split points
2884                  below) in a vector and then process them together.  */
2885               continue;
2886             }
2887
2888           /* If init_b == init_a + the size of the type * k, we have an
2889              interleaving, and DRA is accessed before DRB.  */
2890           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2891           if (type_size_a == 0
2892               || (init_b - init_a) % type_size_a != 0)
2893             break;
2894
2895           /* If we have a store, the accesses are adjacent.  This splits
2896              groups into chunks we support (we don't support vectorization
2897              of stores with gaps).  */
2898           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2899             break;
2900
2901           /* If the step (if not zero or non-constant) is greater than the
2902              difference between data-refs' inits this splits groups into
2903              suitable sizes.  */
2904           if (tree_fits_shwi_p (DR_STEP (dra)))
2905             {
2906               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2907               if (step != 0 && step <= (init_b - init_a))
2908                 break;
2909             }
2910
2911           if (dump_enabled_p ())
2912             {
2913               dump_printf_loc (MSG_NOTE, vect_location,
2914                                "Detected interleaving ");
2915               if (DR_IS_READ (dra))
2916                 dump_printf (MSG_NOTE, "load ");
2917               else
2918                 dump_printf (MSG_NOTE, "store ");
2919               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2920               dump_printf (MSG_NOTE,  " and ");
2921               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2922               dump_printf (MSG_NOTE, "\n");
2923             }
2924
2925           /* Link the found element into the group list.  */
2926           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2927             {
2928               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2929               lastinfo = stmtinfo_a;
2930             }
2931           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2932           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2933           lastinfo = stmtinfo_b;
2934         }
2935     }
2936
2937   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2938     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2939         && !vect_analyze_data_ref_access (dr))
2940       {
2941         if (dump_enabled_p ())
2942           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2943                            "not vectorized: complicated access pattern.\n");
2944
2945         if (is_a <bb_vec_info> (vinfo))
2946           {
2947             /* Mark the statement as not vectorizable.  */
2948             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2949             continue;
2950           }
2951         else
2952           {
2953             datarefs_copy.release ();
2954             return false;
2955           }
2956       }
2957
2958   datarefs_copy.release ();
2959   return true;
2960 }
2961
2962 /* Function vect_vfa_segment_size.
2963
2964    Create an expression that computes the size of segment
2965    that will be accessed for a data reference.  The functions takes into
2966    account that realignment loads may access one more vector.
2967
2968    Input:
2969      DR: The data reference.
2970      LENGTH_FACTOR: segment length to consider.
2971
2972    Return an expression whose value is the size of segment which will be
2973    accessed by DR.  */
2974
2975 static tree
2976 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2977 {
2978   tree segment_length;
2979
2980   if (integer_zerop (DR_STEP (dr)))
2981     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2982   else
2983     segment_length = size_binop (MULT_EXPR,
2984                                  fold_convert (sizetype, DR_STEP (dr)),
2985                                  fold_convert (sizetype, length_factor));
2986
2987   if (vect_supportable_dr_alignment (dr, false)
2988         == dr_explicit_realign_optimized)
2989     {
2990       tree vector_size = TYPE_SIZE_UNIT
2991                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2992
2993       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2994     }
2995   return segment_length;
2996 }
2997
2998 /* Function vect_no_alias_p.
2999
3000    Given data references A and B with equal base and offset, the alias
3001    relation can be decided at compilation time, return TRUE if they do
3002    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
3003    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3004    respectively.  */
3005
3006 static bool
3007 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
3008                  tree segment_length_a, tree segment_length_b)
3009 {
3010   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3011               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3012   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3013     return false;
3014
3015   tree seg_a_min = DR_INIT (a);
3016   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3017                                 seg_a_min, segment_length_a);
3018   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3019      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3020      [a, a+12) */
3021   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3022     {
3023       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3024       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3025                                seg_a_max, unit_size);
3026       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3027                                DR_INIT (a), unit_size);
3028     }
3029   tree seg_b_min = DR_INIT (b);
3030   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3031                                 seg_b_min, segment_length_b);
3032   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3033     {
3034       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3035       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3036                                seg_b_max, unit_size);
3037       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3038                                DR_INIT (b), unit_size);
3039     }
3040
3041   if (tree_int_cst_le (seg_a_max, seg_b_min)
3042       || tree_int_cst_le (seg_b_max, seg_a_min))
3043     return true;
3044
3045   return false;
3046 }
3047
3048 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3049    in DDR is >= VF.  */
3050
3051 static bool
3052 dependence_distance_ge_vf (data_dependence_relation *ddr,
3053                            unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3054 {
3055   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3056       || DDR_NUM_DIST_VECTS (ddr) == 0)
3057     return false;
3058
3059   /* If the dependence is exact, we should have limited the VF instead.  */
3060   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3061
3062   unsigned int i;
3063   lambda_vector dist_v;
3064   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3065     {
3066       HOST_WIDE_INT dist = dist_v[loop_depth];
3067       if (dist != 0
3068           && !(dist > 0 && DDR_REVERSED_P (ddr))
3069           && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3070         return false;
3071     }
3072
3073   if (dump_enabled_p ())
3074     {
3075       dump_printf_loc (MSG_NOTE, vect_location,
3076                        "dependence distance between ");
3077       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3078       dump_printf (MSG_NOTE,  " and ");
3079       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3080       dump_printf (MSG_NOTE,  " is >= VF\n");
3081     }
3082
3083   return true;
3084 }
3085
3086 /* Function vect_prune_runtime_alias_test_list.
3087
3088    Prune a list of ddrs to be tested at run-time by versioning for alias.
3089    Merge several alias checks into one if possible.
3090    Return FALSE if resulting list of ddrs is longer then allowed by
3091    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3092
3093 bool
3094 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3095 {
3096   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3097   hash_set <tree_pair_hash> compared_objects;
3098
3099   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3100   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3101     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3102   vec<vec_object_pair> &check_unequal_addrs
3103     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3104   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3105   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3106
3107   ddr_p ddr;
3108   unsigned int i;
3109   tree length_factor;
3110
3111   if (dump_enabled_p ())
3112     dump_printf_loc (MSG_NOTE, vect_location,
3113                      "=== vect_prune_runtime_alias_test_list ===\n");
3114
3115   if (may_alias_ddrs.is_empty ())
3116     return true;
3117
3118   comp_alias_ddrs.create (may_alias_ddrs.length ());
3119
3120   unsigned int loop_depth
3121     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3122                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3123
3124   /* First, we collect all data ref pairs for aliasing checks.  */
3125   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3126     {
3127       int comp_res;
3128       struct data_reference *dr_a, *dr_b;
3129       gimple *dr_group_first_a, *dr_group_first_b;
3130       tree segment_length_a, segment_length_b;
3131       gimple *stmt_a, *stmt_b;
3132
3133       /* Ignore the alias if the VF we chose ended up being no greater
3134          than the dependence distance.  */
3135       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3136         continue;
3137
3138       if (DDR_OBJECT_A (ddr))
3139         {
3140           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3141           if (!compared_objects.add (new_pair))
3142             {
3143               if (dump_enabled_p ())
3144                 {
3145                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3146                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3147                   dump_printf (MSG_NOTE, " and ");
3148                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3149                   dump_printf (MSG_NOTE, " have different addresses\n");
3150                 }
3151               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3152             }
3153           continue;
3154         }
3155
3156       dr_a = DDR_A (ddr);
3157       stmt_a = DR_STMT (DDR_A (ddr));
3158       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3159       if (dr_group_first_a)
3160         {
3161           stmt_a = dr_group_first_a;
3162           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3163         }
3164
3165       dr_b = DDR_B (ddr);
3166       stmt_b = DR_STMT (DDR_B (ddr));
3167       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3168       if (dr_group_first_b)
3169         {
3170           stmt_b = dr_group_first_b;
3171           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3172         }
3173
3174       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3175         length_factor = scalar_loop_iters;
3176       else
3177         length_factor = size_int (vect_factor);
3178       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3179       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3180
3181       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3182                                         DR_BASE_ADDRESS (dr_b));
3183       if (comp_res == 0)
3184         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3185                                           DR_OFFSET (dr_b));
3186
3187       /* Alias is known at compilation time.  */
3188       if (comp_res == 0
3189           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3190           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3191           && TREE_CODE (segment_length_a) == INTEGER_CST
3192           && TREE_CODE (segment_length_b) == INTEGER_CST)
3193         {
3194           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3195             continue;
3196
3197           if (dump_enabled_p ())
3198             dump_printf_loc (MSG_NOTE, vect_location,
3199                              "not vectorized: compilation time alias.\n");
3200
3201           return false;
3202         }
3203
3204       dr_with_seg_len_pair_t dr_with_seg_len_pair
3205           (dr_with_seg_len (dr_a, segment_length_a),
3206            dr_with_seg_len (dr_b, segment_length_b));
3207
3208       /* Canonicalize pairs by sorting the two DR members.  */
3209       if (comp_res > 0)
3210         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3211
3212       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3213     }
3214
3215   prune_runtime_alias_test_list (&comp_alias_ddrs,
3216                                  (unsigned HOST_WIDE_INT) vect_factor);
3217
3218   unsigned int count = (comp_alias_ddrs.length ()
3219                         + check_unequal_addrs.length ());
3220   dump_printf_loc (MSG_NOTE, vect_location,
3221                    "improved number of alias checks from %d to %d\n",
3222                    may_alias_ddrs.length (), count);
3223   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3224     {
3225       if (dump_enabled_p ())
3226         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3227                          "number of versioning for alias "
3228                          "run-time tests exceeds %d "
3229                          "(--param vect-max-version-for-alias-checks)\n",
3230                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3231       return false;
3232     }
3233
3234   return true;
3235 }
3236
3237 /* Return true if a non-affine read or write in STMT is suitable for a
3238    gather load or scatter store.  Describe the operation in *INFO if so.  */
3239
3240 bool
3241 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3242                            gather_scatter_info *info)
3243 {
3244   HOST_WIDE_INT scale = 1;
3245   poly_int64 pbitpos, pbitsize;
3246   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3247   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3248   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3249   tree offtype = NULL_TREE;
3250   tree decl, base, off;
3251   machine_mode pmode;
3252   int punsignedp, reversep, pvolatilep = 0;
3253
3254   base = DR_REF (dr);
3255   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3256      see if we can use the def stmt of the address.  */
3257   if (is_gimple_call (stmt)
3258       && gimple_call_internal_p (stmt)
3259       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3260           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3261       && TREE_CODE (base) == MEM_REF
3262       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3263       && integer_zerop (TREE_OPERAND (base, 1))
3264       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3265     {
3266       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3267       if (is_gimple_assign (def_stmt)
3268           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3269         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3270     }
3271
3272   /* The gather and scatter builtins need address of the form
3273      loop_invariant + vector * {1, 2, 4, 8}
3274      or
3275      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3276      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3277      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3278      multiplications and additions in it.  To get a vector, we need
3279      a single SSA_NAME that will be defined in the loop and will
3280      contain everything that is not loop invariant and that can be
3281      vectorized.  The following code attempts to find such a preexistng
3282      SSA_NAME OFF and put the loop invariants into a tree BASE
3283      that can be gimplified before the loop.  */
3284   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3285                               &punsignedp, &reversep, &pvolatilep);
3286   gcc_assert (base && !reversep);
3287   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3288
3289   if (TREE_CODE (base) == MEM_REF)
3290     {
3291       if (!integer_zerop (TREE_OPERAND (base, 1)))
3292         {
3293           if (off == NULL_TREE)
3294             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3295           else
3296             off = size_binop (PLUS_EXPR, off,
3297                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3298         }
3299       base = TREE_OPERAND (base, 0);
3300     }
3301   else
3302     base = build_fold_addr_expr (base);
3303
3304   if (off == NULL_TREE)
3305     off = size_zero_node;
3306
3307   /* If base is not loop invariant, either off is 0, then we start with just
3308      the constant offset in the loop invariant BASE and continue with base
3309      as OFF, otherwise give up.
3310      We could handle that case by gimplifying the addition of base + off
3311      into some SSA_NAME and use that as off, but for now punt.  */
3312   if (!expr_invariant_in_loop_p (loop, base))
3313     {
3314       if (!integer_zerop (off))
3315         return false;
3316       off = base;
3317       base = size_int (pbytepos);
3318     }
3319   /* Otherwise put base + constant offset into the loop invariant BASE
3320      and continue with OFF.  */
3321   else
3322     {
3323       base = fold_convert (sizetype, base);
3324       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3325     }
3326
3327   /* OFF at this point may be either a SSA_NAME or some tree expression
3328      from get_inner_reference.  Try to peel off loop invariants from it
3329      into BASE as long as possible.  */
3330   STRIP_NOPS (off);
3331   while (offtype == NULL_TREE)
3332     {
3333       enum tree_code code;
3334       tree op0, op1, add = NULL_TREE;
3335
3336       if (TREE_CODE (off) == SSA_NAME)
3337         {
3338           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3339
3340           if (expr_invariant_in_loop_p (loop, off))
3341             return false;
3342
3343           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3344             break;
3345
3346           op0 = gimple_assign_rhs1 (def_stmt);
3347           code = gimple_assign_rhs_code (def_stmt);
3348           op1 = gimple_assign_rhs2 (def_stmt);
3349         }
3350       else
3351         {
3352           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3353             return false;
3354           code = TREE_CODE (off);
3355           extract_ops_from_tree (off, &code, &op0, &op1);
3356         }
3357       switch (code)
3358         {
3359         case POINTER_PLUS_EXPR:
3360         case PLUS_EXPR:
3361           if (expr_invariant_in_loop_p (loop, op0))
3362             {
3363               add = op0;
3364               off = op1;
3365             do_add:
3366               add = fold_convert (sizetype, add);
3367               if (scale != 1)
3368                 add = size_binop (MULT_EXPR, add, size_int (scale));
3369               base = size_binop (PLUS_EXPR, base, add);
3370               continue;
3371             }
3372           if (expr_invariant_in_loop_p (loop, op1))
3373             {
3374               add = op1;
3375               off = op0;
3376               goto do_add;
3377             }
3378           break;
3379         case MINUS_EXPR:
3380           if (expr_invariant_in_loop_p (loop, op1))
3381             {
3382               add = fold_convert (sizetype, op1);
3383               add = size_binop (MINUS_EXPR, size_zero_node, add);
3384               off = op0;
3385               goto do_add;
3386             }
3387           break;
3388         case MULT_EXPR:
3389           if (scale == 1 && tree_fits_shwi_p (op1))
3390             {
3391               scale = tree_to_shwi (op1);
3392               off = op0;
3393               continue;
3394             }
3395           break;
3396         case SSA_NAME:
3397           off = op0;
3398           continue;
3399         CASE_CONVERT:
3400           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3401               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3402             break;
3403           if (TYPE_PRECISION (TREE_TYPE (op0))
3404               == TYPE_PRECISION (TREE_TYPE (off)))
3405             {
3406               off = op0;
3407               continue;
3408             }
3409           if (TYPE_PRECISION (TREE_TYPE (op0))
3410               < TYPE_PRECISION (TREE_TYPE (off)))
3411             {
3412               off = op0;
3413               offtype = TREE_TYPE (off);
3414               STRIP_NOPS (off);
3415               continue;
3416             }
3417           break;
3418         default:
3419           break;
3420         }
3421       break;
3422     }
3423
3424   /* If at the end OFF still isn't a SSA_NAME or isn't
3425      defined in the loop, punt.  */
3426   if (TREE_CODE (off) != SSA_NAME
3427       || expr_invariant_in_loop_p (loop, off))
3428     return false;
3429
3430   if (offtype == NULL_TREE)
3431     offtype = TREE_TYPE (off);
3432
3433   if (DR_IS_READ (dr))
3434     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3435                                              offtype, scale);
3436   else
3437     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3438                                               offtype, scale);
3439
3440   if (decl == NULL_TREE)
3441     return false;
3442
3443   info->decl = decl;
3444   info->base = base;
3445   info->offset = off;
3446   info->offset_dt = vect_unknown_def_type;
3447   info->offset_vectype = NULL_TREE;
3448   info->scale = scale;
3449   return true;
3450 }
3451
3452 /* Function vect_analyze_data_refs.
3453
3454   Find all the data references in the loop or basic block.
3455
3456    The general structure of the analysis of data refs in the vectorizer is as
3457    follows:
3458    1- vect_analyze_data_refs(loop/bb): call
3459       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3460       in the loop/bb and their dependences.
3461    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3462    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3463    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3464
3465 */
3466
3467 bool
3468 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3469 {
3470   struct loop *loop = NULL;
3471   unsigned int i;
3472   struct data_reference *dr;
3473   tree scalar_type;
3474
3475   if (dump_enabled_p ())
3476     dump_printf_loc (MSG_NOTE, vect_location,
3477                      "=== vect_analyze_data_refs ===\n");
3478
3479   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3480     loop = LOOP_VINFO_LOOP (loop_vinfo);
3481
3482   /* Go through the data-refs, check that the analysis succeeded.  Update
3483      pointer from stmt_vec_info struct to DR and vectype.  */
3484
3485   vec<data_reference_p> datarefs = vinfo->datarefs;
3486   FOR_EACH_VEC_ELT (datarefs, i, dr)
3487     {
3488       gimple *stmt;
3489       stmt_vec_info stmt_info;
3490       tree base, offset, init;
3491       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3492       bool simd_lane_access = false;
3493       int vf;
3494
3495 again:
3496       if (!dr || !DR_REF (dr))
3497         {
3498           if (dump_enabled_p ())
3499             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3500                              "not vectorized: unhandled data-ref\n");
3501           return false;
3502         }
3503
3504       stmt = DR_STMT (dr);
3505       stmt_info = vinfo_for_stmt (stmt);
3506
3507       /* Discard clobbers from the dataref vector.  We will remove
3508          clobber stmts during vectorization.  */
3509       if (gimple_clobber_p (stmt))
3510         {
3511           free_data_ref (dr);
3512           if (i == datarefs.length () - 1)
3513             {
3514               datarefs.pop ();
3515               break;
3516             }
3517           datarefs.ordered_remove (i);
3518           dr = datarefs[i];
3519           goto again;
3520         }
3521
3522       /* Check that analysis of the data-ref succeeded.  */
3523       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3524           || !DR_STEP (dr))
3525         {
3526           bool maybe_gather
3527             = DR_IS_READ (dr)
3528               && !TREE_THIS_VOLATILE (DR_REF (dr))
3529               && targetm.vectorize.builtin_gather != NULL;
3530           bool maybe_scatter
3531             = DR_IS_WRITE (dr)
3532               && !TREE_THIS_VOLATILE (DR_REF (dr))
3533               && targetm.vectorize.builtin_scatter != NULL;
3534           bool maybe_simd_lane_access
3535             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3536
3537           /* If target supports vector gather loads or scatter stores, or if
3538              this might be a SIMD lane access, see if they can't be used.  */
3539           if (is_a <loop_vec_info> (vinfo)
3540               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3541               && !nested_in_vect_loop_p (loop, stmt))
3542             {
3543               struct data_reference *newdr
3544                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3545                                    DR_REF (dr), stmt, !maybe_scatter,
3546                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3547               gcc_assert (newdr != NULL && DR_REF (newdr));
3548               if (DR_BASE_ADDRESS (newdr)
3549                   && DR_OFFSET (newdr)
3550                   && DR_INIT (newdr)
3551                   && DR_STEP (newdr)
3552                   && integer_zerop (DR_STEP (newdr)))
3553                 {
3554                   if (maybe_simd_lane_access)
3555                     {
3556                       tree off = DR_OFFSET (newdr);
3557                       STRIP_NOPS (off);
3558                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3559                           && TREE_CODE (off) == MULT_EXPR
3560                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3561                         {
3562                           tree step = TREE_OPERAND (off, 1);
3563                           off = TREE_OPERAND (off, 0);
3564                           STRIP_NOPS (off);
3565                           if (CONVERT_EXPR_P (off)
3566                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3567                                                                           0)))
3568                                  < TYPE_PRECISION (TREE_TYPE (off)))
3569                             off = TREE_OPERAND (off, 0);
3570                           if (TREE_CODE (off) == SSA_NAME)
3571                             {
3572                               gimple *def = SSA_NAME_DEF_STMT (off);
3573                               tree reft = TREE_TYPE (DR_REF (newdr));
3574                               if (is_gimple_call (def)
3575                                   && gimple_call_internal_p (def)
3576                                   && (gimple_call_internal_fn (def)
3577                                       == IFN_GOMP_SIMD_LANE))
3578                                 {
3579                                   tree arg = gimple_call_arg (def, 0);
3580                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3581                                   arg = SSA_NAME_VAR (arg);
3582                                   if (arg == loop->simduid
3583                                       /* For now.  */
3584                                       && tree_int_cst_equal
3585                                            (TYPE_SIZE_UNIT (reft),
3586                                             step))
3587                                     {
3588                                       DR_OFFSET (newdr) = ssize_int (0);
3589                                       DR_STEP (newdr) = step;
3590                                       DR_OFFSET_ALIGNMENT (newdr)
3591                                         = BIGGEST_ALIGNMENT;
3592                                       DR_STEP_ALIGNMENT (newdr)
3593                                         = highest_pow2_factor (step);
3594                                       dr = newdr;
3595                                       simd_lane_access = true;
3596                                     }
3597                                 }
3598                             }
3599                         }
3600                     }
3601                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3602                     {
3603                       dr = newdr;
3604                       if (maybe_gather)
3605                         gatherscatter = GATHER;
3606                       else
3607                         gatherscatter = SCATTER;
3608                     }
3609                 }
3610               if (gatherscatter == SG_NONE && !simd_lane_access)
3611                 free_data_ref (newdr);
3612             }
3613
3614           if (gatherscatter == SG_NONE && !simd_lane_access)
3615             {
3616               if (dump_enabled_p ())
3617                 {
3618                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3619                                    "not vectorized: data ref analysis "
3620                                    "failed ");
3621                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3622                 }
3623
3624               if (is_a <bb_vec_info> (vinfo))
3625                 break;
3626
3627               return false;
3628             }
3629         }
3630
3631       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3632         {
3633           if (dump_enabled_p ())
3634             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3635                              "not vectorized: base addr of dr is a "
3636                              "constant\n");
3637
3638           if (is_a <bb_vec_info> (vinfo))
3639             break;
3640
3641           if (gatherscatter != SG_NONE || simd_lane_access)
3642             free_data_ref (dr);
3643           return false;
3644         }
3645
3646       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3647         {
3648           if (dump_enabled_p ())
3649             {
3650               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3651                                "not vectorized: volatile type ");
3652               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3653             }
3654
3655           if (is_a <bb_vec_info> (vinfo))
3656             break;
3657
3658           return false;
3659         }
3660
3661       if (stmt_can_throw_internal (stmt))
3662         {
3663           if (dump_enabled_p ())
3664             {
3665               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3666                                "not vectorized: statement can throw an "
3667                                "exception ");
3668               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3669             }
3670
3671           if (is_a <bb_vec_info> (vinfo))
3672             break;
3673
3674           if (gatherscatter != SG_NONE || simd_lane_access)
3675             free_data_ref (dr);
3676           return false;
3677         }
3678
3679       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3680           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3681         {
3682           if (dump_enabled_p ())
3683             {
3684               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3685                                "not vectorized: statement is bitfield "
3686                                "access ");
3687               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3688             }
3689
3690           if (is_a <bb_vec_info> (vinfo))
3691             break;
3692
3693           if (gatherscatter != SG_NONE || simd_lane_access)
3694             free_data_ref (dr);
3695           return false;
3696         }
3697
3698       base = unshare_expr (DR_BASE_ADDRESS (dr));
3699       offset = unshare_expr (DR_OFFSET (dr));
3700       init = unshare_expr (DR_INIT (dr));
3701
3702       if (is_gimple_call (stmt)
3703           && (!gimple_call_internal_p (stmt)
3704               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3705                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3706         {
3707           if (dump_enabled_p ())
3708             {
3709               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3710                                "not vectorized: dr in a call ");
3711               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3712             }
3713
3714           if (is_a <bb_vec_info> (vinfo))
3715             break;
3716
3717           if (gatherscatter != SG_NONE || simd_lane_access)
3718             free_data_ref (dr);
3719           return false;
3720         }
3721
3722       /* Update DR field in stmt_vec_info struct.  */
3723
3724       /* If the dataref is in an inner-loop of the loop that is considered for
3725          for vectorization, we also want to analyze the access relative to
3726          the outer-loop (DR contains information only relative to the
3727          inner-most enclosing loop).  We do that by building a reference to the
3728          first location accessed by the inner-loop, and analyze it relative to
3729          the outer-loop.  */
3730       if (loop && nested_in_vect_loop_p (loop, stmt))
3731         {
3732           /* Build a reference to the first location accessed by the
3733              inner loop: *(BASE + INIT + OFFSET).  By construction,
3734              this address must be invariant in the inner loop, so we
3735              can consider it as being used in the outer loop.  */
3736           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3737                                           init, offset);
3738           tree init_addr = fold_build_pointer_plus (base, init_offset);
3739           tree init_ref = build_fold_indirect_ref (init_addr);
3740
3741           if (dump_enabled_p ())
3742             {
3743               dump_printf_loc (MSG_NOTE, vect_location,
3744                                "analyze in outer loop: ");
3745               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3746               dump_printf (MSG_NOTE, "\n");
3747             }
3748
3749           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3750                                      init_ref, loop))
3751             /* dr_analyze_innermost already explained the failure.  */
3752             return false;
3753
3754           if (dump_enabled_p ())
3755             {
3756               dump_printf_loc (MSG_NOTE, vect_location,
3757                                "\touter base_address: ");
3758               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3759                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3760               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3761               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3762                                  STMT_VINFO_DR_OFFSET (stmt_info));
3763               dump_printf (MSG_NOTE,
3764                            "\n\touter constant offset from base address: ");
3765               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3766                                  STMT_VINFO_DR_INIT (stmt_info));
3767               dump_printf (MSG_NOTE, "\n\touter step: ");
3768               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3769                                  STMT_VINFO_DR_STEP (stmt_info));
3770               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3771                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3772               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3773                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3774               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3775                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3776               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3777                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3778             }
3779         }
3780
3781       if (STMT_VINFO_DATA_REF (stmt_info))
3782         {
3783           if (dump_enabled_p ())
3784             {
3785               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3786                                "not vectorized: more than one data ref "
3787                                "in stmt: ");
3788               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3789             }
3790
3791           if (is_a <bb_vec_info> (vinfo))
3792             break;
3793
3794           if (gatherscatter != SG_NONE || simd_lane_access)
3795             free_data_ref (dr);
3796           return false;
3797         }
3798
3799       STMT_VINFO_DATA_REF (stmt_info) = dr;
3800       if (simd_lane_access)
3801         {
3802           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3803           free_data_ref (datarefs[i]);
3804           datarefs[i] = dr;
3805         }
3806
3807       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3808           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3809           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3810         {
3811           if (dump_enabled_p ())
3812             {
3813               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3814                                "not vectorized: base object not addressable "
3815                                "for stmt: ");
3816               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3817             }
3818           if (is_a <bb_vec_info> (vinfo))
3819             {
3820               /* In BB vectorization the ref can still participate
3821                  in dependence analysis, we just can't vectorize it.  */
3822               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3823               continue;
3824             }
3825           return false;
3826         }
3827
3828       /* Set vectype for STMT.  */
3829       scalar_type = TREE_TYPE (DR_REF (dr));
3830       STMT_VINFO_VECTYPE (stmt_info)
3831         = get_vectype_for_scalar_type (scalar_type);
3832       if (!STMT_VINFO_VECTYPE (stmt_info))
3833         {
3834           if (dump_enabled_p ())
3835             {
3836               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3837                                "not vectorized: no vectype for stmt: ");
3838               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3839               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3840               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3841                                  scalar_type);
3842               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3843             }
3844
3845           if (is_a <bb_vec_info> (vinfo))
3846             {
3847               /* No vector type is fine, the ref can still participate
3848                  in dependence analysis, we just can't vectorize it.  */
3849               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3850               continue;
3851             }
3852
3853           if (gatherscatter != SG_NONE || simd_lane_access)
3854             {
3855               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3856               if (gatherscatter != SG_NONE)
3857                 free_data_ref (dr);
3858             }
3859           return false;
3860         }
3861       else
3862         {
3863           if (dump_enabled_p ())
3864             {
3865               dump_printf_loc (MSG_NOTE, vect_location,
3866                                "got vectype for stmt: ");
3867               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3868               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3869                                  STMT_VINFO_VECTYPE (stmt_info));
3870               dump_printf (MSG_NOTE, "\n");
3871             }
3872         }
3873
3874       /* Adjust the minimal vectorization factor according to the
3875          vector type.  */
3876       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3877       if (vf > *min_vf)
3878         *min_vf = vf;
3879
3880       if (gatherscatter != SG_NONE)
3881         {
3882           gather_scatter_info gs_info;
3883           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3884                                           &gs_info)
3885               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3886             {
3887               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3888               free_data_ref (dr);
3889               if (dump_enabled_p ())
3890                 {
3891                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3892                                    (gatherscatter == GATHER) ?
3893                                    "not vectorized: not suitable for gather "
3894                                    "load " :
3895                                    "not vectorized: not suitable for scatter "
3896                                    "store ");
3897                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3898                 }
3899               return false;
3900             }
3901
3902           free_data_ref (datarefs[i]);
3903           datarefs[i] = dr;
3904           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3905         }
3906
3907       else if (is_a <loop_vec_info> (vinfo)
3908                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3909         {
3910           if (nested_in_vect_loop_p (loop, stmt))
3911             {
3912               if (dump_enabled_p ())
3913                 {
3914                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3915                                    "not vectorized: not suitable for strided "
3916                                    "load ");
3917                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3918                 }
3919               return false;
3920             }
3921           STMT_VINFO_STRIDED_P (stmt_info) = true;
3922         }
3923     }
3924
3925   /* If we stopped analysis at the first dataref we could not analyze
3926      when trying to vectorize a basic-block mark the rest of the datarefs
3927      as not vectorizable and truncate the vector of datarefs.  That
3928      avoids spending useless time in analyzing their dependence.  */
3929   if (i != datarefs.length ())
3930     {
3931       gcc_assert (is_a <bb_vec_info> (vinfo));
3932       for (unsigned j = i; j < datarefs.length (); ++j)
3933         {
3934           data_reference_p dr = datarefs[j];
3935           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3936           free_data_ref (dr);
3937         }
3938       datarefs.truncate (i);
3939     }
3940
3941   return true;
3942 }
3943
3944
3945 /* Function vect_get_new_vect_var.
3946
3947    Returns a name for a new variable.  The current naming scheme appends the
3948    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3949    the name of vectorizer generated variables, and appends that to NAME if
3950    provided.  */
3951
3952 tree
3953 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3954 {
3955   const char *prefix;
3956   tree new_vect_var;
3957
3958   switch (var_kind)
3959   {
3960   case vect_simple_var:
3961     prefix = "vect";
3962     break;
3963   case vect_scalar_var:
3964     prefix = "stmp";
3965     break;
3966   case vect_mask_var:
3967     prefix = "mask";
3968     break;
3969   case vect_pointer_var:
3970     prefix = "vectp";
3971     break;
3972   default:
3973     gcc_unreachable ();
3974   }
3975
3976   if (name)
3977     {
3978       char* tmp = concat (prefix, "_", name, NULL);
3979       new_vect_var = create_tmp_reg (type, tmp);
3980       free (tmp);
3981     }
3982   else
3983     new_vect_var = create_tmp_reg (type, prefix);
3984
3985   return new_vect_var;
3986 }
3987
3988 /* Like vect_get_new_vect_var but return an SSA name.  */
3989
3990 tree
3991 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3992 {
3993   const char *prefix;
3994   tree new_vect_var;
3995
3996   switch (var_kind)
3997   {
3998   case vect_simple_var:
3999     prefix = "vect";
4000     break;
4001   case vect_scalar_var:
4002     prefix = "stmp";
4003     break;
4004   case vect_pointer_var:
4005     prefix = "vectp";
4006     break;
4007   default:
4008     gcc_unreachable ();
4009   }
4010
4011   if (name)
4012     {
4013       char* tmp = concat (prefix, "_", name, NULL);
4014       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4015       free (tmp);
4016     }
4017   else
4018     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4019
4020   return new_vect_var;
4021 }
4022
4023 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4024
4025 static void
4026 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4027 {
4028   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4029   int misalign = DR_MISALIGNMENT (dr);
4030   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4031     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4032   else
4033     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4034                             DR_TARGET_ALIGNMENT (dr), misalign);
4035 }
4036
4037 /* Function vect_create_addr_base_for_vector_ref.
4038
4039    Create an expression that computes the address of the first memory location
4040    that will be accessed for a data reference.
4041
4042    Input:
4043    STMT: The statement containing the data reference.
4044    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4045    OFFSET: Optional. If supplied, it is be added to the initial address.
4046    LOOP:    Specify relative to which loop-nest should the address be computed.
4047             For example, when the dataref is in an inner-loop nested in an
4048             outer-loop that is now being vectorized, LOOP can be either the
4049             outer-loop, or the inner-loop.  The first memory location accessed
4050             by the following dataref ('in' points to short):
4051
4052                 for (i=0; i<N; i++)
4053                    for (j=0; j<M; j++)
4054                      s += in[i+j]
4055
4056             is as follows:
4057             if LOOP=i_loop:     &in             (relative to i_loop)
4058             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4059    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4060             initial address.  Unlike OFFSET, which is number of elements to
4061             be added, BYTE_OFFSET is measured in bytes.
4062
4063    Output:
4064    1. Return an SSA_NAME whose value is the address of the memory location of
4065       the first vector of the data reference.
4066    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4067       these statement(s) which define the returned SSA_NAME.
4068
4069    FORNOW: We are only handling array accesses with step 1.  */
4070
4071 tree
4072 vect_create_addr_base_for_vector_ref (gimple *stmt,
4073                                       gimple_seq *new_stmt_list,
4074                                       tree offset,
4075                                       tree byte_offset)
4076 {
4077   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4078   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4079   const char *base_name;
4080   tree addr_base;
4081   tree dest;
4082   gimple_seq seq = NULL;
4083   tree vect_ptr_type;
4084   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4085   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4086   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4087
4088   tree data_ref_base = unshare_expr (drb->base_address);
4089   tree base_offset = unshare_expr (drb->offset);
4090   tree init = unshare_expr (drb->init);
4091
4092   if (loop_vinfo)
4093     base_name = get_name (data_ref_base);
4094   else
4095     {
4096       base_offset = ssize_int (0);
4097       init = ssize_int (0);
4098       base_name = get_name (DR_REF (dr));
4099     }
4100
4101   /* Create base_offset */
4102   base_offset = size_binop (PLUS_EXPR,
4103                             fold_convert (sizetype, base_offset),
4104                             fold_convert (sizetype, init));
4105
4106   if (offset)
4107     {
4108       offset = fold_build2 (MULT_EXPR, sizetype,
4109                             fold_convert (sizetype, offset), step);
4110       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4111                                  base_offset, offset);
4112     }
4113   if (byte_offset)
4114     {
4115       byte_offset = fold_convert (sizetype, byte_offset);
4116       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4117                                  base_offset, byte_offset);
4118     }
4119
4120   /* base + base_offset */
4121   if (loop_vinfo)
4122     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4123   else
4124     {
4125       addr_base = build1 (ADDR_EXPR,
4126                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4127                           unshare_expr (DR_REF (dr)));
4128     }
4129
4130   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4131   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4132   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4133   gimple_seq_add_seq (new_stmt_list, seq);
4134
4135   if (DR_PTR_INFO (dr)
4136       && TREE_CODE (addr_base) == SSA_NAME
4137       && !SSA_NAME_PTR_INFO (addr_base))
4138     {
4139       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4140       if (offset || byte_offset)
4141         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4142     }
4143
4144   if (dump_enabled_p ())
4145     {
4146       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4147       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4148       dump_printf (MSG_NOTE, "\n");
4149     }
4150
4151   return addr_base;
4152 }
4153
4154
4155 /* Function vect_create_data_ref_ptr.
4156
4157    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4158    location accessed in the loop by STMT, along with the def-use update
4159    chain to appropriately advance the pointer through the loop iterations.
4160    Also set aliasing information for the pointer.  This pointer is used by
4161    the callers to this function to create a memory reference expression for
4162    vector load/store access.
4163
4164    Input:
4165    1. STMT: a stmt that references memory. Expected to be of the form
4166          GIMPLE_ASSIGN <name, data-ref> or
4167          GIMPLE_ASSIGN <data-ref, name>.
4168    2. AGGR_TYPE: the type of the reference, which should be either a vector
4169         or an array.
4170    3. AT_LOOP: the loop where the vector memref is to be created.
4171    4. OFFSET (optional): an offset to be added to the initial address accessed
4172         by the data-ref in STMT.
4173    5. BSI: location where the new stmts are to be placed if there is no loop
4174    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4175         pointing to the initial address.
4176    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4177         to the initial address accessed by the data-ref in STMT.  This is
4178         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4179         in bytes.
4180
4181    Output:
4182    1. Declare a new ptr to vector_type, and have it point to the base of the
4183       data reference (initial addressed accessed by the data reference).
4184       For example, for vector of type V8HI, the following code is generated:
4185
4186       v8hi *ap;
4187       ap = (v8hi *)initial_address;
4188
4189       if OFFSET is not supplied:
4190          initial_address = &a[init];
4191       if OFFSET is supplied:
4192          initial_address = &a[init + OFFSET];
4193       if BYTE_OFFSET is supplied:
4194          initial_address = &a[init] + BYTE_OFFSET;
4195
4196       Return the initial_address in INITIAL_ADDRESS.
4197
4198    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4199       update the pointer in each iteration of the loop.
4200
4201       Return the increment stmt that updates the pointer in PTR_INCR.
4202
4203    3. Set INV_P to true if the access pattern of the data reference in the
4204       vectorized loop is invariant.  Set it to false otherwise.
4205
4206    4. Return the pointer.  */
4207
4208 tree
4209 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4210                           tree offset, tree *initial_address,
4211                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4212                           bool only_init, bool *inv_p, tree byte_offset)
4213 {
4214   const char *base_name;
4215   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4216   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4217   struct loop *loop = NULL;
4218   bool nested_in_vect_loop = false;
4219   struct loop *containing_loop = NULL;
4220   tree aggr_ptr_type;
4221   tree aggr_ptr;
4222   tree new_temp;
4223   gimple_seq new_stmt_list = NULL;
4224   edge pe = NULL;
4225   basic_block new_bb;
4226   tree aggr_ptr_init;
4227   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4228   tree aptr;
4229   gimple_stmt_iterator incr_gsi;
4230   bool insert_after;
4231   tree indx_before_incr, indx_after_incr;
4232   gimple *incr;
4233   tree step;
4234   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4235
4236   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4237               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4238
4239   if (loop_vinfo)
4240     {
4241       loop = LOOP_VINFO_LOOP (loop_vinfo);
4242       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4243       containing_loop = (gimple_bb (stmt))->loop_father;
4244       pe = loop_preheader_edge (loop);
4245     }
4246   else
4247     {
4248       gcc_assert (bb_vinfo);
4249       only_init = true;
4250       *ptr_incr = NULL;
4251     }
4252
4253   /* Check the step (evolution) of the load in LOOP, and record
4254      whether it's invariant.  */
4255   step = vect_dr_behavior (dr)->step;
4256   if (integer_zerop (step))
4257     *inv_p = true;
4258   else
4259     *inv_p = false;
4260
4261   /* Create an expression for the first address accessed by this load
4262      in LOOP.  */
4263   base_name = get_name (DR_BASE_ADDRESS (dr));
4264
4265   if (dump_enabled_p ())
4266     {
4267       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4268       dump_printf_loc (MSG_NOTE, vect_location,
4269                        "create %s-pointer variable to type: ",
4270                        get_tree_code_name (TREE_CODE (aggr_type)));
4271       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4272       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4273         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4274       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4275         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4276       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4277         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4278       else
4279         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4280       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4281       dump_printf (MSG_NOTE, "\n");
4282     }
4283
4284   /* (1) Create the new aggregate-pointer variable.
4285      Vector and array types inherit the alias set of their component
4286      type by default so we need to use a ref-all pointer if the data
4287      reference does not conflict with the created aggregated data
4288      reference because it is not addressable.  */
4289   bool need_ref_all = false;
4290   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4291                               get_alias_set (DR_REF (dr))))
4292     need_ref_all = true;
4293   /* Likewise for any of the data references in the stmt group.  */
4294   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4295     {
4296       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4297       do
4298         {
4299           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4300           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4301           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4302                                       get_alias_set (DR_REF (sdr))))
4303             {
4304               need_ref_all = true;
4305               break;
4306             }
4307           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4308         }
4309       while (orig_stmt);
4310     }
4311   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4312                                                need_ref_all);
4313   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4314
4315
4316   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4317      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4318      def-use update cycles for the pointer: one relative to the outer-loop
4319      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4320      to the inner-loop (which is the inner-most loop containing the dataref),
4321      and this is done be step (5) below.
4322
4323      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4324      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4325      redundant.  Steps (3),(4) create the following:
4326
4327         vp0 = &base_addr;
4328         LOOP:   vp1 = phi(vp0,vp2)
4329                 ...
4330                 ...
4331                 vp2 = vp1 + step
4332                 goto LOOP
4333
4334      If there is an inner-loop nested in loop, then step (5) will also be
4335      applied, and an additional update in the inner-loop will be created:
4336
4337         vp0 = &base_addr;
4338         LOOP:   vp1 = phi(vp0,vp2)
4339                 ...
4340         inner:     vp3 = phi(vp1,vp4)
4341                    vp4 = vp3 + inner_step
4342                    if () goto inner
4343                 ...
4344                 vp2 = vp1 + step
4345                 if () goto LOOP   */
4346
4347   /* (2) Calculate the initial address of the aggregate-pointer, and set
4348      the aggregate-pointer to point to it before the loop.  */
4349
4350   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4351
4352   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4353                                                    offset, byte_offset);
4354   if (new_stmt_list)
4355     {
4356       if (pe)
4357         {
4358           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4359           gcc_assert (!new_bb);
4360         }
4361       else
4362         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4363     }
4364
4365   *initial_address = new_temp;
4366   aggr_ptr_init = new_temp;
4367
4368   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4369      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4370      inner-loop nested in LOOP (during outer-loop vectorization).  */
4371
4372   /* No update in loop is required.  */
4373   if (only_init && (!loop_vinfo || at_loop == loop))
4374     aptr = aggr_ptr_init;
4375   else
4376     {
4377       /* The step of the aggregate pointer is the type size.  */
4378       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4379       /* One exception to the above is when the scalar step of the load in
4380          LOOP is zero. In this case the step here is also zero.  */
4381       if (*inv_p)
4382         iv_step = size_zero_node;
4383       else if (tree_int_cst_sgn (step) == -1)
4384         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4385
4386       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4387
4388       create_iv (aggr_ptr_init,
4389                  fold_convert (aggr_ptr_type, iv_step),
4390                  aggr_ptr, loop, &incr_gsi, insert_after,
4391                  &indx_before_incr, &indx_after_incr);
4392       incr = gsi_stmt (incr_gsi);
4393       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4394
4395       /* Copy the points-to information if it exists. */
4396       if (DR_PTR_INFO (dr))
4397         {
4398           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4399           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4400         }
4401       if (ptr_incr)
4402         *ptr_incr = incr;
4403
4404       aptr = indx_before_incr;
4405     }
4406
4407   if (!nested_in_vect_loop || only_init)
4408     return aptr;
4409
4410
4411   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4412      nested in LOOP, if exists.  */
4413
4414   gcc_assert (nested_in_vect_loop);
4415   if (!only_init)
4416     {
4417       standard_iv_increment_position (containing_loop, &incr_gsi,
4418                                       &insert_after);
4419       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4420                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4421                  &indx_after_incr);
4422       incr = gsi_stmt (incr_gsi);
4423       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4424
4425       /* Copy the points-to information if it exists. */
4426       if (DR_PTR_INFO (dr))
4427         {
4428           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4429           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4430         }
4431       if (ptr_incr)
4432         *ptr_incr = incr;
4433
4434       return indx_before_incr;
4435     }
4436   else
4437     gcc_unreachable ();
4438 }
4439
4440
4441 /* Function bump_vector_ptr
4442
4443    Increment a pointer (to a vector type) by vector-size. If requested,
4444    i.e. if PTR-INCR is given, then also connect the new increment stmt
4445    to the existing def-use update-chain of the pointer, by modifying
4446    the PTR_INCR as illustrated below:
4447
4448    The pointer def-use update-chain before this function:
4449                         DATAREF_PTR = phi (p_0, p_2)
4450                         ....
4451         PTR_INCR:       p_2 = DATAREF_PTR + step
4452
4453    The pointer def-use update-chain after this function:
4454                         DATAREF_PTR = phi (p_0, p_2)
4455                         ....
4456                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4457                         ....
4458         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4459
4460    Input:
4461    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4462                  in the loop.
4463    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4464               the loop.  The increment amount across iterations is expected
4465               to be vector_size.
4466    BSI - location where the new update stmt is to be placed.
4467    STMT - the original scalar memory-access stmt that is being vectorized.
4468    BUMP - optional. The offset by which to bump the pointer. If not given,
4469           the offset is assumed to be vector_size.
4470
4471    Output: Return NEW_DATAREF_PTR as illustrated above.
4472
4473 */
4474
4475 tree
4476 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4477                  gimple *stmt, tree bump)
4478 {
4479   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4480   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4481   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4482   tree update = TYPE_SIZE_UNIT (vectype);
4483   gassign *incr_stmt;
4484   ssa_op_iter iter;
4485   use_operand_p use_p;
4486   tree new_dataref_ptr;
4487
4488   if (bump)
4489     update = bump;
4490
4491   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4492     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4493   else
4494     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4495   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4496                                    dataref_ptr, update);
4497   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4498
4499   /* Copy the points-to information if it exists. */
4500   if (DR_PTR_INFO (dr))
4501     {
4502       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4503       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4504     }
4505
4506   if (!ptr_incr)
4507     return new_dataref_ptr;
4508
4509   /* Update the vector-pointer's cross-iteration increment.  */
4510   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4511     {
4512       tree use = USE_FROM_PTR (use_p);
4513
4514       if (use == dataref_ptr)
4515         SET_USE (use_p, new_dataref_ptr);
4516       else
4517         gcc_assert (tree_int_cst_compare (use, update) == 0);
4518     }
4519
4520   return new_dataref_ptr;
4521 }
4522
4523
4524 /* Function vect_create_destination_var.
4525
4526    Create a new temporary of type VECTYPE.  */
4527
4528 tree
4529 vect_create_destination_var (tree scalar_dest, tree vectype)
4530 {
4531   tree vec_dest;
4532   const char *name;
4533   char *new_name;
4534   tree type;
4535   enum vect_var_kind kind;
4536
4537   kind = vectype
4538     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4539     ? vect_mask_var
4540     : vect_simple_var
4541     : vect_scalar_var;
4542   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4543
4544   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4545
4546   name = get_name (scalar_dest);
4547   if (name)
4548     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4549   else
4550     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4551   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4552   free (new_name);
4553
4554   return vec_dest;
4555 }
4556
4557 /* Function vect_grouped_store_supported.
4558
4559    Returns TRUE if interleave high and interleave low permutations
4560    are supported, and FALSE otherwise.  */
4561
4562 bool
4563 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4564 {
4565   machine_mode mode = TYPE_MODE (vectype);
4566
4567   /* vect_permute_store_chain requires the group size to be equal to 3 or
4568      be a power of two.  */
4569   if (count != 3 && exact_log2 (count) == -1)
4570     {
4571       if (dump_enabled_p ())
4572         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4573                          "the size of the group of accesses"
4574                          " is not a power of 2 or not eqaul to 3\n");
4575       return false;
4576     }
4577
4578   /* Check that the permutation is supported.  */
4579   if (VECTOR_MODE_P (mode))
4580     {
4581       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4582       if (count == 3)
4583         {
4584           unsigned int j0 = 0, j1 = 0, j2 = 0;
4585           unsigned int i, j;
4586
4587           vec_perm_builder sel (nelt, nelt, 1);
4588           sel.quick_grow (nelt);
4589           vec_perm_indices indices;
4590           for (j = 0; j < 3; j++)
4591             {
4592               int nelt0 = ((3 - j) * nelt) % 3;
4593               int nelt1 = ((3 - j) * nelt + 1) % 3;
4594               int nelt2 = ((3 - j) * nelt + 2) % 3;
4595               for (i = 0; i < nelt; i++)
4596                 {
4597                   if (3 * i + nelt0 < nelt)
4598                     sel[3 * i + nelt0] = j0++;
4599                   if (3 * i + nelt1 < nelt)
4600                     sel[3 * i + nelt1] = nelt + j1++;
4601                   if (3 * i + nelt2 < nelt)
4602                     sel[3 * i + nelt2] = 0;
4603                 }
4604               indices.new_vector (sel, 2, nelt);
4605               if (!can_vec_perm_const_p (mode, indices))
4606                 {
4607                   if (dump_enabled_p ())
4608                     dump_printf (MSG_MISSED_OPTIMIZATION,
4609                                  "permutation op not supported by target.\n");
4610                   return false;
4611                 }
4612
4613               for (i = 0; i < nelt; i++)
4614                 {
4615                   if (3 * i + nelt0 < nelt)
4616                     sel[3 * i + nelt0] = 3 * i + nelt0;
4617                   if (3 * i + nelt1 < nelt)
4618                     sel[3 * i + nelt1] = 3 * i + nelt1;
4619                   if (3 * i + nelt2 < nelt)
4620                     sel[3 * i + nelt2] = nelt + j2++;
4621                 }
4622               indices.new_vector (sel, 2, nelt);
4623               if (!can_vec_perm_const_p (mode, indices))
4624                 {
4625                   if (dump_enabled_p ())
4626                     dump_printf (MSG_MISSED_OPTIMIZATION,
4627                                  "permutation op not supported by target.\n");
4628                   return false;
4629                 }
4630             }
4631           return true;
4632         }
4633       else
4634         {
4635           /* If length is not equal to 3 then only power of 2 is supported.  */
4636           gcc_assert (pow2p_hwi (count));
4637
4638           /* The encoding has 2 interleaved stepped patterns.  */
4639           vec_perm_builder sel (nelt, 2, 3);
4640           sel.quick_grow (6);
4641           for (i = 0; i < 3; i++)
4642             {
4643               sel[i * 2] = i;
4644               sel[i * 2 + 1] = i + nelt;
4645             }
4646           vec_perm_indices indices (sel, 2, nelt);
4647           if (can_vec_perm_const_p (mode, indices))
4648             {
4649               for (i = 0; i < 6; i++)
4650                 sel[i] += nelt / 2;
4651               indices.new_vector (sel, 2, nelt);
4652               if (can_vec_perm_const_p (mode, indices))
4653                 return true;
4654             }
4655         }
4656     }
4657
4658   if (dump_enabled_p ())
4659     dump_printf (MSG_MISSED_OPTIMIZATION,
4660                  "permutaion op not supported by target.\n");
4661   return false;
4662 }
4663
4664
4665 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4666    type VECTYPE.  */
4667
4668 bool
4669 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4670 {
4671   return vect_lanes_optab_supported_p ("vec_store_lanes",
4672                                        vec_store_lanes_optab,
4673                                        vectype, count);
4674 }
4675
4676
4677 /* Function vect_permute_store_chain.
4678
4679    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4680    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4681    the data correctly for the stores.  Return the final references for stores
4682    in RESULT_CHAIN.
4683
4684    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4685    The input is 4 vectors each containing 8 elements.  We assign a number to
4686    each element, the input sequence is:
4687
4688    1st vec:   0  1  2  3  4  5  6  7
4689    2nd vec:   8  9 10 11 12 13 14 15
4690    3rd vec:  16 17 18 19 20 21 22 23
4691    4th vec:  24 25 26 27 28 29 30 31
4692
4693    The output sequence should be:
4694
4695    1st vec:  0  8 16 24  1  9 17 25
4696    2nd vec:  2 10 18 26  3 11 19 27
4697    3rd vec:  4 12 20 28  5 13 21 30
4698    4th vec:  6 14 22 30  7 15 23 31
4699
4700    i.e., we interleave the contents of the four vectors in their order.
4701
4702    We use interleave_high/low instructions to create such output.  The input of
4703    each interleave_high/low operation is two vectors:
4704    1st vec    2nd vec
4705    0 1 2 3    4 5 6 7
4706    the even elements of the result vector are obtained left-to-right from the
4707    high/low elements of the first vector.  The odd elements of the result are
4708    obtained left-to-right from the high/low elements of the second vector.
4709    The output of interleave_high will be:   0 4 1 5
4710    and of interleave_low:                   2 6 3 7
4711
4712
4713    The permutation is done in log LENGTH stages.  In each stage interleave_high
4714    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4715    where the first argument is taken from the first half of DR_CHAIN and the
4716    second argument from it's second half.
4717    In our example,
4718
4719    I1: interleave_high (1st vec, 3rd vec)
4720    I2: interleave_low (1st vec, 3rd vec)
4721    I3: interleave_high (2nd vec, 4th vec)
4722    I4: interleave_low (2nd vec, 4th vec)
4723
4724    The output for the first stage is:
4725
4726    I1:  0 16  1 17  2 18  3 19
4727    I2:  4 20  5 21  6 22  7 23
4728    I3:  8 24  9 25 10 26 11 27
4729    I4: 12 28 13 29 14 30 15 31
4730
4731    The output of the second stage, i.e. the final result is:
4732
4733    I1:  0  8 16 24  1  9 17 25
4734    I2:  2 10 18 26  3 11 19 27
4735    I3:  4 12 20 28  5 13 21 30
4736    I4:  6 14 22 30  7 15 23 31.  */
4737
4738 void
4739 vect_permute_store_chain (vec<tree> dr_chain,
4740                           unsigned int length,
4741                           gimple *stmt,
4742                           gimple_stmt_iterator *gsi,
4743                           vec<tree> *result_chain)
4744 {
4745   tree vect1, vect2, high, low;
4746   gimple *perm_stmt;
4747   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4748   tree perm_mask_low, perm_mask_high;
4749   tree data_ref;
4750   tree perm3_mask_low, perm3_mask_high;
4751   unsigned int i, n, log_length = exact_log2 (length);
4752   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4753
4754   result_chain->quick_grow (length);
4755   memcpy (result_chain->address (), dr_chain.address (),
4756           length * sizeof (tree));
4757
4758   if (length == 3)
4759     {
4760       unsigned int j0 = 0, j1 = 0, j2 = 0;
4761
4762       vec_perm_builder sel (nelt, nelt, 1);
4763       sel.quick_grow (nelt);
4764       vec_perm_indices indices;
4765       for (j = 0; j < 3; j++)
4766         {
4767           int nelt0 = ((3 - j) * nelt) % 3;
4768           int nelt1 = ((3 - j) * nelt + 1) % 3;
4769           int nelt2 = ((3 - j) * nelt + 2) % 3;
4770
4771           for (i = 0; i < nelt; i++)
4772             {
4773               if (3 * i + nelt0 < nelt)
4774                 sel[3 * i + nelt0] = j0++;
4775               if (3 * i + nelt1 < nelt)
4776                 sel[3 * i + nelt1] = nelt + j1++;
4777               if (3 * i + nelt2 < nelt)
4778                 sel[3 * i + nelt2] = 0;
4779             }
4780           indices.new_vector (sel, 2, nelt);
4781           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4782
4783           for (i = 0; i < nelt; i++)
4784             {
4785               if (3 * i + nelt0 < nelt)
4786                 sel[3 * i + nelt0] = 3 * i + nelt0;
4787               if (3 * i + nelt1 < nelt)
4788                 sel[3 * i + nelt1] = 3 * i + nelt1;
4789               if (3 * i + nelt2 < nelt)
4790                 sel[3 * i + nelt2] = nelt + j2++;
4791             }
4792           indices.new_vector (sel, 2, nelt);
4793           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4794
4795           vect1 = dr_chain[0];
4796           vect2 = dr_chain[1];
4797
4798           /* Create interleaving stmt:
4799              low = VEC_PERM_EXPR <vect1, vect2,
4800                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4801                                    j + 2, nelt + j + 2, *, ...}>  */
4802           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4803           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4804                                            vect2, perm3_mask_low);
4805           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4806
4807           vect1 = data_ref;
4808           vect2 = dr_chain[2];
4809           /* Create interleaving stmt:
4810              low = VEC_PERM_EXPR <vect1, vect2,
4811                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4812                                    6, 7, nelt + j + 2, ...}>  */
4813           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4814           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4815                                            vect2, perm3_mask_high);
4816           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4817           (*result_chain)[j] = data_ref;
4818         }
4819     }
4820   else
4821     {
4822       /* If length is not equal to 3 then only power of 2 is supported.  */
4823       gcc_assert (pow2p_hwi (length));
4824
4825       /* The encoding has 2 interleaved stepped patterns.  */
4826       vec_perm_builder sel (nelt, 2, 3);
4827       sel.quick_grow (6);
4828       for (i = 0; i < 3; i++)
4829         {
4830           sel[i * 2] = i;
4831           sel[i * 2 + 1] = i + nelt;
4832         }
4833         vec_perm_indices indices (sel, 2, nelt);
4834         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4835
4836         for (i = 0; i < 6; i++)
4837           sel[i] += nelt / 2;
4838         indices.new_vector (sel, 2, nelt);
4839         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4840
4841         for (i = 0, n = log_length; i < n; i++)
4842           {
4843             for (j = 0; j < length/2; j++)
4844               {
4845                 vect1 = dr_chain[j];
4846                 vect2 = dr_chain[j+length/2];
4847
4848                 /* Create interleaving stmt:
4849                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4850                                                         ...}>  */
4851                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4852                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4853                                                  vect2, perm_mask_high);
4854                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4855                 (*result_chain)[2*j] = high;
4856
4857                 /* Create interleaving stmt:
4858                    low = VEC_PERM_EXPR <vect1, vect2,
4859                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4860                                          ...}>  */
4861                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4862                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4863                                                  vect2, perm_mask_low);
4864                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4865                 (*result_chain)[2*j+1] = low;
4866               }
4867             memcpy (dr_chain.address (), result_chain->address (),
4868                     length * sizeof (tree));
4869           }
4870     }
4871 }
4872
4873 /* Function vect_setup_realignment
4874
4875    This function is called when vectorizing an unaligned load using
4876    the dr_explicit_realign[_optimized] scheme.
4877    This function generates the following code at the loop prolog:
4878
4879       p = initial_addr;
4880    x  msq_init = *(floor(p));   # prolog load
4881       realignment_token = call target_builtin;
4882     loop:
4883    x  msq = phi (msq_init, ---)
4884
4885    The stmts marked with x are generated only for the case of
4886    dr_explicit_realign_optimized.
4887
4888    The code above sets up a new (vector) pointer, pointing to the first
4889    location accessed by STMT, and a "floor-aligned" load using that pointer.
4890    It also generates code to compute the "realignment-token" (if the relevant
4891    target hook was defined), and creates a phi-node at the loop-header bb
4892    whose arguments are the result of the prolog-load (created by this
4893    function) and the result of a load that takes place in the loop (to be
4894    created by the caller to this function).
4895
4896    For the case of dr_explicit_realign_optimized:
4897    The caller to this function uses the phi-result (msq) to create the
4898    realignment code inside the loop, and sets up the missing phi argument,
4899    as follows:
4900     loop:
4901       msq = phi (msq_init, lsq)
4902       lsq = *(floor(p'));        # load in loop
4903       result = realign_load (msq, lsq, realignment_token);
4904
4905    For the case of dr_explicit_realign:
4906     loop:
4907       msq = *(floor(p));        # load in loop
4908       p' = p + (VS-1);
4909       lsq = *(floor(p'));       # load in loop
4910       result = realign_load (msq, lsq, realignment_token);
4911
4912    Input:
4913    STMT - (scalar) load stmt to be vectorized. This load accesses
4914           a memory location that may be unaligned.
4915    BSI - place where new code is to be inserted.
4916    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4917                               is used.
4918
4919    Output:
4920    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4921                        target hook, if defined.
4922    Return value - the result of the loop-header phi node.  */
4923
4924 tree
4925 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4926                         tree *realignment_token,
4927                         enum dr_alignment_support alignment_support_scheme,
4928                         tree init_addr,
4929                         struct loop **at_loop)
4930 {
4931   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4932   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4933   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4934   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4935   struct loop *loop = NULL;
4936   edge pe = NULL;
4937   tree scalar_dest = gimple_assign_lhs (stmt);
4938   tree vec_dest;
4939   gimple *inc;
4940   tree ptr;
4941   tree data_ref;
4942   basic_block new_bb;
4943   tree msq_init = NULL_TREE;
4944   tree new_temp;
4945   gphi *phi_stmt;
4946   tree msq = NULL_TREE;
4947   gimple_seq stmts = NULL;
4948   bool inv_p;
4949   bool compute_in_loop = false;
4950   bool nested_in_vect_loop = false;
4951   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4952   struct loop *loop_for_initial_load = NULL;
4953
4954   if (loop_vinfo)
4955     {
4956       loop = LOOP_VINFO_LOOP (loop_vinfo);
4957       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4958     }
4959
4960   gcc_assert (alignment_support_scheme == dr_explicit_realign
4961               || alignment_support_scheme == dr_explicit_realign_optimized);
4962
4963   /* We need to generate three things:
4964      1. the misalignment computation
4965      2. the extra vector load (for the optimized realignment scheme).
4966      3. the phi node for the two vectors from which the realignment is
4967       done (for the optimized realignment scheme).  */
4968
4969   /* 1. Determine where to generate the misalignment computation.
4970
4971      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4972      calculation will be generated by this function, outside the loop (in the
4973      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4974      caller, inside the loop.
4975
4976      Background: If the misalignment remains fixed throughout the iterations of
4977      the loop, then both realignment schemes are applicable, and also the
4978      misalignment computation can be done outside LOOP.  This is because we are
4979      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4980      are a multiple of VS (the Vector Size), and therefore the misalignment in
4981      different vectorized LOOP iterations is always the same.
4982      The problem arises only if the memory access is in an inner-loop nested
4983      inside LOOP, which is now being vectorized using outer-loop vectorization.
4984      This is the only case when the misalignment of the memory access may not
4985      remain fixed throughout the iterations of the inner-loop (as explained in
4986      detail in vect_supportable_dr_alignment).  In this case, not only is the
4987      optimized realignment scheme not applicable, but also the misalignment
4988      computation (and generation of the realignment token that is passed to
4989      REALIGN_LOAD) have to be done inside the loop.
4990
4991      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4992      or not, which in turn determines if the misalignment is computed inside
4993      the inner-loop, or outside LOOP.  */
4994
4995   if (init_addr != NULL_TREE || !loop_vinfo)
4996     {
4997       compute_in_loop = true;
4998       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4999     }
5000
5001
5002   /* 2. Determine where to generate the extra vector load.
5003
5004      For the optimized realignment scheme, instead of generating two vector
5005      loads in each iteration, we generate a single extra vector load in the
5006      preheader of the loop, and in each iteration reuse the result of the
5007      vector load from the previous iteration.  In case the memory access is in
5008      an inner-loop nested inside LOOP, which is now being vectorized using
5009      outer-loop vectorization, we need to determine whether this initial vector
5010      load should be generated at the preheader of the inner-loop, or can be
5011      generated at the preheader of LOOP.  If the memory access has no evolution
5012      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5013      to be generated inside LOOP (in the preheader of the inner-loop).  */
5014
5015   if (nested_in_vect_loop)
5016     {
5017       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5018       bool invariant_in_outerloop =
5019             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5020       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5021     }
5022   else
5023     loop_for_initial_load = loop;
5024   if (at_loop)
5025     *at_loop = loop_for_initial_load;
5026
5027   if (loop_for_initial_load)
5028     pe = loop_preheader_edge (loop_for_initial_load);
5029
5030   /* 3. For the case of the optimized realignment, create the first vector
5031       load at the loop preheader.  */
5032
5033   if (alignment_support_scheme == dr_explicit_realign_optimized)
5034     {
5035       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5036       gassign *new_stmt;
5037
5038       gcc_assert (!compute_in_loop);
5039       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5040       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5041                                       NULL_TREE, &init_addr, NULL, &inc,
5042                                       true, &inv_p);
5043       if (TREE_CODE (ptr) == SSA_NAME)
5044         new_temp = copy_ssa_name (ptr);
5045       else
5046         new_temp = make_ssa_name (TREE_TYPE (ptr));
5047       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5048       new_stmt = gimple_build_assign
5049                    (new_temp, BIT_AND_EXPR, ptr,
5050                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5051       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5052       gcc_assert (!new_bb);
5053       data_ref
5054         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5055                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5056       new_stmt = gimple_build_assign (vec_dest, data_ref);
5057       new_temp = make_ssa_name (vec_dest, new_stmt);
5058       gimple_assign_set_lhs (new_stmt, new_temp);
5059       if (pe)
5060         {
5061           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5062           gcc_assert (!new_bb);
5063         }
5064       else
5065          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5066
5067       msq_init = gimple_assign_lhs (new_stmt);
5068     }
5069
5070   /* 4. Create realignment token using a target builtin, if available.
5071       It is done either inside the containing loop, or before LOOP (as
5072       determined above).  */
5073
5074   if (targetm.vectorize.builtin_mask_for_load)
5075     {
5076       gcall *new_stmt;
5077       tree builtin_decl;
5078
5079       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5080       if (!init_addr)
5081         {
5082           /* Generate the INIT_ADDR computation outside LOOP.  */
5083           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5084                                                             NULL_TREE);
5085           if (loop)
5086             {
5087               pe = loop_preheader_edge (loop);
5088               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5089               gcc_assert (!new_bb);
5090             }
5091           else
5092              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5093         }
5094
5095       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5096       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5097       vec_dest =
5098         vect_create_destination_var (scalar_dest,
5099                                      gimple_call_return_type (new_stmt));
5100       new_temp = make_ssa_name (vec_dest, new_stmt);
5101       gimple_call_set_lhs (new_stmt, new_temp);
5102
5103       if (compute_in_loop)
5104         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5105       else
5106         {
5107           /* Generate the misalignment computation outside LOOP.  */
5108           pe = loop_preheader_edge (loop);
5109           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5110           gcc_assert (!new_bb);
5111         }
5112
5113       *realignment_token = gimple_call_lhs (new_stmt);
5114
5115       /* The result of the CALL_EXPR to this builtin is determined from
5116          the value of the parameter and no global variables are touched
5117          which makes the builtin a "const" function.  Requiring the
5118          builtin to have the "const" attribute makes it unnecessary
5119          to call mark_call_clobbered.  */
5120       gcc_assert (TREE_READONLY (builtin_decl));
5121     }
5122
5123   if (alignment_support_scheme == dr_explicit_realign)
5124     return msq;
5125
5126   gcc_assert (!compute_in_loop);
5127   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5128
5129
5130   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5131
5132   pe = loop_preheader_edge (containing_loop);
5133   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5134   msq = make_ssa_name (vec_dest);
5135   phi_stmt = create_phi_node (msq, containing_loop->header);
5136   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5137
5138   return msq;
5139 }
5140
5141
5142 /* Function vect_grouped_load_supported.
5143
5144    COUNT is the size of the load group (the number of statements plus the
5145    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5146    only one statement, with a gap of COUNT - 1.
5147
5148    Returns true if a suitable permute exists.  */
5149
5150 bool
5151 vect_grouped_load_supported (tree vectype, bool single_element_p,
5152                              unsigned HOST_WIDE_INT count)
5153 {
5154   machine_mode mode = TYPE_MODE (vectype);
5155
5156   /* If this is single-element interleaving with an element distance
5157      that leaves unused vector loads around punt - we at least create
5158      very sub-optimal code in that case (and blow up memory,
5159      see PR65518).  */
5160   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5161     {
5162       if (dump_enabled_p ())
5163         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5164                          "single-element interleaving not supported "
5165                          "for not adjacent vector loads\n");
5166       return false;
5167     }
5168
5169   /* vect_permute_load_chain requires the group size to be equal to 3 or
5170      be a power of two.  */
5171   if (count != 3 && exact_log2 (count) == -1)
5172     {
5173       if (dump_enabled_p ())
5174         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5175                          "the size of the group of accesses"
5176                          " is not a power of 2 or not equal to 3\n");
5177       return false;
5178     }
5179
5180   /* Check that the permutation is supported.  */
5181   if (VECTOR_MODE_P (mode))
5182     {
5183       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5184
5185       if (count == 3)
5186         {
5187           vec_perm_builder sel (nelt, nelt, 1);
5188           sel.quick_grow (nelt);
5189           vec_perm_indices indices;
5190           unsigned int k;
5191           for (k = 0; k < 3; k++)
5192             {
5193               for (i = 0; i < nelt; i++)
5194                 if (3 * i + k < 2 * nelt)
5195                   sel[i] = 3 * i + k;
5196                 else
5197                   sel[i] = 0;
5198               indices.new_vector (sel, 2, nelt);
5199               if (!can_vec_perm_const_p (mode, indices))
5200                 {
5201                   if (dump_enabled_p ())
5202                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5203                                      "shuffle of 3 loads is not supported by"
5204                                      " target\n");
5205                   return false;
5206                 }
5207               for (i = 0, j = 0; i < nelt; i++)
5208                 if (3 * i + k < 2 * nelt)
5209                   sel[i] = i;
5210                 else
5211                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5212               indices.new_vector (sel, 2, nelt);
5213               if (!can_vec_perm_const_p (mode, indices))
5214                 {
5215                   if (dump_enabled_p ())
5216                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5217                                      "shuffle of 3 loads is not supported by"
5218                                      " target\n");
5219                   return false;
5220                 }
5221             }
5222           return true;
5223         }
5224       else
5225         {
5226           /* If length is not equal to 3 then only power of 2 is supported.  */
5227           gcc_assert (pow2p_hwi (count));
5228
5229           /* The encoding has a single stepped pattern.  */
5230           vec_perm_builder sel (nelt, 1, 3);
5231           sel.quick_grow (3);
5232           for (i = 0; i < 3; i++)
5233             sel[i] = i * 2;
5234           vec_perm_indices indices (sel, 2, nelt);
5235           if (can_vec_perm_const_p (mode, indices))
5236             {
5237               for (i = 0; i < 3; i++)
5238                 sel[i] = i * 2 + 1;
5239               indices.new_vector (sel, 2, nelt);
5240               if (can_vec_perm_const_p (mode, indices))
5241                 return true;
5242             }
5243         }
5244     }
5245
5246   if (dump_enabled_p ())
5247     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5248                      "extract even/odd not supported by target\n");
5249   return false;
5250 }
5251
5252 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5253    type VECTYPE.  */
5254
5255 bool
5256 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5257 {
5258   return vect_lanes_optab_supported_p ("vec_load_lanes",
5259                                        vec_load_lanes_optab,
5260                                        vectype, count);
5261 }
5262
5263 /* Function vect_permute_load_chain.
5264
5265    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5266    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5267    the input data correctly.  Return the final references for loads in
5268    RESULT_CHAIN.
5269
5270    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5271    The input is 4 vectors each containing 8 elements. We assign a number to each
5272    element, the input sequence is:
5273
5274    1st vec:   0  1  2  3  4  5  6  7
5275    2nd vec:   8  9 10 11 12 13 14 15
5276    3rd vec:  16 17 18 19 20 21 22 23
5277    4th vec:  24 25 26 27 28 29 30 31
5278
5279    The output sequence should be:
5280
5281    1st vec:  0 4  8 12 16 20 24 28
5282    2nd vec:  1 5  9 13 17 21 25 29
5283    3rd vec:  2 6 10 14 18 22 26 30
5284    4th vec:  3 7 11 15 19 23 27 31
5285
5286    i.e., the first output vector should contain the first elements of each
5287    interleaving group, etc.
5288
5289    We use extract_even/odd instructions to create such output.  The input of
5290    each extract_even/odd operation is two vectors
5291    1st vec    2nd vec
5292    0 1 2 3    4 5 6 7
5293
5294    and the output is the vector of extracted even/odd elements.  The output of
5295    extract_even will be:   0 2 4 6
5296    and of extract_odd:     1 3 5 7
5297
5298
5299    The permutation is done in log LENGTH stages.  In each stage extract_even
5300    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5301    their order.  In our example,
5302
5303    E1: extract_even (1st vec, 2nd vec)
5304    E2: extract_odd (1st vec, 2nd vec)
5305    E3: extract_even (3rd vec, 4th vec)
5306    E4: extract_odd (3rd vec, 4th vec)
5307
5308    The output for the first stage will be:
5309
5310    E1:  0  2  4  6  8 10 12 14
5311    E2:  1  3  5  7  9 11 13 15
5312    E3: 16 18 20 22 24 26 28 30
5313    E4: 17 19 21 23 25 27 29 31
5314
5315    In order to proceed and create the correct sequence for the next stage (or
5316    for the correct output, if the second stage is the last one, as in our
5317    example), we first put the output of extract_even operation and then the
5318    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5319    The input for the second stage is:
5320
5321    1st vec (E1):  0  2  4  6  8 10 12 14
5322    2nd vec (E3): 16 18 20 22 24 26 28 30
5323    3rd vec (E2):  1  3  5  7  9 11 13 15
5324    4th vec (E4): 17 19 21 23 25 27 29 31
5325
5326    The output of the second stage:
5327
5328    E1: 0 4  8 12 16 20 24 28
5329    E2: 2 6 10 14 18 22 26 30
5330    E3: 1 5  9 13 17 21 25 29
5331    E4: 3 7 11 15 19 23 27 31
5332
5333    And RESULT_CHAIN after reordering:
5334
5335    1st vec (E1):  0 4  8 12 16 20 24 28
5336    2nd vec (E3):  1 5  9 13 17 21 25 29
5337    3rd vec (E2):  2 6 10 14 18 22 26 30
5338    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5339
5340 static void
5341 vect_permute_load_chain (vec<tree> dr_chain,
5342                          unsigned int length,
5343                          gimple *stmt,
5344                          gimple_stmt_iterator *gsi,
5345                          vec<tree> *result_chain)
5346 {
5347   tree data_ref, first_vect, second_vect;
5348   tree perm_mask_even, perm_mask_odd;
5349   tree perm3_mask_low, perm3_mask_high;
5350   gimple *perm_stmt;
5351   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5352   unsigned int i, j, log_length = exact_log2 (length);
5353   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5354
5355   result_chain->quick_grow (length);
5356   memcpy (result_chain->address (), dr_chain.address (),
5357           length * sizeof (tree));
5358
5359   if (length == 3)
5360     {
5361       unsigned int k;
5362
5363       vec_perm_builder sel (nelt, nelt, 1);
5364       sel.quick_grow (nelt);
5365       vec_perm_indices indices;
5366       for (k = 0; k < 3; k++)
5367         {
5368           for (i = 0; i < nelt; i++)
5369             if (3 * i + k < 2 * nelt)
5370               sel[i] = 3 * i + k;
5371             else
5372               sel[i] = 0;
5373           indices.new_vector (sel, 2, nelt);
5374           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5375
5376           for (i = 0, j = 0; i < nelt; i++)
5377             if (3 * i + k < 2 * nelt)
5378               sel[i] = i;
5379             else
5380               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5381           indices.new_vector (sel, 2, nelt);
5382           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5383
5384           first_vect = dr_chain[0];
5385           second_vect = dr_chain[1];
5386
5387           /* Create interleaving stmt (low part of):
5388              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5389                                                              ...}>  */
5390           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5391           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5392                                            second_vect, perm3_mask_low);
5393           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5394
5395           /* Create interleaving stmt (high part of):
5396              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5397                                                               ...}>  */
5398           first_vect = data_ref;
5399           second_vect = dr_chain[2];
5400           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5401           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5402                                            second_vect, perm3_mask_high);
5403           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5404           (*result_chain)[k] = data_ref;
5405         }
5406     }
5407   else
5408     {
5409       /* If length is not equal to 3 then only power of 2 is supported.  */
5410       gcc_assert (pow2p_hwi (length));
5411
5412       /* The encoding has a single stepped pattern.  */
5413       vec_perm_builder sel (nelt, 1, 3);
5414       sel.quick_grow (3);
5415       for (i = 0; i < 3; ++i)
5416         sel[i] = i * 2;
5417       vec_perm_indices indices (sel, 2, nelt);
5418       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5419
5420       for (i = 0; i < 3; ++i)
5421         sel[i] = i * 2 + 1;
5422       indices.new_vector (sel, 2, nelt);
5423       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5424
5425       for (i = 0; i < log_length; i++)
5426         {
5427           for (j = 0; j < length; j += 2)
5428             {
5429               first_vect = dr_chain[j];
5430               second_vect = dr_chain[j+1];
5431
5432               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5433               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5434               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5435                                                first_vect, second_vect,
5436                                                perm_mask_even);
5437               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5438               (*result_chain)[j/2] = data_ref;
5439
5440               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5441               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5442               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5443                                                first_vect, second_vect,
5444                                                perm_mask_odd);
5445               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5446               (*result_chain)[j/2+length/2] = data_ref;
5447             }
5448           memcpy (dr_chain.address (), result_chain->address (),
5449                   length * sizeof (tree));
5450         }
5451     }
5452 }
5453
5454 /* Function vect_shift_permute_load_chain.
5455
5456    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5457    sequence of stmts to reorder the input data accordingly.
5458    Return the final references for loads in RESULT_CHAIN.
5459    Return true if successed, false otherwise.
5460
5461    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5462    The input is 3 vectors each containing 8 elements.  We assign a
5463    number to each element, the input sequence is:
5464
5465    1st vec:   0  1  2  3  4  5  6  7
5466    2nd vec:   8  9 10 11 12 13 14 15
5467    3rd vec:  16 17 18 19 20 21 22 23
5468
5469    The output sequence should be:
5470
5471    1st vec:  0 3 6  9 12 15 18 21
5472    2nd vec:  1 4 7 10 13 16 19 22
5473    3rd vec:  2 5 8 11 14 17 20 23
5474
5475    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5476
5477    First we shuffle all 3 vectors to get correct elements order:
5478
5479    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5480    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5481    3rd vec:  (16 19 22) (17 20 23) (18 21)
5482
5483    Next we unite and shift vector 3 times:
5484
5485    1st step:
5486      shift right by 6 the concatenation of:
5487      "1st vec" and  "2nd vec"
5488        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5489      "2nd vec" and  "3rd vec"
5490        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5491      "3rd vec" and  "1st vec"
5492        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5493                              | New vectors                   |
5494
5495      So that now new vectors are:
5496
5497      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5498      2nd vec:  (10 13) (16 19 22) (17 20 23)
5499      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5500
5501    2nd step:
5502      shift right by 5 the concatenation of:
5503      "1st vec" and  "3rd vec"
5504        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5505      "2nd vec" and  "1st vec"
5506        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5507      "3rd vec" and  "2nd vec"
5508        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5509                           | New vectors                   |
5510
5511      So that now new vectors are:
5512
5513      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5514      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5515      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5516
5517    3rd step:
5518      shift right by 5 the concatenation of:
5519      "1st vec" and  "1st vec"
5520        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5521      shift right by 3 the concatenation of:
5522      "2nd vec" and  "2nd vec"
5523                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5524                           | New vectors                   |
5525
5526      So that now all vectors are READY:
5527      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5528      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5529      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5530
5531    This algorithm is faster than one in vect_permute_load_chain if:
5532      1.  "shift of a concatination" is faster than general permutation.
5533          This is usually so.
5534      2.  The TARGET machine can't execute vector instructions in parallel.
5535          This is because each step of the algorithm depends on previous.
5536          The algorithm in vect_permute_load_chain is much more parallel.
5537
5538    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5539 */
5540
5541 static bool
5542 vect_shift_permute_load_chain (vec<tree> dr_chain,
5543                                unsigned int length,
5544                                gimple *stmt,
5545                                gimple_stmt_iterator *gsi,
5546                                vec<tree> *result_chain)
5547 {
5548   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5549   tree perm2_mask1, perm2_mask2, perm3_mask;
5550   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5551   gimple *perm_stmt;
5552
5553   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5554   unsigned int i;
5555   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5556   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5557   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5558
5559   vec_perm_builder sel (nelt, nelt, 1);
5560   sel.quick_grow (nelt);
5561
5562   result_chain->quick_grow (length);
5563   memcpy (result_chain->address (), dr_chain.address (),
5564           length * sizeof (tree));
5565
5566   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5567     {
5568       unsigned int j, log_length = exact_log2 (length);
5569       for (i = 0; i < nelt / 2; ++i)
5570         sel[i] = i * 2;
5571       for (i = 0; i < nelt / 2; ++i)
5572         sel[nelt / 2 + i] = i * 2 + 1;
5573       vec_perm_indices indices (sel, 2, nelt);
5574       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5575         {
5576           if (dump_enabled_p ())
5577             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5578                              "shuffle of 2 fields structure is not \
5579                               supported by target\n");
5580           return false;
5581         }
5582       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5583
5584       for (i = 0; i < nelt / 2; ++i)
5585         sel[i] = i * 2 + 1;
5586       for (i = 0; i < nelt / 2; ++i)
5587         sel[nelt / 2 + i] = i * 2;
5588       indices.new_vector (sel, 2, nelt);
5589       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5590         {
5591           if (dump_enabled_p ())
5592             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5593                              "shuffle of 2 fields structure is not \
5594                               supported by target\n");
5595           return false;
5596         }
5597       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5598
5599       /* Generating permutation constant to shift all elements.
5600          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5601       for (i = 0; i < nelt; i++)
5602         sel[i] = nelt / 2 + i;
5603       indices.new_vector (sel, 2, nelt);
5604       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5605         {
5606           if (dump_enabled_p ())
5607             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5608                              "shift permutation is not supported by target\n");
5609           return false;
5610         }
5611       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5612
5613       /* Generating permutation constant to select vector from 2.
5614          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5615       for (i = 0; i < nelt / 2; i++)
5616         sel[i] = i;
5617       for (i = nelt / 2; i < nelt; i++)
5618         sel[i] = nelt + i;
5619       indices.new_vector (sel, 2, nelt);
5620       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5621         {
5622           if (dump_enabled_p ())
5623             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5624                              "select is not supported by target\n");
5625           return false;
5626         }
5627       select_mask = vect_gen_perm_mask_checked (vectype, indices);
5628
5629       for (i = 0; i < log_length; i++)
5630         {
5631           for (j = 0; j < length; j += 2)
5632             {
5633               first_vect = dr_chain[j];
5634               second_vect = dr_chain[j + 1];
5635
5636               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5637               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5638                                                first_vect, first_vect,
5639                                                perm2_mask1);
5640               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5641               vect[0] = data_ref;
5642
5643               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5644               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5645                                                second_vect, second_vect,
5646                                                perm2_mask2);
5647               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5648               vect[1] = data_ref;
5649
5650               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5651               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5652                                                vect[0], vect[1], shift1_mask);
5653               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5654               (*result_chain)[j/2 + length/2] = data_ref;
5655
5656               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5657               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5658                                                vect[0], vect[1], select_mask);
5659               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5660               (*result_chain)[j/2] = data_ref;
5661             }
5662           memcpy (dr_chain.address (), result_chain->address (),
5663                   length * sizeof (tree));
5664         }
5665       return true;
5666     }
5667   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5668     {
5669       unsigned int k = 0, l = 0;
5670
5671       /* Generating permutation constant to get all elements in rigth order.
5672          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5673       for (i = 0; i < nelt; i++)
5674         {
5675           if (3 * k + (l % 3) >= nelt)
5676             {
5677               k = 0;
5678               l += (3 - (nelt % 3));
5679             }
5680           sel[i] = 3 * k + (l % 3);
5681           k++;
5682         }
5683       vec_perm_indices indices (sel, 2, nelt);
5684       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5685         {
5686           if (dump_enabled_p ())
5687             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5688                              "shuffle of 3 fields structure is not \
5689                               supported by target\n");
5690           return false;
5691         }
5692       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5693
5694       /* Generating permutation constant to shift all elements.
5695          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5696       for (i = 0; i < nelt; i++)
5697         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5698       indices.new_vector (sel, 2, nelt);
5699       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5700         {
5701           if (dump_enabled_p ())
5702             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5703                              "shift permutation is not supported by target\n");
5704           return false;
5705         }
5706       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5707
5708       /* Generating permutation constant to shift all elements.
5709          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5710       for (i = 0; i < nelt; i++)
5711         sel[i] = 2 * (nelt / 3) + 1 + i;
5712       indices.new_vector (sel, 2, nelt);
5713       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5714         {
5715           if (dump_enabled_p ())
5716             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5717                              "shift permutation is not supported by target\n");
5718           return false;
5719         }
5720       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5721
5722       /* Generating permutation constant to shift all elements.
5723          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5724       for (i = 0; i < nelt; i++)
5725         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5726       indices.new_vector (sel, 2, nelt);
5727       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5728         {
5729           if (dump_enabled_p ())
5730             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5731                              "shift permutation is not supported by target\n");
5732           return false;
5733         }
5734       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5735
5736       /* Generating permutation constant to shift all elements.
5737          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5738       for (i = 0; i < nelt; i++)
5739         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5740       indices.new_vector (sel, 2, nelt);
5741       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5742         {
5743           if (dump_enabled_p ())
5744             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5745                              "shift permutation is not supported by target\n");
5746           return false;
5747         }
5748       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5749
5750       for (k = 0; k < 3; k++)
5751         {
5752           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5753           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5754                                            dr_chain[k], dr_chain[k],
5755                                            perm3_mask);
5756           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5757           vect[k] = data_ref;
5758         }
5759
5760       for (k = 0; k < 3; k++)
5761         {
5762           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5763           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5764                                            vect[k % 3], vect[(k + 1) % 3],
5765                                            shift1_mask);
5766           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5767           vect_shift[k] = data_ref;
5768         }
5769
5770       for (k = 0; k < 3; k++)
5771         {
5772           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5773           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5774                                            vect_shift[(4 - k) % 3],
5775                                            vect_shift[(3 - k) % 3],
5776                                            shift2_mask);
5777           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5778           vect[k] = data_ref;
5779         }
5780
5781       (*result_chain)[3 - (nelt % 3)] = vect[2];
5782
5783       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5784       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5785                                        vect[0], shift3_mask);
5786       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5787       (*result_chain)[nelt % 3] = data_ref;
5788
5789       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5790       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5791                                        vect[1], shift4_mask);
5792       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5793       (*result_chain)[0] = data_ref;
5794       return true;
5795     }
5796   return false;
5797 }
5798
5799 /* Function vect_transform_grouped_load.
5800
5801    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5802    to perform their permutation and ascribe the result vectorized statements to
5803    the scalar statements.
5804 */
5805
5806 void
5807 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5808                              gimple_stmt_iterator *gsi)
5809 {
5810   machine_mode mode;
5811   vec<tree> result_chain = vNULL;
5812
5813   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5814      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5815      vectors, that are ready for vector computation.  */
5816   result_chain.create (size);
5817
5818   /* If reassociation width for vector type is 2 or greater target machine can
5819      execute 2 or more vector instructions in parallel.  Otherwise try to
5820      get chain for loads group using vect_shift_permute_load_chain.  */
5821   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5822   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5823       || pow2p_hwi (size)
5824       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5825                                          gsi, &result_chain))
5826     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5827   vect_record_grouped_load_vectors (stmt, result_chain);
5828   result_chain.release ();
5829 }
5830
5831 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5832    generated as part of the vectorization of STMT.  Assign the statement
5833    for each vector to the associated scalar statement.  */
5834
5835 void
5836 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5837 {
5838   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5839   gimple *next_stmt, *new_stmt;
5840   unsigned int i, gap_count;
5841   tree tmp_data_ref;
5842
5843   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5844      Since we scan the chain starting from it's first node, their order
5845      corresponds the order of data-refs in RESULT_CHAIN.  */
5846   next_stmt = first_stmt;
5847   gap_count = 1;
5848   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5849     {
5850       if (!next_stmt)
5851         break;
5852
5853       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5854        code elimination pass later.  No need to check for the first stmt in
5855        the group, since it always exists.
5856        GROUP_GAP is the number of steps in elements from the previous
5857        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5858        correspond to the gaps.  */
5859       if (next_stmt != first_stmt
5860           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5861       {
5862         gap_count++;
5863         continue;
5864       }
5865
5866       while (next_stmt)
5867         {
5868           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5869           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5870              copies, and we put the new vector statement in the first available
5871              RELATED_STMT.  */
5872           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5873             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5874           else
5875             {
5876               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5877                 {
5878                   gimple *prev_stmt =
5879                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5880                   gimple *rel_stmt =
5881                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5882                   while (rel_stmt)
5883                     {
5884                       prev_stmt = rel_stmt;
5885                       rel_stmt =
5886                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5887                     }
5888
5889                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5890                     new_stmt;
5891                 }
5892             }
5893
5894           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5895           gap_count = 1;
5896           /* If NEXT_STMT accesses the same DR as the previous statement,
5897              put the same TMP_DATA_REF as its vectorized statement; otherwise
5898              get the next data-ref from RESULT_CHAIN.  */
5899           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5900             break;
5901         }
5902     }
5903 }
5904
5905 /* Function vect_force_dr_alignment_p.
5906
5907    Returns whether the alignment of a DECL can be forced to be aligned
5908    on ALIGNMENT bit boundary.  */
5909
5910 bool
5911 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5912 {
5913   if (!VAR_P (decl))
5914     return false;
5915
5916   if (decl_in_symtab_p (decl)
5917       && !symtab_node::get (decl)->can_increase_alignment_p ())
5918     return false;
5919
5920   if (TREE_STATIC (decl))
5921     return (alignment <= MAX_OFILE_ALIGNMENT);
5922   else
5923     return (alignment <= MAX_STACK_ALIGNMENT);
5924 }
5925
5926
5927 /* Return whether the data reference DR is supported with respect to its
5928    alignment.
5929    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5930    it is aligned, i.e., check if it is possible to vectorize it with different
5931    alignment.  */
5932
5933 enum dr_alignment_support
5934 vect_supportable_dr_alignment (struct data_reference *dr,
5935                                bool check_aligned_accesses)
5936 {
5937   gimple *stmt = DR_STMT (dr);
5938   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5939   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5940   machine_mode mode = TYPE_MODE (vectype);
5941   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5942   struct loop *vect_loop = NULL;
5943   bool nested_in_vect_loop = false;
5944
5945   if (aligned_access_p (dr) && !check_aligned_accesses)
5946     return dr_aligned;
5947
5948   /* For now assume all conditional loads/stores support unaligned
5949      access without any special code.  */
5950   if (is_gimple_call (stmt)
5951       && gimple_call_internal_p (stmt)
5952       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5953           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5954     return dr_unaligned_supported;
5955
5956   if (loop_vinfo)
5957     {
5958       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5959       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5960     }
5961
5962   /* Possibly unaligned access.  */
5963
5964   /* We can choose between using the implicit realignment scheme (generating
5965      a misaligned_move stmt) and the explicit realignment scheme (generating
5966      aligned loads with a REALIGN_LOAD).  There are two variants to the
5967      explicit realignment scheme: optimized, and unoptimized.
5968      We can optimize the realignment only if the step between consecutive
5969      vector loads is equal to the vector size.  Since the vector memory
5970      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5971      is guaranteed that the misalignment amount remains the same throughout the
5972      execution of the vectorized loop.  Therefore, we can create the
5973      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5974      at the loop preheader.
5975
5976      However, in the case of outer-loop vectorization, when vectorizing a
5977      memory access in the inner-loop nested within the LOOP that is now being
5978      vectorized, while it is guaranteed that the misalignment of the
5979      vectorized memory access will remain the same in different outer-loop
5980      iterations, it is *not* guaranteed that is will remain the same throughout
5981      the execution of the inner-loop.  This is because the inner-loop advances
5982      with the original scalar step (and not in steps of VS).  If the inner-loop
5983      step happens to be a multiple of VS, then the misalignment remains fixed
5984      and we can use the optimized realignment scheme.  For example:
5985
5986       for (i=0; i<N; i++)
5987         for (j=0; j<M; j++)
5988           s += a[i+j];
5989
5990      When vectorizing the i-loop in the above example, the step between
5991      consecutive vector loads is 1, and so the misalignment does not remain
5992      fixed across the execution of the inner-loop, and the realignment cannot
5993      be optimized (as illustrated in the following pseudo vectorized loop):
5994
5995       for (i=0; i<N; i+=4)
5996         for (j=0; j<M; j++){
5997           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5998                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5999                          // (assuming that we start from an aligned address).
6000           }
6001
6002      We therefore have to use the unoptimized realignment scheme:
6003
6004       for (i=0; i<N; i+=4)
6005           for (j=k; j<M; j+=4)
6006           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6007                            // that the misalignment of the initial address is
6008                            // 0).
6009
6010      The loop can then be vectorized as follows:
6011
6012       for (k=0; k<4; k++){
6013         rt = get_realignment_token (&vp[k]);
6014         for (i=0; i<N; i+=4){
6015           v1 = vp[i+k];
6016           for (j=k; j<M; j+=4){
6017             v2 = vp[i+j+VS-1];
6018             va = REALIGN_LOAD <v1,v2,rt>;
6019             vs += va;
6020             v1 = v2;
6021           }
6022         }
6023     } */
6024
6025   if (DR_IS_READ (dr))
6026     {
6027       bool is_packed = false;
6028       tree type = (TREE_TYPE (DR_REF (dr)));
6029
6030       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6031           && (!targetm.vectorize.builtin_mask_for_load
6032               || targetm.vectorize.builtin_mask_for_load ()))
6033         {
6034           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6035
6036           /* If we are doing SLP then the accesses need not have the
6037              same alignment, instead it depends on the SLP group size.  */
6038           if (loop_vinfo
6039               && STMT_SLP_TYPE (stmt_info)
6040               && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6041                   * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6042                   % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6043             ;
6044           else if (!loop_vinfo
6045                    || (nested_in_vect_loop
6046                        && (TREE_INT_CST_LOW (DR_STEP (dr))
6047                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6048             return dr_explicit_realign;
6049           else
6050             return dr_explicit_realign_optimized;
6051         }
6052       if (!known_alignment_for_access_p (dr))
6053         is_packed = not_size_aligned (DR_REF (dr));
6054
6055       if (targetm.vectorize.support_vector_misalignment
6056             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6057         /* Can't software pipeline the loads, but can at least do them.  */
6058         return dr_unaligned_supported;
6059     }
6060   else
6061     {
6062       bool is_packed = false;
6063       tree type = (TREE_TYPE (DR_REF (dr)));
6064
6065       if (!known_alignment_for_access_p (dr))
6066         is_packed = not_size_aligned (DR_REF (dr));
6067
6068      if (targetm.vectorize.support_vector_misalignment
6069            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6070        return dr_unaligned_supported;
6071     }
6072
6073   /* Unsupported.  */
6074   return dr_unaligned_unsupported;
6075 }