gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55
  56 /* Return true if load- or store-lanes optab OPTAB is implemented for
  57    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  58
  59 static bool
  60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  61                               tree vectype, unsigned HOST_WIDE_INT count)
  62 {
  63   machine_mode mode, array_mode;
  64   bool limit_p;
  65
  66   mode = TYPE_MODE (vectype);
  67   limit_p = !targetm.array_mode_supported_p (mode, count);
  68   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  69                               MODE_INT, limit_p);
  70
  71   if (array_mode == BLKmode)
  72     {
  73       if (dump_enabled_p ())
  74         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  75                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  76                          GET_MODE_NAME (mode), count);
  77       return false;
  78     }
  79
  80   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  81     {
  82       if (dump_enabled_p ())
  83         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  84                          "cannot use %s<%s><%s>\n", name,
  85                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  86       return false;
  87     }
  88
  89   if (dump_enabled_p ())
  90     dump_printf_loc (MSG_NOTE, vect_location,
  91                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  92                      GET_MODE_NAME (mode));
  93
  94   return true;
  95 }
  96
  97
  98 /* Return the smallest scalar part of STMT.
  99    This is used to determine the vectype of the stmt.  We generally set the
 100    vectype according to the type of the result (lhs).  For stmts whose
 101    result-type is different than the type of the arguments (e.g., demotion,
 102    promotion), vectype will be reset appropriately (later).  Note that we have
 103    to visit the smallest datatype in this function, because that determines the
 104    VF.  If the smallest datatype in the loop is present only as the rhs of a
 105    promotion operation - we'd miss it.
 106    Such a case, where a variable of this datatype does not appear in the lhs
 107    anywhere in the loop, can only occur if it's an invariant: e.g.:
 108    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 109    invariant motion.  However, we cannot rely on invariant motion to always
 110    take invariants out of the loop, and so in the case of promotion we also
 111    have to check the rhs.
 112    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 113    types.  */
 114
 115 tree
 116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 117                                HOST_WIDE_INT *rhs_size_unit)
 118 {
 119   tree scalar_type = gimple_expr_type (stmt);
 120   HOST_WIDE_INT lhs, rhs;
 121
 122   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 123
 124   if (is_gimple_assign (stmt)
 125       && (gimple_assign_cast_p (stmt)
 126           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 127           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 128           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 129     {
 130       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 131
 132       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 133       if (rhs < lhs)
 134         scalar_type = rhs_type;
 135     }
 136
 137   *lhs_size_unit = lhs;
 138   *rhs_size_unit = rhs;
 139   return scalar_type;
 140 }
 141
 142
 143 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 144    tested at run-time.  Return TRUE if DDR was successfully inserted.
 145    Return false if versioning is not supported.  */
 146
 147 static bool
 148 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 149 {
 150   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 151
 152   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 153     return false;
 154
 155   if (!runtime_alias_check_p (ddr, loop,
 156                               optimize_loop_nest_for_speed_p (loop)))
 157     return false;
 158
 159   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 160   return true;
 161 }
 162
 163
 164 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 165    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 166    distances.  These distances are conservatively correct but they don't
 167    reflect a guaranteed dependence.
 168
 169    Return true if this function does all the work necessary to avoid
 170    an alias or false if the caller should use the dependence distances
 171    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 172    the depth of the loop described by LOOP_VINFO and the other arguments
 173    are as for vect_analyze_data_ref_dependence.  */
 174
 175 static bool
 176 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 177                                        loop_vec_info loop_vinfo,
 178                                        int loop_depth, int *max_vf)
 179 {
 180   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 181   lambda_vector dist_v;
 182   unsigned int i;
 183   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 184     {
 185       int dist = dist_v[loop_depth];
 186       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 187         {
 188           /* If the user asserted safelen >= DIST consecutive iterations
 189              can be executed concurrently, assume independence.
 190
 191              ??? An alternative would be to add the alias check even
 192              in this case, and vectorize the fallback loop with the
 193              maximum VF set to safelen.  However, if the user has
 194              explicitly given a length, it's less likely that that
 195              would be a win.  */
 196           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 197             {
 198               if (loop->safelen < *max_vf)
 199                 *max_vf = loop->safelen;
 200               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 201               continue;
 202             }
 203
 204           /* For dependence distances of 2 or more, we have the option
 205              of limiting VF or checking for an alias at runtime.
 206              Prefer to check at runtime if we can, to avoid limiting
 207              the VF unnecessarily when the bases are in fact independent.
 208
 209              Note that the alias checks will be removed if the VF ends up
 210              being small enough.  */
 211           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 212         }
 213     }
 214   return true;
 215 }
 216
 217
 218 /* Function vect_analyze_data_ref_dependence.
 219
 220    Return TRUE if there (might) exist a dependence between a memory-reference
 221    DRA and a memory-reference DRB.  When versioning for alias may check a
 222    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 223    the data dependence.  */
 224
 225 static bool
 226 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 227                                   loop_vec_info loop_vinfo, int *max_vf)
 228 {
 229   unsigned int i;
 230   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 231   struct data_reference *dra = DDR_A (ddr);
 232   struct data_reference *drb = DDR_B (ddr);
 233   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 234   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 235   lambda_vector dist_v;
 236   unsigned int loop_depth;
 237
 238   /* In loop analysis all data references should be vectorizable.  */
 239   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 240       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 241     gcc_unreachable ();
 242
 243   /* Independent data accesses.  */
 244   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 245     return false;
 246
 247   if (dra == drb
 248       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 249     return false;
 250
 251   /* We do not have to consider dependences between accesses that belong
 252      to the same group.  */
 253   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 254       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 255     return false;
 256
 257   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 258      least two scalar iterations, there is always also a true dependence.
 259      As the vectorizer does not re-order loads and stores we can ignore
 260      the anti-dependence if TBAA can disambiguate both DRs similar to the
 261      case with known negative distance anti-dependences (positive
 262      distance anti-dependences would violate TBAA constraints).  */
 263   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 264        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 265       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 266                                  get_alias_set (DR_REF (drb))))
 267     return false;
 268
 269   /* Unknown data dependence.  */
 270   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 271     {
 272       /* If user asserted safelen consecutive iterations can be
 273          executed concurrently, assume independence.  */
 274       if (loop->safelen >= 2)
 275         {
 276           if (loop->safelen < *max_vf)
 277             *max_vf = loop->safelen;
 278           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 279           return false;
 280         }
 281
 282       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 283           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 284         {
 285           if (dump_enabled_p ())
 286             {
 287               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 288                                "versioning for alias not supported for: "
 289                                "can't determine dependence between ");
 290               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 291                                  DR_REF (dra));
 292               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 293               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 294                                  DR_REF (drb));
 295               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 296             }
 297           return true;
 298         }
 299
 300       if (dump_enabled_p ())
 301         {
 302           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 303                            "versioning for alias required: "
 304                            "can't determine dependence between ");
 305           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 306                              DR_REF (dra));
 307           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 308           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 309                              DR_REF (drb));
 310           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 311         }
 312
 313       /* Add to list of ddrs that need to be tested at run-time.  */
 314       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 315     }
 316
 317   /* Known data dependence.  */
 318   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 319     {
 320       /* If user asserted safelen consecutive iterations can be
 321          executed concurrently, assume independence.  */
 322       if (loop->safelen >= 2)
 323         {
 324           if (loop->safelen < *max_vf)
 325             *max_vf = loop->safelen;
 326           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 327           return false;
 328         }
 329
 330       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 331           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 332         {
 333           if (dump_enabled_p ())
 334             {
 335               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 336                                "versioning for alias not supported for: "
 337                                "bad dist vector for ");
 338               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 339                                  DR_REF (dra));
 340               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 341               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                  DR_REF (drb));
 343               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 344             }
 345           return true;
 346         }
 347
 348       if (dump_enabled_p ())
 349         {
 350           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 351                            "versioning for alias required: "
 352                            "bad dist vector for ");
 353           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 354           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 355           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 356           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 357         }
 358       /* Add to list of ddrs that need to be tested at run-time.  */
 359       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 360     }
 361
 362   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 363
 364   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 365       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 366                                                 loop_depth, max_vf))
 367     return false;
 368
 369   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 370     {
 371       int dist = dist_v[loop_depth];
 372
 373       if (dump_enabled_p ())
 374         dump_printf_loc (MSG_NOTE, vect_location,
 375                          "dependence distance  = %d.\n", dist);
 376
 377       if (dist == 0)
 378         {
 379           if (dump_enabled_p ())
 380             {
 381               dump_printf_loc (MSG_NOTE, vect_location,
 382                                "dependence distance == 0 between ");
 383               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 384               dump_printf (MSG_NOTE, " and ");
 385               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 386               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 387             }
 388
 389           /* When we perform grouped accesses and perform implicit CSE
 390              by detecting equal accesses and doing disambiguation with
 391              runtime alias tests like for
 392                 .. = a[i];
 393                 .. = a[i+1];
 394                 a[i] = ..;
 395                 a[i+1] = ..;
 396                 *p = ..;
 397                 .. = a[i];
 398                 .. = a[i+1];
 399              where we will end up loading { a[i], a[i+1] } once, make
 400              sure that inserting group loads before the first load and
 401              stores after the last store will do the right thing.
 402              Similar for groups like
 403                 a[i] = ...;
 404                 ... = a[i];
 405                 a[i+1] = ...;
 406              where loads from the group interleave with the store.  */
 407           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 408               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 409             {
 410               gimple *earlier_stmt;
 411               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 412               if (DR_IS_WRITE
 413                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 414                 {
 415                   if (dump_enabled_p ())
 416                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 417                                      "READ_WRITE dependence in interleaving."
 418                                      "\n");
 419                   return true;
 420                 }
 421             }
 422
 423           continue;
 424         }
 425
 426       if (dist > 0 && DDR_REVERSED_P (ddr))
 427         {
 428           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 429              reversed (to make distance vector positive), and the actual
 430              distance is negative.  */
 431           if (dump_enabled_p ())
 432             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 433                              "dependence distance negative.\n");
 434           /* Record a negative dependence distance to later limit the
 435              amount of stmt copying / unrolling we can perform.
 436              Only need to handle read-after-write dependence.  */
 437           if (DR_IS_READ (drb)
 438               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 439                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 440             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 441           continue;
 442         }
 443
 444       if (abs (dist) >= 2
 445           && abs (dist) < *max_vf)
 446         {
 447           /* The dependence distance requires reduction of the maximal
 448              vectorization factor.  */
 449           *max_vf = abs (dist);
 450           if (dump_enabled_p ())
 451             dump_printf_loc (MSG_NOTE, vect_location,
 452                              "adjusting maximal vectorization factor to %i\n",
 453                              *max_vf);
 454         }
 455
 456       if (abs (dist) >= *max_vf)
 457         {
 458           /* Dependence distance does not create dependence, as far as
 459              vectorization is concerned, in this case.  */
 460           if (dump_enabled_p ())
 461             dump_printf_loc (MSG_NOTE, vect_location,
 462                              "dependence distance >= VF.\n");
 463           continue;
 464         }
 465
 466       if (dump_enabled_p ())
 467         {
 468           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 469                        "not vectorized, possible dependence "
 470                        "between data-refs ");
 471           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 472           dump_printf (MSG_NOTE,  " and ");
 473           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 474           dump_printf (MSG_NOTE,  "\n");
 475         }
 476
 477       return true;
 478     }
 479
 480   return false;
 481 }
 482
 483 /* Function vect_analyze_data_ref_dependences.
 484
 485    Examine all the data references in the loop, and make sure there do not
 486    exist any data dependences between them.  Set *MAX_VF according to
 487    the maximum vectorization factor the data dependences allow.  */
 488
 489 bool
 490 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 491 {
 492   unsigned int i;
 493   struct data_dependence_relation *ddr;
 494
 495   if (dump_enabled_p ())
 496     dump_printf_loc (MSG_NOTE, vect_location,
 497                      "=== vect_analyze_data_ref_dependences ===\n");
 498
 499   LOOP_VINFO_DDRS (loop_vinfo)
 500     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 501              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 502   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 503   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 504   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 505                                 &LOOP_VINFO_DDRS (loop_vinfo),
 506                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 507     return false;
 508
 509   /* For epilogues we either have no aliases or alias versioning
 510      was applied to original loop.  Therefore we may just get max_vf
 511      using VF of original loop.  */
 512   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 513     *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
 514   else
 515     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 516       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 517         return false;
 518
 519   return true;
 520 }
 521
 522
 523 /* Function vect_slp_analyze_data_ref_dependence.
 524
 525    Return TRUE if there (might) exist a dependence between a memory-reference
 526    DRA and a memory-reference DRB.  When versioning for alias may check a
 527    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 528    the data dependence.  */
 529
 530 static bool
 531 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 532 {
 533   struct data_reference *dra = DDR_A (ddr);
 534   struct data_reference *drb = DDR_B (ddr);
 535
 536   /* We need to check dependences of statements marked as unvectorizable
 537      as well, they still can prohibit vectorization.  */
 538
 539   /* Independent data accesses.  */
 540   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 541     return false;
 542
 543   if (dra == drb)
 544     return false;
 545
 546   /* Read-read is OK.  */
 547   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 548     return false;
 549
 550   /* If dra and drb are part of the same interleaving chain consider
 551      them independent.  */
 552   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 553       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 554           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 555     return false;
 556
 557   /* Unknown data dependence.  */
 558   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 559     {
 560       if  (dump_enabled_p ())
 561         {
 562           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 563                            "can't determine dependence between ");
 564           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 565           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 566           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 567           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 568         }
 569     }
 570   else if (dump_enabled_p ())
 571     {
 572       dump_printf_loc (MSG_NOTE, vect_location,
 573                        "determined dependence between ");
 574       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 575       dump_printf (MSG_NOTE, " and ");
 576       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 577       dump_printf (MSG_NOTE,  "\n");
 578     }
 579
 580   return true;
 581 }
 582
 583
 584 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 585    contain the vector of scalar stores of this instance if we are
 586    disambiguating the loads.  */
 587
 588 static bool
 589 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 590                                    vec<gimple *> stores, gimple *last_store)
 591 {
 592   /* This walks over all stmts involved in the SLP load/store done
 593      in NODE verifying we can sink them up to the last stmt in the
 594      group.  */
 595   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 596   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 597     {
 598       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 599       if (access == last_access)
 600         continue;
 601       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 602       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 603            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 604         {
 605           gimple *stmt = gsi_stmt (gsi);
 606           if (! gimple_vuse (stmt)
 607               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 608             continue;
 609
 610           /* If we couldn't record a (single) data reference for this
 611              stmt we have to give up.  */
 612           /* ???  Here and below if dependence analysis fails we can resort
 613              to the alias oracle which can handle more kinds of stmts.  */
 614           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 615           if (!dr_b)
 616             return false;
 617
 618           bool dependent = false;
 619           /* If we run into a store of this same instance (we've just
 620              marked those) then delay dependence checking until we run
 621              into the last store because this is where it will have
 622              been sunk to (and we verify if we can do that as well).  */
 623           if (gimple_visited_p (stmt))
 624             {
 625               if (stmt != last_store)
 626                 continue;
 627               unsigned i;
 628               gimple *store;
 629               FOR_EACH_VEC_ELT (stores, i, store)
 630                 {
 631                   data_reference *store_dr
 632                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 633                   ddr_p ddr = initialize_data_dependence_relation
 634                                 (dr_a, store_dr, vNULL);
 635                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 636                   free_dependence_relation (ddr);
 637                   if (dependent)
 638                     break;
 639                 }
 640             }
 641           else
 642             {
 643               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 644                                                                dr_b, vNULL);
 645               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 646               free_dependence_relation (ddr);
 647             }
 648           if (dependent)
 649             return false;
 650         }
 651     }
 652   return true;
 653 }
 654
 655
 656 /* Function vect_analyze_data_ref_dependences.
 657
 658    Examine all the data references in the basic-block, and make sure there
 659    do not exist any data dependences between them.  Set *MAX_VF according to
 660    the maximum vectorization factor the data dependences allow.  */
 661
 662 bool
 663 vect_slp_analyze_instance_dependence (slp_instance instance)
 664 {
 665   if (dump_enabled_p ())
 666     dump_printf_loc (MSG_NOTE, vect_location,
 667                      "=== vect_slp_analyze_instance_dependence ===\n");
 668
 669   /* The stores of this instance are at the root of the SLP tree.  */
 670   slp_tree store = SLP_INSTANCE_TREE (instance);
 671   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 672     store = NULL;
 673
 674   /* Verify we can sink stores to the vectorized stmt insert location.  */
 675   gimple *last_store = NULL;
 676   if (store)
 677     {
 678       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 679         return false;
 680
 681       /* Mark stores in this instance and remember the last one.  */
 682       last_store = vect_find_last_scalar_stmt_in_slp (store);
 683       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 684         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 685     }
 686
 687   bool res = true;
 688
 689   /* Verify we can sink loads to the vectorized stmt insert location,
 690      special-casing stores of this instance.  */
 691   slp_tree load;
 692   unsigned int i;
 693   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 694     if (! vect_slp_analyze_node_dependences (instance, load,
 695                                              store
 696                                              ? SLP_TREE_SCALAR_STMTS (store)
 697                                              : vNULL, last_store))
 698       {
 699         res = false;
 700         break;
 701       }
 702
 703   /* Unset the visited flag.  */
 704   if (store)
 705     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 706       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 707
 708   return res;
 709 }
 710
 711 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 712    the statement that contains DRB, which is useful for recording in the
 713    dump file.  */
 714
 715 static void
 716 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 717                             innermost_loop_behavior *drb)
 718 {
 719   bool existed;
 720   innermost_loop_behavior *&entry
 721     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 722   if (!existed || entry->base_alignment < drb->base_alignment)
 723     {
 724       entry = drb;
 725       if (dump_enabled_p ())
 726         {
 727           dump_printf_loc (MSG_NOTE, vect_location,
 728                            "recording new base alignment for ");
 729           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 730           dump_printf (MSG_NOTE, "\n");
 731           dump_printf_loc (MSG_NOTE, vect_location,
 732                            "  alignment:    %d\n", drb->base_alignment);
 733           dump_printf_loc (MSG_NOTE, vect_location,
 734                            "  misalignment: %d\n", drb->base_misalignment);
 735           dump_printf_loc (MSG_NOTE, vect_location,
 736                            "  based on:     ");
 737           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 738         }
 739     }
 740 }
 741
 742 /* If the region we're going to vectorize is reached, all unconditional
 743    data references occur at least once.  We can therefore pool the base
 744    alignment guarantees from each unconditional reference.  Do this by
 745    going through all the data references in VINFO and checking whether
 746    the containing statement makes the reference unconditionally.  If so,
 747    record the alignment of the base address in VINFO so that it can be
 748    used for all other references with the same base.  */
 749
 750 void
 751 vect_record_base_alignments (vec_info *vinfo)
 752 {
 753   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 754   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 755   data_reference *dr;
 756   unsigned int i;
 757   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 758     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 759       {
 760         gimple *stmt = DR_STMT (dr);
 761         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 762
 763         /* If DR is nested in the loop that is being vectorized, we can also
 764            record the alignment of the base wrt the outer loop.  */
 765         if (loop && nested_in_vect_loop_p (loop, stmt))
 766           {
 767             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 768             vect_record_base_alignment
 769               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 770           }
 771       }
 772 }
 773
 774 /* Function vect_compute_data_ref_alignment
 775
 776    Compute the misalignment of the data reference DR.
 777
 778    Output:
 779    1. If during the misalignment computation it is found that the data reference
 780       cannot be vectorized then false is returned.
 781    2. DR_MISALIGNMENT (DR) is defined.
 782
 783    FOR NOW: No analysis is actually performed. Misalignment is calculated
 784    only for trivial cases. TODO.  */
 785
 786 bool
 787 vect_compute_data_ref_alignment (struct data_reference *dr)
 788 {
 789   gimple *stmt = DR_STMT (dr);
 790   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 791   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 792   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 793   struct loop *loop = NULL;
 794   tree ref = DR_REF (dr);
 795   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 796
 797   if (dump_enabled_p ())
 798     dump_printf_loc (MSG_NOTE, vect_location,
 799                      "vect_compute_data_ref_alignment:\n");
 800
 801   if (loop_vinfo)
 802     loop = LOOP_VINFO_LOOP (loop_vinfo);
 803
 804   /* Initialize misalignment to unknown.  */
 805   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 806
 807   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 808   bool step_preserves_misalignment_p;
 809
 810   /* No step for BB vectorization.  */
 811   if (!loop)
 812     {
 813       gcc_assert (integer_zerop (drb->step));
 814       step_preserves_misalignment_p = true;
 815     }
 816
 817   /* In case the dataref is in an inner-loop of the loop that is being
 818      vectorized (LOOP), we use the base and misalignment information
 819      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 820      stays the same throughout the execution of the inner-loop, which is why
 821      we have to check that the stride of the dataref in the inner-loop evenly
 822      divides by the vector size.  */
 823   else if (nested_in_vect_loop_p (loop, stmt))
 824     {
 825       step_preserves_misalignment_p
 826         = (DR_STEP_ALIGNMENT (dr)
 827            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 828
 829       if (dump_enabled_p ())
 830         {
 831           if (step_preserves_misalignment_p)
 832             dump_printf_loc (MSG_NOTE, vect_location,
 833                              "inner step divides the vector-size.\n");
 834           else
 835             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 836                              "inner step doesn't divide the vector-size.\n");
 837         }
 838     }
 839
 840   /* Similarly we can only use base and misalignment information relative to
 841      an innermost loop if the misalignment stays the same throughout the
 842      execution of the loop.  As above, this is the case if the stride of
 843      the dataref evenly divides by the vector size.  */
 844   else
 845     {
 846       unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 847       step_preserves_misalignment_p
 848         = ((DR_STEP_ALIGNMENT (dr) * vf)
 849            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 850
 851       if (!step_preserves_misalignment_p && dump_enabled_p ())
 852         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 853                          "step doesn't divide the vector-size.\n");
 854     }
 855
 856   unsigned int base_alignment = drb->base_alignment;
 857   unsigned int base_misalignment = drb->base_misalignment;
 858   unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
 859
 860   /* Calculate the maximum of the pooled base address alignment and the
 861      alignment that we can compute for DR itself.  */
 862   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 863   if (entry && base_alignment < (*entry)->base_alignment)
 864     {
 865       base_alignment = (*entry)->base_alignment;
 866       base_misalignment = (*entry)->base_misalignment;
 867     }
 868
 869   if (drb->offset_alignment < vector_alignment
 870       || !step_preserves_misalignment_p
 871       /* We need to know whether the step wrt the vectorized loop is
 872          negative when computing the starting misalignment below.  */
 873       || TREE_CODE (drb->step) != INTEGER_CST)
 874     {
 875       if (dump_enabled_p ())
 876         {
 877           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 878                            "Unknown alignment for access: ");
 879           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 880           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 881         }
 882       return true;
 883     }
 884
 885   if (base_alignment < vector_alignment)
 886     {
 887       tree base = drb->base_address;
 888       if (TREE_CODE (base) == ADDR_EXPR)
 889         base = TREE_OPERAND (base, 0);
 890       if (!vect_can_force_dr_alignment_p (base,
 891                                           vector_alignment * BITS_PER_UNIT))
 892         {
 893           if (dump_enabled_p ())
 894             {
 895               dump_printf_loc (MSG_NOTE, vect_location,
 896                                "can't force alignment of ref: ");
 897               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 898               dump_printf (MSG_NOTE, "\n");
 899             }
 900           return true;
 901         }
 902
 903       if (DECL_USER_ALIGN (base))
 904         {
 905           if (dump_enabled_p ())
 906             {
 907               dump_printf_loc (MSG_NOTE, vect_location,
 908                                "not forcing alignment of user-aligned "
 909                                "variable: ");
 910               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 911               dump_printf (MSG_NOTE, "\n");
 912             }
 913           return true;
 914         }
 915
 916       /* Force the alignment of the decl.
 917          NOTE: This is the only change to the code we make during
 918          the analysis phase, before deciding to vectorize the loop.  */
 919       if (dump_enabled_p ())
 920         {
 921           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 922           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 923           dump_printf (MSG_NOTE, "\n");
 924         }
 925
 926       DR_VECT_AUX (dr)->base_decl = base;
 927       DR_VECT_AUX (dr)->base_misaligned = true;
 928       base_misalignment = 0;
 929     }
 930   unsigned int misalignment = (base_misalignment
 931                                + TREE_INT_CST_LOW (drb->init));
 932
 933   /* If this is a backward running DR then first access in the larger
 934      vectype actually is N-1 elements before the address in the DR.
 935      Adjust misalign accordingly.  */
 936   if (tree_int_cst_sgn (drb->step) < 0)
 937     /* PLUS because STEP is negative.  */
 938     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 939                      * TREE_INT_CST_LOW (drb->step));
 940
 941   SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
 942
 943   if (dump_enabled_p ())
 944     {
 945       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 946                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 947       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 948       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 949     }
 950
 951   return true;
 952 }
 953
 954
 955 /* Function vect_update_misalignment_for_peel.
 956    Sets DR's misalignment
 957    - to 0 if it has the same alignment as DR_PEEL,
 958    - to the misalignment computed using NPEEL if DR's salignment is known,
 959    - to -1 (unknown) otherwise.
 960
 961    DR - the data reference whose misalignment is to be adjusted.
 962    DR_PEEL - the data reference whose misalignment is being made
 963              zero in the vector loop by the peel.
 964    NPEEL - the number of iterations in the peel loop if the misalignment
 965            of DR_PEEL is known at compile time.  */
 966
 967 static void
 968 vect_update_misalignment_for_peel (struct data_reference *dr,
 969                                    struct data_reference *dr_peel, int npeel)
 970 {
 971   unsigned int i;
 972   vec<dr_p> same_aligned_drs;
 973   struct data_reference *current_dr;
 974   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 975   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 976   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 977   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 978
 979  /* For interleaved data accesses the step in the loop must be multiplied by
 980      the size of the interleaving group.  */
 981   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 982     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 983   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 984     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 985
 986   /* It can be assumed that the data refs with the same alignment as dr_peel
 987      are aligned in the vector loop.  */
 988   same_aligned_drs
 989     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 990   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
 991     {
 992       if (current_dr != dr)
 993         continue;
 994       gcc_assert (!known_alignment_for_access_p (dr)
 995                   || !known_alignment_for_access_p (dr_peel)
 996                   || (DR_MISALIGNMENT (dr) / dr_size
 997                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
 998       SET_DR_MISALIGNMENT (dr, 0);
 999       return;
1000     }
1001
1002   if (known_alignment_for_access_p (dr)
1003       && known_alignment_for_access_p (dr_peel))
1004     {
1005       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1006       int misal = DR_MISALIGNMENT (dr);
1007       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1008       misal += negative ? -npeel * dr_size : npeel * dr_size;
1009       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1010       SET_DR_MISALIGNMENT (dr, misal);
1011       return;
1012     }
1013
1014   if (dump_enabled_p ())
1015     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1016                      "to unknown (-1).\n");
1017   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1018 }
1019
1020
1021 /* Function verify_data_ref_alignment
1022
1023    Return TRUE if DR can be handled with respect to alignment.  */
1024
1025 static bool
1026 verify_data_ref_alignment (data_reference_p dr)
1027 {
1028   enum dr_alignment_support supportable_dr_alignment
1029     = vect_supportable_dr_alignment (dr, false);
1030   if (!supportable_dr_alignment)
1031     {
1032       if (dump_enabled_p ())
1033         {
1034           if (DR_IS_READ (dr))
1035             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036                              "not vectorized: unsupported unaligned load.");
1037           else
1038             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1039                              "not vectorized: unsupported unaligned "
1040                              "store.");
1041
1042           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1043                              DR_REF (dr));
1044           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1045         }
1046       return false;
1047     }
1048
1049   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1050     dump_printf_loc (MSG_NOTE, vect_location,
1051                      "Vectorizing an unaligned access.\n");
1052
1053   return true;
1054 }
1055
1056 /* Function vect_verify_datarefs_alignment
1057
1058    Return TRUE if all data references in the loop can be
1059    handled with respect to alignment.  */
1060
1061 bool
1062 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1063 {
1064   vec<data_reference_p> datarefs = vinfo->datarefs;
1065   struct data_reference *dr;
1066   unsigned int i;
1067
1068   FOR_EACH_VEC_ELT (datarefs, i, dr)
1069     {
1070       gimple *stmt = DR_STMT (dr);
1071       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1072
1073       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1074         continue;
1075
1076       /* For interleaving, only the alignment of the first access matters.   */
1077       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1078           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1079         continue;
1080
1081       /* Strided accesses perform only component accesses, alignment is
1082          irrelevant for them.  */
1083       if (STMT_VINFO_STRIDED_P (stmt_info)
1084           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1085         continue;
1086
1087       if (! verify_data_ref_alignment (dr))
1088         return false;
1089     }
1090
1091   return true;
1092 }
1093
1094 /* Given an memory reference EXP return whether its alignment is less
1095    than its size.  */
1096
1097 static bool
1098 not_size_aligned (tree exp)
1099 {
1100   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1101     return true;
1102
1103   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1104           > get_object_alignment (exp));
1105 }
1106
1107 /* Function vector_alignment_reachable_p
1108
1109    Return true if vector alignment for DR is reachable by peeling
1110    a few loop iterations.  Return false otherwise.  */
1111
1112 static bool
1113 vector_alignment_reachable_p (struct data_reference *dr)
1114 {
1115   gimple *stmt = DR_STMT (dr);
1116   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1117   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1118
1119   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1120     {
1121       /* For interleaved access we peel only if number of iterations in
1122          the prolog loop ({VF - misalignment}), is a multiple of the
1123          number of the interleaved accesses.  */
1124       int elem_size, mis_in_elements;
1125       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1126
1127       /* FORNOW: handle only known alignment.  */
1128       if (!known_alignment_for_access_p (dr))
1129         return false;
1130
1131       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1132       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1133
1134       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1135         return false;
1136     }
1137
1138   /* If misalignment is known at the compile time then allow peeling
1139      only if natural alignment is reachable through peeling.  */
1140   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1141     {
1142       HOST_WIDE_INT elmsize =
1143                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1144       if (dump_enabled_p ())
1145         {
1146           dump_printf_loc (MSG_NOTE, vect_location,
1147                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1148           dump_printf (MSG_NOTE,
1149                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1150         }
1151       if (DR_MISALIGNMENT (dr) % elmsize)
1152         {
1153           if (dump_enabled_p ())
1154             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1155                              "data size does not divide the misalignment.\n");
1156           return false;
1157         }
1158     }
1159
1160   if (!known_alignment_for_access_p (dr))
1161     {
1162       tree type = TREE_TYPE (DR_REF (dr));
1163       bool is_packed = not_size_aligned (DR_REF (dr));
1164       if (dump_enabled_p ())
1165         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166                          "Unknown misalignment, %snaturally aligned\n",
1167                          is_packed ? "not " : "");
1168       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1169     }
1170
1171   return true;
1172 }
1173
1174
1175 /* Calculate the cost of the memory access represented by DR.  */
1176
1177 static void
1178 vect_get_data_access_cost (struct data_reference *dr,
1179                            unsigned int *inside_cost,
1180                            unsigned int *outside_cost,
1181                            stmt_vector_for_cost *body_cost_vec)
1182 {
1183   gimple *stmt = DR_STMT (dr);
1184   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1185   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1186   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1187   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1188   int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly  */
1189
1190   if (DR_IS_READ (dr))
1191     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1192                         NULL, body_cost_vec, false);
1193   else
1194     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1195
1196   if (dump_enabled_p ())
1197     dump_printf_loc (MSG_NOTE, vect_location,
1198                      "vect_get_data_access_cost: inside_cost = %d, "
1199                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1200 }
1201
1202
1203 typedef struct _vect_peel_info
1204 {
1205   struct data_reference *dr;
1206   int npeel;
1207   unsigned int count;
1208 } *vect_peel_info;
1209
1210 typedef struct _vect_peel_extended_info
1211 {
1212   struct _vect_peel_info peel_info;
1213   unsigned int inside_cost;
1214   unsigned int outside_cost;
1215 } *vect_peel_extended_info;
1216
1217
1218 /* Peeling hashtable helpers.  */
1219
1220 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1221 {
1222   static inline hashval_t hash (const _vect_peel_info *);
1223   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1224 };
1225
1226 inline hashval_t
1227 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1228 {
1229   return (hashval_t) peel_info->npeel;
1230 }
1231
1232 inline bool
1233 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1234 {
1235   return (a->npeel == b->npeel);
1236 }
1237
1238
1239 /* Insert DR into peeling hash table with NPEEL as key.  */
1240
1241 static void
1242 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1243                           loop_vec_info loop_vinfo, struct data_reference *dr,
1244                           int npeel)
1245 {
1246   struct _vect_peel_info elem, *slot;
1247   _vect_peel_info **new_slot;
1248   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1249
1250   elem.npeel = npeel;
1251   slot = peeling_htab->find (&elem);
1252   if (slot)
1253     slot->count++;
1254   else
1255     {
1256       slot = XNEW (struct _vect_peel_info);
1257       slot->npeel = npeel;
1258       slot->dr = dr;
1259       slot->count = 1;
1260       new_slot = peeling_htab->find_slot (slot, INSERT);
1261       *new_slot = slot;
1262     }
1263
1264   if (!supportable_dr_alignment
1265       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1266     slot->count += VECT_MAX_COST;
1267 }
1268
1269
1270 /* Traverse peeling hash table to find peeling option that aligns maximum
1271    number of data accesses.  */
1272
1273 int
1274 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1275                                      _vect_peel_extended_info *max)
1276 {
1277   vect_peel_info elem = *slot;
1278
1279   if (elem->count > max->peel_info.count
1280       || (elem->count == max->peel_info.count
1281           && max->peel_info.npeel > elem->npeel))
1282     {
1283       max->peel_info.npeel = elem->npeel;
1284       max->peel_info.count = elem->count;
1285       max->peel_info.dr = elem->dr;
1286     }
1287
1288   return 1;
1289 }
1290
1291 /* Get the costs of peeling NPEEL iterations checking data access costs
1292    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1293    misalignment will be zero after peeling.  */
1294
1295 static void
1296 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1297                                 struct data_reference *dr0,
1298                                 unsigned int *inside_cost,
1299                                 unsigned int *outside_cost,
1300                                 stmt_vector_for_cost *body_cost_vec,
1301                                 unsigned int npeel,
1302                                 bool unknown_misalignment)
1303 {
1304   unsigned i;
1305   data_reference *dr;
1306
1307   FOR_EACH_VEC_ELT (datarefs, i, dr)
1308     {
1309       gimple *stmt = DR_STMT (dr);
1310       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1311       /* For interleaving, only the alignment of the first access
1312          matters.  */
1313       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1314           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1315         continue;
1316
1317       /* Strided accesses perform only component accesses, alignment is
1318          irrelevant for them.  */
1319       if (STMT_VINFO_STRIDED_P (stmt_info)
1320           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1321         continue;
1322
1323       int save_misalignment;
1324       save_misalignment = DR_MISALIGNMENT (dr);
1325       if (npeel == 0)
1326         ;
1327       else if (unknown_misalignment && dr == dr0)
1328         SET_DR_MISALIGNMENT (dr, 0);
1329       else
1330         vect_update_misalignment_for_peel (dr, dr0, npeel);
1331       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1332                                  body_cost_vec);
1333       SET_DR_MISALIGNMENT (dr, save_misalignment);
1334     }
1335 }
1336
1337 /* Traverse peeling hash table and calculate cost for each peeling option.
1338    Find the one with the lowest cost.  */
1339
1340 int
1341 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1342                                    _vect_peel_extended_info *min)
1343 {
1344   vect_peel_info elem = *slot;
1345   int dummy;
1346   unsigned int inside_cost = 0, outside_cost = 0;
1347   gimple *stmt = DR_STMT (elem->dr);
1348   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1349   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1350   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1351                        epilogue_cost_vec;
1352
1353   prologue_cost_vec.create (2);
1354   body_cost_vec.create (2);
1355   epilogue_cost_vec.create (2);
1356
1357   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1358                                   elem->dr, &inside_cost, &outside_cost,
1359                                   &body_cost_vec, elem->npeel, false);
1360
1361   body_cost_vec.release ();
1362
1363   outside_cost += vect_get_known_peeling_cost
1364     (loop_vinfo, elem->npeel, &dummy,
1365      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1366      &prologue_cost_vec, &epilogue_cost_vec);
1367
1368   /* Prologue and epilogue costs are added to the target model later.
1369      These costs depend only on the scalar iteration cost, the
1370      number of peeling iterations finally chosen, and the number of
1371      misaligned statements.  So discard the information found here.  */
1372   prologue_cost_vec.release ();
1373   epilogue_cost_vec.release ();
1374
1375   if (inside_cost < min->inside_cost
1376       || (inside_cost == min->inside_cost
1377           && outside_cost < min->outside_cost))
1378     {
1379       min->inside_cost = inside_cost;
1380       min->outside_cost = outside_cost;
1381       min->peel_info.dr = elem->dr;
1382       min->peel_info.npeel = elem->npeel;
1383       min->peel_info.count = elem->count;
1384     }
1385
1386   return 1;
1387 }
1388
1389
1390 /* Choose best peeling option by traversing peeling hash table and either
1391    choosing an option with the lowest cost (if cost model is enabled) or the
1392    option that aligns as many accesses as possible.  */
1393
1394 static struct _vect_peel_extended_info
1395 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1396                                        loop_vec_info loop_vinfo)
1397 {
1398    struct _vect_peel_extended_info res;
1399
1400    res.peel_info.dr = NULL;
1401
1402    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1403      {
1404        res.inside_cost = INT_MAX;
1405        res.outside_cost = INT_MAX;
1406        peeling_htab->traverse <_vect_peel_extended_info *,
1407                                vect_peeling_hash_get_lowest_cost> (&res);
1408      }
1409    else
1410      {
1411        res.peel_info.count = 0;
1412        peeling_htab->traverse <_vect_peel_extended_info *,
1413                                vect_peeling_hash_get_most_frequent> (&res);
1414        res.inside_cost = 0;
1415        res.outside_cost = 0;
1416      }
1417
1418    return res;
1419 }
1420
1421 /* Return true if the new peeling NPEEL is supported.  */
1422
1423 static bool
1424 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1425                           unsigned npeel)
1426 {
1427   unsigned i;
1428   struct data_reference *dr = NULL;
1429   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1430   gimple *stmt;
1431   stmt_vec_info stmt_info;
1432   enum dr_alignment_support supportable_dr_alignment;
1433
1434   /* Ensure that all data refs can be vectorized after the peel.  */
1435   FOR_EACH_VEC_ELT (datarefs, i, dr)
1436     {
1437       int save_misalignment;
1438
1439       if (dr == dr0)
1440         continue;
1441
1442       stmt = DR_STMT (dr);
1443       stmt_info = vinfo_for_stmt (stmt);
1444       /* For interleaving, only the alignment of the first access
1445          matters.  */
1446       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1447           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1448         continue;
1449
1450       /* Strided accesses perform only component accesses, alignment is
1451          irrelevant for them.  */
1452       if (STMT_VINFO_STRIDED_P (stmt_info)
1453           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1454         continue;
1455
1456       save_misalignment = DR_MISALIGNMENT (dr);
1457       vect_update_misalignment_for_peel (dr, dr0, npeel);
1458       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1459       SET_DR_MISALIGNMENT (dr, save_misalignment);
1460
1461       if (!supportable_dr_alignment)
1462         return false;
1463     }
1464
1465   return true;
1466 }
1467
1468 /* Function vect_enhance_data_refs_alignment
1469
1470    This pass will use loop versioning and loop peeling in order to enhance
1471    the alignment of data references in the loop.
1472
1473    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1474    original loop is to be vectorized.  Any other loops that are created by
1475    the transformations performed in this pass - are not supposed to be
1476    vectorized.  This restriction will be relaxed.
1477
1478    This pass will require a cost model to guide it whether to apply peeling
1479    or versioning or a combination of the two.  For example, the scheme that
1480    intel uses when given a loop with several memory accesses, is as follows:
1481    choose one memory access ('p') which alignment you want to force by doing
1482    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1483    other accesses are not necessarily aligned, or (2) use loop versioning to
1484    generate one loop in which all accesses are aligned, and another loop in
1485    which only 'p' is necessarily aligned.
1486
1487    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1488    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1489    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1490
1491    Devising a cost model is the most critical aspect of this work.  It will
1492    guide us on which access to peel for, whether to use loop versioning, how
1493    many versions to create, etc.  The cost model will probably consist of
1494    generic considerations as well as target specific considerations (on
1495    powerpc for example, misaligned stores are more painful than misaligned
1496    loads).
1497
1498    Here are the general steps involved in alignment enhancements:
1499
1500      -- original loop, before alignment analysis:
1501         for (i=0; i<N; i++){
1502           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1503           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1504         }
1505
1506      -- After vect_compute_data_refs_alignment:
1507         for (i=0; i<N; i++){
1508           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1509           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1510         }
1511
1512      -- Possibility 1: we do loop versioning:
1513      if (p is aligned) {
1514         for (i=0; i<N; i++){    # loop 1A
1515           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1516           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1517         }
1518      }
1519      else {
1520         for (i=0; i<N; i++){    # loop 1B
1521           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1522           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1523         }
1524      }
1525
1526      -- Possibility 2: we do loop peeling:
1527      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1528         x = q[i];
1529         p[i] = y;
1530      }
1531      for (i = 3; i < N; i++){   # loop 2A
1532         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1533         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1534      }
1535
1536      -- Possibility 3: combination of loop peeling and versioning:
1537      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1538         x = q[i];
1539         p[i] = y;
1540      }
1541      if (p is aligned) {
1542         for (i = 3; i<N; i++){  # loop 3A
1543           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1544           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1545         }
1546      }
1547      else {
1548         for (i = 3; i<N; i++){  # loop 3B
1549           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1550           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1551         }
1552      }
1553
1554      These loops are later passed to loop_transform to be vectorized.  The
1555      vectorizer will use the alignment information to guide the transformation
1556      (whether to generate regular loads/stores, or with special handling for
1557      misalignment).  */
1558
1559 bool
1560 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1561 {
1562   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1563   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1564   enum dr_alignment_support supportable_dr_alignment;
1565   struct data_reference *dr0 = NULL, *first_store = NULL;
1566   struct data_reference *dr;
1567   unsigned int i, j;
1568   bool do_peeling = false;
1569   bool do_versioning = false;
1570   bool stat;
1571   gimple *stmt;
1572   stmt_vec_info stmt_info;
1573   unsigned int npeel = 0;
1574   bool one_misalignment_known = false;
1575   bool one_misalignment_unknown = false;
1576   bool one_dr_unsupportable = false;
1577   struct data_reference *unsupportable_dr = NULL;
1578   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1579   unsigned possible_npeel_number = 1;
1580   tree vectype;
1581   unsigned int nelements, mis, same_align_drs_max = 0;
1582   hash_table<peel_info_hasher> peeling_htab (1);
1583
1584   if (dump_enabled_p ())
1585     dump_printf_loc (MSG_NOTE, vect_location,
1586                      "=== vect_enhance_data_refs_alignment ===\n");
1587
1588   /* Reset data so we can safely be called multiple times.  */
1589   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1590   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1591
1592   /* While cost model enhancements are expected in the future, the high level
1593      view of the code at this time is as follows:
1594
1595      A) If there is a misaligned access then see if peeling to align
1596         this access can make all data references satisfy
1597         vect_supportable_dr_alignment.  If so, update data structures
1598         as needed and return true.
1599
1600      B) If peeling wasn't possible and there is a data reference with an
1601         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1602         then see if loop versioning checks can be used to make all data
1603         references satisfy vect_supportable_dr_alignment.  If so, update
1604         data structures as needed and return true.
1605
1606      C) If neither peeling nor versioning were successful then return false if
1607         any data reference does not satisfy vect_supportable_dr_alignment.
1608
1609      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1610
1611      Note, Possibility 3 above (which is peeling and versioning together) is not
1612      being done at this time.  */
1613
1614   /* (1) Peeling to force alignment.  */
1615
1616   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1617      Considerations:
1618      + How many accesses will become aligned due to the peeling
1619      - How many accesses will become unaligned due to the peeling,
1620        and the cost of misaligned accesses.
1621      - The cost of peeling (the extra runtime checks, the increase
1622        in code size).  */
1623
1624   FOR_EACH_VEC_ELT (datarefs, i, dr)
1625     {
1626       stmt = DR_STMT (dr);
1627       stmt_info = vinfo_for_stmt (stmt);
1628
1629       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1630         continue;
1631
1632       /* For interleaving, only the alignment of the first access
1633          matters.  */
1634       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1635           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1636         continue;
1637
1638       /* For invariant accesses there is nothing to enhance.  */
1639       if (integer_zerop (DR_STEP (dr)))
1640         continue;
1641
1642       /* Strided accesses perform only component accesses, alignment is
1643          irrelevant for them.  */
1644       if (STMT_VINFO_STRIDED_P (stmt_info)
1645           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1646         continue;
1647
1648       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1649       do_peeling = vector_alignment_reachable_p (dr);
1650       if (do_peeling)
1651         {
1652           if (known_alignment_for_access_p (dr))
1653             {
1654               unsigned int npeel_tmp = 0;
1655               bool negative = tree_int_cst_compare (DR_STEP (dr),
1656                                                     size_zero_node) < 0;
1657
1658               vectype = STMT_VINFO_VECTYPE (stmt_info);
1659               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1660               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1661                                                 TREE_TYPE (DR_REF (dr))));
1662               if (DR_MISALIGNMENT (dr) != 0)
1663                 npeel_tmp = (negative ? (mis - nelements)
1664                              : (nelements - mis)) & (nelements - 1);
1665
1666               /* For multiple types, it is possible that the bigger type access
1667                  will have more than one peeling option.  E.g., a loop with two
1668                  types: one of size (vector size / 4), and the other one of
1669                  size (vector size / 8).  Vectorization factor will 8.  If both
1670                  accesses are misaligned by 3, the first one needs one scalar
1671                  iteration to be aligned, and the second one needs 5.  But the
1672                  first one will be aligned also by peeling 5 scalar
1673                  iterations, and in that case both accesses will be aligned.
1674                  Hence, except for the immediate peeling amount, we also want
1675                  to try to add full vector size, while we don't exceed
1676                  vectorization factor.
1677                  We do this automatically for cost model, since we calculate
1678                  cost for every peeling option.  */
1679               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1680                 {
1681                   if (STMT_SLP_TYPE (stmt_info))
1682                     possible_npeel_number
1683                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1684                   else
1685                     possible_npeel_number = vf / nelements;
1686
1687                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1688                      allow peeling NELEMENTS.  */
1689                   if (DR_MISALIGNMENT (dr) == 0)
1690                     possible_npeel_number++;
1691                 }
1692
1693               /* Save info about DR in the hash table.  Also include peeling
1694                  amounts according to the explanation above.  */
1695               for (j = 0; j < possible_npeel_number; j++)
1696                 {
1697                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1698                                             dr, npeel_tmp);
1699                   npeel_tmp += nelements;
1700                 }
1701
1702               one_misalignment_known = true;
1703             }
1704           else
1705             {
1706               /* If we don't know any misalignment values, we prefer
1707                  peeling for data-ref that has the maximum number of data-refs
1708                  with the same alignment, unless the target prefers to align
1709                  stores over load.  */
1710               unsigned same_align_drs
1711                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1712               if (!dr0
1713                   || same_align_drs_max < same_align_drs)
1714                 {
1715                   same_align_drs_max = same_align_drs;
1716                   dr0 = dr;
1717                 }
1718               /* For data-refs with the same number of related
1719                  accesses prefer the one where the misalign
1720                  computation will be invariant in the outermost loop.  */
1721               else if (same_align_drs_max == same_align_drs)
1722                 {
1723                   struct loop *ivloop0, *ivloop;
1724                   ivloop0 = outermost_invariant_loop_for_expr
1725                     (loop, DR_BASE_ADDRESS (dr0));
1726                   ivloop = outermost_invariant_loop_for_expr
1727                     (loop, DR_BASE_ADDRESS (dr));
1728                   if ((ivloop && !ivloop0)
1729                       || (ivloop && ivloop0
1730                           && flow_loop_nested_p (ivloop, ivloop0)))
1731                     dr0 = dr;
1732                 }
1733
1734               one_misalignment_unknown = true;
1735
1736               /* Check for data refs with unsupportable alignment that
1737                  can be peeled.  */
1738               if (!supportable_dr_alignment)
1739               {
1740                 one_dr_unsupportable = true;
1741                 unsupportable_dr = dr;
1742               }
1743
1744               if (!first_store && DR_IS_WRITE (dr))
1745                 first_store = dr;
1746             }
1747         }
1748       else
1749         {
1750           if (!aligned_access_p (dr))
1751             {
1752               if (dump_enabled_p ())
1753                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1754                                  "vector alignment may not be reachable\n");
1755               break;
1756             }
1757         }
1758     }
1759
1760   /* Check if we can possibly peel the loop.  */
1761   if (!vect_can_advance_ivs_p (loop_vinfo)
1762       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1763       || loop->inner)
1764     do_peeling = false;
1765
1766   struct _vect_peel_extended_info peel_for_known_alignment;
1767   struct _vect_peel_extended_info peel_for_unknown_alignment;
1768   struct _vect_peel_extended_info best_peel;
1769
1770   peel_for_unknown_alignment.inside_cost = INT_MAX;
1771   peel_for_unknown_alignment.outside_cost = INT_MAX;
1772   peel_for_unknown_alignment.peel_info.count = 0;
1773
1774   if (do_peeling
1775       && one_misalignment_unknown)
1776     {
1777       /* Check if the target requires to prefer stores over loads, i.e., if
1778          misaligned stores are more expensive than misaligned loads (taking
1779          drs with same alignment into account).  */
1780       unsigned int load_inside_cost = 0;
1781       unsigned int load_outside_cost = 0;
1782       unsigned int store_inside_cost = 0;
1783       unsigned int store_outside_cost = 0;
1784
1785       stmt_vector_for_cost dummy;
1786       dummy.create (2);
1787       vect_get_peeling_costs_all_drs (datarefs, dr0,
1788                                       &load_inside_cost,
1789                                       &load_outside_cost,
1790                                       &dummy, vf / 2, true);
1791       dummy.release ();
1792
1793       if (first_store)
1794         {
1795           dummy.create (2);
1796           vect_get_peeling_costs_all_drs (datarefs, first_store,
1797                                           &store_inside_cost,
1798                                           &store_outside_cost,
1799                                           &dummy, vf / 2, true);
1800           dummy.release ();
1801         }
1802       else
1803         {
1804           store_inside_cost = INT_MAX;
1805           store_outside_cost = INT_MAX;
1806         }
1807
1808       if (load_inside_cost > store_inside_cost
1809           || (load_inside_cost == store_inside_cost
1810               && load_outside_cost > store_outside_cost))
1811         {
1812           dr0 = first_store;
1813           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1814           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1815         }
1816       else
1817         {
1818           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1819           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1820         }
1821
1822       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1823       prologue_cost_vec.create (2);
1824       epilogue_cost_vec.create (2);
1825
1826       int dummy2;
1827       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1828         (loop_vinfo, vf / 2, &dummy2,
1829          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1830          &prologue_cost_vec, &epilogue_cost_vec);
1831
1832       prologue_cost_vec.release ();
1833       epilogue_cost_vec.release ();
1834
1835       peel_for_unknown_alignment.peel_info.count = 1
1836         + STMT_VINFO_SAME_ALIGN_REFS
1837         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1838     }
1839
1840   peel_for_unknown_alignment.peel_info.npeel = 0;
1841   peel_for_unknown_alignment.peel_info.dr = dr0;
1842
1843   best_peel = peel_for_unknown_alignment;
1844
1845   peel_for_known_alignment.inside_cost = INT_MAX;
1846   peel_for_known_alignment.outside_cost = INT_MAX;
1847   peel_for_known_alignment.peel_info.count = 0;
1848   peel_for_known_alignment.peel_info.dr = NULL;
1849
1850   if (do_peeling && one_misalignment_known)
1851     {
1852       /* Peeling is possible, but there is no data access that is not supported
1853          unless aligned.  So we try to choose the best possible peeling from
1854          the hash table.  */
1855       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1856         (&peeling_htab, loop_vinfo);
1857     }
1858
1859   /* Compare costs of peeling for known and unknown alignment. */
1860   if (peel_for_known_alignment.peel_info.dr != NULL
1861       && peel_for_unknown_alignment.inside_cost
1862       >= peel_for_known_alignment.inside_cost)
1863     {
1864       best_peel = peel_for_known_alignment;
1865
1866       /* If the best peeling for known alignment has NPEEL == 0, perform no
1867          peeling at all except if there is an unsupportable dr that we can
1868          align.  */
1869       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1870         do_peeling = false;
1871     }
1872
1873   /* If there is an unsupportable data ref, prefer this over all choices so far
1874      since we'd have to discard a chosen peeling except when it accidentally
1875      aligned the unsupportable data ref.  */
1876   if (one_dr_unsupportable)
1877     dr0 = unsupportable_dr;
1878   else if (do_peeling)
1879     {
1880       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1881          TODO: Use nopeel_outside_cost or get rid of it?  */
1882       unsigned nopeel_inside_cost = 0;
1883       unsigned nopeel_outside_cost = 0;
1884
1885       stmt_vector_for_cost dummy;
1886       dummy.create (2);
1887       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1888                                       &nopeel_outside_cost, &dummy, 0, false);
1889       dummy.release ();
1890
1891       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1892          costs will be recorded.  */
1893       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1894       prologue_cost_vec.create (2);
1895       epilogue_cost_vec.create (2);
1896
1897       int dummy2;
1898       nopeel_outside_cost += vect_get_known_peeling_cost
1899         (loop_vinfo, 0, &dummy2,
1900          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1901          &prologue_cost_vec, &epilogue_cost_vec);
1902
1903       prologue_cost_vec.release ();
1904       epilogue_cost_vec.release ();
1905
1906       npeel = best_peel.peel_info.npeel;
1907       dr0 = best_peel.peel_info.dr;
1908
1909       /* If no peeling is not more expensive than the best peeling we
1910          have so far, don't perform any peeling.  */
1911       if (nopeel_inside_cost <= best_peel.inside_cost)
1912         do_peeling = false;
1913     }
1914
1915   if (do_peeling)
1916     {
1917       stmt = DR_STMT (dr0);
1918       stmt_info = vinfo_for_stmt (stmt);
1919       vectype = STMT_VINFO_VECTYPE (stmt_info);
1920       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1921
1922       if (known_alignment_for_access_p (dr0))
1923         {
1924           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1925                                                 size_zero_node) < 0;
1926           if (!npeel)
1927             {
1928               /* Since it's known at compile time, compute the number of
1929                  iterations in the peeled loop (the peeling factor) for use in
1930                  updating DR_MISALIGNMENT values.  The peeling factor is the
1931                  vectorization factor minus the misalignment as an element
1932                  count.  */
1933               mis = DR_MISALIGNMENT (dr0);
1934               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1935               npeel = ((negative ? mis - nelements : nelements - mis)
1936                        & (nelements - 1));
1937             }
1938
1939           /* For interleaved data access every iteration accesses all the
1940              members of the group, therefore we divide the number of iterations
1941              by the group size.  */
1942           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1943           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1944             npeel /= GROUP_SIZE (stmt_info);
1945
1946           if (dump_enabled_p ())
1947             dump_printf_loc (MSG_NOTE, vect_location,
1948                              "Try peeling by %d\n", npeel);
1949         }
1950
1951       /* Ensure that all datarefs can be vectorized after the peel.  */
1952       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1953         do_peeling = false;
1954
1955       /* Check if all datarefs are supportable and log.  */
1956       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1957         {
1958           stat = vect_verify_datarefs_alignment (loop_vinfo);
1959           if (!stat)
1960             do_peeling = false;
1961           else
1962             return stat;
1963         }
1964
1965       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1966       if (do_peeling)
1967         {
1968           unsigned max_allowed_peel
1969             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1970           if (max_allowed_peel != (unsigned)-1)
1971             {
1972               unsigned max_peel = npeel;
1973               if (max_peel == 0)
1974                 {
1975                   gimple *dr_stmt = DR_STMT (dr0);
1976                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1977                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1978                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1979                 }
1980               if (max_peel > max_allowed_peel)
1981                 {
1982                   do_peeling = false;
1983                   if (dump_enabled_p ())
1984                     dump_printf_loc (MSG_NOTE, vect_location,
1985                         "Disable peeling, max peels reached: %d\n", max_peel);
1986                 }
1987             }
1988         }
1989
1990       /* Cost model #2 - if peeling may result in a remaining loop not
1991          iterating enough to be vectorized then do not peel.  */
1992       if (do_peeling
1993           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1994         {
1995           unsigned max_peel
1996             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1997           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1998               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1999             do_peeling = false;
2000         }
2001
2002       if (do_peeling)
2003         {
2004           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2005              If the misalignment of DR_i is identical to that of dr0 then set
2006              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2007              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2008              by the peeling factor times the element size of DR_i (MOD the
2009              vectorization factor times the size).  Otherwise, the
2010              misalignment of DR_i must be set to unknown.  */
2011           FOR_EACH_VEC_ELT (datarefs, i, dr)
2012             if (dr != dr0)
2013               {
2014                 /* Strided accesses perform only component accesses, alignment
2015                    is irrelevant for them.  */
2016                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2017                 if (STMT_VINFO_STRIDED_P (stmt_info)
2018                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2019                   continue;
2020
2021                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2022               }
2023
2024           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2025           if (npeel)
2026             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2027           else
2028             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2029               = DR_MISALIGNMENT (dr0);
2030           SET_DR_MISALIGNMENT (dr0, 0);
2031           if (dump_enabled_p ())
2032             {
2033               dump_printf_loc (MSG_NOTE, vect_location,
2034                                "Alignment of access forced using peeling.\n");
2035               dump_printf_loc (MSG_NOTE, vect_location,
2036                                "Peeling for alignment will be applied.\n");
2037             }
2038
2039           /* The inside-loop cost will be accounted for in vectorizable_load
2040              and vectorizable_store correctly with adjusted alignments.
2041              Drop the body_cst_vec on the floor here.  */
2042           stat = vect_verify_datarefs_alignment (loop_vinfo);
2043           gcc_assert (stat);
2044           return stat;
2045         }
2046     }
2047
2048   /* (2) Versioning to force alignment.  */
2049
2050   /* Try versioning if:
2051      1) optimize loop for speed
2052      2) there is at least one unsupported misaligned data ref with an unknown
2053         misalignment, and
2054      3) all misaligned data refs with a known misalignment are supported, and
2055      4) the number of runtime alignment checks is within reason.  */
2056
2057   do_versioning =
2058         optimize_loop_nest_for_speed_p (loop)
2059         && (!loop->inner); /* FORNOW */
2060
2061   if (do_versioning)
2062     {
2063       FOR_EACH_VEC_ELT (datarefs, i, dr)
2064         {
2065           stmt = DR_STMT (dr);
2066           stmt_info = vinfo_for_stmt (stmt);
2067
2068           /* For interleaving, only the alignment of the first access
2069              matters.  */
2070           if (aligned_access_p (dr)
2071               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2072                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2073             continue;
2074
2075           if (STMT_VINFO_STRIDED_P (stmt_info))
2076             {
2077               /* Strided loads perform only component accesses, alignment is
2078                  irrelevant for them.  */
2079               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2080                 continue;
2081               do_versioning = false;
2082               break;
2083             }
2084
2085           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2086
2087           if (!supportable_dr_alignment)
2088             {
2089               gimple *stmt;
2090               int mask;
2091               tree vectype;
2092
2093               if (known_alignment_for_access_p (dr)
2094                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2095                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2096                 {
2097                   do_versioning = false;
2098                   break;
2099                 }
2100
2101               stmt = DR_STMT (dr);
2102               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2103               gcc_assert (vectype);
2104
2105               /* The rightmost bits of an aligned address must be zeros.
2106                  Construct the mask needed for this test.  For example,
2107                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2108                  mask must be 15 = 0xf. */
2109               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2110
2111               /* FORNOW: use the same mask to test all potentially unaligned
2112                  references in the loop.  The vectorizer currently supports
2113                  a single vector size, see the reference to
2114                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2115                  vectorization factor is computed.  */
2116               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2117                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2118               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2119               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2120                       DR_STMT (dr));
2121             }
2122         }
2123
2124       /* Versioning requires at least one misaligned data reference.  */
2125       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2126         do_versioning = false;
2127       else if (!do_versioning)
2128         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2129     }
2130
2131   if (do_versioning)
2132     {
2133       vec<gimple *> may_misalign_stmts
2134         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2135       gimple *stmt;
2136
2137       /* It can now be assumed that the data references in the statements
2138          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2139          of the loop being vectorized.  */
2140       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2141         {
2142           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2143           dr = STMT_VINFO_DATA_REF (stmt_info);
2144           SET_DR_MISALIGNMENT (dr, 0);
2145           if (dump_enabled_p ())
2146             dump_printf_loc (MSG_NOTE, vect_location,
2147                              "Alignment of access forced using versioning.\n");
2148         }
2149
2150       if (dump_enabled_p ())
2151         dump_printf_loc (MSG_NOTE, vect_location,
2152                          "Versioning for alignment will be applied.\n");
2153
2154       /* Peeling and versioning can't be done together at this time.  */
2155       gcc_assert (! (do_peeling && do_versioning));
2156
2157       stat = vect_verify_datarefs_alignment (loop_vinfo);
2158       gcc_assert (stat);
2159       return stat;
2160     }
2161
2162   /* This point is reached if neither peeling nor versioning is being done.  */
2163   gcc_assert (! (do_peeling || do_versioning));
2164
2165   stat = vect_verify_datarefs_alignment (loop_vinfo);
2166   return stat;
2167 }
2168
2169
2170 /* Function vect_find_same_alignment_drs.
2171
2172    Update group and alignment relations according to the chosen
2173    vectorization factor.  */
2174
2175 static void
2176 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2177 {
2178   struct data_reference *dra = DDR_A (ddr);
2179   struct data_reference *drb = DDR_B (ddr);
2180   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2181   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2182
2183   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2184     return;
2185
2186   if (dra == drb)
2187     return;
2188
2189   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2190       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2191       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2192     return;
2193
2194   /* Two references with distance zero have the same alignment.  */
2195   offset_int diff = (wi::to_offset (DR_INIT (dra))
2196                      - wi::to_offset (DR_INIT (drb)));
2197   if (diff != 0)
2198     {
2199       /* Get the wider of the two alignments.  */
2200       unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2201       unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2202       unsigned int max_align = MAX (align_a, align_b);
2203
2204       /* Require the gap to be a multiple of the larger vector alignment.  */
2205       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2206         return;
2207     }
2208
2209   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2210   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2211   if (dump_enabled_p ())
2212     {
2213       dump_printf_loc (MSG_NOTE, vect_location,
2214                        "accesses have the same alignment: ");
2215       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2216       dump_printf (MSG_NOTE,  " and ");
2217       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2218       dump_printf (MSG_NOTE, "\n");
2219     }
2220 }
2221
2222
2223 /* Function vect_analyze_data_refs_alignment
2224
2225    Analyze the alignment of the data-references in the loop.
2226    Return FALSE if a data reference is found that cannot be vectorized.  */
2227
2228 bool
2229 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2230 {
2231   if (dump_enabled_p ())
2232     dump_printf_loc (MSG_NOTE, vect_location,
2233                      "=== vect_analyze_data_refs_alignment ===\n");
2234
2235   /* Mark groups of data references with same alignment using
2236      data dependence information.  */
2237   vec<ddr_p> ddrs = vinfo->ddrs;
2238   struct data_dependence_relation *ddr;
2239   unsigned int i;
2240
2241   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2242     vect_find_same_alignment_drs (ddr);
2243
2244   vec<data_reference_p> datarefs = vinfo->datarefs;
2245   struct data_reference *dr;
2246
2247   vect_record_base_alignments (vinfo);
2248   FOR_EACH_VEC_ELT (datarefs, i, dr)
2249     {
2250       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2251       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2252           && !vect_compute_data_ref_alignment (dr))
2253         {
2254           /* Strided accesses perform only component accesses, misalignment
2255              information is irrelevant for them.  */
2256           if (STMT_VINFO_STRIDED_P (stmt_info)
2257               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2258             continue;
2259
2260           if (dump_enabled_p ())
2261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262                              "not vectorized: can't calculate alignment "
2263                              "for data ref.\n");
2264
2265           return false;
2266         }
2267     }
2268
2269   return true;
2270 }
2271
2272
2273 /* Analyze alignment of DRs of stmts in NODE.  */
2274
2275 static bool
2276 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2277 {
2278   /* We vectorize from the first scalar stmt in the node unless
2279      the node is permuted in which case we start from the first
2280      element in the group.  */
2281   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2282   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2283   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2284     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2285
2286   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2287   if (! vect_compute_data_ref_alignment (dr)
2288       /* For creating the data-ref pointer we need alignment of the
2289          first element anyway.  */
2290       || (dr != first_dr
2291           && ! vect_compute_data_ref_alignment (first_dr))
2292       || ! verify_data_ref_alignment (dr))
2293     {
2294       if (dump_enabled_p ())
2295         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296                          "not vectorized: bad data alignment in basic "
2297                          "block.\n");
2298       return false;
2299     }
2300
2301   return true;
2302 }
2303
2304 /* Function vect_slp_analyze_instance_alignment
2305
2306    Analyze the alignment of the data-references in the SLP instance.
2307    Return FALSE if a data reference is found that cannot be vectorized.  */
2308
2309 bool
2310 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2311 {
2312   if (dump_enabled_p ())
2313     dump_printf_loc (MSG_NOTE, vect_location,
2314                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2315
2316   slp_tree node;
2317   unsigned i;
2318   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2319     if (! vect_slp_analyze_and_verify_node_alignment (node))
2320       return false;
2321
2322   node = SLP_INSTANCE_TREE (instance);
2323   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2324       && ! vect_slp_analyze_and_verify_node_alignment
2325              (SLP_INSTANCE_TREE (instance)))
2326     return false;
2327
2328   return true;
2329 }
2330
2331
2332 /* Analyze groups of accesses: check that DR belongs to a group of
2333    accesses of legal size, step, etc.  Detect gaps, single element
2334    interleaving, and other special cases. Set grouped access info.
2335    Collect groups of strided stores for further use in SLP analysis.
2336    Worker for vect_analyze_group_access.  */
2337
2338 static bool
2339 vect_analyze_group_access_1 (struct data_reference *dr)
2340 {
2341   tree step = DR_STEP (dr);
2342   tree scalar_type = TREE_TYPE (DR_REF (dr));
2343   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2344   gimple *stmt = DR_STMT (dr);
2345   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2346   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2347   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2348   HOST_WIDE_INT dr_step = -1;
2349   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2350   bool slp_impossible = false;
2351
2352   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2353      size of the interleaving group (including gaps).  */
2354   if (tree_fits_shwi_p (step))
2355     {
2356       dr_step = tree_to_shwi (step);
2357       /* Check that STEP is a multiple of type size.  Otherwise there is
2358          a non-element-sized gap at the end of the group which we
2359          cannot represent in GROUP_GAP or GROUP_SIZE.
2360          ???  As we can handle non-constant step fine here we should
2361          simply remove uses of GROUP_GAP between the last and first
2362          element and instead rely on DR_STEP.  GROUP_SIZE then would
2363          simply not include that gap.  */
2364       if ((dr_step % type_size) != 0)
2365         {
2366           if (dump_enabled_p ())
2367             {
2368               dump_printf_loc (MSG_NOTE, vect_location,
2369                                "Step ");
2370               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2371               dump_printf (MSG_NOTE,
2372                            " is not a multiple of the element size for ");
2373               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2374               dump_printf (MSG_NOTE, "\n");
2375             }
2376           return false;
2377         }
2378       groupsize = absu_hwi (dr_step) / type_size;
2379     }
2380   else
2381     groupsize = 0;
2382
2383   /* Not consecutive access is possible only if it is a part of interleaving.  */
2384   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2385     {
2386       /* Check if it this DR is a part of interleaving, and is a single
2387          element of the group that is accessed in the loop.  */
2388
2389       /* Gaps are supported only for loads. STEP must be a multiple of the type
2390          size.  The size of the group must be a power of 2.  */
2391       if (DR_IS_READ (dr)
2392           && (dr_step % type_size) == 0
2393           && groupsize > 0
2394           && pow2p_hwi (groupsize))
2395         {
2396           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2397           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2398           GROUP_GAP (stmt_info) = groupsize - 1;
2399           if (dump_enabled_p ())
2400             {
2401               dump_printf_loc (MSG_NOTE, vect_location,
2402                                "Detected single element interleaving ");
2403               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2404               dump_printf (MSG_NOTE, " step ");
2405               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2406               dump_printf (MSG_NOTE, "\n");
2407             }
2408
2409           return true;
2410         }
2411
2412       if (dump_enabled_p ())
2413         {
2414           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2415                            "not consecutive access ");
2416           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2417         }
2418
2419       if (bb_vinfo)
2420         {
2421           /* Mark the statement as unvectorizable.  */
2422           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2423           return true;
2424         }
2425
2426       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2427       STMT_VINFO_STRIDED_P (stmt_info) = true;
2428       return true;
2429     }
2430
2431   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2432     {
2433       /* First stmt in the interleaving chain. Check the chain.  */
2434       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2435       struct data_reference *data_ref = dr;
2436       unsigned int count = 1;
2437       tree prev_init = DR_INIT (data_ref);
2438       gimple *prev = stmt;
2439       HOST_WIDE_INT diff, gaps = 0;
2440
2441       while (next)
2442         {
2443           /* Skip same data-refs.  In case that two or more stmts share
2444              data-ref (supported only for loads), we vectorize only the first
2445              stmt, and the rest get their vectorized loads from the first
2446              one.  */
2447           if (!tree_int_cst_compare (DR_INIT (data_ref),
2448                                      DR_INIT (STMT_VINFO_DATA_REF (
2449                                                    vinfo_for_stmt (next)))))
2450             {
2451               if (DR_IS_WRITE (data_ref))
2452                 {
2453                   if (dump_enabled_p ())
2454                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2455                                      "Two store stmts share the same dr.\n");
2456                   return false;
2457                 }
2458
2459               if (dump_enabled_p ())
2460                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2461                                  "Two or more load stmts share the same dr.\n");
2462
2463               /* For load use the same data-ref load.  */
2464               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2465
2466               prev = next;
2467               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2468               continue;
2469             }
2470
2471           prev = next;
2472           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2473
2474           /* All group members have the same STEP by construction.  */
2475           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2476
2477           /* Check that the distance between two accesses is equal to the type
2478              size. Otherwise, we have gaps.  */
2479           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2480                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2481           if (diff != 1)
2482             {
2483               /* FORNOW: SLP of accesses with gaps is not supported.  */
2484               slp_impossible = true;
2485               if (DR_IS_WRITE (data_ref))
2486                 {
2487                   if (dump_enabled_p ())
2488                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2489                                      "interleaved store with gaps\n");
2490                   return false;
2491                 }
2492
2493               gaps += diff - 1;
2494             }
2495
2496           last_accessed_element += diff;
2497
2498           /* Store the gap from the previous member of the group. If there is no
2499              gap in the access, GROUP_GAP is always 1.  */
2500           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2501
2502           prev_init = DR_INIT (data_ref);
2503           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2504           /* Count the number of data-refs in the chain.  */
2505           count++;
2506         }
2507
2508       if (groupsize == 0)
2509         groupsize = count + gaps;
2510
2511       /* This could be UINT_MAX but as we are generating code in a very
2512          inefficient way we have to cap earlier.  See PR78699 for example.  */
2513       if (groupsize > 4096)
2514         {
2515           if (dump_enabled_p ())
2516             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2517                              "group is too large\n");
2518           return false;
2519         }
2520
2521       /* Check that the size of the interleaving is equal to count for stores,
2522          i.e., that there are no gaps.  */
2523       if (groupsize != count
2524           && !DR_IS_READ (dr))
2525         {
2526           if (dump_enabled_p ())
2527             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528                              "interleaved store with gaps\n");
2529           return false;
2530         }
2531
2532       /* If there is a gap after the last load in the group it is the
2533          difference between the groupsize and the last accessed
2534          element.
2535          When there is no gap, this difference should be 0.  */
2536       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2537
2538       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2539       if (dump_enabled_p ())
2540         {
2541           dump_printf_loc (MSG_NOTE, vect_location,
2542                            "Detected interleaving ");
2543           if (DR_IS_READ (dr))
2544             dump_printf (MSG_NOTE, "load ");
2545           else
2546             dump_printf (MSG_NOTE, "store ");
2547           dump_printf (MSG_NOTE, "of size %u starting with ",
2548                        (unsigned)groupsize);
2549           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2550           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2551             dump_printf_loc (MSG_NOTE, vect_location,
2552                              "There is a gap of %u elements after the group\n",
2553                              GROUP_GAP (vinfo_for_stmt (stmt)));
2554         }
2555
2556       /* SLP: create an SLP data structure for every interleaving group of
2557          stores for further analysis in vect_analyse_slp.  */
2558       if (DR_IS_WRITE (dr) && !slp_impossible)
2559         {
2560           if (loop_vinfo)
2561             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2562           if (bb_vinfo)
2563             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2564         }
2565     }
2566
2567   return true;
2568 }
2569
2570 /* Analyze groups of accesses: check that DR belongs to a group of
2571    accesses of legal size, step, etc.  Detect gaps, single element
2572    interleaving, and other special cases. Set grouped access info.
2573    Collect groups of strided stores for further use in SLP analysis.  */
2574
2575 static bool
2576 vect_analyze_group_access (struct data_reference *dr)
2577 {
2578   if (!vect_analyze_group_access_1 (dr))
2579     {
2580       /* Dissolve the group if present.  */
2581       gimple *next;
2582       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2583       while (stmt)
2584         {
2585           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2586           next = GROUP_NEXT_ELEMENT (vinfo);
2587           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2588           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2589           stmt = next;
2590         }
2591       return false;
2592     }
2593   return true;
2594 }
2595
2596 /* Analyze the access pattern of the data-reference DR.
2597    In case of non-consecutive accesses call vect_analyze_group_access() to
2598    analyze groups of accesses.  */
2599
2600 static bool
2601 vect_analyze_data_ref_access (struct data_reference *dr)
2602 {
2603   tree step = DR_STEP (dr);
2604   tree scalar_type = TREE_TYPE (DR_REF (dr));
2605   gimple *stmt = DR_STMT (dr);
2606   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2607   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2608   struct loop *loop = NULL;
2609
2610   if (loop_vinfo)
2611     loop = LOOP_VINFO_LOOP (loop_vinfo);
2612
2613   if (loop_vinfo && !step)
2614     {
2615       if (dump_enabled_p ())
2616         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2617                          "bad data-ref access in loop\n");
2618       return false;
2619     }
2620
2621   /* Allow loads with zero step in inner-loop vectorization.  */
2622   if (loop_vinfo && integer_zerop (step))
2623     {
2624       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2625       if (!nested_in_vect_loop_p (loop, stmt))
2626         return DR_IS_READ (dr);
2627       /* Allow references with zero step for outer loops marked
2628          with pragma omp simd only - it guarantees absence of
2629          loop-carried dependencies between inner loop iterations.  */
2630       if (!loop->force_vectorize)
2631         {
2632           if (dump_enabled_p ())
2633             dump_printf_loc (MSG_NOTE, vect_location,
2634                              "zero step in inner loop of nest\n");
2635           return false;
2636         }
2637     }
2638
2639   if (loop && nested_in_vect_loop_p (loop, stmt))
2640     {
2641       /* Interleaved accesses are not yet supported within outer-loop
2642         vectorization for references in the inner-loop.  */
2643       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2644
2645       /* For the rest of the analysis we use the outer-loop step.  */
2646       step = STMT_VINFO_DR_STEP (stmt_info);
2647       if (integer_zerop (step))
2648         {
2649           if (dump_enabled_p ())
2650             dump_printf_loc (MSG_NOTE, vect_location,
2651                              "zero step in outer loop.\n");
2652           return DR_IS_READ (dr);
2653         }
2654     }
2655
2656   /* Consecutive?  */
2657   if (TREE_CODE (step) == INTEGER_CST)
2658     {
2659       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2660       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2661           || (dr_step < 0
2662               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2663         {
2664           /* Mark that it is not interleaving.  */
2665           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2666           return true;
2667         }
2668     }
2669
2670   if (loop && nested_in_vect_loop_p (loop, stmt))
2671     {
2672       if (dump_enabled_p ())
2673         dump_printf_loc (MSG_NOTE, vect_location,
2674                          "grouped access in outer loop.\n");
2675       return false;
2676     }
2677
2678
2679   /* Assume this is a DR handled by non-constant strided load case.  */
2680   if (TREE_CODE (step) != INTEGER_CST)
2681     return (STMT_VINFO_STRIDED_P (stmt_info)
2682             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2683                 || vect_analyze_group_access (dr)));
2684
2685   /* Not consecutive access - check if it's a part of interleaving group.  */
2686   return vect_analyze_group_access (dr);
2687 }
2688
2689 /* Compare two data-references DRA and DRB to group them into chunks
2690    suitable for grouping.  */
2691
2692 static int
2693 dr_group_sort_cmp (const void *dra_, const void *drb_)
2694 {
2695   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2696   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2697   int cmp;
2698
2699   /* Stabilize sort.  */
2700   if (dra == drb)
2701     return 0;
2702
2703   /* DRs in different loops never belong to the same group.  */
2704   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2705   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2706   if (loopa != loopb)
2707     return loopa->num < loopb->num ? -1 : 1;
2708
2709   /* Ordering of DRs according to base.  */
2710   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2711     {
2712       cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2713                                    DR_BASE_ADDRESS (drb));
2714       if (cmp != 0)
2715         return cmp;
2716     }
2717
2718   /* And according to DR_OFFSET.  */
2719   if (!dr_equal_offsets_p (dra, drb))
2720     {
2721       cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2722       if (cmp != 0)
2723         return cmp;
2724     }
2725
2726   /* Put reads before writes.  */
2727   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2728     return DR_IS_READ (dra) ? -1 : 1;
2729
2730   /* Then sort after access size.  */
2731   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2732                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2733     {
2734       cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2735                                    TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2736       if (cmp != 0)
2737         return cmp;
2738     }
2739
2740   /* And after step.  */
2741   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2742     {
2743       cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2744       if (cmp != 0)
2745         return cmp;
2746     }
2747
2748   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2749   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2750   if (cmp == 0)
2751     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2752   return cmp;
2753 }
2754
2755 /* Function vect_analyze_data_ref_accesses.
2756
2757    Analyze the access pattern of all the data references in the loop.
2758
2759    FORNOW: the only access pattern that is considered vectorizable is a
2760            simple step 1 (consecutive) access.
2761
2762    FORNOW: handle only arrays and pointer accesses.  */
2763
2764 bool
2765 vect_analyze_data_ref_accesses (vec_info *vinfo)
2766 {
2767   unsigned int i;
2768   vec<data_reference_p> datarefs = vinfo->datarefs;
2769   struct data_reference *dr;
2770
2771   if (dump_enabled_p ())
2772     dump_printf_loc (MSG_NOTE, vect_location,
2773                      "=== vect_analyze_data_ref_accesses ===\n");
2774
2775   if (datarefs.is_empty ())
2776     return true;
2777
2778   /* Sort the array of datarefs to make building the interleaving chains
2779      linear.  Don't modify the original vector's order, it is needed for
2780      determining what dependencies are reversed.  */
2781   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2782   datarefs_copy.qsort (dr_group_sort_cmp);
2783
2784   /* Build the interleaving chains.  */
2785   for (i = 0; i < datarefs_copy.length () - 1;)
2786     {
2787       data_reference_p dra = datarefs_copy[i];
2788       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2789       stmt_vec_info lastinfo = NULL;
2790       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2791         {
2792           ++i;
2793           continue;
2794         }
2795       for (i = i + 1; i < datarefs_copy.length (); ++i)
2796         {
2797           data_reference_p drb = datarefs_copy[i];
2798           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2799           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2800             break;
2801
2802           /* ???  Imperfect sorting (non-compatible types, non-modulo
2803              accesses, same accesses) can lead to a group to be artificially
2804              split here as we don't just skip over those.  If it really
2805              matters we can push those to a worklist and re-iterate
2806              over them.  The we can just skip ahead to the next DR here.  */
2807
2808           /* DRs in a different loop should not be put into the same
2809              interleaving group.  */
2810           if (gimple_bb (DR_STMT (dra))->loop_father
2811               != gimple_bb (DR_STMT (drb))->loop_father)
2812             break;
2813
2814           /* Check that the data-refs have same first location (except init)
2815              and they are both either store or load (not load and store,
2816              not masked loads or stores).  */
2817           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2818               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2819                                    DR_BASE_ADDRESS (drb), 0)
2820               || !dr_equal_offsets_p (dra, drb)
2821               || !gimple_assign_single_p (DR_STMT (dra))
2822               || !gimple_assign_single_p (DR_STMT (drb)))
2823             break;
2824
2825           /* Check that the data-refs have the same constant size.  */
2826           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2827           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2828           if (!tree_fits_uhwi_p (sza)
2829               || !tree_fits_uhwi_p (szb)
2830               || !tree_int_cst_equal (sza, szb))
2831             break;
2832
2833           /* Check that the data-refs have the same step.  */
2834           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2835             break;
2836
2837           /* Do not place the same access in the interleaving chain twice.  */
2838           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2839             break;
2840
2841           /* Check the types are compatible.
2842              ???  We don't distinguish this during sorting.  */
2843           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2844                                    TREE_TYPE (DR_REF (drb))))
2845             break;
2846
2847           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2848           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2849           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2850           gcc_assert (init_a <= init_b);
2851
2852           /* If init_b == init_a + the size of the type * k, we have an
2853              interleaving, and DRA is accessed before DRB.  */
2854           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2855           if (type_size_a == 0
2856               || (init_b - init_a) % type_size_a != 0)
2857             break;
2858
2859           /* If we have a store, the accesses are adjacent.  This splits
2860              groups into chunks we support (we don't support vectorization
2861              of stores with gaps).  */
2862           if (!DR_IS_READ (dra)
2863               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2864                                              (DR_INIT (datarefs_copy[i-1]))
2865                   != type_size_a))
2866             break;
2867
2868           /* If the step (if not zero or non-constant) is greater than the
2869              difference between data-refs' inits this splits groups into
2870              suitable sizes.  */
2871           if (tree_fits_shwi_p (DR_STEP (dra)))
2872             {
2873               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2874               if (step != 0 && step <= (init_b - init_a))
2875                 break;
2876             }
2877
2878           if (dump_enabled_p ())
2879             {
2880               dump_printf_loc (MSG_NOTE, vect_location,
2881                                "Detected interleaving ");
2882               if (DR_IS_READ (dra))
2883                 dump_printf (MSG_NOTE, "load ");
2884               else
2885                 dump_printf (MSG_NOTE, "store ");
2886               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2887               dump_printf (MSG_NOTE,  " and ");
2888               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2889               dump_printf (MSG_NOTE, "\n");
2890             }
2891
2892           /* Link the found element into the group list.  */
2893           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2894             {
2895               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2896               lastinfo = stmtinfo_a;
2897             }
2898           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2899           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2900           lastinfo = stmtinfo_b;
2901         }
2902     }
2903
2904   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2905     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2906         && !vect_analyze_data_ref_access (dr))
2907       {
2908         if (dump_enabled_p ())
2909           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910                            "not vectorized: complicated access pattern.\n");
2911
2912         if (is_a <bb_vec_info> (vinfo))
2913           {
2914             /* Mark the statement as not vectorizable.  */
2915             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2916             continue;
2917           }
2918         else
2919           {
2920             datarefs_copy.release ();
2921             return false;
2922           }
2923       }
2924
2925   datarefs_copy.release ();
2926   return true;
2927 }
2928
2929 /* Function vect_vfa_segment_size.
2930
2931    Create an expression that computes the size of segment
2932    that will be accessed for a data reference.  The functions takes into
2933    account that realignment loads may access one more vector.
2934
2935    Input:
2936      DR: The data reference.
2937      LENGTH_FACTOR: segment length to consider.
2938
2939    Return an expression whose value is the size of segment which will be
2940    accessed by DR.  */
2941
2942 static tree
2943 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2944 {
2945   tree segment_length;
2946
2947   if (integer_zerop (DR_STEP (dr)))
2948     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2949   else
2950     segment_length = size_binop (MULT_EXPR,
2951                                  fold_convert (sizetype, DR_STEP (dr)),
2952                                  fold_convert (sizetype, length_factor));
2953
2954   if (vect_supportable_dr_alignment (dr, false)
2955         == dr_explicit_realign_optimized)
2956     {
2957       tree vector_size = TYPE_SIZE_UNIT
2958                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2959
2960       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2961     }
2962   return segment_length;
2963 }
2964
2965 /* Function vect_no_alias_p.
2966
2967    Given data references A and B with equal base and offset, the alias
2968    relation can be decided at compilation time, return TRUE if they do
2969    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
2970    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2971    respectively.  */
2972
2973 static bool
2974 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2975                  tree segment_length_a, tree segment_length_b)
2976 {
2977   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2978               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2979   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2980     return false;
2981
2982   tree seg_a_min = DR_INIT (a);
2983   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2984                                 seg_a_min, segment_length_a);
2985   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2986      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2987      [a, a+12) */
2988   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2989     {
2990       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2991       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2992                                seg_a_max, unit_size);
2993       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2994                                DR_INIT (a), unit_size);
2995     }
2996   tree seg_b_min = DR_INIT (b);
2997   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2998                                 seg_b_min, segment_length_b);
2999   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3000     {
3001       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3002       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3003                                seg_b_max, unit_size);
3004       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3005                                DR_INIT (b), unit_size);
3006     }
3007
3008   if (tree_int_cst_le (seg_a_max, seg_b_min)
3009       || tree_int_cst_le (seg_b_max, seg_a_min))
3010     return true;
3011
3012   return false;
3013 }
3014
3015 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3016    in DDR is >= VF.  */
3017
3018 static bool
3019 dependence_distance_ge_vf (data_dependence_relation *ddr,
3020                            unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3021 {
3022   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3023       || DDR_NUM_DIST_VECTS (ddr) == 0)
3024     return false;
3025
3026   /* If the dependence is exact, we should have limited the VF instead.  */
3027   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3028
3029   unsigned int i;
3030   lambda_vector dist_v;
3031   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3032     {
3033       HOST_WIDE_INT dist = dist_v[loop_depth];
3034       if (dist != 0
3035           && !(dist > 0 && DDR_REVERSED_P (ddr))
3036           && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3037         return false;
3038     }
3039
3040   if (dump_enabled_p ())
3041     {
3042       dump_printf_loc (MSG_NOTE, vect_location,
3043                        "dependence distance between ");
3044       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3045       dump_printf (MSG_NOTE,  " and ");
3046       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3047       dump_printf (MSG_NOTE,  " is >= VF\n");
3048     }
3049
3050   return true;
3051 }
3052
3053 /* Function vect_prune_runtime_alias_test_list.
3054
3055    Prune a list of ddrs to be tested at run-time by versioning for alias.
3056    Merge several alias checks into one if possible.
3057    Return FALSE if resulting list of ddrs is longer then allowed by
3058    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3059
3060 bool
3061 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3062 {
3063   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3064   hash_set <tree_pair_hash> compared_objects;
3065
3066   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3067   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3068     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3069   vec<vec_object_pair> &check_unequal_addrs
3070     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3071   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3072   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3073
3074   ddr_p ddr;
3075   unsigned int i;
3076   tree length_factor;
3077
3078   if (dump_enabled_p ())
3079     dump_printf_loc (MSG_NOTE, vect_location,
3080                      "=== vect_prune_runtime_alias_test_list ===\n");
3081
3082   if (may_alias_ddrs.is_empty ())
3083     return true;
3084
3085   comp_alias_ddrs.create (may_alias_ddrs.length ());
3086
3087   unsigned int loop_depth
3088     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3089                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3090
3091   /* First, we collect all data ref pairs for aliasing checks.  */
3092   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3093     {
3094       int comp_res;
3095       struct data_reference *dr_a, *dr_b;
3096       gimple *dr_group_first_a, *dr_group_first_b;
3097       tree segment_length_a, segment_length_b;
3098       gimple *stmt_a, *stmt_b;
3099
3100       /* Ignore the alias if the VF we chose ended up being no greater
3101          than the dependence distance.  */
3102       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3103         continue;
3104
3105       if (DDR_OBJECT_A (ddr))
3106         {
3107           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3108           if (!compared_objects.add (new_pair))
3109             {
3110               if (dump_enabled_p ())
3111                 {
3112                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3113                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3114                   dump_printf (MSG_NOTE, " and ");
3115                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3116                   dump_printf (MSG_NOTE, " have different addresses\n");
3117                 }
3118               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3119             }
3120           continue;
3121         }
3122
3123       dr_a = DDR_A (ddr);
3124       stmt_a = DR_STMT (DDR_A (ddr));
3125       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3126       if (dr_group_first_a)
3127         {
3128           stmt_a = dr_group_first_a;
3129           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3130         }
3131
3132       dr_b = DDR_B (ddr);
3133       stmt_b = DR_STMT (DDR_B (ddr));
3134       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3135       if (dr_group_first_b)
3136         {
3137           stmt_b = dr_group_first_b;
3138           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3139         }
3140
3141       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3142         length_factor = scalar_loop_iters;
3143       else
3144         length_factor = size_int (vect_factor);
3145       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3146       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3147
3148       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3149                                         DR_BASE_ADDRESS (dr_b));
3150       if (comp_res == 0)
3151         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3152                                           DR_OFFSET (dr_b));
3153
3154       /* Alias is known at compilation time.  */
3155       if (comp_res == 0
3156           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3157           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3158           && TREE_CODE (segment_length_a) == INTEGER_CST
3159           && TREE_CODE (segment_length_b) == INTEGER_CST)
3160         {
3161           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3162             continue;
3163
3164           if (dump_enabled_p ())
3165             dump_printf_loc (MSG_NOTE, vect_location,
3166                              "not vectorized: compilation time alias.\n");
3167
3168           return false;
3169         }
3170
3171       dr_with_seg_len_pair_t dr_with_seg_len_pair
3172           (dr_with_seg_len (dr_a, segment_length_a),
3173            dr_with_seg_len (dr_b, segment_length_b));
3174
3175       /* Canonicalize pairs by sorting the two DR members.  */
3176       if (comp_res > 0)
3177         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3178
3179       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3180     }
3181
3182   prune_runtime_alias_test_list (&comp_alias_ddrs,
3183                                  (unsigned HOST_WIDE_INT) vect_factor);
3184
3185   unsigned int count = (comp_alias_ddrs.length ()
3186                         + check_unequal_addrs.length ());
3187   dump_printf_loc (MSG_NOTE, vect_location,
3188                    "improved number of alias checks from %d to %d\n",
3189                    may_alias_ddrs.length (), count);
3190   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3191     {
3192       if (dump_enabled_p ())
3193         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3194                          "number of versioning for alias "
3195                          "run-time tests exceeds %d "
3196                          "(--param vect-max-version-for-alias-checks)\n",
3197                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3198       return false;
3199     }
3200
3201   return true;
3202 }
3203
3204 /* Return true if a non-affine read or write in STMT is suitable for a
3205    gather load or scatter store.  Describe the operation in *INFO if so.  */
3206
3207 bool
3208 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3209                            gather_scatter_info *info)
3210 {
3211   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3212   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3213   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3214   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3215   tree offtype = NULL_TREE;
3216   tree decl, base, off;
3217   machine_mode pmode;
3218   int punsignedp, reversep, pvolatilep = 0;
3219
3220   base = DR_REF (dr);
3221   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3222      see if we can use the def stmt of the address.  */
3223   if (is_gimple_call (stmt)
3224       && gimple_call_internal_p (stmt)
3225       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3226           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3227       && TREE_CODE (base) == MEM_REF
3228       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3229       && integer_zerop (TREE_OPERAND (base, 1))
3230       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3231     {
3232       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3233       if (is_gimple_assign (def_stmt)
3234           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3235         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3236     }
3237
3238   /* The gather and scatter builtins need address of the form
3239      loop_invariant + vector * {1, 2, 4, 8}
3240      or
3241      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3242      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3243      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3244      multiplications and additions in it.  To get a vector, we need
3245      a single SSA_NAME that will be defined in the loop and will
3246      contain everything that is not loop invariant and that can be
3247      vectorized.  The following code attempts to find such a preexistng
3248      SSA_NAME OFF and put the loop invariants into a tree BASE
3249      that can be gimplified before the loop.  */
3250   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3251                               &punsignedp, &reversep, &pvolatilep);
3252   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3253
3254   if (TREE_CODE (base) == MEM_REF)
3255     {
3256       if (!integer_zerop (TREE_OPERAND (base, 1)))
3257         {
3258           if (off == NULL_TREE)
3259             {
3260               offset_int moff = mem_ref_offset (base);
3261               off = wide_int_to_tree (sizetype, moff);
3262             }
3263           else
3264             off = size_binop (PLUS_EXPR, off,
3265                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3266         }
3267       base = TREE_OPERAND (base, 0);
3268     }
3269   else
3270     base = build_fold_addr_expr (base);
3271
3272   if (off == NULL_TREE)
3273     off = size_zero_node;
3274
3275   /* If base is not loop invariant, either off is 0, then we start with just
3276      the constant offset in the loop invariant BASE and continue with base
3277      as OFF, otherwise give up.
3278      We could handle that case by gimplifying the addition of base + off
3279      into some SSA_NAME and use that as off, but for now punt.  */
3280   if (!expr_invariant_in_loop_p (loop, base))
3281     {
3282       if (!integer_zerop (off))
3283         return false;
3284       off = base;
3285       base = size_int (pbitpos / BITS_PER_UNIT);
3286     }
3287   /* Otherwise put base + constant offset into the loop invariant BASE
3288      and continue with OFF.  */
3289   else
3290     {
3291       base = fold_convert (sizetype, base);
3292       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3293     }
3294
3295   /* OFF at this point may be either a SSA_NAME or some tree expression
3296      from get_inner_reference.  Try to peel off loop invariants from it
3297      into BASE as long as possible.  */
3298   STRIP_NOPS (off);
3299   while (offtype == NULL_TREE)
3300     {
3301       enum tree_code code;
3302       tree op0, op1, add = NULL_TREE;
3303
3304       if (TREE_CODE (off) == SSA_NAME)
3305         {
3306           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3307
3308           if (expr_invariant_in_loop_p (loop, off))
3309             return false;
3310
3311           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3312             break;
3313
3314           op0 = gimple_assign_rhs1 (def_stmt);
3315           code = gimple_assign_rhs_code (def_stmt);
3316           op1 = gimple_assign_rhs2 (def_stmt);
3317         }
3318       else
3319         {
3320           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3321             return false;
3322           code = TREE_CODE (off);
3323           extract_ops_from_tree (off, &code, &op0, &op1);
3324         }
3325       switch (code)
3326         {
3327         case POINTER_PLUS_EXPR:
3328         case PLUS_EXPR:
3329           if (expr_invariant_in_loop_p (loop, op0))
3330             {
3331               add = op0;
3332               off = op1;
3333             do_add:
3334               add = fold_convert (sizetype, add);
3335               if (scale != 1)
3336                 add = size_binop (MULT_EXPR, add, size_int (scale));
3337               base = size_binop (PLUS_EXPR, base, add);
3338               continue;
3339             }
3340           if (expr_invariant_in_loop_p (loop, op1))
3341             {
3342               add = op1;
3343               off = op0;
3344               goto do_add;
3345             }
3346           break;
3347         case MINUS_EXPR:
3348           if (expr_invariant_in_loop_p (loop, op1))
3349             {
3350               add = fold_convert (sizetype, op1);
3351               add = size_binop (MINUS_EXPR, size_zero_node, add);
3352               off = op0;
3353               goto do_add;
3354             }
3355           break;
3356         case MULT_EXPR:
3357           if (scale == 1 && tree_fits_shwi_p (op1))
3358             {
3359               scale = tree_to_shwi (op1);
3360               off = op0;
3361               continue;
3362             }
3363           break;
3364         case SSA_NAME:
3365           off = op0;
3366           continue;
3367         CASE_CONVERT:
3368           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3369               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3370             break;
3371           if (TYPE_PRECISION (TREE_TYPE (op0))
3372               == TYPE_PRECISION (TREE_TYPE (off)))
3373             {
3374               off = op0;
3375               continue;
3376             }
3377           if (TYPE_PRECISION (TREE_TYPE (op0))
3378               < TYPE_PRECISION (TREE_TYPE (off)))
3379             {
3380               off = op0;
3381               offtype = TREE_TYPE (off);
3382               STRIP_NOPS (off);
3383               continue;
3384             }
3385           break;
3386         default:
3387           break;
3388         }
3389       break;
3390     }
3391
3392   /* If at the end OFF still isn't a SSA_NAME or isn't
3393      defined in the loop, punt.  */
3394   if (TREE_CODE (off) != SSA_NAME
3395       || expr_invariant_in_loop_p (loop, off))
3396     return false;
3397
3398   if (offtype == NULL_TREE)
3399     offtype = TREE_TYPE (off);
3400
3401   if (DR_IS_READ (dr))
3402     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3403                                              offtype, scale);
3404   else
3405     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3406                                               offtype, scale);
3407
3408   if (decl == NULL_TREE)
3409     return false;
3410
3411   info->decl = decl;
3412   info->base = base;
3413   info->offset = off;
3414   info->offset_dt = vect_unknown_def_type;
3415   info->offset_vectype = NULL_TREE;
3416   info->scale = scale;
3417   return true;
3418 }
3419
3420 /* Function vect_analyze_data_refs.
3421
3422   Find all the data references in the loop or basic block.
3423
3424    The general structure of the analysis of data refs in the vectorizer is as
3425    follows:
3426    1- vect_analyze_data_refs(loop/bb): call
3427       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3428       in the loop/bb and their dependences.
3429    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3430    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3431    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3432
3433 */
3434
3435 bool
3436 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3437 {
3438   struct loop *loop = NULL;
3439   unsigned int i;
3440   struct data_reference *dr;
3441   tree scalar_type;
3442
3443   if (dump_enabled_p ())
3444     dump_printf_loc (MSG_NOTE, vect_location,
3445                      "=== vect_analyze_data_refs ===\n");
3446
3447   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3448     loop = LOOP_VINFO_LOOP (loop_vinfo);
3449
3450   /* Go through the data-refs, check that the analysis succeeded.  Update
3451      pointer from stmt_vec_info struct to DR and vectype.  */
3452
3453   vec<data_reference_p> datarefs = vinfo->datarefs;
3454   FOR_EACH_VEC_ELT (datarefs, i, dr)
3455     {
3456       gimple *stmt;
3457       stmt_vec_info stmt_info;
3458       tree base, offset, init;
3459       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3460       bool simd_lane_access = false;
3461       int vf;
3462
3463 again:
3464       if (!dr || !DR_REF (dr))
3465         {
3466           if (dump_enabled_p ())
3467             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3468                              "not vectorized: unhandled data-ref\n");
3469           return false;
3470         }
3471
3472       stmt = DR_STMT (dr);
3473       stmt_info = vinfo_for_stmt (stmt);
3474
3475       /* Discard clobbers from the dataref vector.  We will remove
3476          clobber stmts during vectorization.  */
3477       if (gimple_clobber_p (stmt))
3478         {
3479           free_data_ref (dr);
3480           if (i == datarefs.length () - 1)
3481             {
3482               datarefs.pop ();
3483               break;
3484             }
3485           datarefs.ordered_remove (i);
3486           dr = datarefs[i];
3487           goto again;
3488         }
3489
3490       /* Check that analysis of the data-ref succeeded.  */
3491       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3492           || !DR_STEP (dr))
3493         {
3494           bool maybe_gather
3495             = DR_IS_READ (dr)
3496               && !TREE_THIS_VOLATILE (DR_REF (dr))
3497               && targetm.vectorize.builtin_gather != NULL;
3498           bool maybe_scatter
3499             = DR_IS_WRITE (dr)
3500               && !TREE_THIS_VOLATILE (DR_REF (dr))
3501               && targetm.vectorize.builtin_scatter != NULL;
3502           bool maybe_simd_lane_access
3503             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3504
3505           /* If target supports vector gather loads or scatter stores, or if
3506              this might be a SIMD lane access, see if they can't be used.  */
3507           if (is_a <loop_vec_info> (vinfo)
3508               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3509               && !nested_in_vect_loop_p (loop, stmt))
3510             {
3511               struct data_reference *newdr
3512                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3513                                    DR_REF (dr), stmt, !maybe_scatter,
3514                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3515               gcc_assert (newdr != NULL && DR_REF (newdr));
3516               if (DR_BASE_ADDRESS (newdr)
3517                   && DR_OFFSET (newdr)
3518                   && DR_INIT (newdr)
3519                   && DR_STEP (newdr)
3520                   && integer_zerop (DR_STEP (newdr)))
3521                 {
3522                   if (maybe_simd_lane_access)
3523                     {
3524                       tree off = DR_OFFSET (newdr);
3525                       STRIP_NOPS (off);
3526                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3527                           && TREE_CODE (off) == MULT_EXPR
3528                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3529                         {
3530                           tree step = TREE_OPERAND (off, 1);
3531                           off = TREE_OPERAND (off, 0);
3532                           STRIP_NOPS (off);
3533                           if (CONVERT_EXPR_P (off)
3534                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3535                                                                           0)))
3536                                  < TYPE_PRECISION (TREE_TYPE (off)))
3537                             off = TREE_OPERAND (off, 0);
3538                           if (TREE_CODE (off) == SSA_NAME)
3539                             {
3540                               gimple *def = SSA_NAME_DEF_STMT (off);
3541                               tree reft = TREE_TYPE (DR_REF (newdr));
3542                               if (is_gimple_call (def)
3543                                   && gimple_call_internal_p (def)
3544                                   && (gimple_call_internal_fn (def)
3545                                       == IFN_GOMP_SIMD_LANE))
3546                                 {
3547                                   tree arg = gimple_call_arg (def, 0);
3548                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3549                                   arg = SSA_NAME_VAR (arg);
3550                                   if (arg == loop->simduid
3551                                       /* For now.  */
3552                                       && tree_int_cst_equal
3553                                            (TYPE_SIZE_UNIT (reft),
3554                                             step))
3555                                     {
3556                                       DR_OFFSET (newdr) = ssize_int (0);
3557                                       DR_STEP (newdr) = step;
3558                                       DR_OFFSET_ALIGNMENT (newdr)
3559                                         = BIGGEST_ALIGNMENT;
3560                                       DR_STEP_ALIGNMENT (newdr)
3561                                         = highest_pow2_factor (step);
3562                                       dr = newdr;
3563                                       simd_lane_access = true;
3564                                     }
3565                                 }
3566                             }
3567                         }
3568                     }
3569                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3570                     {
3571                       dr = newdr;
3572                       if (maybe_gather)
3573                         gatherscatter = GATHER;
3574                       else
3575                         gatherscatter = SCATTER;
3576                     }
3577                 }
3578               if (gatherscatter == SG_NONE && !simd_lane_access)
3579                 free_data_ref (newdr);
3580             }
3581
3582           if (gatherscatter == SG_NONE && !simd_lane_access)
3583             {
3584               if (dump_enabled_p ())
3585                 {
3586                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3587                                    "not vectorized: data ref analysis "
3588                                    "failed ");
3589                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3590                 }
3591
3592               if (is_a <bb_vec_info> (vinfo))
3593                 break;
3594
3595               return false;
3596             }
3597         }
3598
3599       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3600         {
3601           if (dump_enabled_p ())
3602             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3603                              "not vectorized: base addr of dr is a "
3604                              "constant\n");
3605
3606           if (is_a <bb_vec_info> (vinfo))
3607             break;
3608
3609           if (gatherscatter != SG_NONE || simd_lane_access)
3610             free_data_ref (dr);
3611           return false;
3612         }
3613
3614       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3615         {
3616           if (dump_enabled_p ())
3617             {
3618               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3619                                "not vectorized: volatile type ");
3620               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3621             }
3622
3623           if (is_a <bb_vec_info> (vinfo))
3624             break;
3625
3626           return false;
3627         }
3628
3629       if (stmt_can_throw_internal (stmt))
3630         {
3631           if (dump_enabled_p ())
3632             {
3633               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3634                                "not vectorized: statement can throw an "
3635                                "exception ");
3636               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3637             }
3638
3639           if (is_a <bb_vec_info> (vinfo))
3640             break;
3641
3642           if (gatherscatter != SG_NONE || simd_lane_access)
3643             free_data_ref (dr);
3644           return false;
3645         }
3646
3647       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3648           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3649         {
3650           if (dump_enabled_p ())
3651             {
3652               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3653                                "not vectorized: statement is bitfield "
3654                                "access ");
3655               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3656             }
3657
3658           if (is_a <bb_vec_info> (vinfo))
3659             break;
3660
3661           if (gatherscatter != SG_NONE || simd_lane_access)
3662             free_data_ref (dr);
3663           return false;
3664         }
3665
3666       base = unshare_expr (DR_BASE_ADDRESS (dr));
3667       offset = unshare_expr (DR_OFFSET (dr));
3668       init = unshare_expr (DR_INIT (dr));
3669
3670       if (is_gimple_call (stmt)
3671           && (!gimple_call_internal_p (stmt)
3672               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3673                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3674         {
3675           if (dump_enabled_p ())
3676             {
3677               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3678                                "not vectorized: dr in a call ");
3679               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3680             }
3681
3682           if (is_a <bb_vec_info> (vinfo))
3683             break;
3684
3685           if (gatherscatter != SG_NONE || simd_lane_access)
3686             free_data_ref (dr);
3687           return false;
3688         }
3689
3690       /* Update DR field in stmt_vec_info struct.  */
3691
3692       /* If the dataref is in an inner-loop of the loop that is considered for
3693          for vectorization, we also want to analyze the access relative to
3694          the outer-loop (DR contains information only relative to the
3695          inner-most enclosing loop).  We do that by building a reference to the
3696          first location accessed by the inner-loop, and analyze it relative to
3697          the outer-loop.  */
3698       if (loop && nested_in_vect_loop_p (loop, stmt))
3699         {
3700           /* Build a reference to the first location accessed by the
3701              inner loop: *(BASE + INIT + OFFSET).  By construction,
3702              this address must be invariant in the inner loop, so we
3703              can consider it as being used in the outer loop.  */
3704           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3705                                           init, offset);
3706           tree init_addr = fold_build_pointer_plus (base, init_offset);
3707           tree init_ref = build_fold_indirect_ref (init_addr);
3708
3709           if (dump_enabled_p ())
3710             {
3711               dump_printf_loc (MSG_NOTE, vect_location,
3712                                "analyze in outer loop: ");
3713               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3714               dump_printf (MSG_NOTE, "\n");
3715             }
3716
3717           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3718                                      init_ref, loop))
3719             /* dr_analyze_innermost already explained the failure.  */
3720             return false;
3721
3722           if (dump_enabled_p ())
3723             {
3724               dump_printf_loc (MSG_NOTE, vect_location,
3725                                "\touter base_address: ");
3726               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3727                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3728               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3729               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3730                                  STMT_VINFO_DR_OFFSET (stmt_info));
3731               dump_printf (MSG_NOTE,
3732                            "\n\touter constant offset from base address: ");
3733               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3734                                  STMT_VINFO_DR_INIT (stmt_info));
3735               dump_printf (MSG_NOTE, "\n\touter step: ");
3736               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3737                                  STMT_VINFO_DR_STEP (stmt_info));
3738               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3739                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3740               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3741                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3742               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3743                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3744               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3745                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3746             }
3747         }
3748
3749       if (STMT_VINFO_DATA_REF (stmt_info))
3750         {
3751           if (dump_enabled_p ())
3752             {
3753               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3754                                "not vectorized: more than one data ref "
3755                                "in stmt: ");
3756               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3757             }
3758
3759           if (is_a <bb_vec_info> (vinfo))
3760             break;
3761
3762           if (gatherscatter != SG_NONE || simd_lane_access)
3763             free_data_ref (dr);
3764           return false;
3765         }
3766
3767       STMT_VINFO_DATA_REF (stmt_info) = dr;
3768       if (simd_lane_access)
3769         {
3770           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3771           free_data_ref (datarefs[i]);
3772           datarefs[i] = dr;
3773         }
3774
3775       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3776           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3777           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3778         {
3779           if (dump_enabled_p ())
3780             {
3781               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3782                                "not vectorized: base object not addressable "
3783                                "for stmt: ");
3784               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3785             }
3786           if (is_a <bb_vec_info> (vinfo))
3787             {
3788               /* In BB vectorization the ref can still participate
3789                  in dependence analysis, we just can't vectorize it.  */
3790               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3791               continue;
3792             }
3793           return false;
3794         }
3795
3796       /* Set vectype for STMT.  */
3797       scalar_type = TREE_TYPE (DR_REF (dr));
3798       STMT_VINFO_VECTYPE (stmt_info)
3799         = get_vectype_for_scalar_type (scalar_type);
3800       if (!STMT_VINFO_VECTYPE (stmt_info))
3801         {
3802           if (dump_enabled_p ())
3803             {
3804               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3805                                "not vectorized: no vectype for stmt: ");
3806               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3807               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3808               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3809                                  scalar_type);
3810               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3811             }
3812
3813           if (is_a <bb_vec_info> (vinfo))
3814             {
3815               /* No vector type is fine, the ref can still participate
3816                  in dependence analysis, we just can't vectorize it.  */
3817               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3818               continue;
3819             }
3820
3821           if (gatherscatter != SG_NONE || simd_lane_access)
3822             {
3823               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3824               if (gatherscatter != SG_NONE)
3825                 free_data_ref (dr);
3826             }
3827           return false;
3828         }
3829       else
3830         {
3831           if (dump_enabled_p ())
3832             {
3833               dump_printf_loc (MSG_NOTE, vect_location,
3834                                "got vectype for stmt: ");
3835               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3836               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3837                                  STMT_VINFO_VECTYPE (stmt_info));
3838               dump_printf (MSG_NOTE, "\n");
3839             }
3840         }
3841
3842       /* Adjust the minimal vectorization factor according to the
3843          vector type.  */
3844       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3845       if (vf > *min_vf)
3846         *min_vf = vf;
3847
3848       if (gatherscatter != SG_NONE)
3849         {
3850           gather_scatter_info gs_info;
3851           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3852                                           &gs_info)
3853               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3854             {
3855               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3856               free_data_ref (dr);
3857               if (dump_enabled_p ())
3858                 {
3859                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3860                                    (gatherscatter == GATHER) ?
3861                                    "not vectorized: not suitable for gather "
3862                                    "load " :
3863                                    "not vectorized: not suitable for scatter "
3864                                    "store ");
3865                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3866                 }
3867               return false;
3868             }
3869
3870           free_data_ref (datarefs[i]);
3871           datarefs[i] = dr;
3872           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3873         }
3874
3875       else if (is_a <loop_vec_info> (vinfo)
3876                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3877         {
3878           if (nested_in_vect_loop_p (loop, stmt))
3879             {
3880               if (dump_enabled_p ())
3881                 {
3882                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3883                                    "not vectorized: not suitable for strided "
3884                                    "load ");
3885                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3886                 }
3887               return false;
3888             }
3889           STMT_VINFO_STRIDED_P (stmt_info) = true;
3890         }
3891     }
3892
3893   /* If we stopped analysis at the first dataref we could not analyze
3894      when trying to vectorize a basic-block mark the rest of the datarefs
3895      as not vectorizable and truncate the vector of datarefs.  That
3896      avoids spending useless time in analyzing their dependence.  */
3897   if (i != datarefs.length ())
3898     {
3899       gcc_assert (is_a <bb_vec_info> (vinfo));
3900       for (unsigned j = i; j < datarefs.length (); ++j)
3901         {
3902           data_reference_p dr = datarefs[j];
3903           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3904           free_data_ref (dr);
3905         }
3906       datarefs.truncate (i);
3907     }
3908
3909   return true;
3910 }
3911
3912
3913 /* Function vect_get_new_vect_var.
3914
3915    Returns a name for a new variable.  The current naming scheme appends the
3916    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3917    the name of vectorizer generated variables, and appends that to NAME if
3918    provided.  */
3919
3920 tree
3921 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3922 {
3923   const char *prefix;
3924   tree new_vect_var;
3925
3926   switch (var_kind)
3927   {
3928   case vect_simple_var:
3929     prefix = "vect";
3930     break;
3931   case vect_scalar_var:
3932     prefix = "stmp";
3933     break;
3934   case vect_mask_var:
3935     prefix = "mask";
3936     break;
3937   case vect_pointer_var:
3938     prefix = "vectp";
3939     break;
3940   default:
3941     gcc_unreachable ();
3942   }
3943
3944   if (name)
3945     {
3946       char* tmp = concat (prefix, "_", name, NULL);
3947       new_vect_var = create_tmp_reg (type, tmp);
3948       free (tmp);
3949     }
3950   else
3951     new_vect_var = create_tmp_reg (type, prefix);
3952
3953   return new_vect_var;
3954 }
3955
3956 /* Like vect_get_new_vect_var but return an SSA name.  */
3957
3958 tree
3959 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3960 {
3961   const char *prefix;
3962   tree new_vect_var;
3963
3964   switch (var_kind)
3965   {
3966   case vect_simple_var:
3967     prefix = "vect";
3968     break;
3969   case vect_scalar_var:
3970     prefix = "stmp";
3971     break;
3972   case vect_pointer_var:
3973     prefix = "vectp";
3974     break;
3975   default:
3976     gcc_unreachable ();
3977   }
3978
3979   if (name)
3980     {
3981       char* tmp = concat (prefix, "_", name, NULL);
3982       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3983       free (tmp);
3984     }
3985   else
3986     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3987
3988   return new_vect_var;
3989 }
3990
3991 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3992
3993 static void
3994 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3995                                   stmt_vec_info stmt_info)
3996 {
3997   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3998   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3999   int misalign = DR_MISALIGNMENT (dr);
4000   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4001     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4002   else
4003     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4004 }
4005
4006 /* Function vect_create_addr_base_for_vector_ref.
4007
4008    Create an expression that computes the address of the first memory location
4009    that will be accessed for a data reference.
4010
4011    Input:
4012    STMT: The statement containing the data reference.
4013    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4014    OFFSET: Optional. If supplied, it is be added to the initial address.
4015    LOOP:    Specify relative to which loop-nest should the address be computed.
4016             For example, when the dataref is in an inner-loop nested in an
4017             outer-loop that is now being vectorized, LOOP can be either the
4018             outer-loop, or the inner-loop.  The first memory location accessed
4019             by the following dataref ('in' points to short):
4020
4021                 for (i=0; i<N; i++)
4022                    for (j=0; j<M; j++)
4023                      s += in[i+j]
4024
4025             is as follows:
4026             if LOOP=i_loop:     &in             (relative to i_loop)
4027             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4028    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4029             initial address.  Unlike OFFSET, which is number of elements to
4030             be added, BYTE_OFFSET is measured in bytes.
4031
4032    Output:
4033    1. Return an SSA_NAME whose value is the address of the memory location of
4034       the first vector of the data reference.
4035    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4036       these statement(s) which define the returned SSA_NAME.
4037
4038    FORNOW: We are only handling array accesses with step 1.  */
4039
4040 tree
4041 vect_create_addr_base_for_vector_ref (gimple *stmt,
4042                                       gimple_seq *new_stmt_list,
4043                                       tree offset,
4044                                       tree byte_offset)
4045 {
4046   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4047   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4048   const char *base_name;
4049   tree addr_base;
4050   tree dest;
4051   gimple_seq seq = NULL;
4052   tree vect_ptr_type;
4053   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4054   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4055   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4056
4057   tree data_ref_base = unshare_expr (drb->base_address);
4058   tree base_offset = unshare_expr (drb->offset);
4059   tree init = unshare_expr (drb->init);
4060
4061   if (loop_vinfo)
4062     base_name = get_name (data_ref_base);
4063   else
4064     {
4065       base_offset = ssize_int (0);
4066       init = ssize_int (0);
4067       base_name = get_name (DR_REF (dr));
4068     }
4069
4070   /* Create base_offset */
4071   base_offset = size_binop (PLUS_EXPR,
4072                             fold_convert (sizetype, base_offset),
4073                             fold_convert (sizetype, init));
4074
4075   if (offset)
4076     {
4077       offset = fold_build2 (MULT_EXPR, sizetype,
4078                             fold_convert (sizetype, offset), step);
4079       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4080                                  base_offset, offset);
4081     }
4082   if (byte_offset)
4083     {
4084       byte_offset = fold_convert (sizetype, byte_offset);
4085       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4086                                  base_offset, byte_offset);
4087     }
4088
4089   /* base + base_offset */
4090   if (loop_vinfo)
4091     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4092   else
4093     {
4094       addr_base = build1 (ADDR_EXPR,
4095                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4096                           unshare_expr (DR_REF (dr)));
4097     }
4098
4099   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4100   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4101   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4102   gimple_seq_add_seq (new_stmt_list, seq);
4103
4104   if (DR_PTR_INFO (dr)
4105       && TREE_CODE (addr_base) == SSA_NAME
4106       && !SSA_NAME_PTR_INFO (addr_base))
4107     {
4108       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4109       if (offset || byte_offset)
4110         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4111     }
4112
4113   if (dump_enabled_p ())
4114     {
4115       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4116       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4117       dump_printf (MSG_NOTE, "\n");
4118     }
4119
4120   return addr_base;
4121 }
4122
4123
4124 /* Function vect_create_data_ref_ptr.
4125
4126    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4127    location accessed in the loop by STMT, along with the def-use update
4128    chain to appropriately advance the pointer through the loop iterations.
4129    Also set aliasing information for the pointer.  This pointer is used by
4130    the callers to this function to create a memory reference expression for
4131    vector load/store access.
4132
4133    Input:
4134    1. STMT: a stmt that references memory. Expected to be of the form
4135          GIMPLE_ASSIGN <name, data-ref> or
4136          GIMPLE_ASSIGN <data-ref, name>.
4137    2. AGGR_TYPE: the type of the reference, which should be either a vector
4138         or an array.
4139    3. AT_LOOP: the loop where the vector memref is to be created.
4140    4. OFFSET (optional): an offset to be added to the initial address accessed
4141         by the data-ref in STMT.
4142    5. BSI: location where the new stmts are to be placed if there is no loop
4143    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4144         pointing to the initial address.
4145    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4146         to the initial address accessed by the data-ref in STMT.  This is
4147         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4148         in bytes.
4149
4150    Output:
4151    1. Declare a new ptr to vector_type, and have it point to the base of the
4152       data reference (initial addressed accessed by the data reference).
4153       For example, for vector of type V8HI, the following code is generated:
4154
4155       v8hi *ap;
4156       ap = (v8hi *)initial_address;
4157
4158       if OFFSET is not supplied:
4159          initial_address = &a[init];
4160       if OFFSET is supplied:
4161          initial_address = &a[init + OFFSET];
4162       if BYTE_OFFSET is supplied:
4163          initial_address = &a[init] + BYTE_OFFSET;
4164
4165       Return the initial_address in INITIAL_ADDRESS.
4166
4167    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4168       update the pointer in each iteration of the loop.
4169
4170       Return the increment stmt that updates the pointer in PTR_INCR.
4171
4172    3. Set INV_P to true if the access pattern of the data reference in the
4173       vectorized loop is invariant.  Set it to false otherwise.
4174
4175    4. Return the pointer.  */
4176
4177 tree
4178 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4179                           tree offset, tree *initial_address,
4180                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4181                           bool only_init, bool *inv_p, tree byte_offset)
4182 {
4183   const char *base_name;
4184   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4185   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4186   struct loop *loop = NULL;
4187   bool nested_in_vect_loop = false;
4188   struct loop *containing_loop = NULL;
4189   tree aggr_ptr_type;
4190   tree aggr_ptr;
4191   tree new_temp;
4192   gimple_seq new_stmt_list = NULL;
4193   edge pe = NULL;
4194   basic_block new_bb;
4195   tree aggr_ptr_init;
4196   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4197   tree aptr;
4198   gimple_stmt_iterator incr_gsi;
4199   bool insert_after;
4200   tree indx_before_incr, indx_after_incr;
4201   gimple *incr;
4202   tree step;
4203   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4204
4205   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4206               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4207
4208   if (loop_vinfo)
4209     {
4210       loop = LOOP_VINFO_LOOP (loop_vinfo);
4211       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4212       containing_loop = (gimple_bb (stmt))->loop_father;
4213       pe = loop_preheader_edge (loop);
4214     }
4215   else
4216     {
4217       gcc_assert (bb_vinfo);
4218       only_init = true;
4219       *ptr_incr = NULL;
4220     }
4221
4222   /* Check the step (evolution) of the load in LOOP, and record
4223      whether it's invariant.  */
4224   step = vect_dr_behavior (dr)->step;
4225   if (integer_zerop (step))
4226     *inv_p = true;
4227   else
4228     *inv_p = false;
4229
4230   /* Create an expression for the first address accessed by this load
4231      in LOOP.  */
4232   base_name = get_name (DR_BASE_ADDRESS (dr));
4233
4234   if (dump_enabled_p ())
4235     {
4236       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4237       dump_printf_loc (MSG_NOTE, vect_location,
4238                        "create %s-pointer variable to type: ",
4239                        get_tree_code_name (TREE_CODE (aggr_type)));
4240       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4241       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4242         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4243       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4244         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4245       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4246         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4247       else
4248         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4249       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4250       dump_printf (MSG_NOTE, "\n");
4251     }
4252
4253   /* (1) Create the new aggregate-pointer variable.
4254      Vector and array types inherit the alias set of their component
4255      type by default so we need to use a ref-all pointer if the data
4256      reference does not conflict with the created aggregated data
4257      reference because it is not addressable.  */
4258   bool need_ref_all = false;
4259   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4260                               get_alias_set (DR_REF (dr))))
4261     need_ref_all = true;
4262   /* Likewise for any of the data references in the stmt group.  */
4263   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4264     {
4265       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4266       do
4267         {
4268           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4269           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4270           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4271                                       get_alias_set (DR_REF (sdr))))
4272             {
4273               need_ref_all = true;
4274               break;
4275             }
4276           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4277         }
4278       while (orig_stmt);
4279     }
4280   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4281                                                need_ref_all);
4282   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4283
4284
4285   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4286      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4287      def-use update cycles for the pointer: one relative to the outer-loop
4288      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4289      to the inner-loop (which is the inner-most loop containing the dataref),
4290      and this is done be step (5) below.
4291
4292      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4293      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4294      redundant.  Steps (3),(4) create the following:
4295
4296         vp0 = &base_addr;
4297         LOOP:   vp1 = phi(vp0,vp2)
4298                 ...
4299                 ...
4300                 vp2 = vp1 + step
4301                 goto LOOP
4302
4303      If there is an inner-loop nested in loop, then step (5) will also be
4304      applied, and an additional update in the inner-loop will be created:
4305
4306         vp0 = &base_addr;
4307         LOOP:   vp1 = phi(vp0,vp2)
4308                 ...
4309         inner:     vp3 = phi(vp1,vp4)
4310                    vp4 = vp3 + inner_step
4311                    if () goto inner
4312                 ...
4313                 vp2 = vp1 + step
4314                 if () goto LOOP   */
4315
4316   /* (2) Calculate the initial address of the aggregate-pointer, and set
4317      the aggregate-pointer to point to it before the loop.  */
4318
4319   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4320
4321   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4322                                                    offset, byte_offset);
4323   if (new_stmt_list)
4324     {
4325       if (pe)
4326         {
4327           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4328           gcc_assert (!new_bb);
4329         }
4330       else
4331         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4332     }
4333
4334   *initial_address = new_temp;
4335   aggr_ptr_init = new_temp;
4336
4337   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4338      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4339      inner-loop nested in LOOP (during outer-loop vectorization).  */
4340
4341   /* No update in loop is required.  */
4342   if (only_init && (!loop_vinfo || at_loop == loop))
4343     aptr = aggr_ptr_init;
4344   else
4345     {
4346       /* The step of the aggregate pointer is the type size.  */
4347       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4348       /* One exception to the above is when the scalar step of the load in
4349          LOOP is zero. In this case the step here is also zero.  */
4350       if (*inv_p)
4351         iv_step = size_zero_node;
4352       else if (tree_int_cst_sgn (step) == -1)
4353         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4354
4355       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4356
4357       create_iv (aggr_ptr_init,
4358                  fold_convert (aggr_ptr_type, iv_step),
4359                  aggr_ptr, loop, &incr_gsi, insert_after,
4360                  &indx_before_incr, &indx_after_incr);
4361       incr = gsi_stmt (incr_gsi);
4362       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4363
4364       /* Copy the points-to information if it exists. */
4365       if (DR_PTR_INFO (dr))
4366         {
4367           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4368           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4369         }
4370       if (ptr_incr)
4371         *ptr_incr = incr;
4372
4373       aptr = indx_before_incr;
4374     }
4375
4376   if (!nested_in_vect_loop || only_init)
4377     return aptr;
4378
4379
4380   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4381      nested in LOOP, if exists.  */
4382
4383   gcc_assert (nested_in_vect_loop);
4384   if (!only_init)
4385     {
4386       standard_iv_increment_position (containing_loop, &incr_gsi,
4387                                       &insert_after);
4388       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4389                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4390                  &indx_after_incr);
4391       incr = gsi_stmt (incr_gsi);
4392       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4393
4394       /* Copy the points-to information if it exists. */
4395       if (DR_PTR_INFO (dr))
4396         {
4397           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4398           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4399         }
4400       if (ptr_incr)
4401         *ptr_incr = incr;
4402
4403       return indx_before_incr;
4404     }
4405   else
4406     gcc_unreachable ();
4407 }
4408
4409
4410 /* Function bump_vector_ptr
4411
4412    Increment a pointer (to a vector type) by vector-size. If requested,
4413    i.e. if PTR-INCR is given, then also connect the new increment stmt
4414    to the existing def-use update-chain of the pointer, by modifying
4415    the PTR_INCR as illustrated below:
4416
4417    The pointer def-use update-chain before this function:
4418                         DATAREF_PTR = phi (p_0, p_2)
4419                         ....
4420         PTR_INCR:       p_2 = DATAREF_PTR + step
4421
4422    The pointer def-use update-chain after this function:
4423                         DATAREF_PTR = phi (p_0, p_2)
4424                         ....
4425                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4426                         ....
4427         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4428
4429    Input:
4430    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4431                  in the loop.
4432    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4433               the loop.  The increment amount across iterations is expected
4434               to be vector_size.
4435    BSI - location where the new update stmt is to be placed.
4436    STMT - the original scalar memory-access stmt that is being vectorized.
4437    BUMP - optional. The offset by which to bump the pointer. If not given,
4438           the offset is assumed to be vector_size.
4439
4440    Output: Return NEW_DATAREF_PTR as illustrated above.
4441
4442 */
4443
4444 tree
4445 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4446                  gimple *stmt, tree bump)
4447 {
4448   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4449   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4450   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4451   tree update = TYPE_SIZE_UNIT (vectype);
4452   gassign *incr_stmt;
4453   ssa_op_iter iter;
4454   use_operand_p use_p;
4455   tree new_dataref_ptr;
4456
4457   if (bump)
4458     update = bump;
4459
4460   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4461     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4462   else
4463     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4464   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4465                                    dataref_ptr, update);
4466   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4467
4468   /* Copy the points-to information if it exists. */
4469   if (DR_PTR_INFO (dr))
4470     {
4471       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4472       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4473     }
4474
4475   if (!ptr_incr)
4476     return new_dataref_ptr;
4477
4478   /* Update the vector-pointer's cross-iteration increment.  */
4479   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4480     {
4481       tree use = USE_FROM_PTR (use_p);
4482
4483       if (use == dataref_ptr)
4484         SET_USE (use_p, new_dataref_ptr);
4485       else
4486         gcc_assert (tree_int_cst_compare (use, update) == 0);
4487     }
4488
4489   return new_dataref_ptr;
4490 }
4491
4492
4493 /* Function vect_create_destination_var.
4494
4495    Create a new temporary of type VECTYPE.  */
4496
4497 tree
4498 vect_create_destination_var (tree scalar_dest, tree vectype)
4499 {
4500   tree vec_dest;
4501   const char *name;
4502   char *new_name;
4503   tree type;
4504   enum vect_var_kind kind;
4505
4506   kind = vectype
4507     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4508     ? vect_mask_var
4509     : vect_simple_var
4510     : vect_scalar_var;
4511   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4512
4513   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4514
4515   name = get_name (scalar_dest);
4516   if (name)
4517     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4518   else
4519     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4520   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4521   free (new_name);
4522
4523   return vec_dest;
4524 }
4525
4526 /* Function vect_grouped_store_supported.
4527
4528    Returns TRUE if interleave high and interleave low permutations
4529    are supported, and FALSE otherwise.  */
4530
4531 bool
4532 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4533 {
4534   machine_mode mode = TYPE_MODE (vectype);
4535
4536   /* vect_permute_store_chain requires the group size to be equal to 3 or
4537      be a power of two.  */
4538   if (count != 3 && exact_log2 (count) == -1)
4539     {
4540       if (dump_enabled_p ())
4541         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4542                          "the size of the group of accesses"
4543                          " is not a power of 2 or not eqaul to 3\n");
4544       return false;
4545     }
4546
4547   /* Check that the permutation is supported.  */
4548   if (VECTOR_MODE_P (mode))
4549     {
4550       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4551       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4552
4553       if (count == 3)
4554         {
4555           unsigned int j0 = 0, j1 = 0, j2 = 0;
4556           unsigned int i, j;
4557
4558           for (j = 0; j < 3; j++)
4559             {
4560               int nelt0 = ((3 - j) * nelt) % 3;
4561               int nelt1 = ((3 - j) * nelt + 1) % 3;
4562               int nelt2 = ((3 - j) * nelt + 2) % 3;
4563               for (i = 0; i < nelt; i++)
4564                 {
4565                   if (3 * i + nelt0 < nelt)
4566                     sel[3 * i + nelt0] = j0++;
4567                   if (3 * i + nelt1 < nelt)
4568                     sel[3 * i + nelt1] = nelt + j1++;
4569                   if (3 * i + nelt2 < nelt)
4570                     sel[3 * i + nelt2] = 0;
4571                 }
4572               if (!can_vec_perm_p (mode, false, sel))
4573                 {
4574                   if (dump_enabled_p ())
4575                     dump_printf (MSG_MISSED_OPTIMIZATION,
4576                                  "permutaion op not supported by target.\n");
4577                   return false;
4578                 }
4579
4580               for (i = 0; i < nelt; i++)
4581                 {
4582                   if (3 * i + nelt0 < nelt)
4583                     sel[3 * i + nelt0] = 3 * i + nelt0;
4584                   if (3 * i + nelt1 < nelt)
4585                     sel[3 * i + nelt1] = 3 * i + nelt1;
4586                   if (3 * i + nelt2 < nelt)
4587                     sel[3 * i + nelt2] = nelt + j2++;
4588                 }
4589               if (!can_vec_perm_p (mode, false, sel))
4590                 {
4591                   if (dump_enabled_p ())
4592                     dump_printf (MSG_MISSED_OPTIMIZATION,
4593                                  "permutaion op not supported by target.\n");
4594                   return false;
4595                 }
4596             }
4597           return true;
4598         }
4599       else
4600         {
4601           /* If length is not equal to 3 then only power of 2 is supported.  */
4602           gcc_assert (pow2p_hwi (count));
4603
4604           for (i = 0; i < nelt / 2; i++)
4605             {
4606               sel[i * 2] = i;
4607               sel[i * 2 + 1] = i + nelt;
4608             }
4609             if (can_vec_perm_p (mode, false, sel))
4610               {
4611                 for (i = 0; i < nelt; i++)
4612                   sel[i] += nelt / 2;
4613                 if (can_vec_perm_p (mode, false, sel))
4614                   return true;
4615               }
4616         }
4617     }
4618
4619   if (dump_enabled_p ())
4620     dump_printf (MSG_MISSED_OPTIMIZATION,
4621                  "permutaion op not supported by target.\n");
4622   return false;
4623 }
4624
4625
4626 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4627    type VECTYPE.  */
4628
4629 bool
4630 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4631 {
4632   return vect_lanes_optab_supported_p ("vec_store_lanes",
4633                                        vec_store_lanes_optab,
4634                                        vectype, count);
4635 }
4636
4637
4638 /* Function vect_permute_store_chain.
4639
4640    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4641    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4642    the data correctly for the stores.  Return the final references for stores
4643    in RESULT_CHAIN.
4644
4645    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4646    The input is 4 vectors each containing 8 elements.  We assign a number to
4647    each element, the input sequence is:
4648
4649    1st vec:   0  1  2  3  4  5  6  7
4650    2nd vec:   8  9 10 11 12 13 14 15
4651    3rd vec:  16 17 18 19 20 21 22 23
4652    4th vec:  24 25 26 27 28 29 30 31
4653
4654    The output sequence should be:
4655
4656    1st vec:  0  8 16 24  1  9 17 25
4657    2nd vec:  2 10 18 26  3 11 19 27
4658    3rd vec:  4 12 20 28  5 13 21 30
4659    4th vec:  6 14 22 30  7 15 23 31
4660
4661    i.e., we interleave the contents of the four vectors in their order.
4662
4663    We use interleave_high/low instructions to create such output.  The input of
4664    each interleave_high/low operation is two vectors:
4665    1st vec    2nd vec
4666    0 1 2 3    4 5 6 7
4667    the even elements of the result vector are obtained left-to-right from the
4668    high/low elements of the first vector.  The odd elements of the result are
4669    obtained left-to-right from the high/low elements of the second vector.
4670    The output of interleave_high will be:   0 4 1 5
4671    and of interleave_low:                   2 6 3 7
4672
4673
4674    The permutation is done in log LENGTH stages.  In each stage interleave_high
4675    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4676    where the first argument is taken from the first half of DR_CHAIN and the
4677    second argument from it's second half.
4678    In our example,
4679
4680    I1: interleave_high (1st vec, 3rd vec)
4681    I2: interleave_low (1st vec, 3rd vec)
4682    I3: interleave_high (2nd vec, 4th vec)
4683    I4: interleave_low (2nd vec, 4th vec)
4684
4685    The output for the first stage is:
4686
4687    I1:  0 16  1 17  2 18  3 19
4688    I2:  4 20  5 21  6 22  7 23
4689    I3:  8 24  9 25 10 26 11 27
4690    I4: 12 28 13 29 14 30 15 31
4691
4692    The output of the second stage, i.e. the final result is:
4693
4694    I1:  0  8 16 24  1  9 17 25
4695    I2:  2 10 18 26  3 11 19 27
4696    I3:  4 12 20 28  5 13 21 30
4697    I4:  6 14 22 30  7 15 23 31.  */
4698
4699 void
4700 vect_permute_store_chain (vec<tree> dr_chain,
4701                           unsigned int length,
4702                           gimple *stmt,
4703                           gimple_stmt_iterator *gsi,
4704                           vec<tree> *result_chain)
4705 {
4706   tree vect1, vect2, high, low;
4707   gimple *perm_stmt;
4708   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4709   tree perm_mask_low, perm_mask_high;
4710   tree data_ref;
4711   tree perm3_mask_low, perm3_mask_high;
4712   unsigned int i, n, log_length = exact_log2 (length);
4713   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4714   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4715
4716   result_chain->quick_grow (length);
4717   memcpy (result_chain->address (), dr_chain.address (),
4718           length * sizeof (tree));
4719
4720   if (length == 3)
4721     {
4722       unsigned int j0 = 0, j1 = 0, j2 = 0;
4723
4724       for (j = 0; j < 3; j++)
4725         {
4726           int nelt0 = ((3 - j) * nelt) % 3;
4727           int nelt1 = ((3 - j) * nelt + 1) % 3;
4728           int nelt2 = ((3 - j) * nelt + 2) % 3;
4729
4730           for (i = 0; i < nelt; i++)
4731             {
4732               if (3 * i + nelt0 < nelt)
4733                 sel[3 * i + nelt0] = j0++;
4734               if (3 * i + nelt1 < nelt)
4735                 sel[3 * i + nelt1] = nelt + j1++;
4736               if (3 * i + nelt2 < nelt)
4737                 sel[3 * i + nelt2] = 0;
4738             }
4739           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4740
4741           for (i = 0; i < nelt; i++)
4742             {
4743               if (3 * i + nelt0 < nelt)
4744                 sel[3 * i + nelt0] = 3 * i + nelt0;
4745               if (3 * i + nelt1 < nelt)
4746                 sel[3 * i + nelt1] = 3 * i + nelt1;
4747               if (3 * i + nelt2 < nelt)
4748                 sel[3 * i + nelt2] = nelt + j2++;
4749             }
4750           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4751
4752           vect1 = dr_chain[0];
4753           vect2 = dr_chain[1];
4754
4755           /* Create interleaving stmt:
4756              low = VEC_PERM_EXPR <vect1, vect2,
4757                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4758                                    j + 2, nelt + j + 2, *, ...}>  */
4759           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4760           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4761                                            vect2, perm3_mask_low);
4762           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4763
4764           vect1 = data_ref;
4765           vect2 = dr_chain[2];
4766           /* Create interleaving stmt:
4767              low = VEC_PERM_EXPR <vect1, vect2,
4768                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4769                                    6, 7, nelt + j + 2, ...}>  */
4770           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4771           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4772                                            vect2, perm3_mask_high);
4773           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4774           (*result_chain)[j] = data_ref;
4775         }
4776     }
4777   else
4778     {
4779       /* If length is not equal to 3 then only power of 2 is supported.  */
4780       gcc_assert (pow2p_hwi (length));
4781
4782       for (i = 0, n = nelt / 2; i < n; i++)
4783         {
4784           sel[i * 2] = i;
4785           sel[i * 2 + 1] = i + nelt;
4786         }
4787         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4788
4789         for (i = 0; i < nelt; i++)
4790           sel[i] += nelt / 2;
4791         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4792
4793         for (i = 0, n = log_length; i < n; i++)
4794           {
4795             for (j = 0; j < length/2; j++)
4796               {
4797                 vect1 = dr_chain[j];
4798                 vect2 = dr_chain[j+length/2];
4799
4800                 /* Create interleaving stmt:
4801                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4802                                                         ...}>  */
4803                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4804                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4805                                                  vect2, perm_mask_high);
4806                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4807                 (*result_chain)[2*j] = high;
4808
4809                 /* Create interleaving stmt:
4810                    low = VEC_PERM_EXPR <vect1, vect2,
4811                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4812                                          ...}>  */
4813                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4814                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4815                                                  vect2, perm_mask_low);
4816                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4817                 (*result_chain)[2*j+1] = low;
4818               }
4819             memcpy (dr_chain.address (), result_chain->address (),
4820                     length * sizeof (tree));
4821           }
4822     }
4823 }
4824
4825 /* Function vect_setup_realignment
4826
4827    This function is called when vectorizing an unaligned load using
4828    the dr_explicit_realign[_optimized] scheme.
4829    This function generates the following code at the loop prolog:
4830
4831       p = initial_addr;
4832    x  msq_init = *(floor(p));   # prolog load
4833       realignment_token = call target_builtin;
4834     loop:
4835    x  msq = phi (msq_init, ---)
4836
4837    The stmts marked with x are generated only for the case of
4838    dr_explicit_realign_optimized.
4839
4840    The code above sets up a new (vector) pointer, pointing to the first
4841    location accessed by STMT, and a "floor-aligned" load using that pointer.
4842    It also generates code to compute the "realignment-token" (if the relevant
4843    target hook was defined), and creates a phi-node at the loop-header bb
4844    whose arguments are the result of the prolog-load (created by this
4845    function) and the result of a load that takes place in the loop (to be
4846    created by the caller to this function).
4847
4848    For the case of dr_explicit_realign_optimized:
4849    The caller to this function uses the phi-result (msq) to create the
4850    realignment code inside the loop, and sets up the missing phi argument,
4851    as follows:
4852     loop:
4853       msq = phi (msq_init, lsq)
4854       lsq = *(floor(p'));        # load in loop
4855       result = realign_load (msq, lsq, realignment_token);
4856
4857    For the case of dr_explicit_realign:
4858     loop:
4859       msq = *(floor(p));        # load in loop
4860       p' = p + (VS-1);
4861       lsq = *(floor(p'));       # load in loop
4862       result = realign_load (msq, lsq, realignment_token);
4863
4864    Input:
4865    STMT - (scalar) load stmt to be vectorized. This load accesses
4866           a memory location that may be unaligned.
4867    BSI - place where new code is to be inserted.
4868    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4869                               is used.
4870
4871    Output:
4872    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4873                        target hook, if defined.
4874    Return value - the result of the loop-header phi node.  */
4875
4876 tree
4877 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4878                         tree *realignment_token,
4879                         enum dr_alignment_support alignment_support_scheme,
4880                         tree init_addr,
4881                         struct loop **at_loop)
4882 {
4883   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4884   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4885   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4886   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4887   struct loop *loop = NULL;
4888   edge pe = NULL;
4889   tree scalar_dest = gimple_assign_lhs (stmt);
4890   tree vec_dest;
4891   gimple *inc;
4892   tree ptr;
4893   tree data_ref;
4894   basic_block new_bb;
4895   tree msq_init = NULL_TREE;
4896   tree new_temp;
4897   gphi *phi_stmt;
4898   tree msq = NULL_TREE;
4899   gimple_seq stmts = NULL;
4900   bool inv_p;
4901   bool compute_in_loop = false;
4902   bool nested_in_vect_loop = false;
4903   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4904   struct loop *loop_for_initial_load = NULL;
4905
4906   if (loop_vinfo)
4907     {
4908       loop = LOOP_VINFO_LOOP (loop_vinfo);
4909       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4910     }
4911
4912   gcc_assert (alignment_support_scheme == dr_explicit_realign
4913               || alignment_support_scheme == dr_explicit_realign_optimized);
4914
4915   /* We need to generate three things:
4916      1. the misalignment computation
4917      2. the extra vector load (for the optimized realignment scheme).
4918      3. the phi node for the two vectors from which the realignment is
4919       done (for the optimized realignment scheme).  */
4920
4921   /* 1. Determine where to generate the misalignment computation.
4922
4923      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4924      calculation will be generated by this function, outside the loop (in the
4925      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4926      caller, inside the loop.
4927
4928      Background: If the misalignment remains fixed throughout the iterations of
4929      the loop, then both realignment schemes are applicable, and also the
4930      misalignment computation can be done outside LOOP.  This is because we are
4931      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4932      are a multiple of VS (the Vector Size), and therefore the misalignment in
4933      different vectorized LOOP iterations is always the same.
4934      The problem arises only if the memory access is in an inner-loop nested
4935      inside LOOP, which is now being vectorized using outer-loop vectorization.
4936      This is the only case when the misalignment of the memory access may not
4937      remain fixed throughout the iterations of the inner-loop (as explained in
4938      detail in vect_supportable_dr_alignment).  In this case, not only is the
4939      optimized realignment scheme not applicable, but also the misalignment
4940      computation (and generation of the realignment token that is passed to
4941      REALIGN_LOAD) have to be done inside the loop.
4942
4943      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4944      or not, which in turn determines if the misalignment is computed inside
4945      the inner-loop, or outside LOOP.  */
4946
4947   if (init_addr != NULL_TREE || !loop_vinfo)
4948     {
4949       compute_in_loop = true;
4950       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4951     }
4952
4953
4954   /* 2. Determine where to generate the extra vector load.
4955
4956      For the optimized realignment scheme, instead of generating two vector
4957      loads in each iteration, we generate a single extra vector load in the
4958      preheader of the loop, and in each iteration reuse the result of the
4959      vector load from the previous iteration.  In case the memory access is in
4960      an inner-loop nested inside LOOP, which is now being vectorized using
4961      outer-loop vectorization, we need to determine whether this initial vector
4962      load should be generated at the preheader of the inner-loop, or can be
4963      generated at the preheader of LOOP.  If the memory access has no evolution
4964      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4965      to be generated inside LOOP (in the preheader of the inner-loop).  */
4966
4967   if (nested_in_vect_loop)
4968     {
4969       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4970       bool invariant_in_outerloop =
4971             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4972       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4973     }
4974   else
4975     loop_for_initial_load = loop;
4976   if (at_loop)
4977     *at_loop = loop_for_initial_load;
4978
4979   if (loop_for_initial_load)
4980     pe = loop_preheader_edge (loop_for_initial_load);
4981
4982   /* 3. For the case of the optimized realignment, create the first vector
4983       load at the loop preheader.  */
4984
4985   if (alignment_support_scheme == dr_explicit_realign_optimized)
4986     {
4987       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4988       gassign *new_stmt;
4989
4990       gcc_assert (!compute_in_loop);
4991       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4992       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4993                                       NULL_TREE, &init_addr, NULL, &inc,
4994                                       true, &inv_p);
4995       if (TREE_CODE (ptr) == SSA_NAME)
4996         new_temp = copy_ssa_name (ptr);
4997       else
4998         new_temp = make_ssa_name (TREE_TYPE (ptr));
4999       new_stmt = gimple_build_assign
5000                    (new_temp, BIT_AND_EXPR, ptr,
5001                     build_int_cst (TREE_TYPE (ptr),
5002                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5003       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5004       gcc_assert (!new_bb);
5005       data_ref
5006         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5007                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5008       new_stmt = gimple_build_assign (vec_dest, data_ref);
5009       new_temp = make_ssa_name (vec_dest, new_stmt);
5010       gimple_assign_set_lhs (new_stmt, new_temp);
5011       if (pe)
5012         {
5013           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5014           gcc_assert (!new_bb);
5015         }
5016       else
5017          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5018
5019       msq_init = gimple_assign_lhs (new_stmt);
5020     }
5021
5022   /* 4. Create realignment token using a target builtin, if available.
5023       It is done either inside the containing loop, or before LOOP (as
5024       determined above).  */
5025
5026   if (targetm.vectorize.builtin_mask_for_load)
5027     {
5028       gcall *new_stmt;
5029       tree builtin_decl;
5030
5031       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5032       if (!init_addr)
5033         {
5034           /* Generate the INIT_ADDR computation outside LOOP.  */
5035           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5036                                                             NULL_TREE);
5037           if (loop)
5038             {
5039               pe = loop_preheader_edge (loop);
5040               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5041               gcc_assert (!new_bb);
5042             }
5043           else
5044              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5045         }
5046
5047       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5048       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5049       vec_dest =
5050         vect_create_destination_var (scalar_dest,
5051                                      gimple_call_return_type (new_stmt));
5052       new_temp = make_ssa_name (vec_dest, new_stmt);
5053       gimple_call_set_lhs (new_stmt, new_temp);
5054
5055       if (compute_in_loop)
5056         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5057       else
5058         {
5059           /* Generate the misalignment computation outside LOOP.  */
5060           pe = loop_preheader_edge (loop);
5061           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5062           gcc_assert (!new_bb);
5063         }
5064
5065       *realignment_token = gimple_call_lhs (new_stmt);
5066
5067       /* The result of the CALL_EXPR to this builtin is determined from
5068          the value of the parameter and no global variables are touched
5069          which makes the builtin a "const" function.  Requiring the
5070          builtin to have the "const" attribute makes it unnecessary
5071          to call mark_call_clobbered.  */
5072       gcc_assert (TREE_READONLY (builtin_decl));
5073     }
5074
5075   if (alignment_support_scheme == dr_explicit_realign)
5076     return msq;
5077
5078   gcc_assert (!compute_in_loop);
5079   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5080
5081
5082   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5083
5084   pe = loop_preheader_edge (containing_loop);
5085   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5086   msq = make_ssa_name (vec_dest);
5087   phi_stmt = create_phi_node (msq, containing_loop->header);
5088   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5089
5090   return msq;
5091 }
5092
5093
5094 /* Function vect_grouped_load_supported.
5095
5096    COUNT is the size of the load group (the number of statements plus the
5097    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5098    only one statement, with a gap of COUNT - 1.
5099
5100    Returns true if a suitable permute exists.  */
5101
5102 bool
5103 vect_grouped_load_supported (tree vectype, bool single_element_p,
5104                              unsigned HOST_WIDE_INT count)
5105 {
5106   machine_mode mode = TYPE_MODE (vectype);
5107
5108   /* If this is single-element interleaving with an element distance
5109      that leaves unused vector loads around punt - we at least create
5110      very sub-optimal code in that case (and blow up memory,
5111      see PR65518).  */
5112   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5113     {
5114       if (dump_enabled_p ())
5115         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5116                          "single-element interleaving not supported "
5117                          "for not adjacent vector loads\n");
5118       return false;
5119     }
5120
5121   /* vect_permute_load_chain requires the group size to be equal to 3 or
5122      be a power of two.  */
5123   if (count != 3 && exact_log2 (count) == -1)
5124     {
5125       if (dump_enabled_p ())
5126         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5127                          "the size of the group of accesses"
5128                          " is not a power of 2 or not equal to 3\n");
5129       return false;
5130     }
5131
5132   /* Check that the permutation is supported.  */
5133   if (VECTOR_MODE_P (mode))
5134     {
5135       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5136       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5137
5138       if (count == 3)
5139         {
5140           unsigned int k;
5141           for (k = 0; k < 3; k++)
5142             {
5143               for (i = 0; i < nelt; i++)
5144                 if (3 * i + k < 2 * nelt)
5145                   sel[i] = 3 * i + k;
5146                 else
5147                   sel[i] = 0;
5148               if (!can_vec_perm_p (mode, false, sel))
5149                 {
5150                   if (dump_enabled_p ())
5151                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5152                                      "shuffle of 3 loads is not supported by"
5153                                      " target\n");
5154                   return false;
5155                 }
5156               for (i = 0, j = 0; i < nelt; i++)
5157                 if (3 * i + k < 2 * nelt)
5158                   sel[i] = i;
5159                 else
5160                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5161               if (!can_vec_perm_p (mode, false, sel))
5162                 {
5163                   if (dump_enabled_p ())
5164                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5165                                      "shuffle of 3 loads is not supported by"
5166                                      " target\n");
5167                   return false;
5168                 }
5169             }
5170           return true;
5171         }
5172       else
5173         {
5174           /* If length is not equal to 3 then only power of 2 is supported.  */
5175           gcc_assert (pow2p_hwi (count));
5176           for (i = 0; i < nelt; i++)
5177             sel[i] = i * 2;
5178           if (can_vec_perm_p (mode, false, sel))
5179             {
5180               for (i = 0; i < nelt; i++)
5181                 sel[i] = i * 2 + 1;
5182               if (can_vec_perm_p (mode, false, sel))
5183                 return true;
5184             }
5185         }
5186     }
5187
5188   if (dump_enabled_p ())
5189     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5190                      "extract even/odd not supported by target\n");
5191   return false;
5192 }
5193
5194 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5195    type VECTYPE.  */
5196
5197 bool
5198 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5199 {
5200   return vect_lanes_optab_supported_p ("vec_load_lanes",
5201                                        vec_load_lanes_optab,
5202                                        vectype, count);
5203 }
5204
5205 /* Function vect_permute_load_chain.
5206
5207    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5208    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5209    the input data correctly.  Return the final references for loads in
5210    RESULT_CHAIN.
5211
5212    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5213    The input is 4 vectors each containing 8 elements. We assign a number to each
5214    element, the input sequence is:
5215
5216    1st vec:   0  1  2  3  4  5  6  7
5217    2nd vec:   8  9 10 11 12 13 14 15
5218    3rd vec:  16 17 18 19 20 21 22 23
5219    4th vec:  24 25 26 27 28 29 30 31
5220
5221    The output sequence should be:
5222
5223    1st vec:  0 4  8 12 16 20 24 28
5224    2nd vec:  1 5  9 13 17 21 25 29
5225    3rd vec:  2 6 10 14 18 22 26 30
5226    4th vec:  3 7 11 15 19 23 27 31
5227
5228    i.e., the first output vector should contain the first elements of each
5229    interleaving group, etc.
5230
5231    We use extract_even/odd instructions to create such output.  The input of
5232    each extract_even/odd operation is two vectors
5233    1st vec    2nd vec
5234    0 1 2 3    4 5 6 7
5235
5236    and the output is the vector of extracted even/odd elements.  The output of
5237    extract_even will be:   0 2 4 6
5238    and of extract_odd:     1 3 5 7
5239
5240
5241    The permutation is done in log LENGTH stages.  In each stage extract_even
5242    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5243    their order.  In our example,
5244
5245    E1: extract_even (1st vec, 2nd vec)
5246    E2: extract_odd (1st vec, 2nd vec)
5247    E3: extract_even (3rd vec, 4th vec)
5248    E4: extract_odd (3rd vec, 4th vec)
5249
5250    The output for the first stage will be:
5251
5252    E1:  0  2  4  6  8 10 12 14
5253    E2:  1  3  5  7  9 11 13 15
5254    E3: 16 18 20 22 24 26 28 30
5255    E4: 17 19 21 23 25 27 29 31
5256
5257    In order to proceed and create the correct sequence for the next stage (or
5258    for the correct output, if the second stage is the last one, as in our
5259    example), we first put the output of extract_even operation and then the
5260    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5261    The input for the second stage is:
5262
5263    1st vec (E1):  0  2  4  6  8 10 12 14
5264    2nd vec (E3): 16 18 20 22 24 26 28 30
5265    3rd vec (E2):  1  3  5  7  9 11 13 15
5266    4th vec (E4): 17 19 21 23 25 27 29 31
5267
5268    The output of the second stage:
5269
5270    E1: 0 4  8 12 16 20 24 28
5271    E2: 2 6 10 14 18 22 26 30
5272    E3: 1 5  9 13 17 21 25 29
5273    E4: 3 7 11 15 19 23 27 31
5274
5275    And RESULT_CHAIN after reordering:
5276
5277    1st vec (E1):  0 4  8 12 16 20 24 28
5278    2nd vec (E3):  1 5  9 13 17 21 25 29
5279    3rd vec (E2):  2 6 10 14 18 22 26 30
5280    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5281
5282 static void
5283 vect_permute_load_chain (vec<tree> dr_chain,
5284                          unsigned int length,
5285                          gimple *stmt,
5286                          gimple_stmt_iterator *gsi,
5287                          vec<tree> *result_chain)
5288 {
5289   tree data_ref, first_vect, second_vect;
5290   tree perm_mask_even, perm_mask_odd;
5291   tree perm3_mask_low, perm3_mask_high;
5292   gimple *perm_stmt;
5293   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5294   unsigned int i, j, log_length = exact_log2 (length);
5295   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5296   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5297
5298   result_chain->quick_grow (length);
5299   memcpy (result_chain->address (), dr_chain.address (),
5300           length * sizeof (tree));
5301
5302   if (length == 3)
5303     {
5304       unsigned int k;
5305
5306       for (k = 0; k < 3; k++)
5307         {
5308           for (i = 0; i < nelt; i++)
5309             if (3 * i + k < 2 * nelt)
5310               sel[i] = 3 * i + k;
5311             else
5312               sel[i] = 0;
5313           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5314
5315           for (i = 0, j = 0; i < nelt; i++)
5316             if (3 * i + k < 2 * nelt)
5317               sel[i] = i;
5318             else
5319               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5320
5321           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5322
5323           first_vect = dr_chain[0];
5324           second_vect = dr_chain[1];
5325
5326           /* Create interleaving stmt (low part of):
5327              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5328                                                              ...}>  */
5329           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5330           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5331                                            second_vect, perm3_mask_low);
5332           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5333
5334           /* Create interleaving stmt (high part of):
5335              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5336                                                               ...}>  */
5337           first_vect = data_ref;
5338           second_vect = dr_chain[2];
5339           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5340           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5341                                            second_vect, perm3_mask_high);
5342           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5343           (*result_chain)[k] = data_ref;
5344         }
5345     }
5346   else
5347     {
5348       /* If length is not equal to 3 then only power of 2 is supported.  */
5349       gcc_assert (pow2p_hwi (length));
5350
5351       for (i = 0; i < nelt; ++i)
5352         sel[i] = i * 2;
5353       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5354
5355       for (i = 0; i < nelt; ++i)
5356         sel[i] = i * 2 + 1;
5357       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5358
5359       for (i = 0; i < log_length; i++)
5360         {
5361           for (j = 0; j < length; j += 2)
5362             {
5363               first_vect = dr_chain[j];
5364               second_vect = dr_chain[j+1];
5365
5366               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5367               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5368               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5369                                                first_vect, second_vect,
5370                                                perm_mask_even);
5371               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5372               (*result_chain)[j/2] = data_ref;
5373
5374               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5375               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5376               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5377                                                first_vect, second_vect,
5378                                                perm_mask_odd);
5379               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5380               (*result_chain)[j/2+length/2] = data_ref;
5381             }
5382           memcpy (dr_chain.address (), result_chain->address (),
5383                   length * sizeof (tree));
5384         }
5385     }
5386 }
5387
5388 /* Function vect_shift_permute_load_chain.
5389
5390    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5391    sequence of stmts to reorder the input data accordingly.
5392    Return the final references for loads in RESULT_CHAIN.
5393    Return true if successed, false otherwise.
5394
5395    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5396    The input is 3 vectors each containing 8 elements.  We assign a
5397    number to each element, the input sequence is:
5398
5399    1st vec:   0  1  2  3  4  5  6  7
5400    2nd vec:   8  9 10 11 12 13 14 15
5401    3rd vec:  16 17 18 19 20 21 22 23
5402
5403    The output sequence should be:
5404
5405    1st vec:  0 3 6  9 12 15 18 21
5406    2nd vec:  1 4 7 10 13 16 19 22
5407    3rd vec:  2 5 8 11 14 17 20 23
5408
5409    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5410
5411    First we shuffle all 3 vectors to get correct elements order:
5412
5413    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5414    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5415    3rd vec:  (16 19 22) (17 20 23) (18 21)
5416
5417    Next we unite and shift vector 3 times:
5418
5419    1st step:
5420      shift right by 6 the concatenation of:
5421      "1st vec" and  "2nd vec"
5422        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5423      "2nd vec" and  "3rd vec"
5424        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5425      "3rd vec" and  "1st vec"
5426        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5427                              | New vectors                   |
5428
5429      So that now new vectors are:
5430
5431      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5432      2nd vec:  (10 13) (16 19 22) (17 20 23)
5433      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5434
5435    2nd step:
5436      shift right by 5 the concatenation of:
5437      "1st vec" and  "3rd vec"
5438        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5439      "2nd vec" and  "1st vec"
5440        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5441      "3rd vec" and  "2nd vec"
5442        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5443                           | New vectors                   |
5444
5445      So that now new vectors are:
5446
5447      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5448      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5449      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5450
5451    3rd step:
5452      shift right by 5 the concatenation of:
5453      "1st vec" and  "1st vec"
5454        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5455      shift right by 3 the concatenation of:
5456      "2nd vec" and  "2nd vec"
5457                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5458                           | New vectors                   |
5459
5460      So that now all vectors are READY:
5461      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5462      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5463      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5464
5465    This algorithm is faster than one in vect_permute_load_chain if:
5466      1.  "shift of a concatination" is faster than general permutation.
5467          This is usually so.
5468      2.  The TARGET machine can't execute vector instructions in parallel.
5469          This is because each step of the algorithm depends on previous.
5470          The algorithm in vect_permute_load_chain is much more parallel.
5471
5472    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5473 */
5474
5475 static bool
5476 vect_shift_permute_load_chain (vec<tree> dr_chain,
5477                                unsigned int length,
5478                                gimple *stmt,
5479                                gimple_stmt_iterator *gsi,
5480                                vec<tree> *result_chain)
5481 {
5482   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5483   tree perm2_mask1, perm2_mask2, perm3_mask;
5484   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5485   gimple *perm_stmt;
5486
5487   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5488   unsigned int i;
5489   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5490   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5491   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5492   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5493
5494   result_chain->quick_grow (length);
5495   memcpy (result_chain->address (), dr_chain.address (),
5496           length * sizeof (tree));
5497
5498   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5499     {
5500       unsigned int j, log_length = exact_log2 (length);
5501       for (i = 0; i < nelt / 2; ++i)
5502         sel[i] = i * 2;
5503       for (i = 0; i < nelt / 2; ++i)
5504         sel[nelt / 2 + i] = i * 2 + 1;
5505       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5506         {
5507           if (dump_enabled_p ())
5508             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5509                              "shuffle of 2 fields structure is not \
5510                               supported by target\n");
5511           return false;
5512         }
5513       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5514
5515       for (i = 0; i < nelt / 2; ++i)
5516         sel[i] = i * 2 + 1;
5517       for (i = 0; i < nelt / 2; ++i)
5518         sel[nelt / 2 + i] = i * 2;
5519       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5520         {
5521           if (dump_enabled_p ())
5522             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5523                              "shuffle of 2 fields structure is not \
5524                               supported by target\n");
5525           return false;
5526         }
5527       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5528
5529       /* Generating permutation constant to shift all elements.
5530          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5531       for (i = 0; i < nelt; i++)
5532         sel[i] = nelt / 2 + i;
5533       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5534         {
5535           if (dump_enabled_p ())
5536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5537                              "shift permutation is not supported by target\n");
5538           return false;
5539         }
5540       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5541
5542       /* Generating permutation constant to select vector from 2.
5543          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5544       for (i = 0; i < nelt / 2; i++)
5545         sel[i] = i;
5546       for (i = nelt / 2; i < nelt; i++)
5547         sel[i] = nelt + i;
5548       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5549         {
5550           if (dump_enabled_p ())
5551             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5552                              "select is not supported by target\n");
5553           return false;
5554         }
5555       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5556
5557       for (i = 0; i < log_length; i++)
5558         {
5559           for (j = 0; j < length; j += 2)
5560             {
5561               first_vect = dr_chain[j];
5562               second_vect = dr_chain[j + 1];
5563
5564               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5565               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5566                                                first_vect, first_vect,
5567                                                perm2_mask1);
5568               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5569               vect[0] = data_ref;
5570
5571               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5572               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5573                                                second_vect, second_vect,
5574                                                perm2_mask2);
5575               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5576               vect[1] = data_ref;
5577
5578               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5579               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5580                                                vect[0], vect[1], shift1_mask);
5581               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5582               (*result_chain)[j/2 + length/2] = data_ref;
5583
5584               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5585               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5586                                                vect[0], vect[1], select_mask);
5587               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5588               (*result_chain)[j/2] = data_ref;
5589             }
5590           memcpy (dr_chain.address (), result_chain->address (),
5591                   length * sizeof (tree));
5592         }
5593       return true;
5594     }
5595   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5596     {
5597       unsigned int k = 0, l = 0;
5598
5599       /* Generating permutation constant to get all elements in rigth order.
5600          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5601       for (i = 0; i < nelt; i++)
5602         {
5603           if (3 * k + (l % 3) >= nelt)
5604             {
5605               k = 0;
5606               l += (3 - (nelt % 3));
5607             }
5608           sel[i] = 3 * k + (l % 3);
5609           k++;
5610         }
5611       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5612         {
5613           if (dump_enabled_p ())
5614             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5615                              "shuffle of 3 fields structure is not \
5616                               supported by target\n");
5617           return false;
5618         }
5619       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5620
5621       /* Generating permutation constant to shift all elements.
5622          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5623       for (i = 0; i < nelt; i++)
5624         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5625       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5626         {
5627           if (dump_enabled_p ())
5628             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5629                              "shift permutation is not supported by target\n");
5630           return false;
5631         }
5632       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5633
5634       /* Generating permutation constant to shift all elements.
5635          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5636       for (i = 0; i < nelt; i++)
5637         sel[i] = 2 * (nelt / 3) + 1 + i;
5638       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5639         {
5640           if (dump_enabled_p ())
5641             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5642                              "shift permutation is not supported by target\n");
5643           return false;
5644         }
5645       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5646
5647       /* Generating permutation constant to shift all elements.
5648          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5649       for (i = 0; i < nelt; i++)
5650         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5651       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5652         {
5653           if (dump_enabled_p ())
5654             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5655                              "shift permutation is not supported by target\n");
5656           return false;
5657         }
5658       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5659
5660       /* Generating permutation constant to shift all elements.
5661          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5662       for (i = 0; i < nelt; i++)
5663         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5664       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5665         {
5666           if (dump_enabled_p ())
5667             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5668                              "shift permutation is not supported by target\n");
5669           return false;
5670         }
5671       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5672
5673       for (k = 0; k < 3; k++)
5674         {
5675           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5676           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5677                                            dr_chain[k], dr_chain[k],
5678                                            perm3_mask);
5679           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5680           vect[k] = data_ref;
5681         }
5682
5683       for (k = 0; k < 3; k++)
5684         {
5685           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5686           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5687                                            vect[k % 3], vect[(k + 1) % 3],
5688                                            shift1_mask);
5689           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5690           vect_shift[k] = data_ref;
5691         }
5692
5693       for (k = 0; k < 3; k++)
5694         {
5695           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5696           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5697                                            vect_shift[(4 - k) % 3],
5698                                            vect_shift[(3 - k) % 3],
5699                                            shift2_mask);
5700           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5701           vect[k] = data_ref;
5702         }
5703
5704       (*result_chain)[3 - (nelt % 3)] = vect[2];
5705
5706       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5707       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5708                                        vect[0], shift3_mask);
5709       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5710       (*result_chain)[nelt % 3] = data_ref;
5711
5712       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5713       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5714                                        vect[1], shift4_mask);
5715       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5716       (*result_chain)[0] = data_ref;
5717       return true;
5718     }
5719   return false;
5720 }
5721
5722 /* Function vect_transform_grouped_load.
5723
5724    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5725    to perform their permutation and ascribe the result vectorized statements to
5726    the scalar statements.
5727 */
5728
5729 void
5730 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5731                              gimple_stmt_iterator *gsi)
5732 {
5733   machine_mode mode;
5734   vec<tree> result_chain = vNULL;
5735
5736   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5737      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5738      vectors, that are ready for vector computation.  */
5739   result_chain.create (size);
5740
5741   /* If reassociation width for vector type is 2 or greater target machine can
5742      execute 2 or more vector instructions in parallel.  Otherwise try to
5743      get chain for loads group using vect_shift_permute_load_chain.  */
5744   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5745   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5746       || pow2p_hwi (size)
5747       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5748                                          gsi, &result_chain))
5749     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5750   vect_record_grouped_load_vectors (stmt, result_chain);
5751   result_chain.release ();
5752 }
5753
5754 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5755    generated as part of the vectorization of STMT.  Assign the statement
5756    for each vector to the associated scalar statement.  */
5757
5758 void
5759 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5760 {
5761   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5762   gimple *next_stmt, *new_stmt;
5763   unsigned int i, gap_count;
5764   tree tmp_data_ref;
5765
5766   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5767      Since we scan the chain starting from it's first node, their order
5768      corresponds the order of data-refs in RESULT_CHAIN.  */
5769   next_stmt = first_stmt;
5770   gap_count = 1;
5771   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5772     {
5773       if (!next_stmt)
5774         break;
5775
5776       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5777        code elimination pass later.  No need to check for the first stmt in
5778        the group, since it always exists.
5779        GROUP_GAP is the number of steps in elements from the previous
5780        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5781        correspond to the gaps.  */
5782       if (next_stmt != first_stmt
5783           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5784       {
5785         gap_count++;
5786         continue;
5787       }
5788
5789       while (next_stmt)
5790         {
5791           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5792           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5793              copies, and we put the new vector statement in the first available
5794              RELATED_STMT.  */
5795           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5796             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5797           else
5798             {
5799               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5800                 {
5801                   gimple *prev_stmt =
5802                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5803                   gimple *rel_stmt =
5804                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5805                   while (rel_stmt)
5806                     {
5807                       prev_stmt = rel_stmt;
5808                       rel_stmt =
5809                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5810                     }
5811
5812                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5813                     new_stmt;
5814                 }
5815             }
5816
5817           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5818           gap_count = 1;
5819           /* If NEXT_STMT accesses the same DR as the previous statement,
5820              put the same TMP_DATA_REF as its vectorized statement; otherwise
5821              get the next data-ref from RESULT_CHAIN.  */
5822           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5823             break;
5824         }
5825     }
5826 }
5827
5828 /* Function vect_force_dr_alignment_p.
5829
5830    Returns whether the alignment of a DECL can be forced to be aligned
5831    on ALIGNMENT bit boundary.  */
5832
5833 bool
5834 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5835 {
5836   if (!VAR_P (decl))
5837     return false;
5838
5839   if (decl_in_symtab_p (decl)
5840       && !symtab_node::get (decl)->can_increase_alignment_p ())
5841     return false;
5842
5843   if (TREE_STATIC (decl))
5844     return (alignment <= MAX_OFILE_ALIGNMENT);
5845   else
5846     return (alignment <= MAX_STACK_ALIGNMENT);
5847 }
5848
5849
5850 /* Return whether the data reference DR is supported with respect to its
5851    alignment.
5852    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5853    it is aligned, i.e., check if it is possible to vectorize it with different
5854    alignment.  */
5855
5856 enum dr_alignment_support
5857 vect_supportable_dr_alignment (struct data_reference *dr,
5858                                bool check_aligned_accesses)
5859 {
5860   gimple *stmt = DR_STMT (dr);
5861   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5862   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5863   machine_mode mode = TYPE_MODE (vectype);
5864   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5865   struct loop *vect_loop = NULL;
5866   bool nested_in_vect_loop = false;
5867
5868   if (aligned_access_p (dr) && !check_aligned_accesses)
5869     return dr_aligned;
5870
5871   /* For now assume all conditional loads/stores support unaligned
5872      access without any special code.  */
5873   if (is_gimple_call (stmt)
5874       && gimple_call_internal_p (stmt)
5875       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5876           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5877     return dr_unaligned_supported;
5878
5879   if (loop_vinfo)
5880     {
5881       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5882       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5883     }
5884
5885   /* Possibly unaligned access.  */
5886
5887   /* We can choose between using the implicit realignment scheme (generating
5888      a misaligned_move stmt) and the explicit realignment scheme (generating
5889      aligned loads with a REALIGN_LOAD).  There are two variants to the
5890      explicit realignment scheme: optimized, and unoptimized.
5891      We can optimize the realignment only if the step between consecutive
5892      vector loads is equal to the vector size.  Since the vector memory
5893      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5894      is guaranteed that the misalignment amount remains the same throughout the
5895      execution of the vectorized loop.  Therefore, we can create the
5896      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5897      at the loop preheader.
5898
5899      However, in the case of outer-loop vectorization, when vectorizing a
5900      memory access in the inner-loop nested within the LOOP that is now being
5901      vectorized, while it is guaranteed that the misalignment of the
5902      vectorized memory access will remain the same in different outer-loop
5903      iterations, it is *not* guaranteed that is will remain the same throughout
5904      the execution of the inner-loop.  This is because the inner-loop advances
5905      with the original scalar step (and not in steps of VS).  If the inner-loop
5906      step happens to be a multiple of VS, then the misalignment remains fixed
5907      and we can use the optimized realignment scheme.  For example:
5908
5909       for (i=0; i<N; i++)
5910         for (j=0; j<M; j++)
5911           s += a[i+j];
5912
5913      When vectorizing the i-loop in the above example, the step between
5914      consecutive vector loads is 1, and so the misalignment does not remain
5915      fixed across the execution of the inner-loop, and the realignment cannot
5916      be optimized (as illustrated in the following pseudo vectorized loop):
5917
5918       for (i=0; i<N; i+=4)
5919         for (j=0; j<M; j++){
5920           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5921                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5922                          // (assuming that we start from an aligned address).
5923           }
5924
5925      We therefore have to use the unoptimized realignment scheme:
5926
5927       for (i=0; i<N; i+=4)
5928           for (j=k; j<M; j+=4)
5929           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5930                            // that the misalignment of the initial address is
5931                            // 0).
5932
5933      The loop can then be vectorized as follows:
5934
5935       for (k=0; k<4; k++){
5936         rt = get_realignment_token (&vp[k]);
5937         for (i=0; i<N; i+=4){
5938           v1 = vp[i+k];
5939           for (j=k; j<M; j+=4){
5940             v2 = vp[i+j+VS-1];
5941             va = REALIGN_LOAD <v1,v2,rt>;
5942             vs += va;
5943             v1 = v2;
5944           }
5945         }
5946     } */
5947
5948   if (DR_IS_READ (dr))
5949     {
5950       bool is_packed = false;
5951       tree type = (TREE_TYPE (DR_REF (dr)));
5952
5953       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5954           && (!targetm.vectorize.builtin_mask_for_load
5955               || targetm.vectorize.builtin_mask_for_load ()))
5956         {
5957           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5958
5959           /* If we are doing SLP then the accesses need not have the
5960              same alignment, instead it depends on the SLP group size.  */
5961           if (loop_vinfo
5962               && STMT_SLP_TYPE (stmt_info)
5963               && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5964                   * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5965                   % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5966             ;
5967           else if (!loop_vinfo
5968                    || (nested_in_vect_loop
5969                        && (TREE_INT_CST_LOW (DR_STEP (dr))
5970                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5971             return dr_explicit_realign;
5972           else
5973             return dr_explicit_realign_optimized;
5974         }
5975       if (!known_alignment_for_access_p (dr))
5976         is_packed = not_size_aligned (DR_REF (dr));
5977
5978       if (targetm.vectorize.support_vector_misalignment
5979             (mode, type, DR_MISALIGNMENT (dr), is_packed))
5980         /* Can't software pipeline the loads, but can at least do them.  */
5981         return dr_unaligned_supported;
5982     }
5983   else
5984     {
5985       bool is_packed = false;
5986       tree type = (TREE_TYPE (DR_REF (dr)));
5987
5988       if (!known_alignment_for_access_p (dr))
5989         is_packed = not_size_aligned (DR_REF (dr));
5990
5991      if (targetm.vectorize.support_vector_misalignment
5992            (mode, type, DR_MISALIGNMENT (dr), is_packed))
5993        return dr_unaligned_supported;
5994     }
5995
5996   /* Unsupported.  */
5997   return dr_unaligned_unsupported;
5998 }