gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode;
  65   scalar_int_mode array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   limit_p = !targetm.array_mode_supported_p (mode, count);
  70   if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
  71                           limit_p).exists (&array_mode))
  72     {
  73       if (dump_enabled_p ())
  74         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  75                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  76                          GET_MODE_NAME (mode), count);
  77       return false;
  78     }
  79
  80   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  81     {
  82       if (dump_enabled_p ())
  83         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  84                          "cannot use %s<%s><%s>\n", name,
  85                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  86       return false;
  87     }
  88
  89   if (dump_enabled_p ())
  90     dump_printf_loc (MSG_NOTE, vect_location,
  91                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  92                      GET_MODE_NAME (mode));
  93
  94   return true;
  95 }
  96
  97
  98 /* Return the smallest scalar part of STMT.
  99    This is used to determine the vectype of the stmt.  We generally set the
 100    vectype according to the type of the result (lhs).  For stmts whose
 101    result-type is different than the type of the arguments (e.g., demotion,
 102    promotion), vectype will be reset appropriately (later).  Note that we have
 103    to visit the smallest datatype in this function, because that determines the
 104    VF.  If the smallest datatype in the loop is present only as the rhs of a
 105    promotion operation - we'd miss it.
 106    Such a case, where a variable of this datatype does not appear in the lhs
 107    anywhere in the loop, can only occur if it's an invariant: e.g.:
 108    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 109    invariant motion.  However, we cannot rely on invariant motion to always
 110    take invariants out of the loop, and so in the case of promotion we also
 111    have to check the rhs.
 112    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 113    types.  */
 114
 115 tree
 116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 117                                HOST_WIDE_INT *rhs_size_unit)
 118 {
 119   tree scalar_type = gimple_expr_type (stmt);
 120   HOST_WIDE_INT lhs, rhs;
 121
 122   /* During the analysis phase, this function is called on arbitrary
 123      statements that might not have scalar results.  */
 124   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 125     return scalar_type;
 126
 127   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 128
 129   if (is_gimple_assign (stmt)
 130       && (gimple_assign_cast_p (stmt)
 131           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 132           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 133           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 134     {
 135       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 136
 137       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 138       if (rhs < lhs)
 139         scalar_type = rhs_type;
 140     }
 141
 142   *lhs_size_unit = lhs;
 143   *rhs_size_unit = rhs;
 144   return scalar_type;
 145 }
 146
 147
 148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 149    tested at run-time.  Return TRUE if DDR was successfully inserted.
 150    Return false if versioning is not supported.  */
 151
 152 static bool
 153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 154 {
 155   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 156
 157   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 158     return false;
 159
 160   if (!runtime_alias_check_p (ddr, loop,
 161                               optimize_loop_nest_for_speed_p (loop)))
 162     return false;
 163
 164   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 165   return true;
 166 }
 167
 168
 169 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 170    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 171    distances.  These distances are conservatively correct but they don't
 172    reflect a guaranteed dependence.
 173
 174    Return true if this function does all the work necessary to avoid
 175    an alias or false if the caller should use the dependence distances
 176    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 177    the depth of the loop described by LOOP_VINFO and the other arguments
 178    are as for vect_analyze_data_ref_dependence.  */
 179
 180 static bool
 181 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 182                                        loop_vec_info loop_vinfo,
 183                                        int loop_depth, unsigned int *max_vf)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   lambda_vector dist_v;
 187   unsigned int i;
 188   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 189     {
 190       int dist = dist_v[loop_depth];
 191       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 192         {
 193           /* If the user asserted safelen >= DIST consecutive iterations
 194              can be executed concurrently, assume independence.
 195
 196              ??? An alternative would be to add the alias check even
 197              in this case, and vectorize the fallback loop with the
 198              maximum VF set to safelen.  However, if the user has
 199              explicitly given a length, it's less likely that that
 200              would be a win.  */
 201           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 202             {
 203               if ((unsigned int) loop->safelen < *max_vf)
 204                 *max_vf = loop->safelen;
 205               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 206               continue;
 207             }
 208
 209           /* For dependence distances of 2 or more, we have the option
 210              of limiting VF or checking for an alias at runtime.
 211              Prefer to check at runtime if we can, to avoid limiting
 212              the VF unnecessarily when the bases are in fact independent.
 213
 214              Note that the alias checks will be removed if the VF ends up
 215              being small enough.  */
 216           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 217         }
 218     }
 219   return true;
 220 }
 221
 222
 223 /* Function vect_analyze_data_ref_dependence.
 224
 225    Return TRUE if there (might) exist a dependence between a memory-reference
 226    DRA and a memory-reference DRB.  When versioning for alias may check a
 227    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 228    the data dependence.  */
 229
 230 static bool
 231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 232                                   loop_vec_info loop_vinfo,
 233                                   unsigned int *max_vf)
 234 {
 235   unsigned int i;
 236   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 237   struct data_reference *dra = DDR_A (ddr);
 238   struct data_reference *drb = DDR_B (ddr);
 239   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 240   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 241   lambda_vector dist_v;
 242   unsigned int loop_depth;
 243
 244   /* In loop analysis all data references should be vectorizable.  */
 245   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 246       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 247     gcc_unreachable ();
 248
 249   /* Independent data accesses.  */
 250   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 251     return false;
 252
 253   if (dra == drb
 254       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 255     return false;
 256
 257   /* We do not have to consider dependences between accesses that belong
 258      to the same group.  */
 259   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 260       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 261     return false;
 262
 263   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 264      least two scalar iterations, there is always also a true dependence.
 265      As the vectorizer does not re-order loads and stores we can ignore
 266      the anti-dependence if TBAA can disambiguate both DRs similar to the
 267      case with known negative distance anti-dependences (positive
 268      distance anti-dependences would violate TBAA constraints).  */
 269   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 270        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 271       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 272                                  get_alias_set (DR_REF (drb))))
 273     return false;
 274
 275   /* Unknown data dependence.  */
 276   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 277     {
 278       /* If user asserted safelen consecutive iterations can be
 279          executed concurrently, assume independence.  */
 280       if (loop->safelen >= 2)
 281         {
 282           if ((unsigned int) loop->safelen < *max_vf)
 283             *max_vf = loop->safelen;
 284           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 285           return false;
 286         }
 287
 288       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 289           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 290         {
 291           if (dump_enabled_p ())
 292             {
 293               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 294                                "versioning for alias not supported for: "
 295                                "can't determine dependence between ");
 296               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 297                                  DR_REF (dra));
 298               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 299               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 300                                  DR_REF (drb));
 301               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 302             }
 303           return true;
 304         }
 305
 306       if (dump_enabled_p ())
 307         {
 308           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 309                            "versioning for alias required: "
 310                            "can't determine dependence between ");
 311           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 312                              DR_REF (dra));
 313           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 314           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 315                              DR_REF (drb));
 316           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 317         }
 318
 319       /* Add to list of ddrs that need to be tested at run-time.  */
 320       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 321     }
 322
 323   /* Known data dependence.  */
 324   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 325     {
 326       /* If user asserted safelen consecutive iterations can be
 327          executed concurrently, assume independence.  */
 328       if (loop->safelen >= 2)
 329         {
 330           if ((unsigned int) loop->safelen < *max_vf)
 331             *max_vf = loop->safelen;
 332           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 333           return false;
 334         }
 335
 336       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 337           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 338         {
 339           if (dump_enabled_p ())
 340             {
 341               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 342                                "versioning for alias not supported for: "
 343                                "bad dist vector for ");
 344               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 345                                  DR_REF (dra));
 346               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 347               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 348                                  DR_REF (drb));
 349               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 350             }
 351           return true;
 352         }
 353
 354       if (dump_enabled_p ())
 355         {
 356           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 357                            "versioning for alias required: "
 358                            "bad dist vector for ");
 359           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 360           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 361           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 362           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363         }
 364       /* Add to list of ddrs that need to be tested at run-time.  */
 365       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 366     }
 367
 368   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 369
 370   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 371       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 372                                                 loop_depth, max_vf))
 373     return false;
 374
 375   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 376     {
 377       int dist = dist_v[loop_depth];
 378
 379       if (dump_enabled_p ())
 380         dump_printf_loc (MSG_NOTE, vect_location,
 381                          "dependence distance  = %d.\n", dist);
 382
 383       if (dist == 0)
 384         {
 385           if (dump_enabled_p ())
 386             {
 387               dump_printf_loc (MSG_NOTE, vect_location,
 388                                "dependence distance == 0 between ");
 389               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 390               dump_printf (MSG_NOTE, " and ");
 391               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 392               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 393             }
 394
 395           /* When we perform grouped accesses and perform implicit CSE
 396              by detecting equal accesses and doing disambiguation with
 397              runtime alias tests like for
 398                 .. = a[i];
 399                 .. = a[i+1];
 400                 a[i] = ..;
 401                 a[i+1] = ..;
 402                 *p = ..;
 403                 .. = a[i];
 404                 .. = a[i+1];
 405              where we will end up loading { a[i], a[i+1] } once, make
 406              sure that inserting group loads before the first load and
 407              stores after the last store will do the right thing.
 408              Similar for groups like
 409                 a[i] = ...;
 410                 ... = a[i];
 411                 a[i+1] = ...;
 412              where loads from the group interleave with the store.  */
 413           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 414               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 415             {
 416               gimple *earlier_stmt;
 417               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 418               if (DR_IS_WRITE
 419                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 420                 {
 421                   if (dump_enabled_p ())
 422                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 423                                      "READ_WRITE dependence in interleaving."
 424                                      "\n");
 425                   return true;
 426                 }
 427             }
 428
 429           continue;
 430         }
 431
 432       if (dist > 0 && DDR_REVERSED_P (ddr))
 433         {
 434           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 435              reversed (to make distance vector positive), and the actual
 436              distance is negative.  */
 437           if (dump_enabled_p ())
 438             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 439                              "dependence distance negative.\n");
 440           /* Record a negative dependence distance to later limit the
 441              amount of stmt copying / unrolling we can perform.
 442              Only need to handle read-after-write dependence.  */
 443           if (DR_IS_READ (drb)
 444               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 445                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 446             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 447           continue;
 448         }
 449
 450       unsigned int abs_dist = abs (dist);
 451       if (abs_dist >= 2 && abs_dist < *max_vf)
 452         {
 453           /* The dependence distance requires reduction of the maximal
 454              vectorization factor.  */
 455           *max_vf = abs (dist);
 456           if (dump_enabled_p ())
 457             dump_printf_loc (MSG_NOTE, vect_location,
 458                              "adjusting maximal vectorization factor to %i\n",
 459                              *max_vf);
 460         }
 461
 462       if (abs_dist >= *max_vf)
 463         {
 464           /* Dependence distance does not create dependence, as far as
 465              vectorization is concerned, in this case.  */
 466           if (dump_enabled_p ())
 467             dump_printf_loc (MSG_NOTE, vect_location,
 468                              "dependence distance >= VF.\n");
 469           continue;
 470         }
 471
 472       if (dump_enabled_p ())
 473         {
 474           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 475                        "not vectorized, possible dependence "
 476                        "between data-refs ");
 477           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 478           dump_printf (MSG_NOTE,  " and ");
 479           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 480           dump_printf (MSG_NOTE,  "\n");
 481         }
 482
 483       return true;
 484     }
 485
 486   return false;
 487 }
 488
 489 /* Function vect_analyze_data_ref_dependences.
 490
 491    Examine all the data references in the loop, and make sure there do not
 492    exist any data dependences between them.  Set *MAX_VF according to
 493    the maximum vectorization factor the data dependences allow.  */
 494
 495 bool
 496 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 497                                    unsigned int *max_vf)
 498 {
 499   unsigned int i;
 500   struct data_dependence_relation *ddr;
 501
 502   if (dump_enabled_p ())
 503     dump_printf_loc (MSG_NOTE, vect_location,
 504                      "=== vect_analyze_data_ref_dependences ===\n");
 505
 506   LOOP_VINFO_DDRS (loop_vinfo)
 507     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 508              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 509   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 510   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 511   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 512                                 &LOOP_VINFO_DDRS (loop_vinfo),
 513                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 514     return false;
 515
 516   /* For epilogues we either have no aliases or alias versioning
 517      was applied to original loop.  Therefore we may just get max_vf
 518      using VF of original loop.  */
 519   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 520     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 521   else
 522     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 523       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 524         return false;
 525
 526   return true;
 527 }
 528
 529
 530 /* Function vect_slp_analyze_data_ref_dependence.
 531
 532    Return TRUE if there (might) exist a dependence between a memory-reference
 533    DRA and a memory-reference DRB.  When versioning for alias may check a
 534    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 535    the data dependence.  */
 536
 537 static bool
 538 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 539 {
 540   struct data_reference *dra = DDR_A (ddr);
 541   struct data_reference *drb = DDR_B (ddr);
 542
 543   /* We need to check dependences of statements marked as unvectorizable
 544      as well, they still can prohibit vectorization.  */
 545
 546   /* Independent data accesses.  */
 547   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 548     return false;
 549
 550   if (dra == drb)
 551     return false;
 552
 553   /* Read-read is OK.  */
 554   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 555     return false;
 556
 557   /* If dra and drb are part of the same interleaving chain consider
 558      them independent.  */
 559   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 560       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 561           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 562     return false;
 563
 564   /* Unknown data dependence.  */
 565   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 566     {
 567       if  (dump_enabled_p ())
 568         {
 569           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 570                            "can't determine dependence between ");
 571           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 572           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 573           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 574           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 575         }
 576     }
 577   else if (dump_enabled_p ())
 578     {
 579       dump_printf_loc (MSG_NOTE, vect_location,
 580                        "determined dependence between ");
 581       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 582       dump_printf (MSG_NOTE, " and ");
 583       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 584       dump_printf (MSG_NOTE,  "\n");
 585     }
 586
 587   return true;
 588 }
 589
 590
 591 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 592    contain the vector of scalar stores of this instance if we are
 593    disambiguating the loads.  */
 594
 595 static bool
 596 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 597                                    vec<gimple *> stores, gimple *last_store)
 598 {
 599   /* This walks over all stmts involved in the SLP load/store done
 600      in NODE verifying we can sink them up to the last stmt in the
 601      group.  */
 602   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 603   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 604     {
 605       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 606       if (access == last_access)
 607         continue;
 608       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 609       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 610            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 611         {
 612           gimple *stmt = gsi_stmt (gsi);
 613           if (! gimple_vuse (stmt)
 614               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 615             continue;
 616
 617           /* If we couldn't record a (single) data reference for this
 618              stmt we have to give up.  */
 619           /* ???  Here and below if dependence analysis fails we can resort
 620              to the alias oracle which can handle more kinds of stmts.  */
 621           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 622           if (!dr_b)
 623             return false;
 624
 625           bool dependent = false;
 626           /* If we run into a store of this same instance (we've just
 627              marked those) then delay dependence checking until we run
 628              into the last store because this is where it will have
 629              been sunk to (and we verify if we can do that as well).  */
 630           if (gimple_visited_p (stmt))
 631             {
 632               if (stmt != last_store)
 633                 continue;
 634               unsigned i;
 635               gimple *store;
 636               FOR_EACH_VEC_ELT (stores, i, store)
 637                 {
 638                   data_reference *store_dr
 639                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 640                   ddr_p ddr = initialize_data_dependence_relation
 641                                 (dr_a, store_dr, vNULL);
 642                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 643                   free_dependence_relation (ddr);
 644                   if (dependent)
 645                     break;
 646                 }
 647             }
 648           else
 649             {
 650               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 651                                                                dr_b, vNULL);
 652               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 653               free_dependence_relation (ddr);
 654             }
 655           if (dependent)
 656             return false;
 657         }
 658     }
 659   return true;
 660 }
 661
 662
 663 /* Function vect_analyze_data_ref_dependences.
 664
 665    Examine all the data references in the basic-block, and make sure there
 666    do not exist any data dependences between them.  Set *MAX_VF according to
 667    the maximum vectorization factor the data dependences allow.  */
 668
 669 bool
 670 vect_slp_analyze_instance_dependence (slp_instance instance)
 671 {
 672   if (dump_enabled_p ())
 673     dump_printf_loc (MSG_NOTE, vect_location,
 674                      "=== vect_slp_analyze_instance_dependence ===\n");
 675
 676   /* The stores of this instance are at the root of the SLP tree.  */
 677   slp_tree store = SLP_INSTANCE_TREE (instance);
 678   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 679     store = NULL;
 680
 681   /* Verify we can sink stores to the vectorized stmt insert location.  */
 682   gimple *last_store = NULL;
 683   if (store)
 684     {
 685       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 686         return false;
 687
 688       /* Mark stores in this instance and remember the last one.  */
 689       last_store = vect_find_last_scalar_stmt_in_slp (store);
 690       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 691         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 692     }
 693
 694   bool res = true;
 695
 696   /* Verify we can sink loads to the vectorized stmt insert location,
 697      special-casing stores of this instance.  */
 698   slp_tree load;
 699   unsigned int i;
 700   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 701     if (! vect_slp_analyze_node_dependences (instance, load,
 702                                              store
 703                                              ? SLP_TREE_SCALAR_STMTS (store)
 704                                              : vNULL, last_store))
 705       {
 706         res = false;
 707         break;
 708       }
 709
 710   /* Unset the visited flag.  */
 711   if (store)
 712     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 713       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 714
 715   return res;
 716 }
 717
 718 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 719    the statement that contains DRB, which is useful for recording in the
 720    dump file.  */
 721
 722 static void
 723 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 724                             innermost_loop_behavior *drb)
 725 {
 726   bool existed;
 727   innermost_loop_behavior *&entry
 728     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 729   if (!existed || entry->base_alignment < drb->base_alignment)
 730     {
 731       entry = drb;
 732       if (dump_enabled_p ())
 733         {
 734           dump_printf_loc (MSG_NOTE, vect_location,
 735                            "recording new base alignment for ");
 736           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 737           dump_printf (MSG_NOTE, "\n");
 738           dump_printf_loc (MSG_NOTE, vect_location,
 739                            "  alignment:    %d\n", drb->base_alignment);
 740           dump_printf_loc (MSG_NOTE, vect_location,
 741                            "  misalignment: %d\n", drb->base_misalignment);
 742           dump_printf_loc (MSG_NOTE, vect_location,
 743                            "  based on:     ");
 744           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 745         }
 746     }
 747 }
 748
 749 /* If the region we're going to vectorize is reached, all unconditional
 750    data references occur at least once.  We can therefore pool the base
 751    alignment guarantees from each unconditional reference.  Do this by
 752    going through all the data references in VINFO and checking whether
 753    the containing statement makes the reference unconditionally.  If so,
 754    record the alignment of the base address in VINFO so that it can be
 755    used for all other references with the same base.  */
 756
 757 void
 758 vect_record_base_alignments (vec_info *vinfo)
 759 {
 760   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 761   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 762   data_reference *dr;
 763   unsigned int i;
 764   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 765     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 766       {
 767         gimple *stmt = DR_STMT (dr);
 768         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 769
 770         /* If DR is nested in the loop that is being vectorized, we can also
 771            record the alignment of the base wrt the outer loop.  */
 772         if (loop && nested_in_vect_loop_p (loop, stmt))
 773           {
 774             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 775             vect_record_base_alignment
 776               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 777           }
 778       }
 779 }
 780
 781 /* Return the target alignment for the vectorized form of DR.  */
 782
 783 static unsigned int
 784 vect_calculate_target_alignment (struct data_reference *dr)
 785 {
 786   gimple *stmt = DR_STMT (dr);
 787   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 788   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 789   return targetm.vectorize.preferred_vector_alignment (vectype);
 790 }
 791
 792 /* Function vect_compute_data_ref_alignment
 793
 794    Compute the misalignment of the data reference DR.
 795
 796    Output:
 797    1. If during the misalignment computation it is found that the data reference
 798       cannot be vectorized then false is returned.
 799    2. DR_MISALIGNMENT (DR) is defined.
 800
 801    FOR NOW: No analysis is actually performed. Misalignment is calculated
 802    only for trivial cases. TODO.  */
 803
 804 bool
 805 vect_compute_data_ref_alignment (struct data_reference *dr)
 806 {
 807   gimple *stmt = DR_STMT (dr);
 808   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 809   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 810   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 811   struct loop *loop = NULL;
 812   tree ref = DR_REF (dr);
 813   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 814
 815   if (dump_enabled_p ())
 816     dump_printf_loc (MSG_NOTE, vect_location,
 817                      "vect_compute_data_ref_alignment:\n");
 818
 819   if (loop_vinfo)
 820     loop = LOOP_VINFO_LOOP (loop_vinfo);
 821
 822   /* Initialize misalignment to unknown.  */
 823   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 824
 825   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 826   bool step_preserves_misalignment_p;
 827
 828   unsigned HOST_WIDE_INT vector_alignment
 829     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 830   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 831
 832   /* No step for BB vectorization.  */
 833   if (!loop)
 834     {
 835       gcc_assert (integer_zerop (drb->step));
 836       step_preserves_misalignment_p = true;
 837     }
 838
 839   /* In case the dataref is in an inner-loop of the loop that is being
 840      vectorized (LOOP), we use the base and misalignment information
 841      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 842      stays the same throughout the execution of the inner-loop, which is why
 843      we have to check that the stride of the dataref in the inner-loop evenly
 844      divides by the vector alignment.  */
 845   else if (nested_in_vect_loop_p (loop, stmt))
 846     {
 847       step_preserves_misalignment_p
 848         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 849
 850       if (dump_enabled_p ())
 851         {
 852           if (step_preserves_misalignment_p)
 853             dump_printf_loc (MSG_NOTE, vect_location,
 854                              "inner step divides the vector alignment.\n");
 855           else
 856             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 857                              "inner step doesn't divide the vector"
 858                              " alignment.\n");
 859         }
 860     }
 861
 862   /* Similarly we can only use base and misalignment information relative to
 863      an innermost loop if the misalignment stays the same throughout the
 864      execution of the loop.  As above, this is the case if the stride of
 865      the dataref evenly divides by the alignment.  */
 866   else
 867     {
 868       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 869       step_preserves_misalignment_p
 870         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 871
 872       if (!step_preserves_misalignment_p && dump_enabled_p ())
 873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 874                          "step doesn't divide the vector alignment.\n");
 875     }
 876
 877   unsigned int base_alignment = drb->base_alignment;
 878   unsigned int base_misalignment = drb->base_misalignment;
 879
 880   /* Calculate the maximum of the pooled base address alignment and the
 881      alignment that we can compute for DR itself.  */
 882   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 883   if (entry && base_alignment < (*entry)->base_alignment)
 884     {
 885       base_alignment = (*entry)->base_alignment;
 886       base_misalignment = (*entry)->base_misalignment;
 887     }
 888
 889   if (drb->offset_alignment < vector_alignment
 890       || !step_preserves_misalignment_p
 891       /* We need to know whether the step wrt the vectorized loop is
 892          negative when computing the starting misalignment below.  */
 893       || TREE_CODE (drb->step) != INTEGER_CST)
 894     {
 895       if (dump_enabled_p ())
 896         {
 897           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 898                            "Unknown alignment for access: ");
 899           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 900           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 901         }
 902       return true;
 903     }
 904
 905   if (base_alignment < vector_alignment)
 906     {
 907       tree base = drb->base_address;
 908       if (TREE_CODE (base) == ADDR_EXPR)
 909         base = TREE_OPERAND (base, 0);
 910       if (!vect_can_force_dr_alignment_p (base,
 911                                           vector_alignment * BITS_PER_UNIT))
 912         {
 913           if (dump_enabled_p ())
 914             {
 915               dump_printf_loc (MSG_NOTE, vect_location,
 916                                "can't force alignment of ref: ");
 917               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 918               dump_printf (MSG_NOTE, "\n");
 919             }
 920           return true;
 921         }
 922
 923       if (DECL_USER_ALIGN (base))
 924         {
 925           if (dump_enabled_p ())
 926             {
 927               dump_printf_loc (MSG_NOTE, vect_location,
 928                                "not forcing alignment of user-aligned "
 929                                "variable: ");
 930               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 931               dump_printf (MSG_NOTE, "\n");
 932             }
 933           return true;
 934         }
 935
 936       /* Force the alignment of the decl.
 937          NOTE: This is the only change to the code we make during
 938          the analysis phase, before deciding to vectorize the loop.  */
 939       if (dump_enabled_p ())
 940         {
 941           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 942           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 943           dump_printf (MSG_NOTE, "\n");
 944         }
 945
 946       DR_VECT_AUX (dr)->base_decl = base;
 947       DR_VECT_AUX (dr)->base_misaligned = true;
 948       base_misalignment = 0;
 949     }
 950   poly_int64 misalignment
 951     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 952
 953   /* If this is a backward running DR then first access in the larger
 954      vectype actually is N-1 elements before the address in the DR.
 955      Adjust misalign accordingly.  */
 956   if (tree_int_cst_sgn (drb->step) < 0)
 957     /* PLUS because STEP is negative.  */
 958     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 959                      * TREE_INT_CST_LOW (drb->step));
 960
 961   unsigned int const_misalignment;
 962   if (!known_misalignment (misalignment, vector_alignment,
 963                            &const_misalignment))
 964     {
 965       if (dump_enabled_p ())
 966         {
 967           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 968                            "Non-constant misalignment for access: ");
 969           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 970           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 971         }
 972       return true;
 973     }
 974
 975   SET_DR_MISALIGNMENT (dr, const_misalignment);
 976
 977   if (dump_enabled_p ())
 978     {
 979       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 980                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 981       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 982       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 983     }
 984
 985   return true;
 986 }
 987
 988 /* Function vect_update_misalignment_for_peel.
 989    Sets DR's misalignment
 990    - to 0 if it has the same alignment as DR_PEEL,
 991    - to the misalignment computed using NPEEL if DR's salignment is known,
 992    - to -1 (unknown) otherwise.
 993
 994    DR - the data reference whose misalignment is to be adjusted.
 995    DR_PEEL - the data reference whose misalignment is being made
 996              zero in the vector loop by the peel.
 997    NPEEL - the number of iterations in the peel loop if the misalignment
 998            of DR_PEEL is known at compile time.  */
 999
1000 static void
1001 vect_update_misalignment_for_peel (struct data_reference *dr,
1002                                    struct data_reference *dr_peel, int npeel)
1003 {
1004   unsigned int i;
1005   vec<dr_p> same_aligned_drs;
1006   struct data_reference *current_dr;
1007   int dr_size = vect_get_scalar_dr_size (dr);
1008   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1009   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1010   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1011
1012  /* For interleaved data accesses the step in the loop must be multiplied by
1013      the size of the interleaving group.  */
1014   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1015     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1016   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1017     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1018
1019   /* It can be assumed that the data refs with the same alignment as dr_peel
1020      are aligned in the vector loop.  */
1021   same_aligned_drs
1022     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1023   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1024     {
1025       if (current_dr != dr)
1026         continue;
1027       gcc_assert (!known_alignment_for_access_p (dr)
1028                   || !known_alignment_for_access_p (dr_peel)
1029                   || (DR_MISALIGNMENT (dr) / dr_size
1030                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1031       SET_DR_MISALIGNMENT (dr, 0);
1032       return;
1033     }
1034
1035   if (known_alignment_for_access_p (dr)
1036       && known_alignment_for_access_p (dr_peel))
1037     {
1038       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1039       int misal = DR_MISALIGNMENT (dr);
1040       misal += negative ? -npeel * dr_size : npeel * dr_size;
1041       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1042       SET_DR_MISALIGNMENT (dr, misal);
1043       return;
1044     }
1045
1046   if (dump_enabled_p ())
1047     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1048                      "to unknown (-1).\n");
1049   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1050 }
1051
1052
1053 /* Function verify_data_ref_alignment
1054
1055    Return TRUE if DR can be handled with respect to alignment.  */
1056
1057 static bool
1058 verify_data_ref_alignment (data_reference_p dr)
1059 {
1060   enum dr_alignment_support supportable_dr_alignment
1061     = vect_supportable_dr_alignment (dr, false);
1062   if (!supportable_dr_alignment)
1063     {
1064       if (dump_enabled_p ())
1065         {
1066           if (DR_IS_READ (dr))
1067             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068                              "not vectorized: unsupported unaligned load.");
1069           else
1070             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071                              "not vectorized: unsupported unaligned "
1072                              "store.");
1073
1074           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1075                              DR_REF (dr));
1076           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1077         }
1078       return false;
1079     }
1080
1081   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1082     dump_printf_loc (MSG_NOTE, vect_location,
1083                      "Vectorizing an unaligned access.\n");
1084
1085   return true;
1086 }
1087
1088 /* Function vect_verify_datarefs_alignment
1089
1090    Return TRUE if all data references in the loop can be
1091    handled with respect to alignment.  */
1092
1093 bool
1094 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1095 {
1096   vec<data_reference_p> datarefs = vinfo->datarefs;
1097   struct data_reference *dr;
1098   unsigned int i;
1099
1100   FOR_EACH_VEC_ELT (datarefs, i, dr)
1101     {
1102       gimple *stmt = DR_STMT (dr);
1103       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1104
1105       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1106         continue;
1107
1108       /* For interleaving, only the alignment of the first access matters.   */
1109       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1110           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1111         continue;
1112
1113       /* Strided accesses perform only component accesses, alignment is
1114          irrelevant for them.  */
1115       if (STMT_VINFO_STRIDED_P (stmt_info)
1116           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1117         continue;
1118
1119       if (! verify_data_ref_alignment (dr))
1120         return false;
1121     }
1122
1123   return true;
1124 }
1125
1126 /* Given an memory reference EXP return whether its alignment is less
1127    than its size.  */
1128
1129 static bool
1130 not_size_aligned (tree exp)
1131 {
1132   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1133     return true;
1134
1135   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1136           > get_object_alignment (exp));
1137 }
1138
1139 /* Function vector_alignment_reachable_p
1140
1141    Return true if vector alignment for DR is reachable by peeling
1142    a few loop iterations.  Return false otherwise.  */
1143
1144 static bool
1145 vector_alignment_reachable_p (struct data_reference *dr)
1146 {
1147   gimple *stmt = DR_STMT (dr);
1148   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1150
1151   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1152     {
1153       /* For interleaved access we peel only if number of iterations in
1154          the prolog loop ({VF - misalignment}), is a multiple of the
1155          number of the interleaved accesses.  */
1156       int elem_size, mis_in_elements;
1157
1158       /* FORNOW: handle only known alignment.  */
1159       if (!known_alignment_for_access_p (dr))
1160         return false;
1161
1162       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1163       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1164       elem_size = vector_element_size (vector_size, nelements);
1165       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1166
1167       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1168         return false;
1169     }
1170
1171   /* If misalignment is known at the compile time then allow peeling
1172      only if natural alignment is reachable through peeling.  */
1173   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1174     {
1175       HOST_WIDE_INT elmsize =
1176                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1177       if (dump_enabled_p ())
1178         {
1179           dump_printf_loc (MSG_NOTE, vect_location,
1180                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1181           dump_printf (MSG_NOTE,
1182                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1183         }
1184       if (DR_MISALIGNMENT (dr) % elmsize)
1185         {
1186           if (dump_enabled_p ())
1187             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188                              "data size does not divide the misalignment.\n");
1189           return false;
1190         }
1191     }
1192
1193   if (!known_alignment_for_access_p (dr))
1194     {
1195       tree type = TREE_TYPE (DR_REF (dr));
1196       bool is_packed = not_size_aligned (DR_REF (dr));
1197       if (dump_enabled_p ())
1198         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                          "Unknown misalignment, %snaturally aligned\n",
1200                          is_packed ? "not " : "");
1201       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1202     }
1203
1204   return true;
1205 }
1206
1207
1208 /* Calculate the cost of the memory access represented by DR.  */
1209
1210 static void
1211 vect_get_data_access_cost (struct data_reference *dr,
1212                            unsigned int *inside_cost,
1213                            unsigned int *outside_cost,
1214                            stmt_vector_for_cost *body_cost_vec)
1215 {
1216   gimple *stmt = DR_STMT (dr);
1217   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1218   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1219   int ncopies;
1220
1221   if (PURE_SLP_STMT (stmt_info))
1222     ncopies = 1;
1223   else
1224     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1225
1226   if (DR_IS_READ (dr))
1227     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1228                         NULL, body_cost_vec, false);
1229   else
1230     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1231
1232   if (dump_enabled_p ())
1233     dump_printf_loc (MSG_NOTE, vect_location,
1234                      "vect_get_data_access_cost: inside_cost = %d, "
1235                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1236 }
1237
1238
1239 typedef struct _vect_peel_info
1240 {
1241   struct data_reference *dr;
1242   int npeel;
1243   unsigned int count;
1244 } *vect_peel_info;
1245
1246 typedef struct _vect_peel_extended_info
1247 {
1248   struct _vect_peel_info peel_info;
1249   unsigned int inside_cost;
1250   unsigned int outside_cost;
1251 } *vect_peel_extended_info;
1252
1253
1254 /* Peeling hashtable helpers.  */
1255
1256 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1257 {
1258   static inline hashval_t hash (const _vect_peel_info *);
1259   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1260 };
1261
1262 inline hashval_t
1263 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1264 {
1265   return (hashval_t) peel_info->npeel;
1266 }
1267
1268 inline bool
1269 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1270 {
1271   return (a->npeel == b->npeel);
1272 }
1273
1274
1275 /* Insert DR into peeling hash table with NPEEL as key.  */
1276
1277 static void
1278 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1279                           loop_vec_info loop_vinfo, struct data_reference *dr,
1280                           int npeel)
1281 {
1282   struct _vect_peel_info elem, *slot;
1283   _vect_peel_info **new_slot;
1284   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1285
1286   elem.npeel = npeel;
1287   slot = peeling_htab->find (&elem);
1288   if (slot)
1289     slot->count++;
1290   else
1291     {
1292       slot = XNEW (struct _vect_peel_info);
1293       slot->npeel = npeel;
1294       slot->dr = dr;
1295       slot->count = 1;
1296       new_slot = peeling_htab->find_slot (slot, INSERT);
1297       *new_slot = slot;
1298     }
1299
1300   if (!supportable_dr_alignment
1301       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1302     slot->count += VECT_MAX_COST;
1303 }
1304
1305
1306 /* Traverse peeling hash table to find peeling option that aligns maximum
1307    number of data accesses.  */
1308
1309 int
1310 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1311                                      _vect_peel_extended_info *max)
1312 {
1313   vect_peel_info elem = *slot;
1314
1315   if (elem->count > max->peel_info.count
1316       || (elem->count == max->peel_info.count
1317           && max->peel_info.npeel > elem->npeel))
1318     {
1319       max->peel_info.npeel = elem->npeel;
1320       max->peel_info.count = elem->count;
1321       max->peel_info.dr = elem->dr;
1322     }
1323
1324   return 1;
1325 }
1326
1327 /* Get the costs of peeling NPEEL iterations checking data access costs
1328    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1329    misalignment will be zero after peeling.  */
1330
1331 static void
1332 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1333                                 struct data_reference *dr0,
1334                                 unsigned int *inside_cost,
1335                                 unsigned int *outside_cost,
1336                                 stmt_vector_for_cost *body_cost_vec,
1337                                 unsigned int npeel,
1338                                 bool unknown_misalignment)
1339 {
1340   unsigned i;
1341   data_reference *dr;
1342
1343   FOR_EACH_VEC_ELT (datarefs, i, dr)
1344     {
1345       gimple *stmt = DR_STMT (dr);
1346       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1347       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1348         continue;
1349
1350       /* For interleaving, only the alignment of the first access
1351          matters.  */
1352       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1353           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1354         continue;
1355
1356       /* Strided accesses perform only component accesses, alignment is
1357          irrelevant for them.  */
1358       if (STMT_VINFO_STRIDED_P (stmt_info)
1359           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1360         continue;
1361
1362       int save_misalignment;
1363       save_misalignment = DR_MISALIGNMENT (dr);
1364       if (npeel == 0)
1365         ;
1366       else if (unknown_misalignment && dr == dr0)
1367         SET_DR_MISALIGNMENT (dr, 0);
1368       else
1369         vect_update_misalignment_for_peel (dr, dr0, npeel);
1370       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1371                                  body_cost_vec);
1372       SET_DR_MISALIGNMENT (dr, save_misalignment);
1373     }
1374 }
1375
1376 /* Traverse peeling hash table and calculate cost for each peeling option.
1377    Find the one with the lowest cost.  */
1378
1379 int
1380 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1381                                    _vect_peel_extended_info *min)
1382 {
1383   vect_peel_info elem = *slot;
1384   int dummy;
1385   unsigned int inside_cost = 0, outside_cost = 0;
1386   gimple *stmt = DR_STMT (elem->dr);
1387   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1388   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1389   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1390                        epilogue_cost_vec;
1391
1392   prologue_cost_vec.create (2);
1393   body_cost_vec.create (2);
1394   epilogue_cost_vec.create (2);
1395
1396   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1397                                   elem->dr, &inside_cost, &outside_cost,
1398                                   &body_cost_vec, elem->npeel, false);
1399
1400   body_cost_vec.release ();
1401
1402   outside_cost += vect_get_known_peeling_cost
1403     (loop_vinfo, elem->npeel, &dummy,
1404      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1405      &prologue_cost_vec, &epilogue_cost_vec);
1406
1407   /* Prologue and epilogue costs are added to the target model later.
1408      These costs depend only on the scalar iteration cost, the
1409      number of peeling iterations finally chosen, and the number of
1410      misaligned statements.  So discard the information found here.  */
1411   prologue_cost_vec.release ();
1412   epilogue_cost_vec.release ();
1413
1414   if (inside_cost < min->inside_cost
1415       || (inside_cost == min->inside_cost
1416           && outside_cost < min->outside_cost))
1417     {
1418       min->inside_cost = inside_cost;
1419       min->outside_cost = outside_cost;
1420       min->peel_info.dr = elem->dr;
1421       min->peel_info.npeel = elem->npeel;
1422       min->peel_info.count = elem->count;
1423     }
1424
1425   return 1;
1426 }
1427
1428
1429 /* Choose best peeling option by traversing peeling hash table and either
1430    choosing an option with the lowest cost (if cost model is enabled) or the
1431    option that aligns as many accesses as possible.  */
1432
1433 static struct _vect_peel_extended_info
1434 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1435                                        loop_vec_info loop_vinfo)
1436 {
1437    struct _vect_peel_extended_info res;
1438
1439    res.peel_info.dr = NULL;
1440
1441    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1442      {
1443        res.inside_cost = INT_MAX;
1444        res.outside_cost = INT_MAX;
1445        peeling_htab->traverse <_vect_peel_extended_info *,
1446                                vect_peeling_hash_get_lowest_cost> (&res);
1447      }
1448    else
1449      {
1450        res.peel_info.count = 0;
1451        peeling_htab->traverse <_vect_peel_extended_info *,
1452                                vect_peeling_hash_get_most_frequent> (&res);
1453        res.inside_cost = 0;
1454        res.outside_cost = 0;
1455      }
1456
1457    return res;
1458 }
1459
1460 /* Return true if the new peeling NPEEL is supported.  */
1461
1462 static bool
1463 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1464                           unsigned npeel)
1465 {
1466   unsigned i;
1467   struct data_reference *dr = NULL;
1468   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1469   gimple *stmt;
1470   stmt_vec_info stmt_info;
1471   enum dr_alignment_support supportable_dr_alignment;
1472
1473   /* Ensure that all data refs can be vectorized after the peel.  */
1474   FOR_EACH_VEC_ELT (datarefs, i, dr)
1475     {
1476       int save_misalignment;
1477
1478       if (dr == dr0)
1479         continue;
1480
1481       stmt = DR_STMT (dr);
1482       stmt_info = vinfo_for_stmt (stmt);
1483       /* For interleaving, only the alignment of the first access
1484          matters.  */
1485       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1486           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1487         continue;
1488
1489       /* Strided accesses perform only component accesses, alignment is
1490          irrelevant for them.  */
1491       if (STMT_VINFO_STRIDED_P (stmt_info)
1492           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1493         continue;
1494
1495       save_misalignment = DR_MISALIGNMENT (dr);
1496       vect_update_misalignment_for_peel (dr, dr0, npeel);
1497       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1498       SET_DR_MISALIGNMENT (dr, save_misalignment);
1499
1500       if (!supportable_dr_alignment)
1501         return false;
1502     }
1503
1504   return true;
1505 }
1506
1507 /* Function vect_enhance_data_refs_alignment
1508
1509    This pass will use loop versioning and loop peeling in order to enhance
1510    the alignment of data references in the loop.
1511
1512    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1513    original loop is to be vectorized.  Any other loops that are created by
1514    the transformations performed in this pass - are not supposed to be
1515    vectorized.  This restriction will be relaxed.
1516
1517    This pass will require a cost model to guide it whether to apply peeling
1518    or versioning or a combination of the two.  For example, the scheme that
1519    intel uses when given a loop with several memory accesses, is as follows:
1520    choose one memory access ('p') which alignment you want to force by doing
1521    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1522    other accesses are not necessarily aligned, or (2) use loop versioning to
1523    generate one loop in which all accesses are aligned, and another loop in
1524    which only 'p' is necessarily aligned.
1525
1526    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1527    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1528    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1529
1530    Devising a cost model is the most critical aspect of this work.  It will
1531    guide us on which access to peel for, whether to use loop versioning, how
1532    many versions to create, etc.  The cost model will probably consist of
1533    generic considerations as well as target specific considerations (on
1534    powerpc for example, misaligned stores are more painful than misaligned
1535    loads).
1536
1537    Here are the general steps involved in alignment enhancements:
1538
1539      -- original loop, before alignment analysis:
1540         for (i=0; i<N; i++){
1541           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1542           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1543         }
1544
1545      -- After vect_compute_data_refs_alignment:
1546         for (i=0; i<N; i++){
1547           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1548           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1549         }
1550
1551      -- Possibility 1: we do loop versioning:
1552      if (p is aligned) {
1553         for (i=0; i<N; i++){    # loop 1A
1554           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1555           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1556         }
1557      }
1558      else {
1559         for (i=0; i<N; i++){    # loop 1B
1560           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1561           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1562         }
1563      }
1564
1565      -- Possibility 2: we do loop peeling:
1566      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1567         x = q[i];
1568         p[i] = y;
1569      }
1570      for (i = 3; i < N; i++){   # loop 2A
1571         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1572         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1573      }
1574
1575      -- Possibility 3: combination of loop peeling and versioning:
1576      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1577         x = q[i];
1578         p[i] = y;
1579      }
1580      if (p is aligned) {
1581         for (i = 3; i<N; i++){  # loop 3A
1582           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1583           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1584         }
1585      }
1586      else {
1587         for (i = 3; i<N; i++){  # loop 3B
1588           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1589           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1590         }
1591      }
1592
1593      These loops are later passed to loop_transform to be vectorized.  The
1594      vectorizer will use the alignment information to guide the transformation
1595      (whether to generate regular loads/stores, or with special handling for
1596      misalignment).  */
1597
1598 bool
1599 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1600 {
1601   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1602   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1603   enum dr_alignment_support supportable_dr_alignment;
1604   struct data_reference *dr0 = NULL, *first_store = NULL;
1605   struct data_reference *dr;
1606   unsigned int i, j;
1607   bool do_peeling = false;
1608   bool do_versioning = false;
1609   bool stat;
1610   gimple *stmt;
1611   stmt_vec_info stmt_info;
1612   unsigned int npeel = 0;
1613   bool one_misalignment_known = false;
1614   bool one_misalignment_unknown = false;
1615   bool one_dr_unsupportable = false;
1616   struct data_reference *unsupportable_dr = NULL;
1617   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1618   unsigned possible_npeel_number = 1;
1619   tree vectype;
1620   unsigned int mis, same_align_drs_max = 0;
1621   hash_table<peel_info_hasher> peeling_htab (1);
1622
1623   if (dump_enabled_p ())
1624     dump_printf_loc (MSG_NOTE, vect_location,
1625                      "=== vect_enhance_data_refs_alignment ===\n");
1626
1627   /* Reset data so we can safely be called multiple times.  */
1628   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1629   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1630
1631   /* While cost model enhancements are expected in the future, the high level
1632      view of the code at this time is as follows:
1633
1634      A) If there is a misaligned access then see if peeling to align
1635         this access can make all data references satisfy
1636         vect_supportable_dr_alignment.  If so, update data structures
1637         as needed and return true.
1638
1639      B) If peeling wasn't possible and there is a data reference with an
1640         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1641         then see if loop versioning checks can be used to make all data
1642         references satisfy vect_supportable_dr_alignment.  If so, update
1643         data structures as needed and return true.
1644
1645      C) If neither peeling nor versioning were successful then return false if
1646         any data reference does not satisfy vect_supportable_dr_alignment.
1647
1648      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1649
1650      Note, Possibility 3 above (which is peeling and versioning together) is not
1651      being done at this time.  */
1652
1653   /* (1) Peeling to force alignment.  */
1654
1655   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1656      Considerations:
1657      + How many accesses will become aligned due to the peeling
1658      - How many accesses will become unaligned due to the peeling,
1659        and the cost of misaligned accesses.
1660      - The cost of peeling (the extra runtime checks, the increase
1661        in code size).  */
1662
1663   FOR_EACH_VEC_ELT (datarefs, i, dr)
1664     {
1665       stmt = DR_STMT (dr);
1666       stmt_info = vinfo_for_stmt (stmt);
1667
1668       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1669         continue;
1670
1671       /* For interleaving, only the alignment of the first access
1672          matters.  */
1673       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1674           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1675         continue;
1676
1677       /* For invariant accesses there is nothing to enhance.  */
1678       if (integer_zerop (DR_STEP (dr)))
1679         continue;
1680
1681       /* Strided accesses perform only component accesses, alignment is
1682          irrelevant for them.  */
1683       if (STMT_VINFO_STRIDED_P (stmt_info)
1684           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1685         continue;
1686
1687       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1688       do_peeling = vector_alignment_reachable_p (dr);
1689       if (do_peeling)
1690         {
1691           if (known_alignment_for_access_p (dr))
1692             {
1693               unsigned int npeel_tmp = 0;
1694               bool negative = tree_int_cst_compare (DR_STEP (dr),
1695                                                     size_zero_node) < 0;
1696
1697               vectype = STMT_VINFO_VECTYPE (stmt_info);
1698               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1699               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1700               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1701               if (DR_MISALIGNMENT (dr) != 0)
1702                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1703
1704               /* For multiple types, it is possible that the bigger type access
1705                  will have more than one peeling option.  E.g., a loop with two
1706                  types: one of size (vector size / 4), and the other one of
1707                  size (vector size / 8).  Vectorization factor will 8.  If both
1708                  accesses are misaligned by 3, the first one needs one scalar
1709                  iteration to be aligned, and the second one needs 5.  But the
1710                  first one will be aligned also by peeling 5 scalar
1711                  iterations, and in that case both accesses will be aligned.
1712                  Hence, except for the immediate peeling amount, we also want
1713                  to try to add full vector size, while we don't exceed
1714                  vectorization factor.
1715                  We do this automatically for cost model, since we calculate
1716                  cost for every peeling option.  */
1717               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1718                 {
1719                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1720                                           ? vf * GROUP_SIZE (stmt_info) : vf);
1721                   possible_npeel_number
1722                     = vect_get_num_vectors (nscalars, vectype);
1723
1724                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1725                      allow peeling NELEMENTS.  */
1726                   if (DR_MISALIGNMENT (dr) == 0)
1727                     possible_npeel_number++;
1728                 }
1729
1730               /* Save info about DR in the hash table.  Also include peeling
1731                  amounts according to the explanation above.  */
1732               for (j = 0; j < possible_npeel_number; j++)
1733                 {
1734                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1735                                             dr, npeel_tmp);
1736                   npeel_tmp += target_align / dr_size;
1737                 }
1738
1739               one_misalignment_known = true;
1740             }
1741           else
1742             {
1743               /* If we don't know any misalignment values, we prefer
1744                  peeling for data-ref that has the maximum number of data-refs
1745                  with the same alignment, unless the target prefers to align
1746                  stores over load.  */
1747               unsigned same_align_drs
1748                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1749               if (!dr0
1750                   || same_align_drs_max < same_align_drs)
1751                 {
1752                   same_align_drs_max = same_align_drs;
1753                   dr0 = dr;
1754                 }
1755               /* For data-refs with the same number of related
1756                  accesses prefer the one where the misalign
1757                  computation will be invariant in the outermost loop.  */
1758               else if (same_align_drs_max == same_align_drs)
1759                 {
1760                   struct loop *ivloop0, *ivloop;
1761                   ivloop0 = outermost_invariant_loop_for_expr
1762                     (loop, DR_BASE_ADDRESS (dr0));
1763                   ivloop = outermost_invariant_loop_for_expr
1764                     (loop, DR_BASE_ADDRESS (dr));
1765                   if ((ivloop && !ivloop0)
1766                       || (ivloop && ivloop0
1767                           && flow_loop_nested_p (ivloop, ivloop0)))
1768                     dr0 = dr;
1769                 }
1770
1771               one_misalignment_unknown = true;
1772
1773               /* Check for data refs with unsupportable alignment that
1774                  can be peeled.  */
1775               if (!supportable_dr_alignment)
1776               {
1777                 one_dr_unsupportable = true;
1778                 unsupportable_dr = dr;
1779               }
1780
1781               if (!first_store && DR_IS_WRITE (dr))
1782                 first_store = dr;
1783             }
1784         }
1785       else
1786         {
1787           if (!aligned_access_p (dr))
1788             {
1789               if (dump_enabled_p ())
1790                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1791                                  "vector alignment may not be reachable\n");
1792               break;
1793             }
1794         }
1795     }
1796
1797   /* Check if we can possibly peel the loop.  */
1798   if (!vect_can_advance_ivs_p (loop_vinfo)
1799       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1800       || loop->inner)
1801     do_peeling = false;
1802
1803   struct _vect_peel_extended_info peel_for_known_alignment;
1804   struct _vect_peel_extended_info peel_for_unknown_alignment;
1805   struct _vect_peel_extended_info best_peel;
1806
1807   peel_for_unknown_alignment.inside_cost = INT_MAX;
1808   peel_for_unknown_alignment.outside_cost = INT_MAX;
1809   peel_for_unknown_alignment.peel_info.count = 0;
1810
1811   if (do_peeling
1812       && one_misalignment_unknown)
1813     {
1814       /* Check if the target requires to prefer stores over loads, i.e., if
1815          misaligned stores are more expensive than misaligned loads (taking
1816          drs with same alignment into account).  */
1817       unsigned int load_inside_cost = 0;
1818       unsigned int load_outside_cost = 0;
1819       unsigned int store_inside_cost = 0;
1820       unsigned int store_outside_cost = 0;
1821       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1822
1823       stmt_vector_for_cost dummy;
1824       dummy.create (2);
1825       vect_get_peeling_costs_all_drs (datarefs, dr0,
1826                                       &load_inside_cost,
1827                                       &load_outside_cost,
1828                                       &dummy, estimated_npeels, true);
1829       dummy.release ();
1830
1831       if (first_store)
1832         {
1833           dummy.create (2);
1834           vect_get_peeling_costs_all_drs (datarefs, first_store,
1835                                           &store_inside_cost,
1836                                           &store_outside_cost,
1837                                           &dummy, estimated_npeels, true);
1838           dummy.release ();
1839         }
1840       else
1841         {
1842           store_inside_cost = INT_MAX;
1843           store_outside_cost = INT_MAX;
1844         }
1845
1846       if (load_inside_cost > store_inside_cost
1847           || (load_inside_cost == store_inside_cost
1848               && load_outside_cost > store_outside_cost))
1849         {
1850           dr0 = first_store;
1851           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1852           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1853         }
1854       else
1855         {
1856           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1857           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1858         }
1859
1860       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1861       prologue_cost_vec.create (2);
1862       epilogue_cost_vec.create (2);
1863
1864       int dummy2;
1865       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1866         (loop_vinfo, estimated_npeels, &dummy2,
1867          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1868          &prologue_cost_vec, &epilogue_cost_vec);
1869
1870       prologue_cost_vec.release ();
1871       epilogue_cost_vec.release ();
1872
1873       peel_for_unknown_alignment.peel_info.count = 1
1874         + STMT_VINFO_SAME_ALIGN_REFS
1875         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1876     }
1877
1878   peel_for_unknown_alignment.peel_info.npeel = 0;
1879   peel_for_unknown_alignment.peel_info.dr = dr0;
1880
1881   best_peel = peel_for_unknown_alignment;
1882
1883   peel_for_known_alignment.inside_cost = INT_MAX;
1884   peel_for_known_alignment.outside_cost = INT_MAX;
1885   peel_for_known_alignment.peel_info.count = 0;
1886   peel_for_known_alignment.peel_info.dr = NULL;
1887
1888   if (do_peeling && one_misalignment_known)
1889     {
1890       /* Peeling is possible, but there is no data access that is not supported
1891          unless aligned.  So we try to choose the best possible peeling from
1892          the hash table.  */
1893       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1894         (&peeling_htab, loop_vinfo);
1895     }
1896
1897   /* Compare costs of peeling for known and unknown alignment. */
1898   if (peel_for_known_alignment.peel_info.dr != NULL
1899       && peel_for_unknown_alignment.inside_cost
1900       >= peel_for_known_alignment.inside_cost)
1901     {
1902       best_peel = peel_for_known_alignment;
1903
1904       /* If the best peeling for known alignment has NPEEL == 0, perform no
1905          peeling at all except if there is an unsupportable dr that we can
1906          align.  */
1907       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1908         do_peeling = false;
1909     }
1910
1911   /* If there is an unsupportable data ref, prefer this over all choices so far
1912      since we'd have to discard a chosen peeling except when it accidentally
1913      aligned the unsupportable data ref.  */
1914   if (one_dr_unsupportable)
1915     dr0 = unsupportable_dr;
1916   else if (do_peeling)
1917     {
1918       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1919          TODO: Use nopeel_outside_cost or get rid of it?  */
1920       unsigned nopeel_inside_cost = 0;
1921       unsigned nopeel_outside_cost = 0;
1922
1923       stmt_vector_for_cost dummy;
1924       dummy.create (2);
1925       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1926                                       &nopeel_outside_cost, &dummy, 0, false);
1927       dummy.release ();
1928
1929       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1930          costs will be recorded.  */
1931       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1932       prologue_cost_vec.create (2);
1933       epilogue_cost_vec.create (2);
1934
1935       int dummy2;
1936       nopeel_outside_cost += vect_get_known_peeling_cost
1937         (loop_vinfo, 0, &dummy2,
1938          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1939          &prologue_cost_vec, &epilogue_cost_vec);
1940
1941       prologue_cost_vec.release ();
1942       epilogue_cost_vec.release ();
1943
1944       npeel = best_peel.peel_info.npeel;
1945       dr0 = best_peel.peel_info.dr;
1946
1947       /* If no peeling is not more expensive than the best peeling we
1948          have so far, don't perform any peeling.  */
1949       if (nopeel_inside_cost <= best_peel.inside_cost)
1950         do_peeling = false;
1951     }
1952
1953   if (do_peeling)
1954     {
1955       stmt = DR_STMT (dr0);
1956       stmt_info = vinfo_for_stmt (stmt);
1957       vectype = STMT_VINFO_VECTYPE (stmt_info);
1958
1959       if (known_alignment_for_access_p (dr0))
1960         {
1961           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1962                                                 size_zero_node) < 0;
1963           if (!npeel)
1964             {
1965               /* Since it's known at compile time, compute the number of
1966                  iterations in the peeled loop (the peeling factor) for use in
1967                  updating DR_MISALIGNMENT values.  The peeling factor is the
1968                  vectorization factor minus the misalignment as an element
1969                  count.  */
1970               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1971               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1972               npeel = ((mis & (target_align - 1))
1973                        / vect_get_scalar_dr_size (dr0));
1974             }
1975
1976           /* For interleaved data access every iteration accesses all the
1977              members of the group, therefore we divide the number of iterations
1978              by the group size.  */
1979           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1980           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1981             npeel /= GROUP_SIZE (stmt_info);
1982
1983           if (dump_enabled_p ())
1984             dump_printf_loc (MSG_NOTE, vect_location,
1985                              "Try peeling by %d\n", npeel);
1986         }
1987
1988       /* Ensure that all datarefs can be vectorized after the peel.  */
1989       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1990         do_peeling = false;
1991
1992       /* Check if all datarefs are supportable and log.  */
1993       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1994         {
1995           stat = vect_verify_datarefs_alignment (loop_vinfo);
1996           if (!stat)
1997             do_peeling = false;
1998           else
1999             return stat;
2000         }
2001
2002       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2003       if (do_peeling)
2004         {
2005           unsigned max_allowed_peel
2006             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2007           if (max_allowed_peel != (unsigned)-1)
2008             {
2009               unsigned max_peel = npeel;
2010               if (max_peel == 0)
2011                 {
2012                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2013                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2014                 }
2015               if (max_peel > max_allowed_peel)
2016                 {
2017                   do_peeling = false;
2018                   if (dump_enabled_p ())
2019                     dump_printf_loc (MSG_NOTE, vect_location,
2020                         "Disable peeling, max peels reached: %d\n", max_peel);
2021                 }
2022             }
2023         }
2024
2025       /* Cost model #2 - if peeling may result in a remaining loop not
2026          iterating enough to be vectorized then do not peel.  Since this
2027          is a cost heuristic rather than a correctness decision, use the
2028          most likely runtime value for variable vectorization factors.  */
2029       if (do_peeling
2030           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2031         {
2032           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2033           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2034           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2035               < assumed_vf + max_peel)
2036             do_peeling = false;
2037         }
2038
2039       if (do_peeling)
2040         {
2041           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2042              If the misalignment of DR_i is identical to that of dr0 then set
2043              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2044              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2045              by the peeling factor times the element size of DR_i (MOD the
2046              vectorization factor times the size).  Otherwise, the
2047              misalignment of DR_i must be set to unknown.  */
2048           FOR_EACH_VEC_ELT (datarefs, i, dr)
2049             if (dr != dr0)
2050               {
2051                 /* Strided accesses perform only component accesses, alignment
2052                    is irrelevant for them.  */
2053                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2054                 if (STMT_VINFO_STRIDED_P (stmt_info)
2055                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2056                   continue;
2057
2058                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2059               }
2060
2061           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2062           if (npeel)
2063             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2064           else
2065             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2066               = DR_MISALIGNMENT (dr0);
2067           SET_DR_MISALIGNMENT (dr0, 0);
2068           if (dump_enabled_p ())
2069             {
2070               dump_printf_loc (MSG_NOTE, vect_location,
2071                                "Alignment of access forced using peeling.\n");
2072               dump_printf_loc (MSG_NOTE, vect_location,
2073                                "Peeling for alignment will be applied.\n");
2074             }
2075
2076           /* The inside-loop cost will be accounted for in vectorizable_load
2077              and vectorizable_store correctly with adjusted alignments.
2078              Drop the body_cst_vec on the floor here.  */
2079           stat = vect_verify_datarefs_alignment (loop_vinfo);
2080           gcc_assert (stat);
2081           return stat;
2082         }
2083     }
2084
2085   /* (2) Versioning to force alignment.  */
2086
2087   /* Try versioning if:
2088      1) optimize loop for speed
2089      2) there is at least one unsupported misaligned data ref with an unknown
2090         misalignment, and
2091      3) all misaligned data refs with a known misalignment are supported, and
2092      4) the number of runtime alignment checks is within reason.  */
2093
2094   do_versioning =
2095         optimize_loop_nest_for_speed_p (loop)
2096         && (!loop->inner); /* FORNOW */
2097
2098   if (do_versioning)
2099     {
2100       FOR_EACH_VEC_ELT (datarefs, i, dr)
2101         {
2102           stmt = DR_STMT (dr);
2103           stmt_info = vinfo_for_stmt (stmt);
2104
2105           /* For interleaving, only the alignment of the first access
2106              matters.  */
2107           if (aligned_access_p (dr)
2108               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2109                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2110             continue;
2111
2112           if (STMT_VINFO_STRIDED_P (stmt_info))
2113             {
2114               /* Strided loads perform only component accesses, alignment is
2115                  irrelevant for them.  */
2116               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2117                 continue;
2118               do_versioning = false;
2119               break;
2120             }
2121
2122           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2123
2124           if (!supportable_dr_alignment)
2125             {
2126               gimple *stmt;
2127               int mask;
2128               tree vectype;
2129
2130               if (known_alignment_for_access_p (dr)
2131                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2132                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2133                 {
2134                   do_versioning = false;
2135                   break;
2136                 }
2137
2138               stmt = DR_STMT (dr);
2139               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2140               gcc_assert (vectype);
2141
2142               /* The rightmost bits of an aligned address must be zeros.
2143                  Construct the mask needed for this test.  For example,
2144                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2145                  mask must be 15 = 0xf. */
2146               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2147
2148               /* FORNOW: use the same mask to test all potentially unaligned
2149                  references in the loop.  The vectorizer currently supports
2150                  a single vector size, see the reference to
2151                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2152                  vectorization factor is computed.  */
2153               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2154                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2155               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2156               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2157                       DR_STMT (dr));
2158             }
2159         }
2160
2161       /* Versioning requires at least one misaligned data reference.  */
2162       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2163         do_versioning = false;
2164       else if (!do_versioning)
2165         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2166     }
2167
2168   if (do_versioning)
2169     {
2170       vec<gimple *> may_misalign_stmts
2171         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2172       gimple *stmt;
2173
2174       /* It can now be assumed that the data references in the statements
2175          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2176          of the loop being vectorized.  */
2177       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2178         {
2179           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2180           dr = STMT_VINFO_DATA_REF (stmt_info);
2181           SET_DR_MISALIGNMENT (dr, 0);
2182           if (dump_enabled_p ())
2183             dump_printf_loc (MSG_NOTE, vect_location,
2184                              "Alignment of access forced using versioning.\n");
2185         }
2186
2187       if (dump_enabled_p ())
2188         dump_printf_loc (MSG_NOTE, vect_location,
2189                          "Versioning for alignment will be applied.\n");
2190
2191       /* Peeling and versioning can't be done together at this time.  */
2192       gcc_assert (! (do_peeling && do_versioning));
2193
2194       stat = vect_verify_datarefs_alignment (loop_vinfo);
2195       gcc_assert (stat);
2196       return stat;
2197     }
2198
2199   /* This point is reached if neither peeling nor versioning is being done.  */
2200   gcc_assert (! (do_peeling || do_versioning));
2201
2202   stat = vect_verify_datarefs_alignment (loop_vinfo);
2203   return stat;
2204 }
2205
2206
2207 /* Function vect_find_same_alignment_drs.
2208
2209    Update group and alignment relations according to the chosen
2210    vectorization factor.  */
2211
2212 static void
2213 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2214 {
2215   struct data_reference *dra = DDR_A (ddr);
2216   struct data_reference *drb = DDR_B (ddr);
2217   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2218   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2219
2220   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2221     return;
2222
2223   if (dra == drb)
2224     return;
2225
2226   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2227       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2228       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2229     return;
2230
2231   /* Two references with distance zero have the same alignment.  */
2232   offset_int diff = (wi::to_offset (DR_INIT (dra))
2233                      - wi::to_offset (DR_INIT (drb)));
2234   if (diff != 0)
2235     {
2236       /* Get the wider of the two alignments.  */
2237       unsigned int align_a = (vect_calculate_target_alignment (dra)
2238                               / BITS_PER_UNIT);
2239       unsigned int align_b = (vect_calculate_target_alignment (drb)
2240                               / BITS_PER_UNIT);
2241       unsigned int max_align = MAX (align_a, align_b);
2242
2243       /* Require the gap to be a multiple of the larger vector alignment.  */
2244       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2245         return;
2246     }
2247
2248   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2249   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2250   if (dump_enabled_p ())
2251     {
2252       dump_printf_loc (MSG_NOTE, vect_location,
2253                        "accesses have the same alignment: ");
2254       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2255       dump_printf (MSG_NOTE,  " and ");
2256       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2257       dump_printf (MSG_NOTE, "\n");
2258     }
2259 }
2260
2261
2262 /* Function vect_analyze_data_refs_alignment
2263
2264    Analyze the alignment of the data-references in the loop.
2265    Return FALSE if a data reference is found that cannot be vectorized.  */
2266
2267 bool
2268 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2269 {
2270   if (dump_enabled_p ())
2271     dump_printf_loc (MSG_NOTE, vect_location,
2272                      "=== vect_analyze_data_refs_alignment ===\n");
2273
2274   /* Mark groups of data references with same alignment using
2275      data dependence information.  */
2276   vec<ddr_p> ddrs = vinfo->ddrs;
2277   struct data_dependence_relation *ddr;
2278   unsigned int i;
2279
2280   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2281     vect_find_same_alignment_drs (ddr);
2282
2283   vec<data_reference_p> datarefs = vinfo->datarefs;
2284   struct data_reference *dr;
2285
2286   vect_record_base_alignments (vinfo);
2287   FOR_EACH_VEC_ELT (datarefs, i, dr)
2288     {
2289       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2290       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2291           && !vect_compute_data_ref_alignment (dr))
2292         {
2293           /* Strided accesses perform only component accesses, misalignment
2294              information is irrelevant for them.  */
2295           if (STMT_VINFO_STRIDED_P (stmt_info)
2296               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2297             continue;
2298
2299           if (dump_enabled_p ())
2300             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301                              "not vectorized: can't calculate alignment "
2302                              "for data ref.\n");
2303
2304           return false;
2305         }
2306     }
2307
2308   return true;
2309 }
2310
2311
2312 /* Analyze alignment of DRs of stmts in NODE.  */
2313
2314 static bool
2315 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2316 {
2317   /* We vectorize from the first scalar stmt in the node unless
2318      the node is permuted in which case we start from the first
2319      element in the group.  */
2320   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2321   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2322   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2323     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2324
2325   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2326   if (! vect_compute_data_ref_alignment (dr)
2327       /* For creating the data-ref pointer we need alignment of the
2328          first element anyway.  */
2329       || (dr != first_dr
2330           && ! vect_compute_data_ref_alignment (first_dr))
2331       || ! verify_data_ref_alignment (dr))
2332     {
2333       if (dump_enabled_p ())
2334         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2335                          "not vectorized: bad data alignment in basic "
2336                          "block.\n");
2337       return false;
2338     }
2339
2340   return true;
2341 }
2342
2343 /* Function vect_slp_analyze_instance_alignment
2344
2345    Analyze the alignment of the data-references in the SLP instance.
2346    Return FALSE if a data reference is found that cannot be vectorized.  */
2347
2348 bool
2349 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2350 {
2351   if (dump_enabled_p ())
2352     dump_printf_loc (MSG_NOTE, vect_location,
2353                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2354
2355   slp_tree node;
2356   unsigned i;
2357   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2358     if (! vect_slp_analyze_and_verify_node_alignment (node))
2359       return false;
2360
2361   node = SLP_INSTANCE_TREE (instance);
2362   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2363       && ! vect_slp_analyze_and_verify_node_alignment
2364              (SLP_INSTANCE_TREE (instance)))
2365     return false;
2366
2367   return true;
2368 }
2369
2370
2371 /* Analyze groups of accesses: check that DR belongs to a group of
2372    accesses of legal size, step, etc.  Detect gaps, single element
2373    interleaving, and other special cases. Set grouped access info.
2374    Collect groups of strided stores for further use in SLP analysis.
2375    Worker for vect_analyze_group_access.  */
2376
2377 static bool
2378 vect_analyze_group_access_1 (struct data_reference *dr)
2379 {
2380   tree step = DR_STEP (dr);
2381   tree scalar_type = TREE_TYPE (DR_REF (dr));
2382   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2383   gimple *stmt = DR_STMT (dr);
2384   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2385   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2386   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2387   HOST_WIDE_INT dr_step = -1;
2388   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2389   bool slp_impossible = false;
2390
2391   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2392      size of the interleaving group (including gaps).  */
2393   if (tree_fits_shwi_p (step))
2394     {
2395       dr_step = tree_to_shwi (step);
2396       /* Check that STEP is a multiple of type size.  Otherwise there is
2397          a non-element-sized gap at the end of the group which we
2398          cannot represent in GROUP_GAP or GROUP_SIZE.
2399          ???  As we can handle non-constant step fine here we should
2400          simply remove uses of GROUP_GAP between the last and first
2401          element and instead rely on DR_STEP.  GROUP_SIZE then would
2402          simply not include that gap.  */
2403       if ((dr_step % type_size) != 0)
2404         {
2405           if (dump_enabled_p ())
2406             {
2407               dump_printf_loc (MSG_NOTE, vect_location,
2408                                "Step ");
2409               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2410               dump_printf (MSG_NOTE,
2411                            " is not a multiple of the element size for ");
2412               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2413               dump_printf (MSG_NOTE, "\n");
2414             }
2415           return false;
2416         }
2417       groupsize = absu_hwi (dr_step) / type_size;
2418     }
2419   else
2420     groupsize = 0;
2421
2422   /* Not consecutive access is possible only if it is a part of interleaving.  */
2423   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2424     {
2425       /* Check if it this DR is a part of interleaving, and is a single
2426          element of the group that is accessed in the loop.  */
2427
2428       /* Gaps are supported only for loads. STEP must be a multiple of the type
2429          size.  The size of the group must be a power of 2.  */
2430       if (DR_IS_READ (dr)
2431           && (dr_step % type_size) == 0
2432           && groupsize > 0
2433           && pow2p_hwi (groupsize))
2434         {
2435           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2436           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2437           GROUP_GAP (stmt_info) = groupsize - 1;
2438           if (dump_enabled_p ())
2439             {
2440               dump_printf_loc (MSG_NOTE, vect_location,
2441                                "Detected single element interleaving ");
2442               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2443               dump_printf (MSG_NOTE, " step ");
2444               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2445               dump_printf (MSG_NOTE, "\n");
2446             }
2447
2448           return true;
2449         }
2450
2451       if (dump_enabled_p ())
2452         {
2453           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2454                            "not consecutive access ");
2455           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2456         }
2457
2458       if (bb_vinfo)
2459         {
2460           /* Mark the statement as unvectorizable.  */
2461           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2462           return true;
2463         }
2464
2465       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2466       STMT_VINFO_STRIDED_P (stmt_info) = true;
2467       return true;
2468     }
2469
2470   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2471     {
2472       /* First stmt in the interleaving chain. Check the chain.  */
2473       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2474       struct data_reference *data_ref = dr;
2475       unsigned int count = 1;
2476       tree prev_init = DR_INIT (data_ref);
2477       gimple *prev = stmt;
2478       HOST_WIDE_INT diff, gaps = 0;
2479
2480       while (next)
2481         {
2482           /* Skip same data-refs.  In case that two or more stmts share
2483              data-ref (supported only for loads), we vectorize only the first
2484              stmt, and the rest get their vectorized loads from the first
2485              one.  */
2486           if (!tree_int_cst_compare (DR_INIT (data_ref),
2487                                      DR_INIT (STMT_VINFO_DATA_REF (
2488                                                    vinfo_for_stmt (next)))))
2489             {
2490               if (DR_IS_WRITE (data_ref))
2491                 {
2492                   if (dump_enabled_p ())
2493                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494                                      "Two store stmts share the same dr.\n");
2495                   return false;
2496                 }
2497
2498               if (dump_enabled_p ())
2499                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2500                                  "Two or more load stmts share the same dr.\n");
2501
2502               /* For load use the same data-ref load.  */
2503               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2504
2505               prev = next;
2506               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2507               continue;
2508             }
2509
2510           prev = next;
2511           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2512
2513           /* All group members have the same STEP by construction.  */
2514           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2515
2516           /* Check that the distance between two accesses is equal to the type
2517              size. Otherwise, we have gaps.  */
2518           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2519                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2520           if (diff != 1)
2521             {
2522               /* FORNOW: SLP of accesses with gaps is not supported.  */
2523               slp_impossible = true;
2524               if (DR_IS_WRITE (data_ref))
2525                 {
2526                   if (dump_enabled_p ())
2527                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528                                      "interleaved store with gaps\n");
2529                   return false;
2530                 }
2531
2532               gaps += diff - 1;
2533             }
2534
2535           last_accessed_element += diff;
2536
2537           /* Store the gap from the previous member of the group. If there is no
2538              gap in the access, GROUP_GAP is always 1.  */
2539           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2540
2541           prev_init = DR_INIT (data_ref);
2542           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2543           /* Count the number of data-refs in the chain.  */
2544           count++;
2545         }
2546
2547       if (groupsize == 0)
2548         groupsize = count + gaps;
2549
2550       /* This could be UINT_MAX but as we are generating code in a very
2551          inefficient way we have to cap earlier.  See PR78699 for example.  */
2552       if (groupsize > 4096)
2553         {
2554           if (dump_enabled_p ())
2555             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2556                              "group is too large\n");
2557           return false;
2558         }
2559
2560       /* Check that the size of the interleaving is equal to count for stores,
2561          i.e., that there are no gaps.  */
2562       if (groupsize != count
2563           && !DR_IS_READ (dr))
2564         {
2565           if (dump_enabled_p ())
2566             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567                              "interleaved store with gaps\n");
2568           return false;
2569         }
2570
2571       /* If there is a gap after the last load in the group it is the
2572          difference between the groupsize and the last accessed
2573          element.
2574          When there is no gap, this difference should be 0.  */
2575       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2576
2577       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2578       if (dump_enabled_p ())
2579         {
2580           dump_printf_loc (MSG_NOTE, vect_location,
2581                            "Detected interleaving ");
2582           if (DR_IS_READ (dr))
2583             dump_printf (MSG_NOTE, "load ");
2584           else
2585             dump_printf (MSG_NOTE, "store ");
2586           dump_printf (MSG_NOTE, "of size %u starting with ",
2587                        (unsigned)groupsize);
2588           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2589           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2590             dump_printf_loc (MSG_NOTE, vect_location,
2591                              "There is a gap of %u elements after the group\n",
2592                              GROUP_GAP (vinfo_for_stmt (stmt)));
2593         }
2594
2595       /* SLP: create an SLP data structure for every interleaving group of
2596          stores for further analysis in vect_analyse_slp.  */
2597       if (DR_IS_WRITE (dr) && !slp_impossible)
2598         {
2599           if (loop_vinfo)
2600             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2601           if (bb_vinfo)
2602             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2603         }
2604     }
2605
2606   return true;
2607 }
2608
2609 /* Analyze groups of accesses: check that DR belongs to a group of
2610    accesses of legal size, step, etc.  Detect gaps, single element
2611    interleaving, and other special cases. Set grouped access info.
2612    Collect groups of strided stores for further use in SLP analysis.  */
2613
2614 static bool
2615 vect_analyze_group_access (struct data_reference *dr)
2616 {
2617   if (!vect_analyze_group_access_1 (dr))
2618     {
2619       /* Dissolve the group if present.  */
2620       gimple *next;
2621       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2622       while (stmt)
2623         {
2624           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2625           next = GROUP_NEXT_ELEMENT (vinfo);
2626           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2627           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2628           stmt = next;
2629         }
2630       return false;
2631     }
2632   return true;
2633 }
2634
2635 /* Analyze the access pattern of the data-reference DR.
2636    In case of non-consecutive accesses call vect_analyze_group_access() to
2637    analyze groups of accesses.  */
2638
2639 static bool
2640 vect_analyze_data_ref_access (struct data_reference *dr)
2641 {
2642   tree step = DR_STEP (dr);
2643   tree scalar_type = TREE_TYPE (DR_REF (dr));
2644   gimple *stmt = DR_STMT (dr);
2645   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2646   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2647   struct loop *loop = NULL;
2648
2649   if (loop_vinfo)
2650     loop = LOOP_VINFO_LOOP (loop_vinfo);
2651
2652   if (loop_vinfo && !step)
2653     {
2654       if (dump_enabled_p ())
2655         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2656                          "bad data-ref access in loop\n");
2657       return false;
2658     }
2659
2660   /* Allow loads with zero step in inner-loop vectorization.  */
2661   if (loop_vinfo && integer_zerop (step))
2662     {
2663       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2664       if (!nested_in_vect_loop_p (loop, stmt))
2665         return DR_IS_READ (dr);
2666       /* Allow references with zero step for outer loops marked
2667          with pragma omp simd only - it guarantees absence of
2668          loop-carried dependencies between inner loop iterations.  */
2669       if (!loop->force_vectorize)
2670         {
2671           if (dump_enabled_p ())
2672             dump_printf_loc (MSG_NOTE, vect_location,
2673                              "zero step in inner loop of nest\n");
2674           return false;
2675         }
2676     }
2677
2678   if (loop && nested_in_vect_loop_p (loop, stmt))
2679     {
2680       /* Interleaved accesses are not yet supported within outer-loop
2681         vectorization for references in the inner-loop.  */
2682       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2683
2684       /* For the rest of the analysis we use the outer-loop step.  */
2685       step = STMT_VINFO_DR_STEP (stmt_info);
2686       if (integer_zerop (step))
2687         {
2688           if (dump_enabled_p ())
2689             dump_printf_loc (MSG_NOTE, vect_location,
2690                              "zero step in outer loop.\n");
2691           return DR_IS_READ (dr);
2692         }
2693     }
2694
2695   /* Consecutive?  */
2696   if (TREE_CODE (step) == INTEGER_CST)
2697     {
2698       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2699       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2700           || (dr_step < 0
2701               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2702         {
2703           /* Mark that it is not interleaving.  */
2704           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2705           return true;
2706         }
2707     }
2708
2709   if (loop && nested_in_vect_loop_p (loop, stmt))
2710     {
2711       if (dump_enabled_p ())
2712         dump_printf_loc (MSG_NOTE, vect_location,
2713                          "grouped access in outer loop.\n");
2714       return false;
2715     }
2716
2717
2718   /* Assume this is a DR handled by non-constant strided load case.  */
2719   if (TREE_CODE (step) != INTEGER_CST)
2720     return (STMT_VINFO_STRIDED_P (stmt_info)
2721             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2722                 || vect_analyze_group_access (dr)));
2723
2724   /* Not consecutive access - check if it's a part of interleaving group.  */
2725   return vect_analyze_group_access (dr);
2726 }
2727
2728 /* Compare two data-references DRA and DRB to group them into chunks
2729    suitable for grouping.  */
2730
2731 static int
2732 dr_group_sort_cmp (const void *dra_, const void *drb_)
2733 {
2734   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2735   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2736   int cmp;
2737
2738   /* Stabilize sort.  */
2739   if (dra == drb)
2740     return 0;
2741
2742   /* DRs in different loops never belong to the same group.  */
2743   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2744   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2745   if (loopa != loopb)
2746     return loopa->num < loopb->num ? -1 : 1;
2747
2748   /* Ordering of DRs according to base.  */
2749   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2750                                DR_BASE_ADDRESS (drb));
2751   if (cmp != 0)
2752     return cmp;
2753
2754   /* And according to DR_OFFSET.  */
2755   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2756   if (cmp != 0)
2757     return cmp;
2758
2759   /* Put reads before writes.  */
2760   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2761     return DR_IS_READ (dra) ? -1 : 1;
2762
2763   /* Then sort after access size.  */
2764   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2765                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2766   if (cmp != 0)
2767     return cmp;
2768
2769   /* And after step.  */
2770   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2771   if (cmp != 0)
2772     return cmp;
2773
2774   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2775   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2776   if (cmp == 0)
2777     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2778   return cmp;
2779 }
2780
2781 /* Function vect_analyze_data_ref_accesses.
2782
2783    Analyze the access pattern of all the data references in the loop.
2784
2785    FORNOW: the only access pattern that is considered vectorizable is a
2786            simple step 1 (consecutive) access.
2787
2788    FORNOW: handle only arrays and pointer accesses.  */
2789
2790 bool
2791 vect_analyze_data_ref_accesses (vec_info *vinfo)
2792 {
2793   unsigned int i;
2794   vec<data_reference_p> datarefs = vinfo->datarefs;
2795   struct data_reference *dr;
2796
2797   if (dump_enabled_p ())
2798     dump_printf_loc (MSG_NOTE, vect_location,
2799                      "=== vect_analyze_data_ref_accesses ===\n");
2800
2801   if (datarefs.is_empty ())
2802     return true;
2803
2804   /* Sort the array of datarefs to make building the interleaving chains
2805      linear.  Don't modify the original vector's order, it is needed for
2806      determining what dependencies are reversed.  */
2807   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2808   datarefs_copy.qsort (dr_group_sort_cmp);
2809
2810   /* Build the interleaving chains.  */
2811   for (i = 0; i < datarefs_copy.length () - 1;)
2812     {
2813       data_reference_p dra = datarefs_copy[i];
2814       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2815       stmt_vec_info lastinfo = NULL;
2816       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2817         {
2818           ++i;
2819           continue;
2820         }
2821       for (i = i + 1; i < datarefs_copy.length (); ++i)
2822         {
2823           data_reference_p drb = datarefs_copy[i];
2824           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2825           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2826             break;
2827
2828           /* ???  Imperfect sorting (non-compatible types, non-modulo
2829              accesses, same accesses) can lead to a group to be artificially
2830              split here as we don't just skip over those.  If it really
2831              matters we can push those to a worklist and re-iterate
2832              over them.  The we can just skip ahead to the next DR here.  */
2833
2834           /* DRs in a different loop should not be put into the same
2835              interleaving group.  */
2836           if (gimple_bb (DR_STMT (dra))->loop_father
2837               != gimple_bb (DR_STMT (drb))->loop_father)
2838             break;
2839
2840           /* Check that the data-refs have same first location (except init)
2841              and they are both either store or load (not load and store,
2842              not masked loads or stores).  */
2843           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2844               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2845                                         DR_BASE_ADDRESS (drb)) != 0
2846               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2847               || !gimple_assign_single_p (DR_STMT (dra))
2848               || !gimple_assign_single_p (DR_STMT (drb)))
2849             break;
2850
2851           /* Check that the data-refs have the same constant size.  */
2852           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2853           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2854           if (!tree_fits_uhwi_p (sza)
2855               || !tree_fits_uhwi_p (szb)
2856               || !tree_int_cst_equal (sza, szb))
2857             break;
2858
2859           /* Check that the data-refs have the same step.  */
2860           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2861             break;
2862
2863           /* Check the types are compatible.
2864              ???  We don't distinguish this during sorting.  */
2865           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2866                                    TREE_TYPE (DR_REF (drb))))
2867             break;
2868
2869           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2870           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2871           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2872           HOST_WIDE_INT init_prev
2873             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2874           gcc_assert (init_a <= init_b
2875                       && init_a <= init_prev
2876                       && init_prev <= init_b);
2877
2878           /* Do not place the same access in the interleaving chain twice.  */
2879           if (init_b == init_prev)
2880             {
2881               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2882                           < gimple_uid (DR_STMT (drb)));
2883               /* ???  For now we simply "drop" the later reference which is
2884                  otherwise the same rather than finishing off this group.
2885                  In the end we'd want to re-process duplicates forming
2886                  multiple groups from the refs, likely by just collecting
2887                  all candidates (including duplicates and split points
2888                  below) in a vector and then process them together.  */
2889               continue;
2890             }
2891
2892           /* If init_b == init_a + the size of the type * k, we have an
2893              interleaving, and DRA is accessed before DRB.  */
2894           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2895           if (type_size_a == 0
2896               || (init_b - init_a) % type_size_a != 0)
2897             break;
2898
2899           /* If we have a store, the accesses are adjacent.  This splits
2900              groups into chunks we support (we don't support vectorization
2901              of stores with gaps).  */
2902           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2903             break;
2904
2905           /* If the step (if not zero or non-constant) is greater than the
2906              difference between data-refs' inits this splits groups into
2907              suitable sizes.  */
2908           if (tree_fits_shwi_p (DR_STEP (dra)))
2909             {
2910               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2911               if (step != 0 && step <= (init_b - init_a))
2912                 break;
2913             }
2914
2915           if (dump_enabled_p ())
2916             {
2917               dump_printf_loc (MSG_NOTE, vect_location,
2918                                "Detected interleaving ");
2919               if (DR_IS_READ (dra))
2920                 dump_printf (MSG_NOTE, "load ");
2921               else
2922                 dump_printf (MSG_NOTE, "store ");
2923               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2924               dump_printf (MSG_NOTE,  " and ");
2925               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2926               dump_printf (MSG_NOTE, "\n");
2927             }
2928
2929           /* Link the found element into the group list.  */
2930           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2931             {
2932               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2933               lastinfo = stmtinfo_a;
2934             }
2935           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2936           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2937           lastinfo = stmtinfo_b;
2938         }
2939     }
2940
2941   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2942     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2943         && !vect_analyze_data_ref_access (dr))
2944       {
2945         if (dump_enabled_p ())
2946           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2947                            "not vectorized: complicated access pattern.\n");
2948
2949         if (is_a <bb_vec_info> (vinfo))
2950           {
2951             /* Mark the statement as not vectorizable.  */
2952             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2953             continue;
2954           }
2955         else
2956           {
2957             datarefs_copy.release ();
2958             return false;
2959           }
2960       }
2961
2962   datarefs_copy.release ();
2963   return true;
2964 }
2965
2966 /* Function vect_vfa_segment_size.
2967
2968    Create an expression that computes the size of segment
2969    that will be accessed for a data reference.  The functions takes into
2970    account that realignment loads may access one more vector.
2971
2972    Input:
2973      DR: The data reference.
2974      LENGTH_FACTOR: segment length to consider.
2975
2976    Return an expression whose value is the size of segment which will be
2977    accessed by DR.  */
2978
2979 static tree
2980 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2981 {
2982   tree segment_length;
2983
2984   if (integer_zerop (DR_STEP (dr)))
2985     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2986   else
2987     segment_length = size_binop (MULT_EXPR,
2988                                  fold_convert (sizetype, DR_STEP (dr)),
2989                                  fold_convert (sizetype, length_factor));
2990
2991   if (vect_supportable_dr_alignment (dr, false)
2992         == dr_explicit_realign_optimized)
2993     {
2994       tree vector_size = TYPE_SIZE_UNIT
2995                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2996
2997       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2998     }
2999   return segment_length;
3000 }
3001
3002 /* Function vect_no_alias_p.
3003
3004    Given data references A and B with equal base and offset, the alias
3005    relation can be decided at compilation time, return TRUE if they do
3006    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
3007    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3008    respectively.  */
3009
3010 static bool
3011 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
3012                  tree segment_length_a, tree segment_length_b)
3013 {
3014   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3015               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3016   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3017     return false;
3018
3019   tree seg_a_min = DR_INIT (a);
3020   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3021                                 seg_a_min, segment_length_a);
3022   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3023      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3024      [a, a+12) */
3025   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3026     {
3027       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3028       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3029                                seg_a_max, unit_size);
3030       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3031                                DR_INIT (a), unit_size);
3032     }
3033   tree seg_b_min = DR_INIT (b);
3034   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3035                                 seg_b_min, segment_length_b);
3036   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3037     {
3038       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3039       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3040                                seg_b_max, unit_size);
3041       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3042                                DR_INIT (b), unit_size);
3043     }
3044
3045   if (tree_int_cst_le (seg_a_max, seg_b_min)
3046       || tree_int_cst_le (seg_b_max, seg_a_min))
3047     return true;
3048
3049   return false;
3050 }
3051
3052 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3053    in DDR is >= VF.  */
3054
3055 static bool
3056 dependence_distance_ge_vf (data_dependence_relation *ddr,
3057                            unsigned int loop_depth, poly_uint64 vf)
3058 {
3059   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3060       || DDR_NUM_DIST_VECTS (ddr) == 0)
3061     return false;
3062
3063   /* If the dependence is exact, we should have limited the VF instead.  */
3064   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3065
3066   unsigned int i;
3067   lambda_vector dist_v;
3068   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3069     {
3070       HOST_WIDE_INT dist = dist_v[loop_depth];
3071       if (dist != 0
3072           && !(dist > 0 && DDR_REVERSED_P (ddr))
3073           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3074         return false;
3075     }
3076
3077   if (dump_enabled_p ())
3078     {
3079       dump_printf_loc (MSG_NOTE, vect_location,
3080                        "dependence distance between ");
3081       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3082       dump_printf (MSG_NOTE,  " and ");
3083       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3084       dump_printf (MSG_NOTE,  " is >= VF\n");
3085     }
3086
3087   return true;
3088 }
3089
3090 /* Function vect_prune_runtime_alias_test_list.
3091
3092    Prune a list of ddrs to be tested at run-time by versioning for alias.
3093    Merge several alias checks into one if possible.
3094    Return FALSE if resulting list of ddrs is longer then allowed by
3095    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3096
3097 bool
3098 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3099 {
3100   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3101   hash_set <tree_pair_hash> compared_objects;
3102
3103   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3104   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3105     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3106   vec<vec_object_pair> &check_unequal_addrs
3107     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3108   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3109   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3110
3111   ddr_p ddr;
3112   unsigned int i;
3113   tree length_factor;
3114
3115   if (dump_enabled_p ())
3116     dump_printf_loc (MSG_NOTE, vect_location,
3117                      "=== vect_prune_runtime_alias_test_list ===\n");
3118
3119   if (may_alias_ddrs.is_empty ())
3120     return true;
3121
3122   comp_alias_ddrs.create (may_alias_ddrs.length ());
3123
3124   unsigned int loop_depth
3125     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3126                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3127
3128   /* First, we collect all data ref pairs for aliasing checks.  */
3129   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3130     {
3131       int comp_res;
3132       struct data_reference *dr_a, *dr_b;
3133       gimple *dr_group_first_a, *dr_group_first_b;
3134       tree segment_length_a, segment_length_b;
3135       gimple *stmt_a, *stmt_b;
3136
3137       /* Ignore the alias if the VF we chose ended up being no greater
3138          than the dependence distance.  */
3139       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3140         continue;
3141
3142       if (DDR_OBJECT_A (ddr))
3143         {
3144           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3145           if (!compared_objects.add (new_pair))
3146             {
3147               if (dump_enabled_p ())
3148                 {
3149                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3150                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3151                   dump_printf (MSG_NOTE, " and ");
3152                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3153                   dump_printf (MSG_NOTE, " have different addresses\n");
3154                 }
3155               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3156             }
3157           continue;
3158         }
3159
3160       dr_a = DDR_A (ddr);
3161       stmt_a = DR_STMT (DDR_A (ddr));
3162       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3163       if (dr_group_first_a)
3164         {
3165           stmt_a = dr_group_first_a;
3166           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3167         }
3168
3169       dr_b = DDR_B (ddr);
3170       stmt_b = DR_STMT (DDR_B (ddr));
3171       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3172       if (dr_group_first_b)
3173         {
3174           stmt_b = dr_group_first_b;
3175           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3176         }
3177
3178       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3179         length_factor = scalar_loop_iters;
3180       else
3181         length_factor = size_int (vect_factor);
3182       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3183       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3184
3185       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3186                                         DR_BASE_ADDRESS (dr_b));
3187       if (comp_res == 0)
3188         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3189                                           DR_OFFSET (dr_b));
3190
3191       /* Alias is known at compilation time.  */
3192       if (comp_res == 0
3193           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3194           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3195           && TREE_CODE (segment_length_a) == INTEGER_CST
3196           && TREE_CODE (segment_length_b) == INTEGER_CST)
3197         {
3198           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3199             continue;
3200
3201           if (dump_enabled_p ())
3202             dump_printf_loc (MSG_NOTE, vect_location,
3203                              "not vectorized: compilation time alias.\n");
3204
3205           return false;
3206         }
3207
3208       dr_with_seg_len_pair_t dr_with_seg_len_pair
3209           (dr_with_seg_len (dr_a, segment_length_a),
3210            dr_with_seg_len (dr_b, segment_length_b));
3211
3212       /* Canonicalize pairs by sorting the two DR members.  */
3213       if (comp_res > 0)
3214         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3215
3216       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3217     }
3218
3219   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3220
3221   unsigned int count = (comp_alias_ddrs.length ()
3222                         + check_unequal_addrs.length ());
3223   dump_printf_loc (MSG_NOTE, vect_location,
3224                    "improved number of alias checks from %d to %d\n",
3225                    may_alias_ddrs.length (), count);
3226   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3227     {
3228       if (dump_enabled_p ())
3229         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3230                          "number of versioning for alias "
3231                          "run-time tests exceeds %d "
3232                          "(--param vect-max-version-for-alias-checks)\n",
3233                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3234       return false;
3235     }
3236
3237   return true;
3238 }
3239
3240 /* Return true if a non-affine read or write in STMT is suitable for a
3241    gather load or scatter store.  Describe the operation in *INFO if so.  */
3242
3243 bool
3244 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3245                            gather_scatter_info *info)
3246 {
3247   HOST_WIDE_INT scale = 1;
3248   poly_int64 pbitpos, pbitsize;
3249   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3250   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3251   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3252   tree offtype = NULL_TREE;
3253   tree decl, base, off;
3254   machine_mode pmode;
3255   int punsignedp, reversep, pvolatilep = 0;
3256
3257   base = DR_REF (dr);
3258   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3259      see if we can use the def stmt of the address.  */
3260   if (is_gimple_call (stmt)
3261       && gimple_call_internal_p (stmt)
3262       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3263           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3264       && TREE_CODE (base) == MEM_REF
3265       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3266       && integer_zerop (TREE_OPERAND (base, 1))
3267       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3268     {
3269       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3270       if (is_gimple_assign (def_stmt)
3271           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3272         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3273     }
3274
3275   /* The gather and scatter builtins need address of the form
3276      loop_invariant + vector * {1, 2, 4, 8}
3277      or
3278      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3279      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3280      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3281      multiplications and additions in it.  To get a vector, we need
3282      a single SSA_NAME that will be defined in the loop and will
3283      contain everything that is not loop invariant and that can be
3284      vectorized.  The following code attempts to find such a preexistng
3285      SSA_NAME OFF and put the loop invariants into a tree BASE
3286      that can be gimplified before the loop.  */
3287   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3288                               &punsignedp, &reversep, &pvolatilep);
3289   gcc_assert (base && !reversep);
3290   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3291
3292   if (TREE_CODE (base) == MEM_REF)
3293     {
3294       if (!integer_zerop (TREE_OPERAND (base, 1)))
3295         {
3296           if (off == NULL_TREE)
3297             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3298           else
3299             off = size_binop (PLUS_EXPR, off,
3300                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3301         }
3302       base = TREE_OPERAND (base, 0);
3303     }
3304   else
3305     base = build_fold_addr_expr (base);
3306
3307   if (off == NULL_TREE)
3308     off = size_zero_node;
3309
3310   /* If base is not loop invariant, either off is 0, then we start with just
3311      the constant offset in the loop invariant BASE and continue with base
3312      as OFF, otherwise give up.
3313      We could handle that case by gimplifying the addition of base + off
3314      into some SSA_NAME and use that as off, but for now punt.  */
3315   if (!expr_invariant_in_loop_p (loop, base))
3316     {
3317       if (!integer_zerop (off))
3318         return false;
3319       off = base;
3320       base = size_int (pbytepos);
3321     }
3322   /* Otherwise put base + constant offset into the loop invariant BASE
3323      and continue with OFF.  */
3324   else
3325     {
3326       base = fold_convert (sizetype, base);
3327       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3328     }
3329
3330   /* OFF at this point may be either a SSA_NAME or some tree expression
3331      from get_inner_reference.  Try to peel off loop invariants from it
3332      into BASE as long as possible.  */
3333   STRIP_NOPS (off);
3334   while (offtype == NULL_TREE)
3335     {
3336       enum tree_code code;
3337       tree op0, op1, add = NULL_TREE;
3338
3339       if (TREE_CODE (off) == SSA_NAME)
3340         {
3341           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3342
3343           if (expr_invariant_in_loop_p (loop, off))
3344             return false;
3345
3346           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3347             break;
3348
3349           op0 = gimple_assign_rhs1 (def_stmt);
3350           code = gimple_assign_rhs_code (def_stmt);
3351           op1 = gimple_assign_rhs2 (def_stmt);
3352         }
3353       else
3354         {
3355           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3356             return false;
3357           code = TREE_CODE (off);
3358           extract_ops_from_tree (off, &code, &op0, &op1);
3359         }
3360       switch (code)
3361         {
3362         case POINTER_PLUS_EXPR:
3363         case PLUS_EXPR:
3364           if (expr_invariant_in_loop_p (loop, op0))
3365             {
3366               add = op0;
3367               off = op1;
3368             do_add:
3369               add = fold_convert (sizetype, add);
3370               if (scale != 1)
3371                 add = size_binop (MULT_EXPR, add, size_int (scale));
3372               base = size_binop (PLUS_EXPR, base, add);
3373               continue;
3374             }
3375           if (expr_invariant_in_loop_p (loop, op1))
3376             {
3377               add = op1;
3378               off = op0;
3379               goto do_add;
3380             }
3381           break;
3382         case MINUS_EXPR:
3383           if (expr_invariant_in_loop_p (loop, op1))
3384             {
3385               add = fold_convert (sizetype, op1);
3386               add = size_binop (MINUS_EXPR, size_zero_node, add);
3387               off = op0;
3388               goto do_add;
3389             }
3390           break;
3391         case MULT_EXPR:
3392           if (scale == 1 && tree_fits_shwi_p (op1))
3393             {
3394               scale = tree_to_shwi (op1);
3395               off = op0;
3396               continue;
3397             }
3398           break;
3399         case SSA_NAME:
3400           off = op0;
3401           continue;
3402         CASE_CONVERT:
3403           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3404               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3405             break;
3406           if (TYPE_PRECISION (TREE_TYPE (op0))
3407               == TYPE_PRECISION (TREE_TYPE (off)))
3408             {
3409               off = op0;
3410               continue;
3411             }
3412           if (TYPE_PRECISION (TREE_TYPE (op0))
3413               < TYPE_PRECISION (TREE_TYPE (off)))
3414             {
3415               off = op0;
3416               offtype = TREE_TYPE (off);
3417               STRIP_NOPS (off);
3418               continue;
3419             }
3420           break;
3421         default:
3422           break;
3423         }
3424       break;
3425     }
3426
3427   /* If at the end OFF still isn't a SSA_NAME or isn't
3428      defined in the loop, punt.  */
3429   if (TREE_CODE (off) != SSA_NAME
3430       || expr_invariant_in_loop_p (loop, off))
3431     return false;
3432
3433   if (offtype == NULL_TREE)
3434     offtype = TREE_TYPE (off);
3435
3436   if (DR_IS_READ (dr))
3437     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3438                                              offtype, scale);
3439   else
3440     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3441                                               offtype, scale);
3442
3443   if (decl == NULL_TREE)
3444     return false;
3445
3446   info->decl = decl;
3447   info->base = base;
3448   info->offset = off;
3449   info->offset_dt = vect_unknown_def_type;
3450   info->offset_vectype = NULL_TREE;
3451   info->scale = scale;
3452   return true;
3453 }
3454
3455 /* Function vect_analyze_data_refs.
3456
3457   Find all the data references in the loop or basic block.
3458
3459    The general structure of the analysis of data refs in the vectorizer is as
3460    follows:
3461    1- vect_analyze_data_refs(loop/bb): call
3462       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3463       in the loop/bb and their dependences.
3464    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3465    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3466    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3467
3468 */
3469
3470 bool
3471 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3472 {
3473   struct loop *loop = NULL;
3474   unsigned int i;
3475   struct data_reference *dr;
3476   tree scalar_type;
3477
3478   if (dump_enabled_p ())
3479     dump_printf_loc (MSG_NOTE, vect_location,
3480                      "=== vect_analyze_data_refs ===\n");
3481
3482   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3483     loop = LOOP_VINFO_LOOP (loop_vinfo);
3484
3485   /* Go through the data-refs, check that the analysis succeeded.  Update
3486      pointer from stmt_vec_info struct to DR and vectype.  */
3487
3488   vec<data_reference_p> datarefs = vinfo->datarefs;
3489   FOR_EACH_VEC_ELT (datarefs, i, dr)
3490     {
3491       gimple *stmt;
3492       stmt_vec_info stmt_info;
3493       tree base, offset, init;
3494       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3495       bool simd_lane_access = false;
3496       poly_uint64 vf;
3497
3498 again:
3499       if (!dr || !DR_REF (dr))
3500         {
3501           if (dump_enabled_p ())
3502             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3503                              "not vectorized: unhandled data-ref\n");
3504           return false;
3505         }
3506
3507       stmt = DR_STMT (dr);
3508       stmt_info = vinfo_for_stmt (stmt);
3509
3510       /* Discard clobbers from the dataref vector.  We will remove
3511          clobber stmts during vectorization.  */
3512       if (gimple_clobber_p (stmt))
3513         {
3514           free_data_ref (dr);
3515           if (i == datarefs.length () - 1)
3516             {
3517               datarefs.pop ();
3518               break;
3519             }
3520           datarefs.ordered_remove (i);
3521           dr = datarefs[i];
3522           goto again;
3523         }
3524
3525       /* Check that analysis of the data-ref succeeded.  */
3526       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3527           || !DR_STEP (dr))
3528         {
3529           bool maybe_gather
3530             = DR_IS_READ (dr)
3531               && !TREE_THIS_VOLATILE (DR_REF (dr))
3532               && targetm.vectorize.builtin_gather != NULL;
3533           bool maybe_scatter
3534             = DR_IS_WRITE (dr)
3535               && !TREE_THIS_VOLATILE (DR_REF (dr))
3536               && targetm.vectorize.builtin_scatter != NULL;
3537           bool maybe_simd_lane_access
3538             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3539
3540           /* If target supports vector gather loads or scatter stores, or if
3541              this might be a SIMD lane access, see if they can't be used.  */
3542           if (is_a <loop_vec_info> (vinfo)
3543               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3544               && !nested_in_vect_loop_p (loop, stmt))
3545             {
3546               struct data_reference *newdr
3547                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3548                                    DR_REF (dr), stmt, !maybe_scatter,
3549                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3550               gcc_assert (newdr != NULL && DR_REF (newdr));
3551               if (DR_BASE_ADDRESS (newdr)
3552                   && DR_OFFSET (newdr)
3553                   && DR_INIT (newdr)
3554                   && DR_STEP (newdr)
3555                   && integer_zerop (DR_STEP (newdr)))
3556                 {
3557                   if (maybe_simd_lane_access)
3558                     {
3559                       tree off = DR_OFFSET (newdr);
3560                       STRIP_NOPS (off);
3561                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3562                           && TREE_CODE (off) == MULT_EXPR
3563                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3564                         {
3565                           tree step = TREE_OPERAND (off, 1);
3566                           off = TREE_OPERAND (off, 0);
3567                           STRIP_NOPS (off);
3568                           if (CONVERT_EXPR_P (off)
3569                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3570                                                                           0)))
3571                                  < TYPE_PRECISION (TREE_TYPE (off)))
3572                             off = TREE_OPERAND (off, 0);
3573                           if (TREE_CODE (off) == SSA_NAME)
3574                             {
3575                               gimple *def = SSA_NAME_DEF_STMT (off);
3576                               tree reft = TREE_TYPE (DR_REF (newdr));
3577                               if (is_gimple_call (def)
3578                                   && gimple_call_internal_p (def)
3579                                   && (gimple_call_internal_fn (def)
3580                                       == IFN_GOMP_SIMD_LANE))
3581                                 {
3582                                   tree arg = gimple_call_arg (def, 0);
3583                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3584                                   arg = SSA_NAME_VAR (arg);
3585                                   if (arg == loop->simduid
3586                                       /* For now.  */
3587                                       && tree_int_cst_equal
3588                                            (TYPE_SIZE_UNIT (reft),
3589                                             step))
3590                                     {
3591                                       DR_OFFSET (newdr) = ssize_int (0);
3592                                       DR_STEP (newdr) = step;
3593                                       DR_OFFSET_ALIGNMENT (newdr)
3594                                         = BIGGEST_ALIGNMENT;
3595                                       DR_STEP_ALIGNMENT (newdr)
3596                                         = highest_pow2_factor (step);
3597                                       dr = newdr;
3598                                       simd_lane_access = true;
3599                                     }
3600                                 }
3601                             }
3602                         }
3603                     }
3604                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3605                     {
3606                       dr = newdr;
3607                       if (maybe_gather)
3608                         gatherscatter = GATHER;
3609                       else
3610                         gatherscatter = SCATTER;
3611                     }
3612                 }
3613               if (gatherscatter == SG_NONE && !simd_lane_access)
3614                 free_data_ref (newdr);
3615             }
3616
3617           if (gatherscatter == SG_NONE && !simd_lane_access)
3618             {
3619               if (dump_enabled_p ())
3620                 {
3621                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622                                    "not vectorized: data ref analysis "
3623                                    "failed ");
3624                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3625                 }
3626
3627               if (is_a <bb_vec_info> (vinfo))
3628                 break;
3629
3630               return false;
3631             }
3632         }
3633
3634       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3635         {
3636           if (dump_enabled_p ())
3637             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3638                              "not vectorized: base addr of dr is a "
3639                              "constant\n");
3640
3641           if (is_a <bb_vec_info> (vinfo))
3642             break;
3643
3644           if (gatherscatter != SG_NONE || simd_lane_access)
3645             free_data_ref (dr);
3646           return false;
3647         }
3648
3649       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3650         {
3651           if (dump_enabled_p ())
3652             {
3653               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3654                                "not vectorized: volatile type ");
3655               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3656             }
3657
3658           if (is_a <bb_vec_info> (vinfo))
3659             break;
3660
3661           return false;
3662         }
3663
3664       if (stmt_can_throw_internal (stmt))
3665         {
3666           if (dump_enabled_p ())
3667             {
3668               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3669                                "not vectorized: statement can throw an "
3670                                "exception ");
3671               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3672             }
3673
3674           if (is_a <bb_vec_info> (vinfo))
3675             break;
3676
3677           if (gatherscatter != SG_NONE || simd_lane_access)
3678             free_data_ref (dr);
3679           return false;
3680         }
3681
3682       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3683           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3684         {
3685           if (dump_enabled_p ())
3686             {
3687               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3688                                "not vectorized: statement is bitfield "
3689                                "access ");
3690               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3691             }
3692
3693           if (is_a <bb_vec_info> (vinfo))
3694             break;
3695
3696           if (gatherscatter != SG_NONE || simd_lane_access)
3697             free_data_ref (dr);
3698           return false;
3699         }
3700
3701       base = unshare_expr (DR_BASE_ADDRESS (dr));
3702       offset = unshare_expr (DR_OFFSET (dr));
3703       init = unshare_expr (DR_INIT (dr));
3704
3705       if (is_gimple_call (stmt)
3706           && (!gimple_call_internal_p (stmt)
3707               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3708                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3709         {
3710           if (dump_enabled_p ())
3711             {
3712               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3713                                "not vectorized: dr in a call ");
3714               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3715             }
3716
3717           if (is_a <bb_vec_info> (vinfo))
3718             break;
3719
3720           if (gatherscatter != SG_NONE || simd_lane_access)
3721             free_data_ref (dr);
3722           return false;
3723         }
3724
3725       /* Update DR field in stmt_vec_info struct.  */
3726
3727       /* If the dataref is in an inner-loop of the loop that is considered for
3728          for vectorization, we also want to analyze the access relative to
3729          the outer-loop (DR contains information only relative to the
3730          inner-most enclosing loop).  We do that by building a reference to the
3731          first location accessed by the inner-loop, and analyze it relative to
3732          the outer-loop.  */
3733       if (loop && nested_in_vect_loop_p (loop, stmt))
3734         {
3735           /* Build a reference to the first location accessed by the
3736              inner loop: *(BASE + INIT + OFFSET).  By construction,
3737              this address must be invariant in the inner loop, so we
3738              can consider it as being used in the outer loop.  */
3739           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3740                                           init, offset);
3741           tree init_addr = fold_build_pointer_plus (base, init_offset);
3742           tree init_ref = build_fold_indirect_ref (init_addr);
3743
3744           if (dump_enabled_p ())
3745             {
3746               dump_printf_loc (MSG_NOTE, vect_location,
3747                                "analyze in outer loop: ");
3748               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3749               dump_printf (MSG_NOTE, "\n");
3750             }
3751
3752           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3753                                      init_ref, loop))
3754             /* dr_analyze_innermost already explained the failure.  */
3755             return false;
3756
3757           if (dump_enabled_p ())
3758             {
3759               dump_printf_loc (MSG_NOTE, vect_location,
3760                                "\touter base_address: ");
3761               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3762                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3763               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3764               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3765                                  STMT_VINFO_DR_OFFSET (stmt_info));
3766               dump_printf (MSG_NOTE,
3767                            "\n\touter constant offset from base address: ");
3768               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3769                                  STMT_VINFO_DR_INIT (stmt_info));
3770               dump_printf (MSG_NOTE, "\n\touter step: ");
3771               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3772                                  STMT_VINFO_DR_STEP (stmt_info));
3773               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3774                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3775               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3776                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3777               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3778                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3779               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3780                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3781             }
3782         }
3783
3784       if (STMT_VINFO_DATA_REF (stmt_info))
3785         {
3786           if (dump_enabled_p ())
3787             {
3788               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3789                                "not vectorized: more than one data ref "
3790                                "in stmt: ");
3791               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3792             }
3793
3794           if (is_a <bb_vec_info> (vinfo))
3795             break;
3796
3797           if (gatherscatter != SG_NONE || simd_lane_access)
3798             free_data_ref (dr);
3799           return false;
3800         }
3801
3802       STMT_VINFO_DATA_REF (stmt_info) = dr;
3803       if (simd_lane_access)
3804         {
3805           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3806           free_data_ref (datarefs[i]);
3807           datarefs[i] = dr;
3808         }
3809
3810       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3811           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3812           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3813         {
3814           if (dump_enabled_p ())
3815             {
3816               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3817                                "not vectorized: base object not addressable "
3818                                "for stmt: ");
3819               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3820             }
3821           if (is_a <bb_vec_info> (vinfo))
3822             {
3823               /* In BB vectorization the ref can still participate
3824                  in dependence analysis, we just can't vectorize it.  */
3825               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3826               continue;
3827             }
3828           return false;
3829         }
3830
3831       /* Set vectype for STMT.  */
3832       scalar_type = TREE_TYPE (DR_REF (dr));
3833       STMT_VINFO_VECTYPE (stmt_info)
3834         = get_vectype_for_scalar_type (scalar_type);
3835       if (!STMT_VINFO_VECTYPE (stmt_info))
3836         {
3837           if (dump_enabled_p ())
3838             {
3839               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3840                                "not vectorized: no vectype for stmt: ");
3841               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3842               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3843               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3844                                  scalar_type);
3845               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3846             }
3847
3848           if (is_a <bb_vec_info> (vinfo))
3849             {
3850               /* No vector type is fine, the ref can still participate
3851                  in dependence analysis, we just can't vectorize it.  */
3852               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3853               continue;
3854             }
3855
3856           if (gatherscatter != SG_NONE || simd_lane_access)
3857             {
3858               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3859               if (gatherscatter != SG_NONE)
3860                 free_data_ref (dr);
3861             }
3862           return false;
3863         }
3864       else
3865         {
3866           if (dump_enabled_p ())
3867             {
3868               dump_printf_loc (MSG_NOTE, vect_location,
3869                                "got vectype for stmt: ");
3870               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3871               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3872                                  STMT_VINFO_VECTYPE (stmt_info));
3873               dump_printf (MSG_NOTE, "\n");
3874             }
3875         }
3876
3877       /* Adjust the minimal vectorization factor according to the
3878          vector type.  */
3879       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3880       *min_vf = upper_bound (*min_vf, vf);
3881
3882       if (gatherscatter != SG_NONE)
3883         {
3884           gather_scatter_info gs_info;
3885           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3886                                           &gs_info)
3887               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3888             {
3889               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3890               free_data_ref (dr);
3891               if (dump_enabled_p ())
3892                 {
3893                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3894                                    (gatherscatter == GATHER) ?
3895                                    "not vectorized: not suitable for gather "
3896                                    "load " :
3897                                    "not vectorized: not suitable for scatter "
3898                                    "store ");
3899                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3900                 }
3901               return false;
3902             }
3903
3904           free_data_ref (datarefs[i]);
3905           datarefs[i] = dr;
3906           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3907         }
3908
3909       else if (is_a <loop_vec_info> (vinfo)
3910                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3911         {
3912           if (nested_in_vect_loop_p (loop, stmt))
3913             {
3914               if (dump_enabled_p ())
3915                 {
3916                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3917                                    "not vectorized: not suitable for strided "
3918                                    "load ");
3919                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3920                 }
3921               return false;
3922             }
3923           STMT_VINFO_STRIDED_P (stmt_info) = true;
3924         }
3925     }
3926
3927   /* If we stopped analysis at the first dataref we could not analyze
3928      when trying to vectorize a basic-block mark the rest of the datarefs
3929      as not vectorizable and truncate the vector of datarefs.  That
3930      avoids spending useless time in analyzing their dependence.  */
3931   if (i != datarefs.length ())
3932     {
3933       gcc_assert (is_a <bb_vec_info> (vinfo));
3934       for (unsigned j = i; j < datarefs.length (); ++j)
3935         {
3936           data_reference_p dr = datarefs[j];
3937           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3938           free_data_ref (dr);
3939         }
3940       datarefs.truncate (i);
3941     }
3942
3943   return true;
3944 }
3945
3946
3947 /* Function vect_get_new_vect_var.
3948
3949    Returns a name for a new variable.  The current naming scheme appends the
3950    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3951    the name of vectorizer generated variables, and appends that to NAME if
3952    provided.  */
3953
3954 tree
3955 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3956 {
3957   const char *prefix;
3958   tree new_vect_var;
3959
3960   switch (var_kind)
3961   {
3962   case vect_simple_var:
3963     prefix = "vect";
3964     break;
3965   case vect_scalar_var:
3966     prefix = "stmp";
3967     break;
3968   case vect_mask_var:
3969     prefix = "mask";
3970     break;
3971   case vect_pointer_var:
3972     prefix = "vectp";
3973     break;
3974   default:
3975     gcc_unreachable ();
3976   }
3977
3978   if (name)
3979     {
3980       char* tmp = concat (prefix, "_", name, NULL);
3981       new_vect_var = create_tmp_reg (type, tmp);
3982       free (tmp);
3983     }
3984   else
3985     new_vect_var = create_tmp_reg (type, prefix);
3986
3987   return new_vect_var;
3988 }
3989
3990 /* Like vect_get_new_vect_var but return an SSA name.  */
3991
3992 tree
3993 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3994 {
3995   const char *prefix;
3996   tree new_vect_var;
3997
3998   switch (var_kind)
3999   {
4000   case vect_simple_var:
4001     prefix = "vect";
4002     break;
4003   case vect_scalar_var:
4004     prefix = "stmp";
4005     break;
4006   case vect_pointer_var:
4007     prefix = "vectp";
4008     break;
4009   default:
4010     gcc_unreachable ();
4011   }
4012
4013   if (name)
4014     {
4015       char* tmp = concat (prefix, "_", name, NULL);
4016       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4017       free (tmp);
4018     }
4019   else
4020     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4021
4022   return new_vect_var;
4023 }
4024
4025 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4026
4027 static void
4028 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4029 {
4030   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4031   int misalign = DR_MISALIGNMENT (dr);
4032   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4033     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4034   else
4035     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4036                             DR_TARGET_ALIGNMENT (dr), misalign);
4037 }
4038
4039 /* Function vect_create_addr_base_for_vector_ref.
4040
4041    Create an expression that computes the address of the first memory location
4042    that will be accessed for a data reference.
4043
4044    Input:
4045    STMT: The statement containing the data reference.
4046    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4047    OFFSET: Optional. If supplied, it is be added to the initial address.
4048    LOOP:    Specify relative to which loop-nest should the address be computed.
4049             For example, when the dataref is in an inner-loop nested in an
4050             outer-loop that is now being vectorized, LOOP can be either the
4051             outer-loop, or the inner-loop.  The first memory location accessed
4052             by the following dataref ('in' points to short):
4053
4054                 for (i=0; i<N; i++)
4055                    for (j=0; j<M; j++)
4056                      s += in[i+j]
4057
4058             is as follows:
4059             if LOOP=i_loop:     &in             (relative to i_loop)
4060             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4061    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4062             initial address.  Unlike OFFSET, which is number of elements to
4063             be added, BYTE_OFFSET is measured in bytes.
4064
4065    Output:
4066    1. Return an SSA_NAME whose value is the address of the memory location of
4067       the first vector of the data reference.
4068    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4069       these statement(s) which define the returned SSA_NAME.
4070
4071    FORNOW: We are only handling array accesses with step 1.  */
4072
4073 tree
4074 vect_create_addr_base_for_vector_ref (gimple *stmt,
4075                                       gimple_seq *new_stmt_list,
4076                                       tree offset,
4077                                       tree byte_offset)
4078 {
4079   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4080   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4081   const char *base_name;
4082   tree addr_base;
4083   tree dest;
4084   gimple_seq seq = NULL;
4085   tree vect_ptr_type;
4086   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4087   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4088   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4089
4090   tree data_ref_base = unshare_expr (drb->base_address);
4091   tree base_offset = unshare_expr (drb->offset);
4092   tree init = unshare_expr (drb->init);
4093
4094   if (loop_vinfo)
4095     base_name = get_name (data_ref_base);
4096   else
4097     {
4098       base_offset = ssize_int (0);
4099       init = ssize_int (0);
4100       base_name = get_name (DR_REF (dr));
4101     }
4102
4103   /* Create base_offset */
4104   base_offset = size_binop (PLUS_EXPR,
4105                             fold_convert (sizetype, base_offset),
4106                             fold_convert (sizetype, init));
4107
4108   if (offset)
4109     {
4110       offset = fold_build2 (MULT_EXPR, sizetype,
4111                             fold_convert (sizetype, offset), step);
4112       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4113                                  base_offset, offset);
4114     }
4115   if (byte_offset)
4116     {
4117       byte_offset = fold_convert (sizetype, byte_offset);
4118       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4119                                  base_offset, byte_offset);
4120     }
4121
4122   /* base + base_offset */
4123   if (loop_vinfo)
4124     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4125   else
4126     {
4127       addr_base = build1 (ADDR_EXPR,
4128                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4129                           unshare_expr (DR_REF (dr)));
4130     }
4131
4132   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4133   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4134   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4135   gimple_seq_add_seq (new_stmt_list, seq);
4136
4137   if (DR_PTR_INFO (dr)
4138       && TREE_CODE (addr_base) == SSA_NAME
4139       && !SSA_NAME_PTR_INFO (addr_base))
4140     {
4141       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4142       if (offset || byte_offset)
4143         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4144     }
4145
4146   if (dump_enabled_p ())
4147     {
4148       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4149       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4150       dump_printf (MSG_NOTE, "\n");
4151     }
4152
4153   return addr_base;
4154 }
4155
4156
4157 /* Function vect_create_data_ref_ptr.
4158
4159    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4160    location accessed in the loop by STMT, along with the def-use update
4161    chain to appropriately advance the pointer through the loop iterations.
4162    Also set aliasing information for the pointer.  This pointer is used by
4163    the callers to this function to create a memory reference expression for
4164    vector load/store access.
4165
4166    Input:
4167    1. STMT: a stmt that references memory. Expected to be of the form
4168          GIMPLE_ASSIGN <name, data-ref> or
4169          GIMPLE_ASSIGN <data-ref, name>.
4170    2. AGGR_TYPE: the type of the reference, which should be either a vector
4171         or an array.
4172    3. AT_LOOP: the loop where the vector memref is to be created.
4173    4. OFFSET (optional): an offset to be added to the initial address accessed
4174         by the data-ref in STMT.
4175    5. BSI: location where the new stmts are to be placed if there is no loop
4176    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4177         pointing to the initial address.
4178    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4179         to the initial address accessed by the data-ref in STMT.  This is
4180         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4181         in bytes.
4182
4183    Output:
4184    1. Declare a new ptr to vector_type, and have it point to the base of the
4185       data reference (initial addressed accessed by the data reference).
4186       For example, for vector of type V8HI, the following code is generated:
4187
4188       v8hi *ap;
4189       ap = (v8hi *)initial_address;
4190
4191       if OFFSET is not supplied:
4192          initial_address = &a[init];
4193       if OFFSET is supplied:
4194          initial_address = &a[init + OFFSET];
4195       if BYTE_OFFSET is supplied:
4196          initial_address = &a[init] + BYTE_OFFSET;
4197
4198       Return the initial_address in INITIAL_ADDRESS.
4199
4200    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4201       update the pointer in each iteration of the loop.
4202
4203       Return the increment stmt that updates the pointer in PTR_INCR.
4204
4205    3. Set INV_P to true if the access pattern of the data reference in the
4206       vectorized loop is invariant.  Set it to false otherwise.
4207
4208    4. Return the pointer.  */
4209
4210 tree
4211 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4212                           tree offset, tree *initial_address,
4213                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4214                           bool only_init, bool *inv_p, tree byte_offset)
4215 {
4216   const char *base_name;
4217   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4218   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4219   struct loop *loop = NULL;
4220   bool nested_in_vect_loop = false;
4221   struct loop *containing_loop = NULL;
4222   tree aggr_ptr_type;
4223   tree aggr_ptr;
4224   tree new_temp;
4225   gimple_seq new_stmt_list = NULL;
4226   edge pe = NULL;
4227   basic_block new_bb;
4228   tree aggr_ptr_init;
4229   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4230   tree aptr;
4231   gimple_stmt_iterator incr_gsi;
4232   bool insert_after;
4233   tree indx_before_incr, indx_after_incr;
4234   gimple *incr;
4235   tree step;
4236   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4237
4238   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4239               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4240
4241   if (loop_vinfo)
4242     {
4243       loop = LOOP_VINFO_LOOP (loop_vinfo);
4244       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4245       containing_loop = (gimple_bb (stmt))->loop_father;
4246       pe = loop_preheader_edge (loop);
4247     }
4248   else
4249     {
4250       gcc_assert (bb_vinfo);
4251       only_init = true;
4252       *ptr_incr = NULL;
4253     }
4254
4255   /* Check the step (evolution) of the load in LOOP, and record
4256      whether it's invariant.  */
4257   step = vect_dr_behavior (dr)->step;
4258   if (integer_zerop (step))
4259     *inv_p = true;
4260   else
4261     *inv_p = false;
4262
4263   /* Create an expression for the first address accessed by this load
4264      in LOOP.  */
4265   base_name = get_name (DR_BASE_ADDRESS (dr));
4266
4267   if (dump_enabled_p ())
4268     {
4269       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4270       dump_printf_loc (MSG_NOTE, vect_location,
4271                        "create %s-pointer variable to type: ",
4272                        get_tree_code_name (TREE_CODE (aggr_type)));
4273       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4274       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4275         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4276       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4277         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4278       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4279         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4280       else
4281         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4282       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4283       dump_printf (MSG_NOTE, "\n");
4284     }
4285
4286   /* (1) Create the new aggregate-pointer variable.
4287      Vector and array types inherit the alias set of their component
4288      type by default so we need to use a ref-all pointer if the data
4289      reference does not conflict with the created aggregated data
4290      reference because it is not addressable.  */
4291   bool need_ref_all = false;
4292   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4293                               get_alias_set (DR_REF (dr))))
4294     need_ref_all = true;
4295   /* Likewise for any of the data references in the stmt group.  */
4296   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4297     {
4298       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4299       do
4300         {
4301           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4302           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4303           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4304                                       get_alias_set (DR_REF (sdr))))
4305             {
4306               need_ref_all = true;
4307               break;
4308             }
4309           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4310         }
4311       while (orig_stmt);
4312     }
4313   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4314                                                need_ref_all);
4315   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4316
4317
4318   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4319      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4320      def-use update cycles for the pointer: one relative to the outer-loop
4321      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4322      to the inner-loop (which is the inner-most loop containing the dataref),
4323      and this is done be step (5) below.
4324
4325      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4326      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4327      redundant.  Steps (3),(4) create the following:
4328
4329         vp0 = &base_addr;
4330         LOOP:   vp1 = phi(vp0,vp2)
4331                 ...
4332                 ...
4333                 vp2 = vp1 + step
4334                 goto LOOP
4335
4336      If there is an inner-loop nested in loop, then step (5) will also be
4337      applied, and an additional update in the inner-loop will be created:
4338
4339         vp0 = &base_addr;
4340         LOOP:   vp1 = phi(vp0,vp2)
4341                 ...
4342         inner:     vp3 = phi(vp1,vp4)
4343                    vp4 = vp3 + inner_step
4344                    if () goto inner
4345                 ...
4346                 vp2 = vp1 + step
4347                 if () goto LOOP   */
4348
4349   /* (2) Calculate the initial address of the aggregate-pointer, and set
4350      the aggregate-pointer to point to it before the loop.  */
4351
4352   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4353
4354   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4355                                                    offset, byte_offset);
4356   if (new_stmt_list)
4357     {
4358       if (pe)
4359         {
4360           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4361           gcc_assert (!new_bb);
4362         }
4363       else
4364         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4365     }
4366
4367   *initial_address = new_temp;
4368   aggr_ptr_init = new_temp;
4369
4370   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4371      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4372      inner-loop nested in LOOP (during outer-loop vectorization).  */
4373
4374   /* No update in loop is required.  */
4375   if (only_init && (!loop_vinfo || at_loop == loop))
4376     aptr = aggr_ptr_init;
4377   else
4378     {
4379       /* The step of the aggregate pointer is the type size.  */
4380       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4381       /* One exception to the above is when the scalar step of the load in
4382          LOOP is zero. In this case the step here is also zero.  */
4383       if (*inv_p)
4384         iv_step = size_zero_node;
4385       else if (tree_int_cst_sgn (step) == -1)
4386         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4387
4388       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4389
4390       create_iv (aggr_ptr_init,
4391                  fold_convert (aggr_ptr_type, iv_step),
4392                  aggr_ptr, loop, &incr_gsi, insert_after,
4393                  &indx_before_incr, &indx_after_incr);
4394       incr = gsi_stmt (incr_gsi);
4395       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4396
4397       /* Copy the points-to information if it exists. */
4398       if (DR_PTR_INFO (dr))
4399         {
4400           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4401           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4402         }
4403       if (ptr_incr)
4404         *ptr_incr = incr;
4405
4406       aptr = indx_before_incr;
4407     }
4408
4409   if (!nested_in_vect_loop || only_init)
4410     return aptr;
4411
4412
4413   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4414      nested in LOOP, if exists.  */
4415
4416   gcc_assert (nested_in_vect_loop);
4417   if (!only_init)
4418     {
4419       standard_iv_increment_position (containing_loop, &incr_gsi,
4420                                       &insert_after);
4421       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4422                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4423                  &indx_after_incr);
4424       incr = gsi_stmt (incr_gsi);
4425       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4426
4427       /* Copy the points-to information if it exists. */
4428       if (DR_PTR_INFO (dr))
4429         {
4430           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4431           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4432         }
4433       if (ptr_incr)
4434         *ptr_incr = incr;
4435
4436       return indx_before_incr;
4437     }
4438   else
4439     gcc_unreachable ();
4440 }
4441
4442
4443 /* Function bump_vector_ptr
4444
4445    Increment a pointer (to a vector type) by vector-size. If requested,
4446    i.e. if PTR-INCR is given, then also connect the new increment stmt
4447    to the existing def-use update-chain of the pointer, by modifying
4448    the PTR_INCR as illustrated below:
4449
4450    The pointer def-use update-chain before this function:
4451                         DATAREF_PTR = phi (p_0, p_2)
4452                         ....
4453         PTR_INCR:       p_2 = DATAREF_PTR + step
4454
4455    The pointer def-use update-chain after this function:
4456                         DATAREF_PTR = phi (p_0, p_2)
4457                         ....
4458                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4459                         ....
4460         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4461
4462    Input:
4463    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4464                  in the loop.
4465    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4466               the loop.  The increment amount across iterations is expected
4467               to be vector_size.
4468    BSI - location where the new update stmt is to be placed.
4469    STMT - the original scalar memory-access stmt that is being vectorized.
4470    BUMP - optional. The offset by which to bump the pointer. If not given,
4471           the offset is assumed to be vector_size.
4472
4473    Output: Return NEW_DATAREF_PTR as illustrated above.
4474
4475 */
4476
4477 tree
4478 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4479                  gimple *stmt, tree bump)
4480 {
4481   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4482   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4483   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4484   tree update = TYPE_SIZE_UNIT (vectype);
4485   gassign *incr_stmt;
4486   ssa_op_iter iter;
4487   use_operand_p use_p;
4488   tree new_dataref_ptr;
4489
4490   if (bump)
4491     update = bump;
4492
4493   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4494     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4495   else
4496     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4497   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4498                                    dataref_ptr, update);
4499   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4500
4501   /* Copy the points-to information if it exists. */
4502   if (DR_PTR_INFO (dr))
4503     {
4504       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4505       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4506     }
4507
4508   if (!ptr_incr)
4509     return new_dataref_ptr;
4510
4511   /* Update the vector-pointer's cross-iteration increment.  */
4512   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4513     {
4514       tree use = USE_FROM_PTR (use_p);
4515
4516       if (use == dataref_ptr)
4517         SET_USE (use_p, new_dataref_ptr);
4518       else
4519         gcc_assert (tree_int_cst_compare (use, update) == 0);
4520     }
4521
4522   return new_dataref_ptr;
4523 }
4524
4525
4526 /* Function vect_create_destination_var.
4527
4528    Create a new temporary of type VECTYPE.  */
4529
4530 tree
4531 vect_create_destination_var (tree scalar_dest, tree vectype)
4532 {
4533   tree vec_dest;
4534   const char *name;
4535   char *new_name;
4536   tree type;
4537   enum vect_var_kind kind;
4538
4539   kind = vectype
4540     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4541     ? vect_mask_var
4542     : vect_simple_var
4543     : vect_scalar_var;
4544   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4545
4546   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4547
4548   name = get_name (scalar_dest);
4549   if (name)
4550     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4551   else
4552     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4553   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4554   free (new_name);
4555
4556   return vec_dest;
4557 }
4558
4559 /* Function vect_grouped_store_supported.
4560
4561    Returns TRUE if interleave high and interleave low permutations
4562    are supported, and FALSE otherwise.  */
4563
4564 bool
4565 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4566 {
4567   machine_mode mode = TYPE_MODE (vectype);
4568
4569   /* vect_permute_store_chain requires the group size to be equal to 3 or
4570      be a power of two.  */
4571   if (count != 3 && exact_log2 (count) == -1)
4572     {
4573       if (dump_enabled_p ())
4574         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4575                          "the size of the group of accesses"
4576                          " is not a power of 2 or not eqaul to 3\n");
4577       return false;
4578     }
4579
4580   /* Check that the permutation is supported.  */
4581   if (VECTOR_MODE_P (mode))
4582     {
4583       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4584       if (count == 3)
4585         {
4586           unsigned int j0 = 0, j1 = 0, j2 = 0;
4587           unsigned int i, j;
4588
4589           vec_perm_builder sel (nelt, nelt, 1);
4590           sel.quick_grow (nelt);
4591           vec_perm_indices indices;
4592           for (j = 0; j < 3; j++)
4593             {
4594               int nelt0 = ((3 - j) * nelt) % 3;
4595               int nelt1 = ((3 - j) * nelt + 1) % 3;
4596               int nelt2 = ((3 - j) * nelt + 2) % 3;
4597               for (i = 0; i < nelt; i++)
4598                 {
4599                   if (3 * i + nelt0 < nelt)
4600                     sel[3 * i + nelt0] = j0++;
4601                   if (3 * i + nelt1 < nelt)
4602                     sel[3 * i + nelt1] = nelt + j1++;
4603                   if (3 * i + nelt2 < nelt)
4604                     sel[3 * i + nelt2] = 0;
4605                 }
4606               indices.new_vector (sel, 2, nelt);
4607               if (!can_vec_perm_const_p (mode, indices))
4608                 {
4609                   if (dump_enabled_p ())
4610                     dump_printf (MSG_MISSED_OPTIMIZATION,
4611                                  "permutation op not supported by target.\n");
4612                   return false;
4613                 }
4614
4615               for (i = 0; i < nelt; i++)
4616                 {
4617                   if (3 * i + nelt0 < nelt)
4618                     sel[3 * i + nelt0] = 3 * i + nelt0;
4619                   if (3 * i + nelt1 < nelt)
4620                     sel[3 * i + nelt1] = 3 * i + nelt1;
4621                   if (3 * i + nelt2 < nelt)
4622                     sel[3 * i + nelt2] = nelt + j2++;
4623                 }
4624               indices.new_vector (sel, 2, nelt);
4625               if (!can_vec_perm_const_p (mode, indices))
4626                 {
4627                   if (dump_enabled_p ())
4628                     dump_printf (MSG_MISSED_OPTIMIZATION,
4629                                  "permutation op not supported by target.\n");
4630                   return false;
4631                 }
4632             }
4633           return true;
4634         }
4635       else
4636         {
4637           /* If length is not equal to 3 then only power of 2 is supported.  */
4638           gcc_assert (pow2p_hwi (count));
4639
4640           /* The encoding has 2 interleaved stepped patterns.  */
4641           vec_perm_builder sel (nelt, 2, 3);
4642           sel.quick_grow (6);
4643           for (i = 0; i < 3; i++)
4644             {
4645               sel[i * 2] = i;
4646               sel[i * 2 + 1] = i + nelt;
4647             }
4648           vec_perm_indices indices (sel, 2, nelt);
4649           if (can_vec_perm_const_p (mode, indices))
4650             {
4651               for (i = 0; i < 6; i++)
4652                 sel[i] += nelt / 2;
4653               indices.new_vector (sel, 2, nelt);
4654               if (can_vec_perm_const_p (mode, indices))
4655                 return true;
4656             }
4657         }
4658     }
4659
4660   if (dump_enabled_p ())
4661     dump_printf (MSG_MISSED_OPTIMIZATION,
4662                  "permutaion op not supported by target.\n");
4663   return false;
4664 }
4665
4666
4667 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4668    type VECTYPE.  */
4669
4670 bool
4671 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4672 {
4673   return vect_lanes_optab_supported_p ("vec_store_lanes",
4674                                        vec_store_lanes_optab,
4675                                        vectype, count);
4676 }
4677
4678
4679 /* Function vect_permute_store_chain.
4680
4681    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4682    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4683    the data correctly for the stores.  Return the final references for stores
4684    in RESULT_CHAIN.
4685
4686    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4687    The input is 4 vectors each containing 8 elements.  We assign a number to
4688    each element, the input sequence is:
4689
4690    1st vec:   0  1  2  3  4  5  6  7
4691    2nd vec:   8  9 10 11 12 13 14 15
4692    3rd vec:  16 17 18 19 20 21 22 23
4693    4th vec:  24 25 26 27 28 29 30 31
4694
4695    The output sequence should be:
4696
4697    1st vec:  0  8 16 24  1  9 17 25
4698    2nd vec:  2 10 18 26  3 11 19 27
4699    3rd vec:  4 12 20 28  5 13 21 30
4700    4th vec:  6 14 22 30  7 15 23 31
4701
4702    i.e., we interleave the contents of the four vectors in their order.
4703
4704    We use interleave_high/low instructions to create such output.  The input of
4705    each interleave_high/low operation is two vectors:
4706    1st vec    2nd vec
4707    0 1 2 3    4 5 6 7
4708    the even elements of the result vector are obtained left-to-right from the
4709    high/low elements of the first vector.  The odd elements of the result are
4710    obtained left-to-right from the high/low elements of the second vector.
4711    The output of interleave_high will be:   0 4 1 5
4712    and of interleave_low:                   2 6 3 7
4713
4714
4715    The permutation is done in log LENGTH stages.  In each stage interleave_high
4716    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4717    where the first argument is taken from the first half of DR_CHAIN and the
4718    second argument from it's second half.
4719    In our example,
4720
4721    I1: interleave_high (1st vec, 3rd vec)
4722    I2: interleave_low (1st vec, 3rd vec)
4723    I3: interleave_high (2nd vec, 4th vec)
4724    I4: interleave_low (2nd vec, 4th vec)
4725
4726    The output for the first stage is:
4727
4728    I1:  0 16  1 17  2 18  3 19
4729    I2:  4 20  5 21  6 22  7 23
4730    I3:  8 24  9 25 10 26 11 27
4731    I4: 12 28 13 29 14 30 15 31
4732
4733    The output of the second stage, i.e. the final result is:
4734
4735    I1:  0  8 16 24  1  9 17 25
4736    I2:  2 10 18 26  3 11 19 27
4737    I3:  4 12 20 28  5 13 21 30
4738    I4:  6 14 22 30  7 15 23 31.  */
4739
4740 void
4741 vect_permute_store_chain (vec<tree> dr_chain,
4742                           unsigned int length,
4743                           gimple *stmt,
4744                           gimple_stmt_iterator *gsi,
4745                           vec<tree> *result_chain)
4746 {
4747   tree vect1, vect2, high, low;
4748   gimple *perm_stmt;
4749   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4750   tree perm_mask_low, perm_mask_high;
4751   tree data_ref;
4752   tree perm3_mask_low, perm3_mask_high;
4753   unsigned int i, n, log_length = exact_log2 (length);
4754   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4755
4756   result_chain->quick_grow (length);
4757   memcpy (result_chain->address (), dr_chain.address (),
4758           length * sizeof (tree));
4759
4760   if (length == 3)
4761     {
4762       unsigned int j0 = 0, j1 = 0, j2 = 0;
4763
4764       vec_perm_builder sel (nelt, nelt, 1);
4765       sel.quick_grow (nelt);
4766       vec_perm_indices indices;
4767       for (j = 0; j < 3; j++)
4768         {
4769           int nelt0 = ((3 - j) * nelt) % 3;
4770           int nelt1 = ((3 - j) * nelt + 1) % 3;
4771           int nelt2 = ((3 - j) * nelt + 2) % 3;
4772
4773           for (i = 0; i < nelt; i++)
4774             {
4775               if (3 * i + nelt0 < nelt)
4776                 sel[3 * i + nelt0] = j0++;
4777               if (3 * i + nelt1 < nelt)
4778                 sel[3 * i + nelt1] = nelt + j1++;
4779               if (3 * i + nelt2 < nelt)
4780                 sel[3 * i + nelt2] = 0;
4781             }
4782           indices.new_vector (sel, 2, nelt);
4783           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4784
4785           for (i = 0; i < nelt; i++)
4786             {
4787               if (3 * i + nelt0 < nelt)
4788                 sel[3 * i + nelt0] = 3 * i + nelt0;
4789               if (3 * i + nelt1 < nelt)
4790                 sel[3 * i + nelt1] = 3 * i + nelt1;
4791               if (3 * i + nelt2 < nelt)
4792                 sel[3 * i + nelt2] = nelt + j2++;
4793             }
4794           indices.new_vector (sel, 2, nelt);
4795           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4796
4797           vect1 = dr_chain[0];
4798           vect2 = dr_chain[1];
4799
4800           /* Create interleaving stmt:
4801              low = VEC_PERM_EXPR <vect1, vect2,
4802                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4803                                    j + 2, nelt + j + 2, *, ...}>  */
4804           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4805           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4806                                            vect2, perm3_mask_low);
4807           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4808
4809           vect1 = data_ref;
4810           vect2 = dr_chain[2];
4811           /* Create interleaving stmt:
4812              low = VEC_PERM_EXPR <vect1, vect2,
4813                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4814                                    6, 7, nelt + j + 2, ...}>  */
4815           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4816           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4817                                            vect2, perm3_mask_high);
4818           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4819           (*result_chain)[j] = data_ref;
4820         }
4821     }
4822   else
4823     {
4824       /* If length is not equal to 3 then only power of 2 is supported.  */
4825       gcc_assert (pow2p_hwi (length));
4826
4827       /* The encoding has 2 interleaved stepped patterns.  */
4828       vec_perm_builder sel (nelt, 2, 3);
4829       sel.quick_grow (6);
4830       for (i = 0; i < 3; i++)
4831         {
4832           sel[i * 2] = i;
4833           sel[i * 2 + 1] = i + nelt;
4834         }
4835         vec_perm_indices indices (sel, 2, nelt);
4836         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4837
4838         for (i = 0; i < 6; i++)
4839           sel[i] += nelt / 2;
4840         indices.new_vector (sel, 2, nelt);
4841         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4842
4843         for (i = 0, n = log_length; i < n; i++)
4844           {
4845             for (j = 0; j < length/2; j++)
4846               {
4847                 vect1 = dr_chain[j];
4848                 vect2 = dr_chain[j+length/2];
4849
4850                 /* Create interleaving stmt:
4851                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4852                                                         ...}>  */
4853                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4854                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4855                                                  vect2, perm_mask_high);
4856                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4857                 (*result_chain)[2*j] = high;
4858
4859                 /* Create interleaving stmt:
4860                    low = VEC_PERM_EXPR <vect1, vect2,
4861                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4862                                          ...}>  */
4863                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4864                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4865                                                  vect2, perm_mask_low);
4866                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4867                 (*result_chain)[2*j+1] = low;
4868               }
4869             memcpy (dr_chain.address (), result_chain->address (),
4870                     length * sizeof (tree));
4871           }
4872     }
4873 }
4874
4875 /* Function vect_setup_realignment
4876
4877    This function is called when vectorizing an unaligned load using
4878    the dr_explicit_realign[_optimized] scheme.
4879    This function generates the following code at the loop prolog:
4880
4881       p = initial_addr;
4882    x  msq_init = *(floor(p));   # prolog load
4883       realignment_token = call target_builtin;
4884     loop:
4885    x  msq = phi (msq_init, ---)
4886
4887    The stmts marked with x are generated only for the case of
4888    dr_explicit_realign_optimized.
4889
4890    The code above sets up a new (vector) pointer, pointing to the first
4891    location accessed by STMT, and a "floor-aligned" load using that pointer.
4892    It also generates code to compute the "realignment-token" (if the relevant
4893    target hook was defined), and creates a phi-node at the loop-header bb
4894    whose arguments are the result of the prolog-load (created by this
4895    function) and the result of a load that takes place in the loop (to be
4896    created by the caller to this function).
4897
4898    For the case of dr_explicit_realign_optimized:
4899    The caller to this function uses the phi-result (msq) to create the
4900    realignment code inside the loop, and sets up the missing phi argument,
4901    as follows:
4902     loop:
4903       msq = phi (msq_init, lsq)
4904       lsq = *(floor(p'));        # load in loop
4905       result = realign_load (msq, lsq, realignment_token);
4906
4907    For the case of dr_explicit_realign:
4908     loop:
4909       msq = *(floor(p));        # load in loop
4910       p' = p + (VS-1);
4911       lsq = *(floor(p'));       # load in loop
4912       result = realign_load (msq, lsq, realignment_token);
4913
4914    Input:
4915    STMT - (scalar) load stmt to be vectorized. This load accesses
4916           a memory location that may be unaligned.
4917    BSI - place where new code is to be inserted.
4918    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4919                               is used.
4920
4921    Output:
4922    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4923                        target hook, if defined.
4924    Return value - the result of the loop-header phi node.  */
4925
4926 tree
4927 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4928                         tree *realignment_token,
4929                         enum dr_alignment_support alignment_support_scheme,
4930                         tree init_addr,
4931                         struct loop **at_loop)
4932 {
4933   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4934   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4935   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4936   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4937   struct loop *loop = NULL;
4938   edge pe = NULL;
4939   tree scalar_dest = gimple_assign_lhs (stmt);
4940   tree vec_dest;
4941   gimple *inc;
4942   tree ptr;
4943   tree data_ref;
4944   basic_block new_bb;
4945   tree msq_init = NULL_TREE;
4946   tree new_temp;
4947   gphi *phi_stmt;
4948   tree msq = NULL_TREE;
4949   gimple_seq stmts = NULL;
4950   bool inv_p;
4951   bool compute_in_loop = false;
4952   bool nested_in_vect_loop = false;
4953   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4954   struct loop *loop_for_initial_load = NULL;
4955
4956   if (loop_vinfo)
4957     {
4958       loop = LOOP_VINFO_LOOP (loop_vinfo);
4959       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4960     }
4961
4962   gcc_assert (alignment_support_scheme == dr_explicit_realign
4963               || alignment_support_scheme == dr_explicit_realign_optimized);
4964
4965   /* We need to generate three things:
4966      1. the misalignment computation
4967      2. the extra vector load (for the optimized realignment scheme).
4968      3. the phi node for the two vectors from which the realignment is
4969       done (for the optimized realignment scheme).  */
4970
4971   /* 1. Determine where to generate the misalignment computation.
4972
4973      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4974      calculation will be generated by this function, outside the loop (in the
4975      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4976      caller, inside the loop.
4977
4978      Background: If the misalignment remains fixed throughout the iterations of
4979      the loop, then both realignment schemes are applicable, and also the
4980      misalignment computation can be done outside LOOP.  This is because we are
4981      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4982      are a multiple of VS (the Vector Size), and therefore the misalignment in
4983      different vectorized LOOP iterations is always the same.
4984      The problem arises only if the memory access is in an inner-loop nested
4985      inside LOOP, which is now being vectorized using outer-loop vectorization.
4986      This is the only case when the misalignment of the memory access may not
4987      remain fixed throughout the iterations of the inner-loop (as explained in
4988      detail in vect_supportable_dr_alignment).  In this case, not only is the
4989      optimized realignment scheme not applicable, but also the misalignment
4990      computation (and generation of the realignment token that is passed to
4991      REALIGN_LOAD) have to be done inside the loop.
4992
4993      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4994      or not, which in turn determines if the misalignment is computed inside
4995      the inner-loop, or outside LOOP.  */
4996
4997   if (init_addr != NULL_TREE || !loop_vinfo)
4998     {
4999       compute_in_loop = true;
5000       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5001     }
5002
5003
5004   /* 2. Determine where to generate the extra vector load.
5005
5006      For the optimized realignment scheme, instead of generating two vector
5007      loads in each iteration, we generate a single extra vector load in the
5008      preheader of the loop, and in each iteration reuse the result of the
5009      vector load from the previous iteration.  In case the memory access is in
5010      an inner-loop nested inside LOOP, which is now being vectorized using
5011      outer-loop vectorization, we need to determine whether this initial vector
5012      load should be generated at the preheader of the inner-loop, or can be
5013      generated at the preheader of LOOP.  If the memory access has no evolution
5014      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5015      to be generated inside LOOP (in the preheader of the inner-loop).  */
5016
5017   if (nested_in_vect_loop)
5018     {
5019       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5020       bool invariant_in_outerloop =
5021             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5022       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5023     }
5024   else
5025     loop_for_initial_load = loop;
5026   if (at_loop)
5027     *at_loop = loop_for_initial_load;
5028
5029   if (loop_for_initial_load)
5030     pe = loop_preheader_edge (loop_for_initial_load);
5031
5032   /* 3. For the case of the optimized realignment, create the first vector
5033       load at the loop preheader.  */
5034
5035   if (alignment_support_scheme == dr_explicit_realign_optimized)
5036     {
5037       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5038       gassign *new_stmt;
5039
5040       gcc_assert (!compute_in_loop);
5041       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5042       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5043                                       NULL_TREE, &init_addr, NULL, &inc,
5044                                       true, &inv_p);
5045       if (TREE_CODE (ptr) == SSA_NAME)
5046         new_temp = copy_ssa_name (ptr);
5047       else
5048         new_temp = make_ssa_name (TREE_TYPE (ptr));
5049       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5050       new_stmt = gimple_build_assign
5051                    (new_temp, BIT_AND_EXPR, ptr,
5052                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5053       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5054       gcc_assert (!new_bb);
5055       data_ref
5056         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5057                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5058       new_stmt = gimple_build_assign (vec_dest, data_ref);
5059       new_temp = make_ssa_name (vec_dest, new_stmt);
5060       gimple_assign_set_lhs (new_stmt, new_temp);
5061       if (pe)
5062         {
5063           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5064           gcc_assert (!new_bb);
5065         }
5066       else
5067          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5068
5069       msq_init = gimple_assign_lhs (new_stmt);
5070     }
5071
5072   /* 4. Create realignment token using a target builtin, if available.
5073       It is done either inside the containing loop, or before LOOP (as
5074       determined above).  */
5075
5076   if (targetm.vectorize.builtin_mask_for_load)
5077     {
5078       gcall *new_stmt;
5079       tree builtin_decl;
5080
5081       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5082       if (!init_addr)
5083         {
5084           /* Generate the INIT_ADDR computation outside LOOP.  */
5085           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5086                                                             NULL_TREE);
5087           if (loop)
5088             {
5089               pe = loop_preheader_edge (loop);
5090               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5091               gcc_assert (!new_bb);
5092             }
5093           else
5094              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5095         }
5096
5097       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5098       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5099       vec_dest =
5100         vect_create_destination_var (scalar_dest,
5101                                      gimple_call_return_type (new_stmt));
5102       new_temp = make_ssa_name (vec_dest, new_stmt);
5103       gimple_call_set_lhs (new_stmt, new_temp);
5104
5105       if (compute_in_loop)
5106         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5107       else
5108         {
5109           /* Generate the misalignment computation outside LOOP.  */
5110           pe = loop_preheader_edge (loop);
5111           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5112           gcc_assert (!new_bb);
5113         }
5114
5115       *realignment_token = gimple_call_lhs (new_stmt);
5116
5117       /* The result of the CALL_EXPR to this builtin is determined from
5118          the value of the parameter and no global variables are touched
5119          which makes the builtin a "const" function.  Requiring the
5120          builtin to have the "const" attribute makes it unnecessary
5121          to call mark_call_clobbered.  */
5122       gcc_assert (TREE_READONLY (builtin_decl));
5123     }
5124
5125   if (alignment_support_scheme == dr_explicit_realign)
5126     return msq;
5127
5128   gcc_assert (!compute_in_loop);
5129   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5130
5131
5132   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5133
5134   pe = loop_preheader_edge (containing_loop);
5135   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5136   msq = make_ssa_name (vec_dest);
5137   phi_stmt = create_phi_node (msq, containing_loop->header);
5138   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5139
5140   return msq;
5141 }
5142
5143
5144 /* Function vect_grouped_load_supported.
5145
5146    COUNT is the size of the load group (the number of statements plus the
5147    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5148    only one statement, with a gap of COUNT - 1.
5149
5150    Returns true if a suitable permute exists.  */
5151
5152 bool
5153 vect_grouped_load_supported (tree vectype, bool single_element_p,
5154                              unsigned HOST_WIDE_INT count)
5155 {
5156   machine_mode mode = TYPE_MODE (vectype);
5157
5158   /* If this is single-element interleaving with an element distance
5159      that leaves unused vector loads around punt - we at least create
5160      very sub-optimal code in that case (and blow up memory,
5161      see PR65518).  */
5162   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5163     {
5164       if (dump_enabled_p ())
5165         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5166                          "single-element interleaving not supported "
5167                          "for not adjacent vector loads\n");
5168       return false;
5169     }
5170
5171   /* vect_permute_load_chain requires the group size to be equal to 3 or
5172      be a power of two.  */
5173   if (count != 3 && exact_log2 (count) == -1)
5174     {
5175       if (dump_enabled_p ())
5176         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5177                          "the size of the group of accesses"
5178                          " is not a power of 2 or not equal to 3\n");
5179       return false;
5180     }
5181
5182   /* Check that the permutation is supported.  */
5183   if (VECTOR_MODE_P (mode))
5184     {
5185       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5186
5187       if (count == 3)
5188         {
5189           vec_perm_builder sel (nelt, nelt, 1);
5190           sel.quick_grow (nelt);
5191           vec_perm_indices indices;
5192           unsigned int k;
5193           for (k = 0; k < 3; k++)
5194             {
5195               for (i = 0; i < nelt; i++)
5196                 if (3 * i + k < 2 * nelt)
5197                   sel[i] = 3 * i + k;
5198                 else
5199                   sel[i] = 0;
5200               indices.new_vector (sel, 2, nelt);
5201               if (!can_vec_perm_const_p (mode, indices))
5202                 {
5203                   if (dump_enabled_p ())
5204                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5205                                      "shuffle of 3 loads is not supported by"
5206                                      " target\n");
5207                   return false;
5208                 }
5209               for (i = 0, j = 0; i < nelt; i++)
5210                 if (3 * i + k < 2 * nelt)
5211                   sel[i] = i;
5212                 else
5213                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5214               indices.new_vector (sel, 2, nelt);
5215               if (!can_vec_perm_const_p (mode, indices))
5216                 {
5217                   if (dump_enabled_p ())
5218                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5219                                      "shuffle of 3 loads is not supported by"
5220                                      " target\n");
5221                   return false;
5222                 }
5223             }
5224           return true;
5225         }
5226       else
5227         {
5228           /* If length is not equal to 3 then only power of 2 is supported.  */
5229           gcc_assert (pow2p_hwi (count));
5230
5231           /* The encoding has a single stepped pattern.  */
5232           vec_perm_builder sel (nelt, 1, 3);
5233           sel.quick_grow (3);
5234           for (i = 0; i < 3; i++)
5235             sel[i] = i * 2;
5236           vec_perm_indices indices (sel, 2, nelt);
5237           if (can_vec_perm_const_p (mode, indices))
5238             {
5239               for (i = 0; i < 3; i++)
5240                 sel[i] = i * 2 + 1;
5241               indices.new_vector (sel, 2, nelt);
5242               if (can_vec_perm_const_p (mode, indices))
5243                 return true;
5244             }
5245         }
5246     }
5247
5248   if (dump_enabled_p ())
5249     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5250                      "extract even/odd not supported by target\n");
5251   return false;
5252 }
5253
5254 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5255    type VECTYPE.  */
5256
5257 bool
5258 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5259 {
5260   return vect_lanes_optab_supported_p ("vec_load_lanes",
5261                                        vec_load_lanes_optab,
5262                                        vectype, count);
5263 }
5264
5265 /* Function vect_permute_load_chain.
5266
5267    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5268    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5269    the input data correctly.  Return the final references for loads in
5270    RESULT_CHAIN.
5271
5272    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5273    The input is 4 vectors each containing 8 elements. We assign a number to each
5274    element, the input sequence is:
5275
5276    1st vec:   0  1  2  3  4  5  6  7
5277    2nd vec:   8  9 10 11 12 13 14 15
5278    3rd vec:  16 17 18 19 20 21 22 23
5279    4th vec:  24 25 26 27 28 29 30 31
5280
5281    The output sequence should be:
5282
5283    1st vec:  0 4  8 12 16 20 24 28
5284    2nd vec:  1 5  9 13 17 21 25 29
5285    3rd vec:  2 6 10 14 18 22 26 30
5286    4th vec:  3 7 11 15 19 23 27 31
5287
5288    i.e., the first output vector should contain the first elements of each
5289    interleaving group, etc.
5290
5291    We use extract_even/odd instructions to create such output.  The input of
5292    each extract_even/odd operation is two vectors
5293    1st vec    2nd vec
5294    0 1 2 3    4 5 6 7
5295
5296    and the output is the vector of extracted even/odd elements.  The output of
5297    extract_even will be:   0 2 4 6
5298    and of extract_odd:     1 3 5 7
5299
5300
5301    The permutation is done in log LENGTH stages.  In each stage extract_even
5302    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5303    their order.  In our example,
5304
5305    E1: extract_even (1st vec, 2nd vec)
5306    E2: extract_odd (1st vec, 2nd vec)
5307    E3: extract_even (3rd vec, 4th vec)
5308    E4: extract_odd (3rd vec, 4th vec)
5309
5310    The output for the first stage will be:
5311
5312    E1:  0  2  4  6  8 10 12 14
5313    E2:  1  3  5  7  9 11 13 15
5314    E3: 16 18 20 22 24 26 28 30
5315    E4: 17 19 21 23 25 27 29 31
5316
5317    In order to proceed and create the correct sequence for the next stage (or
5318    for the correct output, if the second stage is the last one, as in our
5319    example), we first put the output of extract_even operation and then the
5320    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5321    The input for the second stage is:
5322
5323    1st vec (E1):  0  2  4  6  8 10 12 14
5324    2nd vec (E3): 16 18 20 22 24 26 28 30
5325    3rd vec (E2):  1  3  5  7  9 11 13 15
5326    4th vec (E4): 17 19 21 23 25 27 29 31
5327
5328    The output of the second stage:
5329
5330    E1: 0 4  8 12 16 20 24 28
5331    E2: 2 6 10 14 18 22 26 30
5332    E3: 1 5  9 13 17 21 25 29
5333    E4: 3 7 11 15 19 23 27 31
5334
5335    And RESULT_CHAIN after reordering:
5336
5337    1st vec (E1):  0 4  8 12 16 20 24 28
5338    2nd vec (E3):  1 5  9 13 17 21 25 29
5339    3rd vec (E2):  2 6 10 14 18 22 26 30
5340    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5341
5342 static void
5343 vect_permute_load_chain (vec<tree> dr_chain,
5344                          unsigned int length,
5345                          gimple *stmt,
5346                          gimple_stmt_iterator *gsi,
5347                          vec<tree> *result_chain)
5348 {
5349   tree data_ref, first_vect, second_vect;
5350   tree perm_mask_even, perm_mask_odd;
5351   tree perm3_mask_low, perm3_mask_high;
5352   gimple *perm_stmt;
5353   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5354   unsigned int i, j, log_length = exact_log2 (length);
5355   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5356
5357   result_chain->quick_grow (length);
5358   memcpy (result_chain->address (), dr_chain.address (),
5359           length * sizeof (tree));
5360
5361   if (length == 3)
5362     {
5363       unsigned int k;
5364
5365       vec_perm_builder sel (nelt, nelt, 1);
5366       sel.quick_grow (nelt);
5367       vec_perm_indices indices;
5368       for (k = 0; k < 3; k++)
5369         {
5370           for (i = 0; i < nelt; i++)
5371             if (3 * i + k < 2 * nelt)
5372               sel[i] = 3 * i + k;
5373             else
5374               sel[i] = 0;
5375           indices.new_vector (sel, 2, nelt);
5376           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5377
5378           for (i = 0, j = 0; i < nelt; i++)
5379             if (3 * i + k < 2 * nelt)
5380               sel[i] = i;
5381             else
5382               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5383           indices.new_vector (sel, 2, nelt);
5384           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5385
5386           first_vect = dr_chain[0];
5387           second_vect = dr_chain[1];
5388
5389           /* Create interleaving stmt (low part of):
5390              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5391                                                              ...}>  */
5392           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5393           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5394                                            second_vect, perm3_mask_low);
5395           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5396
5397           /* Create interleaving stmt (high part of):
5398              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5399                                                               ...}>  */
5400           first_vect = data_ref;
5401           second_vect = dr_chain[2];
5402           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5403           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5404                                            second_vect, perm3_mask_high);
5405           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5406           (*result_chain)[k] = data_ref;
5407         }
5408     }
5409   else
5410     {
5411       /* If length is not equal to 3 then only power of 2 is supported.  */
5412       gcc_assert (pow2p_hwi (length));
5413
5414       /* The encoding has a single stepped pattern.  */
5415       vec_perm_builder sel (nelt, 1, 3);
5416       sel.quick_grow (3);
5417       for (i = 0; i < 3; ++i)
5418         sel[i] = i * 2;
5419       vec_perm_indices indices (sel, 2, nelt);
5420       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5421
5422       for (i = 0; i < 3; ++i)
5423         sel[i] = i * 2 + 1;
5424       indices.new_vector (sel, 2, nelt);
5425       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5426
5427       for (i = 0; i < log_length; i++)
5428         {
5429           for (j = 0; j < length; j += 2)
5430             {
5431               first_vect = dr_chain[j];
5432               second_vect = dr_chain[j+1];
5433
5434               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5435               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5436               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5437                                                first_vect, second_vect,
5438                                                perm_mask_even);
5439               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5440               (*result_chain)[j/2] = data_ref;
5441
5442               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5443               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5444               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5445                                                first_vect, second_vect,
5446                                                perm_mask_odd);
5447               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5448               (*result_chain)[j/2+length/2] = data_ref;
5449             }
5450           memcpy (dr_chain.address (), result_chain->address (),
5451                   length * sizeof (tree));
5452         }
5453     }
5454 }
5455
5456 /* Function vect_shift_permute_load_chain.
5457
5458    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5459    sequence of stmts to reorder the input data accordingly.
5460    Return the final references for loads in RESULT_CHAIN.
5461    Return true if successed, false otherwise.
5462
5463    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5464    The input is 3 vectors each containing 8 elements.  We assign a
5465    number to each element, the input sequence is:
5466
5467    1st vec:   0  1  2  3  4  5  6  7
5468    2nd vec:   8  9 10 11 12 13 14 15
5469    3rd vec:  16 17 18 19 20 21 22 23
5470
5471    The output sequence should be:
5472
5473    1st vec:  0 3 6  9 12 15 18 21
5474    2nd vec:  1 4 7 10 13 16 19 22
5475    3rd vec:  2 5 8 11 14 17 20 23
5476
5477    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5478
5479    First we shuffle all 3 vectors to get correct elements order:
5480
5481    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5482    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5483    3rd vec:  (16 19 22) (17 20 23) (18 21)
5484
5485    Next we unite and shift vector 3 times:
5486
5487    1st step:
5488      shift right by 6 the concatenation of:
5489      "1st vec" and  "2nd vec"
5490        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5491      "2nd vec" and  "3rd vec"
5492        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5493      "3rd vec" and  "1st vec"
5494        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5495                              | New vectors                   |
5496
5497      So that now new vectors are:
5498
5499      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5500      2nd vec:  (10 13) (16 19 22) (17 20 23)
5501      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5502
5503    2nd step:
5504      shift right by 5 the concatenation of:
5505      "1st vec" and  "3rd vec"
5506        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5507      "2nd vec" and  "1st vec"
5508        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5509      "3rd vec" and  "2nd vec"
5510        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5511                           | New vectors                   |
5512
5513      So that now new vectors are:
5514
5515      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5516      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5517      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5518
5519    3rd step:
5520      shift right by 5 the concatenation of:
5521      "1st vec" and  "1st vec"
5522        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5523      shift right by 3 the concatenation of:
5524      "2nd vec" and  "2nd vec"
5525                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5526                           | New vectors                   |
5527
5528      So that now all vectors are READY:
5529      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5530      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5531      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5532
5533    This algorithm is faster than one in vect_permute_load_chain if:
5534      1.  "shift of a concatination" is faster than general permutation.
5535          This is usually so.
5536      2.  The TARGET machine can't execute vector instructions in parallel.
5537          This is because each step of the algorithm depends on previous.
5538          The algorithm in vect_permute_load_chain is much more parallel.
5539
5540    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5541 */
5542
5543 static bool
5544 vect_shift_permute_load_chain (vec<tree> dr_chain,
5545                                unsigned int length,
5546                                gimple *stmt,
5547                                gimple_stmt_iterator *gsi,
5548                                vec<tree> *result_chain)
5549 {
5550   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5551   tree perm2_mask1, perm2_mask2, perm3_mask;
5552   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5553   gimple *perm_stmt;
5554
5555   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5556   unsigned int i;
5557   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5558   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5559   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5560
5561   unsigned HOST_WIDE_INT vf;
5562   if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5563     /* Not supported for variable-length vectors.  */
5564     return false;
5565
5566   vec_perm_builder sel (nelt, nelt, 1);
5567   sel.quick_grow (nelt);
5568
5569   result_chain->quick_grow (length);
5570   memcpy (result_chain->address (), dr_chain.address (),
5571           length * sizeof (tree));
5572
5573   if (pow2p_hwi (length) && vf > 4)
5574     {
5575       unsigned int j, log_length = exact_log2 (length);
5576       for (i = 0; i < nelt / 2; ++i)
5577         sel[i] = i * 2;
5578       for (i = 0; i < nelt / 2; ++i)
5579         sel[nelt / 2 + i] = i * 2 + 1;
5580       vec_perm_indices indices (sel, 2, nelt);
5581       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5582         {
5583           if (dump_enabled_p ())
5584             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5585                              "shuffle of 2 fields structure is not \
5586                               supported by target\n");
5587           return false;
5588         }
5589       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5590
5591       for (i = 0; i < nelt / 2; ++i)
5592         sel[i] = i * 2 + 1;
5593       for (i = 0; i < nelt / 2; ++i)
5594         sel[nelt / 2 + i] = i * 2;
5595       indices.new_vector (sel, 2, nelt);
5596       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5597         {
5598           if (dump_enabled_p ())
5599             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5600                              "shuffle of 2 fields structure is not \
5601                               supported by target\n");
5602           return false;
5603         }
5604       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5605
5606       /* Generating permutation constant to shift all elements.
5607          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5608       for (i = 0; i < nelt; i++)
5609         sel[i] = nelt / 2 + i;
5610       indices.new_vector (sel, 2, nelt);
5611       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5612         {
5613           if (dump_enabled_p ())
5614             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5615                              "shift permutation is not supported by target\n");
5616           return false;
5617         }
5618       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5619
5620       /* Generating permutation constant to select vector from 2.
5621          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5622       for (i = 0; i < nelt / 2; i++)
5623         sel[i] = i;
5624       for (i = nelt / 2; i < nelt; i++)
5625         sel[i] = nelt + i;
5626       indices.new_vector (sel, 2, nelt);
5627       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5628         {
5629           if (dump_enabled_p ())
5630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5631                              "select is not supported by target\n");
5632           return false;
5633         }
5634       select_mask = vect_gen_perm_mask_checked (vectype, indices);
5635
5636       for (i = 0; i < log_length; i++)
5637         {
5638           for (j = 0; j < length; j += 2)
5639             {
5640               first_vect = dr_chain[j];
5641               second_vect = dr_chain[j + 1];
5642
5643               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5644               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5645                                                first_vect, first_vect,
5646                                                perm2_mask1);
5647               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5648               vect[0] = data_ref;
5649
5650               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5651               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5652                                                second_vect, second_vect,
5653                                                perm2_mask2);
5654               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5655               vect[1] = data_ref;
5656
5657               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5658               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5659                                                vect[0], vect[1], shift1_mask);
5660               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5661               (*result_chain)[j/2 + length/2] = data_ref;
5662
5663               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5664               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5665                                                vect[0], vect[1], select_mask);
5666               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5667               (*result_chain)[j/2] = data_ref;
5668             }
5669           memcpy (dr_chain.address (), result_chain->address (),
5670                   length * sizeof (tree));
5671         }
5672       return true;
5673     }
5674   if (length == 3 && vf > 2)
5675     {
5676       unsigned int k = 0, l = 0;
5677
5678       /* Generating permutation constant to get all elements in rigth order.
5679          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5680       for (i = 0; i < nelt; i++)
5681         {
5682           if (3 * k + (l % 3) >= nelt)
5683             {
5684               k = 0;
5685               l += (3 - (nelt % 3));
5686             }
5687           sel[i] = 3 * k + (l % 3);
5688           k++;
5689         }
5690       vec_perm_indices indices (sel, 2, nelt);
5691       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5692         {
5693           if (dump_enabled_p ())
5694             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5695                              "shuffle of 3 fields structure is not \
5696                               supported by target\n");
5697           return false;
5698         }
5699       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5700
5701       /* Generating permutation constant to shift all elements.
5702          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5703       for (i = 0; i < nelt; i++)
5704         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5705       indices.new_vector (sel, 2, nelt);
5706       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5707         {
5708           if (dump_enabled_p ())
5709             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5710                              "shift permutation is not supported by target\n");
5711           return false;
5712         }
5713       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5714
5715       /* Generating permutation constant to shift all elements.
5716          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5717       for (i = 0; i < nelt; i++)
5718         sel[i] = 2 * (nelt / 3) + 1 + i;
5719       indices.new_vector (sel, 2, nelt);
5720       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5721         {
5722           if (dump_enabled_p ())
5723             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5724                              "shift permutation is not supported by target\n");
5725           return false;
5726         }
5727       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5728
5729       /* Generating permutation constant to shift all elements.
5730          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5731       for (i = 0; i < nelt; i++)
5732         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5733       indices.new_vector (sel, 2, nelt);
5734       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5735         {
5736           if (dump_enabled_p ())
5737             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5738                              "shift permutation is not supported by target\n");
5739           return false;
5740         }
5741       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5742
5743       /* Generating permutation constant to shift all elements.
5744          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5745       for (i = 0; i < nelt; i++)
5746         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5747       indices.new_vector (sel, 2, nelt);
5748       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5749         {
5750           if (dump_enabled_p ())
5751             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5752                              "shift permutation is not supported by target\n");
5753           return false;
5754         }
5755       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5756
5757       for (k = 0; k < 3; k++)
5758         {
5759           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5760           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5761                                            dr_chain[k], dr_chain[k],
5762                                            perm3_mask);
5763           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5764           vect[k] = data_ref;
5765         }
5766
5767       for (k = 0; k < 3; k++)
5768         {
5769           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5770           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5771                                            vect[k % 3], vect[(k + 1) % 3],
5772                                            shift1_mask);
5773           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5774           vect_shift[k] = data_ref;
5775         }
5776
5777       for (k = 0; k < 3; k++)
5778         {
5779           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5780           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5781                                            vect_shift[(4 - k) % 3],
5782                                            vect_shift[(3 - k) % 3],
5783                                            shift2_mask);
5784           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5785           vect[k] = data_ref;
5786         }
5787
5788       (*result_chain)[3 - (nelt % 3)] = vect[2];
5789
5790       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5791       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5792                                        vect[0], shift3_mask);
5793       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5794       (*result_chain)[nelt % 3] = data_ref;
5795
5796       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5797       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5798                                        vect[1], shift4_mask);
5799       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5800       (*result_chain)[0] = data_ref;
5801       return true;
5802     }
5803   return false;
5804 }
5805
5806 /* Function vect_transform_grouped_load.
5807
5808    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5809    to perform their permutation and ascribe the result vectorized statements to
5810    the scalar statements.
5811 */
5812
5813 void
5814 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5815                              gimple_stmt_iterator *gsi)
5816 {
5817   machine_mode mode;
5818   vec<tree> result_chain = vNULL;
5819
5820   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5821      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5822      vectors, that are ready for vector computation.  */
5823   result_chain.create (size);
5824
5825   /* If reassociation width for vector type is 2 or greater target machine can
5826      execute 2 or more vector instructions in parallel.  Otherwise try to
5827      get chain for loads group using vect_shift_permute_load_chain.  */
5828   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5829   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5830       || pow2p_hwi (size)
5831       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5832                                          gsi, &result_chain))
5833     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5834   vect_record_grouped_load_vectors (stmt, result_chain);
5835   result_chain.release ();
5836 }
5837
5838 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5839    generated as part of the vectorization of STMT.  Assign the statement
5840    for each vector to the associated scalar statement.  */
5841
5842 void
5843 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5844 {
5845   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5846   gimple *next_stmt, *new_stmt;
5847   unsigned int i, gap_count;
5848   tree tmp_data_ref;
5849
5850   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5851      Since we scan the chain starting from it's first node, their order
5852      corresponds the order of data-refs in RESULT_CHAIN.  */
5853   next_stmt = first_stmt;
5854   gap_count = 1;
5855   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5856     {
5857       if (!next_stmt)
5858         break;
5859
5860       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5861        code elimination pass later.  No need to check for the first stmt in
5862        the group, since it always exists.
5863        GROUP_GAP is the number of steps in elements from the previous
5864        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5865        correspond to the gaps.  */
5866       if (next_stmt != first_stmt
5867           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5868       {
5869         gap_count++;
5870         continue;
5871       }
5872
5873       while (next_stmt)
5874         {
5875           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5876           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5877              copies, and we put the new vector statement in the first available
5878              RELATED_STMT.  */
5879           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5880             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5881           else
5882             {
5883               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5884                 {
5885                   gimple *prev_stmt =
5886                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5887                   gimple *rel_stmt =
5888                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5889                   while (rel_stmt)
5890                     {
5891                       prev_stmt = rel_stmt;
5892                       rel_stmt =
5893                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5894                     }
5895
5896                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5897                     new_stmt;
5898                 }
5899             }
5900
5901           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5902           gap_count = 1;
5903           /* If NEXT_STMT accesses the same DR as the previous statement,
5904              put the same TMP_DATA_REF as its vectorized statement; otherwise
5905              get the next data-ref from RESULT_CHAIN.  */
5906           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5907             break;
5908         }
5909     }
5910 }
5911
5912 /* Function vect_force_dr_alignment_p.
5913
5914    Returns whether the alignment of a DECL can be forced to be aligned
5915    on ALIGNMENT bit boundary.  */
5916
5917 bool
5918 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5919 {
5920   if (!VAR_P (decl))
5921     return false;
5922
5923   if (decl_in_symtab_p (decl)
5924       && !symtab_node::get (decl)->can_increase_alignment_p ())
5925     return false;
5926
5927   if (TREE_STATIC (decl))
5928     return (alignment <= MAX_OFILE_ALIGNMENT);
5929   else
5930     return (alignment <= MAX_STACK_ALIGNMENT);
5931 }
5932
5933
5934 /* Return whether the data reference DR is supported with respect to its
5935    alignment.
5936    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5937    it is aligned, i.e., check if it is possible to vectorize it with different
5938    alignment.  */
5939
5940 enum dr_alignment_support
5941 vect_supportable_dr_alignment (struct data_reference *dr,
5942                                bool check_aligned_accesses)
5943 {
5944   gimple *stmt = DR_STMT (dr);
5945   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5946   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5947   machine_mode mode = TYPE_MODE (vectype);
5948   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5949   struct loop *vect_loop = NULL;
5950   bool nested_in_vect_loop = false;
5951
5952   if (aligned_access_p (dr) && !check_aligned_accesses)
5953     return dr_aligned;
5954
5955   /* For now assume all conditional loads/stores support unaligned
5956      access without any special code.  */
5957   if (is_gimple_call (stmt)
5958       && gimple_call_internal_p (stmt)
5959       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5960           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5961     return dr_unaligned_supported;
5962
5963   if (loop_vinfo)
5964     {
5965       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5966       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5967     }
5968
5969   /* Possibly unaligned access.  */
5970
5971   /* We can choose between using the implicit realignment scheme (generating
5972      a misaligned_move stmt) and the explicit realignment scheme (generating
5973      aligned loads with a REALIGN_LOAD).  There are two variants to the
5974      explicit realignment scheme: optimized, and unoptimized.
5975      We can optimize the realignment only if the step between consecutive
5976      vector loads is equal to the vector size.  Since the vector memory
5977      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5978      is guaranteed that the misalignment amount remains the same throughout the
5979      execution of the vectorized loop.  Therefore, we can create the
5980      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5981      at the loop preheader.
5982
5983      However, in the case of outer-loop vectorization, when vectorizing a
5984      memory access in the inner-loop nested within the LOOP that is now being
5985      vectorized, while it is guaranteed that the misalignment of the
5986      vectorized memory access will remain the same in different outer-loop
5987      iterations, it is *not* guaranteed that is will remain the same throughout
5988      the execution of the inner-loop.  This is because the inner-loop advances
5989      with the original scalar step (and not in steps of VS).  If the inner-loop
5990      step happens to be a multiple of VS, then the misalignment remains fixed
5991      and we can use the optimized realignment scheme.  For example:
5992
5993       for (i=0; i<N; i++)
5994         for (j=0; j<M; j++)
5995           s += a[i+j];
5996
5997      When vectorizing the i-loop in the above example, the step between
5998      consecutive vector loads is 1, and so the misalignment does not remain
5999      fixed across the execution of the inner-loop, and the realignment cannot
6000      be optimized (as illustrated in the following pseudo vectorized loop):
6001
6002       for (i=0; i<N; i+=4)
6003         for (j=0; j<M; j++){
6004           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6005                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6006                          // (assuming that we start from an aligned address).
6007           }
6008
6009      We therefore have to use the unoptimized realignment scheme:
6010
6011       for (i=0; i<N; i+=4)
6012           for (j=k; j<M; j+=4)
6013           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6014                            // that the misalignment of the initial address is
6015                            // 0).
6016
6017      The loop can then be vectorized as follows:
6018
6019       for (k=0; k<4; k++){
6020         rt = get_realignment_token (&vp[k]);
6021         for (i=0; i<N; i+=4){
6022           v1 = vp[i+k];
6023           for (j=k; j<M; j+=4){
6024             v2 = vp[i+j+VS-1];
6025             va = REALIGN_LOAD <v1,v2,rt>;
6026             vs += va;
6027             v1 = v2;
6028           }
6029         }
6030     } */
6031
6032   if (DR_IS_READ (dr))
6033     {
6034       bool is_packed = false;
6035       tree type = (TREE_TYPE (DR_REF (dr)));
6036
6037       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6038           && (!targetm.vectorize.builtin_mask_for_load
6039               || targetm.vectorize.builtin_mask_for_load ()))
6040         {
6041           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6042
6043           /* If we are doing SLP then the accesses need not have the
6044              same alignment, instead it depends on the SLP group size.  */
6045           if (loop_vinfo
6046               && STMT_SLP_TYPE (stmt_info)
6047               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6048                               * GROUP_SIZE (vinfo_for_stmt
6049                                             (GROUP_FIRST_ELEMENT (stmt_info))),
6050                               TYPE_VECTOR_SUBPARTS (vectype)))
6051             ;
6052           else if (!loop_vinfo
6053                    || (nested_in_vect_loop
6054                        && (TREE_INT_CST_LOW (DR_STEP (dr))
6055                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6056             return dr_explicit_realign;
6057           else
6058             return dr_explicit_realign_optimized;
6059         }
6060       if (!known_alignment_for_access_p (dr))
6061         is_packed = not_size_aligned (DR_REF (dr));
6062
6063       if (targetm.vectorize.support_vector_misalignment
6064             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6065         /* Can't software pipeline the loads, but can at least do them.  */
6066         return dr_unaligned_supported;
6067     }
6068   else
6069     {
6070       bool is_packed = false;
6071       tree type = (TREE_TYPE (DR_REF (dr)));
6072
6073       if (!known_alignment_for_access_p (dr))
6074         is_packed = not_size_aligned (DR_REF (dr));
6075
6076      if (targetm.vectorize.support_vector_misalignment
6077            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6078        return dr_unaligned_supported;
6079     }
6080
6081   /* Unsupported.  */
6082   return dr_unaligned_unsupported;
6083 }