gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode;
  65   scalar_int_mode array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   limit_p = !targetm.array_mode_supported_p (mode, count);
  70   if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
  71                           limit_p).exists (&array_mode))
  72     {
  73       if (dump_enabled_p ())
  74         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  75                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  76                          GET_MODE_NAME (mode), count);
  77       return false;
  78     }
  79
  80   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  81     {
  82       if (dump_enabled_p ())
  83         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  84                          "cannot use %s<%s><%s>\n", name,
  85                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  86       return false;
  87     }
  88
  89   if (dump_enabled_p ())
  90     dump_printf_loc (MSG_NOTE, vect_location,
  91                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  92                      GET_MODE_NAME (mode));
  93
  94   return true;
  95 }
  96
  97
  98 /* Return the smallest scalar part of STMT.
  99    This is used to determine the vectype of the stmt.  We generally set the
 100    vectype according to the type of the result (lhs).  For stmts whose
 101    result-type is different than the type of the arguments (e.g., demotion,
 102    promotion), vectype will be reset appropriately (later).  Note that we have
 103    to visit the smallest datatype in this function, because that determines the
 104    VF.  If the smallest datatype in the loop is present only as the rhs of a
 105    promotion operation - we'd miss it.
 106    Such a case, where a variable of this datatype does not appear in the lhs
 107    anywhere in the loop, can only occur if it's an invariant: e.g.:
 108    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 109    invariant motion.  However, we cannot rely on invariant motion to always
 110    take invariants out of the loop, and so in the case of promotion we also
 111    have to check the rhs.
 112    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 113    types.  */
 114
 115 tree
 116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 117                                HOST_WIDE_INT *rhs_size_unit)
 118 {
 119   tree scalar_type = gimple_expr_type (stmt);
 120   HOST_WIDE_INT lhs, rhs;
 121
 122   /* During the analysis phase, this function is called on arbitrary
 123      statements that might not have scalar results.  */
 124   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 125     return scalar_type;
 126
 127   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 128
 129   if (is_gimple_assign (stmt)
 130       && (gimple_assign_cast_p (stmt)
 131           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 132           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 133           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 134     {
 135       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 136
 137       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 138       if (rhs < lhs)
 139         scalar_type = rhs_type;
 140     }
 141
 142   *lhs_size_unit = lhs;
 143   *rhs_size_unit = rhs;
 144   return scalar_type;
 145 }
 146
 147
 148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 149    tested at run-time.  Return TRUE if DDR was successfully inserted.
 150    Return false if versioning is not supported.  */
 151
 152 static bool
 153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 154 {
 155   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 156
 157   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 158     return false;
 159
 160   if (!runtime_alias_check_p (ddr, loop,
 161                               optimize_loop_nest_for_speed_p (loop)))
 162     return false;
 163
 164   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 165   return true;
 166 }
 167
 168
 169 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 170    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 171    distances.  These distances are conservatively correct but they don't
 172    reflect a guaranteed dependence.
 173
 174    Return true if this function does all the work necessary to avoid
 175    an alias or false if the caller should use the dependence distances
 176    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 177    the depth of the loop described by LOOP_VINFO and the other arguments
 178    are as for vect_analyze_data_ref_dependence.  */
 179
 180 static bool
 181 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 182                                        loop_vec_info loop_vinfo,
 183                                        int loop_depth, unsigned int *max_vf)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   lambda_vector dist_v;
 187   unsigned int i;
 188   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 189     {
 190       int dist = dist_v[loop_depth];
 191       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 192         {
 193           /* If the user asserted safelen >= DIST consecutive iterations
 194              can be executed concurrently, assume independence.
 195
 196              ??? An alternative would be to add the alias check even
 197              in this case, and vectorize the fallback loop with the
 198              maximum VF set to safelen.  However, if the user has
 199              explicitly given a length, it's less likely that that
 200              would be a win.  */
 201           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 202             {
 203               if ((unsigned int) loop->safelen < *max_vf)
 204                 *max_vf = loop->safelen;
 205               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 206               continue;
 207             }
 208
 209           /* For dependence distances of 2 or more, we have the option
 210              of limiting VF or checking for an alias at runtime.
 211              Prefer to check at runtime if we can, to avoid limiting
 212              the VF unnecessarily when the bases are in fact independent.
 213
 214              Note that the alias checks will be removed if the VF ends up
 215              being small enough.  */
 216           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 217         }
 218     }
 219   return true;
 220 }
 221
 222
 223 /* Function vect_analyze_data_ref_dependence.
 224
 225    Return TRUE if there (might) exist a dependence between a memory-reference
 226    DRA and a memory-reference DRB.  When versioning for alias may check a
 227    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 228    the data dependence.  */
 229
 230 static bool
 231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 232                                   loop_vec_info loop_vinfo,
 233                                   unsigned int *max_vf)
 234 {
 235   unsigned int i;
 236   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 237   struct data_reference *dra = DDR_A (ddr);
 238   struct data_reference *drb = DDR_B (ddr);
 239   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 240   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 241   lambda_vector dist_v;
 242   unsigned int loop_depth;
 243
 244   /* In loop analysis all data references should be vectorizable.  */
 245   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 246       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 247     gcc_unreachable ();
 248
 249   /* Independent data accesses.  */
 250   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 251     return false;
 252
 253   if (dra == drb
 254       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 255     return false;
 256
 257   /* We do not have to consider dependences between accesses that belong
 258      to the same group.  */
 259   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 260       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 261     return false;
 262
 263   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 264      least two scalar iterations, there is always also a true dependence.
 265      As the vectorizer does not re-order loads and stores we can ignore
 266      the anti-dependence if TBAA can disambiguate both DRs similar to the
 267      case with known negative distance anti-dependences (positive
 268      distance anti-dependences would violate TBAA constraints).  */
 269   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 270        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 271       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 272                                  get_alias_set (DR_REF (drb))))
 273     return false;
 274
 275   /* Unknown data dependence.  */
 276   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 277     {
 278       /* If user asserted safelen consecutive iterations can be
 279          executed concurrently, assume independence.  */
 280       if (loop->safelen >= 2)
 281         {
 282           if ((unsigned int) loop->safelen < *max_vf)
 283             *max_vf = loop->safelen;
 284           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 285           return false;
 286         }
 287
 288       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 289           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 290         {
 291           if (dump_enabled_p ())
 292             {
 293               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 294                                "versioning for alias not supported for: "
 295                                "can't determine dependence between ");
 296               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 297                                  DR_REF (dra));
 298               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 299               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 300                                  DR_REF (drb));
 301               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 302             }
 303           return true;
 304         }
 305
 306       if (dump_enabled_p ())
 307         {
 308           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 309                            "versioning for alias required: "
 310                            "can't determine dependence between ");
 311           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 312                              DR_REF (dra));
 313           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 314           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 315                              DR_REF (drb));
 316           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 317         }
 318
 319       /* Add to list of ddrs that need to be tested at run-time.  */
 320       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 321     }
 322
 323   /* Known data dependence.  */
 324   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 325     {
 326       /* If user asserted safelen consecutive iterations can be
 327          executed concurrently, assume independence.  */
 328       if (loop->safelen >= 2)
 329         {
 330           if ((unsigned int) loop->safelen < *max_vf)
 331             *max_vf = loop->safelen;
 332           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 333           return false;
 334         }
 335
 336       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 337           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 338         {
 339           if (dump_enabled_p ())
 340             {
 341               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 342                                "versioning for alias not supported for: "
 343                                "bad dist vector for ");
 344               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 345                                  DR_REF (dra));
 346               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 347               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 348                                  DR_REF (drb));
 349               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 350             }
 351           return true;
 352         }
 353
 354       if (dump_enabled_p ())
 355         {
 356           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 357                            "versioning for alias required: "
 358                            "bad dist vector for ");
 359           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 360           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 361           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 362           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363         }
 364       /* Add to list of ddrs that need to be tested at run-time.  */
 365       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 366     }
 367
 368   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 369
 370   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 371       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 372                                                 loop_depth, max_vf))
 373     return false;
 374
 375   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 376     {
 377       int dist = dist_v[loop_depth];
 378
 379       if (dump_enabled_p ())
 380         dump_printf_loc (MSG_NOTE, vect_location,
 381                          "dependence distance  = %d.\n", dist);
 382
 383       if (dist == 0)
 384         {
 385           if (dump_enabled_p ())
 386             {
 387               dump_printf_loc (MSG_NOTE, vect_location,
 388                                "dependence distance == 0 between ");
 389               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 390               dump_printf (MSG_NOTE, " and ");
 391               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 392               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 393             }
 394
 395           /* When we perform grouped accesses and perform implicit CSE
 396              by detecting equal accesses and doing disambiguation with
 397              runtime alias tests like for
 398                 .. = a[i];
 399                 .. = a[i+1];
 400                 a[i] = ..;
 401                 a[i+1] = ..;
 402                 *p = ..;
 403                 .. = a[i];
 404                 .. = a[i+1];
 405              where we will end up loading { a[i], a[i+1] } once, make
 406              sure that inserting group loads before the first load and
 407              stores after the last store will do the right thing.
 408              Similar for groups like
 409                 a[i] = ...;
 410                 ... = a[i];
 411                 a[i+1] = ...;
 412              where loads from the group interleave with the store.  */
 413           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 414               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 415             {
 416               gimple *earlier_stmt;
 417               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 418               if (DR_IS_WRITE
 419                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 420                 {
 421                   if (dump_enabled_p ())
 422                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 423                                      "READ_WRITE dependence in interleaving."
 424                                      "\n");
 425                   return true;
 426                 }
 427             }
 428
 429           continue;
 430         }
 431
 432       if (dist > 0 && DDR_REVERSED_P (ddr))
 433         {
 434           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 435              reversed (to make distance vector positive), and the actual
 436              distance is negative.  */
 437           if (dump_enabled_p ())
 438             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 439                              "dependence distance negative.\n");
 440           /* Record a negative dependence distance to later limit the
 441              amount of stmt copying / unrolling we can perform.
 442              Only need to handle read-after-write dependence.  */
 443           if (DR_IS_READ (drb)
 444               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 445                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 446             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 447           continue;
 448         }
 449
 450       unsigned int abs_dist = abs (dist);
 451       if (abs_dist >= 2 && abs_dist < *max_vf)
 452         {
 453           /* The dependence distance requires reduction of the maximal
 454              vectorization factor.  */
 455           *max_vf = abs (dist);
 456           if (dump_enabled_p ())
 457             dump_printf_loc (MSG_NOTE, vect_location,
 458                              "adjusting maximal vectorization factor to %i\n",
 459                              *max_vf);
 460         }
 461
 462       if (abs_dist >= *max_vf)
 463         {
 464           /* Dependence distance does not create dependence, as far as
 465              vectorization is concerned, in this case.  */
 466           if (dump_enabled_p ())
 467             dump_printf_loc (MSG_NOTE, vect_location,
 468                              "dependence distance >= VF.\n");
 469           continue;
 470         }
 471
 472       if (dump_enabled_p ())
 473         {
 474           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 475                        "not vectorized, possible dependence "
 476                        "between data-refs ");
 477           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 478           dump_printf (MSG_NOTE,  " and ");
 479           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 480           dump_printf (MSG_NOTE,  "\n");
 481         }
 482
 483       return true;
 484     }
 485
 486   return false;
 487 }
 488
 489 /* Function vect_analyze_data_ref_dependences.
 490
 491    Examine all the data references in the loop, and make sure there do not
 492    exist any data dependences between them.  Set *MAX_VF according to
 493    the maximum vectorization factor the data dependences allow.  */
 494
 495 bool
 496 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 497                                    unsigned int *max_vf)
 498 {
 499   unsigned int i;
 500   struct data_dependence_relation *ddr;
 501
 502   if (dump_enabled_p ())
 503     dump_printf_loc (MSG_NOTE, vect_location,
 504                      "=== vect_analyze_data_ref_dependences ===\n");
 505
 506   LOOP_VINFO_DDRS (loop_vinfo)
 507     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 508              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 509   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 510   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 511   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 512                                 &LOOP_VINFO_DDRS (loop_vinfo),
 513                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 514     return false;
 515
 516   /* For epilogues we either have no aliases or alias versioning
 517      was applied to original loop.  Therefore we may just get max_vf
 518      using VF of original loop.  */
 519   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 520     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 521   else
 522     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 523       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 524         return false;
 525
 526   return true;
 527 }
 528
 529
 530 /* Function vect_slp_analyze_data_ref_dependence.
 531
 532    Return TRUE if there (might) exist a dependence between a memory-reference
 533    DRA and a memory-reference DRB.  When versioning for alias may check a
 534    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 535    the data dependence.  */
 536
 537 static bool
 538 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 539 {
 540   struct data_reference *dra = DDR_A (ddr);
 541   struct data_reference *drb = DDR_B (ddr);
 542
 543   /* We need to check dependences of statements marked as unvectorizable
 544      as well, they still can prohibit vectorization.  */
 545
 546   /* Independent data accesses.  */
 547   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 548     return false;
 549
 550   if (dra == drb)
 551     return false;
 552
 553   /* Read-read is OK.  */
 554   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 555     return false;
 556
 557   /* If dra and drb are part of the same interleaving chain consider
 558      them independent.  */
 559   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 560       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 561           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 562     return false;
 563
 564   /* Unknown data dependence.  */
 565   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 566     {
 567       if  (dump_enabled_p ())
 568         {
 569           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 570                            "can't determine dependence between ");
 571           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 572           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 573           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 574           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 575         }
 576     }
 577   else if (dump_enabled_p ())
 578     {
 579       dump_printf_loc (MSG_NOTE, vect_location,
 580                        "determined dependence between ");
 581       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 582       dump_printf (MSG_NOTE, " and ");
 583       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 584       dump_printf (MSG_NOTE,  "\n");
 585     }
 586
 587   return true;
 588 }
 589
 590
 591 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 592    contain the vector of scalar stores of this instance if we are
 593    disambiguating the loads.  */
 594
 595 static bool
 596 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 597                                    vec<gimple *> stores, gimple *last_store)
 598 {
 599   /* This walks over all stmts involved in the SLP load/store done
 600      in NODE verifying we can sink them up to the last stmt in the
 601      group.  */
 602   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 603   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 604     {
 605       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 606       if (access == last_access)
 607         continue;
 608       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 609       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 610            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 611         {
 612           gimple *stmt = gsi_stmt (gsi);
 613           if (! gimple_vuse (stmt)
 614               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 615             continue;
 616
 617           /* If we couldn't record a (single) data reference for this
 618              stmt we have to give up.  */
 619           /* ???  Here and below if dependence analysis fails we can resort
 620              to the alias oracle which can handle more kinds of stmts.  */
 621           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 622           if (!dr_b)
 623             return false;
 624
 625           bool dependent = false;
 626           /* If we run into a store of this same instance (we've just
 627              marked those) then delay dependence checking until we run
 628              into the last store because this is where it will have
 629              been sunk to (and we verify if we can do that as well).  */
 630           if (gimple_visited_p (stmt))
 631             {
 632               if (stmt != last_store)
 633                 continue;
 634               unsigned i;
 635               gimple *store;
 636               FOR_EACH_VEC_ELT (stores, i, store)
 637                 {
 638                   data_reference *store_dr
 639                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 640                   ddr_p ddr = initialize_data_dependence_relation
 641                                 (dr_a, store_dr, vNULL);
 642                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 643                   free_dependence_relation (ddr);
 644                   if (dependent)
 645                     break;
 646                 }
 647             }
 648           else
 649             {
 650               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 651                                                                dr_b, vNULL);
 652               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 653               free_dependence_relation (ddr);
 654             }
 655           if (dependent)
 656             return false;
 657         }
 658     }
 659   return true;
 660 }
 661
 662
 663 /* Function vect_analyze_data_ref_dependences.
 664
 665    Examine all the data references in the basic-block, and make sure there
 666    do not exist any data dependences between them.  Set *MAX_VF according to
 667    the maximum vectorization factor the data dependences allow.  */
 668
 669 bool
 670 vect_slp_analyze_instance_dependence (slp_instance instance)
 671 {
 672   if (dump_enabled_p ())
 673     dump_printf_loc (MSG_NOTE, vect_location,
 674                      "=== vect_slp_analyze_instance_dependence ===\n");
 675
 676   /* The stores of this instance are at the root of the SLP tree.  */
 677   slp_tree store = SLP_INSTANCE_TREE (instance);
 678   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 679     store = NULL;
 680
 681   /* Verify we can sink stores to the vectorized stmt insert location.  */
 682   gimple *last_store = NULL;
 683   if (store)
 684     {
 685       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 686         return false;
 687
 688       /* Mark stores in this instance and remember the last one.  */
 689       last_store = vect_find_last_scalar_stmt_in_slp (store);
 690       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 691         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 692     }
 693
 694   bool res = true;
 695
 696   /* Verify we can sink loads to the vectorized stmt insert location,
 697      special-casing stores of this instance.  */
 698   slp_tree load;
 699   unsigned int i;
 700   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 701     if (! vect_slp_analyze_node_dependences (instance, load,
 702                                              store
 703                                              ? SLP_TREE_SCALAR_STMTS (store)
 704                                              : vNULL, last_store))
 705       {
 706         res = false;
 707         break;
 708       }
 709
 710   /* Unset the visited flag.  */
 711   if (store)
 712     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 713       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 714
 715   return res;
 716 }
 717
 718 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 719    the statement that contains DRB, which is useful for recording in the
 720    dump file.  */
 721
 722 static void
 723 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 724                             innermost_loop_behavior *drb)
 725 {
 726   bool existed;
 727   innermost_loop_behavior *&entry
 728     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 729   if (!existed || entry->base_alignment < drb->base_alignment)
 730     {
 731       entry = drb;
 732       if (dump_enabled_p ())
 733         {
 734           dump_printf_loc (MSG_NOTE, vect_location,
 735                            "recording new base alignment for ");
 736           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 737           dump_printf (MSG_NOTE, "\n");
 738           dump_printf_loc (MSG_NOTE, vect_location,
 739                            "  alignment:    %d\n", drb->base_alignment);
 740           dump_printf_loc (MSG_NOTE, vect_location,
 741                            "  misalignment: %d\n", drb->base_misalignment);
 742           dump_printf_loc (MSG_NOTE, vect_location,
 743                            "  based on:     ");
 744           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 745         }
 746     }
 747 }
 748
 749 /* If the region we're going to vectorize is reached, all unconditional
 750    data references occur at least once.  We can therefore pool the base
 751    alignment guarantees from each unconditional reference.  Do this by
 752    going through all the data references in VINFO and checking whether
 753    the containing statement makes the reference unconditionally.  If so,
 754    record the alignment of the base address in VINFO so that it can be
 755    used for all other references with the same base.  */
 756
 757 void
 758 vect_record_base_alignments (vec_info *vinfo)
 759 {
 760   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 761   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 762   data_reference *dr;
 763   unsigned int i;
 764   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 765     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 766       {
 767         gimple *stmt = DR_STMT (dr);
 768         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 769
 770         /* If DR is nested in the loop that is being vectorized, we can also
 771            record the alignment of the base wrt the outer loop.  */
 772         if (loop && nested_in_vect_loop_p (loop, stmt))
 773           {
 774             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 775             vect_record_base_alignment
 776               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 777           }
 778       }
 779 }
 780
 781 /* Return the target alignment for the vectorized form of DR.  */
 782
 783 static unsigned int
 784 vect_calculate_target_alignment (struct data_reference *dr)
 785 {
 786   gimple *stmt = DR_STMT (dr);
 787   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 788   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 789   return targetm.vectorize.preferred_vector_alignment (vectype);
 790 }
 791
 792 /* Function vect_compute_data_ref_alignment
 793
 794    Compute the misalignment of the data reference DR.
 795
 796    Output:
 797    1. If during the misalignment computation it is found that the data reference
 798       cannot be vectorized then false is returned.
 799    2. DR_MISALIGNMENT (DR) is defined.
 800
 801    FOR NOW: No analysis is actually performed. Misalignment is calculated
 802    only for trivial cases. TODO.  */
 803
 804 bool
 805 vect_compute_data_ref_alignment (struct data_reference *dr)
 806 {
 807   gimple *stmt = DR_STMT (dr);
 808   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 809   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 810   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 811   struct loop *loop = NULL;
 812   tree ref = DR_REF (dr);
 813   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 814
 815   if (dump_enabled_p ())
 816     dump_printf_loc (MSG_NOTE, vect_location,
 817                      "vect_compute_data_ref_alignment:\n");
 818
 819   if (loop_vinfo)
 820     loop = LOOP_VINFO_LOOP (loop_vinfo);
 821
 822   /* Initialize misalignment to unknown.  */
 823   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 824
 825   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 826   bool step_preserves_misalignment_p;
 827
 828   unsigned HOST_WIDE_INT vector_alignment
 829     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 830   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 831
 832   /* No step for BB vectorization.  */
 833   if (!loop)
 834     {
 835       gcc_assert (integer_zerop (drb->step));
 836       step_preserves_misalignment_p = true;
 837     }
 838
 839   /* In case the dataref is in an inner-loop of the loop that is being
 840      vectorized (LOOP), we use the base and misalignment information
 841      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 842      stays the same throughout the execution of the inner-loop, which is why
 843      we have to check that the stride of the dataref in the inner-loop evenly
 844      divides by the vector alignment.  */
 845   else if (nested_in_vect_loop_p (loop, stmt))
 846     {
 847       step_preserves_misalignment_p
 848         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 849
 850       if (dump_enabled_p ())
 851         {
 852           if (step_preserves_misalignment_p)
 853             dump_printf_loc (MSG_NOTE, vect_location,
 854                              "inner step divides the vector alignment.\n");
 855           else
 856             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 857                              "inner step doesn't divide the vector"
 858                              " alignment.\n");
 859         }
 860     }
 861
 862   /* Similarly we can only use base and misalignment information relative to
 863      an innermost loop if the misalignment stays the same throughout the
 864      execution of the loop.  As above, this is the case if the stride of
 865      the dataref evenly divides by the alignment.  */
 866   else
 867     {
 868       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 869       step_preserves_misalignment_p
 870         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 871
 872       if (!step_preserves_misalignment_p && dump_enabled_p ())
 873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 874                          "step doesn't divide the vector alignment.\n");
 875     }
 876
 877   unsigned int base_alignment = drb->base_alignment;
 878   unsigned int base_misalignment = drb->base_misalignment;
 879
 880   /* Calculate the maximum of the pooled base address alignment and the
 881      alignment that we can compute for DR itself.  */
 882   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 883   if (entry && base_alignment < (*entry)->base_alignment)
 884     {
 885       base_alignment = (*entry)->base_alignment;
 886       base_misalignment = (*entry)->base_misalignment;
 887     }
 888
 889   if (drb->offset_alignment < vector_alignment
 890       || !step_preserves_misalignment_p
 891       /* We need to know whether the step wrt the vectorized loop is
 892          negative when computing the starting misalignment below.  */
 893       || TREE_CODE (drb->step) != INTEGER_CST)
 894     {
 895       if (dump_enabled_p ())
 896         {
 897           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 898                            "Unknown alignment for access: ");
 899           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 900           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 901         }
 902       return true;
 903     }
 904
 905   if (base_alignment < vector_alignment)
 906     {
 907       tree base = drb->base_address;
 908       if (TREE_CODE (base) == ADDR_EXPR)
 909         base = TREE_OPERAND (base, 0);
 910       if (!vect_can_force_dr_alignment_p (base,
 911                                           vector_alignment * BITS_PER_UNIT))
 912         {
 913           if (dump_enabled_p ())
 914             {
 915               dump_printf_loc (MSG_NOTE, vect_location,
 916                                "can't force alignment of ref: ");
 917               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 918               dump_printf (MSG_NOTE, "\n");
 919             }
 920           return true;
 921         }
 922
 923       if (DECL_USER_ALIGN (base))
 924         {
 925           if (dump_enabled_p ())
 926             {
 927               dump_printf_loc (MSG_NOTE, vect_location,
 928                                "not forcing alignment of user-aligned "
 929                                "variable: ");
 930               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 931               dump_printf (MSG_NOTE, "\n");
 932             }
 933           return true;
 934         }
 935
 936       /* Force the alignment of the decl.
 937          NOTE: This is the only change to the code we make during
 938          the analysis phase, before deciding to vectorize the loop.  */
 939       if (dump_enabled_p ())
 940         {
 941           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 942           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 943           dump_printf (MSG_NOTE, "\n");
 944         }
 945
 946       DR_VECT_AUX (dr)->base_decl = base;
 947       DR_VECT_AUX (dr)->base_misaligned = true;
 948       base_misalignment = 0;
 949     }
 950   poly_int64 misalignment
 951     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 952
 953   /* If this is a backward running DR then first access in the larger
 954      vectype actually is N-1 elements before the address in the DR.
 955      Adjust misalign accordingly.  */
 956   if (tree_int_cst_sgn (drb->step) < 0)
 957     /* PLUS because STEP is negative.  */
 958     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 959                      * TREE_INT_CST_LOW (drb->step));
 960
 961   unsigned int const_misalignment;
 962   if (!known_misalignment (misalignment, vector_alignment,
 963                            &const_misalignment))
 964     {
 965       if (dump_enabled_p ())
 966         {
 967           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 968                            "Non-constant misalignment for access: ");
 969           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 970           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 971         }
 972       return true;
 973     }
 974
 975   SET_DR_MISALIGNMENT (dr, const_misalignment);
 976
 977   if (dump_enabled_p ())
 978     {
 979       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 980                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 981       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 982       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 983     }
 984
 985   return true;
 986 }
 987
 988 /* Function vect_update_misalignment_for_peel.
 989    Sets DR's misalignment
 990    - to 0 if it has the same alignment as DR_PEEL,
 991    - to the misalignment computed using NPEEL if DR's salignment is known,
 992    - to -1 (unknown) otherwise.
 993
 994    DR - the data reference whose misalignment is to be adjusted.
 995    DR_PEEL - the data reference whose misalignment is being made
 996              zero in the vector loop by the peel.
 997    NPEEL - the number of iterations in the peel loop if the misalignment
 998            of DR_PEEL is known at compile time.  */
 999
1000 static void
1001 vect_update_misalignment_for_peel (struct data_reference *dr,
1002                                    struct data_reference *dr_peel, int npeel)
1003 {
1004   unsigned int i;
1005   vec<dr_p> same_aligned_drs;
1006   struct data_reference *current_dr;
1007   int dr_size = vect_get_scalar_dr_size (dr);
1008   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1009   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1010   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1011
1012  /* For interleaved data accesses the step in the loop must be multiplied by
1013      the size of the interleaving group.  */
1014   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1015     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1016   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1017     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1018
1019   /* It can be assumed that the data refs with the same alignment as dr_peel
1020      are aligned in the vector loop.  */
1021   same_aligned_drs
1022     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1023   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1024     {
1025       if (current_dr != dr)
1026         continue;
1027       gcc_assert (!known_alignment_for_access_p (dr)
1028                   || !known_alignment_for_access_p (dr_peel)
1029                   || (DR_MISALIGNMENT (dr) / dr_size
1030                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1031       SET_DR_MISALIGNMENT (dr, 0);
1032       return;
1033     }
1034
1035   if (known_alignment_for_access_p (dr)
1036       && known_alignment_for_access_p (dr_peel))
1037     {
1038       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1039       int misal = DR_MISALIGNMENT (dr);
1040       misal += negative ? -npeel * dr_size : npeel * dr_size;
1041       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1042       SET_DR_MISALIGNMENT (dr, misal);
1043       return;
1044     }
1045
1046   if (dump_enabled_p ())
1047     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1048                      "to unknown (-1).\n");
1049   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1050 }
1051
1052
1053 /* Function verify_data_ref_alignment
1054
1055    Return TRUE if DR can be handled with respect to alignment.  */
1056
1057 static bool
1058 verify_data_ref_alignment (data_reference_p dr)
1059 {
1060   enum dr_alignment_support supportable_dr_alignment
1061     = vect_supportable_dr_alignment (dr, false);
1062   if (!supportable_dr_alignment)
1063     {
1064       if (dump_enabled_p ())
1065         {
1066           if (DR_IS_READ (dr))
1067             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068                              "not vectorized: unsupported unaligned load.");
1069           else
1070             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071                              "not vectorized: unsupported unaligned "
1072                              "store.");
1073
1074           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1075                              DR_REF (dr));
1076           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1077         }
1078       return false;
1079     }
1080
1081   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1082     dump_printf_loc (MSG_NOTE, vect_location,
1083                      "Vectorizing an unaligned access.\n");
1084
1085   return true;
1086 }
1087
1088 /* Function vect_verify_datarefs_alignment
1089
1090    Return TRUE if all data references in the loop can be
1091    handled with respect to alignment.  */
1092
1093 bool
1094 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1095 {
1096   vec<data_reference_p> datarefs = vinfo->datarefs;
1097   struct data_reference *dr;
1098   unsigned int i;
1099
1100   FOR_EACH_VEC_ELT (datarefs, i, dr)
1101     {
1102       gimple *stmt = DR_STMT (dr);
1103       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1104
1105       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1106         continue;
1107
1108       /* For interleaving, only the alignment of the first access matters.   */
1109       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1110           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1111         continue;
1112
1113       /* Strided accesses perform only component accesses, alignment is
1114          irrelevant for them.  */
1115       if (STMT_VINFO_STRIDED_P (stmt_info)
1116           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1117         continue;
1118
1119       if (! verify_data_ref_alignment (dr))
1120         return false;
1121     }
1122
1123   return true;
1124 }
1125
1126 /* Given an memory reference EXP return whether its alignment is less
1127    than its size.  */
1128
1129 static bool
1130 not_size_aligned (tree exp)
1131 {
1132   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1133     return true;
1134
1135   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1136           > get_object_alignment (exp));
1137 }
1138
1139 /* Function vector_alignment_reachable_p
1140
1141    Return true if vector alignment for DR is reachable by peeling
1142    a few loop iterations.  Return false otherwise.  */
1143
1144 static bool
1145 vector_alignment_reachable_p (struct data_reference *dr)
1146 {
1147   gimple *stmt = DR_STMT (dr);
1148   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1150
1151   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1152     {
1153       /* For interleaved access we peel only if number of iterations in
1154          the prolog loop ({VF - misalignment}), is a multiple of the
1155          number of the interleaved accesses.  */
1156       int elem_size, mis_in_elements;
1157       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1158
1159       /* FORNOW: handle only known alignment.  */
1160       if (!known_alignment_for_access_p (dr))
1161         return false;
1162
1163       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1164       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1165
1166       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1167         return false;
1168     }
1169
1170   /* If misalignment is known at the compile time then allow peeling
1171      only if natural alignment is reachable through peeling.  */
1172   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1173     {
1174       HOST_WIDE_INT elmsize =
1175                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1176       if (dump_enabled_p ())
1177         {
1178           dump_printf_loc (MSG_NOTE, vect_location,
1179                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1180           dump_printf (MSG_NOTE,
1181                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1182         }
1183       if (DR_MISALIGNMENT (dr) % elmsize)
1184         {
1185           if (dump_enabled_p ())
1186             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187                              "data size does not divide the misalignment.\n");
1188           return false;
1189         }
1190     }
1191
1192   if (!known_alignment_for_access_p (dr))
1193     {
1194       tree type = TREE_TYPE (DR_REF (dr));
1195       bool is_packed = not_size_aligned (DR_REF (dr));
1196       if (dump_enabled_p ())
1197         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1198                          "Unknown misalignment, %snaturally aligned\n",
1199                          is_packed ? "not " : "");
1200       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1201     }
1202
1203   return true;
1204 }
1205
1206
1207 /* Calculate the cost of the memory access represented by DR.  */
1208
1209 static void
1210 vect_get_data_access_cost (struct data_reference *dr,
1211                            unsigned int *inside_cost,
1212                            unsigned int *outside_cost,
1213                            stmt_vector_for_cost *body_cost_vec)
1214 {
1215   gimple *stmt = DR_STMT (dr);
1216   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1217   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1218   int ncopies;
1219
1220   if (PURE_SLP_STMT (stmt_info))
1221     ncopies = 1;
1222   else
1223     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1224
1225   if (DR_IS_READ (dr))
1226     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1227                         NULL, body_cost_vec, false);
1228   else
1229     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1230
1231   if (dump_enabled_p ())
1232     dump_printf_loc (MSG_NOTE, vect_location,
1233                      "vect_get_data_access_cost: inside_cost = %d, "
1234                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1235 }
1236
1237
1238 typedef struct _vect_peel_info
1239 {
1240   struct data_reference *dr;
1241   int npeel;
1242   unsigned int count;
1243 } *vect_peel_info;
1244
1245 typedef struct _vect_peel_extended_info
1246 {
1247   struct _vect_peel_info peel_info;
1248   unsigned int inside_cost;
1249   unsigned int outside_cost;
1250 } *vect_peel_extended_info;
1251
1252
1253 /* Peeling hashtable helpers.  */
1254
1255 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1256 {
1257   static inline hashval_t hash (const _vect_peel_info *);
1258   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1259 };
1260
1261 inline hashval_t
1262 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1263 {
1264   return (hashval_t) peel_info->npeel;
1265 }
1266
1267 inline bool
1268 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1269 {
1270   return (a->npeel == b->npeel);
1271 }
1272
1273
1274 /* Insert DR into peeling hash table with NPEEL as key.  */
1275
1276 static void
1277 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1278                           loop_vec_info loop_vinfo, struct data_reference *dr,
1279                           int npeel)
1280 {
1281   struct _vect_peel_info elem, *slot;
1282   _vect_peel_info **new_slot;
1283   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1284
1285   elem.npeel = npeel;
1286   slot = peeling_htab->find (&elem);
1287   if (slot)
1288     slot->count++;
1289   else
1290     {
1291       slot = XNEW (struct _vect_peel_info);
1292       slot->npeel = npeel;
1293       slot->dr = dr;
1294       slot->count = 1;
1295       new_slot = peeling_htab->find_slot (slot, INSERT);
1296       *new_slot = slot;
1297     }
1298
1299   if (!supportable_dr_alignment
1300       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1301     slot->count += VECT_MAX_COST;
1302 }
1303
1304
1305 /* Traverse peeling hash table to find peeling option that aligns maximum
1306    number of data accesses.  */
1307
1308 int
1309 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1310                                      _vect_peel_extended_info *max)
1311 {
1312   vect_peel_info elem = *slot;
1313
1314   if (elem->count > max->peel_info.count
1315       || (elem->count == max->peel_info.count
1316           && max->peel_info.npeel > elem->npeel))
1317     {
1318       max->peel_info.npeel = elem->npeel;
1319       max->peel_info.count = elem->count;
1320       max->peel_info.dr = elem->dr;
1321     }
1322
1323   return 1;
1324 }
1325
1326 /* Get the costs of peeling NPEEL iterations checking data access costs
1327    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1328    misalignment will be zero after peeling.  */
1329
1330 static void
1331 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1332                                 struct data_reference *dr0,
1333                                 unsigned int *inside_cost,
1334                                 unsigned int *outside_cost,
1335                                 stmt_vector_for_cost *body_cost_vec,
1336                                 unsigned int npeel,
1337                                 bool unknown_misalignment)
1338 {
1339   unsigned i;
1340   data_reference *dr;
1341
1342   FOR_EACH_VEC_ELT (datarefs, i, dr)
1343     {
1344       gimple *stmt = DR_STMT (dr);
1345       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1346       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1347         continue;
1348
1349       /* For interleaving, only the alignment of the first access
1350          matters.  */
1351       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1352           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1353         continue;
1354
1355       /* Strided accesses perform only component accesses, alignment is
1356          irrelevant for them.  */
1357       if (STMT_VINFO_STRIDED_P (stmt_info)
1358           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1359         continue;
1360
1361       int save_misalignment;
1362       save_misalignment = DR_MISALIGNMENT (dr);
1363       if (npeel == 0)
1364         ;
1365       else if (unknown_misalignment && dr == dr0)
1366         SET_DR_MISALIGNMENT (dr, 0);
1367       else
1368         vect_update_misalignment_for_peel (dr, dr0, npeel);
1369       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1370                                  body_cost_vec);
1371       SET_DR_MISALIGNMENT (dr, save_misalignment);
1372     }
1373 }
1374
1375 /* Traverse peeling hash table and calculate cost for each peeling option.
1376    Find the one with the lowest cost.  */
1377
1378 int
1379 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1380                                    _vect_peel_extended_info *min)
1381 {
1382   vect_peel_info elem = *slot;
1383   int dummy;
1384   unsigned int inside_cost = 0, outside_cost = 0;
1385   gimple *stmt = DR_STMT (elem->dr);
1386   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1387   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1388   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1389                        epilogue_cost_vec;
1390
1391   prologue_cost_vec.create (2);
1392   body_cost_vec.create (2);
1393   epilogue_cost_vec.create (2);
1394
1395   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1396                                   elem->dr, &inside_cost, &outside_cost,
1397                                   &body_cost_vec, elem->npeel, false);
1398
1399   body_cost_vec.release ();
1400
1401   outside_cost += vect_get_known_peeling_cost
1402     (loop_vinfo, elem->npeel, &dummy,
1403      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1404      &prologue_cost_vec, &epilogue_cost_vec);
1405
1406   /* Prologue and epilogue costs are added to the target model later.
1407      These costs depend only on the scalar iteration cost, the
1408      number of peeling iterations finally chosen, and the number of
1409      misaligned statements.  So discard the information found here.  */
1410   prologue_cost_vec.release ();
1411   epilogue_cost_vec.release ();
1412
1413   if (inside_cost < min->inside_cost
1414       || (inside_cost == min->inside_cost
1415           && outside_cost < min->outside_cost))
1416     {
1417       min->inside_cost = inside_cost;
1418       min->outside_cost = outside_cost;
1419       min->peel_info.dr = elem->dr;
1420       min->peel_info.npeel = elem->npeel;
1421       min->peel_info.count = elem->count;
1422     }
1423
1424   return 1;
1425 }
1426
1427
1428 /* Choose best peeling option by traversing peeling hash table and either
1429    choosing an option with the lowest cost (if cost model is enabled) or the
1430    option that aligns as many accesses as possible.  */
1431
1432 static struct _vect_peel_extended_info
1433 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1434                                        loop_vec_info loop_vinfo)
1435 {
1436    struct _vect_peel_extended_info res;
1437
1438    res.peel_info.dr = NULL;
1439
1440    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1441      {
1442        res.inside_cost = INT_MAX;
1443        res.outside_cost = INT_MAX;
1444        peeling_htab->traverse <_vect_peel_extended_info *,
1445                                vect_peeling_hash_get_lowest_cost> (&res);
1446      }
1447    else
1448      {
1449        res.peel_info.count = 0;
1450        peeling_htab->traverse <_vect_peel_extended_info *,
1451                                vect_peeling_hash_get_most_frequent> (&res);
1452        res.inside_cost = 0;
1453        res.outside_cost = 0;
1454      }
1455
1456    return res;
1457 }
1458
1459 /* Return true if the new peeling NPEEL is supported.  */
1460
1461 static bool
1462 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1463                           unsigned npeel)
1464 {
1465   unsigned i;
1466   struct data_reference *dr = NULL;
1467   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1468   gimple *stmt;
1469   stmt_vec_info stmt_info;
1470   enum dr_alignment_support supportable_dr_alignment;
1471
1472   /* Ensure that all data refs can be vectorized after the peel.  */
1473   FOR_EACH_VEC_ELT (datarefs, i, dr)
1474     {
1475       int save_misalignment;
1476
1477       if (dr == dr0)
1478         continue;
1479
1480       stmt = DR_STMT (dr);
1481       stmt_info = vinfo_for_stmt (stmt);
1482       /* For interleaving, only the alignment of the first access
1483          matters.  */
1484       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1485           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1486         continue;
1487
1488       /* Strided accesses perform only component accesses, alignment is
1489          irrelevant for them.  */
1490       if (STMT_VINFO_STRIDED_P (stmt_info)
1491           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1492         continue;
1493
1494       save_misalignment = DR_MISALIGNMENT (dr);
1495       vect_update_misalignment_for_peel (dr, dr0, npeel);
1496       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1497       SET_DR_MISALIGNMENT (dr, save_misalignment);
1498
1499       if (!supportable_dr_alignment)
1500         return false;
1501     }
1502
1503   return true;
1504 }
1505
1506 /* Function vect_enhance_data_refs_alignment
1507
1508    This pass will use loop versioning and loop peeling in order to enhance
1509    the alignment of data references in the loop.
1510
1511    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1512    original loop is to be vectorized.  Any other loops that are created by
1513    the transformations performed in this pass - are not supposed to be
1514    vectorized.  This restriction will be relaxed.
1515
1516    This pass will require a cost model to guide it whether to apply peeling
1517    or versioning or a combination of the two.  For example, the scheme that
1518    intel uses when given a loop with several memory accesses, is as follows:
1519    choose one memory access ('p') which alignment you want to force by doing
1520    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1521    other accesses are not necessarily aligned, or (2) use loop versioning to
1522    generate one loop in which all accesses are aligned, and another loop in
1523    which only 'p' is necessarily aligned.
1524
1525    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1526    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1527    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1528
1529    Devising a cost model is the most critical aspect of this work.  It will
1530    guide us on which access to peel for, whether to use loop versioning, how
1531    many versions to create, etc.  The cost model will probably consist of
1532    generic considerations as well as target specific considerations (on
1533    powerpc for example, misaligned stores are more painful than misaligned
1534    loads).
1535
1536    Here are the general steps involved in alignment enhancements:
1537
1538      -- original loop, before alignment analysis:
1539         for (i=0; i<N; i++){
1540           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1541           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1542         }
1543
1544      -- After vect_compute_data_refs_alignment:
1545         for (i=0; i<N; i++){
1546           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1547           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1548         }
1549
1550      -- Possibility 1: we do loop versioning:
1551      if (p is aligned) {
1552         for (i=0; i<N; i++){    # loop 1A
1553           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1554           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1555         }
1556      }
1557      else {
1558         for (i=0; i<N; i++){    # loop 1B
1559           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1560           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1561         }
1562      }
1563
1564      -- Possibility 2: we do loop peeling:
1565      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1566         x = q[i];
1567         p[i] = y;
1568      }
1569      for (i = 3; i < N; i++){   # loop 2A
1570         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1571         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1572      }
1573
1574      -- Possibility 3: combination of loop peeling and versioning:
1575      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1576         x = q[i];
1577         p[i] = y;
1578      }
1579      if (p is aligned) {
1580         for (i = 3; i<N; i++){  # loop 3A
1581           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1582           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1583         }
1584      }
1585      else {
1586         for (i = 3; i<N; i++){  # loop 3B
1587           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1588           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1589         }
1590      }
1591
1592      These loops are later passed to loop_transform to be vectorized.  The
1593      vectorizer will use the alignment information to guide the transformation
1594      (whether to generate regular loads/stores, or with special handling for
1595      misalignment).  */
1596
1597 bool
1598 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1599 {
1600   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1601   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1602   enum dr_alignment_support supportable_dr_alignment;
1603   struct data_reference *dr0 = NULL, *first_store = NULL;
1604   struct data_reference *dr;
1605   unsigned int i, j;
1606   bool do_peeling = false;
1607   bool do_versioning = false;
1608   bool stat;
1609   gimple *stmt;
1610   stmt_vec_info stmt_info;
1611   unsigned int npeel = 0;
1612   bool one_misalignment_known = false;
1613   bool one_misalignment_unknown = false;
1614   bool one_dr_unsupportable = false;
1615   struct data_reference *unsupportable_dr = NULL;
1616   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1617   unsigned possible_npeel_number = 1;
1618   tree vectype;
1619   unsigned int mis, same_align_drs_max = 0;
1620   hash_table<peel_info_hasher> peeling_htab (1);
1621
1622   if (dump_enabled_p ())
1623     dump_printf_loc (MSG_NOTE, vect_location,
1624                      "=== vect_enhance_data_refs_alignment ===\n");
1625
1626   /* Reset data so we can safely be called multiple times.  */
1627   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1628   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1629
1630   /* While cost model enhancements are expected in the future, the high level
1631      view of the code at this time is as follows:
1632
1633      A) If there is a misaligned access then see if peeling to align
1634         this access can make all data references satisfy
1635         vect_supportable_dr_alignment.  If so, update data structures
1636         as needed and return true.
1637
1638      B) If peeling wasn't possible and there is a data reference with an
1639         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1640         then see if loop versioning checks can be used to make all data
1641         references satisfy vect_supportable_dr_alignment.  If so, update
1642         data structures as needed and return true.
1643
1644      C) If neither peeling nor versioning were successful then return false if
1645         any data reference does not satisfy vect_supportable_dr_alignment.
1646
1647      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1648
1649      Note, Possibility 3 above (which is peeling and versioning together) is not
1650      being done at this time.  */
1651
1652   /* (1) Peeling to force alignment.  */
1653
1654   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1655      Considerations:
1656      + How many accesses will become aligned due to the peeling
1657      - How many accesses will become unaligned due to the peeling,
1658        and the cost of misaligned accesses.
1659      - The cost of peeling (the extra runtime checks, the increase
1660        in code size).  */
1661
1662   FOR_EACH_VEC_ELT (datarefs, i, dr)
1663     {
1664       stmt = DR_STMT (dr);
1665       stmt_info = vinfo_for_stmt (stmt);
1666
1667       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1668         continue;
1669
1670       /* For interleaving, only the alignment of the first access
1671          matters.  */
1672       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1673           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1674         continue;
1675
1676       /* For invariant accesses there is nothing to enhance.  */
1677       if (integer_zerop (DR_STEP (dr)))
1678         continue;
1679
1680       /* Strided accesses perform only component accesses, alignment is
1681          irrelevant for them.  */
1682       if (STMT_VINFO_STRIDED_P (stmt_info)
1683           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1684         continue;
1685
1686       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1687       do_peeling = vector_alignment_reachable_p (dr);
1688       if (do_peeling)
1689         {
1690           if (known_alignment_for_access_p (dr))
1691             {
1692               unsigned int npeel_tmp = 0;
1693               bool negative = tree_int_cst_compare (DR_STEP (dr),
1694                                                     size_zero_node) < 0;
1695
1696               vectype = STMT_VINFO_VECTYPE (stmt_info);
1697               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1698               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1699               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1700               if (DR_MISALIGNMENT (dr) != 0)
1701                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1702
1703               /* For multiple types, it is possible that the bigger type access
1704                  will have more than one peeling option.  E.g., a loop with two
1705                  types: one of size (vector size / 4), and the other one of
1706                  size (vector size / 8).  Vectorization factor will 8.  If both
1707                  accesses are misaligned by 3, the first one needs one scalar
1708                  iteration to be aligned, and the second one needs 5.  But the
1709                  first one will be aligned also by peeling 5 scalar
1710                  iterations, and in that case both accesses will be aligned.
1711                  Hence, except for the immediate peeling amount, we also want
1712                  to try to add full vector size, while we don't exceed
1713                  vectorization factor.
1714                  We do this automatically for cost model, since we calculate
1715                  cost for every peeling option.  */
1716               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1717                 {
1718                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1719                                           ? vf * GROUP_SIZE (stmt_info) : vf);
1720                   possible_npeel_number
1721                     = vect_get_num_vectors (nscalars, vectype);
1722
1723                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1724                      allow peeling NELEMENTS.  */
1725                   if (DR_MISALIGNMENT (dr) == 0)
1726                     possible_npeel_number++;
1727                 }
1728
1729               /* Save info about DR in the hash table.  Also include peeling
1730                  amounts according to the explanation above.  */
1731               for (j = 0; j < possible_npeel_number; j++)
1732                 {
1733                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1734                                             dr, npeel_tmp);
1735                   npeel_tmp += target_align / dr_size;
1736                 }
1737
1738               one_misalignment_known = true;
1739             }
1740           else
1741             {
1742               /* If we don't know any misalignment values, we prefer
1743                  peeling for data-ref that has the maximum number of data-refs
1744                  with the same alignment, unless the target prefers to align
1745                  stores over load.  */
1746               unsigned same_align_drs
1747                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1748               if (!dr0
1749                   || same_align_drs_max < same_align_drs)
1750                 {
1751                   same_align_drs_max = same_align_drs;
1752                   dr0 = dr;
1753                 }
1754               /* For data-refs with the same number of related
1755                  accesses prefer the one where the misalign
1756                  computation will be invariant in the outermost loop.  */
1757               else if (same_align_drs_max == same_align_drs)
1758                 {
1759                   struct loop *ivloop0, *ivloop;
1760                   ivloop0 = outermost_invariant_loop_for_expr
1761                     (loop, DR_BASE_ADDRESS (dr0));
1762                   ivloop = outermost_invariant_loop_for_expr
1763                     (loop, DR_BASE_ADDRESS (dr));
1764                   if ((ivloop && !ivloop0)
1765                       || (ivloop && ivloop0
1766                           && flow_loop_nested_p (ivloop, ivloop0)))
1767                     dr0 = dr;
1768                 }
1769
1770               one_misalignment_unknown = true;
1771
1772               /* Check for data refs with unsupportable alignment that
1773                  can be peeled.  */
1774               if (!supportable_dr_alignment)
1775               {
1776                 one_dr_unsupportable = true;
1777                 unsupportable_dr = dr;
1778               }
1779
1780               if (!first_store && DR_IS_WRITE (dr))
1781                 first_store = dr;
1782             }
1783         }
1784       else
1785         {
1786           if (!aligned_access_p (dr))
1787             {
1788               if (dump_enabled_p ())
1789                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1790                                  "vector alignment may not be reachable\n");
1791               break;
1792             }
1793         }
1794     }
1795
1796   /* Check if we can possibly peel the loop.  */
1797   if (!vect_can_advance_ivs_p (loop_vinfo)
1798       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1799       || loop->inner)
1800     do_peeling = false;
1801
1802   struct _vect_peel_extended_info peel_for_known_alignment;
1803   struct _vect_peel_extended_info peel_for_unknown_alignment;
1804   struct _vect_peel_extended_info best_peel;
1805
1806   peel_for_unknown_alignment.inside_cost = INT_MAX;
1807   peel_for_unknown_alignment.outside_cost = INT_MAX;
1808   peel_for_unknown_alignment.peel_info.count = 0;
1809
1810   if (do_peeling
1811       && one_misalignment_unknown)
1812     {
1813       /* Check if the target requires to prefer stores over loads, i.e., if
1814          misaligned stores are more expensive than misaligned loads (taking
1815          drs with same alignment into account).  */
1816       unsigned int load_inside_cost = 0;
1817       unsigned int load_outside_cost = 0;
1818       unsigned int store_inside_cost = 0;
1819       unsigned int store_outside_cost = 0;
1820       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1821
1822       stmt_vector_for_cost dummy;
1823       dummy.create (2);
1824       vect_get_peeling_costs_all_drs (datarefs, dr0,
1825                                       &load_inside_cost,
1826                                       &load_outside_cost,
1827                                       &dummy, estimated_npeels, true);
1828       dummy.release ();
1829
1830       if (first_store)
1831         {
1832           dummy.create (2);
1833           vect_get_peeling_costs_all_drs (datarefs, first_store,
1834                                           &store_inside_cost,
1835                                           &store_outside_cost,
1836                                           &dummy, estimated_npeels, true);
1837           dummy.release ();
1838         }
1839       else
1840         {
1841           store_inside_cost = INT_MAX;
1842           store_outside_cost = INT_MAX;
1843         }
1844
1845       if (load_inside_cost > store_inside_cost
1846           || (load_inside_cost == store_inside_cost
1847               && load_outside_cost > store_outside_cost))
1848         {
1849           dr0 = first_store;
1850           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1851           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1852         }
1853       else
1854         {
1855           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1856           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1857         }
1858
1859       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1860       prologue_cost_vec.create (2);
1861       epilogue_cost_vec.create (2);
1862
1863       int dummy2;
1864       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1865         (loop_vinfo, estimated_npeels, &dummy2,
1866          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1867          &prologue_cost_vec, &epilogue_cost_vec);
1868
1869       prologue_cost_vec.release ();
1870       epilogue_cost_vec.release ();
1871
1872       peel_for_unknown_alignment.peel_info.count = 1
1873         + STMT_VINFO_SAME_ALIGN_REFS
1874         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1875     }
1876
1877   peel_for_unknown_alignment.peel_info.npeel = 0;
1878   peel_for_unknown_alignment.peel_info.dr = dr0;
1879
1880   best_peel = peel_for_unknown_alignment;
1881
1882   peel_for_known_alignment.inside_cost = INT_MAX;
1883   peel_for_known_alignment.outside_cost = INT_MAX;
1884   peel_for_known_alignment.peel_info.count = 0;
1885   peel_for_known_alignment.peel_info.dr = NULL;
1886
1887   if (do_peeling && one_misalignment_known)
1888     {
1889       /* Peeling is possible, but there is no data access that is not supported
1890          unless aligned.  So we try to choose the best possible peeling from
1891          the hash table.  */
1892       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1893         (&peeling_htab, loop_vinfo);
1894     }
1895
1896   /* Compare costs of peeling for known and unknown alignment. */
1897   if (peel_for_known_alignment.peel_info.dr != NULL
1898       && peel_for_unknown_alignment.inside_cost
1899       >= peel_for_known_alignment.inside_cost)
1900     {
1901       best_peel = peel_for_known_alignment;
1902
1903       /* If the best peeling for known alignment has NPEEL == 0, perform no
1904          peeling at all except if there is an unsupportable dr that we can
1905          align.  */
1906       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1907         do_peeling = false;
1908     }
1909
1910   /* If there is an unsupportable data ref, prefer this over all choices so far
1911      since we'd have to discard a chosen peeling except when it accidentally
1912      aligned the unsupportable data ref.  */
1913   if (one_dr_unsupportable)
1914     dr0 = unsupportable_dr;
1915   else if (do_peeling)
1916     {
1917       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1918          TODO: Use nopeel_outside_cost or get rid of it?  */
1919       unsigned nopeel_inside_cost = 0;
1920       unsigned nopeel_outside_cost = 0;
1921
1922       stmt_vector_for_cost dummy;
1923       dummy.create (2);
1924       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1925                                       &nopeel_outside_cost, &dummy, 0, false);
1926       dummy.release ();
1927
1928       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1929          costs will be recorded.  */
1930       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1931       prologue_cost_vec.create (2);
1932       epilogue_cost_vec.create (2);
1933
1934       int dummy2;
1935       nopeel_outside_cost += vect_get_known_peeling_cost
1936         (loop_vinfo, 0, &dummy2,
1937          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1938          &prologue_cost_vec, &epilogue_cost_vec);
1939
1940       prologue_cost_vec.release ();
1941       epilogue_cost_vec.release ();
1942
1943       npeel = best_peel.peel_info.npeel;
1944       dr0 = best_peel.peel_info.dr;
1945
1946       /* If no peeling is not more expensive than the best peeling we
1947          have so far, don't perform any peeling.  */
1948       if (nopeel_inside_cost <= best_peel.inside_cost)
1949         do_peeling = false;
1950     }
1951
1952   if (do_peeling)
1953     {
1954       stmt = DR_STMT (dr0);
1955       stmt_info = vinfo_for_stmt (stmt);
1956       vectype = STMT_VINFO_VECTYPE (stmt_info);
1957
1958       if (known_alignment_for_access_p (dr0))
1959         {
1960           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1961                                                 size_zero_node) < 0;
1962           if (!npeel)
1963             {
1964               /* Since it's known at compile time, compute the number of
1965                  iterations in the peeled loop (the peeling factor) for use in
1966                  updating DR_MISALIGNMENT values.  The peeling factor is the
1967                  vectorization factor minus the misalignment as an element
1968                  count.  */
1969               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1970               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1971               npeel = ((mis & (target_align - 1))
1972                        / vect_get_scalar_dr_size (dr0));
1973             }
1974
1975           /* For interleaved data access every iteration accesses all the
1976              members of the group, therefore we divide the number of iterations
1977              by the group size.  */
1978           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1979           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1980             npeel /= GROUP_SIZE (stmt_info);
1981
1982           if (dump_enabled_p ())
1983             dump_printf_loc (MSG_NOTE, vect_location,
1984                              "Try peeling by %d\n", npeel);
1985         }
1986
1987       /* Ensure that all datarefs can be vectorized after the peel.  */
1988       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1989         do_peeling = false;
1990
1991       /* Check if all datarefs are supportable and log.  */
1992       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1993         {
1994           stat = vect_verify_datarefs_alignment (loop_vinfo);
1995           if (!stat)
1996             do_peeling = false;
1997           else
1998             return stat;
1999         }
2000
2001       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2002       if (do_peeling)
2003         {
2004           unsigned max_allowed_peel
2005             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2006           if (max_allowed_peel != (unsigned)-1)
2007             {
2008               unsigned max_peel = npeel;
2009               if (max_peel == 0)
2010                 {
2011                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2012                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2013                 }
2014               if (max_peel > max_allowed_peel)
2015                 {
2016                   do_peeling = false;
2017                   if (dump_enabled_p ())
2018                     dump_printf_loc (MSG_NOTE, vect_location,
2019                         "Disable peeling, max peels reached: %d\n", max_peel);
2020                 }
2021             }
2022         }
2023
2024       /* Cost model #2 - if peeling may result in a remaining loop not
2025          iterating enough to be vectorized then do not peel.  Since this
2026          is a cost heuristic rather than a correctness decision, use the
2027          most likely runtime value for variable vectorization factors.  */
2028       if (do_peeling
2029           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2030         {
2031           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2032           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2033           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2034               < assumed_vf + max_peel)
2035             do_peeling = false;
2036         }
2037
2038       if (do_peeling)
2039         {
2040           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2041              If the misalignment of DR_i is identical to that of dr0 then set
2042              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2043              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2044              by the peeling factor times the element size of DR_i (MOD the
2045              vectorization factor times the size).  Otherwise, the
2046              misalignment of DR_i must be set to unknown.  */
2047           FOR_EACH_VEC_ELT (datarefs, i, dr)
2048             if (dr != dr0)
2049               {
2050                 /* Strided accesses perform only component accesses, alignment
2051                    is irrelevant for them.  */
2052                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2053                 if (STMT_VINFO_STRIDED_P (stmt_info)
2054                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2055                   continue;
2056
2057                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2058               }
2059
2060           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2061           if (npeel)
2062             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2063           else
2064             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2065               = DR_MISALIGNMENT (dr0);
2066           SET_DR_MISALIGNMENT (dr0, 0);
2067           if (dump_enabled_p ())
2068             {
2069               dump_printf_loc (MSG_NOTE, vect_location,
2070                                "Alignment of access forced using peeling.\n");
2071               dump_printf_loc (MSG_NOTE, vect_location,
2072                                "Peeling for alignment will be applied.\n");
2073             }
2074
2075           /* The inside-loop cost will be accounted for in vectorizable_load
2076              and vectorizable_store correctly with adjusted alignments.
2077              Drop the body_cst_vec on the floor here.  */
2078           stat = vect_verify_datarefs_alignment (loop_vinfo);
2079           gcc_assert (stat);
2080           return stat;
2081         }
2082     }
2083
2084   /* (2) Versioning to force alignment.  */
2085
2086   /* Try versioning if:
2087      1) optimize loop for speed
2088      2) there is at least one unsupported misaligned data ref with an unknown
2089         misalignment, and
2090      3) all misaligned data refs with a known misalignment are supported, and
2091      4) the number of runtime alignment checks is within reason.  */
2092
2093   do_versioning =
2094         optimize_loop_nest_for_speed_p (loop)
2095         && (!loop->inner); /* FORNOW */
2096
2097   if (do_versioning)
2098     {
2099       FOR_EACH_VEC_ELT (datarefs, i, dr)
2100         {
2101           stmt = DR_STMT (dr);
2102           stmt_info = vinfo_for_stmt (stmt);
2103
2104           /* For interleaving, only the alignment of the first access
2105              matters.  */
2106           if (aligned_access_p (dr)
2107               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2108                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2109             continue;
2110
2111           if (STMT_VINFO_STRIDED_P (stmt_info))
2112             {
2113               /* Strided loads perform only component accesses, alignment is
2114                  irrelevant for them.  */
2115               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2116                 continue;
2117               do_versioning = false;
2118               break;
2119             }
2120
2121           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2122
2123           if (!supportable_dr_alignment)
2124             {
2125               gimple *stmt;
2126               int mask;
2127               tree vectype;
2128
2129               if (known_alignment_for_access_p (dr)
2130                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2131                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2132                 {
2133                   do_versioning = false;
2134                   break;
2135                 }
2136
2137               stmt = DR_STMT (dr);
2138               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2139               gcc_assert (vectype);
2140
2141               /* The rightmost bits of an aligned address must be zeros.
2142                  Construct the mask needed for this test.  For example,
2143                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2144                  mask must be 15 = 0xf. */
2145               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2146
2147               /* FORNOW: use the same mask to test all potentially unaligned
2148                  references in the loop.  The vectorizer currently supports
2149                  a single vector size, see the reference to
2150                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2151                  vectorization factor is computed.  */
2152               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2153                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2154               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2155               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2156                       DR_STMT (dr));
2157             }
2158         }
2159
2160       /* Versioning requires at least one misaligned data reference.  */
2161       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2162         do_versioning = false;
2163       else if (!do_versioning)
2164         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2165     }
2166
2167   if (do_versioning)
2168     {
2169       vec<gimple *> may_misalign_stmts
2170         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2171       gimple *stmt;
2172
2173       /* It can now be assumed that the data references in the statements
2174          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2175          of the loop being vectorized.  */
2176       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2177         {
2178           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2179           dr = STMT_VINFO_DATA_REF (stmt_info);
2180           SET_DR_MISALIGNMENT (dr, 0);
2181           if (dump_enabled_p ())
2182             dump_printf_loc (MSG_NOTE, vect_location,
2183                              "Alignment of access forced using versioning.\n");
2184         }
2185
2186       if (dump_enabled_p ())
2187         dump_printf_loc (MSG_NOTE, vect_location,
2188                          "Versioning for alignment will be applied.\n");
2189
2190       /* Peeling and versioning can't be done together at this time.  */
2191       gcc_assert (! (do_peeling && do_versioning));
2192
2193       stat = vect_verify_datarefs_alignment (loop_vinfo);
2194       gcc_assert (stat);
2195       return stat;
2196     }
2197
2198   /* This point is reached if neither peeling nor versioning is being done.  */
2199   gcc_assert (! (do_peeling || do_versioning));
2200
2201   stat = vect_verify_datarefs_alignment (loop_vinfo);
2202   return stat;
2203 }
2204
2205
2206 /* Function vect_find_same_alignment_drs.
2207
2208    Update group and alignment relations according to the chosen
2209    vectorization factor.  */
2210
2211 static void
2212 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2213 {
2214   struct data_reference *dra = DDR_A (ddr);
2215   struct data_reference *drb = DDR_B (ddr);
2216   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2217   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2218
2219   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2220     return;
2221
2222   if (dra == drb)
2223     return;
2224
2225   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2226       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2227       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2228     return;
2229
2230   /* Two references with distance zero have the same alignment.  */
2231   offset_int diff = (wi::to_offset (DR_INIT (dra))
2232                      - wi::to_offset (DR_INIT (drb)));
2233   if (diff != 0)
2234     {
2235       /* Get the wider of the two alignments.  */
2236       unsigned int align_a = (vect_calculate_target_alignment (dra)
2237                               / BITS_PER_UNIT);
2238       unsigned int align_b = (vect_calculate_target_alignment (drb)
2239                               / BITS_PER_UNIT);
2240       unsigned int max_align = MAX (align_a, align_b);
2241
2242       /* Require the gap to be a multiple of the larger vector alignment.  */
2243       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2244         return;
2245     }
2246
2247   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2248   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2249   if (dump_enabled_p ())
2250     {
2251       dump_printf_loc (MSG_NOTE, vect_location,
2252                        "accesses have the same alignment: ");
2253       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2254       dump_printf (MSG_NOTE,  " and ");
2255       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2256       dump_printf (MSG_NOTE, "\n");
2257     }
2258 }
2259
2260
2261 /* Function vect_analyze_data_refs_alignment
2262
2263    Analyze the alignment of the data-references in the loop.
2264    Return FALSE if a data reference is found that cannot be vectorized.  */
2265
2266 bool
2267 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2268 {
2269   if (dump_enabled_p ())
2270     dump_printf_loc (MSG_NOTE, vect_location,
2271                      "=== vect_analyze_data_refs_alignment ===\n");
2272
2273   /* Mark groups of data references with same alignment using
2274      data dependence information.  */
2275   vec<ddr_p> ddrs = vinfo->ddrs;
2276   struct data_dependence_relation *ddr;
2277   unsigned int i;
2278
2279   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2280     vect_find_same_alignment_drs (ddr);
2281
2282   vec<data_reference_p> datarefs = vinfo->datarefs;
2283   struct data_reference *dr;
2284
2285   vect_record_base_alignments (vinfo);
2286   FOR_EACH_VEC_ELT (datarefs, i, dr)
2287     {
2288       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2289       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2290           && !vect_compute_data_ref_alignment (dr))
2291         {
2292           /* Strided accesses perform only component accesses, misalignment
2293              information is irrelevant for them.  */
2294           if (STMT_VINFO_STRIDED_P (stmt_info)
2295               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2296             continue;
2297
2298           if (dump_enabled_p ())
2299             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2300                              "not vectorized: can't calculate alignment "
2301                              "for data ref.\n");
2302
2303           return false;
2304         }
2305     }
2306
2307   return true;
2308 }
2309
2310
2311 /* Analyze alignment of DRs of stmts in NODE.  */
2312
2313 static bool
2314 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2315 {
2316   /* We vectorize from the first scalar stmt in the node unless
2317      the node is permuted in which case we start from the first
2318      element in the group.  */
2319   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2320   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2321   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2322     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2323
2324   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2325   if (! vect_compute_data_ref_alignment (dr)
2326       /* For creating the data-ref pointer we need alignment of the
2327          first element anyway.  */
2328       || (dr != first_dr
2329           && ! vect_compute_data_ref_alignment (first_dr))
2330       || ! verify_data_ref_alignment (dr))
2331     {
2332       if (dump_enabled_p ())
2333         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334                          "not vectorized: bad data alignment in basic "
2335                          "block.\n");
2336       return false;
2337     }
2338
2339   return true;
2340 }
2341
2342 /* Function vect_slp_analyze_instance_alignment
2343
2344    Analyze the alignment of the data-references in the SLP instance.
2345    Return FALSE if a data reference is found that cannot be vectorized.  */
2346
2347 bool
2348 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2349 {
2350   if (dump_enabled_p ())
2351     dump_printf_loc (MSG_NOTE, vect_location,
2352                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2353
2354   slp_tree node;
2355   unsigned i;
2356   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2357     if (! vect_slp_analyze_and_verify_node_alignment (node))
2358       return false;
2359
2360   node = SLP_INSTANCE_TREE (instance);
2361   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2362       && ! vect_slp_analyze_and_verify_node_alignment
2363              (SLP_INSTANCE_TREE (instance)))
2364     return false;
2365
2366   return true;
2367 }
2368
2369
2370 /* Analyze groups of accesses: check that DR belongs to a group of
2371    accesses of legal size, step, etc.  Detect gaps, single element
2372    interleaving, and other special cases. Set grouped access info.
2373    Collect groups of strided stores for further use in SLP analysis.
2374    Worker for vect_analyze_group_access.  */
2375
2376 static bool
2377 vect_analyze_group_access_1 (struct data_reference *dr)
2378 {
2379   tree step = DR_STEP (dr);
2380   tree scalar_type = TREE_TYPE (DR_REF (dr));
2381   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2382   gimple *stmt = DR_STMT (dr);
2383   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2384   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2385   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2386   HOST_WIDE_INT dr_step = -1;
2387   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2388   bool slp_impossible = false;
2389
2390   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2391      size of the interleaving group (including gaps).  */
2392   if (tree_fits_shwi_p (step))
2393     {
2394       dr_step = tree_to_shwi (step);
2395       /* Check that STEP is a multiple of type size.  Otherwise there is
2396          a non-element-sized gap at the end of the group which we
2397          cannot represent in GROUP_GAP or GROUP_SIZE.
2398          ???  As we can handle non-constant step fine here we should
2399          simply remove uses of GROUP_GAP between the last and first
2400          element and instead rely on DR_STEP.  GROUP_SIZE then would
2401          simply not include that gap.  */
2402       if ((dr_step % type_size) != 0)
2403         {
2404           if (dump_enabled_p ())
2405             {
2406               dump_printf_loc (MSG_NOTE, vect_location,
2407                                "Step ");
2408               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2409               dump_printf (MSG_NOTE,
2410                            " is not a multiple of the element size for ");
2411               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2412               dump_printf (MSG_NOTE, "\n");
2413             }
2414           return false;
2415         }
2416       groupsize = absu_hwi (dr_step) / type_size;
2417     }
2418   else
2419     groupsize = 0;
2420
2421   /* Not consecutive access is possible only if it is a part of interleaving.  */
2422   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2423     {
2424       /* Check if it this DR is a part of interleaving, and is a single
2425          element of the group that is accessed in the loop.  */
2426
2427       /* Gaps are supported only for loads. STEP must be a multiple of the type
2428          size.  The size of the group must be a power of 2.  */
2429       if (DR_IS_READ (dr)
2430           && (dr_step % type_size) == 0
2431           && groupsize > 0
2432           && pow2p_hwi (groupsize))
2433         {
2434           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2435           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2436           GROUP_GAP (stmt_info) = groupsize - 1;
2437           if (dump_enabled_p ())
2438             {
2439               dump_printf_loc (MSG_NOTE, vect_location,
2440                                "Detected single element interleaving ");
2441               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2442               dump_printf (MSG_NOTE, " step ");
2443               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2444               dump_printf (MSG_NOTE, "\n");
2445             }
2446
2447           return true;
2448         }
2449
2450       if (dump_enabled_p ())
2451         {
2452           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453                            "not consecutive access ");
2454           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2455         }
2456
2457       if (bb_vinfo)
2458         {
2459           /* Mark the statement as unvectorizable.  */
2460           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2461           return true;
2462         }
2463
2464       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2465       STMT_VINFO_STRIDED_P (stmt_info) = true;
2466       return true;
2467     }
2468
2469   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2470     {
2471       /* First stmt in the interleaving chain. Check the chain.  */
2472       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2473       struct data_reference *data_ref = dr;
2474       unsigned int count = 1;
2475       tree prev_init = DR_INIT (data_ref);
2476       gimple *prev = stmt;
2477       HOST_WIDE_INT diff, gaps = 0;
2478
2479       while (next)
2480         {
2481           /* Skip same data-refs.  In case that two or more stmts share
2482              data-ref (supported only for loads), we vectorize only the first
2483              stmt, and the rest get their vectorized loads from the first
2484              one.  */
2485           if (!tree_int_cst_compare (DR_INIT (data_ref),
2486                                      DR_INIT (STMT_VINFO_DATA_REF (
2487                                                    vinfo_for_stmt (next)))))
2488             {
2489               if (DR_IS_WRITE (data_ref))
2490                 {
2491                   if (dump_enabled_p ())
2492                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2493                                      "Two store stmts share the same dr.\n");
2494                   return false;
2495                 }
2496
2497               if (dump_enabled_p ())
2498                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2499                                  "Two or more load stmts share the same dr.\n");
2500
2501               /* For load use the same data-ref load.  */
2502               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2503
2504               prev = next;
2505               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2506               continue;
2507             }
2508
2509           prev = next;
2510           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2511
2512           /* All group members have the same STEP by construction.  */
2513           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2514
2515           /* Check that the distance between two accesses is equal to the type
2516              size. Otherwise, we have gaps.  */
2517           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2518                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2519           if (diff != 1)
2520             {
2521               /* FORNOW: SLP of accesses with gaps is not supported.  */
2522               slp_impossible = true;
2523               if (DR_IS_WRITE (data_ref))
2524                 {
2525                   if (dump_enabled_p ())
2526                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2527                                      "interleaved store with gaps\n");
2528                   return false;
2529                 }
2530
2531               gaps += diff - 1;
2532             }
2533
2534           last_accessed_element += diff;
2535
2536           /* Store the gap from the previous member of the group. If there is no
2537              gap in the access, GROUP_GAP is always 1.  */
2538           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2539
2540           prev_init = DR_INIT (data_ref);
2541           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2542           /* Count the number of data-refs in the chain.  */
2543           count++;
2544         }
2545
2546       if (groupsize == 0)
2547         groupsize = count + gaps;
2548
2549       /* This could be UINT_MAX but as we are generating code in a very
2550          inefficient way we have to cap earlier.  See PR78699 for example.  */
2551       if (groupsize > 4096)
2552         {
2553           if (dump_enabled_p ())
2554             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555                              "group is too large\n");
2556           return false;
2557         }
2558
2559       /* Check that the size of the interleaving is equal to count for stores,
2560          i.e., that there are no gaps.  */
2561       if (groupsize != count
2562           && !DR_IS_READ (dr))
2563         {
2564           if (dump_enabled_p ())
2565             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2566                              "interleaved store with gaps\n");
2567           return false;
2568         }
2569
2570       /* If there is a gap after the last load in the group it is the
2571          difference between the groupsize and the last accessed
2572          element.
2573          When there is no gap, this difference should be 0.  */
2574       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2575
2576       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2577       if (dump_enabled_p ())
2578         {
2579           dump_printf_loc (MSG_NOTE, vect_location,
2580                            "Detected interleaving ");
2581           if (DR_IS_READ (dr))
2582             dump_printf (MSG_NOTE, "load ");
2583           else
2584             dump_printf (MSG_NOTE, "store ");
2585           dump_printf (MSG_NOTE, "of size %u starting with ",
2586                        (unsigned)groupsize);
2587           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2588           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2589             dump_printf_loc (MSG_NOTE, vect_location,
2590                              "There is a gap of %u elements after the group\n",
2591                              GROUP_GAP (vinfo_for_stmt (stmt)));
2592         }
2593
2594       /* SLP: create an SLP data structure for every interleaving group of
2595          stores for further analysis in vect_analyse_slp.  */
2596       if (DR_IS_WRITE (dr) && !slp_impossible)
2597         {
2598           if (loop_vinfo)
2599             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2600           if (bb_vinfo)
2601             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2602         }
2603     }
2604
2605   return true;
2606 }
2607
2608 /* Analyze groups of accesses: check that DR belongs to a group of
2609    accesses of legal size, step, etc.  Detect gaps, single element
2610    interleaving, and other special cases. Set grouped access info.
2611    Collect groups of strided stores for further use in SLP analysis.  */
2612
2613 static bool
2614 vect_analyze_group_access (struct data_reference *dr)
2615 {
2616   if (!vect_analyze_group_access_1 (dr))
2617     {
2618       /* Dissolve the group if present.  */
2619       gimple *next;
2620       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2621       while (stmt)
2622         {
2623           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2624           next = GROUP_NEXT_ELEMENT (vinfo);
2625           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2626           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2627           stmt = next;
2628         }
2629       return false;
2630     }
2631   return true;
2632 }
2633
2634 /* Analyze the access pattern of the data-reference DR.
2635    In case of non-consecutive accesses call vect_analyze_group_access() to
2636    analyze groups of accesses.  */
2637
2638 static bool
2639 vect_analyze_data_ref_access (struct data_reference *dr)
2640 {
2641   tree step = DR_STEP (dr);
2642   tree scalar_type = TREE_TYPE (DR_REF (dr));
2643   gimple *stmt = DR_STMT (dr);
2644   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2645   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2646   struct loop *loop = NULL;
2647
2648   if (loop_vinfo)
2649     loop = LOOP_VINFO_LOOP (loop_vinfo);
2650
2651   if (loop_vinfo && !step)
2652     {
2653       if (dump_enabled_p ())
2654         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2655                          "bad data-ref access in loop\n");
2656       return false;
2657     }
2658
2659   /* Allow loads with zero step in inner-loop vectorization.  */
2660   if (loop_vinfo && integer_zerop (step))
2661     {
2662       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2663       if (!nested_in_vect_loop_p (loop, stmt))
2664         return DR_IS_READ (dr);
2665       /* Allow references with zero step for outer loops marked
2666          with pragma omp simd only - it guarantees absence of
2667          loop-carried dependencies between inner loop iterations.  */
2668       if (!loop->force_vectorize)
2669         {
2670           if (dump_enabled_p ())
2671             dump_printf_loc (MSG_NOTE, vect_location,
2672                              "zero step in inner loop of nest\n");
2673           return false;
2674         }
2675     }
2676
2677   if (loop && nested_in_vect_loop_p (loop, stmt))
2678     {
2679       /* Interleaved accesses are not yet supported within outer-loop
2680         vectorization for references in the inner-loop.  */
2681       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2682
2683       /* For the rest of the analysis we use the outer-loop step.  */
2684       step = STMT_VINFO_DR_STEP (stmt_info);
2685       if (integer_zerop (step))
2686         {
2687           if (dump_enabled_p ())
2688             dump_printf_loc (MSG_NOTE, vect_location,
2689                              "zero step in outer loop.\n");
2690           return DR_IS_READ (dr);
2691         }
2692     }
2693
2694   /* Consecutive?  */
2695   if (TREE_CODE (step) == INTEGER_CST)
2696     {
2697       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2698       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2699           || (dr_step < 0
2700               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2701         {
2702           /* Mark that it is not interleaving.  */
2703           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2704           return true;
2705         }
2706     }
2707
2708   if (loop && nested_in_vect_loop_p (loop, stmt))
2709     {
2710       if (dump_enabled_p ())
2711         dump_printf_loc (MSG_NOTE, vect_location,
2712                          "grouped access in outer loop.\n");
2713       return false;
2714     }
2715
2716
2717   /* Assume this is a DR handled by non-constant strided load case.  */
2718   if (TREE_CODE (step) != INTEGER_CST)
2719     return (STMT_VINFO_STRIDED_P (stmt_info)
2720             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2721                 || vect_analyze_group_access (dr)));
2722
2723   /* Not consecutive access - check if it's a part of interleaving group.  */
2724   return vect_analyze_group_access (dr);
2725 }
2726
2727 /* Compare two data-references DRA and DRB to group them into chunks
2728    suitable for grouping.  */
2729
2730 static int
2731 dr_group_sort_cmp (const void *dra_, const void *drb_)
2732 {
2733   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2734   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2735   int cmp;
2736
2737   /* Stabilize sort.  */
2738   if (dra == drb)
2739     return 0;
2740
2741   /* DRs in different loops never belong to the same group.  */
2742   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2743   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2744   if (loopa != loopb)
2745     return loopa->num < loopb->num ? -1 : 1;
2746
2747   /* Ordering of DRs according to base.  */
2748   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2749                                DR_BASE_ADDRESS (drb));
2750   if (cmp != 0)
2751     return cmp;
2752
2753   /* And according to DR_OFFSET.  */
2754   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2755   if (cmp != 0)
2756     return cmp;
2757
2758   /* Put reads before writes.  */
2759   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2760     return DR_IS_READ (dra) ? -1 : 1;
2761
2762   /* Then sort after access size.  */
2763   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2764                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2765   if (cmp != 0)
2766     return cmp;
2767
2768   /* And after step.  */
2769   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2770   if (cmp != 0)
2771     return cmp;
2772
2773   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2774   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2775   if (cmp == 0)
2776     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2777   return cmp;
2778 }
2779
2780 /* Function vect_analyze_data_ref_accesses.
2781
2782    Analyze the access pattern of all the data references in the loop.
2783
2784    FORNOW: the only access pattern that is considered vectorizable is a
2785            simple step 1 (consecutive) access.
2786
2787    FORNOW: handle only arrays and pointer accesses.  */
2788
2789 bool
2790 vect_analyze_data_ref_accesses (vec_info *vinfo)
2791 {
2792   unsigned int i;
2793   vec<data_reference_p> datarefs = vinfo->datarefs;
2794   struct data_reference *dr;
2795
2796   if (dump_enabled_p ())
2797     dump_printf_loc (MSG_NOTE, vect_location,
2798                      "=== vect_analyze_data_ref_accesses ===\n");
2799
2800   if (datarefs.is_empty ())
2801     return true;
2802
2803   /* Sort the array of datarefs to make building the interleaving chains
2804      linear.  Don't modify the original vector's order, it is needed for
2805      determining what dependencies are reversed.  */
2806   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2807   datarefs_copy.qsort (dr_group_sort_cmp);
2808
2809   /* Build the interleaving chains.  */
2810   for (i = 0; i < datarefs_copy.length () - 1;)
2811     {
2812       data_reference_p dra = datarefs_copy[i];
2813       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2814       stmt_vec_info lastinfo = NULL;
2815       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2816         {
2817           ++i;
2818           continue;
2819         }
2820       for (i = i + 1; i < datarefs_copy.length (); ++i)
2821         {
2822           data_reference_p drb = datarefs_copy[i];
2823           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2824           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2825             break;
2826
2827           /* ???  Imperfect sorting (non-compatible types, non-modulo
2828              accesses, same accesses) can lead to a group to be artificially
2829              split here as we don't just skip over those.  If it really
2830              matters we can push those to a worklist and re-iterate
2831              over them.  The we can just skip ahead to the next DR here.  */
2832
2833           /* DRs in a different loop should not be put into the same
2834              interleaving group.  */
2835           if (gimple_bb (DR_STMT (dra))->loop_father
2836               != gimple_bb (DR_STMT (drb))->loop_father)
2837             break;
2838
2839           /* Check that the data-refs have same first location (except init)
2840              and they are both either store or load (not load and store,
2841              not masked loads or stores).  */
2842           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2843               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2844                                         DR_BASE_ADDRESS (drb)) != 0
2845               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2846               || !gimple_assign_single_p (DR_STMT (dra))
2847               || !gimple_assign_single_p (DR_STMT (drb)))
2848             break;
2849
2850           /* Check that the data-refs have the same constant size.  */
2851           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2852           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2853           if (!tree_fits_uhwi_p (sza)
2854               || !tree_fits_uhwi_p (szb)
2855               || !tree_int_cst_equal (sza, szb))
2856             break;
2857
2858           /* Check that the data-refs have the same step.  */
2859           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2860             break;
2861
2862           /* Check the types are compatible.
2863              ???  We don't distinguish this during sorting.  */
2864           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2865                                    TREE_TYPE (DR_REF (drb))))
2866             break;
2867
2868           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2869           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2870           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2871           HOST_WIDE_INT init_prev
2872             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2873           gcc_assert (init_a <= init_b
2874                       && init_a <= init_prev
2875                       && init_prev <= init_b);
2876
2877           /* Do not place the same access in the interleaving chain twice.  */
2878           if (init_b == init_prev)
2879             {
2880               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2881                           < gimple_uid (DR_STMT (drb)));
2882               /* ???  For now we simply "drop" the later reference which is
2883                  otherwise the same rather than finishing off this group.
2884                  In the end we'd want to re-process duplicates forming
2885                  multiple groups from the refs, likely by just collecting
2886                  all candidates (including duplicates and split points
2887                  below) in a vector and then process them together.  */
2888               continue;
2889             }
2890
2891           /* If init_b == init_a + the size of the type * k, we have an
2892              interleaving, and DRA is accessed before DRB.  */
2893           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2894           if (type_size_a == 0
2895               || (init_b - init_a) % type_size_a != 0)
2896             break;
2897
2898           /* If we have a store, the accesses are adjacent.  This splits
2899              groups into chunks we support (we don't support vectorization
2900              of stores with gaps).  */
2901           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2902             break;
2903
2904           /* If the step (if not zero or non-constant) is greater than the
2905              difference between data-refs' inits this splits groups into
2906              suitable sizes.  */
2907           if (tree_fits_shwi_p (DR_STEP (dra)))
2908             {
2909               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2910               if (step != 0 && step <= (init_b - init_a))
2911                 break;
2912             }
2913
2914           if (dump_enabled_p ())
2915             {
2916               dump_printf_loc (MSG_NOTE, vect_location,
2917                                "Detected interleaving ");
2918               if (DR_IS_READ (dra))
2919                 dump_printf (MSG_NOTE, "load ");
2920               else
2921                 dump_printf (MSG_NOTE, "store ");
2922               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2923               dump_printf (MSG_NOTE,  " and ");
2924               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2925               dump_printf (MSG_NOTE, "\n");
2926             }
2927
2928           /* Link the found element into the group list.  */
2929           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2930             {
2931               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2932               lastinfo = stmtinfo_a;
2933             }
2934           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2935           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2936           lastinfo = stmtinfo_b;
2937         }
2938     }
2939
2940   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2941     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2942         && !vect_analyze_data_ref_access (dr))
2943       {
2944         if (dump_enabled_p ())
2945           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2946                            "not vectorized: complicated access pattern.\n");
2947
2948         if (is_a <bb_vec_info> (vinfo))
2949           {
2950             /* Mark the statement as not vectorizable.  */
2951             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2952             continue;
2953           }
2954         else
2955           {
2956             datarefs_copy.release ();
2957             return false;
2958           }
2959       }
2960
2961   datarefs_copy.release ();
2962   return true;
2963 }
2964
2965 /* Function vect_vfa_segment_size.
2966
2967    Create an expression that computes the size of segment
2968    that will be accessed for a data reference.  The functions takes into
2969    account that realignment loads may access one more vector.
2970
2971    Input:
2972      DR: The data reference.
2973      LENGTH_FACTOR: segment length to consider.
2974
2975    Return an expression whose value is the size of segment which will be
2976    accessed by DR.  */
2977
2978 static tree
2979 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2980 {
2981   tree segment_length;
2982
2983   if (integer_zerop (DR_STEP (dr)))
2984     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2985   else
2986     segment_length = size_binop (MULT_EXPR,
2987                                  fold_convert (sizetype, DR_STEP (dr)),
2988                                  fold_convert (sizetype, length_factor));
2989
2990   if (vect_supportable_dr_alignment (dr, false)
2991         == dr_explicit_realign_optimized)
2992     {
2993       tree vector_size = TYPE_SIZE_UNIT
2994                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2995
2996       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2997     }
2998   return segment_length;
2999 }
3000
3001 /* Function vect_no_alias_p.
3002
3003    Given data references A and B with equal base and offset, the alias
3004    relation can be decided at compilation time, return TRUE if they do
3005    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
3006    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3007    respectively.  */
3008
3009 static bool
3010 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
3011                  tree segment_length_a, tree segment_length_b)
3012 {
3013   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3014               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3015   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3016     return false;
3017
3018   tree seg_a_min = DR_INIT (a);
3019   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3020                                 seg_a_min, segment_length_a);
3021   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3022      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3023      [a, a+12) */
3024   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3025     {
3026       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3027       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3028                                seg_a_max, unit_size);
3029       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3030                                DR_INIT (a), unit_size);
3031     }
3032   tree seg_b_min = DR_INIT (b);
3033   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3034                                 seg_b_min, segment_length_b);
3035   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3036     {
3037       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3038       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3039                                seg_b_max, unit_size);
3040       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3041                                DR_INIT (b), unit_size);
3042     }
3043
3044   if (tree_int_cst_le (seg_a_max, seg_b_min)
3045       || tree_int_cst_le (seg_b_max, seg_a_min))
3046     return true;
3047
3048   return false;
3049 }
3050
3051 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3052    in DDR is >= VF.  */
3053
3054 static bool
3055 dependence_distance_ge_vf (data_dependence_relation *ddr,
3056                            unsigned int loop_depth, poly_uint64 vf)
3057 {
3058   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3059       || DDR_NUM_DIST_VECTS (ddr) == 0)
3060     return false;
3061
3062   /* If the dependence is exact, we should have limited the VF instead.  */
3063   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3064
3065   unsigned int i;
3066   lambda_vector dist_v;
3067   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3068     {
3069       HOST_WIDE_INT dist = dist_v[loop_depth];
3070       if (dist != 0
3071           && !(dist > 0 && DDR_REVERSED_P (ddr))
3072           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3073         return false;
3074     }
3075
3076   if (dump_enabled_p ())
3077     {
3078       dump_printf_loc (MSG_NOTE, vect_location,
3079                        "dependence distance between ");
3080       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3081       dump_printf (MSG_NOTE,  " and ");
3082       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3083       dump_printf (MSG_NOTE,  " is >= VF\n");
3084     }
3085
3086   return true;
3087 }
3088
3089 /* Function vect_prune_runtime_alias_test_list.
3090
3091    Prune a list of ddrs to be tested at run-time by versioning for alias.
3092    Merge several alias checks into one if possible.
3093    Return FALSE if resulting list of ddrs is longer then allowed by
3094    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3095
3096 bool
3097 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3098 {
3099   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3100   hash_set <tree_pair_hash> compared_objects;
3101
3102   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3103   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3104     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3105   vec<vec_object_pair> &check_unequal_addrs
3106     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3107   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3108   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3109
3110   ddr_p ddr;
3111   unsigned int i;
3112   tree length_factor;
3113
3114   if (dump_enabled_p ())
3115     dump_printf_loc (MSG_NOTE, vect_location,
3116                      "=== vect_prune_runtime_alias_test_list ===\n");
3117
3118   if (may_alias_ddrs.is_empty ())
3119     return true;
3120
3121   comp_alias_ddrs.create (may_alias_ddrs.length ());
3122
3123   unsigned int loop_depth
3124     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3125                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3126
3127   /* First, we collect all data ref pairs for aliasing checks.  */
3128   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3129     {
3130       int comp_res;
3131       struct data_reference *dr_a, *dr_b;
3132       gimple *dr_group_first_a, *dr_group_first_b;
3133       tree segment_length_a, segment_length_b;
3134       gimple *stmt_a, *stmt_b;
3135
3136       /* Ignore the alias if the VF we chose ended up being no greater
3137          than the dependence distance.  */
3138       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3139         continue;
3140
3141       if (DDR_OBJECT_A (ddr))
3142         {
3143           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3144           if (!compared_objects.add (new_pair))
3145             {
3146               if (dump_enabled_p ())
3147                 {
3148                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3149                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3150                   dump_printf (MSG_NOTE, " and ");
3151                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3152                   dump_printf (MSG_NOTE, " have different addresses\n");
3153                 }
3154               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3155             }
3156           continue;
3157         }
3158
3159       dr_a = DDR_A (ddr);
3160       stmt_a = DR_STMT (DDR_A (ddr));
3161       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3162       if (dr_group_first_a)
3163         {
3164           stmt_a = dr_group_first_a;
3165           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3166         }
3167
3168       dr_b = DDR_B (ddr);
3169       stmt_b = DR_STMT (DDR_B (ddr));
3170       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3171       if (dr_group_first_b)
3172         {
3173           stmt_b = dr_group_first_b;
3174           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3175         }
3176
3177       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3178         length_factor = scalar_loop_iters;
3179       else
3180         length_factor = size_int (vect_factor);
3181       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3182       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3183
3184       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3185                                         DR_BASE_ADDRESS (dr_b));
3186       if (comp_res == 0)
3187         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3188                                           DR_OFFSET (dr_b));
3189
3190       /* Alias is known at compilation time.  */
3191       if (comp_res == 0
3192           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3193           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3194           && TREE_CODE (segment_length_a) == INTEGER_CST
3195           && TREE_CODE (segment_length_b) == INTEGER_CST)
3196         {
3197           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3198             continue;
3199
3200           if (dump_enabled_p ())
3201             dump_printf_loc (MSG_NOTE, vect_location,
3202                              "not vectorized: compilation time alias.\n");
3203
3204           return false;
3205         }
3206
3207       dr_with_seg_len_pair_t dr_with_seg_len_pair
3208           (dr_with_seg_len (dr_a, segment_length_a),
3209            dr_with_seg_len (dr_b, segment_length_b));
3210
3211       /* Canonicalize pairs by sorting the two DR members.  */
3212       if (comp_res > 0)
3213         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3214
3215       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3216     }
3217
3218   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3219
3220   unsigned int count = (comp_alias_ddrs.length ()
3221                         + check_unequal_addrs.length ());
3222   dump_printf_loc (MSG_NOTE, vect_location,
3223                    "improved number of alias checks from %d to %d\n",
3224                    may_alias_ddrs.length (), count);
3225   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3226     {
3227       if (dump_enabled_p ())
3228         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3229                          "number of versioning for alias "
3230                          "run-time tests exceeds %d "
3231                          "(--param vect-max-version-for-alias-checks)\n",
3232                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3233       return false;
3234     }
3235
3236   return true;
3237 }
3238
3239 /* Return true if a non-affine read or write in STMT is suitable for a
3240    gather load or scatter store.  Describe the operation in *INFO if so.  */
3241
3242 bool
3243 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3244                            gather_scatter_info *info)
3245 {
3246   HOST_WIDE_INT scale = 1;
3247   poly_int64 pbitpos, pbitsize;
3248   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3249   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3250   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3251   tree offtype = NULL_TREE;
3252   tree decl, base, off;
3253   machine_mode pmode;
3254   int punsignedp, reversep, pvolatilep = 0;
3255
3256   base = DR_REF (dr);
3257   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3258      see if we can use the def stmt of the address.  */
3259   if (is_gimple_call (stmt)
3260       && gimple_call_internal_p (stmt)
3261       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3262           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3263       && TREE_CODE (base) == MEM_REF
3264       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3265       && integer_zerop (TREE_OPERAND (base, 1))
3266       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3267     {
3268       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3269       if (is_gimple_assign (def_stmt)
3270           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3271         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3272     }
3273
3274   /* The gather and scatter builtins need address of the form
3275      loop_invariant + vector * {1, 2, 4, 8}
3276      or
3277      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3278      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3279      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3280      multiplications and additions in it.  To get a vector, we need
3281      a single SSA_NAME that will be defined in the loop and will
3282      contain everything that is not loop invariant and that can be
3283      vectorized.  The following code attempts to find such a preexistng
3284      SSA_NAME OFF and put the loop invariants into a tree BASE
3285      that can be gimplified before the loop.  */
3286   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3287                               &punsignedp, &reversep, &pvolatilep);
3288   gcc_assert (base && !reversep);
3289   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3290
3291   if (TREE_CODE (base) == MEM_REF)
3292     {
3293       if (!integer_zerop (TREE_OPERAND (base, 1)))
3294         {
3295           if (off == NULL_TREE)
3296             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3297           else
3298             off = size_binop (PLUS_EXPR, off,
3299                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3300         }
3301       base = TREE_OPERAND (base, 0);
3302     }
3303   else
3304     base = build_fold_addr_expr (base);
3305
3306   if (off == NULL_TREE)
3307     off = size_zero_node;
3308
3309   /* If base is not loop invariant, either off is 0, then we start with just
3310      the constant offset in the loop invariant BASE and continue with base
3311      as OFF, otherwise give up.
3312      We could handle that case by gimplifying the addition of base + off
3313      into some SSA_NAME and use that as off, but for now punt.  */
3314   if (!expr_invariant_in_loop_p (loop, base))
3315     {
3316       if (!integer_zerop (off))
3317         return false;
3318       off = base;
3319       base = size_int (pbytepos);
3320     }
3321   /* Otherwise put base + constant offset into the loop invariant BASE
3322      and continue with OFF.  */
3323   else
3324     {
3325       base = fold_convert (sizetype, base);
3326       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3327     }
3328
3329   /* OFF at this point may be either a SSA_NAME or some tree expression
3330      from get_inner_reference.  Try to peel off loop invariants from it
3331      into BASE as long as possible.  */
3332   STRIP_NOPS (off);
3333   while (offtype == NULL_TREE)
3334     {
3335       enum tree_code code;
3336       tree op0, op1, add = NULL_TREE;
3337
3338       if (TREE_CODE (off) == SSA_NAME)
3339         {
3340           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3341
3342           if (expr_invariant_in_loop_p (loop, off))
3343             return false;
3344
3345           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3346             break;
3347
3348           op0 = gimple_assign_rhs1 (def_stmt);
3349           code = gimple_assign_rhs_code (def_stmt);
3350           op1 = gimple_assign_rhs2 (def_stmt);
3351         }
3352       else
3353         {
3354           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3355             return false;
3356           code = TREE_CODE (off);
3357           extract_ops_from_tree (off, &code, &op0, &op1);
3358         }
3359       switch (code)
3360         {
3361         case POINTER_PLUS_EXPR:
3362         case PLUS_EXPR:
3363           if (expr_invariant_in_loop_p (loop, op0))
3364             {
3365               add = op0;
3366               off = op1;
3367             do_add:
3368               add = fold_convert (sizetype, add);
3369               if (scale != 1)
3370                 add = size_binop (MULT_EXPR, add, size_int (scale));
3371               base = size_binop (PLUS_EXPR, base, add);
3372               continue;
3373             }
3374           if (expr_invariant_in_loop_p (loop, op1))
3375             {
3376               add = op1;
3377               off = op0;
3378               goto do_add;
3379             }
3380           break;
3381         case MINUS_EXPR:
3382           if (expr_invariant_in_loop_p (loop, op1))
3383             {
3384               add = fold_convert (sizetype, op1);
3385               add = size_binop (MINUS_EXPR, size_zero_node, add);
3386               off = op0;
3387               goto do_add;
3388             }
3389           break;
3390         case MULT_EXPR:
3391           if (scale == 1 && tree_fits_shwi_p (op1))
3392             {
3393               scale = tree_to_shwi (op1);
3394               off = op0;
3395               continue;
3396             }
3397           break;
3398         case SSA_NAME:
3399           off = op0;
3400           continue;
3401         CASE_CONVERT:
3402           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3403               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3404             break;
3405           if (TYPE_PRECISION (TREE_TYPE (op0))
3406               == TYPE_PRECISION (TREE_TYPE (off)))
3407             {
3408               off = op0;
3409               continue;
3410             }
3411           if (TYPE_PRECISION (TREE_TYPE (op0))
3412               < TYPE_PRECISION (TREE_TYPE (off)))
3413             {
3414               off = op0;
3415               offtype = TREE_TYPE (off);
3416               STRIP_NOPS (off);
3417               continue;
3418             }
3419           break;
3420         default:
3421           break;
3422         }
3423       break;
3424     }
3425
3426   /* If at the end OFF still isn't a SSA_NAME or isn't
3427      defined in the loop, punt.  */
3428   if (TREE_CODE (off) != SSA_NAME
3429       || expr_invariant_in_loop_p (loop, off))
3430     return false;
3431
3432   if (offtype == NULL_TREE)
3433     offtype = TREE_TYPE (off);
3434
3435   if (DR_IS_READ (dr))
3436     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3437                                              offtype, scale);
3438   else
3439     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3440                                               offtype, scale);
3441
3442   if (decl == NULL_TREE)
3443     return false;
3444
3445   info->decl = decl;
3446   info->base = base;
3447   info->offset = off;
3448   info->offset_dt = vect_unknown_def_type;
3449   info->offset_vectype = NULL_TREE;
3450   info->scale = scale;
3451   return true;
3452 }
3453
3454 /* Function vect_analyze_data_refs.
3455
3456   Find all the data references in the loop or basic block.
3457
3458    The general structure of the analysis of data refs in the vectorizer is as
3459    follows:
3460    1- vect_analyze_data_refs(loop/bb): call
3461       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3462       in the loop/bb and their dependences.
3463    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3464    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3465    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3466
3467 */
3468
3469 bool
3470 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3471 {
3472   struct loop *loop = NULL;
3473   unsigned int i;
3474   struct data_reference *dr;
3475   tree scalar_type;
3476
3477   if (dump_enabled_p ())
3478     dump_printf_loc (MSG_NOTE, vect_location,
3479                      "=== vect_analyze_data_refs ===\n");
3480
3481   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3482     loop = LOOP_VINFO_LOOP (loop_vinfo);
3483
3484   /* Go through the data-refs, check that the analysis succeeded.  Update
3485      pointer from stmt_vec_info struct to DR and vectype.  */
3486
3487   vec<data_reference_p> datarefs = vinfo->datarefs;
3488   FOR_EACH_VEC_ELT (datarefs, i, dr)
3489     {
3490       gimple *stmt;
3491       stmt_vec_info stmt_info;
3492       tree base, offset, init;
3493       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3494       bool simd_lane_access = false;
3495       poly_uint64 vf;
3496
3497 again:
3498       if (!dr || !DR_REF (dr))
3499         {
3500           if (dump_enabled_p ())
3501             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3502                              "not vectorized: unhandled data-ref\n");
3503           return false;
3504         }
3505
3506       stmt = DR_STMT (dr);
3507       stmt_info = vinfo_for_stmt (stmt);
3508
3509       /* Discard clobbers from the dataref vector.  We will remove
3510          clobber stmts during vectorization.  */
3511       if (gimple_clobber_p (stmt))
3512         {
3513           free_data_ref (dr);
3514           if (i == datarefs.length () - 1)
3515             {
3516               datarefs.pop ();
3517               break;
3518             }
3519           datarefs.ordered_remove (i);
3520           dr = datarefs[i];
3521           goto again;
3522         }
3523
3524       /* Check that analysis of the data-ref succeeded.  */
3525       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3526           || !DR_STEP (dr))
3527         {
3528           bool maybe_gather
3529             = DR_IS_READ (dr)
3530               && !TREE_THIS_VOLATILE (DR_REF (dr))
3531               && targetm.vectorize.builtin_gather != NULL;
3532           bool maybe_scatter
3533             = DR_IS_WRITE (dr)
3534               && !TREE_THIS_VOLATILE (DR_REF (dr))
3535               && targetm.vectorize.builtin_scatter != NULL;
3536           bool maybe_simd_lane_access
3537             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3538
3539           /* If target supports vector gather loads or scatter stores, or if
3540              this might be a SIMD lane access, see if they can't be used.  */
3541           if (is_a <loop_vec_info> (vinfo)
3542               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3543               && !nested_in_vect_loop_p (loop, stmt))
3544             {
3545               struct data_reference *newdr
3546                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3547                                    DR_REF (dr), stmt, !maybe_scatter,
3548                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3549               gcc_assert (newdr != NULL && DR_REF (newdr));
3550               if (DR_BASE_ADDRESS (newdr)
3551                   && DR_OFFSET (newdr)
3552                   && DR_INIT (newdr)
3553                   && DR_STEP (newdr)
3554                   && integer_zerop (DR_STEP (newdr)))
3555                 {
3556                   if (maybe_simd_lane_access)
3557                     {
3558                       tree off = DR_OFFSET (newdr);
3559                       STRIP_NOPS (off);
3560                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3561                           && TREE_CODE (off) == MULT_EXPR
3562                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3563                         {
3564                           tree step = TREE_OPERAND (off, 1);
3565                           off = TREE_OPERAND (off, 0);
3566                           STRIP_NOPS (off);
3567                           if (CONVERT_EXPR_P (off)
3568                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3569                                                                           0)))
3570                                  < TYPE_PRECISION (TREE_TYPE (off)))
3571                             off = TREE_OPERAND (off, 0);
3572                           if (TREE_CODE (off) == SSA_NAME)
3573                             {
3574                               gimple *def = SSA_NAME_DEF_STMT (off);
3575                               tree reft = TREE_TYPE (DR_REF (newdr));
3576                               if (is_gimple_call (def)
3577                                   && gimple_call_internal_p (def)
3578                                   && (gimple_call_internal_fn (def)
3579                                       == IFN_GOMP_SIMD_LANE))
3580                                 {
3581                                   tree arg = gimple_call_arg (def, 0);
3582                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3583                                   arg = SSA_NAME_VAR (arg);
3584                                   if (arg == loop->simduid
3585                                       /* For now.  */
3586                                       && tree_int_cst_equal
3587                                            (TYPE_SIZE_UNIT (reft),
3588                                             step))
3589                                     {
3590                                       DR_OFFSET (newdr) = ssize_int (0);
3591                                       DR_STEP (newdr) = step;
3592                                       DR_OFFSET_ALIGNMENT (newdr)
3593                                         = BIGGEST_ALIGNMENT;
3594                                       DR_STEP_ALIGNMENT (newdr)
3595                                         = highest_pow2_factor (step);
3596                                       dr = newdr;
3597                                       simd_lane_access = true;
3598                                     }
3599                                 }
3600                             }
3601                         }
3602                     }
3603                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3604                     {
3605                       dr = newdr;
3606                       if (maybe_gather)
3607                         gatherscatter = GATHER;
3608                       else
3609                         gatherscatter = SCATTER;
3610                     }
3611                 }
3612               if (gatherscatter == SG_NONE && !simd_lane_access)
3613                 free_data_ref (newdr);
3614             }
3615
3616           if (gatherscatter == SG_NONE && !simd_lane_access)
3617             {
3618               if (dump_enabled_p ())
3619                 {
3620                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3621                                    "not vectorized: data ref analysis "
3622                                    "failed ");
3623                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3624                 }
3625
3626               if (is_a <bb_vec_info> (vinfo))
3627                 break;
3628
3629               return false;
3630             }
3631         }
3632
3633       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3634         {
3635           if (dump_enabled_p ())
3636             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3637                              "not vectorized: base addr of dr is a "
3638                              "constant\n");
3639
3640           if (is_a <bb_vec_info> (vinfo))
3641             break;
3642
3643           if (gatherscatter != SG_NONE || simd_lane_access)
3644             free_data_ref (dr);
3645           return false;
3646         }
3647
3648       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3649         {
3650           if (dump_enabled_p ())
3651             {
3652               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3653                                "not vectorized: volatile type ");
3654               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3655             }
3656
3657           if (is_a <bb_vec_info> (vinfo))
3658             break;
3659
3660           return false;
3661         }
3662
3663       if (stmt_can_throw_internal (stmt))
3664         {
3665           if (dump_enabled_p ())
3666             {
3667               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3668                                "not vectorized: statement can throw an "
3669                                "exception ");
3670               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3671             }
3672
3673           if (is_a <bb_vec_info> (vinfo))
3674             break;
3675
3676           if (gatherscatter != SG_NONE || simd_lane_access)
3677             free_data_ref (dr);
3678           return false;
3679         }
3680
3681       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3682           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3683         {
3684           if (dump_enabled_p ())
3685             {
3686               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3687                                "not vectorized: statement is bitfield "
3688                                "access ");
3689               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3690             }
3691
3692           if (is_a <bb_vec_info> (vinfo))
3693             break;
3694
3695           if (gatherscatter != SG_NONE || simd_lane_access)
3696             free_data_ref (dr);
3697           return false;
3698         }
3699
3700       base = unshare_expr (DR_BASE_ADDRESS (dr));
3701       offset = unshare_expr (DR_OFFSET (dr));
3702       init = unshare_expr (DR_INIT (dr));
3703
3704       if (is_gimple_call (stmt)
3705           && (!gimple_call_internal_p (stmt)
3706               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3707                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3708         {
3709           if (dump_enabled_p ())
3710             {
3711               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3712                                "not vectorized: dr in a call ");
3713               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3714             }
3715
3716           if (is_a <bb_vec_info> (vinfo))
3717             break;
3718
3719           if (gatherscatter != SG_NONE || simd_lane_access)
3720             free_data_ref (dr);
3721           return false;
3722         }
3723
3724       /* Update DR field in stmt_vec_info struct.  */
3725
3726       /* If the dataref is in an inner-loop of the loop that is considered for
3727          for vectorization, we also want to analyze the access relative to
3728          the outer-loop (DR contains information only relative to the
3729          inner-most enclosing loop).  We do that by building a reference to the
3730          first location accessed by the inner-loop, and analyze it relative to
3731          the outer-loop.  */
3732       if (loop && nested_in_vect_loop_p (loop, stmt))
3733         {
3734           /* Build a reference to the first location accessed by the
3735              inner loop: *(BASE + INIT + OFFSET).  By construction,
3736              this address must be invariant in the inner loop, so we
3737              can consider it as being used in the outer loop.  */
3738           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3739                                           init, offset);
3740           tree init_addr = fold_build_pointer_plus (base, init_offset);
3741           tree init_ref = build_fold_indirect_ref (init_addr);
3742
3743           if (dump_enabled_p ())
3744             {
3745               dump_printf_loc (MSG_NOTE, vect_location,
3746                                "analyze in outer loop: ");
3747               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3748               dump_printf (MSG_NOTE, "\n");
3749             }
3750
3751           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3752                                      init_ref, loop))
3753             /* dr_analyze_innermost already explained the failure.  */
3754             return false;
3755
3756           if (dump_enabled_p ())
3757             {
3758               dump_printf_loc (MSG_NOTE, vect_location,
3759                                "\touter base_address: ");
3760               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3761                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3762               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3763               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3764                                  STMT_VINFO_DR_OFFSET (stmt_info));
3765               dump_printf (MSG_NOTE,
3766                            "\n\touter constant offset from base address: ");
3767               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3768                                  STMT_VINFO_DR_INIT (stmt_info));
3769               dump_printf (MSG_NOTE, "\n\touter step: ");
3770               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3771                                  STMT_VINFO_DR_STEP (stmt_info));
3772               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3773                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3774               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3775                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3776               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3777                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3778               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3779                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3780             }
3781         }
3782
3783       if (STMT_VINFO_DATA_REF (stmt_info))
3784         {
3785           if (dump_enabled_p ())
3786             {
3787               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3788                                "not vectorized: more than one data ref "
3789                                "in stmt: ");
3790               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3791             }
3792
3793           if (is_a <bb_vec_info> (vinfo))
3794             break;
3795
3796           if (gatherscatter != SG_NONE || simd_lane_access)
3797             free_data_ref (dr);
3798           return false;
3799         }
3800
3801       STMT_VINFO_DATA_REF (stmt_info) = dr;
3802       if (simd_lane_access)
3803         {
3804           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3805           free_data_ref (datarefs[i]);
3806           datarefs[i] = dr;
3807         }
3808
3809       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3810           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3811           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3812         {
3813           if (dump_enabled_p ())
3814             {
3815               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3816                                "not vectorized: base object not addressable "
3817                                "for stmt: ");
3818               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3819             }
3820           if (is_a <bb_vec_info> (vinfo))
3821             {
3822               /* In BB vectorization the ref can still participate
3823                  in dependence analysis, we just can't vectorize it.  */
3824               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3825               continue;
3826             }
3827           return false;
3828         }
3829
3830       /* Set vectype for STMT.  */
3831       scalar_type = TREE_TYPE (DR_REF (dr));
3832       STMT_VINFO_VECTYPE (stmt_info)
3833         = get_vectype_for_scalar_type (scalar_type);
3834       if (!STMT_VINFO_VECTYPE (stmt_info))
3835         {
3836           if (dump_enabled_p ())
3837             {
3838               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3839                                "not vectorized: no vectype for stmt: ");
3840               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3841               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3842               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3843                                  scalar_type);
3844               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3845             }
3846
3847           if (is_a <bb_vec_info> (vinfo))
3848             {
3849               /* No vector type is fine, the ref can still participate
3850                  in dependence analysis, we just can't vectorize it.  */
3851               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3852               continue;
3853             }
3854
3855           if (gatherscatter != SG_NONE || simd_lane_access)
3856             {
3857               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3858               if (gatherscatter != SG_NONE)
3859                 free_data_ref (dr);
3860             }
3861           return false;
3862         }
3863       else
3864         {
3865           if (dump_enabled_p ())
3866             {
3867               dump_printf_loc (MSG_NOTE, vect_location,
3868                                "got vectype for stmt: ");
3869               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3870               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3871                                  STMT_VINFO_VECTYPE (stmt_info));
3872               dump_printf (MSG_NOTE, "\n");
3873             }
3874         }
3875
3876       /* Adjust the minimal vectorization factor according to the
3877          vector type.  */
3878       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3879       *min_vf = upper_bound (*min_vf, vf);
3880
3881       if (gatherscatter != SG_NONE)
3882         {
3883           gather_scatter_info gs_info;
3884           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3885                                           &gs_info)
3886               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3887             {
3888               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3889               free_data_ref (dr);
3890               if (dump_enabled_p ())
3891                 {
3892                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3893                                    (gatherscatter == GATHER) ?
3894                                    "not vectorized: not suitable for gather "
3895                                    "load " :
3896                                    "not vectorized: not suitable for scatter "
3897                                    "store ");
3898                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3899                 }
3900               return false;
3901             }
3902
3903           free_data_ref (datarefs[i]);
3904           datarefs[i] = dr;
3905           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3906         }
3907
3908       else if (is_a <loop_vec_info> (vinfo)
3909                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3910         {
3911           if (nested_in_vect_loop_p (loop, stmt))
3912             {
3913               if (dump_enabled_p ())
3914                 {
3915                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3916                                    "not vectorized: not suitable for strided "
3917                                    "load ");
3918                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3919                 }
3920               return false;
3921             }
3922           STMT_VINFO_STRIDED_P (stmt_info) = true;
3923         }
3924     }
3925
3926   /* If we stopped analysis at the first dataref we could not analyze
3927      when trying to vectorize a basic-block mark the rest of the datarefs
3928      as not vectorizable and truncate the vector of datarefs.  That
3929      avoids spending useless time in analyzing their dependence.  */
3930   if (i != datarefs.length ())
3931     {
3932       gcc_assert (is_a <bb_vec_info> (vinfo));
3933       for (unsigned j = i; j < datarefs.length (); ++j)
3934         {
3935           data_reference_p dr = datarefs[j];
3936           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3937           free_data_ref (dr);
3938         }
3939       datarefs.truncate (i);
3940     }
3941
3942   return true;
3943 }
3944
3945
3946 /* Function vect_get_new_vect_var.
3947
3948    Returns a name for a new variable.  The current naming scheme appends the
3949    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3950    the name of vectorizer generated variables, and appends that to NAME if
3951    provided.  */
3952
3953 tree
3954 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3955 {
3956   const char *prefix;
3957   tree new_vect_var;
3958
3959   switch (var_kind)
3960   {
3961   case vect_simple_var:
3962     prefix = "vect";
3963     break;
3964   case vect_scalar_var:
3965     prefix = "stmp";
3966     break;
3967   case vect_mask_var:
3968     prefix = "mask";
3969     break;
3970   case vect_pointer_var:
3971     prefix = "vectp";
3972     break;
3973   default:
3974     gcc_unreachable ();
3975   }
3976
3977   if (name)
3978     {
3979       char* tmp = concat (prefix, "_", name, NULL);
3980       new_vect_var = create_tmp_reg (type, tmp);
3981       free (tmp);
3982     }
3983   else
3984     new_vect_var = create_tmp_reg (type, prefix);
3985
3986   return new_vect_var;
3987 }
3988
3989 /* Like vect_get_new_vect_var but return an SSA name.  */
3990
3991 tree
3992 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3993 {
3994   const char *prefix;
3995   tree new_vect_var;
3996
3997   switch (var_kind)
3998   {
3999   case vect_simple_var:
4000     prefix = "vect";
4001     break;
4002   case vect_scalar_var:
4003     prefix = "stmp";
4004     break;
4005   case vect_pointer_var:
4006     prefix = "vectp";
4007     break;
4008   default:
4009     gcc_unreachable ();
4010   }
4011
4012   if (name)
4013     {
4014       char* tmp = concat (prefix, "_", name, NULL);
4015       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4016       free (tmp);
4017     }
4018   else
4019     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4020
4021   return new_vect_var;
4022 }
4023
4024 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4025
4026 static void
4027 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4028 {
4029   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4030   int misalign = DR_MISALIGNMENT (dr);
4031   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4032     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4033   else
4034     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4035                             DR_TARGET_ALIGNMENT (dr), misalign);
4036 }
4037
4038 /* Function vect_create_addr_base_for_vector_ref.
4039
4040    Create an expression that computes the address of the first memory location
4041    that will be accessed for a data reference.
4042
4043    Input:
4044    STMT: The statement containing the data reference.
4045    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4046    OFFSET: Optional. If supplied, it is be added to the initial address.
4047    LOOP:    Specify relative to which loop-nest should the address be computed.
4048             For example, when the dataref is in an inner-loop nested in an
4049             outer-loop that is now being vectorized, LOOP can be either the
4050             outer-loop, or the inner-loop.  The first memory location accessed
4051             by the following dataref ('in' points to short):
4052
4053                 for (i=0; i<N; i++)
4054                    for (j=0; j<M; j++)
4055                      s += in[i+j]
4056
4057             is as follows:
4058             if LOOP=i_loop:     &in             (relative to i_loop)
4059             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4060    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4061             initial address.  Unlike OFFSET, which is number of elements to
4062             be added, BYTE_OFFSET is measured in bytes.
4063
4064    Output:
4065    1. Return an SSA_NAME whose value is the address of the memory location of
4066       the first vector of the data reference.
4067    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4068       these statement(s) which define the returned SSA_NAME.
4069
4070    FORNOW: We are only handling array accesses with step 1.  */
4071
4072 tree
4073 vect_create_addr_base_for_vector_ref (gimple *stmt,
4074                                       gimple_seq *new_stmt_list,
4075                                       tree offset,
4076                                       tree byte_offset)
4077 {
4078   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4079   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4080   const char *base_name;
4081   tree addr_base;
4082   tree dest;
4083   gimple_seq seq = NULL;
4084   tree vect_ptr_type;
4085   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4086   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4087   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4088
4089   tree data_ref_base = unshare_expr (drb->base_address);
4090   tree base_offset = unshare_expr (drb->offset);
4091   tree init = unshare_expr (drb->init);
4092
4093   if (loop_vinfo)
4094     base_name = get_name (data_ref_base);
4095   else
4096     {
4097       base_offset = ssize_int (0);
4098       init = ssize_int (0);
4099       base_name = get_name (DR_REF (dr));
4100     }
4101
4102   /* Create base_offset */
4103   base_offset = size_binop (PLUS_EXPR,
4104                             fold_convert (sizetype, base_offset),
4105                             fold_convert (sizetype, init));
4106
4107   if (offset)
4108     {
4109       offset = fold_build2 (MULT_EXPR, sizetype,
4110                             fold_convert (sizetype, offset), step);
4111       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4112                                  base_offset, offset);
4113     }
4114   if (byte_offset)
4115     {
4116       byte_offset = fold_convert (sizetype, byte_offset);
4117       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4118                                  base_offset, byte_offset);
4119     }
4120
4121   /* base + base_offset */
4122   if (loop_vinfo)
4123     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4124   else
4125     {
4126       addr_base = build1 (ADDR_EXPR,
4127                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4128                           unshare_expr (DR_REF (dr)));
4129     }
4130
4131   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4132   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4133   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4134   gimple_seq_add_seq (new_stmt_list, seq);
4135
4136   if (DR_PTR_INFO (dr)
4137       && TREE_CODE (addr_base) == SSA_NAME
4138       && !SSA_NAME_PTR_INFO (addr_base))
4139     {
4140       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4141       if (offset || byte_offset)
4142         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4143     }
4144
4145   if (dump_enabled_p ())
4146     {
4147       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4148       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4149       dump_printf (MSG_NOTE, "\n");
4150     }
4151
4152   return addr_base;
4153 }
4154
4155
4156 /* Function vect_create_data_ref_ptr.
4157
4158    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4159    location accessed in the loop by STMT, along with the def-use update
4160    chain to appropriately advance the pointer through the loop iterations.
4161    Also set aliasing information for the pointer.  This pointer is used by
4162    the callers to this function to create a memory reference expression for
4163    vector load/store access.
4164
4165    Input:
4166    1. STMT: a stmt that references memory. Expected to be of the form
4167          GIMPLE_ASSIGN <name, data-ref> or
4168          GIMPLE_ASSIGN <data-ref, name>.
4169    2. AGGR_TYPE: the type of the reference, which should be either a vector
4170         or an array.
4171    3. AT_LOOP: the loop where the vector memref is to be created.
4172    4. OFFSET (optional): an offset to be added to the initial address accessed
4173         by the data-ref in STMT.
4174    5. BSI: location where the new stmts are to be placed if there is no loop
4175    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4176         pointing to the initial address.
4177    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4178         to the initial address accessed by the data-ref in STMT.  This is
4179         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4180         in bytes.
4181
4182    Output:
4183    1. Declare a new ptr to vector_type, and have it point to the base of the
4184       data reference (initial addressed accessed by the data reference).
4185       For example, for vector of type V8HI, the following code is generated:
4186
4187       v8hi *ap;
4188       ap = (v8hi *)initial_address;
4189
4190       if OFFSET is not supplied:
4191          initial_address = &a[init];
4192       if OFFSET is supplied:
4193          initial_address = &a[init + OFFSET];
4194       if BYTE_OFFSET is supplied:
4195          initial_address = &a[init] + BYTE_OFFSET;
4196
4197       Return the initial_address in INITIAL_ADDRESS.
4198
4199    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4200       update the pointer in each iteration of the loop.
4201
4202       Return the increment stmt that updates the pointer in PTR_INCR.
4203
4204    3. Set INV_P to true if the access pattern of the data reference in the
4205       vectorized loop is invariant.  Set it to false otherwise.
4206
4207    4. Return the pointer.  */
4208
4209 tree
4210 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4211                           tree offset, tree *initial_address,
4212                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4213                           bool only_init, bool *inv_p, tree byte_offset)
4214 {
4215   const char *base_name;
4216   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4217   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4218   struct loop *loop = NULL;
4219   bool nested_in_vect_loop = false;
4220   struct loop *containing_loop = NULL;
4221   tree aggr_ptr_type;
4222   tree aggr_ptr;
4223   tree new_temp;
4224   gimple_seq new_stmt_list = NULL;
4225   edge pe = NULL;
4226   basic_block new_bb;
4227   tree aggr_ptr_init;
4228   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4229   tree aptr;
4230   gimple_stmt_iterator incr_gsi;
4231   bool insert_after;
4232   tree indx_before_incr, indx_after_incr;
4233   gimple *incr;
4234   tree step;
4235   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4236
4237   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4238               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4239
4240   if (loop_vinfo)
4241     {
4242       loop = LOOP_VINFO_LOOP (loop_vinfo);
4243       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4244       containing_loop = (gimple_bb (stmt))->loop_father;
4245       pe = loop_preheader_edge (loop);
4246     }
4247   else
4248     {
4249       gcc_assert (bb_vinfo);
4250       only_init = true;
4251       *ptr_incr = NULL;
4252     }
4253
4254   /* Check the step (evolution) of the load in LOOP, and record
4255      whether it's invariant.  */
4256   step = vect_dr_behavior (dr)->step;
4257   if (integer_zerop (step))
4258     *inv_p = true;
4259   else
4260     *inv_p = false;
4261
4262   /* Create an expression for the first address accessed by this load
4263      in LOOP.  */
4264   base_name = get_name (DR_BASE_ADDRESS (dr));
4265
4266   if (dump_enabled_p ())
4267     {
4268       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4269       dump_printf_loc (MSG_NOTE, vect_location,
4270                        "create %s-pointer variable to type: ",
4271                        get_tree_code_name (TREE_CODE (aggr_type)));
4272       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4273       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4274         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4275       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4276         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4277       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4278         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4279       else
4280         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4281       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4282       dump_printf (MSG_NOTE, "\n");
4283     }
4284
4285   /* (1) Create the new aggregate-pointer variable.
4286      Vector and array types inherit the alias set of their component
4287      type by default so we need to use a ref-all pointer if the data
4288      reference does not conflict with the created aggregated data
4289      reference because it is not addressable.  */
4290   bool need_ref_all = false;
4291   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4292                               get_alias_set (DR_REF (dr))))
4293     need_ref_all = true;
4294   /* Likewise for any of the data references in the stmt group.  */
4295   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4296     {
4297       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4298       do
4299         {
4300           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4301           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4302           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4303                                       get_alias_set (DR_REF (sdr))))
4304             {
4305               need_ref_all = true;
4306               break;
4307             }
4308           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4309         }
4310       while (orig_stmt);
4311     }
4312   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4313                                                need_ref_all);
4314   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4315
4316
4317   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4318      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4319      def-use update cycles for the pointer: one relative to the outer-loop
4320      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4321      to the inner-loop (which is the inner-most loop containing the dataref),
4322      and this is done be step (5) below.
4323
4324      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4325      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4326      redundant.  Steps (3),(4) create the following:
4327
4328         vp0 = &base_addr;
4329         LOOP:   vp1 = phi(vp0,vp2)
4330                 ...
4331                 ...
4332                 vp2 = vp1 + step
4333                 goto LOOP
4334
4335      If there is an inner-loop nested in loop, then step (5) will also be
4336      applied, and an additional update in the inner-loop will be created:
4337
4338         vp0 = &base_addr;
4339         LOOP:   vp1 = phi(vp0,vp2)
4340                 ...
4341         inner:     vp3 = phi(vp1,vp4)
4342                    vp4 = vp3 + inner_step
4343                    if () goto inner
4344                 ...
4345                 vp2 = vp1 + step
4346                 if () goto LOOP   */
4347
4348   /* (2) Calculate the initial address of the aggregate-pointer, and set
4349      the aggregate-pointer to point to it before the loop.  */
4350
4351   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4352
4353   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4354                                                    offset, byte_offset);
4355   if (new_stmt_list)
4356     {
4357       if (pe)
4358         {
4359           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4360           gcc_assert (!new_bb);
4361         }
4362       else
4363         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4364     }
4365
4366   *initial_address = new_temp;
4367   aggr_ptr_init = new_temp;
4368
4369   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4370      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4371      inner-loop nested in LOOP (during outer-loop vectorization).  */
4372
4373   /* No update in loop is required.  */
4374   if (only_init && (!loop_vinfo || at_loop == loop))
4375     aptr = aggr_ptr_init;
4376   else
4377     {
4378       /* The step of the aggregate pointer is the type size.  */
4379       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4380       /* One exception to the above is when the scalar step of the load in
4381          LOOP is zero. In this case the step here is also zero.  */
4382       if (*inv_p)
4383         iv_step = size_zero_node;
4384       else if (tree_int_cst_sgn (step) == -1)
4385         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4386
4387       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4388
4389       create_iv (aggr_ptr_init,
4390                  fold_convert (aggr_ptr_type, iv_step),
4391                  aggr_ptr, loop, &incr_gsi, insert_after,
4392                  &indx_before_incr, &indx_after_incr);
4393       incr = gsi_stmt (incr_gsi);
4394       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4395
4396       /* Copy the points-to information if it exists. */
4397       if (DR_PTR_INFO (dr))
4398         {
4399           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4400           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4401         }
4402       if (ptr_incr)
4403         *ptr_incr = incr;
4404
4405       aptr = indx_before_incr;
4406     }
4407
4408   if (!nested_in_vect_loop || only_init)
4409     return aptr;
4410
4411
4412   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4413      nested in LOOP, if exists.  */
4414
4415   gcc_assert (nested_in_vect_loop);
4416   if (!only_init)
4417     {
4418       standard_iv_increment_position (containing_loop, &incr_gsi,
4419                                       &insert_after);
4420       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4421                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4422                  &indx_after_incr);
4423       incr = gsi_stmt (incr_gsi);
4424       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4425
4426       /* Copy the points-to information if it exists. */
4427       if (DR_PTR_INFO (dr))
4428         {
4429           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4430           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4431         }
4432       if (ptr_incr)
4433         *ptr_incr = incr;
4434
4435       return indx_before_incr;
4436     }
4437   else
4438     gcc_unreachable ();
4439 }
4440
4441
4442 /* Function bump_vector_ptr
4443
4444    Increment a pointer (to a vector type) by vector-size. If requested,
4445    i.e. if PTR-INCR is given, then also connect the new increment stmt
4446    to the existing def-use update-chain of the pointer, by modifying
4447    the PTR_INCR as illustrated below:
4448
4449    The pointer def-use update-chain before this function:
4450                         DATAREF_PTR = phi (p_0, p_2)
4451                         ....
4452         PTR_INCR:       p_2 = DATAREF_PTR + step
4453
4454    The pointer def-use update-chain after this function:
4455                         DATAREF_PTR = phi (p_0, p_2)
4456                         ....
4457                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4458                         ....
4459         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4460
4461    Input:
4462    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4463                  in the loop.
4464    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4465               the loop.  The increment amount across iterations is expected
4466               to be vector_size.
4467    BSI - location where the new update stmt is to be placed.
4468    STMT - the original scalar memory-access stmt that is being vectorized.
4469    BUMP - optional. The offset by which to bump the pointer. If not given,
4470           the offset is assumed to be vector_size.
4471
4472    Output: Return NEW_DATAREF_PTR as illustrated above.
4473
4474 */
4475
4476 tree
4477 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4478                  gimple *stmt, tree bump)
4479 {
4480   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4481   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4482   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4483   tree update = TYPE_SIZE_UNIT (vectype);
4484   gassign *incr_stmt;
4485   ssa_op_iter iter;
4486   use_operand_p use_p;
4487   tree new_dataref_ptr;
4488
4489   if (bump)
4490     update = bump;
4491
4492   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4493     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4494   else
4495     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4496   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4497                                    dataref_ptr, update);
4498   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4499
4500   /* Copy the points-to information if it exists. */
4501   if (DR_PTR_INFO (dr))
4502     {
4503       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4504       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4505     }
4506
4507   if (!ptr_incr)
4508     return new_dataref_ptr;
4509
4510   /* Update the vector-pointer's cross-iteration increment.  */
4511   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4512     {
4513       tree use = USE_FROM_PTR (use_p);
4514
4515       if (use == dataref_ptr)
4516         SET_USE (use_p, new_dataref_ptr);
4517       else
4518         gcc_assert (tree_int_cst_compare (use, update) == 0);
4519     }
4520
4521   return new_dataref_ptr;
4522 }
4523
4524
4525 /* Function vect_create_destination_var.
4526
4527    Create a new temporary of type VECTYPE.  */
4528
4529 tree
4530 vect_create_destination_var (tree scalar_dest, tree vectype)
4531 {
4532   tree vec_dest;
4533   const char *name;
4534   char *new_name;
4535   tree type;
4536   enum vect_var_kind kind;
4537
4538   kind = vectype
4539     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4540     ? vect_mask_var
4541     : vect_simple_var
4542     : vect_scalar_var;
4543   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4544
4545   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4546
4547   name = get_name (scalar_dest);
4548   if (name)
4549     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4550   else
4551     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4552   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4553   free (new_name);
4554
4555   return vec_dest;
4556 }
4557
4558 /* Function vect_grouped_store_supported.
4559
4560    Returns TRUE if interleave high and interleave low permutations
4561    are supported, and FALSE otherwise.  */
4562
4563 bool
4564 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4565 {
4566   machine_mode mode = TYPE_MODE (vectype);
4567
4568   /* vect_permute_store_chain requires the group size to be equal to 3 or
4569      be a power of two.  */
4570   if (count != 3 && exact_log2 (count) == -1)
4571     {
4572       if (dump_enabled_p ())
4573         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4574                          "the size of the group of accesses"
4575                          " is not a power of 2 or not eqaul to 3\n");
4576       return false;
4577     }
4578
4579   /* Check that the permutation is supported.  */
4580   if (VECTOR_MODE_P (mode))
4581     {
4582       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4583       if (count == 3)
4584         {
4585           unsigned int j0 = 0, j1 = 0, j2 = 0;
4586           unsigned int i, j;
4587
4588           vec_perm_builder sel (nelt, nelt, 1);
4589           sel.quick_grow (nelt);
4590           vec_perm_indices indices;
4591           for (j = 0; j < 3; j++)
4592             {
4593               int nelt0 = ((3 - j) * nelt) % 3;
4594               int nelt1 = ((3 - j) * nelt + 1) % 3;
4595               int nelt2 = ((3 - j) * nelt + 2) % 3;
4596               for (i = 0; i < nelt; i++)
4597                 {
4598                   if (3 * i + nelt0 < nelt)
4599                     sel[3 * i + nelt0] = j0++;
4600                   if (3 * i + nelt1 < nelt)
4601                     sel[3 * i + nelt1] = nelt + j1++;
4602                   if (3 * i + nelt2 < nelt)
4603                     sel[3 * i + nelt2] = 0;
4604                 }
4605               indices.new_vector (sel, 2, nelt);
4606               if (!can_vec_perm_const_p (mode, indices))
4607                 {
4608                   if (dump_enabled_p ())
4609                     dump_printf (MSG_MISSED_OPTIMIZATION,
4610                                  "permutation op not supported by target.\n");
4611                   return false;
4612                 }
4613
4614               for (i = 0; i < nelt; i++)
4615                 {
4616                   if (3 * i + nelt0 < nelt)
4617                     sel[3 * i + nelt0] = 3 * i + nelt0;
4618                   if (3 * i + nelt1 < nelt)
4619                     sel[3 * i + nelt1] = 3 * i + nelt1;
4620                   if (3 * i + nelt2 < nelt)
4621                     sel[3 * i + nelt2] = nelt + j2++;
4622                 }
4623               indices.new_vector (sel, 2, nelt);
4624               if (!can_vec_perm_const_p (mode, indices))
4625                 {
4626                   if (dump_enabled_p ())
4627                     dump_printf (MSG_MISSED_OPTIMIZATION,
4628                                  "permutation op not supported by target.\n");
4629                   return false;
4630                 }
4631             }
4632           return true;
4633         }
4634       else
4635         {
4636           /* If length is not equal to 3 then only power of 2 is supported.  */
4637           gcc_assert (pow2p_hwi (count));
4638
4639           /* The encoding has 2 interleaved stepped patterns.  */
4640           vec_perm_builder sel (nelt, 2, 3);
4641           sel.quick_grow (6);
4642           for (i = 0; i < 3; i++)
4643             {
4644               sel[i * 2] = i;
4645               sel[i * 2 + 1] = i + nelt;
4646             }
4647           vec_perm_indices indices (sel, 2, nelt);
4648           if (can_vec_perm_const_p (mode, indices))
4649             {
4650               for (i = 0; i < 6; i++)
4651                 sel[i] += nelt / 2;
4652               indices.new_vector (sel, 2, nelt);
4653               if (can_vec_perm_const_p (mode, indices))
4654                 return true;
4655             }
4656         }
4657     }
4658
4659   if (dump_enabled_p ())
4660     dump_printf (MSG_MISSED_OPTIMIZATION,
4661                  "permutaion op not supported by target.\n");
4662   return false;
4663 }
4664
4665
4666 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4667    type VECTYPE.  */
4668
4669 bool
4670 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4671 {
4672   return vect_lanes_optab_supported_p ("vec_store_lanes",
4673                                        vec_store_lanes_optab,
4674                                        vectype, count);
4675 }
4676
4677
4678 /* Function vect_permute_store_chain.
4679
4680    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4681    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4682    the data correctly for the stores.  Return the final references for stores
4683    in RESULT_CHAIN.
4684
4685    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4686    The input is 4 vectors each containing 8 elements.  We assign a number to
4687    each element, the input sequence is:
4688
4689    1st vec:   0  1  2  3  4  5  6  7
4690    2nd vec:   8  9 10 11 12 13 14 15
4691    3rd vec:  16 17 18 19 20 21 22 23
4692    4th vec:  24 25 26 27 28 29 30 31
4693
4694    The output sequence should be:
4695
4696    1st vec:  0  8 16 24  1  9 17 25
4697    2nd vec:  2 10 18 26  3 11 19 27
4698    3rd vec:  4 12 20 28  5 13 21 30
4699    4th vec:  6 14 22 30  7 15 23 31
4700
4701    i.e., we interleave the contents of the four vectors in their order.
4702
4703    We use interleave_high/low instructions to create such output.  The input of
4704    each interleave_high/low operation is two vectors:
4705    1st vec    2nd vec
4706    0 1 2 3    4 5 6 7
4707    the even elements of the result vector are obtained left-to-right from the
4708    high/low elements of the first vector.  The odd elements of the result are
4709    obtained left-to-right from the high/low elements of the second vector.
4710    The output of interleave_high will be:   0 4 1 5
4711    and of interleave_low:                   2 6 3 7
4712
4713
4714    The permutation is done in log LENGTH stages.  In each stage interleave_high
4715    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4716    where the first argument is taken from the first half of DR_CHAIN and the
4717    second argument from it's second half.
4718    In our example,
4719
4720    I1: interleave_high (1st vec, 3rd vec)
4721    I2: interleave_low (1st vec, 3rd vec)
4722    I3: interleave_high (2nd vec, 4th vec)
4723    I4: interleave_low (2nd vec, 4th vec)
4724
4725    The output for the first stage is:
4726
4727    I1:  0 16  1 17  2 18  3 19
4728    I2:  4 20  5 21  6 22  7 23
4729    I3:  8 24  9 25 10 26 11 27
4730    I4: 12 28 13 29 14 30 15 31
4731
4732    The output of the second stage, i.e. the final result is:
4733
4734    I1:  0  8 16 24  1  9 17 25
4735    I2:  2 10 18 26  3 11 19 27
4736    I3:  4 12 20 28  5 13 21 30
4737    I4:  6 14 22 30  7 15 23 31.  */
4738
4739 void
4740 vect_permute_store_chain (vec<tree> dr_chain,
4741                           unsigned int length,
4742                           gimple *stmt,
4743                           gimple_stmt_iterator *gsi,
4744                           vec<tree> *result_chain)
4745 {
4746   tree vect1, vect2, high, low;
4747   gimple *perm_stmt;
4748   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4749   tree perm_mask_low, perm_mask_high;
4750   tree data_ref;
4751   tree perm3_mask_low, perm3_mask_high;
4752   unsigned int i, n, log_length = exact_log2 (length);
4753   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4754
4755   result_chain->quick_grow (length);
4756   memcpy (result_chain->address (), dr_chain.address (),
4757           length * sizeof (tree));
4758
4759   if (length == 3)
4760     {
4761       unsigned int j0 = 0, j1 = 0, j2 = 0;
4762
4763       vec_perm_builder sel (nelt, nelt, 1);
4764       sel.quick_grow (nelt);
4765       vec_perm_indices indices;
4766       for (j = 0; j < 3; j++)
4767         {
4768           int nelt0 = ((3 - j) * nelt) % 3;
4769           int nelt1 = ((3 - j) * nelt + 1) % 3;
4770           int nelt2 = ((3 - j) * nelt + 2) % 3;
4771
4772           for (i = 0; i < nelt; i++)
4773             {
4774               if (3 * i + nelt0 < nelt)
4775                 sel[3 * i + nelt0] = j0++;
4776               if (3 * i + nelt1 < nelt)
4777                 sel[3 * i + nelt1] = nelt + j1++;
4778               if (3 * i + nelt2 < nelt)
4779                 sel[3 * i + nelt2] = 0;
4780             }
4781           indices.new_vector (sel, 2, nelt);
4782           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4783
4784           for (i = 0; i < nelt; i++)
4785             {
4786               if (3 * i + nelt0 < nelt)
4787                 sel[3 * i + nelt0] = 3 * i + nelt0;
4788               if (3 * i + nelt1 < nelt)
4789                 sel[3 * i + nelt1] = 3 * i + nelt1;
4790               if (3 * i + nelt2 < nelt)
4791                 sel[3 * i + nelt2] = nelt + j2++;
4792             }
4793           indices.new_vector (sel, 2, nelt);
4794           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4795
4796           vect1 = dr_chain[0];
4797           vect2 = dr_chain[1];
4798
4799           /* Create interleaving stmt:
4800              low = VEC_PERM_EXPR <vect1, vect2,
4801                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4802                                    j + 2, nelt + j + 2, *, ...}>  */
4803           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4804           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4805                                            vect2, perm3_mask_low);
4806           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4807
4808           vect1 = data_ref;
4809           vect2 = dr_chain[2];
4810           /* Create interleaving stmt:
4811              low = VEC_PERM_EXPR <vect1, vect2,
4812                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4813                                    6, 7, nelt + j + 2, ...}>  */
4814           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4815           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4816                                            vect2, perm3_mask_high);
4817           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4818           (*result_chain)[j] = data_ref;
4819         }
4820     }
4821   else
4822     {
4823       /* If length is not equal to 3 then only power of 2 is supported.  */
4824       gcc_assert (pow2p_hwi (length));
4825
4826       /* The encoding has 2 interleaved stepped patterns.  */
4827       vec_perm_builder sel (nelt, 2, 3);
4828       sel.quick_grow (6);
4829       for (i = 0; i < 3; i++)
4830         {
4831           sel[i * 2] = i;
4832           sel[i * 2 + 1] = i + nelt;
4833         }
4834         vec_perm_indices indices (sel, 2, nelt);
4835         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4836
4837         for (i = 0; i < 6; i++)
4838           sel[i] += nelt / 2;
4839         indices.new_vector (sel, 2, nelt);
4840         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4841
4842         for (i = 0, n = log_length; i < n; i++)
4843           {
4844             for (j = 0; j < length/2; j++)
4845               {
4846                 vect1 = dr_chain[j];
4847                 vect2 = dr_chain[j+length/2];
4848
4849                 /* Create interleaving stmt:
4850                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4851                                                         ...}>  */
4852                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4853                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4854                                                  vect2, perm_mask_high);
4855                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4856                 (*result_chain)[2*j] = high;
4857
4858                 /* Create interleaving stmt:
4859                    low = VEC_PERM_EXPR <vect1, vect2,
4860                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4861                                          ...}>  */
4862                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4863                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4864                                                  vect2, perm_mask_low);
4865                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4866                 (*result_chain)[2*j+1] = low;
4867               }
4868             memcpy (dr_chain.address (), result_chain->address (),
4869                     length * sizeof (tree));
4870           }
4871     }
4872 }
4873
4874 /* Function vect_setup_realignment
4875
4876    This function is called when vectorizing an unaligned load using
4877    the dr_explicit_realign[_optimized] scheme.
4878    This function generates the following code at the loop prolog:
4879
4880       p = initial_addr;
4881    x  msq_init = *(floor(p));   # prolog load
4882       realignment_token = call target_builtin;
4883     loop:
4884    x  msq = phi (msq_init, ---)
4885
4886    The stmts marked with x are generated only for the case of
4887    dr_explicit_realign_optimized.
4888
4889    The code above sets up a new (vector) pointer, pointing to the first
4890    location accessed by STMT, and a "floor-aligned" load using that pointer.
4891    It also generates code to compute the "realignment-token" (if the relevant
4892    target hook was defined), and creates a phi-node at the loop-header bb
4893    whose arguments are the result of the prolog-load (created by this
4894    function) and the result of a load that takes place in the loop (to be
4895    created by the caller to this function).
4896
4897    For the case of dr_explicit_realign_optimized:
4898    The caller to this function uses the phi-result (msq) to create the
4899    realignment code inside the loop, and sets up the missing phi argument,
4900    as follows:
4901     loop:
4902       msq = phi (msq_init, lsq)
4903       lsq = *(floor(p'));        # load in loop
4904       result = realign_load (msq, lsq, realignment_token);
4905
4906    For the case of dr_explicit_realign:
4907     loop:
4908       msq = *(floor(p));        # load in loop
4909       p' = p + (VS-1);
4910       lsq = *(floor(p'));       # load in loop
4911       result = realign_load (msq, lsq, realignment_token);
4912
4913    Input:
4914    STMT - (scalar) load stmt to be vectorized. This load accesses
4915           a memory location that may be unaligned.
4916    BSI - place where new code is to be inserted.
4917    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4918                               is used.
4919
4920    Output:
4921    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4922                        target hook, if defined.
4923    Return value - the result of the loop-header phi node.  */
4924
4925 tree
4926 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4927                         tree *realignment_token,
4928                         enum dr_alignment_support alignment_support_scheme,
4929                         tree init_addr,
4930                         struct loop **at_loop)
4931 {
4932   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4933   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4934   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4935   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4936   struct loop *loop = NULL;
4937   edge pe = NULL;
4938   tree scalar_dest = gimple_assign_lhs (stmt);
4939   tree vec_dest;
4940   gimple *inc;
4941   tree ptr;
4942   tree data_ref;
4943   basic_block new_bb;
4944   tree msq_init = NULL_TREE;
4945   tree new_temp;
4946   gphi *phi_stmt;
4947   tree msq = NULL_TREE;
4948   gimple_seq stmts = NULL;
4949   bool inv_p;
4950   bool compute_in_loop = false;
4951   bool nested_in_vect_loop = false;
4952   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4953   struct loop *loop_for_initial_load = NULL;
4954
4955   if (loop_vinfo)
4956     {
4957       loop = LOOP_VINFO_LOOP (loop_vinfo);
4958       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4959     }
4960
4961   gcc_assert (alignment_support_scheme == dr_explicit_realign
4962               || alignment_support_scheme == dr_explicit_realign_optimized);
4963
4964   /* We need to generate three things:
4965      1. the misalignment computation
4966      2. the extra vector load (for the optimized realignment scheme).
4967      3. the phi node for the two vectors from which the realignment is
4968       done (for the optimized realignment scheme).  */
4969
4970   /* 1. Determine where to generate the misalignment computation.
4971
4972      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4973      calculation will be generated by this function, outside the loop (in the
4974      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4975      caller, inside the loop.
4976
4977      Background: If the misalignment remains fixed throughout the iterations of
4978      the loop, then both realignment schemes are applicable, and also the
4979      misalignment computation can be done outside LOOP.  This is because we are
4980      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4981      are a multiple of VS (the Vector Size), and therefore the misalignment in
4982      different vectorized LOOP iterations is always the same.
4983      The problem arises only if the memory access is in an inner-loop nested
4984      inside LOOP, which is now being vectorized using outer-loop vectorization.
4985      This is the only case when the misalignment of the memory access may not
4986      remain fixed throughout the iterations of the inner-loop (as explained in
4987      detail in vect_supportable_dr_alignment).  In this case, not only is the
4988      optimized realignment scheme not applicable, but also the misalignment
4989      computation (and generation of the realignment token that is passed to
4990      REALIGN_LOAD) have to be done inside the loop.
4991
4992      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4993      or not, which in turn determines if the misalignment is computed inside
4994      the inner-loop, or outside LOOP.  */
4995
4996   if (init_addr != NULL_TREE || !loop_vinfo)
4997     {
4998       compute_in_loop = true;
4999       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5000     }
5001
5002
5003   /* 2. Determine where to generate the extra vector load.
5004
5005      For the optimized realignment scheme, instead of generating two vector
5006      loads in each iteration, we generate a single extra vector load in the
5007      preheader of the loop, and in each iteration reuse the result of the
5008      vector load from the previous iteration.  In case the memory access is in
5009      an inner-loop nested inside LOOP, which is now being vectorized using
5010      outer-loop vectorization, we need to determine whether this initial vector
5011      load should be generated at the preheader of the inner-loop, or can be
5012      generated at the preheader of LOOP.  If the memory access has no evolution
5013      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5014      to be generated inside LOOP (in the preheader of the inner-loop).  */
5015
5016   if (nested_in_vect_loop)
5017     {
5018       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5019       bool invariant_in_outerloop =
5020             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5021       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5022     }
5023   else
5024     loop_for_initial_load = loop;
5025   if (at_loop)
5026     *at_loop = loop_for_initial_load;
5027
5028   if (loop_for_initial_load)
5029     pe = loop_preheader_edge (loop_for_initial_load);
5030
5031   /* 3. For the case of the optimized realignment, create the first vector
5032       load at the loop preheader.  */
5033
5034   if (alignment_support_scheme == dr_explicit_realign_optimized)
5035     {
5036       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5037       gassign *new_stmt;
5038
5039       gcc_assert (!compute_in_loop);
5040       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5041       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5042                                       NULL_TREE, &init_addr, NULL, &inc,
5043                                       true, &inv_p);
5044       if (TREE_CODE (ptr) == SSA_NAME)
5045         new_temp = copy_ssa_name (ptr);
5046       else
5047         new_temp = make_ssa_name (TREE_TYPE (ptr));
5048       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5049       new_stmt = gimple_build_assign
5050                    (new_temp, BIT_AND_EXPR, ptr,
5051                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5052       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5053       gcc_assert (!new_bb);
5054       data_ref
5055         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5056                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5057       new_stmt = gimple_build_assign (vec_dest, data_ref);
5058       new_temp = make_ssa_name (vec_dest, new_stmt);
5059       gimple_assign_set_lhs (new_stmt, new_temp);
5060       if (pe)
5061         {
5062           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5063           gcc_assert (!new_bb);
5064         }
5065       else
5066          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5067
5068       msq_init = gimple_assign_lhs (new_stmt);
5069     }
5070
5071   /* 4. Create realignment token using a target builtin, if available.
5072       It is done either inside the containing loop, or before LOOP (as
5073       determined above).  */
5074
5075   if (targetm.vectorize.builtin_mask_for_load)
5076     {
5077       gcall *new_stmt;
5078       tree builtin_decl;
5079
5080       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5081       if (!init_addr)
5082         {
5083           /* Generate the INIT_ADDR computation outside LOOP.  */
5084           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5085                                                             NULL_TREE);
5086           if (loop)
5087             {
5088               pe = loop_preheader_edge (loop);
5089               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5090               gcc_assert (!new_bb);
5091             }
5092           else
5093              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5094         }
5095
5096       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5097       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5098       vec_dest =
5099         vect_create_destination_var (scalar_dest,
5100                                      gimple_call_return_type (new_stmt));
5101       new_temp = make_ssa_name (vec_dest, new_stmt);
5102       gimple_call_set_lhs (new_stmt, new_temp);
5103
5104       if (compute_in_loop)
5105         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5106       else
5107         {
5108           /* Generate the misalignment computation outside LOOP.  */
5109           pe = loop_preheader_edge (loop);
5110           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5111           gcc_assert (!new_bb);
5112         }
5113
5114       *realignment_token = gimple_call_lhs (new_stmt);
5115
5116       /* The result of the CALL_EXPR to this builtin is determined from
5117          the value of the parameter and no global variables are touched
5118          which makes the builtin a "const" function.  Requiring the
5119          builtin to have the "const" attribute makes it unnecessary
5120          to call mark_call_clobbered.  */
5121       gcc_assert (TREE_READONLY (builtin_decl));
5122     }
5123
5124   if (alignment_support_scheme == dr_explicit_realign)
5125     return msq;
5126
5127   gcc_assert (!compute_in_loop);
5128   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5129
5130
5131   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5132
5133   pe = loop_preheader_edge (containing_loop);
5134   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5135   msq = make_ssa_name (vec_dest);
5136   phi_stmt = create_phi_node (msq, containing_loop->header);
5137   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5138
5139   return msq;
5140 }
5141
5142
5143 /* Function vect_grouped_load_supported.
5144
5145    COUNT is the size of the load group (the number of statements plus the
5146    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5147    only one statement, with a gap of COUNT - 1.
5148
5149    Returns true if a suitable permute exists.  */
5150
5151 bool
5152 vect_grouped_load_supported (tree vectype, bool single_element_p,
5153                              unsigned HOST_WIDE_INT count)
5154 {
5155   machine_mode mode = TYPE_MODE (vectype);
5156
5157   /* If this is single-element interleaving with an element distance
5158      that leaves unused vector loads around punt - we at least create
5159      very sub-optimal code in that case (and blow up memory,
5160      see PR65518).  */
5161   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5162     {
5163       if (dump_enabled_p ())
5164         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5165                          "single-element interleaving not supported "
5166                          "for not adjacent vector loads\n");
5167       return false;
5168     }
5169
5170   /* vect_permute_load_chain requires the group size to be equal to 3 or
5171      be a power of two.  */
5172   if (count != 3 && exact_log2 (count) == -1)
5173     {
5174       if (dump_enabled_p ())
5175         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5176                          "the size of the group of accesses"
5177                          " is not a power of 2 or not equal to 3\n");
5178       return false;
5179     }
5180
5181   /* Check that the permutation is supported.  */
5182   if (VECTOR_MODE_P (mode))
5183     {
5184       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5185
5186       if (count == 3)
5187         {
5188           vec_perm_builder sel (nelt, nelt, 1);
5189           sel.quick_grow (nelt);
5190           vec_perm_indices indices;
5191           unsigned int k;
5192           for (k = 0; k < 3; k++)
5193             {
5194               for (i = 0; i < nelt; i++)
5195                 if (3 * i + k < 2 * nelt)
5196                   sel[i] = 3 * i + k;
5197                 else
5198                   sel[i] = 0;
5199               indices.new_vector (sel, 2, nelt);
5200               if (!can_vec_perm_const_p (mode, indices))
5201                 {
5202                   if (dump_enabled_p ())
5203                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5204                                      "shuffle of 3 loads is not supported by"
5205                                      " target\n");
5206                   return false;
5207                 }
5208               for (i = 0, j = 0; i < nelt; i++)
5209                 if (3 * i + k < 2 * nelt)
5210                   sel[i] = i;
5211                 else
5212                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5213               indices.new_vector (sel, 2, nelt);
5214               if (!can_vec_perm_const_p (mode, indices))
5215                 {
5216                   if (dump_enabled_p ())
5217                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5218                                      "shuffle of 3 loads is not supported by"
5219                                      " target\n");
5220                   return false;
5221                 }
5222             }
5223           return true;
5224         }
5225       else
5226         {
5227           /* If length is not equal to 3 then only power of 2 is supported.  */
5228           gcc_assert (pow2p_hwi (count));
5229
5230           /* The encoding has a single stepped pattern.  */
5231           vec_perm_builder sel (nelt, 1, 3);
5232           sel.quick_grow (3);
5233           for (i = 0; i < 3; i++)
5234             sel[i] = i * 2;
5235           vec_perm_indices indices (sel, 2, nelt);
5236           if (can_vec_perm_const_p (mode, indices))
5237             {
5238               for (i = 0; i < 3; i++)
5239                 sel[i] = i * 2 + 1;
5240               indices.new_vector (sel, 2, nelt);
5241               if (can_vec_perm_const_p (mode, indices))
5242                 return true;
5243             }
5244         }
5245     }
5246
5247   if (dump_enabled_p ())
5248     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5249                      "extract even/odd not supported by target\n");
5250   return false;
5251 }
5252
5253 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5254    type VECTYPE.  */
5255
5256 bool
5257 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5258 {
5259   return vect_lanes_optab_supported_p ("vec_load_lanes",
5260                                        vec_load_lanes_optab,
5261                                        vectype, count);
5262 }
5263
5264 /* Function vect_permute_load_chain.
5265
5266    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5267    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5268    the input data correctly.  Return the final references for loads in
5269    RESULT_CHAIN.
5270
5271    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5272    The input is 4 vectors each containing 8 elements. We assign a number to each
5273    element, the input sequence is:
5274
5275    1st vec:   0  1  2  3  4  5  6  7
5276    2nd vec:   8  9 10 11 12 13 14 15
5277    3rd vec:  16 17 18 19 20 21 22 23
5278    4th vec:  24 25 26 27 28 29 30 31
5279
5280    The output sequence should be:
5281
5282    1st vec:  0 4  8 12 16 20 24 28
5283    2nd vec:  1 5  9 13 17 21 25 29
5284    3rd vec:  2 6 10 14 18 22 26 30
5285    4th vec:  3 7 11 15 19 23 27 31
5286
5287    i.e., the first output vector should contain the first elements of each
5288    interleaving group, etc.
5289
5290    We use extract_even/odd instructions to create such output.  The input of
5291    each extract_even/odd operation is two vectors
5292    1st vec    2nd vec
5293    0 1 2 3    4 5 6 7
5294
5295    and the output is the vector of extracted even/odd elements.  The output of
5296    extract_even will be:   0 2 4 6
5297    and of extract_odd:     1 3 5 7
5298
5299
5300    The permutation is done in log LENGTH stages.  In each stage extract_even
5301    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5302    their order.  In our example,
5303
5304    E1: extract_even (1st vec, 2nd vec)
5305    E2: extract_odd (1st vec, 2nd vec)
5306    E3: extract_even (3rd vec, 4th vec)
5307    E4: extract_odd (3rd vec, 4th vec)
5308
5309    The output for the first stage will be:
5310
5311    E1:  0  2  4  6  8 10 12 14
5312    E2:  1  3  5  7  9 11 13 15
5313    E3: 16 18 20 22 24 26 28 30
5314    E4: 17 19 21 23 25 27 29 31
5315
5316    In order to proceed and create the correct sequence for the next stage (or
5317    for the correct output, if the second stage is the last one, as in our
5318    example), we first put the output of extract_even operation and then the
5319    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5320    The input for the second stage is:
5321
5322    1st vec (E1):  0  2  4  6  8 10 12 14
5323    2nd vec (E3): 16 18 20 22 24 26 28 30
5324    3rd vec (E2):  1  3  5  7  9 11 13 15
5325    4th vec (E4): 17 19 21 23 25 27 29 31
5326
5327    The output of the second stage:
5328
5329    E1: 0 4  8 12 16 20 24 28
5330    E2: 2 6 10 14 18 22 26 30
5331    E3: 1 5  9 13 17 21 25 29
5332    E4: 3 7 11 15 19 23 27 31
5333
5334    And RESULT_CHAIN after reordering:
5335
5336    1st vec (E1):  0 4  8 12 16 20 24 28
5337    2nd vec (E3):  1 5  9 13 17 21 25 29
5338    3rd vec (E2):  2 6 10 14 18 22 26 30
5339    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5340
5341 static void
5342 vect_permute_load_chain (vec<tree> dr_chain,
5343                          unsigned int length,
5344                          gimple *stmt,
5345                          gimple_stmt_iterator *gsi,
5346                          vec<tree> *result_chain)
5347 {
5348   tree data_ref, first_vect, second_vect;
5349   tree perm_mask_even, perm_mask_odd;
5350   tree perm3_mask_low, perm3_mask_high;
5351   gimple *perm_stmt;
5352   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5353   unsigned int i, j, log_length = exact_log2 (length);
5354   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5355
5356   result_chain->quick_grow (length);
5357   memcpy (result_chain->address (), dr_chain.address (),
5358           length * sizeof (tree));
5359
5360   if (length == 3)
5361     {
5362       unsigned int k;
5363
5364       vec_perm_builder sel (nelt, nelt, 1);
5365       sel.quick_grow (nelt);
5366       vec_perm_indices indices;
5367       for (k = 0; k < 3; k++)
5368         {
5369           for (i = 0; i < nelt; i++)
5370             if (3 * i + k < 2 * nelt)
5371               sel[i] = 3 * i + k;
5372             else
5373               sel[i] = 0;
5374           indices.new_vector (sel, 2, nelt);
5375           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5376
5377           for (i = 0, j = 0; i < nelt; i++)
5378             if (3 * i + k < 2 * nelt)
5379               sel[i] = i;
5380             else
5381               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5382           indices.new_vector (sel, 2, nelt);
5383           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5384
5385           first_vect = dr_chain[0];
5386           second_vect = dr_chain[1];
5387
5388           /* Create interleaving stmt (low part of):
5389              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5390                                                              ...}>  */
5391           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5392           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5393                                            second_vect, perm3_mask_low);
5394           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5395
5396           /* Create interleaving stmt (high part of):
5397              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5398                                                               ...}>  */
5399           first_vect = data_ref;
5400           second_vect = dr_chain[2];
5401           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5402           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5403                                            second_vect, perm3_mask_high);
5404           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5405           (*result_chain)[k] = data_ref;
5406         }
5407     }
5408   else
5409     {
5410       /* If length is not equal to 3 then only power of 2 is supported.  */
5411       gcc_assert (pow2p_hwi (length));
5412
5413       /* The encoding has a single stepped pattern.  */
5414       vec_perm_builder sel (nelt, 1, 3);
5415       sel.quick_grow (3);
5416       for (i = 0; i < 3; ++i)
5417         sel[i] = i * 2;
5418       vec_perm_indices indices (sel, 2, nelt);
5419       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5420
5421       for (i = 0; i < 3; ++i)
5422         sel[i] = i * 2 + 1;
5423       indices.new_vector (sel, 2, nelt);
5424       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5425
5426       for (i = 0; i < log_length; i++)
5427         {
5428           for (j = 0; j < length; j += 2)
5429             {
5430               first_vect = dr_chain[j];
5431               second_vect = dr_chain[j+1];
5432
5433               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5434               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5435               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5436                                                first_vect, second_vect,
5437                                                perm_mask_even);
5438               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5439               (*result_chain)[j/2] = data_ref;
5440
5441               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5442               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5443               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5444                                                first_vect, second_vect,
5445                                                perm_mask_odd);
5446               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5447               (*result_chain)[j/2+length/2] = data_ref;
5448             }
5449           memcpy (dr_chain.address (), result_chain->address (),
5450                   length * sizeof (tree));
5451         }
5452     }
5453 }
5454
5455 /* Function vect_shift_permute_load_chain.
5456
5457    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5458    sequence of stmts to reorder the input data accordingly.
5459    Return the final references for loads in RESULT_CHAIN.
5460    Return true if successed, false otherwise.
5461
5462    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5463    The input is 3 vectors each containing 8 elements.  We assign a
5464    number to each element, the input sequence is:
5465
5466    1st vec:   0  1  2  3  4  5  6  7
5467    2nd vec:   8  9 10 11 12 13 14 15
5468    3rd vec:  16 17 18 19 20 21 22 23
5469
5470    The output sequence should be:
5471
5472    1st vec:  0 3 6  9 12 15 18 21
5473    2nd vec:  1 4 7 10 13 16 19 22
5474    3rd vec:  2 5 8 11 14 17 20 23
5475
5476    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5477
5478    First we shuffle all 3 vectors to get correct elements order:
5479
5480    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5481    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5482    3rd vec:  (16 19 22) (17 20 23) (18 21)
5483
5484    Next we unite and shift vector 3 times:
5485
5486    1st step:
5487      shift right by 6 the concatenation of:
5488      "1st vec" and  "2nd vec"
5489        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5490      "2nd vec" and  "3rd vec"
5491        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5492      "3rd vec" and  "1st vec"
5493        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5494                              | New vectors                   |
5495
5496      So that now new vectors are:
5497
5498      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5499      2nd vec:  (10 13) (16 19 22) (17 20 23)
5500      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5501
5502    2nd step:
5503      shift right by 5 the concatenation of:
5504      "1st vec" and  "3rd vec"
5505        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5506      "2nd vec" and  "1st vec"
5507        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5508      "3rd vec" and  "2nd vec"
5509        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5510                           | New vectors                   |
5511
5512      So that now new vectors are:
5513
5514      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5515      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5516      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5517
5518    3rd step:
5519      shift right by 5 the concatenation of:
5520      "1st vec" and  "1st vec"
5521        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5522      shift right by 3 the concatenation of:
5523      "2nd vec" and  "2nd vec"
5524                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5525                           | New vectors                   |
5526
5527      So that now all vectors are READY:
5528      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5529      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5530      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5531
5532    This algorithm is faster than one in vect_permute_load_chain if:
5533      1.  "shift of a concatination" is faster than general permutation.
5534          This is usually so.
5535      2.  The TARGET machine can't execute vector instructions in parallel.
5536          This is because each step of the algorithm depends on previous.
5537          The algorithm in vect_permute_load_chain is much more parallel.
5538
5539    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5540 */
5541
5542 static bool
5543 vect_shift_permute_load_chain (vec<tree> dr_chain,
5544                                unsigned int length,
5545                                gimple *stmt,
5546                                gimple_stmt_iterator *gsi,
5547                                vec<tree> *result_chain)
5548 {
5549   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5550   tree perm2_mask1, perm2_mask2, perm3_mask;
5551   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5552   gimple *perm_stmt;
5553
5554   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5555   unsigned int i;
5556   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5557   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5558   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5559
5560   unsigned HOST_WIDE_INT vf;
5561   if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5562     /* Not supported for variable-length vectors.  */
5563     return false;
5564
5565   vec_perm_builder sel (nelt, nelt, 1);
5566   sel.quick_grow (nelt);
5567
5568   result_chain->quick_grow (length);
5569   memcpy (result_chain->address (), dr_chain.address (),
5570           length * sizeof (tree));
5571
5572   if (pow2p_hwi (length) && vf > 4)
5573     {
5574       unsigned int j, log_length = exact_log2 (length);
5575       for (i = 0; i < nelt / 2; ++i)
5576         sel[i] = i * 2;
5577       for (i = 0; i < nelt / 2; ++i)
5578         sel[nelt / 2 + i] = i * 2 + 1;
5579       vec_perm_indices indices (sel, 2, nelt);
5580       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5581         {
5582           if (dump_enabled_p ())
5583             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5584                              "shuffle of 2 fields structure is not \
5585                               supported by target\n");
5586           return false;
5587         }
5588       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5589
5590       for (i = 0; i < nelt / 2; ++i)
5591         sel[i] = i * 2 + 1;
5592       for (i = 0; i < nelt / 2; ++i)
5593         sel[nelt / 2 + i] = i * 2;
5594       indices.new_vector (sel, 2, nelt);
5595       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5596         {
5597           if (dump_enabled_p ())
5598             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5599                              "shuffle of 2 fields structure is not \
5600                               supported by target\n");
5601           return false;
5602         }
5603       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5604
5605       /* Generating permutation constant to shift all elements.
5606          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5607       for (i = 0; i < nelt; i++)
5608         sel[i] = nelt / 2 + i;
5609       indices.new_vector (sel, 2, nelt);
5610       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5611         {
5612           if (dump_enabled_p ())
5613             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5614                              "shift permutation is not supported by target\n");
5615           return false;
5616         }
5617       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5618
5619       /* Generating permutation constant to select vector from 2.
5620          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5621       for (i = 0; i < nelt / 2; i++)
5622         sel[i] = i;
5623       for (i = nelt / 2; i < nelt; i++)
5624         sel[i] = nelt + i;
5625       indices.new_vector (sel, 2, nelt);
5626       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5627         {
5628           if (dump_enabled_p ())
5629             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5630                              "select is not supported by target\n");
5631           return false;
5632         }
5633       select_mask = vect_gen_perm_mask_checked (vectype, indices);
5634
5635       for (i = 0; i < log_length; i++)
5636         {
5637           for (j = 0; j < length; j += 2)
5638             {
5639               first_vect = dr_chain[j];
5640               second_vect = dr_chain[j + 1];
5641
5642               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5643               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5644                                                first_vect, first_vect,
5645                                                perm2_mask1);
5646               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5647               vect[0] = data_ref;
5648
5649               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5650               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5651                                                second_vect, second_vect,
5652                                                perm2_mask2);
5653               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5654               vect[1] = data_ref;
5655
5656               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5657               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5658                                                vect[0], vect[1], shift1_mask);
5659               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5660               (*result_chain)[j/2 + length/2] = data_ref;
5661
5662               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5663               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5664                                                vect[0], vect[1], select_mask);
5665               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5666               (*result_chain)[j/2] = data_ref;
5667             }
5668           memcpy (dr_chain.address (), result_chain->address (),
5669                   length * sizeof (tree));
5670         }
5671       return true;
5672     }
5673   if (length == 3 && vf > 2)
5674     {
5675       unsigned int k = 0, l = 0;
5676
5677       /* Generating permutation constant to get all elements in rigth order.
5678          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5679       for (i = 0; i < nelt; i++)
5680         {
5681           if (3 * k + (l % 3) >= nelt)
5682             {
5683               k = 0;
5684               l += (3 - (nelt % 3));
5685             }
5686           sel[i] = 3 * k + (l % 3);
5687           k++;
5688         }
5689       vec_perm_indices indices (sel, 2, nelt);
5690       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5691         {
5692           if (dump_enabled_p ())
5693             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5694                              "shuffle of 3 fields structure is not \
5695                               supported by target\n");
5696           return false;
5697         }
5698       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5699
5700       /* Generating permutation constant to shift all elements.
5701          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5702       for (i = 0; i < nelt; i++)
5703         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5704       indices.new_vector (sel, 2, nelt);
5705       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5706         {
5707           if (dump_enabled_p ())
5708             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5709                              "shift permutation is not supported by target\n");
5710           return false;
5711         }
5712       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5713
5714       /* Generating permutation constant to shift all elements.
5715          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5716       for (i = 0; i < nelt; i++)
5717         sel[i] = 2 * (nelt / 3) + 1 + i;
5718       indices.new_vector (sel, 2, nelt);
5719       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5720         {
5721           if (dump_enabled_p ())
5722             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5723                              "shift permutation is not supported by target\n");
5724           return false;
5725         }
5726       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5727
5728       /* Generating permutation constant to shift all elements.
5729          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5730       for (i = 0; i < nelt; i++)
5731         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5732       indices.new_vector (sel, 2, nelt);
5733       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5734         {
5735           if (dump_enabled_p ())
5736             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5737                              "shift permutation is not supported by target\n");
5738           return false;
5739         }
5740       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5741
5742       /* Generating permutation constant to shift all elements.
5743          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5744       for (i = 0; i < nelt; i++)
5745         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5746       indices.new_vector (sel, 2, nelt);
5747       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5748         {
5749           if (dump_enabled_p ())
5750             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5751                              "shift permutation is not supported by target\n");
5752           return false;
5753         }
5754       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5755
5756       for (k = 0; k < 3; k++)
5757         {
5758           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5759           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5760                                            dr_chain[k], dr_chain[k],
5761                                            perm3_mask);
5762           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5763           vect[k] = data_ref;
5764         }
5765
5766       for (k = 0; k < 3; k++)
5767         {
5768           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5769           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5770                                            vect[k % 3], vect[(k + 1) % 3],
5771                                            shift1_mask);
5772           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5773           vect_shift[k] = data_ref;
5774         }
5775
5776       for (k = 0; k < 3; k++)
5777         {
5778           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5779           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5780                                            vect_shift[(4 - k) % 3],
5781                                            vect_shift[(3 - k) % 3],
5782                                            shift2_mask);
5783           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5784           vect[k] = data_ref;
5785         }
5786
5787       (*result_chain)[3 - (nelt % 3)] = vect[2];
5788
5789       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5790       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5791                                        vect[0], shift3_mask);
5792       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5793       (*result_chain)[nelt % 3] = data_ref;
5794
5795       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5796       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5797                                        vect[1], shift4_mask);
5798       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5799       (*result_chain)[0] = data_ref;
5800       return true;
5801     }
5802   return false;
5803 }
5804
5805 /* Function vect_transform_grouped_load.
5806
5807    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5808    to perform their permutation and ascribe the result vectorized statements to
5809    the scalar statements.
5810 */
5811
5812 void
5813 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5814                              gimple_stmt_iterator *gsi)
5815 {
5816   machine_mode mode;
5817   vec<tree> result_chain = vNULL;
5818
5819   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5820      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5821      vectors, that are ready for vector computation.  */
5822   result_chain.create (size);
5823
5824   /* If reassociation width for vector type is 2 or greater target machine can
5825      execute 2 or more vector instructions in parallel.  Otherwise try to
5826      get chain for loads group using vect_shift_permute_load_chain.  */
5827   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5828   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5829       || pow2p_hwi (size)
5830       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5831                                          gsi, &result_chain))
5832     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5833   vect_record_grouped_load_vectors (stmt, result_chain);
5834   result_chain.release ();
5835 }
5836
5837 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5838    generated as part of the vectorization of STMT.  Assign the statement
5839    for each vector to the associated scalar statement.  */
5840
5841 void
5842 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5843 {
5844   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5845   gimple *next_stmt, *new_stmt;
5846   unsigned int i, gap_count;
5847   tree tmp_data_ref;
5848
5849   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5850      Since we scan the chain starting from it's first node, their order
5851      corresponds the order of data-refs in RESULT_CHAIN.  */
5852   next_stmt = first_stmt;
5853   gap_count = 1;
5854   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5855     {
5856       if (!next_stmt)
5857         break;
5858
5859       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5860        code elimination pass later.  No need to check for the first stmt in
5861        the group, since it always exists.
5862        GROUP_GAP is the number of steps in elements from the previous
5863        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5864        correspond to the gaps.  */
5865       if (next_stmt != first_stmt
5866           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5867       {
5868         gap_count++;
5869         continue;
5870       }
5871
5872       while (next_stmt)
5873         {
5874           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5875           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5876              copies, and we put the new vector statement in the first available
5877              RELATED_STMT.  */
5878           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5879             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5880           else
5881             {
5882               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5883                 {
5884                   gimple *prev_stmt =
5885                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5886                   gimple *rel_stmt =
5887                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5888                   while (rel_stmt)
5889                     {
5890                       prev_stmt = rel_stmt;
5891                       rel_stmt =
5892                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5893                     }
5894
5895                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5896                     new_stmt;
5897                 }
5898             }
5899
5900           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5901           gap_count = 1;
5902           /* If NEXT_STMT accesses the same DR as the previous statement,
5903              put the same TMP_DATA_REF as its vectorized statement; otherwise
5904              get the next data-ref from RESULT_CHAIN.  */
5905           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5906             break;
5907         }
5908     }
5909 }
5910
5911 /* Function vect_force_dr_alignment_p.
5912
5913    Returns whether the alignment of a DECL can be forced to be aligned
5914    on ALIGNMENT bit boundary.  */
5915
5916 bool
5917 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5918 {
5919   if (!VAR_P (decl))
5920     return false;
5921
5922   if (decl_in_symtab_p (decl)
5923       && !symtab_node::get (decl)->can_increase_alignment_p ())
5924     return false;
5925
5926   if (TREE_STATIC (decl))
5927     return (alignment <= MAX_OFILE_ALIGNMENT);
5928   else
5929     return (alignment <= MAX_STACK_ALIGNMENT);
5930 }
5931
5932
5933 /* Return whether the data reference DR is supported with respect to its
5934    alignment.
5935    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5936    it is aligned, i.e., check if it is possible to vectorize it with different
5937    alignment.  */
5938
5939 enum dr_alignment_support
5940 vect_supportable_dr_alignment (struct data_reference *dr,
5941                                bool check_aligned_accesses)
5942 {
5943   gimple *stmt = DR_STMT (dr);
5944   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5945   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5946   machine_mode mode = TYPE_MODE (vectype);
5947   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5948   struct loop *vect_loop = NULL;
5949   bool nested_in_vect_loop = false;
5950
5951   if (aligned_access_p (dr) && !check_aligned_accesses)
5952     return dr_aligned;
5953
5954   /* For now assume all conditional loads/stores support unaligned
5955      access without any special code.  */
5956   if (is_gimple_call (stmt)
5957       && gimple_call_internal_p (stmt)
5958       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5959           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5960     return dr_unaligned_supported;
5961
5962   if (loop_vinfo)
5963     {
5964       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5965       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5966     }
5967
5968   /* Possibly unaligned access.  */
5969
5970   /* We can choose between using the implicit realignment scheme (generating
5971      a misaligned_move stmt) and the explicit realignment scheme (generating
5972      aligned loads with a REALIGN_LOAD).  There are two variants to the
5973      explicit realignment scheme: optimized, and unoptimized.
5974      We can optimize the realignment only if the step between consecutive
5975      vector loads is equal to the vector size.  Since the vector memory
5976      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5977      is guaranteed that the misalignment amount remains the same throughout the
5978      execution of the vectorized loop.  Therefore, we can create the
5979      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5980      at the loop preheader.
5981
5982      However, in the case of outer-loop vectorization, when vectorizing a
5983      memory access in the inner-loop nested within the LOOP that is now being
5984      vectorized, while it is guaranteed that the misalignment of the
5985      vectorized memory access will remain the same in different outer-loop
5986      iterations, it is *not* guaranteed that is will remain the same throughout
5987      the execution of the inner-loop.  This is because the inner-loop advances
5988      with the original scalar step (and not in steps of VS).  If the inner-loop
5989      step happens to be a multiple of VS, then the misalignment remains fixed
5990      and we can use the optimized realignment scheme.  For example:
5991
5992       for (i=0; i<N; i++)
5993         for (j=0; j<M; j++)
5994           s += a[i+j];
5995
5996      When vectorizing the i-loop in the above example, the step between
5997      consecutive vector loads is 1, and so the misalignment does not remain
5998      fixed across the execution of the inner-loop, and the realignment cannot
5999      be optimized (as illustrated in the following pseudo vectorized loop):
6000
6001       for (i=0; i<N; i+=4)
6002         for (j=0; j<M; j++){
6003           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6004                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6005                          // (assuming that we start from an aligned address).
6006           }
6007
6008      We therefore have to use the unoptimized realignment scheme:
6009
6010       for (i=0; i<N; i+=4)
6011           for (j=k; j<M; j+=4)
6012           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6013                            // that the misalignment of the initial address is
6014                            // 0).
6015
6016      The loop can then be vectorized as follows:
6017
6018       for (k=0; k<4; k++){
6019         rt = get_realignment_token (&vp[k]);
6020         for (i=0; i<N; i+=4){
6021           v1 = vp[i+k];
6022           for (j=k; j<M; j+=4){
6023             v2 = vp[i+j+VS-1];
6024             va = REALIGN_LOAD <v1,v2,rt>;
6025             vs += va;
6026             v1 = v2;
6027           }
6028         }
6029     } */
6030
6031   if (DR_IS_READ (dr))
6032     {
6033       bool is_packed = false;
6034       tree type = (TREE_TYPE (DR_REF (dr)));
6035
6036       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6037           && (!targetm.vectorize.builtin_mask_for_load
6038               || targetm.vectorize.builtin_mask_for_load ()))
6039         {
6040           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6041
6042           /* If we are doing SLP then the accesses need not have the
6043              same alignment, instead it depends on the SLP group size.  */
6044           if (loop_vinfo
6045               && STMT_SLP_TYPE (stmt_info)
6046               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6047                               * GROUP_SIZE (vinfo_for_stmt
6048                                             (GROUP_FIRST_ELEMENT (stmt_info))),
6049                               TYPE_VECTOR_SUBPARTS (vectype)))
6050             ;
6051           else if (!loop_vinfo
6052                    || (nested_in_vect_loop
6053                        && (TREE_INT_CST_LOW (DR_STEP (dr))
6054                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6055             return dr_explicit_realign;
6056           else
6057             return dr_explicit_realign_optimized;
6058         }
6059       if (!known_alignment_for_access_p (dr))
6060         is_packed = not_size_aligned (DR_REF (dr));
6061
6062       if (targetm.vectorize.support_vector_misalignment
6063             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6064         /* Can't software pipeline the loads, but can at least do them.  */
6065         return dr_unaligned_supported;
6066     }
6067   else
6068     {
6069       bool is_packed = false;
6070       tree type = (TREE_TYPE (DR_REF (dr)));
6071
6072       if (!known_alignment_for_access_p (dr))
6073         is_packed = not_size_aligned (DR_REF (dr));
6074
6075      if (targetm.vectorize.support_vector_misalignment
6076            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6077        return dr_unaligned_supported;
6078     }
6079
6080   /* Unsupported.  */
6081   return dr_unaligned_unsupported;
6082 }