gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode, array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   if (!targetm.array_mode (mode, count).exists (&array_mode))
  69     {
  70       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  71       limit_p = !targetm.array_mode_supported_p (mode, count);
  72       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  73         {
  74           if (dump_enabled_p ())
  75             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  76                              "no array mode for %s["
  77                              HOST_WIDE_INT_PRINT_DEC "]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 120                                HOST_WIDE_INT *rhs_size_unit)
 121 {
 122   tree scalar_type = gimple_expr_type (stmt);
 123   HOST_WIDE_INT lhs, rhs;
 124
 125   /* During the analysis phase, this function is called on arbitrary
 126      statements that might not have scalar results.  */
 127   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 128     return scalar_type;
 129
 130   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 131
 132   if (is_gimple_assign (stmt)
 133       && (gimple_assign_cast_p (stmt)
 134           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 135           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 136           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 137     {
 138       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 139
 140       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 141       if (rhs < lhs)
 142         scalar_type = rhs_type;
 143     }
 144
 145   *lhs_size_unit = lhs;
 146   *rhs_size_unit = rhs;
 147   return scalar_type;
 148 }
 149
 150
 151 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 152    tested at run-time.  Return TRUE if DDR was successfully inserted.
 153    Return false if versioning is not supported.  */
 154
 155 static bool
 156 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 157 {
 158   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 159
 160   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 161     return false;
 162
 163   if (!runtime_alias_check_p (ddr, loop,
 164                               optimize_loop_nest_for_speed_p (loop)))
 165     return false;
 166
 167   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 168   return true;
 169 }
 170
 171
 172 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 173    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 174    distances.  These distances are conservatively correct but they don't
 175    reflect a guaranteed dependence.
 176
 177    Return true if this function does all the work necessary to avoid
 178    an alias or false if the caller should use the dependence distances
 179    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 180    the depth of the loop described by LOOP_VINFO and the other arguments
 181    are as for vect_analyze_data_ref_dependence.  */
 182
 183 static bool
 184 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 185                                        loop_vec_info loop_vinfo,
 186                                        int loop_depth, unsigned int *max_vf)
 187 {
 188   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 189   lambda_vector dist_v;
 190   unsigned int i;
 191   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 192     {
 193       int dist = dist_v[loop_depth];
 194       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 195         {
 196           /* If the user asserted safelen >= DIST consecutive iterations
 197              can be executed concurrently, assume independence.
 198
 199              ??? An alternative would be to add the alias check even
 200              in this case, and vectorize the fallback loop with the
 201              maximum VF set to safelen.  However, if the user has
 202              explicitly given a length, it's less likely that that
 203              would be a win.  */
 204           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 205             {
 206               if ((unsigned int) loop->safelen < *max_vf)
 207                 *max_vf = loop->safelen;
 208               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 209               continue;
 210             }
 211
 212           /* For dependence distances of 2 or more, we have the option
 213              of limiting VF or checking for an alias at runtime.
 214              Prefer to check at runtime if we can, to avoid limiting
 215              the VF unnecessarily when the bases are in fact independent.
 216
 217              Note that the alias checks will be removed if the VF ends up
 218              being small enough.  */
 219           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 220         }
 221     }
 222   return true;
 223 }
 224
 225
 226 /* Function vect_analyze_data_ref_dependence.
 227
 228    Return TRUE if there (might) exist a dependence between a memory-reference
 229    DRA and a memory-reference DRB.  When versioning for alias may check a
 230    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 231    the data dependence.  */
 232
 233 static bool
 234 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 235                                   loop_vec_info loop_vinfo,
 236                                   unsigned int *max_vf)
 237 {
 238   unsigned int i;
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   struct data_reference *dra = DDR_A (ddr);
 241   struct data_reference *drb = DDR_B (ddr);
 242   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 243   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 244   lambda_vector dist_v;
 245   unsigned int loop_depth;
 246
 247   /* In loop analysis all data references should be vectorizable.  */
 248   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 249       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 250     gcc_unreachable ();
 251
 252   /* Independent data accesses.  */
 253   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 254     return false;
 255
 256   if (dra == drb
 257       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 258     return false;
 259
 260   /* We do not have to consider dependences between accesses that belong
 261      to the same group.  */
 262   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 263       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 264     return false;
 265
 266   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 267      least two scalar iterations, there is always also a true dependence.
 268      As the vectorizer does not re-order loads and stores we can ignore
 269      the anti-dependence if TBAA can disambiguate both DRs similar to the
 270      case with known negative distance anti-dependences (positive
 271      distance anti-dependences would violate TBAA constraints).  */
 272   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 273        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 274       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 275                                  get_alias_set (DR_REF (drb))))
 276     return false;
 277
 278   /* Unknown data dependence.  */
 279   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 280     {
 281       /* If user asserted safelen consecutive iterations can be
 282          executed concurrently, assume independence.  */
 283       if (loop->safelen >= 2)
 284         {
 285           if ((unsigned int) loop->safelen < *max_vf)
 286             *max_vf = loop->safelen;
 287           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 288           return false;
 289         }
 290
 291       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 292           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 293         {
 294           if (dump_enabled_p ())
 295             {
 296               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 297                                "versioning for alias not supported for: "
 298                                "can't determine dependence between ");
 299               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 300                                  DR_REF (dra));
 301               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 302               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 303                                  DR_REF (drb));
 304               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 305             }
 306           return true;
 307         }
 308
 309       if (dump_enabled_p ())
 310         {
 311           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 312                            "versioning for alias required: "
 313                            "can't determine dependence between ");
 314           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 315                              DR_REF (dra));
 316           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 317           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 318                              DR_REF (drb));
 319           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 320         }
 321
 322       /* Add to list of ddrs that need to be tested at run-time.  */
 323       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 324     }
 325
 326   /* Known data dependence.  */
 327   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 328     {
 329       /* If user asserted safelen consecutive iterations can be
 330          executed concurrently, assume independence.  */
 331       if (loop->safelen >= 2)
 332         {
 333           if ((unsigned int) loop->safelen < *max_vf)
 334             *max_vf = loop->safelen;
 335           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 336           return false;
 337         }
 338
 339       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 340           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 341         {
 342           if (dump_enabled_p ())
 343             {
 344               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 345                                "versioning for alias not supported for: "
 346                                "bad dist vector for ");
 347               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 348                                  DR_REF (dra));
 349               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 350               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 351                                  DR_REF (drb));
 352               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 353             }
 354           return true;
 355         }
 356
 357       if (dump_enabled_p ())
 358         {
 359           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 360                            "versioning for alias required: "
 361                            "bad dist vector for ");
 362           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 363           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 364           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 365           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 366         }
 367       /* Add to list of ddrs that need to be tested at run-time.  */
 368       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 369     }
 370
 371   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 372
 373   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 374       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 375                                                 loop_depth, max_vf))
 376     return false;
 377
 378   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 379     {
 380       int dist = dist_v[loop_depth];
 381
 382       if (dump_enabled_p ())
 383         dump_printf_loc (MSG_NOTE, vect_location,
 384                          "dependence distance  = %d.\n", dist);
 385
 386       if (dist == 0)
 387         {
 388           if (dump_enabled_p ())
 389             {
 390               dump_printf_loc (MSG_NOTE, vect_location,
 391                                "dependence distance == 0 between ");
 392               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 393               dump_printf (MSG_NOTE, " and ");
 394               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 395               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 396             }
 397
 398           /* When we perform grouped accesses and perform implicit CSE
 399              by detecting equal accesses and doing disambiguation with
 400              runtime alias tests like for
 401                 .. = a[i];
 402                 .. = a[i+1];
 403                 a[i] = ..;
 404                 a[i+1] = ..;
 405                 *p = ..;
 406                 .. = a[i];
 407                 .. = a[i+1];
 408              where we will end up loading { a[i], a[i+1] } once, make
 409              sure that inserting group loads before the first load and
 410              stores after the last store will do the right thing.
 411              Similar for groups like
 412                 a[i] = ...;
 413                 ... = a[i];
 414                 a[i+1] = ...;
 415              where loads from the group interleave with the store.  */
 416           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 417               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 418             {
 419               gimple *earlier_stmt;
 420               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 421               if (DR_IS_WRITE
 422                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 423                 {
 424                   if (dump_enabled_p ())
 425                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 426                                      "READ_WRITE dependence in interleaving."
 427                                      "\n");
 428                   return true;
 429                 }
 430             }
 431
 432           continue;
 433         }
 434
 435       if (dist > 0 && DDR_REVERSED_P (ddr))
 436         {
 437           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 438              reversed (to make distance vector positive), and the actual
 439              distance is negative.  */
 440           if (dump_enabled_p ())
 441             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 442                              "dependence distance negative.\n");
 443           /* Record a negative dependence distance to later limit the
 444              amount of stmt copying / unrolling we can perform.
 445              Only need to handle read-after-write dependence.  */
 446           if (DR_IS_READ (drb)
 447               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 448                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 449             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 450           continue;
 451         }
 452
 453       unsigned int abs_dist = abs (dist);
 454       if (abs_dist >= 2 && abs_dist < *max_vf)
 455         {
 456           /* The dependence distance requires reduction of the maximal
 457              vectorization factor.  */
 458           *max_vf = abs (dist);
 459           if (dump_enabled_p ())
 460             dump_printf_loc (MSG_NOTE, vect_location,
 461                              "adjusting maximal vectorization factor to %i\n",
 462                              *max_vf);
 463         }
 464
 465       if (abs_dist >= *max_vf)
 466         {
 467           /* Dependence distance does not create dependence, as far as
 468              vectorization is concerned, in this case.  */
 469           if (dump_enabled_p ())
 470             dump_printf_loc (MSG_NOTE, vect_location,
 471                              "dependence distance >= VF.\n");
 472           continue;
 473         }
 474
 475       if (dump_enabled_p ())
 476         {
 477           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 478                        "not vectorized, possible dependence "
 479                        "between data-refs ");
 480           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 481           dump_printf (MSG_NOTE,  " and ");
 482           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 483           dump_printf (MSG_NOTE,  "\n");
 484         }
 485
 486       return true;
 487     }
 488
 489   return false;
 490 }
 491
 492 /* Function vect_analyze_data_ref_dependences.
 493
 494    Examine all the data references in the loop, and make sure there do not
 495    exist any data dependences between them.  Set *MAX_VF according to
 496    the maximum vectorization factor the data dependences allow.  */
 497
 498 bool
 499 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 500                                    unsigned int *max_vf)
 501 {
 502   unsigned int i;
 503   struct data_dependence_relation *ddr;
 504
 505   if (dump_enabled_p ())
 506     dump_printf_loc (MSG_NOTE, vect_location,
 507                      "=== vect_analyze_data_ref_dependences ===\n");
 508
 509   LOOP_VINFO_DDRS (loop_vinfo)
 510     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 511              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 512   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 513   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 514   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 515                                 &LOOP_VINFO_DDRS (loop_vinfo),
 516                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 517     return false;
 518
 519   /* For epilogues we either have no aliases or alias versioning
 520      was applied to original loop.  Therefore we may just get max_vf
 521      using VF of original loop.  */
 522   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 523     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 524   else
 525     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 526       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 527         return false;
 528
 529   return true;
 530 }
 531
 532
 533 /* Function vect_slp_analyze_data_ref_dependence.
 534
 535    Return TRUE if there (might) exist a dependence between a memory-reference
 536    DRA and a memory-reference DRB.  When versioning for alias may check a
 537    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 538    the data dependence.  */
 539
 540 static bool
 541 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 542 {
 543   struct data_reference *dra = DDR_A (ddr);
 544   struct data_reference *drb = DDR_B (ddr);
 545
 546   /* We need to check dependences of statements marked as unvectorizable
 547      as well, they still can prohibit vectorization.  */
 548
 549   /* Independent data accesses.  */
 550   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 551     return false;
 552
 553   if (dra == drb)
 554     return false;
 555
 556   /* Read-read is OK.  */
 557   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 558     return false;
 559
 560   /* If dra and drb are part of the same interleaving chain consider
 561      them independent.  */
 562   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 563       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 564           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 565     return false;
 566
 567   /* Unknown data dependence.  */
 568   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 569     {
 570       if  (dump_enabled_p ())
 571         {
 572           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 573                            "can't determine dependence between ");
 574           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 575           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 576           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 577           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 578         }
 579     }
 580   else if (dump_enabled_p ())
 581     {
 582       dump_printf_loc (MSG_NOTE, vect_location,
 583                        "determined dependence between ");
 584       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 585       dump_printf (MSG_NOTE, " and ");
 586       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 587       dump_printf (MSG_NOTE,  "\n");
 588     }
 589
 590   return true;
 591 }
 592
 593
 594 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 595    contain the vector of scalar stores of this instance if we are
 596    disambiguating the loads.  */
 597
 598 static bool
 599 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 600                                    vec<gimple *> stores, gimple *last_store)
 601 {
 602   /* This walks over all stmts involved in the SLP load/store done
 603      in NODE verifying we can sink them up to the last stmt in the
 604      group.  */
 605   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 606   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 607     {
 608       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 609       if (access == last_access)
 610         continue;
 611       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 612       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 613            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 614         {
 615           gimple *stmt = gsi_stmt (gsi);
 616           if (! gimple_vuse (stmt)
 617               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 618             continue;
 619
 620           /* If we couldn't record a (single) data reference for this
 621              stmt we have to give up.  */
 622           /* ???  Here and below if dependence analysis fails we can resort
 623              to the alias oracle which can handle more kinds of stmts.  */
 624           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 625           if (!dr_b)
 626             return false;
 627
 628           bool dependent = false;
 629           /* If we run into a store of this same instance (we've just
 630              marked those) then delay dependence checking until we run
 631              into the last store because this is where it will have
 632              been sunk to (and we verify if we can do that as well).  */
 633           if (gimple_visited_p (stmt))
 634             {
 635               if (stmt != last_store)
 636                 continue;
 637               unsigned i;
 638               gimple *store;
 639               FOR_EACH_VEC_ELT (stores, i, store)
 640                 {
 641                   data_reference *store_dr
 642                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 643                   ddr_p ddr = initialize_data_dependence_relation
 644                                 (dr_a, store_dr, vNULL);
 645                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 646                   free_dependence_relation (ddr);
 647                   if (dependent)
 648                     break;
 649                 }
 650             }
 651           else
 652             {
 653               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 654                                                                dr_b, vNULL);
 655               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 656               free_dependence_relation (ddr);
 657             }
 658           if (dependent)
 659             return false;
 660         }
 661     }
 662   return true;
 663 }
 664
 665
 666 /* Function vect_analyze_data_ref_dependences.
 667
 668    Examine all the data references in the basic-block, and make sure there
 669    do not exist any data dependences between them.  Set *MAX_VF according to
 670    the maximum vectorization factor the data dependences allow.  */
 671
 672 bool
 673 vect_slp_analyze_instance_dependence (slp_instance instance)
 674 {
 675   if (dump_enabled_p ())
 676     dump_printf_loc (MSG_NOTE, vect_location,
 677                      "=== vect_slp_analyze_instance_dependence ===\n");
 678
 679   /* The stores of this instance are at the root of the SLP tree.  */
 680   slp_tree store = SLP_INSTANCE_TREE (instance);
 681   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 682     store = NULL;
 683
 684   /* Verify we can sink stores to the vectorized stmt insert location.  */
 685   gimple *last_store = NULL;
 686   if (store)
 687     {
 688       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 689         return false;
 690
 691       /* Mark stores in this instance and remember the last one.  */
 692       last_store = vect_find_last_scalar_stmt_in_slp (store);
 693       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 694         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 695     }
 696
 697   bool res = true;
 698
 699   /* Verify we can sink loads to the vectorized stmt insert location,
 700      special-casing stores of this instance.  */
 701   slp_tree load;
 702   unsigned int i;
 703   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 704     if (! vect_slp_analyze_node_dependences (instance, load,
 705                                              store
 706                                              ? SLP_TREE_SCALAR_STMTS (store)
 707                                              : vNULL, last_store))
 708       {
 709         res = false;
 710         break;
 711       }
 712
 713   /* Unset the visited flag.  */
 714   if (store)
 715     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 716       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 717
 718   return res;
 719 }
 720
 721 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 722    the statement that contains DRB, which is useful for recording in the
 723    dump file.  */
 724
 725 static void
 726 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 727                             innermost_loop_behavior *drb)
 728 {
 729   bool existed;
 730   innermost_loop_behavior *&entry
 731     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 732   if (!existed || entry->base_alignment < drb->base_alignment)
 733     {
 734       entry = drb;
 735       if (dump_enabled_p ())
 736         {
 737           dump_printf_loc (MSG_NOTE, vect_location,
 738                            "recording new base alignment for ");
 739           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 740           dump_printf (MSG_NOTE, "\n");
 741           dump_printf_loc (MSG_NOTE, vect_location,
 742                            "  alignment:    %d\n", drb->base_alignment);
 743           dump_printf_loc (MSG_NOTE, vect_location,
 744                            "  misalignment: %d\n", drb->base_misalignment);
 745           dump_printf_loc (MSG_NOTE, vect_location,
 746                            "  based on:     ");
 747           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 748         }
 749     }
 750 }
 751
 752 /* If the region we're going to vectorize is reached, all unconditional
 753    data references occur at least once.  We can therefore pool the base
 754    alignment guarantees from each unconditional reference.  Do this by
 755    going through all the data references in VINFO and checking whether
 756    the containing statement makes the reference unconditionally.  If so,
 757    record the alignment of the base address in VINFO so that it can be
 758    used for all other references with the same base.  */
 759
 760 void
 761 vect_record_base_alignments (vec_info *vinfo)
 762 {
 763   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 764   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 765   data_reference *dr;
 766   unsigned int i;
 767   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 768     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 769       {
 770         gimple *stmt = DR_STMT (dr);
 771         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 772
 773         /* If DR is nested in the loop that is being vectorized, we can also
 774            record the alignment of the base wrt the outer loop.  */
 775         if (loop && nested_in_vect_loop_p (loop, stmt))
 776           {
 777             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 778             vect_record_base_alignment
 779               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 780           }
 781       }
 782 }
 783
 784 /* Return the target alignment for the vectorized form of DR.  */
 785
 786 static unsigned int
 787 vect_calculate_target_alignment (struct data_reference *dr)
 788 {
 789   gimple *stmt = DR_STMT (dr);
 790   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 791   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 792   return targetm.vectorize.preferred_vector_alignment (vectype);
 793 }
 794
 795 /* Function vect_compute_data_ref_alignment
 796
 797    Compute the misalignment of the data reference DR.
 798
 799    Output:
 800    1. If during the misalignment computation it is found that the data reference
 801       cannot be vectorized then false is returned.
 802    2. DR_MISALIGNMENT (DR) is defined.
 803
 804    FOR NOW: No analysis is actually performed. Misalignment is calculated
 805    only for trivial cases. TODO.  */
 806
 807 bool
 808 vect_compute_data_ref_alignment (struct data_reference *dr)
 809 {
 810   gimple *stmt = DR_STMT (dr);
 811   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 812   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 813   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 814   struct loop *loop = NULL;
 815   tree ref = DR_REF (dr);
 816   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 817
 818   if (dump_enabled_p ())
 819     dump_printf_loc (MSG_NOTE, vect_location,
 820                      "vect_compute_data_ref_alignment:\n");
 821
 822   if (loop_vinfo)
 823     loop = LOOP_VINFO_LOOP (loop_vinfo);
 824
 825   /* Initialize misalignment to unknown.  */
 826   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 827
 828   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 829   bool step_preserves_misalignment_p;
 830
 831   unsigned HOST_WIDE_INT vector_alignment
 832     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 833   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 834
 835   /* No step for BB vectorization.  */
 836   if (!loop)
 837     {
 838       gcc_assert (integer_zerop (drb->step));
 839       step_preserves_misalignment_p = true;
 840     }
 841
 842   /* In case the dataref is in an inner-loop of the loop that is being
 843      vectorized (LOOP), we use the base and misalignment information
 844      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 845      stays the same throughout the execution of the inner-loop, which is why
 846      we have to check that the stride of the dataref in the inner-loop evenly
 847      divides by the vector alignment.  */
 848   else if (nested_in_vect_loop_p (loop, stmt))
 849     {
 850       step_preserves_misalignment_p
 851         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 852
 853       if (dump_enabled_p ())
 854         {
 855           if (step_preserves_misalignment_p)
 856             dump_printf_loc (MSG_NOTE, vect_location,
 857                              "inner step divides the vector alignment.\n");
 858           else
 859             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 860                              "inner step doesn't divide the vector"
 861                              " alignment.\n");
 862         }
 863     }
 864
 865   /* Similarly we can only use base and misalignment information relative to
 866      an innermost loop if the misalignment stays the same throughout the
 867      execution of the loop.  As above, this is the case if the stride of
 868      the dataref evenly divides by the alignment.  */
 869   else
 870     {
 871       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 872       step_preserves_misalignment_p
 873         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 874
 875       if (!step_preserves_misalignment_p && dump_enabled_p ())
 876         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 877                          "step doesn't divide the vector alignment.\n");
 878     }
 879
 880   unsigned int base_alignment = drb->base_alignment;
 881   unsigned int base_misalignment = drb->base_misalignment;
 882
 883   /* Calculate the maximum of the pooled base address alignment and the
 884      alignment that we can compute for DR itself.  */
 885   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 886   if (entry && base_alignment < (*entry)->base_alignment)
 887     {
 888       base_alignment = (*entry)->base_alignment;
 889       base_misalignment = (*entry)->base_misalignment;
 890     }
 891
 892   if (drb->offset_alignment < vector_alignment
 893       || !step_preserves_misalignment_p
 894       /* We need to know whether the step wrt the vectorized loop is
 895          negative when computing the starting misalignment below.  */
 896       || TREE_CODE (drb->step) != INTEGER_CST)
 897     {
 898       if (dump_enabled_p ())
 899         {
 900           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 901                            "Unknown alignment for access: ");
 902           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 903           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 904         }
 905       return true;
 906     }
 907
 908   if (base_alignment < vector_alignment)
 909     {
 910       tree base = drb->base_address;
 911       if (TREE_CODE (base) == ADDR_EXPR)
 912         base = TREE_OPERAND (base, 0);
 913       if (!vect_can_force_dr_alignment_p (base,
 914                                           vector_alignment * BITS_PER_UNIT))
 915         {
 916           if (dump_enabled_p ())
 917             {
 918               dump_printf_loc (MSG_NOTE, vect_location,
 919                                "can't force alignment of ref: ");
 920               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 921               dump_printf (MSG_NOTE, "\n");
 922             }
 923           return true;
 924         }
 925
 926       /* Force the alignment of the decl.
 927          NOTE: This is the only change to the code we make during
 928          the analysis phase, before deciding to vectorize the loop.  */
 929       if (dump_enabled_p ())
 930         {
 931           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 932           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 933           dump_printf (MSG_NOTE, "\n");
 934         }
 935
 936       DR_VECT_AUX (dr)->base_decl = base;
 937       DR_VECT_AUX (dr)->base_misaligned = true;
 938       base_misalignment = 0;
 939     }
 940   poly_int64 misalignment
 941     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 942
 943   /* If this is a backward running DR then first access in the larger
 944      vectype actually is N-1 elements before the address in the DR.
 945      Adjust misalign accordingly.  */
 946   if (tree_int_cst_sgn (drb->step) < 0)
 947     /* PLUS because STEP is negative.  */
 948     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 949                      * TREE_INT_CST_LOW (drb->step));
 950
 951   unsigned int const_misalignment;
 952   if (!known_misalignment (misalignment, vector_alignment,
 953                            &const_misalignment))
 954     {
 955       if (dump_enabled_p ())
 956         {
 957           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 958                            "Non-constant misalignment for access: ");
 959           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 960           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 961         }
 962       return true;
 963     }
 964
 965   SET_DR_MISALIGNMENT (dr, const_misalignment);
 966
 967   if (dump_enabled_p ())
 968     {
 969       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 970                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 971       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 972       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 973     }
 974
 975   return true;
 976 }
 977
 978 /* Function vect_update_misalignment_for_peel.
 979    Sets DR's misalignment
 980    - to 0 if it has the same alignment as DR_PEEL,
 981    - to the misalignment computed using NPEEL if DR's salignment is known,
 982    - to -1 (unknown) otherwise.
 983
 984    DR - the data reference whose misalignment is to be adjusted.
 985    DR_PEEL - the data reference whose misalignment is being made
 986              zero in the vector loop by the peel.
 987    NPEEL - the number of iterations in the peel loop if the misalignment
 988            of DR_PEEL is known at compile time.  */
 989
 990 static void
 991 vect_update_misalignment_for_peel (struct data_reference *dr,
 992                                    struct data_reference *dr_peel, int npeel)
 993 {
 994   unsigned int i;
 995   vec<dr_p> same_aligned_drs;
 996   struct data_reference *current_dr;
 997   int dr_size = vect_get_scalar_dr_size (dr);
 998   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
 999   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1000   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1001
1002  /* For interleaved data accesses the step in the loop must be multiplied by
1003      the size of the interleaving group.  */
1004   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1005     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1006   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1007     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1008
1009   /* It can be assumed that the data refs with the same alignment as dr_peel
1010      are aligned in the vector loop.  */
1011   same_aligned_drs
1012     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1013   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1014     {
1015       if (current_dr != dr)
1016         continue;
1017       gcc_assert (!known_alignment_for_access_p (dr)
1018                   || !known_alignment_for_access_p (dr_peel)
1019                   || (DR_MISALIGNMENT (dr) / dr_size
1020                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1021       SET_DR_MISALIGNMENT (dr, 0);
1022       return;
1023     }
1024
1025   if (known_alignment_for_access_p (dr)
1026       && known_alignment_for_access_p (dr_peel))
1027     {
1028       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1029       int misal = DR_MISALIGNMENT (dr);
1030       misal += negative ? -npeel * dr_size : npeel * dr_size;
1031       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1032       SET_DR_MISALIGNMENT (dr, misal);
1033       return;
1034     }
1035
1036   if (dump_enabled_p ())
1037     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1038                      "to unknown (-1).\n");
1039   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1040 }
1041
1042
1043 /* Function verify_data_ref_alignment
1044
1045    Return TRUE if DR can be handled with respect to alignment.  */
1046
1047 static bool
1048 verify_data_ref_alignment (data_reference_p dr)
1049 {
1050   enum dr_alignment_support supportable_dr_alignment
1051     = vect_supportable_dr_alignment (dr, false);
1052   if (!supportable_dr_alignment)
1053     {
1054       if (dump_enabled_p ())
1055         {
1056           if (DR_IS_READ (dr))
1057             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058                              "not vectorized: unsupported unaligned load.");
1059           else
1060             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061                              "not vectorized: unsupported unaligned "
1062                              "store.");
1063
1064           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1065                              DR_REF (dr));
1066           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1067         }
1068       return false;
1069     }
1070
1071   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1072     dump_printf_loc (MSG_NOTE, vect_location,
1073                      "Vectorizing an unaligned access.\n");
1074
1075   return true;
1076 }
1077
1078 /* Function vect_verify_datarefs_alignment
1079
1080    Return TRUE if all data references in the loop can be
1081    handled with respect to alignment.  */
1082
1083 bool
1084 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1085 {
1086   vec<data_reference_p> datarefs = vinfo->datarefs;
1087   struct data_reference *dr;
1088   unsigned int i;
1089
1090   FOR_EACH_VEC_ELT (datarefs, i, dr)
1091     {
1092       gimple *stmt = DR_STMT (dr);
1093       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1094
1095       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1096         continue;
1097
1098       /* For interleaving, only the alignment of the first access matters.   */
1099       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1100           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1101         continue;
1102
1103       /* Strided accesses perform only component accesses, alignment is
1104          irrelevant for them.  */
1105       if (STMT_VINFO_STRIDED_P (stmt_info)
1106           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1107         continue;
1108
1109       if (! verify_data_ref_alignment (dr))
1110         return false;
1111     }
1112
1113   return true;
1114 }
1115
1116 /* Given an memory reference EXP return whether its alignment is less
1117    than its size.  */
1118
1119 static bool
1120 not_size_aligned (tree exp)
1121 {
1122   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1123     return true;
1124
1125   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1126           > get_object_alignment (exp));
1127 }
1128
1129 /* Function vector_alignment_reachable_p
1130
1131    Return true if vector alignment for DR is reachable by peeling
1132    a few loop iterations.  Return false otherwise.  */
1133
1134 static bool
1135 vector_alignment_reachable_p (struct data_reference *dr)
1136 {
1137   gimple *stmt = DR_STMT (dr);
1138   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1139   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1140
1141   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1142     {
1143       /* For interleaved access we peel only if number of iterations in
1144          the prolog loop ({VF - misalignment}), is a multiple of the
1145          number of the interleaved accesses.  */
1146       int elem_size, mis_in_elements;
1147
1148       /* FORNOW: handle only known alignment.  */
1149       if (!known_alignment_for_access_p (dr))
1150         return false;
1151
1152       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1153       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1154       elem_size = vector_element_size (vector_size, nelements);
1155       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1156
1157       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1158         return false;
1159     }
1160
1161   /* If misalignment is known at the compile time then allow peeling
1162      only if natural alignment is reachable through peeling.  */
1163   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1164     {
1165       HOST_WIDE_INT elmsize =
1166                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1167       if (dump_enabled_p ())
1168         {
1169           dump_printf_loc (MSG_NOTE, vect_location,
1170                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1171           dump_printf (MSG_NOTE,
1172                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1173         }
1174       if (DR_MISALIGNMENT (dr) % elmsize)
1175         {
1176           if (dump_enabled_p ())
1177             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178                              "data size does not divide the misalignment.\n");
1179           return false;
1180         }
1181     }
1182
1183   if (!known_alignment_for_access_p (dr))
1184     {
1185       tree type = TREE_TYPE (DR_REF (dr));
1186       bool is_packed = not_size_aligned (DR_REF (dr));
1187       if (dump_enabled_p ())
1188         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189                          "Unknown misalignment, %snaturally aligned\n",
1190                          is_packed ? "not " : "");
1191       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1192     }
1193
1194   return true;
1195 }
1196
1197
1198 /* Calculate the cost of the memory access represented by DR.  */
1199
1200 static void
1201 vect_get_data_access_cost (struct data_reference *dr,
1202                            unsigned int *inside_cost,
1203                            unsigned int *outside_cost,
1204                            stmt_vector_for_cost *body_cost_vec)
1205 {
1206   gimple *stmt = DR_STMT (dr);
1207   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1208   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1209   int ncopies;
1210
1211   if (PURE_SLP_STMT (stmt_info))
1212     ncopies = 1;
1213   else
1214     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1215
1216   if (DR_IS_READ (dr))
1217     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1218                         NULL, body_cost_vec, false);
1219   else
1220     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1221
1222   if (dump_enabled_p ())
1223     dump_printf_loc (MSG_NOTE, vect_location,
1224                      "vect_get_data_access_cost: inside_cost = %d, "
1225                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1226 }
1227
1228
1229 typedef struct _vect_peel_info
1230 {
1231   struct data_reference *dr;
1232   int npeel;
1233   unsigned int count;
1234 } *vect_peel_info;
1235
1236 typedef struct _vect_peel_extended_info
1237 {
1238   struct _vect_peel_info peel_info;
1239   unsigned int inside_cost;
1240   unsigned int outside_cost;
1241 } *vect_peel_extended_info;
1242
1243
1244 /* Peeling hashtable helpers.  */
1245
1246 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1247 {
1248   static inline hashval_t hash (const _vect_peel_info *);
1249   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1250 };
1251
1252 inline hashval_t
1253 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1254 {
1255   return (hashval_t) peel_info->npeel;
1256 }
1257
1258 inline bool
1259 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1260 {
1261   return (a->npeel == b->npeel);
1262 }
1263
1264
1265 /* Insert DR into peeling hash table with NPEEL as key.  */
1266
1267 static void
1268 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1269                           loop_vec_info loop_vinfo, struct data_reference *dr,
1270                           int npeel)
1271 {
1272   struct _vect_peel_info elem, *slot;
1273   _vect_peel_info **new_slot;
1274   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1275
1276   elem.npeel = npeel;
1277   slot = peeling_htab->find (&elem);
1278   if (slot)
1279     slot->count++;
1280   else
1281     {
1282       slot = XNEW (struct _vect_peel_info);
1283       slot->npeel = npeel;
1284       slot->dr = dr;
1285       slot->count = 1;
1286       new_slot = peeling_htab->find_slot (slot, INSERT);
1287       *new_slot = slot;
1288     }
1289
1290   if (!supportable_dr_alignment
1291       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1292     slot->count += VECT_MAX_COST;
1293 }
1294
1295
1296 /* Traverse peeling hash table to find peeling option that aligns maximum
1297    number of data accesses.  */
1298
1299 int
1300 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1301                                      _vect_peel_extended_info *max)
1302 {
1303   vect_peel_info elem = *slot;
1304
1305   if (elem->count > max->peel_info.count
1306       || (elem->count == max->peel_info.count
1307           && max->peel_info.npeel > elem->npeel))
1308     {
1309       max->peel_info.npeel = elem->npeel;
1310       max->peel_info.count = elem->count;
1311       max->peel_info.dr = elem->dr;
1312     }
1313
1314   return 1;
1315 }
1316
1317 /* Get the costs of peeling NPEEL iterations checking data access costs
1318    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1319    misalignment will be zero after peeling.  */
1320
1321 static void
1322 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1323                                 struct data_reference *dr0,
1324                                 unsigned int *inside_cost,
1325                                 unsigned int *outside_cost,
1326                                 stmt_vector_for_cost *body_cost_vec,
1327                                 unsigned int npeel,
1328                                 bool unknown_misalignment)
1329 {
1330   unsigned i;
1331   data_reference *dr;
1332
1333   FOR_EACH_VEC_ELT (datarefs, i, dr)
1334     {
1335       gimple *stmt = DR_STMT (dr);
1336       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1337       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1338         continue;
1339
1340       /* For interleaving, only the alignment of the first access
1341          matters.  */
1342       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1343           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1344         continue;
1345
1346       /* Strided accesses perform only component accesses, alignment is
1347          irrelevant for them.  */
1348       if (STMT_VINFO_STRIDED_P (stmt_info)
1349           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1350         continue;
1351
1352       int save_misalignment;
1353       save_misalignment = DR_MISALIGNMENT (dr);
1354       if (npeel == 0)
1355         ;
1356       else if (unknown_misalignment && dr == dr0)
1357         SET_DR_MISALIGNMENT (dr, 0);
1358       else
1359         vect_update_misalignment_for_peel (dr, dr0, npeel);
1360       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1361                                  body_cost_vec);
1362       SET_DR_MISALIGNMENT (dr, save_misalignment);
1363     }
1364 }
1365
1366 /* Traverse peeling hash table and calculate cost for each peeling option.
1367    Find the one with the lowest cost.  */
1368
1369 int
1370 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1371                                    _vect_peel_extended_info *min)
1372 {
1373   vect_peel_info elem = *slot;
1374   int dummy;
1375   unsigned int inside_cost = 0, outside_cost = 0;
1376   gimple *stmt = DR_STMT (elem->dr);
1377   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1378   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1379   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1380                        epilogue_cost_vec;
1381
1382   prologue_cost_vec.create (2);
1383   body_cost_vec.create (2);
1384   epilogue_cost_vec.create (2);
1385
1386   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1387                                   elem->dr, &inside_cost, &outside_cost,
1388                                   &body_cost_vec, elem->npeel, false);
1389
1390   body_cost_vec.release ();
1391
1392   outside_cost += vect_get_known_peeling_cost
1393     (loop_vinfo, elem->npeel, &dummy,
1394      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1395      &prologue_cost_vec, &epilogue_cost_vec);
1396
1397   /* Prologue and epilogue costs are added to the target model later.
1398      These costs depend only on the scalar iteration cost, the
1399      number of peeling iterations finally chosen, and the number of
1400      misaligned statements.  So discard the information found here.  */
1401   prologue_cost_vec.release ();
1402   epilogue_cost_vec.release ();
1403
1404   if (inside_cost < min->inside_cost
1405       || (inside_cost == min->inside_cost
1406           && outside_cost < min->outside_cost))
1407     {
1408       min->inside_cost = inside_cost;
1409       min->outside_cost = outside_cost;
1410       min->peel_info.dr = elem->dr;
1411       min->peel_info.npeel = elem->npeel;
1412       min->peel_info.count = elem->count;
1413     }
1414
1415   return 1;
1416 }
1417
1418
1419 /* Choose best peeling option by traversing peeling hash table and either
1420    choosing an option with the lowest cost (if cost model is enabled) or the
1421    option that aligns as many accesses as possible.  */
1422
1423 static struct _vect_peel_extended_info
1424 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1425                                        loop_vec_info loop_vinfo)
1426 {
1427    struct _vect_peel_extended_info res;
1428
1429    res.peel_info.dr = NULL;
1430
1431    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1432      {
1433        res.inside_cost = INT_MAX;
1434        res.outside_cost = INT_MAX;
1435        peeling_htab->traverse <_vect_peel_extended_info *,
1436                                vect_peeling_hash_get_lowest_cost> (&res);
1437      }
1438    else
1439      {
1440        res.peel_info.count = 0;
1441        peeling_htab->traverse <_vect_peel_extended_info *,
1442                                vect_peeling_hash_get_most_frequent> (&res);
1443        res.inside_cost = 0;
1444        res.outside_cost = 0;
1445      }
1446
1447    return res;
1448 }
1449
1450 /* Return true if the new peeling NPEEL is supported.  */
1451
1452 static bool
1453 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1454                           unsigned npeel)
1455 {
1456   unsigned i;
1457   struct data_reference *dr = NULL;
1458   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1459   gimple *stmt;
1460   stmt_vec_info stmt_info;
1461   enum dr_alignment_support supportable_dr_alignment;
1462
1463   /* Ensure that all data refs can be vectorized after the peel.  */
1464   FOR_EACH_VEC_ELT (datarefs, i, dr)
1465     {
1466       int save_misalignment;
1467
1468       if (dr == dr0)
1469         continue;
1470
1471       stmt = DR_STMT (dr);
1472       stmt_info = vinfo_for_stmt (stmt);
1473       /* For interleaving, only the alignment of the first access
1474          matters.  */
1475       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1476           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1477         continue;
1478
1479       /* Strided accesses perform only component accesses, alignment is
1480          irrelevant for them.  */
1481       if (STMT_VINFO_STRIDED_P (stmt_info)
1482           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1483         continue;
1484
1485       save_misalignment = DR_MISALIGNMENT (dr);
1486       vect_update_misalignment_for_peel (dr, dr0, npeel);
1487       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1488       SET_DR_MISALIGNMENT (dr, save_misalignment);
1489
1490       if (!supportable_dr_alignment)
1491         return false;
1492     }
1493
1494   return true;
1495 }
1496
1497 /* Function vect_enhance_data_refs_alignment
1498
1499    This pass will use loop versioning and loop peeling in order to enhance
1500    the alignment of data references in the loop.
1501
1502    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1503    original loop is to be vectorized.  Any other loops that are created by
1504    the transformations performed in this pass - are not supposed to be
1505    vectorized.  This restriction will be relaxed.
1506
1507    This pass will require a cost model to guide it whether to apply peeling
1508    or versioning or a combination of the two.  For example, the scheme that
1509    intel uses when given a loop with several memory accesses, is as follows:
1510    choose one memory access ('p') which alignment you want to force by doing
1511    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1512    other accesses are not necessarily aligned, or (2) use loop versioning to
1513    generate one loop in which all accesses are aligned, and another loop in
1514    which only 'p' is necessarily aligned.
1515
1516    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1517    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1518    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1519
1520    Devising a cost model is the most critical aspect of this work.  It will
1521    guide us on which access to peel for, whether to use loop versioning, how
1522    many versions to create, etc.  The cost model will probably consist of
1523    generic considerations as well as target specific considerations (on
1524    powerpc for example, misaligned stores are more painful than misaligned
1525    loads).
1526
1527    Here are the general steps involved in alignment enhancements:
1528
1529      -- original loop, before alignment analysis:
1530         for (i=0; i<N; i++){
1531           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1532           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1533         }
1534
1535      -- After vect_compute_data_refs_alignment:
1536         for (i=0; i<N; i++){
1537           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1538           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1539         }
1540
1541      -- Possibility 1: we do loop versioning:
1542      if (p is aligned) {
1543         for (i=0; i<N; i++){    # loop 1A
1544           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1545           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1546         }
1547      }
1548      else {
1549         for (i=0; i<N; i++){    # loop 1B
1550           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1551           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1552         }
1553      }
1554
1555      -- Possibility 2: we do loop peeling:
1556      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1557         x = q[i];
1558         p[i] = y;
1559      }
1560      for (i = 3; i < N; i++){   # loop 2A
1561         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1562         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1563      }
1564
1565      -- Possibility 3: combination of loop peeling and versioning:
1566      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1567         x = q[i];
1568         p[i] = y;
1569      }
1570      if (p is aligned) {
1571         for (i = 3; i<N; i++){  # loop 3A
1572           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1573           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1574         }
1575      }
1576      else {
1577         for (i = 3; i<N; i++){  # loop 3B
1578           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1579           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1580         }
1581      }
1582
1583      These loops are later passed to loop_transform to be vectorized.  The
1584      vectorizer will use the alignment information to guide the transformation
1585      (whether to generate regular loads/stores, or with special handling for
1586      misalignment).  */
1587
1588 bool
1589 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1590 {
1591   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1592   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1593   enum dr_alignment_support supportable_dr_alignment;
1594   struct data_reference *dr0 = NULL, *first_store = NULL;
1595   struct data_reference *dr;
1596   unsigned int i, j;
1597   bool do_peeling = false;
1598   bool do_versioning = false;
1599   bool stat;
1600   gimple *stmt;
1601   stmt_vec_info stmt_info;
1602   unsigned int npeel = 0;
1603   bool one_misalignment_known = false;
1604   bool one_misalignment_unknown = false;
1605   bool one_dr_unsupportable = false;
1606   struct data_reference *unsupportable_dr = NULL;
1607   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1608   unsigned possible_npeel_number = 1;
1609   tree vectype;
1610   unsigned int mis, same_align_drs_max = 0;
1611   hash_table<peel_info_hasher> peeling_htab (1);
1612
1613   if (dump_enabled_p ())
1614     dump_printf_loc (MSG_NOTE, vect_location,
1615                      "=== vect_enhance_data_refs_alignment ===\n");
1616
1617   /* Reset data so we can safely be called multiple times.  */
1618   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1619   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1620
1621   /* While cost model enhancements are expected in the future, the high level
1622      view of the code at this time is as follows:
1623
1624      A) If there is a misaligned access then see if peeling to align
1625         this access can make all data references satisfy
1626         vect_supportable_dr_alignment.  If so, update data structures
1627         as needed and return true.
1628
1629      B) If peeling wasn't possible and there is a data reference with an
1630         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1631         then see if loop versioning checks can be used to make all data
1632         references satisfy vect_supportable_dr_alignment.  If so, update
1633         data structures as needed and return true.
1634
1635      C) If neither peeling nor versioning were successful then return false if
1636         any data reference does not satisfy vect_supportable_dr_alignment.
1637
1638      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1639
1640      Note, Possibility 3 above (which is peeling and versioning together) is not
1641      being done at this time.  */
1642
1643   /* (1) Peeling to force alignment.  */
1644
1645   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1646      Considerations:
1647      + How many accesses will become aligned due to the peeling
1648      - How many accesses will become unaligned due to the peeling,
1649        and the cost of misaligned accesses.
1650      - The cost of peeling (the extra runtime checks, the increase
1651        in code size).  */
1652
1653   FOR_EACH_VEC_ELT (datarefs, i, dr)
1654     {
1655       stmt = DR_STMT (dr);
1656       stmt_info = vinfo_for_stmt (stmt);
1657
1658       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1659         continue;
1660
1661       /* For interleaving, only the alignment of the first access
1662          matters.  */
1663       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1664           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1665         continue;
1666
1667       /* For invariant accesses there is nothing to enhance.  */
1668       if (integer_zerop (DR_STEP (dr)))
1669         continue;
1670
1671       /* Strided accesses perform only component accesses, alignment is
1672          irrelevant for them.  */
1673       if (STMT_VINFO_STRIDED_P (stmt_info)
1674           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1675         continue;
1676
1677       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1678       do_peeling = vector_alignment_reachable_p (dr);
1679       if (do_peeling)
1680         {
1681           if (known_alignment_for_access_p (dr))
1682             {
1683               unsigned int npeel_tmp = 0;
1684               bool negative = tree_int_cst_compare (DR_STEP (dr),
1685                                                     size_zero_node) < 0;
1686
1687               vectype = STMT_VINFO_VECTYPE (stmt_info);
1688               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1689               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1690               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1691               if (DR_MISALIGNMENT (dr) != 0)
1692                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1693
1694               /* For multiple types, it is possible that the bigger type access
1695                  will have more than one peeling option.  E.g., a loop with two
1696                  types: one of size (vector size / 4), and the other one of
1697                  size (vector size / 8).  Vectorization factor will 8.  If both
1698                  accesses are misaligned by 3, the first one needs one scalar
1699                  iteration to be aligned, and the second one needs 5.  But the
1700                  first one will be aligned also by peeling 5 scalar
1701                  iterations, and in that case both accesses will be aligned.
1702                  Hence, except for the immediate peeling amount, we also want
1703                  to try to add full vector size, while we don't exceed
1704                  vectorization factor.
1705                  We do this automatically for cost model, since we calculate
1706                  cost for every peeling option.  */
1707               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1708                 {
1709                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1710                                           ? vf * GROUP_SIZE (stmt_info) : vf);
1711                   possible_npeel_number
1712                     = vect_get_num_vectors (nscalars, vectype);
1713
1714                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1715                      allow peeling NELEMENTS.  */
1716                   if (DR_MISALIGNMENT (dr) == 0)
1717                     possible_npeel_number++;
1718                 }
1719
1720               /* Save info about DR in the hash table.  Also include peeling
1721                  amounts according to the explanation above.  */
1722               for (j = 0; j < possible_npeel_number; j++)
1723                 {
1724                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1725                                             dr, npeel_tmp);
1726                   npeel_tmp += target_align / dr_size;
1727                 }
1728
1729               one_misalignment_known = true;
1730             }
1731           else
1732             {
1733               /* If we don't know any misalignment values, we prefer
1734                  peeling for data-ref that has the maximum number of data-refs
1735                  with the same alignment, unless the target prefers to align
1736                  stores over load.  */
1737               unsigned same_align_drs
1738                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1739               if (!dr0
1740                   || same_align_drs_max < same_align_drs)
1741                 {
1742                   same_align_drs_max = same_align_drs;
1743                   dr0 = dr;
1744                 }
1745               /* For data-refs with the same number of related
1746                  accesses prefer the one where the misalign
1747                  computation will be invariant in the outermost loop.  */
1748               else if (same_align_drs_max == same_align_drs)
1749                 {
1750                   struct loop *ivloop0, *ivloop;
1751                   ivloop0 = outermost_invariant_loop_for_expr
1752                     (loop, DR_BASE_ADDRESS (dr0));
1753                   ivloop = outermost_invariant_loop_for_expr
1754                     (loop, DR_BASE_ADDRESS (dr));
1755                   if ((ivloop && !ivloop0)
1756                       || (ivloop && ivloop0
1757                           && flow_loop_nested_p (ivloop, ivloop0)))
1758                     dr0 = dr;
1759                 }
1760
1761               one_misalignment_unknown = true;
1762
1763               /* Check for data refs with unsupportable alignment that
1764                  can be peeled.  */
1765               if (!supportable_dr_alignment)
1766               {
1767                 one_dr_unsupportable = true;
1768                 unsupportable_dr = dr;
1769               }
1770
1771               if (!first_store && DR_IS_WRITE (dr))
1772                 first_store = dr;
1773             }
1774         }
1775       else
1776         {
1777           if (!aligned_access_p (dr))
1778             {
1779               if (dump_enabled_p ())
1780                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1781                                  "vector alignment may not be reachable\n");
1782               break;
1783             }
1784         }
1785     }
1786
1787   /* Check if we can possibly peel the loop.  */
1788   if (!vect_can_advance_ivs_p (loop_vinfo)
1789       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1790       || loop->inner)
1791     do_peeling = false;
1792
1793   struct _vect_peel_extended_info peel_for_known_alignment;
1794   struct _vect_peel_extended_info peel_for_unknown_alignment;
1795   struct _vect_peel_extended_info best_peel;
1796
1797   peel_for_unknown_alignment.inside_cost = INT_MAX;
1798   peel_for_unknown_alignment.outside_cost = INT_MAX;
1799   peel_for_unknown_alignment.peel_info.count = 0;
1800
1801   if (do_peeling
1802       && one_misalignment_unknown)
1803     {
1804       /* Check if the target requires to prefer stores over loads, i.e., if
1805          misaligned stores are more expensive than misaligned loads (taking
1806          drs with same alignment into account).  */
1807       unsigned int load_inside_cost = 0;
1808       unsigned int load_outside_cost = 0;
1809       unsigned int store_inside_cost = 0;
1810       unsigned int store_outside_cost = 0;
1811       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1812
1813       stmt_vector_for_cost dummy;
1814       dummy.create (2);
1815       vect_get_peeling_costs_all_drs (datarefs, dr0,
1816                                       &load_inside_cost,
1817                                       &load_outside_cost,
1818                                       &dummy, estimated_npeels, true);
1819       dummy.release ();
1820
1821       if (first_store)
1822         {
1823           dummy.create (2);
1824           vect_get_peeling_costs_all_drs (datarefs, first_store,
1825                                           &store_inside_cost,
1826                                           &store_outside_cost,
1827                                           &dummy, estimated_npeels, true);
1828           dummy.release ();
1829         }
1830       else
1831         {
1832           store_inside_cost = INT_MAX;
1833           store_outside_cost = INT_MAX;
1834         }
1835
1836       if (load_inside_cost > store_inside_cost
1837           || (load_inside_cost == store_inside_cost
1838               && load_outside_cost > store_outside_cost))
1839         {
1840           dr0 = first_store;
1841           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1842           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1843         }
1844       else
1845         {
1846           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1847           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1848         }
1849
1850       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1851       prologue_cost_vec.create (2);
1852       epilogue_cost_vec.create (2);
1853
1854       int dummy2;
1855       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1856         (loop_vinfo, estimated_npeels, &dummy2,
1857          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1858          &prologue_cost_vec, &epilogue_cost_vec);
1859
1860       prologue_cost_vec.release ();
1861       epilogue_cost_vec.release ();
1862
1863       peel_for_unknown_alignment.peel_info.count = 1
1864         + STMT_VINFO_SAME_ALIGN_REFS
1865         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1866     }
1867
1868   peel_for_unknown_alignment.peel_info.npeel = 0;
1869   peel_for_unknown_alignment.peel_info.dr = dr0;
1870
1871   best_peel = peel_for_unknown_alignment;
1872
1873   peel_for_known_alignment.inside_cost = INT_MAX;
1874   peel_for_known_alignment.outside_cost = INT_MAX;
1875   peel_for_known_alignment.peel_info.count = 0;
1876   peel_for_known_alignment.peel_info.dr = NULL;
1877
1878   if (do_peeling && one_misalignment_known)
1879     {
1880       /* Peeling is possible, but there is no data access that is not supported
1881          unless aligned.  So we try to choose the best possible peeling from
1882          the hash table.  */
1883       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1884         (&peeling_htab, loop_vinfo);
1885     }
1886
1887   /* Compare costs of peeling for known and unknown alignment. */
1888   if (peel_for_known_alignment.peel_info.dr != NULL
1889       && peel_for_unknown_alignment.inside_cost
1890       >= peel_for_known_alignment.inside_cost)
1891     {
1892       best_peel = peel_for_known_alignment;
1893
1894       /* If the best peeling for known alignment has NPEEL == 0, perform no
1895          peeling at all except if there is an unsupportable dr that we can
1896          align.  */
1897       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1898         do_peeling = false;
1899     }
1900
1901   /* If there is an unsupportable data ref, prefer this over all choices so far
1902      since we'd have to discard a chosen peeling except when it accidentally
1903      aligned the unsupportable data ref.  */
1904   if (one_dr_unsupportable)
1905     dr0 = unsupportable_dr;
1906   else if (do_peeling)
1907     {
1908       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1909          TODO: Use nopeel_outside_cost or get rid of it?  */
1910       unsigned nopeel_inside_cost = 0;
1911       unsigned nopeel_outside_cost = 0;
1912
1913       stmt_vector_for_cost dummy;
1914       dummy.create (2);
1915       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1916                                       &nopeel_outside_cost, &dummy, 0, false);
1917       dummy.release ();
1918
1919       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1920          costs will be recorded.  */
1921       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1922       prologue_cost_vec.create (2);
1923       epilogue_cost_vec.create (2);
1924
1925       int dummy2;
1926       nopeel_outside_cost += vect_get_known_peeling_cost
1927         (loop_vinfo, 0, &dummy2,
1928          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1929          &prologue_cost_vec, &epilogue_cost_vec);
1930
1931       prologue_cost_vec.release ();
1932       epilogue_cost_vec.release ();
1933
1934       npeel = best_peel.peel_info.npeel;
1935       dr0 = best_peel.peel_info.dr;
1936
1937       /* If no peeling is not more expensive than the best peeling we
1938          have so far, don't perform any peeling.  */
1939       if (nopeel_inside_cost <= best_peel.inside_cost)
1940         do_peeling = false;
1941     }
1942
1943   if (do_peeling)
1944     {
1945       stmt = DR_STMT (dr0);
1946       stmt_info = vinfo_for_stmt (stmt);
1947       vectype = STMT_VINFO_VECTYPE (stmt_info);
1948
1949       if (known_alignment_for_access_p (dr0))
1950         {
1951           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1952                                                 size_zero_node) < 0;
1953           if (!npeel)
1954             {
1955               /* Since it's known at compile time, compute the number of
1956                  iterations in the peeled loop (the peeling factor) for use in
1957                  updating DR_MISALIGNMENT values.  The peeling factor is the
1958                  vectorization factor minus the misalignment as an element
1959                  count.  */
1960               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1961               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1962               npeel = ((mis & (target_align - 1))
1963                        / vect_get_scalar_dr_size (dr0));
1964             }
1965
1966           /* For interleaved data access every iteration accesses all the
1967              members of the group, therefore we divide the number of iterations
1968              by the group size.  */
1969           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1970           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1971             npeel /= GROUP_SIZE (stmt_info);
1972
1973           if (dump_enabled_p ())
1974             dump_printf_loc (MSG_NOTE, vect_location,
1975                              "Try peeling by %d\n", npeel);
1976         }
1977
1978       /* Ensure that all datarefs can be vectorized after the peel.  */
1979       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1980         do_peeling = false;
1981
1982       /* Check if all datarefs are supportable and log.  */
1983       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1984         {
1985           stat = vect_verify_datarefs_alignment (loop_vinfo);
1986           if (!stat)
1987             do_peeling = false;
1988           else
1989             return stat;
1990         }
1991
1992       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1993       if (do_peeling)
1994         {
1995           unsigned max_allowed_peel
1996             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1997           if (max_allowed_peel != (unsigned)-1)
1998             {
1999               unsigned max_peel = npeel;
2000               if (max_peel == 0)
2001                 {
2002                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2003                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2004                 }
2005               if (max_peel > max_allowed_peel)
2006                 {
2007                   do_peeling = false;
2008                   if (dump_enabled_p ())
2009                     dump_printf_loc (MSG_NOTE, vect_location,
2010                         "Disable peeling, max peels reached: %d\n", max_peel);
2011                 }
2012             }
2013         }
2014
2015       /* Cost model #2 - if peeling may result in a remaining loop not
2016          iterating enough to be vectorized then do not peel.  Since this
2017          is a cost heuristic rather than a correctness decision, use the
2018          most likely runtime value for variable vectorization factors.  */
2019       if (do_peeling
2020           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2021         {
2022           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2023           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2024           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2025               < assumed_vf + max_peel)
2026             do_peeling = false;
2027         }
2028
2029       if (do_peeling)
2030         {
2031           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2032              If the misalignment of DR_i is identical to that of dr0 then set
2033              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2034              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2035              by the peeling factor times the element size of DR_i (MOD the
2036              vectorization factor times the size).  Otherwise, the
2037              misalignment of DR_i must be set to unknown.  */
2038           FOR_EACH_VEC_ELT (datarefs, i, dr)
2039             if (dr != dr0)
2040               {
2041                 /* Strided accesses perform only component accesses, alignment
2042                    is irrelevant for them.  */
2043                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2044                 if (STMT_VINFO_STRIDED_P (stmt_info)
2045                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2046                   continue;
2047
2048                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2049               }
2050
2051           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2052           if (npeel)
2053             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2054           else
2055             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2056               = DR_MISALIGNMENT (dr0);
2057           SET_DR_MISALIGNMENT (dr0, 0);
2058           if (dump_enabled_p ())
2059             {
2060               dump_printf_loc (MSG_NOTE, vect_location,
2061                                "Alignment of access forced using peeling.\n");
2062               dump_printf_loc (MSG_NOTE, vect_location,
2063                                "Peeling for alignment will be applied.\n");
2064             }
2065
2066           /* The inside-loop cost will be accounted for in vectorizable_load
2067              and vectorizable_store correctly with adjusted alignments.
2068              Drop the body_cst_vec on the floor here.  */
2069           stat = vect_verify_datarefs_alignment (loop_vinfo);
2070           gcc_assert (stat);
2071           return stat;
2072         }
2073     }
2074
2075   /* (2) Versioning to force alignment.  */
2076
2077   /* Try versioning if:
2078      1) optimize loop for speed
2079      2) there is at least one unsupported misaligned data ref with an unknown
2080         misalignment, and
2081      3) all misaligned data refs with a known misalignment are supported, and
2082      4) the number of runtime alignment checks is within reason.  */
2083
2084   do_versioning =
2085         optimize_loop_nest_for_speed_p (loop)
2086         && (!loop->inner); /* FORNOW */
2087
2088   if (do_versioning)
2089     {
2090       FOR_EACH_VEC_ELT (datarefs, i, dr)
2091         {
2092           stmt = DR_STMT (dr);
2093           stmt_info = vinfo_for_stmt (stmt);
2094
2095           /* For interleaving, only the alignment of the first access
2096              matters.  */
2097           if (aligned_access_p (dr)
2098               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2099                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2100             continue;
2101
2102           if (STMT_VINFO_STRIDED_P (stmt_info))
2103             {
2104               /* Strided loads perform only component accesses, alignment is
2105                  irrelevant for them.  */
2106               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2107                 continue;
2108               do_versioning = false;
2109               break;
2110             }
2111
2112           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2113
2114           if (!supportable_dr_alignment)
2115             {
2116               gimple *stmt;
2117               int mask;
2118               tree vectype;
2119
2120               if (known_alignment_for_access_p (dr)
2121                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2122                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2123                 {
2124                   do_versioning = false;
2125                   break;
2126                 }
2127
2128               stmt = DR_STMT (dr);
2129               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2130               gcc_assert (vectype);
2131
2132               /* At present we don't support versioning for alignment
2133                  with variable VF, since there's no guarantee that the
2134                  VF is a power of two.  We could relax this if we added
2135                  a way of enforcing a power-of-two size.  */
2136               unsigned HOST_WIDE_INT size;
2137               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2138                 {
2139                   do_versioning = false;
2140                   break;
2141                 }
2142
2143               /* The rightmost bits of an aligned address must be zeros.
2144                  Construct the mask needed for this test.  For example,
2145                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2146                  mask must be 15 = 0xf. */
2147               mask = size - 1;
2148
2149               /* FORNOW: use the same mask to test all potentially unaligned
2150                  references in the loop.  The vectorizer currently supports
2151                  a single vector size, see the reference to
2152                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2153                  vectorization factor is computed.  */
2154               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2155                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2156               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2157               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2158                       DR_STMT (dr));
2159             }
2160         }
2161
2162       /* Versioning requires at least one misaligned data reference.  */
2163       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2164         do_versioning = false;
2165       else if (!do_versioning)
2166         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2167     }
2168
2169   if (do_versioning)
2170     {
2171       vec<gimple *> may_misalign_stmts
2172         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2173       gimple *stmt;
2174
2175       /* It can now be assumed that the data references in the statements
2176          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2177          of the loop being vectorized.  */
2178       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2179         {
2180           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2181           dr = STMT_VINFO_DATA_REF (stmt_info);
2182           SET_DR_MISALIGNMENT (dr, 0);
2183           if (dump_enabled_p ())
2184             dump_printf_loc (MSG_NOTE, vect_location,
2185                              "Alignment of access forced using versioning.\n");
2186         }
2187
2188       if (dump_enabled_p ())
2189         dump_printf_loc (MSG_NOTE, vect_location,
2190                          "Versioning for alignment will be applied.\n");
2191
2192       /* Peeling and versioning can't be done together at this time.  */
2193       gcc_assert (! (do_peeling && do_versioning));
2194
2195       stat = vect_verify_datarefs_alignment (loop_vinfo);
2196       gcc_assert (stat);
2197       return stat;
2198     }
2199
2200   /* This point is reached if neither peeling nor versioning is being done.  */
2201   gcc_assert (! (do_peeling || do_versioning));
2202
2203   stat = vect_verify_datarefs_alignment (loop_vinfo);
2204   return stat;
2205 }
2206
2207
2208 /* Function vect_find_same_alignment_drs.
2209
2210    Update group and alignment relations according to the chosen
2211    vectorization factor.  */
2212
2213 static void
2214 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2215 {
2216   struct data_reference *dra = DDR_A (ddr);
2217   struct data_reference *drb = DDR_B (ddr);
2218   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2219   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2220
2221   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2222     return;
2223
2224   if (dra == drb)
2225     return;
2226
2227   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2228       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2229       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2230     return;
2231
2232   /* Two references with distance zero have the same alignment.  */
2233   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2234                           - wi::to_poly_offset (DR_INIT (drb)));
2235   if (maybe_ne (diff, 0))
2236     {
2237       /* Get the wider of the two alignments.  */
2238       unsigned int align_a = (vect_calculate_target_alignment (dra)
2239                               / BITS_PER_UNIT);
2240       unsigned int align_b = (vect_calculate_target_alignment (drb)
2241                               / BITS_PER_UNIT);
2242       unsigned int max_align = MAX (align_a, align_b);
2243
2244       /* Require the gap to be a multiple of the larger vector alignment.  */
2245       if (!multiple_p (diff, max_align))
2246         return;
2247     }
2248
2249   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2250   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2251   if (dump_enabled_p ())
2252     {
2253       dump_printf_loc (MSG_NOTE, vect_location,
2254                        "accesses have the same alignment: ");
2255       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2256       dump_printf (MSG_NOTE,  " and ");
2257       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2258       dump_printf (MSG_NOTE, "\n");
2259     }
2260 }
2261
2262
2263 /* Function vect_analyze_data_refs_alignment
2264
2265    Analyze the alignment of the data-references in the loop.
2266    Return FALSE if a data reference is found that cannot be vectorized.  */
2267
2268 bool
2269 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2270 {
2271   if (dump_enabled_p ())
2272     dump_printf_loc (MSG_NOTE, vect_location,
2273                      "=== vect_analyze_data_refs_alignment ===\n");
2274
2275   /* Mark groups of data references with same alignment using
2276      data dependence information.  */
2277   vec<ddr_p> ddrs = vinfo->ddrs;
2278   struct data_dependence_relation *ddr;
2279   unsigned int i;
2280
2281   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2282     vect_find_same_alignment_drs (ddr);
2283
2284   vec<data_reference_p> datarefs = vinfo->datarefs;
2285   struct data_reference *dr;
2286
2287   vect_record_base_alignments (vinfo);
2288   FOR_EACH_VEC_ELT (datarefs, i, dr)
2289     {
2290       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2291       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2292           && !vect_compute_data_ref_alignment (dr))
2293         {
2294           /* Strided accesses perform only component accesses, misalignment
2295              information is irrelevant for them.  */
2296           if (STMT_VINFO_STRIDED_P (stmt_info)
2297               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2298             continue;
2299
2300           if (dump_enabled_p ())
2301             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2302                              "not vectorized: can't calculate alignment "
2303                              "for data ref.\n");
2304
2305           return false;
2306         }
2307     }
2308
2309   return true;
2310 }
2311
2312
2313 /* Analyze alignment of DRs of stmts in NODE.  */
2314
2315 static bool
2316 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2317 {
2318   /* We vectorize from the first scalar stmt in the node unless
2319      the node is permuted in which case we start from the first
2320      element in the group.  */
2321   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2322   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2323   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2324     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2325
2326   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2327   if (! vect_compute_data_ref_alignment (dr)
2328       /* For creating the data-ref pointer we need alignment of the
2329          first element anyway.  */
2330       || (dr != first_dr
2331           && ! vect_compute_data_ref_alignment (first_dr))
2332       || ! verify_data_ref_alignment (dr))
2333     {
2334       if (dump_enabled_p ())
2335         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336                          "not vectorized: bad data alignment in basic "
2337                          "block.\n");
2338       return false;
2339     }
2340
2341   return true;
2342 }
2343
2344 /* Function vect_slp_analyze_instance_alignment
2345
2346    Analyze the alignment of the data-references in the SLP instance.
2347    Return FALSE if a data reference is found that cannot be vectorized.  */
2348
2349 bool
2350 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2351 {
2352   if (dump_enabled_p ())
2353     dump_printf_loc (MSG_NOTE, vect_location,
2354                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2355
2356   slp_tree node;
2357   unsigned i;
2358   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2359     if (! vect_slp_analyze_and_verify_node_alignment (node))
2360       return false;
2361
2362   node = SLP_INSTANCE_TREE (instance);
2363   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2364       && ! vect_slp_analyze_and_verify_node_alignment
2365              (SLP_INSTANCE_TREE (instance)))
2366     return false;
2367
2368   return true;
2369 }
2370
2371
2372 /* Analyze groups of accesses: check that DR belongs to a group of
2373    accesses of legal size, step, etc.  Detect gaps, single element
2374    interleaving, and other special cases. Set grouped access info.
2375    Collect groups of strided stores for further use in SLP analysis.
2376    Worker for vect_analyze_group_access.  */
2377
2378 static bool
2379 vect_analyze_group_access_1 (struct data_reference *dr)
2380 {
2381   tree step = DR_STEP (dr);
2382   tree scalar_type = TREE_TYPE (DR_REF (dr));
2383   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2384   gimple *stmt = DR_STMT (dr);
2385   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2386   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2387   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2388   HOST_WIDE_INT dr_step = -1;
2389   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2390   bool slp_impossible = false;
2391
2392   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2393      size of the interleaving group (including gaps).  */
2394   if (tree_fits_shwi_p (step))
2395     {
2396       dr_step = tree_to_shwi (step);
2397       /* Check that STEP is a multiple of type size.  Otherwise there is
2398          a non-element-sized gap at the end of the group which we
2399          cannot represent in GROUP_GAP or GROUP_SIZE.
2400          ???  As we can handle non-constant step fine here we should
2401          simply remove uses of GROUP_GAP between the last and first
2402          element and instead rely on DR_STEP.  GROUP_SIZE then would
2403          simply not include that gap.  */
2404       if ((dr_step % type_size) != 0)
2405         {
2406           if (dump_enabled_p ())
2407             {
2408               dump_printf_loc (MSG_NOTE, vect_location,
2409                                "Step ");
2410               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2411               dump_printf (MSG_NOTE,
2412                            " is not a multiple of the element size for ");
2413               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2414               dump_printf (MSG_NOTE, "\n");
2415             }
2416           return false;
2417         }
2418       groupsize = absu_hwi (dr_step) / type_size;
2419     }
2420   else
2421     groupsize = 0;
2422
2423   /* Not consecutive access is possible only if it is a part of interleaving.  */
2424   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2425     {
2426       /* Check if it this DR is a part of interleaving, and is a single
2427          element of the group that is accessed in the loop.  */
2428
2429       /* Gaps are supported only for loads. STEP must be a multiple of the type
2430          size.  The size of the group must be a power of 2.  */
2431       if (DR_IS_READ (dr)
2432           && (dr_step % type_size) == 0
2433           && groupsize > 0
2434           && pow2p_hwi (groupsize))
2435         {
2436           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2437           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2438           GROUP_GAP (stmt_info) = groupsize - 1;
2439           if (dump_enabled_p ())
2440             {
2441               dump_printf_loc (MSG_NOTE, vect_location,
2442                                "Detected single element interleaving ");
2443               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2444               dump_printf (MSG_NOTE, " step ");
2445               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2446               dump_printf (MSG_NOTE, "\n");
2447             }
2448
2449           return true;
2450         }
2451
2452       if (dump_enabled_p ())
2453         {
2454           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2455                            "not consecutive access ");
2456           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2457         }
2458
2459       if (bb_vinfo)
2460         {
2461           /* Mark the statement as unvectorizable.  */
2462           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2463           return true;
2464         }
2465
2466       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2467       STMT_VINFO_STRIDED_P (stmt_info) = true;
2468       return true;
2469     }
2470
2471   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2472     {
2473       /* First stmt in the interleaving chain. Check the chain.  */
2474       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2475       struct data_reference *data_ref = dr;
2476       unsigned int count = 1;
2477       tree prev_init = DR_INIT (data_ref);
2478       gimple *prev = stmt;
2479       HOST_WIDE_INT diff, gaps = 0;
2480
2481       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2482       while (next)
2483         {
2484           /* Skip same data-refs.  In case that two or more stmts share
2485              data-ref (supported only for loads), we vectorize only the first
2486              stmt, and the rest get their vectorized loads from the first
2487              one.  */
2488           if (!tree_int_cst_compare (DR_INIT (data_ref),
2489                                      DR_INIT (STMT_VINFO_DATA_REF (
2490                                                    vinfo_for_stmt (next)))))
2491             {
2492               if (DR_IS_WRITE (data_ref))
2493                 {
2494                   if (dump_enabled_p ())
2495                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2496                                      "Two store stmts share the same dr.\n");
2497                   return false;
2498                 }
2499
2500               if (dump_enabled_p ())
2501                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2502                                  "Two or more load stmts share the same dr.\n");
2503
2504               /* For load use the same data-ref load.  */
2505               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2506
2507               prev = next;
2508               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2509               continue;
2510             }
2511
2512           prev = next;
2513           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2514
2515           /* All group members have the same STEP by construction.  */
2516           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2517
2518           /* Check that the distance between two accesses is equal to the type
2519              size. Otherwise, we have gaps.  */
2520           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2521                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2522           if (diff != 1)
2523             {
2524               /* FORNOW: SLP of accesses with gaps is not supported.  */
2525               slp_impossible = true;
2526               if (DR_IS_WRITE (data_ref))
2527                 {
2528                   if (dump_enabled_p ())
2529                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2530                                      "interleaved store with gaps\n");
2531                   return false;
2532                 }
2533
2534               gaps += diff - 1;
2535             }
2536
2537           last_accessed_element += diff;
2538
2539           /* Store the gap from the previous member of the group. If there is no
2540              gap in the access, GROUP_GAP is always 1.  */
2541           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2542
2543           prev_init = DR_INIT (data_ref);
2544           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2545           /* Count the number of data-refs in the chain.  */
2546           count++;
2547         }
2548
2549       if (groupsize == 0)
2550         groupsize = count + gaps;
2551
2552       /* This could be UINT_MAX but as we are generating code in a very
2553          inefficient way we have to cap earlier.  See PR78699 for example.  */
2554       if (groupsize > 4096)
2555         {
2556           if (dump_enabled_p ())
2557             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2558                              "group is too large\n");
2559           return false;
2560         }
2561
2562       /* Check that the size of the interleaving is equal to count for stores,
2563          i.e., that there are no gaps.  */
2564       if (groupsize != count
2565           && !DR_IS_READ (dr))
2566         {
2567           if (dump_enabled_p ())
2568             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2569                              "interleaved store with gaps\n");
2570           return false;
2571         }
2572
2573       /* If there is a gap after the last load in the group it is the
2574          difference between the groupsize and the last accessed
2575          element.
2576          When there is no gap, this difference should be 0.  */
2577       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2578
2579       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2580       if (dump_enabled_p ())
2581         {
2582           dump_printf_loc (MSG_NOTE, vect_location,
2583                            "Detected interleaving ");
2584           if (DR_IS_READ (dr))
2585             dump_printf (MSG_NOTE, "load ");
2586           else
2587             dump_printf (MSG_NOTE, "store ");
2588           dump_printf (MSG_NOTE, "of size %u starting with ",
2589                        (unsigned)groupsize);
2590           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2591           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2592             dump_printf_loc (MSG_NOTE, vect_location,
2593                              "There is a gap of %u elements after the group\n",
2594                              GROUP_GAP (vinfo_for_stmt (stmt)));
2595         }
2596
2597       /* SLP: create an SLP data structure for every interleaving group of
2598          stores for further analysis in vect_analyse_slp.  */
2599       if (DR_IS_WRITE (dr) && !slp_impossible)
2600         {
2601           if (loop_vinfo)
2602             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2603           if (bb_vinfo)
2604             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2605         }
2606     }
2607
2608   return true;
2609 }
2610
2611 /* Analyze groups of accesses: check that DR belongs to a group of
2612    accesses of legal size, step, etc.  Detect gaps, single element
2613    interleaving, and other special cases. Set grouped access info.
2614    Collect groups of strided stores for further use in SLP analysis.  */
2615
2616 static bool
2617 vect_analyze_group_access (struct data_reference *dr)
2618 {
2619   if (!vect_analyze_group_access_1 (dr))
2620     {
2621       /* Dissolve the group if present.  */
2622       gimple *next;
2623       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2624       while (stmt)
2625         {
2626           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2627           next = GROUP_NEXT_ELEMENT (vinfo);
2628           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2629           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2630           stmt = next;
2631         }
2632       return false;
2633     }
2634   return true;
2635 }
2636
2637 /* Analyze the access pattern of the data-reference DR.
2638    In case of non-consecutive accesses call vect_analyze_group_access() to
2639    analyze groups of accesses.  */
2640
2641 static bool
2642 vect_analyze_data_ref_access (struct data_reference *dr)
2643 {
2644   tree step = DR_STEP (dr);
2645   tree scalar_type = TREE_TYPE (DR_REF (dr));
2646   gimple *stmt = DR_STMT (dr);
2647   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2648   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2649   struct loop *loop = NULL;
2650
2651   if (loop_vinfo)
2652     loop = LOOP_VINFO_LOOP (loop_vinfo);
2653
2654   if (loop_vinfo && !step)
2655     {
2656       if (dump_enabled_p ())
2657         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2658                          "bad data-ref access in loop\n");
2659       return false;
2660     }
2661
2662   /* Allow loads with zero step in inner-loop vectorization.  */
2663   if (loop_vinfo && integer_zerop (step))
2664     {
2665       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2666       if (!nested_in_vect_loop_p (loop, stmt))
2667         return DR_IS_READ (dr);
2668       /* Allow references with zero step for outer loops marked
2669          with pragma omp simd only - it guarantees absence of
2670          loop-carried dependencies between inner loop iterations.  */
2671       if (!loop->force_vectorize)
2672         {
2673           if (dump_enabled_p ())
2674             dump_printf_loc (MSG_NOTE, vect_location,
2675                              "zero step in inner loop of nest\n");
2676           return false;
2677         }
2678     }
2679
2680   if (loop && nested_in_vect_loop_p (loop, stmt))
2681     {
2682       /* Interleaved accesses are not yet supported within outer-loop
2683         vectorization for references in the inner-loop.  */
2684       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2685
2686       /* For the rest of the analysis we use the outer-loop step.  */
2687       step = STMT_VINFO_DR_STEP (stmt_info);
2688       if (integer_zerop (step))
2689         {
2690           if (dump_enabled_p ())
2691             dump_printf_loc (MSG_NOTE, vect_location,
2692                              "zero step in outer loop.\n");
2693           return DR_IS_READ (dr);
2694         }
2695     }
2696
2697   /* Consecutive?  */
2698   if (TREE_CODE (step) == INTEGER_CST)
2699     {
2700       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2701       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2702           || (dr_step < 0
2703               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2704         {
2705           /* Mark that it is not interleaving.  */
2706           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2707           return true;
2708         }
2709     }
2710
2711   if (loop && nested_in_vect_loop_p (loop, stmt))
2712     {
2713       if (dump_enabled_p ())
2714         dump_printf_loc (MSG_NOTE, vect_location,
2715                          "grouped access in outer loop.\n");
2716       return false;
2717     }
2718
2719
2720   /* Assume this is a DR handled by non-constant strided load case.  */
2721   if (TREE_CODE (step) != INTEGER_CST)
2722     return (STMT_VINFO_STRIDED_P (stmt_info)
2723             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2724                 || vect_analyze_group_access (dr)));
2725
2726   /* Not consecutive access - check if it's a part of interleaving group.  */
2727   return vect_analyze_group_access (dr);
2728 }
2729
2730 /* Compare two data-references DRA and DRB to group them into chunks
2731    suitable for grouping.  */
2732
2733 static int
2734 dr_group_sort_cmp (const void *dra_, const void *drb_)
2735 {
2736   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2737   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2738   int cmp;
2739
2740   /* Stabilize sort.  */
2741   if (dra == drb)
2742     return 0;
2743
2744   /* DRs in different loops never belong to the same group.  */
2745   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2746   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2747   if (loopa != loopb)
2748     return loopa->num < loopb->num ? -1 : 1;
2749
2750   /* Ordering of DRs according to base.  */
2751   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2752                                DR_BASE_ADDRESS (drb));
2753   if (cmp != 0)
2754     return cmp;
2755
2756   /* And according to DR_OFFSET.  */
2757   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2758   if (cmp != 0)
2759     return cmp;
2760
2761   /* Put reads before writes.  */
2762   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2763     return DR_IS_READ (dra) ? -1 : 1;
2764
2765   /* Then sort after access size.  */
2766   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2767                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2768   if (cmp != 0)
2769     return cmp;
2770
2771   /* And after step.  */
2772   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2773   if (cmp != 0)
2774     return cmp;
2775
2776   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2777   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2778   if (cmp == 0)
2779     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2780   return cmp;
2781 }
2782
2783 /* Function vect_analyze_data_ref_accesses.
2784
2785    Analyze the access pattern of all the data references in the loop.
2786
2787    FORNOW: the only access pattern that is considered vectorizable is a
2788            simple step 1 (consecutive) access.
2789
2790    FORNOW: handle only arrays and pointer accesses.  */
2791
2792 bool
2793 vect_analyze_data_ref_accesses (vec_info *vinfo)
2794 {
2795   unsigned int i;
2796   vec<data_reference_p> datarefs = vinfo->datarefs;
2797   struct data_reference *dr;
2798
2799   if (dump_enabled_p ())
2800     dump_printf_loc (MSG_NOTE, vect_location,
2801                      "=== vect_analyze_data_ref_accesses ===\n");
2802
2803   if (datarefs.is_empty ())
2804     return true;
2805
2806   /* Sort the array of datarefs to make building the interleaving chains
2807      linear.  Don't modify the original vector's order, it is needed for
2808      determining what dependencies are reversed.  */
2809   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2810   datarefs_copy.qsort (dr_group_sort_cmp);
2811
2812   /* Build the interleaving chains.  */
2813   for (i = 0; i < datarefs_copy.length () - 1;)
2814     {
2815       data_reference_p dra = datarefs_copy[i];
2816       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2817       stmt_vec_info lastinfo = NULL;
2818       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2819         {
2820           ++i;
2821           continue;
2822         }
2823       for (i = i + 1; i < datarefs_copy.length (); ++i)
2824         {
2825           data_reference_p drb = datarefs_copy[i];
2826           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2827           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2828             break;
2829
2830           /* ???  Imperfect sorting (non-compatible types, non-modulo
2831              accesses, same accesses) can lead to a group to be artificially
2832              split here as we don't just skip over those.  If it really
2833              matters we can push those to a worklist and re-iterate
2834              over them.  The we can just skip ahead to the next DR here.  */
2835
2836           /* DRs in a different loop should not be put into the same
2837              interleaving group.  */
2838           if (gimple_bb (DR_STMT (dra))->loop_father
2839               != gimple_bb (DR_STMT (drb))->loop_father)
2840             break;
2841
2842           /* Check that the data-refs have same first location (except init)
2843              and they are both either store or load (not load and store,
2844              not masked loads or stores).  */
2845           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2846               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2847                                         DR_BASE_ADDRESS (drb)) != 0
2848               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2849               || !gimple_assign_single_p (DR_STMT (dra))
2850               || !gimple_assign_single_p (DR_STMT (drb)))
2851             break;
2852
2853           /* Check that the data-refs have the same constant size.  */
2854           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2855           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2856           if (!tree_fits_uhwi_p (sza)
2857               || !tree_fits_uhwi_p (szb)
2858               || !tree_int_cst_equal (sza, szb))
2859             break;
2860
2861           /* Check that the data-refs have the same step.  */
2862           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2863             break;
2864
2865           /* Check the types are compatible.
2866              ???  We don't distinguish this during sorting.  */
2867           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2868                                    TREE_TYPE (DR_REF (drb))))
2869             break;
2870
2871           /* Check that the DR_INITs are compile-time constants.  */
2872           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2873               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2874             break;
2875
2876           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2877           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2878           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2879           HOST_WIDE_INT init_prev
2880             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2881           gcc_assert (init_a <= init_b
2882                       && init_a <= init_prev
2883                       && init_prev <= init_b);
2884
2885           /* Do not place the same access in the interleaving chain twice.  */
2886           if (init_b == init_prev)
2887             {
2888               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2889                           < gimple_uid (DR_STMT (drb)));
2890               /* ???  For now we simply "drop" the later reference which is
2891                  otherwise the same rather than finishing off this group.
2892                  In the end we'd want to re-process duplicates forming
2893                  multiple groups from the refs, likely by just collecting
2894                  all candidates (including duplicates and split points
2895                  below) in a vector and then process them together.  */
2896               continue;
2897             }
2898
2899           /* If init_b == init_a + the size of the type * k, we have an
2900              interleaving, and DRA is accessed before DRB.  */
2901           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2902           if (type_size_a == 0
2903               || (init_b - init_a) % type_size_a != 0)
2904             break;
2905
2906           /* If we have a store, the accesses are adjacent.  This splits
2907              groups into chunks we support (we don't support vectorization
2908              of stores with gaps).  */
2909           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2910             break;
2911
2912           /* If the step (if not zero or non-constant) is greater than the
2913              difference between data-refs' inits this splits groups into
2914              suitable sizes.  */
2915           if (tree_fits_shwi_p (DR_STEP (dra)))
2916             {
2917               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2918               if (step != 0 && step <= (init_b - init_a))
2919                 break;
2920             }
2921
2922           if (dump_enabled_p ())
2923             {
2924               dump_printf_loc (MSG_NOTE, vect_location,
2925                                "Detected interleaving ");
2926               if (DR_IS_READ (dra))
2927                 dump_printf (MSG_NOTE, "load ");
2928               else
2929                 dump_printf (MSG_NOTE, "store ");
2930               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2931               dump_printf (MSG_NOTE,  " and ");
2932               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2933               dump_printf (MSG_NOTE, "\n");
2934             }
2935
2936           /* Link the found element into the group list.  */
2937           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2938             {
2939               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2940               lastinfo = stmtinfo_a;
2941             }
2942           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2943           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2944           lastinfo = stmtinfo_b;
2945         }
2946     }
2947
2948   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2949     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2950         && !vect_analyze_data_ref_access (dr))
2951       {
2952         if (dump_enabled_p ())
2953           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2954                            "not vectorized: complicated access pattern.\n");
2955
2956         if (is_a <bb_vec_info> (vinfo))
2957           {
2958             /* Mark the statement as not vectorizable.  */
2959             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2960             continue;
2961           }
2962         else
2963           {
2964             datarefs_copy.release ();
2965             return false;
2966           }
2967       }
2968
2969   datarefs_copy.release ();
2970   return true;
2971 }
2972
2973 /* Function vect_vfa_segment_size.
2974
2975    Create an expression that computes the size of segment
2976    that will be accessed for a data reference.  The functions takes into
2977    account that realignment loads may access one more vector.
2978
2979    Input:
2980      DR: The data reference.
2981      LENGTH_FACTOR: segment length to consider.
2982
2983    Return an expression whose value is the size of segment which will be
2984    accessed by DR.  */
2985
2986 static tree
2987 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2988 {
2989   tree segment_length;
2990
2991   if (integer_zerop (DR_STEP (dr)))
2992     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2993   else
2994     segment_length = size_binop (MULT_EXPR,
2995                                  fold_convert (sizetype, DR_STEP (dr)),
2996                                  fold_convert (sizetype, length_factor));
2997
2998   if (vect_supportable_dr_alignment (dr, false)
2999         == dr_explicit_realign_optimized)
3000     {
3001       tree vector_size = TYPE_SIZE_UNIT
3002                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
3003
3004       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
3005     }
3006   return segment_length;
3007 }
3008
3009 /* Function vect_no_alias_p.
3010
3011    Given data references A and B with equal base and offset, see whether
3012    the alias relation can be decided at compilation time.  Return 1 if
3013    it can and the references alias, 0 if it can and the references do
3014    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A
3015    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3016    respectively.  */
3017
3018 static int
3019 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3020                          tree segment_length_a, tree segment_length_b)
3021 {
3022   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3023   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3024   poly_uint64 const_length_a;
3025   poly_uint64 const_length_b;
3026
3027   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3028      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3029      [a, a+12) */
3030   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3031     {
3032       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3033       offset_a = (offset_a + vect_get_scalar_dr_size (a)) - const_length_a;
3034     }
3035   else
3036     const_length_a = tree_to_poly_uint64 (segment_length_a);
3037   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3038     {
3039       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3040       offset_b = (offset_b + vect_get_scalar_dr_size (b)) - const_length_b;
3041     }
3042   else
3043     const_length_b = tree_to_poly_uint64 (segment_length_b);
3044
3045   if (ranges_known_overlap_p (offset_a, const_length_a,
3046                               offset_b, const_length_b))
3047     return 1;
3048
3049   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3050                                offset_b, const_length_b))
3051     return 0;
3052
3053   return -1;
3054 }
3055
3056 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3057    in DDR is >= VF.  */
3058
3059 static bool
3060 dependence_distance_ge_vf (data_dependence_relation *ddr,
3061                            unsigned int loop_depth, poly_uint64 vf)
3062 {
3063   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3064       || DDR_NUM_DIST_VECTS (ddr) == 0)
3065     return false;
3066
3067   /* If the dependence is exact, we should have limited the VF instead.  */
3068   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3069
3070   unsigned int i;
3071   lambda_vector dist_v;
3072   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3073     {
3074       HOST_WIDE_INT dist = dist_v[loop_depth];
3075       if (dist != 0
3076           && !(dist > 0 && DDR_REVERSED_P (ddr))
3077           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3078         return false;
3079     }
3080
3081   if (dump_enabled_p ())
3082     {
3083       dump_printf_loc (MSG_NOTE, vect_location,
3084                        "dependence distance between ");
3085       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3086       dump_printf (MSG_NOTE,  " and ");
3087       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3088       dump_printf (MSG_NOTE,  " is >= VF\n");
3089     }
3090
3091   return true;
3092 }
3093
3094 /* Function vect_prune_runtime_alias_test_list.
3095
3096    Prune a list of ddrs to be tested at run-time by versioning for alias.
3097    Merge several alias checks into one if possible.
3098    Return FALSE if resulting list of ddrs is longer then allowed by
3099    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3100
3101 bool
3102 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3103 {
3104   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3105   hash_set <tree_pair_hash> compared_objects;
3106
3107   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3108   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3109     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3110   vec<vec_object_pair> &check_unequal_addrs
3111     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3112   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3113   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3114
3115   ddr_p ddr;
3116   unsigned int i;
3117   tree length_factor;
3118
3119   if (dump_enabled_p ())
3120     dump_printf_loc (MSG_NOTE, vect_location,
3121                      "=== vect_prune_runtime_alias_test_list ===\n");
3122
3123   if (may_alias_ddrs.is_empty ())
3124     return true;
3125
3126   comp_alias_ddrs.create (may_alias_ddrs.length ());
3127
3128   unsigned int loop_depth
3129     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3130                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3131
3132   /* First, we collect all data ref pairs for aliasing checks.  */
3133   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3134     {
3135       int comp_res;
3136       struct data_reference *dr_a, *dr_b;
3137       gimple *dr_group_first_a, *dr_group_first_b;
3138       tree segment_length_a, segment_length_b;
3139       gimple *stmt_a, *stmt_b;
3140
3141       /* Ignore the alias if the VF we chose ended up being no greater
3142          than the dependence distance.  */
3143       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3144         continue;
3145
3146       if (DDR_OBJECT_A (ddr))
3147         {
3148           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3149           if (!compared_objects.add (new_pair))
3150             {
3151               if (dump_enabled_p ())
3152                 {
3153                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3154                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3155                   dump_printf (MSG_NOTE, " and ");
3156                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3157                   dump_printf (MSG_NOTE, " have different addresses\n");
3158                 }
3159               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3160             }
3161           continue;
3162         }
3163
3164       dr_a = DDR_A (ddr);
3165       stmt_a = DR_STMT (DDR_A (ddr));
3166       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3167       if (dr_group_first_a)
3168         {
3169           stmt_a = dr_group_first_a;
3170           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3171         }
3172
3173       dr_b = DDR_B (ddr);
3174       stmt_b = DR_STMT (DDR_B (ddr));
3175       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3176       if (dr_group_first_b)
3177         {
3178           stmt_b = dr_group_first_b;
3179           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3180         }
3181
3182       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3183         length_factor = scalar_loop_iters;
3184       else
3185         length_factor = size_int (vect_factor);
3186       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3187       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3188
3189       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3190                                         DR_BASE_ADDRESS (dr_b));
3191       if (comp_res == 0)
3192         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3193                                           DR_OFFSET (dr_b));
3194
3195       /* See whether the alias is known at compilation time.  */
3196       if (comp_res == 0
3197           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3198           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3199           && poly_int_tree_p (segment_length_a)
3200           && poly_int_tree_p (segment_length_b))
3201         {
3202           int res = vect_compile_time_alias (dr_a, dr_b,
3203                                              segment_length_a,
3204                                              segment_length_b);
3205           if (res == 0)
3206             continue;
3207
3208           if (res == 1)
3209             {
3210               if (dump_enabled_p ())
3211                 dump_printf_loc (MSG_NOTE, vect_location,
3212                                  "not vectorized: compilation time alias.\n");
3213               return false;
3214             }
3215         }
3216
3217       dr_with_seg_len_pair_t dr_with_seg_len_pair
3218           (dr_with_seg_len (dr_a, segment_length_a),
3219            dr_with_seg_len (dr_b, segment_length_b));
3220
3221       /* Canonicalize pairs by sorting the two DR members.  */
3222       if (comp_res > 0)
3223         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3224
3225       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3226     }
3227
3228   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3229
3230   unsigned int count = (comp_alias_ddrs.length ()
3231                         + check_unequal_addrs.length ());
3232   dump_printf_loc (MSG_NOTE, vect_location,
3233                    "improved number of alias checks from %d to %d\n",
3234                    may_alias_ddrs.length (), count);
3235   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3236     {
3237       if (dump_enabled_p ())
3238         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3239                          "number of versioning for alias "
3240                          "run-time tests exceeds %d "
3241                          "(--param vect-max-version-for-alias-checks)\n",
3242                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3243       return false;
3244     }
3245
3246   return true;
3247 }
3248
3249 /* Return true if a non-affine read or write in STMT is suitable for a
3250    gather load or scatter store.  Describe the operation in *INFO if so.  */
3251
3252 bool
3253 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3254                            gather_scatter_info *info)
3255 {
3256   HOST_WIDE_INT scale = 1;
3257   poly_int64 pbitpos, pbitsize;
3258   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3259   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3260   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3261   tree offtype = NULL_TREE;
3262   tree decl, base, off;
3263   machine_mode pmode;
3264   int punsignedp, reversep, pvolatilep = 0;
3265
3266   base = DR_REF (dr);
3267   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3268      see if we can use the def stmt of the address.  */
3269   if (is_gimple_call (stmt)
3270       && gimple_call_internal_p (stmt)
3271       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3272           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3273       && TREE_CODE (base) == MEM_REF
3274       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3275       && integer_zerop (TREE_OPERAND (base, 1))
3276       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3277     {
3278       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3279       if (is_gimple_assign (def_stmt)
3280           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3281         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3282     }
3283
3284   /* The gather and scatter builtins need address of the form
3285      loop_invariant + vector * {1, 2, 4, 8}
3286      or
3287      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3288      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3289      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3290      multiplications and additions in it.  To get a vector, we need
3291      a single SSA_NAME that will be defined in the loop and will
3292      contain everything that is not loop invariant and that can be
3293      vectorized.  The following code attempts to find such a preexistng
3294      SSA_NAME OFF and put the loop invariants into a tree BASE
3295      that can be gimplified before the loop.  */
3296   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3297                               &punsignedp, &reversep, &pvolatilep);
3298   gcc_assert (base && !reversep);
3299   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3300
3301   if (TREE_CODE (base) == MEM_REF)
3302     {
3303       if (!integer_zerop (TREE_OPERAND (base, 1)))
3304         {
3305           if (off == NULL_TREE)
3306             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3307           else
3308             off = size_binop (PLUS_EXPR, off,
3309                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3310         }
3311       base = TREE_OPERAND (base, 0);
3312     }
3313   else
3314     base = build_fold_addr_expr (base);
3315
3316   if (off == NULL_TREE)
3317     off = size_zero_node;
3318
3319   /* If base is not loop invariant, either off is 0, then we start with just
3320      the constant offset in the loop invariant BASE and continue with base
3321      as OFF, otherwise give up.
3322      We could handle that case by gimplifying the addition of base + off
3323      into some SSA_NAME and use that as off, but for now punt.  */
3324   if (!expr_invariant_in_loop_p (loop, base))
3325     {
3326       if (!integer_zerop (off))
3327         return false;
3328       off = base;
3329       base = size_int (pbytepos);
3330     }
3331   /* Otherwise put base + constant offset into the loop invariant BASE
3332      and continue with OFF.  */
3333   else
3334     {
3335       base = fold_convert (sizetype, base);
3336       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3337     }
3338
3339   /* OFF at this point may be either a SSA_NAME or some tree expression
3340      from get_inner_reference.  Try to peel off loop invariants from it
3341      into BASE as long as possible.  */
3342   STRIP_NOPS (off);
3343   while (offtype == NULL_TREE)
3344     {
3345       enum tree_code code;
3346       tree op0, op1, add = NULL_TREE;
3347
3348       if (TREE_CODE (off) == SSA_NAME)
3349         {
3350           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3351
3352           if (expr_invariant_in_loop_p (loop, off))
3353             return false;
3354
3355           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3356             break;
3357
3358           op0 = gimple_assign_rhs1 (def_stmt);
3359           code = gimple_assign_rhs_code (def_stmt);
3360           op1 = gimple_assign_rhs2 (def_stmt);
3361         }
3362       else
3363         {
3364           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3365             return false;
3366           code = TREE_CODE (off);
3367           extract_ops_from_tree (off, &code, &op0, &op1);
3368         }
3369       switch (code)
3370         {
3371         case POINTER_PLUS_EXPR:
3372         case PLUS_EXPR:
3373           if (expr_invariant_in_loop_p (loop, op0))
3374             {
3375               add = op0;
3376               off = op1;
3377             do_add:
3378               add = fold_convert (sizetype, add);
3379               if (scale != 1)
3380                 add = size_binop (MULT_EXPR, add, size_int (scale));
3381               base = size_binop (PLUS_EXPR, base, add);
3382               continue;
3383             }
3384           if (expr_invariant_in_loop_p (loop, op1))
3385             {
3386               add = op1;
3387               off = op0;
3388               goto do_add;
3389             }
3390           break;
3391         case MINUS_EXPR:
3392           if (expr_invariant_in_loop_p (loop, op1))
3393             {
3394               add = fold_convert (sizetype, op1);
3395               add = size_binop (MINUS_EXPR, size_zero_node, add);
3396               off = op0;
3397               goto do_add;
3398             }
3399           break;
3400         case MULT_EXPR:
3401           if (scale == 1 && tree_fits_shwi_p (op1))
3402             {
3403               scale = tree_to_shwi (op1);
3404               off = op0;
3405               continue;
3406             }
3407           break;
3408         case SSA_NAME:
3409           off = op0;
3410           continue;
3411         CASE_CONVERT:
3412           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3413               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3414             break;
3415           if (TYPE_PRECISION (TREE_TYPE (op0))
3416               == TYPE_PRECISION (TREE_TYPE (off)))
3417             {
3418               off = op0;
3419               continue;
3420             }
3421           if (TYPE_PRECISION (TREE_TYPE (op0))
3422               < TYPE_PRECISION (TREE_TYPE (off)))
3423             {
3424               off = op0;
3425               offtype = TREE_TYPE (off);
3426               STRIP_NOPS (off);
3427               continue;
3428             }
3429           break;
3430         default:
3431           break;
3432         }
3433       break;
3434     }
3435
3436   /* If at the end OFF still isn't a SSA_NAME or isn't
3437      defined in the loop, punt.  */
3438   if (TREE_CODE (off) != SSA_NAME
3439       || expr_invariant_in_loop_p (loop, off))
3440     return false;
3441
3442   if (offtype == NULL_TREE)
3443     offtype = TREE_TYPE (off);
3444
3445   if (DR_IS_READ (dr))
3446     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3447                                              offtype, scale);
3448   else
3449     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3450                                               offtype, scale);
3451
3452   if (decl == NULL_TREE)
3453     return false;
3454
3455   info->decl = decl;
3456   info->base = base;
3457   info->offset = off;
3458   info->offset_dt = vect_unknown_def_type;
3459   info->offset_vectype = NULL_TREE;
3460   info->scale = scale;
3461   return true;
3462 }
3463
3464 /* Function vect_analyze_data_refs.
3465
3466   Find all the data references in the loop or basic block.
3467
3468    The general structure of the analysis of data refs in the vectorizer is as
3469    follows:
3470    1- vect_analyze_data_refs(loop/bb): call
3471       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3472       in the loop/bb and their dependences.
3473    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3474    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3475    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3476
3477 */
3478
3479 bool
3480 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3481 {
3482   struct loop *loop = NULL;
3483   unsigned int i;
3484   struct data_reference *dr;
3485   tree scalar_type;
3486
3487   if (dump_enabled_p ())
3488     dump_printf_loc (MSG_NOTE, vect_location,
3489                      "=== vect_analyze_data_refs ===\n");
3490
3491   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3492     loop = LOOP_VINFO_LOOP (loop_vinfo);
3493
3494   /* Go through the data-refs, check that the analysis succeeded.  Update
3495      pointer from stmt_vec_info struct to DR and vectype.  */
3496
3497   vec<data_reference_p> datarefs = vinfo->datarefs;
3498   FOR_EACH_VEC_ELT (datarefs, i, dr)
3499     {
3500       gimple *stmt;
3501       stmt_vec_info stmt_info;
3502       tree base, offset, init;
3503       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3504       bool simd_lane_access = false;
3505       poly_uint64 vf;
3506
3507 again:
3508       if (!dr || !DR_REF (dr))
3509         {
3510           if (dump_enabled_p ())
3511             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3512                              "not vectorized: unhandled data-ref\n");
3513           return false;
3514         }
3515
3516       stmt = DR_STMT (dr);
3517       stmt_info = vinfo_for_stmt (stmt);
3518
3519       /* Discard clobbers from the dataref vector.  We will remove
3520          clobber stmts during vectorization.  */
3521       if (gimple_clobber_p (stmt))
3522         {
3523           free_data_ref (dr);
3524           if (i == datarefs.length () - 1)
3525             {
3526               datarefs.pop ();
3527               break;
3528             }
3529           datarefs.ordered_remove (i);
3530           dr = datarefs[i];
3531           goto again;
3532         }
3533
3534       /* Check that analysis of the data-ref succeeded.  */
3535       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3536           || !DR_STEP (dr))
3537         {
3538           bool maybe_gather
3539             = DR_IS_READ (dr)
3540               && !TREE_THIS_VOLATILE (DR_REF (dr))
3541               && targetm.vectorize.builtin_gather != NULL;
3542           bool maybe_scatter
3543             = DR_IS_WRITE (dr)
3544               && !TREE_THIS_VOLATILE (DR_REF (dr))
3545               && targetm.vectorize.builtin_scatter != NULL;
3546           bool maybe_simd_lane_access
3547             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3548
3549           /* If target supports vector gather loads or scatter stores, or if
3550              this might be a SIMD lane access, see if they can't be used.  */
3551           if (is_a <loop_vec_info> (vinfo)
3552               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3553               && !nested_in_vect_loop_p (loop, stmt))
3554             {
3555               struct data_reference *newdr
3556                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3557                                    DR_REF (dr), stmt, !maybe_scatter,
3558                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3559               gcc_assert (newdr != NULL && DR_REF (newdr));
3560               if (DR_BASE_ADDRESS (newdr)
3561                   && DR_OFFSET (newdr)
3562                   && DR_INIT (newdr)
3563                   && DR_STEP (newdr)
3564                   && integer_zerop (DR_STEP (newdr)))
3565                 {
3566                   if (maybe_simd_lane_access)
3567                     {
3568                       tree off = DR_OFFSET (newdr);
3569                       STRIP_NOPS (off);
3570                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3571                           && TREE_CODE (off) == MULT_EXPR
3572                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3573                         {
3574                           tree step = TREE_OPERAND (off, 1);
3575                           off = TREE_OPERAND (off, 0);
3576                           STRIP_NOPS (off);
3577                           if (CONVERT_EXPR_P (off)
3578                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3579                                                                           0)))
3580                                  < TYPE_PRECISION (TREE_TYPE (off)))
3581                             off = TREE_OPERAND (off, 0);
3582                           if (TREE_CODE (off) == SSA_NAME)
3583                             {
3584                               gimple *def = SSA_NAME_DEF_STMT (off);
3585                               tree reft = TREE_TYPE (DR_REF (newdr));
3586                               if (is_gimple_call (def)
3587                                   && gimple_call_internal_p (def)
3588                                   && (gimple_call_internal_fn (def)
3589                                       == IFN_GOMP_SIMD_LANE))
3590                                 {
3591                                   tree arg = gimple_call_arg (def, 0);
3592                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3593                                   arg = SSA_NAME_VAR (arg);
3594                                   if (arg == loop->simduid
3595                                       /* For now.  */
3596                                       && tree_int_cst_equal
3597                                            (TYPE_SIZE_UNIT (reft),
3598                                             step))
3599                                     {
3600                                       DR_OFFSET (newdr) = ssize_int (0);
3601                                       DR_STEP (newdr) = step;
3602                                       DR_OFFSET_ALIGNMENT (newdr)
3603                                         = BIGGEST_ALIGNMENT;
3604                                       DR_STEP_ALIGNMENT (newdr)
3605                                         = highest_pow2_factor (step);
3606                                       dr = newdr;
3607                                       simd_lane_access = true;
3608                                     }
3609                                 }
3610                             }
3611                         }
3612                     }
3613                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3614                     {
3615                       dr = newdr;
3616                       if (maybe_gather)
3617                         gatherscatter = GATHER;
3618                       else
3619                         gatherscatter = SCATTER;
3620                     }
3621                 }
3622               if (gatherscatter == SG_NONE && !simd_lane_access)
3623                 free_data_ref (newdr);
3624             }
3625
3626           if (gatherscatter == SG_NONE && !simd_lane_access)
3627             {
3628               if (dump_enabled_p ())
3629                 {
3630                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3631                                    "not vectorized: data ref analysis "
3632                                    "failed ");
3633                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3634                 }
3635
3636               if (is_a <bb_vec_info> (vinfo))
3637                 break;
3638
3639               return false;
3640             }
3641         }
3642
3643       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3644         {
3645           if (dump_enabled_p ())
3646             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3647                              "not vectorized: base addr of dr is a "
3648                              "constant\n");
3649
3650           if (is_a <bb_vec_info> (vinfo))
3651             break;
3652
3653           if (gatherscatter != SG_NONE || simd_lane_access)
3654             free_data_ref (dr);
3655           return false;
3656         }
3657
3658       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3659         {
3660           if (dump_enabled_p ())
3661             {
3662               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3663                                "not vectorized: volatile type ");
3664               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3665             }
3666
3667           if (is_a <bb_vec_info> (vinfo))
3668             break;
3669
3670           return false;
3671         }
3672
3673       if (stmt_can_throw_internal (stmt))
3674         {
3675           if (dump_enabled_p ())
3676             {
3677               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3678                                "not vectorized: statement can throw an "
3679                                "exception ");
3680               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3681             }
3682
3683           if (is_a <bb_vec_info> (vinfo))
3684             break;
3685
3686           if (gatherscatter != SG_NONE || simd_lane_access)
3687             free_data_ref (dr);
3688           return false;
3689         }
3690
3691       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3692           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3693         {
3694           if (dump_enabled_p ())
3695             {
3696               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697                                "not vectorized: statement is bitfield "
3698                                "access ");
3699               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3700             }
3701
3702           if (is_a <bb_vec_info> (vinfo))
3703             break;
3704
3705           if (gatherscatter != SG_NONE || simd_lane_access)
3706             free_data_ref (dr);
3707           return false;
3708         }
3709
3710       base = unshare_expr (DR_BASE_ADDRESS (dr));
3711       offset = unshare_expr (DR_OFFSET (dr));
3712       init = unshare_expr (DR_INIT (dr));
3713
3714       if (is_gimple_call (stmt)
3715           && (!gimple_call_internal_p (stmt)
3716               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3717                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3718         {
3719           if (dump_enabled_p ())
3720             {
3721               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3722                                "not vectorized: dr in a call ");
3723               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3724             }
3725
3726           if (is_a <bb_vec_info> (vinfo))
3727             break;
3728
3729           if (gatherscatter != SG_NONE || simd_lane_access)
3730             free_data_ref (dr);
3731           return false;
3732         }
3733
3734       /* Update DR field in stmt_vec_info struct.  */
3735
3736       /* If the dataref is in an inner-loop of the loop that is considered for
3737          for vectorization, we also want to analyze the access relative to
3738          the outer-loop (DR contains information only relative to the
3739          inner-most enclosing loop).  We do that by building a reference to the
3740          first location accessed by the inner-loop, and analyze it relative to
3741          the outer-loop.  */
3742       if (loop && nested_in_vect_loop_p (loop, stmt))
3743         {
3744           /* Build a reference to the first location accessed by the
3745              inner loop: *(BASE + INIT + OFFSET).  By construction,
3746              this address must be invariant in the inner loop, so we
3747              can consider it as being used in the outer loop.  */
3748           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3749                                           init, offset);
3750           tree init_addr = fold_build_pointer_plus (base, init_offset);
3751           tree init_ref = build_fold_indirect_ref (init_addr);
3752
3753           if (dump_enabled_p ())
3754             {
3755               dump_printf_loc (MSG_NOTE, vect_location,
3756                                "analyze in outer loop: ");
3757               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3758               dump_printf (MSG_NOTE, "\n");
3759             }
3760
3761           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3762                                      init_ref, loop))
3763             /* dr_analyze_innermost already explained the failure.  */
3764             return false;
3765
3766           if (dump_enabled_p ())
3767             {
3768               dump_printf_loc (MSG_NOTE, vect_location,
3769                                "\touter base_address: ");
3770               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3771                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3772               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3773               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3774                                  STMT_VINFO_DR_OFFSET (stmt_info));
3775               dump_printf (MSG_NOTE,
3776                            "\n\touter constant offset from base address: ");
3777               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3778                                  STMT_VINFO_DR_INIT (stmt_info));
3779               dump_printf (MSG_NOTE, "\n\touter step: ");
3780               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3781                                  STMT_VINFO_DR_STEP (stmt_info));
3782               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3783                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3784               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3785                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3786               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3787                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3788               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3789                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3790             }
3791         }
3792
3793       if (STMT_VINFO_DATA_REF (stmt_info))
3794         {
3795           if (dump_enabled_p ())
3796             {
3797               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3798                                "not vectorized: more than one data ref "
3799                                "in stmt: ");
3800               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3801             }
3802
3803           if (is_a <bb_vec_info> (vinfo))
3804             break;
3805
3806           if (gatherscatter != SG_NONE || simd_lane_access)
3807             free_data_ref (dr);
3808           return false;
3809         }
3810
3811       STMT_VINFO_DATA_REF (stmt_info) = dr;
3812       if (simd_lane_access)
3813         {
3814           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3815           free_data_ref (datarefs[i]);
3816           datarefs[i] = dr;
3817         }
3818
3819       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3820           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3821           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3822         {
3823           if (dump_enabled_p ())
3824             {
3825               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3826                                "not vectorized: base object not addressable "
3827                                "for stmt: ");
3828               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3829             }
3830           if (is_a <bb_vec_info> (vinfo))
3831             {
3832               /* In BB vectorization the ref can still participate
3833                  in dependence analysis, we just can't vectorize it.  */
3834               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3835               continue;
3836             }
3837           return false;
3838         }
3839
3840       /* Set vectype for STMT.  */
3841       scalar_type = TREE_TYPE (DR_REF (dr));
3842       STMT_VINFO_VECTYPE (stmt_info)
3843         = get_vectype_for_scalar_type (scalar_type);
3844       if (!STMT_VINFO_VECTYPE (stmt_info))
3845         {
3846           if (dump_enabled_p ())
3847             {
3848               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3849                                "not vectorized: no vectype for stmt: ");
3850               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3851               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3852               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3853                                  scalar_type);
3854               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3855             }
3856
3857           if (is_a <bb_vec_info> (vinfo))
3858             {
3859               /* No vector type is fine, the ref can still participate
3860                  in dependence analysis, we just can't vectorize it.  */
3861               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3862               continue;
3863             }
3864
3865           if (gatherscatter != SG_NONE || simd_lane_access)
3866             {
3867               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3868               if (gatherscatter != SG_NONE)
3869                 free_data_ref (dr);
3870             }
3871           return false;
3872         }
3873       else
3874         {
3875           if (dump_enabled_p ())
3876             {
3877               dump_printf_loc (MSG_NOTE, vect_location,
3878                                "got vectype for stmt: ");
3879               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3880               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3881                                  STMT_VINFO_VECTYPE (stmt_info));
3882               dump_printf (MSG_NOTE, "\n");
3883             }
3884         }
3885
3886       /* Adjust the minimal vectorization factor according to the
3887          vector type.  */
3888       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3889       *min_vf = upper_bound (*min_vf, vf);
3890
3891       if (gatherscatter != SG_NONE)
3892         {
3893           gather_scatter_info gs_info;
3894           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3895                                           &gs_info)
3896               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3897             {
3898               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3899               free_data_ref (dr);
3900               if (dump_enabled_p ())
3901                 {
3902                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3903                                    (gatherscatter == GATHER) ?
3904                                    "not vectorized: not suitable for gather "
3905                                    "load " :
3906                                    "not vectorized: not suitable for scatter "
3907                                    "store ");
3908                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3909                 }
3910               return false;
3911             }
3912
3913           free_data_ref (datarefs[i]);
3914           datarefs[i] = dr;
3915           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3916         }
3917
3918       else if (is_a <loop_vec_info> (vinfo)
3919                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3920         {
3921           if (nested_in_vect_loop_p (loop, stmt))
3922             {
3923               if (dump_enabled_p ())
3924                 {
3925                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3926                                    "not vectorized: not suitable for strided "
3927                                    "load ");
3928                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3929                 }
3930               return false;
3931             }
3932           STMT_VINFO_STRIDED_P (stmt_info) = true;
3933         }
3934     }
3935
3936   /* If we stopped analysis at the first dataref we could not analyze
3937      when trying to vectorize a basic-block mark the rest of the datarefs
3938      as not vectorizable and truncate the vector of datarefs.  That
3939      avoids spending useless time in analyzing their dependence.  */
3940   if (i != datarefs.length ())
3941     {
3942       gcc_assert (is_a <bb_vec_info> (vinfo));
3943       for (unsigned j = i; j < datarefs.length (); ++j)
3944         {
3945           data_reference_p dr = datarefs[j];
3946           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3947           free_data_ref (dr);
3948         }
3949       datarefs.truncate (i);
3950     }
3951
3952   return true;
3953 }
3954
3955
3956 /* Function vect_get_new_vect_var.
3957
3958    Returns a name for a new variable.  The current naming scheme appends the
3959    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3960    the name of vectorizer generated variables, and appends that to NAME if
3961    provided.  */
3962
3963 tree
3964 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3965 {
3966   const char *prefix;
3967   tree new_vect_var;
3968
3969   switch (var_kind)
3970   {
3971   case vect_simple_var:
3972     prefix = "vect";
3973     break;
3974   case vect_scalar_var:
3975     prefix = "stmp";
3976     break;
3977   case vect_mask_var:
3978     prefix = "mask";
3979     break;
3980   case vect_pointer_var:
3981     prefix = "vectp";
3982     break;
3983   default:
3984     gcc_unreachable ();
3985   }
3986
3987   if (name)
3988     {
3989       char* tmp = concat (prefix, "_", name, NULL);
3990       new_vect_var = create_tmp_reg (type, tmp);
3991       free (tmp);
3992     }
3993   else
3994     new_vect_var = create_tmp_reg (type, prefix);
3995
3996   return new_vect_var;
3997 }
3998
3999 /* Like vect_get_new_vect_var but return an SSA name.  */
4000
4001 tree
4002 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4003 {
4004   const char *prefix;
4005   tree new_vect_var;
4006
4007   switch (var_kind)
4008   {
4009   case vect_simple_var:
4010     prefix = "vect";
4011     break;
4012   case vect_scalar_var:
4013     prefix = "stmp";
4014     break;
4015   case vect_pointer_var:
4016     prefix = "vectp";
4017     break;
4018   default:
4019     gcc_unreachable ();
4020   }
4021
4022   if (name)
4023     {
4024       char* tmp = concat (prefix, "_", name, NULL);
4025       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4026       free (tmp);
4027     }
4028   else
4029     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4030
4031   return new_vect_var;
4032 }
4033
4034 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4035
4036 static void
4037 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4038 {
4039   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4040   int misalign = DR_MISALIGNMENT (dr);
4041   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4042     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4043   else
4044     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4045                             DR_TARGET_ALIGNMENT (dr), misalign);
4046 }
4047
4048 /* Function vect_create_addr_base_for_vector_ref.
4049
4050    Create an expression that computes the address of the first memory location
4051    that will be accessed for a data reference.
4052
4053    Input:
4054    STMT: The statement containing the data reference.
4055    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4056    OFFSET: Optional. If supplied, it is be added to the initial address.
4057    LOOP:    Specify relative to which loop-nest should the address be computed.
4058             For example, when the dataref is in an inner-loop nested in an
4059             outer-loop that is now being vectorized, LOOP can be either the
4060             outer-loop, or the inner-loop.  The first memory location accessed
4061             by the following dataref ('in' points to short):
4062
4063                 for (i=0; i<N; i++)
4064                    for (j=0; j<M; j++)
4065                      s += in[i+j]
4066
4067             is as follows:
4068             if LOOP=i_loop:     &in             (relative to i_loop)
4069             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4070    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4071             initial address.  Unlike OFFSET, which is number of elements to
4072             be added, BYTE_OFFSET is measured in bytes.
4073
4074    Output:
4075    1. Return an SSA_NAME whose value is the address of the memory location of
4076       the first vector of the data reference.
4077    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4078       these statement(s) which define the returned SSA_NAME.
4079
4080    FORNOW: We are only handling array accesses with step 1.  */
4081
4082 tree
4083 vect_create_addr_base_for_vector_ref (gimple *stmt,
4084                                       gimple_seq *new_stmt_list,
4085                                       tree offset,
4086                                       tree byte_offset)
4087 {
4088   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4089   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4090   const char *base_name;
4091   tree addr_base;
4092   tree dest;
4093   gimple_seq seq = NULL;
4094   tree vect_ptr_type;
4095   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4096   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4097   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4098
4099   tree data_ref_base = unshare_expr (drb->base_address);
4100   tree base_offset = unshare_expr (drb->offset);
4101   tree init = unshare_expr (drb->init);
4102
4103   if (loop_vinfo)
4104     base_name = get_name (data_ref_base);
4105   else
4106     {
4107       base_offset = ssize_int (0);
4108       init = ssize_int (0);
4109       base_name = get_name (DR_REF (dr));
4110     }
4111
4112   /* Create base_offset */
4113   base_offset = size_binop (PLUS_EXPR,
4114                             fold_convert (sizetype, base_offset),
4115                             fold_convert (sizetype, init));
4116
4117   if (offset)
4118     {
4119       offset = fold_build2 (MULT_EXPR, sizetype,
4120                             fold_convert (sizetype, offset), step);
4121       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4122                                  base_offset, offset);
4123     }
4124   if (byte_offset)
4125     {
4126       byte_offset = fold_convert (sizetype, byte_offset);
4127       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4128                                  base_offset, byte_offset);
4129     }
4130
4131   /* base + base_offset */
4132   if (loop_vinfo)
4133     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4134   else
4135     {
4136       addr_base = build1 (ADDR_EXPR,
4137                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4138                           unshare_expr (DR_REF (dr)));
4139     }
4140
4141   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4142   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4143   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4144   gimple_seq_add_seq (new_stmt_list, seq);
4145
4146   if (DR_PTR_INFO (dr)
4147       && TREE_CODE (addr_base) == SSA_NAME
4148       && !SSA_NAME_PTR_INFO (addr_base))
4149     {
4150       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4151       if (offset || byte_offset)
4152         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4153     }
4154
4155   if (dump_enabled_p ())
4156     {
4157       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4158       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4159       dump_printf (MSG_NOTE, "\n");
4160     }
4161
4162   return addr_base;
4163 }
4164
4165
4166 /* Function vect_create_data_ref_ptr.
4167
4168    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4169    location accessed in the loop by STMT, along with the def-use update
4170    chain to appropriately advance the pointer through the loop iterations.
4171    Also set aliasing information for the pointer.  This pointer is used by
4172    the callers to this function to create a memory reference expression for
4173    vector load/store access.
4174
4175    Input:
4176    1. STMT: a stmt that references memory. Expected to be of the form
4177          GIMPLE_ASSIGN <name, data-ref> or
4178          GIMPLE_ASSIGN <data-ref, name>.
4179    2. AGGR_TYPE: the type of the reference, which should be either a vector
4180         or an array.
4181    3. AT_LOOP: the loop where the vector memref is to be created.
4182    4. OFFSET (optional): an offset to be added to the initial address accessed
4183         by the data-ref in STMT.
4184    5. BSI: location where the new stmts are to be placed if there is no loop
4185    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4186         pointing to the initial address.
4187    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4188         to the initial address accessed by the data-ref in STMT.  This is
4189         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4190         in bytes.
4191
4192    Output:
4193    1. Declare a new ptr to vector_type, and have it point to the base of the
4194       data reference (initial addressed accessed by the data reference).
4195       For example, for vector of type V8HI, the following code is generated:
4196
4197       v8hi *ap;
4198       ap = (v8hi *)initial_address;
4199
4200       if OFFSET is not supplied:
4201          initial_address = &a[init];
4202       if OFFSET is supplied:
4203          initial_address = &a[init + OFFSET];
4204       if BYTE_OFFSET is supplied:
4205          initial_address = &a[init] + BYTE_OFFSET;
4206
4207       Return the initial_address in INITIAL_ADDRESS.
4208
4209    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4210       update the pointer in each iteration of the loop.
4211
4212       Return the increment stmt that updates the pointer in PTR_INCR.
4213
4214    3. Set INV_P to true if the access pattern of the data reference in the
4215       vectorized loop is invariant.  Set it to false otherwise.
4216
4217    4. Return the pointer.  */
4218
4219 tree
4220 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4221                           tree offset, tree *initial_address,
4222                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4223                           bool only_init, bool *inv_p, tree byte_offset)
4224 {
4225   const char *base_name;
4226   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4227   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4228   struct loop *loop = NULL;
4229   bool nested_in_vect_loop = false;
4230   struct loop *containing_loop = NULL;
4231   tree aggr_ptr_type;
4232   tree aggr_ptr;
4233   tree new_temp;
4234   gimple_seq new_stmt_list = NULL;
4235   edge pe = NULL;
4236   basic_block new_bb;
4237   tree aggr_ptr_init;
4238   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4239   tree aptr;
4240   gimple_stmt_iterator incr_gsi;
4241   bool insert_after;
4242   tree indx_before_incr, indx_after_incr;
4243   gimple *incr;
4244   tree step;
4245   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4246
4247   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4248               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4249
4250   if (loop_vinfo)
4251     {
4252       loop = LOOP_VINFO_LOOP (loop_vinfo);
4253       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4254       containing_loop = (gimple_bb (stmt))->loop_father;
4255       pe = loop_preheader_edge (loop);
4256     }
4257   else
4258     {
4259       gcc_assert (bb_vinfo);
4260       only_init = true;
4261       *ptr_incr = NULL;
4262     }
4263
4264   /* Check the step (evolution) of the load in LOOP, and record
4265      whether it's invariant.  */
4266   step = vect_dr_behavior (dr)->step;
4267   if (integer_zerop (step))
4268     *inv_p = true;
4269   else
4270     *inv_p = false;
4271
4272   /* Create an expression for the first address accessed by this load
4273      in LOOP.  */
4274   base_name = get_name (DR_BASE_ADDRESS (dr));
4275
4276   if (dump_enabled_p ())
4277     {
4278       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4279       dump_printf_loc (MSG_NOTE, vect_location,
4280                        "create %s-pointer variable to type: ",
4281                        get_tree_code_name (TREE_CODE (aggr_type)));
4282       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4283       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4284         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4285       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4286         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4287       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4288         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4289       else
4290         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4291       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4292       dump_printf (MSG_NOTE, "\n");
4293     }
4294
4295   /* (1) Create the new aggregate-pointer variable.
4296      Vector and array types inherit the alias set of their component
4297      type by default so we need to use a ref-all pointer if the data
4298      reference does not conflict with the created aggregated data
4299      reference because it is not addressable.  */
4300   bool need_ref_all = false;
4301   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4302                               get_alias_set (DR_REF (dr))))
4303     need_ref_all = true;
4304   /* Likewise for any of the data references in the stmt group.  */
4305   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4306     {
4307       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4308       do
4309         {
4310           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4311           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4312           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4313                                       get_alias_set (DR_REF (sdr))))
4314             {
4315               need_ref_all = true;
4316               break;
4317             }
4318           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4319         }
4320       while (orig_stmt);
4321     }
4322   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4323                                                need_ref_all);
4324   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4325
4326
4327   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4328      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4329      def-use update cycles for the pointer: one relative to the outer-loop
4330      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4331      to the inner-loop (which is the inner-most loop containing the dataref),
4332      and this is done be step (5) below.
4333
4334      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4335      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4336      redundant.  Steps (3),(4) create the following:
4337
4338         vp0 = &base_addr;
4339         LOOP:   vp1 = phi(vp0,vp2)
4340                 ...
4341                 ...
4342                 vp2 = vp1 + step
4343                 goto LOOP
4344
4345      If there is an inner-loop nested in loop, then step (5) will also be
4346      applied, and an additional update in the inner-loop will be created:
4347
4348         vp0 = &base_addr;
4349         LOOP:   vp1 = phi(vp0,vp2)
4350                 ...
4351         inner:     vp3 = phi(vp1,vp4)
4352                    vp4 = vp3 + inner_step
4353                    if () goto inner
4354                 ...
4355                 vp2 = vp1 + step
4356                 if () goto LOOP   */
4357
4358   /* (2) Calculate the initial address of the aggregate-pointer, and set
4359      the aggregate-pointer to point to it before the loop.  */
4360
4361   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4362
4363   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4364                                                    offset, byte_offset);
4365   if (new_stmt_list)
4366     {
4367       if (pe)
4368         {
4369           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4370           gcc_assert (!new_bb);
4371         }
4372       else
4373         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4374     }
4375
4376   *initial_address = new_temp;
4377   aggr_ptr_init = new_temp;
4378
4379   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4380      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4381      inner-loop nested in LOOP (during outer-loop vectorization).  */
4382
4383   /* No update in loop is required.  */
4384   if (only_init && (!loop_vinfo || at_loop == loop))
4385     aptr = aggr_ptr_init;
4386   else
4387     {
4388       /* The step of the aggregate pointer is the type size.  */
4389       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4390       /* One exception to the above is when the scalar step of the load in
4391          LOOP is zero. In this case the step here is also zero.  */
4392       if (*inv_p)
4393         iv_step = size_zero_node;
4394       else if (tree_int_cst_sgn (step) == -1)
4395         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4396
4397       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4398
4399       create_iv (aggr_ptr_init,
4400                  fold_convert (aggr_ptr_type, iv_step),
4401                  aggr_ptr, loop, &incr_gsi, insert_after,
4402                  &indx_before_incr, &indx_after_incr);
4403       incr = gsi_stmt (incr_gsi);
4404       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4405
4406       /* Copy the points-to information if it exists. */
4407       if (DR_PTR_INFO (dr))
4408         {
4409           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4410           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4411         }
4412       if (ptr_incr)
4413         *ptr_incr = incr;
4414
4415       aptr = indx_before_incr;
4416     }
4417
4418   if (!nested_in_vect_loop || only_init)
4419     return aptr;
4420
4421
4422   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4423      nested in LOOP, if exists.  */
4424
4425   gcc_assert (nested_in_vect_loop);
4426   if (!only_init)
4427     {
4428       standard_iv_increment_position (containing_loop, &incr_gsi,
4429                                       &insert_after);
4430       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4431                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4432                  &indx_after_incr);
4433       incr = gsi_stmt (incr_gsi);
4434       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4435
4436       /* Copy the points-to information if it exists. */
4437       if (DR_PTR_INFO (dr))
4438         {
4439           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4440           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4441         }
4442       if (ptr_incr)
4443         *ptr_incr = incr;
4444
4445       return indx_before_incr;
4446     }
4447   else
4448     gcc_unreachable ();
4449 }
4450
4451
4452 /* Function bump_vector_ptr
4453
4454    Increment a pointer (to a vector type) by vector-size. If requested,
4455    i.e. if PTR-INCR is given, then also connect the new increment stmt
4456    to the existing def-use update-chain of the pointer, by modifying
4457    the PTR_INCR as illustrated below:
4458
4459    The pointer def-use update-chain before this function:
4460                         DATAREF_PTR = phi (p_0, p_2)
4461                         ....
4462         PTR_INCR:       p_2 = DATAREF_PTR + step
4463
4464    The pointer def-use update-chain after this function:
4465                         DATAREF_PTR = phi (p_0, p_2)
4466                         ....
4467                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4468                         ....
4469         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4470
4471    Input:
4472    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4473                  in the loop.
4474    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4475               the loop.  The increment amount across iterations is expected
4476               to be vector_size.
4477    BSI - location where the new update stmt is to be placed.
4478    STMT - the original scalar memory-access stmt that is being vectorized.
4479    BUMP - optional. The offset by which to bump the pointer. If not given,
4480           the offset is assumed to be vector_size.
4481
4482    Output: Return NEW_DATAREF_PTR as illustrated above.
4483
4484 */
4485
4486 tree
4487 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4488                  gimple *stmt, tree bump)
4489 {
4490   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4491   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4492   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4493   tree update = TYPE_SIZE_UNIT (vectype);
4494   gassign *incr_stmt;
4495   ssa_op_iter iter;
4496   use_operand_p use_p;
4497   tree new_dataref_ptr;
4498
4499   if (bump)
4500     update = bump;
4501
4502   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4503     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4504   else
4505     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4506   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4507                                    dataref_ptr, update);
4508   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4509
4510   /* Copy the points-to information if it exists. */
4511   if (DR_PTR_INFO (dr))
4512     {
4513       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4514       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4515     }
4516
4517   if (!ptr_incr)
4518     return new_dataref_ptr;
4519
4520   /* Update the vector-pointer's cross-iteration increment.  */
4521   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4522     {
4523       tree use = USE_FROM_PTR (use_p);
4524
4525       if (use == dataref_ptr)
4526         SET_USE (use_p, new_dataref_ptr);
4527       else
4528         gcc_assert (tree_int_cst_compare (use, update) == 0);
4529     }
4530
4531   return new_dataref_ptr;
4532 }
4533
4534
4535 /* Function vect_create_destination_var.
4536
4537    Create a new temporary of type VECTYPE.  */
4538
4539 tree
4540 vect_create_destination_var (tree scalar_dest, tree vectype)
4541 {
4542   tree vec_dest;
4543   const char *name;
4544   char *new_name;
4545   tree type;
4546   enum vect_var_kind kind;
4547
4548   kind = vectype
4549     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4550     ? vect_mask_var
4551     : vect_simple_var
4552     : vect_scalar_var;
4553   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4554
4555   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4556
4557   name = get_name (scalar_dest);
4558   if (name)
4559     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4560   else
4561     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4562   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4563   free (new_name);
4564
4565   return vec_dest;
4566 }
4567
4568 /* Function vect_grouped_store_supported.
4569
4570    Returns TRUE if interleave high and interleave low permutations
4571    are supported, and FALSE otherwise.  */
4572
4573 bool
4574 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4575 {
4576   machine_mode mode = TYPE_MODE (vectype);
4577
4578   /* vect_permute_store_chain requires the group size to be equal to 3 or
4579      be a power of two.  */
4580   if (count != 3 && exact_log2 (count) == -1)
4581     {
4582       if (dump_enabled_p ())
4583         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4584                          "the size of the group of accesses"
4585                          " is not a power of 2 or not eqaul to 3\n");
4586       return false;
4587     }
4588
4589   /* Check that the permutation is supported.  */
4590   if (VECTOR_MODE_P (mode))
4591     {
4592       unsigned int i;
4593       if (count == 3)
4594         {
4595           unsigned int j0 = 0, j1 = 0, j2 = 0;
4596           unsigned int i, j;
4597
4598           unsigned int nelt;
4599           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4600             {
4601               if (dump_enabled_p ())
4602                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4603                                  "cannot handle groups of 3 stores for"
4604                                  " variable-length vectors\n");
4605               return false;
4606             }
4607
4608           vec_perm_builder sel (nelt, nelt, 1);
4609           sel.quick_grow (nelt);
4610           vec_perm_indices indices;
4611           for (j = 0; j < 3; j++)
4612             {
4613               int nelt0 = ((3 - j) * nelt) % 3;
4614               int nelt1 = ((3 - j) * nelt + 1) % 3;
4615               int nelt2 = ((3 - j) * nelt + 2) % 3;
4616               for (i = 0; i < nelt; i++)
4617                 {
4618                   if (3 * i + nelt0 < nelt)
4619                     sel[3 * i + nelt0] = j0++;
4620                   if (3 * i + nelt1 < nelt)
4621                     sel[3 * i + nelt1] = nelt + j1++;
4622                   if (3 * i + nelt2 < nelt)
4623                     sel[3 * i + nelt2] = 0;
4624                 }
4625               indices.new_vector (sel, 2, nelt);
4626               if (!can_vec_perm_const_p (mode, indices))
4627                 {
4628                   if (dump_enabled_p ())
4629                     dump_printf (MSG_MISSED_OPTIMIZATION,
4630                                  "permutation op not supported by target.\n");
4631                   return false;
4632                 }
4633
4634               for (i = 0; i < nelt; i++)
4635                 {
4636                   if (3 * i + nelt0 < nelt)
4637                     sel[3 * i + nelt0] = 3 * i + nelt0;
4638                   if (3 * i + nelt1 < nelt)
4639                     sel[3 * i + nelt1] = 3 * i + nelt1;
4640                   if (3 * i + nelt2 < nelt)
4641                     sel[3 * i + nelt2] = nelt + j2++;
4642                 }
4643               indices.new_vector (sel, 2, nelt);
4644               if (!can_vec_perm_const_p (mode, indices))
4645                 {
4646                   if (dump_enabled_p ())
4647                     dump_printf (MSG_MISSED_OPTIMIZATION,
4648                                  "permutation op not supported by target.\n");
4649                   return false;
4650                 }
4651             }
4652           return true;
4653         }
4654       else
4655         {
4656           /* If length is not equal to 3 then only power of 2 is supported.  */
4657           gcc_assert (pow2p_hwi (count));
4658           poly_uint64 nelt = GET_MODE_NUNITS (mode);
4659
4660           /* The encoding has 2 interleaved stepped patterns.  */
4661           vec_perm_builder sel (nelt, 2, 3);
4662           sel.quick_grow (6);
4663           for (i = 0; i < 3; i++)
4664             {
4665               sel[i * 2] = i;
4666               sel[i * 2 + 1] = i + nelt;
4667             }
4668           vec_perm_indices indices (sel, 2, nelt);
4669           if (can_vec_perm_const_p (mode, indices))
4670             {
4671               for (i = 0; i < 6; i++)
4672                 sel[i] += exact_div (nelt, 2);
4673               indices.new_vector (sel, 2, nelt);
4674               if (can_vec_perm_const_p (mode, indices))
4675                 return true;
4676             }
4677         }
4678     }
4679
4680   if (dump_enabled_p ())
4681     dump_printf (MSG_MISSED_OPTIMIZATION,
4682                  "permutaion op not supported by target.\n");
4683   return false;
4684 }
4685
4686
4687 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4688    type VECTYPE.  */
4689
4690 bool
4691 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4692 {
4693   return vect_lanes_optab_supported_p ("vec_store_lanes",
4694                                        vec_store_lanes_optab,
4695                                        vectype, count);
4696 }
4697
4698
4699 /* Function vect_permute_store_chain.
4700
4701    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4702    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4703    the data correctly for the stores.  Return the final references for stores
4704    in RESULT_CHAIN.
4705
4706    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4707    The input is 4 vectors each containing 8 elements.  We assign a number to
4708    each element, the input sequence is:
4709
4710    1st vec:   0  1  2  3  4  5  6  7
4711    2nd vec:   8  9 10 11 12 13 14 15
4712    3rd vec:  16 17 18 19 20 21 22 23
4713    4th vec:  24 25 26 27 28 29 30 31
4714
4715    The output sequence should be:
4716
4717    1st vec:  0  8 16 24  1  9 17 25
4718    2nd vec:  2 10 18 26  3 11 19 27
4719    3rd vec:  4 12 20 28  5 13 21 30
4720    4th vec:  6 14 22 30  7 15 23 31
4721
4722    i.e., we interleave the contents of the four vectors in their order.
4723
4724    We use interleave_high/low instructions to create such output.  The input of
4725    each interleave_high/low operation is two vectors:
4726    1st vec    2nd vec
4727    0 1 2 3    4 5 6 7
4728    the even elements of the result vector are obtained left-to-right from the
4729    high/low elements of the first vector.  The odd elements of the result are
4730    obtained left-to-right from the high/low elements of the second vector.
4731    The output of interleave_high will be:   0 4 1 5
4732    and of interleave_low:                   2 6 3 7
4733
4734
4735    The permutation is done in log LENGTH stages.  In each stage interleave_high
4736    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4737    where the first argument is taken from the first half of DR_CHAIN and the
4738    second argument from it's second half.
4739    In our example,
4740
4741    I1: interleave_high (1st vec, 3rd vec)
4742    I2: interleave_low (1st vec, 3rd vec)
4743    I3: interleave_high (2nd vec, 4th vec)
4744    I4: interleave_low (2nd vec, 4th vec)
4745
4746    The output for the first stage is:
4747
4748    I1:  0 16  1 17  2 18  3 19
4749    I2:  4 20  5 21  6 22  7 23
4750    I3:  8 24  9 25 10 26 11 27
4751    I4: 12 28 13 29 14 30 15 31
4752
4753    The output of the second stage, i.e. the final result is:
4754
4755    I1:  0  8 16 24  1  9 17 25
4756    I2:  2 10 18 26  3 11 19 27
4757    I3:  4 12 20 28  5 13 21 30
4758    I4:  6 14 22 30  7 15 23 31.  */
4759
4760 void
4761 vect_permute_store_chain (vec<tree> dr_chain,
4762                           unsigned int length,
4763                           gimple *stmt,
4764                           gimple_stmt_iterator *gsi,
4765                           vec<tree> *result_chain)
4766 {
4767   tree vect1, vect2, high, low;
4768   gimple *perm_stmt;
4769   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4770   tree perm_mask_low, perm_mask_high;
4771   tree data_ref;
4772   tree perm3_mask_low, perm3_mask_high;
4773   unsigned int i, j, n, log_length = exact_log2 (length);
4774
4775   result_chain->quick_grow (length);
4776   memcpy (result_chain->address (), dr_chain.address (),
4777           length * sizeof (tree));
4778
4779   if (length == 3)
4780     {
4781       /* vect_grouped_store_supported ensures that this is constant.  */
4782       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4783       unsigned int j0 = 0, j1 = 0, j2 = 0;
4784
4785       vec_perm_builder sel (nelt, nelt, 1);
4786       sel.quick_grow (nelt);
4787       vec_perm_indices indices;
4788       for (j = 0; j < 3; j++)
4789         {
4790           int nelt0 = ((3 - j) * nelt) % 3;
4791           int nelt1 = ((3 - j) * nelt + 1) % 3;
4792           int nelt2 = ((3 - j) * nelt + 2) % 3;
4793
4794           for (i = 0; i < nelt; i++)
4795             {
4796               if (3 * i + nelt0 < nelt)
4797                 sel[3 * i + nelt0] = j0++;
4798               if (3 * i + nelt1 < nelt)
4799                 sel[3 * i + nelt1] = nelt + j1++;
4800               if (3 * i + nelt2 < nelt)
4801                 sel[3 * i + nelt2] = 0;
4802             }
4803           indices.new_vector (sel, 2, nelt);
4804           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4805
4806           for (i = 0; i < nelt; i++)
4807             {
4808               if (3 * i + nelt0 < nelt)
4809                 sel[3 * i + nelt0] = 3 * i + nelt0;
4810               if (3 * i + nelt1 < nelt)
4811                 sel[3 * i + nelt1] = 3 * i + nelt1;
4812               if (3 * i + nelt2 < nelt)
4813                 sel[3 * i + nelt2] = nelt + j2++;
4814             }
4815           indices.new_vector (sel, 2, nelt);
4816           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4817
4818           vect1 = dr_chain[0];
4819           vect2 = dr_chain[1];
4820
4821           /* Create interleaving stmt:
4822              low = VEC_PERM_EXPR <vect1, vect2,
4823                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4824                                    j + 2, nelt + j + 2, *, ...}>  */
4825           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4826           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4827                                            vect2, perm3_mask_low);
4828           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4829
4830           vect1 = data_ref;
4831           vect2 = dr_chain[2];
4832           /* Create interleaving stmt:
4833              low = VEC_PERM_EXPR <vect1, vect2,
4834                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4835                                    6, 7, nelt + j + 2, ...}>  */
4836           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4837           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4838                                            vect2, perm3_mask_high);
4839           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4840           (*result_chain)[j] = data_ref;
4841         }
4842     }
4843   else
4844     {
4845       /* If length is not equal to 3 then only power of 2 is supported.  */
4846       gcc_assert (pow2p_hwi (length));
4847
4848       /* The encoding has 2 interleaved stepped patterns.  */
4849       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
4850       vec_perm_builder sel (nelt, 2, 3);
4851       sel.quick_grow (6);
4852       for (i = 0; i < 3; i++)
4853         {
4854           sel[i * 2] = i;
4855           sel[i * 2 + 1] = i + nelt;
4856         }
4857         vec_perm_indices indices (sel, 2, nelt);
4858         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4859
4860         for (i = 0; i < 6; i++)
4861           sel[i] += exact_div (nelt, 2);
4862         indices.new_vector (sel, 2, nelt);
4863         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4864
4865         for (i = 0, n = log_length; i < n; i++)
4866           {
4867             for (j = 0; j < length/2; j++)
4868               {
4869                 vect1 = dr_chain[j];
4870                 vect2 = dr_chain[j+length/2];
4871
4872                 /* Create interleaving stmt:
4873                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4874                                                         ...}>  */
4875                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4876                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4877                                                  vect2, perm_mask_high);
4878                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4879                 (*result_chain)[2*j] = high;
4880
4881                 /* Create interleaving stmt:
4882                    low = VEC_PERM_EXPR <vect1, vect2,
4883                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4884                                          ...}>  */
4885                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4886                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4887                                                  vect2, perm_mask_low);
4888                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4889                 (*result_chain)[2*j+1] = low;
4890               }
4891             memcpy (dr_chain.address (), result_chain->address (),
4892                     length * sizeof (tree));
4893           }
4894     }
4895 }
4896
4897 /* Function vect_setup_realignment
4898
4899    This function is called when vectorizing an unaligned load using
4900    the dr_explicit_realign[_optimized] scheme.
4901    This function generates the following code at the loop prolog:
4902
4903       p = initial_addr;
4904    x  msq_init = *(floor(p));   # prolog load
4905       realignment_token = call target_builtin;
4906     loop:
4907    x  msq = phi (msq_init, ---)
4908
4909    The stmts marked with x are generated only for the case of
4910    dr_explicit_realign_optimized.
4911
4912    The code above sets up a new (vector) pointer, pointing to the first
4913    location accessed by STMT, and a "floor-aligned" load using that pointer.
4914    It also generates code to compute the "realignment-token" (if the relevant
4915    target hook was defined), and creates a phi-node at the loop-header bb
4916    whose arguments are the result of the prolog-load (created by this
4917    function) and the result of a load that takes place in the loop (to be
4918    created by the caller to this function).
4919
4920    For the case of dr_explicit_realign_optimized:
4921    The caller to this function uses the phi-result (msq) to create the
4922    realignment code inside the loop, and sets up the missing phi argument,
4923    as follows:
4924     loop:
4925       msq = phi (msq_init, lsq)
4926       lsq = *(floor(p'));        # load in loop
4927       result = realign_load (msq, lsq, realignment_token);
4928
4929    For the case of dr_explicit_realign:
4930     loop:
4931       msq = *(floor(p));        # load in loop
4932       p' = p + (VS-1);
4933       lsq = *(floor(p'));       # load in loop
4934       result = realign_load (msq, lsq, realignment_token);
4935
4936    Input:
4937    STMT - (scalar) load stmt to be vectorized. This load accesses
4938           a memory location that may be unaligned.
4939    BSI - place where new code is to be inserted.
4940    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4941                               is used.
4942
4943    Output:
4944    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4945                        target hook, if defined.
4946    Return value - the result of the loop-header phi node.  */
4947
4948 tree
4949 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4950                         tree *realignment_token,
4951                         enum dr_alignment_support alignment_support_scheme,
4952                         tree init_addr,
4953                         struct loop **at_loop)
4954 {
4955   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4956   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4957   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4958   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4959   struct loop *loop = NULL;
4960   edge pe = NULL;
4961   tree scalar_dest = gimple_assign_lhs (stmt);
4962   tree vec_dest;
4963   gimple *inc;
4964   tree ptr;
4965   tree data_ref;
4966   basic_block new_bb;
4967   tree msq_init = NULL_TREE;
4968   tree new_temp;
4969   gphi *phi_stmt;
4970   tree msq = NULL_TREE;
4971   gimple_seq stmts = NULL;
4972   bool inv_p;
4973   bool compute_in_loop = false;
4974   bool nested_in_vect_loop = false;
4975   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4976   struct loop *loop_for_initial_load = NULL;
4977
4978   if (loop_vinfo)
4979     {
4980       loop = LOOP_VINFO_LOOP (loop_vinfo);
4981       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4982     }
4983
4984   gcc_assert (alignment_support_scheme == dr_explicit_realign
4985               || alignment_support_scheme == dr_explicit_realign_optimized);
4986
4987   /* We need to generate three things:
4988      1. the misalignment computation
4989      2. the extra vector load (for the optimized realignment scheme).
4990      3. the phi node for the two vectors from which the realignment is
4991       done (for the optimized realignment scheme).  */
4992
4993   /* 1. Determine where to generate the misalignment computation.
4994
4995      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4996      calculation will be generated by this function, outside the loop (in the
4997      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4998      caller, inside the loop.
4999
5000      Background: If the misalignment remains fixed throughout the iterations of
5001      the loop, then both realignment schemes are applicable, and also the
5002      misalignment computation can be done outside LOOP.  This is because we are
5003      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5004      are a multiple of VS (the Vector Size), and therefore the misalignment in
5005      different vectorized LOOP iterations is always the same.
5006      The problem arises only if the memory access is in an inner-loop nested
5007      inside LOOP, which is now being vectorized using outer-loop vectorization.
5008      This is the only case when the misalignment of the memory access may not
5009      remain fixed throughout the iterations of the inner-loop (as explained in
5010      detail in vect_supportable_dr_alignment).  In this case, not only is the
5011      optimized realignment scheme not applicable, but also the misalignment
5012      computation (and generation of the realignment token that is passed to
5013      REALIGN_LOAD) have to be done inside the loop.
5014
5015      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5016      or not, which in turn determines if the misalignment is computed inside
5017      the inner-loop, or outside LOOP.  */
5018
5019   if (init_addr != NULL_TREE || !loop_vinfo)
5020     {
5021       compute_in_loop = true;
5022       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5023     }
5024
5025
5026   /* 2. Determine where to generate the extra vector load.
5027
5028      For the optimized realignment scheme, instead of generating two vector
5029      loads in each iteration, we generate a single extra vector load in the
5030      preheader of the loop, and in each iteration reuse the result of the
5031      vector load from the previous iteration.  In case the memory access is in
5032      an inner-loop nested inside LOOP, which is now being vectorized using
5033      outer-loop vectorization, we need to determine whether this initial vector
5034      load should be generated at the preheader of the inner-loop, or can be
5035      generated at the preheader of LOOP.  If the memory access has no evolution
5036      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5037      to be generated inside LOOP (in the preheader of the inner-loop).  */
5038
5039   if (nested_in_vect_loop)
5040     {
5041       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5042       bool invariant_in_outerloop =
5043             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5044       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5045     }
5046   else
5047     loop_for_initial_load = loop;
5048   if (at_loop)
5049     *at_loop = loop_for_initial_load;
5050
5051   if (loop_for_initial_load)
5052     pe = loop_preheader_edge (loop_for_initial_load);
5053
5054   /* 3. For the case of the optimized realignment, create the first vector
5055       load at the loop preheader.  */
5056
5057   if (alignment_support_scheme == dr_explicit_realign_optimized)
5058     {
5059       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5060       gassign *new_stmt;
5061
5062       gcc_assert (!compute_in_loop);
5063       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5064       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5065                                       NULL_TREE, &init_addr, NULL, &inc,
5066                                       true, &inv_p);
5067       if (TREE_CODE (ptr) == SSA_NAME)
5068         new_temp = copy_ssa_name (ptr);
5069       else
5070         new_temp = make_ssa_name (TREE_TYPE (ptr));
5071       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5072       new_stmt = gimple_build_assign
5073                    (new_temp, BIT_AND_EXPR, ptr,
5074                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5075       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5076       gcc_assert (!new_bb);
5077       data_ref
5078         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5079                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5080       new_stmt = gimple_build_assign (vec_dest, data_ref);
5081       new_temp = make_ssa_name (vec_dest, new_stmt);
5082       gimple_assign_set_lhs (new_stmt, new_temp);
5083       if (pe)
5084         {
5085           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5086           gcc_assert (!new_bb);
5087         }
5088       else
5089          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5090
5091       msq_init = gimple_assign_lhs (new_stmt);
5092     }
5093
5094   /* 4. Create realignment token using a target builtin, if available.
5095       It is done either inside the containing loop, or before LOOP (as
5096       determined above).  */
5097
5098   if (targetm.vectorize.builtin_mask_for_load)
5099     {
5100       gcall *new_stmt;
5101       tree builtin_decl;
5102
5103       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5104       if (!init_addr)
5105         {
5106           /* Generate the INIT_ADDR computation outside LOOP.  */
5107           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5108                                                             NULL_TREE);
5109           if (loop)
5110             {
5111               pe = loop_preheader_edge (loop);
5112               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5113               gcc_assert (!new_bb);
5114             }
5115           else
5116              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5117         }
5118
5119       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5120       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5121       vec_dest =
5122         vect_create_destination_var (scalar_dest,
5123                                      gimple_call_return_type (new_stmt));
5124       new_temp = make_ssa_name (vec_dest, new_stmt);
5125       gimple_call_set_lhs (new_stmt, new_temp);
5126
5127       if (compute_in_loop)
5128         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5129       else
5130         {
5131           /* Generate the misalignment computation outside LOOP.  */
5132           pe = loop_preheader_edge (loop);
5133           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5134           gcc_assert (!new_bb);
5135         }
5136
5137       *realignment_token = gimple_call_lhs (new_stmt);
5138
5139       /* The result of the CALL_EXPR to this builtin is determined from
5140          the value of the parameter and no global variables are touched
5141          which makes the builtin a "const" function.  Requiring the
5142          builtin to have the "const" attribute makes it unnecessary
5143          to call mark_call_clobbered.  */
5144       gcc_assert (TREE_READONLY (builtin_decl));
5145     }
5146
5147   if (alignment_support_scheme == dr_explicit_realign)
5148     return msq;
5149
5150   gcc_assert (!compute_in_loop);
5151   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5152
5153
5154   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5155
5156   pe = loop_preheader_edge (containing_loop);
5157   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5158   msq = make_ssa_name (vec_dest);
5159   phi_stmt = create_phi_node (msq, containing_loop->header);
5160   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5161
5162   return msq;
5163 }
5164
5165
5166 /* Function vect_grouped_load_supported.
5167
5168    COUNT is the size of the load group (the number of statements plus the
5169    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5170    only one statement, with a gap of COUNT - 1.
5171
5172    Returns true if a suitable permute exists.  */
5173
5174 bool
5175 vect_grouped_load_supported (tree vectype, bool single_element_p,
5176                              unsigned HOST_WIDE_INT count)
5177 {
5178   machine_mode mode = TYPE_MODE (vectype);
5179
5180   /* If this is single-element interleaving with an element distance
5181      that leaves unused vector loads around punt - we at least create
5182      very sub-optimal code in that case (and blow up memory,
5183      see PR65518).  */
5184   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5185     {
5186       if (dump_enabled_p ())
5187         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5188                          "single-element interleaving not supported "
5189                          "for not adjacent vector loads\n");
5190       return false;
5191     }
5192
5193   /* vect_permute_load_chain requires the group size to be equal to 3 or
5194      be a power of two.  */
5195   if (count != 3 && exact_log2 (count) == -1)
5196     {
5197       if (dump_enabled_p ())
5198         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5199                          "the size of the group of accesses"
5200                          " is not a power of 2 or not equal to 3\n");
5201       return false;
5202     }
5203
5204   /* Check that the permutation is supported.  */
5205   if (VECTOR_MODE_P (mode))
5206     {
5207       unsigned int i, j;
5208       if (count == 3)
5209         {
5210           unsigned int nelt;
5211           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5212             {
5213               if (dump_enabled_p ())
5214                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5215                                  "cannot handle groups of 3 loads for"
5216                                  " variable-length vectors\n");
5217               return false;
5218             }
5219
5220           vec_perm_builder sel (nelt, nelt, 1);
5221           sel.quick_grow (nelt);
5222           vec_perm_indices indices;
5223           unsigned int k;
5224           for (k = 0; k < 3; k++)
5225             {
5226               for (i = 0; i < nelt; i++)
5227                 if (3 * i + k < 2 * nelt)
5228                   sel[i] = 3 * i + k;
5229                 else
5230                   sel[i] = 0;
5231               indices.new_vector (sel, 2, nelt);
5232               if (!can_vec_perm_const_p (mode, indices))
5233                 {
5234                   if (dump_enabled_p ())
5235                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5236                                      "shuffle of 3 loads is not supported by"
5237                                      " target\n");
5238                   return false;
5239                 }
5240               for (i = 0, j = 0; i < nelt; i++)
5241                 if (3 * i + k < 2 * nelt)
5242                   sel[i] = i;
5243                 else
5244                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5245               indices.new_vector (sel, 2, nelt);
5246               if (!can_vec_perm_const_p (mode, indices))
5247                 {
5248                   if (dump_enabled_p ())
5249                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5250                                      "shuffle of 3 loads is not supported by"
5251                                      " target\n");
5252                   return false;
5253                 }
5254             }
5255           return true;
5256         }
5257       else
5258         {
5259           /* If length is not equal to 3 then only power of 2 is supported.  */
5260           gcc_assert (pow2p_hwi (count));
5261           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5262
5263           /* The encoding has a single stepped pattern.  */
5264           vec_perm_builder sel (nelt, 1, 3);
5265           sel.quick_grow (3);
5266           for (i = 0; i < 3; i++)
5267             sel[i] = i * 2;
5268           vec_perm_indices indices (sel, 2, nelt);
5269           if (can_vec_perm_const_p (mode, indices))
5270             {
5271               for (i = 0; i < 3; i++)
5272                 sel[i] = i * 2 + 1;
5273               indices.new_vector (sel, 2, nelt);
5274               if (can_vec_perm_const_p (mode, indices))
5275                 return true;
5276             }
5277         }
5278     }
5279
5280   if (dump_enabled_p ())
5281     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5282                      "extract even/odd not supported by target\n");
5283   return false;
5284 }
5285
5286 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5287    type VECTYPE.  */
5288
5289 bool
5290 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5291 {
5292   return vect_lanes_optab_supported_p ("vec_load_lanes",
5293                                        vec_load_lanes_optab,
5294                                        vectype, count);
5295 }
5296
5297 /* Function vect_permute_load_chain.
5298
5299    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5300    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5301    the input data correctly.  Return the final references for loads in
5302    RESULT_CHAIN.
5303
5304    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5305    The input is 4 vectors each containing 8 elements. We assign a number to each
5306    element, the input sequence is:
5307
5308    1st vec:   0  1  2  3  4  5  6  7
5309    2nd vec:   8  9 10 11 12 13 14 15
5310    3rd vec:  16 17 18 19 20 21 22 23
5311    4th vec:  24 25 26 27 28 29 30 31
5312
5313    The output sequence should be:
5314
5315    1st vec:  0 4  8 12 16 20 24 28
5316    2nd vec:  1 5  9 13 17 21 25 29
5317    3rd vec:  2 6 10 14 18 22 26 30
5318    4th vec:  3 7 11 15 19 23 27 31
5319
5320    i.e., the first output vector should contain the first elements of each
5321    interleaving group, etc.
5322
5323    We use extract_even/odd instructions to create such output.  The input of
5324    each extract_even/odd operation is two vectors
5325    1st vec    2nd vec
5326    0 1 2 3    4 5 6 7
5327
5328    and the output is the vector of extracted even/odd elements.  The output of
5329    extract_even will be:   0 2 4 6
5330    and of extract_odd:     1 3 5 7
5331
5332
5333    The permutation is done in log LENGTH stages.  In each stage extract_even
5334    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5335    their order.  In our example,
5336
5337    E1: extract_even (1st vec, 2nd vec)
5338    E2: extract_odd (1st vec, 2nd vec)
5339    E3: extract_even (3rd vec, 4th vec)
5340    E4: extract_odd (3rd vec, 4th vec)
5341
5342    The output for the first stage will be:
5343
5344    E1:  0  2  4  6  8 10 12 14
5345    E2:  1  3  5  7  9 11 13 15
5346    E3: 16 18 20 22 24 26 28 30
5347    E4: 17 19 21 23 25 27 29 31
5348
5349    In order to proceed and create the correct sequence for the next stage (or
5350    for the correct output, if the second stage is the last one, as in our
5351    example), we first put the output of extract_even operation and then the
5352    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5353    The input for the second stage is:
5354
5355    1st vec (E1):  0  2  4  6  8 10 12 14
5356    2nd vec (E3): 16 18 20 22 24 26 28 30
5357    3rd vec (E2):  1  3  5  7  9 11 13 15
5358    4th vec (E4): 17 19 21 23 25 27 29 31
5359
5360    The output of the second stage:
5361
5362    E1: 0 4  8 12 16 20 24 28
5363    E2: 2 6 10 14 18 22 26 30
5364    E3: 1 5  9 13 17 21 25 29
5365    E4: 3 7 11 15 19 23 27 31
5366
5367    And RESULT_CHAIN after reordering:
5368
5369    1st vec (E1):  0 4  8 12 16 20 24 28
5370    2nd vec (E3):  1 5  9 13 17 21 25 29
5371    3rd vec (E2):  2 6 10 14 18 22 26 30
5372    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5373
5374 static void
5375 vect_permute_load_chain (vec<tree> dr_chain,
5376                          unsigned int length,
5377                          gimple *stmt,
5378                          gimple_stmt_iterator *gsi,
5379                          vec<tree> *result_chain)
5380 {
5381   tree data_ref, first_vect, second_vect;
5382   tree perm_mask_even, perm_mask_odd;
5383   tree perm3_mask_low, perm3_mask_high;
5384   gimple *perm_stmt;
5385   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5386   unsigned int i, j, log_length = exact_log2 (length);
5387
5388   result_chain->quick_grow (length);
5389   memcpy (result_chain->address (), dr_chain.address (),
5390           length * sizeof (tree));
5391
5392   if (length == 3)
5393     {
5394       /* vect_grouped_load_supported ensures that this is constant.  */
5395       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5396       unsigned int k;
5397
5398       vec_perm_builder sel (nelt, nelt, 1);
5399       sel.quick_grow (nelt);
5400       vec_perm_indices indices;
5401       for (k = 0; k < 3; k++)
5402         {
5403           for (i = 0; i < nelt; i++)
5404             if (3 * i + k < 2 * nelt)
5405               sel[i] = 3 * i + k;
5406             else
5407               sel[i] = 0;
5408           indices.new_vector (sel, 2, nelt);
5409           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5410
5411           for (i = 0, j = 0; i < nelt; i++)
5412             if (3 * i + k < 2 * nelt)
5413               sel[i] = i;
5414             else
5415               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5416           indices.new_vector (sel, 2, nelt);
5417           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5418
5419           first_vect = dr_chain[0];
5420           second_vect = dr_chain[1];
5421
5422           /* Create interleaving stmt (low part of):
5423              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5424                                                              ...}>  */
5425           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5426           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5427                                            second_vect, perm3_mask_low);
5428           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5429
5430           /* Create interleaving stmt (high part of):
5431              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5432                                                               ...}>  */
5433           first_vect = data_ref;
5434           second_vect = dr_chain[2];
5435           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5436           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5437                                            second_vect, perm3_mask_high);
5438           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5439           (*result_chain)[k] = data_ref;
5440         }
5441     }
5442   else
5443     {
5444       /* If length is not equal to 3 then only power of 2 is supported.  */
5445       gcc_assert (pow2p_hwi (length));
5446
5447       /* The encoding has a single stepped pattern.  */
5448       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5449       vec_perm_builder sel (nelt, 1, 3);
5450       sel.quick_grow (3);
5451       for (i = 0; i < 3; ++i)
5452         sel[i] = i * 2;
5453       vec_perm_indices indices (sel, 2, nelt);
5454       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5455
5456       for (i = 0; i < 3; ++i)
5457         sel[i] = i * 2 + 1;
5458       indices.new_vector (sel, 2, nelt);
5459       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5460
5461       for (i = 0; i < log_length; i++)
5462         {
5463           for (j = 0; j < length; j += 2)
5464             {
5465               first_vect = dr_chain[j];
5466               second_vect = dr_chain[j+1];
5467
5468               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5469               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5470               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5471                                                first_vect, second_vect,
5472                                                perm_mask_even);
5473               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5474               (*result_chain)[j/2] = data_ref;
5475
5476               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5477               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5478               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5479                                                first_vect, second_vect,
5480                                                perm_mask_odd);
5481               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5482               (*result_chain)[j/2+length/2] = data_ref;
5483             }
5484           memcpy (dr_chain.address (), result_chain->address (),
5485                   length * sizeof (tree));
5486         }
5487     }
5488 }
5489
5490 /* Function vect_shift_permute_load_chain.
5491
5492    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5493    sequence of stmts to reorder the input data accordingly.
5494    Return the final references for loads in RESULT_CHAIN.
5495    Return true if successed, false otherwise.
5496
5497    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5498    The input is 3 vectors each containing 8 elements.  We assign a
5499    number to each element, the input sequence is:
5500
5501    1st vec:   0  1  2  3  4  5  6  7
5502    2nd vec:   8  9 10 11 12 13 14 15
5503    3rd vec:  16 17 18 19 20 21 22 23
5504
5505    The output sequence should be:
5506
5507    1st vec:  0 3 6  9 12 15 18 21
5508    2nd vec:  1 4 7 10 13 16 19 22
5509    3rd vec:  2 5 8 11 14 17 20 23
5510
5511    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5512
5513    First we shuffle all 3 vectors to get correct elements order:
5514
5515    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5516    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5517    3rd vec:  (16 19 22) (17 20 23) (18 21)
5518
5519    Next we unite and shift vector 3 times:
5520
5521    1st step:
5522      shift right by 6 the concatenation of:
5523      "1st vec" and  "2nd vec"
5524        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5525      "2nd vec" and  "3rd vec"
5526        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5527      "3rd vec" and  "1st vec"
5528        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5529                              | New vectors                   |
5530
5531      So that now new vectors are:
5532
5533      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5534      2nd vec:  (10 13) (16 19 22) (17 20 23)
5535      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5536
5537    2nd step:
5538      shift right by 5 the concatenation of:
5539      "1st vec" and  "3rd vec"
5540        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5541      "2nd vec" and  "1st vec"
5542        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5543      "3rd vec" and  "2nd vec"
5544        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5545                           | New vectors                   |
5546
5547      So that now new vectors are:
5548
5549      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5550      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5551      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5552
5553    3rd step:
5554      shift right by 5 the concatenation of:
5555      "1st vec" and  "1st vec"
5556        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5557      shift right by 3 the concatenation of:
5558      "2nd vec" and  "2nd vec"
5559                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5560                           | New vectors                   |
5561
5562      So that now all vectors are READY:
5563      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5564      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5565      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5566
5567    This algorithm is faster than one in vect_permute_load_chain if:
5568      1.  "shift of a concatination" is faster than general permutation.
5569          This is usually so.
5570      2.  The TARGET machine can't execute vector instructions in parallel.
5571          This is because each step of the algorithm depends on previous.
5572          The algorithm in vect_permute_load_chain is much more parallel.
5573
5574    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5575 */
5576
5577 static bool
5578 vect_shift_permute_load_chain (vec<tree> dr_chain,
5579                                unsigned int length,
5580                                gimple *stmt,
5581                                gimple_stmt_iterator *gsi,
5582                                vec<tree> *result_chain)
5583 {
5584   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5585   tree perm2_mask1, perm2_mask2, perm3_mask;
5586   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5587   gimple *perm_stmt;
5588
5589   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5590   unsigned int i;
5591   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5592   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5593
5594   unsigned HOST_WIDE_INT nelt, vf;
5595   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5596       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5597     /* Not supported for variable-length vectors.  */
5598     return false;
5599
5600   vec_perm_builder sel (nelt, nelt, 1);
5601   sel.quick_grow (nelt);
5602
5603   result_chain->quick_grow (length);
5604   memcpy (result_chain->address (), dr_chain.address (),
5605           length * sizeof (tree));
5606
5607   if (pow2p_hwi (length) && vf > 4)
5608     {
5609       unsigned int j, log_length = exact_log2 (length);
5610       for (i = 0; i < nelt / 2; ++i)
5611         sel[i] = i * 2;
5612       for (i = 0; i < nelt / 2; ++i)
5613         sel[nelt / 2 + i] = i * 2 + 1;
5614       vec_perm_indices indices (sel, 2, nelt);
5615       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5616         {
5617           if (dump_enabled_p ())
5618             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5619                              "shuffle of 2 fields structure is not \
5620                               supported by target\n");
5621           return false;
5622         }
5623       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5624
5625       for (i = 0; i < nelt / 2; ++i)
5626         sel[i] = i * 2 + 1;
5627       for (i = 0; i < nelt / 2; ++i)
5628         sel[nelt / 2 + i] = i * 2;
5629       indices.new_vector (sel, 2, nelt);
5630       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5631         {
5632           if (dump_enabled_p ())
5633             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5634                              "shuffle of 2 fields structure is not \
5635                               supported by target\n");
5636           return false;
5637         }
5638       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5639
5640       /* Generating permutation constant to shift all elements.
5641          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5642       for (i = 0; i < nelt; i++)
5643         sel[i] = nelt / 2 + i;
5644       indices.new_vector (sel, 2, nelt);
5645       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5646         {
5647           if (dump_enabled_p ())
5648             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5649                              "shift permutation is not supported by target\n");
5650           return false;
5651         }
5652       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5653
5654       /* Generating permutation constant to select vector from 2.
5655          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5656       for (i = 0; i < nelt / 2; i++)
5657         sel[i] = i;
5658       for (i = nelt / 2; i < nelt; i++)
5659         sel[i] = nelt + i;
5660       indices.new_vector (sel, 2, nelt);
5661       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5662         {
5663           if (dump_enabled_p ())
5664             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5665                              "select is not supported by target\n");
5666           return false;
5667         }
5668       select_mask = vect_gen_perm_mask_checked (vectype, indices);
5669
5670       for (i = 0; i < log_length; i++)
5671         {
5672           for (j = 0; j < length; j += 2)
5673             {
5674               first_vect = dr_chain[j];
5675               second_vect = dr_chain[j + 1];
5676
5677               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5678               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5679                                                first_vect, first_vect,
5680                                                perm2_mask1);
5681               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5682               vect[0] = data_ref;
5683
5684               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5685               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5686                                                second_vect, second_vect,
5687                                                perm2_mask2);
5688               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5689               vect[1] = data_ref;
5690
5691               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5692               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5693                                                vect[0], vect[1], shift1_mask);
5694               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5695               (*result_chain)[j/2 + length/2] = data_ref;
5696
5697               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5698               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5699                                                vect[0], vect[1], select_mask);
5700               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5701               (*result_chain)[j/2] = data_ref;
5702             }
5703           memcpy (dr_chain.address (), result_chain->address (),
5704                   length * sizeof (tree));
5705         }
5706       return true;
5707     }
5708   if (length == 3 && vf > 2)
5709     {
5710       unsigned int k = 0, l = 0;
5711
5712       /* Generating permutation constant to get all elements in rigth order.
5713          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5714       for (i = 0; i < nelt; i++)
5715         {
5716           if (3 * k + (l % 3) >= nelt)
5717             {
5718               k = 0;
5719               l += (3 - (nelt % 3));
5720             }
5721           sel[i] = 3 * k + (l % 3);
5722           k++;
5723         }
5724       vec_perm_indices indices (sel, 2, nelt);
5725       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5726         {
5727           if (dump_enabled_p ())
5728             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5729                              "shuffle of 3 fields structure is not \
5730                               supported by target\n");
5731           return false;
5732         }
5733       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5734
5735       /* Generating permutation constant to shift all elements.
5736          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5737       for (i = 0; i < nelt; i++)
5738         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5739       indices.new_vector (sel, 2, nelt);
5740       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5741         {
5742           if (dump_enabled_p ())
5743             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5744                              "shift permutation is not supported by target\n");
5745           return false;
5746         }
5747       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5748
5749       /* Generating permutation constant to shift all elements.
5750          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5751       for (i = 0; i < nelt; i++)
5752         sel[i] = 2 * (nelt / 3) + 1 + i;
5753       indices.new_vector (sel, 2, nelt);
5754       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5755         {
5756           if (dump_enabled_p ())
5757             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758                              "shift permutation is not supported by target\n");
5759           return false;
5760         }
5761       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5762
5763       /* Generating permutation constant to shift all elements.
5764          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5765       for (i = 0; i < nelt; i++)
5766         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5767       indices.new_vector (sel, 2, nelt);
5768       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5769         {
5770           if (dump_enabled_p ())
5771             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5772                              "shift permutation is not supported by target\n");
5773           return false;
5774         }
5775       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5776
5777       /* Generating permutation constant to shift all elements.
5778          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5779       for (i = 0; i < nelt; i++)
5780         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5781       indices.new_vector (sel, 2, nelt);
5782       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5783         {
5784           if (dump_enabled_p ())
5785             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5786                              "shift permutation is not supported by target\n");
5787           return false;
5788         }
5789       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5790
5791       for (k = 0; k < 3; k++)
5792         {
5793           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5794           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5795                                            dr_chain[k], dr_chain[k],
5796                                            perm3_mask);
5797           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5798           vect[k] = data_ref;
5799         }
5800
5801       for (k = 0; k < 3; k++)
5802         {
5803           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5804           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5805                                            vect[k % 3], vect[(k + 1) % 3],
5806                                            shift1_mask);
5807           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5808           vect_shift[k] = data_ref;
5809         }
5810
5811       for (k = 0; k < 3; k++)
5812         {
5813           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5814           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5815                                            vect_shift[(4 - k) % 3],
5816                                            vect_shift[(3 - k) % 3],
5817                                            shift2_mask);
5818           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5819           vect[k] = data_ref;
5820         }
5821
5822       (*result_chain)[3 - (nelt % 3)] = vect[2];
5823
5824       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5825       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5826                                        vect[0], shift3_mask);
5827       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5828       (*result_chain)[nelt % 3] = data_ref;
5829
5830       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5831       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5832                                        vect[1], shift4_mask);
5833       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5834       (*result_chain)[0] = data_ref;
5835       return true;
5836     }
5837   return false;
5838 }
5839
5840 /* Function vect_transform_grouped_load.
5841
5842    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5843    to perform their permutation and ascribe the result vectorized statements to
5844    the scalar statements.
5845 */
5846
5847 void
5848 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5849                              gimple_stmt_iterator *gsi)
5850 {
5851   machine_mode mode;
5852   vec<tree> result_chain = vNULL;
5853
5854   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5855      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5856      vectors, that are ready for vector computation.  */
5857   result_chain.create (size);
5858
5859   /* If reassociation width for vector type is 2 or greater target machine can
5860      execute 2 or more vector instructions in parallel.  Otherwise try to
5861      get chain for loads group using vect_shift_permute_load_chain.  */
5862   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5863   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5864       || pow2p_hwi (size)
5865       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5866                                          gsi, &result_chain))
5867     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5868   vect_record_grouped_load_vectors (stmt, result_chain);
5869   result_chain.release ();
5870 }
5871
5872 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5873    generated as part of the vectorization of STMT.  Assign the statement
5874    for each vector to the associated scalar statement.  */
5875
5876 void
5877 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5878 {
5879   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5880   gimple *next_stmt, *new_stmt;
5881   unsigned int i, gap_count;
5882   tree tmp_data_ref;
5883
5884   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5885      Since we scan the chain starting from it's first node, their order
5886      corresponds the order of data-refs in RESULT_CHAIN.  */
5887   next_stmt = first_stmt;
5888   gap_count = 1;
5889   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5890     {
5891       if (!next_stmt)
5892         break;
5893
5894       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5895        code elimination pass later.  No need to check for the first stmt in
5896        the group, since it always exists.
5897        GROUP_GAP is the number of steps in elements from the previous
5898        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5899        correspond to the gaps.  */
5900       if (next_stmt != first_stmt
5901           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5902       {
5903         gap_count++;
5904         continue;
5905       }
5906
5907       while (next_stmt)
5908         {
5909           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5910           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5911              copies, and we put the new vector statement in the first available
5912              RELATED_STMT.  */
5913           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5914             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5915           else
5916             {
5917               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5918                 {
5919                   gimple *prev_stmt =
5920                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5921                   gimple *rel_stmt =
5922                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5923                   while (rel_stmt)
5924                     {
5925                       prev_stmt = rel_stmt;
5926                       rel_stmt =
5927                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5928                     }
5929
5930                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5931                     new_stmt;
5932                 }
5933             }
5934
5935           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5936           gap_count = 1;
5937           /* If NEXT_STMT accesses the same DR as the previous statement,
5938              put the same TMP_DATA_REF as its vectorized statement; otherwise
5939              get the next data-ref from RESULT_CHAIN.  */
5940           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5941             break;
5942         }
5943     }
5944 }
5945
5946 /* Function vect_force_dr_alignment_p.
5947
5948    Returns whether the alignment of a DECL can be forced to be aligned
5949    on ALIGNMENT bit boundary.  */
5950
5951 bool
5952 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5953 {
5954   if (!VAR_P (decl))
5955     return false;
5956
5957   if (decl_in_symtab_p (decl)
5958       && !symtab_node::get (decl)->can_increase_alignment_p ())
5959     return false;
5960
5961   if (TREE_STATIC (decl))
5962     return (alignment <= MAX_OFILE_ALIGNMENT);
5963   else
5964     return (alignment <= MAX_STACK_ALIGNMENT);
5965 }
5966
5967
5968 /* Return whether the data reference DR is supported with respect to its
5969    alignment.
5970    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5971    it is aligned, i.e., check if it is possible to vectorize it with different
5972    alignment.  */
5973
5974 enum dr_alignment_support
5975 vect_supportable_dr_alignment (struct data_reference *dr,
5976                                bool check_aligned_accesses)
5977 {
5978   gimple *stmt = DR_STMT (dr);
5979   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5980   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5981   machine_mode mode = TYPE_MODE (vectype);
5982   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5983   struct loop *vect_loop = NULL;
5984   bool nested_in_vect_loop = false;
5985
5986   if (aligned_access_p (dr) && !check_aligned_accesses)
5987     return dr_aligned;
5988
5989   /* For now assume all conditional loads/stores support unaligned
5990      access without any special code.  */
5991   if (is_gimple_call (stmt)
5992       && gimple_call_internal_p (stmt)
5993       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5994           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5995     return dr_unaligned_supported;
5996
5997   if (loop_vinfo)
5998     {
5999       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6000       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6001     }
6002
6003   /* Possibly unaligned access.  */
6004
6005   /* We can choose between using the implicit realignment scheme (generating
6006      a misaligned_move stmt) and the explicit realignment scheme (generating
6007      aligned loads with a REALIGN_LOAD).  There are two variants to the
6008      explicit realignment scheme: optimized, and unoptimized.
6009      We can optimize the realignment only if the step between consecutive
6010      vector loads is equal to the vector size.  Since the vector memory
6011      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6012      is guaranteed that the misalignment amount remains the same throughout the
6013      execution of the vectorized loop.  Therefore, we can create the
6014      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6015      at the loop preheader.
6016
6017      However, in the case of outer-loop vectorization, when vectorizing a
6018      memory access in the inner-loop nested within the LOOP that is now being
6019      vectorized, while it is guaranteed that the misalignment of the
6020      vectorized memory access will remain the same in different outer-loop
6021      iterations, it is *not* guaranteed that is will remain the same throughout
6022      the execution of the inner-loop.  This is because the inner-loop advances
6023      with the original scalar step (and not in steps of VS).  If the inner-loop
6024      step happens to be a multiple of VS, then the misalignment remains fixed
6025      and we can use the optimized realignment scheme.  For example:
6026
6027       for (i=0; i<N; i++)
6028         for (j=0; j<M; j++)
6029           s += a[i+j];
6030
6031      When vectorizing the i-loop in the above example, the step between
6032      consecutive vector loads is 1, and so the misalignment does not remain
6033      fixed across the execution of the inner-loop, and the realignment cannot
6034      be optimized (as illustrated in the following pseudo vectorized loop):
6035
6036       for (i=0; i<N; i+=4)
6037         for (j=0; j<M; j++){
6038           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6039                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6040                          // (assuming that we start from an aligned address).
6041           }
6042
6043      We therefore have to use the unoptimized realignment scheme:
6044
6045       for (i=0; i<N; i+=4)
6046           for (j=k; j<M; j+=4)
6047           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6048                            // that the misalignment of the initial address is
6049                            // 0).
6050
6051      The loop can then be vectorized as follows:
6052
6053       for (k=0; k<4; k++){
6054         rt = get_realignment_token (&vp[k]);
6055         for (i=0; i<N; i+=4){
6056           v1 = vp[i+k];
6057           for (j=k; j<M; j+=4){
6058             v2 = vp[i+j+VS-1];
6059             va = REALIGN_LOAD <v1,v2,rt>;
6060             vs += va;
6061             v1 = v2;
6062           }
6063         }
6064     } */
6065
6066   if (DR_IS_READ (dr))
6067     {
6068       bool is_packed = false;
6069       tree type = (TREE_TYPE (DR_REF (dr)));
6070
6071       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6072           && (!targetm.vectorize.builtin_mask_for_load
6073               || targetm.vectorize.builtin_mask_for_load ()))
6074         {
6075           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6076
6077           /* If we are doing SLP then the accesses need not have the
6078              same alignment, instead it depends on the SLP group size.  */
6079           if (loop_vinfo
6080               && STMT_SLP_TYPE (stmt_info)
6081               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6082                               * GROUP_SIZE (vinfo_for_stmt
6083                                             (GROUP_FIRST_ELEMENT (stmt_info))),
6084                               TYPE_VECTOR_SUBPARTS (vectype)))
6085             ;
6086           else if (!loop_vinfo
6087                    || (nested_in_vect_loop
6088                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6089                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6090             return dr_explicit_realign;
6091           else
6092             return dr_explicit_realign_optimized;
6093         }
6094       if (!known_alignment_for_access_p (dr))
6095         is_packed = not_size_aligned (DR_REF (dr));
6096
6097       if (targetm.vectorize.support_vector_misalignment
6098             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6099         /* Can't software pipeline the loads, but can at least do them.  */
6100         return dr_unaligned_supported;
6101     }
6102   else
6103     {
6104       bool is_packed = false;
6105       tree type = (TREE_TYPE (DR_REF (dr)));
6106
6107       if (!known_alignment_for_access_p (dr))
6108         is_packed = not_size_aligned (DR_REF (dr));
6109
6110      if (targetm.vectorize.support_vector_misalignment
6111            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6112        return dr_unaligned_supported;
6113     }
6114
6115   /* Unsupported.  */
6116   return dr_unaligned_unsupported;
6117 }