gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "tm_p.h"
  30 #include "target.h"
  31 #include "basic-block.h"
  32 #include "gimple-pretty-print.h"
  33 #include "tree-ssa-alias.h"
  34 #include "internal-fn.h"
  35 #include "tree-eh.h"
  36 #include "gimple-expr.h"
  37 #include "is-a.h"
  38 #include "gimple.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "gimple-ssa.h"
  43 #include "tree-phinodes.h"
  44 #include "ssa-iterators.h"
  45 #include "stringpool.h"
  46 #include "tree-ssanames.h"
  47 #include "tree-ssa-loop-ivopts.h"
  48 #include "tree-ssa-loop-manip.h"
  49 #include "tree-ssa-loop.h"
  50 #include "dumpfile.h"
  51 #include "cfgloop.h"
  52 #include "tree-chrec.h"
  53 #include "tree-scalar-evolution.h"
  54 #include "tree-vectorizer.h"
  55 #include "diagnostic-core.h"
  56 #include "cgraph.h"
  57 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  58 #include "expr.h"
  59 #include "optabs.h"
  60 #include "builtins.h"
  61 #include "varasm.h"
  62
  63 /* Return true if load- or store-lanes optab OPTAB is implemented for
  64    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  65
  66 static bool
  67 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  68                               tree vectype, unsigned HOST_WIDE_INT count)
  69 {
  70   enum machine_mode mode, array_mode;
  71   bool limit_p;
  72
  73   mode = TYPE_MODE (vectype);
  74   limit_p = !targetm.array_mode_supported_p (mode, count);
  75   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  76                               MODE_INT, limit_p);
  77
  78   if (array_mode == BLKmode)
  79     {
  80       if (dump_enabled_p ())
  81         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  82                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  83                          GET_MODE_NAME (mode), count);
  84       return false;
  85     }
  86
  87   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  88     {
  89       if (dump_enabled_p ())
  90         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  91                          "cannot use %s<%s><%s>\n", name,
  92                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  93       return false;
  94     }
  95
  96   if (dump_enabled_p ())
  97     dump_printf_loc (MSG_NOTE, vect_location,
  98                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  99                      GET_MODE_NAME (mode));
 100
 101   return true;
 102 }
 103
 104
 105 /* Return the smallest scalar part of STMT.
 106    This is used to determine the vectype of the stmt.  We generally set the
 107    vectype according to the type of the result (lhs).  For stmts whose
 108    result-type is different than the type of the arguments (e.g., demotion,
 109    promotion), vectype will be reset appropriately (later).  Note that we have
 110    to visit the smallest datatype in this function, because that determines the
 111    VF.  If the smallest datatype in the loop is present only as the rhs of a
 112    promotion operation - we'd miss it.
 113    Such a case, where a variable of this datatype does not appear in the lhs
 114    anywhere in the loop, can only occur if it's an invariant: e.g.:
 115    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 116    invariant motion.  However, we cannot rely on invariant motion to always
 117    take invariants out of the loop, and so in the case of promotion we also
 118    have to check the rhs.
 119    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 120    types.  */
 121
 122 tree
 123 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 124                                HOST_WIDE_INT *rhs_size_unit)
 125 {
 126   tree scalar_type = gimple_expr_type (stmt);
 127   HOST_WIDE_INT lhs, rhs;
 128
 129   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 130
 131   if (is_gimple_assign (stmt)
 132       && (gimple_assign_cast_p (stmt)
 133           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 134           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 135           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 136     {
 137       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 138
 139       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 140       if (rhs < lhs)
 141         scalar_type = rhs_type;
 142     }
 143
 144   *lhs_size_unit = lhs;
 145   *rhs_size_unit = rhs;
 146   return scalar_type;
 147 }
 148
 149
 150 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 151    tested at run-time.  Return TRUE if DDR was successfully inserted.
 152    Return false if versioning is not supported.  */
 153
 154 static bool
 155 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 156 {
 157   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 158
 159   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 160     return false;
 161
 162   if (dump_enabled_p ())
 163     {
 164       dump_printf_loc (MSG_NOTE, vect_location,
 165                        "mark for run-time aliasing test between ");
 166       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 167       dump_printf (MSG_NOTE,  " and ");
 168       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 169       dump_printf (MSG_NOTE, "\n");
 170     }
 171
 172   if (optimize_loop_nest_for_size_p (loop))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 176                          "versioning not supported when optimizing"
 177                          " for size.\n");
 178       return false;
 179     }
 180
 181   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 182   if (loop->inner)
 183     {
 184       if (dump_enabled_p ())
 185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 186                          "versioning not yet supported for outer-loops.\n");
 187       return false;
 188     }
 189
 190   /* FORNOW: We don't support creating runtime alias tests for non-constant
 191      step.  */
 192   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 193       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 194     {
 195       if (dump_enabled_p ())
 196         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 197                          "versioning not yet supported for non-constant "
 198                          "step\n");
 199       return false;
 200     }
 201
 202   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 203   return true;
 204 }
 205
 206
 207 /* Function vect_analyze_data_ref_dependence.
 208
 209    Return TRUE if there (might) exist a dependence between a memory-reference
 210    DRA and a memory-reference DRB.  When versioning for alias may check a
 211    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 212    the data dependence.  */
 213
 214 static bool
 215 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 216                                   loop_vec_info loop_vinfo, int *max_vf)
 217 {
 218   unsigned int i;
 219   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 220   struct data_reference *dra = DDR_A (ddr);
 221   struct data_reference *drb = DDR_B (ddr);
 222   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 223   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 224   lambda_vector dist_v;
 225   unsigned int loop_depth;
 226
 227   /* In loop analysis all data references should be vectorizable.  */
 228   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 229       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 230     gcc_unreachable ();
 231
 232   /* Independent data accesses.  */
 233   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 234     return false;
 235
 236   if (dra == drb
 237       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 238     return false;
 239
 240   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 241      least two scalar iterations, there is always also a true dependence.
 242      As the vectorizer does not re-order loads and stores we can ignore
 243      the anti-dependence if TBAA can disambiguate both DRs similar to the
 244      case with known negative distance anti-dependences (positive
 245      distance anti-dependences would violate TBAA constraints).  */
 246   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 247        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 248       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 249                                  get_alias_set (DR_REF (drb))))
 250     return false;
 251
 252   /* Unknown data dependence.  */
 253   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 254     {
 255       /* If user asserted safelen consecutive iterations can be
 256          executed concurrently, assume independence.  */
 257       if (loop->safelen >= 2)
 258         {
 259           if (loop->safelen < *max_vf)
 260             *max_vf = loop->safelen;
 261           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 262           return false;
 263         }
 264
 265       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 266           || STMT_VINFO_GATHER_P (stmtinfo_b))
 267         {
 268           if (dump_enabled_p ())
 269             {
 270               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 271                                "versioning for alias not supported for: "
 272                                "can't determine dependence between ");
 273               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 274                                  DR_REF (dra));
 275               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 276               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 277                                  DR_REF (drb));
 278               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 279             }
 280           return true;
 281         }
 282
 283       if (dump_enabled_p ())
 284         {
 285           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 286                            "versioning for alias required: "
 287                            "can't determine dependence between ");
 288           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 289                              DR_REF (dra));
 290           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 291           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 292                              DR_REF (drb));
 293           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 294         }
 295
 296       /* Add to list of ddrs that need to be tested at run-time.  */
 297       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 298     }
 299
 300   /* Known data dependence.  */
 301   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 302     {
 303       /* If user asserted safelen consecutive iterations can be
 304          executed concurrently, assume independence.  */
 305       if (loop->safelen >= 2)
 306         {
 307           if (loop->safelen < *max_vf)
 308             *max_vf = loop->safelen;
 309           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 310           return false;
 311         }
 312
 313       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 314           || STMT_VINFO_GATHER_P (stmtinfo_b))
 315         {
 316           if (dump_enabled_p ())
 317             {
 318               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 319                                "versioning for alias not supported for: "
 320                                "bad dist vector for ");
 321               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 322                                  DR_REF (dra));
 323               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 324               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 325                                  DR_REF (drb));
 326               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 327             }
 328           return true;
 329         }
 330
 331       if (dump_enabled_p ())
 332         {
 333           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 334                            "versioning for alias required: "
 335                            "bad dist vector for ");
 336           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 337           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 338           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 339           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 340         }
 341       /* Add to list of ddrs that need to be tested at run-time.  */
 342       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 343     }
 344
 345   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 346   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 347     {
 348       int dist = dist_v[loop_depth];
 349
 350       if (dump_enabled_p ())
 351         dump_printf_loc (MSG_NOTE, vect_location,
 352                          "dependence distance  = %d.\n", dist);
 353
 354       if (dist == 0)
 355         {
 356           if (dump_enabled_p ())
 357             {
 358               dump_printf_loc (MSG_NOTE, vect_location,
 359                                "dependence distance == 0 between ");
 360               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 361               dump_printf (MSG_NOTE, " and ");
 362               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 363               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 364             }
 365
 366           /* When we perform grouped accesses and perform implicit CSE
 367              by detecting equal accesses and doing disambiguation with
 368              runtime alias tests like for
 369                 .. = a[i];
 370                 .. = a[i+1];
 371                 a[i] = ..;
 372                 a[i+1] = ..;
 373                 *p = ..;
 374                 .. = a[i];
 375                 .. = a[i+1];
 376              where we will end up loading { a[i], a[i+1] } once, make
 377              sure that inserting group loads before the first load and
 378              stores after the last store will do the right thing.
 379              Similar for groups like
 380                 a[i] = ...;
 381                 ... = a[i];
 382                 a[i+1] = ...;
 383              where loads from the group interleave with the store.  */
 384           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 385               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 386             {
 387               gimple earlier_stmt;
 388               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 389               if (DR_IS_WRITE
 390                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 391                 {
 392                   if (dump_enabled_p ())
 393                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                      "READ_WRITE dependence in interleaving."
 395                                      "\n");
 396                   return true;
 397                 }
 398             }
 399
 400           continue;
 401         }
 402
 403       if (dist > 0 && DDR_REVERSED_P (ddr))
 404         {
 405           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 406              reversed (to make distance vector positive), and the actual
 407              distance is negative.  */
 408           if (dump_enabled_p ())
 409             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 410                              "dependence distance negative.\n");
 411           /* Record a negative dependence distance to later limit the
 412              amount of stmt copying / unrolling we can perform.
 413              Only need to handle read-after-write dependence.  */
 414           if (DR_IS_READ (drb)
 415               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 416                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 417             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 418           continue;
 419         }
 420
 421       if (abs (dist) >= 2
 422           && abs (dist) < *max_vf)
 423         {
 424           /* The dependence distance requires reduction of the maximal
 425              vectorization factor.  */
 426           *max_vf = abs (dist);
 427           if (dump_enabled_p ())
 428             dump_printf_loc (MSG_NOTE, vect_location,
 429                              "adjusting maximal vectorization factor to %i\n",
 430                              *max_vf);
 431         }
 432
 433       if (abs (dist) >= *max_vf)
 434         {
 435           /* Dependence distance does not create dependence, as far as
 436              vectorization is concerned, in this case.  */
 437           if (dump_enabled_p ())
 438             dump_printf_loc (MSG_NOTE, vect_location,
 439                              "dependence distance >= VF.\n");
 440           continue;
 441         }
 442
 443       if (dump_enabled_p ())
 444         {
 445           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 446                        "not vectorized, possible dependence "
 447                        "between data-refs ");
 448           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 449           dump_printf (MSG_NOTE,  " and ");
 450           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 451           dump_printf (MSG_NOTE,  "\n");
 452         }
 453
 454       return true;
 455     }
 456
 457   return false;
 458 }
 459
 460 /* Function vect_analyze_data_ref_dependences.
 461
 462    Examine all the data references in the loop, and make sure there do not
 463    exist any data dependences between them.  Set *MAX_VF according to
 464    the maximum vectorization factor the data dependences allow.  */
 465
 466 bool
 467 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 468 {
 469   unsigned int i;
 470   struct data_dependence_relation *ddr;
 471
 472   if (dump_enabled_p ())
 473     dump_printf_loc (MSG_NOTE, vect_location,
 474                      "=== vect_analyze_data_ref_dependences ===\n");
 475
 476   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 477   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 478                                 &LOOP_VINFO_DDRS (loop_vinfo),
 479                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 480     return false;
 481
 482   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 483     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 484       return false;
 485
 486   return true;
 487 }
 488
 489
 490 /* Function vect_slp_analyze_data_ref_dependence.
 491
 492    Return TRUE if there (might) exist a dependence between a memory-reference
 493    DRA and a memory-reference DRB.  When versioning for alias may check a
 494    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 495    the data dependence.  */
 496
 497 static bool
 498 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 499 {
 500   struct data_reference *dra = DDR_A (ddr);
 501   struct data_reference *drb = DDR_B (ddr);
 502
 503   /* We need to check dependences of statements marked as unvectorizable
 504      as well, they still can prohibit vectorization.  */
 505
 506   /* Independent data accesses.  */
 507   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 508     return false;
 509
 510   if (dra == drb)
 511     return false;
 512
 513   /* Read-read is OK.  */
 514   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 515     return false;
 516
 517   /* If dra and drb are part of the same interleaving chain consider
 518      them independent.  */
 519   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 520       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 521           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 522     return false;
 523
 524   /* Unknown data dependence.  */
 525   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 526     {
 527       if  (dump_enabled_p ())
 528         {
 529           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 530                            "can't determine dependence between ");
 531           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 532           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 533           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 534           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 535         }
 536     }
 537   else if (dump_enabled_p ())
 538     {
 539       dump_printf_loc (MSG_NOTE, vect_location,
 540                        "determined dependence between ");
 541       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 542       dump_printf (MSG_NOTE, " and ");
 543       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 544       dump_printf (MSG_NOTE,  "\n");
 545     }
 546
 547   /* We do not vectorize basic blocks with write-write dependencies.  */
 548   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 549     return true;
 550
 551   /* If we have a read-write dependence check that the load is before the store.
 552      When we vectorize basic blocks, vector load can be only before
 553      corresponding scalar load, and vector store can be only after its
 554      corresponding scalar store.  So the order of the acceses is preserved in
 555      case the load is before the store.  */
 556   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 557   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 558     {
 559       /* That only holds for load-store pairs taking part in vectorization.  */
 560       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 561           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 562         return false;
 563     }
 564
 565   return true;
 566 }
 567
 568
 569 /* Function vect_analyze_data_ref_dependences.
 570
 571    Examine all the data references in the basic-block, and make sure there
 572    do not exist any data dependences between them.  Set *MAX_VF according to
 573    the maximum vectorization factor the data dependences allow.  */
 574
 575 bool
 576 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 577 {
 578   struct data_dependence_relation *ddr;
 579   unsigned int i;
 580
 581   if (dump_enabled_p ())
 582     dump_printf_loc (MSG_NOTE, vect_location,
 583                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 584
 585   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 586                                 &BB_VINFO_DDRS (bb_vinfo),
 587                                 vNULL, true))
 588     return false;
 589
 590   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 591     if (vect_slp_analyze_data_ref_dependence (ddr))
 592       return false;
 593
 594   return true;
 595 }
 596
 597
 598 /* Function vect_compute_data_ref_alignment
 599
 600    Compute the misalignment of the data reference DR.
 601
 602    Output:
 603    1. If during the misalignment computation it is found that the data reference
 604       cannot be vectorized then false is returned.
 605    2. DR_MISALIGNMENT (DR) is defined.
 606
 607    FOR NOW: No analysis is actually performed. Misalignment is calculated
 608    only for trivial cases. TODO.  */
 609
 610 static bool
 611 vect_compute_data_ref_alignment (struct data_reference *dr)
 612 {
 613   gimple stmt = DR_STMT (dr);
 614   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 615   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 616   struct loop *loop = NULL;
 617   tree ref = DR_REF (dr);
 618   tree vectype;
 619   tree base, base_addr;
 620   bool base_aligned;
 621   tree misalign;
 622   tree aligned_to, alignment;
 623
 624   if (dump_enabled_p ())
 625     dump_printf_loc (MSG_NOTE, vect_location,
 626                      "vect_compute_data_ref_alignment:\n");
 627
 628   if (loop_vinfo)
 629     loop = LOOP_VINFO_LOOP (loop_vinfo);
 630
 631   /* Initialize misalignment to unknown.  */
 632   SET_DR_MISALIGNMENT (dr, -1);
 633
 634   /* Strided loads perform only component accesses, misalignment information
 635      is irrelevant for them.  */
 636   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 637     return true;
 638
 639   misalign = DR_INIT (dr);
 640   aligned_to = DR_ALIGNED_TO (dr);
 641   base_addr = DR_BASE_ADDRESS (dr);
 642   vectype = STMT_VINFO_VECTYPE (stmt_info);
 643
 644   /* In case the dataref is in an inner-loop of the loop that is being
 645      vectorized (LOOP), we use the base and misalignment information
 646      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 647      stays the same throughout the execution of the inner-loop, which is why
 648      we have to check that the stride of the dataref in the inner-loop evenly
 649      divides by the vector size.  */
 650   if (loop && nested_in_vect_loop_p (loop, stmt))
 651     {
 652       tree step = DR_STEP (dr);
 653       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 654
 655       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 656         {
 657           if (dump_enabled_p ())
 658             dump_printf_loc (MSG_NOTE, vect_location,
 659                              "inner step divides the vector-size.\n");
 660           misalign = STMT_VINFO_DR_INIT (stmt_info);
 661           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 662           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 663         }
 664       else
 665         {
 666           if (dump_enabled_p ())
 667             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 668                              "inner step doesn't divide the vector-size.\n");
 669           misalign = NULL_TREE;
 670         }
 671     }
 672
 673   /* Similarly, if we're doing basic-block vectorization, we can only use
 674      base and misalignment information relative to an innermost loop if the
 675      misalignment stays the same throughout the execution of the loop.
 676      As above, this is the case if the stride of the dataref evenly divides
 677      by the vector size.  */
 678   if (!loop)
 679     {
 680       tree step = DR_STEP (dr);
 681       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 682
 683       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
 684         {
 685           if (dump_enabled_p ())
 686             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 687                              "SLP: step doesn't divide the vector-size.\n");
 688           misalign = NULL_TREE;
 689         }
 690     }
 691
 692   base = build_fold_indirect_ref (base_addr);
 693   alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
 694
 695   if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
 696       || !misalign)
 697     {
 698       if (dump_enabled_p ())
 699         {
 700           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 701                            "Unknown alignment for access: ");
 702           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
 703           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 704         }
 705       return true;
 706     }
 707
 708   if ((DECL_P (base)
 709        && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
 710                                 alignment) >= 0)
 711       || (TREE_CODE (base_addr) == SSA_NAME
 712           && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
 713                                                       TREE_TYPE (base_addr)))),
 714                                    alignment) >= 0)
 715       || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
 716     base_aligned = true;
 717   else
 718     base_aligned = false;
 719
 720   if (!base_aligned)
 721     {
 722       /* Do not change the alignment of global variables here if
 723          flag_section_anchors is enabled as we already generated
 724          RTL for other functions.  Most global variables should
 725          have been aligned during the IPA increase_alignment pass.  */
 726       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
 727           || (TREE_STATIC (base) && flag_section_anchors))
 728         {
 729           if (dump_enabled_p ())
 730             {
 731               dump_printf_loc (MSG_NOTE, vect_location,
 732                                "can't force alignment of ref: ");
 733               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 734               dump_printf (MSG_NOTE, "\n");
 735             }
 736           return true;
 737         }
 738
 739       /* Force the alignment of the decl.
 740          NOTE: This is the only change to the code we make during
 741          the analysis phase, before deciding to vectorize the loop.  */
 742       if (dump_enabled_p ())
 743         {
 744           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 745           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 746           dump_printf (MSG_NOTE, "\n");
 747         }
 748
 749       ((dataref_aux *)dr->aux)->base_decl = base;
 750       ((dataref_aux *)dr->aux)->base_misaligned = true;
 751     }
 752
 753   /* If this is a backward running DR then first access in the larger
 754      vectype actually is N-1 elements before the address in the DR.
 755      Adjust misalign accordingly.  */
 756   if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
 757     {
 758       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 759       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 760          otherwise we wouldn't be here.  */
 761       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 762       /* PLUS because DR_STEP was negative.  */
 763       misalign = size_binop (PLUS_EXPR, misalign, offset);
 764     }
 765
 766   /* Modulo alignment.  */
 767   misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
 768
 769   if (!tree_fits_uhwi_p (misalign))
 770     {
 771       /* Negative or overflowed misalignment value.  */
 772       if (dump_enabled_p ())
 773         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 774                          "unexpected misalign value\n");
 775       return false;
 776     }
 777
 778   SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
 779
 780   if (dump_enabled_p ())
 781     {
 782       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 783                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 784       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 785       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 786     }
 787
 788   return true;
 789 }
 790
 791
 792 /* Function vect_compute_data_refs_alignment
 793
 794    Compute the misalignment of data references in the loop.
 795    Return FALSE if a data reference is found that cannot be vectorized.  */
 796
 797 static bool
 798 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 799                                   bb_vec_info bb_vinfo)
 800 {
 801   vec<data_reference_p> datarefs;
 802   struct data_reference *dr;
 803   unsigned int i;
 804
 805   if (loop_vinfo)
 806     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 807   else
 808     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 809
 810   FOR_EACH_VEC_ELT (datarefs, i, dr)
 811     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 812         && !vect_compute_data_ref_alignment (dr))
 813       {
 814         if (bb_vinfo)
 815           {
 816             /* Mark unsupported statement as unvectorizable.  */
 817             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 818             continue;
 819           }
 820         else
 821           return false;
 822       }
 823
 824   return true;
 825 }
 826
 827
 828 /* Function vect_update_misalignment_for_peel
 829
 830    DR - the data reference whose misalignment is to be adjusted.
 831    DR_PEEL - the data reference whose misalignment is being made
 832              zero in the vector loop by the peel.
 833    NPEEL - the number of iterations in the peel loop if the misalignment
 834            of DR_PEEL is known at compile time.  */
 835
 836 static void
 837 vect_update_misalignment_for_peel (struct data_reference *dr,
 838                                    struct data_reference *dr_peel, int npeel)
 839 {
 840   unsigned int i;
 841   vec<dr_p> same_align_drs;
 842   struct data_reference *current_dr;
 843   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 844   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 845   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 846   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 847
 848  /* For interleaved data accesses the step in the loop must be multiplied by
 849      the size of the interleaving group.  */
 850   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 851     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 852   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 853     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 854
 855   /* It can be assumed that the data refs with the same alignment as dr_peel
 856      are aligned in the vector loop.  */
 857   same_align_drs
 858     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 859   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 860     {
 861       if (current_dr != dr)
 862         continue;
 863       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 864                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 865       SET_DR_MISALIGNMENT (dr, 0);
 866       return;
 867     }
 868
 869   if (known_alignment_for_access_p (dr)
 870       && known_alignment_for_access_p (dr_peel))
 871     {
 872       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 873       int misal = DR_MISALIGNMENT (dr);
 874       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 875       misal += negative ? -npeel * dr_size : npeel * dr_size;
 876       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 877       SET_DR_MISALIGNMENT (dr, misal);
 878       return;
 879     }
 880
 881   if (dump_enabled_p ())
 882     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 883   SET_DR_MISALIGNMENT (dr, -1);
 884 }
 885
 886
 887 /* Function vect_verify_datarefs_alignment
 888
 889    Return TRUE if all data references in the loop can be
 890    handled with respect to alignment.  */
 891
 892 bool
 893 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 894 {
 895   vec<data_reference_p> datarefs;
 896   struct data_reference *dr;
 897   enum dr_alignment_support supportable_dr_alignment;
 898   unsigned int i;
 899
 900   if (loop_vinfo)
 901     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 902   else
 903     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 904
 905   FOR_EACH_VEC_ELT (datarefs, i, dr)
 906     {
 907       gimple stmt = DR_STMT (dr);
 908       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 909
 910       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 911         continue;
 912
 913       /* For interleaving, only the alignment of the first access matters.
 914          Skip statements marked as not vectorizable.  */
 915       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 916            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 917           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 918         continue;
 919
 920       /* Strided loads perform only component accesses, alignment is
 921          irrelevant for them.  */
 922       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 923         continue;
 924
 925       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 926       if (!supportable_dr_alignment)
 927         {
 928           if (dump_enabled_p ())
 929             {
 930               if (DR_IS_READ (dr))
 931                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 932                                  "not vectorized: unsupported unaligned load.");
 933               else
 934                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 935                                  "not vectorized: unsupported unaligned "
 936                                  "store.");
 937
 938               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 939                                  DR_REF (dr));
 940               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 941             }
 942           return false;
 943         }
 944       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 945         dump_printf_loc (MSG_NOTE, vect_location,
 946                          "Vectorizing an unaligned access.\n");
 947     }
 948   return true;
 949 }
 950
 951 /* Given an memory reference EXP return whether its alignment is less
 952    than its size.  */
 953
 954 static bool
 955 not_size_aligned (tree exp)
 956 {
 957   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 958     return true;
 959
 960   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 961           > get_object_alignment (exp));
 962 }
 963
 964 /* Function vector_alignment_reachable_p
 965
 966    Return true if vector alignment for DR is reachable by peeling
 967    a few loop iterations.  Return false otherwise.  */
 968
 969 static bool
 970 vector_alignment_reachable_p (struct data_reference *dr)
 971 {
 972   gimple stmt = DR_STMT (dr);
 973   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 974   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 975
 976   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 977     {
 978       /* For interleaved access we peel only if number of iterations in
 979          the prolog loop ({VF - misalignment}), is a multiple of the
 980          number of the interleaved accesses.  */
 981       int elem_size, mis_in_elements;
 982       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 983
 984       /* FORNOW: handle only known alignment.  */
 985       if (!known_alignment_for_access_p (dr))
 986         return false;
 987
 988       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
 989       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
 990
 991       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
 992         return false;
 993     }
 994
 995   /* If misalignment is known at the compile time then allow peeling
 996      only if natural alignment is reachable through peeling.  */
 997   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
 998     {
 999       HOST_WIDE_INT elmsize =
1000                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1001       if (dump_enabled_p ())
1002         {
1003           dump_printf_loc (MSG_NOTE, vect_location,
1004                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1005           dump_printf (MSG_NOTE,
1006                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1007         }
1008       if (DR_MISALIGNMENT (dr) % elmsize)
1009         {
1010           if (dump_enabled_p ())
1011             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012                              "data size does not divide the misalignment.\n");
1013           return false;
1014         }
1015     }
1016
1017   if (!known_alignment_for_access_p (dr))
1018     {
1019       tree type = TREE_TYPE (DR_REF (dr));
1020       bool is_packed = not_size_aligned (DR_REF (dr));
1021       if (dump_enabled_p ())
1022         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1023                          "Unknown misalignment, is_packed = %d\n",is_packed);
1024       if ((TYPE_USER_ALIGN (type) && !is_packed)
1025           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1026         return true;
1027       else
1028         return false;
1029     }
1030
1031   return true;
1032 }
1033
1034
1035 /* Calculate the cost of the memory access represented by DR.  */
1036
1037 static void
1038 vect_get_data_access_cost (struct data_reference *dr,
1039                            unsigned int *inside_cost,
1040                            unsigned int *outside_cost,
1041                            stmt_vector_for_cost *body_cost_vec)
1042 {
1043   gimple stmt = DR_STMT (dr);
1044   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1045   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1046   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1047   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1048   int ncopies = vf / nunits;
1049
1050   if (DR_IS_READ (dr))
1051     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1052                         NULL, body_cost_vec, false);
1053   else
1054     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1055
1056   if (dump_enabled_p ())
1057     dump_printf_loc (MSG_NOTE, vect_location,
1058                      "vect_get_data_access_cost: inside_cost = %d, "
1059                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1060 }
1061
1062
1063 /* Insert DR into peeling hash table with NPEEL as key.  */
1064
1065 static void
1066 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1067                           int npeel)
1068 {
1069   struct _vect_peel_info elem, *slot;
1070   _vect_peel_info **new_slot;
1071   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1072
1073   elem.npeel = npeel;
1074   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1075   if (slot)
1076     slot->count++;
1077   else
1078     {
1079       slot = XNEW (struct _vect_peel_info);
1080       slot->npeel = npeel;
1081       slot->dr = dr;
1082       slot->count = 1;
1083       new_slot
1084         = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1085       *new_slot = slot;
1086     }
1087
1088   if (!supportable_dr_alignment
1089       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1090     slot->count += VECT_MAX_COST;
1091 }
1092
1093
1094 /* Traverse peeling hash table to find peeling option that aligns maximum
1095    number of data accesses.  */
1096
1097 int
1098 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1099                                      _vect_peel_extended_info *max)
1100 {
1101   vect_peel_info elem = *slot;
1102
1103   if (elem->count > max->peel_info.count
1104       || (elem->count == max->peel_info.count
1105           && max->peel_info.npeel > elem->npeel))
1106     {
1107       max->peel_info.npeel = elem->npeel;
1108       max->peel_info.count = elem->count;
1109       max->peel_info.dr = elem->dr;
1110     }
1111
1112   return 1;
1113 }
1114
1115
1116 /* Traverse peeling hash table and calculate cost for each peeling option.
1117    Find the one with the lowest cost.  */
1118
1119 int
1120 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1121                                    _vect_peel_extended_info *min)
1122 {
1123   vect_peel_info elem = *slot;
1124   int save_misalignment, dummy;
1125   unsigned int inside_cost = 0, outside_cost = 0, i;
1126   gimple stmt = DR_STMT (elem->dr);
1127   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1128   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1129   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1130   struct data_reference *dr;
1131   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1132   int single_iter_cost;
1133
1134   prologue_cost_vec.create (2);
1135   body_cost_vec.create (2);
1136   epilogue_cost_vec.create (2);
1137
1138   FOR_EACH_VEC_ELT (datarefs, i, dr)
1139     {
1140       stmt = DR_STMT (dr);
1141       stmt_info = vinfo_for_stmt (stmt);
1142       /* For interleaving, only the alignment of the first access
1143          matters.  */
1144       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1145           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1146         continue;
1147
1148       save_misalignment = DR_MISALIGNMENT (dr);
1149       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1150       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1151                                  &body_cost_vec);
1152       SET_DR_MISALIGNMENT (dr, save_misalignment);
1153     }
1154
1155   single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1156   outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1157                                                &dummy, single_iter_cost,
1158                                                &prologue_cost_vec,
1159                                                &epilogue_cost_vec);
1160
1161   /* Prologue and epilogue costs are added to the target model later.
1162      These costs depend only on the scalar iteration cost, the
1163      number of peeling iterations finally chosen, and the number of
1164      misaligned statements.  So discard the information found here.  */
1165   prologue_cost_vec.release ();
1166   epilogue_cost_vec.release ();
1167
1168   if (inside_cost < min->inside_cost
1169       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1170     {
1171       min->inside_cost = inside_cost;
1172       min->outside_cost = outside_cost;
1173       min->body_cost_vec.release ();
1174       min->body_cost_vec = body_cost_vec;
1175       min->peel_info.dr = elem->dr;
1176       min->peel_info.npeel = elem->npeel;
1177     }
1178   else
1179     body_cost_vec.release ();
1180
1181   return 1;
1182 }
1183
1184
1185 /* Choose best peeling option by traversing peeling hash table and either
1186    choosing an option with the lowest cost (if cost model is enabled) or the
1187    option that aligns as many accesses as possible.  */
1188
1189 static struct data_reference *
1190 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1191                                        unsigned int *npeel,
1192                                        stmt_vector_for_cost *body_cost_vec)
1193 {
1194    struct _vect_peel_extended_info res;
1195
1196    res.peel_info.dr = NULL;
1197    res.body_cost_vec = stmt_vector_for_cost ();
1198
1199    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1200      {
1201        res.inside_cost = INT_MAX;
1202        res.outside_cost = INT_MAX;
1203        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1204            ->traverse <_vect_peel_extended_info *,
1205                        vect_peeling_hash_get_lowest_cost> (&res);
1206      }
1207    else
1208      {
1209        res.peel_info.count = 0;
1210        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1211            ->traverse <_vect_peel_extended_info *,
1212                        vect_peeling_hash_get_most_frequent> (&res);
1213      }
1214
1215    *npeel = res.peel_info.npeel;
1216    *body_cost_vec = res.body_cost_vec;
1217    return res.peel_info.dr;
1218 }
1219
1220
1221 /* Function vect_enhance_data_refs_alignment
1222
1223    This pass will use loop versioning and loop peeling in order to enhance
1224    the alignment of data references in the loop.
1225
1226    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1227    original loop is to be vectorized.  Any other loops that are created by
1228    the transformations performed in this pass - are not supposed to be
1229    vectorized.  This restriction will be relaxed.
1230
1231    This pass will require a cost model to guide it whether to apply peeling
1232    or versioning or a combination of the two.  For example, the scheme that
1233    intel uses when given a loop with several memory accesses, is as follows:
1234    choose one memory access ('p') which alignment you want to force by doing
1235    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1236    other accesses are not necessarily aligned, or (2) use loop versioning to
1237    generate one loop in which all accesses are aligned, and another loop in
1238    which only 'p' is necessarily aligned.
1239
1240    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1241    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1242    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1243
1244    Devising a cost model is the most critical aspect of this work.  It will
1245    guide us on which access to peel for, whether to use loop versioning, how
1246    many versions to create, etc.  The cost model will probably consist of
1247    generic considerations as well as target specific considerations (on
1248    powerpc for example, misaligned stores are more painful than misaligned
1249    loads).
1250
1251    Here are the general steps involved in alignment enhancements:
1252
1253      -- original loop, before alignment analysis:
1254         for (i=0; i<N; i++){
1255           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1256           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1257         }
1258
1259      -- After vect_compute_data_refs_alignment:
1260         for (i=0; i<N; i++){
1261           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1262           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1263         }
1264
1265      -- Possibility 1: we do loop versioning:
1266      if (p is aligned) {
1267         for (i=0; i<N; i++){    # loop 1A
1268           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1269           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1270         }
1271      }
1272      else {
1273         for (i=0; i<N; i++){    # loop 1B
1274           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1275           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1276         }
1277      }
1278
1279      -- Possibility 2: we do loop peeling:
1280      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1281         x = q[i];
1282         p[i] = y;
1283      }
1284      for (i = 3; i < N; i++){   # loop 2A
1285         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1286         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1287      }
1288
1289      -- Possibility 3: combination of loop peeling and versioning:
1290      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1291         x = q[i];
1292         p[i] = y;
1293      }
1294      if (p is aligned) {
1295         for (i = 3; i<N; i++){  # loop 3A
1296           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1297           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1298         }
1299      }
1300      else {
1301         for (i = 3; i<N; i++){  # loop 3B
1302           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1303           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1304         }
1305      }
1306
1307      These loops are later passed to loop_transform to be vectorized.  The
1308      vectorizer will use the alignment information to guide the transformation
1309      (whether to generate regular loads/stores, or with special handling for
1310      misalignment).  */
1311
1312 bool
1313 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1314 {
1315   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1316   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1317   enum dr_alignment_support supportable_dr_alignment;
1318   struct data_reference *dr0 = NULL, *first_store = NULL;
1319   struct data_reference *dr;
1320   unsigned int i, j;
1321   bool do_peeling = false;
1322   bool do_versioning = false;
1323   bool stat;
1324   gimple stmt;
1325   stmt_vec_info stmt_info;
1326   unsigned int npeel = 0;
1327   bool all_misalignments_unknown = true;
1328   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1329   unsigned possible_npeel_number = 1;
1330   tree vectype;
1331   unsigned int nelements, mis, same_align_drs_max = 0;
1332   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1333
1334   if (dump_enabled_p ())
1335     dump_printf_loc (MSG_NOTE, vect_location,
1336                      "=== vect_enhance_data_refs_alignment ===\n");
1337
1338   /* While cost model enhancements are expected in the future, the high level
1339      view of the code at this time is as follows:
1340
1341      A) If there is a misaligned access then see if peeling to align
1342         this access can make all data references satisfy
1343         vect_supportable_dr_alignment.  If so, update data structures
1344         as needed and return true.
1345
1346      B) If peeling wasn't possible and there is a data reference with an
1347         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1348         then see if loop versioning checks can be used to make all data
1349         references satisfy vect_supportable_dr_alignment.  If so, update
1350         data structures as needed and return true.
1351
1352      C) If neither peeling nor versioning were successful then return false if
1353         any data reference does not satisfy vect_supportable_dr_alignment.
1354
1355      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1356
1357      Note, Possibility 3 above (which is peeling and versioning together) is not
1358      being done at this time.  */
1359
1360   /* (1) Peeling to force alignment.  */
1361
1362   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1363      Considerations:
1364      + How many accesses will become aligned due to the peeling
1365      - How many accesses will become unaligned due to the peeling,
1366        and the cost of misaligned accesses.
1367      - The cost of peeling (the extra runtime checks, the increase
1368        in code size).  */
1369
1370   FOR_EACH_VEC_ELT (datarefs, i, dr)
1371     {
1372       stmt = DR_STMT (dr);
1373       stmt_info = vinfo_for_stmt (stmt);
1374
1375       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1376         continue;
1377
1378       /* For interleaving, only the alignment of the first access
1379          matters.  */
1380       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1381           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1382         continue;
1383
1384       /* For invariant accesses there is nothing to enhance.  */
1385       if (integer_zerop (DR_STEP (dr)))
1386         continue;
1387
1388       /* Strided loads perform only component accesses, alignment is
1389          irrelevant for them.  */
1390       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1391         continue;
1392
1393       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1394       do_peeling = vector_alignment_reachable_p (dr);
1395       if (do_peeling)
1396         {
1397           if (known_alignment_for_access_p (dr))
1398             {
1399               unsigned int npeel_tmp;
1400               bool negative = tree_int_cst_compare (DR_STEP (dr),
1401                                                     size_zero_node) < 0;
1402
1403               /* Save info about DR in the hash table.  */
1404               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1405                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1406                   = new hash_table<peel_info_hasher> (1);
1407
1408               vectype = STMT_VINFO_VECTYPE (stmt_info);
1409               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1410               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1411                                                 TREE_TYPE (DR_REF (dr))));
1412               npeel_tmp = (negative
1413                            ? (mis - nelements) : (nelements - mis))
1414                   & (nelements - 1);
1415
1416               /* For multiple types, it is possible that the bigger type access
1417                  will have more than one peeling option.  E.g., a loop with two
1418                  types: one of size (vector size / 4), and the other one of
1419                  size (vector size / 8).  Vectorization factor will 8.  If both
1420                  access are misaligned by 3, the first one needs one scalar
1421                  iteration to be aligned, and the second one needs 5.  But the
1422                  the first one will be aligned also by peeling 5 scalar
1423                  iterations, and in that case both accesses will be aligned.
1424                  Hence, except for the immediate peeling amount, we also want
1425                  to try to add full vector size, while we don't exceed
1426                  vectorization factor.
1427                  We do this automtically for cost model, since we calculate cost
1428                  for every peeling option.  */
1429               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1430                 possible_npeel_number = vf /nelements;
1431
1432               /* Handle the aligned case. We may decide to align some other
1433                  access, making DR unaligned.  */
1434               if (DR_MISALIGNMENT (dr) == 0)
1435                 {
1436                   npeel_tmp = 0;
1437                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1438                     possible_npeel_number++;
1439                 }
1440
1441               for (j = 0; j < possible_npeel_number; j++)
1442                 {
1443                   gcc_assert (npeel_tmp <= vf);
1444                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1445                   npeel_tmp += nelements;
1446                 }
1447
1448               all_misalignments_unknown = false;
1449               /* Data-ref that was chosen for the case that all the
1450                  misalignments are unknown is not relevant anymore, since we
1451                  have a data-ref with known alignment.  */
1452               dr0 = NULL;
1453             }
1454           else
1455             {
1456               /* If we don't know any misalignment values, we prefer
1457                  peeling for data-ref that has the maximum number of data-refs
1458                  with the same alignment, unless the target prefers to align
1459                  stores over load.  */
1460               if (all_misalignments_unknown)
1461                 {
1462                   unsigned same_align_drs
1463                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1464                   if (!dr0
1465                       || same_align_drs_max < same_align_drs)
1466                     {
1467                       same_align_drs_max = same_align_drs;
1468                       dr0 = dr;
1469                     }
1470                   /* For data-refs with the same number of related
1471                      accesses prefer the one where the misalign
1472                      computation will be invariant in the outermost loop.  */
1473                   else if (same_align_drs_max == same_align_drs)
1474                     {
1475                       struct loop *ivloop0, *ivloop;
1476                       ivloop0 = outermost_invariant_loop_for_expr
1477                           (loop, DR_BASE_ADDRESS (dr0));
1478                       ivloop = outermost_invariant_loop_for_expr
1479                           (loop, DR_BASE_ADDRESS (dr));
1480                       if ((ivloop && !ivloop0)
1481                           || (ivloop && ivloop0
1482                               && flow_loop_nested_p (ivloop, ivloop0)))
1483                         dr0 = dr;
1484                     }
1485
1486                   if (!first_store && DR_IS_WRITE (dr))
1487                     first_store = dr;
1488                 }
1489
1490               /* If there are both known and unknown misaligned accesses in the
1491                  loop, we choose peeling amount according to the known
1492                  accesses.  */
1493               if (!supportable_dr_alignment)
1494                 {
1495                   dr0 = dr;
1496                   if (!first_store && DR_IS_WRITE (dr))
1497                     first_store = dr;
1498                 }
1499             }
1500         }
1501       else
1502         {
1503           if (!aligned_access_p (dr))
1504             {
1505               if (dump_enabled_p ())
1506                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507                                  "vector alignment may not be reachable\n");
1508               break;
1509             }
1510         }
1511     }
1512
1513   /* Check if we can possibly peel the loop.  */
1514   if (!vect_can_advance_ivs_p (loop_vinfo)
1515       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1516     do_peeling = false;
1517
1518   /* If we don't know how many times the peeling loop will run
1519      assume it will run VF-1 times and disable peeling if the remaining
1520      iters are less than the vectorization factor.  */
1521   if (do_peeling
1522       && all_misalignments_unknown
1523       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1524       && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1525           < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1526     do_peeling = false;
1527
1528   if (do_peeling
1529       && all_misalignments_unknown
1530       && vect_supportable_dr_alignment (dr0, false))
1531     {
1532       /* Check if the target requires to prefer stores over loads, i.e., if
1533          misaligned stores are more expensive than misaligned loads (taking
1534          drs with same alignment into account).  */
1535       if (first_store && DR_IS_READ (dr0))
1536         {
1537           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1538           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1539           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1540           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1541           stmt_vector_for_cost dummy;
1542           dummy.create (2);
1543
1544           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1545                                      &dummy);
1546           vect_get_data_access_cost (first_store, &store_inside_cost,
1547                                      &store_outside_cost, &dummy);
1548
1549           dummy.release ();
1550
1551           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1552              aligning the load DR0).  */
1553           load_inside_penalty = store_inside_cost;
1554           load_outside_penalty = store_outside_cost;
1555           for (i = 0;
1556                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1557                           DR_STMT (first_store))).iterate (i, &dr);
1558                i++)
1559             if (DR_IS_READ (dr))
1560               {
1561                 load_inside_penalty += load_inside_cost;
1562                 load_outside_penalty += load_outside_cost;
1563               }
1564             else
1565               {
1566                 load_inside_penalty += store_inside_cost;
1567                 load_outside_penalty += store_outside_cost;
1568               }
1569
1570           /* Calculate the penalty for leaving DR0 unaligned (by
1571              aligning the FIRST_STORE).  */
1572           store_inside_penalty = load_inside_cost;
1573           store_outside_penalty = load_outside_cost;
1574           for (i = 0;
1575                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1576                       DR_STMT (dr0))).iterate (i, &dr);
1577                i++)
1578             if (DR_IS_READ (dr))
1579               {
1580                 store_inside_penalty += load_inside_cost;
1581                 store_outside_penalty += load_outside_cost;
1582               }
1583             else
1584               {
1585                 store_inside_penalty += store_inside_cost;
1586                 store_outside_penalty += store_outside_cost;
1587               }
1588
1589           if (load_inside_penalty > store_inside_penalty
1590               || (load_inside_penalty == store_inside_penalty
1591                   && load_outside_penalty > store_outside_penalty))
1592             dr0 = first_store;
1593         }
1594
1595       /* In case there are only loads with different unknown misalignments, use
1596          peeling only if it may help to align other accesses in the loop.  */
1597       if (!first_store
1598           && !STMT_VINFO_SAME_ALIGN_REFS (
1599                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1600           && vect_supportable_dr_alignment (dr0, false)
1601               != dr_unaligned_supported)
1602         do_peeling = false;
1603     }
1604
1605   if (do_peeling && !dr0)
1606     {
1607       /* Peeling is possible, but there is no data access that is not supported
1608          unless aligned. So we try to choose the best possible peeling.  */
1609
1610       /* We should get here only if there are drs with known misalignment.  */
1611       gcc_assert (!all_misalignments_unknown);
1612
1613       /* Choose the best peeling from the hash table.  */
1614       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1615                                                    &body_cost_vec);
1616       if (!dr0 || !npeel)
1617         do_peeling = false;
1618
1619       /* If peeling by npeel will result in a remaining loop not iterating
1620          enough to be vectorized then do not peel.  */
1621       if (do_peeling
1622           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1623           && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1624               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1625         do_peeling = false;
1626     }
1627
1628   if (do_peeling)
1629     {
1630       stmt = DR_STMT (dr0);
1631       stmt_info = vinfo_for_stmt (stmt);
1632       vectype = STMT_VINFO_VECTYPE (stmt_info);
1633       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1634
1635       if (known_alignment_for_access_p (dr0))
1636         {
1637           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1638                                                 size_zero_node) < 0;
1639           if (!npeel)
1640             {
1641               /* Since it's known at compile time, compute the number of
1642                  iterations in the peeled loop (the peeling factor) for use in
1643                  updating DR_MISALIGNMENT values.  The peeling factor is the
1644                  vectorization factor minus the misalignment as an element
1645                  count.  */
1646               mis = DR_MISALIGNMENT (dr0);
1647               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1648               npeel = ((negative ? mis - nelements : nelements - mis)
1649                        & (nelements - 1));
1650             }
1651
1652           /* For interleaved data access every iteration accesses all the
1653              members of the group, therefore we divide the number of iterations
1654              by the group size.  */
1655           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1656           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1657             npeel /= GROUP_SIZE (stmt_info);
1658
1659           if (dump_enabled_p ())
1660             dump_printf_loc (MSG_NOTE, vect_location,
1661                              "Try peeling by %d\n", npeel);
1662         }
1663
1664       /* Ensure that all data refs can be vectorized after the peel.  */
1665       FOR_EACH_VEC_ELT (datarefs, i, dr)
1666         {
1667           int save_misalignment;
1668
1669           if (dr == dr0)
1670             continue;
1671
1672           stmt = DR_STMT (dr);
1673           stmt_info = vinfo_for_stmt (stmt);
1674           /* For interleaving, only the alignment of the first access
1675             matters.  */
1676           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1677               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1678             continue;
1679
1680           /* Strided loads perform only component accesses, alignment is
1681              irrelevant for them.  */
1682           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1683             continue;
1684
1685           save_misalignment = DR_MISALIGNMENT (dr);
1686           vect_update_misalignment_for_peel (dr, dr0, npeel);
1687           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1688           SET_DR_MISALIGNMENT (dr, save_misalignment);
1689
1690           if (!supportable_dr_alignment)
1691             {
1692               do_peeling = false;
1693               break;
1694             }
1695         }
1696
1697       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1698         {
1699           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1700           if (!stat)
1701             do_peeling = false;
1702           else
1703             {
1704               body_cost_vec.release ();
1705               return stat;
1706             }
1707         }
1708
1709       if (do_peeling)
1710         {
1711           unsigned max_allowed_peel
1712             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1713           if (max_allowed_peel != (unsigned)-1)
1714             {
1715               unsigned max_peel = npeel;
1716               if (max_peel == 0)
1717                 {
1718                   gimple dr_stmt = DR_STMT (dr0);
1719                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1720                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1721                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1722                 }
1723               if (max_peel > max_allowed_peel)
1724                 {
1725                   do_peeling = false;
1726                   if (dump_enabled_p ())
1727                     dump_printf_loc (MSG_NOTE, vect_location,
1728                         "Disable peeling, max peels reached: %d\n", max_peel);
1729                 }
1730             }
1731         }
1732
1733       if (do_peeling)
1734         {
1735           stmt_info_for_cost *si;
1736           void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1737
1738           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1739              If the misalignment of DR_i is identical to that of dr0 then set
1740              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1741              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1742              by the peeling factor times the element size of DR_i (MOD the
1743              vectorization factor times the size).  Otherwise, the
1744              misalignment of DR_i must be set to unknown.  */
1745           FOR_EACH_VEC_ELT (datarefs, i, dr)
1746             if (dr != dr0)
1747               vect_update_misalignment_for_peel (dr, dr0, npeel);
1748
1749           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1750           if (npeel)
1751             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1752           else
1753             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1754               = DR_MISALIGNMENT (dr0);
1755           SET_DR_MISALIGNMENT (dr0, 0);
1756           if (dump_enabled_p ())
1757             {
1758               dump_printf_loc (MSG_NOTE, vect_location,
1759                                "Alignment of access forced using peeling.\n");
1760               dump_printf_loc (MSG_NOTE, vect_location,
1761                                "Peeling for alignment will be applied.\n");
1762             }
1763           /* We've delayed passing the inside-loop peeling costs to the
1764              target cost model until we were sure peeling would happen.
1765              Do so now.  */
1766           if (body_cost_vec.exists ())
1767             {
1768               FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1769                 {
1770                   struct _stmt_vec_info *stmt_info
1771                     = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1772                   (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1773                                         si->misalign, vect_body);
1774                 }
1775               body_cost_vec.release ();
1776             }
1777
1778           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1779           gcc_assert (stat);
1780           return stat;
1781         }
1782     }
1783
1784   body_cost_vec.release ();
1785
1786   /* (2) Versioning to force alignment.  */
1787
1788   /* Try versioning if:
1789      1) optimize loop for speed
1790      2) there is at least one unsupported misaligned data ref with an unknown
1791         misalignment, and
1792      3) all misaligned data refs with a known misalignment are supported, and
1793      4) the number of runtime alignment checks is within reason.  */
1794
1795   do_versioning =
1796         optimize_loop_nest_for_speed_p (loop)
1797         && (!loop->inner); /* FORNOW */
1798
1799   if (do_versioning)
1800     {
1801       FOR_EACH_VEC_ELT (datarefs, i, dr)
1802         {
1803           stmt = DR_STMT (dr);
1804           stmt_info = vinfo_for_stmt (stmt);
1805
1806           /* For interleaving, only the alignment of the first access
1807              matters.  */
1808           if (aligned_access_p (dr)
1809               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1810                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1811             continue;
1812
1813           /* Strided loads perform only component accesses, alignment is
1814              irrelevant for them.  */
1815           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1816             continue;
1817
1818           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1819
1820           if (!supportable_dr_alignment)
1821             {
1822               gimple stmt;
1823               int mask;
1824               tree vectype;
1825
1826               if (known_alignment_for_access_p (dr)
1827                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1828                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1829                 {
1830                   do_versioning = false;
1831                   break;
1832                 }
1833
1834               stmt = DR_STMT (dr);
1835               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1836               gcc_assert (vectype);
1837
1838               /* The rightmost bits of an aligned address must be zeros.
1839                  Construct the mask needed for this test.  For example,
1840                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1841                  mask must be 15 = 0xf. */
1842               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1843
1844               /* FORNOW: use the same mask to test all potentially unaligned
1845                  references in the loop.  The vectorizer currently supports
1846                  a single vector size, see the reference to
1847                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1848                  vectorization factor is computed.  */
1849               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1850                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1851               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1852               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1853                       DR_STMT (dr));
1854             }
1855         }
1856
1857       /* Versioning requires at least one misaligned data reference.  */
1858       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1859         do_versioning = false;
1860       else if (!do_versioning)
1861         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1862     }
1863
1864   if (do_versioning)
1865     {
1866       vec<gimple> may_misalign_stmts
1867         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1868       gimple stmt;
1869
1870       /* It can now be assumed that the data references in the statements
1871          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1872          of the loop being vectorized.  */
1873       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1874         {
1875           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1876           dr = STMT_VINFO_DATA_REF (stmt_info);
1877           SET_DR_MISALIGNMENT (dr, 0);
1878           if (dump_enabled_p ())
1879             dump_printf_loc (MSG_NOTE, vect_location,
1880                              "Alignment of access forced using versioning.\n");
1881         }
1882
1883       if (dump_enabled_p ())
1884         dump_printf_loc (MSG_NOTE, vect_location,
1885                          "Versioning for alignment will be applied.\n");
1886
1887       /* Peeling and versioning can't be done together at this time.  */
1888       gcc_assert (! (do_peeling && do_versioning));
1889
1890       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1891       gcc_assert (stat);
1892       return stat;
1893     }
1894
1895   /* This point is reached if neither peeling nor versioning is being done.  */
1896   gcc_assert (! (do_peeling || do_versioning));
1897
1898   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1899   return stat;
1900 }
1901
1902
1903 /* Function vect_find_same_alignment_drs.
1904
1905    Update group and alignment relations according to the chosen
1906    vectorization factor.  */
1907
1908 static void
1909 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1910                               loop_vec_info loop_vinfo)
1911 {
1912   unsigned int i;
1913   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1914   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1915   struct data_reference *dra = DDR_A (ddr);
1916   struct data_reference *drb = DDR_B (ddr);
1917   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1918   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1919   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1920   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1921   lambda_vector dist_v;
1922   unsigned int loop_depth;
1923
1924   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1925     return;
1926
1927   if (dra == drb)
1928     return;
1929
1930   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1931     return;
1932
1933   /* Loop-based vectorization and known data dependence.  */
1934   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1935     return;
1936
1937   /* Data-dependence analysis reports a distance vector of zero
1938      for data-references that overlap only in the first iteration
1939      but have different sign step (see PR45764).
1940      So as a sanity check require equal DR_STEP.  */
1941   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1942     return;
1943
1944   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1945   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1946     {
1947       int dist = dist_v[loop_depth];
1948
1949       if (dump_enabled_p ())
1950         dump_printf_loc (MSG_NOTE, vect_location,
1951                          "dependence distance  = %d.\n", dist);
1952
1953       /* Same loop iteration.  */
1954       if (dist == 0
1955           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1956         {
1957           /* Two references with distance zero have the same alignment.  */
1958           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1959           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1960           if (dump_enabled_p ())
1961             {
1962               dump_printf_loc (MSG_NOTE, vect_location,
1963                                "accesses have the same alignment.\n");
1964               dump_printf (MSG_NOTE,
1965                            "dependence distance modulo vf == 0 between ");
1966               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1967               dump_printf (MSG_NOTE,  " and ");
1968               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1969               dump_printf (MSG_NOTE, "\n");
1970             }
1971         }
1972     }
1973 }
1974
1975
1976 /* Function vect_analyze_data_refs_alignment
1977
1978    Analyze the alignment of the data-references in the loop.
1979    Return FALSE if a data reference is found that cannot be vectorized.  */
1980
1981 bool
1982 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1983                                   bb_vec_info bb_vinfo)
1984 {
1985   if (dump_enabled_p ())
1986     dump_printf_loc (MSG_NOTE, vect_location,
1987                      "=== vect_analyze_data_refs_alignment ===\n");
1988
1989   /* Mark groups of data references with same alignment using
1990      data dependence information.  */
1991   if (loop_vinfo)
1992     {
1993       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1994       struct data_dependence_relation *ddr;
1995       unsigned int i;
1996
1997       FOR_EACH_VEC_ELT (ddrs, i, ddr)
1998         vect_find_same_alignment_drs (ddr, loop_vinfo);
1999     }
2000
2001   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2002     {
2003       if (dump_enabled_p ())
2004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005                          "not vectorized: can't calculate alignment "
2006                          "for data ref.\n");
2007       return false;
2008     }
2009
2010   return true;
2011 }
2012
2013
2014 /* Analyze groups of accesses: check that DR belongs to a group of
2015    accesses of legal size, step, etc.  Detect gaps, single element
2016    interleaving, and other special cases. Set grouped access info.
2017    Collect groups of strided stores for further use in SLP analysis.  */
2018
2019 static bool
2020 vect_analyze_group_access (struct data_reference *dr)
2021 {
2022   tree step = DR_STEP (dr);
2023   tree scalar_type = TREE_TYPE (DR_REF (dr));
2024   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2025   gimple stmt = DR_STMT (dr);
2026   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2027   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2028   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2029   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2030   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2031   bool slp_impossible = false;
2032   struct loop *loop = NULL;
2033
2034   if (loop_vinfo)
2035     loop = LOOP_VINFO_LOOP (loop_vinfo);
2036
2037   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2038      size of the interleaving group (including gaps).  */
2039   groupsize = absu_hwi (dr_step) / type_size;
2040
2041   /* Not consecutive access is possible only if it is a part of interleaving.  */
2042   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2043     {
2044       /* Check if it this DR is a part of interleaving, and is a single
2045          element of the group that is accessed in the loop.  */
2046
2047       /* Gaps are supported only for loads. STEP must be a multiple of the type
2048          size.  The size of the group must be a power of 2.  */
2049       if (DR_IS_READ (dr)
2050           && (dr_step % type_size) == 0
2051           && groupsize > 0
2052           && exact_log2 (groupsize) != -1)
2053         {
2054           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2055           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2056           if (dump_enabled_p ())
2057             {
2058               dump_printf_loc (MSG_NOTE, vect_location,
2059                                "Detected single element interleaving ");
2060               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2061               dump_printf (MSG_NOTE, " step ");
2062               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2063               dump_printf (MSG_NOTE, "\n");
2064             }
2065
2066           if (loop_vinfo)
2067             {
2068               if (dump_enabled_p ())
2069                 dump_printf_loc (MSG_NOTE, vect_location,
2070                                  "Data access with gaps requires scalar "
2071                                  "epilogue loop\n");
2072               if (loop->inner)
2073                 {
2074                   if (dump_enabled_p ())
2075                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076                                      "Peeling for outer loop is not"
2077                                      " supported\n");
2078                   return false;
2079                 }
2080
2081               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2082             }
2083
2084           return true;
2085         }
2086
2087       if (dump_enabled_p ())
2088         {
2089           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2090                            "not consecutive access ");
2091           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2092           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2093         }
2094
2095       if (bb_vinfo)
2096         {
2097           /* Mark the statement as unvectorizable.  */
2098           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2099           return true;
2100         }
2101
2102       return false;
2103     }
2104
2105   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2106     {
2107       /* First stmt in the interleaving chain. Check the chain.  */
2108       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2109       struct data_reference *data_ref = dr;
2110       unsigned int count = 1;
2111       tree prev_init = DR_INIT (data_ref);
2112       gimple prev = stmt;
2113       HOST_WIDE_INT diff, gaps = 0;
2114       unsigned HOST_WIDE_INT count_in_bytes;
2115
2116       while (next)
2117         {
2118           /* Skip same data-refs.  In case that two or more stmts share
2119              data-ref (supported only for loads), we vectorize only the first
2120              stmt, and the rest get their vectorized loads from the first
2121              one.  */
2122           if (!tree_int_cst_compare (DR_INIT (data_ref),
2123                                      DR_INIT (STMT_VINFO_DATA_REF (
2124                                                    vinfo_for_stmt (next)))))
2125             {
2126               if (DR_IS_WRITE (data_ref))
2127                 {
2128                   if (dump_enabled_p ())
2129                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2130                                      "Two store stmts share the same dr.\n");
2131                   return false;
2132                 }
2133
2134               /* For load use the same data-ref load.  */
2135               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2136
2137               prev = next;
2138               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2139               continue;
2140             }
2141
2142           prev = next;
2143           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2144
2145           /* All group members have the same STEP by construction.  */
2146           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2147
2148           /* Check that the distance between two accesses is equal to the type
2149              size. Otherwise, we have gaps.  */
2150           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2151                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2152           if (diff != 1)
2153             {
2154               /* FORNOW: SLP of accesses with gaps is not supported.  */
2155               slp_impossible = true;
2156               if (DR_IS_WRITE (data_ref))
2157                 {
2158                   if (dump_enabled_p ())
2159                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2160                                      "interleaved store with gaps\n");
2161                   return false;
2162                 }
2163
2164               gaps += diff - 1;
2165             }
2166
2167           last_accessed_element += diff;
2168
2169           /* Store the gap from the previous member of the group. If there is no
2170              gap in the access, GROUP_GAP is always 1.  */
2171           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2172
2173           prev_init = DR_INIT (data_ref);
2174           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2175           /* Count the number of data-refs in the chain.  */
2176           count++;
2177         }
2178
2179       /* COUNT is the number of accesses found, we multiply it by the size of
2180          the type to get COUNT_IN_BYTES.  */
2181       count_in_bytes = type_size * count;
2182
2183       /* Check that the size of the interleaving (including gaps) is not
2184          greater than STEP.  */
2185       if (dr_step != 0
2186           && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2187         {
2188           if (dump_enabled_p ())
2189             {
2190               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191                                "interleaving size is greater than step for ");
2192               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2193                                  DR_REF (dr));
2194               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2195             }
2196           return false;
2197         }
2198
2199       /* Check that the size of the interleaving is equal to STEP for stores,
2200          i.e., that there are no gaps.  */
2201       if (dr_step != 0
2202           && absu_hwi (dr_step) != count_in_bytes)
2203         {
2204           if (DR_IS_READ (dr))
2205             {
2206               slp_impossible = true;
2207               /* There is a gap after the last load in the group. This gap is a
2208                  difference between the groupsize and the number of elements.
2209                  When there is no gap, this difference should be 0.  */
2210               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2211             }
2212           else
2213             {
2214               if (dump_enabled_p ())
2215                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2216                                  "interleaved store with gaps\n");
2217               return false;
2218             }
2219         }
2220
2221       /* Check that STEP is a multiple of type size.  */
2222       if (dr_step != 0
2223           && (dr_step % type_size) != 0)
2224         {
2225           if (dump_enabled_p ())
2226             {
2227               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2228                                "step is not a multiple of type size: step ");
2229               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2230               dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2231               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2232                                  TYPE_SIZE_UNIT (scalar_type));
2233               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2234             }
2235           return false;
2236         }
2237
2238       if (groupsize == 0)
2239         groupsize = count;
2240
2241       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2242       if (dump_enabled_p ())
2243         dump_printf_loc (MSG_NOTE, vect_location,
2244                          "Detected interleaving of size %d\n", (int)groupsize);
2245
2246       /* SLP: create an SLP data structure for every interleaving group of
2247          stores for further analysis in vect_analyse_slp.  */
2248       if (DR_IS_WRITE (dr) && !slp_impossible)
2249         {
2250           if (loop_vinfo)
2251             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2252           if (bb_vinfo)
2253             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2254         }
2255
2256       /* There is a gap in the end of the group.  */
2257       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2258         {
2259           if (dump_enabled_p ())
2260             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2261                              "Data access with gaps requires scalar "
2262                              "epilogue loop\n");
2263           if (loop->inner)
2264             {
2265               if (dump_enabled_p ())
2266                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2267                                  "Peeling for outer loop is not supported\n");
2268               return false;
2269             }
2270
2271           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2272         }
2273     }
2274
2275   return true;
2276 }
2277
2278
2279 /* Analyze the access pattern of the data-reference DR.
2280    In case of non-consecutive accesses call vect_analyze_group_access() to
2281    analyze groups of accesses.  */
2282
2283 static bool
2284 vect_analyze_data_ref_access (struct data_reference *dr)
2285 {
2286   tree step = DR_STEP (dr);
2287   tree scalar_type = TREE_TYPE (DR_REF (dr));
2288   gimple stmt = DR_STMT (dr);
2289   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2290   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2291   struct loop *loop = NULL;
2292
2293   if (loop_vinfo)
2294     loop = LOOP_VINFO_LOOP (loop_vinfo);
2295
2296   if (loop_vinfo && !step)
2297     {
2298       if (dump_enabled_p ())
2299         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2300                          "bad data-ref access in loop\n");
2301       return false;
2302     }
2303
2304   /* Allow invariant loads in not nested loops.  */
2305   if (loop_vinfo && integer_zerop (step))
2306     {
2307       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2308       if (nested_in_vect_loop_p (loop, stmt))
2309         {
2310           if (dump_enabled_p ())
2311             dump_printf_loc (MSG_NOTE, vect_location,
2312                              "zero step in inner loop of nest\n");
2313           return false;
2314         }
2315       return DR_IS_READ (dr);
2316     }
2317
2318   if (loop && nested_in_vect_loop_p (loop, stmt))
2319     {
2320       /* Interleaved accesses are not yet supported within outer-loop
2321         vectorization for references in the inner-loop.  */
2322       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2323
2324       /* For the rest of the analysis we use the outer-loop step.  */
2325       step = STMT_VINFO_DR_STEP (stmt_info);
2326       if (integer_zerop (step))
2327         {
2328           if (dump_enabled_p ())
2329             dump_printf_loc (MSG_NOTE, vect_location,
2330                              "zero step in outer loop.\n");
2331           if (DR_IS_READ (dr))
2332             return true;
2333           else
2334             return false;
2335         }
2336     }
2337
2338   /* Consecutive?  */
2339   if (TREE_CODE (step) == INTEGER_CST)
2340     {
2341       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2342       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2343           || (dr_step < 0
2344               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2345         {
2346           /* Mark that it is not interleaving.  */
2347           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2348           return true;
2349         }
2350     }
2351
2352   if (loop && nested_in_vect_loop_p (loop, stmt))
2353     {
2354       if (dump_enabled_p ())
2355         dump_printf_loc (MSG_NOTE, vect_location,
2356                          "grouped access in outer loop.\n");
2357       return false;
2358     }
2359
2360   /* Assume this is a DR handled by non-constant strided load case.  */
2361   if (TREE_CODE (step) != INTEGER_CST)
2362     return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2363
2364   /* Not consecutive access - check if it's a part of interleaving group.  */
2365   return vect_analyze_group_access (dr);
2366 }
2367
2368
2369
2370 /*  A helper function used in the comparator function to sort data
2371     references.  T1 and T2 are two data references to be compared.
2372     The function returns -1, 0, or 1.  */
2373
2374 static int
2375 compare_tree (tree t1, tree t2)
2376 {
2377   int i, cmp;
2378   enum tree_code code;
2379   char tclass;
2380
2381   if (t1 == t2)
2382     return 0;
2383   if (t1 == NULL)
2384     return -1;
2385   if (t2 == NULL)
2386     return 1;
2387
2388
2389   if (TREE_CODE (t1) != TREE_CODE (t2))
2390     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2391
2392   code = TREE_CODE (t1);
2393   switch (code)
2394     {
2395     /* For const values, we can just use hash values for comparisons.  */
2396     case INTEGER_CST:
2397     case REAL_CST:
2398     case FIXED_CST:
2399     case STRING_CST:
2400     case COMPLEX_CST:
2401     case VECTOR_CST:
2402       {
2403         hashval_t h1 = iterative_hash_expr (t1, 0);
2404         hashval_t h2 = iterative_hash_expr (t2, 0);
2405         if (h1 != h2)
2406           return h1 < h2 ? -1 : 1;
2407         break;
2408       }
2409
2410     case SSA_NAME:
2411       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2412       if (cmp != 0)
2413         return cmp;
2414
2415       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2416         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2417       break;
2418
2419     default:
2420       tclass = TREE_CODE_CLASS (code);
2421
2422       /* For var-decl, we could compare their UIDs.  */
2423       if (tclass == tcc_declaration)
2424         {
2425           if (DECL_UID (t1) != DECL_UID (t2))
2426             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2427           break;
2428         }
2429
2430       /* For expressions with operands, compare their operands recursively.  */
2431       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2432         {
2433           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2434           if (cmp != 0)
2435             return cmp;
2436         }
2437     }
2438
2439   return 0;
2440 }
2441
2442
2443 /* Compare two data-references DRA and DRB to group them into chunks
2444    suitable for grouping.  */
2445
2446 static int
2447 dr_group_sort_cmp (const void *dra_, const void *drb_)
2448 {
2449   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2450   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2451   int cmp;
2452
2453   /* Stabilize sort.  */
2454   if (dra == drb)
2455     return 0;
2456
2457   /* Ordering of DRs according to base.  */
2458   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2459     {
2460       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2461       if (cmp != 0)
2462         return cmp;
2463     }
2464
2465   /* And according to DR_OFFSET.  */
2466   if (!dr_equal_offsets_p (dra, drb))
2467     {
2468       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2469       if (cmp != 0)
2470         return cmp;
2471     }
2472
2473   /* Put reads before writes.  */
2474   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2475     return DR_IS_READ (dra) ? -1 : 1;
2476
2477   /* Then sort after access size.  */
2478   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2479                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2480     {
2481       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2482                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2483       if (cmp != 0)
2484         return cmp;
2485     }
2486
2487   /* And after step.  */
2488   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2489     {
2490       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2491       if (cmp != 0)
2492         return cmp;
2493     }
2494
2495   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2496   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2497   if (cmp == 0)
2498     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2499   return cmp;
2500 }
2501
2502 /* Function vect_analyze_data_ref_accesses.
2503
2504    Analyze the access pattern of all the data references in the loop.
2505
2506    FORNOW: the only access pattern that is considered vectorizable is a
2507            simple step 1 (consecutive) access.
2508
2509    FORNOW: handle only arrays and pointer accesses.  */
2510
2511 bool
2512 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2513 {
2514   unsigned int i;
2515   vec<data_reference_p> datarefs;
2516   struct data_reference *dr;
2517
2518   if (dump_enabled_p ())
2519     dump_printf_loc (MSG_NOTE, vect_location,
2520                      "=== vect_analyze_data_ref_accesses ===\n");
2521
2522   if (loop_vinfo)
2523     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2524   else
2525     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2526
2527   if (datarefs.is_empty ())
2528     return true;
2529
2530   /* Sort the array of datarefs to make building the interleaving chains
2531      linear.  Don't modify the original vector's order, it is needed for
2532      determining what dependencies are reversed.  */
2533   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2534   datarefs_copy.qsort (dr_group_sort_cmp);
2535
2536   /* Build the interleaving chains.  */
2537   for (i = 0; i < datarefs_copy.length () - 1;)
2538     {
2539       data_reference_p dra = datarefs_copy[i];
2540       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2541       stmt_vec_info lastinfo = NULL;
2542       for (i = i + 1; i < datarefs_copy.length (); ++i)
2543         {
2544           data_reference_p drb = datarefs_copy[i];
2545           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2546
2547           /* ???  Imperfect sorting (non-compatible types, non-modulo
2548              accesses, same accesses) can lead to a group to be artificially
2549              split here as we don't just skip over those.  If it really
2550              matters we can push those to a worklist and re-iterate
2551              over them.  The we can just skip ahead to the next DR here.  */
2552
2553           /* Check that the data-refs have same first location (except init)
2554              and they are both either store or load (not load and store).  */
2555           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2556               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2557                                    DR_BASE_ADDRESS (drb), 0)
2558               || !dr_equal_offsets_p (dra, drb))
2559             break;
2560
2561           /* Check that the data-refs have the same constant size and step.  */
2562           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2563           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2564           if (!tree_fits_uhwi_p (sza)
2565               || !tree_fits_uhwi_p (szb)
2566               || !tree_int_cst_equal (sza, szb)
2567               || !tree_fits_shwi_p (DR_STEP (dra))
2568               || !tree_fits_shwi_p (DR_STEP (drb))
2569               || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2570             break;
2571
2572           /* Do not place the same access in the interleaving chain twice.  */
2573           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2574             break;
2575
2576           /* Check the types are compatible.
2577              ???  We don't distinguish this during sorting.  */
2578           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2579                                    TREE_TYPE (DR_REF (drb))))
2580             break;
2581
2582           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2583           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2584           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2585           gcc_assert (init_a < init_b);
2586
2587           /* If init_b == init_a + the size of the type * k, we have an
2588              interleaving, and DRA is accessed before DRB.  */
2589           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2590           if ((init_b - init_a) % type_size_a != 0)
2591             break;
2592
2593           /* The step (if not zero) is greater than the difference between
2594              data-refs' inits.  This splits groups into suitable sizes.  */
2595           HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2596           if (step != 0 && step <= (init_b - init_a))
2597             break;
2598
2599           if (dump_enabled_p ())
2600             {
2601               dump_printf_loc (MSG_NOTE, vect_location,
2602                                "Detected interleaving ");
2603               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2604               dump_printf (MSG_NOTE,  " and ");
2605               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2606               dump_printf (MSG_NOTE, "\n");
2607             }
2608
2609           /* Link the found element into the group list.  */
2610           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2611             {
2612               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2613               lastinfo = stmtinfo_a;
2614             }
2615           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2616           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2617           lastinfo = stmtinfo_b;
2618         }
2619     }
2620
2621   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2622     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2623         && !vect_analyze_data_ref_access (dr))
2624       {
2625         if (dump_enabled_p ())
2626           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2627                            "not vectorized: complicated access pattern.\n");
2628
2629         if (bb_vinfo)
2630           {
2631             /* Mark the statement as not vectorizable.  */
2632             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2633             continue;
2634           }
2635         else
2636           {
2637             datarefs_copy.release ();
2638             return false;
2639           }
2640       }
2641
2642   datarefs_copy.release ();
2643   return true;
2644 }
2645
2646
2647 /* Operator == between two dr_with_seg_len objects.
2648
2649    This equality operator is used to make sure two data refs
2650    are the same one so that we will consider to combine the
2651    aliasing checks of those two pairs of data dependent data
2652    refs.  */
2653
2654 static bool
2655 operator == (const dr_with_seg_len& d1,
2656              const dr_with_seg_len& d2)
2657 {
2658   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2659                           DR_BASE_ADDRESS (d2.dr), 0)
2660            && compare_tree (d1.offset, d2.offset) == 0
2661            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2662 }
2663
2664 /* Function comp_dr_with_seg_len_pair.
2665
2666    Comparison function for sorting objects of dr_with_seg_len_pair_t
2667    so that we can combine aliasing checks in one scan.  */
2668
2669 static int
2670 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2671 {
2672   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2673   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2674
2675   const dr_with_seg_len &p11 = p1->first,
2676                         &p12 = p1->second,
2677                         &p21 = p2->first,
2678                         &p22 = p2->second;
2679
2680   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2681      if a and c have the same basic address snd step, and b and d have the same
2682      address and step.  Therefore, if any a&c or b&d don't have the same address
2683      and step, we don't care the order of those two pairs after sorting.  */
2684   int comp_res;
2685
2686   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2687                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2688     return comp_res;
2689   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2690                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2691     return comp_res;
2692   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2693     return comp_res;
2694   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2695     return comp_res;
2696   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2697     return comp_res;
2698   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2699     return comp_res;
2700
2701   return 0;
2702 }
2703
2704 template <class T> static void
2705 swap (T& a, T& b)
2706 {
2707   T c (a);
2708   a = b;
2709   b = c;
2710 }
2711
2712 /* Function vect_vfa_segment_size.
2713
2714    Create an expression that computes the size of segment
2715    that will be accessed for a data reference.  The functions takes into
2716    account that realignment loads may access one more vector.
2717
2718    Input:
2719      DR: The data reference.
2720      LENGTH_FACTOR: segment length to consider.
2721
2722    Return an expression whose value is the size of segment which will be
2723    accessed by DR.  */
2724
2725 static tree
2726 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2727 {
2728   tree segment_length;
2729
2730   if (integer_zerop (DR_STEP (dr)))
2731     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2732   else
2733     segment_length = size_binop (MULT_EXPR,
2734                                  fold_convert (sizetype, DR_STEP (dr)),
2735                                  fold_convert (sizetype, length_factor));
2736
2737   if (vect_supportable_dr_alignment (dr, false)
2738         == dr_explicit_realign_optimized)
2739     {
2740       tree vector_size = TYPE_SIZE_UNIT
2741                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2742
2743       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2744     }
2745   return segment_length;
2746 }
2747
2748 /* Function vect_prune_runtime_alias_test_list.
2749
2750    Prune a list of ddrs to be tested at run-time by versioning for alias.
2751    Merge several alias checks into one if possible.
2752    Return FALSE if resulting list of ddrs is longer then allowed by
2753    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2754
2755 bool
2756 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2757 {
2758   vec<ddr_p> may_alias_ddrs =
2759     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2760   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2761     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2762   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2763   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2764
2765   ddr_p ddr;
2766   unsigned int i;
2767   tree length_factor;
2768
2769   if (dump_enabled_p ())
2770     dump_printf_loc (MSG_NOTE, vect_location,
2771                      "=== vect_prune_runtime_alias_test_list ===\n");
2772
2773   if (may_alias_ddrs.is_empty ())
2774     return true;
2775
2776   /* Basically, for each pair of dependent data refs store_ptr_0
2777      and load_ptr_0, we create an expression:
2778
2779      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2780      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2781
2782      for aliasing checks.  However, in some cases we can decrease
2783      the number of checks by combining two checks into one.  For
2784      example, suppose we have another pair of data refs store_ptr_0
2785      and load_ptr_1, and if the following condition is satisfied:
2786
2787      load_ptr_0 < load_ptr_1  &&
2788      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2789
2790      (this condition means, in each iteration of vectorized loop,
2791      the accessed memory of store_ptr_0 cannot be between the memory
2792      of load_ptr_0 and load_ptr_1.)
2793
2794      we then can use only the following expression to finish the
2795      alising checks between store_ptr_0 & load_ptr_0 and
2796      store_ptr_0 & load_ptr_1:
2797
2798      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2799      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2800
2801      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2802      same basic address.  */
2803
2804   comp_alias_ddrs.create (may_alias_ddrs.length ());
2805
2806   /* First, we collect all data ref pairs for aliasing checks.  */
2807   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2808     {
2809       struct data_reference *dr_a, *dr_b;
2810       gimple dr_group_first_a, dr_group_first_b;
2811       tree segment_length_a, segment_length_b;
2812       gimple stmt_a, stmt_b;
2813
2814       dr_a = DDR_A (ddr);
2815       stmt_a = DR_STMT (DDR_A (ddr));
2816       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2817       if (dr_group_first_a)
2818         {
2819           stmt_a = dr_group_first_a;
2820           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2821         }
2822
2823       dr_b = DDR_B (ddr);
2824       stmt_b = DR_STMT (DDR_B (ddr));
2825       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2826       if (dr_group_first_b)
2827         {
2828           stmt_b = dr_group_first_b;
2829           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2830         }
2831
2832       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2833         length_factor = scalar_loop_iters;
2834       else
2835         length_factor = size_int (vect_factor);
2836       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2837       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2838
2839       dr_with_seg_len_pair_t dr_with_seg_len_pair
2840           (dr_with_seg_len (dr_a, segment_length_a),
2841            dr_with_seg_len (dr_b, segment_length_b));
2842
2843       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2844         swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2845
2846       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2847     }
2848
2849   /* Second, we sort the collected data ref pairs so that we can scan
2850      them once to combine all possible aliasing checks.  */
2851   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2852
2853   /* Third, we scan the sorted dr pairs and check if we can combine
2854      alias checks of two neighbouring dr pairs.  */
2855   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2856     {
2857       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2858       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2859                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2860                       *dr_a2 = &comp_alias_ddrs[i].first,
2861                       *dr_b2 = &comp_alias_ddrs[i].second;
2862
2863       /* Remove duplicate data ref pairs.  */
2864       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2865         {
2866           if (dump_enabled_p ())
2867             {
2868               dump_printf_loc (MSG_NOTE, vect_location,
2869                                "found equal ranges ");
2870               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2871                                  DR_REF (dr_a1->dr));
2872               dump_printf (MSG_NOTE,  ", ");
2873               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2874                                  DR_REF (dr_b1->dr));
2875               dump_printf (MSG_NOTE,  " and ");
2876               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2877                                  DR_REF (dr_a2->dr));
2878               dump_printf (MSG_NOTE,  ", ");
2879               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2880                                  DR_REF (dr_b2->dr));
2881               dump_printf (MSG_NOTE, "\n");
2882             }
2883
2884           comp_alias_ddrs.ordered_remove (i--);
2885           continue;
2886         }
2887
2888       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2889         {
2890           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2891              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2892           if (*dr_a1 == *dr_a2)
2893             {
2894               swap (dr_a1, dr_b1);
2895               swap (dr_a2, dr_b2);
2896             }
2897
2898           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2899                                 DR_BASE_ADDRESS (dr_a2->dr),
2900                                 0)
2901               || !tree_fits_shwi_p (dr_a1->offset)
2902               || !tree_fits_shwi_p (dr_a2->offset))
2903             continue;
2904
2905           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2906                                 - tree_to_shwi (dr_a1->offset));
2907
2908
2909           /* Now we check if the following condition is satisfied:
2910
2911              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2912
2913              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2914              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2915              have to make a best estimation.  We can get the minimum value
2916              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2917              then either of the following two conditions can guarantee the
2918              one above:
2919
2920              1: DIFF <= MIN_SEG_LEN_B
2921              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2922
2923              */
2924
2925           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2926                                           ? tree_to_shwi (dr_b1->seg_len)
2927                                           : vect_factor);
2928
2929           if (diff <= min_seg_len_b
2930               || (tree_fits_shwi_p (dr_a1->seg_len)
2931                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2932             {
2933               if (dump_enabled_p ())
2934                 {
2935                   dump_printf_loc (MSG_NOTE, vect_location,
2936                                    "merging ranges for ");
2937                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2938                                      DR_REF (dr_a1->dr));
2939                   dump_printf (MSG_NOTE,  ", ");
2940                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2941                                      DR_REF (dr_b1->dr));
2942                   dump_printf (MSG_NOTE,  " and ");
2943                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2944                                      DR_REF (dr_a2->dr));
2945                   dump_printf (MSG_NOTE,  ", ");
2946                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2947                                      DR_REF (dr_b2->dr));
2948                   dump_printf (MSG_NOTE, "\n");
2949                 }
2950
2951               dr_a1->seg_len = size_binop (PLUS_EXPR,
2952                                            dr_a2->seg_len, size_int (diff));
2953               comp_alias_ddrs.ordered_remove (i--);
2954             }
2955         }
2956     }
2957
2958   dump_printf_loc (MSG_NOTE, vect_location,
2959                    "improved number of alias checks from %d to %d\n",
2960                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2961   if ((int) comp_alias_ddrs.length () >
2962       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2963     return false;
2964
2965   return true;
2966 }
2967
2968 /* Check whether a non-affine read in stmt is suitable for gather load
2969    and if so, return a builtin decl for that operation.  */
2970
2971 tree
2972 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2973                    tree *offp, int *scalep)
2974 {
2975   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2976   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2977   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2978   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2979   tree offtype = NULL_TREE;
2980   tree decl, base, off;
2981   enum machine_mode pmode;
2982   int punsignedp, pvolatilep;
2983
2984   base = DR_REF (dr);
2985   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2986      see if we can use the def stmt of the address.  */
2987   if (is_gimple_call (stmt)
2988       && gimple_call_internal_p (stmt)
2989       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2990           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2991       && TREE_CODE (base) == MEM_REF
2992       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2993       && integer_zerop (TREE_OPERAND (base, 1))
2994       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2995     {
2996       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2997       if (is_gimple_assign (def_stmt)
2998           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2999         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3000     }
3001
3002   /* The gather builtins need address of the form
3003      loop_invariant + vector * {1, 2, 4, 8}
3004      or
3005      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3006      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3007      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3008      multiplications and additions in it.  To get a vector, we need
3009      a single SSA_NAME that will be defined in the loop and will
3010      contain everything that is not loop invariant and that can be
3011      vectorized.  The following code attempts to find such a preexistng
3012      SSA_NAME OFF and put the loop invariants into a tree BASE
3013      that can be gimplified before the loop.  */
3014   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3015                               &pmode, &punsignedp, &pvolatilep, false);
3016   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3017
3018   if (TREE_CODE (base) == MEM_REF)
3019     {
3020       if (!integer_zerop (TREE_OPERAND (base, 1)))
3021         {
3022           if (off == NULL_TREE)
3023             {
3024               offset_int moff = mem_ref_offset (base);
3025               off = wide_int_to_tree (sizetype, moff);
3026             }
3027           else
3028             off = size_binop (PLUS_EXPR, off,
3029                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3030         }
3031       base = TREE_OPERAND (base, 0);
3032     }
3033   else
3034     base = build_fold_addr_expr (base);
3035
3036   if (off == NULL_TREE)
3037     off = size_zero_node;
3038
3039   /* If base is not loop invariant, either off is 0, then we start with just
3040      the constant offset in the loop invariant BASE and continue with base
3041      as OFF, otherwise give up.
3042      We could handle that case by gimplifying the addition of base + off
3043      into some SSA_NAME and use that as off, but for now punt.  */
3044   if (!expr_invariant_in_loop_p (loop, base))
3045     {
3046       if (!integer_zerop (off))
3047         return NULL_TREE;
3048       off = base;
3049       base = size_int (pbitpos / BITS_PER_UNIT);
3050     }
3051   /* Otherwise put base + constant offset into the loop invariant BASE
3052      and continue with OFF.  */
3053   else
3054     {
3055       base = fold_convert (sizetype, base);
3056       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3057     }
3058
3059   /* OFF at this point may be either a SSA_NAME or some tree expression
3060      from get_inner_reference.  Try to peel off loop invariants from it
3061      into BASE as long as possible.  */
3062   STRIP_NOPS (off);
3063   while (offtype == NULL_TREE)
3064     {
3065       enum tree_code code;
3066       tree op0, op1, add = NULL_TREE;
3067
3068       if (TREE_CODE (off) == SSA_NAME)
3069         {
3070           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3071
3072           if (expr_invariant_in_loop_p (loop, off))
3073             return NULL_TREE;
3074
3075           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3076             break;
3077
3078           op0 = gimple_assign_rhs1 (def_stmt);
3079           code = gimple_assign_rhs_code (def_stmt);
3080           op1 = gimple_assign_rhs2 (def_stmt);
3081         }
3082       else
3083         {
3084           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3085             return NULL_TREE;
3086           code = TREE_CODE (off);
3087           extract_ops_from_tree (off, &code, &op0, &op1);
3088         }
3089       switch (code)
3090         {
3091         case POINTER_PLUS_EXPR:
3092         case PLUS_EXPR:
3093           if (expr_invariant_in_loop_p (loop, op0))
3094             {
3095               add = op0;
3096               off = op1;
3097             do_add:
3098               add = fold_convert (sizetype, add);
3099               if (scale != 1)
3100                 add = size_binop (MULT_EXPR, add, size_int (scale));
3101               base = size_binop (PLUS_EXPR, base, add);
3102               continue;
3103             }
3104           if (expr_invariant_in_loop_p (loop, op1))
3105             {
3106               add = op1;
3107               off = op0;
3108               goto do_add;
3109             }
3110           break;
3111         case MINUS_EXPR:
3112           if (expr_invariant_in_loop_p (loop, op1))
3113             {
3114               add = fold_convert (sizetype, op1);
3115               add = size_binop (MINUS_EXPR, size_zero_node, add);
3116               off = op0;
3117               goto do_add;
3118             }
3119           break;
3120         case MULT_EXPR:
3121           if (scale == 1 && tree_fits_shwi_p (op1))
3122             {
3123               scale = tree_to_shwi (op1);
3124               off = op0;
3125               continue;
3126             }
3127           break;
3128         case SSA_NAME:
3129           off = op0;
3130           continue;
3131         CASE_CONVERT:
3132           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3133               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3134             break;
3135           if (TYPE_PRECISION (TREE_TYPE (op0))
3136               == TYPE_PRECISION (TREE_TYPE (off)))
3137             {
3138               off = op0;
3139               continue;
3140             }
3141           if (TYPE_PRECISION (TREE_TYPE (op0))
3142               < TYPE_PRECISION (TREE_TYPE (off)))
3143             {
3144               off = op0;
3145               offtype = TREE_TYPE (off);
3146               STRIP_NOPS (off);
3147               continue;
3148             }
3149           break;
3150         default:
3151           break;
3152         }
3153       break;
3154     }
3155
3156   /* If at the end OFF still isn't a SSA_NAME or isn't
3157      defined in the loop, punt.  */
3158   if (TREE_CODE (off) != SSA_NAME
3159       || expr_invariant_in_loop_p (loop, off))
3160     return NULL_TREE;
3161
3162   if (offtype == NULL_TREE)
3163     offtype = TREE_TYPE (off);
3164
3165   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3166                                            offtype, scale);
3167   if (decl == NULL_TREE)
3168     return NULL_TREE;
3169
3170   if (basep)
3171     *basep = base;
3172   if (offp)
3173     *offp = off;
3174   if (scalep)
3175     *scalep = scale;
3176   return decl;
3177 }
3178
3179 /* Function vect_analyze_data_refs.
3180
3181   Find all the data references in the loop or basic block.
3182
3183    The general structure of the analysis of data refs in the vectorizer is as
3184    follows:
3185    1- vect_analyze_data_refs(loop/bb): call
3186       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3187       in the loop/bb and their dependences.
3188    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3189    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3190    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3191
3192 */
3193
3194 bool
3195 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3196                         bb_vec_info bb_vinfo,
3197                         int *min_vf, unsigned *n_stmts)
3198 {
3199   struct loop *loop = NULL;
3200   basic_block bb = NULL;
3201   unsigned int i;
3202   vec<data_reference_p> datarefs;
3203   struct data_reference *dr;
3204   tree scalar_type;
3205
3206   if (dump_enabled_p ())
3207     dump_printf_loc (MSG_NOTE, vect_location,
3208                      "=== vect_analyze_data_refs ===\n");
3209
3210   if (loop_vinfo)
3211     {
3212       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3213
3214       loop = LOOP_VINFO_LOOP (loop_vinfo);
3215       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3216       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3217         {
3218           if (dump_enabled_p ())
3219             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3220                              "not vectorized: loop contains function calls"
3221                              " or data references that cannot be analyzed\n");
3222           return false;
3223         }
3224
3225       for (i = 0; i < loop->num_nodes; i++)
3226         {
3227           gimple_stmt_iterator gsi;
3228
3229           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3230             {
3231               gimple stmt = gsi_stmt (gsi);
3232               if (is_gimple_debug (stmt))
3233                 continue;
3234               ++*n_stmts;
3235               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3236                 {
3237                   if (is_gimple_call (stmt) && loop->safelen)
3238                     {
3239                       tree fndecl = gimple_call_fndecl (stmt), op;
3240                       if (fndecl != NULL_TREE)
3241                         {
3242                           struct cgraph_node *node = cgraph_node::get (fndecl);
3243                           if (node != NULL && node->simd_clones != NULL)
3244                             {
3245                               unsigned int j, n = gimple_call_num_args (stmt);
3246                               for (j = 0; j < n; j++)
3247                                 {
3248                                   op = gimple_call_arg (stmt, j);
3249                                   if (DECL_P (op)
3250                                       || (REFERENCE_CLASS_P (op)
3251                                           && get_base_address (op)))
3252                                     break;
3253                                 }
3254                               op = gimple_call_lhs (stmt);
3255                               /* Ignore #pragma omp declare simd functions
3256                                  if they don't have data references in the
3257                                  call stmt itself.  */
3258                               if (j == n
3259                                   && !(op
3260                                        && (DECL_P (op)
3261                                            || (REFERENCE_CLASS_P (op)
3262                                                && get_base_address (op)))))
3263                                 continue;
3264                             }
3265                         }
3266                     }
3267                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3268                   if (dump_enabled_p ())
3269                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3270                                      "not vectorized: loop contains function "
3271                                      "calls or data references that cannot "
3272                                      "be analyzed\n");
3273                   return false;
3274                 }
3275             }
3276         }
3277
3278       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3279     }
3280   else
3281     {
3282       gimple_stmt_iterator gsi;
3283
3284       bb = BB_VINFO_BB (bb_vinfo);
3285       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3286         {
3287           gimple stmt = gsi_stmt (gsi);
3288           if (is_gimple_debug (stmt))
3289             continue;
3290           ++*n_stmts;
3291           if (!find_data_references_in_stmt (NULL, stmt,
3292                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3293             {
3294               /* Mark the rest of the basic-block as unvectorizable.  */
3295               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3296                 {
3297                   stmt = gsi_stmt (gsi);
3298                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3299                 }
3300               break;
3301             }
3302         }
3303
3304       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3305     }
3306
3307   /* Go through the data-refs, check that the analysis succeeded.  Update
3308      pointer from stmt_vec_info struct to DR and vectype.  */
3309
3310   FOR_EACH_VEC_ELT (datarefs, i, dr)
3311     {
3312       gimple stmt;
3313       stmt_vec_info stmt_info;
3314       tree base, offset, init;
3315       bool gather = false;
3316       bool simd_lane_access = false;
3317       int vf;
3318
3319 again:
3320       if (!dr || !DR_REF (dr))
3321         {
3322           if (dump_enabled_p ())
3323             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3324                              "not vectorized: unhandled data-ref\n");
3325           return false;
3326         }
3327
3328       stmt = DR_STMT (dr);
3329       stmt_info = vinfo_for_stmt (stmt);
3330
3331       /* Discard clobbers from the dataref vector.  We will remove
3332          clobber stmts during vectorization.  */
3333       if (gimple_clobber_p (stmt))
3334         {
3335           free_data_ref (dr);
3336           if (i == datarefs.length () - 1)
3337             {
3338               datarefs.pop ();
3339               break;
3340             }
3341           datarefs.ordered_remove (i);
3342           dr = datarefs[i];
3343           goto again;
3344         }
3345
3346       /* Check that analysis of the data-ref succeeded.  */
3347       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3348           || !DR_STEP (dr))
3349         {
3350           bool maybe_gather
3351             = DR_IS_READ (dr)
3352               && !TREE_THIS_VOLATILE (DR_REF (dr))
3353               && targetm.vectorize.builtin_gather != NULL;
3354           bool maybe_simd_lane_access
3355             = loop_vinfo && loop->simduid;
3356
3357           /* If target supports vector gather loads, or if this might be
3358              a SIMD lane access, see if they can't be used.  */
3359           if (loop_vinfo
3360               && (maybe_gather || maybe_simd_lane_access)
3361               && !nested_in_vect_loop_p (loop, stmt))
3362             {
3363               struct data_reference *newdr
3364                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3365                                    DR_REF (dr), stmt, true);
3366               gcc_assert (newdr != NULL && DR_REF (newdr));
3367               if (DR_BASE_ADDRESS (newdr)
3368                   && DR_OFFSET (newdr)
3369                   && DR_INIT (newdr)
3370                   && DR_STEP (newdr)
3371                   && integer_zerop (DR_STEP (newdr)))
3372                 {
3373                   if (maybe_simd_lane_access)
3374                     {
3375                       tree off = DR_OFFSET (newdr);
3376                       STRIP_NOPS (off);
3377                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3378                           && TREE_CODE (off) == MULT_EXPR
3379                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3380                         {
3381                           tree step = TREE_OPERAND (off, 1);
3382                           off = TREE_OPERAND (off, 0);
3383                           STRIP_NOPS (off);
3384                           if (CONVERT_EXPR_P (off)
3385                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3386                                                                           0)))
3387                                  < TYPE_PRECISION (TREE_TYPE (off)))
3388                             off = TREE_OPERAND (off, 0);
3389                           if (TREE_CODE (off) == SSA_NAME)
3390                             {
3391                               gimple def = SSA_NAME_DEF_STMT (off);
3392                               tree reft = TREE_TYPE (DR_REF (newdr));
3393                               if (is_gimple_call (def)
3394                                   && gimple_call_internal_p (def)
3395                                   && (gimple_call_internal_fn (def)
3396                                       == IFN_GOMP_SIMD_LANE))
3397                                 {
3398                                   tree arg = gimple_call_arg (def, 0);
3399                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3400                                   arg = SSA_NAME_VAR (arg);
3401                                   if (arg == loop->simduid
3402                                       /* For now.  */
3403                                       && tree_int_cst_equal
3404                                            (TYPE_SIZE_UNIT (reft),
3405                                             step))
3406                                     {
3407                                       DR_OFFSET (newdr) = ssize_int (0);
3408                                       DR_STEP (newdr) = step;
3409                                       DR_ALIGNED_TO (newdr)
3410                                         = size_int (BIGGEST_ALIGNMENT);
3411                                       dr = newdr;
3412                                       simd_lane_access = true;
3413                                     }
3414                                 }
3415                             }
3416                         }
3417                     }
3418                   if (!simd_lane_access && maybe_gather)
3419                     {
3420                       dr = newdr;
3421                       gather = true;
3422                     }
3423                 }
3424               if (!gather && !simd_lane_access)
3425                 free_data_ref (newdr);
3426             }
3427
3428           if (!gather && !simd_lane_access)
3429             {
3430               if (dump_enabled_p ())
3431                 {
3432                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3433                                    "not vectorized: data ref analysis "
3434                                    "failed ");
3435                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3436                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3437                 }
3438
3439               if (bb_vinfo)
3440                 break;
3441
3442               return false;
3443             }
3444         }
3445
3446       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3447         {
3448           if (dump_enabled_p ())
3449             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3450                              "not vectorized: base addr of dr is a "
3451                              "constant\n");
3452
3453           if (bb_vinfo)
3454             break;
3455
3456           if (gather || simd_lane_access)
3457             free_data_ref (dr);
3458           return false;
3459         }
3460
3461       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3462         {
3463           if (dump_enabled_p ())
3464             {
3465               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3466                                "not vectorized: volatile type ");
3467               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3468               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3469             }
3470
3471           if (bb_vinfo)
3472             break;
3473
3474           return false;
3475         }
3476
3477       if (stmt_can_throw_internal (stmt))
3478         {
3479           if (dump_enabled_p ())
3480             {
3481               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3482                                "not vectorized: statement can throw an "
3483                                "exception ");
3484               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3485               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3486             }
3487
3488           if (bb_vinfo)
3489             break;
3490
3491           if (gather || simd_lane_access)
3492             free_data_ref (dr);
3493           return false;
3494         }
3495
3496       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3497           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3498         {
3499           if (dump_enabled_p ())
3500             {
3501               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3502                                "not vectorized: statement is bitfield "
3503                                "access ");
3504               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3505               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3506             }
3507
3508           if (bb_vinfo)
3509             break;
3510
3511           if (gather || simd_lane_access)
3512             free_data_ref (dr);
3513           return false;
3514         }
3515
3516       base = unshare_expr (DR_BASE_ADDRESS (dr));
3517       offset = unshare_expr (DR_OFFSET (dr));
3518       init = unshare_expr (DR_INIT (dr));
3519
3520       if (is_gimple_call (stmt)
3521           && (!gimple_call_internal_p (stmt)
3522               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3523                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3524         {
3525           if (dump_enabled_p ())
3526             {
3527               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3528                                "not vectorized: dr in a call ");
3529               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3530               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3531             }
3532
3533           if (bb_vinfo)
3534             break;
3535
3536           if (gather || simd_lane_access)
3537             free_data_ref (dr);
3538           return false;
3539         }
3540
3541       /* Update DR field in stmt_vec_info struct.  */
3542
3543       /* If the dataref is in an inner-loop of the loop that is considered for
3544          for vectorization, we also want to analyze the access relative to
3545          the outer-loop (DR contains information only relative to the
3546          inner-most enclosing loop).  We do that by building a reference to the
3547          first location accessed by the inner-loop, and analyze it relative to
3548          the outer-loop.  */
3549       if (loop && nested_in_vect_loop_p (loop, stmt))
3550         {
3551           tree outer_step, outer_base, outer_init;
3552           HOST_WIDE_INT pbitsize, pbitpos;
3553           tree poffset;
3554           enum machine_mode pmode;
3555           int punsignedp, pvolatilep;
3556           affine_iv base_iv, offset_iv;
3557           tree dinit;
3558
3559           /* Build a reference to the first location accessed by the
3560              inner-loop: *(BASE+INIT).  (The first location is actually
3561              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3562           tree inner_base = build_fold_indirect_ref
3563                                 (fold_build_pointer_plus (base, init));
3564
3565           if (dump_enabled_p ())
3566             {
3567               dump_printf_loc (MSG_NOTE, vect_location,
3568                                "analyze in outer-loop: ");
3569               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3570               dump_printf (MSG_NOTE, "\n");
3571             }
3572
3573           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3574                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3575           gcc_assert (outer_base != NULL_TREE);
3576
3577           if (pbitpos % BITS_PER_UNIT != 0)
3578             {
3579               if (dump_enabled_p ())
3580                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3581                                  "failed: bit offset alignment.\n");
3582               return false;
3583             }
3584
3585           outer_base = build_fold_addr_expr (outer_base);
3586           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3587                           &base_iv, false))
3588             {
3589               if (dump_enabled_p ())
3590                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3591                                  "failed: evolution of base is not affine.\n");
3592               return false;
3593             }
3594
3595           if (offset)
3596             {
3597               if (poffset)
3598                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3599                                        poffset);
3600               else
3601                 poffset = offset;
3602             }
3603
3604           if (!poffset)
3605             {
3606               offset_iv.base = ssize_int (0);
3607               offset_iv.step = ssize_int (0);
3608             }
3609           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3610                                &offset_iv, false))
3611             {
3612               if (dump_enabled_p ())
3613                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3614                                  "evolution of offset is not affine.\n");
3615               return false;
3616             }
3617
3618           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3619           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3620           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3621           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3622           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3623
3624           outer_step = size_binop (PLUS_EXPR,
3625                                 fold_convert (ssizetype, base_iv.step),
3626                                 fold_convert (ssizetype, offset_iv.step));
3627
3628           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3629           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3630           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3631           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3632           STMT_VINFO_DR_OFFSET (stmt_info) =
3633                                 fold_convert (ssizetype, offset_iv.base);
3634           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3635                                 size_int (highest_pow2_factor (offset_iv.base));
3636
3637           if (dump_enabled_p ())
3638             {
3639               dump_printf_loc (MSG_NOTE, vect_location,
3640                                "\touter base_address: ");
3641               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3642                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3643               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3644               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3645                                  STMT_VINFO_DR_OFFSET (stmt_info));
3646               dump_printf (MSG_NOTE,
3647                            "\n\touter constant offset from base address: ");
3648               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3649                                  STMT_VINFO_DR_INIT (stmt_info));
3650               dump_printf (MSG_NOTE, "\n\touter step: ");
3651               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3652                                  STMT_VINFO_DR_STEP (stmt_info));
3653               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3654               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3655                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3656               dump_printf (MSG_NOTE, "\n");
3657             }
3658         }
3659
3660       if (STMT_VINFO_DATA_REF (stmt_info))
3661         {
3662           if (dump_enabled_p ())
3663             {
3664               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3665                                "not vectorized: more than one data ref "
3666                                "in stmt: ");
3667               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3668               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3669             }
3670
3671           if (bb_vinfo)
3672             break;
3673
3674           if (gather || simd_lane_access)
3675             free_data_ref (dr);
3676           return false;
3677         }
3678
3679       STMT_VINFO_DATA_REF (stmt_info) = dr;
3680       if (simd_lane_access)
3681         {
3682           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3683           free_data_ref (datarefs[i]);
3684           datarefs[i] = dr;
3685         }
3686
3687       /* Set vectype for STMT.  */
3688       scalar_type = TREE_TYPE (DR_REF (dr));
3689       STMT_VINFO_VECTYPE (stmt_info)
3690         = get_vectype_for_scalar_type (scalar_type);
3691       if (!STMT_VINFO_VECTYPE (stmt_info))
3692         {
3693           if (dump_enabled_p ())
3694             {
3695               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3696                                "not vectorized: no vectype for stmt: ");
3697               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3698               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3699               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3700                                  scalar_type);
3701               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3702             }
3703
3704           if (bb_vinfo)
3705             break;
3706
3707           if (gather || simd_lane_access)
3708             {
3709               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3710               if (gather)
3711                 free_data_ref (dr);
3712             }
3713           return false;
3714         }
3715       else
3716         {
3717           if (dump_enabled_p ())
3718             {
3719               dump_printf_loc (MSG_NOTE, vect_location,
3720                                "got vectype for stmt: ");
3721               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3722               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3723                                  STMT_VINFO_VECTYPE (stmt_info));
3724               dump_printf (MSG_NOTE, "\n");
3725             }
3726         }
3727
3728       /* Adjust the minimal vectorization factor according to the
3729          vector type.  */
3730       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3731       if (vf > *min_vf)
3732         *min_vf = vf;
3733
3734       if (gather)
3735         {
3736           tree off;
3737
3738           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3739           if (gather
3740               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3741             gather = false;
3742           if (!gather)
3743             {
3744               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3745               free_data_ref (dr);
3746               if (dump_enabled_p ())
3747                 {
3748                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3749                                    "not vectorized: not suitable for gather "
3750                                    "load ");
3751                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3752                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3753                 }
3754               return false;
3755             }
3756
3757           datarefs[i] = dr;
3758           STMT_VINFO_GATHER_P (stmt_info) = true;
3759         }
3760       else if (loop_vinfo
3761                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3762         {
3763           if (nested_in_vect_loop_p (loop, stmt)
3764               || !DR_IS_READ (dr))
3765             {
3766               if (dump_enabled_p ())
3767                 {
3768                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3769                                    "not vectorized: not suitable for strided "
3770                                    "load ");
3771                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3772                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3773                 }
3774               return false;
3775             }
3776           STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3777         }
3778     }
3779
3780   /* If we stopped analysis at the first dataref we could not analyze
3781      when trying to vectorize a basic-block mark the rest of the datarefs
3782      as not vectorizable and truncate the vector of datarefs.  That
3783      avoids spending useless time in analyzing their dependence.  */
3784   if (i != datarefs.length ())
3785     {
3786       gcc_assert (bb_vinfo != NULL);
3787       for (unsigned j = i; j < datarefs.length (); ++j)
3788         {
3789           data_reference_p dr = datarefs[j];
3790           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3791           free_data_ref (dr);
3792         }
3793       datarefs.truncate (i);
3794     }
3795
3796   return true;
3797 }
3798
3799
3800 /* Function vect_get_new_vect_var.
3801
3802    Returns a name for a new variable.  The current naming scheme appends the
3803    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3804    the name of vectorizer generated variables, and appends that to NAME if
3805    provided.  */
3806
3807 tree
3808 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3809 {
3810   const char *prefix;
3811   tree new_vect_var;
3812
3813   switch (var_kind)
3814   {
3815   case vect_simple_var:
3816     prefix = "vect";
3817     break;
3818   case vect_scalar_var:
3819     prefix = "stmp";
3820     break;
3821   case vect_pointer_var:
3822     prefix = "vectp";
3823     break;
3824   default:
3825     gcc_unreachable ();
3826   }
3827
3828   if (name)
3829     {
3830       char* tmp = concat (prefix, "_", name, NULL);
3831       new_vect_var = create_tmp_reg (type, tmp);
3832       free (tmp);
3833     }
3834   else
3835     new_vect_var = create_tmp_reg (type, prefix);
3836
3837   return new_vect_var;
3838 }
3839
3840
3841 /* Function vect_create_addr_base_for_vector_ref.
3842
3843    Create an expression that computes the address of the first memory location
3844    that will be accessed for a data reference.
3845
3846    Input:
3847    STMT: The statement containing the data reference.
3848    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3849    OFFSET: Optional. If supplied, it is be added to the initial address.
3850    LOOP:    Specify relative to which loop-nest should the address be computed.
3851             For example, when the dataref is in an inner-loop nested in an
3852             outer-loop that is now being vectorized, LOOP can be either the
3853             outer-loop, or the inner-loop.  The first memory location accessed
3854             by the following dataref ('in' points to short):
3855
3856                 for (i=0; i<N; i++)
3857                    for (j=0; j<M; j++)
3858                      s += in[i+j]
3859
3860             is as follows:
3861             if LOOP=i_loop:     &in             (relative to i_loop)
3862             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3863    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3864             initial address.  Unlike OFFSET, which is number of elements to
3865             be added, BYTE_OFFSET is measured in bytes.
3866
3867    Output:
3868    1. Return an SSA_NAME whose value is the address of the memory location of
3869       the first vector of the data reference.
3870    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3871       these statement(s) which define the returned SSA_NAME.
3872
3873    FORNOW: We are only handling array accesses with step 1.  */
3874
3875 tree
3876 vect_create_addr_base_for_vector_ref (gimple stmt,
3877                                       gimple_seq *new_stmt_list,
3878                                       tree offset,
3879                                       struct loop *loop,
3880                                       tree byte_offset)
3881 {
3882   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3883   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3884   tree data_ref_base;
3885   const char *base_name;
3886   tree addr_base;
3887   tree dest;
3888   gimple_seq seq = NULL;
3889   tree base_offset;
3890   tree init;
3891   tree vect_ptr_type;
3892   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3893   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3894
3895   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3896     {
3897       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3898
3899       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3900
3901       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3902       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3903       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3904     }
3905   else
3906     {
3907       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3908       base_offset = unshare_expr (DR_OFFSET (dr));
3909       init = unshare_expr (DR_INIT (dr));
3910     }
3911
3912   if (loop_vinfo)
3913     base_name = get_name (data_ref_base);
3914   else
3915     {
3916       base_offset = ssize_int (0);
3917       init = ssize_int (0);
3918       base_name = get_name (DR_REF (dr));
3919     }
3920
3921   /* Create base_offset */
3922   base_offset = size_binop (PLUS_EXPR,
3923                             fold_convert (sizetype, base_offset),
3924                             fold_convert (sizetype, init));
3925
3926   if (offset)
3927     {
3928       offset = fold_build2 (MULT_EXPR, sizetype,
3929                             fold_convert (sizetype, offset), step);
3930       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3931                                  base_offset, offset);
3932     }
3933   if (byte_offset)
3934     {
3935       byte_offset = fold_convert (sizetype, byte_offset);
3936       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3937                                  base_offset, byte_offset);
3938     }
3939
3940   /* base + base_offset */
3941   if (loop_vinfo)
3942     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3943   else
3944     {
3945       addr_base = build1 (ADDR_EXPR,
3946                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3947                           unshare_expr (DR_REF (dr)));
3948     }
3949
3950   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3951   addr_base = fold_convert (vect_ptr_type, addr_base);
3952   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3953   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3954   gimple_seq_add_seq (new_stmt_list, seq);
3955
3956   if (DR_PTR_INFO (dr)
3957       && TREE_CODE (addr_base) == SSA_NAME)
3958     {
3959       duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3960       if (offset)
3961         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3962     }
3963
3964   if (dump_enabled_p ())
3965     {
3966       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3967       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3968       dump_printf (MSG_NOTE, "\n");
3969     }
3970
3971   return addr_base;
3972 }
3973
3974
3975 /* Function vect_create_data_ref_ptr.
3976
3977    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3978    location accessed in the loop by STMT, along with the def-use update
3979    chain to appropriately advance the pointer through the loop iterations.
3980    Also set aliasing information for the pointer.  This pointer is used by
3981    the callers to this function to create a memory reference expression for
3982    vector load/store access.
3983
3984    Input:
3985    1. STMT: a stmt that references memory. Expected to be of the form
3986          GIMPLE_ASSIGN <name, data-ref> or
3987          GIMPLE_ASSIGN <data-ref, name>.
3988    2. AGGR_TYPE: the type of the reference, which should be either a vector
3989         or an array.
3990    3. AT_LOOP: the loop where the vector memref is to be created.
3991    4. OFFSET (optional): an offset to be added to the initial address accessed
3992         by the data-ref in STMT.
3993    5. BSI: location where the new stmts are to be placed if there is no loop
3994    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3995         pointing to the initial address.
3996    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3997         to the initial address accessed by the data-ref in STMT.  This is
3998         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
3999         in bytes.
4000
4001    Output:
4002    1. Declare a new ptr to vector_type, and have it point to the base of the
4003       data reference (initial addressed accessed by the data reference).
4004       For example, for vector of type V8HI, the following code is generated:
4005
4006       v8hi *ap;
4007       ap = (v8hi *)initial_address;
4008
4009       if OFFSET is not supplied:
4010          initial_address = &a[init];
4011       if OFFSET is supplied:
4012          initial_address = &a[init + OFFSET];
4013       if BYTE_OFFSET is supplied:
4014          initial_address = &a[init] + BYTE_OFFSET;
4015
4016       Return the initial_address in INITIAL_ADDRESS.
4017
4018    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4019       update the pointer in each iteration of the loop.
4020
4021       Return the increment stmt that updates the pointer in PTR_INCR.
4022
4023    3. Set INV_P to true if the access pattern of the data reference in the
4024       vectorized loop is invariant.  Set it to false otherwise.
4025
4026    4. Return the pointer.  */
4027
4028 tree
4029 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4030                           tree offset, tree *initial_address,
4031                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
4032                           bool only_init, bool *inv_p, tree byte_offset)
4033 {
4034   const char *base_name;
4035   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4036   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4037   struct loop *loop = NULL;
4038   bool nested_in_vect_loop = false;
4039   struct loop *containing_loop = NULL;
4040   tree aggr_ptr_type;
4041   tree aggr_ptr;
4042   tree new_temp;
4043   gimple vec_stmt;
4044   gimple_seq new_stmt_list = NULL;
4045   edge pe = NULL;
4046   basic_block new_bb;
4047   tree aggr_ptr_init;
4048   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4049   tree aptr;
4050   gimple_stmt_iterator incr_gsi;
4051   bool insert_after;
4052   tree indx_before_incr, indx_after_incr;
4053   gimple incr;
4054   tree step;
4055   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4056
4057   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4058               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4059
4060   if (loop_vinfo)
4061     {
4062       loop = LOOP_VINFO_LOOP (loop_vinfo);
4063       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4064       containing_loop = (gimple_bb (stmt))->loop_father;
4065       pe = loop_preheader_edge (loop);
4066     }
4067   else
4068     {
4069       gcc_assert (bb_vinfo);
4070       only_init = true;
4071       *ptr_incr = NULL;
4072     }
4073
4074   /* Check the step (evolution) of the load in LOOP, and record
4075      whether it's invariant.  */
4076   if (nested_in_vect_loop)
4077     step = STMT_VINFO_DR_STEP (stmt_info);
4078   else
4079     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4080
4081   if (integer_zerop (step))
4082     *inv_p = true;
4083   else
4084     *inv_p = false;
4085
4086   /* Create an expression for the first address accessed by this load
4087      in LOOP.  */
4088   base_name = get_name (DR_BASE_ADDRESS (dr));
4089
4090   if (dump_enabled_p ())
4091     {
4092       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4093       dump_printf_loc (MSG_NOTE, vect_location,
4094                        "create %s-pointer variable to type: ",
4095                        get_tree_code_name (TREE_CODE (aggr_type)));
4096       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4097       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4098         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4099       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4100         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4101       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4102         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4103       else
4104         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4105       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4106       dump_printf (MSG_NOTE, "\n");
4107     }
4108
4109   /* (1) Create the new aggregate-pointer variable.
4110      Vector and array types inherit the alias set of their component
4111      type by default so we need to use a ref-all pointer if the data
4112      reference does not conflict with the created aggregated data
4113      reference because it is not addressable.  */
4114   bool need_ref_all = false;
4115   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4116                               get_alias_set (DR_REF (dr))))
4117     need_ref_all = true;
4118   /* Likewise for any of the data references in the stmt group.  */
4119   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4120     {
4121       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4122       do
4123         {
4124           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4125           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4126           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4127                                       get_alias_set (DR_REF (sdr))))
4128             {
4129               need_ref_all = true;
4130               break;
4131             }
4132           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4133         }
4134       while (orig_stmt);
4135     }
4136   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4137                                                need_ref_all);
4138   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4139
4140
4141   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4142      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4143      def-use update cycles for the pointer: one relative to the outer-loop
4144      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4145      to the inner-loop (which is the inner-most loop containing the dataref),
4146      and this is done be step (5) below.
4147
4148      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4149      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4150      redundant.  Steps (3),(4) create the following:
4151
4152         vp0 = &base_addr;
4153         LOOP:   vp1 = phi(vp0,vp2)
4154                 ...
4155                 ...
4156                 vp2 = vp1 + step
4157                 goto LOOP
4158
4159      If there is an inner-loop nested in loop, then step (5) will also be
4160      applied, and an additional update in the inner-loop will be created:
4161
4162         vp0 = &base_addr;
4163         LOOP:   vp1 = phi(vp0,vp2)
4164                 ...
4165         inner:     vp3 = phi(vp1,vp4)
4166                    vp4 = vp3 + inner_step
4167                    if () goto inner
4168                 ...
4169                 vp2 = vp1 + step
4170                 if () goto LOOP   */
4171
4172   /* (2) Calculate the initial address of the aggregate-pointer, and set
4173      the aggregate-pointer to point to it before the loop.  */
4174
4175   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4176
4177   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4178                                                    offset, loop, byte_offset);
4179   if (new_stmt_list)
4180     {
4181       if (pe)
4182         {
4183           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4184           gcc_assert (!new_bb);
4185         }
4186       else
4187         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4188     }
4189
4190   *initial_address = new_temp;
4191
4192   /* Create: p = (aggr_type *) initial_base  */
4193   if (TREE_CODE (new_temp) != SSA_NAME
4194       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4195     {
4196       vec_stmt = gimple_build_assign (aggr_ptr,
4197                                       fold_convert (aggr_ptr_type, new_temp));
4198       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4199       /* Copy the points-to information if it exists. */
4200       if (DR_PTR_INFO (dr))
4201         duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4202       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4203       if (pe)
4204         {
4205           new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4206           gcc_assert (!new_bb);
4207         }
4208       else
4209         gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4210     }
4211   else
4212     aggr_ptr_init = new_temp;
4213
4214   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4215      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4216      inner-loop nested in LOOP (during outer-loop vectorization).  */
4217
4218   /* No update in loop is required.  */
4219   if (only_init && (!loop_vinfo || at_loop == loop))
4220     aptr = aggr_ptr_init;
4221   else
4222     {
4223       /* The step of the aggregate pointer is the type size.  */
4224       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4225       /* One exception to the above is when the scalar step of the load in
4226          LOOP is zero. In this case the step here is also zero.  */
4227       if (*inv_p)
4228         iv_step = size_zero_node;
4229       else if (tree_int_cst_sgn (step) == -1)
4230         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4231
4232       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4233
4234       create_iv (aggr_ptr_init,
4235                  fold_convert (aggr_ptr_type, iv_step),
4236                  aggr_ptr, loop, &incr_gsi, insert_after,
4237                  &indx_before_incr, &indx_after_incr);
4238       incr = gsi_stmt (incr_gsi);
4239       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4240
4241       /* Copy the points-to information if it exists. */
4242       if (DR_PTR_INFO (dr))
4243         {
4244           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4245           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4246         }
4247       if (ptr_incr)
4248         *ptr_incr = incr;
4249
4250       aptr = indx_before_incr;
4251     }
4252
4253   if (!nested_in_vect_loop || only_init)
4254     return aptr;
4255
4256
4257   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4258      nested in LOOP, if exists.  */
4259
4260   gcc_assert (nested_in_vect_loop);
4261   if (!only_init)
4262     {
4263       standard_iv_increment_position (containing_loop, &incr_gsi,
4264                                       &insert_after);
4265       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4266                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4267                  &indx_after_incr);
4268       incr = gsi_stmt (incr_gsi);
4269       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4270
4271       /* Copy the points-to information if it exists. */
4272       if (DR_PTR_INFO (dr))
4273         {
4274           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4275           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4276         }
4277       if (ptr_incr)
4278         *ptr_incr = incr;
4279
4280       return indx_before_incr;
4281     }
4282   else
4283     gcc_unreachable ();
4284 }
4285
4286
4287 /* Function bump_vector_ptr
4288
4289    Increment a pointer (to a vector type) by vector-size. If requested,
4290    i.e. if PTR-INCR is given, then also connect the new increment stmt
4291    to the existing def-use update-chain of the pointer, by modifying
4292    the PTR_INCR as illustrated below:
4293
4294    The pointer def-use update-chain before this function:
4295                         DATAREF_PTR = phi (p_0, p_2)
4296                         ....
4297         PTR_INCR:       p_2 = DATAREF_PTR + step
4298
4299    The pointer def-use update-chain after this function:
4300                         DATAREF_PTR = phi (p_0, p_2)
4301                         ....
4302                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4303                         ....
4304         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4305
4306    Input:
4307    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4308                  in the loop.
4309    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4310               the loop.  The increment amount across iterations is expected
4311               to be vector_size.
4312    BSI - location where the new update stmt is to be placed.
4313    STMT - the original scalar memory-access stmt that is being vectorized.
4314    BUMP - optional. The offset by which to bump the pointer. If not given,
4315           the offset is assumed to be vector_size.
4316
4317    Output: Return NEW_DATAREF_PTR as illustrated above.
4318
4319 */
4320
4321 tree
4322 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4323                  gimple stmt, tree bump)
4324 {
4325   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4326   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4327   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4328   tree update = TYPE_SIZE_UNIT (vectype);
4329   gimple_assign incr_stmt;
4330   ssa_op_iter iter;
4331   use_operand_p use_p;
4332   tree new_dataref_ptr;
4333
4334   if (bump)
4335     update = bump;
4336
4337   new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4338   incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4339                                             dataref_ptr, update);
4340   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4341
4342   /* Copy the points-to information if it exists. */
4343   if (DR_PTR_INFO (dr))
4344     {
4345       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4346       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4347     }
4348
4349   if (!ptr_incr)
4350     return new_dataref_ptr;
4351
4352   /* Update the vector-pointer's cross-iteration increment.  */
4353   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4354     {
4355       tree use = USE_FROM_PTR (use_p);
4356
4357       if (use == dataref_ptr)
4358         SET_USE (use_p, new_dataref_ptr);
4359       else
4360         gcc_assert (tree_int_cst_compare (use, update) == 0);
4361     }
4362
4363   return new_dataref_ptr;
4364 }
4365
4366
4367 /* Function vect_create_destination_var.
4368
4369    Create a new temporary of type VECTYPE.  */
4370
4371 tree
4372 vect_create_destination_var (tree scalar_dest, tree vectype)
4373 {
4374   tree vec_dest;
4375   const char *name;
4376   char *new_name;
4377   tree type;
4378   enum vect_var_kind kind;
4379
4380   kind = vectype ? vect_simple_var : vect_scalar_var;
4381   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4382
4383   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4384
4385   name = get_name (scalar_dest);
4386   if (name)
4387     asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4388   else
4389     asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4390   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4391   free (new_name);
4392
4393   return vec_dest;
4394 }
4395
4396 /* Function vect_grouped_store_supported.
4397
4398    Returns TRUE if interleave high and interleave low permutations
4399    are supported, and FALSE otherwise.  */
4400
4401 bool
4402 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4403 {
4404   enum machine_mode mode = TYPE_MODE (vectype);
4405
4406   /* vect_permute_store_chain requires the group size to be equal to 3 or
4407      be a power of two.  */
4408   if (count != 3 && exact_log2 (count) == -1)
4409     {
4410       if (dump_enabled_p ())
4411         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4412                          "the size of the group of accesses"
4413                          " is not a power of 2 or not eqaul to 3\n");
4414       return false;
4415     }
4416
4417   /* Check that the permutation is supported.  */
4418   if (VECTOR_MODE_P (mode))
4419     {
4420       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4421       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4422
4423       if (count == 3)
4424         {
4425           unsigned int j0 = 0, j1 = 0, j2 = 0;
4426           unsigned int i, j;
4427
4428           for (j = 0; j < 3; j++)
4429             {
4430               int nelt0 = ((3 - j) * nelt) % 3;
4431               int nelt1 = ((3 - j) * nelt + 1) % 3;
4432               int nelt2 = ((3 - j) * nelt + 2) % 3;
4433               for (i = 0; i < nelt; i++)
4434                 {
4435                   if (3 * i + nelt0 < nelt)
4436                     sel[3 * i + nelt0] = j0++;
4437                   if (3 * i + nelt1 < nelt)
4438                     sel[3 * i + nelt1] = nelt + j1++;
4439                   if (3 * i + nelt2 < nelt)
4440                     sel[3 * i + nelt2] = 0;
4441                 }
4442               if (!can_vec_perm_p (mode, false, sel))
4443                 {
4444                   if (dump_enabled_p ())
4445                     dump_printf (MSG_MISSED_OPTIMIZATION,
4446                                  "permutaion op not supported by target.\n");
4447                   return false;
4448                 }
4449
4450               for (i = 0; i < nelt; i++)
4451                 {
4452                   if (3 * i + nelt0 < nelt)
4453                     sel[3 * i + nelt0] = 3 * i + nelt0;
4454                   if (3 * i + nelt1 < nelt)
4455                     sel[3 * i + nelt1] = 3 * i + nelt1;
4456                   if (3 * i + nelt2 < nelt)
4457                     sel[3 * i + nelt2] = nelt + j2++;
4458                 }
4459               if (!can_vec_perm_p (mode, false, sel))
4460                 {
4461                   if (dump_enabled_p ())
4462                     dump_printf (MSG_MISSED_OPTIMIZATION,
4463                                  "permutaion op not supported by target.\n");
4464                   return false;
4465                 }
4466             }
4467           return true;
4468         }
4469       else
4470         {
4471           /* If length is not equal to 3 then only power of 2 is supported.  */
4472           gcc_assert (exact_log2 (count) != -1);
4473
4474           for (i = 0; i < nelt / 2; i++)
4475             {
4476               sel[i * 2] = i;
4477               sel[i * 2 + 1] = i + nelt;
4478             }
4479             if (can_vec_perm_p (mode, false, sel))
4480               {
4481                 for (i = 0; i < nelt; i++)
4482                   sel[i] += nelt / 2;
4483                 if (can_vec_perm_p (mode, false, sel))
4484                   return true;
4485               }
4486         }
4487     }
4488
4489   if (dump_enabled_p ())
4490     dump_printf (MSG_MISSED_OPTIMIZATION,
4491                  "permutaion op not supported by target.\n");
4492   return false;
4493 }
4494
4495
4496 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4497    type VECTYPE.  */
4498
4499 bool
4500 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4501 {
4502   return vect_lanes_optab_supported_p ("vec_store_lanes",
4503                                        vec_store_lanes_optab,
4504                                        vectype, count);
4505 }
4506
4507
4508 /* Function vect_permute_store_chain.
4509
4510    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4511    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4512    the data correctly for the stores.  Return the final references for stores
4513    in RESULT_CHAIN.
4514
4515    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4516    The input is 4 vectors each containing 8 elements.  We assign a number to
4517    each element, the input sequence is:
4518
4519    1st vec:   0  1  2  3  4  5  6  7
4520    2nd vec:   8  9 10 11 12 13 14 15
4521    3rd vec:  16 17 18 19 20 21 22 23
4522    4th vec:  24 25 26 27 28 29 30 31
4523
4524    The output sequence should be:
4525
4526    1st vec:  0  8 16 24  1  9 17 25
4527    2nd vec:  2 10 18 26  3 11 19 27
4528    3rd vec:  4 12 20 28  5 13 21 30
4529    4th vec:  6 14 22 30  7 15 23 31
4530
4531    i.e., we interleave the contents of the four vectors in their order.
4532
4533    We use interleave_high/low instructions to create such output.  The input of
4534    each interleave_high/low operation is two vectors:
4535    1st vec    2nd vec
4536    0 1 2 3    4 5 6 7
4537    the even elements of the result vector are obtained left-to-right from the
4538    high/low elements of the first vector.  The odd elements of the result are
4539    obtained left-to-right from the high/low elements of the second vector.
4540    The output of interleave_high will be:   0 4 1 5
4541    and of interleave_low:                   2 6 3 7
4542
4543
4544    The permutation is done in log LENGTH stages.  In each stage interleave_high
4545    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4546    where the first argument is taken from the first half of DR_CHAIN and the
4547    second argument from it's second half.
4548    In our example,
4549
4550    I1: interleave_high (1st vec, 3rd vec)
4551    I2: interleave_low (1st vec, 3rd vec)
4552    I3: interleave_high (2nd vec, 4th vec)
4553    I4: interleave_low (2nd vec, 4th vec)
4554
4555    The output for the first stage is:
4556
4557    I1:  0 16  1 17  2 18  3 19
4558    I2:  4 20  5 21  6 22  7 23
4559    I3:  8 24  9 25 10 26 11 27
4560    I4: 12 28 13 29 14 30 15 31
4561
4562    The output of the second stage, i.e. the final result is:
4563
4564    I1:  0  8 16 24  1  9 17 25
4565    I2:  2 10 18 26  3 11 19 27
4566    I3:  4 12 20 28  5 13 21 30
4567    I4:  6 14 22 30  7 15 23 31.  */
4568
4569 void
4570 vect_permute_store_chain (vec<tree> dr_chain,
4571                           unsigned int length,
4572                           gimple stmt,
4573                           gimple_stmt_iterator *gsi,
4574                           vec<tree> *result_chain)
4575 {
4576   tree vect1, vect2, high, low;
4577   gimple perm_stmt;
4578   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4579   tree perm_mask_low, perm_mask_high;
4580   tree data_ref;
4581   tree perm3_mask_low, perm3_mask_high;
4582   unsigned int i, n, log_length = exact_log2 (length);
4583   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4584   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4585
4586   result_chain->quick_grow (length);
4587   memcpy (result_chain->address (), dr_chain.address (),
4588           length * sizeof (tree));
4589
4590   if (length == 3)
4591     {
4592       unsigned int j0 = 0, j1 = 0, j2 = 0;
4593
4594       for (j = 0; j < 3; j++)
4595         {
4596           int nelt0 = ((3 - j) * nelt) % 3;
4597           int nelt1 = ((3 - j) * nelt + 1) % 3;
4598           int nelt2 = ((3 - j) * nelt + 2) % 3;
4599
4600           for (i = 0; i < nelt; i++)
4601             {
4602               if (3 * i + nelt0 < nelt)
4603                 sel[3 * i + nelt0] = j0++;
4604               if (3 * i + nelt1 < nelt)
4605                 sel[3 * i + nelt1] = nelt + j1++;
4606               if (3 * i + nelt2 < nelt)
4607                 sel[3 * i + nelt2] = 0;
4608             }
4609           perm3_mask_low = vect_gen_perm_mask (vectype, sel);
4610           gcc_assert (perm3_mask_low != NULL);
4611
4612           for (i = 0; i < nelt; i++)
4613             {
4614               if (3 * i + nelt0 < nelt)
4615                 sel[3 * i + nelt0] = 3 * i + nelt0;
4616               if (3 * i + nelt1 < nelt)
4617                 sel[3 * i + nelt1] = 3 * i + nelt1;
4618               if (3 * i + nelt2 < nelt)
4619                 sel[3 * i + nelt2] = nelt + j2++;
4620             }
4621           perm3_mask_high = vect_gen_perm_mask (vectype, sel);
4622           gcc_assert (perm3_mask_high != NULL);
4623
4624           vect1 = dr_chain[0];
4625           vect2 = dr_chain[1];
4626
4627           /* Create interleaving stmt:
4628              low = VEC_PERM_EXPR <vect1, vect2,
4629                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4630                                    j + 2, nelt + j + 2, *, ...}>  */
4631           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4632           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4633                                                     vect1, vect2,
4634                                                     perm3_mask_low);
4635           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4636
4637           vect1 = data_ref;
4638           vect2 = dr_chain[2];
4639           /* Create interleaving stmt:
4640              low = VEC_PERM_EXPR <vect1, vect2,
4641                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4642                                    6, 7, nelt + j + 2, ...}>  */
4643           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4644           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4645                                                     vect1, vect2,
4646                                                     perm3_mask_high);
4647           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4648           (*result_chain)[j] = data_ref;
4649         }
4650     }
4651   else
4652     {
4653       /* If length is not equal to 3 then only power of 2 is supported.  */
4654       gcc_assert (exact_log2 (length) != -1);
4655
4656       for (i = 0, n = nelt / 2; i < n; i++)
4657         {
4658           sel[i * 2] = i;
4659           sel[i * 2 + 1] = i + nelt;
4660         }
4661         perm_mask_high = vect_gen_perm_mask (vectype, sel);
4662         gcc_assert (perm_mask_high != NULL);
4663
4664         for (i = 0; i < nelt; i++)
4665           sel[i] += nelt / 2;
4666         perm_mask_low = vect_gen_perm_mask (vectype, sel);
4667         gcc_assert (perm_mask_low != NULL);
4668
4669         for (i = 0, n = log_length; i < n; i++)
4670           {
4671             for (j = 0; j < length/2; j++)
4672               {
4673                 vect1 = dr_chain[j];
4674                 vect2 = dr_chain[j+length/2];
4675
4676                 /* Create interleaving stmt:
4677                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4678                                                         ...}>  */
4679                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4680                 perm_stmt
4681                   = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4682                                                   vect1, vect2, perm_mask_high);
4683                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4684                 (*result_chain)[2*j] = high;
4685
4686                 /* Create interleaving stmt:
4687                    low = VEC_PERM_EXPR <vect1, vect2,
4688                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4689                                          ...}>  */
4690                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4691                 perm_stmt
4692                   = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4693                                                   vect1, vect2, perm_mask_low);
4694                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4695                 (*result_chain)[2*j+1] = low;
4696               }
4697             memcpy (dr_chain.address (), result_chain->address (),
4698                     length * sizeof (tree));
4699           }
4700     }
4701 }
4702
4703 /* Function vect_setup_realignment
4704
4705    This function is called when vectorizing an unaligned load using
4706    the dr_explicit_realign[_optimized] scheme.
4707    This function generates the following code at the loop prolog:
4708
4709       p = initial_addr;
4710    x  msq_init = *(floor(p));   # prolog load
4711       realignment_token = call target_builtin;
4712     loop:
4713    x  msq = phi (msq_init, ---)
4714
4715    The stmts marked with x are generated only for the case of
4716    dr_explicit_realign_optimized.
4717
4718    The code above sets up a new (vector) pointer, pointing to the first
4719    location accessed by STMT, and a "floor-aligned" load using that pointer.
4720    It also generates code to compute the "realignment-token" (if the relevant
4721    target hook was defined), and creates a phi-node at the loop-header bb
4722    whose arguments are the result of the prolog-load (created by this
4723    function) and the result of a load that takes place in the loop (to be
4724    created by the caller to this function).
4725
4726    For the case of dr_explicit_realign_optimized:
4727    The caller to this function uses the phi-result (msq) to create the
4728    realignment code inside the loop, and sets up the missing phi argument,
4729    as follows:
4730     loop:
4731       msq = phi (msq_init, lsq)
4732       lsq = *(floor(p'));        # load in loop
4733       result = realign_load (msq, lsq, realignment_token);
4734
4735    For the case of dr_explicit_realign:
4736     loop:
4737       msq = *(floor(p));        # load in loop
4738       p' = p + (VS-1);
4739       lsq = *(floor(p'));       # load in loop
4740       result = realign_load (msq, lsq, realignment_token);
4741
4742    Input:
4743    STMT - (scalar) load stmt to be vectorized. This load accesses
4744           a memory location that may be unaligned.
4745    BSI - place where new code is to be inserted.
4746    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4747                               is used.
4748
4749    Output:
4750    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4751                        target hook, if defined.
4752    Return value - the result of the loop-header phi node.  */
4753
4754 tree
4755 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4756                         tree *realignment_token,
4757                         enum dr_alignment_support alignment_support_scheme,
4758                         tree init_addr,
4759                         struct loop **at_loop)
4760 {
4761   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4762   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4763   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4764   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4765   struct loop *loop = NULL;
4766   edge pe = NULL;
4767   tree scalar_dest = gimple_assign_lhs (stmt);
4768   tree vec_dest;
4769   gimple inc;
4770   tree ptr;
4771   tree data_ref;
4772   basic_block new_bb;
4773   tree msq_init = NULL_TREE;
4774   tree new_temp;
4775   gimple_phi phi_stmt;
4776   tree msq = NULL_TREE;
4777   gimple_seq stmts = NULL;
4778   bool inv_p;
4779   bool compute_in_loop = false;
4780   bool nested_in_vect_loop = false;
4781   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4782   struct loop *loop_for_initial_load = NULL;
4783
4784   if (loop_vinfo)
4785     {
4786       loop = LOOP_VINFO_LOOP (loop_vinfo);
4787       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4788     }
4789
4790   gcc_assert (alignment_support_scheme == dr_explicit_realign
4791               || alignment_support_scheme == dr_explicit_realign_optimized);
4792
4793   /* We need to generate three things:
4794      1. the misalignment computation
4795      2. the extra vector load (for the optimized realignment scheme).
4796      3. the phi node for the two vectors from which the realignment is
4797       done (for the optimized realignment scheme).  */
4798
4799   /* 1. Determine where to generate the misalignment computation.
4800
4801      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4802      calculation will be generated by this function, outside the loop (in the
4803      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4804      caller, inside the loop.
4805
4806      Background: If the misalignment remains fixed throughout the iterations of
4807      the loop, then both realignment schemes are applicable, and also the
4808      misalignment computation can be done outside LOOP.  This is because we are
4809      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4810      are a multiple of VS (the Vector Size), and therefore the misalignment in
4811      different vectorized LOOP iterations is always the same.
4812      The problem arises only if the memory access is in an inner-loop nested
4813      inside LOOP, which is now being vectorized using outer-loop vectorization.
4814      This is the only case when the misalignment of the memory access may not
4815      remain fixed throughout the iterations of the inner-loop (as explained in
4816      detail in vect_supportable_dr_alignment).  In this case, not only is the
4817      optimized realignment scheme not applicable, but also the misalignment
4818      computation (and generation of the realignment token that is passed to
4819      REALIGN_LOAD) have to be done inside the loop.
4820
4821      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4822      or not, which in turn determines if the misalignment is computed inside
4823      the inner-loop, or outside LOOP.  */
4824
4825   if (init_addr != NULL_TREE || !loop_vinfo)
4826     {
4827       compute_in_loop = true;
4828       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4829     }
4830
4831
4832   /* 2. Determine where to generate the extra vector load.
4833
4834      For the optimized realignment scheme, instead of generating two vector
4835      loads in each iteration, we generate a single extra vector load in the
4836      preheader of the loop, and in each iteration reuse the result of the
4837      vector load from the previous iteration.  In case the memory access is in
4838      an inner-loop nested inside LOOP, which is now being vectorized using
4839      outer-loop vectorization, we need to determine whether this initial vector
4840      load should be generated at the preheader of the inner-loop, or can be
4841      generated at the preheader of LOOP.  If the memory access has no evolution
4842      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4843      to be generated inside LOOP (in the preheader of the inner-loop).  */
4844
4845   if (nested_in_vect_loop)
4846     {
4847       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4848       bool invariant_in_outerloop =
4849             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4850       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4851     }
4852   else
4853     loop_for_initial_load = loop;
4854   if (at_loop)
4855     *at_loop = loop_for_initial_load;
4856
4857   if (loop_for_initial_load)
4858     pe = loop_preheader_edge (loop_for_initial_load);
4859
4860   /* 3. For the case of the optimized realignment, create the first vector
4861       load at the loop preheader.  */
4862
4863   if (alignment_support_scheme == dr_explicit_realign_optimized)
4864     {
4865       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4866       gimple_assign new_stmt;
4867
4868       gcc_assert (!compute_in_loop);
4869       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4870       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4871                                       NULL_TREE, &init_addr, NULL, &inc,
4872                                       true, &inv_p);
4873       new_temp = copy_ssa_name (ptr, NULL);
4874       new_stmt = gimple_build_assign_with_ops
4875                    (BIT_AND_EXPR, new_temp, ptr,
4876                     build_int_cst (TREE_TYPE (ptr),
4877                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4878       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4879       gcc_assert (!new_bb);
4880       data_ref
4881         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4882                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4883       new_stmt = gimple_build_assign (vec_dest, data_ref);
4884       new_temp = make_ssa_name (vec_dest, new_stmt);
4885       gimple_assign_set_lhs (new_stmt, new_temp);
4886       if (pe)
4887         {
4888           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4889           gcc_assert (!new_bb);
4890         }
4891       else
4892          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4893
4894       msq_init = gimple_assign_lhs (new_stmt);
4895     }
4896
4897   /* 4. Create realignment token using a target builtin, if available.
4898       It is done either inside the containing loop, or before LOOP (as
4899       determined above).  */
4900
4901   if (targetm.vectorize.builtin_mask_for_load)
4902     {
4903       gimple_call new_stmt;
4904       tree builtin_decl;
4905
4906       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4907       if (!init_addr)
4908         {
4909           /* Generate the INIT_ADDR computation outside LOOP.  */
4910           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4911                                                         NULL_TREE, loop);
4912           if (loop)
4913             {
4914               pe = loop_preheader_edge (loop);
4915               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4916               gcc_assert (!new_bb);
4917             }
4918           else
4919              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4920         }
4921
4922       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4923       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4924       vec_dest =
4925         vect_create_destination_var (scalar_dest,
4926                                      gimple_call_return_type (new_stmt));
4927       new_temp = make_ssa_name (vec_dest, new_stmt);
4928       gimple_call_set_lhs (new_stmt, new_temp);
4929
4930       if (compute_in_loop)
4931         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4932       else
4933         {
4934           /* Generate the misalignment computation outside LOOP.  */
4935           pe = loop_preheader_edge (loop);
4936           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4937           gcc_assert (!new_bb);
4938         }
4939
4940       *realignment_token = gimple_call_lhs (new_stmt);
4941
4942       /* The result of the CALL_EXPR to this builtin is determined from
4943          the value of the parameter and no global variables are touched
4944          which makes the builtin a "const" function.  Requiring the
4945          builtin to have the "const" attribute makes it unnecessary
4946          to call mark_call_clobbered.  */
4947       gcc_assert (TREE_READONLY (builtin_decl));
4948     }
4949
4950   if (alignment_support_scheme == dr_explicit_realign)
4951     return msq;
4952
4953   gcc_assert (!compute_in_loop);
4954   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4955
4956
4957   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4958
4959   pe = loop_preheader_edge (containing_loop);
4960   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4961   msq = make_ssa_name (vec_dest, NULL);
4962   phi_stmt = create_phi_node (msq, containing_loop->header);
4963   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4964
4965   return msq;
4966 }
4967
4968
4969 /* Function vect_grouped_load_supported.
4970
4971    Returns TRUE if even and odd permutations are supported,
4972    and FALSE otherwise.  */
4973
4974 bool
4975 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4976 {
4977   enum machine_mode mode = TYPE_MODE (vectype);
4978
4979   /* vect_permute_load_chain requires the group size to be equal to 3 or
4980      be a power of two.  */
4981   if (count != 3 && exact_log2 (count) == -1)
4982     {
4983       if (dump_enabled_p ())
4984         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4985                          "the size of the group of accesses"
4986                          " is not a power of 2 or not equal to 3\n");
4987       return false;
4988     }
4989
4990   /* Check that the permutation is supported.  */
4991   if (VECTOR_MODE_P (mode))
4992     {
4993       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4994       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4995
4996       if (count == 3)
4997         {
4998           unsigned int k;
4999           for (k = 0; k < 3; k++)
5000             {
5001               for (i = 0; i < nelt; i++)
5002                 if (3 * i + k < 2 * nelt)
5003                   sel[i] = 3 * i + k;
5004                 else
5005                   sel[i] = 0;
5006               if (!can_vec_perm_p (mode, false, sel))
5007                 {
5008                   if (dump_enabled_p ())
5009                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5010                                      "shuffle of 3 loads is not supported by"
5011                                      " target\n");
5012                     return false;
5013                 }
5014               for (i = 0, j = 0; i < nelt; i++)
5015                 if (3 * i + k < 2 * nelt)
5016                   sel[i] = i;
5017                 else
5018                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5019               if (!can_vec_perm_p (mode, false, sel))
5020                 {
5021                   if (dump_enabled_p ())
5022                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5023                                      "shuffle of 3 loads is not supported by"
5024                                      " target\n");
5025                   return false;
5026                 }
5027             }
5028           return true;
5029         }
5030       else
5031         {
5032           /* If length is not equal to 3 then only power of 2 is supported.  */
5033           gcc_assert (exact_log2 (count) != -1);
5034           for (i = 0; i < nelt; i++)
5035             sel[i] = i * 2;
5036           if (can_vec_perm_p (mode, false, sel))
5037             {
5038               for (i = 0; i < nelt; i++)
5039                 sel[i] = i * 2 + 1;
5040               if (can_vec_perm_p (mode, false, sel))
5041                 return true;
5042             }
5043         }
5044     }
5045
5046   if (dump_enabled_p ())
5047     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5048                      "extract even/odd not supported by target\n");
5049   return false;
5050 }
5051
5052 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5053    type VECTYPE.  */
5054
5055 bool
5056 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5057 {
5058   return vect_lanes_optab_supported_p ("vec_load_lanes",
5059                                        vec_load_lanes_optab,
5060                                        vectype, count);
5061 }
5062
5063 /* Function vect_permute_load_chain.
5064
5065    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5066    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5067    the input data correctly.  Return the final references for loads in
5068    RESULT_CHAIN.
5069
5070    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5071    The input is 4 vectors each containing 8 elements. We assign a number to each
5072    element, the input sequence is:
5073
5074    1st vec:   0  1  2  3  4  5  6  7
5075    2nd vec:   8  9 10 11 12 13 14 15
5076    3rd vec:  16 17 18 19 20 21 22 23
5077    4th vec:  24 25 26 27 28 29 30 31
5078
5079    The output sequence should be:
5080
5081    1st vec:  0 4  8 12 16 20 24 28
5082    2nd vec:  1 5  9 13 17 21 25 29
5083    3rd vec:  2 6 10 14 18 22 26 30
5084    4th vec:  3 7 11 15 19 23 27 31
5085
5086    i.e., the first output vector should contain the first elements of each
5087    interleaving group, etc.
5088
5089    We use extract_even/odd instructions to create such output.  The input of
5090    each extract_even/odd operation is two vectors
5091    1st vec    2nd vec
5092    0 1 2 3    4 5 6 7
5093
5094    and the output is the vector of extracted even/odd elements.  The output of
5095    extract_even will be:   0 2 4 6
5096    and of extract_odd:     1 3 5 7
5097
5098
5099    The permutation is done in log LENGTH stages.  In each stage extract_even
5100    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5101    their order.  In our example,
5102
5103    E1: extract_even (1st vec, 2nd vec)
5104    E2: extract_odd (1st vec, 2nd vec)
5105    E3: extract_even (3rd vec, 4th vec)
5106    E4: extract_odd (3rd vec, 4th vec)
5107
5108    The output for the first stage will be:
5109
5110    E1:  0  2  4  6  8 10 12 14
5111    E2:  1  3  5  7  9 11 13 15
5112    E3: 16 18 20 22 24 26 28 30
5113    E4: 17 19 21 23 25 27 29 31
5114
5115    In order to proceed and create the correct sequence for the next stage (or
5116    for the correct output, if the second stage is the last one, as in our
5117    example), we first put the output of extract_even operation and then the
5118    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5119    The input for the second stage is:
5120
5121    1st vec (E1):  0  2  4  6  8 10 12 14
5122    2nd vec (E3): 16 18 20 22 24 26 28 30
5123    3rd vec (E2):  1  3  5  7  9 11 13 15
5124    4th vec (E4): 17 19 21 23 25 27 29 31
5125
5126    The output of the second stage:
5127
5128    E1: 0 4  8 12 16 20 24 28
5129    E2: 2 6 10 14 18 22 26 30
5130    E3: 1 5  9 13 17 21 25 29
5131    E4: 3 7 11 15 19 23 27 31
5132
5133    And RESULT_CHAIN after reordering:
5134
5135    1st vec (E1):  0 4  8 12 16 20 24 28
5136    2nd vec (E3):  1 5  9 13 17 21 25 29
5137    3rd vec (E2):  2 6 10 14 18 22 26 30
5138    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5139
5140 static void
5141 vect_permute_load_chain (vec<tree> dr_chain,
5142                          unsigned int length,
5143                          gimple stmt,
5144                          gimple_stmt_iterator *gsi,
5145                          vec<tree> *result_chain)
5146 {
5147   tree data_ref, first_vect, second_vect;
5148   tree perm_mask_even, perm_mask_odd;
5149   tree perm3_mask_low, perm3_mask_high;
5150   gimple perm_stmt;
5151   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5152   unsigned int i, j, log_length = exact_log2 (length);
5153   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5154   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5155
5156   result_chain->quick_grow (length);
5157   memcpy (result_chain->address (), dr_chain.address (),
5158           length * sizeof (tree));
5159
5160   if (length == 3)
5161     {
5162       unsigned int k;
5163
5164       for (k = 0; k < 3; k++)
5165         {
5166           for (i = 0; i < nelt; i++)
5167             if (3 * i + k < 2 * nelt)
5168               sel[i] = 3 * i + k;
5169             else
5170               sel[i] = 0;
5171           perm3_mask_low = vect_gen_perm_mask (vectype, sel);
5172           gcc_assert (perm3_mask_low != NULL);
5173
5174           for (i = 0, j = 0; i < nelt; i++)
5175             if (3 * i + k < 2 * nelt)
5176               sel[i] = i;
5177             else
5178               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5179
5180           perm3_mask_high = vect_gen_perm_mask (vectype, sel);
5181           gcc_assert (perm3_mask_high != NULL);
5182
5183           first_vect = dr_chain[0];
5184           second_vect = dr_chain[1];
5185
5186           /* Create interleaving stmt (low part of):
5187              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5188                                                              ...}>  */
5189           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5190           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5191                                                     first_vect, second_vect,
5192                                                     perm3_mask_low);
5193           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5194
5195           /* Create interleaving stmt (high part of):
5196              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5197                                                               ...}>  */
5198           first_vect = data_ref;
5199           second_vect = dr_chain[2];
5200           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5201           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5202                                                     first_vect, second_vect,
5203                                                     perm3_mask_high);
5204           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5205           (*result_chain)[k] = data_ref;
5206         }
5207     }
5208   else
5209     {
5210       /* If length is not equal to 3 then only power of 2 is supported.  */
5211       gcc_assert (exact_log2 (length) != -1);
5212
5213       for (i = 0; i < nelt; ++i)
5214         sel[i] = i * 2;
5215       perm_mask_even = vect_gen_perm_mask (vectype, sel);
5216       gcc_assert (perm_mask_even != NULL);
5217
5218       for (i = 0; i < nelt; ++i)
5219         sel[i] = i * 2 + 1;
5220       perm_mask_odd = vect_gen_perm_mask (vectype, sel);
5221       gcc_assert (perm_mask_odd != NULL);
5222
5223       for (i = 0; i < log_length; i++)
5224         {
5225           for (j = 0; j < length; j += 2)
5226             {
5227               first_vect = dr_chain[j];
5228               second_vect = dr_chain[j+1];
5229
5230               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5231               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5232               perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5233                                                         first_vect, second_vect,
5234                                                         perm_mask_even);
5235               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5236               (*result_chain)[j/2] = data_ref;
5237
5238               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5239               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5240               perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5241                                                         first_vect, second_vect,
5242                                                         perm_mask_odd);
5243               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5244               (*result_chain)[j/2+length/2] = data_ref;
5245             }
5246           memcpy (dr_chain.address (), result_chain->address (),
5247                   length * sizeof (tree));
5248         }
5249     }
5250 }
5251
5252 /* Function vect_shift_permute_load_chain.
5253
5254    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5255    sequence of stmts to reorder the input data accordingly.
5256    Return the final references for loads in RESULT_CHAIN.
5257    Return true if successed, false otherwise.
5258
5259    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5260    The input is 3 vectors each containing 8 elements.  We assign a
5261    number to each element, the input sequence is:
5262
5263    1st vec:   0  1  2  3  4  5  6  7
5264    2nd vec:   8  9 10 11 12 13 14 15
5265    3rd vec:  16 17 18 19 20 21 22 23
5266
5267    The output sequence should be:
5268
5269    1st vec:  0 3 6  9 12 15 18 21
5270    2nd vec:  1 4 7 10 13 16 19 22
5271    3rd vec:  2 5 8 11 14 17 20 23
5272
5273    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5274
5275    First we shuffle all 3 vectors to get correct elements order:
5276
5277    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5278    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5279    3rd vec:  (16 19 22) (17 20 23) (18 21)
5280
5281    Next we unite and shift vector 3 times:
5282
5283    1st step:
5284      shift right by 6 the concatenation of:
5285      "1st vec" and  "2nd vec"
5286        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5287      "2nd vec" and  "3rd vec"
5288        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5289      "3rd vec" and  "1st vec"
5290        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5291                              | New vectors                   |
5292
5293      So that now new vectors are:
5294
5295      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5296      2nd vec:  (10 13) (16 19 22) (17 20 23)
5297      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5298
5299    2nd step:
5300      shift right by 5 the concatenation of:
5301      "1st vec" and  "3rd vec"
5302        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5303      "2nd vec" and  "1st vec"
5304        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5305      "3rd vec" and  "2nd vec"
5306        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5307                           | New vectors                   |
5308
5309      So that now new vectors are:
5310
5311      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5312      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5313      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5314
5315    3rd step:
5316      shift right by 5 the concatenation of:
5317      "1st vec" and  "1st vec"
5318        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5319      shift right by 3 the concatenation of:
5320      "2nd vec" and  "2nd vec"
5321                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5322                           | New vectors                   |
5323
5324      So that now all vectors are READY:
5325      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5326      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5327      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5328
5329    This algorithm is faster than one in vect_permute_load_chain if:
5330      1.  "shift of a concatination" is faster than general permutation.
5331          This is usually so.
5332      2.  The TARGET machine can't execute vector instructions in parallel.
5333          This is because each step of the algorithm depends on previous.
5334          The algorithm in vect_permute_load_chain is much more parallel.
5335
5336    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5337 */
5338
5339 static bool
5340 vect_shift_permute_load_chain (vec<tree> dr_chain,
5341                                unsigned int length,
5342                                gimple stmt,
5343                                gimple_stmt_iterator *gsi,
5344                                vec<tree> *result_chain)
5345 {
5346   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5347   tree perm2_mask1, perm2_mask2, perm3_mask;
5348   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5349   gimple perm_stmt;
5350
5351   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5352   unsigned int i;
5353   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5354   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5355   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5356   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5357
5358   result_chain->quick_grow (length);
5359   memcpy (result_chain->address (), dr_chain.address (),
5360           length * sizeof (tree));
5361
5362   if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5363     {
5364       for (i = 0; i < nelt / 2; ++i)
5365         sel[i] = i * 2;
5366       for (i = 0; i < nelt / 2; ++i)
5367         sel[nelt / 2 + i] = i * 2 + 1;
5368       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5369         {
5370           if (dump_enabled_p ())
5371             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5372                              "shuffle of 2 fields structure is not \
5373                               supported by target\n");
5374           return false;
5375         }
5376       perm2_mask1 = vect_gen_perm_mask (vectype, sel);
5377       gcc_assert (perm2_mask1 != NULL);
5378
5379       for (i = 0; i < nelt / 2; ++i)
5380         sel[i] = i * 2 + 1;
5381       for (i = 0; i < nelt / 2; ++i)
5382         sel[nelt / 2 + i] = i * 2;
5383       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5384         {
5385           if (dump_enabled_p ())
5386             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5387                              "shuffle of 2 fields structure is not \
5388                               supported by target\n");
5389           return false;
5390         }
5391       perm2_mask2 = vect_gen_perm_mask (vectype, sel);
5392       gcc_assert (perm2_mask2 != NULL);
5393
5394       /* Generating permutation constant to shift all elements.
5395          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5396       for (i = 0; i < nelt; i++)
5397         sel[i] = nelt / 2 + i;
5398       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5399         {
5400           if (dump_enabled_p ())
5401             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5402                              "shift permutation is not supported by target\n");
5403           return false;
5404         }
5405       shift1_mask = vect_gen_perm_mask (vectype, sel);
5406       gcc_assert (shift1_mask != NULL);
5407
5408       /* Generating permutation constant to select vector from 2.
5409          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5410       for (i = 0; i < nelt / 2; i++)
5411         sel[i] = i;
5412       for (i = nelt / 2; i < nelt; i++)
5413         sel[i] = nelt + i;
5414       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5415         {
5416           if (dump_enabled_p ())
5417             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5418                              "select is not supported by target\n");
5419           return false;
5420         }
5421       select_mask = vect_gen_perm_mask (vectype, sel);
5422       gcc_assert (select_mask != NULL);
5423
5424       first_vect = dr_chain[0];
5425       second_vect = dr_chain[1];
5426
5427       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5428       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5429                                                 first_vect, first_vect,
5430                                                 perm2_mask1);
5431       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5432       vect[0] = data_ref;
5433
5434       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5435       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5436                                                 second_vect, second_vect,
5437                                                 perm2_mask2);
5438       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5439       vect[1] = data_ref;
5440
5441       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5442       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5443                                                 vect[0], vect[1],
5444                                                 shift1_mask);
5445       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5446       (*result_chain)[1] = data_ref;
5447
5448       data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5449       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5450                                                 vect[0], vect[1],
5451                                                 select_mask);
5452       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5453       (*result_chain)[0] = data_ref;
5454
5455       return true;
5456     }
5457   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5458     {
5459       unsigned int k = 0, l = 0;
5460
5461       /* Generating permutation constant to get all elements in rigth order.
5462          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5463       for (i = 0; i < nelt; i++)
5464         {
5465           if (3 * k + (l % 3) >= nelt)
5466             {
5467               k = 0;
5468               l += (3 - (nelt % 3));
5469             }
5470           sel[i] = 3 * k + (l % 3);
5471           k++;
5472         }
5473       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5474         {
5475           if (dump_enabled_p ())
5476             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5477                              "shuffle of 3 fields structure is not \
5478                               supported by target\n");
5479           return false;
5480         }
5481       perm3_mask = vect_gen_perm_mask (vectype, sel);
5482       gcc_assert (perm3_mask != NULL);
5483
5484       /* Generating permutation constant to shift all elements.
5485          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5486       for (i = 0; i < nelt; i++)
5487         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5488       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5489         {
5490           if (dump_enabled_p ())
5491             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5492                              "shift permutation is not supported by target\n");
5493           return false;
5494         }
5495       shift1_mask = vect_gen_perm_mask (vectype, sel);
5496       gcc_assert (shift1_mask != NULL);
5497
5498       /* Generating permutation constant to shift all elements.
5499          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5500       for (i = 0; i < nelt; i++)
5501         sel[i] = 2 * (nelt / 3) + 1 + i;
5502       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5503         {
5504           if (dump_enabled_p ())
5505             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5506                              "shift permutation is not supported by target\n");
5507           return false;
5508         }
5509       shift2_mask = vect_gen_perm_mask (vectype, sel);
5510       gcc_assert (shift2_mask != NULL);
5511
5512       /* Generating permutation constant to shift all elements.
5513          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5514       for (i = 0; i < nelt; i++)
5515         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5516       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5517         {
5518           if (dump_enabled_p ())
5519             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5520                              "shift permutation is not supported by target\n");
5521           return false;
5522         }
5523       shift3_mask = vect_gen_perm_mask (vectype, sel);
5524       gcc_assert (shift3_mask != NULL);
5525
5526       /* Generating permutation constant to shift all elements.
5527          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5528       for (i = 0; i < nelt; i++)
5529         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5530       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5531         {
5532           if (dump_enabled_p ())
5533             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5534                              "shift permutation is not supported by target\n");
5535           return false;
5536         }
5537       shift4_mask = vect_gen_perm_mask (vectype, sel);
5538       gcc_assert (shift4_mask != NULL);
5539
5540       for (k = 0; k < 3; k++)
5541         {
5542           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5543           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5544                                                     dr_chain[k], dr_chain[k],
5545                                                     perm3_mask);
5546           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5547           vect[k] = data_ref;
5548         }
5549
5550       for (k = 0; k < 3; k++)
5551         {
5552           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5553           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5554                                                     vect[k % 3],
5555                                                     vect[(k + 1) % 3],
5556                                                     shift1_mask);
5557           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5558           vect_shift[k] = data_ref;
5559         }
5560
5561       for (k = 0; k < 3; k++)
5562         {
5563           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5564           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5565                                                     vect_shift[(4 - k) % 3],
5566                                                     vect_shift[(3 - k) % 3],
5567                                                     shift2_mask);
5568           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5569           vect[k] = data_ref;
5570         }
5571
5572       (*result_chain)[3 - (nelt % 3)] = vect[2];
5573
5574       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5575       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5576                                                 vect[0], vect[0],
5577                                                 shift3_mask);
5578       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5579       (*result_chain)[nelt % 3] = data_ref;
5580
5581       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5582       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5583                                                 vect[1], vect[1],
5584                                                 shift4_mask);
5585       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5586       (*result_chain)[0] = data_ref;
5587       return true;
5588     }
5589   return false;
5590 }
5591
5592 /* Function vect_transform_grouped_load.
5593
5594    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5595    to perform their permutation and ascribe the result vectorized statements to
5596    the scalar statements.
5597 */
5598
5599 void
5600 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5601                              gimple_stmt_iterator *gsi)
5602 {
5603   enum machine_mode mode;
5604   vec<tree> result_chain = vNULL;
5605
5606   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5607      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5608      vectors, that are ready for vector computation.  */
5609   result_chain.create (size);
5610
5611   /* If reassociation width for vector type is 2 or greater target machine can
5612      execute 2 or more vector instructions in parallel.  Otherwise try to
5613      get chain for loads group using vect_shift_permute_load_chain.  */
5614   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5615   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5616       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5617                                          gsi, &result_chain))
5618     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5619   vect_record_grouped_load_vectors (stmt, result_chain);
5620   result_chain.release ();
5621 }
5622
5623 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5624    generated as part of the vectorization of STMT.  Assign the statement
5625    for each vector to the associated scalar statement.  */
5626
5627 void
5628 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5629 {
5630   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5631   gimple next_stmt, new_stmt;
5632   unsigned int i, gap_count;
5633   tree tmp_data_ref;
5634
5635   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5636      Since we scan the chain starting from it's first node, their order
5637      corresponds the order of data-refs in RESULT_CHAIN.  */
5638   next_stmt = first_stmt;
5639   gap_count = 1;
5640   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5641     {
5642       if (!next_stmt)
5643         break;
5644
5645       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5646        code elimination pass later.  No need to check for the first stmt in
5647        the group, since it always exists.
5648        GROUP_GAP is the number of steps in elements from the previous
5649        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5650        correspond to the gaps.  */
5651       if (next_stmt != first_stmt
5652           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5653       {
5654         gap_count++;
5655         continue;
5656       }
5657
5658       while (next_stmt)
5659         {
5660           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5661           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5662              copies, and we put the new vector statement in the first available
5663              RELATED_STMT.  */
5664           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5665             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5666           else
5667             {
5668               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5669                 {
5670                   gimple prev_stmt =
5671                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5672                   gimple rel_stmt =
5673                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5674                   while (rel_stmt)
5675                     {
5676                       prev_stmt = rel_stmt;
5677                       rel_stmt =
5678                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5679                     }
5680
5681                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5682                     new_stmt;
5683                 }
5684             }
5685
5686           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5687           gap_count = 1;
5688           /* If NEXT_STMT accesses the same DR as the previous statement,
5689              put the same TMP_DATA_REF as its vectorized statement; otherwise
5690              get the next data-ref from RESULT_CHAIN.  */
5691           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5692             break;
5693         }
5694     }
5695 }
5696
5697 /* Function vect_force_dr_alignment_p.
5698
5699    Returns whether the alignment of a DECL can be forced to be aligned
5700    on ALIGNMENT bit boundary.  */
5701
5702 bool
5703 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5704 {
5705   if (TREE_CODE (decl) != VAR_DECL)
5706     return false;
5707
5708   /* With -fno-toplevel-reorder we may have already output the constant.  */
5709   if (TREE_ASM_WRITTEN (decl))
5710     return false;
5711
5712   /* Constant pool entries may be shared and not properly merged by LTO.  */
5713   if (DECL_IN_CONSTANT_POOL (decl))
5714     return false;
5715
5716   if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl))
5717     {
5718       symtab_node *snode;
5719
5720       /* We cannot change alignment of symbols that may bind to symbols
5721          in other translation unit that may contain a definition with lower
5722          alignment.  */
5723       if (!decl_binds_to_current_def_p (decl))
5724         return false;
5725
5726       /* When compiling partition, be sure the symbol is not output by other
5727          partition.  */
5728       snode = symtab_node::get (decl);
5729       if (flag_ltrans
5730           && (snode->in_other_partition
5731               || snode->get_partitioning_class () == SYMBOL_DUPLICATE))
5732         return false;
5733     }
5734
5735   /* Do not override the alignment as specified by the ABI when the used
5736      attribute is set.  */
5737   if (DECL_PRESERVE_P (decl))
5738     return false;
5739
5740   /* Do not override explicit alignment set by the user when an explicit
5741      section name is also used.  This is a common idiom used by many
5742      software projects.  */
5743   if (TREE_STATIC (decl)
5744       && DECL_SECTION_NAME (decl) != NULL
5745       && !symtab_node::get (decl)->implicit_section)
5746     return false;
5747
5748   /* If symbol is an alias, we need to check that target is OK.  */
5749   if (TREE_STATIC (decl))
5750     {
5751       tree target = symtab_node::get (decl)->ultimate_alias_target ()->decl;
5752       if (target != decl)
5753         {
5754           if (DECL_PRESERVE_P (target))
5755             return false;
5756           decl = target;
5757         }
5758     }
5759
5760   if (TREE_STATIC (decl))
5761     return (alignment <= MAX_OFILE_ALIGNMENT);
5762   else
5763     return (alignment <= MAX_STACK_ALIGNMENT);
5764 }
5765
5766
5767 /* Return whether the data reference DR is supported with respect to its
5768    alignment.
5769    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5770    it is aligned, i.e., check if it is possible to vectorize it with different
5771    alignment.  */
5772
5773 enum dr_alignment_support
5774 vect_supportable_dr_alignment (struct data_reference *dr,
5775                                bool check_aligned_accesses)
5776 {
5777   gimple stmt = DR_STMT (dr);
5778   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5779   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5780   enum machine_mode mode = TYPE_MODE (vectype);
5781   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5782   struct loop *vect_loop = NULL;
5783   bool nested_in_vect_loop = false;
5784
5785   if (aligned_access_p (dr) && !check_aligned_accesses)
5786     return dr_aligned;
5787
5788   /* For now assume all conditional loads/stores support unaligned
5789      access without any special code.  */
5790   if (is_gimple_call (stmt)
5791       && gimple_call_internal_p (stmt)
5792       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5793           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5794     return dr_unaligned_supported;
5795
5796   if (loop_vinfo)
5797     {
5798       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5799       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5800     }
5801
5802   /* Possibly unaligned access.  */
5803
5804   /* We can choose between using the implicit realignment scheme (generating
5805      a misaligned_move stmt) and the explicit realignment scheme (generating
5806      aligned loads with a REALIGN_LOAD).  There are two variants to the
5807      explicit realignment scheme: optimized, and unoptimized.
5808      We can optimize the realignment only if the step between consecutive
5809      vector loads is equal to the vector size.  Since the vector memory
5810      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5811      is guaranteed that the misalignment amount remains the same throughout the
5812      execution of the vectorized loop.  Therefore, we can create the
5813      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5814      at the loop preheader.
5815
5816      However, in the case of outer-loop vectorization, when vectorizing a
5817      memory access in the inner-loop nested within the LOOP that is now being
5818      vectorized, while it is guaranteed that the misalignment of the
5819      vectorized memory access will remain the same in different outer-loop
5820      iterations, it is *not* guaranteed that is will remain the same throughout
5821      the execution of the inner-loop.  This is because the inner-loop advances
5822      with the original scalar step (and not in steps of VS).  If the inner-loop
5823      step happens to be a multiple of VS, then the misalignment remains fixed
5824      and we can use the optimized realignment scheme.  For example:
5825
5826       for (i=0; i<N; i++)
5827         for (j=0; j<M; j++)
5828           s += a[i+j];
5829
5830      When vectorizing the i-loop in the above example, the step between
5831      consecutive vector loads is 1, and so the misalignment does not remain
5832      fixed across the execution of the inner-loop, and the realignment cannot
5833      be optimized (as illustrated in the following pseudo vectorized loop):
5834
5835       for (i=0; i<N; i+=4)
5836         for (j=0; j<M; j++){
5837           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5838                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5839                          // (assuming that we start from an aligned address).
5840           }
5841
5842      We therefore have to use the unoptimized realignment scheme:
5843
5844       for (i=0; i<N; i+=4)
5845           for (j=k; j<M; j+=4)
5846           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5847                            // that the misalignment of the initial address is
5848                            // 0).
5849
5850      The loop can then be vectorized as follows:
5851
5852       for (k=0; k<4; k++){
5853         rt = get_realignment_token (&vp[k]);
5854         for (i=0; i<N; i+=4){
5855           v1 = vp[i+k];
5856           for (j=k; j<M; j+=4){
5857             v2 = vp[i+j+VS-1];
5858             va = REALIGN_LOAD <v1,v2,rt>;
5859             vs += va;
5860             v1 = v2;
5861           }
5862         }
5863     } */
5864
5865   if (DR_IS_READ (dr))
5866     {
5867       bool is_packed = false;
5868       tree type = (TREE_TYPE (DR_REF (dr)));
5869
5870       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5871           && (!targetm.vectorize.builtin_mask_for_load
5872               || targetm.vectorize.builtin_mask_for_load ()))
5873         {
5874           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5875           if ((nested_in_vect_loop
5876                && (TREE_INT_CST_LOW (DR_STEP (dr))
5877                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5878               || !loop_vinfo)
5879             return dr_explicit_realign;
5880           else
5881             return dr_explicit_realign_optimized;
5882         }
5883       if (!known_alignment_for_access_p (dr))
5884         is_packed = not_size_aligned (DR_REF (dr));
5885
5886       if ((TYPE_USER_ALIGN (type) && !is_packed)
5887           || targetm.vectorize.
5888                support_vector_misalignment (mode, type,
5889                                             DR_MISALIGNMENT (dr), is_packed))
5890         /* Can't software pipeline the loads, but can at least do them.  */
5891         return dr_unaligned_supported;
5892     }
5893   else
5894     {
5895       bool is_packed = false;
5896       tree type = (TREE_TYPE (DR_REF (dr)));
5897
5898       if (!known_alignment_for_access_p (dr))
5899         is_packed = not_size_aligned (DR_REF (dr));
5900
5901      if ((TYPE_USER_ALIGN (type) && !is_packed)
5902          || targetm.vectorize.
5903               support_vector_misalignment (mode, type,
5904                                            DR_MISALIGNMENT (dr), is_packed))
5905        return dr_unaligned_supported;
5906     }
5907
5908   /* Unsupported.  */
5909   return dr_unaligned_unsupported;
5910 }