gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "backend.h"
  91 #include "predict.h"
  92 #include "tree.h"
  93 #include "gimple.h"
  94 #include "rtl.h"
  95 #include "ssa.h"
  96 #include "flags.h"
  97 #include "alias.h"
  98 #include "fold-const.h"
  99 #include "internal-fn.h"
 100 #include "gimple-fold.h"
 101 #include "gimple-iterator.h"
 102 #include "gimplify.h"
 103 #include "gimplify-me.h"
 104 #include "stor-layout.h"
 105 #include "tree-cfg.h"
 106 #include "tree-dfa.h"
 107 #include "tree-ssa.h"
 108 #include "tree-pass.h"
 109 #include "alloc-pool.h"
 110 #include "target.h"
 111 #include "gimple-pretty-print.h"
 112 #include "builtins.h"
 113 #include "params.h"
 114 #include "insn-codes.h"
 115 #include "optabs-tree.h"
 116
 117 /* This structure represents one basic block that either computes a
 118    division, or is a common dominator for basic block that compute a
 119    division.  */
 120 struct occurrence {
 121   /* The basic block represented by this structure.  */
 122   basic_block bb;
 123
 124   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 125      inserted in BB.  */
 126   tree recip_def;
 127
 128   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 129      was inserted in BB.  */
 130   gimple *recip_def_stmt;
 131
 132   /* Pointer to a list of "struct occurrence"s for blocks dominated
 133      by BB.  */
 134   struct occurrence *children;
 135
 136   /* Pointer to the next "struct occurrence"s in the list of blocks
 137      sharing a common dominator.  */
 138   struct occurrence *next;
 139
 140   /* The number of divisions that are in BB before compute_merit.  The
 141      number of divisions that are in BB or post-dominate it after
 142      compute_merit.  */
 143   int num_divisions;
 144
 145   /* True if the basic block has a division, false if it is a common
 146      dominator for basic blocks that do.  If it is false and trapping
 147      math is active, BB is not a candidate for inserting a reciprocal.  */
 148   bool bb_has_division;
 149 };
 150
 151 static struct
 152 {
 153   /* Number of 1.0/X ops inserted.  */
 154   int rdivs_inserted;
 155
 156   /* Number of 1.0/FUNC ops inserted.  */
 157   int rfuncs_inserted;
 158 } reciprocal_stats;
 159
 160 static struct
 161 {
 162   /* Number of cexpi calls inserted.  */
 163   int inserted;
 164 } sincos_stats;
 165
 166 static struct
 167 {
 168   /* Number of hand-written 16-bit nop / bswaps found.  */
 169   int found_16bit;
 170
 171   /* Number of hand-written 32-bit nop / bswaps found.  */
 172   int found_32bit;
 173
 174   /* Number of hand-written 64-bit nop / bswaps found.  */
 175   int found_64bit;
 176 } nop_stats, bswap_stats;
 177
 178 static struct
 179 {
 180   /* Number of widening multiplication ops inserted.  */
 181   int widen_mults_inserted;
 182
 183   /* Number of integer multiply-and-accumulate ops inserted.  */
 184   int maccs_inserted;
 185
 186   /* Number of fp fused multiply-add ops inserted.  */
 187   int fmas_inserted;
 188 } widen_mul_stats;
 189
 190 /* The instance of "struct occurrence" representing the highest
 191    interesting block in the dominator tree.  */
 192 static struct occurrence *occ_head;
 193
 194 /* Allocation pool for getting instances of "struct occurrence".  */
 195 static object_allocator<occurrence> *occ_pool;
 196
 197
 198
 199 /* Allocate and return a new struct occurrence for basic block BB, and
 200    whose children list is headed by CHILDREN.  */
 201 static struct occurrence *
 202 occ_new (basic_block bb, struct occurrence *children)
 203 {
 204   struct occurrence *occ;
 205
 206   bb->aux = occ = occ_pool->allocate ();
 207   memset (occ, 0, sizeof (struct occurrence));
 208
 209   occ->bb = bb;
 210   occ->children = children;
 211   return occ;
 212 }
 213
 214
 215 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 216    list of "struct occurrence"s, one per basic block, having IDOM as
 217    their common dominator.
 218
 219    We try to insert NEW_OCC as deep as possible in the tree, and we also
 220    insert any other block that is a common dominator for BB and one
 221    block already in the tree.  */
 222
 223 static void
 224 insert_bb (struct occurrence *new_occ, basic_block idom,
 225            struct occurrence **p_head)
 226 {
 227   struct occurrence *occ, **p_occ;
 228
 229   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 230     {
 231       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 232       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 233       if (dom == bb)
 234         {
 235           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 236              from its list.  */
 237           *p_occ = occ->next;
 238           occ->next = new_occ->children;
 239           new_occ->children = occ;
 240
 241           /* Try the next block (it may as well be dominated by BB).  */
 242         }
 243
 244       else if (dom == occ_bb)
 245         {
 246           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 247           insert_bb (new_occ, dom, &occ->children);
 248           return;
 249         }
 250
 251       else if (dom != idom)
 252         {
 253           gcc_assert (!dom->aux);
 254
 255           /* There is a dominator between IDOM and BB, add it and make
 256              two children out of NEW_OCC and OCC.  First, remove OCC from
 257              its list.  */
 258           *p_occ = occ->next;
 259           new_occ->next = occ;
 260           occ->next = NULL;
 261
 262           /* None of the previous blocks has DOM as a dominator: if we tail
 263              recursed, we would reexamine them uselessly. Just switch BB with
 264              DOM, and go on looking for blocks dominated by DOM.  */
 265           new_occ = occ_new (dom, new_occ);
 266         }
 267
 268       else
 269         {
 270           /* Nothing special, go on with the next element.  */
 271           p_occ = &occ->next;
 272         }
 273     }
 274
 275   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 276   new_occ->next = *p_head;
 277   *p_head = new_occ;
 278 }
 279
 280 /* Register that we found a division in BB.  */
 281
 282 static inline void
 283 register_division_in (basic_block bb)
 284 {
 285   struct occurrence *occ;
 286
 287   occ = (struct occurrence *) bb->aux;
 288   if (!occ)
 289     {
 290       occ = occ_new (bb, NULL);
 291       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 292     }
 293
 294   occ->bb_has_division = true;
 295   occ->num_divisions++;
 296 }
 297
 298
 299 /* Compute the number of divisions that postdominate each block in OCC and
 300    its children.  */
 301
 302 static void
 303 compute_merit (struct occurrence *occ)
 304 {
 305   struct occurrence *occ_child;
 306   basic_block dom = occ->bb;
 307
 308   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 309     {
 310       basic_block bb;
 311       if (occ_child->children)
 312         compute_merit (occ_child);
 313
 314       if (flag_exceptions)
 315         bb = single_noncomplex_succ (dom);
 316       else
 317         bb = dom;
 318
 319       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 320         occ->num_divisions += occ_child->num_divisions;
 321     }
 322 }
 323
 324
 325 /* Return whether USE_STMT is a floating-point division by DEF.  */
 326 static inline bool
 327 is_division_by (gimple *use_stmt, tree def)
 328 {
 329   return is_gimple_assign (use_stmt)
 330          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 331          && gimple_assign_rhs2 (use_stmt) == def
 332          /* Do not recognize x / x as valid division, as we are getting
 333             confused later by replacing all immediate uses x in such
 334             a stmt.  */
 335          && gimple_assign_rhs1 (use_stmt) != def;
 336 }
 337
 338 /* Walk the subset of the dominator tree rooted at OCC, setting the
 339    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 340    the given basic block.  The field may be left NULL, of course,
 341    if it is not possible or profitable to do the optimization.
 342
 343    DEF_BSI is an iterator pointing at the statement defining DEF.
 344    If RECIP_DEF is set, a dominator already has a computation that can
 345    be used.  */
 346
 347 static void
 348 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 349                     tree def, tree recip_def, int threshold)
 350 {
 351   tree type;
 352   gassign *new_stmt;
 353   gimple_stmt_iterator gsi;
 354   struct occurrence *occ_child;
 355
 356   if (!recip_def
 357       && (occ->bb_has_division || !flag_trapping_math)
 358       && occ->num_divisions >= threshold)
 359     {
 360       /* Make a variable with the replacement and substitute it.  */
 361       type = TREE_TYPE (def);
 362       recip_def = create_tmp_reg (type, "reciptmp");
 363       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 364                                       build_one_cst (type), def);
 365
 366       if (occ->bb_has_division)
 367         {
 368           /* Case 1: insert before an existing division.  */
 369           gsi = gsi_after_labels (occ->bb);
 370           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 371             gsi_next (&gsi);
 372
 373           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 374         }
 375       else if (def_gsi && occ->bb == def_gsi->bb)
 376         {
 377           /* Case 2: insert right after the definition.  Note that this will
 378              never happen if the definition statement can throw, because in
 379              that case the sole successor of the statement's basic block will
 380              dominate all the uses as well.  */
 381           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 382         }
 383       else
 384         {
 385           /* Case 3: insert in a basic block not containing defs/uses.  */
 386           gsi = gsi_after_labels (occ->bb);
 387           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 388         }
 389
 390       reciprocal_stats.rdivs_inserted++;
 391
 392       occ->recip_def_stmt = new_stmt;
 393     }
 394
 395   occ->recip_def = recip_def;
 396   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 397     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 398 }
 399
 400
 401 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 402    possible.  */
 403
 404 static inline void
 405 replace_reciprocal (use_operand_p use_p)
 406 {
 407   gimple *use_stmt = USE_STMT (use_p);
 408   basic_block bb = gimple_bb (use_stmt);
 409   struct occurrence *occ = (struct occurrence *) bb->aux;
 410
 411   if (optimize_bb_for_speed_p (bb)
 412       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 413     {
 414       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 415       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 416       SET_USE (use_p, occ->recip_def);
 417       fold_stmt_inplace (&gsi);
 418       update_stmt (use_stmt);
 419     }
 420 }
 421
 422
 423 /* Free OCC and return one more "struct occurrence" to be freed.  */
 424
 425 static struct occurrence *
 426 free_bb (struct occurrence *occ)
 427 {
 428   struct occurrence *child, *next;
 429
 430   /* First get the two pointers hanging off OCC.  */
 431   next = occ->next;
 432   child = occ->children;
 433   occ->bb->aux = NULL;
 434   occ_pool->remove (occ);
 435
 436   /* Now ensure that we don't recurse unless it is necessary.  */
 437   if (!child)
 438     return next;
 439   else
 440     {
 441       while (next)
 442         next = free_bb (next);
 443
 444       return child;
 445     }
 446 }
 447
 448
 449 /* Look for floating-point divisions among DEF's uses, and try to
 450    replace them by multiplications with the reciprocal.  Add
 451    as many statements computing the reciprocal as needed.
 452
 453    DEF must be a GIMPLE register of a floating-point type.  */
 454
 455 static void
 456 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 457 {
 458   use_operand_p use_p;
 459   imm_use_iterator use_iter;
 460   struct occurrence *occ;
 461   int count = 0, threshold;
 462
 463   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 464
 465   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 466     {
 467       gimple *use_stmt = USE_STMT (use_p);
 468       if (is_division_by (use_stmt, def))
 469         {
 470           register_division_in (gimple_bb (use_stmt));
 471           count++;
 472         }
 473     }
 474
 475   /* Do the expensive part only if we can hope to optimize something.  */
 476   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 477   if (count >= threshold)
 478     {
 479       gimple *use_stmt;
 480       for (occ = occ_head; occ; occ = occ->next)
 481         {
 482           compute_merit (occ);
 483           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 484         }
 485
 486       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 487         {
 488           if (is_division_by (use_stmt, def))
 489             {
 490               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 491                 replace_reciprocal (use_p);
 492             }
 493         }
 494     }
 495
 496   for (occ = occ_head; occ; )
 497     occ = free_bb (occ);
 498
 499   occ_head = NULL;
 500 }
 501
 502 /* Go through all the floating-point SSA_NAMEs, and call
 503    execute_cse_reciprocals_1 on each of them.  */
 504 namespace {
 505
 506 const pass_data pass_data_cse_reciprocals =
 507 {
 508   GIMPLE_PASS, /* type */
 509   "recip", /* name */
 510   OPTGROUP_NONE, /* optinfo_flags */
 511   TV_NONE, /* tv_id */
 512   PROP_ssa, /* properties_required */
 513   0, /* properties_provided */
 514   0, /* properties_destroyed */
 515   0, /* todo_flags_start */
 516   TODO_update_ssa, /* todo_flags_finish */
 517 };
 518
 519 class pass_cse_reciprocals : public gimple_opt_pass
 520 {
 521 public:
 522   pass_cse_reciprocals (gcc::context *ctxt)
 523     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 524   {}
 525
 526   /* opt_pass methods: */
 527   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 528   virtual unsigned int execute (function *);
 529
 530 }; // class pass_cse_reciprocals
 531
 532 unsigned int
 533 pass_cse_reciprocals::execute (function *fun)
 534 {
 535   basic_block bb;
 536   tree arg;
 537
 538   occ_pool = new object_allocator<occurrence> ("dominators for recip");
 539
 540   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 541   calculate_dominance_info (CDI_DOMINATORS);
 542   calculate_dominance_info (CDI_POST_DOMINATORS);
 543
 544 #ifdef ENABLE_CHECKING
 545   FOR_EACH_BB_FN (bb, fun)
 546     gcc_assert (!bb->aux);
 547 #endif
 548
 549   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 550     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 551         && is_gimple_reg (arg))
 552       {
 553         tree name = ssa_default_def (fun, arg);
 554         if (name)
 555           execute_cse_reciprocals_1 (NULL, name);
 556       }
 557
 558   FOR_EACH_BB_FN (bb, fun)
 559     {
 560       tree def;
 561
 562       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 563            gsi_next (&gsi))
 564         {
 565           gphi *phi = gsi.phi ();
 566           def = PHI_RESULT (phi);
 567           if (! virtual_operand_p (def)
 568               && FLOAT_TYPE_P (TREE_TYPE (def)))
 569             execute_cse_reciprocals_1 (NULL, def);
 570         }
 571
 572       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 573            gsi_next (&gsi))
 574         {
 575           gimple *stmt = gsi_stmt (gsi);
 576
 577           if (gimple_has_lhs (stmt)
 578               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 579               && FLOAT_TYPE_P (TREE_TYPE (def))
 580               && TREE_CODE (def) == SSA_NAME)
 581             execute_cse_reciprocals_1 (&gsi, def);
 582         }
 583
 584       if (optimize_bb_for_size_p (bb))
 585         continue;
 586
 587       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 588       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 589            gsi_next (&gsi))
 590         {
 591           gimple *stmt = gsi_stmt (gsi);
 592           tree fndecl;
 593
 594           if (is_gimple_assign (stmt)
 595               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 596             {
 597               tree arg1 = gimple_assign_rhs2 (stmt);
 598               gimple *stmt1;
 599
 600               if (TREE_CODE (arg1) != SSA_NAME)
 601                 continue;
 602
 603               stmt1 = SSA_NAME_DEF_STMT (arg1);
 604
 605               if (is_gimple_call (stmt1)
 606                   && gimple_call_lhs (stmt1)
 607                   && (fndecl = gimple_call_fndecl (stmt1))
 608                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 609                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 610                 {
 611                   enum built_in_function code;
 612                   bool md_code, fail;
 613                   imm_use_iterator ui;
 614                   use_operand_p use_p;
 615
 616                   code = DECL_FUNCTION_CODE (fndecl);
 617                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 618
 619                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 620                   if (!fndecl)
 621                     continue;
 622
 623                   /* Check that all uses of the SSA name are divisions,
 624                      otherwise replacing the defining statement will do
 625                      the wrong thing.  */
 626                   fail = false;
 627                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 628                     {
 629                       gimple *stmt2 = USE_STMT (use_p);
 630                       if (is_gimple_debug (stmt2))
 631                         continue;
 632                       if (!is_gimple_assign (stmt2)
 633                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 634                           || gimple_assign_rhs1 (stmt2) == arg1
 635                           || gimple_assign_rhs2 (stmt2) != arg1)
 636                         {
 637                           fail = true;
 638                           break;
 639                         }
 640                     }
 641                   if (fail)
 642                     continue;
 643
 644                   gimple_replace_ssa_lhs (stmt1, arg1);
 645                   gimple_call_set_fndecl (stmt1, fndecl);
 646                   update_stmt (stmt1);
 647                   reciprocal_stats.rfuncs_inserted++;
 648
 649                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 650                     {
 651                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 652                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 653                       fold_stmt_inplace (&gsi);
 654                       update_stmt (stmt);
 655                     }
 656                 }
 657             }
 658         }
 659     }
 660
 661   statistics_counter_event (fun, "reciprocal divs inserted",
 662                             reciprocal_stats.rdivs_inserted);
 663   statistics_counter_event (fun, "reciprocal functions inserted",
 664                             reciprocal_stats.rfuncs_inserted);
 665
 666   free_dominance_info (CDI_DOMINATORS);
 667   free_dominance_info (CDI_POST_DOMINATORS);
 668   delete occ_pool;
 669   return 0;
 670 }
 671
 672 } // anon namespace
 673
 674 gimple_opt_pass *
 675 make_pass_cse_reciprocals (gcc::context *ctxt)
 676 {
 677   return new pass_cse_reciprocals (ctxt);
 678 }
 679
 680 /* Records an occurrence at statement USE_STMT in the vector of trees
 681    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 682    is not yet initialized.  Returns true if the occurrence was pushed on
 683    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 684    statements in the vector.  */
 685
 686 static bool
 687 maybe_record_sincos (vec<gimple *> *stmts,
 688                      basic_block *top_bb, gimple *use_stmt)
 689 {
 690   basic_block use_bb = gimple_bb (use_stmt);
 691   if (*top_bb
 692       && (*top_bb == use_bb
 693           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 694     stmts->safe_push (use_stmt);
 695   else if (!*top_bb
 696            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 697     {
 698       stmts->safe_push (use_stmt);
 699       *top_bb = use_bb;
 700     }
 701   else
 702     return false;
 703
 704   return true;
 705 }
 706
 707 /* Look for sin, cos and cexpi calls with the same argument NAME and
 708    create a single call to cexpi CSEing the result in this case.
 709    We first walk over all immediate uses of the argument collecting
 710    statements that we can CSE in a vector and in a second pass replace
 711    the statement rhs with a REALPART or IMAGPART expression on the
 712    result of the cexpi call we insert before the use statement that
 713    dominates all other candidates.  */
 714
 715 static bool
 716 execute_cse_sincos_1 (tree name)
 717 {
 718   gimple_stmt_iterator gsi;
 719   imm_use_iterator use_iter;
 720   tree fndecl, res, type;
 721   gimple *def_stmt, *use_stmt, *stmt;
 722   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 723   auto_vec<gimple *> stmts;
 724   basic_block top_bb = NULL;
 725   int i;
 726   bool cfg_changed = false;
 727
 728   type = TREE_TYPE (name);
 729   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 730     {
 731       if (gimple_code (use_stmt) != GIMPLE_CALL
 732           || !gimple_call_lhs (use_stmt)
 733           || !(fndecl = gimple_call_fndecl (use_stmt))
 734           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 735         continue;
 736
 737       switch (DECL_FUNCTION_CODE (fndecl))
 738         {
 739         CASE_FLT_FN (BUILT_IN_COS):
 740           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 741           break;
 742
 743         CASE_FLT_FN (BUILT_IN_SIN):
 744           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 745           break;
 746
 747         CASE_FLT_FN (BUILT_IN_CEXPI):
 748           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 749           break;
 750
 751         default:;
 752         }
 753     }
 754
 755   if (seen_cos + seen_sin + seen_cexpi <= 1)
 756     return false;
 757
 758   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 759      the name def statement.  */
 760   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 761   if (!fndecl)
 762     return false;
 763   stmt = gimple_build_call (fndecl, 1, name);
 764   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 765   gimple_call_set_lhs (stmt, res);
 766
 767   def_stmt = SSA_NAME_DEF_STMT (name);
 768   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 769       && gimple_code (def_stmt) != GIMPLE_PHI
 770       && gimple_bb (def_stmt) == top_bb)
 771     {
 772       gsi = gsi_for_stmt (def_stmt);
 773       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 774     }
 775   else
 776     {
 777       gsi = gsi_after_labels (top_bb);
 778       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 779     }
 780   sincos_stats.inserted++;
 781
 782   /* And adjust the recorded old call sites.  */
 783   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 784     {
 785       tree rhs = NULL;
 786       fndecl = gimple_call_fndecl (use_stmt);
 787
 788       switch (DECL_FUNCTION_CODE (fndecl))
 789         {
 790         CASE_FLT_FN (BUILT_IN_COS):
 791           rhs = fold_build1 (REALPART_EXPR, type, res);
 792           break;
 793
 794         CASE_FLT_FN (BUILT_IN_SIN):
 795           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 796           break;
 797
 798         CASE_FLT_FN (BUILT_IN_CEXPI):
 799           rhs = res;
 800           break;
 801
 802         default:;
 803           gcc_unreachable ();
 804         }
 805
 806         /* Replace call with a copy.  */
 807         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 808
 809         gsi = gsi_for_stmt (use_stmt);
 810         gsi_replace (&gsi, stmt, true);
 811         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 812           cfg_changed = true;
 813     }
 814
 815   return cfg_changed;
 816 }
 817
 818 /* To evaluate powi(x,n), the floating point value x raised to the
 819    constant integer exponent n, we use a hybrid algorithm that
 820    combines the "window method" with look-up tables.  For an
 821    introduction to exponentiation algorithms and "addition chains",
 822    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 823    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 824    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 825    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 826
 827 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 828    multiplications to inline before calling the system library's pow
 829    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 830    so this default never requires calling pow, powf or powl.  */
 831
 832 #ifndef POWI_MAX_MULTS
 833 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 834 #endif
 835
 836 /* The size of the "optimal power tree" lookup table.  All
 837    exponents less than this value are simply looked up in the
 838    powi_table below.  This threshold is also used to size the
 839    cache of pseudo registers that hold intermediate results.  */
 840 #define POWI_TABLE_SIZE 256
 841
 842 /* The size, in bits of the window, used in the "window method"
 843    exponentiation algorithm.  This is equivalent to a radix of
 844    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 845 #define POWI_WINDOW_SIZE 3
 846
 847 /* The following table is an efficient representation of an
 848    "optimal power tree".  For each value, i, the corresponding
 849    value, j, in the table states than an optimal evaluation
 850    sequence for calculating pow(x,i) can be found by evaluating
 851    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 852    100 integers is given in Knuth's "Seminumerical algorithms".  */
 853
 854 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 855   {
 856       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 857       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 858       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 859      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 860      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 861      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 862      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 863      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 864      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 865      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 866      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 867      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 868      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 869      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 870      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 871      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 872      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 873      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 874      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 875      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 876      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 877      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 878      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 879      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 880      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 881     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 882     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 883     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 884     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 885     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 886     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 887     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 888   };
 889
 890
 891 /* Return the number of multiplications required to calculate
 892    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 893    subroutine of powi_cost.  CACHE is an array indicating
 894    which exponents have already been calculated.  */
 895
 896 static int
 897 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 898 {
 899   /* If we've already calculated this exponent, then this evaluation
 900      doesn't require any additional multiplications.  */
 901   if (cache[n])
 902     return 0;
 903
 904   cache[n] = true;
 905   return powi_lookup_cost (n - powi_table[n], cache)
 906          + powi_lookup_cost (powi_table[n], cache) + 1;
 907 }
 908
 909 /* Return the number of multiplications required to calculate
 910    powi(x,n) for an arbitrary x, given the exponent N.  This
 911    function needs to be kept in sync with powi_as_mults below.  */
 912
 913 static int
 914 powi_cost (HOST_WIDE_INT n)
 915 {
 916   bool cache[POWI_TABLE_SIZE];
 917   unsigned HOST_WIDE_INT digit;
 918   unsigned HOST_WIDE_INT val;
 919   int result;
 920
 921   if (n == 0)
 922     return 0;
 923
 924   /* Ignore the reciprocal when calculating the cost.  */
 925   val = (n < 0) ? -n : n;
 926
 927   /* Initialize the exponent cache.  */
 928   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 929   cache[1] = true;
 930
 931   result = 0;
 932
 933   while (val >= POWI_TABLE_SIZE)
 934     {
 935       if (val & 1)
 936         {
 937           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 938           result += powi_lookup_cost (digit, cache)
 939                     + POWI_WINDOW_SIZE + 1;
 940           val >>= POWI_WINDOW_SIZE;
 941         }
 942       else
 943         {
 944           val >>= 1;
 945           result++;
 946         }
 947     }
 948
 949   return result + powi_lookup_cost (val, cache);
 950 }
 951
 952 /* Recursive subroutine of powi_as_mults.  This function takes the
 953    array, CACHE, of already calculated exponents and an exponent N and
 954    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 955
 956 static tree
 957 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 958                  HOST_WIDE_INT n, tree *cache)
 959 {
 960   tree op0, op1, ssa_target;
 961   unsigned HOST_WIDE_INT digit;
 962   gassign *mult_stmt;
 963
 964   if (n < POWI_TABLE_SIZE && cache[n])
 965     return cache[n];
 966
 967   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 968
 969   if (n < POWI_TABLE_SIZE)
 970     {
 971       cache[n] = ssa_target;
 972       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 973       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 974     }
 975   else if (n & 1)
 976     {
 977       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 978       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 979       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 980     }
 981   else
 982     {
 983       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 984       op1 = op0;
 985     }
 986
 987   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
 988   gimple_set_location (mult_stmt, loc);
 989   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 990
 991   return ssa_target;
 992 }
 993
 994 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 995    This function needs to be kept in sync with powi_cost above.  */
 996
 997 static tree
 998 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 999                tree arg0, HOST_WIDE_INT n)
1000 {
1001   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1002   gassign *div_stmt;
1003   tree target;
1004
1005   if (n == 0)
1006     return build_real (type, dconst1);
1007
1008   memset (cache, 0,  sizeof (cache));
1009   cache[1] = arg0;
1010
1011   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1012   if (n >= 0)
1013     return result;
1014
1015   /* If the original exponent was negative, reciprocate the result.  */
1016   target = make_temp_ssa_name (type, NULL, "powmult");
1017   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1018                                   build_real (type, dconst1), result);
1019   gimple_set_location (div_stmt, loc);
1020   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1021
1022   return target;
1023 }
1024
1025 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1026    location info LOC.  If the arguments are appropriate, create an
1027    equivalent sequence of statements prior to GSI using an optimal
1028    number of multiplications, and return an expession holding the
1029    result.  */
1030
1031 static tree
1032 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1033                             tree arg0, HOST_WIDE_INT n)
1034 {
1035   /* Avoid largest negative number.  */
1036   if (n != -n
1037       && ((n >= -1 && n <= 2)
1038           || (optimize_function_for_speed_p (cfun)
1039               && powi_cost (n) <= POWI_MAX_MULTS)))
1040     return powi_as_mults (gsi, loc, arg0, n);
1041
1042   return NULL_TREE;
1043 }
1044
1045 /* Build a gimple call statement that calls FN with argument ARG.
1046    Set the lhs of the call statement to a fresh SSA name.  Insert the
1047    statement prior to GSI's current position, and return the fresh
1048    SSA name.  */
1049
1050 static tree
1051 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1052                        tree fn, tree arg)
1053 {
1054   gcall *call_stmt;
1055   tree ssa_target;
1056
1057   call_stmt = gimple_build_call (fn, 1, arg);
1058   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1059   gimple_set_lhs (call_stmt, ssa_target);
1060   gimple_set_location (call_stmt, loc);
1061   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1062
1063   return ssa_target;
1064 }
1065
1066 /* Build a gimple binary operation with the given CODE and arguments
1067    ARG0, ARG1, assigning the result to a new SSA name for variable
1068    TARGET.  Insert the statement prior to GSI's current position, and
1069    return the fresh SSA name.*/
1070
1071 static tree
1072 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1073                         const char *name, enum tree_code code,
1074                         tree arg0, tree arg1)
1075 {
1076   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1077   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1078   gimple_set_location (stmt, loc);
1079   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1080   return result;
1081 }
1082
1083 /* Build a gimple reference operation with the given CODE and argument
1084    ARG, assigning the result to a new SSA name of TYPE with NAME.
1085    Insert the statement prior to GSI's current position, and return
1086    the fresh SSA name.  */
1087
1088 static inline tree
1089 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1090                       const char *name, enum tree_code code, tree arg0)
1091 {
1092   tree result = make_temp_ssa_name (type, NULL, name);
1093   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1094   gimple_set_location (stmt, loc);
1095   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1096   return result;
1097 }
1098
1099 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1100    prior to GSI's current position, and return the fresh SSA name.  */
1101
1102 static tree
1103 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1104                        tree type, tree val)
1105 {
1106   tree result = make_ssa_name (type);
1107   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1108   gimple_set_location (stmt, loc);
1109   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1110   return result;
1111 }
1112
1113 struct pow_synth_sqrt_info
1114 {
1115   bool *factors;
1116   unsigned int deepest;
1117   unsigned int num_mults;
1118 };
1119
1120 /* Return true iff the real value C can be represented as a
1121    sum of powers of 0.5 up to N.  That is:
1122    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1123    Record in INFO the various parameters of the synthesis algorithm such
1124    as the factors a[i], the maximum 0.5 power and the number of
1125    multiplications that will be required.  */
1126
1127 bool
1128 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1129                                  struct pow_synth_sqrt_info *info)
1130 {
1131   REAL_VALUE_TYPE factor = dconsthalf;
1132   REAL_VALUE_TYPE remainder = c;
1133
1134   info->deepest = 0;
1135   info->num_mults = 0;
1136   memset (info->factors, 0, n * sizeof (bool));
1137
1138   for (unsigned i = 0; i < n; i++)
1139     {
1140       REAL_VALUE_TYPE res;
1141
1142       /* If something inexact happened bail out now.  */
1143       if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1144         return false;
1145
1146       /* We have hit zero.  The number is representable as a sum
1147          of powers of 0.5.  */
1148       if (real_equal (&res, &dconst0))
1149         {
1150           info->factors[i] = true;
1151           info->deepest = i + 1;
1152           return true;
1153         }
1154       else if (!REAL_VALUE_NEGATIVE (res))
1155         {
1156           remainder = res;
1157           info->factors[i] = true;
1158           info->num_mults++;
1159         }
1160       else
1161         info->factors[i] = false;
1162
1163       real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1164     }
1165   return false;
1166 }
1167
1168 /* Return the tree corresponding to FN being applied
1169    to ARG N times at GSI and LOC.
1170    Look up previous results from CACHE if need be.
1171    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1172
1173 static tree
1174 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1175               tree fn, location_t loc, tree *cache)
1176 {
1177   tree res = cache[n];
1178   if (!res)
1179     {
1180       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1181       res = build_and_insert_call (gsi, loc, fn, prev);
1182       cache[n] = res;
1183     }
1184
1185   return res;
1186 }
1187
1188 /* Print to STREAM the repeated application of function FNAME to ARG
1189    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1190    "foo (foo (x))".  */
1191
1192 static void
1193 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1194                  unsigned int n)
1195 {
1196   if (n == 0)
1197     fprintf (stream, "%s", arg);
1198   else
1199     {
1200       fprintf (stream, "%s (", fname);
1201       print_nested_fn (stream, fname, arg, n - 1);
1202       fprintf (stream, ")");
1203     }
1204 }
1205
1206 /* Print to STREAM the fractional sequence of sqrt chains
1207    applied to ARG, described by INFO.  Used for the dump file.  */
1208
1209 static void
1210 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1211                                 struct pow_synth_sqrt_info *info)
1212 {
1213   for (unsigned int i = 0; i < info->deepest; i++)
1214     {
1215       bool is_set = info->factors[i];
1216       if (is_set)
1217         {
1218           print_nested_fn (stream, "sqrt", arg, i + 1);
1219           if (i != info->deepest - 1)
1220             fprintf (stream, " * ");
1221         }
1222     }
1223 }
1224
1225 /* Print to STREAM a representation of raising ARG to an integer
1226    power N.  Used for the dump file.  */
1227
1228 static void
1229 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1230 {
1231   if (n > 1)
1232     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1233   else if (n == 1)
1234     fprintf (stream, "%s", arg);
1235 }
1236
1237 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1238    square roots.  Place at GSI and LOC.  Limit the maximum depth
1239    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1240    result of the expanded sequence or NULL_TREE if the expansion failed.
1241
1242    This routine assumes that ARG1 is a real number with a fractional part
1243    (the integer exponent case will have been handled earlier in
1244    gimple_expand_builtin_pow).
1245
1246    For ARG1 > 0.0:
1247    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1248      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1249                     FRAC_PART == ARG1 - WHOLE_PART:
1250      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1251      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1252      if it can be expressed as such, that is if FRAC_PART satisfies:
1253      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1254      where integer a[i] is either 0 or 1.
1255
1256      Example:
1257      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1258        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1259
1260    For ARG1 < 0.0 there are two approaches:
1261    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1262          is calculated as above.
1263
1264      Example:
1265      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1266        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1267
1268    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1269            FRAC_PART  := ARG1 - WHOLE_PART
1270      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1271      Example:
1272      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1273        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1274
1275    For ARG1 < 0.0 we choose between (A) and (B) depending on
1276    how many multiplications we'd have to do.
1277    So, for the example in (B): POW (x, -5.875), if we were to
1278    follow algorithm (A) we would produce:
1279    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1280    which contains more multiplications than approach (B).
1281
1282    Hopefully, this approach will eliminate potentially expensive POW library
1283    calls when unsafe floating point math is enabled and allow the compiler to
1284    further optimise the multiplies, square roots and divides produced by this
1285    function.  */
1286
1287 static tree
1288 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1289                      tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1290 {
1291   tree type = TREE_TYPE (arg0);
1292   machine_mode mode = TYPE_MODE (type);
1293   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1294   bool one_over = true;
1295
1296   if (!sqrtfn)
1297     return NULL_TREE;
1298
1299   if (TREE_CODE (arg1) != REAL_CST)
1300     return NULL_TREE;
1301
1302   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1303
1304   gcc_assert (max_depth > 0);
1305   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1306
1307   struct pow_synth_sqrt_info synth_info;
1308   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1309   synth_info.deepest = 0;
1310   synth_info.num_mults = 0;
1311
1312   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1313   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1314
1315   /* The whole and fractional parts of exp.  */
1316   REAL_VALUE_TYPE whole_part;
1317   REAL_VALUE_TYPE frac_part;
1318
1319   real_floor (&whole_part, mode, &exp);
1320   real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1321
1322
1323   REAL_VALUE_TYPE ceil_whole = dconst0;
1324   REAL_VALUE_TYPE ceil_fract = dconst0;
1325
1326   if (neg_exp)
1327     {
1328       real_ceil (&ceil_whole, mode, &exp);
1329       real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1330     }
1331
1332   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1333     return NULL_TREE;
1334
1335   /* Check whether it's more profitable to not use 1.0 / ...  */
1336   if (neg_exp)
1337     {
1338       struct pow_synth_sqrt_info alt_synth_info;
1339       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1340       alt_synth_info.deepest = 0;
1341       alt_synth_info.num_mults = 0;
1342
1343       if (representable_as_half_series_p (ceil_fract, max_depth,
1344                                            &alt_synth_info)
1345           && alt_synth_info.deepest <= synth_info.deepest
1346           && alt_synth_info.num_mults < synth_info.num_mults)
1347         {
1348           whole_part = ceil_whole;
1349           frac_part = ceil_fract;
1350           synth_info.deepest = alt_synth_info.deepest;
1351           synth_info.num_mults = alt_synth_info.num_mults;
1352           memcpy (synth_info.factors, alt_synth_info.factors,
1353                   (max_depth + 1) * sizeof (bool));
1354           one_over = false;
1355         }
1356     }
1357
1358   HOST_WIDE_INT n = real_to_integer (&whole_part);
1359   REAL_VALUE_TYPE cint;
1360   real_from_integer (&cint, VOIDmode, n, SIGNED);
1361
1362   if (!real_identical (&whole_part, &cint))
1363     return NULL_TREE;
1364
1365   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1366     return NULL_TREE;
1367
1368   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1369
1370   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1371
1372   /* Calculate the integer part of the exponent.  */
1373   if (n > 1)
1374     {
1375       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1376       if (!integer_res)
1377         return NULL_TREE;
1378     }
1379
1380   if (dump_file)
1381     {
1382       char string[64];
1383
1384       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1385       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1386
1387       if (neg_exp)
1388         {
1389           if (one_over)
1390             {
1391               fprintf (dump_file, "1.0 / (");
1392               dump_integer_part (dump_file, "x", n);
1393               if (n > 0)
1394                 fprintf (dump_file, " * ");
1395               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1396               fprintf (dump_file, ")");
1397             }
1398           else
1399             {
1400               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1401               fprintf (dump_file, " / (");
1402               dump_integer_part (dump_file, "x", n);
1403               fprintf (dump_file, ")");
1404             }
1405         }
1406       else
1407         {
1408           dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1409           if (n > 0)
1410             fprintf (dump_file, " * ");
1411           dump_integer_part (dump_file, "x", n);
1412         }
1413
1414       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1415     }
1416
1417
1418   tree fract_res = NULL_TREE;
1419   cache[0] = arg0;
1420
1421   /* Calculate the fractional part of the exponent.  */
1422   for (unsigned i = 0; i < synth_info.deepest; i++)
1423     {
1424       if (synth_info.factors[i])
1425         {
1426           tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1427
1428           if (!fract_res)
1429               fract_res = sqrt_chain;
1430
1431           else
1432             fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1433                                            fract_res, sqrt_chain);
1434         }
1435     }
1436
1437   tree res = NULL_TREE;
1438
1439   if (neg_exp)
1440     {
1441       if (one_over)
1442         {
1443           if (n > 0)
1444             res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1445                                            fract_res, integer_res);
1446           else
1447             res = fract_res;
1448
1449           res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1450                                           build_real (type, dconst1), res);
1451         }
1452       else
1453         {
1454           res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1455                                          fract_res, integer_res);
1456         }
1457     }
1458   else
1459     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1460                                    fract_res, integer_res);
1461   return res;
1462 }
1463
1464 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1465    with location info LOC.  If possible, create an equivalent and
1466    less expensive sequence of statements prior to GSI, and return an
1467    expession holding the result.  */
1468
1469 static tree
1470 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1471                            tree arg0, tree arg1)
1472 {
1473   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1474   REAL_VALUE_TYPE c2, dconst3;
1475   HOST_WIDE_INT n;
1476   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1477   machine_mode mode;
1478   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1479   bool hw_sqrt_exists, c_is_int, c2_is_int;
1480
1481   dconst1_4 = dconst1;
1482   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1483
1484   /* If the exponent isn't a constant, there's nothing of interest
1485      to be done.  */
1486   if (TREE_CODE (arg1) != REAL_CST)
1487     return NULL_TREE;
1488
1489   /* If the exponent is equivalent to an integer, expand to an optimal
1490      multiplication sequence when profitable.  */
1491   c = TREE_REAL_CST (arg1);
1492   n = real_to_integer (&c);
1493   real_from_integer (&cint, VOIDmode, n, SIGNED);
1494   c_is_int = real_identical (&c, &cint);
1495
1496   if (c_is_int
1497       && ((n >= -1 && n <= 2)
1498           || (flag_unsafe_math_optimizations
1499               && speed_p
1500               && powi_cost (n) <= POWI_MAX_MULTS)))
1501     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1502
1503   /* Attempt various optimizations using sqrt and cbrt.  */
1504   type = TREE_TYPE (arg0);
1505   mode = TYPE_MODE (type);
1506   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1507
1508   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1509      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1510      sqrt(-0) = -0.  */
1511   if (sqrtfn
1512       && real_equal (&c, &dconsthalf)
1513       && !HONOR_SIGNED_ZEROS (mode))
1514     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1515
1516   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1517
1518   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1519      optimizations since 1./3. is not exactly representable.  If x
1520      is negative and finite, the correct value of pow(x,1./3.) is
1521      a NaN with the "invalid" exception raised, because the value
1522      of 1./3. actually has an even denominator.  The correct value
1523      of cbrt(x) is a negative real value.  */
1524   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1525   dconst1_3 = real_value_truncate (mode, dconst_third ());
1526
1527   if (flag_unsafe_math_optimizations
1528       && cbrtfn
1529       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1530       && real_equal (&c, &dconst1_3))
1531     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1532
1533   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1534      if we don't have a hardware sqrt insn.  */
1535   dconst1_6 = dconst1_3;
1536   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1537
1538   if (flag_unsafe_math_optimizations
1539       && sqrtfn
1540       && cbrtfn
1541       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1542       && speed_p
1543       && hw_sqrt_exists
1544       && real_equal (&c, &dconst1_6))
1545     {
1546       /* sqrt(x)  */
1547       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1548
1549       /* cbrt(sqrt(x))  */
1550       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1551     }
1552
1553
1554   /* Attempt to expand the POW as a product of square root chains.
1555      Expand the 0.25 case even when otpimising for size.  */
1556   if (flag_unsafe_math_optimizations
1557       && sqrtfn
1558       && hw_sqrt_exists
1559       && (speed_p || real_equal (&c, &dconst1_4))
1560       && !HONOR_SIGNED_ZEROS (mode))
1561     {
1562       unsigned int max_depth = speed_p
1563                                 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1564                                 : 2;
1565
1566       tree expand_with_sqrts
1567         = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1568
1569       if (expand_with_sqrts)
1570         return expand_with_sqrts;
1571     }
1572
1573   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1574   n = real_to_integer (&c2);
1575   real_from_integer (&cint, VOIDmode, n, SIGNED);
1576   c2_is_int = real_identical (&c2, &cint);
1577
1578   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1579
1580      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1581      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1582
1583      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1584      different from pow(x, 1./3.) due to rounding and behavior with
1585      negative x, we need to constrain this transformation to unsafe
1586      math and positive x or finite math.  */
1587   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1588   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1589   real_round (&c2, mode, &c2);
1590   n = real_to_integer (&c2);
1591   real_from_integer (&cint, VOIDmode, n, SIGNED);
1592   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1593   real_convert (&c2, mode, &c2);
1594
1595   if (flag_unsafe_math_optimizations
1596       && cbrtfn
1597       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1598       && real_identical (&c2, &c)
1599       && !c2_is_int
1600       && optimize_function_for_speed_p (cfun)
1601       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1602     {
1603       tree powi_x_ndiv3 = NULL_TREE;
1604
1605       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1606          possible or profitable, give up.  Skip the degenerate case when
1607          abs(n) < 3, where the result is always 1.  */
1608       if (absu_hwi (n) >= 3)
1609         {
1610           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1611                                                      abs_hwi (n / 3));
1612           if (!powi_x_ndiv3)
1613             return NULL_TREE;
1614         }
1615
1616       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1617          as that creates an unnecessary variable.  Instead, just produce
1618          either cbrt(x) or cbrt(x) * cbrt(x).  */
1619       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1620
1621       if (absu_hwi (n) % 3 == 1)
1622         powi_cbrt_x = cbrt_x;
1623       else
1624         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1625                                               cbrt_x, cbrt_x);
1626
1627       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1628       if (absu_hwi (n) < 3)
1629         result = powi_cbrt_x;
1630       else
1631         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1632                                          powi_x_ndiv3, powi_cbrt_x);
1633
1634       /* If n is negative, reciprocate the result.  */
1635       if (n < 0)
1636         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1637                                          build_real (type, dconst1), result);
1638
1639       return result;
1640     }
1641
1642   /* No optimizations succeeded.  */
1643   return NULL_TREE;
1644 }
1645
1646 /* ARG is the argument to a cabs builtin call in GSI with location info
1647    LOC.  Create a sequence of statements prior to GSI that calculates
1648    sqrt(R*R + I*I), where R and I are the real and imaginary components
1649    of ARG, respectively.  Return an expression holding the result.  */
1650
1651 static tree
1652 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1653 {
1654   tree real_part, imag_part, addend1, addend2, sum, result;
1655   tree type = TREE_TYPE (TREE_TYPE (arg));
1656   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1657   machine_mode mode = TYPE_MODE (type);
1658
1659   if (!flag_unsafe_math_optimizations
1660       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1661       || !sqrtfn
1662       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1663     return NULL_TREE;
1664
1665   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1666                                     REALPART_EXPR, arg);
1667   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1668                                     real_part, real_part);
1669   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1670                                     IMAGPART_EXPR, arg);
1671   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1672                                     imag_part, imag_part);
1673   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1674   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1675
1676   return result;
1677 }
1678
1679 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1680    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1681    an optimal number of multiplies, when n is a constant.  */
1682
1683 namespace {
1684
1685 const pass_data pass_data_cse_sincos =
1686 {
1687   GIMPLE_PASS, /* type */
1688   "sincos", /* name */
1689   OPTGROUP_NONE, /* optinfo_flags */
1690   TV_NONE, /* tv_id */
1691   PROP_ssa, /* properties_required */
1692   0, /* properties_provided */
1693   0, /* properties_destroyed */
1694   0, /* todo_flags_start */
1695   TODO_update_ssa, /* todo_flags_finish */
1696 };
1697
1698 class pass_cse_sincos : public gimple_opt_pass
1699 {
1700 public:
1701   pass_cse_sincos (gcc::context *ctxt)
1702     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1703   {}
1704
1705   /* opt_pass methods: */
1706   virtual bool gate (function *)
1707     {
1708       /* We no longer require either sincos or cexp, since powi expansion
1709          piggybacks on this pass.  */
1710       return optimize;
1711     }
1712
1713   virtual unsigned int execute (function *);
1714
1715 }; // class pass_cse_sincos
1716
1717 unsigned int
1718 pass_cse_sincos::execute (function *fun)
1719 {
1720   basic_block bb;
1721   bool cfg_changed = false;
1722
1723   calculate_dominance_info (CDI_DOMINATORS);
1724   memset (&sincos_stats, 0, sizeof (sincos_stats));
1725
1726   FOR_EACH_BB_FN (bb, fun)
1727     {
1728       gimple_stmt_iterator gsi;
1729       bool cleanup_eh = false;
1730
1731       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1732         {
1733           gimple *stmt = gsi_stmt (gsi);
1734           tree fndecl;
1735
1736           /* Only the last stmt in a bb could throw, no need to call
1737              gimple_purge_dead_eh_edges if we change something in the middle
1738              of a basic block.  */
1739           cleanup_eh = false;
1740
1741           if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL)
1742               && gimple_call_lhs (stmt))
1743             {
1744               tree arg, arg0, arg1, result;
1745               HOST_WIDE_INT n;
1746               location_t loc;
1747
1748               fndecl = gimple_call_fndecl (stmt);
1749               switch (DECL_FUNCTION_CODE (fndecl))
1750                 {
1751                 CASE_FLT_FN (BUILT_IN_COS):
1752                 CASE_FLT_FN (BUILT_IN_SIN):
1753                 CASE_FLT_FN (BUILT_IN_CEXPI):
1754                   /* Make sure we have either sincos or cexp.  */
1755                   if (!targetm.libc_has_function (function_c99_math_complex)
1756                       && !targetm.libc_has_function (function_sincos))
1757                     break;
1758
1759                   arg = gimple_call_arg (stmt, 0);
1760                   if (TREE_CODE (arg) == SSA_NAME)
1761                     cfg_changed |= execute_cse_sincos_1 (arg);
1762                   break;
1763
1764                 CASE_FLT_FN (BUILT_IN_POW):
1765                   arg0 = gimple_call_arg (stmt, 0);
1766                   arg1 = gimple_call_arg (stmt, 1);
1767
1768                   loc = gimple_location (stmt);
1769                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1770
1771                   if (result)
1772                     {
1773                       tree lhs = gimple_get_lhs (stmt);
1774                       gassign *new_stmt = gimple_build_assign (lhs, result);
1775                       gimple_set_location (new_stmt, loc);
1776                       unlink_stmt_vdef (stmt);
1777                       gsi_replace (&gsi, new_stmt, true);
1778                       cleanup_eh = true;
1779                       if (gimple_vdef (stmt))
1780                         release_ssa_name (gimple_vdef (stmt));
1781                     }
1782                   break;
1783
1784                 CASE_FLT_FN (BUILT_IN_POWI):
1785                   arg0 = gimple_call_arg (stmt, 0);
1786                   arg1 = gimple_call_arg (stmt, 1);
1787                   loc = gimple_location (stmt);
1788
1789                   if (real_minus_onep (arg0))
1790                     {
1791                       tree t0, t1, cond, one, minus_one;
1792                       gassign *stmt;
1793
1794                       t0 = TREE_TYPE (arg0);
1795                       t1 = TREE_TYPE (arg1);
1796                       one = build_real (t0, dconst1);
1797                       minus_one = build_real (t0, dconstm1);
1798
1799                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1800                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1801                                                   arg1, build_int_cst (t1, 1));
1802                       gimple_set_location (stmt, loc);
1803                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1804
1805                       result = make_temp_ssa_name (t0, NULL, "powi");
1806                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1807                                                   minus_one, one);
1808                       gimple_set_location (stmt, loc);
1809                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1810                     }
1811                   else
1812                     {
1813                       if (!tree_fits_shwi_p (arg1))
1814                         break;
1815
1816                       n = tree_to_shwi (arg1);
1817                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1818                     }
1819
1820                   if (result)
1821                     {
1822                       tree lhs = gimple_get_lhs (stmt);
1823                       gassign *new_stmt = gimple_build_assign (lhs, result);
1824                       gimple_set_location (new_stmt, loc);
1825                       unlink_stmt_vdef (stmt);
1826                       gsi_replace (&gsi, new_stmt, true);
1827                       cleanup_eh = true;
1828                       if (gimple_vdef (stmt))
1829                         release_ssa_name (gimple_vdef (stmt));
1830                     }
1831                   break;
1832
1833                 CASE_FLT_FN (BUILT_IN_CABS):
1834                   arg0 = gimple_call_arg (stmt, 0);
1835                   loc = gimple_location (stmt);
1836                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1837
1838                   if (result)
1839                     {
1840                       tree lhs = gimple_get_lhs (stmt);
1841                       gassign *new_stmt = gimple_build_assign (lhs, result);
1842                       gimple_set_location (new_stmt, loc);
1843                       unlink_stmt_vdef (stmt);
1844                       gsi_replace (&gsi, new_stmt, true);
1845                       cleanup_eh = true;
1846                       if (gimple_vdef (stmt))
1847                         release_ssa_name (gimple_vdef (stmt));
1848                     }
1849                   break;
1850
1851                 default:;
1852                 }
1853             }
1854         }
1855       if (cleanup_eh)
1856         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1857     }
1858
1859   statistics_counter_event (fun, "sincos statements inserted",
1860                             sincos_stats.inserted);
1861
1862   free_dominance_info (CDI_DOMINATORS);
1863   return cfg_changed ? TODO_cleanup_cfg : 0;
1864 }
1865
1866 } // anon namespace
1867
1868 gimple_opt_pass *
1869 make_pass_cse_sincos (gcc::context *ctxt)
1870 {
1871   return new pass_cse_sincos (ctxt);
1872 }
1873
1874 /* A symbolic number is used to detect byte permutation and selection
1875    patterns.  Therefore the field N contains an artificial number
1876    consisting of octet sized markers:
1877
1878    0    - target byte has the value 0
1879    FF   - target byte has an unknown value (eg. due to sign extension)
1880    1..size - marker value is the target byte index minus one.
1881
1882    To detect permutations on memory sources (arrays and structures), a symbolic
1883    number is also associated a base address (the array or structure the load is
1884    made from), an offset from the base address and a range which gives the
1885    difference between the highest and lowest accessed memory location to make
1886    such a symbolic number. The range is thus different from size which reflects
1887    the size of the type of current expression. Note that for non memory source,
1888    range holds the same value as size.
1889
1890    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1891    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1892    still have a size of 2 but this time a range of 1.  */
1893
1894 struct symbolic_number {
1895   uint64_t n;
1896   tree type;
1897   tree base_addr;
1898   tree offset;
1899   HOST_WIDE_INT bytepos;
1900   tree alias_set;
1901   tree vuse;
1902   unsigned HOST_WIDE_INT range;
1903 };
1904
1905 #define BITS_PER_MARKER 8
1906 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1907 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1908 #define HEAD_MARKER(n, size) \
1909   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1910
1911 /* The number which the find_bswap_or_nop_1 result should match in
1912    order to have a nop.  The number is masked according to the size of
1913    the symbolic number before using it.  */
1914 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1915   (uint64_t)0x08070605 << 32 | 0x04030201)
1916
1917 /* The number which the find_bswap_or_nop_1 result should match in
1918    order to have a byte swap.  The number is masked according to the
1919    size of the symbolic number before using it.  */
1920 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1921   (uint64_t)0x01020304 << 32 | 0x05060708)
1922
1923 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1924    number N.  Return false if the requested operation is not permitted
1925    on a symbolic number.  */
1926
1927 static inline bool
1928 do_shift_rotate (enum tree_code code,
1929                  struct symbolic_number *n,
1930                  int count)
1931 {
1932   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1933   unsigned head_marker;
1934
1935   if (count % BITS_PER_UNIT != 0)
1936     return false;
1937   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1938
1939   /* Zero out the extra bits of N in order to avoid them being shifted
1940      into the significant bits.  */
1941   if (size < 64 / BITS_PER_MARKER)
1942     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1943
1944   switch (code)
1945     {
1946     case LSHIFT_EXPR:
1947       n->n <<= count;
1948       break;
1949     case RSHIFT_EXPR:
1950       head_marker = HEAD_MARKER (n->n, size);
1951       n->n >>= count;
1952       /* Arithmetic shift of signed type: result is dependent on the value.  */
1953       if (!TYPE_UNSIGNED (n->type) && head_marker)
1954         for (i = 0; i < count / BITS_PER_MARKER; i++)
1955           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1956                   << ((size - 1 - i) * BITS_PER_MARKER);
1957       break;
1958     case LROTATE_EXPR:
1959       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1960       break;
1961     case RROTATE_EXPR:
1962       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1963       break;
1964     default:
1965       return false;
1966     }
1967   /* Zero unused bits for size.  */
1968   if (size < 64 / BITS_PER_MARKER)
1969     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1970   return true;
1971 }
1972
1973 /* Perform sanity checking for the symbolic number N and the gimple
1974    statement STMT.  */
1975
1976 static inline bool
1977 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
1978 {
1979   tree lhs_type;
1980
1981   lhs_type = gimple_expr_type (stmt);
1982
1983   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1984     return false;
1985
1986   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1987     return false;
1988
1989   return true;
1990 }
1991
1992 /* Initialize the symbolic number N for the bswap pass from the base element
1993    SRC manipulated by the bitwise OR expression.  */
1994
1995 static bool
1996 init_symbolic_number (struct symbolic_number *n, tree src)
1997 {
1998   int size;
1999
2000   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2001
2002   /* Set up the symbolic number N by setting each byte to a value between 1 and
2003      the byte size of rhs1.  The highest order byte is set to n->size and the
2004      lowest order byte to 1.  */
2005   n->type = TREE_TYPE (src);
2006   size = TYPE_PRECISION (n->type);
2007   if (size % BITS_PER_UNIT != 0)
2008     return false;
2009   size /= BITS_PER_UNIT;
2010   if (size > 64 / BITS_PER_MARKER)
2011     return false;
2012   n->range = size;
2013   n->n = CMPNOP;
2014
2015   if (size < 64 / BITS_PER_MARKER)
2016     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2017
2018   return true;
2019 }
2020
2021 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2022    the answer. If so, REF is that memory source and the base of the memory area
2023    accessed and the offset of the access from that base are recorded in N.  */
2024
2025 bool
2026 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2027 {
2028   /* Leaf node is an array or component ref. Memorize its base and
2029      offset from base to compare to other such leaf node.  */
2030   HOST_WIDE_INT bitsize, bitpos;
2031   machine_mode mode;
2032   int unsignedp, volatilep;
2033   tree offset, base_addr;
2034
2035   /* Not prepared to handle PDP endian.  */
2036   if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2037     return false;
2038
2039   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2040     return false;
2041
2042   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2043                                    &unsignedp, &volatilep, false);
2044
2045   if (TREE_CODE (base_addr) == MEM_REF)
2046     {
2047       offset_int bit_offset = 0;
2048       tree off = TREE_OPERAND (base_addr, 1);
2049
2050       if (!integer_zerop (off))
2051         {
2052           offset_int boff, coff = mem_ref_offset (base_addr);
2053           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
2054           bit_offset += boff;
2055         }
2056
2057       base_addr = TREE_OPERAND (base_addr, 0);
2058
2059       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
2060       if (wi::neg_p (bit_offset))
2061         {
2062           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2063           offset_int tem = bit_offset.and_not (mask);
2064           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2065              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
2066           bit_offset -= tem;
2067           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
2068           if (offset)
2069             offset = size_binop (PLUS_EXPR, offset,
2070                                     wide_int_to_tree (sizetype, tem));
2071           else
2072             offset = wide_int_to_tree (sizetype, tem);
2073         }
2074
2075       bitpos += bit_offset.to_shwi ();
2076     }
2077
2078   if (bitpos % BITS_PER_UNIT)
2079     return false;
2080   if (bitsize % BITS_PER_UNIT)
2081     return false;
2082
2083   if (!init_symbolic_number (n, ref))
2084     return false;
2085   n->base_addr = base_addr;
2086   n->offset = offset;
2087   n->bytepos = bitpos / BITS_PER_UNIT;
2088   n->alias_set = reference_alias_ptr_type (ref);
2089   n->vuse = gimple_vuse (stmt);
2090   return true;
2091 }
2092
2093 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2094    symbolic number N1 and N2 whose source statements are respectively
2095    SOURCE_STMT1 and SOURCE_STMT2.  */
2096
2097 static gimple *
2098 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2099                         gimple *source_stmt2, struct symbolic_number *n2,
2100                         struct symbolic_number *n)
2101 {
2102   int i, size;
2103   uint64_t mask;
2104   gimple *source_stmt;
2105   struct symbolic_number *n_start;
2106
2107   /* Sources are different, cancel bswap if they are not memory location with
2108      the same base (array, structure, ...).  */
2109   if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
2110     {
2111       uint64_t inc;
2112       HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2113       struct symbolic_number *toinc_n_ptr, *n_end;
2114
2115       if (!n1->base_addr || !n2->base_addr
2116           || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2117         return NULL;
2118
2119       if (!n1->offset != !n2->offset
2120           || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2121         return NULL;
2122
2123       if (n1->bytepos < n2->bytepos)
2124         {
2125           n_start = n1;
2126           start_sub = n2->bytepos - n1->bytepos;
2127           source_stmt = source_stmt1;
2128         }
2129       else
2130         {
2131           n_start = n2;
2132           start_sub = n1->bytepos - n2->bytepos;
2133           source_stmt = source_stmt2;
2134         }
2135
2136       /* Find the highest address at which a load is performed and
2137          compute related info.  */
2138       end1 = n1->bytepos + (n1->range - 1);
2139       end2 = n2->bytepos + (n2->range - 1);
2140       if (end1 < end2)
2141         {
2142           end = end2;
2143           end_sub = end2 - end1;
2144         }
2145       else
2146         {
2147           end = end1;
2148           end_sub = end1 - end2;
2149         }
2150       n_end = (end2 > end1) ? n2 : n1;
2151
2152       /* Find symbolic number whose lsb is the most significant.  */
2153       if (BYTES_BIG_ENDIAN)
2154         toinc_n_ptr = (n_end == n1) ? n2 : n1;
2155       else
2156         toinc_n_ptr = (n_start == n1) ? n2 : n1;
2157
2158       n->range = end - n_start->bytepos + 1;
2159
2160       /* Check that the range of memory covered can be represented by
2161          a symbolic number.  */
2162       if (n->range > 64 / BITS_PER_MARKER)
2163         return NULL;
2164
2165       /* Reinterpret byte marks in symbolic number holding the value of
2166          bigger weight according to target endianness.  */
2167       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2168       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2169       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2170         {
2171           unsigned marker
2172             = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2173           if (marker && marker != MARKER_BYTE_UNKNOWN)
2174             toinc_n_ptr->n += inc;
2175         }
2176     }
2177   else
2178     {
2179       n->range = n1->range;
2180       n_start = n1;
2181       source_stmt = source_stmt1;
2182     }
2183
2184   if (!n1->alias_set
2185       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2186     n->alias_set = n1->alias_set;
2187   else
2188     n->alias_set = ptr_type_node;
2189   n->vuse = n_start->vuse;
2190   n->base_addr = n_start->base_addr;
2191   n->offset = n_start->offset;
2192   n->bytepos = n_start->bytepos;
2193   n->type = n_start->type;
2194   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2195
2196   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2197     {
2198       uint64_t masked1, masked2;
2199
2200       masked1 = n1->n & mask;
2201       masked2 = n2->n & mask;
2202       if (masked1 && masked2 && masked1 != masked2)
2203         return NULL;
2204     }
2205   n->n = n1->n | n2->n;
2206
2207   return source_stmt;
2208 }
2209
2210 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2211    the operation given by the rhs of STMT on the result.  If the operation
2212    could successfully be executed the function returns a gimple stmt whose
2213    rhs's first tree is the expression of the source operand and NULL
2214    otherwise.  */
2215
2216 static gimple *
2217 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2218 {
2219   enum tree_code code;
2220   tree rhs1, rhs2 = NULL;
2221   gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2222   enum gimple_rhs_class rhs_class;
2223
2224   if (!limit || !is_gimple_assign (stmt))
2225     return NULL;
2226
2227   rhs1 = gimple_assign_rhs1 (stmt);
2228
2229   if (find_bswap_or_nop_load (stmt, rhs1, n))
2230     return stmt;
2231
2232   if (TREE_CODE (rhs1) != SSA_NAME)
2233     return NULL;
2234
2235   code = gimple_assign_rhs_code (stmt);
2236   rhs_class = gimple_assign_rhs_class (stmt);
2237   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2238
2239   if (rhs_class == GIMPLE_BINARY_RHS)
2240     rhs2 = gimple_assign_rhs2 (stmt);
2241
2242   /* Handle unary rhs and binary rhs with integer constants as second
2243      operand.  */
2244
2245   if (rhs_class == GIMPLE_UNARY_RHS
2246       || (rhs_class == GIMPLE_BINARY_RHS
2247           && TREE_CODE (rhs2) == INTEGER_CST))
2248     {
2249       if (code != BIT_AND_EXPR
2250           && code != LSHIFT_EXPR
2251           && code != RSHIFT_EXPR
2252           && code != LROTATE_EXPR
2253           && code != RROTATE_EXPR
2254           && !CONVERT_EXPR_CODE_P (code))
2255         return NULL;
2256
2257       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2258
2259       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2260          we have to initialize the symbolic number.  */
2261       if (!source_stmt1)
2262         {
2263           if (gimple_assign_load_p (stmt)
2264               || !init_symbolic_number (n, rhs1))
2265             return NULL;
2266           source_stmt1 = stmt;
2267         }
2268
2269       switch (code)
2270         {
2271         case BIT_AND_EXPR:
2272           {
2273             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2274             uint64_t val = int_cst_value (rhs2), mask = 0;
2275             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2276
2277             /* Only constants masking full bytes are allowed.  */
2278             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2279               if ((val & tmp) != 0 && (val & tmp) != tmp)
2280                 return NULL;
2281               else if (val & tmp)
2282                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2283
2284             n->n &= mask;
2285           }
2286           break;
2287         case LSHIFT_EXPR:
2288         case RSHIFT_EXPR:
2289         case LROTATE_EXPR:
2290         case RROTATE_EXPR:
2291           if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2292             return NULL;
2293           break;
2294         CASE_CONVERT:
2295           {
2296             int i, type_size, old_type_size;
2297             tree type;
2298
2299             type = gimple_expr_type (stmt);
2300             type_size = TYPE_PRECISION (type);
2301             if (type_size % BITS_PER_UNIT != 0)
2302               return NULL;
2303             type_size /= BITS_PER_UNIT;
2304             if (type_size > 64 / BITS_PER_MARKER)
2305               return NULL;
2306
2307             /* Sign extension: result is dependent on the value.  */
2308             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2309             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2310                 && HEAD_MARKER (n->n, old_type_size))
2311               for (i = 0; i < type_size - old_type_size; i++)
2312                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2313                         << ((type_size - 1 - i) * BITS_PER_MARKER);
2314
2315             if (type_size < 64 / BITS_PER_MARKER)
2316               {
2317                 /* If STMT casts to a smaller type mask out the bits not
2318                    belonging to the target type.  */
2319                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2320               }
2321             n->type = type;
2322             if (!n->base_addr)
2323               n->range = type_size;
2324           }
2325           break;
2326         default:
2327           return NULL;
2328         };
2329       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2330     }
2331
2332   /* Handle binary rhs.  */
2333
2334   if (rhs_class == GIMPLE_BINARY_RHS)
2335     {
2336       struct symbolic_number n1, n2;
2337       gimple *source_stmt, *source_stmt2;
2338
2339       if (code != BIT_IOR_EXPR)
2340         return NULL;
2341
2342       if (TREE_CODE (rhs2) != SSA_NAME)
2343         return NULL;
2344
2345       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2346
2347       switch (code)
2348         {
2349         case BIT_IOR_EXPR:
2350           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2351
2352           if (!source_stmt1)
2353             return NULL;
2354
2355           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2356
2357           if (!source_stmt2)
2358             return NULL;
2359
2360           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2361             return NULL;
2362
2363           if (!n1.vuse != !n2.vuse
2364               || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2365             return NULL;
2366
2367           source_stmt
2368             = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2369
2370           if (!source_stmt)
2371             return NULL;
2372
2373           if (!verify_symbolic_number_p (n, stmt))
2374             return NULL;
2375
2376           break;
2377         default:
2378           return NULL;
2379         }
2380       return source_stmt;
2381     }
2382   return NULL;
2383 }
2384
2385 /* Check if STMT completes a bswap implementation or a read in a given
2386    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2387    accordingly.  It also sets N to represent the kind of operations
2388    performed: size of the resulting expression and whether it works on
2389    a memory source, and if so alias-set and vuse.  At last, the
2390    function returns a stmt whose rhs's first tree is the source
2391    expression.  */
2392
2393 static gimple *
2394 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2395 {
2396 /* The number which the find_bswap_or_nop_1 result should match in order
2397    to have a full byte swap.  The number is shifted to the right
2398    according to the size of the symbolic number before using it.  */
2399   uint64_t cmpxchg = CMPXCHG;
2400   uint64_t cmpnop = CMPNOP;
2401
2402   gimple *source_stmt;
2403   int limit;
2404
2405   /* The last parameter determines the depth search limit.  It usually
2406      correlates directly to the number n of bytes to be touched.  We
2407      increase that number by log2(n) + 1 here in order to also
2408      cover signed -> unsigned conversions of the src operand as can be seen
2409      in libgcc, and for initial shift/and operation of the src operand.  */
2410   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2411   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2412   source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2413
2414   if (!source_stmt)
2415     return NULL;
2416
2417   /* Find real size of result (highest non-zero byte).  */
2418   if (n->base_addr)
2419     {
2420       int rsize;
2421       uint64_t tmpn;
2422
2423       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2424       n->range = rsize;
2425     }
2426
2427   /* Zero out the extra bits of N and CMP*.  */
2428   if (n->range < (int) sizeof (int64_t))
2429     {
2430       uint64_t mask;
2431
2432       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2433       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2434       cmpnop &= mask;
2435     }
2436
2437   /* A complete byte swap should make the symbolic number to start with
2438      the largest digit in the highest order byte. Unchanged symbolic
2439      number indicates a read with same endianness as target architecture.  */
2440   if (n->n == cmpnop)
2441     *bswap = false;
2442   else if (n->n == cmpxchg)
2443     *bswap = true;
2444   else
2445     return NULL;
2446
2447   /* Useless bit manipulation performed by code.  */
2448   if (!n->base_addr && n->n == cmpnop)
2449     return NULL;
2450
2451   n->range *= BITS_PER_UNIT;
2452   return source_stmt;
2453 }
2454
2455 namespace {
2456
2457 const pass_data pass_data_optimize_bswap =
2458 {
2459   GIMPLE_PASS, /* type */
2460   "bswap", /* name */
2461   OPTGROUP_NONE, /* optinfo_flags */
2462   TV_NONE, /* tv_id */
2463   PROP_ssa, /* properties_required */
2464   0, /* properties_provided */
2465   0, /* properties_destroyed */
2466   0, /* todo_flags_start */
2467   0, /* todo_flags_finish */
2468 };
2469
2470 class pass_optimize_bswap : public gimple_opt_pass
2471 {
2472 public:
2473   pass_optimize_bswap (gcc::context *ctxt)
2474     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2475   {}
2476
2477   /* opt_pass methods: */
2478   virtual bool gate (function *)
2479     {
2480       return flag_expensive_optimizations && optimize;
2481     }
2482
2483   virtual unsigned int execute (function *);
2484
2485 }; // class pass_optimize_bswap
2486
2487 /* Perform the bswap optimization: replace the expression computed in the rhs
2488    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2489    Which of these alternatives replace the rhs is given by N->base_addr (non
2490    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2491    load to perform are also given in N while the builtin bswap invoke is given
2492    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2493    load statements involved to construct the rhs in CUR_STMT and N->range gives
2494    the size of the rhs expression for maintaining some statistics.
2495
2496    Note that if the replacement involve a load, CUR_STMT is moved just after
2497    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2498    changing of basic block.  */
2499
2500 static bool
2501 bswap_replace (gimple *cur_stmt, gimple *src_stmt, tree fndecl,
2502                tree bswap_type, tree load_type, struct symbolic_number *n,
2503                bool bswap)
2504 {
2505   gimple_stmt_iterator gsi;
2506   tree src, tmp, tgt;
2507   gimple *bswap_stmt;
2508
2509   gsi = gsi_for_stmt (cur_stmt);
2510   src = gimple_assign_rhs1 (src_stmt);
2511   tgt = gimple_assign_lhs (cur_stmt);
2512
2513   /* Need to load the value from memory first.  */
2514   if (n->base_addr)
2515     {
2516       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2517       tree addr_expr, addr_tmp, val_expr, val_tmp;
2518       tree load_offset_ptr, aligned_load_type;
2519       gimple *addr_stmt, *load_stmt;
2520       unsigned align;
2521       HOST_WIDE_INT load_offset = 0;
2522
2523       align = get_object_alignment (src);
2524       /* If the new access is smaller than the original one, we need
2525          to perform big endian adjustment.  */
2526       if (BYTES_BIG_ENDIAN)
2527         {
2528           HOST_WIDE_INT bitsize, bitpos;
2529           machine_mode mode;
2530           int unsignedp, volatilep;
2531           tree offset;
2532
2533           get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2534                                &unsignedp, &volatilep, false);
2535           if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2536             {
2537               load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2538               unsigned HOST_WIDE_INT l
2539                 = (load_offset * BITS_PER_UNIT) & (align - 1);
2540               if (l)
2541                 align = l & -l;
2542             }
2543         }
2544
2545       if (bswap
2546           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2547           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2548         return false;
2549
2550       /* Move cur_stmt just before  one of the load of the original
2551          to ensure it has the same VUSE.  See PR61517 for what could
2552          go wrong.  */
2553       gsi_move_before (&gsi, &gsi_ins);
2554       gsi = gsi_for_stmt (cur_stmt);
2555
2556       /* Compute address to load from and cast according to the size
2557          of the load.  */
2558       addr_expr = build_fold_addr_expr (unshare_expr (src));
2559       if (is_gimple_mem_ref_addr (addr_expr))
2560         addr_tmp = addr_expr;
2561       else
2562         {
2563           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2564                                          "load_src");
2565           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2566           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2567         }
2568
2569       /* Perform the load.  */
2570       aligned_load_type = load_type;
2571       if (align < TYPE_ALIGN (load_type))
2572         aligned_load_type = build_aligned_type (load_type, align);
2573       load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2574       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2575                               load_offset_ptr);
2576
2577       if (!bswap)
2578         {
2579           if (n->range == 16)
2580             nop_stats.found_16bit++;
2581           else if (n->range == 32)
2582             nop_stats.found_32bit++;
2583           else
2584             {
2585               gcc_assert (n->range == 64);
2586               nop_stats.found_64bit++;
2587             }
2588
2589           /* Convert the result of load if necessary.  */
2590           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2591             {
2592               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2593                                             "load_dst");
2594               load_stmt = gimple_build_assign (val_tmp, val_expr);
2595               gimple_set_vuse (load_stmt, n->vuse);
2596               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2597               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2598             }
2599           else
2600             {
2601               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2602               gimple_set_vuse (cur_stmt, n->vuse);
2603             }
2604           update_stmt (cur_stmt);
2605
2606           if (dump_file)
2607             {
2608               fprintf (dump_file,
2609                        "%d bit load in target endianness found at: ",
2610                        (int) n->range);
2611               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2612             }
2613           return true;
2614         }
2615       else
2616         {
2617           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2618           load_stmt = gimple_build_assign (val_tmp, val_expr);
2619           gimple_set_vuse (load_stmt, n->vuse);
2620           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2621         }
2622       src = val_tmp;
2623     }
2624
2625   if (n->range == 16)
2626     bswap_stats.found_16bit++;
2627   else if (n->range == 32)
2628     bswap_stats.found_32bit++;
2629   else
2630     {
2631       gcc_assert (n->range == 64);
2632       bswap_stats.found_64bit++;
2633     }
2634
2635   tmp = src;
2636
2637   /* Convert the src expression if necessary.  */
2638   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2639     {
2640       gimple *convert_stmt;
2641
2642       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2643       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2644       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2645     }
2646
2647   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2648      are considered as rotation of 2N bit values by N bits is generally not
2649      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2650      gives 0x03040102 while a bswap for that value is 0x04030201.  */
2651   if (bswap && n->range == 16)
2652     {
2653       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2654       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2655       bswap_stmt = gimple_build_assign (NULL, src);
2656     }
2657   else
2658     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2659
2660   tmp = tgt;
2661
2662   /* Convert the result if necessary.  */
2663   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2664     {
2665       gimple *convert_stmt;
2666
2667       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2668       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2669       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2670     }
2671
2672   gimple_set_lhs (bswap_stmt, tmp);
2673
2674   if (dump_file)
2675     {
2676       fprintf (dump_file, "%d bit bswap implementation found at: ",
2677                (int) n->range);
2678       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2679     }
2680
2681   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2682   gsi_remove (&gsi, true);
2683   return true;
2684 }
2685
2686 /* Find manual byte swap implementations as well as load in a given
2687    endianness. Byte swaps are turned into a bswap builtin invokation
2688    while endian loads are converted to bswap builtin invokation or
2689    simple load according to the target endianness.  */
2690
2691 unsigned int
2692 pass_optimize_bswap::execute (function *fun)
2693 {
2694   basic_block bb;
2695   bool bswap32_p, bswap64_p;
2696   bool changed = false;
2697   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2698
2699   if (BITS_PER_UNIT != 8)
2700     return 0;
2701
2702   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2703                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2704   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2705                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2706                    || (bswap32_p && word_mode == SImode)));
2707
2708   /* Determine the argument type of the builtins.  The code later on
2709      assumes that the return and argument type are the same.  */
2710   if (bswap32_p)
2711     {
2712       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2713       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2714     }
2715
2716   if (bswap64_p)
2717     {
2718       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2719       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2720     }
2721
2722   memset (&nop_stats, 0, sizeof (nop_stats));
2723   memset (&bswap_stats, 0, sizeof (bswap_stats));
2724
2725   FOR_EACH_BB_FN (bb, fun)
2726     {
2727       gimple_stmt_iterator gsi;
2728
2729       /* We do a reverse scan for bswap patterns to make sure we get the
2730          widest match. As bswap pattern matching doesn't handle previously
2731          inserted smaller bswap replacements as sub-patterns, the wider
2732          variant wouldn't be detected.  */
2733       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2734         {
2735           gimple *src_stmt, *cur_stmt = gsi_stmt (gsi);
2736           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2737           enum tree_code code;
2738           struct symbolic_number n;
2739           bool bswap;
2740
2741           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2742              might be moved to a different basic block by bswap_replace and gsi
2743              must not points to it if that's the case.  Moving the gsi_prev
2744              there make sure that gsi points to the statement previous to
2745              cur_stmt while still making sure that all statements are
2746              considered in this basic block.  */
2747           gsi_prev (&gsi);
2748
2749           if (!is_gimple_assign (cur_stmt))
2750             continue;
2751
2752           code = gimple_assign_rhs_code (cur_stmt);
2753           switch (code)
2754             {
2755             case LROTATE_EXPR:
2756             case RROTATE_EXPR:
2757               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2758                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2759                      % BITS_PER_UNIT)
2760                 continue;
2761               /* Fall through.  */
2762             case BIT_IOR_EXPR:
2763               break;
2764             default:
2765               continue;
2766             }
2767
2768           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2769
2770           if (!src_stmt)
2771             continue;
2772
2773           switch (n.range)
2774             {
2775             case 16:
2776               /* Already in canonical form, nothing to do.  */
2777               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2778                 continue;
2779               load_type = bswap_type = uint16_type_node;
2780               break;
2781             case 32:
2782               load_type = uint32_type_node;
2783               if (bswap32_p)
2784                 {
2785                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2786                   bswap_type = bswap32_type;
2787                 }
2788               break;
2789             case 64:
2790               load_type = uint64_type_node;
2791               if (bswap64_p)
2792                 {
2793                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2794                   bswap_type = bswap64_type;
2795                 }
2796               break;
2797             default:
2798               continue;
2799             }
2800
2801           if (bswap && !fndecl && n.range != 16)
2802             continue;
2803
2804           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2805                              &n, bswap))
2806             changed = true;
2807         }
2808     }
2809
2810   statistics_counter_event (fun, "16-bit nop implementations found",
2811                             nop_stats.found_16bit);
2812   statistics_counter_event (fun, "32-bit nop implementations found",
2813                             nop_stats.found_32bit);
2814   statistics_counter_event (fun, "64-bit nop implementations found",
2815                             nop_stats.found_64bit);
2816   statistics_counter_event (fun, "16-bit bswap implementations found",
2817                             bswap_stats.found_16bit);
2818   statistics_counter_event (fun, "32-bit bswap implementations found",
2819                             bswap_stats.found_32bit);
2820   statistics_counter_event (fun, "64-bit bswap implementations found",
2821                             bswap_stats.found_64bit);
2822
2823   return (changed ? TODO_update_ssa : 0);
2824 }
2825
2826 } // anon namespace
2827
2828 gimple_opt_pass *
2829 make_pass_optimize_bswap (gcc::context *ctxt)
2830 {
2831   return new pass_optimize_bswap (ctxt);
2832 }
2833
2834 /* Return true if stmt is a type conversion operation that can be stripped
2835    when used in a widening multiply operation.  */
2836 static bool
2837 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2838 {
2839   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2840
2841   if (TREE_CODE (result_type) == INTEGER_TYPE)
2842     {
2843       tree op_type;
2844       tree inner_op_type;
2845
2846       if (!CONVERT_EXPR_CODE_P (rhs_code))
2847         return false;
2848
2849       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2850
2851       /* If the type of OP has the same precision as the result, then
2852          we can strip this conversion.  The multiply operation will be
2853          selected to create the correct extension as a by-product.  */
2854       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2855         return true;
2856
2857       /* We can also strip a conversion if it preserves the signed-ness of
2858          the operation and doesn't narrow the range.  */
2859       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2860
2861       /* If the inner-most type is unsigned, then we can strip any
2862          intermediate widening operation.  If it's signed, then the
2863          intermediate widening operation must also be signed.  */
2864       if ((TYPE_UNSIGNED (inner_op_type)
2865            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2866           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2867         return true;
2868
2869       return false;
2870     }
2871
2872   return rhs_code == FIXED_CONVERT_EXPR;
2873 }
2874
2875 /* Return true if RHS is a suitable operand for a widening multiplication,
2876    assuming a target type of TYPE.
2877    There are two cases:
2878
2879      - RHS makes some value at least twice as wide.  Store that value
2880        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2881
2882      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2883        but leave *TYPE_OUT untouched.  */
2884
2885 static bool
2886 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2887                         tree *new_rhs_out)
2888 {
2889   gimple *stmt;
2890   tree type1, rhs1;
2891
2892   if (TREE_CODE (rhs) == SSA_NAME)
2893     {
2894       stmt = SSA_NAME_DEF_STMT (rhs);
2895       if (is_gimple_assign (stmt))
2896         {
2897           if (! widening_mult_conversion_strippable_p (type, stmt))
2898             rhs1 = rhs;
2899           else
2900             {
2901               rhs1 = gimple_assign_rhs1 (stmt);
2902
2903               if (TREE_CODE (rhs1) == INTEGER_CST)
2904                 {
2905                   *new_rhs_out = rhs1;
2906                   *type_out = NULL;
2907                   return true;
2908                 }
2909             }
2910         }
2911       else
2912         rhs1 = rhs;
2913
2914       type1 = TREE_TYPE (rhs1);
2915
2916       if (TREE_CODE (type1) != TREE_CODE (type)
2917           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2918         return false;
2919
2920       *new_rhs_out = rhs1;
2921       *type_out = type1;
2922       return true;
2923     }
2924
2925   if (TREE_CODE (rhs) == INTEGER_CST)
2926     {
2927       *new_rhs_out = rhs;
2928       *type_out = NULL;
2929       return true;
2930     }
2931
2932   return false;
2933 }
2934
2935 /* Return true if STMT performs a widening multiplication, assuming the
2936    output type is TYPE.  If so, store the unwidened types of the operands
2937    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2938    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2939    and *TYPE2_OUT would give the operands of the multiplication.  */
2940
2941 static bool
2942 is_widening_mult_p (gimple *stmt,
2943                     tree *type1_out, tree *rhs1_out,
2944                     tree *type2_out, tree *rhs2_out)
2945 {
2946   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2947
2948   if (TREE_CODE (type) != INTEGER_TYPE
2949       && TREE_CODE (type) != FIXED_POINT_TYPE)
2950     return false;
2951
2952   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2953                                rhs1_out))
2954     return false;
2955
2956   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2957                                rhs2_out))
2958     return false;
2959
2960   if (*type1_out == NULL)
2961     {
2962       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2963         return false;
2964       *type1_out = *type2_out;
2965     }
2966
2967   if (*type2_out == NULL)
2968     {
2969       if (!int_fits_type_p (*rhs2_out, *type1_out))
2970         return false;
2971       *type2_out = *type1_out;
2972     }
2973
2974   /* Ensure that the larger of the two operands comes first. */
2975   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2976     {
2977       std::swap (*type1_out, *type2_out);
2978       std::swap (*rhs1_out, *rhs2_out);
2979     }
2980
2981   return true;
2982 }
2983
2984 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2985    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2986    value is true iff we converted the statement.  */
2987
2988 static bool
2989 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2990 {
2991   tree lhs, rhs1, rhs2, type, type1, type2;
2992   enum insn_code handler;
2993   machine_mode to_mode, from_mode, actual_mode;
2994   optab op;
2995   int actual_precision;
2996   location_t loc = gimple_location (stmt);
2997   bool from_unsigned1, from_unsigned2;
2998
2999   lhs = gimple_assign_lhs (stmt);
3000   type = TREE_TYPE (lhs);
3001   if (TREE_CODE (type) != INTEGER_TYPE)
3002     return false;
3003
3004   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3005     return false;
3006
3007   to_mode = TYPE_MODE (type);
3008   from_mode = TYPE_MODE (type1);
3009   from_unsigned1 = TYPE_UNSIGNED (type1);
3010   from_unsigned2 = TYPE_UNSIGNED (type2);
3011
3012   if (from_unsigned1 && from_unsigned2)
3013     op = umul_widen_optab;
3014   else if (!from_unsigned1 && !from_unsigned2)
3015     op = smul_widen_optab;
3016   else
3017     op = usmul_widen_optab;
3018
3019   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3020                                                   0, &actual_mode);
3021
3022   if (handler == CODE_FOR_nothing)
3023     {
3024       if (op != smul_widen_optab)
3025         {
3026           /* We can use a signed multiply with unsigned types as long as
3027              there is a wider mode to use, or it is the smaller of the two
3028              types that is unsigned.  Note that type1 >= type2, always.  */
3029           if ((TYPE_UNSIGNED (type1)
3030                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3031               || (TYPE_UNSIGNED (type2)
3032                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3033             {
3034               from_mode = GET_MODE_WIDER_MODE (from_mode);
3035               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3036                 return false;
3037             }
3038
3039           op = smul_widen_optab;
3040           handler = find_widening_optab_handler_and_mode (op, to_mode,
3041                                                           from_mode, 0,
3042                                                           &actual_mode);
3043
3044           if (handler == CODE_FOR_nothing)
3045             return false;
3046
3047           from_unsigned1 = from_unsigned2 = false;
3048         }
3049       else
3050         return false;
3051     }
3052
3053   /* Ensure that the inputs to the handler are in the correct precison
3054      for the opcode.  This will be the full mode size.  */
3055   actual_precision = GET_MODE_PRECISION (actual_mode);
3056   if (2 * actual_precision > TYPE_PRECISION (type))
3057     return false;
3058   if (actual_precision != TYPE_PRECISION (type1)
3059       || from_unsigned1 != TYPE_UNSIGNED (type1))
3060     rhs1 = build_and_insert_cast (gsi, loc,
3061                                   build_nonstandard_integer_type
3062                                     (actual_precision, from_unsigned1), rhs1);
3063   if (actual_precision != TYPE_PRECISION (type2)
3064       || from_unsigned2 != TYPE_UNSIGNED (type2))
3065     rhs2 = build_and_insert_cast (gsi, loc,
3066                                   build_nonstandard_integer_type
3067                                     (actual_precision, from_unsigned2), rhs2);
3068
3069   /* Handle constants.  */
3070   if (TREE_CODE (rhs1) == INTEGER_CST)
3071     rhs1 = fold_convert (type1, rhs1);
3072   if (TREE_CODE (rhs2) == INTEGER_CST)
3073     rhs2 = fold_convert (type2, rhs2);
3074
3075   gimple_assign_set_rhs1 (stmt, rhs1);
3076   gimple_assign_set_rhs2 (stmt, rhs2);
3077   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3078   update_stmt (stmt);
3079   widen_mul_stats.widen_mults_inserted++;
3080   return true;
3081 }
3082
3083 /* Process a single gimple statement STMT, which is found at the
3084    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3085    rhs (given by CODE), and try to convert it into a
3086    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
3087    is true iff we converted the statement.  */
3088
3089 static bool
3090 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3091                             enum tree_code code)
3092 {
3093   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3094   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3095   tree type, type1, type2, optype;
3096   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3097   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3098   optab this_optab;
3099   enum tree_code wmult_code;
3100   enum insn_code handler;
3101   machine_mode to_mode, from_mode, actual_mode;
3102   location_t loc = gimple_location (stmt);
3103   int actual_precision;
3104   bool from_unsigned1, from_unsigned2;
3105
3106   lhs = gimple_assign_lhs (stmt);
3107   type = TREE_TYPE (lhs);
3108   if (TREE_CODE (type) != INTEGER_TYPE
3109       && TREE_CODE (type) != FIXED_POINT_TYPE)
3110     return false;
3111
3112   if (code == MINUS_EXPR)
3113     wmult_code = WIDEN_MULT_MINUS_EXPR;
3114   else
3115     wmult_code = WIDEN_MULT_PLUS_EXPR;
3116
3117   rhs1 = gimple_assign_rhs1 (stmt);
3118   rhs2 = gimple_assign_rhs2 (stmt);
3119
3120   if (TREE_CODE (rhs1) == SSA_NAME)
3121     {
3122       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3123       if (is_gimple_assign (rhs1_stmt))
3124         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3125     }
3126
3127   if (TREE_CODE (rhs2) == SSA_NAME)
3128     {
3129       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3130       if (is_gimple_assign (rhs2_stmt))
3131         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3132     }
3133
3134   /* Allow for one conversion statement between the multiply
3135      and addition/subtraction statement.  If there are more than
3136      one conversions then we assume they would invalidate this
3137      transformation.  If that's not the case then they should have
3138      been folded before now.  */
3139   if (CONVERT_EXPR_CODE_P (rhs1_code))
3140     {
3141       conv1_stmt = rhs1_stmt;
3142       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3143       if (TREE_CODE (rhs1) == SSA_NAME)
3144         {
3145           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3146           if (is_gimple_assign (rhs1_stmt))
3147             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3148         }
3149       else
3150         return false;
3151     }
3152   if (CONVERT_EXPR_CODE_P (rhs2_code))
3153     {
3154       conv2_stmt = rhs2_stmt;
3155       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3156       if (TREE_CODE (rhs2) == SSA_NAME)
3157         {
3158           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3159           if (is_gimple_assign (rhs2_stmt))
3160             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3161         }
3162       else
3163         return false;
3164     }
3165
3166   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3167      is_widening_mult_p, but we still need the rhs returns.
3168
3169      It might also appear that it would be sufficient to use the existing
3170      operands of the widening multiply, but that would limit the choice of
3171      multiply-and-accumulate instructions.
3172
3173      If the widened-multiplication result has more than one uses, it is
3174      probably wiser not to do the conversion.  */
3175   if (code == PLUS_EXPR
3176       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3177     {
3178       if (!has_single_use (rhs1)
3179           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3180                                   &type2, &mult_rhs2))
3181         return false;
3182       add_rhs = rhs2;
3183       conv_stmt = conv1_stmt;
3184     }
3185   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3186     {
3187       if (!has_single_use (rhs2)
3188           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3189                                   &type2, &mult_rhs2))
3190         return false;
3191       add_rhs = rhs1;
3192       conv_stmt = conv2_stmt;
3193     }
3194   else
3195     return false;
3196
3197   to_mode = TYPE_MODE (type);
3198   from_mode = TYPE_MODE (type1);
3199   from_unsigned1 = TYPE_UNSIGNED (type1);
3200   from_unsigned2 = TYPE_UNSIGNED (type2);
3201   optype = type1;
3202
3203   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
3204   if (from_unsigned1 != from_unsigned2)
3205     {
3206       if (!INTEGRAL_TYPE_P (type))
3207         return false;
3208       /* We can use a signed multiply with unsigned types as long as
3209          there is a wider mode to use, or it is the smaller of the two
3210          types that is unsigned.  Note that type1 >= type2, always.  */
3211       if ((from_unsigned1
3212            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3213           || (from_unsigned2
3214               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3215         {
3216           from_mode = GET_MODE_WIDER_MODE (from_mode);
3217           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3218             return false;
3219         }
3220
3221       from_unsigned1 = from_unsigned2 = false;
3222       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3223                                                false);
3224     }
3225
3226   /* If there was a conversion between the multiply and addition
3227      then we need to make sure it fits a multiply-and-accumulate.
3228      The should be a single mode change which does not change the
3229      value.  */
3230   if (conv_stmt)
3231     {
3232       /* We use the original, unmodified data types for this.  */
3233       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3234       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3235       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3236       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3237
3238       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3239         {
3240           /* Conversion is a truncate.  */
3241           if (TYPE_PRECISION (to_type) < data_size)
3242             return false;
3243         }
3244       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3245         {
3246           /* Conversion is an extend.  Check it's the right sort.  */
3247           if (TYPE_UNSIGNED (from_type) != is_unsigned
3248               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3249             return false;
3250         }
3251       /* else convert is a no-op for our purposes.  */
3252     }
3253
3254   /* Verify that the machine can perform a widening multiply
3255      accumulate in this mode/signedness combination, otherwise
3256      this transformation is likely to pessimize code.  */
3257   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3258   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3259                                                   from_mode, 0, &actual_mode);
3260
3261   if (handler == CODE_FOR_nothing)
3262     return false;
3263
3264   /* Ensure that the inputs to the handler are in the correct precison
3265      for the opcode.  This will be the full mode size.  */
3266   actual_precision = GET_MODE_PRECISION (actual_mode);
3267   if (actual_precision != TYPE_PRECISION (type1)
3268       || from_unsigned1 != TYPE_UNSIGNED (type1))
3269     mult_rhs1 = build_and_insert_cast (gsi, loc,
3270                                        build_nonstandard_integer_type
3271                                          (actual_precision, from_unsigned1),
3272                                        mult_rhs1);
3273   if (actual_precision != TYPE_PRECISION (type2)
3274       || from_unsigned2 != TYPE_UNSIGNED (type2))
3275     mult_rhs2 = build_and_insert_cast (gsi, loc,
3276                                        build_nonstandard_integer_type
3277                                          (actual_precision, from_unsigned2),
3278                                        mult_rhs2);
3279
3280   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3281     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3282
3283   /* Handle constants.  */
3284   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3285     mult_rhs1 = fold_convert (type1, mult_rhs1);
3286   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3287     mult_rhs2 = fold_convert (type2, mult_rhs2);
3288
3289   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3290                                   add_rhs);
3291   update_stmt (gsi_stmt (*gsi));
3292   widen_mul_stats.maccs_inserted++;
3293   return true;
3294 }
3295
3296 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3297    with uses in additions and subtractions to form fused multiply-add
3298    operations.  Returns true if successful and MUL_STMT should be removed.  */
3299
3300 static bool
3301 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3302 {
3303   tree mul_result = gimple_get_lhs (mul_stmt);
3304   tree type = TREE_TYPE (mul_result);
3305   gimple *use_stmt, *neguse_stmt;
3306   gassign *fma_stmt;
3307   use_operand_p use_p;
3308   imm_use_iterator imm_iter;
3309
3310   if (FLOAT_TYPE_P (type)
3311       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3312     return false;
3313
3314   /* We don't want to do bitfield reduction ops.  */
3315   if (INTEGRAL_TYPE_P (type)
3316       && (TYPE_PRECISION (type)
3317           != GET_MODE_PRECISION (TYPE_MODE (type))))
3318     return false;
3319
3320   /* If the target doesn't support it, don't generate it.  We assume that
3321      if fma isn't available then fms, fnma or fnms are not either.  */
3322   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3323     return false;
3324
3325   /* If the multiplication has zero uses, it is kept around probably because
3326      of -fnon-call-exceptions.  Don't optimize it away in that case,
3327      it is DCE job.  */
3328   if (has_zero_uses (mul_result))
3329     return false;
3330
3331   /* Make sure that the multiplication statement becomes dead after
3332      the transformation, thus that all uses are transformed to FMAs.
3333      This means we assume that an FMA operation has the same cost
3334      as an addition.  */
3335   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3336     {
3337       enum tree_code use_code;
3338       tree result = mul_result;
3339       bool negate_p = false;
3340
3341       use_stmt = USE_STMT (use_p);
3342
3343       if (is_gimple_debug (use_stmt))
3344         continue;
3345
3346       /* For now restrict this operations to single basic blocks.  In theory
3347          we would want to support sinking the multiplication in
3348          m = a*b;
3349          if ()
3350            ma = m + c;
3351          else
3352            d = m;
3353          to form a fma in the then block and sink the multiplication to the
3354          else block.  */
3355       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3356         return false;
3357
3358       if (!is_gimple_assign (use_stmt))
3359         return false;
3360
3361       use_code = gimple_assign_rhs_code (use_stmt);
3362
3363       /* A negate on the multiplication leads to FNMA.  */
3364       if (use_code == NEGATE_EXPR)
3365         {
3366           ssa_op_iter iter;
3367           use_operand_p usep;
3368
3369           result = gimple_assign_lhs (use_stmt);
3370
3371           /* Make sure the negate statement becomes dead with this
3372              single transformation.  */
3373           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3374                                &use_p, &neguse_stmt))
3375             return false;
3376
3377           /* Make sure the multiplication isn't also used on that stmt.  */
3378           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3379             if (USE_FROM_PTR (usep) == mul_result)
3380               return false;
3381
3382           /* Re-validate.  */
3383           use_stmt = neguse_stmt;
3384           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3385             return false;
3386           if (!is_gimple_assign (use_stmt))
3387             return false;
3388
3389           use_code = gimple_assign_rhs_code (use_stmt);
3390           negate_p = true;
3391         }
3392
3393       switch (use_code)
3394         {
3395         case MINUS_EXPR:
3396           if (gimple_assign_rhs2 (use_stmt) == result)
3397             negate_p = !negate_p;
3398           break;
3399         case PLUS_EXPR:
3400           break;
3401         default:
3402           /* FMA can only be formed from PLUS and MINUS.  */
3403           return false;
3404         }
3405
3406       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3407          by a MULT_EXPR that we'll visit later, we might be able to
3408          get a more profitable match with fnma.
3409          OTOH, if we don't, a negate / fma pair has likely lower latency
3410          that a mult / subtract pair.  */
3411       if (use_code == MINUS_EXPR && !negate_p
3412           && gimple_assign_rhs1 (use_stmt) == result
3413           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3414           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3415         {
3416           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3417
3418           if (TREE_CODE (rhs2) == SSA_NAME)
3419             {
3420               gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3421               if (has_single_use (rhs2)
3422                   && is_gimple_assign (stmt2)
3423                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3424               return false;
3425             }
3426         }
3427
3428       /* We can't handle a * b + a * b.  */
3429       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3430         return false;
3431
3432       /* While it is possible to validate whether or not the exact form
3433          that we've recognized is available in the backend, the assumption
3434          is that the transformation is never a loss.  For instance, suppose
3435          the target only has the plain FMA pattern available.  Consider
3436          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3437          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3438          still have 3 operations, but in the FMA form the two NEGs are
3439          independent and could be run in parallel.  */
3440     }
3441
3442   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3443     {
3444       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3445       enum tree_code use_code;
3446       tree addop, mulop1 = op1, result = mul_result;
3447       bool negate_p = false;
3448
3449       if (is_gimple_debug (use_stmt))
3450         continue;
3451
3452       use_code = gimple_assign_rhs_code (use_stmt);
3453       if (use_code == NEGATE_EXPR)
3454         {
3455           result = gimple_assign_lhs (use_stmt);
3456           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3457           gsi_remove (&gsi, true);
3458           release_defs (use_stmt);
3459
3460           use_stmt = neguse_stmt;
3461           gsi = gsi_for_stmt (use_stmt);
3462           use_code = gimple_assign_rhs_code (use_stmt);
3463           negate_p = true;
3464         }
3465
3466       if (gimple_assign_rhs1 (use_stmt) == result)
3467         {
3468           addop = gimple_assign_rhs2 (use_stmt);
3469           /* a * b - c -> a * b + (-c)  */
3470           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3471             addop = force_gimple_operand_gsi (&gsi,
3472                                               build1 (NEGATE_EXPR,
3473                                                       type, addop),
3474                                               true, NULL_TREE, true,
3475                                               GSI_SAME_STMT);
3476         }
3477       else
3478         {
3479           addop = gimple_assign_rhs1 (use_stmt);
3480           /* a - b * c -> (-b) * c + a */
3481           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3482             negate_p = !negate_p;
3483         }
3484
3485       if (negate_p)
3486         mulop1 = force_gimple_operand_gsi (&gsi,
3487                                            build1 (NEGATE_EXPR,
3488                                                    type, mulop1),
3489                                            true, NULL_TREE, true,
3490                                            GSI_SAME_STMT);
3491
3492       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3493                                       FMA_EXPR, mulop1, op2, addop);
3494       gsi_replace (&gsi, fma_stmt, true);
3495       widen_mul_stats.fmas_inserted++;
3496     }
3497
3498   return true;
3499 }
3500
3501 /* Find integer multiplications where the operands are extended from
3502    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3503    where appropriate.  */
3504
3505 namespace {
3506
3507 const pass_data pass_data_optimize_widening_mul =
3508 {
3509   GIMPLE_PASS, /* type */
3510   "widening_mul", /* name */
3511   OPTGROUP_NONE, /* optinfo_flags */
3512   TV_NONE, /* tv_id */
3513   PROP_ssa, /* properties_required */
3514   0, /* properties_provided */
3515   0, /* properties_destroyed */
3516   0, /* todo_flags_start */
3517   TODO_update_ssa, /* todo_flags_finish */
3518 };
3519
3520 class pass_optimize_widening_mul : public gimple_opt_pass
3521 {
3522 public:
3523   pass_optimize_widening_mul (gcc::context *ctxt)
3524     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3525   {}
3526
3527   /* opt_pass methods: */
3528   virtual bool gate (function *)
3529     {
3530       return flag_expensive_optimizations && optimize;
3531     }
3532
3533   virtual unsigned int execute (function *);
3534
3535 }; // class pass_optimize_widening_mul
3536
3537 unsigned int
3538 pass_optimize_widening_mul::execute (function *fun)
3539 {
3540   basic_block bb;
3541   bool cfg_changed = false;
3542
3543   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3544
3545   FOR_EACH_BB_FN (bb, fun)
3546     {
3547       gimple_stmt_iterator gsi;
3548
3549       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3550         {
3551           gimple *stmt = gsi_stmt (gsi);
3552           enum tree_code code;
3553
3554           if (is_gimple_assign (stmt))
3555             {
3556               code = gimple_assign_rhs_code (stmt);
3557               switch (code)
3558                 {
3559                 case MULT_EXPR:
3560                   if (!convert_mult_to_widen (stmt, &gsi)
3561                       && convert_mult_to_fma (stmt,
3562                                               gimple_assign_rhs1 (stmt),
3563                                               gimple_assign_rhs2 (stmt)))
3564                     {
3565                       gsi_remove (&gsi, true);
3566                       release_defs (stmt);
3567                       continue;
3568                     }
3569                   break;
3570
3571                 case PLUS_EXPR:
3572                 case MINUS_EXPR:
3573                   convert_plusminus_to_widen (&gsi, stmt, code);
3574                   break;
3575
3576                 default:;
3577                 }
3578             }
3579           else if (is_gimple_call (stmt)
3580                    && gimple_call_lhs (stmt))
3581             {
3582               tree fndecl = gimple_call_fndecl (stmt);
3583               if (fndecl
3584                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3585                 {
3586                   switch (DECL_FUNCTION_CODE (fndecl))
3587                     {
3588                       case BUILT_IN_POWF:
3589                       case BUILT_IN_POW:
3590                       case BUILT_IN_POWL:
3591                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3592                             && real_equal
3593                                  (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3594                                   &dconst2)
3595                             && convert_mult_to_fma (stmt,
3596                                                     gimple_call_arg (stmt, 0),
3597                                                     gimple_call_arg (stmt, 0)))
3598                           {
3599                             unlink_stmt_vdef (stmt);
3600                             if (gsi_remove (&gsi, true)
3601                                 && gimple_purge_dead_eh_edges (bb))
3602                               cfg_changed = true;
3603                             release_defs (stmt);
3604                             continue;
3605                           }
3606                           break;
3607
3608                       default:;
3609                     }
3610                 }
3611             }
3612           gsi_next (&gsi);
3613         }
3614     }
3615
3616   statistics_counter_event (fun, "widening multiplications inserted",
3617                             widen_mul_stats.widen_mults_inserted);
3618   statistics_counter_event (fun, "widening maccs inserted",
3619                             widen_mul_stats.maccs_inserted);
3620   statistics_counter_event (fun, "fused multiply-adds inserted",
3621                             widen_mul_stats.fmas_inserted);
3622
3623   return cfg_changed ? TODO_cleanup_cfg : 0;
3624 }
3625
3626 } // anon namespace
3627
3628 gimple_opt_pass *
3629 make_pass_optimize_widening_mul (gcc::context *ctxt)
3630 {
3631   return new pass_optimize_widening_mul (ctxt);
3632 }