* tree-ssa-phiopt.c, config/arm/arm.c, config/fr30/fr30.md,
[official-gcc.git] / gcc / modulo-sched.c
blob4ea4ed803e92b77cb590bfe32769a8adcc7a4f33
1 /* Swing Modulo Scheduling implementation.
2 Copyright (C) 2004, 2005
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
21 02111-1307, USA. */
24 #include "config.h"
25 #include "system.h"
26 #include "coretypes.h"
27 #include "tm.h"
28 #include "toplev.h"
29 #include "rtl.h"
30 #include "tm_p.h"
31 #include "hard-reg-set.h"
32 #include "regs.h"
33 #include "function.h"
34 #include "flags.h"
35 #include "insn-config.h"
36 #include "insn-attr.h"
37 #include "except.h"
38 #include "toplev.h"
39 #include "recog.h"
40 #include "sched-int.h"
41 #include "target.h"
42 #include "cfglayout.h"
43 #include "cfgloop.h"
44 #include "cfghooks.h"
45 #include "expr.h"
46 #include "params.h"
47 #include "gcov-io.h"
48 #include "df.h"
49 #include "ddg.h"
51 #ifdef INSN_SCHEDULING
53 /* This file contains the implementation of the Swing Modulo Scheduler,
54 described in the following references:
55 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
56 Lifetime--sensitive modulo scheduling in a production environment.
57 IEEE Trans. on Comps., 50(3), March 2001
58 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
59 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
60 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
62 The basic structure is:
63 1. Build a data-dependence graph (DDG) for each loop.
64 2. Use the DDG to order the insns of a loop (not in topological order
65 necessarily, but rather) trying to place each insn after all its
66 predecessors _or_ after all its successors.
67 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
68 4. Use the ordering to perform list-scheduling of the loop:
69 1. Set II = MII. We will try to schedule the loop within II cycles.
70 2. Try to schedule the insns one by one according to the ordering.
71 For each insn compute an interval of cycles by considering already-
72 scheduled preds and succs (and associated latencies); try to place
73 the insn in the cycles of this window checking for potential
74 resource conflicts (using the DFA interface).
75 Note: this is different from the cycle-scheduling of schedule_insns;
76 here the insns are not scheduled monotonically top-down (nor bottom-
77 up).
78 3. If failed in scheduling all insns - bump II++ and try again, unless
79 II reaches an upper bound MaxII, in which case report failure.
80 5. If we succeeded in scheduling the loop within II cycles, we now
81 generate prolog and epilog, decrease the counter of the loop, and
82 perform modulo variable expansion for live ranges that span more than
83 II cycles (i.e. use register copies to prevent a def from overwriting
84 itself before reaching the use).
88 /* This page defines partial-schedule structures and functions for
89 modulo scheduling. */
91 typedef struct partial_schedule *partial_schedule_ptr;
92 typedef struct ps_insn *ps_insn_ptr;
94 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
95 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
97 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
98 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
100 /* Perform signed modulo, always returning a non-negative value. */
101 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
103 /* The number of different iterations the nodes in ps span, assuming
104 the stage boundaries are placed efficiently. */
105 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
106 + 1 + (ps)->ii - 1) / (ps)->ii)
108 /* A single instruction in the partial schedule. */
109 struct ps_insn
111 /* The corresponding DDG_NODE. */
112 ddg_node_ptr node;
114 /* The (absolute) cycle in which the PS instruction is scheduled.
115 Same as SCHED_TIME (node). */
116 int cycle;
118 /* The next/prev PS_INSN in the same row. */
119 ps_insn_ptr next_in_row,
120 prev_in_row;
122 /* The number of nodes in the same row that come after this node. */
123 int row_rest_count;
126 /* Holds the partial schedule as an array of II rows. Each entry of the
127 array points to a linked list of PS_INSNs, which represents the
128 instructions that are scheduled for that row. */
129 struct partial_schedule
131 int ii; /* Number of rows in the partial schedule. */
132 int history; /* Threshold for conflict checking using DFA. */
134 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
135 ps_insn_ptr *rows;
137 /* The earliest absolute cycle of an insn in the partial schedule. */
138 int min_cycle;
140 /* The latest absolute cycle of an insn in the partial schedule. */
141 int max_cycle;
143 ddg_ptr g; /* The DDG of the insns in the partial schedule. */
146 /* We use this to record all the register replacements we do in
147 the kernel so we can undo SMS if it is not profitable. */
148 struct undo_replace_buff_elem
150 rtx insn;
151 rtx orig_reg;
152 rtx new_reg;
153 struct undo_replace_buff_elem *next;
158 partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
159 void free_partial_schedule (partial_schedule_ptr);
160 void reset_partial_schedule (partial_schedule_ptr, int new_ii);
161 void print_partial_schedule (partial_schedule_ptr, FILE *);
162 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
163 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
164 ddg_node_ptr node, int cycle,
165 sbitmap must_precede,
166 sbitmap must_follow);
167 static void rotate_partial_schedule (partial_schedule_ptr, int);
168 void set_row_column_for_ps (partial_schedule_ptr);
169 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
172 /* This page defines constants and structures for the modulo scheduling
173 driver. */
175 /* As in haifa-sched.c: */
176 /* issue_rate is the number of insns that can be scheduled in the same
177 machine cycle. It can be defined in the config/mach/mach.h file,
178 otherwise we set it to 1. */
180 static int issue_rate;
182 /* For printing statistics. */
183 static FILE *stats_file;
185 static int sms_order_nodes (ddg_ptr, int, int * result);
186 static void set_node_sched_params (ddg_ptr);
187 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int,
188 int *, FILE*);
189 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
190 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
191 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
192 int from_stage, int to_stage,
193 int is_prolog);
195 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
196 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
197 #define SCHED_FIRST_REG_MOVE(x) \
198 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
199 #define SCHED_NREG_MOVES(x) \
200 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
201 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
202 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
203 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
205 /* The scheduling parameters held for each node. */
206 typedef struct node_sched_params
208 int asap; /* A lower-bound on the absolute scheduling cycle. */
209 int time; /* The absolute scheduling cycle (time >= asap). */
211 /* The following field (first_reg_move) is a pointer to the first
212 register-move instruction added to handle the modulo-variable-expansion
213 of the register defined by this node. This register-move copies the
214 original register defined by the node. */
215 rtx first_reg_move;
217 /* The number of register-move instructions added, immediately preceding
218 first_reg_move. */
219 int nreg_moves;
221 int row; /* Holds time % ii. */
222 int stage; /* Holds time / ii. */
224 /* The column of a node inside the ps. If nodes u, v are on the same row,
225 u will precede v if column (u) < column (v). */
226 int column;
227 } *node_sched_params_ptr;
230 /* The following three functions are copied from the current scheduler
231 code in order to use sched_analyze() for computing the dependencies.
232 They are used when initializing the sched_info structure. */
233 static const char *
234 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
236 static char tmp[80];
238 sprintf (tmp, "i%4d", INSN_UID (insn));
239 return tmp;
242 static int
243 contributes_to_priority (rtx next, rtx insn)
245 return BLOCK_NUM (next) == BLOCK_NUM (insn);
248 static void
249 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
250 regset cond_exec ATTRIBUTE_UNUSED,
251 regset used ATTRIBUTE_UNUSED,
252 regset set ATTRIBUTE_UNUSED)
256 static struct sched_info sms_sched_info =
258 NULL,
259 NULL,
260 NULL,
261 NULL,
262 NULL,
263 sms_print_insn,
264 contributes_to_priority,
265 compute_jump_reg_dependencies,
266 NULL, NULL,
267 NULL, NULL,
268 0, 0, 0
272 /* Return the register decremented and tested or zero if it is not a decrement
273 and branch jump insn (similar to doloop_condition_get). */
274 static rtx
275 doloop_register_get (rtx insn, rtx *comp)
277 rtx pattern, cmp, inc, reg, condition;
279 if (!JUMP_P (insn))
280 return NULL_RTX;
281 pattern = PATTERN (insn);
283 /* The canonical doloop pattern we expect is:
285 (parallel [(set (pc) (if_then_else (condition)
286 (label_ref (label))
287 (pc)))
288 (set (reg) (plus (reg) (const_int -1)))
289 (additional clobbers and uses)])
291 where condition is further restricted to be
292 (ne (reg) (const_int 1)). */
294 if (GET_CODE (pattern) != PARALLEL)
295 return NULL_RTX;
297 cmp = XVECEXP (pattern, 0, 0);
298 inc = XVECEXP (pattern, 0, 1);
299 /* Return the compare rtx. */
300 *comp = cmp;
302 /* Check for (set (reg) (something)). */
303 if (GET_CODE (inc) != SET || ! REG_P (SET_DEST (inc)))
304 return NULL_RTX;
306 /* Extract loop counter register. */
307 reg = SET_DEST (inc);
309 /* Check if something = (plus (reg) (const_int -1)). */
310 if (GET_CODE (SET_SRC (inc)) != PLUS
311 || XEXP (SET_SRC (inc), 0) != reg
312 || XEXP (SET_SRC (inc), 1) != constm1_rtx)
313 return NULL_RTX;
315 /* Check for (set (pc) (if_then_else (condition)
316 (label_ref (label))
317 (pc))). */
318 if (GET_CODE (cmp) != SET
319 || SET_DEST (cmp) != pc_rtx
320 || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
321 || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
322 || XEXP (SET_SRC (cmp), 2) != pc_rtx)
323 return NULL_RTX;
325 /* Extract loop termination condition. */
326 condition = XEXP (SET_SRC (cmp), 0);
328 /* Check if condition = (ne (reg) (const_int 1)), which is more
329 restrictive than the check in doloop_condition_get:
330 if ((GET_CODE (condition) != GE && GET_CODE (condition) != NE)
331 || GET_CODE (XEXP (condition, 1)) != CONST_INT). */
332 if (GET_CODE (condition) != NE
333 || XEXP (condition, 1) != const1_rtx)
334 return NULL_RTX;
336 if (XEXP (condition, 0) == reg)
337 return reg;
339 return NULL_RTX;
342 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
343 that the number of iterations is a compile-time constant. If so,
344 return the rtx that sets COUNT_REG to a constant, and set COUNT to
345 this constant. Otherwise return 0. */
346 static rtx
347 const_iteration_count (rtx count_reg, basic_block pre_header,
348 HOST_WIDEST_INT * count)
350 rtx insn;
351 rtx head, tail;
353 if (! pre_header)
354 return NULL_RTX;
356 get_block_head_tail (pre_header->index, &head, &tail);
358 for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
359 if (INSN_P (insn) && single_set (insn) &&
360 rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
362 rtx pat = single_set (insn);
364 if (GET_CODE (SET_SRC (pat)) == CONST_INT)
366 *count = INTVAL (SET_SRC (pat));
367 return insn;
370 return NULL_RTX;
373 return NULL_RTX;
376 /* A very simple resource-based lower bound on the initiation interval.
377 ??? Improve the accuracy of this bound by considering the
378 utilization of various units. */
379 static int
380 res_MII (ddg_ptr g)
382 return (g->num_nodes / issue_rate);
386 /* Points to the array that contains the sched data for each node. */
387 static node_sched_params_ptr node_sched_params;
389 /* Allocate sched_params for each node and initialize it. Assumes that
390 the aux field of each node contain the asap bound (computed earlier),
391 and copies it into the sched_params field. */
392 static void
393 set_node_sched_params (ddg_ptr g)
395 int i;
397 /* Allocate for each node in the DDG a place to hold the "sched_data". */
398 /* Initialize ASAP/ALAP/HIGHT to zero. */
399 node_sched_params = (node_sched_params_ptr)
400 xcalloc (g->num_nodes,
401 sizeof (struct node_sched_params));
403 /* Set the pointer of the general data of the node to point to the
404 appropriate sched_params structure. */
405 for (i = 0; i < g->num_nodes; i++)
407 /* Watch out for aliasing problems? */
408 node_sched_params[i].asap = g->nodes[i].aux.count;
409 g->nodes[i].aux.info = &node_sched_params[i];
413 static void
414 print_node_sched_params (FILE * dump_file, int num_nodes)
416 int i;
418 if (! dump_file)
419 return;
420 for (i = 0; i < num_nodes; i++)
422 node_sched_params_ptr nsp = &node_sched_params[i];
423 rtx reg_move = nsp->first_reg_move;
424 int j;
426 fprintf (dump_file, "Node %d:\n", i);
427 fprintf (dump_file, " asap = %d:\n", nsp->asap);
428 fprintf (dump_file, " time = %d:\n", nsp->time);
429 fprintf (dump_file, " nreg_moves = %d:\n", nsp->nreg_moves);
430 for (j = 0; j < nsp->nreg_moves; j++)
432 fprintf (dump_file, " reg_move = ");
433 print_rtl_single (dump_file, reg_move);
434 reg_move = PREV_INSN (reg_move);
439 /* Calculate an upper bound for II. SMS should not schedule the loop if it
440 requires more cycles than this bound. Currently set to the sum of the
441 longest latency edge for each node. Reset based on experiments. */
442 static int
443 calculate_maxii (ddg_ptr g)
445 int i;
446 int maxii = 0;
448 for (i = 0; i < g->num_nodes; i++)
450 ddg_node_ptr u = &g->nodes[i];
451 ddg_edge_ptr e;
452 int max_edge_latency = 0;
454 for (e = u->out; e; e = e->next_out)
455 max_edge_latency = MAX (max_edge_latency, e->latency);
457 maxii += max_edge_latency;
459 return maxii;
463 Breaking intra-loop register anti-dependences:
464 Each intra-loop register anti-dependence implies a cross-iteration true
465 dependence of distance 1. Therefore, we can remove such false dependencies
466 and figure out if the partial schedule broke them by checking if (for a
467 true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
468 if so generate a register move. The number of such moves is equal to:
469 SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
470 nreg_moves = ----------------------------------- + 1 - { dependence.
471 ii { 1 if not.
473 static struct undo_replace_buff_elem *
474 generate_reg_moves (partial_schedule_ptr ps)
476 ddg_ptr g = ps->g;
477 int ii = ps->ii;
478 int i;
479 struct undo_replace_buff_elem *reg_move_replaces = NULL;
481 for (i = 0; i < g->num_nodes; i++)
483 ddg_node_ptr u = &g->nodes[i];
484 ddg_edge_ptr e;
485 int nreg_moves = 0, i_reg_move;
486 sbitmap *uses_of_defs;
487 rtx last_reg_move;
488 rtx prev_reg, old_reg;
490 /* Compute the number of reg_moves needed for u, by looking at life
491 ranges started at u (excluding self-loops). */
492 for (e = u->out; e; e = e->next_out)
493 if (e->type == TRUE_DEP && e->dest != e->src)
495 int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
497 if (e->distance == 1)
498 nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
500 /* If dest precedes src in the schedule of the kernel, then dest
501 will read before src writes and we can save one reg_copy. */
502 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
503 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
504 nreg_moves4e--;
506 nreg_moves = MAX (nreg_moves, nreg_moves4e);
509 if (nreg_moves == 0)
510 continue;
512 /* Every use of the register defined by node may require a different
513 copy of this register, depending on the time the use is scheduled.
514 Set a bitmap vector, telling which nodes use each copy of this
515 register. */
516 uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
517 sbitmap_vector_zero (uses_of_defs, nreg_moves);
518 for (e = u->out; e; e = e->next_out)
519 if (e->type == TRUE_DEP && e->dest != e->src)
521 int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
523 if (e->distance == 1)
524 dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
526 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
527 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
528 dest_copy--;
530 if (dest_copy)
531 SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
534 /* Now generate the reg_moves, attaching relevant uses to them. */
535 SCHED_NREG_MOVES (u) = nreg_moves;
536 old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
537 last_reg_move = u->insn;
539 for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
541 int i_use;
542 rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
543 rtx reg_move = gen_move_insn (new_reg, prev_reg);
545 add_insn_before (reg_move, last_reg_move);
546 last_reg_move = reg_move;
548 if (!SCHED_FIRST_REG_MOVE (u))
549 SCHED_FIRST_REG_MOVE (u) = reg_move;
551 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use,
553 struct undo_replace_buff_elem *rep;
555 rep = (struct undo_replace_buff_elem *)
556 xcalloc (1, sizeof (struct undo_replace_buff_elem));
557 rep->insn = g->nodes[i_use].insn;
558 rep->orig_reg = old_reg;
559 rep->new_reg = new_reg;
561 if (! reg_move_replaces)
562 reg_move_replaces = rep;
563 else
565 rep->next = reg_move_replaces;
566 reg_move_replaces = rep;
569 replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
572 prev_reg = new_reg;
575 return reg_move_replaces;
578 /* We call this when we want to undo the SMS schedule for a given loop.
579 One of the things that we do is to delete the register moves generated
580 for the sake of SMS; this function deletes the register move instructions
581 recorded in the undo buffer. */
582 static void
583 undo_generate_reg_moves (partial_schedule_ptr ps,
584 struct undo_replace_buff_elem *reg_move_replaces)
586 int i,j;
588 for (i = 0; i < ps->g->num_nodes; i++)
590 ddg_node_ptr u = &ps->g->nodes[i];
591 rtx prev;
592 rtx crr = SCHED_FIRST_REG_MOVE (u);
594 for (j = 0; j < SCHED_NREG_MOVES (u); j++)
596 prev = PREV_INSN (crr);
597 delete_insn (crr);
598 crr = prev;
602 while (reg_move_replaces)
604 struct undo_replace_buff_elem *rep = reg_move_replaces;
606 reg_move_replaces = reg_move_replaces->next;
607 replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
611 /* Free memory allocated for the undo buffer. */
612 static void
613 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
616 while (reg_move_replaces)
618 struct undo_replace_buff_elem *rep = reg_move_replaces;
620 reg_move_replaces = reg_move_replaces->next;
621 free (rep);
625 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
626 of SCHED_ROW and SCHED_STAGE. */
627 static void
628 normalize_sched_times (partial_schedule_ptr ps)
630 int i;
631 ddg_ptr g = ps->g;
632 int amount = PS_MIN_CYCLE (ps);
633 int ii = ps->ii;
635 /* Don't include the closing branch assuming that it is the last node. */
636 for (i = 0; i < g->num_nodes - 1; i++)
638 ddg_node_ptr u = &g->nodes[i];
639 int normalized_time = SCHED_TIME (u) - amount;
641 if (normalized_time < 0)
642 abort ();
644 SCHED_TIME (u) = normalized_time;
645 SCHED_ROW (u) = normalized_time % ii;
646 SCHED_STAGE (u) = normalized_time / ii;
650 /* Set SCHED_COLUMN of each node according to its position in PS. */
651 static void
652 set_columns_for_ps (partial_schedule_ptr ps)
654 int row;
656 for (row = 0; row < ps->ii; row++)
658 ps_insn_ptr cur_insn = ps->rows[row];
659 int column = 0;
661 for (; cur_insn; cur_insn = cur_insn->next_in_row)
662 SCHED_COLUMN (cur_insn->node) = column++;
666 /* Permute the insns according to their order in PS, from row 0 to
667 row ii-1, and position them right before LAST. This schedules
668 the insns of the loop kernel. */
669 static void
670 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
672 int ii = ps->ii;
673 int row;
674 ps_insn_ptr ps_ij;
676 for (row = 0; row < ii ; row++)
677 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
678 if (PREV_INSN (last) != ps_ij->node->insn)
679 reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
680 PREV_INSN (last));
683 /* As part of undoing SMS we return to the original ordering of the
684 instructions inside the loop kernel. Given the partial schedule PS, this
685 function returns the ordering of the instruction according to their CUID
686 in the DDG (PS->G), which is the original order of the instruction before
687 performing SMS. */
688 static void
689 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
691 int i;
693 for (i = 0 ; i < ps->g->num_nodes; i++)
694 if (last == ps->g->nodes[i].insn
695 || last == ps->g->nodes[i].first_note)
696 break;
697 else if (PREV_INSN (last) != ps->g->nodes[i].insn)
698 reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
699 PREV_INSN (last));
702 /* Used to generate the prologue & epilogue. Duplicate the subset of
703 nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
704 of both), together with a prefix/suffix of their reg_moves. */
705 static void
706 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
707 int to_stage, int for_prolog)
709 int row;
710 ps_insn_ptr ps_ij;
712 for (row = 0; row < ps->ii; row++)
713 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
715 ddg_node_ptr u_node = ps_ij->node;
716 int j, i_reg_moves;
717 rtx reg_move = NULL_RTX;
719 if (for_prolog)
721 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
722 number of reg_moves starting with the second occurrence of
723 u_node, which is generated if its SCHED_STAGE <= to_stage. */
724 i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
725 i_reg_moves = MAX (i_reg_moves, 0);
726 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
728 /* The reg_moves start from the *first* reg_move backwards. */
729 if (i_reg_moves)
731 reg_move = SCHED_FIRST_REG_MOVE (u_node);
732 for (j = 1; j < i_reg_moves; j++)
733 reg_move = PREV_INSN (reg_move);
736 else /* It's for the epilog. */
738 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
739 starting to decrease one stage after u_node no longer occurs;
740 that is, generate all reg_moves until
741 SCHED_STAGE (u_node) == from_stage - 1. */
742 i_reg_moves = SCHED_NREG_MOVES (u_node)
743 - (from_stage - SCHED_STAGE (u_node) - 1);
744 i_reg_moves = MAX (i_reg_moves, 0);
745 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
747 /* The reg_moves start from the *last* reg_move forwards. */
748 if (i_reg_moves)
750 reg_move = SCHED_FIRST_REG_MOVE (u_node);
751 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
752 reg_move = PREV_INSN (reg_move);
756 for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
757 emit_insn (copy_rtx (PATTERN (reg_move)));
758 if (SCHED_STAGE (u_node) >= from_stage
759 && SCHED_STAGE (u_node) <= to_stage)
760 duplicate_insn_chain (u_node->first_note, u_node->insn);
765 /* Generate the instructions (including reg_moves) for prolog & epilog. */
766 static void
767 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
769 int i;
770 int last_stage = PS_STAGE_COUNT (ps) - 1;
771 edge e;
773 /* Generate the prolog, inserting its insns on the loop-entry edge. */
774 start_sequence ();
776 if (count_reg)
777 /* Generate a subtract instruction at the beginning of the prolog to
778 adjust the loop count by STAGE_COUNT. */
779 emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
781 for (i = 0; i < last_stage; i++)
782 duplicate_insns_of_cycles (ps, 0, i, 1);
784 /* Put the prolog , on the one and only entry edge. */
785 e = loop_preheader_edge (loop);
786 loop_split_edge_with(e , get_insns());
788 end_sequence ();
790 /* Generate the epilog, inserting its insns on the loop-exit edge. */
791 start_sequence ();
793 for (i = 0; i < last_stage; i++)
794 duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
796 /* Put the epilogue on the one and only one exit edge. */
797 gcc_assert (loop->single_exit);
798 e = loop->single_exit;
799 loop_split_edge_with(e , get_insns());
800 end_sequence ();
803 /* Return the line note insn preceding INSN, for debugging. Taken from
804 emit-rtl.c. */
805 static rtx
806 find_line_note (rtx insn)
808 for (; insn; insn = PREV_INSN (insn))
809 if (NOTE_P (insn)
810 && NOTE_LINE_NUMBER (insn) >= 0)
811 break;
813 return insn;
816 /* Return true if all the BBs of the loop are empty except the
817 loop header. */
818 static bool
819 loop_single_full_bb_p (struct loop *loop)
821 unsigned i;
822 basic_block *bbs = get_loop_body (loop);
824 for (i = 0; i < loop->num_nodes ; i++)
826 rtx head, tail;
827 bool empty_bb = true;
829 if (bbs[i] == loop->header)
830 continue;
832 /* Make sure that basic blocks other than the header
833 have only notes labels or jumps. */
834 get_block_head_tail (bbs[i]->index, &head, &tail);
835 for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
837 if (NOTE_P (head) || LABEL_P (head)
838 || (INSN_P (head) && JUMP_P (head)))
839 continue;
840 empty_bb = false;
841 break;
844 if (! empty_bb)
846 free (bbs);
847 return false;
850 free (bbs);
851 return true;
854 /* A simple loop from SMS point of view; it is a loop that is composed of
855 either a single basic block or two BBs - a header and a latch. */
856 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \
857 && (EDGE_COUNT (loop->latch->preds) == 1) \
858 && (EDGE_COUNT (loop->latch->succs) == 1))
860 /* Return true if the loop is in its canonical form and false if not.
861 i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */
862 static bool
863 loop_canon_p (struct loop *loop, FILE *dump_file)
866 if (loop->inner || ! loop->outer)
867 return false;
869 if (!loop->single_exit)
871 if (dump_file)
873 rtx line_note = find_line_note (BB_END (loop->header));
875 fprintf (dump_file, "SMS loop many exits ");
876 if (line_note)
878 expanded_location xloc;
879 NOTE_EXPANDED_LOCATION (xloc, line_note);
880 fprintf (stats_file, " %s %d (file, line)\n",
881 xloc.file, xloc.line);
884 return false;
887 if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
889 if (dump_file)
891 rtx line_note = find_line_note (BB_END (loop->header));
893 fprintf (dump_file, "SMS loop many BBs. ");
894 if (line_note)
896 expanded_location xloc;
897 NOTE_EXPANDED_LOCATION (xloc, line_note);
898 fprintf (stats_file, " %s %d (file, line)\n",
899 xloc.file, xloc.line);
902 return false;
905 return true;
908 /* If there are more than one entry for the loop,
909 make it one by splitting the first entry edge and
910 redirecting the others to the new BB. */
911 static void
912 canon_loop (struct loop *loop)
914 edge e;
915 edge_iterator i;
917 /* Avoid annoying special cases of edges going to exit
918 block. */
919 FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
920 if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
921 loop_split_edge_with (e, NULL_RTX);
923 if (loop->latch == loop->header
924 || EDGE_COUNT (loop->latch->succs) > 1)
926 FOR_EACH_EDGE (e, i, loop->header->preds)
927 if (e->src == loop->latch)
928 break;
929 loop_split_edge_with (e, NULL_RTX);
933 /* Build the loop information without loop
934 canonization, the loop canonization will
935 be performed if the loop is SMSable. */
936 static struct loops *
937 build_loops_structure (FILE *dumpfile)
939 struct loops *loops = xcalloc (1, sizeof (struct loops));
941 /* Find the loops. */
943 if (flow_loops_find (loops) <= 1)
945 /* No loops. */
946 flow_loops_free (loops);
947 free (loops);
949 return NULL;
952 /* Not going to update these. */
953 free (loops->cfg.rc_order);
954 loops->cfg.rc_order = NULL;
955 free (loops->cfg.dfs_order);
956 loops->cfg.dfs_order = NULL;
958 create_preheaders (loops, CP_SIMPLE_PREHEADERS);
959 mark_single_exit_loops (loops);
960 /* Dump loops. */
961 flow_loops_dump (loops, dumpfile, NULL, 1);
963 #ifdef ENABLE_CHECKING
964 verify_dominators (CDI_DOMINATORS);
965 verify_loop_structure (loops);
966 #endif
968 return loops;
971 /* Main entry point, perform SMS scheduling on the loops of the function
972 that consist of single basic blocks. */
973 void
974 sms_schedule (FILE *dump_file)
976 static int passes = 0;
977 rtx insn;
978 ddg_ptr *g_arr, g;
979 int * node_order;
980 int maxii;
981 unsigned i,num_loops;
982 partial_schedule_ptr ps;
983 struct df *df;
984 struct loops *loops;
985 basic_block bb = NULL;
986 /* vars to the versioning only if needed*/
987 struct loop * nloop;
988 basic_block condition_bb = NULL;
989 edge latch_edge;
990 gcov_type trip_count = 0;
992 if (! (loops = build_loops_structure (dump_file)))
993 return; /* There is no loops to schedule. */
996 stats_file = dump_file;
998 /* Initialize issue_rate. */
999 if (targetm.sched.issue_rate)
1001 int temp = reload_completed;
1003 reload_completed = 1;
1004 issue_rate = (*targetm.sched.issue_rate) ();
1005 reload_completed = temp;
1007 else
1008 issue_rate = 1;
1010 /* Initialize the scheduler. */
1011 current_sched_info = &sms_sched_info;
1012 sched_init (NULL);
1014 /* Init Data Flow analysis, to be used in interloop dep calculation. */
1015 df = df_init ();
1016 df_analyze (df, 0, DF_ALL);
1018 /* Allocate memory to hold the DDG array one entry for each loop.
1019 We use loop->num as index into this array. */
1020 g_arr = xcalloc (loops->num, sizeof (ddg_ptr));
1023 /* Build DDGs for all the relevant loops and hold them in G_ARR
1024 indexed by the loop index. */
1025 for (i = 0; i < loops->num; i++)
1027 rtx head, tail;
1028 rtx count_reg, comp;
1029 struct loop *loop = loops->parray[i];
1031 /* For debugging. */
1032 if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
1034 if (dump_file)
1035 fprintf (dump_file, "SMS reached MAX_PASSES... \n");
1037 break;
1040 if (! loop_canon_p (loop, dump_file))
1041 continue;
1043 if (! loop_single_full_bb_p (loop))
1044 continue;
1046 bb = loop->header;
1048 get_block_head_tail (bb->index, &head, &tail);
1049 latch_edge = loop_latch_edge (loop);
1050 gcc_assert (loop->single_exit);
1051 if (loop->single_exit->count)
1052 trip_count = latch_edge->count / loop->single_exit->count;
1054 /* Perfrom SMS only on loops that their average count is above threshold. */
1056 if ( latch_edge->count
1057 && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
1059 if (stats_file)
1061 rtx line_note = find_line_note (tail);
1063 if (line_note)
1065 expanded_location xloc;
1066 NOTE_EXPANDED_LOCATION (xloc, line_note);
1067 fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1068 xloc.file, xloc.line);
1070 fprintf (stats_file, "SMS single-bb-loop\n");
1071 if (profile_info && flag_branch_probabilities)
1073 fprintf (stats_file, "SMS loop-count ");
1074 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1075 (HOST_WIDEST_INT) bb->count);
1076 fprintf (stats_file, "\n");
1077 fprintf (stats_file, "SMS trip-count ");
1078 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1079 (HOST_WIDEST_INT) trip_count);
1080 fprintf (stats_file, "\n");
1081 fprintf (stats_file, "SMS profile-sum-max ");
1082 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1083 (HOST_WIDEST_INT) profile_info->sum_max);
1084 fprintf (stats_file, "\n");
1087 continue;
1090 /* Make sure this is a doloop. */
1091 if ( !(count_reg = doloop_register_get (tail, &comp)))
1092 continue;
1094 /* Don't handle BBs with calls or barriers, or !single_set insns. */
1095 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1096 if (CALL_P (insn)
1097 || BARRIER_P (insn)
1098 || (INSN_P (insn) && !JUMP_P (insn)
1099 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1100 break;
1102 if (insn != NEXT_INSN (tail))
1104 if (stats_file)
1106 if (CALL_P (insn))
1107 fprintf (stats_file, "SMS loop-with-call\n");
1108 else if (BARRIER_P (insn))
1109 fprintf (stats_file, "SMS loop-with-barrier\n");
1110 else
1111 fprintf (stats_file, "SMS loop-with-not-single-set\n");
1112 print_rtl_single (stats_file, insn);
1115 continue;
1118 if (! (g = create_ddg (bb, df, 0)))
1120 if (stats_file)
1121 fprintf (stats_file, "SMS doloop\n");
1122 continue;
1125 g_arr[i] = g;
1128 /* Release Data Flow analysis data structures. */
1129 df_finish (df);
1131 /* We don't want to perform SMS on new loops - created by versioning. */
1132 num_loops = loops->num;
1133 /* Go over the built DDGs and perfrom SMS for each one of them. */
1134 for (i = 0; i < num_loops; i++)
1136 rtx head, tail;
1137 rtx count_reg, count_init, comp;
1138 int mii, rec_mii;
1139 unsigned stage_count = 0;
1140 HOST_WIDEST_INT loop_count = 0;
1141 struct loop *loop = loops->parray[i];
1143 if (! (g = g_arr[i]))
1144 continue;
1146 if (dump_file)
1147 print_ddg (dump_file, g);
1149 get_block_head_tail (loop->header->index, &head, &tail);
1151 latch_edge = loop_latch_edge (loop);
1152 gcc_assert (loop->single_exit);
1153 if (loop->single_exit->count)
1154 trip_count = latch_edge->count / loop->single_exit->count;
1156 if (stats_file)
1158 rtx line_note = find_line_note (tail);
1160 if (line_note)
1162 expanded_location xloc;
1163 NOTE_EXPANDED_LOCATION (xloc, line_note);
1164 fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1165 xloc.file, xloc.line);
1167 fprintf (stats_file, "SMS single-bb-loop\n");
1168 if (profile_info && flag_branch_probabilities)
1170 fprintf (stats_file, "SMS loop-count ");
1171 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1172 (HOST_WIDEST_INT) bb->count);
1173 fprintf (stats_file, "\n");
1174 fprintf (stats_file, "SMS profile-sum-max ");
1175 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1176 (HOST_WIDEST_INT) profile_info->sum_max);
1177 fprintf (stats_file, "\n");
1179 fprintf (stats_file, "SMS doloop\n");
1180 fprintf (stats_file, "SMS built-ddg %d\n", g->num_nodes);
1181 fprintf (stats_file, "SMS num-loads %d\n", g->num_loads);
1182 fprintf (stats_file, "SMS num-stores %d\n", g->num_stores);
1186 /* In case of th loop have doloop register it gets special
1187 handling. */
1188 count_init = NULL_RTX;
1189 if ((count_reg = doloop_register_get (tail, &comp)))
1191 basic_block pre_header;
1193 pre_header = loop_preheader_edge (loop)->src;
1194 count_init = const_iteration_count (count_reg, pre_header,
1195 &loop_count);
1197 gcc_assert (count_reg);
1199 if (stats_file && count_init)
1201 fprintf (stats_file, "SMS const-doloop ");
1202 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1203 loop_count);
1204 fprintf (stats_file, "\n");
1207 node_order = (int *) xmalloc (sizeof (int) * g->num_nodes);
1209 mii = 1; /* Need to pass some estimate of mii. */
1210 rec_mii = sms_order_nodes (g, mii, node_order);
1211 mii = MAX (res_MII (g), rec_mii);
1212 maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1214 if (stats_file)
1215 fprintf (stats_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1216 rec_mii, mii, maxii);
1218 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1219 ASAP. */
1220 set_node_sched_params (g);
1222 ps = sms_schedule_by_order (g, mii, maxii, node_order, dump_file);
1224 if (ps)
1225 stage_count = PS_STAGE_COUNT (ps);
1227 /* Stage count of 1 means that there is no interleaving between
1228 iterations, let the scheduling passes do the job. */
1229 if (stage_count < 1
1230 || (count_init && (loop_count <= stage_count))
1231 || (flag_branch_probabilities && (trip_count <= stage_count)))
1233 if (dump_file)
1235 fprintf (dump_file, "SMS failed... \n");
1236 fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1237 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1238 fprintf (dump_file, ", trip-count=");
1239 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1240 fprintf (dump_file, ")\n");
1242 continue;
1244 else
1246 int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1247 int new_cycles;
1248 struct undo_replace_buff_elem *reg_move_replaces;
1250 if (stats_file)
1252 fprintf (stats_file,
1253 "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1254 stage_count);
1255 print_partial_schedule (ps, stats_file);
1256 fprintf (stats_file,
1257 "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1258 g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1261 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
1262 the closing_branch was scheduled and should appear in the last (ii-1)
1263 row. Otherwise, we are free to schedule the branch, and we let nodes
1264 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1265 row; this should reduce stage_count to minimum. */
1266 normalize_sched_times (ps);
1267 rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1268 set_columns_for_ps (ps);
1270 /* Generate the kernel just to be able to measure its cycles. */
1271 permute_partial_schedule (ps, g->closing_branch->first_note);
1272 reg_move_replaces = generate_reg_moves (ps);
1274 /* Get the number of cycles the new kernel expect to execute in. */
1275 new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1277 /* Get back to the original loop so we can do loop versioning. */
1278 undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1279 if (reg_move_replaces)
1280 undo_generate_reg_moves (ps, reg_move_replaces);
1282 if ( new_cycles >= orig_cycles)
1284 /* SMS is not profitable so undo the permutation and reg move generation
1285 and return the kernel to its original state. */
1286 if (dump_file)
1287 fprintf (dump_file, "Undoing SMS becuase it is not profitable.\n");
1290 else
1292 canon_loop (loop);
1294 /* case the BCT count is not known , Do loop-versioning */
1295 if (count_reg && ! count_init)
1297 rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1298 GEN_INT(stage_count));
1300 nloop = loop_version (loops, loop, comp_rtx, &condition_bb);
1303 /* Set new iteration count of loop kernel. */
1304 if (count_reg && count_init)
1305 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1306 - stage_count + 1);
1308 /* Now apply the scheduled kernel to the RTL of the loop. */
1309 permute_partial_schedule (ps, g->closing_branch->first_note);
1311 /* Mark this loop as software pipelined so the later
1312 scheduling passes doesn't touch it. */
1313 if (! flag_resched_modulo_sched)
1314 g->bb->flags |= BB_DISABLE_SCHEDULE;
1315 /* The life-info is not valid any more. */
1316 g->bb->flags |= BB_DIRTY;
1318 reg_move_replaces = generate_reg_moves (ps);
1319 if (dump_file)
1320 print_node_sched_params (dump_file, g->num_nodes);
1321 /* Generate prolog and epilog. */
1322 if (count_reg && !count_init)
1323 generate_prolog_epilog (ps, loop, count_reg);
1324 else
1325 generate_prolog_epilog (ps, loop, NULL_RTX);
1327 free_undo_replace_buff (reg_move_replaces);
1330 free_partial_schedule (ps);
1331 free (node_sched_params);
1332 free (node_order);
1333 free_ddg (g);
1336 /* Release scheduler data, needed until now because of DFA. */
1337 sched_finish ();
1338 loop_optimizer_finalize (loops, dump_file);
1341 /* The SMS scheduling algorithm itself
1342 -----------------------------------
1343 Input: 'O' an ordered list of insns of a loop.
1344 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1346 'Q' is the empty Set
1347 'PS' is the partial schedule; it holds the currently scheduled nodes with
1348 their cycle/slot.
1349 'PSP' previously scheduled predecessors.
1350 'PSS' previously scheduled successors.
1351 't(u)' the cycle where u is scheduled.
1352 'l(u)' is the latency of u.
1353 'd(v,u)' is the dependence distance from v to u.
1354 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1355 the node ordering phase.
1356 'check_hardware_resources_conflicts(u, PS, c)'
1357 run a trace around cycle/slot through DFA model
1358 to check resource conflicts involving instruction u
1359 at cycle c given the partial schedule PS.
1360 'add_to_partial_schedule_at_time(u, PS, c)'
1361 Add the node/instruction u to the partial schedule
1362 PS at time c.
1363 'calculate_register_pressure(PS)'
1364 Given a schedule of instructions, calculate the register
1365 pressure it implies. One implementation could be the
1366 maximum number of overlapping live ranges.
1367 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1368 registers available in the hardware.
1370 1. II = MII.
1371 2. PS = empty list
1372 3. for each node u in O in pre-computed order
1373 4. if (PSP(u) != Q && PSS(u) == Q) then
1374 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1375 6. start = Early_start; end = Early_start + II - 1; step = 1
1376 11. else if (PSP(u) == Q && PSS(u) != Q) then
1377 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1378 13. start = Late_start; end = Late_start - II + 1; step = -1
1379 14. else if (PSP(u) != Q && PSS(u) != Q) then
1380 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1381 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1382 17. start = Early_start;
1383 18. end = min(Early_start + II - 1 , Late_start);
1384 19. step = 1
1385 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1386 21. start = ASAP(u); end = start + II - 1; step = 1
1387 22. endif
1389 23. success = false
1390 24. for (c = start ; c != end ; c += step)
1391 25. if check_hardware_resources_conflicts(u, PS, c) then
1392 26. add_to_partial_schedule_at_time(u, PS, c)
1393 27. success = true
1394 28. break
1395 29. endif
1396 30. endfor
1397 31. if (success == false) then
1398 32. II = II + 1
1399 33. if (II > maxII) then
1400 34. finish - failed to schedule
1401 35. endif
1402 36. goto 2.
1403 37. endif
1404 38. endfor
1405 39. if (calculate_register_pressure(PS) > maxRP) then
1406 40. goto 32.
1407 41. endif
1408 42. compute epilogue & prologue
1409 43. finish - succeeded to schedule
1412 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1413 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1414 set to 0 to save compile time. */
1415 #define DFA_HISTORY SMS_DFA_HISTORY
1417 /* Given the partial schedule PS, this function calculates and returns the
1418 cycles in which we can schedule the node with the given index I.
1419 NOTE: Here we do the backtracking in SMS, in some special cases. We have
1420 noticed that there are several cases in which we fail to SMS the loop
1421 because the sched window of a node is empty due to tight data-deps. In
1422 such cases we want to unschedule some of the predecessors/successors
1423 until we get non-empty scheduling window. It returns -1 if the
1424 scheduling window is empty and zero otherwise. */
1426 static int
1427 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1428 sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1430 int start, step, end;
1431 ddg_edge_ptr e;
1432 int u = nodes_order [i];
1433 ddg_node_ptr u_node = &ps->g->nodes[u];
1434 sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1435 sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1436 sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1437 sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1438 int psp_not_empty;
1439 int pss_not_empty;
1441 /* 1. compute sched window for u (start, end, step). */
1442 sbitmap_zero (psp);
1443 sbitmap_zero (pss);
1444 psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1445 pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1447 if (psp_not_empty && !pss_not_empty)
1449 int early_start = INT_MIN;
1451 end = INT_MAX;
1452 for (e = u_node->in; e != 0; e = e->next_in)
1454 ddg_node_ptr v_node = e->src;
1455 if (TEST_BIT (sched_nodes, v_node->cuid))
1457 int node_st = SCHED_TIME (v_node)
1458 + e->latency - (e->distance * ii);
1460 early_start = MAX (early_start, node_st);
1462 if (e->data_type == MEM_DEP)
1463 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1466 start = early_start;
1467 end = MIN (end, early_start + ii);
1468 step = 1;
1471 else if (!psp_not_empty && pss_not_empty)
1473 int late_start = INT_MAX;
1475 end = INT_MIN;
1476 for (e = u_node->out; e != 0; e = e->next_out)
1478 ddg_node_ptr v_node = e->dest;
1479 if (TEST_BIT (sched_nodes, v_node->cuid))
1481 late_start = MIN (late_start,
1482 SCHED_TIME (v_node) - e->latency
1483 + (e->distance * ii));
1484 if (e->data_type == MEM_DEP)
1485 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1488 start = late_start;
1489 end = MAX (end, late_start - ii);
1490 step = -1;
1493 else if (psp_not_empty && pss_not_empty)
1495 int early_start = INT_MIN;
1496 int late_start = INT_MAX;
1498 start = INT_MIN;
1499 end = INT_MAX;
1500 for (e = u_node->in; e != 0; e = e->next_in)
1502 ddg_node_ptr v_node = e->src;
1504 if (TEST_BIT (sched_nodes, v_node->cuid))
1506 early_start = MAX (early_start,
1507 SCHED_TIME (v_node) + e->latency
1508 - (e->distance * ii));
1509 if (e->data_type == MEM_DEP)
1510 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1513 for (e = u_node->out; e != 0; e = e->next_out)
1515 ddg_node_ptr v_node = e->dest;
1517 if (TEST_BIT (sched_nodes, v_node->cuid))
1519 late_start = MIN (late_start,
1520 SCHED_TIME (v_node) - e->latency
1521 + (e->distance * ii));
1522 if (e->data_type == MEM_DEP)
1523 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1526 start = MAX (start, early_start);
1527 end = MIN (end, MIN (early_start + ii, late_start + 1));
1528 step = 1;
1530 else /* psp is empty && pss is empty. */
1532 start = SCHED_ASAP (u_node);
1533 end = start + ii;
1534 step = 1;
1537 *start_p = start;
1538 *step_p = step;
1539 *end_p = end;
1540 sbitmap_free (psp);
1541 sbitmap_free (pss);
1543 if ((start >= end && step == 1) || (start <= end && step == -1))
1544 return -1;
1545 else
1546 return 0;
1549 /* This function implements the scheduling algorithm for SMS according to the
1550 above algorithm. */
1551 static partial_schedule_ptr
1552 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order, FILE *dump_file)
1554 int ii = mii;
1555 int i, c, success;
1556 int try_again_with_larger_ii = true;
1557 int num_nodes = g->num_nodes;
1558 ddg_edge_ptr e;
1559 int start, end, step; /* Place together into one struct? */
1560 sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1561 sbitmap must_precede = sbitmap_alloc (num_nodes);
1562 sbitmap must_follow = sbitmap_alloc (num_nodes);
1563 sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1565 partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1567 sbitmap_ones (tobe_scheduled);
1568 sbitmap_zero (sched_nodes);
1570 while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1571 || try_again_with_larger_ii ) && ii < maxii)
1573 int j;
1574 bool unscheduled_nodes = false;
1576 if (dump_file)
1577 fprintf(dump_file, "Starting with ii=%d\n", ii);
1578 if (try_again_with_larger_ii)
1580 try_again_with_larger_ii = false;
1581 sbitmap_zero (sched_nodes);
1584 for (i = 0; i < num_nodes; i++)
1586 int u = nodes_order[i];
1587 ddg_node_ptr u_node = &ps->g->nodes[u];
1588 rtx insn = u_node->insn;
1590 if (!INSN_P (insn))
1592 RESET_BIT (tobe_scheduled, u);
1593 continue;
1596 if (JUMP_P (insn)) /* Closing branch handled later. */
1598 RESET_BIT (tobe_scheduled, u);
1599 continue;
1602 if (TEST_BIT (sched_nodes, u))
1603 continue;
1605 /* Try to get non-empty scheduling window. */
1606 j = i;
1607 while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1608 && j > 0)
1610 unscheduled_nodes = true;
1611 if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1612 || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1614 ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1615 RESET_BIT (sched_nodes, nodes_order [j - 1]);
1617 j--;
1619 if (j < 0)
1621 /* ??? Try backtracking instead of immediately ii++? */
1622 ii++;
1623 try_again_with_larger_ii = true;
1624 reset_partial_schedule (ps, ii);
1625 break;
1627 /* 2. Try scheduling u in window. */
1628 if (dump_file)
1629 fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1630 u, start, end, step);
1632 /* use must_follow & must_precede bitmaps to determine order
1633 of nodes within the cycle. */
1634 sbitmap_zero (must_precede);
1635 sbitmap_zero (must_follow);
1636 for (e = u_node->in; e != 0; e = e->next_in)
1637 if (TEST_BIT (sched_nodes, e->src->cuid)
1638 && e->latency == (ii * e->distance)
1639 && start == SCHED_TIME (e->src))
1640 SET_BIT (must_precede, e->src->cuid);
1642 for (e = u_node->out; e != 0; e = e->next_out)
1643 if (TEST_BIT (sched_nodes, e->dest->cuid)
1644 && e->latency == (ii * e->distance)
1645 && end == SCHED_TIME (e->dest))
1646 SET_BIT (must_follow, e->dest->cuid);
1648 success = 0;
1649 if ((step > 0 && start < end) || (step < 0 && start > end))
1650 for (c = start; c != end; c += step)
1652 ps_insn_ptr psi;
1654 psi = ps_add_node_check_conflicts (ps, u_node, c,
1655 must_precede,
1656 must_follow);
1658 if (psi)
1660 SCHED_TIME (u_node) = c;
1661 SET_BIT (sched_nodes, u);
1662 success = 1;
1663 if (dump_file)
1664 fprintf(dump_file, "Schedule in %d\n", c);
1665 break;
1668 if (!success)
1670 /* ??? Try backtracking instead of immediately ii++? */
1671 ii++;
1672 try_again_with_larger_ii = true;
1673 reset_partial_schedule (ps, ii);
1674 break;
1676 if (unscheduled_nodes)
1677 break;
1679 /* ??? If (success), check register pressure estimates. */
1680 } /* Continue with next node. */
1681 } /* While try_again_with_larger_ii. */
1683 sbitmap_free (sched_nodes);
1685 if (ii >= maxii)
1687 free_partial_schedule (ps);
1688 ps = NULL;
1690 return ps;
1694 /* This page implements the algorithm for ordering the nodes of a DDG
1695 for modulo scheduling, activated through the
1696 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
1698 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1699 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1700 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1701 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1702 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1703 #define DEPTH(x) (ASAP ((x)))
1705 typedef struct node_order_params * nopa;
1707 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1708 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1709 static nopa calculate_order_params (ddg_ptr, int mii);
1710 static int find_max_asap (ddg_ptr, sbitmap);
1711 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1712 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1714 enum sms_direction {BOTTOMUP, TOPDOWN};
1716 struct node_order_params
1718 int asap;
1719 int alap;
1720 int height;
1723 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
1724 static void
1725 check_nodes_order (int *node_order, int num_nodes)
1727 int i;
1728 sbitmap tmp = sbitmap_alloc (num_nodes);
1730 sbitmap_zero (tmp);
1732 for (i = 0; i < num_nodes; i++)
1734 int u = node_order[i];
1736 if (u >= num_nodes || u < 0 || TEST_BIT (tmp, u))
1737 abort ();
1739 SET_BIT (tmp, u);
1742 sbitmap_free (tmp);
1745 /* Order the nodes of G for scheduling and pass the result in
1746 NODE_ORDER. Also set aux.count of each node to ASAP.
1747 Return the recMII for the given DDG. */
1748 static int
1749 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1751 int i;
1752 int rec_mii = 0;
1753 ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1755 nopa nops = calculate_order_params (g, mii);
1757 order_nodes_of_sccs (sccs, node_order);
1759 if (sccs->num_sccs > 0)
1760 /* First SCC has the largest recurrence_length. */
1761 rec_mii = sccs->sccs[0]->recurrence_length;
1763 /* Save ASAP before destroying node_order_params. */
1764 for (i = 0; i < g->num_nodes; i++)
1766 ddg_node_ptr v = &g->nodes[i];
1767 v->aux.count = ASAP (v);
1770 free (nops);
1771 free_ddg_all_sccs (sccs);
1772 check_nodes_order (node_order, g->num_nodes);
1774 return rec_mii;
1777 static void
1778 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1780 int i, pos = 0;
1781 ddg_ptr g = all_sccs->ddg;
1782 int num_nodes = g->num_nodes;
1783 sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1784 sbitmap on_path = sbitmap_alloc (num_nodes);
1785 sbitmap tmp = sbitmap_alloc (num_nodes);
1786 sbitmap ones = sbitmap_alloc (num_nodes);
1788 sbitmap_zero (prev_sccs);
1789 sbitmap_ones (ones);
1791 /* Perfrom the node ordering starting from the SCC with the highest recMII.
1792 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
1793 for (i = 0; i < all_sccs->num_sccs; i++)
1795 ddg_scc_ptr scc = all_sccs->sccs[i];
1797 /* Add nodes on paths from previous SCCs to the current SCC. */
1798 find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1799 sbitmap_a_or_b (tmp, scc->nodes, on_path);
1801 /* Add nodes on paths from the current SCC to previous SCCs. */
1802 find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1803 sbitmap_a_or_b (tmp, tmp, on_path);
1805 /* Remove nodes of previous SCCs from current extended SCC. */
1806 sbitmap_difference (tmp, tmp, prev_sccs);
1808 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1809 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
1812 /* Handle the remaining nodes that do not belong to any scc. Each call
1813 to order_nodes_in_scc handles a single connected component. */
1814 while (pos < g->num_nodes)
1816 sbitmap_difference (tmp, ones, prev_sccs);
1817 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1819 sbitmap_free (prev_sccs);
1820 sbitmap_free (on_path);
1821 sbitmap_free (tmp);
1822 sbitmap_free (ones);
1825 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
1826 static struct node_order_params *
1827 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1829 int u;
1830 int max_asap;
1831 int num_nodes = g->num_nodes;
1832 ddg_edge_ptr e;
1833 /* Allocate a place to hold ordering params for each node in the DDG. */
1834 nopa node_order_params_arr;
1836 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
1837 node_order_params_arr = (nopa) xcalloc (num_nodes,
1838 sizeof (struct node_order_params));
1840 /* Set the aux pointer of each node to point to its order_params structure. */
1841 for (u = 0; u < num_nodes; u++)
1842 g->nodes[u].aux.info = &node_order_params_arr[u];
1844 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1845 calculate ASAP, ALAP, mobility, distance, and height for each node
1846 in the dependence (direct acyclic) graph. */
1848 /* We assume that the nodes in the array are in topological order. */
1850 max_asap = 0;
1851 for (u = 0; u < num_nodes; u++)
1853 ddg_node_ptr u_node = &g->nodes[u];
1855 ASAP (u_node) = 0;
1856 for (e = u_node->in; e; e = e->next_in)
1857 if (e->distance == 0)
1858 ASAP (u_node) = MAX (ASAP (u_node),
1859 ASAP (e->src) + e->latency);
1860 max_asap = MAX (max_asap, ASAP (u_node));
1863 for (u = num_nodes - 1; u > -1; u--)
1865 ddg_node_ptr u_node = &g->nodes[u];
1867 ALAP (u_node) = max_asap;
1868 HEIGHT (u_node) = 0;
1869 for (e = u_node->out; e; e = e->next_out)
1870 if (e->distance == 0)
1872 ALAP (u_node) = MIN (ALAP (u_node),
1873 ALAP (e->dest) - e->latency);
1874 HEIGHT (u_node) = MAX (HEIGHT (u_node),
1875 HEIGHT (e->dest) + e->latency);
1879 return node_order_params_arr;
1882 static int
1883 find_max_asap (ddg_ptr g, sbitmap nodes)
1885 int u;
1886 int max_asap = -1;
1887 int result = -1;
1889 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1891 ddg_node_ptr u_node = &g->nodes[u];
1893 if (max_asap < ASAP (u_node))
1895 max_asap = ASAP (u_node);
1896 result = u;
1899 return result;
1902 static int
1903 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1905 int u;
1906 int max_hv = -1;
1907 int min_mob = INT_MAX;
1908 int result = -1;
1910 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1912 ddg_node_ptr u_node = &g->nodes[u];
1914 if (max_hv < HEIGHT (u_node))
1916 max_hv = HEIGHT (u_node);
1917 min_mob = MOB (u_node);
1918 result = u;
1920 else if ((max_hv == HEIGHT (u_node))
1921 && (min_mob > MOB (u_node)))
1923 min_mob = MOB (u_node);
1924 result = u;
1927 return result;
1930 static int
1931 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1933 int u;
1934 int max_dv = -1;
1935 int min_mob = INT_MAX;
1936 int result = -1;
1938 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1940 ddg_node_ptr u_node = &g->nodes[u];
1942 if (max_dv < DEPTH (u_node))
1944 max_dv = DEPTH (u_node);
1945 min_mob = MOB (u_node);
1946 result = u;
1948 else if ((max_dv == DEPTH (u_node))
1949 && (min_mob > MOB (u_node)))
1951 min_mob = MOB (u_node);
1952 result = u;
1955 return result;
1958 /* Places the nodes of SCC into the NODE_ORDER array starting
1959 at position POS, according to the SMS ordering algorithm.
1960 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1961 the NODE_ORDER array, starting from position zero. */
1962 static int
1963 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1964 int * node_order, int pos)
1966 enum sms_direction dir;
1967 int num_nodes = g->num_nodes;
1968 sbitmap workset = sbitmap_alloc (num_nodes);
1969 sbitmap tmp = sbitmap_alloc (num_nodes);
1970 sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1971 sbitmap predecessors = sbitmap_alloc (num_nodes);
1972 sbitmap successors = sbitmap_alloc (num_nodes);
1974 sbitmap_zero (predecessors);
1975 find_predecessors (predecessors, g, nodes_ordered);
1977 sbitmap_zero (successors);
1978 find_successors (successors, g, nodes_ordered);
1980 sbitmap_zero (tmp);
1981 if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1983 sbitmap_copy (workset, tmp);
1984 dir = BOTTOMUP;
1986 else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1988 sbitmap_copy (workset, tmp);
1989 dir = TOPDOWN;
1991 else
1993 int u;
1995 sbitmap_zero (workset);
1996 if ((u = find_max_asap (g, scc)) >= 0)
1997 SET_BIT (workset, u);
1998 dir = BOTTOMUP;
2001 sbitmap_zero (zero_bitmap);
2002 while (!sbitmap_equal (workset, zero_bitmap))
2004 int v;
2005 ddg_node_ptr v_node;
2006 sbitmap v_node_preds;
2007 sbitmap v_node_succs;
2009 if (dir == TOPDOWN)
2011 while (!sbitmap_equal (workset, zero_bitmap))
2013 v = find_max_hv_min_mob (g, workset);
2014 v_node = &g->nodes[v];
2015 node_order[pos++] = v;
2016 v_node_succs = NODE_SUCCESSORS (v_node);
2017 sbitmap_a_and_b (tmp, v_node_succs, scc);
2019 /* Don't consider the already ordered successors again. */
2020 sbitmap_difference (tmp, tmp, nodes_ordered);
2021 sbitmap_a_or_b (workset, workset, tmp);
2022 RESET_BIT (workset, v);
2023 SET_BIT (nodes_ordered, v);
2025 dir = BOTTOMUP;
2026 sbitmap_zero (predecessors);
2027 find_predecessors (predecessors, g, nodes_ordered);
2028 sbitmap_a_and_b (workset, predecessors, scc);
2030 else
2032 while (!sbitmap_equal (workset, zero_bitmap))
2034 v = find_max_dv_min_mob (g, workset);
2035 v_node = &g->nodes[v];
2036 node_order[pos++] = v;
2037 v_node_preds = NODE_PREDECESSORS (v_node);
2038 sbitmap_a_and_b (tmp, v_node_preds, scc);
2040 /* Don't consider the already ordered predecessors again. */
2041 sbitmap_difference (tmp, tmp, nodes_ordered);
2042 sbitmap_a_or_b (workset, workset, tmp);
2043 RESET_BIT (workset, v);
2044 SET_BIT (nodes_ordered, v);
2046 dir = TOPDOWN;
2047 sbitmap_zero (successors);
2048 find_successors (successors, g, nodes_ordered);
2049 sbitmap_a_and_b (workset, successors, scc);
2052 sbitmap_free (tmp);
2053 sbitmap_free (workset);
2054 sbitmap_free (zero_bitmap);
2055 sbitmap_free (predecessors);
2056 sbitmap_free (successors);
2057 return pos;
2061 /* This page contains functions for manipulating partial-schedules during
2062 modulo scheduling. */
2064 /* Create a partial schedule and allocate a memory to hold II rows. */
2065 partial_schedule_ptr
2066 create_partial_schedule (int ii, ddg_ptr g, int history)
2068 partial_schedule_ptr ps = (partial_schedule_ptr)
2069 xmalloc (sizeof (struct partial_schedule));
2070 ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2071 ps->ii = ii;
2072 ps->history = history;
2073 ps->min_cycle = INT_MAX;
2074 ps->max_cycle = INT_MIN;
2075 ps->g = g;
2077 return ps;
2080 /* Free the PS_INSNs in rows array of the given partial schedule.
2081 ??? Consider caching the PS_INSN's. */
2082 static void
2083 free_ps_insns (partial_schedule_ptr ps)
2085 int i;
2087 for (i = 0; i < ps->ii; i++)
2089 while (ps->rows[i])
2091 ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2093 free (ps->rows[i]);
2094 ps->rows[i] = ps_insn;
2096 ps->rows[i] = NULL;
2100 /* Free all the memory allocated to the partial schedule. */
2101 void
2102 free_partial_schedule (partial_schedule_ptr ps)
2104 if (!ps)
2105 return;
2106 free_ps_insns (ps);
2107 free (ps->rows);
2108 free (ps);
2111 /* Clear the rows array with its PS_INSNs, and create a new one with
2112 NEW_II rows. */
2113 void
2114 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2116 if (!ps)
2117 return;
2118 free_ps_insns (ps);
2119 if (new_ii == ps->ii)
2120 return;
2121 ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2122 * sizeof (ps_insn_ptr));
2123 memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2124 ps->ii = new_ii;
2125 ps->min_cycle = INT_MAX;
2126 ps->max_cycle = INT_MIN;
2129 /* Prints the partial schedule as an ii rows array, for each rows
2130 print the ids of the insns in it. */
2131 void
2132 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2134 int i;
2136 for (i = 0; i < ps->ii; i++)
2138 ps_insn_ptr ps_i = ps->rows[i];
2140 fprintf (dump, "\n[CYCLE %d ]: ", i);
2141 while (ps_i)
2143 fprintf (dump, "%d, ",
2144 INSN_UID (ps_i->node->insn));
2145 ps_i = ps_i->next_in_row;
2150 /* Creates an object of PS_INSN and initializes it to the given parameters. */
2151 static ps_insn_ptr
2152 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2154 ps_insn_ptr ps_i = xmalloc (sizeof (struct ps_insn));
2156 ps_i->node = node;
2157 ps_i->next_in_row = NULL;
2158 ps_i->prev_in_row = NULL;
2159 ps_i->row_rest_count = rest_count;
2160 ps_i->cycle = cycle;
2162 return ps_i;
2166 /* Removes the given PS_INSN from the partial schedule. Returns false if the
2167 node is not found in the partial schedule, else returns true. */
2168 static bool
2169 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2171 int row;
2173 if (!ps || !ps_i)
2174 return false;
2176 row = SMODULO (ps_i->cycle, ps->ii);
2177 if (! ps_i->prev_in_row)
2179 if (ps_i != ps->rows[row])
2180 return false;
2182 ps->rows[row] = ps_i->next_in_row;
2183 if (ps->rows[row])
2184 ps->rows[row]->prev_in_row = NULL;
2186 else
2188 ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2189 if (ps_i->next_in_row)
2190 ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2192 free (ps_i);
2193 return true;
2196 /* Unlike what literature describes for modulo scheduling (which focuses
2197 on VLIW machines) the order of the instructions inside a cycle is
2198 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2199 where the current instruction should go relative to the already
2200 scheduled instructions in the given cycle. Go over these
2201 instructions and find the first possible column to put it in. */
2202 static bool
2203 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2204 sbitmap must_precede, sbitmap must_follow)
2206 ps_insn_ptr next_ps_i;
2207 ps_insn_ptr first_must_follow = NULL;
2208 ps_insn_ptr last_must_precede = NULL;
2209 int row;
2211 if (! ps_i)
2212 return false;
2214 row = SMODULO (ps_i->cycle, ps->ii);
2216 /* Find the first must follow and the last must precede
2217 and insert the node immediately after the must precede
2218 but make sure that it there is no must follow after it. */
2219 for (next_ps_i = ps->rows[row];
2220 next_ps_i;
2221 next_ps_i = next_ps_i->next_in_row)
2223 if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2224 && ! first_must_follow)
2225 first_must_follow = next_ps_i;
2226 if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2228 /* If we have already met a node that must follow, then
2229 there is no possible column. */
2230 if (first_must_follow)
2231 return false;
2232 else
2233 last_must_precede = next_ps_i;
2237 /* Now insert the node after INSERT_AFTER_PSI. */
2239 if (! last_must_precede)
2241 ps_i->next_in_row = ps->rows[row];
2242 ps_i->prev_in_row = NULL;
2243 if (ps_i->next_in_row)
2244 ps_i->next_in_row->prev_in_row = ps_i;
2245 ps->rows[row] = ps_i;
2247 else
2249 ps_i->next_in_row = last_must_precede->next_in_row;
2250 last_must_precede->next_in_row = ps_i;
2251 ps_i->prev_in_row = last_must_precede;
2252 if (ps_i->next_in_row)
2253 ps_i->next_in_row->prev_in_row = ps_i;
2256 return true;
2259 /* Advances the PS_INSN one column in its current row; returns false
2260 in failure and true in success. Bit N is set in MUST_FOLLOW if
2261 the node with cuid N must be come after the node pointed to by
2262 PS_I when scheduled in the same cycle. */
2263 static int
2264 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2265 sbitmap must_follow)
2267 ps_insn_ptr prev, next;
2268 int row;
2269 ddg_node_ptr next_node;
2271 if (!ps || !ps_i)
2272 return false;
2274 row = SMODULO (ps_i->cycle, ps->ii);
2276 if (! ps_i->next_in_row)
2277 return false;
2279 next_node = ps_i->next_in_row->node;
2281 /* Check if next_in_row is dependent on ps_i, both having same sched
2282 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
2283 if (TEST_BIT (must_follow, next_node->cuid))
2284 return false;
2286 /* Advance PS_I over its next_in_row in the doubly linked list. */
2287 prev = ps_i->prev_in_row;
2288 next = ps_i->next_in_row;
2290 if (ps_i == ps->rows[row])
2291 ps->rows[row] = next;
2293 ps_i->next_in_row = next->next_in_row;
2295 if (next->next_in_row)
2296 next->next_in_row->prev_in_row = ps_i;
2298 next->next_in_row = ps_i;
2299 ps_i->prev_in_row = next;
2301 next->prev_in_row = prev;
2302 if (prev)
2303 prev->next_in_row = next;
2305 return true;
2308 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2309 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
2310 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2311 before/after (respectively) the node pointed to by PS_I when scheduled
2312 in the same cycle. */
2313 static ps_insn_ptr
2314 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2315 sbitmap must_precede, sbitmap must_follow)
2317 ps_insn_ptr ps_i;
2318 int rest_count = 1;
2319 int row = SMODULO (cycle, ps->ii);
2321 if (ps->rows[row]
2322 && ps->rows[row]->row_rest_count >= issue_rate)
2323 return NULL;
2325 if (ps->rows[row])
2326 rest_count += ps->rows[row]->row_rest_count;
2328 ps_i = create_ps_insn (node, rest_count, cycle);
2330 /* Finds and inserts PS_I according to MUST_FOLLOW and
2331 MUST_PRECEDE. */
2332 if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2334 free (ps_i);
2335 return NULL;
2338 return ps_i;
2341 /* Advance time one cycle. Assumes DFA is being used. */
2342 static void
2343 advance_one_cycle (void)
2345 if (targetm.sched.dfa_pre_cycle_insn)
2346 state_transition (curr_state,
2347 (*targetm.sched.dfa_pre_cycle_insn) ());
2349 state_transition (curr_state, NULL);
2351 if (targetm.sched.dfa_post_cycle_insn)
2352 state_transition (curr_state,
2353 (*targetm.sched.dfa_post_cycle_insn) ());
2356 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2357 the number of cycles according to DFA that the kernel fits in,
2358 we use this to check if we done well with SMS after we add
2359 register moves. In some cases register moves overhead makes
2360 it even worse than the original loop. We want SMS to be performed
2361 when it gives less cycles after register moves are added. */
2362 static int
2363 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2365 int cycles = 0;
2366 rtx insn;
2367 int can_issue_more = issue_rate;
2369 state_reset (curr_state);
2371 for (insn = first_insn;
2372 insn != NULL_RTX && insn != last_insn;
2373 insn = NEXT_INSN (insn))
2375 if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2376 continue;
2378 /* Check if there is room for the current insn. */
2379 if (!can_issue_more || state_dead_lock_p (curr_state))
2381 cycles ++;
2382 advance_one_cycle ();
2383 can_issue_more = issue_rate;
2386 /* Update the DFA state and return with failure if the DFA found
2387 recource conflicts. */
2388 if (state_transition (curr_state, insn) >= 0)
2390 cycles ++;
2391 advance_one_cycle ();
2392 can_issue_more = issue_rate;
2395 if (targetm.sched.variable_issue)
2396 can_issue_more =
2397 (*targetm.sched.variable_issue) (sched_dump, sched_verbose,
2398 insn, can_issue_more);
2399 /* A naked CLOBBER or USE generates no instruction, so don't
2400 let them consume issue slots. */
2401 else if (GET_CODE (PATTERN (insn)) != USE
2402 && GET_CODE (PATTERN (insn)) != CLOBBER)
2403 can_issue_more--;
2405 return cycles;
2408 /* Checks if PS has resource conflicts according to DFA, starting from
2409 FROM cycle to TO cycle; returns true if there are conflicts and false
2410 if there are no conflicts. Assumes DFA is being used. */
2411 static int
2412 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2414 int cycle;
2416 state_reset (curr_state);
2418 for (cycle = from; cycle <= to; cycle++)
2420 ps_insn_ptr crr_insn;
2421 /* Holds the remaining issue slots in the current row. */
2422 int can_issue_more = issue_rate;
2424 /* Walk through the DFA for the current row. */
2425 for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2426 crr_insn;
2427 crr_insn = crr_insn->next_in_row)
2429 rtx insn = crr_insn->node->insn;
2431 if (!INSN_P (insn))
2432 continue;
2434 /* Check if there is room for the current insn. */
2435 if (!can_issue_more || state_dead_lock_p (curr_state))
2436 return true;
2438 /* Update the DFA state and return with failure if the DFA found
2439 recource conflicts. */
2440 if (state_transition (curr_state, insn) >= 0)
2441 return true;
2443 if (targetm.sched.variable_issue)
2444 can_issue_more =
2445 (*targetm.sched.variable_issue) (sched_dump, sched_verbose,
2446 insn, can_issue_more);
2447 /* A naked CLOBBER or USE generates no instruction, so don't
2448 let them consume issue slots. */
2449 else if (GET_CODE (PATTERN (insn)) != USE
2450 && GET_CODE (PATTERN (insn)) != CLOBBER)
2451 can_issue_more--;
2454 /* Advance the DFA to the next cycle. */
2455 advance_one_cycle ();
2457 return false;
2460 /* Checks if the given node causes resource conflicts when added to PS at
2461 cycle C. If not the node is added to PS and returned; otherwise zero
2462 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2463 cuid N must be come before/after (respectively) the node pointed to by
2464 PS_I when scheduled in the same cycle. */
2465 ps_insn_ptr
2466 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2467 int c, sbitmap must_precede,
2468 sbitmap must_follow)
2470 int has_conflicts = 0;
2471 ps_insn_ptr ps_i;
2473 /* First add the node to the PS, if this succeeds check for
2474 conflicts, trying different issue slots in the same row. */
2475 if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2476 return NULL; /* Failed to insert the node at the given cycle. */
2478 has_conflicts = ps_has_conflicts (ps, c, c)
2479 || (ps->history > 0
2480 && ps_has_conflicts (ps,
2481 c - ps->history,
2482 c + ps->history));
2484 /* Try different issue slots to find one that the given node can be
2485 scheduled in without conflicts. */
2486 while (has_conflicts)
2488 if (! ps_insn_advance_column (ps, ps_i, must_follow))
2489 break;
2490 has_conflicts = ps_has_conflicts (ps, c, c)
2491 || (ps->history > 0
2492 && ps_has_conflicts (ps,
2493 c - ps->history,
2494 c + ps->history));
2497 if (has_conflicts)
2499 remove_node_from_ps (ps, ps_i);
2500 return NULL;
2503 ps->min_cycle = MIN (ps->min_cycle, c);
2504 ps->max_cycle = MAX (ps->max_cycle, c);
2505 return ps_i;
2508 /* Rotate the rows of PS such that insns scheduled at time
2509 START_CYCLE will appear in row 0. Updates max/min_cycles. */
2510 void
2511 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2513 int i, row, backward_rotates;
2514 int last_row = ps->ii - 1;
2516 if (start_cycle == 0)
2517 return;
2519 backward_rotates = SMODULO (start_cycle, ps->ii);
2521 /* Revisit later and optimize this into a single loop. */
2522 for (i = 0; i < backward_rotates; i++)
2524 ps_insn_ptr first_row = ps->rows[0];
2526 for (row = 0; row < last_row; row++)
2527 ps->rows[row] = ps->rows[row+1];
2529 ps->rows[last_row] = first_row;
2532 ps->max_cycle -= start_cycle;
2533 ps->min_cycle -= start_cycle;
2536 /* Remove the node N from the partial schedule PS; because we restart the DFA
2537 each time we want to check for resource conflicts; this is equivalent to
2538 unscheduling the node N. */
2539 static bool
2540 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2542 ps_insn_ptr ps_i;
2543 int row = SMODULO (SCHED_TIME (n), ps->ii);
2545 if (row < 0 || row > ps->ii)
2546 return false;
2548 for (ps_i = ps->rows[row];
2549 ps_i && ps_i->node != n;
2550 ps_i = ps_i->next_in_row);
2551 if (!ps_i)
2552 return false;
2554 return remove_node_from_ps (ps, ps_i);
2556 #endif /* INSN_SCHEDULING*/