1 /* Swing Modulo Scheduling implementation.
2 Copyright (C) 2004, 2005
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
26 #include "coretypes.h"
31 #include "hard-reg-set.h"
35 #include "insn-config.h"
36 #include "insn-attr.h"
40 #include "sched-int.h"
42 #include "cfglayout.h"
51 #ifdef INSN_SCHEDULING
53 /* This file contains the implementation of the Swing Modulo Scheduler,
54 described in the following references:
55 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
56 Lifetime--sensitive modulo scheduling in a production environment.
57 IEEE Trans. on Comps., 50(3), March 2001
58 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
59 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
60 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
62 The basic structure is:
63 1. Build a data-dependence graph (DDG) for each loop.
64 2. Use the DDG to order the insns of a loop (not in topological order
65 necessarily, but rather) trying to place each insn after all its
66 predecessors _or_ after all its successors.
67 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
68 4. Use the ordering to perform list-scheduling of the loop:
69 1. Set II = MII. We will try to schedule the loop within II cycles.
70 2. Try to schedule the insns one by one according to the ordering.
71 For each insn compute an interval of cycles by considering already-
72 scheduled preds and succs (and associated latencies); try to place
73 the insn in the cycles of this window checking for potential
74 resource conflicts (using the DFA interface).
75 Note: this is different from the cycle-scheduling of schedule_insns;
76 here the insns are not scheduled monotonically top-down (nor bottom-
78 3. If failed in scheduling all insns - bump II++ and try again, unless
79 II reaches an upper bound MaxII, in which case report failure.
80 5. If we succeeded in scheduling the loop within II cycles, we now
81 generate prolog and epilog, decrease the counter of the loop, and
82 perform modulo variable expansion for live ranges that span more than
83 II cycles (i.e. use register copies to prevent a def from overwriting
84 itself before reaching the use).
88 /* This page defines partial-schedule structures and functions for
91 typedef struct partial_schedule
*partial_schedule_ptr
;
92 typedef struct ps_insn
*ps_insn_ptr
;
94 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
95 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
97 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
98 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
100 /* Perform signed modulo, always returning a non-negative value. */
101 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
103 /* The number of different iterations the nodes in ps span, assuming
104 the stage boundaries are placed efficiently. */
105 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
106 + 1 + (ps)->ii - 1) / (ps)->ii)
108 #define CFG_HOOKS cfg_layout_rtl_cfg_hooks
110 /* A single instruction in the partial schedule. */
113 /* The corresponding DDG_NODE. */
116 /* The (absolute) cycle in which the PS instruction is scheduled.
117 Same as SCHED_TIME (node). */
120 /* The next/prev PS_INSN in the same row. */
121 ps_insn_ptr next_in_row
,
124 /* The number of nodes in the same row that come after this node. */
128 /* Holds the partial schedule as an array of II rows. Each entry of the
129 array points to a linked list of PS_INSNs, which represents the
130 instructions that are scheduled for that row. */
131 struct partial_schedule
133 int ii
; /* Number of rows in the partial schedule. */
134 int history
; /* Threshold for conflict checking using DFA. */
136 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
139 /* The earliest absolute cycle of an insn in the partial schedule. */
142 /* The latest absolute cycle of an insn in the partial schedule. */
145 ddg_ptr g
; /* The DDG of the insns in the partial schedule. */
149 static partial_schedule_ptr
create_partial_schedule (int ii
, ddg_ptr
, int history
);
150 static void free_partial_schedule (partial_schedule_ptr
);
151 static void reset_partial_schedule (partial_schedule_ptr
, int new_ii
);
152 void print_partial_schedule (partial_schedule_ptr
, FILE *);
153 static ps_insn_ptr
ps_add_node_check_conflicts (partial_schedule_ptr
,
154 ddg_node_ptr node
, int cycle
,
155 sbitmap must_precede
,
156 sbitmap must_follow
);
157 static void rotate_partial_schedule (partial_schedule_ptr
, int);
159 /* This page defines constants and structures for the modulo scheduling
162 /* As in haifa-sched.c: */
163 /* issue_rate is the number of insns that can be scheduled in the same
164 machine cycle. It can be defined in the config/mach/mach.h file,
165 otherwise we set it to 1. */
167 static int issue_rate
;
169 /* For printing statistics. */
170 static FILE *stats_file
;
172 static int sms_order_nodes (ddg_ptr
, int, int * result
);
173 static void set_node_sched_params (ddg_ptr
);
174 static partial_schedule_ptr
sms_schedule_by_order (ddg_ptr
, int, int,
176 static void permute_partial_schedule (partial_schedule_ptr ps
, rtx last
);
177 static void generate_prolog_epilog (partial_schedule_ptr
, rtx
, rtx
, int);
178 static void duplicate_insns_of_cycles (partial_schedule_ptr ps
,
179 int from_stage
, int to_stage
,
183 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
184 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
185 #define SCHED_FIRST_REG_MOVE(x) \
186 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
187 #define SCHED_NREG_MOVES(x) \
188 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
189 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
190 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
191 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
193 /* The scheduling parameters held for each node. */
194 typedef struct node_sched_params
196 int asap
; /* A lower-bound on the absolute scheduling cycle. */
197 int time
; /* The absolute scheduling cycle (time >= asap). */
199 /* The following field (first_reg_move) is a pointer to the first
200 register-move instruction added to handle the modulo-variable-expansion
201 of the register defined by this node. This register-move copies the
202 original register defined by the node. */
205 /* The number of register-move instructions added, immediately preceding
209 int row
; /* Holds time % ii. */
210 int stage
; /* Holds time / ii. */
212 /* The column of a node inside the ps. If nodes u, v are on the same row,
213 u will precede v if column (u) < column (v). */
215 } *node_sched_params_ptr
;
218 /* The following three functions are copied from the current scheduler
219 code in order to use sched_analyze() for computing the dependencies.
220 They are used when initializing the sched_info structure. */
222 sms_print_insn (rtx insn
, int aligned ATTRIBUTE_UNUSED
)
226 sprintf (tmp
, "i%4d", INSN_UID (insn
));
231 contributes_to_priority (rtx next
, rtx insn
)
233 return BLOCK_NUM (next
) == BLOCK_NUM (insn
);
237 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED
,
238 regset cond_exec ATTRIBUTE_UNUSED
,
239 regset used ATTRIBUTE_UNUSED
,
240 regset set ATTRIBUTE_UNUSED
)
244 static struct sched_info sms_sched_info
=
252 contributes_to_priority
,
253 compute_jump_reg_dependencies
,
260 /* Return the register decremented and tested or zero if it is not a decrement
261 and branch jump insn (similar to doloop_condition_get). */
263 doloop_register_get (rtx insn
, rtx
*comp
)
265 rtx pattern
, cmp
, inc
, reg
, condition
;
269 pattern
= PATTERN (insn
);
271 /* The canonical doloop pattern we expect is:
273 (parallel [(set (pc) (if_then_else (condition)
276 (set (reg) (plus (reg) (const_int -1)))
277 (additional clobbers and uses)])
279 where condition is further restricted to be
280 (ne (reg) (const_int 1)). */
282 if (GET_CODE (pattern
) != PARALLEL
)
285 cmp
= XVECEXP (pattern
, 0, 0);
286 inc
= XVECEXP (pattern
, 0, 1);
287 /* Return the compare rtx. */
290 /* Check for (set (reg) (something)). */
291 if (GET_CODE (inc
) != SET
|| ! REG_P (SET_DEST (inc
)))
294 /* Extract loop counter register. */
295 reg
= SET_DEST (inc
);
297 /* Check if something = (plus (reg) (const_int -1)). */
298 if (GET_CODE (SET_SRC (inc
)) != PLUS
299 || XEXP (SET_SRC (inc
), 0) != reg
300 || XEXP (SET_SRC (inc
), 1) != constm1_rtx
)
303 /* Check for (set (pc) (if_then_else (condition)
306 if (GET_CODE (cmp
) != SET
307 || SET_DEST (cmp
) != pc_rtx
308 || GET_CODE (SET_SRC (cmp
)) != IF_THEN_ELSE
309 || GET_CODE (XEXP (SET_SRC (cmp
), 1)) != LABEL_REF
310 || XEXP (SET_SRC (cmp
), 2) != pc_rtx
)
313 /* Extract loop termination condition. */
314 condition
= XEXP (SET_SRC (cmp
), 0);
316 /* Check if condition = (ne (reg) (const_int 1)), which is more
317 restrictive than the check in doloop_condition_get:
318 if ((GET_CODE (condition) != GE && GET_CODE (condition) != NE)
319 || GET_CODE (XEXP (condition, 1)) != CONST_INT). */
320 if (GET_CODE (condition
) != NE
321 || XEXP (condition
, 1) != const1_rtx
)
324 if (XEXP (condition
, 0) == reg
)
330 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
331 that the number of iterations is a compile-time constant. If so,
332 return the rtx that sets COUNT_REG to a constant, and set COUNT to
333 this constant. Otherwise return 0. */
335 const_iteration_count (rtx count_reg
, basic_block pre_header
,
336 HOST_WIDEST_INT
* count
)
340 get_block_head_tail (pre_header
->index
, &head
, &tail
);
342 for (insn
= tail
; insn
!= PREV_INSN (head
); insn
= PREV_INSN (insn
))
343 if (INSN_P (insn
) && single_set (insn
) &&
344 rtx_equal_p (count_reg
, SET_DEST (single_set (insn
))))
346 rtx pat
= single_set (insn
);
348 if (GET_CODE (SET_SRC (pat
)) == CONST_INT
)
350 *count
= INTVAL (SET_SRC (pat
));
360 /* A very simple resource-based lower bound on the initiation interval.
361 ??? Improve the accuracy of this bound by considering the
362 utilization of various units. */
366 return (g
->num_nodes
/ issue_rate
);
370 /* Points to the array that contains the sched data for each node. */
371 static node_sched_params_ptr node_sched_params
;
373 /* Allocate sched_params for each node and initialize it. Assumes that
374 the aux field of each node contain the asap bound (computed earlier),
375 and copies it into the sched_params field. */
377 set_node_sched_params (ddg_ptr g
)
381 /* Allocate for each node in the DDG a place to hold the "sched_data". */
382 /* Initialize ASAP/ALAP/HIGHT to zero. */
383 node_sched_params
= (node_sched_params_ptr
)
384 xcalloc (g
->num_nodes
,
385 sizeof (struct node_sched_params
));
387 /* Set the pointer of the general data of the node to point to the
388 appropriate sched_params structure. */
389 for (i
= 0; i
< g
->num_nodes
; i
++)
391 /* Watch out for aliasing problems? */
392 node_sched_params
[i
].asap
= g
->nodes
[i
].aux
.count
;
393 g
->nodes
[i
].aux
.info
= &node_sched_params
[i
];
398 print_node_sched_params (FILE * dump_file
, int num_nodes
)
402 for (i
= 0; i
< num_nodes
; i
++)
404 node_sched_params_ptr nsp
= &node_sched_params
[i
];
405 rtx reg_move
= nsp
->first_reg_move
;
408 fprintf (dump_file
, "Node %d:\n", i
);
409 fprintf (dump_file
, " asap = %d:\n", nsp
->asap
);
410 fprintf (dump_file
, " time = %d:\n", nsp
->time
);
411 fprintf (dump_file
, " nreg_moves = %d:\n", nsp
->nreg_moves
);
412 for (j
= 0; j
< nsp
->nreg_moves
; j
++)
414 fprintf (dump_file
, " reg_move = ");
415 print_rtl_single (dump_file
, reg_move
);
416 reg_move
= PREV_INSN (reg_move
);
421 /* Calculate an upper bound for II. SMS should not schedule the loop if it
422 requires more cycles than this bound. Currently set to the sum of the
423 longest latency edge for each node. Reset based on experiments. */
425 calculate_maxii (ddg_ptr g
)
430 for (i
= 0; i
< g
->num_nodes
; i
++)
432 ddg_node_ptr u
= &g
->nodes
[i
];
434 int max_edge_latency
= 0;
436 for (e
= u
->out
; e
; e
= e
->next_out
)
437 max_edge_latency
= MAX (max_edge_latency
, e
->latency
);
439 maxii
+= max_edge_latency
;
445 /* Given the partial schedule, generate register moves when the length
446 of the register live range is more than ii; the number of moves is
447 determined according to the following equation:
448 SCHED_TIME (use) - SCHED_TIME (def) { 1 broken loop-carried
449 nreg_moves = ----------------------------------- - { dependence.
451 This handles the modulo-variable-expansions (mve's) needed for the ps. */
453 generate_reg_moves (partial_schedule_ptr ps
)
459 for (i
= 0; i
< g
->num_nodes
; i
++)
461 ddg_node_ptr u
= &g
->nodes
[i
];
463 int nreg_moves
= 0, i_reg_move
;
464 sbitmap
*uses_of_defs
;
466 rtx prev_reg
, old_reg
;
468 /* Compute the number of reg_moves needed for u, by looking at life
469 ranges started at u (excluding self-loops). */
470 for (e
= u
->out
; e
; e
= e
->next_out
)
471 if (e
->type
== TRUE_DEP
&& e
->dest
!= e
->src
)
473 int nreg_moves4e
= (SCHED_TIME (e
->dest
) - SCHED_TIME (e
->src
)) / ii
;
475 /* If dest precedes src in the schedule of the kernel, then dest
476 will read before src writes and we can save one reg_copy. */
477 if (SCHED_ROW (e
->dest
) == SCHED_ROW (e
->src
)
478 && SCHED_COLUMN (e
->dest
) < SCHED_COLUMN (e
->src
))
481 nreg_moves
= MAX (nreg_moves
, nreg_moves4e
);
487 /* Every use of the register defined by node may require a different
488 copy of this register, depending on the time the use is scheduled.
489 Set a bitmap vector, telling which nodes use each copy of this
491 uses_of_defs
= sbitmap_vector_alloc (nreg_moves
, g
->num_nodes
);
492 sbitmap_vector_zero (uses_of_defs
, nreg_moves
);
493 for (e
= u
->out
; e
; e
= e
->next_out
)
494 if (e
->type
== TRUE_DEP
&& e
->dest
!= e
->src
)
496 int dest_copy
= (SCHED_TIME (e
->dest
) - SCHED_TIME (e
->src
)) / ii
;
498 if (SCHED_ROW (e
->dest
) == SCHED_ROW (e
->src
)
499 && SCHED_COLUMN (e
->dest
) < SCHED_COLUMN (e
->src
))
503 SET_BIT (uses_of_defs
[dest_copy
- 1], e
->dest
->cuid
);
506 /* Now generate the reg_moves, attaching relevant uses to them. */
507 SCHED_NREG_MOVES (u
) = nreg_moves
;
508 old_reg
= prev_reg
= copy_rtx (SET_DEST (single_set (u
->insn
)));
509 last_reg_move
= u
->insn
;
511 for (i_reg_move
= 0; i_reg_move
< nreg_moves
; i_reg_move
++)
514 rtx new_reg
= gen_reg_rtx (GET_MODE (prev_reg
));
515 rtx reg_move
= gen_move_insn (new_reg
, prev_reg
);
517 add_insn_before (reg_move
, last_reg_move
);
518 last_reg_move
= reg_move
;
520 if (!SCHED_FIRST_REG_MOVE (u
))
521 SCHED_FIRST_REG_MOVE (u
) = reg_move
;
523 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs
[i_reg_move
], 0, i_use
,
524 replace_rtx (g
->nodes
[i_use
].insn
, old_reg
, new_reg
));
531 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
532 of SCHED_ROW and SCHED_STAGE. */
534 normalize_sched_times (partial_schedule_ptr ps
)
538 int amount
= PS_MIN_CYCLE (ps
);
541 for (i
= 0; i
< g
->num_nodes
; i
++)
543 ddg_node_ptr u
= &g
->nodes
[i
];
544 int normalized_time
= SCHED_TIME (u
) - amount
;
546 if (normalized_time
< 0)
549 SCHED_TIME (u
) = normalized_time
;
550 SCHED_ROW (u
) = normalized_time
% ii
;
551 SCHED_STAGE (u
) = normalized_time
/ ii
;
555 /* Set SCHED_COLUMN of each node according to its position in PS. */
557 set_columns_for_ps (partial_schedule_ptr ps
)
561 for (row
= 0; row
< ps
->ii
; row
++)
563 ps_insn_ptr cur_insn
= ps
->rows
[row
];
566 for (; cur_insn
; cur_insn
= cur_insn
->next_in_row
)
567 SCHED_COLUMN (cur_insn
->node
) = column
++;
571 /* Permute the insns according to their order in PS, from row 0 to
572 row ii-1, and position them right before LAST. This schedules
573 the insns of the loop kernel. */
575 permute_partial_schedule (partial_schedule_ptr ps
, rtx last
)
581 for (row
= 0; row
< ii
; row
++)
582 for (ps_ij
= ps
->rows
[row
]; ps_ij
; ps_ij
= ps_ij
->next_in_row
)
583 if (PREV_INSN (last
) != ps_ij
->node
->insn
)
584 reorder_insns_nobb (ps_ij
->node
->first_note
, ps_ij
->node
->insn
,
588 /* Used to generate the prologue & epilogue. Duplicate the subset of
589 nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
590 of both), together with a prefix/suffix of their reg_moves. */
592 duplicate_insns_of_cycles (partial_schedule_ptr ps
, int from_stage
,
593 int to_stage
, int for_prolog
)
598 for (row
= 0; row
< ps
->ii
; row
++)
599 for (ps_ij
= ps
->rows
[row
]; ps_ij
; ps_ij
= ps_ij
->next_in_row
)
601 ddg_node_ptr u_node
= ps_ij
->node
;
603 rtx reg_move
= NULL_RTX
;
607 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
608 number of reg_moves starting with the second occurrence of
609 u_node, which is generated if its SCHED_STAGE <= to_stage. */
610 i_reg_moves
= to_stage
- SCHED_STAGE (u_node
);
611 i_reg_moves
= MAX (i_reg_moves
, 0);
612 i_reg_moves
= MIN (i_reg_moves
, SCHED_NREG_MOVES (u_node
));
614 /* The reg_moves start from the *first* reg_move backwards. */
617 reg_move
= SCHED_FIRST_REG_MOVE (u_node
);
618 for (j
= 1; j
< i_reg_moves
; j
++)
619 reg_move
= PREV_INSN (reg_move
);
622 else /* It's for the epilog. */
624 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
625 starting to decrease one stage after u_node no longer occurs;
626 that is, generate all reg_moves until
627 SCHED_STAGE (u_node) == from_stage - 1. */
628 i_reg_moves
= SCHED_NREG_MOVES (u_node
)
629 - (from_stage
- SCHED_STAGE (u_node
) - 1);
630 i_reg_moves
= MAX (i_reg_moves
, 0);
631 i_reg_moves
= MIN (i_reg_moves
, SCHED_NREG_MOVES (u_node
));
633 /* The reg_moves start from the *last* reg_move forwards. */
636 reg_move
= SCHED_FIRST_REG_MOVE (u_node
);
637 for (j
= 1; j
< SCHED_NREG_MOVES (u_node
); j
++)
638 reg_move
= PREV_INSN (reg_move
);
642 for (j
= 0; j
< i_reg_moves
; j
++, reg_move
= NEXT_INSN (reg_move
))
643 emit_insn (copy_rtx (PATTERN (reg_move
)));
645 if (SCHED_STAGE (u_node
) >= from_stage
646 && SCHED_STAGE (u_node
) <= to_stage
)
647 duplicate_insn_chain (u_node
->first_note
, u_node
->insn
);
652 /* Generate the instructions (including reg_moves) for prolog & epilog. */
654 generate_prolog_epilog (partial_schedule_ptr ps
, rtx orig_loop_beg
,
655 rtx orig_loop_end
, int unknown_count
)
658 int last_stage
= PS_STAGE_COUNT (ps
) - 1;
660 rtx c_reg
= NULL_RTX
;
662 rtx precond_jump
= NULL_RTX
;
663 rtx precond_exit_label
= NULL_RTX
;
664 rtx precond_exit_label_insn
= NULL_RTX
;
665 rtx last_epilog_insn
= NULL_RTX
;
666 rtx loop_exit_label
= NULL_RTX
;
667 rtx loop_exit_label_insn
= NULL_RTX
;
668 rtx orig_loop_bct
= NULL_RTX
;
670 /* Loop header edge. */
671 e
= EDGE_PRED (ps
->g
->bb
, 0);
672 if (e
->src
== ps
->g
->bb
)
673 e
= EDGE_PRED (ps
->g
->bb
, 1);
675 /* Generate the prolog, inserting its insns on the loop-entry edge. */
678 /* This is the place where we want to insert the precondition. */
680 precond_jump
= emit_note (NOTE_INSN_DELETED
);
682 for (i
= 0; i
< last_stage
; i
++)
683 duplicate_insns_of_cycles (ps
, 0, i
, 1);
685 /* No need to call insert_insn_on_edge; we prepared the sequence. */
686 e
->insns
.r
= get_insns ();
689 /* Generate the epilog, inserting its insns on the loop-exit edge. */
692 for (i
= 0; i
< last_stage
; i
++)
693 duplicate_insns_of_cycles (ps
, i
+ 1, last_stage
, 0);
695 last_epilog_insn
= emit_note (NOTE_INSN_DELETED
);
697 /* Emit the label where to put the original loop code. */
702 precond_exit_label
= gen_label_rtx ();
703 precond_exit_label_insn
= emit_label (precond_exit_label
);
705 /* Put the original loop code. */
706 reorder_insns_nobb (orig_loop_beg
, orig_loop_end
, precond_exit_label_insn
);
708 /* Change the label of the BCT to be the PRECOND_EXIT_LABEL. */
709 orig_loop_bct
= get_last_insn ();
710 c_reg
= doloop_register_get (orig_loop_bct
, &cmp
);
711 label
= XEXP (SET_SRC (cmp
), 1);
712 cond
= XEXP (SET_SRC (cmp
), 0);
714 if (! c_reg
|| GET_CODE (cond
) != NE
)
717 XEXP (label
, 0) = precond_exit_label
;
718 JUMP_LABEL (orig_loop_bct
) = precond_exit_label_insn
;
719 LABEL_NUSES (precond_exit_label_insn
)++;
721 /* Generate the loop exit label. */
722 loop_exit_label
= gen_label_rtx ();
723 loop_exit_label_insn
= emit_label (loop_exit_label
);
726 e
= EDGE_SUCC (ps
->g
->bb
, 0);
727 if (e
->dest
== ps
->g
->bb
)
728 e
= EDGE_SUCC (ps
->g
->bb
, 1);
730 e
->insns
.r
= get_insns ();
733 commit_edge_insertions ();
737 rtx precond_insns
, epilog_jump
, insert_after_insn
;
738 basic_block loop_exit_bb
= BLOCK_FOR_INSN (loop_exit_label_insn
);
739 basic_block epilog_bb
= BLOCK_FOR_INSN (last_epilog_insn
);
740 basic_block precond_bb
= BLOCK_FOR_INSN (precond_jump
);
741 basic_block orig_loop_bb
= BLOCK_FOR_INSN (precond_exit_label_insn
);
742 edge epilog_exit_edge
= single_succ_edge (epilog_bb
);
744 /* Do loop preconditioning to take care of cases were the loop count is
745 less than the stage count. Update the CFG properly. */
746 insert_after_insn
= precond_jump
;
748 c_reg
= doloop_register_get (ps
->g
->closing_branch
->insn
, &cmp
);
749 emit_cmp_and_jump_insns (c_reg
, GEN_INT (PS_STAGE_COUNT (ps
)), LT
, NULL
,
750 GET_MODE (c_reg
), 1, precond_exit_label
);
751 precond_insns
= get_insns ();
752 precond_jump
= get_last_insn ();
754 reorder_insns (precond_insns
, precond_jump
, insert_after_insn
);
756 /* Generate a subtract instruction at the beginning of the prolog to
757 adjust the loop count by STAGE_COUNT. */
758 emit_insn_after (gen_sub2_insn (c_reg
, GEN_INT (PS_STAGE_COUNT (ps
) - 1)),
760 update_bb_for_insn (precond_bb
);
761 delete_insn (insert_after_insn
);
763 /* Update label info for the precondition jump. */
764 JUMP_LABEL (precond_jump
) = precond_exit_label_insn
;
765 LABEL_NUSES (precond_exit_label_insn
)++;
767 /* Update the CFG. */
768 split_block (precond_bb
, precond_jump
);
769 make_edge (precond_bb
, orig_loop_bb
, 0);
771 /* Add a jump at end of the epilog to the LOOP_EXIT_LABEL to jump over the
772 original loop copy and update the CFG. */
773 epilog_jump
= emit_jump_insn_after (gen_jump (loop_exit_label
),
775 delete_insn (last_epilog_insn
);
776 JUMP_LABEL (epilog_jump
) = loop_exit_label_insn
;
777 LABEL_NUSES (loop_exit_label_insn
)++;
779 redirect_edge_succ (epilog_exit_edge
, loop_exit_bb
);
780 epilog_exit_edge
->flags
&= ~EDGE_FALLTHRU
;
781 emit_barrier_after (BB_END (epilog_bb
));
785 /* Return the line note insn preceding INSN, for debugging. Taken from
788 find_line_note (rtx insn
)
790 for (; insn
; insn
= PREV_INSN (insn
))
792 && NOTE_LINE_NUMBER (insn
) >= 0)
798 /* Main entry point, perform SMS scheduling on the loops of the function
799 that consist of single basic blocks. */
801 sms_schedule (FILE *dump_file
)
803 static int passes
= 0;
806 basic_block bb
, pre_header
= NULL
;
810 partial_schedule_ptr ps
;
811 int max_bb_index
= last_basic_block
;
814 stats_file
= dump_file
;
816 /* Initialize issue_rate. */
817 if (targetm
.sched
.issue_rate
)
819 int temp
= reload_completed
;
821 reload_completed
= 1;
822 issue_rate
= (*targetm
.sched
.issue_rate
) ();
823 reload_completed
= temp
;
828 /* Initialize the scheduler. */
829 current_sched_info
= &sms_sched_info
;
832 /* Init Data Flow analysis, to be used in interloop dep calculation. */
834 df_analyze (df
, 0, DF_ALL
);
836 /* Allocate memory to hold the DDG array. */
837 g_arr
= xcalloc (max_bb_index
, sizeof (ddg_ptr
));
839 /* Build DDGs for all the relevant loops and hold them in G_ARR
840 indexed by the loop BB index. */
845 edge e
, pre_header_edge
;
850 /* Check if bb has two successors, one being itself. */
851 if (EDGE_COUNT (bb
->succs
) != 2)
854 if (EDGE_SUCC (bb
, 0)->dest
!= bb
&& EDGE_SUCC (bb
, 1)->dest
!= bb
)
857 if ((EDGE_SUCC (bb
, 0)->flags
& EDGE_COMPLEX
)
858 || (EDGE_SUCC (bb
, 1)->flags
& EDGE_COMPLEX
))
861 /* Check if bb has two predecessors, one being itself. */
862 if (EDGE_COUNT (bb
->preds
) != 2)
865 if (EDGE_PRED (bb
, 0)->src
!= bb
&& EDGE_PRED (bb
, 1)->src
!= bb
)
868 if ((EDGE_PRED (bb
, 0)->flags
& EDGE_COMPLEX
)
869 || (EDGE_PRED (bb
, 1)->flags
& EDGE_COMPLEX
))
873 if ((passes
++ > MAX_SMS_LOOP_NUMBER
) && (MAX_SMS_LOOP_NUMBER
!= -1))
876 fprintf (dump_file
, "SMS reached MAX_PASSES... \n");
880 get_block_head_tail (bb
->index
, &head
, &tail
);
881 pre_header_edge
= EDGE_PRED (bb
, 0);
882 if (EDGE_PRED (bb
, 0)->src
!= bb
)
883 pre_header_edge
= EDGE_PRED (bb
, 1);
885 /* Perfrom SMS only on loops that their average count is above threshold. */
886 if (bb
->count
< pre_header_edge
->count
* SMS_LOOP_AVERAGE_COUNT_THRESHOLD
)
890 rtx line_note
= find_line_note (tail
);
894 expanded_location xloc
;
895 NOTE_EXPANDED_LOCATION (xloc
, line_note
);
896 fprintf (stats_file
, "SMS bb %s %d (file, line)\n",
897 xloc
.file
, xloc
.line
);
899 fprintf (stats_file
, "SMS single-bb-loop\n");
900 if (profile_info
&& flag_branch_probabilities
)
902 fprintf (stats_file
, "SMS loop-count ");
903 fprintf (stats_file
, HOST_WIDEST_INT_PRINT_DEC
,
904 (HOST_WIDEST_INT
) bb
->count
);
905 fprintf (stats_file
, "\n");
906 fprintf (stats_file
, "SMS preheader-count ");
907 fprintf (stats_file
, HOST_WIDEST_INT_PRINT_DEC
,
908 (HOST_WIDEST_INT
) pre_header_edge
->count
);
909 fprintf (stats_file
, "\n");
910 fprintf (stats_file
, "SMS profile-sum-max ");
911 fprintf (stats_file
, HOST_WIDEST_INT_PRINT_DEC
,
912 (HOST_WIDEST_INT
) profile_info
->sum_max
);
913 fprintf (stats_file
, "\n");
919 /* Make sure this is a doloop. */
920 if ( !(count_reg
= doloop_register_get (tail
, &comp
)))
923 e
= EDGE_PRED (bb
, 0);
925 pre_header
= EDGE_PRED (bb
, 1)->src
;
929 /* Don't handle BBs with calls or barriers, or !single_set insns. */
930 for (insn
= head
; insn
!= NEXT_INSN (tail
); insn
= NEXT_INSN (insn
))
933 || (INSN_P (insn
) && !JUMP_P (insn
)
934 && !single_set (insn
) && GET_CODE (PATTERN (insn
)) != USE
))
937 if (insn
!= NEXT_INSN (tail
))
942 fprintf (stats_file
, "SMS loop-with-call\n");
943 else if (BARRIER_P (insn
))
944 fprintf (stats_file
, "SMS loop-with-barrier\n");
946 fprintf (stats_file
, "SMS loop-with-not-single-set\n");
947 print_rtl_single (stats_file
, insn
);
953 if (! (g
= create_ddg (bb
, df
, 0)))
956 fprintf (stats_file
, "SMS doloop\n");
960 g_arr
[bb
->index
] = g
;
963 /* Release Data Flow analysis data structures. */
966 /* Go over the built DDGs and perfrom SMS for each one of them. */
967 for (i
= 0; i
< max_bb_index
; i
++)
970 rtx count_reg
, count_init
, comp
;
971 edge pre_header_edge
;
974 HOST_WIDEST_INT loop_count
= 0;
976 if (! (g
= g_arr
[i
]))
980 print_ddg (dump_file
, g
);
982 get_block_head_tail (g
->bb
->index
, &head
, &tail
);
984 pre_header_edge
= EDGE_PRED (g
->bb
, 0);
985 if (EDGE_PRED (g
->bb
, 0)->src
!= g
->bb
)
986 pre_header_edge
= EDGE_PRED (g
->bb
, 1);
990 rtx line_note
= find_line_note (tail
);
994 expanded_location xloc
;
995 NOTE_EXPANDED_LOCATION (xloc
, line_note
);
996 fprintf (stats_file
, "SMS bb %s %d (file, line)\n",
997 xloc
.file
, xloc
.line
);
999 fprintf (stats_file
, "SMS single-bb-loop\n");
1000 if (profile_info
&& flag_branch_probabilities
)
1002 fprintf (stats_file
, "SMS loop-count ");
1003 fprintf (stats_file
, HOST_WIDEST_INT_PRINT_DEC
,
1004 (HOST_WIDEST_INT
) bb
->count
);
1005 fprintf (stats_file
, "\n");
1006 fprintf (stats_file
, "SMS preheader-count ");
1007 fprintf (stats_file
, HOST_WIDEST_INT_PRINT_DEC
,
1008 (HOST_WIDEST_INT
) pre_header_edge
->count
);
1009 fprintf (stats_file
, "\n");
1010 fprintf (stats_file
, "SMS profile-sum-max ");
1011 fprintf (stats_file
, HOST_WIDEST_INT_PRINT_DEC
,
1012 (HOST_WIDEST_INT
) profile_info
->sum_max
);
1013 fprintf (stats_file
, "\n");
1015 fprintf (stats_file
, "SMS doloop\n");
1016 fprintf (stats_file
, "SMS built-ddg %d\n", g
->num_nodes
);
1017 fprintf (stats_file
, "SMS num-loads %d\n", g
->num_loads
);
1018 fprintf (stats_file
, "SMS num-stores %d\n", g
->num_stores
);
1021 /* Make sure this is a doloop. */
1022 if ( !(count_reg
= doloop_register_get (tail
, &comp
)))
1025 /* This should be NULL_RTX if the count is unknown at compile time. */
1026 count_init
= const_iteration_count (count_reg
, pre_header
, &loop_count
);
1028 if (stats_file
&& count_init
)
1030 fprintf (stats_file
, "SMS const-doloop ");
1031 fprintf (stats_file
, HOST_WIDEST_INT_PRINT_DEC
, loop_count
);
1032 fprintf (stats_file
, "\n");
1035 node_order
= (int *) xmalloc (sizeof (int) * g
->num_nodes
);
1037 mii
= 1; /* Need to pass some estimate of mii. */
1038 rec_mii
= sms_order_nodes (g
, mii
, node_order
);
1039 mii
= MAX (res_MII (g
), rec_mii
);
1040 maxii
= (calculate_maxii (g
) * SMS_MAX_II_FACTOR
) / 100;
1043 fprintf (stats_file
, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1044 rec_mii
, mii
, maxii
);
1046 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1048 set_node_sched_params (g
);
1050 ps
= sms_schedule_by_order (g
, mii
, maxii
, node_order
, dump_file
);
1053 stage_count
= PS_STAGE_COUNT (ps
);
1055 if (stage_count
== 0 || (count_init
&& (stage_count
> loop_count
)))
1058 fprintf (dump_file
, "SMS failed... \n");
1060 fprintf (stats_file
, "SMS sched-failed %d\n", stage_count
);
1064 rtx orig_loop_beg
= NULL_RTX
;
1065 rtx orig_loop_end
= NULL_RTX
;
1069 fprintf (stats_file
,
1070 "SMS succeeded %d %d (with ii, sc)\n", ps
->ii
,
1072 print_partial_schedule (ps
, dump_file
);
1074 "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1075 g
->closing_branch
->cuid
, PS_MIN_CYCLE (ps
) - 1);
1078 /* Save the original loop if we want to do loop preconditioning in
1079 case the BCT count is not known. */
1085 /* Copy the original loop code before modifying it -
1086 so we can use it later. */
1087 for (i
= 0; i
< ps
->g
->num_nodes
; i
++)
1088 duplicate_insn_chain (ps
->g
->nodes
[i
].first_note
,
1089 ps
->g
->nodes
[i
].insn
);
1091 orig_loop_beg
= get_insns ();
1092 orig_loop_end
= get_last_insn ();
1095 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
1096 the closing_branch was scheduled and should appear in the last (ii-1)
1097 row. Otherwise, we are free to schedule the branch, and we let nodes
1098 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1099 row; this should reduce stage_count to minimum. */
1100 normalize_sched_times (ps
);
1101 rotate_partial_schedule (ps
, PS_MIN_CYCLE (ps
));
1102 set_columns_for_ps (ps
);
1104 permute_partial_schedule (ps
, g
->closing_branch
->first_note
);
1106 /* Mark this loop as software pipelined so the later
1107 scheduling passes doesn't touch it. */
1108 if (! flag_resched_modulo_sched
)
1109 g
->bb
->flags
|= BB_DISABLE_SCHEDULE
;
1111 generate_reg_moves (ps
);
1113 print_node_sched_params (dump_file
, g
->num_nodes
);
1115 /* Set new iteration count of loop kernel. */
1117 SET_SRC (single_set (count_init
)) = GEN_INT (loop_count
1120 /* Generate prolog and epilog. */
1121 generate_prolog_epilog (ps
, orig_loop_beg
, orig_loop_end
,
1122 count_init
? 0 : 1);
1124 free_partial_schedule (ps
);
1125 free (node_sched_params
);
1130 /* Release scheduler data, needed until now because of DFA. */
1134 /* The SMS scheduling algorithm itself
1135 -----------------------------------
1136 Input: 'O' an ordered list of insns of a loop.
1137 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1139 'Q' is the empty Set
1140 'PS' is the partial schedule; it holds the currently scheduled nodes with
1142 'PSP' previously scheduled predecessors.
1143 'PSS' previously scheduled successors.
1144 't(u)' the cycle where u is scheduled.
1145 'l(u)' is the latency of u.
1146 'd(v,u)' is the dependence distance from v to u.
1147 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1148 the node ordering phase.
1149 'check_hardware_resources_conflicts(u, PS, c)'
1150 run a trace around cycle/slot through DFA model
1151 to check resource conflicts involving instruction u
1152 at cycle c given the partial schedule PS.
1153 'add_to_partial_schedule_at_time(u, PS, c)'
1154 Add the node/instruction u to the partial schedule
1156 'calculate_register_pressure(PS)'
1157 Given a schedule of instructions, calculate the register
1158 pressure it implies. One implementation could be the
1159 maximum number of overlapping live ranges.
1160 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1161 registers available in the hardware.
1165 3. for each node u in O in pre-computed order
1166 4. if (PSP(u) != Q && PSS(u) == Q) then
1167 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1168 6. start = Early_start; end = Early_start + II - 1; step = 1
1169 11. else if (PSP(u) == Q && PSS(u) != Q) then
1170 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1171 13. start = Late_start; end = Late_start - II + 1; step = -1
1172 14. else if (PSP(u) != Q && PSS(u) != Q) then
1173 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1174 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1175 17. start = Early_start;
1176 18. end = min(Early_start + II - 1 , Late_start);
1178 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1179 21. start = ASAP(u); end = start + II - 1; step = 1
1183 24. for (c = start ; c != end ; c += step)
1184 25. if check_hardware_resources_conflicts(u, PS, c) then
1185 26. add_to_partial_schedule_at_time(u, PS, c)
1190 31. if (success == false) then
1192 33. if (II > maxII) then
1193 34. finish - failed to schedule
1198 39. if (calculate_register_pressure(PS) > maxRP) then
1201 42. compute epilogue & prologue
1202 43. finish - succeeded to schedule
1205 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1206 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1207 set to 0 to save compile time. */
1208 #define DFA_HISTORY SMS_DFA_HISTORY
1210 static partial_schedule_ptr
1211 sms_schedule_by_order (ddg_ptr g
, int mii
, int maxii
, int *nodes_order
, FILE *dump_file
)
1215 int try_again_with_larger_ii
= true;
1216 int num_nodes
= g
->num_nodes
;
1218 int start
, end
, step
; /* Place together into one struct? */
1219 sbitmap sched_nodes
= sbitmap_alloc (num_nodes
);
1220 sbitmap must_precede
= sbitmap_alloc (num_nodes
);
1221 sbitmap must_follow
= sbitmap_alloc (num_nodes
);
1223 partial_schedule_ptr ps
= create_partial_schedule (ii
, g
, DFA_HISTORY
);
1225 while (try_again_with_larger_ii
&& ii
< maxii
)
1228 fprintf(dump_file
, "Starting with ii=%d\n", ii
);
1229 try_again_with_larger_ii
= false;
1230 sbitmap_zero (sched_nodes
);
1232 for (i
= 0; i
< num_nodes
; i
++)
1234 int u
= nodes_order
[i
];
1235 ddg_node_ptr u_node
= &g
->nodes
[u
];
1236 sbitmap u_node_preds
= NODE_PREDECESSORS (u_node
);
1237 sbitmap u_node_succs
= NODE_SUCCESSORS (u_node
);
1240 rtx insn
= u_node
->insn
;
1245 if (JUMP_P (insn
)) /* Closing branch handled later. */
1248 /* 1. compute sched window for u (start, end, step). */
1249 psp_not_empty
= sbitmap_any_common_bits (u_node_preds
, sched_nodes
);
1250 pss_not_empty
= sbitmap_any_common_bits (u_node_succs
, sched_nodes
);
1252 if (psp_not_empty
&& !pss_not_empty
)
1254 int early_start
= 0;
1257 for (e
= u_node
->in
; e
!= 0; e
= e
->next_in
)
1259 ddg_node_ptr v_node
= e
->src
;
1260 if (TEST_BIT (sched_nodes
, v_node
->cuid
))
1262 int node_st
= SCHED_TIME (v_node
)
1263 + e
->latency
- (e
->distance
* ii
);
1265 early_start
= MAX (early_start
, node_st
);
1267 if (e
->data_type
== MEM_DEP
)
1268 end
= MIN (end
, SCHED_TIME (v_node
) + ii
- 1);
1271 start
= early_start
;
1272 end
= MIN (end
, early_start
+ ii
);
1276 else if (!psp_not_empty
&& pss_not_empty
)
1278 int late_start
= INT_MAX
;
1281 for (e
= u_node
->out
; e
!= 0; e
= e
->next_out
)
1283 ddg_node_ptr v_node
= e
->dest
;
1284 if (TEST_BIT (sched_nodes
, v_node
->cuid
))
1286 late_start
= MIN (late_start
,
1287 SCHED_TIME (v_node
) - e
->latency
1288 + (e
->distance
* ii
));
1289 if (e
->data_type
== MEM_DEP
)
1290 end
= MAX (end
, SCHED_TIME (v_node
) - ii
+ 1);
1294 end
= MAX (end
, late_start
- ii
);
1298 else if (psp_not_empty
&& pss_not_empty
)
1300 int early_start
= 0;
1301 int late_start
= INT_MAX
;
1305 for (e
= u_node
->in
; e
!= 0; e
= e
->next_in
)
1307 ddg_node_ptr v_node
= e
->src
;
1309 if (TEST_BIT (sched_nodes
, v_node
->cuid
))
1311 early_start
= MAX (early_start
,
1312 SCHED_TIME (v_node
) + e
->latency
1313 - (e
->distance
* ii
));
1314 if (e
->data_type
== MEM_DEP
)
1315 end
= MIN (end
, SCHED_TIME (v_node
) + ii
- 1);
1318 for (e
= u_node
->out
; e
!= 0; e
= e
->next_out
)
1320 ddg_node_ptr v_node
= e
->dest
;
1322 if (TEST_BIT (sched_nodes
, v_node
->cuid
))
1324 late_start
= MIN (late_start
,
1325 SCHED_TIME (v_node
) - e
->latency
1326 + (e
->distance
* ii
));
1327 if (e
->data_type
== MEM_DEP
)
1328 start
= MAX (start
, SCHED_TIME (v_node
) - ii
+ 1);
1331 start
= MAX (start
, early_start
);
1332 end
= MIN (end
, MIN (early_start
+ ii
, late_start
+ 1));
1335 else /* psp is empty && pss is empty. */
1337 start
= SCHED_ASAP (u_node
);
1342 /* 2. Try scheduling u in window. */
1344 fprintf(dump_file
, "Trying to schedule node %d in (%d .. %d) step %d\n",
1345 u
, start
, end
, step
);
1347 /* use must_follow & must_precede bitmaps to determine order
1348 of nodes within the cycle. */
1349 sbitmap_zero (must_precede
);
1350 sbitmap_zero (must_follow
);
1351 for (e
= u_node
->in
; e
!= 0; e
= e
->next_in
)
1352 if (TEST_BIT (sched_nodes
, e
->src
->cuid
)
1353 && e
->latency
== (ii
* e
->distance
)
1354 && start
== SCHED_TIME (e
->src
))
1355 SET_BIT (must_precede
, e
->src
->cuid
);
1357 for (e
= u_node
->out
; e
!= 0; e
= e
->next_out
)
1358 if (TEST_BIT (sched_nodes
, e
->dest
->cuid
)
1359 && e
->latency
== (ii
* e
->distance
)
1360 && end
== SCHED_TIME (e
->dest
))
1361 SET_BIT (must_follow
, e
->dest
->cuid
);
1364 if ((step
> 0 && start
< end
) || (step
< 0 && start
> end
))
1365 for (c
= start
; c
!= end
; c
+= step
)
1369 psi
= ps_add_node_check_conflicts (ps
, u_node
, c
,
1375 SCHED_TIME (u_node
) = c
;
1376 SET_BIT (sched_nodes
, u
);
1379 fprintf(dump_file
, "Schedule in %d\n", c
);
1385 /* ??? Try backtracking instead of immediately ii++? */
1387 try_again_with_larger_ii
= true;
1388 reset_partial_schedule (ps
, ii
);
1391 /* ??? If (success), check register pressure estimates. */
1392 } /* Continue with next node. */
1393 } /* While try_again_with_larger_ii. */
1395 sbitmap_free (sched_nodes
);
1399 free_partial_schedule (ps
);
1406 /* This page implements the algorithm for ordering the nodes of a DDG
1407 for modulo scheduling, activated through the
1408 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
1410 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1411 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1412 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1413 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1414 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1415 #define DEPTH(x) (ASAP ((x)))
1417 typedef struct node_order_params
* nopa
;
1419 static void order_nodes_of_sccs (ddg_all_sccs_ptr
, int * result
);
1420 static int order_nodes_in_scc (ddg_ptr
, sbitmap
, sbitmap
, int*, int);
1421 static nopa
calculate_order_params (ddg_ptr
, int mii
);
1422 static int find_max_asap (ddg_ptr
, sbitmap
);
1423 static int find_max_hv_min_mob (ddg_ptr
, sbitmap
);
1424 static int find_max_dv_min_mob (ddg_ptr
, sbitmap
);
1426 enum sms_direction
{BOTTOMUP
, TOPDOWN
};
1428 struct node_order_params
1435 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
1437 check_nodes_order (int *node_order
, int num_nodes
)
1440 sbitmap tmp
= sbitmap_alloc (num_nodes
);
1444 for (i
= 0; i
< num_nodes
; i
++)
1446 int u
= node_order
[i
];
1448 if (u
>= num_nodes
|| u
< 0 || TEST_BIT (tmp
, u
))
1457 /* Order the nodes of G for scheduling and pass the result in
1458 NODE_ORDER. Also set aux.count of each node to ASAP.
1459 Return the recMII for the given DDG. */
1461 sms_order_nodes (ddg_ptr g
, int mii
, int * node_order
)
1465 ddg_all_sccs_ptr sccs
= create_ddg_all_sccs (g
);
1467 nopa nops
= calculate_order_params (g
, mii
);
1469 order_nodes_of_sccs (sccs
, node_order
);
1471 if (sccs
->num_sccs
> 0)
1472 /* First SCC has the largest recurrence_length. */
1473 rec_mii
= sccs
->sccs
[0]->recurrence_length
;
1475 /* Save ASAP before destroying node_order_params. */
1476 for (i
= 0; i
< g
->num_nodes
; i
++)
1478 ddg_node_ptr v
= &g
->nodes
[i
];
1479 v
->aux
.count
= ASAP (v
);
1483 free_ddg_all_sccs (sccs
);
1484 check_nodes_order (node_order
, g
->num_nodes
);
1490 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs
, int * node_order
)
1493 ddg_ptr g
= all_sccs
->ddg
;
1494 int num_nodes
= g
->num_nodes
;
1495 sbitmap prev_sccs
= sbitmap_alloc (num_nodes
);
1496 sbitmap on_path
= sbitmap_alloc (num_nodes
);
1497 sbitmap tmp
= sbitmap_alloc (num_nodes
);
1498 sbitmap ones
= sbitmap_alloc (num_nodes
);
1500 sbitmap_zero (prev_sccs
);
1501 sbitmap_ones (ones
);
1503 /* Perfrom the node ordering starting from the SCC with the highest recMII.
1504 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
1505 for (i
= 0; i
< all_sccs
->num_sccs
; i
++)
1507 ddg_scc_ptr scc
= all_sccs
->sccs
[i
];
1509 /* Add nodes on paths from previous SCCs to the current SCC. */
1510 find_nodes_on_paths (on_path
, g
, prev_sccs
, scc
->nodes
);
1511 sbitmap_a_or_b (tmp
, scc
->nodes
, on_path
);
1513 /* Add nodes on paths from the current SCC to previous SCCs. */
1514 find_nodes_on_paths (on_path
, g
, scc
->nodes
, prev_sccs
);
1515 sbitmap_a_or_b (tmp
, tmp
, on_path
);
1517 /* Remove nodes of previous SCCs from current extended SCC. */
1518 sbitmap_difference (tmp
, tmp
, prev_sccs
);
1520 pos
= order_nodes_in_scc (g
, prev_sccs
, tmp
, node_order
, pos
);
1521 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
1524 /* Handle the remaining nodes that do not belong to any scc. Each call
1525 to order_nodes_in_scc handles a single connected component. */
1526 while (pos
< g
->num_nodes
)
1528 sbitmap_difference (tmp
, ones
, prev_sccs
);
1529 pos
= order_nodes_in_scc (g
, prev_sccs
, tmp
, node_order
, pos
);
1531 sbitmap_free (prev_sccs
);
1532 sbitmap_free (on_path
);
1534 sbitmap_free (ones
);
1537 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
1538 static struct node_order_params
*
1539 calculate_order_params (ddg_ptr g
, int mii ATTRIBUTE_UNUSED
)
1543 int num_nodes
= g
->num_nodes
;
1545 /* Allocate a place to hold ordering params for each node in the DDG. */
1546 nopa node_order_params_arr
;
1548 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
1549 node_order_params_arr
= (nopa
) xcalloc (num_nodes
,
1550 sizeof (struct node_order_params
));
1552 /* Set the aux pointer of each node to point to its order_params structure. */
1553 for (u
= 0; u
< num_nodes
; u
++)
1554 g
->nodes
[u
].aux
.info
= &node_order_params_arr
[u
];
1556 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1557 calculate ASAP, ALAP, mobility, distance, and height for each node
1558 in the dependence (direct acyclic) graph. */
1560 /* We assume that the nodes in the array are in topological order. */
1563 for (u
= 0; u
< num_nodes
; u
++)
1565 ddg_node_ptr u_node
= &g
->nodes
[u
];
1568 for (e
= u_node
->in
; e
; e
= e
->next_in
)
1569 if (e
->distance
== 0)
1570 ASAP (u_node
) = MAX (ASAP (u_node
),
1571 ASAP (e
->src
) + e
->latency
);
1572 max_asap
= MAX (max_asap
, ASAP (u_node
));
1575 for (u
= num_nodes
- 1; u
> -1; u
--)
1577 ddg_node_ptr u_node
= &g
->nodes
[u
];
1579 ALAP (u_node
) = max_asap
;
1580 HEIGHT (u_node
) = 0;
1581 for (e
= u_node
->out
; e
; e
= e
->next_out
)
1582 if (e
->distance
== 0)
1584 ALAP (u_node
) = MIN (ALAP (u_node
),
1585 ALAP (e
->dest
) - e
->latency
);
1586 HEIGHT (u_node
) = MAX (HEIGHT (u_node
),
1587 HEIGHT (e
->dest
) + e
->latency
);
1591 return node_order_params_arr
;
1595 find_max_asap (ddg_ptr g
, sbitmap nodes
)
1601 EXECUTE_IF_SET_IN_SBITMAP (nodes
, 0, u
,
1603 ddg_node_ptr u_node
= &g
->nodes
[u
];
1605 if (max_asap
< ASAP (u_node
))
1607 max_asap
= ASAP (u_node
);
1615 find_max_hv_min_mob (ddg_ptr g
, sbitmap nodes
)
1619 int min_mob
= INT_MAX
;
1622 EXECUTE_IF_SET_IN_SBITMAP (nodes
, 0, u
,
1624 ddg_node_ptr u_node
= &g
->nodes
[u
];
1626 if (max_hv
< HEIGHT (u_node
))
1628 max_hv
= HEIGHT (u_node
);
1629 min_mob
= MOB (u_node
);
1632 else if ((max_hv
== HEIGHT (u_node
))
1633 && (min_mob
> MOB (u_node
)))
1635 min_mob
= MOB (u_node
);
1643 find_max_dv_min_mob (ddg_ptr g
, sbitmap nodes
)
1647 int min_mob
= INT_MAX
;
1650 EXECUTE_IF_SET_IN_SBITMAP (nodes
, 0, u
,
1652 ddg_node_ptr u_node
= &g
->nodes
[u
];
1654 if (max_dv
< DEPTH (u_node
))
1656 max_dv
= DEPTH (u_node
);
1657 min_mob
= MOB (u_node
);
1660 else if ((max_dv
== DEPTH (u_node
))
1661 && (min_mob
> MOB (u_node
)))
1663 min_mob
= MOB (u_node
);
1670 /* Places the nodes of SCC into the NODE_ORDER array starting
1671 at position POS, according to the SMS ordering algorithm.
1672 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1673 the NODE_ORDER array, starting from position zero. */
1675 order_nodes_in_scc (ddg_ptr g
, sbitmap nodes_ordered
, sbitmap scc
,
1676 int * node_order
, int pos
)
1678 enum sms_direction dir
;
1679 int num_nodes
= g
->num_nodes
;
1680 sbitmap workset
= sbitmap_alloc (num_nodes
);
1681 sbitmap tmp
= sbitmap_alloc (num_nodes
);
1682 sbitmap zero_bitmap
= sbitmap_alloc (num_nodes
);
1683 sbitmap predecessors
= sbitmap_alloc (num_nodes
);
1684 sbitmap successors
= sbitmap_alloc (num_nodes
);
1686 sbitmap_zero (predecessors
);
1687 find_predecessors (predecessors
, g
, nodes_ordered
);
1689 sbitmap_zero (successors
);
1690 find_successors (successors
, g
, nodes_ordered
);
1693 if (sbitmap_a_and_b_cg (tmp
, predecessors
, scc
))
1695 sbitmap_copy (workset
, tmp
);
1698 else if (sbitmap_a_and_b_cg (tmp
, successors
, scc
))
1700 sbitmap_copy (workset
, tmp
);
1707 sbitmap_zero (workset
);
1708 if ((u
= find_max_asap (g
, scc
)) >= 0)
1709 SET_BIT (workset
, u
);
1713 sbitmap_zero (zero_bitmap
);
1714 while (!sbitmap_equal (workset
, zero_bitmap
))
1717 ddg_node_ptr v_node
;
1718 sbitmap v_node_preds
;
1719 sbitmap v_node_succs
;
1723 while (!sbitmap_equal (workset
, zero_bitmap
))
1725 v
= find_max_hv_min_mob (g
, workset
);
1726 v_node
= &g
->nodes
[v
];
1727 node_order
[pos
++] = v
;
1728 v_node_succs
= NODE_SUCCESSORS (v_node
);
1729 sbitmap_a_and_b (tmp
, v_node_succs
, scc
);
1731 /* Don't consider the already ordered successors again. */
1732 sbitmap_difference (tmp
, tmp
, nodes_ordered
);
1733 sbitmap_a_or_b (workset
, workset
, tmp
);
1734 RESET_BIT (workset
, v
);
1735 SET_BIT (nodes_ordered
, v
);
1738 sbitmap_zero (predecessors
);
1739 find_predecessors (predecessors
, g
, nodes_ordered
);
1740 sbitmap_a_and_b (workset
, predecessors
, scc
);
1744 while (!sbitmap_equal (workset
, zero_bitmap
))
1746 v
= find_max_dv_min_mob (g
, workset
);
1747 v_node
= &g
->nodes
[v
];
1748 node_order
[pos
++] = v
;
1749 v_node_preds
= NODE_PREDECESSORS (v_node
);
1750 sbitmap_a_and_b (tmp
, v_node_preds
, scc
);
1752 /* Don't consider the already ordered predecessors again. */
1753 sbitmap_difference (tmp
, tmp
, nodes_ordered
);
1754 sbitmap_a_or_b (workset
, workset
, tmp
);
1755 RESET_BIT (workset
, v
);
1756 SET_BIT (nodes_ordered
, v
);
1759 sbitmap_zero (successors
);
1760 find_successors (successors
, g
, nodes_ordered
);
1761 sbitmap_a_and_b (workset
, successors
, scc
);
1765 sbitmap_free (workset
);
1766 sbitmap_free (zero_bitmap
);
1767 sbitmap_free (predecessors
);
1768 sbitmap_free (successors
);
1773 /* This page contains functions for manipulating partial-schedules during
1774 modulo scheduling. */
1776 /* Create a partial schedule and allocate a memory to hold II rows. */
1777 static partial_schedule_ptr
1778 create_partial_schedule (int ii
, ddg_ptr g
, int history
)
1780 partial_schedule_ptr ps
= (partial_schedule_ptr
)
1781 xmalloc (sizeof (struct partial_schedule
));
1782 ps
->rows
= (ps_insn_ptr
*) xcalloc (ii
, sizeof (ps_insn_ptr
));
1784 ps
->history
= history
;
1785 ps
->min_cycle
= INT_MAX
;
1786 ps
->max_cycle
= INT_MIN
;
1792 /* Free the PS_INSNs in rows array of the given partial schedule.
1793 ??? Consider caching the PS_INSN's. */
1795 free_ps_insns (partial_schedule_ptr ps
)
1799 for (i
= 0; i
< ps
->ii
; i
++)
1803 ps_insn_ptr ps_insn
= ps
->rows
[i
]->next_in_row
;
1806 ps
->rows
[i
] = ps_insn
;
1812 /* Free all the memory allocated to the partial schedule. */
1814 free_partial_schedule (partial_schedule_ptr ps
)
1823 /* Clear the rows array with its PS_INSNs, and create a new one with
1826 reset_partial_schedule (partial_schedule_ptr ps
, int new_ii
)
1831 if (new_ii
== ps
->ii
)
1833 ps
->rows
= (ps_insn_ptr
*) xrealloc (ps
->rows
, new_ii
1834 * sizeof (ps_insn_ptr
));
1835 memset (ps
->rows
, 0, new_ii
* sizeof (ps_insn_ptr
));
1837 ps
->min_cycle
= INT_MAX
;
1838 ps
->max_cycle
= INT_MIN
;
1841 /* Prints the partial schedule as an ii rows array, for each rows
1842 print the ids of the insns in it. */
1844 print_partial_schedule (partial_schedule_ptr ps
, FILE *dump
)
1848 for (i
= 0; i
< ps
->ii
; i
++)
1850 ps_insn_ptr ps_i
= ps
->rows
[i
];
1852 fprintf (dump
, "\n[CYCLE %d ]: ", i
);
1855 fprintf (dump
, "%d, ",
1856 INSN_UID (ps_i
->node
->insn
));
1857 ps_i
= ps_i
->next_in_row
;
1862 /* Creates an object of PS_INSN and initializes it to the given parameters. */
1864 create_ps_insn (ddg_node_ptr node
, int rest_count
, int cycle
)
1866 ps_insn_ptr ps_i
= xmalloc (sizeof (struct ps_insn
));
1869 ps_i
->next_in_row
= NULL
;
1870 ps_i
->prev_in_row
= NULL
;
1871 ps_i
->row_rest_count
= rest_count
;
1872 ps_i
->cycle
= cycle
;
1878 /* Removes the given PS_INSN from the partial schedule. Returns false if the
1879 node is not found in the partial schedule, else returns true. */
1881 remove_node_from_ps (partial_schedule_ptr ps
, ps_insn_ptr ps_i
)
1888 row
= SMODULO (ps_i
->cycle
, ps
->ii
);
1889 if (! ps_i
->prev_in_row
)
1891 if (ps_i
!= ps
->rows
[row
])
1894 ps
->rows
[row
] = ps_i
->next_in_row
;
1896 ps
->rows
[row
]->prev_in_row
= NULL
;
1900 ps_i
->prev_in_row
->next_in_row
= ps_i
->next_in_row
;
1901 if (ps_i
->next_in_row
)
1902 ps_i
->next_in_row
->prev_in_row
= ps_i
->prev_in_row
;
1908 /* Unlike what literature describes for modulo scheduling (which focuses
1909 on VLIW machines) the order of the instructions inside a cycle is
1910 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
1911 where the current instruction should go relative to the already
1912 scheduled instructions in the given cycle. Go over these
1913 instructions and find the first possible column to put it in. */
1915 ps_insn_find_column (partial_schedule_ptr ps
, ps_insn_ptr ps_i
,
1916 sbitmap must_precede
, sbitmap must_follow
)
1918 ps_insn_ptr next_ps_i
;
1919 ps_insn_ptr first_must_follow
= NULL
;
1920 ps_insn_ptr last_must_precede
= NULL
;
1926 row
= SMODULO (ps_i
->cycle
, ps
->ii
);
1928 /* Find the first must follow and the last must precede
1929 and insert the node immediately after the must precede
1930 but make sure that it there is no must follow after it. */
1931 for (next_ps_i
= ps
->rows
[row
];
1933 next_ps_i
= next_ps_i
->next_in_row
)
1935 if (TEST_BIT (must_follow
, next_ps_i
->node
->cuid
)
1936 && ! first_must_follow
)
1937 first_must_follow
= next_ps_i
;
1938 if (TEST_BIT (must_precede
, next_ps_i
->node
->cuid
))
1940 /* If we have already met a node that must follow, then
1941 there is no possible column. */
1942 if (first_must_follow
)
1945 last_must_precede
= next_ps_i
;
1949 /* Now insert the node after INSERT_AFTER_PSI. */
1951 if (! last_must_precede
)
1953 ps_i
->next_in_row
= ps
->rows
[row
];
1954 ps_i
->prev_in_row
= NULL
;
1955 if (ps_i
->next_in_row
)
1956 ps_i
->next_in_row
->prev_in_row
= ps_i
;
1957 ps
->rows
[row
] = ps_i
;
1961 ps_i
->next_in_row
= last_must_precede
->next_in_row
;
1962 last_must_precede
->next_in_row
= ps_i
;
1963 ps_i
->prev_in_row
= last_must_precede
;
1964 if (ps_i
->next_in_row
)
1965 ps_i
->next_in_row
->prev_in_row
= ps_i
;
1971 /* Advances the PS_INSN one column in its current row; returns false
1972 in failure and true in success. Bit N is set in MUST_FOLLOW if
1973 the node with cuid N must be come after the node pointed to by
1974 PS_I when scheduled in the same cycle. */
1976 ps_insn_advance_column (partial_schedule_ptr ps
, ps_insn_ptr ps_i
,
1977 sbitmap must_follow
)
1979 ps_insn_ptr prev
, next
;
1981 ddg_node_ptr next_node
;
1986 row
= SMODULO (ps_i
->cycle
, ps
->ii
);
1988 if (! ps_i
->next_in_row
)
1991 next_node
= ps_i
->next_in_row
->node
;
1993 /* Check if next_in_row is dependent on ps_i, both having same sched
1994 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
1995 if (TEST_BIT (must_follow
, next_node
->cuid
))
1998 /* Advance PS_I over its next_in_row in the doubly linked list. */
1999 prev
= ps_i
->prev_in_row
;
2000 next
= ps_i
->next_in_row
;
2002 if (ps_i
== ps
->rows
[row
])
2003 ps
->rows
[row
] = next
;
2005 ps_i
->next_in_row
= next
->next_in_row
;
2007 if (next
->next_in_row
)
2008 next
->next_in_row
->prev_in_row
= ps_i
;
2010 next
->next_in_row
= ps_i
;
2011 ps_i
->prev_in_row
= next
;
2013 next
->prev_in_row
= prev
;
2015 prev
->next_in_row
= next
;
2020 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2021 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
2022 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2023 before/after (respectively) the node pointed to by PS_I when scheduled
2024 in the same cycle. */
2026 add_node_to_ps (partial_schedule_ptr ps
, ddg_node_ptr node
, int cycle
,
2027 sbitmap must_precede
, sbitmap must_follow
)
2031 int row
= SMODULO (cycle
, ps
->ii
);
2034 && ps
->rows
[row
]->row_rest_count
>= issue_rate
)
2038 rest_count
+= ps
->rows
[row
]->row_rest_count
;
2040 ps_i
= create_ps_insn (node
, rest_count
, cycle
);
2042 /* Finds and inserts PS_I according to MUST_FOLLOW and
2044 if (! ps_insn_find_column (ps
, ps_i
, must_precede
, must_follow
))
2053 /* Advance time one cycle. Assumes DFA is being used. */
2055 advance_one_cycle (void)
2057 if (targetm
.sched
.dfa_pre_cycle_insn
)
2058 state_transition (curr_state
,
2059 (*targetm
.sched
.dfa_pre_cycle_insn
) ());
2061 state_transition (curr_state
, NULL
);
2063 if (targetm
.sched
.dfa_post_cycle_insn
)
2064 state_transition (curr_state
,
2065 (*targetm
.sched
.dfa_post_cycle_insn
) ());
2068 /* Checks if PS has resource conflicts according to DFA, starting from
2069 FROM cycle to TO cycle; returns true if there are conflicts and false
2070 if there are no conflicts. Assumes DFA is being used. */
2072 ps_has_conflicts (partial_schedule_ptr ps
, int from
, int to
)
2076 state_reset (curr_state
);
2078 for (cycle
= from
; cycle
<= to
; cycle
++)
2080 ps_insn_ptr crr_insn
;
2081 /* Holds the remaining issue slots in the current row. */
2082 int can_issue_more
= issue_rate
;
2084 /* Walk through the DFA for the current row. */
2085 for (crr_insn
= ps
->rows
[SMODULO (cycle
, ps
->ii
)];
2087 crr_insn
= crr_insn
->next_in_row
)
2089 rtx insn
= crr_insn
->node
->insn
;
2094 /* Check if there is room for the current insn. */
2095 if (!can_issue_more
|| state_dead_lock_p (curr_state
))
2098 /* Update the DFA state and return with failure if the DFA found
2099 recource conflicts. */
2100 if (state_transition (curr_state
, insn
) >= 0)
2103 if (targetm
.sched
.variable_issue
)
2105 (*targetm
.sched
.variable_issue
) (sched_dump
, sched_verbose
,
2106 insn
, can_issue_more
);
2107 /* A naked CLOBBER or USE generates no instruction, so don't
2108 let them consume issue slots. */
2109 else if (GET_CODE (PATTERN (insn
)) != USE
2110 && GET_CODE (PATTERN (insn
)) != CLOBBER
)
2114 /* Advance the DFA to the next cycle. */
2115 advance_one_cycle ();
2120 /* Checks if the given node causes resource conflicts when added to PS at
2121 cycle C. If not the node is added to PS and returned; otherwise zero
2122 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2123 cuid N must be come before/after (respectively) the node pointed to by
2124 PS_I when scheduled in the same cycle. */
2126 ps_add_node_check_conflicts (partial_schedule_ptr ps
, ddg_node_ptr n
,
2127 int c
, sbitmap must_precede
,
2128 sbitmap must_follow
)
2130 int has_conflicts
= 0;
2133 /* First add the node to the PS, if this succeeds check for
2134 conflicts, trying different issue slots in the same row. */
2135 if (! (ps_i
= add_node_to_ps (ps
, n
, c
, must_precede
, must_follow
)))
2136 return NULL
; /* Failed to insert the node at the given cycle. */
2138 has_conflicts
= ps_has_conflicts (ps
, c
, c
)
2140 && ps_has_conflicts (ps
,
2144 /* Try different issue slots to find one that the given node can be
2145 scheduled in without conflicts. */
2146 while (has_conflicts
)
2148 if (! ps_insn_advance_column (ps
, ps_i
, must_follow
))
2150 has_conflicts
= ps_has_conflicts (ps
, c
, c
)
2152 && ps_has_conflicts (ps
,
2159 remove_node_from_ps (ps
, ps_i
);
2163 ps
->min_cycle
= MIN (ps
->min_cycle
, c
);
2164 ps
->max_cycle
= MAX (ps
->max_cycle
, c
);
2168 /* Rotate the rows of PS such that insns scheduled at time
2169 START_CYCLE will appear in row 0. Updates max/min_cycles. */
2171 rotate_partial_schedule (partial_schedule_ptr ps
, int start_cycle
)
2173 int i
, row
, backward_rotates
;
2174 int last_row
= ps
->ii
- 1;
2176 if (start_cycle
== 0)
2179 backward_rotates
= SMODULO (start_cycle
, ps
->ii
);
2181 /* Revisit later and optimize this into a single loop. */
2182 for (i
= 0; i
< backward_rotates
; i
++)
2184 ps_insn_ptr first_row
= ps
->rows
[0];
2186 for (row
= 0; row
< last_row
; row
++)
2187 ps
->rows
[row
] = ps
->rows
[row
+1];
2189 ps
->rows
[last_row
] = first_row
;
2192 ps
->max_cycle
-= start_cycle
;
2193 ps
->min_cycle
-= start_cycle
;
2196 #endif /* INSN_SCHEDULING*/