1 (* Emission of the core of the Cortex-A8 NEON scheduling description.
2 Copyright (C) 2007 Free Software Foundation, Inc.
3 Contributed by CodeSourcery.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>.
22 (* This scheduling description generator works as follows.
23 - Each group of instructions has source and destination requirements
24 specified. The source requirements may be specified using
25 Source (the stage at which all source operands not otherwise
26 described are read), Source_m (the stage at which Rm operands are
27 read), Source_n (likewise for Rn) and Source_d (likewise for Rd).
28 - For each group of instructions the earliest stage where a source
29 operand may be required is calculated.
30 - Each group of instructions is selected in turn as a producer.
31 The latencies between this group and every other group are then
32 calculated, yielding up to four values for each combination:
33 1. Producer -> consumer Rn latency
34 2. Producer -> consumer Rm latency
35 3. Producer -> consumer Rd (as a source) latency
36 4. Producer -> consumer worst-case latency.
37 Value 4 is calculated from the destination availability requirements
38 of the consumer and the earliest source availability requirements
40 - The largest Value 4 calculated for the current producer is the
41 worse-case latency, L, for that instruction group. This value is written
42 out in a define_insn_reservation for the producer group.
43 - For each producer and consumer pair, the latencies calculated above
44 are collated. The average (of up to four values) is calculated and
45 if this average is different from the worst-case latency, an
46 unguarded define_bypass construction is issued for that pair.
47 (For each pair only one define_bypass construction will be emitted,
48 and at present we do not emit specific guards.)
51 let find_with_result fn lst
=
52 let rec scan = function
60 let n1 = 1 and n2
= 2 and n3
= 3 and n4
= 4 and n5
= 5 and n6
= 6
61 and n7
= 7 and n8
= 8 and n9
= 9
63 type availability
= Source
of int
68 | Dest_n_after
of int * int
70 type guard
= Guard_none
| Guard_only_m
| Guard_only_n
| Guard_only_d
72 (* Reservation behaviors. All but the last row here correspond to one
73 pipeline each. Each constructor will correspond to one
74 define_reservation. *)
76 Mul
| Mul_2cycle
| Mul_4cycle
77 | Shift
| Shift_2cycle
84 | Fmul_then_fadd
| Fmul_then_fadd_2
86 (* This table must be kept as short as possible by conflating
87 entries with the same availability behavior.
89 First components: instruction group names
90 Second components: availability requirements, in the order in which
91 they should appear in the comments in the .md file.
92 Third components: reservation info
94 let availability_table = [
95 (* NEON integer ALU instructions. *)
96 (* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr
97 veor vbic vorn ddd qqq *)
98 "neon_int_1", [Source n2
; Dest n3
], ALU
;
99 (* vadd vsub qqd vsub ddd qqq *)
100 "neon_int_2", [Source_m
n1; Source_n n2
; Dest n3
], ALU
;
101 (* vsum vneg dd qq vadd vsub qdd *)
102 "neon_int_3", [Source
n1; Dest n3
], ALU
;
103 (* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *)
104 (* vhadd vrhadd vqadd vtst ddd qqq *)
105 "neon_int_4", [Source n2
; Dest n4
], ALU
;
106 (* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *)
107 "neon_int_5", [Source_m
n1; Source_n n2
; Dest n4
], ALU
;
108 (* vqneg vqabs dd qq *)
109 "neon_vqneg_vqabs", [Source
n1; Dest n4
], ALU
;
111 "neon_vmov", [Dest n3
], ALU
;
113 "neon_vaba", [Source_n n2
; Source_m
n1; Source_d n3
; Dest n6
], ALU
;
115 [Source_n n2
; Source_m
n1; Source_d n3
; Dest_n_after
(1, n6
)], ALU_2cycle
;
117 "neon_vsma", [Source_m
n1; Source_d n3
; Dest n6
], ALU
;
119 (* NEON integer multiply instructions. *)
120 (* vmul, vqdmlh, vqrdmlh *)
121 (* vmul, vqdmul, qdd 16/8 long 32/16 long *)
122 "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2
; Dest n6
], Mul
;
123 "neon_mul_qqq_8_16_32_ddd_32", [Source n2
; Dest_n_after
(1, n6
)], Mul_2cycle
;
124 (* vmul, vqdmul again *)
125 "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar",
126 [Source_n n2
; Source_m
n1; Dest_n_after
(1, n6
)], Mul_2cycle
;
128 "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long",
129 [Source_n n2
; Source_m n2
; Source_d n3
; Dest n6
], Mul
;
131 [Source_n n2
; Source_m n2
; Source_d n3
; Dest_n_after
(1, n6
)], Mul_2cycle
;
132 "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long",
133 [Source_n n2
; Source_m
n1; Source_d n3
; Dest_n_after
(1, n6
)], Mul_2cycle
;
134 "neon_mla_qqq_32_qqd_32_scalar",
135 [Source_n n2
; Source_m
n1; Source_d n3
; Dest_n_after
(3, n6
)], Mul_4cycle
;
136 (* vmul, vqdmulh, vqrdmulh *)
138 "neon_mul_ddd_16_scalar_32_16_long_scalar",
139 [Source_n n2
; Source_m
n1; Dest n6
], Mul
;
140 "neon_mul_qqd_32_scalar",
141 [Source_n n2
; Source_m
n1; Dest_n_after
(3, n6
)], Mul_4cycle
;
143 (* vmla, vmla, vqdmla, vqdmls *)
144 "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar",
145 [Source_n n2
; Source_m
n1; Source_d n3
; Dest n6
], Mul
;
147 (* NEON integer shift instructions. *)
148 (* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *)
149 "neon_shift_1", [Source
n1; Dest n3
], Shift
;
150 (* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow;
151 vqshl_vrshl_vqrshl_ddd *)
152 "neon_shift_2", [Source
n1; Dest n4
], Shift
;
153 (* vsli, vsri and vshl for qqq *)
154 "neon_shift_3", [Source
n1; Dest_n_after
(1, n3
)], Shift_2cycle
;
155 "neon_vshl_ddd", [Source
n1; Dest
n1], Shift
;
156 "neon_vqshl_vrshl_vqrshl_qqq", [Source
n1; Dest_n_after
(1, n4
)],
158 "neon_vsra_vrsra", [Source_m
n1; Source_d n3
; Dest n6
], Shift
;
160 (* NEON floating-point instructions. *)
161 (* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *)
162 (* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *)
163 "neon_fp_vadd_ddd_vabs_dd", [Source n2
; Dest n5
], Fadd
;
164 "neon_fp_vadd_qqq_vabs_qq", [Source n2
; Dest_n_after
(1, n5
)],
166 (* vsum, fvmx, vfmn *)
167 "neon_fp_vsum", [Source
n1; Dest n5
], Fadd
;
168 "neon_fp_vmul_ddd", [Source_n n2
; Source_m
n1; Dest n5
], Fmul
;
169 "neon_fp_vmul_qqd", [Source_n n2
; Source_m
n1; Dest_n_after
(1, n5
)],
173 [Source_n n2
; Source_m n2
; Source_d n3
; Dest n9
], Fmul_then_fadd
;
175 [Source_n n2
; Source_m n2
; Source_d n3
; Dest_n_after
(1, n9
)],
177 "neon_fp_vmla_ddd_scalar",
178 [Source_n n2
; Source_m
n1; Source_d n3
; Dest n9
], Fmul_then_fadd
;
179 "neon_fp_vmla_qqq_scalar",
180 [Source_n n2
; Source_m
n1; Source_d n3
; Dest_n_after
(1, n9
)],
182 "neon_fp_vrecps_vrsqrts_ddd", [Source n2
; Dest n9
], Fmul_then_fadd
;
183 "neon_fp_vrecps_vrsqrts_qqq", [Source n2
; Dest_n_after
(1, n9
)],
186 (* NEON byte permute instructions. *)
187 (* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *)
188 "neon_bp_simple", [Source
n1; Dest n2
], Permute
1;
189 (* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1};
190 similarly for vtbx *)
191 "neon_bp_2cycle", [Source
n1; Dest_n_after
(1, n2
)], Permute
2;
193 "neon_bp_3cycle", [Source
n1; Dest_n_after
(2, n2
)], Permute
3;
195 (* NEON load/store instructions. *)
196 "neon_ldr", [Dest
n1], Ls
1;
197 "neon_str", [Source
n1], Ls
1;
198 "neon_vld1_1_2_regs", [Dest_n_after
(1, n1)], Ls
2;
199 "neon_vld1_3_4_regs", [Dest_n_after
(2, n1)], Ls
3;
200 "neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after
(1, n2
)], Ls
2;
201 "neon_vld2_4_regs", [Dest_n_after
(2, n2
)], Ls
3;
202 "neon_vld3_vld4", [Dest_n_after
(3, n2
)], Ls
4;
203 "neon_vst1_1_2_regs_vst2_2_regs", [Source
n1], Ls
2;
204 "neon_vst1_3_4_regs", [Source
n1], Ls
3;
205 "neon_vst2_4_regs_vst3_vst4", [Source
n1], Ls
4;
206 "neon_vst3_vst4", [Source
n1], Ls
4;
207 "neon_vld1_vld2_lane", [Source
n1; Dest_n_after
(2, n2
)], Ls
3;
208 "neon_vld3_vld4_lane", [Source
n1; Dest_n_after
(4, n2
)], Ls
5;
209 "neon_vst1_vst2_lane", [Source
n1], Ls
2;
210 "neon_vst3_vst4_lane", [Source
n1], Ls
3;
211 "neon_vld3_vld4_all_lanes", [Dest_n_after
(1, n2
)], Ls
3;
213 (* NEON register transfer instructions. *)
214 "neon_mcr", [Dest n2
], Permute
1;
215 "neon_mcr_2_mcrr", [Dest n2
], Permute
2;
216 (* MRC instructions are in the .tpl file. *)
219 (* Augment the tuples in the availability table with an extra component
220 that describes the earliest stage where a source operand may be
221 required. (It is also possible that an entry in the table has no
222 source requirements.) *)
223 let calculate_sources =
224 List.map
(fun (name
, avail
, res
) ->
227 (fun cur
-> fun info
->
235 | Some stage'
when stage
< stage'
-> Some stage
237 | _
-> cur
) None avail
239 (name
, avail
, res
, earliest_stage))
241 (* Find the stage, if any, at the end of which a group produces a result. *)
242 let find_dest (attr
, avail
, _
, _
) =
245 (fun av
-> match av
with
246 Dest st
-> Some
(Some st
)
247 | Dest_n_after
(after
, st
) -> Some
(Some
(after
+ st
))
249 with Not_found
-> None
251 (* Find the worst-case latency between a producer and a consumer. *)
252 let worst_case_latency producer
(_
, _
, _
, earliest_required
) =
253 let dest = find_dest producer
in
254 match earliest_required
, dest with
256 (* The consumer doesn't have any source requirements. *)
259 (* The producer doesn't produce any results (e.g. a store insn). *)
261 | Some consumed
, Some produced
-> Some
(produced
- consumed
+ 1)
263 (* Helper function for below. *)
264 let latency_calc f producer
(_
, avail
, _
, _
) =
266 let source_avail = find_with_result f avail
in
267 match find_dest producer
with
269 (* The producer does not produce a result. *)
272 let latency = produced
- source_avail + 1 in
273 (* Latencies below zero are raised to zero since we don't have
275 if latency < 0 then Some
0 else Some
latency
276 with Not_found
-> None
278 (* Find any Rm latency between a producer and a consumer. If no
279 Rm source requirement is explicitly specified for the consumer,
280 return "positive infinity". Also return "positive infinity" if
281 the latency matches the supplied worst-case latency for this
283 let get_m_latency producer consumer
=
284 match latency_calc (fun av
-> match av
with Source_m stage
-> Some stage
285 | _
-> None
) producer consumer
286 with None
-> [] | Some
latency -> [(Guard_only_m
, latency)]
288 (* Likewise for Rn. *)
289 let get_n_latency producer consumer
=
290 match latency_calc (fun av
-> match av
with Source_n stage
-> Some stage
291 | _
-> None
) producer consumer
292 with None
-> [] | Some
latency -> [(Guard_only_n
, latency)]
294 (* Likewise for Rd. *)
295 let get_d_latency producer consumer
=
297 latency_calc (fun av
-> match av
with Source_d stage
-> Some stage
298 | _
-> None
) producer consumer
299 with None
-> [] | Some
latency -> [(Guard_only_d
, latency)]
301 (* Given a producer and a consumer, work out the latency of the producer
302 to the consumer in each of the four cases (availability information
303 permitting) identified at the top of this file. Return the
304 consumer, the worst-case unguarded latency and any guarded latencies. *)
305 let calculate_latencies producer consumer
=
306 let worst = worst_case_latency producer consumer
in
307 let m_latency = get_m_latency producer consumer
in
308 let n_latency = get_n_latency producer consumer
in
309 let d_latency = get_d_latency producer consumer
in
310 (consumer
, worst, m_latency @ n_latency @ d_latency)
312 (* Helper function for below. *)
313 let pick_latency largest
worst guards
=
317 | Some
worst -> (Guard_none
, worst) :: guards
319 if List.length
guards = 0 then None
else
321 List.fold_left
(fun acc
-> fun (_
, latency) -> acc
+ latency) 0 guards
323 let average_latency = (float_of_int
total_latency) /.
324 (float_of_int
(List.length
guards)) in
325 let rounded_latency = int_of_float
(ceil
average_latency) in
326 if rounded_latency = largest
then None
327 else Some
(Guard_none
, rounded_latency)
329 (* Collate all bypasses for a particular producer as required in
330 worst_case_latencies_and_bypasses. (By this stage there is a maximum
331 of one bypass from this producer to any particular consumer listed
332 in LATENCIES.) Use a hash table to collate bypasses with the
333 same latency and guard. *)
334 let collate_bypasses (producer_name
, _
, _
, _
) largest latencies
=
335 let ht = Hashtbl.create
42 in
338 fun ((consumer
, _
, _
, _
), worst, guards) ->
339 (* Find out which latency to use. Ignoring latencies that match
340 the *overall* worst-case latency for this producer (which will
341 be in define_insn_reservation), we have to examine:
342 1. the latency with no guard between this producer and this
344 2. any guarded latency. *)
345 let guard_latency_opt = pick_latency largest
worst guards in
346 match guard_latency_opt with
348 | Some
(guard
, latency) ->
350 (if (try ignore
(Hashtbl.find
ht (guard
, latency)); false
351 with Not_found
-> true) then
352 keys := (guard
, latency) :: !keys);
353 Hashtbl.add
ht (guard
, latency) consumer
356 (* The hash table now has bypasses collated so that ones with the
357 same latency and guard have the same keys. Walk through all the
358 keys, extract the associated bypasses, and concatenate the names
359 of the consumers for each bypass. *)
361 fun ((guard
, latency) as key
) ->
362 let consumers = Hashtbl.find_all
ht key
in
364 String.concat
",\\\n " consumers,
369 (* For every producer, find the worst-case latency between it and
370 *any* consumer. Also determine (if such a thing exists) the
371 lowest-latency bypass from each producer to each consumer. Group
372 the output in such a way that all bypasses with the same producer
373 and latency are together, and so that bypasses with the worst-case
374 latency are ignored. *)
375 let worst_case_latencies_and_bypasses =
376 let rec f (worst_acc
, bypasses_acc
) prev xs
=
378 [] -> (worst_acc
, bypasses_acc
)
379 | ((producer_name
, producer_avail
, res_string
, _
) as producer
)::next
->
380 (* For this particular producer, work out the latencies between
381 it and every consumer. *)
383 List.fold_left
(fun acc
-> fun consumer
->
384 (calculate_latencies producer consumer
) :: acc
)
387 (* Now work out what the overall worst case latency was for this
388 particular producer. *)
392 let comp_fn (_
, l1
, _
) (_
, l2
, _
) =
393 if l1
> l2
then -1 else if l1
= l2
then 0 else 1
396 match List.hd
(List.sort
comp_fn latencies) with
397 (_
, None
, _
) -> 0 (* Producer has no consumers. *)
398 | (_
, Some
worst, _
) -> worst
400 (* Having got the largest latency, collect all bypasses for
401 this producer and filter out those with that larger
402 latency. Record the others for later emission. *)
403 let bypasses = collate_bypasses producer
largest latencies in
404 (* Go on to process remaining producers, having noted
405 the result for this one. *)
406 f ((producer_name
, producer_avail
, largest,
407 res_string
) :: worst_acc
,
408 bypasses @ bypasses_acc
)
409 (prev
@ [producer
]) next
413 (* Emit a helpful comment for a define_insn_reservation. *)
414 let write_comment producer avail
=
415 let seen_source = ref false in
417 let read = if !seen_source then "" else "read " in
421 Printf.printf
"%stheir source operands at N%d" read stage
424 Printf.printf
"%stheir (D|Q)n operands at N%d" read stage
427 Printf.printf
"%stheir (D|Q)m operands at N%d" read stage
429 Printf.printf
"%stheir (D|Q)d operands at N%d" read stage
431 Printf.printf
"produce a result at N%d" stage
432 | Dest_n_after
(after
, stage
) ->
433 Printf.printf
"produce a result at N%d on cycle %d" stage
(after
+ 1)
435 Printf.printf
";; Instructions using this reservation ";
437 let sep = if x
mod 2 = 1 then "" else "\n;;" in
440 | [info
] -> describe info
; Printf.printf
".\n"
441 | info
::(_
::[] as infos
) ->
442 describe info
; Printf.printf
", and%s " sep; f infos
(x
+1)
443 | info
::infos
-> describe info
; Printf.printf
",%s " sep; f infos
(x
+1)
447 (* Emit a define_insn_reservation for each producer. The latency
448 written in will be its worst-case latency. *)
449 let emit_insn_reservations =
451 fun (producer
, avail
, latency, reservation
) ->
452 write_comment producer avail
;
453 Printf.printf
"(define_insn_reservation \"%s\" %d\n" producer
latency;
454 Printf.printf
" (and (eq_attr \"tune\" \"cortexa8\")\n";
455 Printf.printf
" (eq_attr \"neon_type\" \"%s\"))\n" producer
;
457 match reservation
with
458 Mul
-> "dp" | Mul_2cycle
-> "dp_2" | Mul_4cycle
-> "dp_4"
459 | Shift
-> "dp" | Shift_2cycle
-> "dp_2"
460 | ALU
-> "dp" | ALU_2cycle
-> "dp_2"
461 | Fmul
-> "dp" | Fmul_2cycle
-> "dp_2"
462 | Fadd
-> "fadd" | Fadd_2cycle
-> "fadd_2"
464 | Ls n
-> "ls_" ^
(string_of_int n
)
465 | Permute
1 -> "perm"
466 | Permute n
-> "perm_" ^
(string_of_int n
)
467 | Fmul_then_fadd
-> "fmul_then_fadd"
468 | Fmul_then_fadd_2
-> "fmul_then_fadd_2"
470 Printf.printf
" \"cortex_a8_neon_%s\")\n\n" str
473 (* Given a guard description, return the name of the C function to
474 be used as the guard for define_bypass. *)
477 Guard_only_m
-> "arm_neon_only_m_dependency"
478 | Guard_only_n
-> "arm_neon_only_n_dependency"
479 | Guard_only_d
-> "arm_neon_only_d_dependency"
480 | Guard_none
-> assert false
482 (* Emit a define_bypass for each bypass. *)
485 fun (producer
, consumers, latency, guard
) ->
486 Printf.printf
"(define_bypass %d \"%s\"\n" latency producer
;
487 if guard
= Guard_none
then
488 Printf.printf
" \"%s\")\n\n" consumers
491 Printf.printf
" \"%s\"\n" consumers;
492 Printf.printf
" \"%s\")\n\n" (guard_fn guard
)
496 (* Program entry point. *)
498 let table = calculate_sources availability_table in
499 let worst_cases, bypasses = worst_case_latencies_and_bypasses table in
500 emit_insn_reservations (List.rev
worst_cases);
501 Printf.printf
";; Exceptions to the default latencies.\n\n";
502 emit_bypasses bypasses