1 (* Emission of the core of the Cortex-A8 NEON scheduling description.
2 Copyright (C) 2007 Free Software Foundation, Inc.
3 Contributed by CodeSourcery.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
23 (* This scheduling description generator works as follows.
24 - Each group of instructions has source and destination requirements
25 specified. The source requirements may be specified using
26 Source (the stage at which all source operands not otherwise
27 described are read), Source_m (the stage at which Rm operands are
28 read), Source_n (likewise for Rn) and Source_d (likewise for Rd).
29 - For each group of instructions the earliest stage where a source
30 operand may be required is calculated.
31 - Each group of instructions is selected in turn as a producer.
32 The latencies between this group and every other group are then
33 calculated, yielding up to four values for each combination:
34 1. Producer -> consumer Rn latency
35 2. Producer -> consumer Rm latency
36 3. Producer -> consumer Rd (as a source) latency
37 4. Producer -> consumer worst-case latency.
38 Value 4 is calculated from the destination availability requirements
39 of the consumer and the earliest source availability requirements
41 - The largest Value 4 calculated for the current producer is the
42 worse-case latency, L, for that instruction group. This value is written
43 out in a define_insn_reservation for the producer group.
44 - For each producer and consumer pair, the latencies calculated above
45 are collated. The average (of up to four values) is calculated and
46 if this average is different from the worst-case latency, an
47 unguarded define_bypass construction is issued for that pair.
48 (For each pair only one define_bypass construction will be emitted,
49 and at present we do not emit specific guards.)
54 let n1 = 1 and n2 = 2 and n3 = 3 and n4 = 4 and n5 = 5 and n6 = 6
55 and n7 = 7 and n8 = 8 and n9 = 9
57 type availability = Source of int
62 | Dest_n_after of int * int
64 type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
66 (* Reservation behaviours. All but the last row here correspond to one
67 pipeline each. Each constructor will correspond to one
68 define_reservation. *)
70 Mul | Mul_2cycle | Mul_4cycle
71 | Shift | Shift_2cycle
78 | Fmul_then_fadd | Fmul_then_fadd_2
80 (* This table must be kept as short as possible by conflating
81 entries with the same availability behaviour.
83 First components: instruction group names
84 Second components: availability requirements, in the order in which
85 they should appear in the comments in the .md file.
86 Third components: reservation info
88 let availability_table = [
89 (* NEON integer ALU instructions. *)
90 (* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr
91 veor vbic vorn ddd qqq *)
92 "neon_int_1", [Source n2; Dest n3], ALU;
93 (* vadd vsub qqd vsub ddd qqq *)
94 "neon_int_2", [Source_m n1; Source_n n2; Dest n3], ALU;
95 (* vsum vneg dd qq vadd vsub qdd *)
96 "neon_int_3", [Source n1; Dest n3], ALU;
97 (* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *)
98 (* vhadd vrhadd vqadd vtst ddd qqq *)
99 "neon_int_4", [Source n2; Dest n4], ALU;
100 (* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *)
101 "neon_int_5", [Source_m n1; Source_n n2; Dest n4], ALU;
102 (* vqneg vqabs dd qq *)
103 "neon_vqneg_vqabs", [Source n1; Dest n4], ALU;
105 "neon_vmov", [Dest n3], ALU;
107 "neon_vaba", [Source_n n2; Source_m n1; Source_d n3; Dest n6], ALU;
109 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], ALU_2cycle;
111 "neon_vsma", [Source_m n1; Source_d n3; Dest n6], ALU;
113 (* NEON integer multiply instructions. *)
114 (* vmul, vqdmlh, vqrdmlh *)
115 (* vmul, vqdmul, qdd 16/8 long 32/16 long *)
116 "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2; Dest n6], Mul;
117 "neon_mul_qqq_8_16_32_ddd_32", [Source n2; Dest_n_after (1, n6)], Mul_2cycle;
118 (* vmul, vqdmul again *)
119 "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar",
120 [Source_n n2; Source_m n1; Dest_n_after (1, n6)], Mul_2cycle;
122 "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long",
123 [Source_n n2; Source_m n2; Source_d n3; Dest n6], Mul;
125 [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
126 "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long",
127 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
128 "neon_mla_qqq_32_qqd_32_scalar",
129 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (3, n6)], Mul_4cycle;
130 (* vmul, vqdmulh, vqrdmulh *)
132 "neon_mul_ddd_16_scalar_32_16_long_scalar",
133 [Source_n n2; Source_m n1; Dest n6], Mul;
134 "neon_mul_qqd_32_scalar",
135 [Source_n n2; Source_m n1; Dest_n_after (3, n6)], Mul_4cycle;
137 (* vmla, vmla, vqdmla, vqdmls *)
138 "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar",
139 [Source_n n2; Source_m n1; Source_d n3; Dest n6], Mul;
141 (* NEON integer shift instructions. *)
142 (* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *)
143 "neon_shift_1", [Source n1; Dest n3], Shift;
144 (* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow;
145 vqshl_vrshl_vqrshl_ddd *)
146 "neon_shift_2", [Source n1; Dest n4], Shift;
147 (* vsli, vsri and vshl for qqq *)
148 "neon_shift_3", [Source n1; Dest_n_after (1, n3)], Shift_2cycle;
149 "neon_vshl_ddd", [Source n1; Dest n1], Shift;
150 "neon_vqshl_vrshl_vqrshl_qqq", [Source n1; Dest_n_after (1, n4)],
152 "neon_vsra_vrsra", [Source_m n1; Source_d n3; Dest n6], Shift;
154 (* NEON floating-point instructions. *)
155 (* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *)
156 (* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *)
157 "neon_fp_vadd_ddd_vabs_dd", [Source n2; Dest n5], Fadd;
158 "neon_fp_vadd_qqq_vabs_qq", [Source n2; Dest_n_after (1, n5)],
160 (* vsum, fvmx, vfmn *)
161 "neon_fp_vsum", [Source n1; Dest n5], Fadd;
162 "neon_fp_vmul_ddd", [Source_n n2; Source_m n1; Dest n5], Fmul;
163 "neon_fp_vmul_qqd", [Source_n n2; Source_m n1; Dest_n_after (1, n5)],
167 [Source_n n2; Source_m n2; Source_d n3; Dest n9], Fmul_then_fadd;
169 [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n9)],
171 "neon_fp_vmla_ddd_scalar",
172 [Source_n n2; Source_m n1; Source_d n3; Dest n9], Fmul_then_fadd;
173 "neon_fp_vmla_qqq_scalar",
174 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n9)],
176 "neon_fp_vrecps_vrsqrts_ddd", [Source n2; Dest n9], Fmul_then_fadd;
177 "neon_fp_vrecps_vrsqrts_qqq", [Source n2; Dest_n_after (1, n9)],
180 (* NEON byte permute instructions. *)
181 (* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *)
182 "neon_bp_simple", [Source n1; Dest n2], Permute 1;
183 (* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1};
184 similarly for vtbx *)
185 "neon_bp_2cycle", [Source n1; Dest_n_after (1, n2)], Permute 2;
187 "neon_bp_3cycle", [Source n1; Dest_n_after (2, n2)], Permute 3;
189 (* NEON load/store instructions. *)
190 "neon_ldr", [Dest n1], Ls 1;
191 "neon_str", [Source n1], Ls 1;
192 "neon_vld1_1_2_regs", [Dest_n_after (1, n1)], Ls 2;
193 "neon_vld1_3_4_regs", [Dest_n_after (2, n1)], Ls 3;
194 "neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after (1, n2)], Ls 2;
195 "neon_vld2_4_regs", [Dest_n_after (2, n2)], Ls 3;
196 "neon_vld3_vld4", [Dest_n_after (3, n2)], Ls 4;
197 "neon_vst1_1_2_regs_vst2_2_regs", [Source n1], Ls 2;
198 "neon_vst1_3_4_regs", [Source n1], Ls 3;
199 "neon_vst2_4_regs_vst3_vst4", [Source n1], Ls 4;
200 "neon_vst3_vst4", [Source n1], Ls 4;
201 "neon_vld1_vld2_lane", [Source n1; Dest_n_after (2, n2)], Ls 3;
202 "neon_vld3_vld4_lane", [Source n1; Dest_n_after (4, n2)], Ls 5;
203 "neon_vst1_vst2_lane", [Source n1], Ls 2;
204 "neon_vst3_vst4_lane", [Source n1], Ls 3;
205 "neon_vld3_vld4_all_lanes", [Dest_n_after (1, n2)], Ls 3;
207 (* NEON register transfer instructions. *)
208 "neon_mcr", [Dest n2], Permute 1;
209 "neon_mcr_2_mcrr", [Dest n2], Permute 2;
210 (* MRC instructions are in the .tpl file. *)
213 (* Augment the tuples in the availability table with an extra component
214 that describes the earliest stage where a source operand may be
215 required. (It is also possible that an entry in the table has no
216 source requirements.) *)
217 let calculate_sources =
218 List.map (fun (name, avail, res) ->
221 (fun cur -> fun info ->
229 | Some stage' when stage < stage' -> Some stage
231 | _ -> cur) None avail
233 (name, avail, res, earliest_stage))
235 (* Find the stage, if any, at the end of which a group produces a result. *)
236 let find_dest (attr, avail, _, _) =
239 (fun av -> match av with
240 Dest st -> Some (Some st)
241 | Dest_n_after (after, st) -> Some (Some (after + st))
243 with Not_found -> None
245 (* Find the worst-case latency between a producer and a consumer. *)
246 let worst_case_latency producer (_, _, _, earliest_required) =
247 let dest = find_dest producer in
248 match earliest_required, dest with
250 (* The consumer doesn't have any source requirements. *)
253 (* The producer doesn't produce any results (e.g. a store insn). *)
255 | Some consumed, Some produced -> Some (produced - consumed + 1)
257 (* Helper function for below. *)
258 let latency_calc f producer (_, avail, _, _) =
260 let source_avail = find_with_result f avail in
261 match find_dest producer with
263 (* The producer does not produce a result. *)
266 let latency = produced - source_avail + 1 in
267 (* Latencies below zero are raised to zero since we don't have
269 if latency < 0 then Some 0 else Some latency
270 with Not_found -> None
272 (* Find any Rm latency between a producer and a consumer. If no
273 Rm source requirement is explicitly specified for the consumer,
274 return "positive infinity". Also return "positive infinity" if
275 the latency matches the supplied worst-case latency for this
277 let get_m_latency producer consumer =
278 match latency_calc (fun av -> match av with Source_m stage -> Some stage
279 | _ -> None) producer consumer
280 with None -> [] | Some latency -> [(Guard_only_m, latency)]
282 (* Likewise for Rn. *)
283 let get_n_latency producer consumer =
284 match latency_calc (fun av -> match av with Source_n stage -> Some stage
285 | _ -> None) producer consumer
286 with None -> [] | Some latency -> [(Guard_only_n, latency)]
288 (* Likewise for Rd. *)
289 let get_d_latency producer consumer =
291 latency_calc (fun av -> match av with Source_d stage -> Some stage
292 | _ -> None) producer consumer
293 with None -> [] | Some latency -> [(Guard_only_d, latency)]
295 (* Given a producer and a consumer, work out the latency of the producer
296 to the consumer in each of the four cases (availability information
297 permitting) identified at the top of this file. Return the
298 consumer, the worst-case unguarded latency and any guarded latencies. *)
299 let calculate_latencies producer consumer =
300 let worst = worst_case_latency producer consumer in
301 let m_latency = get_m_latency producer consumer in
302 let n_latency = get_n_latency producer consumer in
303 let d_latency = get_d_latency producer consumer in
304 (consumer, worst, m_latency @ n_latency @ d_latency)
306 (* Helper function for below. *)
307 let pick_latency largest worst guards =
311 | Some worst -> (Guard_none, worst) :: guards
313 if List.length guards = 0 then None else
315 List.fold_left (fun acc -> fun (_, latency) -> acc + latency) 0 guards
317 let average_latency = (float_of_int total_latency) /.
318 (float_of_int (List.length guards)) in
319 let rounded_latency = int_of_float (ceil average_latency) in
320 if rounded_latency = largest then None
321 else Some (Guard_none, rounded_latency)
323 (* Collate all bypasses for a particular producer as required in
324 worst_case_latencies_and_bypasses. (By this stage there is a maximum
325 of one bypass from this producer to any particular consumer listed
326 in LATENCIES.) Use a hash table to collate bypasses with the
327 same latency and guard. *)
328 let collate_bypasses (producer_name, _, _, _) largest latencies =
329 let ht = Hashtbl.create 42 in
332 fun ((consumer, _, _, _), worst, guards) ->
333 (* Find out which latency to use. Ignoring latencies that match
334 the *overall* worst-case latency for this producer (which will
335 be in define_insn_reservation), we have to examine:
336 1. the latency with no guard between this producer and this
338 2. any guarded latency. *)
339 let guard_latency_opt = pick_latency largest worst guards in
340 match guard_latency_opt with
342 | Some (guard, latency) ->
344 (if (try ignore (Hashtbl.find ht (guard, latency)); false
345 with Not_found -> true) then
346 keys := (guard, latency) :: !keys);
347 Hashtbl.add ht (guard, latency) consumer
350 (* The hash table now has bypasses collated so that ones with the
351 same latency and guard have the same keys. Walk through all the
352 keys, extract the associated bypasses, and concatenate the names
353 of the consumers for each bypass. *)
355 fun ((guard, latency) as key) ->
356 let consumers = Hashtbl.find_all ht key in
358 String.concat ",\\\n " consumers,
363 (* For every producer, find the worst-case latency between it and
364 *any* consumer. Also determine (if such a thing exists) the
365 lowest-latency bypass from each producer to each consumer. Group
366 the output in such a way that all bypasses with the same producer
367 and latency are together, and so that bypasses with the worst-case
368 latency are ignored. *)
369 let worst_case_latencies_and_bypasses =
370 let rec f (worst_acc, bypasses_acc) prev xs =
372 [] -> (worst_acc, bypasses_acc)
373 | ((producer_name, producer_avail, res_string, _) as producer)::next ->
374 (* For this particular producer, work out the latencies between
375 it and every consumer. *)
377 List.fold_left (fun acc -> fun consumer ->
378 (calculate_latencies producer consumer) :: acc)
381 (* Now work out what the overall worst case latency was for this
382 particular producer. *)
386 let comp_fn (_, l1, _) (_, l2, _) =
387 if l1 > l2 then -1 else if l1 = l2 then 0 else 1
390 match List.hd (List.sort comp_fn latencies) with
391 (_, None, _) -> 0 (* Producer has no consumers. *)
392 | (_, Some worst, _) -> worst
394 (* Having got the largest latency, collect all bypasses for
395 this producer and filter out those with that larger
396 latency. Record the others for later emission. *)
397 let bypasses = collate_bypasses producer largest latencies in
398 (* Go on to process remaining producers, having noted
399 the result for this one. *)
400 f ((producer_name, producer_avail, largest,
401 res_string) :: worst_acc,
402 bypasses @ bypasses_acc)
403 (prev @ [producer]) next
407 (* Emit a helpful comment for a define_insn_reservation. *)
408 let write_comment producer avail =
409 let seen_source = ref false in
411 let read = if !seen_source then "" else "read " in
415 Printf.printf "%stheir source operands at N%d" read stage
418 Printf.printf "%stheir (D|Q)n operands at N%d" read stage
421 Printf.printf "%stheir (D|Q)m operands at N%d" read stage
423 Printf.printf "%stheir (D|Q)d operands at N%d" read stage
425 Printf.printf "produce a result at N%d" stage
426 | Dest_n_after (after, stage) ->
427 Printf.printf "produce a result at N%d on cycle %d" stage (after + 1)
429 Printf.printf ";; Instructions using this reservation ";
431 let sep = if x mod 2 = 1 then "" else "\n;;" in
434 | [info] -> describe info; Printf.printf ".\n"
435 | info::(_::[] as infos) ->
436 describe info; Printf.printf ", and%s " sep; f infos (x+1)
437 | info::infos -> describe info; Printf.printf ",%s " sep; f infos (x+1)
441 (* Emit a define_insn_reservation for each producer. The latency
442 written in will be its worst-case latency. *)
443 let emit_insn_reservations =
445 fun (producer, avail, latency, reservation) ->
446 write_comment producer avail;
447 Printf.printf "(define_insn_reservation \"%s\" %d\n" producer latency;
448 Printf.printf " (and (eq_attr \"tune\" \"cortexa8\")\n";
449 Printf.printf " (eq_attr \"neon_type\" \"%s\"))\n" producer;
451 match reservation with
452 Mul -> "dp" | Mul_2cycle -> "dp_2" | Mul_4cycle -> "dp_4"
453 | Shift -> "dp" | Shift_2cycle -> "dp_2"
454 | ALU -> "dp" | ALU_2cycle -> "dp_2"
455 | Fmul -> "dp" | Fmul_2cycle -> "dp_2"
456 | Fadd -> "fadd" | Fadd_2cycle -> "fadd_2"
458 | Ls n -> "ls_" ^ (string_of_int n)
459 | Permute 1 -> "perm"
460 | Permute n -> "perm_" ^ (string_of_int n)
461 | Fmul_then_fadd -> "fmul_then_fadd"
462 | Fmul_then_fadd_2 -> "fmul_then_fadd_2"
464 Printf.printf " \"cortex_a8_neon_%s\")\n\n" str
467 (* Given a guard description, return the name of the C function to
468 be used as the guard for define_bypass. *)
471 Guard_only_m -> "arm_neon_only_m_dependency"
472 | Guard_only_n -> "arm_neon_only_n_dependency"
473 | Guard_only_d -> "arm_neon_only_d_dependency"
474 | Guard_none -> assert false
476 (* Emit a define_bypass for each bypass. *)
479 fun (producer, consumers, latency, guard) ->
480 Printf.printf "(define_bypass %d \"%s\"\n" latency producer;
481 if guard = Guard_none then
482 Printf.printf " \"%s\")\n\n" consumers
485 Printf.printf " \"%s\"\n" consumers;
486 Printf.printf " \"%s\")\n\n" (guard_fn guard)
490 (* Program entry point. *)
492 let table = calculate_sources availability_table in
493 let worst_cases, bypasses = worst_case_latencies_and_bypasses table in
494 emit_insn_reservations (List.rev worst_cases);
495 Printf.printf ";; Exceptions to the default latencies.\n\n";
496 emit_bypasses bypasses