1 /* Loop autoparallelization.
2 Copyright (C) 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
28 #include "tree-flow.h"
31 #include "tree-data-ref.h"
32 #include "diagnostic.h"
33 #include "tree-pass.h"
34 #include "tree-scalar-evolution.h"
36 #include "langhooks.h"
37 #include "tree-vectorizer.h"
39 /* This pass tries to distribute iterations of loops into several threads.
40 The implementation is straightforward -- for each loop we test whether its
41 iterations are independent, and if it is the case (and some additional
42 conditions regarding profitability and correctness are satisfied), we
43 add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion
46 The most of the complexity is in bringing the code into shape expected
48 -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction
49 variable and that the exit test is at the start of the loop body
50 -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable
51 variables by accesses through pointers, and breaking up ssa chains
52 by storing the values incoming to the parallelized loop to a structure
53 passed to the new function as an argument (something similar is done
54 in omp gimplification, unfortunately only a small part of the code
58 -- if there are several parallelizable loops in a function, it may be
59 possible to generate the threads just once (using synchronization to
60 ensure that cross-loop dependences are obeyed).
61 -- handling of common scalar dependence patterns (accumulation, ...)
62 -- handling of non-innermost loops */
66 currently we use vect_is_simple_reduction() to detect reduction patterns.
67 The code transformation will be introduced by an example.
74 for (i = 0; i < N; i++)
84 # sum_29 = PHI <sum_11(5), 1(3)>
85 # i_28 = PHI <i_12(5), 0(3)>
88 sum_11 = D.1795_8 + sum_29;
96 # sum_21 = PHI <sum_11(4)>
97 printf (&"%d"[0], sum_21);
100 after reduction transformation (only relevant parts):
108 # Storing the initial value given by the user. #
110 .paral_data_store.32.sum.27 = 1;
112 #pragma omp parallel num_threads(4)
114 #pragma omp for schedule(static)
116 # The neutral element corresponding to the particular
117 reduction's operation, e.g. 0 for PLUS_EXPR,
118 1 for MULT_EXPR, etc. replaces the user's initial value. #
120 # sum.27_29 = PHI <sum.27_11, 0>
122 sum.27_11 = D.1827_8 + sum.27_29;
126 # Adding this reduction phi is done at create_phi_for_local_result() #
127 # sum.27_56 = PHI <sum.27_11, 0>
130 # Creating the atomic operation is done at
131 create_call_for_reduction_1() #
133 #pragma omp atomic_load
134 D.1839_59 = *&.paral_data_load.33_51->reduction.23;
135 D.1840_60 = sum.27_56 + D.1839_59;
136 #pragma omp atomic_store (D.1840_60);
140 # collecting the result after the join of the threads is done at
141 create_loads_for_reductions().
142 The value computed by the threads is loaded from the
146 .paral_data_load.33_52 = &.paral_data_store.32;
147 sum_37 = .paral_data_load.33_52->sum.27;
148 sum_43 = D.1795_41 + sum_37;
151 # sum_21 = PHI <sum_43, sum_26>
152 printf (&"%d"[0], sum_21);
160 /* Minimal number of iterations of a loop that should be executed in each
162 #define MIN_PER_THREAD 100
164 /* Element of the hashtable, representing a
165 reduction in the current loop. */
166 struct reduction_info
168 gimple reduc_stmt; /* reduction statement. */
169 gimple reduc_phi; /* The phi node defining the reduction. */
170 enum tree_code reduction_code;/* code for the reduction operation. */
171 gimple keep_res; /* The PHI_RESULT of this phi is the resulting value
172 of the reduction variable when existing the loop. */
173 tree initial_value; /* The initial value of the reduction var before entering the loop. */
174 tree field; /* the name of the field in the parloop data structure intended for reduction. */
175 tree init; /* reduction initialization value. */
176 gimple new_phi; /* (helper field) Newly created phi node whose result
177 will be passed to the atomic operation. Represents
178 the local result each thread computed for the reduction
182 /* Equality and hash functions for hashtab code. */
185 reduction_info_eq (const void *aa, const void *bb)
187 const struct reduction_info *a = (const struct reduction_info *) aa;
188 const struct reduction_info *b = (const struct reduction_info *) bb;
190 return (a->reduc_phi == b->reduc_phi);
194 reduction_info_hash (const void *aa)
196 const struct reduction_info *a = (const struct reduction_info *) aa;
198 return htab_hash_pointer (a->reduc_phi);
201 static struct reduction_info *
202 reduction_phi (htab_t reduction_list, gimple phi)
204 struct reduction_info tmpred, *red;
206 if (htab_elements (reduction_list) == 0)
209 tmpred.reduc_phi = phi;
210 red = (struct reduction_info *) htab_find (reduction_list, &tmpred);
215 /* Element of hashtable of names to copy. */
217 struct name_to_copy_elt
219 unsigned version; /* The version of the name to copy. */
220 tree new_name; /* The new name used in the copy. */
221 tree field; /* The field of the structure used to pass the
225 /* Equality and hash functions for hashtab code. */
228 name_to_copy_elt_eq (const void *aa, const void *bb)
230 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
231 const struct name_to_copy_elt *b = (const struct name_to_copy_elt *) bb;
233 return a->version == b->version;
237 name_to_copy_elt_hash (const void *aa)
239 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
241 return (hashval_t) a->version;
245 /* Data dependency analysis. Returns true if the iterations of LOOP
246 are independent on each other (that is, if we can execute them
250 loop_parallel_p (struct loop *loop)
252 VEC (ddr_p, heap) * dependence_relations;
253 VEC (data_reference_p, heap) *datarefs;
254 lambda_trans_matrix trans;
257 if (dump_file && (dump_flags & TDF_DETAILS))
258 fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
260 /* Check for problems with dependences. If the loop can be reversed,
261 the iterations are independent. */
262 datarefs = VEC_alloc (data_reference_p, heap, 10);
263 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
264 compute_data_dependences_for_loop (loop, true, &datarefs,
265 &dependence_relations);
266 if (dump_file && (dump_flags & TDF_DETAILS))
267 dump_data_dependence_relations (dump_file, dependence_relations);
269 trans = lambda_trans_matrix_new (1, 1);
270 LTM_MATRIX (trans)[0][0] = -1;
272 if (lambda_transform_legal_p (trans, 1, dependence_relations))
275 if (dump_file && (dump_flags & TDF_DETAILS))
276 fprintf (dump_file, " SUCCESS: may be parallelized\n");
278 else if (dump_file && (dump_flags & TDF_DETAILS))
280 " FAILED: data dependencies exist across iterations\n");
282 free_dependence_relations (dependence_relations);
283 free_data_refs (datarefs);
288 /* Return true when LOOP contains basic blocks marked with the
289 BB_IRREDUCIBLE_LOOP flag. */
292 loop_has_blocks_with_irreducible_flag (struct loop *loop)
295 basic_block *bbs = get_loop_body_in_dom_order (loop);
298 for (i = 0; i < loop->num_nodes; i++)
299 if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
308 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
309 The assignment statement is placed on edge ENTRY. DECL_ADDRESS maps decls
310 to their addresses that can be reused. The address of OBJ is known to
311 be invariant in the whole function. */
314 take_address_of (tree obj, tree type, edge entry, htab_t decl_address)
318 struct int_tree_map ielt, *nielt;
319 tree *var_p, name, bvar, addr;
323 /* Since the address of OBJ is invariant, the trees may be shared.
324 Avoid rewriting unrelated parts of the code. */
325 obj = unshare_expr (obj);
327 handled_component_p (*var_p);
328 var_p = &TREE_OPERAND (*var_p, 0))
330 uid = DECL_UID (*var_p);
333 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
336 addr = build_addr (*var_p, current_function_decl);
337 bvar = create_tmp_var (TREE_TYPE (addr), get_name (*var_p));
338 add_referenced_var (bvar);
339 stmt = gimple_build_assign (bvar, addr);
340 name = make_ssa_name (bvar, stmt);
341 gimple_assign_set_lhs (stmt, name);
342 gsi_insert_on_edge_immediate (entry, stmt);
344 nielt = XNEW (struct int_tree_map);
350 name = ((struct int_tree_map *) *dslot)->to;
354 *var_p = build1 (INDIRECT_REF, TREE_TYPE (*var_p), name);
355 name = force_gimple_operand (build_addr (obj, current_function_decl),
356 &stmts, true, NULL_TREE);
357 if (!gimple_seq_empty_p (stmts))
358 gsi_insert_seq_on_edge_immediate (entry, stmts);
361 if (TREE_TYPE (name) != type)
363 name = force_gimple_operand (fold_convert (type, name), &stmts, true,
365 if (!gimple_seq_empty_p (stmts))
366 gsi_insert_seq_on_edge_immediate (entry, stmts);
372 /* Callback for htab_traverse. Create the initialization statement
373 for reduction described in SLOT, and place it at the preheader of
374 the loop described in DATA. */
377 initialize_reductions (void **slot, void *data)
380 tree bvar, type, arg;
383 struct reduction_info *const reduc = (struct reduction_info *) *slot;
384 struct loop *loop = (struct loop *) data;
386 /* Create initialization in preheader:
387 reduction_variable = initialization value of reduction. */
389 /* In the phi node at the header, replace the argument coming
390 from the preheader with the reduction initialization value. */
392 /* Create a new variable to initialize the reduction. */
393 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
394 bvar = create_tmp_var (type, "reduction");
395 add_referenced_var (bvar);
397 c = build_omp_clause (gimple_location (reduc->reduc_stmt),
398 OMP_CLAUSE_REDUCTION);
399 OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code;
400 OMP_CLAUSE_DECL (c) = SSA_NAME_VAR (gimple_assign_lhs (reduc->reduc_stmt));
402 init = omp_reduction_init (c, TREE_TYPE (bvar));
405 /* Replace the argument representing the initialization value
406 with the initialization value for the reduction (neutral
407 element for the particular operation, e.g. 0 for PLUS_EXPR,
408 1 for MULT_EXPR, etc).
409 Keep the old value in a new variable "reduction_initial",
410 that will be taken in consideration after the parallel
411 computing is done. */
413 e = loop_preheader_edge (loop);
414 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
415 /* Create new variable to hold the initial value. */
417 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
418 (reduc->reduc_phi, loop_preheader_edge (loop)), init);
419 reduc->initial_value = arg;
425 struct walk_stmt_info info;
431 /* Eliminates references to local variables in *TP out of the single
432 entry single exit region starting at DTA->ENTRY.
433 DECL_ADDRESS contains addresses of the references that had their
434 address taken already. If the expression is changed, CHANGED is
435 set to true. Callback for walk_tree. */
438 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
440 struct elv_data *const dta = (struct elv_data *) data;
441 tree t = *tp, var, addr, addr_type, type, obj;
447 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
450 type = TREE_TYPE (t);
451 addr_type = build_pointer_type (type);
452 addr = take_address_of (t, addr_type, dta->entry, dta->decl_address);
453 *tp = build1 (INDIRECT_REF, TREE_TYPE (*tp), addr);
459 if (TREE_CODE (t) == ADDR_EXPR)
461 /* ADDR_EXPR may appear in two contexts:
462 -- as a gimple operand, when the address taken is a function invariant
463 -- as gimple rhs, when the resulting address in not a function
465 We do not need to do anything special in the latter case (the base of
466 the memory reference whose address is taken may be replaced in the
467 DECL_P case). The former case is more complicated, as we need to
468 ensure that the new address is still a gimple operand. Thus, it
469 is not sufficient to replace just the base of the memory reference --
470 we need to move the whole computation of the address out of the
472 if (!is_gimple_val (t))
476 obj = TREE_OPERAND (t, 0);
477 var = get_base_address (obj);
478 if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
481 addr_type = TREE_TYPE (t);
482 addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address);
495 /* Moves the references to local variables in STMT out of the single
496 entry single exit region starting at ENTRY. DECL_ADDRESS contains
497 addresses of the references that had their address taken
501 eliminate_local_variables_stmt (edge entry, gimple stmt,
506 memset (&dta.info, '\0', sizeof (dta.info));
508 dta.decl_address = decl_address;
511 if (gimple_debug_bind_p (stmt))
512 walk_tree (gimple_debug_bind_get_value_ptr (stmt),
513 eliminate_local_variables_1, &dta.info, NULL);
515 walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info);
521 /* Eliminates the references to local variables from the single entry
522 single exit region between the ENTRY and EXIT edges.
525 1) Taking address of a local variable -- these are moved out of the
526 region (and temporary variable is created to hold the address if
529 2) Dereferencing a local variable -- these are replaced with indirect
533 eliminate_local_variables (edge entry, edge exit)
536 VEC (basic_block, heap) *body = VEC_alloc (basic_block, heap, 3);
538 gimple_stmt_iterator gsi;
539 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
541 basic_block entry_bb = entry->src;
542 basic_block exit_bb = exit->dest;
544 gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
546 for (i = 0; VEC_iterate (basic_block, body, i, bb); i++)
547 if (bb != entry_bb && bb != exit_bb)
548 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
549 eliminate_local_variables_stmt (entry, gsi_stmt (gsi),
552 htab_delete (decl_address);
553 VEC_free (basic_block, heap, body);
556 /* Returns true if expression EXPR is not defined between ENTRY and
557 EXIT, i.e. if all its operands are defined outside of the region. */
560 expr_invariant_in_region_p (edge entry, edge exit, tree expr)
562 basic_block entry_bb = entry->src;
563 basic_block exit_bb = exit->dest;
566 if (is_gimple_min_invariant (expr))
569 if (TREE_CODE (expr) == SSA_NAME)
571 def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
573 && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb)
574 && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb))
583 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
584 The copies are stored to NAME_COPIES, if NAME was already duplicated,
585 its duplicate stored in NAME_COPIES is returned.
587 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
588 duplicated, storing the copies in DECL_COPIES. */
591 separate_decls_in_region_name (tree name,
592 htab_t name_copies, htab_t decl_copies,
595 tree copy, var, var_copy;
596 unsigned idx, uid, nuid;
597 struct int_tree_map ielt, *nielt;
598 struct name_to_copy_elt elt, *nelt;
599 void **slot, **dslot;
601 if (TREE_CODE (name) != SSA_NAME)
604 idx = SSA_NAME_VERSION (name);
606 slot = htab_find_slot_with_hash (name_copies, &elt, idx,
607 copy_name_p ? INSERT : NO_INSERT);
609 return ((struct name_to_copy_elt *) *slot)->new_name;
611 var = SSA_NAME_VAR (name);
612 uid = DECL_UID (var);
614 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
617 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
618 DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
619 add_referenced_var (var_copy);
620 nielt = XNEW (struct int_tree_map);
622 nielt->to = var_copy;
625 /* Ensure that when we meet this decl next time, we won't duplicate
627 nuid = DECL_UID (var_copy);
629 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
630 gcc_assert (!*dslot);
631 nielt = XNEW (struct int_tree_map);
633 nielt->to = var_copy;
637 var_copy = ((struct int_tree_map *) *dslot)->to;
641 copy = duplicate_ssa_name (name, NULL);
642 nelt = XNEW (struct name_to_copy_elt);
644 nelt->new_name = copy;
645 nelt->field = NULL_TREE;
654 SSA_NAME_VAR (copy) = var_copy;
658 /* Finds the ssa names used in STMT that are defined outside the
659 region between ENTRY and EXIT and replaces such ssa names with
660 their duplicates. The duplicates are stored to NAME_COPIES. Base
661 decls of all ssa names used in STMT (including those defined in
662 LOOP) are replaced with the new temporary variables; the
663 replacement decls are stored in DECL_COPIES. */
666 separate_decls_in_region_stmt (edge entry, edge exit, gimple stmt,
667 htab_t name_copies, htab_t decl_copies)
675 mark_virtual_ops_for_renaming (stmt);
677 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
679 name = DEF_FROM_PTR (def);
680 gcc_assert (TREE_CODE (name) == SSA_NAME);
681 copy = separate_decls_in_region_name (name, name_copies, decl_copies,
683 gcc_assert (copy == name);
686 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
688 name = USE_FROM_PTR (use);
689 if (TREE_CODE (name) != SSA_NAME)
692 copy_name_p = expr_invariant_in_region_p (entry, exit, name);
693 copy = separate_decls_in_region_name (name, name_copies, decl_copies,
699 /* Finds the ssa names used in STMT that are defined outside the
700 region between ENTRY and EXIT and replaces such ssa names with
701 their duplicates. The duplicates are stored to NAME_COPIES. Base
702 decls of all ssa names used in STMT (including those defined in
703 LOOP) are replaced with the new temporary variables; the
704 replacement decls are stored in DECL_COPIES. */
707 separate_decls_in_region_debug_bind (gimple stmt,
708 htab_t name_copies, htab_t decl_copies)
713 struct int_tree_map ielt;
714 struct name_to_copy_elt elt;
715 void **slot, **dslot;
717 var = gimple_debug_bind_get_var (stmt);
718 gcc_assert (DECL_P (var) && SSA_VAR_P (var));
719 ielt.uid = DECL_UID (var);
720 dslot = htab_find_slot_with_hash (decl_copies, &ielt, ielt.uid, NO_INSERT);
723 gimple_debug_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to);
725 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
727 name = USE_FROM_PTR (use);
728 if (TREE_CODE (name) != SSA_NAME)
731 elt.version = SSA_NAME_VERSION (name);
732 slot = htab_find_slot_with_hash (name_copies, &elt, elt.version, NO_INSERT);
735 gimple_debug_bind_reset_value (stmt);
740 SET_USE (use, ((struct name_to_copy_elt *) *slot)->new_name);
746 /* Callback for htab_traverse. Adds a field corresponding to the reduction
747 specified in SLOT. The type is passed in DATA. */
750 add_field_for_reduction (void **slot, void *data)
753 struct reduction_info *const red = (struct reduction_info *) *slot;
754 tree const type = (tree) data;
755 tree var = SSA_NAME_VAR (gimple_assign_lhs (red->reduc_stmt));
756 tree field = build_decl (gimple_location (red->reduc_stmt),
757 FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
759 insert_field_into_struct (type, field);
766 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
767 described in SLOT. The type is passed in DATA. */
770 add_field_for_name (void **slot, void *data)
772 struct name_to_copy_elt *const elt = (struct name_to_copy_elt *) *slot;
773 tree type = (tree) data;
774 tree name = ssa_name (elt->version);
775 tree var = SSA_NAME_VAR (name);
776 tree field = build_decl (DECL_SOURCE_LOCATION (var),
777 FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
779 insert_field_into_struct (type, field);
785 /* Callback for htab_traverse. A local result is the intermediate result
787 thread, or the initial value in case no iteration was executed.
788 This function creates a phi node reflecting these values.
789 The phi's result will be stored in NEW_PHI field of the
790 reduction's data structure. */
793 create_phi_for_local_result (void **slot, void *data)
795 struct reduction_info *const reduc = (struct reduction_info *) *slot;
796 const struct loop *const loop = (const struct loop *) data;
799 basic_block store_bb;
801 source_location locus;
803 /* STORE_BB is the block where the phi
804 should be stored. It is the destination of the loop exit.
805 (Find the fallthru edge from GIMPLE_OMP_CONTINUE). */
806 store_bb = FALLTHRU_EDGE (loop->latch)->dest;
808 /* STORE_BB has two predecessors. One coming from the loop
809 (the reduction's result is computed at the loop),
810 and another coming from a block preceding the loop,
812 are executed (the initial value should be taken). */
813 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch))
814 e = EDGE_PRED (store_bb, 1);
816 e = EDGE_PRED (store_bb, 0);
818 = make_ssa_name (SSA_NAME_VAR (gimple_assign_lhs (reduc->reduc_stmt)),
820 locus = gimple_location (reduc->reduc_stmt);
821 new_phi = create_phi_node (local_res, store_bb);
822 SSA_NAME_DEF_STMT (local_res) = new_phi;
823 add_phi_arg (new_phi, reduc->init, e, locus);
824 add_phi_arg (new_phi, gimple_assign_lhs (reduc->reduc_stmt),
825 FALLTHRU_EDGE (loop->latch), locus);
826 reduc->new_phi = new_phi;
836 basic_block store_bb;
840 /* Callback for htab_traverse. Create an atomic instruction for the
841 reduction described in SLOT.
842 DATA annotates the place in memory the atomic operation relates to,
843 and the basic block it needs to be generated in. */
846 create_call_for_reduction_1 (void **slot, void *data)
848 struct reduction_info *const reduc = (struct reduction_info *) *slot;
849 struct clsn_data *const clsn_data = (struct clsn_data *) data;
850 gimple_stmt_iterator gsi;
851 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
852 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
857 tree t, addr, addr_type, ref, x;
861 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
862 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
863 addr_type = build_pointer_type (type);
865 addr = build_addr (t, current_function_decl);
867 /* Create phi node. */
868 bb = clsn_data->load_bb;
870 e = split_block (bb, t);
873 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL);
874 add_referenced_var (tmp_load);
875 tmp_load = make_ssa_name (tmp_load, NULL);
876 load = gimple_build_omp_atomic_load (tmp_load, addr);
877 SSA_NAME_DEF_STMT (tmp_load) = load;
878 gsi = gsi_start_bb (new_bb);
879 gsi_insert_after (&gsi, load, GSI_NEW_STMT);
881 e = split_block (new_bb, load);
883 gsi = gsi_start_bb (new_bb);
885 x = fold_build2 (reduc->reduction_code,
886 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
887 PHI_RESULT (reduc->new_phi));
889 name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true,
890 GSI_CONTINUE_LINKING);
892 gsi_insert_after (&gsi, gimple_build_omp_atomic_store (name), GSI_NEW_STMT);
896 /* Create the atomic operation at the join point of the threads.
897 REDUCTION_LIST describes the reductions in the LOOP.
898 LD_ST_DATA describes the shared data structure where
899 shared data is stored in and loaded from. */
901 create_call_for_reduction (struct loop *loop, htab_t reduction_list,
902 struct clsn_data *ld_st_data)
904 htab_traverse (reduction_list, create_phi_for_local_result, loop);
905 /* Find the fallthru edge from GIMPLE_OMP_CONTINUE. */
906 ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest;
907 htab_traverse (reduction_list, create_call_for_reduction_1, ld_st_data);
910 /* Callback for htab_traverse. Loads the final reduction value at the
911 join point of all threads, and inserts it in the right place. */
914 create_loads_for_reductions (void **slot, void *data)
916 struct reduction_info *const red = (struct reduction_info *) *slot;
917 struct clsn_data *const clsn_data = (struct clsn_data *) data;
919 gimple_stmt_iterator gsi;
920 tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt));
921 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
926 gsi = gsi_after_labels (clsn_data->load_bb);
927 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
928 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
932 name = PHI_RESULT (red->keep_res);
933 stmt = gimple_build_assign (name, x);
934 SSA_NAME_DEF_STMT (name) = stmt;
936 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
938 for (gsi = gsi_start_phis (gimple_bb (red->keep_res));
939 !gsi_end_p (gsi); gsi_next (&gsi))
940 if (gsi_stmt (gsi) == red->keep_res)
942 remove_phi_node (&gsi, false);
948 /* Load the reduction result that was stored in LD_ST_DATA.
949 REDUCTION_LIST describes the list of reductions that the
950 loads should be generated for. */
952 create_final_loads_for_reduction (htab_t reduction_list,
953 struct clsn_data *ld_st_data)
955 gimple_stmt_iterator gsi;
959 gsi = gsi_after_labels (ld_st_data->load_bb);
960 t = build_fold_addr_expr (ld_st_data->store);
961 stmt = gimple_build_assign (ld_st_data->load, t);
963 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
964 SSA_NAME_DEF_STMT (ld_st_data->load) = stmt;
966 htab_traverse (reduction_list, create_loads_for_reductions, ld_st_data);
970 /* Callback for htab_traverse. Store the neutral value for the
971 particular reduction's operation, e.g. 0 for PLUS_EXPR,
972 1 for MULT_EXPR, etc. into the reduction field.
973 The reduction is specified in SLOT. The store information is
977 create_stores_for_reduction (void **slot, void *data)
979 struct reduction_info *const red = (struct reduction_info *) *slot;
980 struct clsn_data *const clsn_data = (struct clsn_data *) data;
983 gimple_stmt_iterator gsi;
984 tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt));
986 gsi = gsi_last_bb (clsn_data->store_bb);
987 t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE);
988 stmt = gimple_build_assign (t, red->initial_value);
989 mark_virtual_ops_for_renaming (stmt);
990 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
995 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
996 store to a field of STORE in STORE_BB for the ssa name and its duplicate
997 specified in SLOT. */
1000 create_loads_and_stores_for_name (void **slot, void *data)
1002 struct name_to_copy_elt *const elt = (struct name_to_copy_elt *) *slot;
1003 struct clsn_data *const clsn_data = (struct clsn_data *) data;
1006 gimple_stmt_iterator gsi;
1007 tree type = TREE_TYPE (elt->new_name);
1008 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
1011 gsi = gsi_last_bb (clsn_data->store_bb);
1012 t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE);
1013 stmt = gimple_build_assign (t, ssa_name (elt->version));
1014 mark_virtual_ops_for_renaming (stmt);
1015 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1017 gsi = gsi_last_bb (clsn_data->load_bb);
1018 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
1019 t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE);
1020 stmt = gimple_build_assign (elt->new_name, t);
1021 SSA_NAME_DEF_STMT (elt->new_name) = stmt;
1022 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1027 /* Moves all the variables used in LOOP and defined outside of it (including
1028 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1029 name) to a structure created for this purpose. The code
1037 is transformed this way:
1052 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
1053 pointer `new' is intentionally not initialized (the loop will be split to a
1054 separate function later, and `new' will be initialized from its arguments).
1055 LD_ST_DATA holds information about the shared data structure used to pass
1056 information among the threads. It is initialized here, and
1057 gen_parallel_loop will pass it to create_call_for_reduction that
1058 needs this information. REDUCTION_LIST describes the reductions
1062 separate_decls_in_region (edge entry, edge exit, htab_t reduction_list,
1063 tree *arg_struct, tree *new_arg_struct,
1064 struct clsn_data *ld_st_data)
1067 basic_block bb1 = split_edge (entry);
1068 basic_block bb0 = single_pred (bb1);
1069 htab_t name_copies = htab_create (10, name_to_copy_elt_hash,
1070 name_to_copy_elt_eq, free);
1071 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
1074 tree type, type_name, nvar;
1075 gimple_stmt_iterator gsi;
1076 struct clsn_data clsn_data;
1077 VEC (basic_block, heap) *body = VEC_alloc (basic_block, heap, 3);
1079 basic_block entry_bb = bb1;
1080 basic_block exit_bb = exit->dest;
1081 bool has_debug_stmt = false;
1083 entry = single_succ_edge (entry_bb);
1084 gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
1086 for (i = 0; VEC_iterate (basic_block, body, i, bb); i++)
1088 if (bb != entry_bb && bb != exit_bb)
1090 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1091 separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi),
1092 name_copies, decl_copies);
1094 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1096 gimple stmt = gsi_stmt (gsi);
1098 if (is_gimple_debug (stmt))
1099 has_debug_stmt = true;
1101 separate_decls_in_region_stmt (entry, exit, stmt,
1102 name_copies, decl_copies);
1107 /* Now process debug bind stmts. We must not create decls while
1108 processing debug stmts, so we defer their processing so as to
1109 make sure we will have debug info for as many variables as
1110 possible (all of those that were dealt with in the loop above),
1111 and discard those for which we know there's nothing we can
1114 for (i = 0; VEC_iterate (basic_block, body, i, bb); i++)
1115 if (bb != entry_bb && bb != exit_bb)
1117 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1119 gimple stmt = gsi_stmt (gsi);
1121 if (gimple_debug_bind_p (stmt))
1123 if (separate_decls_in_region_debug_bind (stmt,
1127 gsi_remove (&gsi, true);
1136 VEC_free (basic_block, heap, body);
1138 if (htab_elements (name_copies) == 0 && htab_elements (reduction_list) == 0)
1140 /* It may happen that there is nothing to copy (if there are only
1141 loop carried and external variables in the loop). */
1143 *new_arg_struct = NULL;
1147 /* Create the type for the structure to store the ssa names to. */
1148 type = lang_hooks.types.make_type (RECORD_TYPE);
1149 type_name = build_decl (BUILTINS_LOCATION,
1150 TYPE_DECL, create_tmp_var_name (".paral_data"),
1152 TYPE_NAME (type) = type_name;
1154 htab_traverse (name_copies, add_field_for_name, type);
1155 if (reduction_list && htab_elements (reduction_list) > 0)
1157 /* Create the fields for reductions. */
1158 htab_traverse (reduction_list, add_field_for_reduction,
1163 /* Create the loads and stores. */
1164 *arg_struct = create_tmp_var (type, ".paral_data_store");
1165 add_referenced_var (*arg_struct);
1166 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1167 add_referenced_var (nvar);
1168 *new_arg_struct = make_ssa_name (nvar, NULL);
1170 ld_st_data->store = *arg_struct;
1171 ld_st_data->load = *new_arg_struct;
1172 ld_st_data->store_bb = bb0;
1173 ld_st_data->load_bb = bb1;
1175 htab_traverse (name_copies, create_loads_and_stores_for_name,
1178 /* Load the calculation from memory (after the join of the threads). */
1180 if (reduction_list && htab_elements (reduction_list) > 0)
1182 htab_traverse (reduction_list, create_stores_for_reduction,
1184 clsn_data.load = make_ssa_name (nvar, NULL);
1185 clsn_data.load_bb = exit->dest;
1186 clsn_data.store = ld_st_data->store;
1187 create_final_loads_for_reduction (reduction_list, &clsn_data);
1191 htab_delete (decl_copies);
1192 htab_delete (name_copies);
1195 /* Bitmap containing uids of functions created by parallelization. We cannot
1196 allocate it from the default obstack, as it must live across compilation
1197 of several functions; we make it gc allocated instead. */
1199 static GTY(()) bitmap parallelized_functions;
1201 /* Returns true if FN was created by create_loop_fn. */
1204 parallelized_function_p (tree fn)
1206 if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
1209 return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
1212 /* Creates and returns an empty function that will receive the body of
1213 a parallelized loop. */
1216 create_loop_fn (void)
1220 tree decl, type, name, t;
1221 struct function *act_cfun = cfun;
1222 static unsigned loopfn_num;
1224 snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1225 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1226 clean_symbol_name (tname);
1227 name = get_identifier (tname);
1228 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1230 decl = build_decl (BUILTINS_LOCATION,
1231 FUNCTION_DECL, name, type);
1232 if (!parallelized_functions)
1233 parallelized_functions = BITMAP_GGC_ALLOC ();
1234 bitmap_set_bit (parallelized_functions, DECL_UID (decl));
1236 TREE_STATIC (decl) = 1;
1237 TREE_USED (decl) = 1;
1238 DECL_ARTIFICIAL (decl) = 1;
1239 DECL_IGNORED_P (decl) = 0;
1240 TREE_PUBLIC (decl) = 0;
1241 DECL_UNINLINABLE (decl) = 1;
1242 DECL_EXTERNAL (decl) = 0;
1243 DECL_CONTEXT (decl) = NULL_TREE;
1244 DECL_INITIAL (decl) = make_node (BLOCK);
1246 t = build_decl (BUILTINS_LOCATION,
1247 RESULT_DECL, NULL_TREE, void_type_node);
1248 DECL_ARTIFICIAL (t) = 1;
1249 DECL_IGNORED_P (t) = 1;
1250 DECL_RESULT (decl) = t;
1252 t = build_decl (BUILTINS_LOCATION,
1253 PARM_DECL, get_identifier (".paral_data_param"),
1255 DECL_ARTIFICIAL (t) = 1;
1256 DECL_ARG_TYPE (t) = ptr_type_node;
1257 DECL_CONTEXT (t) = decl;
1259 DECL_ARGUMENTS (decl) = t;
1261 allocate_struct_function (decl, false);
1263 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1265 set_cfun (act_cfun);
1270 /* Moves the exit condition of LOOP to the beginning of its header, and
1271 duplicates the part of the last iteration that gets disabled to the
1272 exit of the loop. NIT is the number of iterations of the loop
1273 (used to initialize the variables in the duplicated part).
1275 TODO: the common case is that latch of the loop is empty and immediately
1276 follows the loop exit. In this case, it would be better not to copy the
1277 body of the loop, but only move the entry of the loop directly before the
1278 exit check and increase the number of iterations of the loop by one.
1279 This may need some additional preconditioning in case NIT = ~0.
1280 REDUCTION_LIST describes the reductions in LOOP. */
1283 transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit)
1285 basic_block *bbs, *nbbs, ex_bb, orig_header;
1288 edge exit = single_dom_exit (loop), hpred;
1289 tree control, control_name, res, t;
1290 gimple phi, nphi, cond_stmt, stmt;
1291 gimple_stmt_iterator gsi;
1293 split_block_after_labels (loop->header);
1294 orig_header = single_succ (loop->header);
1295 hpred = single_succ_edge (loop->header);
1297 cond_stmt = last_stmt (exit->src);
1298 control = gimple_cond_lhs (cond_stmt);
1299 gcc_assert (gimple_cond_rhs (cond_stmt) == nit);
1301 /* Make sure that we have phi nodes on exit for all loop header phis
1302 (create_parallel_loop requires that). */
1303 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1305 phi = gsi_stmt (gsi);
1306 res = PHI_RESULT (phi);
1307 t = make_ssa_name (SSA_NAME_VAR (res), phi);
1308 SET_PHI_RESULT (phi, t);
1310 nphi = create_phi_node (res, orig_header);
1311 SSA_NAME_DEF_STMT (res) = nphi;
1312 add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
1316 gimple_cond_set_lhs (cond_stmt, t);
1317 update_stmt (cond_stmt);
1322 bbs = get_loop_body_in_dom_order (loop);
1323 for (n = 0; bbs[n] != exit->src; n++)
1325 nbbs = XNEWVEC (basic_block, n);
1326 ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1333 /* Other than reductions, the only gimple reg that should be copied
1334 out of the loop is the control variable. */
1336 control_name = NULL_TREE;
1337 for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); )
1339 phi = gsi_stmt (gsi);
1340 res = PHI_RESULT (phi);
1341 if (!is_gimple_reg (res))
1347 /* Check if it is a part of reduction. If it is,
1348 keep the phi at the reduction's keep_res field. The
1349 PHI_RESULT of this phi is the resulting value of the reduction
1350 variable when exiting the loop. */
1352 exit = single_dom_exit (loop);
1354 if (htab_elements (reduction_list) > 0)
1356 struct reduction_info *red;
1358 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1360 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1363 red->keep_res = phi;
1368 gcc_assert (control_name == NULL_TREE
1369 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
1371 remove_phi_node (&gsi, false);
1373 gcc_assert (control_name != NULL_TREE);
1375 /* Initialize the control variable to NIT. */
1376 gsi = gsi_after_labels (ex_bb);
1377 nit = force_gimple_operand_gsi (&gsi,
1378 fold_convert (TREE_TYPE (control_name), nit),
1379 false, NULL_TREE, false, GSI_SAME_STMT);
1380 stmt = gimple_build_assign (control_name, nit);
1381 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1382 SSA_NAME_DEF_STMT (control_name) = stmt;
1385 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1386 LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL.
1387 NEW_DATA is the variable that should be initialized from the argument
1388 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
1389 basic block containing GIMPLE_OMP_PARALLEL tree. */
1392 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
1393 tree new_data, unsigned n_threads)
1395 gimple_stmt_iterator gsi;
1396 basic_block bb, paral_bb, for_bb, ex_bb;
1398 gimple stmt, for_stmt, phi, cond_stmt;
1399 tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
1400 edge exit, nexit, guard, end, e;
1402 /* Prepare the GIMPLE_OMP_PARALLEL statement. */
1403 bb = loop_preheader_edge (loop)->src;
1404 paral_bb = single_pred (bb);
1405 gsi = gsi_last_bb (paral_bb);
1407 t = build_omp_clause (BUILTINS_LOCATION, OMP_CLAUSE_NUM_THREADS);
1408 OMP_CLAUSE_NUM_THREADS_EXPR (t)
1409 = build_int_cst (integer_type_node, n_threads);
1410 stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
1412 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1414 /* Initialize NEW_DATA. */
1417 gsi = gsi_after_labels (bb);
1419 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL);
1420 stmt = gimple_build_assign (param, build_fold_addr_expr (data));
1421 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1422 SSA_NAME_DEF_STMT (param) = stmt;
1424 stmt = gimple_build_assign (new_data,
1425 fold_convert (TREE_TYPE (new_data), param));
1426 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1427 SSA_NAME_DEF_STMT (new_data) = stmt;
1430 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
1431 bb = split_loop_exit_edge (single_dom_exit (loop));
1432 gsi = gsi_last_bb (bb);
1433 gsi_insert_after (&gsi, gimple_build_omp_return (false), GSI_NEW_STMT);
1435 /* Extract data for GIMPLE_OMP_FOR. */
1436 gcc_assert (loop->header == single_dom_exit (loop)->src);
1437 cond_stmt = last_stmt (loop->header);
1439 cvar = gimple_cond_lhs (cond_stmt);
1440 cvar_base = SSA_NAME_VAR (cvar);
1441 phi = SSA_NAME_DEF_STMT (cvar);
1442 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1443 initvar = make_ssa_name (cvar_base, NULL);
1444 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
1446 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
1448 gsi = gsi_last_bb (loop->latch);
1449 gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next));
1450 gsi_remove (&gsi, true);
1453 for_bb = split_edge (loop_preheader_edge (loop));
1454 ex_bb = split_loop_exit_edge (single_dom_exit (loop));
1455 extract_true_false_edges_from_block (loop->header, &nexit, &exit);
1456 gcc_assert (exit == single_dom_exit (loop));
1458 guard = make_edge (for_bb, ex_bb, 0);
1459 single_succ_edge (loop->latch)->flags = 0;
1460 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
1461 for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); gsi_next (&gsi))
1463 source_location locus;
1465 phi = gsi_stmt (gsi);
1466 res = PHI_RESULT (phi);
1467 stmt = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
1469 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
1470 locus = gimple_phi_arg_location_from_edge (stmt,
1471 loop_preheader_edge (loop));
1472 add_phi_arg (phi, def, guard, locus);
1474 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop));
1475 locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop));
1476 add_phi_arg (phi, def, end, locus);
1478 e = redirect_edge_and_branch (exit, nexit->dest);
1479 PENDING_STMT (e) = NULL;
1481 /* Emit GIMPLE_OMP_FOR. */
1482 gimple_cond_set_lhs (cond_stmt, cvar_base);
1483 type = TREE_TYPE (cvar);
1484 t = build_omp_clause (BUILTINS_LOCATION, OMP_CLAUSE_SCHEDULE);
1485 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
1487 for_stmt = gimple_build_omp_for (NULL, t, 1, NULL);
1488 gimple_omp_for_set_index (for_stmt, 0, initvar);
1489 gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
1490 gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt));
1491 gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt));
1492 gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type,
1494 build_int_cst (type, 1)));
1496 gsi = gsi_last_bb (for_bb);
1497 gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT);
1498 SSA_NAME_DEF_STMT (initvar) = for_stmt;
1500 /* Emit GIMPLE_OMP_CONTINUE. */
1501 gsi = gsi_last_bb (loop->latch);
1502 stmt = gimple_build_omp_continue (cvar_next, cvar);
1503 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1504 SSA_NAME_DEF_STMT (cvar_next) = stmt;
1506 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR. */
1507 gsi = gsi_last_bb (ex_bb);
1508 gsi_insert_after (&gsi, gimple_build_omp_return (true), GSI_NEW_STMT);
1513 /* Generates code to execute the iterations of LOOP in N_THREADS
1514 threads in parallel.
1516 NITER describes number of iterations of LOOP.
1517 REDUCTION_LIST describes the reductions existent in the LOOP. */
1520 gen_parallel_loop (struct loop *loop, htab_t reduction_list,
1521 unsigned n_threads, struct tree_niter_desc *niter)
1525 tree many_iterations_cond, type, nit;
1526 tree arg_struct, new_arg_struct;
1528 basic_block parallel_head;
1530 struct clsn_data clsn_data;
1535 ---------------------------------------------------------------------
1538 IV = phi (INIT, IV + STEP)
1544 ---------------------------------------------------------------------
1546 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1547 we generate the following code:
1549 ---------------------------------------------------------------------
1552 || NITER < MIN_PER_THREAD * N_THREADS)
1556 store all local loop-invariant variables used in body of the loop to DATA.
1557 GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1558 load the variables from DATA.
1559 GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1562 GIMPLE_OMP_CONTINUE;
1563 GIMPLE_OMP_RETURN -- GIMPLE_OMP_FOR
1564 GIMPLE_OMP_RETURN -- GIMPLE_OMP_PARALLEL
1570 IV = phi (INIT, IV + STEP)
1581 /* Create two versions of the loop -- in the old one, we know that the
1582 number of iterations is large enough, and we will transform it into the
1583 loop that will be split to loop_fn, the new one will be used for the
1584 remaining iterations. */
1586 type = TREE_TYPE (niter->niter);
1587 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1590 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1592 many_iterations_cond =
1593 fold_build2 (GE_EXPR, boolean_type_node,
1594 nit, build_int_cst (type, MIN_PER_THREAD * n_threads));
1595 many_iterations_cond
1596 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1597 invert_truthvalue (unshare_expr (niter->may_be_zero)),
1598 many_iterations_cond);
1599 many_iterations_cond
1600 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
1602 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1603 if (!is_gimple_condexpr (many_iterations_cond))
1605 many_iterations_cond
1606 = force_gimple_operand (many_iterations_cond, &stmts,
1609 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1612 initialize_original_copy_tables ();
1614 /* We assume that the loop usually iterates a lot. */
1615 prob = 4 * REG_BR_PROB_BASE / 5;
1616 nloop = loop_version (loop, many_iterations_cond, NULL,
1617 prob, prob, REG_BR_PROB_BASE - prob, true);
1618 update_ssa (TODO_update_ssa);
1619 free_original_copy_tables ();
1621 /* Base all the induction variables in LOOP on a single control one. */
1622 canonicalize_loop_ivs (loop, &nit);
1624 /* Ensure that the exit condition is the first statement in the loop. */
1625 transform_to_exit_first_loop (loop, reduction_list, nit);
1627 /* Generate initializations for reductions. */
1628 if (htab_elements (reduction_list) > 0)
1629 htab_traverse (reduction_list, initialize_reductions, loop);
1631 /* Eliminate the references to local variables from the loop. */
1632 gcc_assert (single_exit (loop));
1633 entry = loop_preheader_edge (loop);
1634 exit = single_dom_exit (loop);
1636 eliminate_local_variables (entry, exit);
1637 /* In the old loop, move all variables non-local to the loop to a structure
1638 and back, and create separate decls for the variables used in loop. */
1639 separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
1640 &new_arg_struct, &clsn_data);
1642 /* Create the parallel constructs. */
1643 parallel_head = create_parallel_loop (loop, create_loop_fn (), arg_struct,
1644 new_arg_struct, n_threads);
1645 if (htab_elements (reduction_list) > 0)
1646 create_call_for_reduction (loop, reduction_list, &clsn_data);
1650 /* Cancel the loop (it is simpler to do it here rather than to teach the
1651 expander to do it). */
1652 cancel_loop_tree (loop);
1654 /* Free loop bound estimations that could contain references to
1655 removed statements. */
1656 FOR_EACH_LOOP (li, loop, 0)
1657 free_numbers_of_iterations_estimates_loop (loop);
1659 /* Expand the parallel constructs. We do it directly here instead of running
1660 a separate expand_omp pass, since it is more efficient, and less likely to
1661 cause troubles with further analyses not being able to deal with the
1664 omp_expand_local (parallel_head);
1667 /* Returns true when LOOP contains vector phi nodes. */
1670 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED)
1673 basic_block *bbs = get_loop_body_in_dom_order (loop);
1674 gimple_stmt_iterator gsi;
1677 for (i = 0; i < loop->num_nodes; i++)
1678 for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
1679 if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi_stmt (gsi)))) == VECTOR_TYPE)
1688 /* Create a reduction_info struct, initialize it with REDUC_STMT
1689 and PHI, insert it to the REDUCTION_LIST. */
1692 build_new_reduction (htab_t reduction_list, gimple reduc_stmt, gimple phi)
1695 struct reduction_info *new_reduction;
1697 gcc_assert (reduc_stmt);
1699 if (dump_file && (dump_flags & TDF_DETAILS))
1702 "Detected reduction. reduction stmt is: \n");
1703 print_gimple_stmt (dump_file, reduc_stmt, 0, 0);
1704 fprintf (dump_file, "\n");
1707 new_reduction = XCNEW (struct reduction_info);
1709 new_reduction->reduc_stmt = reduc_stmt;
1710 new_reduction->reduc_phi = phi;
1711 new_reduction->reduction_code = gimple_assign_rhs_code (reduc_stmt);
1712 slot = htab_find_slot (reduction_list, new_reduction, INSERT);
1713 *slot = new_reduction;
1716 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST. */
1719 gather_scalar_reductions (loop_p loop, htab_t reduction_list)
1721 gimple_stmt_iterator gsi;
1722 loop_vec_info simple_loop_info;
1725 simple_loop_info = vect_analyze_loop_form (loop);
1727 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1729 gimple phi = gsi_stmt (gsi);
1731 tree res = PHI_RESULT (phi);
1734 if (!is_gimple_reg (res))
1737 if (!simple_iv (loop, loop, res, &iv, true)
1738 && simple_loop_info)
1740 gimple reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi, true, &double_reduc);
1742 build_new_reduction (reduction_list, reduc_stmt, phi);
1745 destroy_loop_vec_info (simple_loop_info, true);
1748 /* Try to initialize NITER for code generation part. */
1751 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter)
1753 edge exit = single_dom_exit (loop);
1757 /* We need to know # of iterations, and there should be no uses of values
1758 defined inside loop outside of it, unless the values are invariants of
1760 if (!number_of_iterations_exit (loop, exit, niter, false))
1762 if (dump_file && (dump_flags & TDF_DETAILS))
1763 fprintf (dump_file, " FAILED: number of iterations not known\n");
1770 /* Try to initialize REDUCTION_LIST for code generation part.
1771 REDUCTION_LIST describes the reductions. */
1774 try_create_reduction_list (loop_p loop, htab_t reduction_list)
1776 edge exit = single_dom_exit (loop);
1777 gimple_stmt_iterator gsi;
1781 gather_scalar_reductions (loop, reduction_list);
1784 for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
1786 gimple phi = gsi_stmt (gsi);
1787 struct reduction_info *red;
1788 imm_use_iterator imm_iter;
1789 use_operand_p use_p;
1791 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1793 if (is_gimple_reg (val))
1795 if (dump_file && (dump_flags & TDF_DETAILS))
1797 fprintf (dump_file, "phi is ");
1798 print_gimple_stmt (dump_file, phi, 0, 0);
1799 fprintf (dump_file, "arg of phi to exit: value ");
1800 print_generic_expr (dump_file, val, 0);
1801 fprintf (dump_file, " used outside loop\n");
1803 " checking if it a part of reduction pattern: \n");
1805 if (htab_elements (reduction_list) == 0)
1807 if (dump_file && (dump_flags & TDF_DETAILS))
1809 " FAILED: it is not a part of reduction.\n");
1813 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
1815 if (flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
1817 reduc_phi = USE_STMT (use_p);
1821 red = reduction_phi (reduction_list, reduc_phi);
1824 if (dump_file && (dump_flags & TDF_DETAILS))
1826 " FAILED: it is not a part of reduction.\n");
1829 if (dump_file && (dump_flags & TDF_DETAILS))
1831 fprintf (dump_file, "reduction phi is ");
1832 print_gimple_stmt (dump_file, red->reduc_phi, 0, 0);
1833 fprintf (dump_file, "reduction stmt is ");
1834 print_gimple_stmt (dump_file, red->reduc_stmt, 0, 0);
1839 /* The iterations of the loop may communicate only through bivs whose
1840 iteration space can be distributed efficiently. */
1841 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1843 gimple phi = gsi_stmt (gsi);
1844 tree def = PHI_RESULT (phi);
1847 if (is_gimple_reg (def) && !simple_iv (loop, loop, def, &iv, true))
1849 struct reduction_info *red;
1851 red = reduction_phi (reduction_list, phi);
1854 if (dump_file && (dump_flags & TDF_DETAILS))
1856 " FAILED: scalar dependency between iterations\n");
1866 /* Detect parallel loops and generate parallel code using libgomp
1867 primitives. Returns true if some loop was parallelized, false
1871 parallelize_loops (void)
1873 unsigned n_threads = flag_tree_parallelize_loops;
1874 bool changed = false;
1876 struct tree_niter_desc niter_desc;
1878 htab_t reduction_list;
1880 /* Do not parallelize loops in the functions created by parallelization. */
1881 if (parallelized_function_p (cfun->decl))
1884 reduction_list = htab_create (10, reduction_info_hash,
1885 reduction_info_eq, free);
1886 init_stmt_vec_info_vec ();
1888 FOR_EACH_LOOP (li, loop, 0)
1890 htab_empty (reduction_list);
1892 /* If we use autopar in graphite pass, we use it's marked dependency
1893 checking results. */
1894 if (flag_loop_parallelize_all && !loop->can_be_parallel)
1897 /* FIXME: Only consider innermost loops with just one exit. */
1898 if (loop->inner || !single_dom_exit (loop))
1901 if (/* And of course, the loop must be parallelizable. */
1902 !can_duplicate_loop_p (loop)
1903 || loop_has_blocks_with_irreducible_flag (loop)
1904 /* FIXME: the check for vector phi nodes could be removed. */
1905 || loop_has_vector_phi_nodes (loop))
1908 /* FIXME: Bypass this check as graphite doesn't update the
1909 count and frequency correctly now. */
1910 if (!flag_loop_parallelize_all
1911 && ((estimated_loop_iterations_int (loop, false)
1912 <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
1913 /* Do not bother with loops in cold areas. */
1914 || optimize_loop_nest_for_size_p (loop)))
1917 if (!try_get_loop_niter (loop, &niter_desc))
1920 if (!try_create_reduction_list (loop, reduction_list))
1923 if (!flag_loop_parallelize_all && !loop_parallel_p (loop))
1927 gen_parallel_loop (loop, reduction_list,
1928 n_threads, &niter_desc);
1929 verify_flow_info ();
1930 verify_dominators (CDI_DOMINATORS);
1931 verify_loop_structure ();
1932 verify_loop_closed_ssa ();
1935 free_stmt_vec_info_vec ();
1936 htab_delete (reduction_list);
1938 /* Parallelization will cause new function calls to be inserted through
1939 which local variables will escape. Reset the points-to solutions
1940 for ESCAPED and CALLUSED. */
1943 pt_solution_reset (&cfun->gimple_df->escaped);
1944 pt_solution_reset (&cfun->gimple_df->callused);
1950 #include "gt-tree-parloops.h"