1 /* Loop autoparallelization.
2 Copyright (C) 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
25 #include "coretypes.h"
29 #include "tree-flow.h"
32 #include "tree-data-ref.h"
33 #include "diagnostic.h"
34 #include "tree-pass.h"
35 #include "tree-scalar-evolution.h"
37 #include "langhooks.h"
38 #include "tree-vectorizer.h"
40 /* This pass tries to distribute iterations of loops into several threads.
41 The implementation is straightforward -- for each loop we test whether its
42 iterations are independent, and if it is the case (and some additional
43 conditions regarding profitability and correctness are satisfied), we
44 add OMP_PARALLEL and OMP_FOR codes and let omp expansion machinery do
47 The most of the complexity is in bringing the code into shape expected
49 -- for OMP_FOR, ensuring that the loop has only one induction variable
50 and that the exit test is at the start of the loop body
51 -- for OMP_PARALLEL, replacing the references to local addressable
52 variables by accesses through pointers, and breaking up ssa chains
53 by storing the values incoming to the parallelized loop to a structure
54 passed to the new function as an argument (something similar is done
55 in omp gimplification, unfortunately only a small part of the code
59 -- if there are several parallelizable loops in a function, it may be
60 possible to generate the threads just once (using synchronization to
61 ensure that cross-loop dependences are obeyed).
62 -- handling of common scalar dependence patterns (accumulation, ...)
63 -- handling of non-innermost loops */
67 currently we use vect_is_simple_reduction() to detect reduction patterns.
68 The code transformation will be introduced by an example.
75 for (i = 0; i < N; i++)
85 # sum_29 = PHI <sum_11(5), 1(3)>
86 # i_28 = PHI <i_12(5), 0(3)>
89 sum_11 = D.1795_8 + sum_29;
97 # sum_21 = PHI <sum_11(4)>
98 printf (&"%d"[0], sum_21);
101 after reduction transformation (only relevant parts):
109 # Storing the the initial value given by the user. #
111 .paral_data_store.32.sum.27 = 1;
113 #pragma omp parallel num_threads(4)
115 #pragma omp for schedule(static)
117 # The neutral element corresponding to the particular
118 reduction's operation, e.g. 0 for PLUS_EXPR,
119 1 for MULT_EXPR, etc. replaces the user's initial value. #
121 # sum.27_29 = PHI <sum.27_11, 0>
123 sum.27_11 = D.1827_8 + sum.27_29;
127 # Adding this reduction phi is done at create_phi_for_local_result() #
128 # sum.27_56 = PHI <sum.27_11, 0>
131 # Creating the atomic operation is done at
132 create_call_for_reduction_1() #
134 #pragma omp atomic_load
135 D.1839_59 = *&.paral_data_load.33_51->reduction.23;
136 D.1840_60 = sum.27_56 + D.1839_59;
137 #pragma omp atomic_store (D.1840_60);
141 # collecting the result after the join of the threads is done at
142 create_loads_for_reductions().
143 The value computed by the threads is loaded from the
147 .paral_data_load.33_52 = &.paral_data_store.32;
148 sum_37 = .paral_data_load.33_52->sum.27;
149 sum_43 = D.1795_41 + sum_37;
152 # sum_21 = PHI <sum_43, sum_26>
153 printf (&"%d"[0], sum_21);
161 /* Minimal number of iterations of a loop that should be executed in each
163 #define MIN_PER_THREAD 100
165 /* Element of the hashtable, representing a
166 reduction in the current loop. */
167 struct reduction_info
169 tree reduc_stmt; /* reduction statement. */
170 tree reduc_phi; /* The phi node defining the reduction. */
171 enum tree_code reduction_code; /* code for the reduction operation. */
172 tree keep_res; /* The PHI_RESULT of this phi is the resulting value
173 of the reduction variable when existing the loop. */
174 tree initial_value; /* The initial value of the reduction var before entering the loop. */
175 tree field; /* the name of the field in the parloop data structure intended for reduction. */
176 tree init; /* reduction initialization value. */
177 tree new_phi; /* (helper field) Newly created phi node whose result
178 will be passed to the atomic operation. Represents
179 the local result each thread computed for the reduction
183 /* Equality and hash functions for hashtab code. */
186 reduction_info_eq (const void *aa, const void *bb)
188 const struct reduction_info *a = (const struct reduction_info *) aa;
189 const struct reduction_info *b = (const struct reduction_info *) bb;
191 return (a->reduc_phi == b->reduc_phi);
195 reduction_info_hash (const void *aa)
197 const struct reduction_info *a = (const struct reduction_info *) aa;
199 return htab_hash_pointer (a->reduc_phi);
202 static struct reduction_info *
203 reduction_phi (htab_t reduction_list, tree phi)
205 struct reduction_info tmpred, *red;
207 if (htab_elements (reduction_list) == 0)
210 tmpred.reduc_phi = phi;
211 red = htab_find (reduction_list, &tmpred);
216 /* Element of hashtable of names to copy. */
218 struct name_to_copy_elt
220 unsigned version; /* The version of the name to copy. */
221 tree new_name; /* The new name used in the copy. */
222 tree field; /* The field of the structure used to pass the
226 /* Equality and hash functions for hashtab code. */
229 name_to_copy_elt_eq (const void *aa, const void *bb)
231 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
232 const struct name_to_copy_elt *b = (const struct name_to_copy_elt *) bb;
234 return a->version == b->version;
238 name_to_copy_elt_hash (const void *aa)
240 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
242 return (hashval_t) a->version;
245 /* Returns true if the iterations of LOOP are independent on each other (that
246 is, if we can execute them in parallel), and if LOOP satisfies other
247 conditions that we need to be able to parallelize it. Description of number
248 of iterations is stored to NITER. Reduction analysis is done, if
249 reductions are found, they are inserted to the REDUCTION_LIST. */
252 loop_parallel_p (struct loop *loop, htab_t reduction_list, struct tree_niter_desc *niter)
254 edge exit = single_dom_exit (loop);
255 VEC (ddr_p, heap) * dependence_relations;
256 VEC (data_reference_p, heap) * datarefs;
257 lambda_trans_matrix trans;
260 loop_vec_info simple_loop_info;
262 /* Only consider innermost loops with just one exit. The innermost-loop
263 restriction is not necessary, but it makes things simpler. */
264 if (loop->inner || !exit)
267 if (dump_file && (dump_flags & TDF_DETAILS))
268 fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
270 /* We need to know # of iterations, and there should be no uses of values
271 defined inside loop outside of it, unless the values are invariants of
273 if (!number_of_iterations_exit (loop, exit, niter, false))
275 if (dump_file && (dump_flags & TDF_DETAILS))
276 fprintf (dump_file, " FAILED: number of iterations not known\n");
280 simple_loop_info = vect_analyze_loop_form (loop);
282 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
284 tree reduc_stmt = NULL, operation;
286 /* ??? TODO: Change this into a generic function that
287 recognizes reductions. */
288 if (!is_gimple_reg (PHI_RESULT (phi)))
290 if (simple_loop_info)
291 reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi);
293 /* Create a reduction_info struct, initialize it and insert it to
294 the reduction list. */
299 struct reduction_info *new_reduction;
301 if (dump_file && (dump_flags & TDF_DETAILS))
304 "Detected reduction. reduction stmt is: \n");
305 print_generic_stmt (dump_file, reduc_stmt, 0);
306 fprintf (dump_file, "\n");
309 new_reduction = XCNEW (struct reduction_info);
311 new_reduction->reduc_stmt = reduc_stmt;
312 new_reduction->reduc_phi = phi;
313 operation = GIMPLE_STMT_OPERAND (reduc_stmt, 1);
314 new_reduction->reduction_code = TREE_CODE (operation);
315 slot = htab_find_slot (reduction_list, new_reduction, INSERT);
316 *slot = new_reduction;
320 for (phi = phi_nodes (exit->dest); phi; phi = PHI_CHAIN (phi))
322 struct reduction_info *red;
323 imm_use_iterator imm_iter;
327 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
329 if (is_gimple_reg (val))
331 if (dump_file && (dump_flags & TDF_DETAILS))
333 fprintf (dump_file, "phi is ");
334 print_generic_expr (dump_file, phi, 0);
335 fprintf (dump_file, "arg of phi to exit: value ");
336 print_generic_expr (dump_file, val, 0);
337 fprintf (dump_file, " used outside loop\n");
339 " checking if it a part of reduction pattern: \n");
341 if (htab_elements (reduction_list) == 0)
343 if (dump_file && (dump_flags & TDF_DETAILS))
345 " FAILED: it is not a part of reduction.\n");
349 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
351 if (flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
353 reduc_phi = USE_STMT (use_p);
357 red = reduction_phi (reduction_list, reduc_phi);
360 if (dump_file && (dump_flags & TDF_DETAILS))
362 " FAILED: it is not a part of reduction.\n");
365 if (dump_file && (dump_flags & TDF_DETAILS))
367 fprintf (dump_file, "reduction phi is ");
368 print_generic_expr (dump_file, red->reduc_phi, 0);
369 fprintf (dump_file, "reduction stmt is ");
370 print_generic_expr (dump_file, red->reduc_stmt, 0);
376 /* The iterations of the loop may communicate only through bivs whose
377 iteration space can be distributed efficiently. */
378 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
380 tree def = PHI_RESULT (phi);
383 if (is_gimple_reg (def) && !simple_iv (loop, phi, def, &iv, true))
385 struct reduction_info *red;
387 red = reduction_phi (reduction_list, phi);
390 if (dump_file && (dump_flags & TDF_DETAILS))
392 " FAILED: scalar dependency between iterations\n");
398 /* We need to version the loop to verify assumptions in runtime. */
399 if (!can_duplicate_loop_p (loop))
401 if (dump_file && (dump_flags & TDF_DETAILS))
402 fprintf (dump_file, " FAILED: cannot be duplicated\n");
406 /* Check for problems with dependences. If the loop can be reversed,
407 the iterations are independent. */
408 datarefs = VEC_alloc (data_reference_p, heap, 10);
409 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
410 compute_data_dependences_for_loop (loop, true, &datarefs,
411 &dependence_relations);
412 if (dump_file && (dump_flags & TDF_DETAILS))
413 dump_data_dependence_relations (dump_file, dependence_relations);
415 trans = lambda_trans_matrix_new (1, 1);
416 LTM_MATRIX (trans)[0][0] = -1;
418 if (lambda_transform_legal_p (trans, 1, dependence_relations))
421 if (dump_file && (dump_flags & TDF_DETAILS))
422 fprintf (dump_file, " SUCCESS: may be parallelized\n");
424 else if (dump_file && (dump_flags & TDF_DETAILS))
426 " FAILED: data dependencies exist across iterations\n");
428 free_dependence_relations (dependence_relations);
429 free_data_refs (datarefs);
434 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
435 The assignment statement is placed before LOOP. DECL_ADDRESS maps decls
436 to their addresses that can be reused. The address of OBJ is known to
437 be invariant in the whole function. */
440 take_address_of (tree obj, tree type, struct loop *loop, htab_t decl_address)
444 struct int_tree_map ielt, *nielt;
445 tree *var_p, name, bvar, stmt, addr;
446 edge entry = loop_preheader_edge (loop);
448 /* Since the address of OBJ is invariant, the trees may be shared.
449 Avoid rewriting unrelated parts of the code. */
450 obj = unshare_expr (obj);
452 handled_component_p (*var_p);
453 var_p = &TREE_OPERAND (*var_p, 0))
455 uid = DECL_UID (*var_p);
458 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
461 addr = build_addr (*var_p, current_function_decl);
462 bvar = create_tmp_var (TREE_TYPE (addr), get_name (*var_p));
463 add_referenced_var (bvar);
464 stmt = build_gimple_modify_stmt (bvar, addr);
465 name = make_ssa_name (bvar, stmt);
466 GIMPLE_STMT_OPERAND (stmt, 0) = name;
467 bsi_insert_on_edge_immediate (entry, stmt);
469 nielt = XNEW (struct int_tree_map);
475 name = ((struct int_tree_map *) *dslot)->to;
479 *var_p = build1 (INDIRECT_REF, TREE_TYPE (*var_p), name);
480 name = force_gimple_operand (build_addr (obj, current_function_decl),
481 &stmt, true, NULL_TREE);
483 bsi_insert_on_edge_immediate (entry, stmt);
486 if (TREE_TYPE (name) != type)
488 name = force_gimple_operand (fold_convert (type, name), &stmt, true,
491 bsi_insert_on_edge_immediate (entry, stmt);
497 /* Callback for htab_traverse. Create the initialization statement
498 for reduction described in SLOT, and place it at the preheader of
499 the loop described in DATA. */
502 initialize_reductions (void **slot, void *data)
505 tree bvar, type, arg;
508 struct reduction_info *reduc = *slot;
509 struct loop *loop = (struct loop *) data;
511 /* Create initialization in preheader:
512 reduction_variable = initialization value of reduction. */
514 /* In the phi node at the header, replace the argument coming
515 from the preheader with the reduction initialization value. */
517 /* Create a new variable to initialize the reduction. */
518 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
519 bvar = create_tmp_var (type, "reduction");
520 add_referenced_var (bvar);
522 c = build_omp_clause (OMP_CLAUSE_REDUCTION);
523 OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code;
524 OMP_CLAUSE_DECL (c) =
525 SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0));
527 init = omp_reduction_init (c, TREE_TYPE (bvar));
530 /* Replace the argument representing the initialization value
531 with the initialization value for the reduction (neutral
532 element for the particular operation, e.g. 0 for PLUS_EXPR,
533 1 for MULT_EXPR, etc).
534 Keep the old value in a new variable "reduction_initial",
535 that will be taken in consideration after the parallel
536 computing is done. */
538 e = loop_preheader_edge (loop);
539 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
540 /* Create new variable to hold the initial value. */
542 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
543 (reduc->reduc_phi, loop_preheader_edge (loop)), init);
544 reduc->initial_value = arg;
555 /* Eliminates references to local variables in *TP out of LOOP. DECL_ADDRESS
556 contains addresses of the references that had their address taken already.
557 If the expression is changed, CHANGED is set to true. Callback for
561 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
563 struct elv_data *dta = data;
564 tree t = *tp, var, addr, addr_type, type, obj;
570 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
573 type = TREE_TYPE (t);
574 addr_type = build_pointer_type (type);
575 addr = take_address_of (t, addr_type, dta->loop, dta->decl_address);
576 *tp = build1 (INDIRECT_REF, TREE_TYPE (*tp), addr);
582 if (TREE_CODE (t) == ADDR_EXPR)
584 /* ADDR_EXPR may appear in two contexts:
585 -- as a gimple operand, when the address taken is a function invariant
586 -- as gimple rhs, when the resulting address in not a function
588 We do not need to do anything special in the latter case (the base of
589 the memory reference whose address is taken may be replaced in the
590 DECL_P case). The former case is more complicated, as we need to
591 ensure that the new address is still a gimple operand. Thus, it
592 is not sufficient to replace just the base of the memory reference --
593 we need to move the whole computation of the address out of the
595 if (!is_gimple_val (t))
599 obj = TREE_OPERAND (t, 0);
600 var = get_base_address (obj);
601 if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
604 addr_type = TREE_TYPE (t);
605 addr = take_address_of (obj, addr_type, dta->loop, dta->decl_address);
612 if (!EXPR_P (t) && !GIMPLE_STMT_P (t))
618 /* Moves the references to local variables in STMT from LOOP. DECL_ADDRESS
619 contains addresses for the references for that we have already taken
623 eliminate_local_variables_stmt (struct loop *loop, tree stmt,
629 dta.decl_address = decl_address;
632 walk_tree (&stmt, eliminate_local_variables_1, &dta, NULL);
638 /* Eliminates the references to local variables from LOOP.
640 1) Taking address of a local variable -- these are moved out of the
641 loop (and temporary variable is created to hold the address if
643 2) Dereferencing a local variable -- these are replaced with indirect
647 eliminate_local_variables (struct loop *loop)
649 basic_block bb, *body = get_loop_body (loop);
651 block_stmt_iterator bsi;
652 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
655 /* Find and rename the ssa names defined outside of loop. */
656 for (i = 0; i < loop->num_nodes; i++)
660 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
661 eliminate_local_variables_stmt (loop, bsi_stmt (bsi), decl_address);
664 htab_delete (decl_address);
667 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
668 The copies are stored to NAME_COPIES, if NAME was already duplicated,
669 its duplicate stored in NAME_COPIES is returned.
671 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
672 duplicated, storing the copies in DECL_COPIES. */
675 separate_decls_in_loop_name (tree name,
676 htab_t name_copies, htab_t decl_copies,
679 tree copy, var, var_copy;
680 unsigned idx, uid, nuid;
681 struct int_tree_map ielt, *nielt;
682 struct name_to_copy_elt elt, *nelt;
683 void **slot, **dslot;
685 if (TREE_CODE (name) != SSA_NAME)
688 idx = SSA_NAME_VERSION (name);
690 slot = htab_find_slot_with_hash (name_copies, &elt, idx,
691 copy_name_p ? INSERT : NO_INSERT);
693 return ((struct name_to_copy_elt *) *slot)->new_name;
695 var = SSA_NAME_VAR (name);
696 uid = DECL_UID (var);
698 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
701 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
702 DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
703 add_referenced_var (var_copy);
704 nielt = XNEW (struct int_tree_map);
706 nielt->to = var_copy;
709 /* Ensure that when we meet this decl next time, we won't duplicate
711 nuid = DECL_UID (var_copy);
713 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
714 gcc_assert (!*dslot);
715 nielt = XNEW (struct int_tree_map);
717 nielt->to = var_copy;
721 var_copy = ((struct int_tree_map *) *dslot)->to;
725 copy = duplicate_ssa_name (name, NULL_TREE);
726 nelt = XNEW (struct name_to_copy_elt);
728 nelt->new_name = copy;
729 nelt->field = NULL_TREE;
738 SSA_NAME_VAR (copy) = var_copy;
742 /* Finds the ssa names used in STMT that are defined outside of LOOP and
743 replaces such ssa names with their duplicates. The duplicates are stored to
744 NAME_COPIES. Base decls of all ssa names used in STMT
745 (including those defined in LOOP) are replaced with the new temporary
746 variables; the replacement decls are stored in DECL_COPIES. */
749 separate_decls_in_loop_stmt (struct loop *loop, tree stmt,
750 htab_t name_copies, htab_t decl_copies)
758 mark_virtual_ops_for_renaming (stmt);
760 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
762 name = DEF_FROM_PTR (def);
763 gcc_assert (TREE_CODE (name) == SSA_NAME);
764 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
766 gcc_assert (copy == name);
769 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
771 name = USE_FROM_PTR (use);
772 if (TREE_CODE (name) != SSA_NAME)
775 copy_name_p = expr_invariant_in_loop_p (loop, name);
776 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
782 /* Callback for htab_traverse. Adds a field corresponding to the reduction
783 specified in SLOT. The type is passed in DATA. */
786 add_field_for_reduction (void **slot, void *data)
789 struct reduction_info *red = *slot;
791 tree var = SSA_NAME_VAR (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
792 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
794 insert_field_into_struct (type, field);
801 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
802 described in SLOT. The type is passed in DATA. */
805 add_field_for_name (void **slot, void *data)
807 struct name_to_copy_elt *elt = *slot;
809 tree name = ssa_name (elt->version);
810 tree var = SSA_NAME_VAR (name);
811 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
813 insert_field_into_struct (type, field);
819 /* Callback for htab_traverse. A local result is the intermediate result
821 thread, or the intial value in case no iteration was executed.
822 This function creates a phi node reflecting these values.
823 The phi's result will be stored in NEW_PHI field of the
824 reduction's data structure. */
827 create_phi_for_local_result (void **slot, void *data)
829 struct reduction_info *reduc = *slot;
830 struct loop *loop = data;
833 basic_block store_bb;
836 /* STORE_BB is the block where the phi
837 should be stored. It is the destination of the loop exit.
838 (Find the fallthru edge from OMP_CONTINUE). */
839 store_bb = FALLTHRU_EDGE (loop->latch)->dest;
841 /* STORE_BB has two predecessors. One coming from the loop
842 (the reduction's result is computed at the loop),
843 and another coming from a block preceding the loop,
845 are executed (the initial value should be taken). */
846 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch))
847 e = EDGE_PRED (store_bb, 1);
849 e = EDGE_PRED (store_bb, 0);
850 local_res = make_ssa_name (SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0)), NULL_TREE);
851 new_phi = create_phi_node (local_res, store_bb);
852 SSA_NAME_DEF_STMT (local_res) = new_phi;
853 add_phi_arg (new_phi, reduc->init, e);
854 add_phi_arg (new_phi, GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0),
855 FALLTHRU_EDGE (loop->latch));
856 reduc->new_phi = new_phi;
866 basic_block store_bb;
870 /* Callback for htab_traverse. Create an atomic instruction for the
871 reduction described in SLOT.
872 DATA annotates the place in memory the atomic operation relates to,
873 and the basic block it needs to be generated in. */
876 create_call_for_reduction_1 (void **slot, void *data)
878 struct reduction_info *reduc = *slot;
879 struct clsn_data *clsn_data = data;
880 block_stmt_iterator bsi;
881 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
882 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
887 tree t, addr, addr_type, ref, x;
888 tree tmp_load, load, name;
890 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
891 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
892 addr_type = build_pointer_type (type);
894 addr = build_addr (t, current_function_decl);
896 /* Create phi node. */
897 bb = clsn_data->load_bb;
899 e = split_block (bb, t);
902 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL);
903 add_referenced_var (tmp_load);
904 tmp_load = make_ssa_name (tmp_load, NULL);
905 load = build2 (OMP_ATOMIC_LOAD, void_type_node, tmp_load, addr);
906 SSA_NAME_DEF_STMT (tmp_load) = load;
907 bsi = bsi_start (new_bb);
908 bsi_insert_after (&bsi, load, BSI_NEW_STMT);
910 e = split_block (new_bb, load);
912 bsi = bsi_start (new_bb);
915 fold_build2 (reduc->reduction_code,
916 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
917 PHI_RESULT (reduc->new_phi));
920 force_gimple_operand_bsi (&bsi, x, true, NULL_TREE, true,
921 BSI_CONTINUE_LINKING);
923 x = build1 (OMP_ATOMIC_STORE, void_type_node, name);
925 bsi_insert_after (&bsi, x, BSI_NEW_STMT);
929 /* Create the atomic operation at the join point of the threads.
930 REDUCTION_LIST describes the reductions in the LOOP.
931 LD_ST_DATA describes the shared data structure where
932 shared data is stored in and loaded from. */
934 create_call_for_reduction (struct loop *loop, htab_t reduction_list,
935 struct clsn_data *ld_st_data)
937 htab_traverse (reduction_list, create_phi_for_local_result, loop);
938 /* Find the fallthru edge from OMP_CONTINUE. */
939 ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest;
940 htab_traverse (reduction_list, create_call_for_reduction_1, ld_st_data);
943 /* Callback for htab_traverse. Loads the final reduction value at the
944 join point of all threads, and inserts it in the right place. */
947 create_loads_for_reductions (void **slot, void *data)
949 struct reduction_info *red = *slot;
950 struct clsn_data *clsn_data = data;
952 block_stmt_iterator bsi;
953 tree type = TREE_TYPE (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
954 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
959 bsi = bsi_after_labels (clsn_data->load_bb);
960 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
961 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
965 name = PHI_RESULT (red->keep_res);
966 stmt = build_gimple_modify_stmt (name, x);
967 GIMPLE_STMT_OPERAND (stmt, 0) = name;
968 SSA_NAME_DEF_STMT (name) = stmt;
970 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
972 remove_phi_node (red->keep_res, NULL_TREE, false);
977 /* Load the reduction result that was stored in LD_ST_DATA.
978 REDUCTION_LIST describes the list of reductions that the
979 loades should be generated for. */
981 create_final_loads_for_reduction (htab_t reduction_list,
982 struct clsn_data *ld_st_data)
984 block_stmt_iterator bsi;
987 bsi = bsi_after_labels (ld_st_data->load_bb);
988 t = build_fold_addr_expr (ld_st_data->store);
990 build_gimple_modify_stmt (ld_st_data->load,
991 build_fold_addr_expr (ld_st_data->store));
993 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
994 SSA_NAME_DEF_STMT (ld_st_data->load) = t;
995 GIMPLE_STMT_OPERAND (t, 0) = ld_st_data->load;
997 htab_traverse (reduction_list, create_loads_for_reductions, ld_st_data);
1001 /* Callback for htab_traverse. Store the neutral value for the
1002 particular reduction's operation, e.g. 0 for PLUS_EXPR,
1003 1 for MULT_EXPR, etc. into the reduction field.
1004 The reduction is specified in SLOT. The store information is
1008 create_stores_for_reduction (void **slot, void *data)
1010 struct reduction_info *red = *slot;
1011 struct clsn_data *clsn_data = data;
1013 block_stmt_iterator bsi;
1014 tree type = TREE_TYPE (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
1016 bsi = bsi_last (clsn_data->store_bb);
1018 build_gimple_modify_stmt (build3
1019 (COMPONENT_REF, type, clsn_data->store,
1020 red->field, NULL_TREE),
1021 red->initial_value);
1022 mark_virtual_ops_for_renaming (stmt);
1023 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1028 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
1029 store to a field of STORE in STORE_BB for the ssa name and its duplicate
1030 specified in SLOT. */
1033 create_loads_and_stores_for_name (void **slot, void *data)
1035 struct name_to_copy_elt *elt = *slot;
1036 struct clsn_data *clsn_data = data;
1038 block_stmt_iterator bsi;
1039 tree type = TREE_TYPE (elt->new_name);
1040 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
1043 bsi = bsi_last (clsn_data->store_bb);
1045 build_gimple_modify_stmt (build3
1046 (COMPONENT_REF, type, clsn_data->store,
1047 elt->field, NULL_TREE),
1048 ssa_name (elt->version));
1049 mark_virtual_ops_for_renaming (stmt);
1050 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1052 bsi = bsi_last (clsn_data->load_bb);
1053 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
1054 stmt = build_gimple_modify_stmt (elt->new_name,
1055 build3 (COMPONENT_REF, type, load_struct,
1056 elt->field, NULL_TREE));
1057 SSA_NAME_DEF_STMT (elt->new_name) = stmt;
1058 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1063 /* Moves all the variables used in LOOP and defined outside of it (including
1064 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1065 name) to a structure created for this purpose. The code
1073 is transformed this way:
1088 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
1089 pointer `new' is intentionally not initialized (the loop will be split to a
1090 separate function later, and `new' will be initialized from its arguments).
1091 LD_ST_DATA holds information about the shared data structure used to pass
1092 information among the threads. It is initialized here, and
1093 gen_parallel_loop will pass it to create_call_for_reduction that
1094 needs this information. REDUCTION_LIST describes the reductions
1098 separate_decls_in_loop (struct loop *loop, htab_t reduction_list,
1099 tree * arg_struct, tree * new_arg_struct,
1100 struct clsn_data *ld_st_data)
1103 basic_block bb1 = split_edge (loop_preheader_edge (loop));
1104 basic_block bb0 = single_pred (bb1);
1105 htab_t name_copies = htab_create (10, name_to_copy_elt_hash,
1106 name_to_copy_elt_eq, free);
1107 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
1109 basic_block bb, *body = get_loop_body (loop);
1111 tree phi, type, type_name, nvar;
1112 block_stmt_iterator bsi;
1113 struct clsn_data clsn_data;
1115 /* Find and rename the ssa names defined outside of loop. */
1116 for (i = 0; i < loop->num_nodes; i++)
1120 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1121 separate_decls_in_loop_stmt (loop, phi, name_copies, decl_copies);
1123 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
1124 separate_decls_in_loop_stmt (loop, bsi_stmt (bsi), name_copies,
1129 if (htab_elements (name_copies) == 0)
1131 /* It may happen that there is nothing to copy (if there are only
1132 loop carried and external variables in the loop). */
1134 *new_arg_struct = NULL;
1138 /* Create the type for the structure to store the ssa names to. */
1139 type = lang_hooks.types.make_type (RECORD_TYPE);
1140 type_name = build_decl (TYPE_DECL, create_tmp_var_name (".paral_data"),
1142 TYPE_NAME (type) = type_name;
1144 htab_traverse (name_copies, add_field_for_name, type);
1145 if (htab_elements (reduction_list) > 0)
1147 /* Create the fields for reductions. */
1148 htab_traverse (reduction_list, add_field_for_reduction,
1153 /* Create the loads and stores. */
1154 *arg_struct = create_tmp_var (type, ".paral_data_store");
1155 add_referenced_var (*arg_struct);
1156 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1157 add_referenced_var (nvar);
1158 *new_arg_struct = make_ssa_name (nvar, NULL_TREE);
1160 ld_st_data->store = *arg_struct;
1161 ld_st_data->load = *new_arg_struct;
1162 ld_st_data->store_bb = bb0;
1163 ld_st_data->load_bb = bb1;
1165 htab_traverse (name_copies, create_loads_and_stores_for_name,
1168 /* Load the calculation from memory (after the join of the threads). */
1170 if (htab_elements (reduction_list) > 0)
1172 htab_traverse (reduction_list, create_stores_for_reduction,
1174 clsn_data.load = make_ssa_name (nvar, NULL_TREE);
1175 clsn_data.load_bb = single_dom_exit (loop)->dest;
1176 clsn_data.store = ld_st_data->store;
1177 create_final_loads_for_reduction (reduction_list, &clsn_data);
1181 htab_delete (decl_copies);
1182 htab_delete (name_copies);
1185 /* Bitmap containing uids of functions created by parallelization. We cannot
1186 allocate it from the default obstack, as it must live across compilation
1187 of several functions; we make it gc allocated instead. */
1189 static GTY(()) bitmap parallelized_functions;
1191 /* Returns true if FN was created by create_loop_fn. */
1194 parallelized_function_p (tree fn)
1196 if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
1199 return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
1202 /* Creates and returns an empty function that will receive the body of
1203 a parallelized loop. */
1206 create_loop_fn (void)
1210 tree decl, type, name, t;
1211 struct function *act_cfun = cfun;
1212 static unsigned loopfn_num;
1214 snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1215 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1216 clean_symbol_name (tname);
1217 name = get_identifier (tname);
1218 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1220 decl = build_decl (FUNCTION_DECL, name, type);
1221 if (!parallelized_functions)
1222 parallelized_functions = BITMAP_GGC_ALLOC ();
1223 bitmap_set_bit (parallelized_functions, DECL_UID (decl));
1225 TREE_STATIC (decl) = 1;
1226 TREE_USED (decl) = 1;
1227 DECL_ARTIFICIAL (decl) = 1;
1228 DECL_IGNORED_P (decl) = 0;
1229 TREE_PUBLIC (decl) = 0;
1230 DECL_UNINLINABLE (decl) = 1;
1231 DECL_EXTERNAL (decl) = 0;
1232 DECL_CONTEXT (decl) = NULL_TREE;
1233 DECL_INITIAL (decl) = make_node (BLOCK);
1235 t = build_decl (RESULT_DECL, NULL_TREE, void_type_node);
1236 DECL_ARTIFICIAL (t) = 1;
1237 DECL_IGNORED_P (t) = 1;
1238 DECL_RESULT (decl) = t;
1240 t = build_decl (PARM_DECL, get_identifier (".paral_data_param"),
1242 DECL_ARTIFICIAL (t) = 1;
1243 DECL_ARG_TYPE (t) = ptr_type_node;
1244 DECL_CONTEXT (t) = decl;
1246 DECL_ARGUMENTS (decl) = t;
1248 allocate_struct_function (decl, false);
1250 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1252 set_cfun (act_cfun);
1257 /* Bases all the induction variables in LOOP on a single induction variable
1258 (unsigned with base 0 and step 1), whose final value is compared with
1259 NIT. The induction variable is incremented in the loop latch.
1260 REDUCTION_LIST describes the reductions in LOOP. */
1263 canonicalize_loop_ivs (struct loop *loop, htab_t reduction_list, tree nit)
1265 unsigned precision = TYPE_PRECISION (TREE_TYPE (nit));
1266 tree phi, prev, res, type, var_before, val, atype, mtype, t, next;
1267 block_stmt_iterator bsi;
1270 edge exit = single_dom_exit (loop);
1271 struct reduction_info *red;
1273 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
1275 res = PHI_RESULT (phi);
1277 if (is_gimple_reg (res) && TYPE_PRECISION (TREE_TYPE (res)) > precision)
1278 precision = TYPE_PRECISION (TREE_TYPE (res));
1281 type = lang_hooks.types.type_for_size (precision, 1);
1283 bsi = bsi_last (loop->latch);
1284 create_iv (build_int_cst_type (type, 0), build_int_cst (type, 1), NULL_TREE,
1285 loop, &bsi, true, &var_before, NULL);
1287 bsi = bsi_after_labels (loop->header);
1289 for (phi = phi_nodes (loop->header); phi; phi = next)
1291 next = PHI_CHAIN (phi);
1292 res = PHI_RESULT (phi);
1294 if (!is_gimple_reg (res) || res == var_before)
1300 ok = simple_iv (loop, phi, res, &iv, true);
1301 red = reduction_phi (reduction_list, phi);
1302 /* We preserve the reduction phi nodes. */
1310 remove_phi_node (phi, prev, false);
1312 atype = TREE_TYPE (res);
1313 mtype = POINTER_TYPE_P (atype) ? sizetype : atype;
1314 val = fold_build2 (MULT_EXPR, mtype, unshare_expr (iv.step),
1315 fold_convert (mtype, var_before));
1316 val = fold_build2 (POINTER_TYPE_P (atype)
1317 ? POINTER_PLUS_EXPR : PLUS_EXPR,
1318 atype, unshare_expr (iv.base), val);
1319 val = force_gimple_operand_bsi (&bsi, val, false, NULL_TREE, true,
1321 t = build_gimple_modify_stmt (res, val);
1322 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1323 SSA_NAME_DEF_STMT (res) = t;
1326 t = last_stmt (exit->src);
1327 /* Make the loop exit if the control condition is not satisfied. */
1328 if (exit->flags & EDGE_TRUE_VALUE)
1332 extract_true_false_edges_from_block (exit->src, &te, &fe);
1333 te->flags = EDGE_FALSE_VALUE;
1334 fe->flags = EDGE_TRUE_VALUE;
1336 COND_EXPR_COND (t) = build2 (LT_EXPR, boolean_type_node, var_before, nit);
1339 /* Moves the exit condition of LOOP to the beginning of its header, and
1340 duplicates the part of the last iteration that gets disabled to the
1341 exit of the loop. NIT is the number of iterations of the loop
1342 (used to initialize the variables in the duplicated part).
1344 TODO: the common case is that latch of the loop is empty and immediatelly
1345 follows the loop exit. In this case, it would be better not to copy the
1346 body of the loop, but only move the entry of the loop directly before the
1347 exit check and increase the number of iterations of the loop by one.
1348 This may need some additional preconditioning in case NIT = ~0.
1349 REDUCTION_LIST describes the reductions in LOOP. */
1352 transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit)
1354 basic_block *bbs, *nbbs, ex_bb, orig_header;
1357 edge exit = single_dom_exit (loop), hpred;
1358 tree phi, nphi, cond, control, control_name, res, t, cond_stmt;
1359 block_stmt_iterator bsi;
1361 split_block_after_labels (loop->header);
1362 orig_header = single_succ (loop->header);
1363 hpred = single_succ_edge (loop->header);
1365 cond_stmt = last_stmt (exit->src);
1366 cond = COND_EXPR_COND (cond_stmt);
1367 control = TREE_OPERAND (cond, 0);
1368 gcc_assert (TREE_OPERAND (cond, 1) == nit);
1370 /* Make sure that we have phi nodes on exit for all loop header phis
1371 (create_parallel_loop requires that). */
1372 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
1374 res = PHI_RESULT (phi);
1375 t = make_ssa_name (SSA_NAME_VAR (res), phi);
1376 SET_PHI_RESULT (phi, t);
1378 nphi = create_phi_node (res, orig_header);
1379 SSA_NAME_DEF_STMT (res) = nphi;
1380 add_phi_arg (nphi, t, hpred);
1384 TREE_OPERAND (cond, 0) = t;
1385 update_stmt (cond_stmt);
1390 bbs = get_loop_body_in_dom_order (loop);
1391 for (n = 0; bbs[n] != exit->src; n++)
1393 nbbs = XNEWVEC (basic_block, n);
1394 ok = tree_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1401 /* Other than reductions, the only gimple reg that should be copied
1402 out of the loop is the control variable. */
1404 control_name = NULL_TREE;
1405 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
1407 res = PHI_RESULT (phi);
1408 if (!is_gimple_reg (res))
1411 /* Check if it is a part of reduction. If it is,
1412 keep the phi at the reduction's keep_res field. The
1413 PHI_RESULT of this phi is the resulting value of the reduction
1414 variable when exiting the loop. */
1416 exit = single_dom_exit (loop);
1418 if (htab_elements (reduction_list) > 0)
1420 struct reduction_info *red;
1422 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1424 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1426 red->keep_res = phi;
1429 gcc_assert (control_name == NULL_TREE
1430 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
1433 gcc_assert (control_name != NULL_TREE);
1434 phi = SSA_NAME_DEF_STMT (control_name);
1435 remove_phi_node (phi, NULL_TREE, false);
1437 /* Initialize the control variable to NIT. */
1438 bsi = bsi_after_labels (ex_bb);
1439 nit = force_gimple_operand_bsi (&bsi,
1440 fold_convert (TREE_TYPE (control_name), nit),
1441 false, NULL_TREE, false, BSI_SAME_STMT);
1442 t = build_gimple_modify_stmt (control_name, nit);
1443 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
1444 SSA_NAME_DEF_STMT (control_name) = t;
1447 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1448 LOOP_FN and DATA are the arguments of OMP_PARALLEL.
1449 NEW_DATA is the variable that should be initialized from the argument
1450 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
1451 basic block containing OMP_PARALLEL tree. */
1454 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
1455 tree new_data, unsigned n_threads)
1457 block_stmt_iterator bsi;
1458 basic_block bb, paral_bb, for_bb, ex_bb;
1459 tree t, param, res, for_stmt;
1460 tree cvar, cvar_init, initvar, cvar_next, cvar_base, cond, phi, type;
1461 edge exit, nexit, guard, end, e;
1463 /* Prepare the OMP_PARALLEL statement. */
1464 bb = loop_preheader_edge (loop)->src;
1465 paral_bb = single_pred (bb);
1466 bsi = bsi_last (paral_bb);
1468 t = build_omp_clause (OMP_CLAUSE_NUM_THREADS);
1469 OMP_CLAUSE_NUM_THREADS_EXPR (t)
1470 = build_int_cst (integer_type_node, n_threads);
1471 t = build4 (OMP_PARALLEL, void_type_node, NULL_TREE, t, loop_fn, data);
1473 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
1475 /* Initialize NEW_DATA. */
1478 bsi = bsi_after_labels (bb);
1480 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL_TREE);
1481 t = build_gimple_modify_stmt (param, build_fold_addr_expr (data));
1482 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1483 SSA_NAME_DEF_STMT (param) = t;
1485 t = build_gimple_modify_stmt (new_data,
1486 fold_convert (TREE_TYPE (new_data),
1488 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1489 SSA_NAME_DEF_STMT (new_data) = t;
1492 /* Emit OMP_RETURN for OMP_PARALLEL. */
1493 bb = split_loop_exit_edge (single_dom_exit (loop));
1494 bsi = bsi_last (bb);
1495 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
1497 /* Extract data for OMP_FOR. */
1498 gcc_assert (loop->header == single_dom_exit (loop)->src);
1499 cond = COND_EXPR_COND (last_stmt (loop->header));
1501 cvar = TREE_OPERAND (cond, 0);
1502 cvar_base = SSA_NAME_VAR (cvar);
1503 phi = SSA_NAME_DEF_STMT (cvar);
1504 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1505 initvar = make_ssa_name (cvar_base, NULL_TREE);
1506 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
1508 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
1510 bsi = bsi_last (loop->latch);
1511 gcc_assert (bsi_stmt (bsi) == SSA_NAME_DEF_STMT (cvar_next));
1512 bsi_remove (&bsi, true);
1515 for_bb = split_edge (loop_preheader_edge (loop));
1516 ex_bb = split_loop_exit_edge (single_dom_exit (loop));
1517 extract_true_false_edges_from_block (loop->header, &nexit, &exit);
1518 gcc_assert (exit == single_dom_exit (loop));
1520 guard = make_edge (for_bb, ex_bb, 0);
1521 single_succ_edge (loop->latch)->flags = 0;
1522 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
1523 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
1525 res = PHI_RESULT (phi);
1526 gcc_assert (!is_gimple_reg (phi));
1527 t = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
1528 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_preheader_edge (loop)),
1530 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_latch_edge (loop)),
1533 e = redirect_edge_and_branch (exit, nexit->dest);
1534 PENDING_STMT (e) = NULL;
1537 TREE_OPERAND (cond, 0) = cvar_base;
1538 type = TREE_TYPE (cvar);
1539 t = build_omp_clause (OMP_CLAUSE_SCHEDULE);
1540 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
1542 for_stmt = make_node (OMP_FOR);
1543 TREE_TYPE (for_stmt) = void_type_node;
1544 OMP_FOR_CLAUSES (for_stmt) = t;
1545 OMP_FOR_INIT (for_stmt) = build_gimple_modify_stmt (initvar, cvar_init);
1546 OMP_FOR_COND (for_stmt) = cond;
1547 OMP_FOR_INCR (for_stmt) = build_gimple_modify_stmt (cvar_base,
1548 build2 (PLUS_EXPR, type,
1552 OMP_FOR_BODY (for_stmt) = NULL_TREE;
1553 OMP_FOR_PRE_BODY (for_stmt) = NULL_TREE;
1555 bsi = bsi_last (for_bb);
1556 bsi_insert_after (&bsi, for_stmt, BSI_NEW_STMT);
1557 SSA_NAME_DEF_STMT (initvar) = for_stmt;
1559 /* Emit OMP_CONTINUE. */
1560 bsi = bsi_last (loop->latch);
1561 t = build2 (OMP_CONTINUE, void_type_node, cvar_next, cvar);
1562 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
1563 SSA_NAME_DEF_STMT (cvar_next) = t;
1565 /* Emit OMP_RETURN for OMP_FOR. */
1566 bsi = bsi_last (ex_bb);
1567 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
1572 /* Generates code to execute the iterations of LOOP in N_THREADS threads in
1573 parallel. NITER describes number of iterations of LOOP.
1574 REDUCTION_LIST describes the reductions existant in the LOOP. */
1577 gen_parallel_loop (struct loop *loop, htab_t reduction_list,
1578 unsigned n_threads, struct tree_niter_desc *niter)
1581 tree many_iterations_cond, type, nit;
1582 tree stmts, arg_struct, new_arg_struct;
1583 basic_block parallel_head;
1584 struct clsn_data clsn_data;
1589 ---------------------------------------------------------------------
1592 IV = phi (INIT, IV + STEP)
1598 ---------------------------------------------------------------------
1600 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1601 we generate the following code:
1603 ---------------------------------------------------------------------
1606 || NITER < MIN_PER_THREAD * N_THREADS)
1610 store all local loop-invariant variables used in body of the loop to DATA.
1611 OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1612 load the variables from DATA.
1613 OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1617 OMP_RETURN -- OMP_FOR
1618 OMP_RETURN -- OMP_PARALLEL
1624 IV = phi (INIT, IV + STEP)
1635 /* Create two versions of the loop -- in the old one, we know that the
1636 number of iterations is large enough, and we will transform it into the
1637 loop that will be split to loop_fn, the new one will be used for the
1638 remaining iterations. */
1640 type = TREE_TYPE (niter->niter);
1641 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1644 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1646 many_iterations_cond =
1647 fold_build2 (GE_EXPR, boolean_type_node,
1648 nit, build_int_cst (type, MIN_PER_THREAD * n_threads));
1649 many_iterations_cond
1650 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1651 invert_truthvalue (unshare_expr (niter->may_be_zero)),
1652 many_iterations_cond);
1653 many_iterations_cond
1654 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
1656 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1657 if (!is_gimple_condexpr (many_iterations_cond))
1659 many_iterations_cond
1660 = force_gimple_operand (many_iterations_cond, &stmts,
1663 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1666 initialize_original_copy_tables ();
1668 /* We assume that the loop usually iterates a lot. */
1669 prob = 4 * REG_BR_PROB_BASE / 5;
1670 nloop = loop_version (loop, many_iterations_cond, NULL,
1671 prob, prob, REG_BR_PROB_BASE - prob, true);
1672 update_ssa (TODO_update_ssa);
1673 free_original_copy_tables ();
1675 /* Base all the induction variables in LOOP on a single control one. */
1676 canonicalize_loop_ivs (loop, reduction_list, nit);
1678 /* Ensure that the exit condition is the first statement in the loop. */
1679 transform_to_exit_first_loop (loop, reduction_list, nit);
1682 /* Generate intializations for reductions. */
1684 if (htab_elements (reduction_list) > 0)
1685 htab_traverse (reduction_list, initialize_reductions, loop);
1687 /* Eliminate the references to local variables from the loop. */
1688 eliminate_local_variables (loop);
1690 /* In the old loop, move all variables non-local to the loop to a structure
1691 and back, and create separate decls for the variables used in loop. */
1692 separate_decls_in_loop (loop, reduction_list, &arg_struct, &new_arg_struct, &clsn_data);
1694 /* Create the parallel constructs. */
1695 parallel_head = create_parallel_loop (loop, create_loop_fn (), arg_struct,
1696 new_arg_struct, n_threads);
1697 if (htab_elements (reduction_list) > 0)
1698 create_call_for_reduction (loop, reduction_list, &clsn_data);
1702 /* Cancel the loop (it is simpler to do it here rather than to teach the
1703 expander to do it). */
1704 cancel_loop_tree (loop);
1706 /* Expand the parallel constructs. We do it directly here instead of running
1707 a separate expand_omp pass, since it is more efficient, and less likely to
1708 cause troubles with further analyses not being able to deal with the
1711 omp_expand_local (parallel_head);
1714 /* Detect parallel loops and generate parallel code using libgomp
1715 primitives. Returns true if some loop was parallelized, false
1719 parallelize_loops (void)
1721 unsigned n_threads = flag_tree_parallelize_loops;
1722 bool changed = false;
1724 struct tree_niter_desc niter_desc;
1726 htab_t reduction_list;
1728 /* Do not parallelize loops in the functions created by parallelization. */
1729 if (parallelized_function_p (cfun->decl))
1732 reduction_list = htab_create (10, reduction_info_hash,
1733 reduction_info_eq, free);
1735 FOR_EACH_LOOP (li, loop, 0)
1737 htab_empty (reduction_list);
1738 if (/* Do not bother with loops in cold areas. */
1739 !maybe_hot_bb_p (loop->header)
1740 /* Or loops that roll too little. */
1741 || expected_loop_iterations (loop) <= n_threads
1742 /* And of course, the loop must be parallelizable. */
1743 || !can_duplicate_loop_p (loop)
1744 || !loop_parallel_p (loop, reduction_list, &niter_desc))
1748 gen_parallel_loop (loop, reduction_list, n_threads, &niter_desc);
1749 verify_flow_info ();
1750 verify_dominators (CDI_DOMINATORS);
1751 verify_loop_structure ();
1752 verify_loop_closed_ssa ();
1755 htab_delete (reduction_list);
1759 #include "gt-tree-parloops.h"