static edge
slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
- basic_block dom_bb)
+ basic_block dom_bb)
{
block_stmt_iterator bsi;
edge new_e, enter_e;
tree cond_stmt;
+ tree gimplify_stmt_list;
enter_e = EDGE_SUCC (guard_bb, 0);
enter_e->flags &= ~EDGE_FALLTHRU;
enter_e->flags |= EDGE_FALSE_VALUE;
bsi = bsi_last (guard_bb);
+ cond =
+ force_gimple_operand (cond, &gimplify_stmt_list, true,
+ NULL_TREE);
cond_stmt = build3 (COND_EXPR, void_type_node, cond,
NULL_TREE, NULL_TREE);
+ if (gimplify_stmt_list)
+ bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
+
+ bsi = bsi_last (guard_bb);
bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
+
/* Add new edge to connect guard block to the merge/loop-exit block. */
new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
*/
bool
-slpeel_can_duplicate_loop_p (struct loop *loop, edge e)
+slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
{
edge exit_e = single_exit (loop);
edge entry_e = loop_preheader_edge (loop);
}
#endif
+/* If the run time cost model check determines that vectorization is
+ not profitable and hence scalar loop should be generated then set
+ FIRST_NITERS to prologue peeled iterations. This will allow all the
+ iterations to be executed in the prologue peeled scalar loop. */
+
+void
+set_prologue_iterations (basic_block bb_before_first_loop,
+ tree first_niters,
+ struct loop *loop,
+ unsigned int th)
+{
+ edge e;
+ basic_block cond_bb, then_bb;
+ tree var, prologue_after_cost_adjust_name, stmt;
+ block_stmt_iterator bsi;
+ tree newphi;
+ edge e_true, e_false, e_fallthru;
+ tree cond_stmt;
+ tree gimplify_stmt_list;
+ tree cost_pre_condition = NULL_TREE;
+ tree scalar_loop_iters =
+ unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
+
+ e = single_pred_edge (bb_before_first_loop);
+ cond_bb = split_edge(e);
+
+ e = single_pred_edge (bb_before_first_loop);
+ then_bb = split_edge(e);
+ set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
+
+ e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
+ EDGE_FALSE_VALUE);
+ set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
+
+ e_true = EDGE_PRED (then_bb, 0);
+ e_true->flags &= ~EDGE_FALLTHRU;
+ e_true->flags |= EDGE_TRUE_VALUE;
+
+ e_fallthru = EDGE_SUCC (then_bb, 0);
+
+ cost_pre_condition =
+ build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
+ build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+ cost_pre_condition =
+ force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
+ true, NULL_TREE);
+ cond_stmt = build3 (COND_EXPR, void_type_node, cost_pre_condition,
+ NULL_TREE, NULL_TREE);
+
+ bsi = bsi_last (cond_bb);
+ if (gimplify_stmt_list)
+ bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
+
+ bsi = bsi_last (cond_bb);
+ bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
+
+ var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
+ "prologue_after_cost_adjust");
+ add_referenced_var (var);
+ prologue_after_cost_adjust_name =
+ force_gimple_operand (scalar_loop_iters, &stmt, false, var);
+
+ bsi = bsi_last (then_bb);
+ if (stmt)
+ bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
+
+ newphi = create_phi_node (var, bb_before_first_loop);
+ add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
+ add_phi_arg (newphi, first_niters, e_false);
+
+ first_niters = PHI_RESULT (newphi);
+}
+
+
/* Function slpeel_tree_peel_loop_to_edge.
Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
that is placed on the entry (exit) edge E of LOOP. After this transformation
we have two loops one after the other - first-loop iterates FIRST_NITERS
times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
+ If the cost model indicates that it is profitable to emit a scalar
+ loop instead of the vector one, then the prolog (epilog) loop will iterate
+ for the entire unchanged scalar iterations of the loop.
Input:
- LOOP: the loop to be peeled.
for updating the loop bound of the first-loop to FIRST_NITERS. If it
is false, the caller of this function may want to take care of this
(this can be useful if we don't want new stmts added to first-loop).
+ - TH: cost model profitability threshold of iterations for vectorization.
+ - CHECK_PROFITABILITY: specify whether cost model check has not occured
+ during versioning and hence needs to occur during
+ prologue generation or whether cost model check
+ has not occured during prologue generation and hence
+ needs to occur during epilogue generation.
+
Output:
The function returns a pointer to the new loop-copy, or NULL if it failed
slpeel_tree_peel_loop_to_edge (struct loop *loop,
edge e, tree first_niters,
tree niters, bool update_first_loop_count,
- unsigned int th)
+ unsigned int th, bool check_profitability)
{
struct loop *new_loop = NULL, *first_loop, *second_loop;
edge skip_e;
- tree pre_condition;
+ tree pre_condition = NULL_TREE;
bitmap definitions;
basic_block bb_before_second_loop, bb_after_second_loop;
basic_block bb_before_first_loop;
basic_block new_exit_bb;
edge exit_e = single_exit (loop);
LOC loop_loc;
+ tree cost_pre_condition = NULL_TREE;
if (!slpeel_can_duplicate_loop_p (loop, e))
return NULL;
rename_variables_in_loop (new_loop);
- /* 2. Add the guard that controls whether the first loop is executed.
- Resulting CFG would be:
+ /* 2. Add the guard code in one of the following ways:
- bb_before_first_loop:
- if (FIRST_NITERS == 0) GOTO bb_before_second_loop
- GOTO first-loop
+ 2.a Add the guard that controls whether the first loop is executed.
+ This occurs when this function is invoked for prologue or epilogiue
+ generation and when the cost model check can be done at compile time.
- first_loop:
- do {
- } while ...
+ Resulting CFG would be:
- bb_before_second_loop:
+ bb_before_first_loop:
+ if (FIRST_NITERS == 0) GOTO bb_before_second_loop
+ GOTO first-loop
- second_loop:
- do {
- } while ...
+ first_loop:
+ do {
+ } while ...
- orig_exit_bb:
- */
+ bb_before_second_loop:
+
+ second_loop:
+ do {
+ } while ...
+
+ orig_exit_bb:
+
+ 2.b Add the cost model check that allows the prologue
+ to iterate for the entire unchanged scalar
+ iterations of the loop in the event that the cost
+ model indicates that the scalar loop is more
+ profitable than the vector one. This occurs when
+ this function is invoked for prologue generation
+ and the cost model check needs to be done at run
+ time.
+
+ Resulting CFG after prologue peeling would be:
+
+ if (scalar_loop_iterations <= th)
+ FIRST_NITERS = scalar_loop_iterations
+
+ bb_before_first_loop:
+ if (FIRST_NITERS == 0) GOTO bb_before_second_loop
+ GOTO first-loop
+
+ first_loop:
+ do {
+ } while ...
+
+ bb_before_second_loop:
+
+ second_loop:
+ do {
+ } while ...
+
+ orig_exit_bb:
+
+ 2.c Add the cost model check that allows the epilogue
+ to iterate for the entire unchanged scalar
+ iterations of the loop in the event that the cost
+ model indicates that the scalar loop is more
+ profitable than the vector one. This occurs when
+ this function is invoked for epilogue generation
+ and the cost model check needs to be done at run
+ time.
+
+ Resulting CFG after prologue peeling would be:
+
+ bb_before_first_loop:
+ if ((scalar_loop_iterations <= th)
+ ||
+ FIRST_NITERS == 0) GOTO bb_before_second_loop
+ GOTO first-loop
+
+ first_loop:
+ do {
+ } while ...
+
+ bb_before_second_loop:
+
+ second_loop:
+ do {
+ } while ...
+
+ orig_exit_bb:
+ */
bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
bb_before_second_loop = split_edge (single_exit (first_loop));
- pre_condition =
- fold_build2 (LE_EXPR, boolean_type_node, first_niters,
- build_int_cst (TREE_TYPE (first_niters), th));
+ /* Epilogue peeling. */
+ if (!update_first_loop_count)
+ {
+ pre_condition =
+ fold_build2 (LE_EXPR, boolean_type_node, first_niters,
+ build_int_cst (TREE_TYPE (first_niters), 0));
+ if (check_profitability)
+ {
+ tree scalar_loop_iters
+ = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
+ (loop_vec_info_for_loop (loop)));
+ cost_pre_condition =
+ build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
+ build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+
+ pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+ cost_pre_condition, pre_condition);
+ }
+ }
+
+ /* Prologue peeling. */
+ else
+ {
+ if (check_profitability)
+ set_prologue_iterations (bb_before_first_loop, first_niters,
+ loop, th);
+
+ pre_condition =
+ fold_build2 (LE_EXPR, boolean_type_node, first_niters,
+ build_int_cst (TREE_TYPE (first_niters), 0));
+ }
skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
bb_before_second_loop, bb_before_first_loop);
STMT_VINFO_IN_PATTERN_P (res) = false;
STMT_VINFO_RELATED_STMT (res) = NULL;
STMT_VINFO_DATA_REF (res) = NULL;
- if (TREE_CODE (stmt) == PHI_NODE)
+
+ STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
+ STMT_VINFO_DR_OFFSET (res) = NULL;
+ STMT_VINFO_DR_INIT (res) = NULL;
+ STMT_VINFO_DR_STEP (res) = NULL;
+ STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
+
+ if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt)))
STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
else
STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
+ STMT_SLP_TYPE (res) = 0;
DR_GROUP_FIRST_DR (res) = NULL_TREE;
DR_GROUP_NEXT_DR (res) = NULL_TREE;
DR_GROUP_SIZE (res) = 0;
}
+/* Function bb_in_loop_p
+
+ Used as predicate for dfs order traversal of the loop bbs. */
+
+static bool
+bb_in_loop_p (const_basic_block bb, const void *data)
+{
+ const struct loop *const loop = (const struct loop *)data;
+ if (flow_bb_inside_loop_p (loop, bb))
+ return true;
+ return false;
+}
+
+
/* Function new_loop_vec_info.
Create and initialize a new loop_vec_info struct for LOOP, as well as
loop_vec_info res;
basic_block *bbs;
block_stmt_iterator si;
- unsigned int i;
+ unsigned int i, nbbs;
res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
+ LOOP_VINFO_LOOP (res) = loop;
bbs = get_loop_body (loop);
- /* Create stmt_info for all stmts in the loop. */
+ /* Create/Update stmt_info for all stmts in the loop. */
for (i = 0; i < loop->num_nodes; i++)
{
basic_block bb = bbs[i];
tree phi;
- for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
- {
- stmt_ann_t ann = get_stmt_ann (phi);
- set_stmt_info (ann, new_stmt_vec_info (phi, res));
- }
-
- for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+ /* BBs in a nested inner-loop will have been already processed (because
+ we will have called vect_analyze_loop_form for any nested inner-loop).
+ Therefore, for stmts in an inner-loop we just want to update the
+ STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
+ loop_info of the outer-loop we are currently considering to vectorize
+ (instead of the loop_info of the inner-loop).
+ For stmts in other BBs we need to create a stmt_info from scratch. */
+ if (bb->loop_father != loop)
{
- tree stmt = bsi_stmt (si);
- stmt_ann_t ann;
+ /* Inner-loop bb. */
+ gcc_assert (loop->inner && bb->loop_father == loop->inner);
+ for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+ {
+ stmt_vec_info stmt_info = vinfo_for_stmt (phi);
+ loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+ gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
+ STMT_VINFO_LOOP_VINFO (stmt_info) = res;
+ }
+ for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+ {
+ tree stmt = bsi_stmt (si);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+ gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
+ STMT_VINFO_LOOP_VINFO (stmt_info) = res;
+ }
+ }
+ else
+ {
+ /* bb in current nest. */
+ for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+ {
+ stmt_ann_t ann = get_stmt_ann (phi);
+ set_stmt_info (ann, new_stmt_vec_info (phi, res));
+ }
- ann = stmt_ann (stmt);
- set_stmt_info (ann, new_stmt_vec_info (stmt, res));
+ for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+ {
+ tree stmt = bsi_stmt (si);
+ stmt_ann_t ann = stmt_ann (stmt);
+ set_stmt_info (ann, new_stmt_vec_info (stmt, res));
+ }
}
}
- LOOP_VINFO_LOOP (res) = loop;
+ /* CHECKME: We want to visit all BBs before their successors (except for
+ latch blocks, for which this assertion wouldn't hold). In the simple
+ case of the loop forms we allow, a dfs order of the BBs would the same
+ as reversed postorder traversal, so we are safe. */
+
+ free (bbs);
+ bbs = XCNEWVEC (basic_block, loop->num_nodes);
+ nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
+ bbs, loop->num_nodes, loop);
+ gcc_assert (nbbs == loop->num_nodes);
+
LOOP_VINFO_BBS (res) = bbs;
- LOOP_VINFO_EXIT_COND (res) = NULL;
LOOP_VINFO_NITERS (res) = NULL;
+ LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
LOOP_VINFO_VECTORIZABLE_P (res) = 0;
LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
LOOP_VINFO_UNALIGNED_DR (res) = NULL;
- LOOP_VINFO_MAY_MISALIGN_STMTS (res)
- = VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_CHECKS));
+ LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
+ VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
+ LOOP_VINFO_MAY_ALIAS_DDRS (res) =
+ VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
+ LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
+ LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
+ LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
return res;
}
stmts in the loop. */
void
-destroy_loop_vec_info (loop_vec_info loop_vinfo)
+destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
{
struct loop *loop;
basic_block *bbs;
int nbbs;
block_stmt_iterator si;
int j;
+ VEC (slp_instance, heap) *slp_instances;
+ slp_instance instance;
if (!loop_vinfo)
return;
bbs = LOOP_VINFO_BBS (loop_vinfo);
nbbs = loop->num_nodes;
+ if (!clean_stmts)
+ {
+ free (LOOP_VINFO_BBS (loop_vinfo));
+ free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
+ free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
+ VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
+
+ free (loop_vinfo);
+ loop->aux = NULL;
+ return;
+ }
+
for (j = 0; j < nbbs; j++)
{
basic_block bb = bbs[j];
free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
+ VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
+ slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+ for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
+ vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
+ VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
free (loop_vinfo);
loop->aux = NULL;
on ALIGNMENT bit boundary. */
bool
-vect_can_force_dr_alignment_p (tree decl, unsigned int alignment)
+vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
{
if (TREE_CODE (decl) != VAR_DECL)
return false;
if (TREE_STATIC (decl))
return (alignment <= MAX_OFILE_ALIGNMENT);
else
- /* This is not 100% correct. The absolute correct stack alignment
- is STACK_BOUNDARY. We're supposed to hope, but not assume, that
- PREFERRED_STACK_BOUNDARY is honored by all translation units.
- However, until someone implements forced stack alignment, SSE
- isn't really usable without this. */
- return (alignment <= PREFERRED_STACK_BOUNDARY);
+ /* This used to be PREFERRED_STACK_BOUNDARY, however, that is not 100%
+ correct until someone implements forced stack alignment. */
+ return (alignment <= STACK_BOUNDARY);
}
enum dr_alignment_support
vect_supportable_dr_alignment (struct data_reference *dr)
{
- tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
+ tree stmt = DR_STMT (dr);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
enum machine_mode mode = (int) TYPE_MODE (vectype);
+ struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
+ bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
+ bool invariant_in_outerloop = false;
if (aligned_access_p (dr))
return dr_aligned;
+ if (nested_in_vect_loop)
+ {
+ tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
+ invariant_in_outerloop =
+ (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
+ }
+
/* Possibly unaligned access. */
-
+
+ /* We can choose between using the implicit realignment scheme (generating
+ a misaligned_move stmt) and the explicit realignment scheme (generating
+ aligned loads with a REALIGN_LOAD). There are two variants to the explicit
+ realignment scheme: optimized, and unoptimized.
+ We can optimize the realignment only if the step between consecutive
+ vector loads is equal to the vector size. Since the vector memory
+ accesses advance in steps of VS (Vector Size) in the vectorized loop, it
+ is guaranteed that the misalignment amount remains the same throughout the
+ execution of the vectorized loop. Therefore, we can create the
+ "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
+ at the loop preheader.
+
+ However, in the case of outer-loop vectorization, when vectorizing a
+ memory access in the inner-loop nested within the LOOP that is now being
+ vectorized, while it is guaranteed that the misalignment of the
+ vectorized memory access will remain the same in different outer-loop
+ iterations, it is *not* guaranteed that is will remain the same throughout
+ the execution of the inner-loop. This is because the inner-loop advances
+ with the original scalar step (and not in steps of VS). If the inner-loop
+ step happens to be a multiple of VS, then the misalignment remains fixed
+ and we can use the optimized realignment scheme. For example:
+
+ for (i=0; i<N; i++)
+ for (j=0; j<M; j++)
+ s += a[i+j];
+
+ When vectorizing the i-loop in the above example, the step between
+ consecutive vector loads is 1, and so the misalignment does not remain
+ fixed across the execution of the inner-loop, and the realignment cannot
+ be optimized (as illustrated in the following pseudo vectorized loop):
+
+ for (i=0; i<N; i+=4)
+ for (j=0; j<M; j++){
+ vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
+ // when j is {0,1,2,3,4,5,6,7,...} respectively.
+ // (assuming that we start from an aligned address).
+ }
+
+ We therefore have to use the unoptimized realignment scheme:
+
+ for (i=0; i<N; i+=4)
+ for (j=k; j<M; j+=4)
+ vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
+ // that the misalignment of the initial address is
+ // 0).
+
+ The loop can then be vectorized as follows:
+
+ for (k=0; k<4; k++){
+ rt = get_realignment_token (&vp[k]);
+ for (i=0; i<N; i+=4){
+ v1 = vp[i+k];
+ for (j=k; j<M; j+=4){
+ v2 = vp[i+j+VS-1];
+ va = REALIGN_LOAD <v1,v2,rt>;
+ vs += va;
+ v1 = v2;
+ }
+ }
+ } */
+
if (DR_IS_READ (dr))
{
- if (optab_handler (vec_realign_load_optab, mode)->insn_code != CODE_FOR_nothing
+ if (optab_handler (vec_realign_load_optab, mode)->insn_code !=
+ CODE_FOR_nothing
&& (!targetm.vectorize.builtin_mask_for_load
|| targetm.vectorize.builtin_mask_for_load ()))
- return dr_unaligned_software_pipeline;
+ {
+ if (nested_in_vect_loop
+ && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD)
+ return dr_explicit_realign;
+ else
+ return dr_explicit_realign_optimized;
+ }
- if (optab_handler (movmisalign_optab, mode)->insn_code != CODE_FOR_nothing)
+ if (optab_handler (movmisalign_optab, mode)->insn_code !=
+ CODE_FOR_nothing)
/* Can't software pipeline the loads, but can at least do them. */
return dr_unaligned_supported;
}
{
case PHI_NODE:
*def = PHI_RESULT (*def_stmt);
- gcc_assert (*dt == vect_induction_def || *dt == vect_reduction_def
- || *dt == vect_invariant_def);
break;
case GIMPLE_MODIFY_STMT:
enum tree_code *code1, enum tree_code *code2)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
+ struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
bool ordered_p;
enum machine_mode vec_mode;
enum insn_code icode1, icode2;
Some targets can take advantage of this and generate more efficient code.
For example, targets like Altivec, that support widen_mult using a sequence
of {mult_even,mult_odd} generate the following vectors:
- vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8]. */
+ vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
- if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction)
+ When vectorizaing outer-loops, we execute the inner-loop sequentially
+ (each vectorized inner-loop iteration contributes to VF outer-loop
+ iterations in parallel). We therefore don't allow to change the order
+ of the computation in the inner-loop during outer-loop vectorization. */
+
+ if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
+ && !nested_in_vect_loop_p (vect_loop, stmt))
ordered_p = false;
else
ordered_p = true;
bool
supportable_narrowing_operation (enum tree_code code,
- tree stmt, tree vectype,
+ const_tree stmt, const_tree vectype,
enum tree_code *code1)
{
enum machine_mode vec_mode;
Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. */
tree
-vect_is_simple_reduction (struct loop *loop, tree phi)
+vect_is_simple_reduction (loop_vec_info loop_info, tree phi)
{
+ struct loop *loop = (bb_for_stmt (phi))->loop_father;
+ struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
edge latch_e = loop_latch_edge (loop);
tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
tree def_stmt, def1, def2;
imm_use_iterator imm_iter;
use_operand_p use_p;
+ gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
+
name = PHI_RESULT (phi);
nloop_uses = 0;
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
return NULL_TREE;
}
+ /* Generally, when vectorizing a reduction we change the order of the
+ computation. This may change the behavior of the program in some
+ cases, so we need to check that this is ok. One exception is when
+ vectorizing an outer-loop: the inner-loop is executed sequentially,
+ and therefore vectorizing reductions in the inner-loop durint
+ outer-loop vectorization is safe. */
+
/* CHECKME: check for !flag_finite_math_only too? */
- if (SCALAR_FLOAT_TYPE_P (type) && !flag_unsafe_math_optimizations)
+ if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
+ && !nested_in_vect_loop_p (vect_loop, def_stmt))
{
/* Changing the order of operations changes the semantics. */
if (vect_print_dump_info (REPORT_DETAILS))
}
return NULL_TREE;
}
- else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type))
+ else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
+ && !nested_in_vect_loop_p (vect_loop, def_stmt))
{
/* Changing the order of operations changes the semantics. */
if (vect_print_dump_info (REPORT_DETAILS))
/* Check that one def is the reduction def, defined by PHI,
- the other def is either defined in the loop by a GIMPLE_MODIFY_STMT,
- or it's an induction (defined by some phi node). */
+ the other def is either defined in the loop ("vect_loop_def"),
+ or it's an induction (defined by a loop-header phi-node). */
if (def2 == phi
&& flow_bb_inside_loop_p (loop, bb_for_stmt (def1))
&& (TREE_CODE (def1) == GIMPLE_MODIFY_STMT
- || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def))
+ || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
+ || (TREE_CODE (def1) == PHI_NODE
+ && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
+ && !is_loop_header_bb_p (bb_for_stmt (def1)))))
{
if (vect_print_dump_info (REPORT_DETAILS))
{
else if (def1 == phi
&& flow_bb_inside_loop_p (loop, bb_for_stmt (def2))
&& (TREE_CODE (def2) == GIMPLE_MODIFY_STMT
- || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def))
+ || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
+ || (TREE_CODE (def2) == PHI_NODE
+ && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
+ && !is_loop_header_bb_p (bb_for_stmt (def2)))))
{
/* Swap operands (just for simplicity - so that the rest of the code
can assume that the reduction variable is always the last (second)
if (!loop)
continue;
loop_vinfo = loop->aux;
- destroy_loop_vec_info (loop_vinfo);
+ destroy_loop_vec_info (loop_vinfo, true);
loop->aux = NULL;
}