int vec_inside_cost = 0;
int vec_outside_cost = 0;
int scalar_single_iter_cost = 0;
+ int scalar_outside_cost = 0;
+ bool runtime_test = false;
int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
int nbbs = loop->num_nodes;
- int byte_misalign;
+ int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
+ int peel_guard_costs = 0;
int innerloop_iters = 0, factor;
VEC (slp_instance, heap) *slp_instances;
slp_instance instance;
/* Cost model disabled. */
if (!flag_vect_cost_model)
{
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "cost model disabled.");
return 0;
}
- /* Requires loop versioning tests to handle misalignment.
- FIXME: Make cost depend on number of stmts in may_misalign list. */
+ /* If the number of iterations is unknown, or the
+ peeling-for-misalignment amount is unknown, we will have to generate
+ a runtime test to test the loop count against the threshold. */
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ || (byte_misalign < 0))
+ runtime_test = true;
+
+ /* Requires loop versioning tests to handle misalignment. */
if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
{
- vec_outside_cost += TARG_COND_BRANCH_COST;
- if (vect_print_dump_info (REPORT_DETAILS))
+ /* FIXME: Make cost depend on complexity of individual check. */
+ vec_outside_cost +=
+ VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
+ if (vect_print_dump_info (REPORT_COST))
+ fprintf (vect_dump, "cost model: Adding cost of checks for loop "
+ "versioning to treat misalignment.\n");
+ }
+
+ if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+ {
+ /* FIXME: Make cost depend on complexity of individual check. */
+ vec_outside_cost +=
+ VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "cost model: Adding cost of checks for loop "
- "versioning.\n");
+ "versioning aliasing.\n");
+ }
+
+ if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+ || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+ {
+ vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
}
/* Count statements in scalar loop. Using this as scalar cost for a single
factor = 1;
for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
- {
- tree stmt = bsi_stmt (si);
- stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
- if (!STMT_VINFO_RELEVANT_P (stmt_info)
- && !STMT_VINFO_LIVE_P (stmt_info))
- continue;
- scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
- vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
+ {
+ tree stmt = bsi_stmt (si);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ /* Skip stmts that are not vectorized inside the loop. */
+ if (!STMT_VINFO_RELEVANT_P (stmt_info)
+ && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
+ continue;
+ scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
+ vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
/* FIXME: for stmts in the inner-loop in outer-loop vectorization,
some of the "outside" costs are generated inside the outer-loop. */
- vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
- }
+ vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
+ }
}
/* Add additional cost for the peeled instructions in prologue and epilogue
loop.
FORNOW: If we dont know the value of peel_iters for prologue or epilogue
- at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
+ at compile-time - we assume it's vf/2 (the worst would be vf-1).
TODO: Build an expression that represents peel_iters for prologue and
epilogue to be used in a run-time test. */
- byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
-
if (byte_misalign < 0)
{
- peel_iters_prologue = (vf - 1)/2;
- if (vect_print_dump_info (REPORT_DETAILS))
+ peel_iters_prologue = vf/2;
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "cost model: "
- "prologue peel iters set to (vf-1)/2.");
+ "prologue peel iters set to vf/2.");
/* If peeling for alignment is unknown, loop bound of main loop becomes
unknown. */
- peel_iters_epilogue = (vf - 1)/2;
- if (vect_print_dump_info (REPORT_DETAILS))
+ peel_iters_epilogue = vf/2;
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "cost model: "
- "epilogue peel iters set to (vf-1)/2 because "
+ "epilogue peel iters set to vf/2 because "
"peeling for alignment is unknown .");
+
+ /* If peeled iterations are unknown, count a taken branch and a not taken
+ branch per peeled loop. Even if scalar loop iterations are known,
+ vector iterations are not known since peeled prologue iterations are
+ not known. Hence guards remain the same. */
+ peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
+ + TARG_COND_NOT_TAKEN_BRANCH_COST);
+
}
else
{
if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
{
- peel_iters_epilogue = (vf - 1)/2;
- if (vect_print_dump_info (REPORT_DETAILS))
+ peel_iters_epilogue = vf/2;
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "cost model: "
- "epilogue peel iters set to (vf-1)/2 because "
+ "epilogue peel iters set to vf/2 because "
"loop iterations are unknown .");
+
+ /* If peeled iterations are known but number of scalar loop
+ iterations are unknown, count a taken branch per peeled loop. */
+ peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
+
}
else
{
}
}
- /* Requires a prologue loop when peeling to handle misalignment. Add cost of
- two guards, one for the peeled loop and one for the vector loop. */
-
- if (peel_iters_prologue)
- {
- vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "cost model: Adding cost of checks for "
- "prologue.\n");
- }
-
- /* Requires an epilogue loop to finish up remaining iterations after vector
- loop. Add cost of two guards, one for the peeled loop and one for the
- vector loop. */
-
- if (peel_iters_epilogue
- || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
- {
- vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "cost model : Adding cost of checks for "
- "epilogue.\n");
- }
-
vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
- + (peel_iters_epilogue * scalar_single_iter_cost);
-
- /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
- information we provide for the target is whether testing against the
- threshold involves a runtime test. */
- if (targetm.vectorize.builtin_vectorization_cost)
- {
- bool runtime_test = false;
-
- /* If the number of iterations is unknown, or the
- peeling-for-misalignment amount is unknown, we eill have to generate
- a runtime test to test the loop count against the threshold. */
- if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- || (byte_misalign < 0))
- runtime_test = true;
- vec_outside_cost +=
- targetm.vectorize.builtin_vectorization_cost (runtime_test);
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
- targetm.vectorize.builtin_vectorization_cost (runtime_test));
+ + (peel_iters_epilogue * scalar_single_iter_cost)
+ + peel_guard_costs;
+
+ /* FORNOW: The scalar outside cost is incremented in one of the
+ following ways:
+
+ 1. The vectorizer checks for alignment and aliasing and generates
+ a condition that allows dynamic vectorization. A cost model
+ check is ANDED with the versioning condition. Hence scalar code
+ path now has the added cost of the versioning check.
+
+ if (cost > th & versioning_check)
+ jmp to vector code
+
+ Hence run-time scalar is incremented by not-taken branch cost.
+
+ 2. The vectorizer then checks if a prologue is required. If the
+ cost model check was not done before during versioning, it has to
+ be done before the prologue check.
+
+ if (cost <= th)
+ prologue = scalar_iters
+ if (prologue == 0)
+ jmp to vector code
+ else
+ execute prologue
+ if (prologue == num_iters)
+ go to exit
+
+ Hence the run-time scalar cost is incremented by a taken branch,
+ plus a not-taken branch, plus a taken branch cost.
+
+ 3. The vectorizer then checks if an epilogue is required. If the
+ cost model check was not done before during prologue check, it
+ has to be done with the epilogue check.
+
+ if (prologue == 0)
+ jmp to vector code
+ else
+ execute prologue
+ if (prologue == num_iters)
+ go to exit
+ vector code:
+ if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
+ jmp to epilogue
+
+ Hence the run-time scalar cost should be incremented by 2 taken
+ branches.
+
+ TODO: The back end may reorder the BBS's differently and reverse
+ conditions/branch directions. Change the stimates below to
+ something more reasonable. */
+
+ if (runtime_test)
+ {
+ /* Cost model check occurs at versioning. */
+ if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+ || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+ scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
+ else
+ {
+ /* Cost model occurs at prologue generation. */
+ if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
+ + TARG_COND_NOT_TAKEN_BRANCH_COST;
+ /* Cost model check occurs at epilogue generation. */
+ else
+ scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
+ }
}
/* Add SLP costs. */
/* Calculate number of iterations required to make the vector version
profitable, relative to the loop bodies only. The following condition
- must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
+ must hold true:
+ SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
+ where
SIC = scalar iteration cost, VIC = vector iteration cost,
- VOC = vector outside cost and VF = vectorization factor. */
+ VOC = vector outside cost, VF = vectorization factor,
+ PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
+ SOC = scalar outside cost for run time cost model check. */
if ((scalar_single_iter_cost * vf) > vec_inside_cost)
{
- if (vec_outside_cost == 0)
+ if (vec_outside_cost <= 0)
min_profitable_iters = 1;
else
{
- min_profitable_iters = (vec_outside_cost * vf)
+ min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
+ - vec_inside_cost * peel_iters_prologue
+ - vec_inside_cost * peel_iters_epilogue)
/ ((scalar_single_iter_cost * vf)
- vec_inside_cost);
if ((scalar_single_iter_cost * vf * min_profitable_iters)
<= ((vec_inside_cost * min_profitable_iters)
- + (vec_outside_cost * vf)))
+ + ((vec_outside_cost - scalar_outside_cost) * vf)))
min_profitable_iters++;
}
}
/* vector version will never be profitable. */
else
{
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "cost model: vector iteration cost = %d "
"is divisible by scalar iteration cost = %d by a factor "
"greater than or equal to the vectorization factor = %d .",
return -1;
}
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
{
fprintf (vect_dump, "Cost model analysis: \n");
fprintf (vect_dump, " Vector inside of loop cost: %d\n",
vec_inside_cost);
fprintf (vect_dump, " Vector outside of loop cost: %d\n",
vec_outside_cost);
- fprintf (vect_dump, " Scalar cost: %d\n", scalar_single_iter_cost);
+ fprintf (vect_dump, " Scalar iteration cost: %d\n",
+ scalar_single_iter_cost);
+ fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
fprintf (vect_dump, " prologue iterations: %d\n",
peel_iters_prologue);
fprintf (vect_dump, " epilogue iterations: %d\n",
peel_iters_epilogue);
fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
min_profitable_iters);
- fprintf (vect_dump, " Actual minimum iters for profitability: %d\n",
- min_profitable_iters < vf ? vf : min_profitable_iters);
}
min_profitable_iters =
if (niters <= min_profitable_iters)
then skip the vectorized loop. */
min_profitable_iters--;
+
+ if (vect_print_dump_info (REPORT_COST))
+ fprintf (vect_dump, " Profitability threshold = %d\n",
+ min_profitable_iters);
+
return min_profitable_iters;
}
generated within the strip-mine loop, the initial definition before
the loop, and the epilogue code that must be generated. */
-static void
+static bool
vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
int ncopies)
{
reduction_op = TREE_OPERAND (operation, op_type-1);
vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
+ if (!vectype)
+ {
+ if (vect_print_dump_info (REPORT_COST))
+ {
+ fprintf (vect_dump, "unsupported data-type ");
+ print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
+ }
+ return false;
+ }
+
mode = TYPE_MODE (vectype);
orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
"outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+
+ return true;
}
/* prologue cost for vec_init and vec_step. */
STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
"outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
}
-/* Return addresses of the cost fields of SLP_NODE if it's not NULL, and of
- the stmt otherwise. */
-
-static inline void
-vect_get_cost_fields (stmt_vec_info stmt_info, slp_tree slp_node,
- int **inside_cost_field, int **outside_cost_field)
-{
- if (slp_node)
- {
- *inside_cost_field = &(SLP_TREE_INSIDE_OF_LOOP_COST (slp_node));
- *outside_cost_field = &(SLP_TREE_OUTSIDE_OF_LOOP_COST (slp_node));
- }
- else
- {
- *inside_cost_field = &(STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info));
- *outside_cost_field = &(STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
- }
-}
-
-
/* Function vect_model_simple_cost.
Models cost for simple operations, i.e. those that only emit ncopies of a
enum vect_def_type *dt, slp_tree slp_node)
{
int i;
- int *inside_cost_field, *outside_cost_field;
+ int inside_cost = 0, outside_cost = 0;
- /* Take addresses of relevant fields to update in the function. */
- vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field,
- &outside_cost_field);
-
- *inside_cost_field = ncopies * TARG_VEC_STMT_COST;
+ inside_cost = ncopies * TARG_VEC_STMT_COST;
/* FORNOW: Assuming maximum 2 args per stmts. */
for (i = 0; i < 2; i++)
{
if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
- *outside_cost_field += TARG_SCALAR_TO_VEC_COST;
+ outside_cost += TARG_SCALAR_TO_VEC_COST;
}
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
- "outside_cost = %d .", *inside_cost_field, *outside_cost_field);
+ "outside_cost = %d .", inside_cost, outside_cost);
+
+ /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
+ stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
+ stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
}
vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
enum vect_def_type dt, slp_tree slp_node)
{
- int cost = 0;
int group_size;
- int *inside_cost_field, *outside_cost_field;
-
- /* Take addresses of relevant fields to update in the function. */
- vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field,
- &outside_cost_field);
+ int inside_cost = 0, outside_cost = 0;
if (dt == vect_constant_def || dt == vect_invariant_def)
- *outside_cost_field = TARG_SCALAR_TO_VEC_COST;
+ outside_cost = TARG_SCALAR_TO_VEC_COST;
/* Strided access? */
if (DR_GROUP_FIRST_DR (stmt_info))
if (group_size > 1)
{
/* Uses a high and low interleave operation for each needed permute. */
- cost = ncopies * exact_log2(group_size) * group_size
+ inside_cost = ncopies * exact_log2(group_size) * group_size
* TARG_VEC_STMT_COST;
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
group_size);
}
/* Costs of the stores. */
- cost += ncopies * TARG_VEC_STORE_COST;
+ inside_cost += ncopies * TARG_VEC_STORE_COST;
- *inside_cost_field = cost;
-
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
- "outside_cost = %d .", *inside_cost_field, *outside_cost_field);
+ "outside_cost = %d .", inside_cost, outside_cost);
+
+ /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
+ stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
+ stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
}
vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
{
- int inner_cost = 0;
int group_size;
int alignment_support_cheme;
tree first_stmt;
struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
- int *inside_cost_field, *outside_cost_field;
-
- /* Take addresses of relevant fields to update in the function. */
- vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field,
- &outside_cost_field);
+ int inside_cost = 0, outside_cost = 0;
/* Strided accesses? */
first_stmt = DR_GROUP_FIRST_DR (stmt_info);
if (group_size > 1)
{
/* Uses an even and odd extract operations for each needed permute. */
- inner_cost = ncopies * exact_log2(group_size) * group_size
- * TARG_VEC_STMT_COST;
+ inside_cost = ncopies * exact_log2(group_size) * group_size
+ * TARG_VEC_STMT_COST;
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
group_size);
{
case dr_aligned:
{
- inner_cost += ncopies * TARG_VEC_LOAD_COST;
+ inside_cost += ncopies * TARG_VEC_LOAD_COST;
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "vect_model_load_cost: aligned.");
break;
case dr_unaligned_supported:
{
/* Here, we assign an additional cost for the unaligned load. */
- inner_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
+ inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
"hardware.");
}
case dr_explicit_realign:
{
- inner_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
+ inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
/* FIXME: If the misalignment remains fixed across the iterations of
the containing loop, the following cost should be added to the
outside costs. */
if (targetm.vectorize.builtin_mask_for_load)
- inner_cost += TARG_VEC_STMT_COST;
+ inside_cost += TARG_VEC_STMT_COST;
break;
}
case dr_explicit_realign_optimized:
{
- int outer_cost = 0;
-
- if (vect_print_dump_info (REPORT_DETAILS))
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "vect_model_load_cost: unaligned software "
"pipelined.");
if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
{
- outer_cost = 2*TARG_VEC_STMT_COST;
+ outside_cost = 2*TARG_VEC_STMT_COST;
if (targetm.vectorize.builtin_mask_for_load)
- outer_cost += TARG_VEC_STMT_COST;
+ outside_cost += TARG_VEC_STMT_COST;
}
-
- *outside_cost_field = outer_cost;
- inner_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
+ inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
break;
}
default:
gcc_unreachable ();
}
-
- *inside_cost_field = inner_cost;
-
- if (vect_print_dump_info (REPORT_DETAILS))
+
+ if (vect_print_dump_info (REPORT_COST))
fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
- "outside_cost = %d .", *inside_cost_field, *outside_cost_field);
+ "outside_cost = %d .", inside_cost, outside_cost);
+ /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
+ stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
+ stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
}
bool is_store = false;
unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
+ bool constant_p;
if (STMT_VINFO_DATA_REF (stmt_vinfo))
is_store = true;
number_of_copies = least_common_multiple (nunits, group_size) / group_size;
number_of_places_left_in_vector = nunits;
+ constant_p = true;
for (j = 0; j < number_of_copies; j++)
{
for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
op = operation;
else
op = TREE_OPERAND (operation, op_num);
+ if (!CONSTANT_CLASS_P (op))
+ constant_p = false;
/* Create 'vect_ = {op0,op1,...,opn}'. */
t = tree_cons (NULL_TREE, op, t);
number_of_places_left_in_vector = nunits;
vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
- vec_cst = build_constructor_from_list (vector_type, t);
+ gcc_assert (vector_type);
+ if (constant_p)
+ vec_cst = build_vector (vector_type, t);
+ else
+ vec_cst = build_constructor_from_list (vector_type, t);
+ constant_p = true;
VEC_quick_push (tree, voprnds,
vect_init_vector (stmt, vec_cst, vector_type,
NULL));
}
-/* Get vectorized defintions from SLP_NODE that contains corresponding
+/* Get vectorized definitions from SLP_NODE that contains corresponding
vectorized def-stmts. */
static void
call vect_get_constant_vectors() to create vector stmts.
Otherwise, the def-stmts must be already vectorized and the vectorized stmts
must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
- vect_get_slp_vect_defs() to retrieve them. */
+ vect_get_slp_vect_defs() to retrieve them.
+ If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
+ the right node. This is used when the second operand must remain scalar. */
static void
vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
return;
operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
- if (TREE_OPERAND_LENGTH (operation) == unary_op)
+ if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
return;
*vec_oprnds1 = VEC_alloc (tree, heap,
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
- tree vectype = get_vectype_for_scalar_type (scalar_type);
- int nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ tree vectype;
+ int nunits;
edge pe = loop_preheader_edge (loop);
struct loop *iv_loop;
basic_block new_bb;
int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
int i;
bool ok;
- int ncopies = vf / nunits;
+ int ncopies;
tree expr;
stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
bool nested_in_vect_loop = false;
block_stmt_iterator si;
basic_block bb = bb_for_stmt (iv_phi);
+ vectype = get_vectype_for_scalar_type (scalar_type);
+ gcc_assert (vectype);
+ nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ ncopies = vf / nunits;
+
gcc_assert (phi_info);
gcc_assert (ncopies >= 1);
t = NULL_TREE;
for (i = 0; i < nunits; i++)
t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
- vec = build_constructor_from_list (vectype, t);
+ gcc_assert (CONSTANT_CLASS_P (new_name));
+ vec = build_vector (vectype, t);
vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
t = NULL_TREE;
for (i = 0; i < nunits; i++)
t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
- vec = build_constructor_from_list (vectype, t);
+ gcc_assert (CONSTANT_CLASS_P (new_name));
+ vec = build_vector (vectype, t);
vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
vec_def = induc_def;
t = tree_cons (NULL_TREE, op, t);
}
vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
+ gcc_assert (vector_type);
vec_cst = build_vector (vector_type, t);
return vect_init_vector (stmt, vec_cst, vector_type, NULL);
/* FIXME: use build_constructor directly. */
vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
+ gcc_assert (vector_type);
vec_inv = build_constructor_from_list (vector_type, t);
return vect_init_vector (stmt, vec_inv, vector_type, NULL);
}
vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
- if (vec_oprnds1)
+ if (vec_oprnds1 && *vec_oprnds1)
{
vec_oprnd = VEC_pop (tree, *vec_oprnds1);
vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
tree vector_type;
bool nested_in_vect_loop = false;
- gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
+ gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
if (nested_in_vect_loop_p (loop, stmt))
nested_in_vect_loop = true;
else
case WIDEN_SUM_EXPR:
case DOT_PROD_EXPR:
case PLUS_EXPR:
- if (nested_in_vect_loop)
- *adjustment_def = vecdef;
- else
- *adjustment_def = init_val;
- /* Create a vector of zeros for init_def. */
- if (INTEGRAL_TYPE_P (type))
- def_for_init = build_int_cst (type, 0);
+ if (nested_in_vect_loop)
+ *adjustment_def = vecdef;
else
+ *adjustment_def = init_val;
+ /* Create a vector of zeros for init_def. */
+ if (SCALAR_FLOAT_TYPE_P (type))
def_for_init = build_real (type, dconst0);
- for (i = nunits - 1; i >= 0; --i)
- t = tree_cons (NULL_TREE, def_for_init, t);
+ else
+ def_for_init = build_int_cst (type, 0);
+ for (i = nunits - 1; i >= 0; --i)
+ t = tree_cons (NULL_TREE, def_for_init, t);
vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
+ gcc_assert (vector_type);
init_def = build_vector (vector_type, t);
break;
op_type = TREE_OPERAND_LENGTH (operation);
reduction_op = TREE_OPERAND (operation, op_type-1);
vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
+ gcc_assert (vectype);
mode = TYPE_MODE (vectype);
/*** 1. Create the reduction def-use cycle ***/
return false;
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
scalar_type = TREE_TYPE (scalar_dest);
+ if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
+ && !SCALAR_FLOAT_TYPE_P (scalar_type))
+ return false;
/* All uses but the last are expected to be defined in the loop.
The last use is the reduction variable. */
reduction variable, and get the tree-code from orig_stmt. */
orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
+ if (!vectype)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "unsupported data-type ");
+ print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
+ }
+ return false;
+ }
+
vec_mode = TYPE_MODE (vectype);
}
else
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
- vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
+ if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
+ return false;
return true;
}
if (STMT_SLP_TYPE (stmt_info))
return false;
- /* FORNOW: not yet supported. */
- if (STMT_VINFO_LIVE_P (stmt_info))
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "value used after loop.");
- return false;
- }
-
/* Is STMT a vectorizable call? */
if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
return false;
return false;
vectype_in = get_vectype_for_scalar_type (rhs_type);
+ if (!vectype_in)
+ return false;
nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
vectype_out = get_vectype_for_scalar_type (lhs_type);
+ if (!vectype_out)
+ return false;
nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
/* FORNOW */
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
return false;
- if (STMT_VINFO_LIVE_P (stmt_info))
- {
- /* FORNOW: not yet supported. */
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "value used after loop.");
- return false;
- }
-
if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
return false;
op0 = TREE_OPERAND (operation, 0);
rhs_type = TREE_TYPE (op0);
vectype_in = get_vectype_for_scalar_type (rhs_type);
+ if (!vectype_in)
+ return false;
nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
lhs_type = TREE_TYPE (scalar_dest);
vectype_out = get_vectype_for_scalar_type (lhs_type);
+ if (!vectype_out)
+ return false;
nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
/* FORNOW */
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
return false;
- /* FORNOW: not yet supported. */
- if (STMT_VINFO_LIVE_P (stmt_info))
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "value used after loop.");
- return false;
- }
-
/* Is vectorizable assignment? */
if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
return false;
gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
- if (STMT_VINFO_LIVE_P (stmt_info))
- {
- /* FORNOW: not yet supported. */
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "value used after loop.");
- return false;
- }
-
if (TREE_CODE (phi) != PHI_NODE)
return false;
int j, i;
VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
tree vop0, vop1;
+ unsigned int k;
+ bool scalar_shift_arg = false;
/* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
this, so we can safely override NCOPIES with 1 here. */
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
return false;
- /* FORNOW: not yet supported. */
- if (STMT_VINFO_LIVE_P (stmt_info))
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "value used after loop.");
- return false;
- }
-
/* Is STMT a vectorizable binary/unary operation? */
if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
return false;
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
+ if (!vectype_out)
+ return false;
nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
if (nunits_out != nunits_in)
return false;
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "op not supported by target.");
+ /* Check only during analysis. */
if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
- || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
- < vect_min_worthwhile_factor (code))
+ || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+ < vect_min_worthwhile_factor (code)
+ && !vec_stmt))
return false;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "proceeding using word mode.");
}
- /* Worthwhile without SIMD support? */
+ /* Worthwhile without SIMD support? Check only during analysis. */
if (!VECTOR_MODE_P (TYPE_MODE (vectype))
&& LOOP_VINFO_VECT_FACTOR (loop_vinfo)
- < vect_min_worthwhile_factor (code))
+ < vect_min_worthwhile_factor (code)
+ && !vec_stmt)
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "not worthwhile without SIMD support.");
/* Invariant argument is needed for a vector shift
by a scalar shift operand. */
optab_op2_mode = insn_data[icode].operand[2].mode;
- if (! (VECTOR_MODE_P (optab_op2_mode)
- || dt[1] == vect_constant_def
- || dt[1] == vect_invariant_def))
+ if (!VECTOR_MODE_P (optab_op2_mode))
{
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "operand mode requires invariant argument.");
- return false;
- }
+ if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "operand mode requires invariant"
+ " argument.");
+ return false;
+ }
+
+ scalar_shift_arg = true;
+ }
}
if (!vec_stmt) /* transformation not required. */
/* Handle def. */
vec_dest = vect_create_destination_var (scalar_dest, vectype);
+ /* Allocate VECs for vector operands. In case of SLP, vector operands are
+ created in the previous stages of the recursion, so no allocation is
+ needed, except for the case of shift with scalar shift argument. In that
+ case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
+ be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
+ In case of loop-based vectorization we allocate VECs of size 1. We
+ allocate VEC_OPRNDS1 only in case of binary operation. */
if (!slp_node)
{
vec_oprnds0 = VEC_alloc (tree, heap, 1);
if (op_type == binary_op)
- vec_oprnds1 = VEC_alloc (tree, heap, 1);
+ vec_oprnds1 = VEC_alloc (tree, heap, 1);
}
+ else if (scalar_shift_arg)
+ vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
/* In case the vectorization factor (VF) is bigger than the number
of elements that we can fit in a vectype (nunits), we have to generate
if (j == 0)
{
if (op_type == binary_op
- && (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
- && !slp_node)
+ && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
{
/* Vector shl and shr insn patterns can be defined with scalar
operand 2 (shift operand). In this case, use constant or loop
fprintf (vect_dump, "operand 1 using scalar mode.");
vec_oprnd1 = op1;
VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+ if (slp_node)
+ {
+ /* Store vec_oprnd1 for every vector stmt to be created
+ for SLP_NODE. We check during the analysis that all the
+ shift arguments are the same.
+ TODO: Allow different constants for different vector
+ stmts generated for an SLP instance. */
+ for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
+ VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+ }
}
}
+ /* vec_oprnd1 is available if operand 1 should be of a scalar-type
+ (a special case for certain kind of vector shifts); otherwise,
+ operand 1 should be of a vector type (the usual case). */
if (op_type == binary_op && !vec_oprnd1)
vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
slp_node);
else
- vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, &vec_oprnds1,
+ vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
slp_node);
}
else
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
return false;
- /* FORNOW: not yet supported. */
- if (STMT_VINFO_LIVE_P (stmt_info))
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "value used after loop.");
- return false;
- }
-
/* Is STMT a vectorizable type-demotion operation? */
if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
return false;
op0 = TREE_OPERAND (operation, 0);
vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
+ if (!vectype_in)
+ return false;
nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
+ if (!vectype_out)
+ return false;
nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
if (nunits_in != nunits_out / 2) /* FORNOW */
return false;
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
return false;
- /* FORNOW: not yet supported. */
- if (STMT_VINFO_LIVE_P (stmt_info))
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "value used after loop.");
- return false;
- }
-
/* Is STMT a vectorizable type-promotion operation? */
if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
return false;
op0 = TREE_OPERAND (operation, 0);
vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
+ if (!vectype_in)
+ return false;
nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
+ if (!vectype_out)
+ return false;
nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
if (nunits_out != nunits_in / 2) /* FORNOW */
return false;
int nunits = TYPE_VECTOR_SUBPARTS (vectype);
int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
int j;
- tree next_stmt, first_stmt;
+ tree next_stmt, first_stmt = NULL_TREE;
bool strided_store = false;
unsigned int group_size, i;
VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
return false;
- if (STMT_VINFO_LIVE_P (stmt_info))
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "value used after loop.");
- return false;
- }
-
/* Is vectorizable store? */
if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
{
strided_store = true;
+ first_stmt = DR_GROUP_FIRST_DR (stmt_info);
if (!vect_strided_store_supported (vectype)
&& !PURE_SLP_STMT (stmt_info) && !slp)
- return false;
+ return false;
+
+ if (first_stmt == stmt)
+ {
+ /* STMT is the leader of the group. Check the operands of all the
+ stmts of the group. */
+ next_stmt = DR_GROUP_NEXT_DR (stmt_info);
+ while (next_stmt)
+ {
+ op = GIMPLE_STMT_OPERAND (next_stmt, 1);
+ if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "use not simple.");
+ return false;
+ }
+ next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
+ }
+ }
}
if (!vec_stmt) /* transformation not required. */
if (strided_store)
{
- first_stmt = DR_GROUP_FIRST_DR (stmt_info);
first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
OPRNDS are of size 1. */
for (i = 0; i < group_size; i++)
{
- vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
- VEC_index (tree, oprnds, i));
+ op = VEC_index (tree, oprnds, i);
+ vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
+ vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
VEC_replace(tree, dr_chain, i, vec_oprnd);
VEC_replace(tree, oprnds, i, vec_oprnd);
}
new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
new_temp = make_ssa_name (vec_dest, new_stmt);
GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ mark_symbols_for_renaming (new_stmt);
new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
gcc_assert (!new_bb);
msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
return false;
- /* FORNOW: not yet supported. */
- if (STMT_VINFO_LIVE_P (stmt_info))
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "value used after loop.");
- return false;
- }
-
/* Is vectorizable load? */
if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
return false;
for (i = 0; i < op_type; i++)
{
op = TREE_OPERAND (operation, i);
- if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
+ if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
}
}
+/* Return the more conservative threshold between the
+ min_profitable_iters returned by the cost model and the user
+ specified threshold, if provided. */
+
+static unsigned int
+conservative_cost_threshold (loop_vec_info loop_vinfo,
+ int min_profitable_iters)
+{
+ unsigned int th;
+ int min_scalar_loop_bound;
+
+ min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+ * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
+
+ /* Use the cost model only if it is more conservative than user specified
+ threshold. */
+ th = (unsigned) min_scalar_loop_bound;
+ if (min_profitable_iters
+ && (!min_scalar_loop_bound
+ || min_profitable_iters > min_scalar_loop_bound))
+ th = (unsigned) min_profitable_iters;
+
+ if (th && vect_print_dump_info (REPORT_COST))
+ fprintf (vect_dump, "Vectorization may not be profitable.");
+
+ return th;
+}
/* Function vect_do_peeling_for_loop_bound
edge update_e;
basic_block preheader;
int loop_num;
- unsigned int th;
- int min_scalar_loop_bound;
+ bool check_profitability = false;
+ unsigned int th = 0;
int min_profitable_iters;
if (vect_print_dump_info (REPORT_DETAILS))
loop_num = loop->num;
- /* Analyze cost to set threshhold for vectorized loop. */
- min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
- min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
- * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-
- /* Use the cost model only if it is more conservative than user specified
- threshold. */
+ /* If cost model check not done during versioning and
+ peeling for alignment. */
+ if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+ && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
+ && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
+ {
+ check_profitability = true;
- th = (unsigned) min_scalar_loop_bound;
- if (min_profitable_iters
- && (!min_scalar_loop_bound
- || min_profitable_iters > min_scalar_loop_bound))
- th = (unsigned) min_profitable_iters;
+ /* Get profitability threshold for vectorized loop. */
+ min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
- if (min_profitable_iters
- && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "vectorization may not be profitable.");
+ th = conservative_cost_threshold (loop_vinfo,
+ min_profitable_iters);
+ }
new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
ratio_mult_vf_name, ni_name, false,
- th);
+ th, check_profitability);
gcc_assert (new_loop);
gcc_assert (loop_num == loop->num);
#ifdef ENABLE_CHECKING
tree niters_of_prolog_loop, ni_name;
tree n_iters;
struct loop *new_loop;
+ bool check_profitability = false;
+ unsigned int th = 0;
+ int min_profitable_iters;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
ni_name = vect_build_loop_niters (loop_vinfo);
niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
+
+ /* If cost model check not done during versioning. */
+ if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+ && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+ {
+ check_profitability = true;
+
+ /* Get profitability threshold for vectorized loop. */
+ min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
+
+ th = conservative_cost_threshold (loop_vinfo,
+ min_profitable_iters);
+ }
+
/* Peel the prolog loop and iterate it niters_of_prolog_loop. */
- new_loop =
- slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
- niters_of_prolog_loop, ni_name, true, 0);
+ new_loop =
+ slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
+ niters_of_prolog_loop, ni_name, true,
+ th, check_profitability);
+
gcc_assert (new_loop);
#ifdef ENABLE_CHECKING
slpeel_verify_cfg_after_peeling (new_loop, loop);
checked at runtime.
Input:
+ COND_EXPR - input conditional expression. New conditions will be chained
+ with logical AND operation.
LOOP_VINFO - two fields of the loop information are used.
LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
test can be done as a&(n-1) == 0. For example, for 16
byte vectors the test is a&0xf == 0. */
-static tree
+static void
vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
+ tree *cond_expr,
tree *cond_expr_stmt_list)
{
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
tree or_tmp_name = NULL_TREE;
tree and_tmp, and_tmp_name, and_stmt;
tree ptrsize_zero;
+ tree part_cond_expr;
/* Check that mask is one less than a power of 2, i.e., mask is
all zeros followed by all ones. */
/* Make and_tmp the left operand of the conditional test against zero.
if and_tmp has a nonzero bit then some address is unaligned. */
ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
- return build2 (EQ_EXPR, boolean_type_node,
- and_tmp_name, ptrsize_zero);
+ part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
+ and_tmp_name, ptrsize_zero);
+ if (*cond_expr)
+ *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+ *cond_expr, part_cond_expr);
+ else
+ *cond_expr = part_cond_expr;
}
/* Function vect_vfa_segment_size.
static tree
vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
{
- tree segment_length;
+ tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
+ DR_STEP (dr), vect_factor);
if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
{
- tree vector_size =
- build_int_cst (integer_type_node,
- GET_MODE_SIZE (TYPE_MODE (STMT_VINFO_VECTYPE
- (vinfo_for_stmt (DR_STMT (dr))))));
+ tree vector_size = TYPE_SIZE_UNIT
+ (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
- segment_length =
- fold_convert (sizetype,
- fold_build2 (PLUS_EXPR, integer_type_node,
- fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr),
- vect_factor),
- vector_size));
+ segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
+ segment_length, vector_size);
}
- else
- {
- segment_length =
- fold_convert (sizetype,
- fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr),
- vect_factor));
- }
-
- return segment_length;
+ return fold_convert (sizetype, segment_length);
}
/* Function vect_create_cond_for_alias_checks.
Input:
COND_EXPR - input conditional expression. New conditions will be chained
- with logical and operation.
+ with logical AND operation.
LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
to be checked.
COND_EXPR - conditional expression.
COND_EXPR_STMT_LIST - statements needed to construct the conditional
expression.
+
+
The returned value is the conditional expression to be used in the if
statement that controls which version of the loop gets executed at runtime.
*/
for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
{
- tree stmt_a = DR_STMT (DDR_A (ddr));
- tree stmt_b = DR_STMT (DDR_B (ddr));
+ struct data_reference *dr_a, *dr_b;
+ tree dr_group_first_a, dr_group_first_b;
+ tree addr_base_a, addr_base_b;
+ tree segment_length_a, segment_length_b;
+ tree stmt_a, stmt_b;
+
+ dr_a = DDR_A (ddr);
+ stmt_a = DR_STMT (DDR_A (ddr));
+ dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
+ if (dr_group_first_a)
+ {
+ stmt_a = dr_group_first_a;
+ dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
+ }
- tree addr_base_a =
+ dr_b = DDR_B (ddr);
+ stmt_b = DR_STMT (DDR_B (ddr));
+ dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
+ if (dr_group_first_b)
+ {
+ stmt_b = dr_group_first_b;
+ dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
+ }
+
+ addr_base_a =
vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
NULL_TREE, loop);
- tree addr_base_b =
+ addr_base_b =
vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
NULL_TREE, loop);
- tree segment_length_a = vect_vfa_segment_size (DDR_A (ddr), vect_factor);
- tree segment_length_b = vect_vfa_segment_size (DDR_B (ddr), vect_factor);
+ segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
+ segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
if (vect_print_dump_info (REPORT_DR_DETAILS))
{
fprintf (vect_dump,
"create runtime check for data references ");
- print_generic_expr (vect_dump, DR_REF (DDR_A (ddr)), TDF_SLIM);
+ print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
fprintf (vect_dump, " and ");
- print_generic_expr (vect_dump, DR_REF (DDR_B (ddr)), TDF_SLIM);
+ print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
}
}
+/* Function vect_loop_versioning.
+
+ If the loop has data references that may or may not be aligned or/and
+ has data reference relations whose independence was not proven then
+ two versions of the loop need to be generated, one which is vectorized
+ and one which isn't. A test is then generated to control which of the
+ loops is executed. The test checks for the alignment of all of the
+ data references that may or may not be aligned. An additional
+ sequence of runtime tests is generated for each pairs of DDRs whose
+ independence was not proven. The vectorized version of loop is
+ executed only if both alias and alignment tests are passed.
+
+ The test generated to check which version of loop is executed
+ is modified to also check for profitability as indicated by the
+ cost model initially. */
+
+static void
+vect_loop_versioning (loop_vec_info loop_vinfo)
+{
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ struct loop *nloop;
+ tree cond_expr = NULL_TREE;
+ tree cond_expr_stmt_list = NULL_TREE;
+ basic_block condition_bb;
+ block_stmt_iterator cond_exp_bsi;
+ basic_block merge_bb;
+ basic_block new_exit_bb;
+ edge new_exit_e, e;
+ tree orig_phi, new_phi, arg;
+ unsigned prob = 4 * REG_BR_PROB_BASE / 5;
+ tree gimplify_stmt_list;
+ tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
+ int min_profitable_iters = 0;
+ unsigned int th;
+
+ /* Get profitability threshold for vectorized loop. */
+ min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
+
+ th = conservative_cost_threshold (loop_vinfo,
+ min_profitable_iters);
+
+ cond_expr =
+ build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
+ build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+
+ cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
+ false, NULL_TREE);
+
+ if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
+ vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
+ &cond_expr_stmt_list);
+
+ if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+ vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
+ &cond_expr_stmt_list);
+
+ cond_expr =
+ fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
+ cond_expr =
+ force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
+ NULL_TREE);
+ append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
+
+ initialize_original_copy_tables ();
+ nloop = loop_version (loop, cond_expr, &condition_bb,
+ prob, prob, REG_BR_PROB_BASE - prob, true);
+ free_original_copy_tables();
+
+ /* Loop versioning violates an assumption we try to maintain during
+ vectorization - that the loop exit block has a single predecessor.
+ After versioning, the exit block of both loop versions is the same
+ basic block (i.e. it has two predecessors). Just in order to simplify
+ following transformations in the vectorizer, we fix this situation
+ here by adding a new (empty) block on the exit-edge of the loop,
+ with the proper loop-exit phis to maintain loop-closed-form. */
+
+ merge_bb = single_exit (loop)->dest;
+ gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
+ new_exit_bb = split_edge (single_exit (loop));
+ new_exit_e = single_exit (loop);
+ e = EDGE_SUCC (new_exit_bb, 0);
+
+ for (orig_phi = phi_nodes (merge_bb); orig_phi;
+ orig_phi = PHI_CHAIN (orig_phi))
+ {
+ new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
+ new_exit_bb);
+ arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
+ add_phi_arg (new_phi, arg, new_exit_e);
+ SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
+ }
+
+ /* End loop-exit-fixes after versioning. */
+
+ update_ssa (TODO_update_ssa);
+ if (cond_expr_stmt_list)
+ {
+ cond_exp_bsi = bsi_last (condition_bb);
+ bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
+ }
+}
+
/* Remove a group of stores (for SLP or interleaving), free their
stmt_vec_info. */
return is_store;
}
-
/* Function vect_transform_loop.
The analysis phase has determined that the loop is vectorizable.
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vec_transform_loop ===");
- /* If the loop has data references that may or may not be aligned or/and
- has data reference relations whose independence was not proven then
- two versions of the loop need to be generated, one which is vectorized
- and one which isn't. A test is then generated to control which of the
- loops is executed. The test checks for the alignment of all of the
- data references that may or may not be aligned. An additional
- sequence of runtime tests is generated for each pairs of DDRs whose
- independence was not proven. The vectorized version of loop is
- executed only if both alias and alignment tests are passed. */
-
if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
|| VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
- {
- struct loop *nloop;
- tree cond_expr = NULL_TREE;
- tree cond_expr_stmt_list = NULL_TREE;
- basic_block condition_bb;
- block_stmt_iterator cond_exp_bsi;
- basic_block merge_bb;
- basic_block new_exit_bb;
- edge new_exit_e, e;
- tree orig_phi, new_phi, arg;
- unsigned prob = 4 * REG_BR_PROB_BASE / 5;
- tree gimplify_stmt_list;
-
- if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
- cond_expr =
- vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
-
- if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
- vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
- &cond_expr_stmt_list);
-
- cond_expr =
- fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
- cond_expr =
- force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
- NULL_TREE);
- append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
-
- initialize_original_copy_tables ();
- nloop = loop_version (loop, cond_expr, &condition_bb,
- prob, prob, REG_BR_PROB_BASE - prob, true);
- free_original_copy_tables();
-
- /** Loop versioning violates an assumption we try to maintain during
- vectorization - that the loop exit block has a single predecessor.
- After versioning, the exit block of both loop versions is the same
- basic block (i.e. it has two predecessors). Just in order to simplify
- following transformations in the vectorizer, we fix this situation
- here by adding a new (empty) block on the exit-edge of the loop,
- with the proper loop-exit phis to maintain loop-closed-form. **/
-
- merge_bb = single_exit (loop)->dest;
- gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
- new_exit_bb = split_edge (single_exit (loop));
- new_exit_e = single_exit (loop);
- e = EDGE_SUCC (new_exit_bb, 0);
-
- for (orig_phi = phi_nodes (merge_bb); orig_phi;
- orig_phi = PHI_CHAIN (orig_phi))
- {
- new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
- new_exit_bb);
- arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
- add_phi_arg (new_phi, arg, new_exit_e);
- SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
- }
-
- /** end loop-exit-fixes after versioning **/
-
- update_ssa (TODO_update_ssa);
- cond_exp_bsi = bsi_last (condition_bb);
- bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
- }
+ vect_loop_versioning (loop_vinfo);
/* CHECKME: we wouldn't need this if we called update_ssa once
for all loops. */