static int vect_min_worthwhile_factor (enum tree_code);
+static int
+cost_for_stmt (tree stmt)
+{
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+
+ switch (STMT_VINFO_TYPE (stmt_info))
+ {
+ case load_vec_info_type:
+ return TARG_SCALAR_LOAD_COST;
+ case store_vec_info_type:
+ return TARG_SCALAR_STORE_COST;
+ case op_vec_info_type:
+ case condition_vec_info_type:
+ case assignment_vec_info_type:
+ case reduc_vec_info_type:
+ case induc_vec_info_type:
+ case type_promotion_vec_info_type:
+ case type_demotion_vec_info_type:
+ case type_conversion_vec_info_type:
+ case call_vec_info_type:
+ return TARG_SCALAR_STMT_COST;
+ case undef_vec_info_type:
+ default:
+ gcc_unreachable ();
+ }
+}
+
+
+/* Function vect_estimate_min_profitable_iters
+
+ Return the number of iterations required for the vector version of the
+ loop to be profitable relative to the cost of the scalar version of the
+ loop.
+
+ TODO: Take profile info into account before making vectorization
+ decisions, if available. */
+
+int
+vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
+{
+ int i;
+ int min_profitable_iters;
+ int peel_iters_prologue;
+ int peel_iters_epilogue;
+ int vec_inside_cost = 0;
+ int vec_outside_cost = 0;
+ int scalar_single_iter_cost = 0;
+ int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
+ int nbbs = loop->num_nodes;
+ int byte_misalign;
+
+ /* Cost model disabled. */
+ if (!flag_vect_cost_model)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "cost model disabled.");
+ return 0;
+ }
+
+ /* Requires loop versioning tests to handle misalignment.
+ FIXME: Make cost depend on number of stmts in may_misalign list. */
+
+ if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
+ {
+ vec_outside_cost += TARG_COND_BRANCH_COST;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "cost model: Adding cost of checks for loop "
+ "versioning.\n");
+ }
+
+ /* Count statements in scalar loop. Using this as scalar cost for a single
+ iteration for now.
+
+ TODO: Add outer loop support.
+
+ TODO: Consider assigning different costs to different scalar
+ statements. */
+
+ for (i = 0; i < nbbs; i++)
+ {
+ block_stmt_iterator si;
+ basic_block bb = bbs[i];
+
+ for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+ {
+ tree stmt = bsi_stmt (si);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ if (!STMT_VINFO_RELEVANT_P (stmt_info)
+ && !STMT_VINFO_LIVE_P (stmt_info))
+ continue;
+ scalar_single_iter_cost += cost_for_stmt (stmt);
+ vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info);
+ vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
+ }
+ }
+
+ /* Add additional cost for the peeled instructions in prologue and epilogue
+ loop.
+
+ FORNOW: If we dont know the value of peel_iters for prologue or epilogue
+ at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
+
+ TODO: Build an expression that represents peel_iters for prologue and
+ epilogue to be used in a run-time test. */
+
+ byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
+
+ if (byte_misalign < 0)
+ {
+ peel_iters_prologue = (vf - 1)/2;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "cost model: "
+ "prologue peel iters set to (vf-1)/2.");
+
+ /* If peeling for alignment is unknown, loop bound of main loop becomes
+ unknown. */
+ peel_iters_epilogue = (vf - 1)/2;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "cost model: "
+ "epilogue peel iters set to (vf-1)/2 because "
+ "peeling for alignment is unknown .");
+ }
+ else
+ {
+ if (byte_misalign)
+ {
+ struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
+ int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
+ tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
+ int nelements = TYPE_VECTOR_SUBPARTS (vectype);
+
+ peel_iters_prologue = nelements - (byte_misalign / element_size);
+ }
+ else
+ peel_iters_prologue = 0;
+
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ {
+ peel_iters_epilogue = (vf - 1)/2;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "cost model: "
+ "epilogue peel iters set to (vf-1)/2 because "
+ "loop iterations are unknown .");
+ }
+ else
+ {
+ int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+ peel_iters_prologue = niters < peel_iters_prologue ?
+ niters : peel_iters_prologue;
+ peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
+ }
+ }
+
+ /* Requires a prologue loop when peeling to handle misalignment. Add cost of
+ two guards, one for the peeled loop and one for the vector loop. */
+
+ if (peel_iters_prologue)
+ {
+ vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "cost model: Adding cost of checks for "
+ "prologue.\n");
+ }
+
+ /* Requires an epilogue loop to finish up remaining iterations after vector
+ loop. Add cost of two guards, one for the peeled loop and one for the
+ vector loop. */
+
+ if (peel_iters_epilogue
+ || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
+ {
+ vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "cost model : Adding cost of checks for "
+ "epilogue.\n");
+ }
+
+ vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
+ + (peel_iters_epilogue * scalar_single_iter_cost);
+
+ /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
+ information we provide for the target is whether testing against the
+ threshold involves a runtime test. */
+ if (targetm.vectorize.builtin_vectorization_cost)
+ {
+ bool runtime_test = false;
+
+ /* If the number of iterations is unknown, or the
+ peeling-for-misalignment amount is unknown, we eill have to generate
+ a runtime test to test the loop count agains the threshold. */
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ || (byte_misalign < 0))
+ runtime_test = true;
+ vec_outside_cost +=
+ targetm.vectorize.builtin_vectorization_cost (runtime_test);
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
+ targetm.vectorize.builtin_vectorization_cost (runtime_test));
+ }
+
+ /* Calculate number of iterations required to make the vector version
+ profitable, relative to the loop bodies only. The following condition
+ must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
+ SIC = scalar iteration cost, VIC = vector iteration cost,
+ VOC = vector outside cost and VF = vectorization factor. */
+
+ if ((scalar_single_iter_cost * vf) > vec_inside_cost)
+ {
+ if (vec_outside_cost == 0)
+ min_profitable_iters = 1;
+ else
+ {
+ min_profitable_iters = (vec_outside_cost * vf)
+ / ((scalar_single_iter_cost * vf)
+ - vec_inside_cost);
+
+ if ((scalar_single_iter_cost * vf * min_profitable_iters)
+ <= ((vec_inside_cost * min_profitable_iters)
+ + (vec_outside_cost * vf)))
+ min_profitable_iters++;
+ }
+ }
+ /* vector version will never be profitable. */
+ else
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "cost model: vector iteration cost = %d "
+ "is divisible by scalar iteration cost = %d by a factor "
+ "greater than or equal to the vectorization factor = %d .",
+ vec_inside_cost, scalar_single_iter_cost, vf);
+ return -1;
+ }
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "Cost model analysis: \n");
+ fprintf (vect_dump, " Vector inside of loop cost: %d\n",
+ vec_inside_cost);
+ fprintf (vect_dump, " Vector outside of loop cost: %d\n",
+ vec_outside_cost);
+ fprintf (vect_dump, " Scalar cost: %d\n", scalar_single_iter_cost);
+ fprintf (vect_dump, " prologue iterations: %d\n",
+ peel_iters_prologue);
+ fprintf (vect_dump, " epilogue iterations: %d\n",
+ peel_iters_epilogue);
+ fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
+ min_profitable_iters);
+ fprintf (vect_dump, " Actual minimum iters for profitability: %d\n",
+ min_profitable_iters < vf ? vf : min_profitable_iters);
+ }
+
+ min_profitable_iters =
+ min_profitable_iters < vf ? vf : min_profitable_iters;
+
+ /* Because the condition we create is:
+ if (niters <= min_profitable_iters)
+ then skip the vectorized loop. */
+ min_profitable_iters--;
+ return min_profitable_iters;
+}
+
+
+/* TODO: Close dependency between vect_model_*_cost and vectorizable_*
+ functions. Design better to avoid maintenance issues. */
+
+/* Function vect_model_reduction_cost.
+
+ Models cost for a reduction operation, including the vector ops
+ generated within the strip-mine loop, the initial definition before
+ the loop, and the epilogue code that must be generated. */
+
+static void
+vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
+ int ncopies)
+{
+ int outer_cost = 0;
+ enum tree_code code;
+ optab optab;
+ tree vectype;
+ tree orig_stmt;
+ tree reduction_op;
+ enum machine_mode mode;
+ tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
+ int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
+
+ /* Cost of reduction op inside loop. */
+ STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
+
+ reduction_op = TREE_OPERAND (operation, op_type-1);
+ vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
+ mode = TYPE_MODE (vectype);
+ orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+
+ if (!orig_stmt)
+ orig_stmt = STMT_VINFO_STMT (stmt_info);
+
+ code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
+
+ /* Add in cost for initial definition. */
+ outer_cost += TARG_SCALAR_TO_VEC_COST;
+
+ /* Determine cost of epilogue code.
+
+ We have a reduction operator that will reduce the vector in one statement.
+ Also requires scalar extract. */
+
+ if (reduc_code < NUM_TREE_CODES)
+ outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
+ else
+ {
+ int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+ tree bitsize =
+ TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
+ int element_bitsize = tree_low_cst (bitsize, 1);
+ int nelements = vec_size_in_bits / element_bitsize;
+
+ optab = optab_for_tree_code (code, vectype);
+
+ /* We have a whole vector shift available. */
+ if (VECTOR_MODE_P (mode)
+ && optab->handlers[mode].insn_code != CODE_FOR_nothing
+ && vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
+ /* Final reduction via vector shifts and the reduction operator. Also
+ requires scalar extract. */
+ outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
+ + TARG_VEC_TO_SCALAR_COST);
+ else
+ /* Use extracts and reduction op for final reduction. For N elements,
+ we have N extracts and N-1 reduction ops. */
+ outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
+ }
+
+ STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
+ "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
+ STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+}
+
+
+/* Function vect_model_induction_cost.
+
+ Models cost for induction operations. */
+
+static void
+vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
+{
+ /* loop cost for vec_loop. */
+ STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
+ /* prologue cost for vec_init and vec_step. */
+ STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
+ "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
+ STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+}
+
+
+/* Function vect_model_simple_cost.
+
+ Models cost for simple operations, i.e. those that only emit ncopies of a
+ single op. Right now, this does not account for multiple insns that could
+ be generated for the single vector op. We will handle that shortly. */
+
+static void
+vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type *dt)
+{
+ int i;
+
+ STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
+
+ /* FORNOW: Assuming maximum 2 args per stmts. */
+ for (i=0; i<2; i++)
+ {
+ if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
+ STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) += TARG_SCALAR_TO_VEC_COST;
+ }
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
+ "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
+ STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+}
+
+
+/* Function vect_cost_strided_group_size
+
+ For strided load or store, return the group_size only if it is the first
+ load or store of a group, else return 1. This ensures that group size is
+ only returned once per group. */
+
+static int
+vect_cost_strided_group_size (stmt_vec_info stmt_info)
+{
+ tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
+
+ if (first_stmt == STMT_VINFO_STMT (stmt_info))
+ return DR_GROUP_SIZE (stmt_info);
+
+ return 1;
+}
+
+
+/* Function vect_model_store_cost
+
+ Models cost for stores. In the case of strided accesses, one access
+ has the overhead of the strided access attributed to it. */
+
+static void
+vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type dt)
+{
+ int cost = 0;
+ int group_size;
+
+ if (dt == vect_constant_def || dt == vect_invariant_def)
+ STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = TARG_SCALAR_TO_VEC_COST;
+
+ /* Strided access? */
+ if (DR_GROUP_FIRST_DR (stmt_info))
+ group_size = vect_cost_strided_group_size (stmt_info);
+ /* Not a strided access. */
+ else
+ group_size = 1;
+
+ /* Is this an access in a group of stores, which provide strided access?
+ If so, add in the cost of the permutes. */
+ if (group_size > 1)
+ {
+ /* Uses a high and low interleave operation for each needed permute. */
+ cost = ncopies * exact_log2(group_size) * group_size
+ * TARG_VEC_STMT_COST;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
+ group_size);
+
+ }
+
+ /* Costs of the stores. */
+ cost += ncopies * TARG_VEC_STORE_COST;
+
+ STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = cost;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
+ "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
+ STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+}
+
+
+/* Function vect_model_load_cost
+
+ Models cost for loads. In the case of strided accesses, the last access
+ has the overhead of the strided access attributed to it. Since unaligned
+ accesses are supported for loads, we also account for the costs of the
+ access scheme chosen. */
+
+static void
+vect_model_load_cost (stmt_vec_info stmt_info, int ncopies)
+
+{
+ int inner_cost = 0;
+ int group_size;
+ int alignment_support_cheme;
+ tree first_stmt;
+ struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
+
+ /* Strided accesses? */
+ first_stmt = DR_GROUP_FIRST_DR (stmt_info);
+ if (first_stmt)
+ {
+ group_size = vect_cost_strided_group_size (stmt_info);
+ first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
+ }
+ /* Not a strided access. */
+ else
+ {
+ group_size = 1;
+ first_dr = dr;
+ }
+
+ alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
+
+ /* Is this an access in a group of loads providing strided access?
+ If so, add in the cost of the permutes. */
+ if (group_size > 1)
+ {
+ /* Uses an even and odd extract operations for each needed permute. */
+ inner_cost = ncopies * exact_log2(group_size) * group_size
+ * TARG_VEC_STMT_COST;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
+ group_size);
+
+ }
+
+ /* The loads themselves. */
+ switch (alignment_support_cheme)
+ {
+ case dr_aligned:
+ {
+ inner_cost += ncopies * TARG_VEC_LOAD_COST;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_model_load_cost: aligned.");
+
+ break;
+ }
+ case dr_unaligned_supported:
+ {
+ /* Here, we assign an additional cost for the unaligned load. */
+ inner_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
+ "hardware.");
+
+ break;
+ }
+ case dr_unaligned_software_pipeline:
+ {
+ int outer_cost = 0;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_model_load_cost: unaligned software "
+ "pipelined.");
+
+ /* Unaligned software pipeline has a load of an address, an initial
+ load, and possibly a mask operation to "prime" the loop. However,
+ if this is an access in a group of loads, which provide strided
+ access, then the above cost should only be considered for one
+ access in the group. Inside the loop, there is a load op
+ and a realignment op. */
+
+ if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1)
+ {
+ outer_cost = 2*TARG_VEC_STMT_COST;
+ if (targetm.vectorize.builtin_mask_for_load)
+ outer_cost += TARG_VEC_STMT_COST;
+ }
+
+ STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
+
+ inner_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
+
+ break;
+ }
+
+ default:
+ gcc_unreachable ();
+ }
+
+ STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = inner_cost;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
+ "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
+ STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+
+}
+
+
/* Function vect_get_new_vect_var.
Returns a name for a new variable. The current naming scheme appends the
/* Create base_offset */
base_offset = size_binop (PLUS_EXPR, base_offset, init);
+ base_offset = fold_convert (sizetype, base_offset);
dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
add_referenced_var (dest);
base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
if (offset)
{
- tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
+ tree tmp = create_tmp_var (sizetype, "offset");
tree step;
/* For interleaved access step we divide STEP by the size of the
offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
base_offset, offset);
- base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
+ base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
append_to_statement_list_force (new_stmt, new_stmt_list);
}
/* base + base_offset */
- addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
+ addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
base_offset);
vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
accessed in the loop by STMT, along with the def-use update chain to
appropriately advance the pointer through the loop iterations. Also set
aliasing information for the pointer. This vector pointer is used by the
- callers to this function to create a memory reference expression for vector
+ callers to this function to create a memory reference expression for vector
load/store access.
Input:
/** (2) Add aliasing information to the new vector-pointer:
(The points-to info (DR_PTR_INFO) may be defined later.) **/
- tag = DR_MEMTAG (dr);
+ tag = DR_SYMBOL_TAG (dr);
gcc_assert (tag);
/* If tag is a variable (and NOT_A_TAG) than a new symbol memory
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
tree vptr_type = TREE_TYPE (dataref_ptr);
tree ptr_var = SSA_NAME_VAR (dataref_ptr);
- tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
+ tree update = TYPE_SIZE_UNIT (vectype);
tree incr_stmt;
ssa_op_iter iter;
use_operand_p use_p;
tree new_dataref_ptr;
incr_stmt = build_gimple_modify_stmt (ptr_var,
- build2 (PLUS_EXPR, vptr_type,
+ build2 (POINTER_PLUS_EXPR, vptr_type,
dataref_ptr, update));
new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
- tree scalar_type = TREE_TYPE (iv_phi);
+ tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
tree vectype = get_vectype_for_scalar_type (scalar_type);
int nunits = TYPE_VECTOR_SUBPARTS (vectype);
edge pe = loop_preheader_edge (loop);
access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi));
gcc_assert (access_fn);
- ok = vect_is_simple_iv_evolution (loop->num, access_fn, &init_expr, &step_expr);
+ ok = vect_is_simple_iv_evolution (loop->num, access_fn,
+ &init_expr, &step_expr);
gcc_assert (ok);
/* Create the vector that holds the initial_value of the induction. */
/* FIXME: use build_constructor directly. */
vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
vec_inv = build_constructor_from_list (vector_type, t);
+
return vect_init_vector (stmt, vec_inv, vector_type);
}
vectorized stmt to be created (by the caller to this function) is a "copy"
created in case the vectorized result cannot fit in one vector, and several
copies of the vector-stmt are required. In this case the vector-def is
- retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
+ retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
of the stmt that defines VEC_OPRND.
DT is the type of the vector def VEC_OPRND.
}
-#define ADJUST_IN_EPILOG 1
-
/* Function get_initial_def_for_reduction
Input:
INIT_VAL - the initial value of the reduction variable
Output:
- SCALAR_DEF - a tree that holds a value to be added to the final result
- of the reduction (used for "ADJUST_IN_EPILOG" - see below).
+ ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
+ of the reduction (used for adjusting the epilog - see below).
Return a vector variable, initialized according to the operation that STMT
- performs. This vector will be used as the initial value of the
- vector of partial results.
+ performs. This vector will be used as the initial value of the
+ vector of partial results.
- Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
+ Option1 (adjust in epilog): Initialize the vector as follows:
add: [0,0,...,0,0]
mult: [1,1,...,1,1]
min/max: [init_val,init_val,..,init_val,init_val]
bit and/or: [init_val,init_val,..,init_val,init_val]
- and when necessary (e.g. add/mult case) let the caller know
+ and when necessary (e.g. add/mult case) let the caller know
that it needs to adjust the result by init_val.
Option2: Initialize the vector as follows:
or [0,0,0,0] and let the caller know that it needs to adjust
the result at the end by 'init_val'.
- FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
- TODO: Use some cost-model to estimate which scheme is more profitable.
-*/
+ FORNOW, we are using the 'adjust in epilog' scheme, because this way the
+ initialization vector is simpler (same element in all entries).
+ A cost model should help decide between these two schemes. */
static tree
-get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
+get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
{
stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
int nunits = TYPE_VECTOR_SUBPARTS (vectype);
- int nelements;
enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
tree type = TREE_TYPE (init_val);
- tree def;
- tree vec, t = NULL_TREE;
- bool need_epilog_adjust;
+ tree vecdef;
+ tree def_for_init;
+ tree init_def;
+ tree t = NULL_TREE;
int i;
tree vector_type;
gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
+ vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
switch (code)
{
case WIDEN_SUM_EXPR:
case DOT_PROD_EXPR:
case PLUS_EXPR:
+ *adjustment_def = init_val;
+ /* Create a vector of zeros for init_def. */
if (INTEGRAL_TYPE_P (type))
- def = build_int_cst (type, 0);
+ def_for_init = build_int_cst (type, 0);
else
- def = build_real (type, dconst0);
-
-#ifdef ADJUST_IN_EPILOG
- /* All the 'nunits' elements are set to 0. The final result will be
- adjusted by 'init_val' at the loop epilog. */
- nelements = nunits;
- need_epilog_adjust = true;
-#else
- /* 'nunits - 1' elements are set to 0; The last element is set to
- 'init_val'. No further adjustments at the epilog are needed. */
- nelements = nunits - 1;
- need_epilog_adjust = false;
-#endif
+ def_for_init = build_real (type, dconst0);
+ for (i = nunits - 1; i >= 0; --i)
+ t = tree_cons (NULL_TREE, def_for_init, t);
+ vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
+ init_def = build_vector (vector_type, t);
break;
case MIN_EXPR:
case MAX_EXPR:
- def = init_val;
- nelements = nunits;
- need_epilog_adjust = false;
+ *adjustment_def = NULL_TREE;
+ init_def = vecdef;
break;
default:
gcc_unreachable ();
}
- for (i = nelements - 1; i >= 0; --i)
- t = tree_cons (NULL_TREE, def, t);
-
- if (nelements == nunits - 1)
- {
- /* Set the last element of the vector. */
- t = tree_cons (NULL_TREE, init_val, t);
- nelements += 1;
- }
- gcc_assert (nelements == nunits);
-
- vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
- if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
- vec = build_vector (vector_type, t);
- else
- vec = build_constructor_from_list (vector_type, t);
-
- if (!need_epilog_adjust)
- *scalar_def = NULL_TREE;
- else
- *scalar_def = init_val;
-
- return vect_init_vector (stmt, vec, vector_type);
+ return init_def;
}
loop:
vec_def = phi <null, null> # REDUCTION_PHI
- VECT_DEF = vector_stmt # vectorized form of STMT
+ VECT_DEF = vector_stmt # vectorized form of STMT
s_loop = scalar_stmt # (scalar) STMT
loop_exit:
s_out0 = phi <s_loop> # (scalar) EXIT_PHI
exit_bb = single_exit (loop)->dest;
new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
- exit_bsi = bsi_start (exit_bb);
+ exit_bsi = bsi_after_labels (exit_bb);
/* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
- (i.e. when reduc_code is not available) and in the final adjustment code
- (if needed). Also get the original scalar reduction variable as
+ (i.e. when reduc_code is not available) and in the final adjustment
+ code (if needed). Also get the original scalar reduction variable as
defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
represents a reduction pattern), the tree-code and scalar-def are
taken from the original stmt that the pattern-stmt (STMT) replaces.
epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
new_temp = make_ssa_name (vec_dest, epilog_stmt);
GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
- bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+ bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
extract_scalar_result = true;
}
epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
new_name = make_ssa_name (vec_dest, epilog_stmt);
GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
- bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+ bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
tmp = build2 (code, vectype, new_name, new_temp);
epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
new_temp = make_ssa_name (vec_dest, epilog_stmt);
GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
- bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+ bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
}
extract_scalar_result = true;
epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
- bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+ bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
for (bit_offset = element_bitsize;
bit_offset < vec_size_in_bits;
bitpos);
BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
- epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
+ epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
- bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+ bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
tmp = build2 (code, scalar_type, new_name, new_temp);
epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
- bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+ bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
}
extract_scalar_result = false;
epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
- bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+ bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
}
/* 2.4 Adjust the final result by the initial value of the reduction
epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
- bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+ bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
}
/* 2.6 Replace uses of s_out0 with uses of s_out3 */
and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
sequence that had been detected and replaced by the pattern-stmt (STMT).
- In some cases of reduction patterns, the type of the reduction variable X is
+ In some cases of reduction patterns, the type of the reduction variable X is
different than the type of the other arguments of STMT.
In such cases, the vectype that is used when transforming STMT into a vector
- stmt is different than the vectype that is used to determine the
+ stmt is different than the vectype that is used to determine the
vectorization factor, because it consists of a different number of elements
than the actual number of elements that are being operated upon in parallel.
- For example, consider an accumulation of shorts into an int accumulator.
+ For example, consider an accumulation of shorts into an int accumulator.
On some targets it's possible to vectorize this pattern operating on 8
shorts at a time (hence, the vectype for purposes of determining the
vectorization factor should be V8HI); on the other hand, the vectype that
- is used to create the vector form is actually V4SI (the type of the result).
+ is used to create the vector form is actually V4SI (the type of the result).
- Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
- indicates what is the actual level of parallelism (V8HI in the example), so
- that the right vectorization factor would be derived. This vectype
- corresponds to the type of arguments to the reduction stmt, and should *NOT*
+ Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
+ indicates what is the actual level of parallelism (V8HI in the example), so
+ that the right vectorization factor would be derived. This vectype
+ corresponds to the type of arguments to the reduction stmt, and should *NOT*
be used to create the vectorized stmt. The right vectype for the vectorized
- stmt is obtained from the type of the result X:
+ stmt is obtained from the type of the result X:
get_vectype_for_scalar_type (TREE_TYPE (X))
- This means that, contrary to "regular" reductions (or "regular" stmts in
+ This means that, contrary to "regular" reductions (or "regular" stmts in
general), the following equation:
STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
does *NOT* necessarily hold for reduction patterns. */
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+ vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
return true;
}
op = TREE_OPERAND (operation, 1);
loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
}
-
+
/* Get the vector def for the reduction variable from the phi node */
reduc_def = PHI_RESULT (new_phi);
}
loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
if (op_type == ternary_op)
loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
-
+
/* Get the vector def for the reduction variable from the vectorized
reduction operation generated in the previous iteration (j-1) */
reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
}
-
+
/* Arguments are ready. create the new vector stmt. */
-
if (op_type == binary_op)
expr = build2 (code, vectype, loop_vec_def0, reduc_def);
else
expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
- reduc_def);
+ reduc_def);
new_stmt = build_gimple_modify_stmt (vec_dest, expr);
new_temp = make_ssa_name (vec_dest, new_stmt);
GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
vect_finish_stmt_generation (stmt, new_stmt, bsi);
-
+
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
else
STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
prev_stmt_info = vinfo_for_stmt (new_stmt);
}
-
+
/* Finalize the reduction-phi (set it's arguments) and create the
epilog reduction code. */
- vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
+ vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
return true;
}
tree scalar_dest;
tree operation;
tree op, type;
+ tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
tree vectype_out, vectype_in;
+ int nunits_in;
+ int nunits_out;
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
- enum vect_def_type dt[2];
+ enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
+ tree new_stmt;
int ncopies, j, nargs;
call_expr_arg_iterator iter;
+ tree vargs;
+ enum { NARROW, NONE, WIDEN } modifier;
if (!STMT_VINFO_RELEVANT_P (stmt_info))
return false;
nargs = 0;
FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
{
- ++nargs;
-
/* Bail out if the function has more than two arguments, we
do not have interesting builtin functions to vectorize with
more than two arguments. */
- if (nargs > 2)
+ if (nargs >= 2)
return false;
/* We can only handle calls with arguments of the same type. */
}
rhs_type = TREE_TYPE (op);
- if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1]))
+ if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
return false;
}
+
+ ++nargs;
}
/* No arguments is also not good. */
return false;
vectype_in = get_vectype_for_scalar_type (rhs_type);
+ nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
vectype_out = get_vectype_for_scalar_type (lhs_type);
+ nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
- /* Only handle the case of vectors with the same number of elements.
- FIXME: We need a way to handle for example the SSE2 cvtpd2dq
- instruction which converts V2DFmode to V4SImode but only
- using the lower half of the V4SImode result. */
- if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
+ /* FORNOW */
+ if (nunits_in == nunits_out / 2)
+ modifier = NARROW;
+ else if (nunits_out == nunits_in)
+ modifier = NONE;
+ else if (nunits_out == nunits_in / 2)
+ modifier = WIDEN;
+ else
return false;
/* For now, we only vectorize functions if a target specific builtin
gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
+ if (modifier == NARROW)
+ ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
+ else
+ ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
+
+ /* Sanity check: make sure that at least one copy of the vectorized stmt
+ needs to be generated. */
+ gcc_assert (ncopies >= 1);
+
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "=== vectorizable_call ===");
+ vect_model_simple_cost (stmt_info, ncopies, dt);
return true;
}
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "transform operation.");
- ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
- / TYPE_VECTOR_SUBPARTS (vectype_out));
- gcc_assert (ncopies >= 1);
-
/* Handle def. */
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
prev_stmt_info = NULL;
- for (j = 0; j < ncopies; ++j)
- {
- tree new_stmt, vargs;
- tree vec_oprnd[2];
- int n;
-
- /* Build argument list for the vectorized call. */
- /* FIXME: Rewrite this so that it doesn't construct a temporary
- list. */
- vargs = NULL_TREE;
- n = -1;
- FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
+ switch (modifier)
+ {
+ case NONE:
+ for (j = 0; j < ncopies; ++j)
{
- ++n;
+ /* Build argument list for the vectorized call. */
+ /* FIXME: Rewrite this so that it doesn't
+ construct a temporary list. */
+ vargs = NULL_TREE;
+ nargs = 0;
+ FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
+ {
+ if (j == 0)
+ vec_oprnd0
+ = vect_get_vec_def_for_operand (op, stmt, NULL);
+ else
+ vec_oprnd0
+ = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
+
+ vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
+
+ ++nargs;
+ }
+ vargs = nreverse (vargs);
+
+ rhs = build_function_call_expr (fndecl, vargs);
+ new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
if (j == 0)
- vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
+ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
else
- vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
- vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
}
- vargs = nreverse (vargs);
- rhs = build_function_call_expr (fndecl, vargs);
- new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
- new_temp = make_ssa_name (vec_dest, new_stmt);
- GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ break;
- vect_finish_stmt_generation (stmt, new_stmt, bsi);
+ case NARROW:
+ for (j = 0; j < ncopies; ++j)
+ {
+ /* Build argument list for the vectorized call. */
+ /* FIXME: Rewrite this so that it doesn't
+ construct a temporary list. */
+ vargs = NULL_TREE;
+ nargs = 0;
+ FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
+ {
+ if (j == 0)
+ {
+ vec_oprnd0
+ = vect_get_vec_def_for_operand (op, stmt, NULL);
+ vec_oprnd1
+ = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
+ }
+ else
+ {
+ vec_oprnd0
+ = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
+ vec_oprnd1
+ = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
+ }
+
+ vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
+ vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
+
+ ++nargs;
+ }
+ vargs = nreverse (vargs);
+
+ rhs = build_function_call_expr (fndecl, vargs);
+ new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
+
+ if (j == 0)
+ STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ }
+
+ *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
+
+ break;
+
+ case WIDEN:
+ /* No current target implements this case. */
+ return false;
+ }
+
+ /* The call in STMT might prevent it from being removed in dce.
+ We however cannot remove it here, due to the way the ssa name
+ it defines is mapped to the new definition. So just replace
+ rhs of the statement with something harmless. */
+ type = TREE_TYPE (scalar_dest);
+ GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
+ update_stmt (stmt);
+
+ return true;
+}
+
+
+/* Function vect_gen_widened_results_half
+
+ Create a vector stmt whose code, type, number of arguments, and result
+ variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
+ VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
+ In the case that CODE is a CALL_EXPR, this means that a call to DECL
+ needs to be created (DECL is a function-decl of a target-builtin).
+ STMT is the original scalar stmt that we are vectorizing. */
+
+static tree
+vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
+ tree vec_oprnd0, tree vec_oprnd1, int op_type,
+ tree vec_dest, block_stmt_iterator *bsi,
+ tree stmt)
+{
+ tree expr;
+ tree new_stmt;
+ tree new_temp;
+ tree sym;
+ ssa_op_iter iter;
+
+ /* Generate half of the widened result: */
+ if (code == CALL_EXPR)
+ {
+ /* Target specific support */
+ if (op_type == binary_op)
+ expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
+ else
+ expr = build_call_expr (decl, 1, vec_oprnd0);
+ }
+ else
+ {
+ /* Generic support */
+ gcc_assert (op_type == TREE_CODE_LENGTH (code));
+ if (op_type == binary_op)
+ expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
+ else
+ expr = build1 (code, vectype, vec_oprnd0);
+ }
+ new_stmt = build_gimple_modify_stmt (vec_dest, expr);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
- if (j == 0)
- STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
- else
- STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
- prev_stmt_info = vinfo_for_stmt (new_stmt);
+ if (code == CALL_EXPR)
+ {
+ FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
+ {
+ if (TREE_CODE (sym) == SSA_NAME)
+ sym = SSA_NAME_VAR (sym);
+ mark_sym_for_renaming (sym);
+ }
}
- /* The call in STMT might prevent it from being removed in dce. We however
- cannot remove it here, due to the way the ssa name it defines is mapped
- to the new definition. So just replace rhs of the statement with something
- harmless. */
- type = TREE_TYPE (scalar_dest);
- GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
-
- return true;
+ return new_stmt;
}
tree scalar_dest;
tree operation;
tree op0;
- tree vec_oprnd0 = NULL_TREE;
+ tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
- enum tree_code code;
+ enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
+ tree decl1 = NULL_TREE, decl2 = NULL_TREE;
tree new_temp;
tree def, def_stmt;
enum vect_def_type dt0;
tree new_stmt;
+ stmt_vec_info prev_stmt_info;
int nunits_in;
int nunits_out;
- int ncopies, j;
tree vectype_out, vectype_in;
+ int ncopies, j;
+ tree expr;
tree rhs_type, lhs_type;
tree builtin_decl;
- stmt_vec_info prev_stmt_info;
+ enum { NARROW, NONE, WIDEN } modifier;
/* Is STMT a vectorizable conversion? */
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
lhs_type = TREE_TYPE (scalar_dest);
vectype_out = get_vectype_for_scalar_type (lhs_type);
- gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
- /* FORNOW: need to extend to support short<->float conversions as well. */
- if (nunits_out != nunits_in)
+ /* FORNOW */
+ if (nunits_in == nunits_out / 2)
+ modifier = NARROW;
+ else if (nunits_out == nunits_in)
+ modifier = NONE;
+ else if (nunits_out == nunits_in / 2)
+ modifier = WIDEN;
+ else
return false;
+ if (modifier == NONE)
+ gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
+
/* Bail out if the types are both integral or non-integral */
if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
|| (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
return false;
+ if (modifier == NARROW)
+ ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
+ else
+ ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
+
/* Sanity check: make sure that at least one copy of the vectorized stmt
needs to be generated. */
- ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
gcc_assert (ncopies >= 1);
+ /* Check the operands of the operation. */
if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
{
if (vect_print_dump_info (REPORT_DETAILS))
}
/* Supportable by target? */
- if (!targetm.vectorize.builtin_conversion (code, vectype_in))
+ if ((modifier == NONE
+ && !targetm.vectorize.builtin_conversion (code, vectype_in))
+ || (modifier == WIDEN
+ && !supportable_widening_operation (code, stmt, vectype_in,
+ &decl1, &decl2,
+ &code1, &code2))
+ || (modifier == NARROW
+ && !supportable_narrowing_operation (code, stmt, vectype_in,
+ &code1)))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "op not supported by target.");
return false;
}
+ if (modifier != NONE)
+ STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
+
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
return true;
}
- /** Transform. **/
-
+ /** Transform. **/
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "transform conversion.");
vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
prev_stmt_info = NULL;
- for (j = 0; j < ncopies; j++)
+ switch (modifier)
{
- tree sym;
- ssa_op_iter iter;
+ case NONE:
+ for (j = 0; j < ncopies; j++)
+ {
+ tree sym;
+ ssa_op_iter iter;
- if (j == 0)
- vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
- else
- vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ if (j == 0)
+ vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
+ else
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
- builtin_decl =
- targetm.vectorize.builtin_conversion (code, vectype_in);
- new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
+ builtin_decl =
+ targetm.vectorize.builtin_conversion (code, vectype_in);
+ new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
- /* Arguments are ready. create the new vector stmt. */
- new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
- new_temp = make_ssa_name (vec_dest, new_stmt);
- GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
- vect_finish_stmt_generation (stmt, new_stmt, bsi);
- FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
- {
- if (TREE_CODE (sym) == SSA_NAME)
- sym = SSA_NAME_VAR (sym);
- mark_sym_for_renaming (sym);
- }
+ /* Arguments are ready. create the new vector stmt. */
+ new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
+ FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
+ {
+ if (TREE_CODE (sym) == SSA_NAME)
+ sym = SSA_NAME_VAR (sym);
+ mark_sym_for_renaming (sym);
+ }
- if (j == 0)
- STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
- else
- STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
- prev_stmt_info = vinfo_for_stmt (new_stmt);
+ if (j == 0)
+ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ }
+ break;
+
+ case WIDEN:
+ /* In case the vectorization factor (VF) is bigger than the number
+ of elements that we can fit in a vectype (nunits), we have to
+ generate more than one vector stmt - i.e - we need to "unroll"
+ the vector stmt by a factor VF/nunits. */
+ for (j = 0; j < ncopies; j++)
+ {
+ if (j == 0)
+ vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
+ else
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+
+ STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
+
+ /* Generate first half of the widened result: */
+ new_stmt
+ = vect_gen_widened_results_half (code1, vectype_out, decl1,
+ vec_oprnd0, vec_oprnd1,
+ unary_op, vec_dest, bsi, stmt);
+ if (j == 0)
+ STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+
+ /* Generate second half of the widened result: */
+ new_stmt
+ = vect_gen_widened_results_half (code2, vectype_out, decl2,
+ vec_oprnd0, vec_oprnd1,
+ unary_op, vec_dest, bsi, stmt);
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ }
+ break;
+
+ case NARROW:
+ /* In case the vectorization factor (VF) is bigger than the number
+ of elements that we can fit in a vectype (nunits), we have to
+ generate more than one vector stmt - i.e - we need to "unroll"
+ the vector stmt by a factor VF/nunits. */
+ for (j = 0; j < ncopies; j++)
+ {
+ /* Handle uses. */
+ if (j == 0)
+ {
+ vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ }
+ else
+ {
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ }
+
+ /* Arguments are ready. Create the new vector stmt. */
+ expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
+ new_stmt = build_gimple_modify_stmt (vec_dest, expr);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
+
+ if (j == 0)
+ STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ }
+
+ *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
}
return true;
}
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
tree new_temp;
tree def, def_stmt;
- enum vect_def_type dt;
+ enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
int nunits = TYPE_VECTOR_SUBPARTS (vectype);
int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
return false;
op = GIMPLE_STMT_OPERAND (stmt, 1);
- if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
+ if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "=== vectorizable_assignment ===");
+ vect_model_simple_cost (stmt_info, ncopies, dt);
return true;
}
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "=== vectorizable_induction ===");
+ vect_model_induction_cost (stmt_info, ncopies);
return true;
}
int icode;
enum machine_mode optab_op2_mode;
tree def, def_stmt;
- enum vect_def_type dt0, dt1;
+ enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
tree new_stmt;
stmt_vec_info prev_stmt_info;
int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
operation = GIMPLE_STMT_OPERAND (stmt, 1);
code = TREE_CODE (operation);
+
+ /* For pointer addition, we should use the normal plus for
+ the vector addition. */
+ if (code == POINTER_PLUS_EXPR)
+ code = PLUS_EXPR;
+
optab = optab_for_tree_code (code, vectype);
/* Support only unary or binary operations. */
}
op0 = TREE_OPERAND (operation, 0);
- if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
+ if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
return false;
}
-
+
if (op_type == binary_op)
{
op1 = TREE_OPERAND (operation, 1);
- if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
+ if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
by a scalar shift operand. */
optab_op2_mode = insn_data[icode].operand[2].mode;
if (! (VECTOR_MODE_P (optab_op2_mode)
- || dt1 == vect_constant_def
- || dt1 == vect_invariant_def))
+ || dt[1] == vect_constant_def
+ || dt[1] == vect_invariant_def))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "operand mode requires invariant argument.");
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "=== vectorizable_operation ===");
+ vect_model_simple_cost (stmt_info, ncopies, dt);
return true;
}
stmts that use the defs of the current stmt. The example below illustrates
the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4 vectorized stmts):
-
+
before vectorization:
RELATED_STMT VEC_STMT
S1: x = memref - -
S2: z = x + 1 - -
-
+
step 1: vectorize stmt S1 (done in vectorizable_load. See more details
there):
RELATED_STMT VEC_STMT
VS1_3: vx3 = memref3 - -
S1: x = load - VS1_0
S2: z = x + 1 - -
-
+
step2: vectorize stmt S2 (done here):
To vectorize stmt S2 we first need to find the relevant vector
def for the first operand 'x'. This is, as usual, obtained from
VS2_2: vz2 = vx2 + v1 VS2_3 -
VS2_3: vz3 = vx3 + v1 - -
S2: z = x + 1 - VS2_0 */
-
+
prev_stmt_info = NULL;
for (j = 0; j < ncopies; j++)
{
}
else
{
- vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
if (op_type == binary_op)
- vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
}
/* Arguments are ready. create the new vector stmt. */
-
+
if (op_type == binary_op)
new_stmt = build_gimple_modify_stmt (vec_dest,
build2 (code, vectype, vec_oprnd0, vec_oprnd1));
new_temp = make_ssa_name (vec_dest, new_stmt);
GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
vect_finish_stmt_generation (stmt, new_stmt, bsi);
-
+
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
else
/* Function vectorizable_type_demotion
-
+
Check if STMT performs a binary or unary operation that involves
type demotion, and if it can be vectorized.
If VEC_STMT is also passed, vectorize the STMT: create a vectorized
stmt to replace it, put it in VEC_STMT, and insert it at BSI.
Return FALSE if not a vectorizable STMT, TRUE otherwise. */
-
+
bool
vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
- tree *vec_stmt)
+ tree *vec_stmt)
{
tree vec_dest;
tree scalar_dest;
tree vec_oprnd0=NULL, vec_oprnd1=NULL;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
- enum tree_code code;
+ enum tree_code code, code1 = ERROR_MARK;
tree new_temp;
tree def, def_stmt;
- enum vect_def_type dt0;
+ enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
tree new_stmt;
stmt_vec_info prev_stmt_info;
int nunits_in;
int j;
tree expr;
tree vectype_in;
- tree scalar_type;
- optab optab;
- enum machine_mode vec_mode;
-
+
if (!STMT_VINFO_RELEVANT_P (stmt_info))
return false;
fprintf (vect_dump, "value used after loop.");
return false;
}
-
+
/* Is STMT a vectorizable type-demotion operation? */
if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
return false;
-
+
if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
return false;
-
+
operation = GIMPLE_STMT_OPERAND (stmt, 1);
code = TREE_CODE (operation);
if (code != NOP_EXPR && code != CONVERT_EXPR)
return false;
-
+
op0 = TREE_OPERAND (operation, 0);
vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
-
+
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
- scalar_type = TREE_TYPE (scalar_dest);
- vectype_out = get_vectype_for_scalar_type (scalar_type);
+ vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
if (nunits_in != nunits_out / 2) /* FORNOW */
return false;
-
+
ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
gcc_assert (ncopies >= 1);
- if (! INTEGRAL_TYPE_P (scalar_type)
- || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
+ if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
+ && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
+ || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
+ && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
+ && (code == NOP_EXPR || code == CONVERT_EXPR))))
return false;
-
+
/* Check the operands of the operation. */
- if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
+ if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
return false;
}
-
+
/* Supportable by target? */
- code = VEC_PACK_MOD_EXPR;
- optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
- if (!optab)
- return false;
-
- vec_mode = TYPE_MODE (vectype_in);
- if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
+ if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
return false;
-
+
STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
-
+
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "=== vectorizable_demotion ===");
+ vect_model_simple_cost (stmt_info, ncopies, dt);
return true;
}
-
+
/** Transform. **/
-
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
- ncopies);
-
+ ncopies);
+
/* Handle def. */
vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
if (j == 0)
{
vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
- vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
}
else
{
- vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
- vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
}
-
+
/* Arguments are ready. Create the new vector stmt. */
- expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
+ expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
new_stmt = build_gimple_modify_stmt (vec_dest, expr);
new_temp = make_ssa_name (vec_dest, new_stmt);
GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
vect_finish_stmt_generation (stmt, new_stmt, bsi);
-
+
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
else
STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
-
+
prev_stmt_info = vinfo_for_stmt (new_stmt);
}
-
+
*vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
return true;
}
-/* Function vect_gen_widened_results_half
-
- Create a vector stmt whose code, type, number of arguments, and result
- variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
- VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
- In the case that CODE is a CALL_EXPR, this means that a call to DECL
- needs to be created (DECL is a function-decl of a target-builtin).
- STMT is the original scalar stmt that we are vectorizing. */
-
-static tree
-vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
- tree vec_oprnd0, tree vec_oprnd1, int op_type,
- tree vec_dest, block_stmt_iterator *bsi,
- tree stmt)
-{
- tree expr;
- tree new_stmt;
- tree new_temp;
- tree sym;
- ssa_op_iter iter;
-
- /* Generate half of the widened result: */
- if (code == CALL_EXPR)
- {
- /* Target specific support */
- if (op_type == binary_op)
- expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
- else
- expr = build_call_expr (decl, 1, vec_oprnd0);
- }
- else
- {
- /* Generic support */
- gcc_assert (op_type == TREE_CODE_LENGTH (code));
- if (op_type == binary_op)
- expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
- else
- expr = build1 (code, vectype, vec_oprnd0);
- }
- new_stmt = build_gimple_modify_stmt (vec_dest, expr);
- new_temp = make_ssa_name (vec_dest, new_stmt);
- GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
- vect_finish_stmt_generation (stmt, new_stmt, bsi);
-
- if (code == CALL_EXPR)
- {
- FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
- {
- if (TREE_CODE (sym) == SSA_NAME)
- sym = SSA_NAME_VAR (sym);
- mark_sym_for_renaming (sym);
- }
- }
-
- return new_stmt;
-}
-
-
/* Function vectorizable_type_promotion
Check if STMT performs a binary or unary operation that involves
tree vec_oprnd0=NULL, vec_oprnd1=NULL;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
- enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
+ enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
tree decl1 = NULL_TREE, decl2 = NULL_TREE;
int op_type;
tree def, def_stmt;
- enum vect_def_type dt0, dt1;
+ enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
tree new_stmt;
stmt_vec_info prev_stmt_info;
int nunits_in;
operation = GIMPLE_STMT_OPERAND (stmt, 1);
code = TREE_CODE (operation);
- if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
+ if (code != NOP_EXPR && code != CONVERT_EXPR
+ && code != WIDEN_MULT_EXPR)
return false;
op0 = TREE_OPERAND (operation, 0);
vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
- ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
- gcc_assert (ncopies >= 1);
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
if (nunits_out != nunits_in / 2) /* FORNOW */
return false;
- if (! INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
- || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
+ ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
+ gcc_assert (ncopies >= 1);
+
+ if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
+ && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
+ || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
+ && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
+ && (code == CONVERT_EXPR || code == NOP_EXPR))))
return false;
/* Check the operands of the operation. */
- if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
+ if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
if (op_type == binary_op)
{
op1 = TREE_OPERAND (operation, 1);
- if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
+ if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "=== vectorizable_promotion ===");
+ vect_model_simple_cost (stmt_info, 2*ncopies, dt);
return true;
}
}
else
{
- vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
if (op_type == binary_op)
- vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
}
/* Arguments are ready. Create the new vector stmt. We are creating
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
+ vect_model_store_cost (stmt_info, ncopies, dt);
return true;
}
/** Transform. **/
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
-
if (strided_store)
{
first_stmt = DR_GROUP_FIRST_DR (stmt_info);
group_size = 1;
}
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
+
dr_chain = VEC_alloc (tree, heap, group_size);
oprnds = VEC_alloc (tree, heap, group_size);
/* For interleaved stores we created vectorized defs for all the
defs stored in OPRNDS in the previous iteration (previous copy).
DR_CHAIN is then used as an input to vect_permute_store_chain(),
- and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
+ and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
next copy.
If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
OPRNDS are of size 1. */
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
+ vect_model_load_cost (stmt_info, ncopies);
return true;
}
- /** Transform. **/
-
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "transform load.");
+ /** Transform. **/
+
if (strided_load)
{
first_stmt = DR_GROUP_FIRST_DR (stmt_info);
done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
gcc_assert (done);
break;
-
+
case type_promotion_vec_info_type:
done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
gcc_assert (done);
tree phi;
bool updated = false;
- for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
+ for (phi = phi_nodes (header_bb); phi; phi = PHI_CHAIN (phi))
{
if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
{
tree evolution_part;
tree init_expr;
tree step_expr;
- tree var, stmt, ni, ni_name;
+ tree var, ni, ni_name;
block_stmt_iterator last_bsi;
if (vect_print_dump_info (REPORT_DETAILS))
init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
loop->num));
- ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
- fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
- fold_convert (TREE_TYPE (init_expr),
- niters),
- step_expr),
- init_expr);
+ if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
+ ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
+ init_expr,
+ fold_convert (sizetype,
+ fold_build2 (MULT_EXPR, TREE_TYPE (niters),
+ niters, step_expr)));
+ else
+ ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
+ fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
+ fold_convert (TREE_TYPE (init_expr),
+ niters),
+ step_expr),
+ init_expr);
+
+
var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
add_referenced_var (var);
- ni_name = force_gimple_operand (ni, &stmt, false, var);
-
- /* Insert stmt into exit_bb. */
last_bsi = bsi_last (exit_bb);
- if (stmt)
- bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
-
+ ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
+ true, BSI_SAME_STMT);
+
/* Fix phi expressions in the successor bb. */
SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
}
basic_block preheader;
int loop_num;
unsigned int th;
+ int min_scalar_loop_bound;
+ int min_profitable_iters;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
&ratio_mult_vf_name, ratio);
loop_num = loop->num;
- /* Threshold for vectorized loop. */
- th = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)) *
- LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+ /* Analyze cost to set threshhold for vectorized loop. */
+ min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
+ min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
+ * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+ /* Use the cost model only if it is more conservative than user specified
+ threshold. */
+
+ th = (unsigned) min_scalar_loop_bound;
+ if (min_profitable_iters
+ && (!min_scalar_loop_bound
+ || min_profitable_iters > min_scalar_loop_bound))
+ th = (unsigned) min_profitable_iters;
+
+ if (min_profitable_iters
+ && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vectorization may not be profitable.");
+
new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
- ratio_mult_vf_name, ni_name, false, th);
+ ratio_mult_vf_name, ni_name, false,
+ th);
gcc_assert (new_loop);
gcc_assert (loop_num == loop->num);
#ifdef ENABLE_CHECKING
prolog_niters = min ( LOOP_NITERS ,
(VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
where group_size is the size of the interleaved group.
-*/
+
+ The above formulas assume that VF == number of elements in the vector. This
+ may not hold when there are multiple-types in the loop.
+ In this case, for some data-references in the loop the VF does not represent
+ the number of elements that fit in the vector. Therefore, instead of VF we
+ use TYPE_VECTOR_SUBPARTS. */
static tree
vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
{
struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
- int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
tree var, stmt;
tree iters, iters_name;
tree niters_type = TREE_TYPE (loop_niters);
int group_size = 1;
int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
+ int nelements = TYPE_VECTOR_SUBPARTS (vectype);
if (DR_GROUP_FIRST_DR (stmt_info))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "known alignment = %d.", byte_misalign);
iters = build_int_cst (niters_type,
- (vf - elem_misalign)&(vf/group_size-1));
+ (nelements - elem_misalign)&(nelements/group_size-1));
}
else
{
tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
tree elem_size_log =
- build_int_cst (type, exact_log2 (vectype_align/vf));
- tree vf_minus_1 = build_int_cst (type, vf - 1);
- tree vf_tree = build_int_cst (type, vf);
+ build_int_cst (type, exact_log2 (vectype_align/nelements));
+ tree nelements_minus_1 = build_int_cst (type, nelements - 1);
+ tree nelements_tree = build_int_cst (type, nelements);
tree byte_misalign;
tree elem_misalign;
/* Create: byte_misalign = addr & (vectype_size - 1) */
byte_misalign =
- fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
+ fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
/* Create: elem_misalign = byte_misalign / element_size */
elem_misalign =
fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
- /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
- iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
- iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
+ /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
+ iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
+ iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
iters = fold_convert (niters_type, iters);
}
NITERS iterations were peeled from the loop represented by LOOP_VINFO.
This function updates the information recorded for the data references in
the loop to account for the fact that the first NITERS iterations had
- already been executed. Specifically, it updates the initial_condition of the
- access_function of all the data_references in the loop. */
+ already been executed. Specifically, it updates the initial_condition of
+ the access_function of all the data_references in the loop. */
static void
vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
struct data_reference *dr;
- if (vect_dump && (dump_flags & TDF_DETAILS))
+ if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
continue;
}
+ gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
!= (unsigned HOST_WIDE_INT) vectorization_factor)
&& vect_print_dump_info (REPORT_DETAILS))