#include "optabs.h"
/* Return the smallest scalar part of STMT.
- This is used to determine the vectype of the stmt. We generally set the
- vectype according to the type of the result (lhs). For stmts whose
+ This is used to determine the vectype of the stmt. We generally set the
+ vectype according to the type of the result (lhs). For stmts whose
result-type is different than the type of the arguments (e.g., demotion,
promotion), vectype will be reset appropriately (later). Note that we have
to visit the smallest datatype in this function, because that determines the
- VF. If the smallest datatype in the loop is present only as the rhs of a
+ VF. If the smallest datatype in the loop is present only as the rhs of a
promotion operation - we'd miss it.
Such a case, where a variable of this datatype does not appear in the lhs
anywhere in the loop, can only occur if it's an invariant: e.g.:
'int_x = (int) short_inv', which we'd expect to have been optimized away by
- invariant motion. However, we cannot rely on invariant motion to always take
- invariants out of the loop, and so in the case of promotion we also have to
- check the rhs.
+ invariant motion. However, we cannot rely on invariant motion to always
+ take invariants out of the loop, and so in the case of promotion we also
+ have to check the rhs.
LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
types. */
/* Find the place of the data-ref in STMT in the interleaving chain that starts
- from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
+ from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
int
vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
/* Function vect_update_interleaving_chain.
For two data-refs DRA and DRB that are a part of a chain interleaved data
- accesses, update the interleaving chain. DRB's INIT is smaller than DRA's.
+ accesses, update the interleaving chain. DRB's INIT is smaller than DRA's.
There are four possible cases:
1. New stmts - both DRA and DRB are not a part of any chain:
if (tree_int_cst_compare (init_old, DR_INIT (drb)) > 0)
{
/* DRB's init is smaller than the init of the stmt previously marked
- as the first stmt of the interleaving chain of DRA. Therefore, we
+ as the first stmt of the interleaving chain of DRA. Therefore, we
update FIRST_STMT and put DRB in the head of the list. */
DR_GROUP_FIRST_DR (stmtinfo_b) = DR_STMT (drb);
DR_GROUP_NEXT_DR (stmtinfo_b) = old_first_stmt;
}
-/* Check dependence between DRA and DRB for basic block vectorization. */
+/* Check dependence between DRA and DRB for basic block vectorization.
+ If the accesses share same bases and offsets, we can compare their initial
+ constant offsets to decide whether they differ or not. In case of a read-
+ write dependence we check that the load is before the store to ensure that
+ vectorization will not change the order of the accesses. */
static bool
vect_drs_dependent_in_basic_block (struct data_reference *dra,
return true;
}
- /* Check that the data-refs have same bases and offsets. If not, we can't
+ /* Check that the data-refs have same bases and offsets. If not, we can't
determine if they are dependent. */
if ((DR_BASE_ADDRESS (dra) != DR_BASE_ADDRESS (drb)
&& (TREE_CODE (DR_BASE_ADDRESS (dra)) != ADDR_EXPR
if (init_a != init_b)
return false;
- /* We have a read-write dependence. Check that the load is before the store.
+ /* We have a read-write dependence. Check that the load is before the store.
When we vectorize basic blocks, vector load can be only before
corresponding scalar load, and vector store can be only after its
- corresponding scalar store. So the order of the acceses is preserved in
+ corresponding scalar store. So the order of the acceses is preserved in
case the load is before the store. */
earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
/* Function vect_check_interleaving.
- Check if DRA and DRB are a part of interleaving. In case they are, insert
+ Check if DRA and DRB are a part of interleaving. In case they are, insert
DRA and DRB in an interleaving chain. */
static bool
/* In case the dataref is in an inner-loop of the loop that is being
vectorized (LOOP), we use the base and misalignment information
- relative to the outer-loop (LOOP). This is ok only if the misalignment
+ relative to the outer-loop (LOOP). This is ok only if the misalignment
stays the same throughout the execution of the inner-loop, which is why
we have to check that the stride of the dataref in the inner-loop evenly
divides by the vector size. */
|| (TREE_CODE (base) == VAR_DECL
&& DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
+ /* If this is a backward running DR then first access in the larger
+ vectype actually is N-1 elements before the address in the DR.
+ Adjust misalign accordingly. */
+ if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
+ {
+ tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
+ /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
+ otherwise we wouldn't be here. */
+ offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
+ /* PLUS because DR_STEP was negative. */
+ misalign = size_binop (PLUS_EXPR, misalign, offset);
+ }
+
/* Modulo alignment. */
misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
if (known_alignment_for_access_p (dr)
&& known_alignment_for_access_p (dr_peel))
{
+ bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
int misal = DR_MISALIGNMENT (dr);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- misal += npeel * dr_size;
- misal %= GET_MODE_SIZE (TYPE_MODE (vectype));
+ misal += negative ? -npeel * dr_size : npeel * dr_size;
+ misal &= GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
SET_DR_MISALIGNMENT (dr, misal);
return;
}
}
-/* Traverse peeling hash table and calculate cost for each peeling option. Find
- one with the lowest cost. */
+/* Traverse peeling hash table and calculate cost for each peeling option.
+ Find the one with the lowest cost. */
static int
vect_peeling_hash_get_lowest_cost (void **slot, void *data)
the alignment of data references in the loop.
FOR NOW: we assume that whatever versioning/peeling takes place, only the
- original loop is to be vectorized; Any other loops that are created by
+ original loop is to be vectorized. Any other loops that are created by
the transformations performed in this pass - are not supposed to be
- vectorized. This restriction will be relaxed.
+ vectorized. This restriction will be relaxed.
This pass will require a cost model to guide it whether to apply peeling
- or versioning or a combination of the two. For example, the scheme that
+ or versioning or a combination of the two. For example, the scheme that
intel uses when given a loop with several memory accesses, is as follows:
choose one memory access ('p') which alignment you want to force by doing
- peeling. Then, either (1) generate a loop in which 'p' is aligned and all
+ peeling. Then, either (1) generate a loop in which 'p' is aligned and all
other accesses are not necessarily aligned, or (2) use loop versioning to
generate one loop in which all accesses are aligned, and another loop in
which only 'p' is necessarily aligned.
Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
- Devising a cost model is the most critical aspect of this work. It will
+ Devising a cost model is the most critical aspect of this work. It will
guide us on which access to peel for, whether to use loop versioning, how
- many versions to create, etc. The cost model will probably consist of
+ many versions to create, etc. The cost model will probably consist of
generic considerations as well as target specific considerations (on
powerpc for example, misaligned stores are more painful than misaligned
loads).
}
}
- These loops are later passed to loop_transform to be vectorized. The
+ These loops are later passed to loop_transform to be vectorized. The
vectorizer will use the alignment information to guide the transformation
(whether to generate regular loads/stores, or with special handling for
misalignment). */
if (known_alignment_for_access_p (dr))
{
unsigned int npeel_tmp;
+ bool negative = tree_int_cst_compare (DR_STEP (dr),
+ size_zero_node) < 0;
/* Save info about DR in the hash table. */
if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
nelements = TYPE_VECTOR_SUBPARTS (vectype);
mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
TREE_TYPE (DR_REF (dr))));
- npeel_tmp = (nelements - mis) % vf;
+ npeel_tmp = (negative
+ ? (mis - nelements) : (nelements - mis))
+ & (nelements - 1);
/* For multiple types, it is possible that the bigger type access
- will have more than one peeling option. E.g., a loop with two
+ will have more than one peeling option. E.g., a loop with two
types: one of size (vector size / 4), and the other one of
- size (vector size / 8). Vectorization factor will 8. If both
+ size (vector size / 8). Vectorization factor will 8. If both
access are misaligned by 3, the first one needs one scalar
- iteration to be aligned, and the second one needs 5. But the
+ iteration to be aligned, and the second one needs 5. But the
the first one will be aligned also by peeling 5 scalar
iterations, and in that case both accesses will be aligned.
Hence, except for the immediate peeling amount, we also want
if (known_alignment_for_access_p (dr0))
{
+ bool negative = tree_int_cst_compare (DR_STEP (dr0),
+ size_zero_node) < 0;
if (!npeel)
{
/* Since it's known at compile time, compute the number of
count. */
mis = DR_MISALIGNMENT (dr0);
mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
- npeel = nelements - mis;
+ npeel = ((negative ? mis - nelements : nelements - mis)
+ & (nelements - 1));
}
/* For interleaved data access every iteration accesses all the
if (DDR_NUM_DIST_VECTS (ddr) == 0)
return;
+ /* Data-dependence analysis reports a distance vector of zero
+ for data-references that overlap only in the first iteration
+ but have different sign step (see PR45764).
+ So as a sanity check require equal DR_STEP. */
+ if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
+ return;
+
loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
FOR_EACH_VEC_ELT (lambda_vector, DDR_DIST_VECTS (ddr), i, dist_v)
{
/* Analyze groups of strided accesses: check that DR belongs to a group of
- strided accesses of legal size, step, etc. Detect gaps, single element
+ strided accesses of legal size, step, etc. Detect gaps, single element
interleaving, and other special cases. Set strided access info.
Collect groups of strided stores for further use in SLP analysis. */
while (next)
{
- /* Skip same data-refs. In case that two or more stmts share data-ref
- (supported only for loads), we vectorize only the first stmt, and
- the rest get their vectorized loads from the first one. */
+ /* Skip same data-refs. In case that two or more stmts share
+ data-ref (supported only for loads), we vectorize only the first
+ stmt, and the rest get their vectorized loads from the first
+ one. */
if (!tree_int_cst_compare (DR_INIT (data_ref),
DR_INIT (STMT_VINFO_DATA_REF (
vinfo_for_stmt (next)))))
/* FORNOW: we handle only interleaving that is a power of 2.
We don't fail here if it may be still possible to vectorize the
- group using SLP. If not, the size of the group will be checked in
+ group using SLP. If not, the size of the group will be checked in
vect_analyze_operations, and the vectorization will fail. */
if (exact_log2 (stride) == -1)
{
}
/* Consecutive? */
- if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)))
+ if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
+ || (dr_step < 0
+ && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
{
/* Mark that it is not interleaving. */
DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) = NULL;
datarefs = BB_VINFO_DATAREFS (bb_vinfo);
}
- /* Go through the data-refs, check that the analysis succeeded. Update pointer
- from stmt_vec_info struct to DR and vectype. */
+ /* Go through the data-refs, check that the analysis succeeded. Update
+ pointer from stmt_vec_info struct to DR and vectype. */
FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
{
tree dinit;
/* Build a reference to the first location accessed by the
- inner-loop: *(BASE+INIT). (The first location is actually
+ inner-loop: *(BASE+INIT). (The first location is actually
BASE+INIT+OFFSET, but we add OFFSET separately later). */
tree inner_base = build_fold_indirect_ref
(fold_build2 (POINTER_PLUS_EXPR,
/* Function vect_get_new_vect_var.
- Returns a name for a new variable. The current naming scheme appends the
+ Returns a name for a new variable. The current naming scheme appends the
prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
the name of vectorizer generated variables, and appends that to NAME if
provided. */
LOOP: Specify relative to which loop-nest should the address be computed.
For example, when the dataref is in an inner-loop nested in an
outer-loop that is now being vectorized, LOOP can be either the
- outer-loop, or the inner-loop. The first memory location accessed
+ outer-loop, or the inner-loop. The first memory location accessed
by the following dataref ('in' points to short):
for (i=0; i<N; i++)
Return the increment stmt that updates the pointer in PTR_INCR.
3. Set INV_P to true if the access pattern of the data reference in the
- vectorized loop is invariant. Set it to false otherwise.
+ vectorized loop is invariant. Set it to false otherwise.
4. Return the pointer. */
tree vptr;
gimple_stmt_iterator incr_gsi;
bool insert_after;
+ bool negative;
tree indx_before_incr, indx_after_incr;
gimple incr;
tree step;
*inv_p = true;
else
*inv_p = false;
+ negative = tree_int_cst_compare (step, size_zero_node) < 0;
/* Create an expression for the first address accessed by this load
in LOOP. */
print_generic_expr (vect_dump, base_name, TDF_SLIM);
}
- /** (1) Create the new vector-pointer variable: **/
+ /* (1) Create the new vector-pointer variable. */
vect_ptr_type = build_pointer_type (vectype);
base = get_base_address (DR_REF (dr));
if (base
add_referenced_var (vect_ptr);
- /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
- vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
- def-use update cycles for the pointer: One relative to the outer-loop
- (LOOP), which is what steps (3) and (4) below do. The other is relative
- to the inner-loop (which is the inner-most loop containing the dataref),
- and this is done be step (5) below.
+ /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
+ vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
+ def-use update cycles for the pointer: one relative to the outer-loop
+ (LOOP), which is what steps (3) and (4) below do. The other is relative
+ to the inner-loop (which is the inner-most loop containing the dataref),
+ and this is done be step (5) below.
- When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
- inner-most loop, and so steps (3),(4) work the same, and step (5) is
- redundant. Steps (3),(4) create the following:
+ When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
+ inner-most loop, and so steps (3),(4) work the same, and step (5) is
+ redundant. Steps (3),(4) create the following:
vp0 = &base_addr;
LOOP: vp1 = phi(vp0,vp2)
vp2 = vp1 + step
goto LOOP
- If there is an inner-loop nested in loop, then step (5) will also be
- applied, and an additional update in the inner-loop will be created:
+ If there is an inner-loop nested in loop, then step (5) will also be
+ applied, and an additional update in the inner-loop will be created:
vp0 = &base_addr;
LOOP: vp1 = phi(vp0,vp2)
vp2 = vp1 + step
if () goto LOOP */
- /** (3) Calculate the initial address the vector-pointer, and set
- the vector-pointer to point to it before the loop: **/
+ /* (2) Calculate the initial address the vector-pointer, and set
+ the vector-pointer to point to it before the loop. */
/* Create: (&(base[init_val+offset]) in the loop preheader. */
else
vect_ptr_init = new_temp;
- /** (4) Handle the updating of the vector-pointer inside the loop.
- This is needed when ONLY_INIT is false, and also when AT_LOOP
- is the inner-loop nested in LOOP (during outer-loop vectorization).
- **/
+ /* (3) Handle the updating of the vector-pointer inside the loop.
+ This is needed when ONLY_INIT is false, and also when AT_LOOP is the
+ inner-loop nested in LOOP (during outer-loop vectorization). */
/* No update in loop is required. */
if (only_init && (!loop_vinfo || at_loop == loop))
LOOP is zero. In this case the step here is also zero. */
if (*inv_p)
step = size_zero_node;
+ else if (negative)
+ step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step);
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
return vptr;
- /** (5) Handle the updating of the vector-pointer inside the inner-loop
- nested in LOOP, if exists: **/
+ /* (4) Handle the updating of the vector-pointer inside the inner-loop
+ nested in LOOP, if exists. */
gcc_assert (nested_in_vect_loop);
if (!only_init)
Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
a power of 2, generate interleave_high/low stmts to reorder the data
- correctly for the stores. Return the final references for stores in
+ correctly for the stores. Return the final references for stores in
RESULT_CHAIN.
E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
- The input is 4 vectors each containing 8 elements. We assign a number to each
- element, the input sequence is:
+ The input is 4 vectors each containing 8 elements. We assign a number to
+ each element, the input sequence is:
1st vec: 0 1 2 3 4 5 6 7
2nd vec: 8 9 10 11 12 13 14 15
i.e., we interleave the contents of the four vectors in their order.
- We use interleave_high/low instructions to create such output. The input of
+ We use interleave_high/low instructions to create such output. The input of
each interleave_high/low operation is two vectors:
1st vec 2nd vec
0 1 2 3 4 5 6 7
the even elements of the result vector are obtained left-to-right from the
- high/low elements of the first vector. The odd elements of the result are
+ high/low elements of the first vector. The odd elements of the result are
obtained left-to-right from the high/low elements of the second vector.
The output of interleave_high will be: 0 4 1 5
and of interleave_low: 2 6 3 7
- The permutation is done in log LENGTH stages. In each stage interleave_high
+ The permutation is done in log LENGTH stages. In each stage interleave_high
and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
where the first argument is taken from the first half of DR_CHAIN and the
second argument from it's second half.
1. the misalignment computation
2. the extra vector load (for the optimized realignment scheme).
3. the phi node for the two vectors from which the realignment is
- done (for the optimized realignment scheme).
- */
+ done (for the optimized realignment scheme). */
/* 1. Determine where to generate the misalignment computation.
Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
a power of 2, generate extract_even/odd stmts to reorder the input data
- correctly. Return the final references for loads in RESULT_CHAIN.
+ correctly. Return the final references for loads in RESULT_CHAIN.
E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
The input is 4 vectors each containing 8 elements. We assign a number to each
i.e., the first output vector should contain the first elements of each
interleaving group, etc.
- We use extract_even/odd instructions to create such output. The input of each
- extract_even/odd operation is two vectors
+ We use extract_even/odd instructions to create such output. The input of
+ each extract_even/odd operation is two vectors
1st vec 2nd vec
0 1 2 3 4 5 6 7
- and the output is the vector of extracted even/odd elements. The output of
+ and the output is the vector of extracted even/odd elements. The output of
extract_even will be: 0 2 4 6
and of extract_odd: 1 3 5 7
- The permutation is done in log LENGTH stages. In each stage extract_even and
- extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
- order. In our example,
+ The permutation is done in log LENGTH stages. In each stage extract_even
+ and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
+ their order. In our example,
E1: extract_even (1st vec, 2nd vec)
E2: extract_odd (1st vec, 2nd vec)
if (!next_stmt)
break;
- /* Skip the gaps. Loads created for the gaps will be removed by dead
- code elimination pass later. No need to check for the first stmt in
+ /* Skip the gaps. Loads created for the gaps will be removed by dead
+ code elimination pass later. No need to check for the first stmt in
the group, since it always exists.
DR_GROUP_GAP is the number of steps in elements from the previous
- access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
- correspond to the gaps.
- */
+ access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
+ correspond to the gaps. */
if (next_stmt != first_stmt
&& gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
{
/* We can choose between using the implicit realignment scheme (generating
a misaligned_move stmt) and the explicit realignment scheme (generating
- aligned loads with a REALIGN_LOAD). There are two variants to the explicit
- realignment scheme: optimized, and unoptimized.
+ aligned loads with a REALIGN_LOAD). There are two variants to the
+ explicit realignment scheme: optimized, and unoptimized.
We can optimize the realignment only if the step between consecutive
vector loads is equal to the vector size. Since the vector memory
accesses advance in steps of VS (Vector Size) in the vectorized loop, it