+ /* Worthwhile without SIMD support? */
+ if (!VECTOR_MODE_P (TYPE_MODE (vectype))
+ && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+ < vect_min_worthwhile_factor (code))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "not worthwhile without SIMD support.");
+ return false;
+ }
+
+ if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
+ {
+ /* FORNOW: not yet supported. */
+ if (!VECTOR_MODE_P (vec_mode))
+ return false;
+
+ /* Invariant argument is needed for a vector shift
+ by a scalar shift operand. */
+ optab_op2_mode = insn_data[icode].operand[2].mode;
+ if (! (VECTOR_MODE_P (optab_op2_mode)
+ || dt1 == vect_constant_def
+ || dt1 == vect_invariant_def))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "operand mode requires invariant argument.");
+ return false;
+ }
+ }
+
+ if (!vec_stmt) /* transformation not required. */
+ {
+ STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
+ return true;
+ }
+
+ /** Transform. **/
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "transform binary/unary operation.");
+
+ /* Handle def. */
+ vec_dest = vect_create_destination_var (scalar_dest, vectype);
+
+ /* In case the vectorization factor (VF) is bigger than the number
+ of elements that we can fit in a vectype (nunits), we have to generate
+ more than one vector stmt - i.e - we need to "unroll" the
+ vector stmt by a factor VF/nunits. In doing so, we record a pointer
+ from one copy of the vector stmt to the next, in the field
+ STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
+ stages to find the correct vector defs to be used when vectorizing
+ stmts that use the defs of the current stmt. The example below illustrates
+ the vectorization process when VF=16 and nunits=4 (i.e - we need to create
+ 4 vectorized stmts):
+
+ before vectorization:
+ RELATED_STMT VEC_STMT
+ S1: x = memref - -
+ S2: z = x + 1 - -
+
+ step 1: vectorize stmt S1 (done in vectorizable_load. See more details
+ there):
+ RELATED_STMT VEC_STMT
+ VS1_0: vx0 = memref0 VS1_1 -
+ VS1_1: vx1 = memref1 VS1_2 -
+ VS1_2: vx2 = memref2 VS1_3 -
+ VS1_3: vx3 = memref3 - -
+ S1: x = load - VS1_0
+ S2: z = x + 1 - -
+
+ step2: vectorize stmt S2 (done here):
+ To vectorize stmt S2 we first need to find the relevant vector
+ def for the first operand 'x'. This is, as usual, obtained from
+ the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
+ that defines 'x' (S1). This way we find the stmt VS1_0, and the
+ relevant vector def 'vx0'. Having found 'vx0' we can generate
+ the vector stmt VS2_0, and as usual, record it in the
+ STMT_VINFO_VEC_STMT of stmt S2.
+ When creating the second copy (VS2_1), we obtain the relevant vector
+ def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
+ stmt VS1_0. This way we find the stmt VS1_1 and the relevant
+ vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
+ pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
+ Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
+ chain of stmts and pointers:
+ RELATED_STMT VEC_STMT
+ VS1_0: vx0 = memref0 VS1_1 -
+ VS1_1: vx1 = memref1 VS1_2 -
+ VS1_2: vx2 = memref2 VS1_3 -
+ VS1_3: vx3 = memref3 - -
+ S1: x = load - VS1_0
+ VS2_0: vz0 = vx0 + v1 VS2_1 -
+ VS2_1: vz1 = vx1 + v1 VS2_2 -
+ VS2_2: vz2 = vx2 + v1 VS2_3 -
+ VS2_3: vz3 = vx3 + v1 - -
+ S2: z = x + 1 - VS2_0 */
+
+ prev_stmt_info = NULL;
+ for (j = 0; j < ncopies; j++)
+ {
+ /* Handle uses. */
+ if (j == 0)
+ {
+ vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
+ if (op_type == binary_op)
+ {
+ if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
+ {
+ /* Vector shl and shr insn patterns can be defined with
+ scalar operand 2 (shift operand). In this case, use
+ constant or loop invariant op1 directly, without
+ extending it to vector mode first. */
+ optab_op2_mode = insn_data[icode].operand[2].mode;
+ if (!VECTOR_MODE_P (optab_op2_mode))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "operand 1 using scalar mode.");
+ vec_oprnd1 = op1;
+ }
+ }
+ if (!vec_oprnd1)
+ vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
+ }
+ }
+ else
+ {
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ if (op_type == binary_op)
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
+ }
+
+ /* Arguments are ready. create the new vector stmt. */
+
+ if (op_type == binary_op)
+ new_stmt = build_gimple_modify_stmt (vec_dest,
+ build2 (code, vectype, vec_oprnd0, vec_oprnd1));
+ else
+ new_stmt = build_gimple_modify_stmt (vec_dest,
+ build1 (code, vectype, vec_oprnd0));
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
+
+ if (j == 0)
+ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ }
+
+ return true;
+}
+
+
+/* Function vectorizable_type_demotion
+
+ Check if STMT performs a binary or unary operation that involves
+ type demotion, and if it can be vectorized.
+ If VEC_STMT is also passed, vectorize the STMT: create a vectorized
+ stmt to replace it, put it in VEC_STMT, and insert it at BSI.
+ Return FALSE if not a vectorizable STMT, TRUE otherwise. */
+
+bool
+vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
+ tree *vec_stmt)
+{
+ tree vec_dest;
+ tree scalar_dest;
+ tree operation;
+ tree op0;
+ tree vec_oprnd0=NULL, vec_oprnd1=NULL;
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+ enum tree_code code;
+ tree new_temp;
+ tree def, def_stmt;
+ enum vect_def_type dt0;
+ tree new_stmt;
+ stmt_vec_info prev_stmt_info;
+ int nunits_in;
+ int nunits_out;
+ tree vectype_out;
+ int ncopies;
+ int j;
+ tree expr;
+ tree vectype_in;
+ tree scalar_type;
+ optab optab;
+ enum machine_mode vec_mode;
+
+ /* Is STMT a vectorizable type-demotion operation? */
+
+ if (!STMT_VINFO_RELEVANT_P (stmt_info))
+ return false;
+
+ gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
+
+ if (STMT_VINFO_LIVE_P (stmt_info))
+ {
+ /* FORNOW: not yet supported. */
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "value used after loop.");
+ return false;
+ }
+
+ if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
+ return false;
+
+ if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
+ return false;
+
+ operation = GIMPLE_STMT_OPERAND (stmt, 1);
+ code = TREE_CODE (operation);
+ if (code != NOP_EXPR && code != CONVERT_EXPR)
+ return false;
+
+ op0 = TREE_OPERAND (operation, 0);
+ vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
+ nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
+
+ scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
+ scalar_type = TREE_TYPE (scalar_dest);
+ vectype_out = get_vectype_for_scalar_type (scalar_type);
+ nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
+ if (nunits_in != nunits_out / 2) /* FORNOW */
+ return false;
+
+ ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
+ gcc_assert (ncopies >= 1);
+
+ if (! INTEGRAL_TYPE_P (scalar_type)
+ || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
+ return false;
+
+ /* Check the operands of the operation. */
+ if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "use not simple.");
+ return false;
+ }
+
+ /* Supportable by target? */
+ code = VEC_PACK_MOD_EXPR;
+ optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
+ if (!optab)
+ return false;
+
+ vec_mode = TYPE_MODE (vectype_in);
+ if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
+ return false;
+
+ STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
+
+ if (!vec_stmt) /* transformation not required. */
+ {
+ STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
+ return true;
+ }
+
+ /** Transform. **/
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
+ ncopies);
+
+ /* Handle def. */
+ vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
+
+ /* In case the vectorization factor (VF) is bigger than the number
+ of elements that we can fit in a vectype (nunits), we have to generate
+ more than one vector stmt - i.e - we need to "unroll" the
+ vector stmt by a factor VF/nunits. */
+ prev_stmt_info = NULL;
+ for (j = 0; j < ncopies; j++)
+ {
+ /* Handle uses. */
+ if (j == 0)
+ {
+ vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ }
+ else
+ {
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ }
+
+ /* Arguments are ready. Create the new vector stmt. */
+ expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
+ new_stmt = build_gimple_modify_stmt (vec_dest, expr);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
+
+ if (j == 0)
+ STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ }
+
+ *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
+ return true;
+}
+
+
+/* Function vect_gen_widened_results_half
+
+ Create a vector stmt whose code, type, number of arguments, and result
+ variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
+ VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
+ In the case that CODE is a CALL_EXPR, this means that a call to DECL
+ needs to be created (DECL is a function-decl of a target-builtin).
+ STMT is the original scalar stmt that we are vectorizing. */
+
+static tree
+vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
+ tree vec_oprnd0, tree vec_oprnd1, int op_type,
+ tree vec_dest, block_stmt_iterator *bsi,
+ tree stmt)
+{
+ tree expr;
+ tree new_stmt;
+ tree new_temp;
+ tree sym;
+ ssa_op_iter iter;
+
+ /* Generate half of the widened result: */
+ if (code == CALL_EXPR)
+ {
+ /* Target specific support */
+ if (op_type == binary_op)
+ expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
+ else
+ expr = build_call_expr (decl, 1, vec_oprnd0);
+ }
+ else
+ {
+ /* Generic support */
+ gcc_assert (op_type == TREE_CODE_LENGTH (code));
+ if (op_type == binary_op)
+ expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
+ else
+ expr = build1 (code, vectype, vec_oprnd0);
+ }
+ new_stmt = build_gimple_modify_stmt (vec_dest, expr);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
+
+ if (code == CALL_EXPR)
+ {
+ FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
+ {
+ if (TREE_CODE (sym) == SSA_NAME)
+ sym = SSA_NAME_VAR (sym);
+ mark_sym_for_renaming (sym);
+ }
+ }
+
+ return new_stmt;
+}
+
+
+/* Function vectorizable_type_promotion
+
+ Check if STMT performs a binary or unary operation that involves
+ type promotion, and if it can be vectorized.
+ If VEC_STMT is also passed, vectorize the STMT: create a vectorized
+ stmt to replace it, put it in VEC_STMT, and insert it at BSI.
+ Return FALSE if not a vectorizable STMT, TRUE otherwise. */
+
+bool
+vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
+ tree *vec_stmt)
+{
+ tree vec_dest;
+ tree scalar_dest;
+ tree operation;
+ tree op0, op1 = NULL;
+ tree vec_oprnd0=NULL, vec_oprnd1=NULL;
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+ enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
+ tree decl1 = NULL_TREE, decl2 = NULL_TREE;
+ int op_type;
+ tree def, def_stmt;
+ enum vect_def_type dt0, dt1;
+ tree new_stmt;
+ stmt_vec_info prev_stmt_info;
+ int nunits_in;
+ int nunits_out;
+ tree vectype_out;
+ int ncopies;
+ int j;
+ tree vectype_in;
+
+ /* Is STMT a vectorizable type-promotion operation? */
+
+ if (!STMT_VINFO_RELEVANT_P (stmt_info))
+ return false;
+
+ gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
+
+ if (STMT_VINFO_LIVE_P (stmt_info))
+ {
+ /* FORNOW: not yet supported. */
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "value used after loop.");
+ return false;
+ }
+
+ if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
+ return false;
+
+ if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
+ return false;
+
+ operation = GIMPLE_STMT_OPERAND (stmt, 1);
+ code = TREE_CODE (operation);
+ if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
+ return false;
+
+ op0 = TREE_OPERAND (operation, 0);
+ vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
+ nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
+ ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
+ gcc_assert (ncopies >= 1);
+
+ scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
+ vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
+ nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
+ if (nunits_out != nunits_in / 2) /* FORNOW */
+ return false;
+
+ if (! INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
+ || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
+ return false;
+
+ /* Check the operands of the operation. */
+ if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "use not simple.");
+ return false;
+ }
+
+ op_type = TREE_CODE_LENGTH (code);
+ if (op_type == binary_op)
+ {
+ op1 = TREE_OPERAND (operation, 1);
+ if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "use not simple.");
+ return false;
+ }
+ }
+
+ /* Supportable by target? */
+ if (!supportable_widening_operation (code, stmt, vectype_in,
+ &decl1, &decl2, &code1, &code2))
+ return false;
+
+ STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
+
+ if (!vec_stmt) /* transformation not required. */
+ {
+ STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
+ return true;
+ }
+
+ /** Transform. **/
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
+ ncopies);
+
+ /* Handle def. */
+ vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
+
+ /* In case the vectorization factor (VF) is bigger than the number
+ of elements that we can fit in a vectype (nunits), we have to generate
+ more than one vector stmt - i.e - we need to "unroll" the
+ vector stmt by a factor VF/nunits. */
+
+ prev_stmt_info = NULL;
+ for (j = 0; j < ncopies; j++)
+ {
+ /* Handle uses. */
+ if (j == 0)
+ {
+ vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
+ if (op_type == binary_op)
+ vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
+ }
+ else
+ {
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ if (op_type == binary_op)
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
+ }
+
+ /* Arguments are ready. Create the new vector stmt. We are creating
+ two vector defs because the widened result does not fit in one vector.
+ The vectorized stmt can be expressed as a call to a taregt builtin,
+ or a using a tree-code. */
+ /* Generate first half of the widened result: */
+ new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
+ vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
+ if (j == 0)
+ STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+
+ /* Generate second half of the widened result: */
+ new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
+ vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+
+ }
+
+ *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
+ return true;
+}
+
+
+/* Function vect_strided_store_supported.
+
+ Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
+ and FALSE otherwise. */
+
+static bool
+vect_strided_store_supported (tree vectype)
+{
+ optab interleave_high_optab, interleave_low_optab;
+ int mode;
+
+ mode = (int) TYPE_MODE (vectype);
+
+ /* Check that the operation is supported. */
+ interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
+ vectype);
+ interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
+ vectype);
+ if (!interleave_high_optab || !interleave_low_optab)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "no optab for interleave.");
+ return false;
+ }
+
+ if (interleave_high_optab->handlers[(int) mode].insn_code
+ == CODE_FOR_nothing
+ || interleave_low_optab->handlers[(int) mode].insn_code
+ == CODE_FOR_nothing)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "interleave op not supported by target.");
+ return false;
+ }
+ return true;
+}
+
+
+/* Function vect_permute_store_chain.
+
+ Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
+ a power of 2, generate interleave_high/low stmts to reorder the data
+ correctly for the stores. Return the final references for stores in
+ RESULT_CHAIN.
+
+ E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
+ The input is 4 vectors each containing 8 elements. We assign a number to each
+ element, the input sequence is:
+
+ 1st vec: 0 1 2 3 4 5 6 7
+ 2nd vec: 8 9 10 11 12 13 14 15
+ 3rd vec: 16 17 18 19 20 21 22 23
+ 4th vec: 24 25 26 27 28 29 30 31
+
+ The output sequence should be:
+
+ 1st vec: 0 8 16 24 1 9 17 25
+ 2nd vec: 2 10 18 26 3 11 19 27
+ 3rd vec: 4 12 20 28 5 13 21 30
+ 4th vec: 6 14 22 30 7 15 23 31
+
+ i.e., we interleave the contents of the four vectors in their order.
+
+ We use interleave_high/low instructions to create such output. The input of
+ each interleave_high/low operation is two vectors:
+ 1st vec 2nd vec
+ 0 1 2 3 4 5 6 7
+ the even elements of the result vector are obtained left-to-right from the
+ high/low elements of the first vector. The odd elements of the result are
+ obtained left-to-right from the high/low elements of the second vector.
+ The output of interleave_high will be: 0 4 1 5
+ and of interleave_low: 2 6 3 7
+
+
+ The permutation is done in log LENGTH stages. In each stage interleave_high
+ and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
+ where the first argument is taken from the first half of DR_CHAIN and the
+ second argument from it's second half.
+ In our example,
+
+ I1: interleave_high (1st vec, 3rd vec)
+ I2: interleave_low (1st vec, 3rd vec)
+ I3: interleave_high (2nd vec, 4th vec)
+ I4: interleave_low (2nd vec, 4th vec)
+
+ The output for the first stage is:
+
+ I1: 0 16 1 17 2 18 3 19
+ I2: 4 20 5 21 6 22 7 23
+ I3: 8 24 9 25 10 26 11 27
+ I4: 12 28 13 29 14 30 15 31
+
+ The output of the second stage, i.e. the final result is:
+
+ I1: 0 8 16 24 1 9 17 25
+ I2: 2 10 18 26 3 11 19 27
+ I3: 4 12 20 28 5 13 21 30
+ I4: 6 14 22 30 7 15 23 31. */
+
+static bool
+vect_permute_store_chain (VEC(tree,heap) *dr_chain,
+ unsigned int length,
+ tree stmt,
+ block_stmt_iterator *bsi,
+ VEC(tree,heap) **result_chain)
+{
+ tree perm_dest, perm_stmt, vect1, vect2, high, low;
+ tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
+ tree scalar_dest, tmp;
+ int i;
+ unsigned int j;
+ VEC(tree,heap) *first, *second;
+
+ scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
+ first = VEC_alloc (tree, heap, length/2);
+ second = VEC_alloc (tree, heap, length/2);
+
+ /* Check that the operation is supported. */
+ if (!vect_strided_store_supported (vectype))
+ return false;
+
+ *result_chain = VEC_copy (tree, heap, dr_chain);
+
+ for (i = 0; i < exact_log2 (length); i++)
+ {
+ for (j = 0; j < length/2; j++)
+ {
+ vect1 = VEC_index (tree, dr_chain, j);
+ vect2 = VEC_index (tree, dr_chain, j+length/2);
+
+ /* Create interleaving stmt:
+ in the case of big endian:
+ high = interleave_high (vect1, vect2)
+ and in the case of little endian:
+ high = interleave_low (vect1, vect2). */
+ perm_dest = create_tmp_var (vectype, "vect_inter_high");
+ DECL_GIMPLE_REG_P (perm_dest) = 1;
+ add_referenced_var (perm_dest);
+ if (BYTES_BIG_ENDIAN)
+ tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
+ else
+ tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
+ perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
+ high = make_ssa_name (perm_dest, perm_stmt);
+ GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
+ vect_finish_stmt_generation (stmt, perm_stmt, bsi);
+ VEC_replace (tree, *result_chain, 2*j, high);
+
+ /* Create interleaving stmt:
+ in the case of big endian:
+ low = interleave_low (vect1, vect2)
+ and in the case of little endian:
+ low = interleave_high (vect1, vect2). */
+ perm_dest = create_tmp_var (vectype, "vect_inter_low");
+ DECL_GIMPLE_REG_P (perm_dest) = 1;
+ add_referenced_var (perm_dest);
+ if (BYTES_BIG_ENDIAN)
+ tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
+ else
+ tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
+ perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
+ low = make_ssa_name (perm_dest, perm_stmt);
+ GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
+ vect_finish_stmt_generation (stmt, perm_stmt, bsi);
+ VEC_replace (tree, *result_chain, 2*j+1, low);
+ }
+ dr_chain = VEC_copy (tree, heap, *result_chain);
+ }
+ return true;
+}
+
+
+/* Function vectorizable_store.
+
+ Check if STMT defines a non scalar data-ref (array/pointer/structure) that
+ can be vectorized.
+ If VEC_STMT is also passed, vectorize the STMT: create a vectorized
+ stmt to replace it, put it in VEC_STMT, and insert it at BSI.
+ Return FALSE if not a vectorizable STMT, TRUE otherwise. */
+
+bool
+vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
+{
+ tree scalar_dest;
+ tree data_ref;
+ tree op;
+ tree vec_oprnd = NULL_TREE;
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+ enum machine_mode vec_mode;
+ tree dummy;
+ enum dr_alignment_support alignment_support_cheme;
+ ssa_op_iter iter;
+ def_operand_p def_p;
+ tree def, def_stmt;
+ enum vect_def_type dt;
+ stmt_vec_info prev_stmt_info = NULL;
+ tree dataref_ptr = NULL_TREE;
+ int nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
+ int j;
+ tree next_stmt, first_stmt;
+ bool strided_store = false;
+ unsigned int group_size, i;
+ VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
+ gcc_assert (ncopies >= 1);
+
+ /* Is vectorizable store? */
+
+ if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
+ return false;
+
+ scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
+ if (TREE_CODE (scalar_dest) != ARRAY_REF
+ && TREE_CODE (scalar_dest) != INDIRECT_REF
+ && !DR_GROUP_FIRST_DR (stmt_info))
+ return false;
+
+ op = GIMPLE_STMT_OPERAND (stmt, 1);
+ if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "use not simple.");
+ return false;
+ }
+
+ vec_mode = TYPE_MODE (vectype);
+ /* FORNOW. In some cases can vectorize even if data-type not supported
+ (e.g. - array initialization with 0). */
+ if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
+ return false;
+
+ if (!STMT_VINFO_DATA_REF (stmt_info))
+ return false;
+
+ if (DR_GROUP_FIRST_DR (stmt_info))
+ {
+ strided_store = true;
+ if (!vect_strided_store_supported (vectype))
+ return false;
+ }
+
+ if (!vec_stmt) /* transformation not required. */
+ {
+ STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
+ return true;
+ }
+
+ /** Transform. **/
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
+
+ if (strided_store)
+ {
+ first_stmt = DR_GROUP_FIRST_DR (stmt_info);
+ first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
+ group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
+
+ DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
+
+ /* We vectorize all the stmts of the interleaving group when we
+ reach the last stmt in the group. */
+ if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
+ < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
+ {
+ *vec_stmt = NULL_TREE;
+ return true;
+ }
+ }
+ else
+ {
+ first_stmt = stmt;
+ first_dr = dr;
+ group_size = 1;
+ }
+
+ dr_chain = VEC_alloc (tree, heap, group_size);
+ oprnds = VEC_alloc (tree, heap, group_size);
+
+ alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
+ gcc_assert (alignment_support_cheme);
+ gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
+
+ /* In case the vectorization factor (VF) is bigger than the number
+ of elements that we can fit in a vectype (nunits), we have to generate
+ more than one vector stmt - i.e - we need to "unroll" the
+ vector stmt by a factor VF/nunits. For more details see documentation in
+ vect_get_vec_def_for_copy_stmt. */
+
+ /* In case of interleaving (non-unit strided access):
+
+ S1: &base + 2 = x2
+ S2: &base = x0
+ S3: &base + 1 = x1
+ S4: &base + 3 = x3
+
+ We create vectorized stores starting from base address (the access of the
+ first stmt in the chain (S2 in the above example), when the last store stmt
+ of the chain (S4) is reached:
+
+ VS1: &base = vx2
+ VS2: &base + vec_size*1 = vx0
+ VS3: &base + vec_size*2 = vx1
+ VS4: &base + vec_size*3 = vx3
+
+ Then permutation statements are generated:
+
+ VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
+ VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
+ ...
+
+ And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
+ (the order of the data-refs in the output of vect_permute_store_chain
+ corresponds to the order of scalar stmts in the interleaving chain - see
+ the documentation of vect_permute_store_chain()).
+
+ In case of both multiple types and interleaving, above vector stores and
+ permutation stmts are created for every copy. The result vector stmts are
+ put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
+ STMT_VINFO_RELATED_STMT for the next copies.
+ */
+
+ prev_stmt_info = NULL;
+ for (j = 0; j < ncopies; j++)
+ {
+ tree new_stmt;
+ tree ptr_incr;
+
+ if (j == 0)
+ {
+ /* For interleaved stores we collect vectorized defs for all the
+ stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
+ as an input to vect_permute_store_chain(), and OPRNDS as an input
+ to vect_get_vec_def_for_stmt_copy() for the next copy.
+ If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
+ OPRNDS are of size 1. */
+ next_stmt = first_stmt;
+ for (i = 0; i < group_size; i++)
+ {
+ /* Since gaps are not supported for interleaved stores, GROUP_SIZE
+ is the exact number of stmts in the chain. Therefore, NEXT_STMT
+ can't be NULL_TREE. In case that there is no interleaving,
+ GROUP_SIZE is 1, and only one iteration of the loop will be
+ executed. */
+ gcc_assert (next_stmt);
+ op = GIMPLE_STMT_OPERAND (next_stmt, 1);
+ vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
+ VEC_quick_push(tree, dr_chain, vec_oprnd);
+ VEC_quick_push(tree, oprnds, vec_oprnd);
+ next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
+ }
+ dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
+ &dummy, &ptr_incr, false,
+ TREE_TYPE (vec_oprnd));
+ }
+ else
+ {
+ /* For interleaved stores we created vectorized defs for all the
+ defs stored in OPRNDS in the previous iteration (previous copy).
+ DR_CHAIN is then used as an input to vect_permute_store_chain(),
+ and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
+ next copy.
+ If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
+ OPRNDS are of size 1. */
+ for (i = 0; i < group_size; i++)
+ {
+ vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
+ VEC_index (tree, oprnds, i));
+ VEC_replace(tree, dr_chain, i, vec_oprnd);
+ VEC_replace(tree, oprnds, i, vec_oprnd);
+ }
+ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
+ }
+
+ if (strided_store)
+ {
+ result_chain = VEC_alloc (tree, heap, group_size);
+ /* Permute. */
+ if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
+ &result_chain))
+ return false;
+ }
+
+ next_stmt = first_stmt;
+ for (i = 0; i < group_size; i++)
+ {
+ /* For strided stores vectorized defs are interleaved in
+ vect_permute_store_chain(). */
+ if (strided_store)
+ vec_oprnd = VEC_index(tree, result_chain, i);
+
+ data_ref = build_fold_indirect_ref (dataref_ptr);
+ /* Arguments are ready. Create the new vector stmt. */
+ new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
+
+ /* Set the VDEFs for the vector pointer. If this virtual def
+ has a use outside the loop and a loop peel is performed
+ then the def may be renamed by the peel. Mark it for
+ renaming so the later use will also be renamed. */
+ copy_virtual_operands (new_stmt, next_stmt);
+ if (j == 0)
+ {
+ /* The original store is deleted so the same SSA_NAMEs
+ can be used. */
+ FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
+ {
+ SSA_NAME_DEF_STMT (def) = new_stmt;
+ mark_sym_for_renaming (SSA_NAME_VAR (def));
+ }
+
+ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+ }
+ else
+ {
+ /* Create new names for all the definitions created by COPY and
+ add replacement mappings for each new name. */
+ FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
+ {
+ create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
+ mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
+ }
+
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+ }
+
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
+ if (!next_stmt)
+ break;
+ /* Bump the vector pointer. */
+ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
+ }
+ }
+
+ return true;
+}
+
+
+/* Function vect_setup_realignment
+
+ This function is called when vectorizing an unaligned load using
+ the dr_unaligned_software_pipeline scheme.
+ This function generates the following code at the loop prolog:
+
+ p = initial_addr;
+ msq_init = *(floor(p)); # prolog load
+ realignment_token = call target_builtin;
+ loop:
+ msq = phi (msq_init, ---)
+
+ The code above sets up a new (vector) pointer, pointing to the first
+ location accessed by STMT, and a "floor-aligned" load using that pointer.
+ It also generates code to compute the "realignment-token" (if the relevant
+ target hook was defined), and creates a phi-node at the loop-header bb
+ whose arguments are the result of the prolog-load (created by this
+ function) and the result of a load that takes place in the loop (to be
+ created by the caller to this function).
+ The caller to this function uses the phi-result (msq) to create the
+ realignment code inside the loop, and sets up the missing phi argument,
+ as follows:
+
+ loop:
+ msq = phi (msq_init, lsq)
+ lsq = *(floor(p')); # load in loop
+ result = realign_load (msq, lsq, realignment_token);
+
+ Input:
+ STMT - (scalar) load stmt to be vectorized. This load accesses
+ a memory location that may be unaligned.
+ BSI - place where new code is to be inserted.
+
+ Output:
+ REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
+ target hook, if defined.
+ Return value - the result of the loop-header phi node. */
+
+static tree
+vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
+ tree *realignment_token)
+{
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ edge pe = loop_preheader_edge (loop);
+ tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
+ tree vec_dest;
+ tree init_addr;
+ tree inc;
+ tree ptr;
+ tree data_ref;
+ tree new_stmt;
+ basic_block new_bb;
+ tree msq_init;
+ tree new_temp;
+ tree phi_stmt;
+ tree msq;
+
+ /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
+ vec_dest = vect_create_destination_var (scalar_dest, vectype);
+ ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
+ NULL_TREE);
+ data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
+ new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
+ gcc_assert (!new_bb);
+ msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
+ copy_virtual_operands (new_stmt, stmt);
+ update_vuses_to_preheader (new_stmt, loop);
+
+ /* 2. Create permutation mask, if required, in loop preheader. */
+ if (targetm.vectorize.builtin_mask_for_load)
+ {
+ tree builtin_decl;
+
+ builtin_decl = targetm.vectorize.builtin_mask_for_load ();
+ new_stmt = build_call_expr (builtin_decl, 1, init_addr);
+ vec_dest = vect_create_destination_var (scalar_dest,
+ TREE_TYPE (new_stmt));
+ new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
+ gcc_assert (!new_bb);
+ *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
+
+ /* The result of the CALL_EXPR to this builtin is determined from
+ the value of the parameter and no global variables are touched
+ which makes the builtin a "const" function. Requiring the
+ builtin to have the "const" attribute makes it unnecessary
+ to call mark_call_clobbered. */
+ gcc_assert (TREE_READONLY (builtin_decl));
+ }
+
+ /* 3. Create msq = phi <msq_init, lsq> in loop */
+ vec_dest = vect_create_destination_var (scalar_dest, vectype);
+ msq = make_ssa_name (vec_dest, NULL_TREE);
+ phi_stmt = create_phi_node (msq, loop->header);
+ SSA_NAME_DEF_STMT (msq) = phi_stmt;
+ add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
+
+ return msq;
+}
+
+
+/* Function vect_strided_load_supported.
+
+ Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
+ and FALSE otherwise. */
+
+static bool
+vect_strided_load_supported (tree vectype)
+{
+ optab perm_even_optab, perm_odd_optab;
+ int mode;
+
+ mode = (int) TYPE_MODE (vectype);
+
+ perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
+ if (!perm_even_optab)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "no optab for perm_even.");
+ return false;
+ }
+
+ if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)