From: irar Date: Mon, 19 Apr 2010 09:10:45 +0000 (+0000) Subject: PR tree-optimization/37027 X-Git-Url: http://git.sourceforge.jp/view?p=pf3gnuchains%2Fgcc-fork.git;a=commitdiff_plain;h=f2848122032581aa68fb5ce5e52dc3ff60325e26 PR tree-optimization/37027 * tree-vectorizer.h (struct _loop_vec_info): Add new field reductions and macro to access it. (vectorizable_reduction): Add argument. (vect_get_slp_defs): Likewise. * tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction statements for possible use in SLP. (new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS. (destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS. (vect_create_epilog_for_reduction): Handle SLP. Modify documentation, add new argument. (vectorizable_reduction): Likewise. * tree-vect-stmts.c (vect_get_vec_defs): Update call to vect_get_slp_defs. (vectorizable_type_demotion, vectorizable_type_promotion, vectorizable_store): Likewise. (vect_analyze_stmt): Update call to vectorizable_reduction. (vect_transform_stmt): Likewise. * tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction. (vect_build_slp_tree): Fix indentation. Check that there are no loads from different interleaving chains in same node. (vect_slp_rearrange_stmts): New function. (vect_supported_load_permutation_p): Allow load permutations for reductions. Call vect_slp_rearrange_stmts() to rearrange statements inside SLP nodes if necessary. (vect_analyze_slp_instance): Handle reductions. (vect_analyze_slp): Try to build SLP instances originating from groups of reductions. (vect_detect_hybrid_slp_stmts): Skip reduction statements. (vect_get_constant_vectors): Create initial vectors for reductions according to reduction code. Add new argument. (vect_get_slp_defs): Add new argument, pass it to vect_get_constant_vectors. (vect_schedule_slp_instance): Remove SLP tree root statements. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@158506 138bc75d-0d04-0410-961f-82ee72b054a4 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 2da24e12ccb..98c80045f99 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,40 @@ +2010-04-19 Ira Rosen + + PR tree-optimization/37027 + * tree-vectorizer.h (struct _loop_vec_info): Add new field reductions + and macro to access it. + (vectorizable_reduction): Add argument. + (vect_get_slp_defs): Likewise. + * tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction + statements for possible use in SLP. + (new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS. + (destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS. + (vect_create_epilog_for_reduction): Handle SLP. Modify documentation, + add new argument. + (vectorizable_reduction): Likewise. + * tree-vect-stmts.c (vect_get_vec_defs): Update call to + vect_get_slp_defs. + (vectorizable_type_demotion, vectorizable_type_promotion, + vectorizable_store): Likewise. + (vect_analyze_stmt): Update call to vectorizable_reduction. + (vect_transform_stmt): Likewise. + * tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction. + (vect_build_slp_tree): Fix indentation. Check that there are no loads + from different interleaving chains in same node. + (vect_slp_rearrange_stmts): New function. + (vect_supported_load_permutation_p): Allow load permutations for + reductions. Call vect_slp_rearrange_stmts() to rearrange statements + inside SLP nodes if necessary. + (vect_analyze_slp_instance): Handle reductions. + (vect_analyze_slp): Try to build SLP instances originating from groups + of reductions. + (vect_detect_hybrid_slp_stmts): Skip reduction statements. + (vect_get_constant_vectors): Create initial vectors for reductions + according to reduction code. Add new argument. + (vect_get_slp_defs): Add new argument, pass it to + vect_get_constant_vectors. + (vect_schedule_slp_instance): Remove SLP tree root statements. + 2010-04-19 Jakub Jelinek * tree.h (ENUM_IS_SCOPED): Define. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 8ffc442caeb..868ce20d31e 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,14 @@ +2010-04-19 Ira Rosen + + PR tree-optimization/37027 + * lib/target-supports.exp + (check_effective_target_vect_widen_sum_hi_to_si_pattern): New. + * gcc.dg/vect/pr37027.c: New test. + * gcc.dg/vect/slp-reduc-1.c, gcc.dg/vect/slp-reduc-2.c, + gcc.dg/vect/slp-reduc-3.c, gcc.dg/vect/slp-reduc-4.c, + gcc.dg/vect/slp-reduc-5.c, gcc.dg/vect/slp-reduc-6.c, + gcc.dg/vect/vect-complex-6.c: Likewise. + 2010-04-19 Jakub Jelinek * g++.dg/debug/dwarf2/enum1.C: New test. diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 809f3e15a02..e6e9008ea37 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -545,6 +545,11 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = vect_reduction_def; + /* Store the reduction cycles for possible vectorization in + loop-aware SLP. */ + VEC_safe_push (gimple, heap, + LOOP_VINFO_REDUCTIONS (loop_vinfo), + reduc_stmt); } } } @@ -745,6 +750,7 @@ new_loop_vec_info (struct loop *loop) VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10); + LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10); LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10); LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1; @@ -835,6 +841,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo)); + VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo)); free (loop_vinfo); loop->aux = NULL; @@ -1223,7 +1230,6 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo) if ((STMT_VINFO_RELEVANT_P (stmt_info) || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) && !PURE_SLP_STMT (stmt_info)) - /* STMT needs both SLP and loop-based vectorization. */ only_slp_in_loop = false; } @@ -2860,28 +2866,33 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, /* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction - computation. - - VECT_DEF is a vector of partial results. - REDUC_CODE is the tree-code for the epilog reduction. + computation. + + VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector + reduction statements. + STMT is the scalar reduction stmt that is being vectorized. NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits). In this case we have to generate more than one vector stmt - i.e - we need to "unroll" the vector stmt by a factor VF/nunits. For more details see documentation in vectorizable_operation. - STMT is the scalar reduction stmt that is being vectorized. - REDUCTION_PHI is the phi-node that carries the reduction computation. - REDUC_INDEX is the index of the operand in the right hand side of the + REDUC_CODE is the tree-code for the epilog reduction. + REDUCTION_PHIS is a list of the phi-nodes that carry the reduction + computation. + REDUC_INDEX is the index of the operand in the right hand side of the statement that is defined by REDUCTION_PHI. DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. + SLP_NODE is an SLP node containing a group of reduction statements. The + first one in this group is STMT. This function: - 1. Creates the reduction def-use cycle: sets the arguments for - REDUCTION_PHI: + 1. Creates the reduction def-use cycles: sets the arguments for + REDUCTION_PHIS: The loop-entry argument is the vectorized initial-value of the reduction. - The loop-latch argument is VECT_DEF - the vector of partial sums. - 2. "Reduces" the vector of partial results VECT_DEF into a single result, - by applying the operation specified by REDUC_CODE if available, or by + The loop-latch argument is taken from VECT_DEFS - the vector of partial + sums. + 2. "Reduces" each vector of partial results VECT_DEFS into a single result, + by applying the operation specified by REDUC_CODE if available, or by other means (whole-vector shifts or a scalar loop). The function also creates a new phi node at the loop exit to preserve loop-closed form, as illustrated below. @@ -2914,12 +2925,11 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, */ static void -vect_create_epilog_for_reduction (tree vect_def, gimple stmt, - int ncopies, - enum tree_code reduc_code, - gimple reduction_phi, - int reduc_index, - bool double_reduc) +vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt, + int ncopies, enum tree_code reduc_code, + VEC (gimple, heap) *reduction_phis, + int reduc_index, bool double_reduc, + slp_tree slp_node) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); stmt_vec_info prev_phi_info; @@ -2933,32 +2943,37 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, gimple new_phi = NULL, phi; gimple_stmt_iterator exit_gsi; tree vec_dest; - tree new_temp = NULL_TREE; - tree new_name; + tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest; gimple epilog_stmt = NULL; - tree new_scalar_dest, new_dest; + enum tree_code code = gimple_assign_rhs_code (stmt); gimple exit_phi; tree bitsize, bitpos; - enum tree_code code = gimple_assign_rhs_code (stmt); - tree adjustment_def; - tree vec_initial_def, def; - tree orig_name; + tree adjustment_def = NULL; + tree vec_initial_def = NULL; + tree reduction_op, expr, def; + tree orig_name, scalar_result; imm_use_iterator imm_iter; use_operand_p use_p; bool extract_scalar_result = false; - tree reduction_op, expr; - gimple orig_stmt; - gimple use_stmt; + gimple use_stmt, orig_stmt, reduction_phi = NULL; bool nested_in_vect_loop = false; - VEC(gimple,heap) *phis = NULL; + VEC (gimple, heap) *new_phis = NULL; enum vect_def_type dt = vect_unknown_def_type; int j, i; + VEC (tree, heap) *scalar_results = NULL; + int group_size = 1, k, ratio; + VEC (tree, heap) *vec_initial_defs = NULL; + VEC (gimple, heap) *phis; + + if (slp_node) + group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node)); if (nested_in_vect_loop_p (loop, stmt)) { outer_loop = loop; loop = loop->inner; nested_in_vect_loop = true; + gcc_assert (!slp_node); } switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) @@ -2983,47 +2998,80 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, gcc_assert (vectype); mode = TYPE_MODE (vectype); - /*** 1. Create the reduction def-use cycle ***/ + /* 1. Create the reduction def-use cycle: + Set the arguments of REDUCTION_PHIS, i.e., transform + + loop: + vec_def = phi # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + ... - /* For the case of reduction, vect_get_vec_def_for_operand returns - the scalar def before the loop, that defines the initial value - of the reduction variable. */ - vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, - &adjustment_def); + into: - phi = reduction_phi; - def = vect_def; - for (j = 0; j < ncopies; j++) + loop: + vec_def = phi # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + ... + + (in case of SLP, do it for all the phis). */ + + /* Get the loop-entry arguments. */ + if (slp_node) + vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index); + else { - /* 1.1 set the loop-entry arg of the reduction-phi: */ - add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop), - UNKNOWN_LOCATION); + vec_initial_defs = VEC_alloc (tree, heap, 1); + /* For the case of reduction, vect_get_vec_def_for_operand returns + the scalar def before the loop, that defines the initial value + of the reduction variable. */ + vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, + &adjustment_def); + VEC_quick_push (tree, vec_initial_defs, vec_initial_def); + } - /* 1.2 set the loop-latch arg for the reduction-phi: */ - if (j > 0) - def = vect_get_vec_def_for_stmt_copy (dt, def); - add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION); + /* Set phi nodes arguments. */ + for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++) + { + tree vec_init_def = VEC_index (tree, vec_initial_defs, i); + tree def = VEC_index (tree, vect_defs, i); + for (j = 0; j < ncopies; j++) + { + /* Set the loop-entry arg of the reduction-phi. */ + add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop), + UNKNOWN_LOCATION); - if (vect_print_dump_info (REPORT_DETAILS)) - { - fprintf (vect_dump, "transform reduction: created def-use cycle: "); - print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); - fprintf (vect_dump, "\n"); - print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM); - } + /* Set the loop-latch arg for the reduction-phi. */ + if (j > 0) + def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def); - phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); + add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION); + + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "transform reduction: created def-use" + " cycle: "); + print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); + fprintf (vect_dump, "\n"); + print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, + TDF_SLIM); + } + + phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); + } } - /*** 2. Create epilog code - The reduction epilog code operates across the elements of the vector - of partial results computed by the vectorized loop. - The reduction epilog code consists of: - step 1: compute the scalar result in a vector (v_out2) - step 2: extract the scalar result (s_out3) from the vector (v_out2) - step 3: adjust the scalar result (s_out3) if needed. + VEC_free (tree, heap, vec_initial_defs); + + /* 2. Create epilog code. + The reduction epilog code operates across the elements of the vector + of partial results computed by the vectorized loop. + The reduction epilog code consists of: + + step 1: compute the scalar result in a vector (v_out2) + step 2: extract the scalar result (s_out3) from the vector (v_out2) + step 3: adjust the scalar result (s_out3) if needed. - Step 1 can be accomplished using one the following three schemes: + Step 1 can be accomplished using one the following three schemes: (scheme 1) using reduc_code, if available. (scheme 2) using whole-vector shifts, if available. (scheme 3) using a scalar loop. In this case steps 1+2 above are @@ -3038,29 +3086,33 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, s_out4 = adjust_result # step 3 (step 3 is optional, and steps 1 and 2 may be combined). - Lastly, the uses of s_out0 are replaced by s_out4. + Lastly, the uses of s_out0 are replaced by s_out4. */ - ***/ - /* 2.1 Create new loop-exit-phi to preserve loop-closed form: - v_out1 = phi */ + /* 2.1 Create new loop-exit-phis to preserve loop-closed form: + v_out1 = phi + Store them in NEW_PHIS. */ exit_bb = single_exit (loop)->dest; - def = vect_def; prev_phi_info = NULL; - for (j = 0; j < ncopies; j++) + new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs)); + for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++) { - phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb); - set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL)); - if (j == 0) - new_phi = phi; - else - { - def = vect_get_vec_def_for_stmt_copy (dt, def); - STMT_VINFO_RELATED_STMT (prev_phi_info) = phi; - } - SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); - prev_phi_info = vinfo_for_stmt (phi); + for (j = 0; j < ncopies; j++) + { + phi = create_phi_node (SSA_NAME_VAR (def), exit_bb); + set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL)); + if (j == 0) + VEC_quick_push (gimple, new_phis, phi); + else + { + def = vect_get_vec_def_for_stmt_copy (dt, def); + STMT_VINFO_RELATED_STMT (prev_phi_info) = phi; + } + + SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); + prev_phi_info = vinfo_for_stmt (phi); + } } exit_gsi = gsi_after_labels (exit_bb); @@ -3089,16 +3141,17 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, } code = gimple_assign_rhs_code (orig_stmt); + /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, + partial results are added and not subtracted. */ + if (code == MINUS_EXPR) + code = PLUS_EXPR; + scalar_dest = gimple_assign_lhs (orig_stmt); scalar_type = TREE_TYPE (scalar_dest); + scalar_results = VEC_alloc (tree, heap, group_size); new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); bitsize = TYPE_SIZE (scalar_type); - /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, - partial results are added and not subtracted. */ - if (code == MINUS_EXPR) - code = PLUS_EXPR; - /* In case this is a reduction in an inner-loop while vectorizing an outer loop - we don't need to extract a single scalar result at the end of the inner-loop (unless it is double reduction, i.e., the use of reduction is @@ -3108,28 +3161,21 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, if (nested_in_vect_loop && !double_reduc) goto vect_finalize_reduction; - /* The epilogue is created for the outer-loop, i.e., for the loop being - vectorized. */ - if (double_reduc) - loop = outer_loop; - - /* FORNOW */ - gcc_assert (ncopies == 1); - /* 2.3 Create the reduction code, using one of the three schemes described - above. */ - - if (reduc_code != ERROR_MARK) + above. In SLP we simply need to extract all the elements from the + vector (without reducing them), so we use scalar shifts. */ + if (reduc_code != ERROR_MARK && !slp_node) { tree tmp; /*** Case 1: Create: - v_out2 = reduc_expr */ + v_out2 = reduc_expr */ if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "Reduce using direct vector reduction."); + fprintf (vect_dump, "Reduce using direct vector reduction."); vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_phi = VEC_index (gimple, new_phis, 0); tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi)); epilog_stmt = gimple_build_assign (vec_dest, tmp); new_temp = make_ssa_name (vec_dest, epilog_stmt); @@ -3148,142 +3194,182 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, tree vec_temp; if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) - shift_code = VEC_RSHIFT_EXPR; + shift_code = VEC_RSHIFT_EXPR; else - have_whole_vector_shift = false; + have_whole_vector_shift = false; /* Regardless of whether we have a whole vector shift, if we're - emulating the operation via tree-vect-generic, we don't want - to use it. Only the first round of the reduction is likely - to still be profitable via emulation. */ + emulating the operation via tree-vect-generic, we don't want + to use it. Only the first round of the reduction is likely + to still be profitable via emulation. */ /* ??? It might be better to emit a reduction tree code here, so that - tree-vect-generic can expand the first round via bit tricks. */ + tree-vect-generic can expand the first round via bit tricks. */ if (!VECTOR_MODE_P (mode)) - have_whole_vector_shift = false; + have_whole_vector_shift = false; else - { - optab optab = optab_for_tree_code (code, vectype, optab_default); - if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) - have_whole_vector_shift = false; - } - - if (have_whole_vector_shift) { - /*** Case 2: Create: - for (offset = VS/2; offset >= element_size; offset/=2) - { - Create: va' = vec_shift - Create: va = vop - } */ - - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "Reduce using vector shifts"); + optab optab = optab_for_tree_code (code, vectype, optab_default); + if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) + have_whole_vector_shift = false; + } - vec_dest = vect_create_destination_var (scalar_dest, vectype); - new_temp = PHI_RESULT (new_phi); + if (have_whole_vector_shift && !slp_node) + { + /*** Case 2: Create: + for (offset = VS/2; offset >= element_size; offset/=2) + { + Create: va' = vec_shift + Create: va = vop + } */ - for (bit_offset = vec_size_in_bits/2; - bit_offset >= element_bitsize; - bit_offset /= 2) - { - tree bitpos = size_int (bit_offset); - - epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest, - new_temp, bitpos); - new_name = make_ssa_name (vec_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_name); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - - epilog_stmt = gimple_build_assign_with_ops (code, vec_dest, - new_name, new_temp); - new_temp = make_ssa_name (vec_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_temp); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - } + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Reduce using vector shifts"); + + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_phi = VEC_index (gimple, new_phis, 0); + new_temp = PHI_RESULT (new_phi); + for (bit_offset = vec_size_in_bits/2; + bit_offset >= element_bitsize; + bit_offset /= 2) + { + tree bitpos = size_int (bit_offset); + + epilog_stmt = gimple_build_assign_with_ops (shift_code, + vec_dest, new_temp, bitpos); + new_name = make_ssa_name (vec_dest, epilog_stmt); + gimple_assign_set_lhs (epilog_stmt, new_name); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + + epilog_stmt = gimple_build_assign_with_ops (code, vec_dest, + new_name, new_temp); + new_temp = make_ssa_name (vec_dest, epilog_stmt); + gimple_assign_set_lhs (epilog_stmt, new_temp); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } - extract_scalar_result = true; - } + extract_scalar_result = true; + } else { - tree rhs; - - /*** Case 3: Create: - s = extract_field - for (offset = element_size; - offset < vector_size; - offset += element_size;) - { - Create: s' = extract_field - Create: s = op - } */ + tree rhs; + + /*** Case 3: Create: + s = extract_field + for (offset = element_size; + offset < vector_size; + offset += element_size;) + { + Create: s' = extract_field + Create: s = op // For non SLP cases + } */ - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "Reduce using scalar code. "); - - vec_temp = PHI_RESULT (new_phi); - vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); - rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, - bitsize_zero_node); - epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); - new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_temp); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - - for (bit_offset = element_bitsize; - bit_offset < vec_size_in_bits; - bit_offset += element_bitsize) - { - tree bitpos = bitsize_int (bit_offset); - tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, - bitpos); - - epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); - new_name = make_ssa_name (new_scalar_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_name); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - - epilog_stmt = gimple_build_assign_with_ops (code, - new_scalar_dest, - new_name, new_temp); - new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_temp); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - } + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Reduce using scalar code. "); - extract_scalar_result = false; - } + vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); + for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++) + { + vec_temp = PHI_RESULT (new_phi); + rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, + bitsize_zero_node); + epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + gimple_assign_set_lhs (epilog_stmt, new_temp); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + + /* In SLP we don't need to apply reduction operation, so we just + collect s' values in SCALAR_RESULTS. */ + if (slp_node) + VEC_safe_push (tree, heap, scalar_results, new_temp); + + for (bit_offset = element_bitsize; + bit_offset < vec_size_in_bits; + bit_offset += element_bitsize) + { + tree bitpos = bitsize_int (bit_offset); + tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, + bitsize, bitpos); + + epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); + new_name = make_ssa_name (new_scalar_dest, epilog_stmt); + gimple_assign_set_lhs (epilog_stmt, new_name); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + + if (slp_node) + { + /* In SLP we don't need to apply reduction operation, so + we just collect s' values in SCALAR_RESULTS. */ + new_temp = new_name; + VEC_safe_push (tree, heap, scalar_results, new_name); + } + else + { + epilog_stmt = gimple_build_assign_with_ops (code, + new_scalar_dest, new_name, new_temp); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + gimple_assign_set_lhs (epilog_stmt, new_temp); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } + } + } + + /* The only case where we need to reduce scalar results in SLP, is + unrolling. If the size of SCALAR_RESULTS is greater than + GROUP_SIZE, we reduce them combining elements modulo + GROUP_SIZE. */ + if (slp_node) + { + tree res, first_res, new_res; + gimple new_stmt; + + /* Reduce multiple scalar results in case of SLP unrolling. */ + for (j = group_size; VEC_iterate (tree, scalar_results, j, res); + j++) + { + first_res = VEC_index (tree, scalar_results, j % group_size); + new_stmt = gimple_build_assign_with_ops (code, + new_scalar_dest, first_res, res); + new_res = make_ssa_name (new_scalar_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, new_res); + gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT); + VEC_replace (tree, scalar_results, j % group_size, new_res); + } + } + else + /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ + VEC_safe_push (tree, heap, scalar_results, new_temp); + + extract_scalar_result = false; + } } /* 2.4 Extract the final scalar result. Create: - s_out3 = extract_field */ + s_out3 = extract_field */ if (extract_scalar_result) { tree rhs; - gcc_assert (!nested_in_vect_loop || double_reduc); if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "extract scalar result"); + fprintf (vect_dump, "extract scalar result"); if (BYTES_BIG_ENDIAN) - bitpos = size_binop (MULT_EXPR, - bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), - TYPE_SIZE (scalar_type)); + bitpos = size_binop (MULT_EXPR, + bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), + TYPE_SIZE (scalar_type)); else - bitpos = bitsize_zero_node; + bitpos = bitsize_zero_node; rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos); epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); gimple_assign_set_lhs (epilog_stmt, new_temp); gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + VEC_safe_push (tree, heap, scalar_results, new_temp); } - + vect_finalize_reduction: - if (double_reduc) - loop = loop->inner; - /* 2.5 Adjust the final result by the initial value of the reduction variable. (When such adjustment is not needed, then 'adjustment_def' is zero). For example, if code is PLUS we create: @@ -3291,14 +3377,17 @@ vect_finalize_reduction: if (adjustment_def) { + gcc_assert (!slp_node); if (nested_in_vect_loop) { + new_phi = VEC_index (gimple, new_phis, 0); gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); new_dest = vect_create_destination_var (scalar_dest, vectype); } else { + new_temp = VEC_index (tree, scalar_results, 0); gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); expr = build2 (code, scalar_type, new_temp, adjustment_def); new_dest = vect_create_destination_var (scalar_dest, scalar_type); @@ -3309,142 +3398,206 @@ vect_finalize_reduction: gimple_assign_set_lhs (epilog_stmt, new_temp); SSA_NAME_DEF_STMT (new_temp) = epilog_stmt; gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + if (nested_in_vect_loop) + { + set_vinfo_for_stmt (epilog_stmt, + new_stmt_vec_info (epilog_stmt, loop_vinfo, + NULL)); + STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) = + STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi)); + + if (!double_reduc) + VEC_quick_push (tree, scalar_results, new_temp); + else + VEC_replace (tree, scalar_results, 0, new_temp); + } + else + VEC_replace (tree, scalar_results, 0, new_temp); + + VEC_replace (gimple, new_phis, 0, epilog_stmt); } + /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit + phis with new adjusted scalar results, i.e., replace use + with use . - /* 2.6 Handle the loop-exit phi */ + Transform: + loop_exit: + s_out0 = phi # (scalar) EXIT_PHI + v_out1 = phi # NEW_EXIT_PHI + v_out2 = reduce + s_out3 = extract_field + s_out4 = adjust_result + use + use + + into: - /* Replace uses of s_out0 with uses of s_out3: - Find the loop-closed-use at the loop exit of the original scalar result. - (The reduction result is expected to have two immediate uses - one at the - latch block, and one at the loop exit). */ - phis = VEC_alloc (gimple, heap, 10); - FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) + loop_exit: + s_out0 = phi # (scalar) EXIT_PHI + v_out1 = phi # NEW_EXIT_PHI + v_out2 = reduce + s_out3 = extract_field + s_out4 = adjust_result + use */ + + /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in + case that GROUP_SIZE is greater than vectorization factor). Therefore, we + need to match SCALAR_RESULTS with corresponding statements. The first + (GROUP_SIZE / number of new vector stmts) scalar results correspond to + the first vector stmt, etc. + (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */ + ratio = group_size / VEC_length (gimple, new_phis); + gcc_assert (!(group_size % VEC_length (gimple, new_phis))); + + for (k = 0; k < group_size; k++) { - if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) - { - exit_phi = USE_STMT (use_p); - VEC_quick_push (gimple, phis, exit_phi); - } - } + if (k % ratio == 0) + { + epilog_stmt = VEC_index (gimple, new_phis, k / ratio); + reduction_phi = VEC_index (gimple, reduction_phis, k / ratio); + } - /* We expect to have found an exit_phi because of loop-closed-ssa form. */ - gcc_assert (!VEC_empty (gimple, phis)); + if (slp_node) + { + gimple current_stmt = VEC_index (gimple, + SLP_TREE_SCALAR_STMTS (slp_node), k); - for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++) - { - if (nested_in_vect_loop) - { - stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); - gimple vect_phi; - - /* FORNOW. Currently not supporting the case that an inner-loop - reduction is not used in the outer-loop (but only outside the - outer-loop), unless it is double reduction. */ - gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo) - && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc); - - epilog_stmt = adjustment_def ? epilog_stmt : new_phi; - STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt; - set_vinfo_for_stmt (epilog_stmt, - new_stmt_vec_info (epilog_stmt, loop_vinfo, - NULL)); - if (adjustment_def) - STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) = - STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi)); - - if (!double_reduc - || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def) - continue; - - /* Handle double reduction: - - stmt1: s1 = phi - double reduction phi (outer loop) - stmt2: s3 = phi - (regular) reduction phi (inner loop) - stmt3: s4 = use (s3) - (regular) reduction stmt (inner loop) - stmt4: s2 = phi - double reduction stmt (outer loop) - - At that point the regular reduction (stmt2 and stmt3) is already - vectorized, as well as the exit phi node, stmt4. - Here we vectorize the phi node of double reduction, stmt1, and - update all relevant statements. */ - - /* Go through all the uses of s2 to find double reduction phi node, - i.e., stmt1 above. */ - orig_name = PHI_RESULT (exit_phi); - FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) + orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt)); + /* SLP statements can't participate in patterns. */ + gcc_assert (!orig_stmt); + scalar_dest = gimple_assign_lhs (current_stmt); + } + + phis = VEC_alloc (gimple, heap, 3); + /* Find the loop-closed-use at the loop exit of the original scalar + result. (The reduction result is expected to have two immediate uses - + one at the latch block, and one at the loop exit). */ + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) + if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) + VEC_safe_push (gimple, heap, phis, USE_STMT (use_p)); + + /* We expect to have found an exit_phi because of loop-closed-ssa + form. */ + gcc_assert (!VEC_empty (gimple, phis)); + + for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++) + { + if (outer_loop) { - stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt); - stmt_vec_info new_phi_vinfo; - tree vect_phi_init, preheader_arg, vect_phi_res, init_def; - basic_block bb = gimple_bb (use_stmt); - gimple use; - - /* Check that USE_STMT is really double reduction phi node. */ - if (gimple_code (use_stmt) != GIMPLE_PHI - || gimple_phi_num_args (use_stmt) != 2 - || !use_stmt_vinfo - || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) - != vect_double_reduction_def - || bb->loop_father != outer_loop) + stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); + gimple vect_phi; + + /* FORNOW. Currently not supporting the case that an inner-loop + reduction is not used in the outer-loop (but only outside the + outer-loop), unless it is double reduction. */ + gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo) + && !STMT_VINFO_LIVE_P (exit_phi_vinfo)) + || double_reduc); + + STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt; + if (!double_reduc + || STMT_VINFO_DEF_TYPE (exit_phi_vinfo) + != vect_double_reduction_def) continue; - /* Create vector phi node for double reduction: - vs1 = phi - vs1 was created previously in this function by a call to - vect_get_vec_def_for_operand and is stored in vec_initial_def; - vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI; - vs0 is created here. */ + /* Handle double reduction: - /* Create vector phi node. */ - vect_phi = create_phi_node (vec_initial_def, bb); - new_phi_vinfo = new_stmt_vec_info (vect_phi, - loop_vec_info_for_loop (outer_loop), NULL); - set_vinfo_for_stmt (vect_phi, new_phi_vinfo); + stmt1: s1 = phi - double reduction phi (outer loop) + stmt2: s3 = phi - (regular) reduc phi (inner loop) + stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop) + stmt4: s2 = phi - double reduction stmt (outer loop) - /* Create vs0 - initial def of the double reduction phi. */ - preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, - loop_preheader_edge (outer_loop)); - init_def = get_initial_def_for_reduction (stmt, preheader_arg, - NULL); - vect_phi_init = vect_init_vector (use_stmt, init_def, vectype, - NULL); - - /* Update phi node arguments with vs0 and vs2. */ - add_phi_arg (vect_phi, vect_phi_init, - loop_preheader_edge (outer_loop), UNKNOWN_LOCATION); - add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt), - loop_latch_edge (outer_loop), UNKNOWN_LOCATION); - if (vect_print_dump_info (REPORT_DETAILS)) - { - fprintf (vect_dump, "created double reduction phi node: "); - print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM); - } - - vect_phi_res = PHI_RESULT (vect_phi); + At that point the regular reduction (stmt2 and stmt3) is + already vectorized, as well as the exit phi node, stmt4. + Here we vectorize the phi node of double reduction, stmt1, and + update all relevant statements. */ - /* Replace the use, i.e., set the correct vs1 in the regular - reduction phi node. FORNOW, NCOPIES is always 1, so the loop - is redundant. */ - use = reduction_phi; - for (j = 0; j < ncopies; j++) + /* Go through all the uses of s2 to find double reduction phi + node, i.e., stmt1 above. */ + orig_name = PHI_RESULT (exit_phi); + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) { - edge pr_edge = loop_preheader_edge (loop); - SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); - use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); + stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt); + stmt_vec_info new_phi_vinfo; + tree vect_phi_init, preheader_arg, vect_phi_res, init_def; + basic_block bb = gimple_bb (use_stmt); + gimple use; + + /* Check that USE_STMT is really double reduction phi + node. */ + if (gimple_code (use_stmt) != GIMPLE_PHI + || gimple_phi_num_args (use_stmt) != 2 + || !use_stmt_vinfo + || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) + != vect_double_reduction_def + || bb->loop_father != outer_loop) + continue; + + /* Create vector phi node for double reduction: + vs1 = phi + vs1 was created previously in this function by a call to + vect_get_vec_def_for_operand and is stored in + vec_initial_def; + vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI; + vs0 is created here. */ + + /* Create vector phi node. */ + vect_phi = create_phi_node (vec_initial_def, bb); + new_phi_vinfo = new_stmt_vec_info (vect_phi, + loop_vec_info_for_loop (outer_loop), NULL); + set_vinfo_for_stmt (vect_phi, new_phi_vinfo); + + /* Create vs0 - initial def of the double reduction phi. */ + preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, + loop_preheader_edge (outer_loop)); + init_def = get_initial_def_for_reduction (stmt, + preheader_arg, NULL); + vect_phi_init = vect_init_vector (use_stmt, init_def, + vectype, NULL); + + /* Update phi node arguments with vs0 and vs2. */ + add_phi_arg (vect_phi, vect_phi_init, + loop_preheader_edge (outer_loop), + UNKNOWN_LOCATION); + add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt), + loop_latch_edge (outer_loop), UNKNOWN_LOCATION); + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "created double reduction phi " + "node: "); + print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM); + } + + vect_phi_res = PHI_RESULT (vect_phi); + + /* Replace the use, i.e., set the correct vs1 in the regular + reduction phi node. FORNOW, NCOPIES is always 1, so the + loop is redundant. */ + use = reduction_phi; + for (j = 0; j < ncopies; j++) + { + edge pr_edge = loop_preheader_edge (loop); + SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); + use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); + } } } - } - /* Replace the uses: */ - orig_name = PHI_RESULT (exit_phi); - FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) - FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) - SET_USE (use_p, new_temp); + /* Replace the uses: */ + orig_name = PHI_RESULT (exit_phi); + scalar_result = VEC_index (tree, scalar_results, k); + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) + FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) + SET_USE (use_p, scalar_result); + } + + VEC_free (gimple, heap, phis); } - VEC_free (gimple, heap, phis); -} + VEC_free (tree, heap, scalar_results); + VEC_free (gimple, heap, new_phis); +} /* Function vectorizable_reduction. @@ -3489,7 +3642,7 @@ vect_finalize_reduction: bool vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, - gimple *vec_stmt) + gimple *vec_stmt, slp_tree slp_node) { tree vec_dest; tree scalar_dest; @@ -3517,7 +3670,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, int ncopies; int epilog_copies; stmt_vec_info prev_stmt_info, prev_phi_info; - gimple first_phi = NULL; bool single_defuse_cycle = false; tree reduc_def = NULL_TREE; gimple new_stmt = NULL; @@ -3532,6 +3684,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, struct loop * def_stmt_loop, *outer_loop = NULL; tree def_arg; gimple def_arg_stmt; + VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL; + VEC (gimple, heap) *phis = NULL; + int vec_num; + tree def0, def1; if (nested_in_vect_loop_p (loop, stmt)) { @@ -3540,10 +3696,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, nested_cycle = true; } - /* FORNOW: SLP not supported. */ - if (STMT_SLP_TYPE (stmt_info)) - return false; - /* 1. Is vectorizable reduction? */ /* Not supportable if the reduction variable is used in the loop. */ if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer) @@ -3676,9 +3828,12 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt))) return false; + if (slp_node) + ncopies = 1; + else + ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) + / TYPE_VECTOR_SUBPARTS (vectype_in)); - ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) - / TYPE_VECTOR_SUBPARTS (vectype_in)); gcc_assert (ncopies >= 1); vec_mode = TYPE_MODE (vectype_in); @@ -3897,23 +4052,48 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, prev_stmt_info = NULL; prev_phi_info = NULL; + if (slp_node) + { + vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out) + == TYPE_VECTOR_SUBPARTS (vectype_in)); + } + else + { + vec_num = 1; + vec_oprnds0 = VEC_alloc (tree, heap, 1); + if (op_type == ternary_op) + vec_oprnds1 = VEC_alloc (tree, heap, 1); + } + + phis = VEC_alloc (gimple, heap, vec_num); + vect_defs = VEC_alloc (tree, heap, vec_num); + if (!slp_node) + VEC_quick_push (tree, vect_defs, NULL_TREE); + for (j = 0; j < ncopies; j++) { if (j == 0 || !single_defuse_cycle) { - /* Create the reduction-phi that defines the reduction-operand. */ - new_phi = create_phi_node (vec_dest, loop->header); - set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo, - NULL)); - /* Get the vector def for the reduction variable from the phi - node. */ - reduc_def = PHI_RESULT (new_phi); - } + for (i = 0; i < vec_num; i++) + { + /* Create the reduction-phi that defines the reduction + operand. */ + new_phi = create_phi_node (vec_dest, loop->header); + set_vinfo_for_stmt (new_phi, + new_stmt_vec_info (new_phi, loop_vinfo, + NULL)); + if (j == 0 || slp_node) + VEC_quick_push (gimple, phis, new_phi); + } + } if (code == COND_EXPR) { - first_phi = new_phi; - vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index); + gcc_assert (!slp_node); + vectorizable_condition (stmt, gsi, vec_stmt, + PHI_RESULT (VEC_index (gimple, phis, 0)), + reduc_index); /* Multiple types are not supported for condition. */ break; } @@ -3921,65 +4101,94 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, /* Handle uses. */ if (j == 0) { - loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], - stmt, NULL); - if (op_type == ternary_op) + if (slp_node) + vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1); + else { - if (reduc_index == 0) - loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt, - NULL); - else - loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, - NULL); + loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], + stmt, NULL); + VEC_quick_push (tree, vec_oprnds0, loop_vec_def0); + if (op_type == ternary_op) + { + if (reduc_index == 0) + loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt, + NULL); + else + loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, + NULL); + + VEC_quick_push (tree, vec_oprnds1, loop_vec_def1); + } } - - /* Get the vector def for the reduction variable from the phi - node. */ - first_phi = new_phi; } else { - enum vect_def_type dt = vect_unknown_def_type; /* Dummy */ - loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0); - if (op_type == ternary_op) - loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1); + if (!slp_node) + { + enum vect_def_type dt = vect_unknown_def_type; /* Dummy */ + loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0); + VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0); + if (op_type == ternary_op) + { + loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, + loop_vec_def1); + VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1); + } + } - if (single_defuse_cycle) - reduc_def = gimple_assign_lhs (new_stmt); - else - reduc_def = PHI_RESULT (new_phi); + if (single_defuse_cycle) + reduc_def = gimple_assign_lhs (new_stmt); - STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; + STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; } - /* Arguments are ready. Create the new vector stmt. */ - if (op_type == binary_op) + for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++) { - if (reduc_index == 0) - expr = build2 (code, vectype_out, reduc_def, loop_vec_def0); + if (slp_node) + reduc_def = PHI_RESULT (VEC_index (gimple, phis, i)); else - expr = build2 (code, vectype_out, loop_vec_def0, reduc_def); - } - else - { - if (reduc_index == 0) - expr = build3 (code, vectype_out, reduc_def, loop_vec_def0, - loop_vec_def1); + { + if (!single_defuse_cycle || j == 0) + reduc_def = PHI_RESULT (new_phi); + } + + def1 = ((op_type == ternary_op) + ? VEC_index (tree, vec_oprnds1, i) : NULL); + if (op_type == binary_op) + { + if (reduc_index == 0) + expr = build2 (code, vectype_out, reduc_def, def0); + else + expr = build2 (code, vectype_out, def0, reduc_def); + } else { - if (reduc_index == 1) - expr = build3 (code, vectype_out, loop_vec_def0, reduc_def, - loop_vec_def1); + if (reduc_index == 0) + expr = build3 (code, vectype_out, reduc_def, def0, def1); else - expr = build3 (code, vectype_out, loop_vec_def0, loop_vec_def1, - reduc_def); + { + if (reduc_index == 1) + expr = build3 (code, vectype_out, def0, reduc_def, def1); + else + expr = build3 (code, vectype_out, def0, def1, reduc_def); + } + } + + new_stmt = gimple_build_assign (vec_dest, expr); + new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, new_temp); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + if (slp_node) + { + VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt); + VEC_quick_push (tree, vect_defs, new_temp); } + else + VEC_replace (tree, vect_defs, 0, new_temp); } - new_stmt = gimple_build_assign (vec_dest, expr); - new_temp = make_ssa_name (vec_dest, new_stmt); - gimple_assign_set_lhs (new_stmt, new_temp); - vect_finish_stmt_generation (stmt, new_stmt, gsi); + if (slp_node) + continue; if (j == 0) STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; @@ -3992,12 +4201,21 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, /* Finalize the reduction-phi (set its arguments) and create the epilog reduction code. */ - if (!single_defuse_cycle || code == COND_EXPR) - new_temp = gimple_assign_lhs (*vec_stmt); + if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node) + { + new_temp = gimple_assign_lhs (*vec_stmt); + VEC_replace (tree, vect_defs, 0, new_temp); + } + + vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies, + epilog_reduc_code, phis, reduc_index, + double_reduc, slp_node); + + VEC_free (gimple, heap, phis); + VEC_free (tree, heap, vec_oprnds0); + if (vec_oprnds1) + VEC_free (tree, heap, vec_oprnds1); - vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies, - epilog_reduc_code, first_phi, reduc_index, - double_reduc); return true; } diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index f1b6355025f..99a865fee20 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -29,8 +29,6 @@ along with GCC; see the file COPYING3. If not see #include "target.h" #include "basic-block.h" #include "diagnostic.h" -#include "tree-pretty-print.h" -#include "gimple-pretty-print.h" #include "tree-flow.h" #include "tree-dump.h" #include "cfgloop.h" @@ -346,19 +344,6 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); } - /* Fail to vectorize statements marked as unvectorizable. */ - if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt))) - { - if (vect_print_dump_info (REPORT_SLP)) - { - fprintf (vect_dump, - "Build SLP failed: unvectorizable statement "); - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); - } - - return false; - } - lhs = gimple_get_lhs (stmt); if (lhs == NULL_TREE) { @@ -1228,7 +1213,7 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) } /* Find SLP sequences starting from groups of reductions. */ - if (loop_vinfo && VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)) > 1 + if (loop_vinfo && VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)) && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, VEC_index (gimple, reductions, 0))) ok = true; diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 4868f73e684..988749b792f 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1134,7 +1134,7 @@ vect_get_vec_defs (tree op0, tree op1, gimple stmt, slp_tree slp_node) { if (slp_node) - vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1); + vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1, -1); else { tree vec_oprnd; @@ -2519,7 +2519,7 @@ vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi, { /* Handle uses. */ if (slp_node) - vect_get_slp_defs (slp_node, &vec_oprnds0, NULL); + vect_get_slp_defs (slp_node, &vec_oprnds0, NULL, -1); else { VEC_free (tree, heap, vec_oprnds0); @@ -2819,7 +2819,7 @@ vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi, if (j == 0) { if (slp_node) - vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1); + vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1); else { vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); @@ -3105,7 +3105,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, if (slp) { /* Get vectorized arguments for SLP_NODE. */ - vect_get_slp_defs (slp_node, &vec_oprnds, NULL); + vect_get_slp_defs (slp_node, &vec_oprnds, NULL, -1); vec_oprnd = VEC_index (tree, vec_oprnds, 0); } @@ -4049,7 +4049,7 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node) || vectorizable_load (stmt, NULL, NULL, NULL, NULL) || vectorizable_call (stmt, NULL, NULL) || vectorizable_store (stmt, NULL, NULL, NULL) - || vectorizable_reduction (stmt, NULL, NULL) + || vectorizable_reduction (stmt, NULL, NULL, NULL) || vectorizable_condition (stmt, NULL, NULL, NULL, 0)); else { @@ -4201,8 +4201,7 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi, break; case reduc_vec_info_type: - gcc_assert (!slp_node); - done = vectorizable_reduction (stmt, gsi, &vec_stmt); + done = vectorizable_reduction (stmt, gsi, &vec_stmt, slp_node); gcc_assert (done); break; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 52b2a7ec59f..bd43a4bc173 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -242,6 +242,9 @@ typedef struct _loop_vec_info { /* The unrolling factor needed to SLP the loop. In case of that pure SLP is applied to the loop, i.e., no unrolling is needed, this is 1. */ unsigned slp_unrolling_factor; + + /* Reduction cycles detected in the loop. Used in loop-aware SLP. */ + VEC (gimple, heap) *reductions; } *loop_vec_info; /* Access Functions. */ @@ -266,6 +269,7 @@ typedef struct _loop_vec_info { #define LOOP_VINFO_STRIDED_STORES(L) (L)->strided_stores #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor +#define LOOP_VINFO_REDUCTIONS(L) (L)->reductions #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ VEC_length (gimple, (L)->may_misalign_stmts) > 0 @@ -844,7 +848,8 @@ extern void vect_transform_loop (loop_vec_info); extern loop_vec_info vect_analyze_loop_form (struct loop *); extern bool vectorizable_live_operation (gimple, gimple_stmt_iterator *, gimple *); -extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *); +extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *, + slp_tree); extern bool vectorizable_induction (gimple, gimple_stmt_iterator *, gimple *); extern int vect_estimate_min_profitable_iters (loop_vec_info); extern tree get_initial_def_for_reduction (gimple, tree, tree *); @@ -862,7 +867,7 @@ extern bool vect_analyze_slp (loop_vec_info, bb_vec_info); extern void vect_make_slp_decision (loop_vec_info); extern void vect_detect_hybrid_slp (loop_vec_info); extern void vect_get_slp_defs (slp_tree, VEC (tree,heap) **, - VEC (tree,heap) **); + VEC (tree,heap) **, int); extern LOC find_bb_location (basic_block); extern bb_vec_info vect_slp_analyze_bb (basic_block); extern void vect_slp_transform_bb (basic_block);