X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Ftree-vect-transform.c;h=1d23f14d1ff35e5b5456e8a08289305ece89ee28;hb=2934c6b590ded021f96a83dbb8b1100acedde68c;hp=bbac6fed52ed52ee52e7df0135ca6193c9492a18;hpb=b35070fead178a918238436d8f6a4d9fc11a0acf;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index bbac6fed52e..1d23f14d1ff 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -1,5 +1,5 @@ /* Transformation Utilities for Loop Vectorization. - Copyright (C) 2003,2004,2005 Free Software Foundation, Inc. + Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc. Contributed by Dorit Naishlos This file is part of GCC. @@ -59,6 +59,7 @@ static void vect_finish_stmt_generation (tree stmt, tree vec_stmt, block_stmt_iterator *bsi); static bool vect_is_simple_cond (tree, loop_vec_info); static void update_vuses_to_preheader (tree, struct loop*); +static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree); static tree get_initial_def_for_reduction (tree, tree, tree *); /* Utility function dealing with loop peeling (not peeling itself). */ @@ -299,12 +300,12 @@ vect_create_data_ref_ptr (tree stmt, tag = DR_MEMTAG (dr); gcc_assert (tag); - /* If tag is a variable (and NOT_A_TAG) than a new type alias + /* If tag is a variable (and NOT_A_TAG) than a new symbol memory tag must be created with tag added to its may alias list. */ - if (var_ann (tag)->mem_tag_kind == NOT_A_TAG) + if (!MTAG_P (tag)) new_type_alias (vect_ptr, tag); else - var_ann (vect_ptr)->type_mem_tag = tag; + var_ann (vect_ptr)->symbol_mem_tag = tag; var_ann (vect_ptr)->subvars = DR_SUBVARS (dr); @@ -656,6 +657,8 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def) switch (code) { + case WIDEN_SUM_EXPR: + case DOT_PROD_EXPR: case PLUS_EXPR: if (INTEGRAL_TYPE_P (type)) def = build_int_cst (type, 0); @@ -711,66 +714,66 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def) } -/* Function vect_create_epilog_for_reduction: +/* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction - computation. + computation. - LOOP_EXIT_VECT_DEF is a vector of partial results. We need to "reduce" it - into a single result, by applying the operation REDUC_CODE on the - partial-results-vector. For this, we need to create a new phi node at the - loop exit to preserve loop-closed form, as illustrated below. - - STMT is the original scalar reduction stmt that is being vectorized. - REDUCTION_OP is the scalar reduction-variable. + VECT_DEF is a vector of partial results. + REDUC_CODE is the tree-code for the epilog reduction. + STMT is the scalar reduction stmt that is being vectorized. REDUCTION_PHI is the phi-node that carries the reduction computation. - This function also sets the arguments for the REDUCTION_PHI: - The loop-entry argument is the (vectorized) initial-value of REDUCTION_OP. - The loop-latch argument is VECT_DEF - the vector of partial sums. - This function transforms this: + This function: + 1. Creates the reduction def-use cycle: sets the the arguments for + REDUCTION_PHI: + The loop-entry argument is the vectorized initial-value of the reduction. + The loop-latch argument is VECT_DEF - the vector of partial sums. + 2. "Reduces" the vector of partial results VECT_DEF into a single result, + by applying the operation specified by REDUC_CODE if available, or by + other means (whole-vector shifts or a scalar loop). + The function also creates a new phi node at the loop exit to preserve + loop-closed form, as illustrated below. + + The flow at the entry to this function: loop: - vec_def = phi # REDUCTION_PHI - .... - VECT_DEF = ... - + vec_def = phi # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + s_loop = scalar_stmt # (scalar) STMT loop_exit: - s_out0 = phi # EXIT_PHI - + s_out0 = phi # (scalar) EXIT_PHI use use - Into: + The above is transformed by this function into: loop: - vec_def = phi # REDUCTION_PHI - .... - VECT_DEF = ... - + vec_def = phi # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + s_loop = scalar_stmt # (scalar) STMT loop_exit: - s_out0 = phi # EXIT_PHI - v_out1 = phi # NEW_EXIT_PHI - - v_out2 = reduc_expr + s_out0 = phi # (scalar) EXIT_PHI + v_out1 = phi # NEW_EXIT_PHI + v_out2 = reduce s_out3 = extract_field - - use - use + s_out4 = adjust_result + use + use */ static void -vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, +vect_create_epilog_for_reduction (tree vect_def, tree stmt, enum tree_code reduc_code, tree reduction_phi) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - enum machine_mode mode = TYPE_MODE (vectype); + tree vectype; + enum machine_mode mode; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block exit_bb; - tree scalar_dest = TREE_OPERAND (stmt, 0); - tree scalar_type = TREE_TYPE (scalar_dest); + tree scalar_dest; + tree scalar_type; tree new_phi; block_stmt_iterator exit_bsi; tree vec_dest; @@ -786,7 +789,16 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, imm_use_iterator imm_iter; use_operand_p use_p; bool extract_scalar_result; + tree reduction_op; + tree orig_stmt; + tree operation = TREE_OPERAND (stmt, 1); + int op_type; + op_type = TREE_CODE_LENGTH (TREE_CODE (operation)); + reduction_op = TREE_OPERAND (operation, op_type-1); + vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); + mode = TYPE_MODE (vectype); + /*** 1. Create the reduction def-use cycle ***/ /* 1.1 set the loop-entry arg of the reduction-phi: */ @@ -797,7 +809,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, &scalar_initial_def); add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop)); - /* 1.2 set the loop-latch arg for the reduction-phi: */ add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop)); @@ -810,7 +821,32 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, } - /*** 2. Create epilog code ***/ + /*** 2. Create epilog code + The reduction epilog code operates across the elements of the vector + of partial results computed by the vectorized loop. + The reduction epilog code consists of: + step 1: compute the scalar result in a vector (v_out2) + step 2: extract the scalar result (s_out3) from the vector (v_out2) + step 3: adjust the scalar result (s_out3) if needed. + + Step 1 can be accomplished using one the following three schemes: + (scheme 1) using reduc_code, if available. + (scheme 2) using whole-vector shifts, if available. + (scheme 3) using a scalar loop. In this case steps 1+2 above are + combined. + + The overall epilog code looks like this: + + s_out0 = phi # original EXIT_PHI + v_out1 = phi # NEW_EXIT_PHI + v_out2 = reduce # step 1 + s_out3 = extract_field # step 2 + s_out4 = adjust_result # step 3 + + (step 3 is optional, and step2 1 and 2 may be combined). + Lastly, the uses of s_out0 are replaced by s_out4. + + ***/ /* 2.1 Create new loop-exit-phi to preserve loop-closed form: v_out1 = phi */ @@ -818,15 +854,39 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, exit_bb = loop->single_exit->dest; new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb); SET_PHI_ARG_DEF (new_phi, loop->single_exit->dest_idx, vect_def); - exit_bsi = bsi_start (exit_bb); - + /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 + (i.e. when reduc_code is not available) and in the final adjustment code + (if needed). Also get the original scalar reduction variable as + defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it + represents a reduction pattern), the tree-code and scalar-def are + taken from the original stmt that the pattern-stmt (STMT) replaces. + Otherwise (it is a regular reduction) - the tree-code and scalar-def + are taken from STMT. */ + + orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); + if (!orig_stmt) + { + /* Regular reduction */ + orig_stmt = stmt; + } + else + { + /* Reduction pattern */ + stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); + gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); + gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); + } + code = TREE_CODE (TREE_OPERAND (orig_stmt, 1)); + scalar_dest = TREE_OPERAND (orig_stmt, 0); + scalar_type = TREE_TYPE (scalar_dest); new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); bitsize = TYPE_SIZE (scalar_type); bytesize = TYPE_SIZE_UNIT (scalar_type); - /* 2.2 Create the reduction code. */ + /* 2.3 Create the reduction code, using one of the three schemes described + above. */ if (reduc_code < NUM_TREE_CODES) { @@ -849,16 +909,11 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, { enum tree_code shift_code = 0; bool have_whole_vector_shift = true; - enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1)); /* CHECKME */ int bit_offset; int element_bitsize = tree_low_cst (bitsize, 1); int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); tree vec_temp; - /* The result of the reduction is expected to be at the least - significant bits of the vector. This is merely convention, - as it's the extraction later that really matters, and that - is also under our control. */ if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing) shift_code = VEC_RSHIFT_EXPR; else @@ -881,7 +936,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, if (have_whole_vector_shift) { - /*** Case 2: + /*** Case 2: Create: for (offset = VS/2; offset >= element_size; offset/=2) { Create: va' = vec_shift @@ -905,17 +960,12 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_name = make_ssa_name (vec_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_name; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); - epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, build2 (code, vectype, new_name, new_temp)); new_temp = make_ssa_name (vec_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } extract_scalar_result = true; @@ -924,10 +974,11 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, { tree rhs; - /*** Case 3: - Create: + /*** Case 3: Create: s = extract_field - for (offset=element_size; offset Create: s = op @@ -938,18 +989,13 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, vec_temp = PHI_RESULT (new_phi); vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); - rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, bitsize_zero_node); - BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type); - epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, - rhs); + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs); new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); for (bit_offset = element_bitsize; bit_offset < vec_size_in_bits; @@ -965,25 +1011,19 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_name = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_name; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); - epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, build2 (code, scalar_type, new_name, new_temp)); new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } extract_scalar_result = false; } } - - /* 2.3 Extract the final scalar result. Create: + /* 2.4 Extract the final scalar result. Create: s_out3 = extract_field */ if (extract_scalar_result) @@ -993,8 +1033,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "extract scalar result"); - /* The result is in the low order bits. */ - if (BITS_BIG_ENDIAN) + if (BYTES_BIG_ENDIAN) bitpos = size_binop (MULT_EXPR, bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), TYPE_SIZE (scalar_type)); @@ -1007,17 +1046,14 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } - /* 2.4 Adjust the final result by the initial value of the reduction - variable. (when such adjustment is not needed, then + variable. (When such adjustment is not needed, then 'scalar_initial_def' is zero). Create: - s_out = scalar_expr */ + s_out4 = scalar_expr */ if (scalar_initial_def) { @@ -1026,18 +1062,13 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } + /* 2.6 Replace uses of s_out0 with uses of s_out3 */ - /* 2.5 Replace uses of s_out0 with uses of s_out3 */ - - /* Find the loop-closed-use at the loop exit of the original - scalar result. (The reduction result is expected to have - two immediate uses - one at the latch block, and one at the - loop exit). */ + /* Find the loop-closed-use at the loop exit of the original scalar result. + (The reduction result is expected to have two immediate uses - one at the + latch block, and one at the loop exit). */ exit_phi = NULL; FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) { @@ -1047,9 +1078,10 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, break; } } - + /* We expect to have found an exit_phi because of loop-closed-ssa form. */ + gcc_assert (exit_phi); + /* Replace the uses: */ orig_name = PHI_RESULT (exit_phi); - FOR_EACH_IMM_USE_SAFE (use_p, imm_iter, orig_name) SET_USE (use_p, new_temp); } @@ -1060,33 +1092,69 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, Check if STMT performs a reduction operation that can be vectorized. If VEC_STMT is also passed, vectorize the STMT: create a vectorized stmt to replace it, put it in VEC_STMT, and insert it at BSI. - Return FALSE if not a vectorizable STMT, TRUE otherwise. */ + Return FALSE if not a vectorizable STMT, TRUE otherwise. + + This function also handles reduction idioms (patterns) that have been + recognized in advance during vect_pattern_recog. In this case, STMT may be + of this form: + X = pattern_expr (arg0, arg1, ..., X) + and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original + sequence that had been detected and replaced by the pattern-stmt (STMT). + + In some cases of reduction patterns, the type of the reduction variable X is + different than the type of the other arguments of STMT. + In such cases, the vectype that is used when transforming STMT into a vector + stmt is different than the vectype that is used to determine the + vectorization factor, because it consists of a different number of elements + than the actual number of elements that are being operated upon in parallel. + + For example, consider an accumulation of shorts into an int accumulator. + On some targets it's possible to vectorize this pattern operating on 8 + shorts at a time (hence, the vectype for purposes of determining the + vectorization factor should be V8HI); on the other hand, the vectype that + is used to create the vector form is actually V4SI (the type of the result). + + Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that + indicates what is the actual level of parallelism (V8HI in the example), so + that the right vectorization factor would be derived. This vectype + corresponds to the type of arguments to the reduction stmt, and should *NOT* + be used to create the vectorized stmt. The right vectype for the vectorized + stmt is obtained from the type of the result X: + get_vectype_for_scalar_type (TREE_TYPE (X)) + + This means that, contrary to "regular" reductions (or "regular" stmts in + general), the following equation: + STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) + does *NOT* necessarily hold for reduction patterns. */ bool vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) { tree vec_dest; tree scalar_dest; - tree op0, op1; - tree loop_vec_def; + tree op; + tree loop_vec_def0, loop_vec_def1; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree operation; - enum tree_code code, reduc_code = 0; + enum tree_code code, orig_code, epilog_reduc_code = 0; enum machine_mode vec_mode; int op_type; optab optab, reduc_optab; tree new_temp; - tree def0, def1, def_stmt0, def_stmt1; - enum vect_def_type dt0, dt1; + tree def, def_stmt; + enum vect_def_type dt; tree new_phi; tree scalar_type; - bool is_simple_use0; - bool is_simple_use1; + bool is_simple_use; + tree orig_stmt; + stmt_vec_info orig_stmt_info; + tree expr = NULL_TREE; + int i; - /* Is vectorizable reduction? */ + /* 1. Is vectorizable reduction? */ /* Not supportable if the reduction variable is used in the loop. */ if (STMT_VINFO_RELEVANT_P (stmt_info)) @@ -1095,43 +1163,68 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (!STMT_VINFO_LIVE_P (stmt_info)) return false; - /* Make sure it was already recognized as a reduction pattern. */ + /* Make sure it was already recognized as a reduction computation. */ if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def) return false; + /* 2. Has this been recognized as a reduction pattern? + + Check if STMT represents a pattern that has been recognized + in earlier analysis stages. For stmts that represent a pattern, + the STMT_VINFO_RELATED_STMT field records the last stmt in + the original sequence that constitutes the pattern. */ + + orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); + if (orig_stmt) + { + orig_stmt_info = vinfo_for_stmt (orig_stmt); + gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); + gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); + gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); + } + + /* 3. Check the operands of the operation. The first operands are defined + inside the loop body. The last operand is the reduction variable, + which is defined by the loop-header-phi. */ + gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR); operation = TREE_OPERAND (stmt, 1); code = TREE_CODE (operation); op_type = TREE_CODE_LENGTH (code); - if (op_type != binary_op) + if (op_type != binary_op && op_type != ternary_op) return false; - - op0 = TREE_OPERAND (operation, 0); - op1 = TREE_OPERAND (operation, 1); scalar_dest = TREE_OPERAND (stmt, 0); scalar_type = TREE_TYPE (scalar_dest); - /* Check the first operand. It is expected to be defined inside the loop. */ - is_simple_use0 = - vect_is_simple_use (op0, loop_vinfo, &def_stmt0, &def0, &dt0); - is_simple_use1 = - vect_is_simple_use (op1, loop_vinfo, &def_stmt1, &def1, &dt1); - - gcc_assert (is_simple_use0); - gcc_assert (is_simple_use1); - gcc_assert (dt0 == vect_loop_def); - gcc_assert (dt1 == vect_reduction_def); - gcc_assert (TREE_CODE (def_stmt1) == PHI_NODE); - gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt1)); + /* All uses but the last are expected to be defined in the loop. + The last use is the reduction variable. */ + for (i = 0; i < op_type-1; i++) + { + op = TREE_OPERAND (operation, i); + is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); + gcc_assert (is_simple_use); + gcc_assert (dt == vect_loop_def || dt == vect_invariant_def || + dt == vect_constant_def); + } - if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt1))) - return false; + op = TREE_OPERAND (operation, i); + is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); + gcc_assert (is_simple_use); + gcc_assert (dt == vect_reduction_def); + gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); + if (orig_stmt) + gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt)); + else + gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt)); + + if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) + return false; - /* Supportable by target? */ + /* 4. Supportable by target? */ - /* check support for the operation in the loop */ + /* 4.1. check support for the operation in the loop */ optab = optab_for_tree_code (code, vectype); if (!optab) { @@ -1162,21 +1255,69 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) return false; } - /* check support for the epilog operation */ - if (!reduction_code_for_scalar_code (code, &reduc_code)) + /* 4.2. Check support for the epilog operation. + + If STMT represents a reduction pattern, then the type of the + reduction variable may be different than the type of the rest + of the arguments. For example, consider the case of accumulation + of shorts into an int accumulator; The original code: + S1: int_a = (int) short_a; + orig_stmt-> S2: int_acc = plus ; + + was replaced with: + STMT: int_acc = widen_sum + + This means that: + 1. The tree-code that is used to create the vector operation in the + epilog code (that reduces the partial results) is not the + tree-code of STMT, but is rather the tree-code of the original + stmt from the pattern that STMT is replacing. I.e, in the example + above we want to use 'widen_sum' in the loop, but 'plus' in the + epilog. + 2. The type (mode) we use to check available target support + for the vector operation to be created in the *epilog*, is + determined by the type of the reduction variable (in the example + above we'd check this: plus_optab[vect_int_mode]). + However the type (mode) we use to check available target support + for the vector operation to be created *inside the loop*, is + determined by the type of the other arguments to STMT (in the + example we'd check this: widen_sum_optab[vect_short_mode]). + + This is contrary to "regular" reductions, in which the types of all + the arguments are the same as the type of the reduction variable. + For "regular" reductions we can therefore use the same vector type + (and also the same tree-code) when generating the epilog code and + when generating the code inside the loop. */ + + if (orig_stmt) + { + /* This is a reduction pattern: get the vectype from the type of the + reduction variable, and get the tree-code from orig_stmt. */ + orig_code = TREE_CODE (TREE_OPERAND (orig_stmt, 1)); + vectype = get_vectype_for_scalar_type (TREE_TYPE (def)); + vec_mode = TYPE_MODE (vectype); + } + else + { + /* Regular reduction: use the same vectype and tree-code as used for + the vector code inside the loop can be used for the epilog code. */ + orig_code = code; + } + + if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code)) return false; - reduc_optab = optab_for_tree_code (reduc_code, vectype); + reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype); if (!reduc_optab) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "no optab for reduction."); - reduc_code = NUM_TREE_CODES; + epilog_reduc_code = NUM_TREE_CODES; } if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "reduc op not supported by target."); - reduc_code = NUM_TREE_CODES; + epilog_reduc_code = NUM_TREE_CODES; } if (!vec_stmt) /* transformation not required. */ @@ -1193,25 +1334,31 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Create the destination vector */ vec_dest = vect_create_destination_var (scalar_dest, vectype); - /* Create the reduction-phi that defines the reduction-operand. */ new_phi = create_phi_node (vec_dest, loop->header); - /* Prepare the operand that is defined inside the loop body */ - loop_vec_def = vect_get_vec_def_for_operand (op0, stmt, NULL); + op = TREE_OPERAND (operation, 0); + loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL); + if (op_type == binary_op) + expr = build2 (code, vectype, loop_vec_def0, PHI_RESULT (new_phi)); + else if (op_type == ternary_op) + { + op = TREE_OPERAND (operation, 1); + loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL); + expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, + PHI_RESULT (new_phi)); + } /* Create the vectorized operation that computes the partial results */ - *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, - build2 (code, vectype, loop_vec_def, PHI_RESULT (new_phi))); + *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, expr); new_temp = make_ssa_name (vec_dest, *vec_stmt); TREE_OPERAND (*vec_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, *vec_stmt, bsi); - /* Finalize the reduction-phi (set it's arguments) and create the epilog reduction code. */ - vect_create_epilog_for_reduction (new_temp, stmt, op1, reduc_code, new_phi); + vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi); return true; } @@ -1781,7 +1928,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) the value of the parameter and no global variables are touched which makes the builtin a "const" function. Requiring the builtin to have the "const" attribute makes it unnecessary - to call mark_call_clobbered_vars_to_rename. */ + to call mark_call_clobbered. */ gcc_assert (TREE_READONLY (builtin_decl)); } else @@ -2019,8 +2166,8 @@ vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Arguments are ready. create the new vector stmt. */ vec_compare = build2 (TREE_CODE (cond_expr), vectype, vec_cond_lhs, vec_cond_rhs); - vec_cond_expr = build (VEC_COND_EXPR, vectype, - vec_compare, vec_then_clause, vec_else_clause); + vec_cond_expr = build3 (VEC_COND_EXPR, vectype, + vec_compare, vec_then_clause, vec_else_clause); *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, vec_cond_expr); new_temp = make_ssa_name (vec_dest, *vec_stmt); @@ -2040,6 +2187,7 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi) bool is_store = false; tree vec_stmt = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree orig_stmt_in_pattern; bool done; if (STMT_VINFO_RELEVANT_P (stmt_info)) @@ -2078,7 +2226,25 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi) gcc_unreachable (); } + gcc_assert (vec_stmt); STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt; + orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info); + if (orig_stmt_in_pattern) + { + stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern); + if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) + { + gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); + + /* STMT was inserted by the vectorizer to replace a computation + idiom. ORIG_STMT_IN_PATTERN is a stmt in the original + sequence that computed this idiom. We need to record a pointer + to VEC_STMT in the stmt_info of ORIG_STMT_IN_PATTERN. See more + detail in the documentation of vect_pattern_recog. */ + + STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt; + } + } } if (STMT_VINFO_LIVE_P (stmt_info)) @@ -2603,16 +2769,14 @@ static void vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters) { unsigned int i; - varray_type datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); + VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); + struct data_reference *dr; if (vect_dump && (dump_flags & TDF_DETAILS)) fprintf (vect_dump, "=== vect_update_inits_of_dr ==="); - for (i = 0; i < VARRAY_ACTIVE_SIZE (datarefs); i++) - { - struct data_reference *dr = VARRAY_GENERIC_PTR (datarefs, i); - vect_update_init_of_dr (dr, niters); - } + for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) + vect_update_init_of_dr (dr, niters); } @@ -2779,7 +2943,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, append_to_statement_list_force (and_stmt, cond_expr_stmt_list); /* Make and_tmp the left operand of the conditional test against zero. - if and_tmp has a non-zero bit then some address is unaligned. */ + if and_tmp has a nonzero bit then some address is unaligned. */ ptrsize_zero = build_int_cst (int_ptrsize_type, 0); return build2 (EQ_EXPR, boolean_type_node, and_tmp_name, ptrsize_zero); @@ -2822,12 +2986,44 @@ vect_transform_loop (loop_vec_info loop_vinfo, tree cond_expr_stmt_list = NULL_TREE; basic_block condition_bb; block_stmt_iterator cond_exp_bsi; + basic_block merge_bb; + basic_block new_exit_bb; + edge new_exit_e, e; + tree orig_phi, new_phi, arg; cond_expr = vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list); initialize_original_copy_tables (); nloop = loop_version (loops, loop, cond_expr, &condition_bb, true); free_original_copy_tables(); + + /** Loop versioning violates an assumption we try to maintain during + vectorization - that the loop exit block has a single predecessor. + After versioning, the exit block of both loop versions is the same + basic block (i.e. it has two predecessors). Just in order to simplify + following transformations in the vectorizer, we fix this situation + here by adding a new (empty) block on the exit-edge of the loop, + with the proper loop-exit phis to maintain loop-closed-form. **/ + + merge_bb = loop->single_exit->dest; + gcc_assert (EDGE_COUNT (merge_bb->preds) == 2); + new_exit_bb = split_edge (loop->single_exit); + add_bb_to_loop (new_exit_bb, loop->outer); + new_exit_e = loop->single_exit; + e = EDGE_SUCC (new_exit_bb, 0); + + for (orig_phi = phi_nodes (merge_bb); orig_phi; + orig_phi = PHI_CHAIN (orig_phi)) + { + new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)), + new_exit_bb); + arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); + add_phi_arg (new_phi, arg, new_exit_e); + SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi)); + } + + /** end loop-exit-fixes after versioning **/ + update_ssa (TODO_update_ssa); cond_exp_bsi = bsi_last (condition_bb); bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT); @@ -2912,7 +3108,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, stmt_ann_t ann = stmt_ann (stmt); free (stmt_info); set_stmt_info ((tree_ann_t)ann, NULL); - bsi_remove (&si); + bsi_remove (&si, true); continue; }