X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Ftree-vect-transform.c;h=db0573ce6f5cf8e5f25f50b3a97348bd20e9a272;hb=57d5535b031f85ada1a22059d8215cb349aa0e97;hp=2b4d1d774af4f381f6b1023bfaeb08db6edfed08;hpb=ea8f3370b1692d96d1289a6ed7a757fd5b0685bd;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index 2b4d1d774af..db0573ce6f5 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -1,5 +1,5 @@ /* Transformation Utilities for Loop Vectorization. - Copyright (C) 2003,2004,2005 Free Software Foundation, Inc. + Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc. Contributed by Dorit Naishlos This file is part of GCC. @@ -16,8 +16,8 @@ for more details. You should have received a copy of the GNU General Public License along with GCC; see the file COPYING. If not, write to the Free -Software Foundation, 59 Temple Place - Suite 330, Boston, MA -02111-1307, USA. */ +Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301, USA. */ #include "config.h" #include "system.h" @@ -35,6 +35,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include "cfgloop.h" #include "expr.h" #include "optabs.h" +#include "recog.h" #include "tree-data-ref.h" #include "tree-chrec.h" #include "tree-scalar-evolution.h" @@ -50,7 +51,6 @@ static void vect_align_data_ref (tree); static tree vect_create_destination_var (tree, tree); static tree vect_create_data_ref_ptr (tree, block_stmt_iterator *, tree, tree *, bool); -static tree vect_create_index_for_vector_ref (loop_vec_info); static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree); static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *); static tree vect_get_vec_def_for_operand (tree, tree, tree *); @@ -59,6 +59,7 @@ static void vect_finish_stmt_generation (tree stmt, tree vec_stmt, block_stmt_iterator *bsi); static bool vect_is_simple_cond (tree, loop_vec_info); static void update_vuses_to_preheader (tree, struct loop*); +static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree); static tree get_initial_def_for_reduction (tree, tree, tree *); /* Utility function dealing with loop peeling (not peeling itself). */ @@ -72,6 +73,7 @@ static void vect_update_inits_of_drs (loop_vec_info, tree); static void vect_do_peeling_for_alignment (loop_vec_info, struct loops *); static void vect_do_peeling_for_loop_bound (loop_vec_info, tree *, struct loops *); +static int vect_min_worthwhile_factor (enum tree_code); /* Function vect_get_new_vect_var. @@ -111,58 +113,6 @@ vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) } -/* Function vect_create_index_for_vector_ref. - - Create (and return) an index variable, along with it's update chain in the - loop. This variable will be used to access a memory location in a vector - operation. - - Input: - LOOP: The loop being vectorized. - BSI: The block_stmt_iterator where STMT is. Any new stmts created by this - function can be added here, or in the loop pre-header. - - Output: - Return an index that will be used to index a vector array. It is expected - that a pointer to the first vector will be used as the base address for the - indexed reference. - - FORNOW: we are not trying to be efficient, just creating a new index each - time from scratch. At this time all vector references could use the same - index. - - TODO: create only one index to be used by all vector references. Record - the index in the LOOP_VINFO the first time this procedure is called and - return it on subsequent calls. The increment of this index must be placed - just before the conditional expression that ends the single block loop. */ - -static tree -vect_create_index_for_vector_ref (loop_vec_info loop_vinfo) -{ - tree init, step; - block_stmt_iterator incr_bsi; - bool insert_after; - tree indx_before_incr, indx_after_incr; - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - tree incr; - - /* It is assumed that the base pointer used for vectorized access contains - the address of the first vector. Therefore the index used for vectorized - access must be initialized to zero and incremented by 1. */ - - init = integer_zero_node; - step = integer_one_node; - - standard_iv_increment_position (loop, &incr_bsi, &insert_after); - create_iv (init, step, NULL_TREE, loop, &incr_bsi, insert_after, - &indx_before_incr, &indx_after_incr); - incr = bsi_stmt (incr_bsi); - set_stmt_info ((tree_ann_t)stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo)); - - return indx_before_incr; -} - - /* Function vect_create_addr_base_for_vector_ref. Create an expression that computes the address of the first memory location @@ -188,8 +138,7 @@ vect_create_addr_base_for_vector_ref (tree stmt, { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - tree data_ref_base = - unshare_expr (STMT_VINFO_VECT_DR_BASE_ADDRESS (stmt_info)); + tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); tree base_name = build_fold_indirect_ref (data_ref_base); tree ref = DR_REF (dr); tree scalar_type = TREE_TYPE (ref); @@ -198,9 +147,11 @@ vect_create_addr_base_for_vector_ref (tree stmt, tree new_temp; tree addr_base, addr_expr; tree dest, new_stmt; - tree base_offset = unshare_expr (STMT_VINFO_VECT_INIT_OFFSET (stmt_info)); + tree base_offset = unshare_expr (DR_OFFSET (dr)); + tree init = unshare_expr (DR_INIT (dr)); /* Create base_offset */ + base_offset = size_binop (PLUS_EXPR, base_offset, init); dest = create_tmp_var (TREE_TYPE (base_offset), "base_off"); add_referenced_tmp_var (dest); base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest); @@ -210,17 +161,17 @@ vect_create_addr_base_for_vector_ref (tree stmt, { tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset"); add_referenced_tmp_var (tmp); - offset = fold (build2 (MULT_EXPR, TREE_TYPE (offset), offset, - STMT_VINFO_VECT_STEP (stmt_info))); - base_offset = fold (build2 (PLUS_EXPR, TREE_TYPE (base_offset), - base_offset, offset)); + offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, + DR_STEP (dr)); + base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset), + base_offset, offset); base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp); append_to_statement_list_force (new_stmt, new_stmt_list); } /* base + base_offset */ - addr_base = fold (build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base, - base_offset)); + addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base, + base_offset); /* addr_expr = addr_base */ addr_expr = vect_get_new_vect_var (scalar_ptr_type, vect_pointer_var, @@ -231,7 +182,7 @@ vect_create_addr_base_for_vector_ref (tree stmt, TREE_OPERAND (vec_stmt, 0) = new_temp; append_to_statement_list_force (vec_stmt, new_stmt_list); - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "created "); print_generic_expr (vect_dump, vec_stmt, TDF_SLIM); @@ -289,24 +240,17 @@ vect_align_data_ref (tree stmt) Return the initial_address in INITIAL_ADDRESS. - 2. Create a data-reference in the loop based on the new vector pointer vp, - and using a new index variable 'idx' as follows: - - vp' = vp + update - - where if ONLY_INIT is true: - update = zero - and otherwise - update = idx + vector_type_size - - Return the pointer vp'. - + 2. If ONLY_INIT is true, return the initial pointer. Otherwise, create + a data-reference in the loop based on the new vector pointer vp. This + new data reference will by some means be updated each iteration of + the loop. Return the pointer vp'. FORNOW: handle only aligned and consecutive accesses. */ static tree -vect_create_data_ref_ptr (tree stmt, block_stmt_iterator *bsi, tree offset, - tree *initial_address, bool only_init) +vect_create_data_ref_ptr (tree stmt, + block_stmt_iterator *bsi ATTRIBUTE_UNUSED, + tree offset, tree *initial_address, bool only_init) { tree base_name; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); @@ -319,22 +263,17 @@ vect_create_data_ref_ptr (tree stmt, block_stmt_iterator *bsi, tree offset, tree new_temp; tree vec_stmt; tree new_stmt_list = NULL_TREE; - tree idx; edge pe = loop_preheader_edge (loop); basic_block new_bb; tree vect_ptr_init; - tree vectype_size; - tree ptr_update; - tree data_ref_ptr; - tree type, tmp, size; + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - base_name = build_fold_indirect_ref (unshare_expr ( - STMT_VINFO_VECT_DR_BASE_ADDRESS (stmt_info))); + base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr))); - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) { tree data_ref_base = base_name; - fprintf (vect_dump, "create array_ref of type: "); + fprintf (vect_dump, "create vector-pointer variable to type: "); print_generic_expr (vect_dump, vectype, TDF_SLIM); if (TREE_CODE (data_ref_base) == VAR_DECL) fprintf (vect_dump, " vectorizing a one dimensional array ref: "); @@ -356,19 +295,19 @@ vect_create_data_ref_ptr (tree stmt, block_stmt_iterator *bsi, tree offset, /** (2) Add aliasing information to the new vector-pointer: - (The points-to info (SSA_NAME_PTR_INFO) may be defined later.) **/ + (The points-to info (DR_PTR_INFO) may be defined later.) **/ - tag = STMT_VINFO_MEMTAG (stmt_info); + tag = DR_MEMTAG (dr); gcc_assert (tag); /* If tag is a variable (and NOT_A_TAG) than a new type alias tag must be created with tag added to its may alias list. */ - if (var_ann (tag)->mem_tag_kind == NOT_A_TAG) + if (!MTAG_P (tag)) new_type_alias (vect_ptr, tag); else var_ann (vect_ptr)->type_mem_tag = tag; - var_ann (vect_ptr)->subvars = STMT_VINFO_SUBVARS (stmt_info); + var_ann (vect_ptr)->subvars = DR_SUBVARS (dr); /** (3) Calculate the initial address the vector-pointer, and set the vector-pointer to point to it before the loop: **/ @@ -384,11 +323,10 @@ vect_create_data_ref_ptr (tree stmt, block_stmt_iterator *bsi, tree offset, /* Create: p = (vectype *) initial_base */ vec_stmt = fold_convert (vect_ptr_type, new_temp); vec_stmt = build2 (MODIFY_EXPR, void_type_node, vect_ptr, vec_stmt); - new_temp = make_ssa_name (vect_ptr, vec_stmt); - TREE_OPERAND (vec_stmt, 0) = new_temp; + vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt); + TREE_OPERAND (vec_stmt, 0) = vect_ptr_init; new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt); gcc_assert (!new_bb); - vect_ptr_init = TREE_OPERAND (vec_stmt, 0); /** (4) Handle the updating of the vector-pointer inside the loop: **/ @@ -396,45 +334,37 @@ vect_create_data_ref_ptr (tree stmt, block_stmt_iterator *bsi, tree offset, if (only_init) /* No update in loop is required. */ { /* Copy the points-to information if it exists. */ - if (STMT_VINFO_PTR_INFO (stmt_info)) - duplicate_ssa_name_ptr_info (vect_ptr_init, - STMT_VINFO_PTR_INFO (stmt_info)); + if (DR_PTR_INFO (dr)) + duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr)); return vect_ptr_init; } + else + { + block_stmt_iterator incr_bsi; + bool insert_after; + tree indx_before_incr, indx_after_incr; + tree incr; + + standard_iv_increment_position (loop, &incr_bsi, &insert_after); + create_iv (vect_ptr_init, + fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)), + NULL_TREE, loop, &incr_bsi, insert_after, + &indx_before_incr, &indx_after_incr); + incr = bsi_stmt (incr_bsi); + set_stmt_info ((tree_ann_t)stmt_ann (incr), + new_stmt_vec_info (incr, loop_vinfo)); - idx = vect_create_index_for_vector_ref (loop_vinfo); - - /* Create: update = idx * vectype_size */ - tmp = create_tmp_var (integer_type_node, "update"); - add_referenced_tmp_var (tmp); - size = TYPE_SIZE (vect_ptr_type); - type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1); - ptr_update = create_tmp_var (type, "update"); - add_referenced_tmp_var (ptr_update); - vectype_size = TYPE_SIZE_UNIT (vectype); - vec_stmt = build2 (MULT_EXPR, integer_type_node, idx, vectype_size); - vec_stmt = build2 (MODIFY_EXPR, void_type_node, tmp, vec_stmt); - new_temp = make_ssa_name (tmp, vec_stmt); - TREE_OPERAND (vec_stmt, 0) = new_temp; - bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT); - vec_stmt = fold_convert (type, new_temp); - vec_stmt = build2 (MODIFY_EXPR, void_type_node, ptr_update, vec_stmt); - new_temp = make_ssa_name (ptr_update, vec_stmt); - TREE_OPERAND (vec_stmt, 0) = new_temp; - bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT); - - /* Create: data_ref_ptr = vect_ptr_init + update */ - vec_stmt = build2 (PLUS_EXPR, vect_ptr_type, vect_ptr_init, new_temp); - vec_stmt = build2 (MODIFY_EXPR, void_type_node, vect_ptr, vec_stmt); - new_temp = make_ssa_name (vect_ptr, vec_stmt); - TREE_OPERAND (vec_stmt, 0) = new_temp; - bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT); - data_ref_ptr = TREE_OPERAND (vec_stmt, 0); + /* Copy the points-to information if it exists. */ + if (DR_PTR_INFO (dr)) + { + duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr)); + duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr)); + } + merge_alias_info (vect_ptr_init, indx_before_incr); + merge_alias_info (vect_ptr_init, indx_after_incr); - /* Copy the points-to information if it exists. */ - if (STMT_VINFO_PTR_INFO (stmt_info)) - duplicate_ssa_name_ptr_info (data_ref_ptr, STMT_VINFO_PTR_INFO (stmt_info)); - return data_ref_ptr; + return indx_before_incr; + } } @@ -496,7 +426,7 @@ vect_init_vector (tree stmt, tree vector_var) new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); gcc_assert (!new_bb); - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "created new init_stmt: "); print_generic_expr (vect_dump, init_stmt, TDF_SLIM); @@ -538,7 +468,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) enum vect_def_type dt; bool is_simple_use; - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "vect_get_vec_def_for_operand: "); print_generic_expr (vect_dump, op, TDF_SLIM); @@ -546,7 +476,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); gcc_assert (is_simple_use); - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) { if (def) { @@ -569,7 +499,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) *scalar_def = op; /* Create 'vect_cst_ = {cst,cst,...,cst}' */ - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits); for (i = nunits - 1; i >= 0; --i) @@ -587,7 +517,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) *scalar_def = def; /* Create 'vec_inv = {inv,inv,..,inv}' */ - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "Create vector_inv."); for (i = nunits - 1; i >= 0; --i) @@ -595,7 +525,8 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) t = tree_cons (NULL_TREE, def, t); } - vec_inv = build_constructor (vectype, t); + /* FIXME: use build_constructor directly. */ + vec_inv = build_constructor_from_list (vectype, t); return vect_init_vector (stmt, vec_inv); } @@ -626,7 +557,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) /* Case 5: operand is defined by loop-header phi - induction. */ case vect_induction_def: { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "induction - unsupported."); internal_error ("no support for induction"); /* FORNOW */ } @@ -646,7 +577,7 @@ vect_finish_stmt_generation (tree stmt, tree vec_stmt, block_stmt_iterator *bsi) { bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT); - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "add new stmt: "); print_generic_expr (vect_dump, vec_stmt, TDF_SLIM); @@ -726,9 +657,14 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def) switch (code) { + case WIDEN_SUM_EXPR: + case DOT_PROD_EXPR: case PLUS_EXPR: - def = INTEGRAL_TYPE_P (type) ? integer_zero_node : - build_real (type, dconst0); + if (INTEGRAL_TYPE_P (type)) + def = build_int_cst (type, 0); + else + def = build_real (type, dconst0); + #ifdef ADJUST_IN_EPILOG /* All the 'nunits' elements are set to 0. The final result will be adjusted by 'init_val' at the loop epilog. */ @@ -754,9 +690,7 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def) } for (i = nelements - 1; i >= 0; --i) - { - t = tree_cons (NULL_TREE, def, t); - } + t = tree_cons (NULL_TREE, def, t); if (nelements == nunits - 1) { @@ -769,90 +703,102 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def) if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST) vec = build_vector (vectype, t); else - vec = build_constructor (vectype, t); + vec = build_constructor_from_list (vectype, t); - if (need_epilog_adjust) - *scalar_def = init_val; + if (!need_epilog_adjust) + *scalar_def = NULL_TREE; else - *scalar_def = INTEGRAL_TYPE_P (type) ? integer_zero_node - : build_real (type, dconst0); + *scalar_def = init_val; + return vect_init_vector (stmt, vec); } -/* Function vect_create_epilog_for_reduction: +/* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction - computation. + computation. - LOOP_EXIT_VECT_DEF is a vector of partial results. We need to "reduce" it - into a single result, by applying the operation REDUC_CODE on the - partial-results-vector. For this, we need to create a new phi node at the - loop exit to preserve loop-closed form, as illustrated below. - - STMT is the original scalar reduction stmt that is being vectorized. - REDUCTION_OP is the scalar reduction-variable. + VECT_DEF is a vector of partial results. + REDUC_CODE is the tree-code for the epilog reduction. + STMT is the scalar reduction stmt that is being vectorized. REDUCTION_PHI is the phi-node that carries the reduction computation. - This function also sets the arguments for the REDUCTION_PHI: - The loop-entry argument is the (vectorized) initial-value of REDUCTION_OP. - The loop-latch argument is VECT_DEF - the vector of partial sums. - This function transforms this: + This function: + 1. Creates the reduction def-use cycle: sets the the arguments for + REDUCTION_PHI: + The loop-entry argument is the vectorized initial-value of the reduction. + The loop-latch argument is VECT_DEF - the vector of partial sums. + 2. "Reduces" the vector of partial results VECT_DEF into a single result, + by applying the operation specified by REDUC_CODE if available, or by + other means (whole-vector shifts or a scalar loop). + The function also creates a new phi node at the loop exit to preserve + loop-closed form, as illustrated below. + + The flow at the entry to this function: loop: - vec_def = phi # REDUCTION_PHI - .... - VECT_DEF = ... - + vec_def = phi # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + s_loop = scalar_stmt # (scalar) STMT loop_exit: - s_out0 = phi # EXIT_PHI - + s_out0 = phi # (scalar) EXIT_PHI use use - Into: + The above is transformed by this function into: loop: - vec_def = phi # REDUCTION_PHI - .... - VECT_DEF = ... - + vec_def = phi # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + s_loop = scalar_stmt # (scalar) STMT loop_exit: - s_out0 = phi # EXIT_PHI - v_out1 = phi # NEW_EXIT_PHI - - v_out2 = reduc_expr + s_out0 = phi # (scalar) EXIT_PHI + v_out1 = phi # NEW_EXIT_PHI + v_out2 = reduce s_out3 = extract_field - - use - use + s_out4 = adjust_result + use + use */ static void -vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, +vect_create_epilog_for_reduction (tree vect_def, tree stmt, enum tree_code reduc_code, tree reduction_phi) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); + tree vectype; + enum machine_mode mode; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block exit_bb; - tree scalar_dest = TREE_OPERAND (stmt, 0); - tree scalar_type = TREE_TYPE (scalar_dest); + tree scalar_dest; + tree scalar_type; tree new_phi; block_stmt_iterator exit_bsi; tree vec_dest; tree new_temp; + tree new_name; tree epilog_stmt; tree new_scalar_dest, exit_phi; - tree bitsize, bitpos; + tree bitsize, bitpos, bytesize; enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1)); tree scalar_initial_def; tree vec_initial_def; tree orig_name; imm_use_iterator imm_iter; use_operand_p use_p; + bool extract_scalar_result; + tree reduction_op; + tree orig_stmt; + tree operation = TREE_OPERAND (stmt, 1); + int op_type; + op_type = TREE_CODE_LENGTH (TREE_CODE (operation)); + reduction_op = TREE_OPERAND (operation, op_type-1); + vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); + mode = TYPE_MODE (vectype); + /*** 1. Create the reduction def-use cycle ***/ /* 1.1 set the loop-entry arg of the reduction-phi: */ @@ -863,11 +809,10 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, &scalar_initial_def); add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop)); - /* 1.2 set the loop-latch arg for the reduction-phi: */ add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop)); - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "transform reduction: created def-use cycle:"); print_generic_expr (vect_dump, reduction_phi, TDF_SLIM); @@ -876,7 +821,32 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, } - /*** 2. Create epilog code ***/ + /*** 2. Create epilog code + The reduction epilog code operates across the elements of the vector + of partial results computed by the vectorized loop. + The reduction epilog code consists of: + step 1: compute the scalar result in a vector (v_out2) + step 2: extract the scalar result (s_out3) from the vector (v_out2) + step 3: adjust the scalar result (s_out3) if needed. + + Step 1 can be accomplished using one the following three schemes: + (scheme 1) using reduc_code, if available. + (scheme 2) using whole-vector shifts, if available. + (scheme 3) using a scalar loop. In this case steps 1+2 above are + combined. + + The overall epilog code looks like this: + + s_out0 = phi # original EXIT_PHI + v_out1 = phi # NEW_EXIT_PHI + v_out2 = reduce # step 1 + s_out3 = extract_field # step 2 + s_out4 = adjust_result # step 3 + + (step 3 is optional, and step2 1 and 2 may be combined). + Lastly, the uses of s_out0 are replaced by s_out4. + + ***/ /* 2.1 Create new loop-exit-phi to preserve loop-closed form: v_out1 = phi */ @@ -884,84 +854,234 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, exit_bb = loop->single_exit->dest; new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb); SET_PHI_ARG_DEF (new_phi, loop->single_exit->dest_idx, vect_def); - exit_bsi = bsi_start (exit_bb); - - /* 2.2 Create: - v_out2 = reduc_expr - s_out3 = extract_field */ - - vec_dest = vect_create_destination_var (scalar_dest, vectype); - epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, - build1 (reduc_code, vectype, PHI_RESULT (new_phi))); - new_temp = make_ssa_name (vec_dest, epilog_stmt); - TREE_OPERAND (epilog_stmt, 0) = new_temp; - bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 + (i.e. when reduc_code is not available) and in the final adjusment code + (if needed). Also get the original scalar reduction variable as + defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it + represents a reduction pattern), the tree-code and scalar-def are + taken from the original stmt that the pattern-stmt (STMT) replaces. + Otherwise (it is a regular reduction) - the tree-code and scalar-def + are taken from STMT. */ + + orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); + if (!orig_stmt) { - fprintf (vect_dump, "transform reduction: created epilog code:"); - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + /* Regular reduction */ + orig_stmt = stmt; } - + else + { + /* Reduction pattern */ + stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); + gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); + gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); + } + code = TREE_CODE (TREE_OPERAND (orig_stmt, 1)); + scalar_dest = TREE_OPERAND (orig_stmt, 0); + scalar_type = TREE_TYPE (scalar_dest); new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); bitsize = TYPE_SIZE (scalar_type); + bytesize = TYPE_SIZE_UNIT (scalar_type); + + /* 2.3 Create the reduction code, using one of the three schemes described + above. */ + + if (reduc_code < NUM_TREE_CODES) + { + /*** Case 1: Create: + v_out2 = reduc_expr */ - /* The result is in the low order bits. */ - if (BITS_BIG_ENDIAN) - bitpos = size_binop (MULT_EXPR, - bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), - TYPE_SIZE (scalar_type)); + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Reduce using direct vector reduction."); + + vec_dest = vect_create_destination_var (scalar_dest, vectype); + epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, + build1 (reduc_code, vectype, PHI_RESULT (new_phi))); + new_temp = make_ssa_name (vec_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + + extract_scalar_result = true; + } else - bitpos = bitsize_zero_node; + { + enum tree_code shift_code = 0; + bool have_whole_vector_shift = true; + int bit_offset; + int element_bitsize = tree_low_cst (bitsize, 1); + int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); + tree vec_temp; + + if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing) + shift_code = VEC_RSHIFT_EXPR; + else + have_whole_vector_shift = false; + + /* Regardless of whether we have a whole vector shift, if we're + emulating the operation via tree-vect-generic, we don't want + to use it. Only the first round of the reduction is likely + to still be profitable via emulation. */ + /* ??? It might be better to emit a reduction tree code here, so that + tree-vect-generic can expand the first round via bit tricks. */ + if (!VECTOR_MODE_P (mode)) + have_whole_vector_shift = false; + else + { + optab optab = optab_for_tree_code (code, vectype); + if (optab->handlers[mode].insn_code == CODE_FOR_nothing) + have_whole_vector_shift = false; + } + + if (have_whole_vector_shift) + { + /*** Case 2: Create: + for (offset = VS/2; offset >= element_size; offset/=2) + { + Create: va' = vec_shift + Create: va = vop + } */ + + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Reduce using vector shifts"); + + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_temp = PHI_RESULT (new_phi); - epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, - build3 (BIT_FIELD_REF, scalar_type, - new_temp, bitsize, bitpos)); - new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); - TREE_OPERAND (epilog_stmt, 0) = new_temp; - bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + for (bit_offset = vec_size_in_bits/2; + bit_offset >= element_bitsize; + bit_offset /= 2) + { + tree bitpos = size_int (bit_offset); + + epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, + build2 (shift_code, vectype, new_temp, bitpos)); + new_name = make_ssa_name (vec_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_name; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + + epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, + build2 (code, vectype, new_name, new_temp)); + new_temp = make_ssa_name (vec_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + } + + extract_scalar_result = true; + } + else + { + tree rhs; + + /*** Case 3: Create: + s = extract_field + for (offset = element_size; + offset < vector_size; + offset += element_size;) + { + Create: s' = extract_field + Create: s = op + } */ + + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Reduce using scalar code. "); + + vec_temp = PHI_RESULT (new_phi); + vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); + rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, + bitsize_zero_node); + BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type); + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + + for (bit_offset = element_bitsize; + bit_offset < vec_size_in_bits; + bit_offset += element_bitsize) + { + tree bitpos = bitsize_int (bit_offset); + tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, + bitpos); + + BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type); + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, + rhs); + new_name = make_ssa_name (new_scalar_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_name; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, + build2 (code, scalar_type, new_name, new_temp)); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + } - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + extract_scalar_result = false; + } + } + /* 2.4 Extract the final scalar result. Create: + s_out3 = extract_field */ - /* 2.3 Adjust the final result by the initial value of the reduction - variable. (when such adjustment is not needed, then - 'scalar_initial_def' is zero). + if (extract_scalar_result) + { + tree rhs; - Create: - s_out = scalar_expr */ + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "extract scalar result"); - epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, - build2 (code, scalar_type, new_temp, scalar_initial_def)); - new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); - TREE_OPERAND (epilog_stmt, 0) = new_temp; - bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + if (BYTES_BIG_ENDIAN) + bitpos = size_binop (MULT_EXPR, + bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), + TYPE_SIZE (scalar_type)); + else + bitpos = bitsize_zero_node; + + rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos); + BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type); + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + } - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + /* 2.4 Adjust the final result by the initial value of the reduction + variable. (When such adjustment is not needed, then + 'scalar_initial_def' is zero). - - /* 2.4 Replace uses of s_out0 with uses of s_out3 */ + Create: + s_out4 = scalar_expr */ + + if (scalar_initial_def) + { + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, + build2 (code, scalar_type, new_temp, scalar_initial_def)); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + } - /* Find the loop-closed-use at the loop exit of the original - scalar result. (The reduction result is expected to have - two immediate uses - one at the latch block, and one at the - loop exit). */ + /* 2.6 Replace uses of s_out0 with uses of s_out3 */ + + /* Find the loop-closed-use at the loop exit of the original scalar result. + (The reduction result is expected to have two immediate uses - one at the + latch block, and one at the loop exit). */ exit_phi = NULL; FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) { if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p)))) - { - exit_phi = USE_STMT (use_p); - break; - } + { + exit_phi = USE_STMT (use_p); + break; + } } - + /* We expect to have found an exit_phi because of loop-closed-ssa form. */ + gcc_assert (exit_phi); + /* Replace the uses: */ orig_name = PHI_RESULT (exit_phi); - FOR_EACH_IMM_USE_SAFE (use_p, imm_iter, orig_name) SET_USE (use_p, new_temp); } @@ -972,33 +1092,69 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, Check if STMT performs a reduction operation that can be vectorized. If VEC_STMT is also passed, vectorize the STMT: create a vectorized stmt to replace it, put it in VEC_STMT, and insert it at BSI. - Return FALSE if not a vectorizable STMT, TRUE otherwise. */ + Return FALSE if not a vectorizable STMT, TRUE otherwise. + + This function also handles reduction idioms (patterns) that have been + recognized in advance during vect_pattern_recog. In this case, STMT may be + of this form: + X = pattern_expr (arg0, arg1, ..., X) + and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original + sequence that had been detected and replaced by the pattern-stmt (STMT). + + In some cases of reduction patterns, the type of the reduction variable X is + different than the type of the other arguments of STMT. + In such cases, the vectype that is used when transforming STMT into a vector + stmt is different than the vectype that is used to determine the + vectorization factor, because it consists of a different number of elements + than the actual number of elements that are being operated upon in parallel. + + For example, consider an accumulation of shorts into an int accumulator. + On some targets it's possible to vectorize this pattern operating on 8 + shorts at a time (hence, the vectype for purposes of determining the + vectorization factor should be V8HI); on the other hand, the vectype that + is used to create the vector form is actually V4SI (the type of the result). + + Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that + indicates what is the actual level of parallelism (V8HI in the example), so + that the right vectorization factor would be derived. This vectype + corresponds to the type of arguments to the reduction stmt, and should *NOT* + be used to create the vectorized stmt. The right vectype for the vectorized + stmt is obtained from the type of the result X: + get_vectype_for_scalar_type (TREE_TYPE (X)) + + This means that, contrary to "regular" reductions (or "regular" stmts in + general), the following equation: + STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) + does *NOT* necessarily hold for reduction patterns. */ bool vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) { tree vec_dest; tree scalar_dest; - tree op0, op1; - tree loop_vec_def; + tree op; + tree loop_vec_def0, loop_vec_def1; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree operation; - enum tree_code code, reduc_code = 0; + enum tree_code code, orig_code, epilog_reduc_code = 0; enum machine_mode vec_mode; int op_type; optab optab, reduc_optab; tree new_temp; - tree def0, def1, def_stmt0, def_stmt1; - enum vect_def_type dt0, dt1; + tree def, def_stmt; + enum vect_def_type dt; tree new_phi; tree scalar_type; - bool is_simple_use0; - bool is_simple_use1; + bool is_simple_use; + tree orig_stmt; + stmt_vec_info orig_stmt_info; + tree expr = NULL_TREE; + int i; - /* Is vectorizable reduction? */ + /* 1. Is vectorizable reduction? */ /* Not supportable if the reduction variable is used in the loop. */ if (STMT_VINFO_RELEVANT_P (stmt_info)) @@ -1007,73 +1163,161 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (!STMT_VINFO_LIVE_P (stmt_info)) return false; - /* Make sure it was already recognized as a reduction pattern. */ + /* Make sure it was already recognized as a reduction computation. */ if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def) return false; + /* 2. Has this been recognized as a reduction pattern? + + Check if STMT represents a pattern that has been recognized + in earlier analysis stages. For stmts that represent a pattern, + the STMT_VINFO_RELATED_STMT field records the last stmt in + the original sequence that constitutes the pattern. */ + + orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); + if (orig_stmt) + { + orig_stmt_info = vinfo_for_stmt (orig_stmt); + gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); + gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); + gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); + } + + /* 3. Check the operands of the operation. The first operands are defined + inside the loop body. The last operand is the reduction variable, + which is defined by the loop-header-phi. */ + gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR); operation = TREE_OPERAND (stmt, 1); code = TREE_CODE (operation); op_type = TREE_CODE_LENGTH (code); - if (op_type != binary_op) + if (op_type != binary_op && op_type != ternary_op) return false; - - op0 = TREE_OPERAND (operation, 0); - op1 = TREE_OPERAND (operation, 1); scalar_dest = TREE_OPERAND (stmt, 0); scalar_type = TREE_TYPE (scalar_dest); - /* Check the first operand. It is expected to be defined inside the loop. */ - is_simple_use0 = - vect_is_simple_use (op0, loop_vinfo, &def_stmt0, &def0, &dt0); - is_simple_use1 = - vect_is_simple_use (op1, loop_vinfo, &def_stmt1, &def1, &dt1); - - gcc_assert (is_simple_use0); - gcc_assert (is_simple_use1); - gcc_assert (dt0 == vect_loop_def); - gcc_assert (dt1 == vect_reduction_def); - gcc_assert (TREE_CODE (def_stmt1) == PHI_NODE); - gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt1)); + /* All uses but the last are expected to be defined in the loop. + The last use is the reduction variable. */ + for (i = 0; i < op_type-1; i++) + { + op = TREE_OPERAND (operation, i); + is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); + gcc_assert (is_simple_use); + gcc_assert (dt == vect_loop_def || dt == vect_invariant_def || + dt == vect_constant_def); + } - if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt1))) - return false; + op = TREE_OPERAND (operation, i); + is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); + gcc_assert (is_simple_use); + gcc_assert (dt == vect_reduction_def); + gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); + if (orig_stmt) + gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt)); + else + gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt)); + + if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) + return false; - /* Supportable by target? */ + /* 4. Supportable by target? */ - /* check support for the operation in the loop */ + /* 4.1. check support for the operation in the loop */ optab = optab_for_tree_code (code, vectype); if (!optab) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "no optab."); return false; } vec_mode = TYPE_MODE (vectype); if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "op not supported by target."); + if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD + || LOOP_VINFO_VECT_FACTOR (loop_vinfo) + < vect_min_worthwhile_factor (code)) + return false; + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "proceeding using word mode."); + } + + /* Worthwhile without SIMD support? */ + if (!VECTOR_MODE_P (TYPE_MODE (vectype)) + && LOOP_VINFO_VECT_FACTOR (loop_vinfo) + < vect_min_worthwhile_factor (code)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "not worthwhile without SIMD support."); return false; } - /* check support for the epilog operation */ - if (!reduction_code_for_scalar_code (code, &reduc_code)) + /* 4.2. Check support for the epilog operation. + + If STMT represents a reduction pattern, then the type of the + reduction variable may be different than the type of the rest + of the arguments. For example, consider the case of accumulation + of shorts into an int accumulator; The original code: + S1: int_a = (int) short_a; + orig_stmt-> S2: int_acc = plus ; + + was replaced with: + STMT: int_acc = widen_sum + + This means that: + 1. The tree-code that is used to create the vector operation in the + epilog code (that reduces the partial results) is not the + tree-code of STMT, but is rather the tree-code of the original + stmt from the pattern that STMT is replacing. I.e, in the example + above we want to use 'widen_sum' in the loop, but 'plus' in the + epilog. + 2. The type (mode) we use to check available target support + for the vector operation to be created in the *epilog*, is + determined by the type of the reduction variable (in the example + above we'd check this: plus_optab[vect_int_mode]). + However the type (mode) we use to check available target support + for the vector operation to be created *inside the loop*, is + determined by the type of the other arguments to STMT (in the + example we'd check this: widen_sum_optab[vect_short_mode]). + + This is contrary to "regular" reductions, in which the types of all + the arguments are the same as the type of the reduction variable. + For "regular" reductions we can therefore use the same vector type + (and also the same tree-code) when generating the epilog code and + when generating the code inside the loop. */ + + if (orig_stmt) + { + /* This is a reduction pattern: get the vectype from the type of the + reduction variable, and get the tree-code from orig_stmt. */ + orig_code = TREE_CODE (TREE_OPERAND (orig_stmt, 1)); + vectype = get_vectype_for_scalar_type (TREE_TYPE (def)); + vec_mode = TYPE_MODE (vectype); + } + else + { + /* Regular reduction: use the same vectype and tree-code as used for + the vector code inside the loop can be used for the epilog code. */ + orig_code = code; + } + + if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code)) return false; - reduc_optab = optab_for_tree_code (reduc_code, vectype); + reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype); if (!reduc_optab) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "no optab for reduction."); - return false; + epilog_reduc_code = NUM_TREE_CODES; } if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) - fprintf (vect_dump, "op not supported by target."); - return false; + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "reduc op not supported by target."); + epilog_reduc_code = NUM_TREE_CODES; } if (!vec_stmt) /* transformation not required. */ @@ -1084,33 +1328,37 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /** Transform. **/ - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform reduction."); /* Create the destination vector */ vec_dest = vect_create_destination_var (scalar_dest, vectype); - /* Create the reduction-phi that defines the reduction-operand. */ new_phi = create_phi_node (vec_dest, loop->header); - /* Prepare the operand that is defined inside the loop body */ - loop_vec_def = vect_get_vec_def_for_operand (op0, stmt, NULL); - gcc_assert (VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (loop_vec_def)))); - + op = TREE_OPERAND (operation, 0); + loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL); + if (op_type == binary_op) + expr = build2 (code, vectype, loop_vec_def0, PHI_RESULT (new_phi)); + else if (op_type == ternary_op) + { + op = TREE_OPERAND (operation, 1); + loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL); + expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, + PHI_RESULT (new_phi)); + } /* Create the vectorized operation that computes the partial results */ - *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, - build2 (code, vectype, loop_vec_def, PHI_RESULT (new_phi))); + *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, expr); new_temp = make_ssa_name (vec_dest, *vec_stmt); TREE_OPERAND (*vec_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, *vec_stmt, bsi); - /* Finalize the reduction-phi (set it's arguments) and create the epilog reduction code. */ - vect_create_epilog_for_reduction (new_temp, stmt, op1, reduc_code, new_phi); + vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi); return true; } @@ -1152,7 +1400,7 @@ vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) op = TREE_OPERAND (stmt, 1); if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "use not simple."); return false; } @@ -1164,7 +1412,7 @@ vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) } /** Transform. **/ - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform assignment."); /* Handle def. */ @@ -1236,6 +1484,8 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) int op_type; tree op; optab optab; + int icode; + enum machine_mode optab_op2_mode; tree def, def_stmt; enum vect_def_type dt; @@ -1248,7 +1498,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (STMT_VINFO_LIVE_P (stmt_info)) { /* FORNOW: not yet supported. */ - if (vect_print_dump_info (REPORT_DETAILS, LOOP_LOC (loop_vinfo))) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "value used after loop."); return false; } @@ -1267,7 +1517,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) op_type = TREE_CODE_LENGTH (code); if (op_type != unary_op && op_type != binary_op) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type); return false; } @@ -1277,7 +1527,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) op = TREE_OPERAND (operation, i); if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "use not simple."); return false; } @@ -1286,20 +1536,21 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Supportable by target? */ if (!optab) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "no optab."); return false; } vec_mode = TYPE_MODE (vectype); - if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing) + icode = (int) optab->handlers[(int) vec_mode].insn_code; + if (icode == CODE_FOR_nothing) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "op not supported by target."); if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD || LOOP_VINFO_VECT_FACTOR (loop_vinfo) < vect_min_worthwhile_factor (code)) return false; - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "proceeding using word mode."); } @@ -1308,11 +1559,30 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) < vect_min_worthwhile_factor (code)) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "not worthwhile without SIMD support."); return false; } + if (code == LSHIFT_EXPR || code == RSHIFT_EXPR) + { + /* FORNOW: not yet supported. */ + if (!VECTOR_MODE_P (vec_mode)) + return false; + + /* Invariant argument is needed for a vector shift + by a scalar shift operand. */ + optab_op2_mode = insn_data[icode].operand[2].mode; + if (! (VECTOR_MODE_P (optab_op2_mode) + || dt == vect_constant_def + || dt == vect_invariant_def)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "operand mode requires invariant argument."); + return false; + } + } + if (!vec_stmt) /* transformation not required. */ { STMT_VINFO_TYPE (stmt_info) = op_vec_info_type; @@ -1321,7 +1591,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /** Transform. **/ - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform binary/unary operation."); /* Handle def. */ @@ -1335,7 +1605,25 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (op_type == binary_op) { op1 = TREE_OPERAND (operation, 1); - vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL); + + if (code == LSHIFT_EXPR || code == RSHIFT_EXPR) + { + /* Vector shl and shr insn patterns can be defined with + scalar operand 2 (shift operand). In this case, use + constant or loop invariant op1 directly, without + extending it to vector mode first. */ + + optab_op2_mode = insn_data[icode].operand[2].mode; + if (!VECTOR_MODE_P (optab_op2_mode)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "operand 1 using scalar mode."); + vec_oprnd1 = op1; + } + } + + if (!vec_oprnd1) + vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL); } /* Arguments are ready. create the new vector stmt. */ @@ -1393,7 +1681,7 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) op = TREE_OPERAND (stmt, 1); if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "use not simple."); return false; } @@ -1416,7 +1704,7 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /** Transform. **/ - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform store"); alignment_support_cheme = vect_supportable_dr_alignment (dr); @@ -1494,7 +1782,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (STMT_VINFO_LIVE_P (stmt_info)) { /* FORNOW: not yet supported. */ - if (vect_print_dump_info (REPORT_DETAILS, LOOP_LOC (loop_vinfo))) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "value used after loop."); return false; } @@ -1519,7 +1807,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) (e.g. - data copies). */ if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing) { - if (vect_print_dump_info (REPORT_DETAILS, LOOP_LOC (loop_vinfo))) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "Aligned load, but unsupported type."); return false; } @@ -1532,7 +1820,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /** Transform. **/ - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform load."); alignment_support_cheme = vect_supportable_dr_alignment (dr); @@ -1608,9 +1896,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* <2> Create lsq = *(floor(p2')) in the loop */ - offset = build_int_cst (integer_type_node, - TYPE_VECTOR_SUBPARTS (vectype)); - offset = int_const_binop (MINUS_EXPR, offset, integer_one_node, 1); + offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); vec_dest = vect_create_destination_var (scalar_dest, vectype); dataref_ptr = vect_create_data_ref_ptr (stmt, bsi, offset, &dummy, false); data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); @@ -1642,7 +1928,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) the value of the parameter and no global variables are touched which makes the builtin a "const" function. Requiring the builtin to have the "const" attribute makes it unnecessary - to call mark_call_clobbered_vars_to_rename. */ + to call mark_call_clobbered. */ gcc_assert (TREE_READONLY (builtin_decl)); } else @@ -1721,7 +2007,7 @@ vectorizable_live_operation (tree stmt, op = TREE_OPERAND (operation, i); if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "use not simple."); return false; } @@ -1812,7 +2098,7 @@ vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (STMT_VINFO_LIVE_P (stmt_info)) { /* FORNOW: not yet supported. */ - if (vect_print_dump_info (REPORT_DETAILS, LOOP_LOC (loop_vinfo))) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "value used after loop."); return false; } @@ -1880,8 +2166,8 @@ vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Arguments are ready. create the new vector stmt. */ vec_compare = build2 (TREE_CODE (cond_expr), vectype, vec_cond_lhs, vec_cond_rhs); - vec_cond_expr = build (VEC_COND_EXPR, vectype, - vec_compare, vec_then_clause, vec_else_clause); + vec_cond_expr = build3 (VEC_COND_EXPR, vectype, + vec_compare, vec_then_clause, vec_else_clause); *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, vec_cond_expr); new_temp = make_ssa_name (vec_dest, *vec_stmt); @@ -1901,6 +2187,7 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi) bool is_store = false; tree vec_stmt = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree orig_stmt_in_pattern; bool done; if (STMT_VINFO_RELEVANT_P (stmt_info)) @@ -1934,12 +2221,30 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi) break; default: - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "stmt not supported."); gcc_unreachable (); } + gcc_assert (vec_stmt); STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt; + orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info); + if (orig_stmt_in_pattern) + { + stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern); + if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) + { + gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); + + /* STMT was inserted by the vectorizer to replace a computation + idiom. ORIG_STMT_IN_PATTERN is a stmt in the original + sequence that computed this idiom. We need to record a pointer + to VEC_STMT in the stmt_info of ORIG_STMT_IN_PATTERN. See more + detail in the documentation of vect_pattern_recog. */ + + STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt; + } + } } if (STMT_VINFO_LIVE_P (stmt_info)) @@ -2017,7 +2322,7 @@ vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo, struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree ni = LOOP_VINFO_NITERS (loop_vinfo); int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - tree log_vf = build_int_cst (unsigned_type_node, exact_log2 (vf)); + tree log_vf; pe = loop_preheader_edge (loop); @@ -2025,6 +2330,7 @@ vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo, number of iterations loop executes. */ ni_name = vect_build_loop_niters (loop_vinfo); + log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf)); /* Create: ratio = ni >> log2(vf) */ @@ -2202,7 +2508,7 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters, tree var, stmt, ni, ni_name; block_stmt_iterator last_bsi; - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: "); print_generic_expr (vect_dump, phi, TDF_SLIM); @@ -2211,7 +2517,7 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters, /* Skip virtual phi's. */ if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi)))) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "virtual phi. skip."); continue; } @@ -2219,7 +2525,7 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters, /* Skip reduction phis. */ if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def) { - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "reduc phi. skip."); continue; } @@ -2279,7 +2585,7 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio, basic_block preheader; int loop_num; - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ==="); initialize_original_copy_tables (); @@ -2358,7 +2664,6 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT; - tree vf_minus_1 = build_int_cst (unsigned_type_node, vf - 1); tree niters_type = TREE_TYPE (loop_niters); pe = loop_preheader_edge (loop); @@ -2369,7 +2674,7 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) int element_size = vectype_align/vf; int elem_misalign = byte_misalign / element_size; - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "known alignment = %d.", byte_misalign); iters = build_int_cst (niters_type, (vf - elem_misalign)&(vf-1)); } @@ -2383,8 +2688,9 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1); tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1); tree elem_size_log = - build_int_cst (unsigned_type_node, exact_log2 (vectype_align/vf)); - tree vf_tree = build_int_cst (unsigned_type_node, vf); + build_int_cst (type, exact_log2 (vectype_align/vf)); + tree vf_minus_1 = build_int_cst (type, vf - 1); + tree vf_tree = build_int_cst (type, vf); tree byte_misalign; tree elem_misalign; @@ -2397,11 +2703,11 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) /* Create: elem_misalign = byte_misalign / element_size */ elem_misalign = - build2 (RSHIFT_EXPR, unsigned_type_node, byte_misalign, elem_size_log); + build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log); /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */ - iters = build2 (MINUS_EXPR, unsigned_type_node, vf_tree, elem_misalign); - iters = build2 (BIT_AND_EXPR, unsigned_type_node, iters, vf_minus_1); + iters = build2 (MINUS_EXPR, type, vf_tree, elem_misalign); + iters = build2 (BIT_AND_EXPR, type, iters, vf_minus_1); iters = fold_convert (niters_type, iters); } @@ -2412,7 +2718,7 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) if (TREE_CODE (loop_niters) != INTEGER_CST) iters = build2 (MIN_EXPR, niters_type, iters, loop_niters); - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "niters for prolog loop: "); print_generic_expr (vect_dump, iters, TDF_SLIM); @@ -2438,18 +2744,16 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) NITERS iterations were peeled from LOOP. DR represents a data reference in LOOP. This function updates the information recorded in DR to account for the fact that the first NITERS iterations had already been - executed. Specifically, it updates the OFFSET field of stmt_info. */ + executed. Specifically, it updates the OFFSET field of DR. */ static void vect_update_init_of_dr (struct data_reference *dr, tree niters) { - stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); - tree offset = STMT_VINFO_VECT_INIT_OFFSET (stmt_info); + tree offset = DR_OFFSET (dr); - niters = fold (build2 (MULT_EXPR, TREE_TYPE (niters), niters, - STMT_VINFO_VECT_STEP (stmt_info))); - offset = fold (build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters)); - STMT_VINFO_VECT_INIT_OFFSET (stmt_info) = offset; + niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr)); + offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters); + DR_OFFSET (dr) = offset; } @@ -2465,21 +2769,14 @@ static void vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters) { unsigned int i; - varray_type loop_write_datarefs = LOOP_VINFO_DATAREF_WRITES (loop_vinfo); - varray_type loop_read_datarefs = LOOP_VINFO_DATAREF_READS (loop_vinfo); + varray_type datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); if (vect_dump && (dump_flags & TDF_DETAILS)) fprintf (vect_dump, "=== vect_update_inits_of_dr ==="); - for (i = 0; i < VARRAY_ACTIVE_SIZE (loop_write_datarefs); i++) + for (i = 0; i < VARRAY_ACTIVE_SIZE (datarefs); i++) { - struct data_reference *dr = VARRAY_GENERIC_PTR (loop_write_datarefs, i); - vect_update_init_of_dr (dr, niters); - } - - for (i = 0; i < VARRAY_ACTIVE_SIZE (loop_read_datarefs); i++) - { - struct data_reference *dr = VARRAY_GENERIC_PTR (loop_read_datarefs, i); + struct data_reference *dr = VARRAY_GENERIC_PTR (datarefs, i); vect_update_init_of_dr (dr, niters); } } @@ -2501,7 +2798,7 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo, struct loops *loops) tree n_iters; struct loop *new_loop; - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vect_do_peeling_for_alignment ==="); initialize_original_copy_tables (); @@ -2520,8 +2817,8 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo, struct loops *loops) /* Update number of times loop executes. */ n_iters = LOOP_VINFO_NITERS (loop_vinfo); - LOOP_VINFO_NITERS (loop_vinfo) = fold (build2 (MINUS_EXPR, - TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop)); + LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR, + TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop); /* Update the init conditions of the access functions of all data refs. */ vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop); @@ -2533,6 +2830,128 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo, struct loops *loops) } +/* Function vect_create_cond_for_align_checks. + + Create a conditional expression that represents the alignment checks for + all of data references (array element references) whose alignment must be + checked at runtime. + + Input: + LOOP_VINFO - two fields of the loop information are used. + LOOP_VINFO_PTR_MASK is the mask used to check the alignment. + LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked. + + Output: + COND_EXPR_STMT_LIST - statements needed to construct the conditional + expression. + The returned value is the conditional expression to be used in the if + statement that controls which version of the loop gets executed at runtime. + + The algorithm makes two assumptions: + 1) The number of bytes "n" in a vector is a power of 2. + 2) An address "a" is aligned if a%n is zero and that this + test can be done as a&(n-1) == 0. For example, for 16 + byte vectors the test is a&0xf == 0. */ + +static tree +vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, + tree *cond_expr_stmt_list) +{ + VEC(tree,heap) *may_misalign_stmts + = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); + tree ref_stmt; + int mask = LOOP_VINFO_PTR_MASK (loop_vinfo); + tree mask_cst; + unsigned int i; + tree psize; + tree int_ptrsize_type; + char tmp_name[20]; + tree or_tmp_name = NULL_TREE; + tree and_tmp, and_tmp_name, and_stmt; + tree ptrsize_zero; + + /* Check that mask is one less than a power of 2, i.e., mask is + all zeros followed by all ones. */ + gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0)); + + /* CHECKME: what is the best integer or unsigned type to use to hold a + cast from a pointer value? */ + psize = TYPE_SIZE (ptr_type_node); + int_ptrsize_type + = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0); + + /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address + of the first vector of the i'th data reference. */ + + for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++) + { + tree new_stmt_list = NULL_TREE; + tree addr_base; + tree addr_tmp, addr_tmp_name, addr_stmt; + tree or_tmp, new_or_tmp_name, or_stmt; + + /* create: addr_tmp = (int)(address_of_first_vector) */ + addr_base = vect_create_addr_base_for_vector_ref (ref_stmt, + &new_stmt_list, + NULL_TREE); + + if (new_stmt_list != NULL_TREE) + append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list); + + sprintf (tmp_name, "%s%d", "addr2int", i); + addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name); + add_referenced_tmp_var (addr_tmp); + addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE); + addr_stmt = fold_convert (int_ptrsize_type, addr_base); + addr_stmt = build2 (MODIFY_EXPR, void_type_node, + addr_tmp_name, addr_stmt); + SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt; + append_to_statement_list_force (addr_stmt, cond_expr_stmt_list); + + /* The addresses are OR together. */ + + if (or_tmp_name != NULL_TREE) + { + /* create: or_tmp = or_tmp | addr_tmp */ + sprintf (tmp_name, "%s%d", "orptrs", i); + or_tmp = create_tmp_var (int_ptrsize_type, tmp_name); + add_referenced_tmp_var (or_tmp); + new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE); + or_stmt = build2 (MODIFY_EXPR, void_type_node, new_or_tmp_name, + build2 (BIT_IOR_EXPR, int_ptrsize_type, + or_tmp_name, + addr_tmp_name)); + SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt; + append_to_statement_list_force (or_stmt, cond_expr_stmt_list); + or_tmp_name = new_or_tmp_name; + } + else + or_tmp_name = addr_tmp_name; + + } /* end for i */ + + mask_cst = build_int_cst (int_ptrsize_type, mask); + + /* create: and_tmp = or_tmp & mask */ + and_tmp = create_tmp_var (int_ptrsize_type, "andmask" ); + add_referenced_tmp_var (and_tmp); + and_tmp_name = make_ssa_name (and_tmp, NULL_TREE); + + and_stmt = build2 (MODIFY_EXPR, void_type_node, + and_tmp_name, + build2 (BIT_AND_EXPR, int_ptrsize_type, + or_tmp_name, mask_cst)); + SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt; + append_to_statement_list_force (and_stmt, cond_expr_stmt_list); + + /* Make and_tmp the left operand of the conditional test against zero. + if and_tmp has a non-zero bit then some address is unaligned. */ + ptrsize_zero = build_int_cst (int_ptrsize_type, 0); + return build2 (EQ_EXPR, boolean_type_node, + and_tmp_name, ptrsize_zero); +} + + /* Function vect_transform_loop. The analysis phase has determined that the loop is vectorizable. @@ -2550,10 +2969,72 @@ vect_transform_loop (loop_vec_info loop_vinfo, int i; tree ratio = NULL; int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + bitmap_iterator bi; + unsigned int j; - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vec_transform_loop ==="); + /* If the loop has data references that may or may not be aligned then + two versions of the loop need to be generated, one which is vectorized + and one which isn't. A test is then generated to control which of the + loops is executed. The test checks for the alignment of all of the + data references that may or may not be aligned. */ + + if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))) + { + struct loop *nloop; + tree cond_expr; + tree cond_expr_stmt_list = NULL_TREE; + basic_block condition_bb; + block_stmt_iterator cond_exp_bsi; + basic_block merge_bb; + basic_block new_exit_bb; + edge new_exit_e, e; + tree orig_phi, new_phi, arg; + + cond_expr = vect_create_cond_for_align_checks (loop_vinfo, + &cond_expr_stmt_list); + initialize_original_copy_tables (); + nloop = loop_version (loops, loop, cond_expr, &condition_bb, true); + free_original_copy_tables(); + + /** Loop versioning violates an assumption we try to maintain during + vectorization - that the loop exit block has a single predecessor. + After versioning, the exit block of both loop versions is the same + basic block (i.e. it has two predecessors). Just in order to simplify + following transformations in the vectorizer, we fix this situation + here by adding a new (empty) block on the exit-edge of the loop, + with the proper loop-exit phis to maintain loop-closed-form. **/ + + merge_bb = loop->single_exit->dest; + gcc_assert (EDGE_COUNT (merge_bb->preds) == 2); + new_exit_bb = split_edge (loop->single_exit); + add_bb_to_loop (new_exit_bb, loop->outer); + new_exit_e = loop->single_exit; + e = EDGE_SUCC (new_exit_bb, 0); + + for (orig_phi = phi_nodes (merge_bb); orig_phi; + orig_phi = PHI_CHAIN (orig_phi)) + { + new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)), + new_exit_bb); + arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); + add_phi_arg (new_phi, arg, new_exit_e); + SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi)); + } + + /** end loop-exit-fixes after versioning **/ + + update_ssa (TODO_update_ssa); + cond_exp_bsi = bsi_last (condition_bb); + bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT); + } + + /* CHECKME: we wouldn't need this if we calles update_ssa once + for all loops. */ + bitmap_zero (vect_vnames_to_rename); + /* Peel the loop if there are data refs with unknown alignment. Only one data ref with unknown store is allowed. */ @@ -2599,7 +3080,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, stmt_vec_info stmt_info; bool is_store; - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "------>vectorizing statement: "); print_generic_expr (vect_dump, stmt, TDF_SLIM); @@ -2616,10 +3097,10 @@ vect_transform_loop (loop_vec_info loop_vinfo, units and no inner unrolling is necessary. */ gcc_assert (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)) - == vectorization_factor); + == (unsigned HOST_WIDE_INT) vectorization_factor); /* -------- vectorize statement ------------ */ - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform statement."); is_store = vect_transform_stmt (stmt, &si); @@ -2629,7 +3110,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, stmt_ann_t ann = stmt_ann (stmt); free (stmt_info); set_stmt_info ((tree_ann_t)ann, NULL); - bsi_remove (&si); + bsi_remove (&si, true); continue; } @@ -2639,11 +3120,14 @@ vect_transform_loop (loop_vec_info loop_vinfo, slpeel_make_loop_iterate_ntimes (loop, ratio); + EXECUTE_IF_SET_IN_BITMAP (vect_vnames_to_rename, 0, j, bi) + mark_sym_for_renaming (SSA_NAME_VAR (ssa_name (j))); + /* The memory tags and pointers in vectorized statements need to have their SSA forms updated. FIXME, why can't this be delayed until all the loops have been transformed? */ update_ssa (TODO_update_ssa); - if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS, LOOP_LOC (loop_vinfo))) + if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) fprintf (vect_dump, "LOOP VECTORIZED."); }