X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Ftree-vect-transform.c;h=7c5b1b20b4d020d45e03ccd205125efd07531dc9;hb=26ab781bb4d68889c2489beda4a88a3cf0ae8459;hp=8d309583e7251e1e6742b72ba91db66224dfc239;hpb=8c4c00c181e6df4f0a9afc76e4c9edbbc1c2fd41;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index 8d309583e72..7c5b1b20b4d 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -49,16 +49,15 @@ along with GCC; see the file COPYING3. If not see static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *); static tree vect_create_destination_var (tree, tree); static tree vect_create_data_ref_ptr - (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree); -static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree); -static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *); + (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *); +static tree vect_create_addr_base_for_vector_ref + (tree, tree *, tree, struct loop *); static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *); static tree vect_get_vec_def_for_operand (tree, tree, tree *); -static tree vect_init_vector (tree, tree, tree); +static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *); static void vect_finish_stmt_generation - (tree stmt, tree vec_stmt, block_stmt_iterator *bsi); + (tree stmt, tree vec_stmt, block_stmt_iterator *); static bool vect_is_simple_cond (tree, loop_vec_info); -static void update_vuses_to_preheader (tree, struct loop*); static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree); static tree get_initial_def_for_reduction (tree, tree, tree *); @@ -125,6 +124,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); int nbbs = loop->num_nodes; int byte_misalign; + int innerloop_iters, factor; /* Cost model disabled. */ if (!flag_vect_cost_model) @@ -153,11 +153,20 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) TODO: Consider assigning different costs to different scalar statements. */ + /* FORNOW. */ + if (loop->inner) + innerloop_iters = 50; /* FIXME */ + for (i = 0; i < nbbs; i++) { block_stmt_iterator si; basic_block bb = bbs[i]; + if (bb->loop_father == loop->inner) + factor = innerloop_iters; + else + factor = 1; + for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si)) { tree stmt = bsi_stmt (si); @@ -165,8 +174,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) if (!STMT_VINFO_RELEVANT_P (stmt_info) && !STMT_VINFO_LIVE_P (stmt_info)) continue; - scalar_single_iter_cost += cost_for_stmt (stmt); - vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info); + scalar_single_iter_cost += cost_for_stmt (stmt) * factor; + vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; + /* FIXME: for stmts in the inner-loop in outer-loop vectorization, + some of the "outside" costs are generated inside the outer-loop. */ vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); } } @@ -265,7 +276,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) /* If the number of iterations is unknown, or the peeling-for-misalignment amount is unknown, we eill have to generate - a runtime test to test the loop count agains the threshold. */ + a runtime test to test the loop count against the threshold. */ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || (byte_misalign < 0)) runtime_test = true; @@ -360,6 +371,8 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, enum machine_mode mode; tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1); int op_type = TREE_CODE_LENGTH (TREE_CODE (operation)); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); /* Cost of reduction op inside loop. */ STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST; @@ -382,30 +395,33 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, We have a reduction operator that will reduce the vector in one statement. Also requires scalar extract. */ - if (reduc_code < NUM_TREE_CODES) - outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST; - else + if (!nested_in_vect_loop_p (loop, orig_stmt)) { - int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); - tree bitsize = - TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0))); - int element_bitsize = tree_low_cst (bitsize, 1); - int nelements = vec_size_in_bits / element_bitsize; - - optab = optab_for_tree_code (code, vectype); - - /* We have a whole vector shift available. */ - if (VECTOR_MODE_P (mode) - && optab->handlers[mode].insn_code != CODE_FOR_nothing - && vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing) - /* Final reduction via vector shifts and the reduction operator. Also - requires scalar extract. */ - outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST - + TARG_VEC_TO_SCALAR_COST); - else - /* Use extracts and reduction op for final reduction. For N elements, - we have N extracts and N-1 reduction ops. */ - outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST); + if (reduc_code < NUM_TREE_CODES) + outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST; + else + { + int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); + tree bitsize = + TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0))); + int element_bitsize = tree_low_cst (bitsize, 1); + int nelements = vec_size_in_bits / element_bitsize; + + optab = optab_for_tree_code (code, vectype); + + /* We have a whole vector shift available. */ + if (VECTOR_MODE_P (mode) + && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing + && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) + /* Final reduction via vector shifts and the reduction operator. Also + requires scalar extract. */ + outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST + + TARG_VEC_TO_SCALAR_COST); + else + /* Use extracts and reduction op for final reduction. For N elements, + we have N extracts and N-1 reduction ops. */ + outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST); + } } STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost; @@ -598,7 +614,19 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies) break; } - case dr_unaligned_software_pipeline: + case dr_explicit_realign: + { + inner_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST); + + /* FIXME: If the misalignment remains fixed across the iterations of + the containing loop, the following cost should be added to the + outside costs. */ + if (targetm.vectorize.builtin_mask_for_load) + inner_cost += TARG_VEC_STMT_COST; + + break; + } + case dr_explicit_realign_optimized: { int outer_cost = 0; @@ -695,6 +723,19 @@ vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) STMT: The statement containing the data reference. NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. OFFSET: Optional. If supplied, it is be added to the initial address. + LOOP: Specify relative to which loop-nest should the address be computed. + For example, when the dataref is in an inner-loop nested in an + outer-loop that is now being vectorized, LOOP can be either the + outer-loop, or the inner-loop. The first memory location accessed + by the following dataref ('in' points to short): + + for (i=0; iloop_father; tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); - tree base_name = build_fold_indirect_ref (data_ref_base); + tree base_name; + tree data_ref_base_var; + tree new_base_stmt; tree vec_stmt; tree addr_base, addr_expr; tree dest, new_stmt; tree base_offset = unshare_expr (DR_OFFSET (dr)); tree init = unshare_expr (DR_INIT (dr)); tree vect_ptr_type, addr_expr2; + tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); + + gcc_assert (loop); + if (loop != containing_loop) + { + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + + gcc_assert (nested_in_vect_loop_p (loop, stmt)); + + data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); + base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); + init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); + } + + /* Create data_ref_base */ + base_name = build_fold_indirect_ref (data_ref_base); + data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp"); + add_referenced_var (data_ref_base_var); + data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt, + true, data_ref_base_var); + append_to_statement_list_force(new_base_stmt, new_stmt_list); /* Create base_offset */ base_offset = size_binop (PLUS_EXPR, base_offset, init); base_offset = fold_convert (sizetype, base_offset); dest = create_tmp_var (TREE_TYPE (base_offset), "base_off"); add_referenced_var (dest); - base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest); + base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest); append_to_statement_list_force (new_stmt, new_stmt_list); if (offset) { tree tmp = create_tmp_var (sizetype, "offset"); - tree step; - - /* For interleaved access step we divide STEP by the size of the - interleaving group. */ - if (DR_GROUP_SIZE (stmt_info)) - step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr), - build_int_cst (TREE_TYPE (offset), - DR_GROUP_SIZE (stmt_info))); - else - step = DR_STEP (dr); add_referenced_var (tmp); offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step); @@ -751,8 +808,8 @@ vect_create_addr_base_for_vector_ref (tree stmt, } /* base + base_offset */ - addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base, - base_offset); + addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base), + data_ref_base, base_offset); vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); @@ -789,7 +846,7 @@ vect_create_addr_base_for_vector_ref (tree stmt, 1. STMT: a stmt that references memory. Expected to be of the form GIMPLE_MODIFY_STMT or GIMPLE_MODIFY_STMT . - 2. BSI: block_stmt_iterator where new stmts can be added. + 2. AT_LOOP: the loop where the vector memref is to be created. 3. OFFSET (optional): an offset to be added to the initial address accessed by the data-ref in STMT. 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain @@ -816,18 +873,22 @@ vect_create_addr_base_for_vector_ref (tree stmt, Return the increment stmt that updates the pointer in PTR_INCR. - 3. Return the pointer. */ + 3. Set INV_P to true if the access pattern of the data reference in the + vectorized loop is invariant. Set it to false otherwise. + + 4. Return the pointer. */ static tree -vect_create_data_ref_ptr (tree stmt, - block_stmt_iterator *bsi ATTRIBUTE_UNUSED, +vect_create_data_ref_ptr (tree stmt, struct loop *at_loop, tree offset, tree *initial_address, tree *ptr_incr, - bool only_init, tree type) + bool only_init, tree type, bool *inv_p) { tree base_name; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); + struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree vect_ptr_type; tree vect_ptr; @@ -835,11 +896,31 @@ vect_create_data_ref_ptr (tree stmt, tree new_temp; tree vec_stmt; tree new_stmt_list = NULL_TREE; - edge pe = loop_preheader_edge (loop); + edge pe; basic_block new_bb; tree vect_ptr_init; struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + tree vptr; + block_stmt_iterator incr_bsi; + bool insert_after; + tree indx_before_incr, indx_after_incr; + tree incr; + tree step; + + /* Check the step (evolution) of the load in LOOP, and record + whether it's invariant. */ + if (nested_in_vect_loop) + step = STMT_VINFO_DR_STEP (stmt_info); + else + step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); + + if (tree_int_cst_compare (step, size_zero_node) == 0) + *inv_p = true; + else + *inv_p = false; + /* Create an expression for the first address accessed by this load + in LOOP. */ base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr))); if (vect_print_dump_info (REPORT_DETAILS)) @@ -882,12 +963,44 @@ vect_create_data_ref_ptr (tree stmt, var_ann (vect_ptr)->subvars = DR_SUBVARS (dr); + /** Note: If the dataref is in an inner-loop nested in LOOP, and we are + vectorizing LOOP (i.e. outer-loop vectorization), we need to create two + def-use update cycles for the pointer: One relative to the outer-loop + (LOOP), which is what steps (3) and (4) below do. The other is relative + to the inner-loop (which is the inner-most loop containing the dataref), + and this is done be step (5) below. + + When vectorizing inner-most loops, the vectorized loop (LOOP) is also the + inner-most loop, and so steps (3),(4) work the same, and step (5) is + redundant. Steps (3),(4) create the following: + + vp0 = &base_addr; + LOOP: vp1 = phi(vp0,vp2) + ... + ... + vp2 = vp1 + step + goto LOOP + + If there is an inner-loop nested in loop, then step (5) will also be + applied, and an additional update in the inner-loop will be created: + + vp0 = &base_addr; + LOOP: vp1 = phi(vp0,vp2) + ... + inner: vp3 = phi(vp1,vp4) + vp4 = vp3 + inner_step + if () goto inner + ... + vp2 = vp1 + step + if () goto LOOP */ + /** (3) Calculate the initial address the vector-pointer, and set the vector-pointer to point to it before the loop: **/ /* Create: (&(base[init_val+offset]) in the loop preheader. */ + new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, - offset); + offset, loop); pe = loop_preheader_edge (loop); new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list); gcc_assert (!new_bb); @@ -902,25 +1015,31 @@ vect_create_data_ref_ptr (tree stmt, gcc_assert (!new_bb); - /** (4) Handle the updating of the vector-pointer inside the loop: **/ + /** (4) Handle the updating of the vector-pointer inside the loop. + This is needed when ONLY_INIT is false, and also when AT_LOOP + is the inner-loop nested in LOOP (during outer-loop vectorization). + **/ - if (only_init) /* No update in loop is required. */ + if (only_init && at_loop == loop) /* No update in loop is required. */ { /* Copy the points-to information if it exists. */ if (DR_PTR_INFO (dr)) duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr)); - return vect_ptr_init; + vptr = vect_ptr_init; } else { - block_stmt_iterator incr_bsi; - bool insert_after; - tree indx_before_incr, indx_after_incr; - tree incr; + /* The step of the vector pointer is the Vector Size. */ + tree step = TYPE_SIZE_UNIT (vectype); + /* One exception to the above is when the scalar step of the load in + LOOP is zero. In this case the step here is also zero. */ + if (*inv_p) + step = size_zero_node; standard_iv_increment_position (loop, &incr_bsi, &insert_after); + create_iv (vect_ptr_init, - fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)), + fold_convert (vect_ptr_type, step), NULL_TREE, loop, &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr); incr = bsi_stmt (incr_bsi); @@ -938,15 +1057,51 @@ vect_create_data_ref_ptr (tree stmt, if (ptr_incr) *ptr_incr = incr; - return indx_before_incr; + vptr = indx_before_incr; } + + if (!nested_in_vect_loop || only_init) + return vptr; + + + /** (5) Handle the updating of the vector-pointer inside the inner-loop + nested in LOOP, if exists: **/ + + gcc_assert (nested_in_vect_loop); + if (!only_init) + { + standard_iv_increment_position (containing_loop, &incr_bsi, + &insert_after); + create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE, + containing_loop, &incr_bsi, insert_after, &indx_before_incr, + &indx_after_incr); + incr = bsi_stmt (incr_bsi); + set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo)); + + /* Copy the points-to information if it exists. */ + if (DR_PTR_INFO (dr)) + { + duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr)); + duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr)); + } + merge_alias_info (vect_ptr_init, indx_before_incr); + merge_alias_info (vect_ptr_init, indx_after_incr); + if (ptr_incr) + *ptr_incr = incr; + + return indx_before_incr; + } + else + gcc_unreachable (); } /* Function bump_vector_ptr - Increment a pointer (to a vector type) by vector-size. Connect the new - increment stmt to the existing def-use update-chain of the pointer. + Increment a pointer (to a vector type) by vector-size. If requested, + i.e. if PTR-INCR is given, then also connect the new increment stmt + to the existing def-use update-chain of the pointer, by modifying + the PTR_INCR as illustrated below: The pointer def-use update-chain before this function: DATAREF_PTR = phi (p_0, p_2) @@ -956,18 +1111,20 @@ vect_create_data_ref_ptr (tree stmt, The pointer def-use update-chain after this function: DATAREF_PTR = phi (p_0, p_2) .... - NEW_DATAREF_PTR = DATAREF_PTR + vector_size + NEW_DATAREF_PTR = DATAREF_PTR + BUMP .... PTR_INCR: p_2 = NEW_DATAREF_PTR + step Input: DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated in the loop. - PTR_INCR - the stmt that updates the pointer in each iteration of the loop. - The increment amount across iterations is also expected to be - vector_size. + PTR_INCR - optional. The stmt that updates the pointer in each iteration of + the loop. The increment amount across iterations is expected + to be vector_size. BSI - location where the new update stmt is to be placed. STMT - the original scalar memory-access stmt that is being vectorized. + BUMP - optional. The offset by which to bump the pointer. If not given, + the offset is assumed to be vector_size. Output: Return NEW_DATAREF_PTR as illustrated above. @@ -975,7 +1132,7 @@ vect_create_data_ref_ptr (tree stmt, static tree bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, - tree stmt) + tree stmt, tree bump) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); @@ -988,6 +1145,9 @@ bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, use_operand_p use_p; tree new_dataref_ptr; + if (bump) + update = bump; + incr_stmt = build_gimple_modify_stmt (ptr_var, build2 (POINTER_PLUS_EXPR, vptr_type, dataref_ptr, update)); @@ -995,6 +1155,14 @@ bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr; vect_finish_stmt_generation (stmt, incr_stmt, bsi); + /* Copy the points-to information if it exists. */ + if (DR_PTR_INFO (dr)) + duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); + merge_alias_info (new_dataref_ptr, dataref_ptr); + + if (!ptr_incr) + return new_dataref_ptr; + /* Update the vector-pointer's cross-iteration increment. */ FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) { @@ -1006,11 +1174,6 @@ bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, gcc_assert (tree_int_cst_compare (use, update) == 0); } - /* Copy the points-to information if it exists. */ - if (DR_PTR_INFO (dr)) - duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); - merge_alias_info (new_dataref_ptr, dataref_ptr); - return new_dataref_ptr; } @@ -1045,15 +1208,16 @@ vect_create_destination_var (tree scalar_dest, tree vectype) /* Function vect_init_vector. Insert a new stmt (INIT_STMT) that initializes a new vector variable with - the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be - used in the vectorization of STMT. */ + the vector elements of VECTOR_VAR. Place the initialization at BSI if it + is not NULL. Otherwise, place the initialization at the loop preheader. + Return the DEF of INIT_STMT. + It will be used in the vectorization of STMT. */ static tree -vect_init_vector (tree stmt, tree vector_var, tree vector_type) +vect_init_vector (tree stmt, tree vector_var, tree vector_type, + block_stmt_iterator *bsi) { stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree new_var; tree init_stmt; tree vec_oprnd; @@ -1063,14 +1227,23 @@ vect_init_vector (tree stmt, tree vector_var, tree vector_type) new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_"); add_referenced_var (new_var); - init_stmt = build_gimple_modify_stmt (new_var, vector_var); new_temp = make_ssa_name (new_var, init_stmt); GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp; - pe = loop_preheader_edge (loop); - new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); - gcc_assert (!new_bb); + if (bsi) + vect_finish_stmt_generation (stmt, init_stmt, bsi); + else + { + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + + if (nested_in_vect_loop_p (loop, stmt)) + loop = loop->inner; + pe = loop_preheader_edge (loop); + new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); + gcc_assert (!new_bb); + } if (vect_print_dump_info (REPORT_DETAILS)) { @@ -1086,6 +1259,7 @@ vect_init_vector (tree stmt, tree vector_var, tree vector_type) /* Function get_initial_def_for_induction Input: + STMT - a stmt that performs an induction operation in the loop. IV_PHI - the initial value of the induction variable Output: @@ -1104,8 +1278,8 @@ get_initial_def_for_induction (tree iv_phi) tree vectype = get_vectype_for_scalar_type (scalar_type); int nunits = TYPE_VECTOR_SUBPARTS (vectype); edge pe = loop_preheader_edge (loop); + struct loop *iv_loop; basic_block new_bb; - block_stmt_iterator bsi; tree vec, vec_init, vec_step, t; tree access_fn; tree new_var; @@ -1119,8 +1293,13 @@ get_initial_def_for_induction (tree iv_phi) int ncopies = vf / nunits; tree expr; stmt_vec_info phi_info = vinfo_for_stmt (iv_phi); + bool nested_in_vect_loop = false; tree stmts; - tree stmt = NULL_TREE; + imm_use_iterator imm_iter; + use_operand_p use_p; + tree exit_phi; + edge latch_e; + tree loop_arg; block_stmt_iterator si; basic_block bb = bb_for_stmt (iv_phi); @@ -1129,65 +1308,107 @@ get_initial_def_for_induction (tree iv_phi) /* Find the first insertion point in the BB. */ si = bsi_after_labels (bb); - stmt = bsi_stmt (si); - access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi)); + if (INTEGRAL_TYPE_P (scalar_type)) + step_expr = build_int_cst (scalar_type, 0); + else + step_expr = build_real (scalar_type, dconst0); + + /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */ + if (nested_in_vect_loop_p (loop, iv_phi)) + { + nested_in_vect_loop = true; + iv_loop = loop->inner; + } + else + iv_loop = loop; + gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father); + + latch_e = loop_latch_edge (iv_loop); + loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e); + + access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi)); gcc_assert (access_fn); - ok = vect_is_simple_iv_evolution (loop->num, access_fn, - &init_expr, &step_expr); + ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn, + &init_expr, &step_expr); gcc_assert (ok); + pe = loop_preheader_edge (iv_loop); /* Create the vector that holds the initial_value of the induction. */ - new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_"); - add_referenced_var (new_var); - - new_name = force_gimple_operand (init_expr, &stmts, false, new_var); - if (stmts) + if (nested_in_vect_loop) { - new_bb = bsi_insert_on_edge_immediate (pe, stmts); - gcc_assert (!new_bb); + /* iv_loop is nested in the loop to be vectorized. init_expr had already + been created during vectorization of previous stmts; We obtain it from + the STMT_VINFO_VEC_STMT of the defining stmt. */ + tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop)); + vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL); } - - t = NULL_TREE; - t = tree_cons (NULL_TREE, new_name, t); - for (i = 1; i < nunits; i++) + else { - tree tmp; + /* iv_loop is the loop to be vectorized. Create: + vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ + new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_"); + add_referenced_var (new_var); - /* Create: new_name = new_name + step_expr */ - tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr); - init_stmt = build_gimple_modify_stmt (new_var, tmp); - new_name = make_ssa_name (new_var, init_stmt); - GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name; + new_name = force_gimple_operand (init_expr, &stmts, false, new_var); + if (stmts) + { + new_bb = bsi_insert_on_edge_immediate (pe, stmts); + gcc_assert (!new_bb); + } - new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); - gcc_assert (!new_bb); + t = NULL_TREE; + t = tree_cons (NULL_TREE, init_expr, t); + for (i = 1; i < nunits; i++) + { + tree tmp; - if (vect_print_dump_info (REPORT_DETAILS)) - { - fprintf (vect_dump, "created new init_stmt: "); - print_generic_expr (vect_dump, init_stmt, TDF_SLIM); - } - t = tree_cons (NULL_TREE, new_name, t); + /* Create: new_name_i = new_name + step_expr */ + tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr); + init_stmt = build_gimple_modify_stmt (new_var, tmp); + new_name = make_ssa_name (new_var, init_stmt); + GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name; + + new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); + gcc_assert (!new_bb); + + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "created new init_stmt: "); + print_generic_expr (vect_dump, init_stmt, TDF_SLIM); + } + t = tree_cons (NULL_TREE, new_name, t); + } + /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */ + vec = build_constructor_from_list (vectype, nreverse (t)); + vec_init = vect_init_vector (iv_phi, vec, vectype, NULL); } - vec = build_constructor_from_list (vectype, nreverse (t)); - vec_init = vect_init_vector (stmt, vec, vectype); /* Create the vector that holds the step of the induction. */ - expr = build_int_cst (scalar_type, vf); - new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr); + if (nested_in_vect_loop) + /* iv_loop is nested in the loop to be vectorized. Generate: + vec_step = [S, S, S, S] */ + new_name = step_expr; + else + { + /* iv_loop is the loop to be vectorized. Generate: + vec_step = [VF*S, VF*S, VF*S, VF*S] */ + expr = build_int_cst (scalar_type, vf); + new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr); + } + t = NULL_TREE; for (i = 0; i < nunits; i++) t = tree_cons (NULL_TREE, unshare_expr (new_name), t); vec = build_constructor_from_list (vectype, t); - vec_step = vect_init_vector (stmt, vec, vectype); + vec_step = vect_init_vector (iv_phi, vec, vectype, NULL); /* Create the following def-use cycle: loop prolog: - vec_init = [X, X+S, X+2*S, X+3*S] - vec_step = [VF*S, VF*S, VF*S, VF*S] + vec_init = ... + vec_step = ... loop: vec_iv = PHI ... @@ -1198,7 +1419,7 @@ get_initial_def_for_induction (tree iv_phi) /* Create the induction-phi that defines the induction-operand. */ vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); add_referenced_var (vec_dest); - induction_phi = create_phi_node (vec_dest, loop->header); + induction_phi = create_phi_node (vec_dest, iv_loop->header); set_stmt_info (get_stmt_ann (induction_phi), new_stmt_vec_info (induction_phi, loop_vinfo)); induc_def = PHI_RESULT (induction_phi); @@ -1209,15 +1430,16 @@ get_initial_def_for_induction (tree iv_phi) induc_def, vec_step)); vec_def = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def; - bsi = bsi_for_stmt (stmt); - vect_finish_stmt_generation (stmt, new_stmt, &bsi); + bsi_insert_before (&si, new_stmt, BSI_SAME_STMT); + set_stmt_info (get_stmt_ann (new_stmt), + new_stmt_vec_info (new_stmt, loop_vinfo)); /* Set the arguments of the phi node: */ - add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop)); - add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop)); + add_phi_arg (induction_phi, vec_init, pe); + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop)); - /* In case the vectorization factor (VF) is bigger than the number + /* In case that vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate more than one vector stmt - i.e - we need to "unroll" the vector stmt by a factor VF/nunits. For more details see documentation @@ -1226,6 +1448,8 @@ get_initial_def_for_induction (tree iv_phi) if (ncopies > 1) { stmt_vec_info prev_stmt_vinfo; + /* FORNOW. This restriction should be relaxed. */ + gcc_assert (!nested_in_vect_loop); /* Create the vector that holds the step of the induction. */ expr = build_int_cst (scalar_type, nunits); @@ -1234,7 +1458,7 @@ get_initial_def_for_induction (tree iv_phi) for (i = 0; i < nunits; i++) t = tree_cons (NULL_TREE, unshare_expr (new_name), t); vec = build_constructor_from_list (vectype, t); - vec_step = vect_init_vector (stmt, vec, vectype); + vec_step = vect_init_vector (iv_phi, vec, vectype, NULL); vec_def = induc_def; prev_stmt_vinfo = vinfo_for_stmt (induction_phi); @@ -1242,19 +1466,50 @@ get_initial_def_for_induction (tree iv_phi) { tree tmp; - /* vec_i = vec_prev + vec_{step*nunits} */ + /* vec_i = vec_prev + vec_step */ tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step); new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp); vec_def = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def; - bsi = bsi_for_stmt (stmt); - vect_finish_stmt_generation (stmt, new_stmt, &bsi); - + bsi_insert_before (&si, new_stmt, BSI_SAME_STMT); + set_stmt_info (get_stmt_ann (new_stmt), + new_stmt_vec_info (new_stmt, loop_vinfo)); STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; prev_stmt_vinfo = vinfo_for_stmt (new_stmt); } } + if (nested_in_vect_loop) + { + /* Find the loop-closed exit-phi of the induction, and record + the final vector of induction results: */ + exit_phi = NULL; + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) + { + if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p)))) + { + exit_phi = USE_STMT (use_p); + break; + } + } + if (exit_phi) + { + stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); + /* FORNOW. Currently not supporting the case that an inner-loop induction + is not used in the outer-loop (i.e. only outside the outer-loop). */ + gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) + && !STMT_VINFO_LIVE_P (stmt_vinfo)); + + STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt; + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "vector of inductions after inner-loop:"); + print_generic_expr (vect_dump, new_stmt, TDF_SLIM); + } + } + } + + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "transform induction: created def-use cycle:"); @@ -1290,7 +1545,6 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); int nunits = TYPE_VECTOR_SUBPARTS (vectype); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree vec_inv; tree vec_cst; tree t = NULL_TREE; @@ -1341,7 +1595,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); vec_cst = build_vector (vector_type, t); - return vect_init_vector (stmt, vec_cst, vector_type); + return vect_init_vector (stmt, vec_cst, vector_type, NULL); } /* Case 2: operand is defined outside the loop - loop invariant. */ @@ -1362,8 +1616,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) /* FIXME: use build_constructor directly. */ vector_type = get_vectype_for_scalar_type (TREE_TYPE (def)); vec_inv = build_constructor_from_list (vector_type, t); - - return vect_init_vector (stmt, vec_inv, vector_type); + return vect_init_vector (stmt, vec_inv, vector_type, NULL); } /* Case 3: operand is defined inside the loop. */ @@ -1376,14 +1629,20 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) def_stmt_info = vinfo_for_stmt (def_stmt); vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info); gcc_assert (vec_stmt); - vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0); + if (TREE_CODE (vec_stmt) == PHI_NODE) + vec_oprnd = PHI_RESULT (vec_stmt); + else + vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0); return vec_oprnd; } /* Case 4: operand is defined by a loop header phi - reduction */ case vect_reduction_def: { + struct loop *loop; + gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); + loop = (bb_for_stmt (def_stmt))->loop_father; /* Get the def before the loop */ op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); @@ -1395,8 +1654,12 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) { gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); - /* Get the def before the loop */ - return get_initial_def_for_induction (def_stmt); + /* Get the def from the vectorized stmt. */ + def_stmt_info = vinfo_for_stmt (def_stmt); + vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info); + gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE)); + vec_oprnd = PHI_RESULT (vec_stmt); + return vec_oprnd; } default: @@ -1477,7 +1740,6 @@ vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd) vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info); gcc_assert (vec_stmt_for_operand); vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0); - return vec_oprnd; } @@ -1493,7 +1755,11 @@ vect_finish_stmt_generation (tree stmt, tree vec_stmt, stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + gcc_assert (stmt == bsi_stmt (*bsi)); + gcc_assert (TREE_CODE (stmt) != LABEL_EXPR); + bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT); + set_stmt_info (get_stmt_ann (vec_stmt), new_stmt_vec_info (vec_stmt, loop_vinfo)); @@ -1561,6 +1827,8 @@ static tree get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def) { stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); int nunits = TYPE_VECTOR_SUBPARTS (vectype); enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)); @@ -1571,8 +1839,14 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def) tree t = NULL_TREE; int i; tree vector_type; + bool nested_in_vect_loop = false; gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)); + if (nested_in_vect_loop_p (loop, stmt)) + nested_in_vect_loop = true; + else + gcc_assert (loop == (bb_for_stmt (stmt))->loop_father); + vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL); switch (code) @@ -1580,7 +1854,10 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def) case WIDEN_SUM_EXPR: case DOT_PROD_EXPR: case PLUS_EXPR: - *adjustment_def = init_val; + if (nested_in_vect_loop) + *adjustment_def = vecdef; + else + *adjustment_def = init_val; /* Create a vector of zeros for init_def. */ if (INTEGRAL_TYPE_P (type)) def_for_init = build_int_cst (type, 0); @@ -1669,24 +1946,31 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree new_phi; block_stmt_iterator exit_bsi; tree vec_dest; - tree new_temp; + tree new_temp = NULL_TREE; tree new_name; - tree epilog_stmt; - tree new_scalar_dest, exit_phi; + tree epilog_stmt = NULL_TREE; + tree new_scalar_dest, exit_phi, new_dest; tree bitsize, bitpos, bytesize; enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)); - tree scalar_initial_def; + tree adjustment_def; tree vec_initial_def; tree orig_name; imm_use_iterator imm_iter; use_operand_p use_p; - bool extract_scalar_result; - tree reduction_op; + bool extract_scalar_result = false; + tree reduction_op, expr; tree orig_stmt; tree use_stmt; tree operation = GIMPLE_STMT_OPERAND (stmt, 1); + bool nested_in_vect_loop = false; int op_type; + if (nested_in_vect_loop_p (loop, stmt)) + { + loop = loop->inner; + nested_in_vect_loop = true; + } + op_type = TREE_OPERAND_LENGTH (operation); reduction_op = TREE_OPERAND (operation, op_type-1); vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); @@ -1699,7 +1983,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, the scalar def before the loop, that defines the initial value of the reduction variable. */ vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, - &scalar_initial_def); + &adjustment_def); add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop)); /* 1.2 set the loop-latch arg for the reduction-phi: */ @@ -1778,6 +2062,15 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, bitsize = TYPE_SIZE (scalar_type); bytesize = TYPE_SIZE_UNIT (scalar_type); + + /* In case this is a reduction in an inner-loop while vectorizing an outer + loop - we don't need to extract a single scalar result at the end of the + inner-loop. The final vector of partial results will be used in the + vectorized outer-loop, or reduced to a scalar result at the end of the + outer-loop. */ + if (nested_in_vect_loop) + goto vect_finalize_reduction; + /* 2.3 Create the reduction code, using one of the three schemes described above. */ @@ -1809,7 +2102,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); tree vec_temp; - if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing) + if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) shift_code = VEC_RSHIFT_EXPR; else have_whole_vector_shift = false; @@ -1825,7 +2118,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, else { optab optab = optab_for_tree_code (code, vectype); - if (optab->handlers[mode].insn_code == CODE_FOR_nothing) + if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) have_whole_vector_shift = false; } @@ -1924,6 +2217,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, { tree rhs; + gcc_assert (!nested_in_vect_loop); if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "extract scalar result"); @@ -1942,25 +2236,42 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); } - /* 2.4 Adjust the final result by the initial value of the reduction +vect_finalize_reduction: + + /* 2.5 Adjust the final result by the initial value of the reduction variable. (When such adjustment is not needed, then - 'scalar_initial_def' is zero). + 'adjustment_def' is zero). For example, if code is PLUS we create: + new_temp = loop_exit_def + adjustment_def */ - Create: - s_out4 = scalar_expr */ - - if (scalar_initial_def) + if (adjustment_def) { - tree tmp = build2 (code, scalar_type, new_temp, scalar_initial_def); - epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp); - new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + if (nested_in_vect_loop) + { + gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); + expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); + new_dest = vect_create_destination_var (scalar_dest, vectype); + } + else + { + gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); + expr = build2 (code, scalar_type, new_temp, adjustment_def); + new_dest = vect_create_destination_var (scalar_dest, scalar_type); + } + epilog_stmt = build_gimple_modify_stmt (new_dest, expr); + new_temp = make_ssa_name (new_dest, epilog_stmt); GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp; +#if 0 + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); +#else bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); +#endif } - /* 2.6 Replace uses of s_out0 with uses of s_out3 */ - /* Find the loop-closed-use at the loop exit of the original scalar result. + /* 2.6 Handle the loop-exit phi */ + + /* Replace uses of s_out0 with uses of s_out3: + Find the loop-closed-use at the loop exit of the original scalar result. (The reduction result is expected to have two immediate uses - one at the latch block, and one at the loop exit). */ exit_phi = NULL; @@ -1974,6 +2285,29 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, } /* We expect to have found an exit_phi because of loop-closed-ssa form. */ gcc_assert (exit_phi); + + if (nested_in_vect_loop) + { + stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); + + /* FORNOW. Currently not supporting the case that an inner-loop reduction + is not used in the outer-loop (but only outside the outer-loop). */ + gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) + && !STMT_VINFO_LIVE_P (stmt_vinfo)); + + epilog_stmt = adjustment_def ? epilog_stmt : new_phi; + STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt; + set_stmt_info (get_stmt_ann (epilog_stmt), + new_stmt_vec_info (epilog_stmt, loop_vinfo)); + + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "vector of partial results after inner-loop:"); + print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + } + return; + } + /* Replace the uses: */ orig_name = PHI_RESULT (exit_phi); FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) @@ -2055,15 +2389,30 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) tree new_stmt = NULL_TREE; int j; + if (nested_in_vect_loop_p (loop, stmt)) + { + loop = loop->inner; + /* FORNOW. This restriction should be relaxed. */ + if (ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } + } + gcc_assert (ncopies >= 1); /* 1. Is vectorizable reduction? */ /* Not supportable if the reduction variable is used in the loop. */ - if (STMT_VINFO_RELEVANT_P (stmt_info)) + if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer) return false; - if (!STMT_VINFO_LIVE_P (stmt_info)) + /* Reductions that are not used even in an enclosing outer-loop, + are expected to be "live" (used out of the loop). */ + if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop + && !STMT_VINFO_LIVE_P (stmt_info)) return false; /* Make sure it was already recognized as a reduction computation. */ @@ -2120,9 +2469,9 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) gcc_assert (dt == vect_reduction_def); gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); if (orig_stmt) - gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt)); + gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt)); else - gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt)); + gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt)); if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) return false; @@ -2138,7 +2487,7 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) return false; } vec_mode = TYPE_MODE (vectype); - if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing) + if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "op not supported by target."); @@ -2218,7 +2567,7 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) fprintf (vect_dump, "no optab for reduction."); epilog_reduc_code = NUM_TREE_CODES; } - if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing) + if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "reduc op not supported by target."); @@ -2347,6 +2696,7 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) int nunits_in; int nunits_out; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type; enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; tree new_stmt; @@ -2456,6 +2806,14 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) needs to be generated. */ gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } + if (!vec_stmt) /* transformation not required. */ { STMT_VINFO_TYPE (stmt_info) = call_vec_info_type; @@ -2470,6 +2828,14 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform operation."); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } + /* Handle def. */ scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); vec_dest = vect_create_destination_var (scalar_dest, vectype_out); @@ -2661,6 +3027,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK; tree decl1 = NULL_TREE, decl2 = NULL_TREE; tree new_temp; @@ -2742,6 +3109,14 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, needs to be generated. */ gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } + /* Check the operands of the operation. */ if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0)) { @@ -3083,6 +3458,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum tree_code code; enum machine_mode vec_mode; tree new_temp; @@ -3101,6 +3477,13 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) int j; gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; @@ -3175,7 +3558,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) return false; } vec_mode = TYPE_MODE (vectype); - icode = (int) optab->handlers[(int) vec_mode].insn_code; + icode = (int) optab_handler (optab, vec_mode)->insn_code; if (icode == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS)) @@ -3363,6 +3746,7 @@ vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi, tree vec_oprnd0=NULL, vec_oprnd1=NULL; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum tree_code code, code1 = ERROR_MARK; tree new_temp; tree def, def_stmt; @@ -3415,6 +3799,13 @@ vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi, ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out; gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest)) && INTEGRAL_TYPE_P (TREE_TYPE (op0))) @@ -3512,6 +3903,7 @@ vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi, tree vec_oprnd0=NULL, vec_oprnd1=NULL; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK; tree decl1 = NULL_TREE, decl2 = NULL_TREE; int op_type; @@ -3565,6 +3957,13 @@ vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi, ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest)) && INTEGRAL_TYPE_P (TREE_TYPE (op0))) @@ -3691,9 +4090,9 @@ vect_strided_store_supported (tree vectype) return false; } - if (interleave_high_optab->handlers[(int) mode].insn_code + if (optab_handler (interleave_high_optab, mode)->insn_code == CODE_FOR_nothing - || interleave_low_optab->handlers[(int) mode].insn_code + || optab_handler (interleave_low_optab, mode)->insn_code == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS)) @@ -3857,11 +4256,10 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL; tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum machine_mode vec_mode; tree dummy; - enum dr_alignment_support alignment_support_cheme; - ssa_op_iter iter; - def_operand_p def_p; + enum dr_alignment_support alignment_support_scheme; tree def, def_stmt; enum vect_def_type dt; stmt_vec_info prev_stmt_info = NULL; @@ -3873,8 +4271,18 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) bool strided_store = false; unsigned int group_size, i; VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL; + bool inv_p; + gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } + if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; @@ -3910,7 +4318,7 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) vec_mode = TYPE_MODE (vectype); /* FORNOW. In some cases can vectorize even if data-type not supported (e.g. - array initialization with 0). */ - if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing) + if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing) return false; if (!STMT_VINFO_DATA_REF (stmt_info)) @@ -3940,6 +4348,9 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++; + /* FORNOW */ + gcc_assert (!nested_in_vect_loop_p (loop, stmt)); + /* We vectorize all the stmts of the interleaving group when we reach the last stmt in the group. */ if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt)) @@ -3962,9 +4373,9 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) dr_chain = VEC_alloc (tree, heap, group_size); oprnds = VEC_alloc (tree, heap, group_size); - alignment_support_cheme = vect_supportable_dr_alignment (first_dr); - gcc_assert (alignment_support_cheme); - gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */ + alignment_support_scheme = vect_supportable_dr_alignment (first_dr); + gcc_assert (alignment_support_scheme); + gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */ /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate @@ -4034,9 +4445,10 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) VEC_quick_push(tree, oprnds, vec_oprnd); next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); } - dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE, + dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE, &dummy, &ptr_incr, false, - TREE_TYPE (vec_oprnd)); + TREE_TYPE (vec_oprnd), &inv_p); + gcc_assert (!inv_p); } else { @@ -4054,7 +4466,8 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) VEC_replace(tree, dr_chain, i, vec_oprnd); VEC_replace(tree, oprnds, i, vec_oprnd); } - dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); + dataref_ptr = + bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); } if (strided_store) @@ -4078,43 +4491,20 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Arguments are ready. Create the new vector stmt. */ new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd); vect_finish_stmt_generation (stmt, new_stmt, bsi); - - /* Set the VDEFs for the vector pointer. If this virtual def - has a use outside the loop and a loop peel is performed - then the def may be renamed by the peel. Mark it for - renaming so the later use will also be renamed. */ - copy_virtual_operands (new_stmt, next_stmt); - if (j == 0) - { - /* The original store is deleted so the same SSA_NAMEs - can be used. */ - FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF) - { - SSA_NAME_DEF_STMT (def) = new_stmt; - mark_sym_for_renaming (SSA_NAME_VAR (def)); - } - - STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; - } + mark_symbols_for_renaming (new_stmt); + + if (j == 0) + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; else - { - /* Create new names for all the definitions created by COPY and - add replacement mappings for each new name. */ - FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF) - { - create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p); - mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p))); - } - - STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; - } + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; prev_stmt_info = vinfo_for_stmt (new_stmt); next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); if (!next_stmt) break; /* Bump the vector pointer. */ - dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); + dataref_ptr = + bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); } } @@ -4125,14 +4515,17 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Function vect_setup_realignment This function is called when vectorizing an unaligned load using - the dr_unaligned_software_pipeline scheme. + the dr_explicit_realign[_optimized] scheme. This function generates the following code at the loop prolog: p = initial_addr; - msq_init = *(floor(p)); # prolog load + x msq_init = *(floor(p)); # prolog load realignment_token = call target_builtin; loop: - msq = phi (msq_init, ---) + x msq = phi (msq_init, ---) + + The stmts marked with x are generated only for the case of + dr_explicit_realign_optimized. The code above sets up a new (vector) pointer, pointing to the first location accessed by STMT, and a "floor-aligned" load using that pointer. @@ -4141,19 +4534,29 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) whose arguments are the result of the prolog-load (created by this function) and the result of a load that takes place in the loop (to be created by the caller to this function). + + For the case of dr_explicit_realign_optimized: The caller to this function uses the phi-result (msq) to create the realignment code inside the loop, and sets up the missing phi argument, as follows: - loop: msq = phi (msq_init, lsq) lsq = *(floor(p')); # load in loop result = realign_load (msq, lsq, realignment_token); + For the case of dr_explicit_realign: + loop: + msq = *(floor(p)); # load in loop + p' = p + (VS-1); + lsq = *(floor(p')); # load in loop + result = realign_load (msq, lsq, realignment_token); + Input: STMT - (scalar) load stmt to be vectorized. This load accesses a memory location that may be unaligned. BSI - place where new code is to be inserted. + ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes + is used. Output: REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load @@ -4162,45 +4565,144 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) static tree vect_setup_realignment (tree stmt, block_stmt_iterator *bsi, - tree *realignment_token) + tree *realignment_token, + enum dr_alignment_support alignment_support_scheme, + tree init_addr, + struct loop **at_loop) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - edge pe = loop_preheader_edge (loop); + edge pe; tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); tree vec_dest; - tree init_addr; tree inc; tree ptr; tree data_ref; tree new_stmt; basic_block new_bb; - tree msq_init; + tree msq_init = NULL_TREE; tree new_temp; tree phi_stmt; - tree msq; + tree msq = NULL_TREE; + tree stmts = NULL_TREE; + bool inv_p; + bool compute_in_loop = false; + bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); + struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; + struct loop *loop_for_initial_load; + + gcc_assert (alignment_support_scheme == dr_explicit_realign + || alignment_support_scheme == dr_explicit_realign_optimized); + + /* We need to generate three things: + 1. the misalignment computation + 2. the extra vector load (for the optimized realignment scheme). + 3. the phi node for the two vectors from which the realignment is + done (for the optimized realignment scheme). + */ + + /* 1. Determine where to generate the misalignment computation. + + If INIT_ADDR is NULL_TREE, this indicates that the misalignment + calculation will be generated by this function, outside the loop (in the + preheader). Otherwise, INIT_ADDR had already been computed for us by the + caller, inside the loop. + + Background: If the misalignment remains fixed throughout the iterations of + the loop, then both realignment schemes are applicable, and also the + misalignment computation can be done outside LOOP. This is because we are + vectorizing LOOP, and so the memory accesses in LOOP advance in steps that + are a multiple of VS (the Vector Size), and therefore the misalignment in + different vectorized LOOP iterations is always the same. + The problem arises only if the memory access is in an inner-loop nested + inside LOOP, which is now being vectorized using outer-loop vectorization. + This is the only case when the misalignment of the memory access may not + remain fixed thtoughout the iterations of the inner-loop (as exaplained in + detail in vect_supportable_dr_alignment). In this case, not only is the + optimized realignment scheme not applicable, but also the misalignment + computation (and generation of the realignment token that is passed to + REALIGN_LOAD) have to be done inside the loop. + + In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode + or not, which in turn determines if the misalignment is computed inside + the inner-loop, or outside LOOP. */ + + if (init_addr != NULL_TREE) + { + compute_in_loop = true; + gcc_assert (alignment_support_scheme == dr_explicit_realign); + } + + + /* 2. Determine where to generate the extra vector load. + + For the optimized realignment scheme, instead of generating two vector + loads in each iteration, we generate a single extra vector load in the + preheader of the loop, and in each iteration reuse the result of the + vector load from the previous iteration. In case the memory access is in + an inner-loop nested inside LOOP, which is now being vectorized using + outer-loop vectorization, we need to determine whether this initial vector + load should be generated at the preheader of the inner-loop, or can be + generated at the preheader of LOOP. If the memory access has no evolution + in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has + to be generated inside LOOP (in the preheader of the inner-loop). */ + + if (nested_in_vect_loop) + { + tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); + bool invariant_in_outerloop = + (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); + loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); + } + else + loop_for_initial_load = loop; + if (at_loop) + *at_loop = loop_for_initial_load; - /* 1. Create msq_init = *(floor(p1)) in the loop preheader */ - vec_dest = vect_create_destination_var (scalar_dest, vectype); - ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true, - NULL_TREE); - data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); - new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); - new_temp = make_ssa_name (vec_dest, new_stmt); - GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; - new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); - gcc_assert (!new_bb); - msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0); - copy_virtual_operands (new_stmt, stmt); - update_vuses_to_preheader (new_stmt, loop); + /* 3. For the case of the optimized realignment, create the first vector + load at the loop preheader. */ + + if (alignment_support_scheme == dr_explicit_realign_optimized) + { + /* Create msq_init = *(floor(p1)) in the loop preheader */ + + gcc_assert (!compute_in_loop); + pe = loop_preheader_edge (loop_for_initial_load); + vec_dest = vect_create_destination_var (scalar_dest, vectype); + ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE, + &init_addr, &inc, true, NULL_TREE, &inv_p); + data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); + new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); + gcc_assert (!new_bb); + msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0); + } + + /* 4. Create realignment token using a target builtin, if available. + It is done either inside the containing loop, or before LOOP (as + determined above). */ - /* 2. Create permutation mask, if required, in loop preheader. */ if (targetm.vectorize.builtin_mask_for_load) { tree builtin_decl; + /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ + if (compute_in_loop) + gcc_assert (init_addr); /* already computed by the caller. */ + else + { + /* Generate the INIT_ADDR computation outside LOOP. */ + init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, + NULL_TREE, loop); + pe = loop_preheader_edge (loop); + new_bb = bsi_insert_on_edge_immediate (pe, stmts); + gcc_assert (!new_bb); + } + builtin_decl = targetm.vectorize.builtin_mask_for_load (); new_stmt = build_call_expr (builtin_decl, 1, init_addr); vec_dest = vect_create_destination_var (scalar_dest, @@ -4208,8 +4710,17 @@ vect_setup_realignment (tree stmt, block_stmt_iterator *bsi, new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); new_temp = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; - new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); - gcc_assert (!new_bb); + + if (compute_in_loop) + bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT); + else + { + /* Generate the misalignment computation outside LOOP. */ + pe = loop_preheader_edge (loop); + new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); + gcc_assert (!new_bb); + } + *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0); /* The result of the CALL_EXPR to this builtin is determined from @@ -4220,12 +4731,21 @@ vect_setup_realignment (tree stmt, block_stmt_iterator *bsi, gcc_assert (TREE_READONLY (builtin_decl)); } - /* 3. Create msq = phi in loop */ + if (alignment_support_scheme == dr_explicit_realign) + return msq; + + gcc_assert (!compute_in_loop); + gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); + + + /* 5. Create msq = phi in loop */ + + pe = loop_preheader_edge (containing_loop); vec_dest = vect_create_destination_var (scalar_dest, vectype); msq = make_ssa_name (vec_dest, NULL_TREE); - phi_stmt = create_phi_node (msq, loop->header); + phi_stmt = create_phi_node (msq, containing_loop->header); SSA_NAME_DEF_STMT (msq) = phi_stmt; - add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop)); + add_phi_arg (phi_stmt, msq_init, pe); return msq; } @@ -4252,7 +4772,7 @@ vect_strided_load_supported (tree vectype) return false; } - if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing) + if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "perm_even op not supported by target."); @@ -4267,7 +4787,7 @@ vect_strided_load_supported (tree vectype) return false; } - if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing) + if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "perm_odd op not supported by target."); @@ -4515,13 +5035,15 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) stmt_vec_info prev_stmt_info; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; + bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr; tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree new_temp; int mode; tree new_stmt = NULL_TREE; tree dummy; - enum dr_alignment_support alignment_support_cheme; + enum dr_alignment_support alignment_support_scheme; tree dataref_ptr = NULL_TREE; tree ptr_incr; int nunits = TYPE_VECTOR_SUBPARTS (vectype); @@ -4530,10 +5052,24 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) tree msq = NULL_TREE, lsq; tree offset = NULL_TREE; tree realignment_token = NULL_TREE; - tree phi_stmt = NULL_TREE; + tree phi = NULL_TREE; VEC(tree,heap) *dr_chain = NULL; bool strided_load = false; tree first_stmt; + tree scalar_type; + bool inv_p; + bool compute_in_loop = false; + struct loop *at_loop; + + gcc_assert (ncopies >= 1); + + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; @@ -4566,11 +5102,12 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (!STMT_VINFO_DATA_REF (stmt_info)) return false; + scalar_type = TREE_TYPE (DR_REF (dr)); mode = (int) TYPE_MODE (vectype); /* FORNOW. In some cases can vectorize even if data-type not supported (e.g. - data copies). */ - if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing) + if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "Aligned load, but unsupported type."); @@ -4581,6 +5118,8 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (DR_GROUP_FIRST_DR (stmt_info)) { strided_load = true; + /* FORNOW */ + gcc_assert (! nested_in_vect_loop); /* Check if interleaving is supported. */ if (!vect_strided_load_supported (vectype)) @@ -4619,9 +5158,8 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) group_size = 1; } - alignment_support_cheme = vect_supportable_dr_alignment (first_dr); - gcc_assert (alignment_support_cheme); - + alignment_support_scheme = vect_supportable_dr_alignment (first_dr); + gcc_assert (alignment_support_scheme); /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate @@ -4703,7 +5241,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) } Otherwise, the data reference is potentially unaligned on a target that - does not support unaligned accesses (dr_unaligned_software_pipeline) - + does not support unaligned accesses (dr_explicit_realign_optimized) - then generate the following code, in which the data in each iteration is obtained by two vector loads, one from the previous iteration, and one from the current iteration: @@ -4720,27 +5258,52 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) msq = lsq; } */ - if (alignment_support_cheme == dr_unaligned_software_pipeline) + /* If the misalignment remains the same throughout the execution of the + loop, we can create the init_addr and permutation mask at the loop + preheader. Otherwise, it needs to be created inside the loop. + This can only occur when vectorizing memory accesses in the inner-loop + nested within an outer-loop that is being vectorized. */ + + if (nested_in_vect_loop_p (loop, stmt) + && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0)) { - msq = vect_setup_realignment (first_stmt, bsi, &realignment_token); - phi_stmt = SSA_NAME_DEF_STMT (msq); - offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); + gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized); + compute_in_loop = true; } + if ((alignment_support_scheme == dr_explicit_realign_optimized + || alignment_support_scheme == dr_explicit_realign) + && !compute_in_loop) + { + msq = vect_setup_realignment (first_stmt, bsi, &realignment_token, + alignment_support_scheme, NULL_TREE, + &at_loop); + if (alignment_support_scheme == dr_explicit_realign_optimized) + { + phi = SSA_NAME_DEF_STMT (msq); + offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); + } + } + else + at_loop = loop; + prev_stmt_info = NULL; for (j = 0; j < ncopies; j++) { /* 1. Create the vector pointer update chain. */ if (j == 0) - dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy, - &ptr_incr, false, NULL_TREE); + dataref_ptr = vect_create_data_ref_ptr (first_stmt, + at_loop, offset, + &dummy, &ptr_incr, false, + NULL_TREE, &inv_p); else - dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); + dataref_ptr = + bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); for (i = 0; i < group_size; i++) { /* 2. Create the vector-load in the loop. */ - switch (alignment_support_cheme) + switch (alignment_support_scheme) { case dr_aligned: gcc_assert (aligned_access_p (first_dr)); @@ -4751,14 +5314,39 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) int mis = DR_MISALIGNMENT (first_dr); tree tmis = (mis == -1 ? size_zero_node : size_int (mis)); - gcc_assert (!aligned_access_p (first_dr)); tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT)); data_ref = build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis); break; } - case dr_unaligned_software_pipeline: - gcc_assert (!aligned_access_p (first_dr)); + case dr_explicit_realign: + { + tree ptr, bump; + tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); + + if (compute_in_loop) + msq = vect_setup_realignment (first_stmt, bsi, + &realignment_token, + dr_explicit_realign, + dataref_ptr, NULL); + + data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + vect_finish_stmt_generation (stmt, new_stmt, bsi); + copy_virtual_operands (new_stmt, stmt); + mark_symbols_for_renaming (new_stmt); + msq = new_temp; + + bump = size_binop (MULT_EXPR, vs_minus_1, + TYPE_SIZE_UNIT (scalar_type)); + ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump); + data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); + break; + } + case dr_explicit_realign_optimized: data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); break; default: @@ -4769,32 +5357,72 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) new_temp = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, new_stmt, bsi); - copy_virtual_operands (new_stmt, stmt); mark_symbols_for_renaming (new_stmt); - /* 3. Handle explicit realignment if necessary/supported. */ - if (alignment_support_cheme == dr_unaligned_software_pipeline) + /* 3. Handle explicit realignment if necessary/supported. Create in + loop: vec_dest = realign_load (msq, lsq, realignment_token) */ + if (alignment_support_scheme == dr_explicit_realign_optimized + || alignment_support_scheme == dr_explicit_realign) { - /* Create in loop: - */ lsq = GIMPLE_STMT_OPERAND (new_stmt, 0); if (!realignment_token) realignment_token = dataref_ptr; vec_dest = vect_create_destination_var (scalar_dest, vectype); - new_stmt = - build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token); + new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, + realignment_token); new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); new_temp = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, new_stmt, bsi); - if (i == group_size - 1 && j == ncopies - 1) - add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop)); - msq = lsq; + + if (alignment_support_scheme == dr_explicit_realign_optimized) + { + if (i == group_size - 1 && j == ncopies - 1) + add_phi_arg (phi, lsq, loop_latch_edge (containing_loop)); + msq = lsq; + } } + + /* 4. Handle invariant-load. */ + if (inv_p) + { + gcc_assert (!strided_load); + gcc_assert (nested_in_vect_loop_p (loop, stmt)); + if (j == 0) + { + int k; + tree t = NULL_TREE; + tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type); + + /* CHECKME: bitpos depends on endianess? */ + bitpos = bitsize_zero_node; + vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp, + bitsize, bitpos); + BIT_FIELD_REF_UNSIGNED (vec_inv) = + TYPE_UNSIGNED (scalar_type); + vec_dest = + vect_create_destination_var (scalar_dest, NULL_TREE); + new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + vect_finish_stmt_generation (stmt, new_stmt, bsi); + + for (k = nunits - 1; k >= 0; --k) + t = tree_cons (NULL_TREE, new_temp, t); + /* FIXME: use build_constructor directly. */ + vec_inv = build_constructor_from_list (vectype, t); + new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi); + new_stmt = SSA_NAME_DEF_STMT (new_temp); + } + else + gcc_unreachable (); /* FORNOW. */ + } + if (strided_load) VEC_quick_push (tree, dr_chain, new_temp); if (i < group_size - 1) - dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); + dataref_ptr = + bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); } if (strided_load) @@ -4831,6 +5459,7 @@ vectorizable_live_operation (tree stmt, tree operation; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); int i; int op_type; tree op; @@ -4848,6 +5477,10 @@ vectorizable_live_operation (tree stmt, if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME) return false; + /* FORNOW. CHECKME. */ + if (nested_in_vect_loop_p (loop, stmt)) + return false; + operation = GIMPLE_STMT_OPERAND (stmt, 1); op_type = TREE_OPERAND_LENGTH (operation); @@ -4902,7 +5535,8 @@ vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo) if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt)) return false; } - else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST) + else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST + && TREE_CODE (lhs) != FIXED_CST) return false; if (TREE_CODE (rhs) == SSA_NAME) @@ -4911,7 +5545,8 @@ vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo) if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt)) return false; } - else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST) + else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST + && TREE_CODE (rhs) != FIXED_CST) return false; return true; @@ -4992,7 +5627,8 @@ vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) return false; } else if (TREE_CODE (then_clause) != INTEGER_CST - && TREE_CODE (then_clause) != REAL_CST) + && TREE_CODE (then_clause) != REAL_CST + && TREE_CODE (then_clause) != FIXED_CST) return false; if (TREE_CODE (else_clause) == SSA_NAME) @@ -5003,7 +5639,8 @@ vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) return false; } else if (TREE_CODE (else_clause) != INTEGER_CST - && TREE_CODE (else_clause) != REAL_CST) + && TREE_CODE (else_clause) != REAL_CST + && TREE_CODE (else_clause) != FIXED_CST) return false; @@ -5262,82 +5899,6 @@ vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo, } -/* Function update_vuses_to_preheader. - - Input: - STMT - a statement with potential VUSEs. - LOOP - the loop whose preheader will contain STMT. - - It's possible to vectorize a loop even though an SSA_NAME from a VUSE - appears to be defined in a VDEF in another statement in a loop. - One such case is when the VUSE is at the dereference of a __restricted__ - pointer in a load and the VDEF is at the dereference of a different - __restricted__ pointer in a store. Vectorization may result in - copy_virtual_uses being called to copy the problematic VUSE to a new - statement that is being inserted in the loop preheader. This procedure - is called to change the SSA_NAME in the new statement's VUSE from the - SSA_NAME updated in the loop to the related SSA_NAME available on the - path entering the loop. - - When this function is called, we have the following situation: - - # vuse - S1: vload - do { - # name1 = phi < name0 , name2> - - # vuse - S2: vload - - # name2 = vdef - S3: vstore - - }while... - - Stmt S1 was created in the loop preheader block as part of misaligned-load - handling. This function fixes the name of the vuse of S1 from 'name1' to - 'name0'. */ - -static void -update_vuses_to_preheader (tree stmt, struct loop *loop) -{ - basic_block header_bb = loop->header; - edge preheader_e = loop_preheader_edge (loop); - ssa_op_iter iter; - use_operand_p use_p; - - FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE) - { - tree ssa_name = USE_FROM_PTR (use_p); - tree def_stmt = SSA_NAME_DEF_STMT (ssa_name); - tree name_var = SSA_NAME_VAR (ssa_name); - basic_block bb = bb_for_stmt (def_stmt); - - /* For a use before any definitions, def_stmt is a NOP_EXPR. */ - if (!IS_EMPTY_STMT (def_stmt) - && flow_bb_inside_loop_p (loop, bb)) - { - /* If the block containing the statement defining the SSA_NAME - is in the loop then it's necessary to find the definition - outside the loop using the PHI nodes of the header. */ - tree phi; - bool updated = false; - - for (phi = phi_nodes (header_bb); phi; phi = PHI_CHAIN (phi)) - { - if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var) - { - SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx)); - updated = true; - break; - } - } - gcc_assert (updated); - } - } -} - - /* Function vect_update_ivs_after_vectorizer. "Advance" the induction variables of LOOP to the value they should take @@ -5632,8 +6193,8 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) else { tree new_stmts = NULL_TREE; - tree start_addr = - vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE); + tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, + &new_stmts, NULL_TREE, loop); tree ptr_type = TREE_TYPE (start_addr); tree size = TYPE_SIZE (ptr_type); tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1); @@ -5806,6 +6367,7 @@ static tree vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, tree *cond_expr_stmt_list) { + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); VEC(tree,heap) *may_misalign_stmts = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); tree ref_stmt, tmp; @@ -5841,8 +6403,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, /* create: addr_tmp = (int)(address_of_first_vector) */ addr_base = vect_create_addr_base_for_vector_ref (ref_stmt, - &new_stmt_list, - NULL_TREE); + &new_stmt_list, NULL_TREE, loop); if (new_stmt_list != NULL_TREE) append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list); @@ -5896,6 +6457,145 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, and_tmp_name, ptrsize_zero); } +/* Function vect_vfa_segment_size. + + Create an expression that computes the size of segment + that will be accessed for a data reference. The functions takes into + account that realignment loads may access one more vector. + + Input: + DR: The data reference. + VECT_FACTOR: vectorization factor. + + Return an exrpession whose value is the size of segment which will be + accessed by DR. */ + +static tree +vect_vfa_segment_size (struct data_reference *dr, tree vect_factor) +{ + tree segment_length; + + if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized) + { + tree vector_size = + build_int_cst (integer_type_node, + GET_MODE_SIZE (TYPE_MODE (STMT_VINFO_VECTYPE + (vinfo_for_stmt (DR_STMT (dr)))))); + + segment_length = + fold_convert (sizetype, + fold_build2 (PLUS_EXPR, integer_type_node, + fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr), + vect_factor), + vector_size)); + } + else + { + segment_length = + fold_convert (sizetype, + fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr), + vect_factor)); + } + + return segment_length; +} + +/* Function vect_create_cond_for_alias_checks. + + Create a conditional expression that represents the run-time checks for + overlapping of address ranges represented by a list of data references + relations passed as input. + + Input: + COND_EXPR - input conditional expression. New conditions will be chained + with logical and operation. + LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs + to be checked. + + Output: + COND_EXPR - conditional expression. + COND_EXPR_STMT_LIST - statements needed to construct the conditional + expression. + The returned value is the conditional expression to be used in the if + statement that controls which version of the loop gets executed at runtime. +*/ + +static void +vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, + tree * cond_expr, + tree * cond_expr_stmt_list) +{ + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + VEC (ddr_p, heap) * may_alias_ddrs = + LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); + tree vect_factor = + build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); + + ddr_p ddr; + unsigned int i; + tree part_cond_expr; + + /* Create expression + ((store_ptr_0 + store_segment_length_0) < load_ptr_0) + || (load_ptr_0 + load_segment_length_0) < store_ptr_0)) + && + ... + && + ((store_ptr_n + store_segment_length_n) < load_ptr_n) + || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */ + + if (VEC_empty (ddr_p, may_alias_ddrs)) + return; + + for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++) + { + tree stmt_a = DR_STMT (DDR_A (ddr)); + tree stmt_b = DR_STMT (DDR_B (ddr)); + + tree addr_base_a = + vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list, + NULL_TREE, loop); + tree addr_base_b = + vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list, + NULL_TREE, loop); + + tree segment_length_a = vect_vfa_segment_size (DDR_A (ddr), vect_factor); + tree segment_length_b = vect_vfa_segment_size (DDR_B (ddr), vect_factor); + + if (vect_print_dump_info (REPORT_DR_DETAILS)) + { + fprintf (vect_dump, + "create runtime check for data references "); + print_generic_expr (vect_dump, DR_REF (DDR_A (ddr)), TDF_SLIM); + fprintf (vect_dump, " and "); + print_generic_expr (vect_dump, DR_REF (DDR_B (ddr)), TDF_SLIM); + } + + + part_cond_expr = + fold_build2 (TRUTH_OR_EXPR, boolean_type_node, + fold_build2 (LT_EXPR, boolean_type_node, + fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a), + addr_base_a, + segment_length_a), + addr_base_b), + fold_build2 (LT_EXPR, boolean_type_node, + fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b), + addr_base_b, + segment_length_b), + addr_base_a)); + + if (*cond_expr) + *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, + *cond_expr, part_cond_expr); + else + *cond_expr = part_cond_expr; + } + if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) + fprintf (vect_dump, "created %u versioning for alias checks.\n", + VEC_length (ddr_p, may_alias_ddrs)); + +} /* Function vect_transform_loop. @@ -5918,16 +6618,21 @@ vect_transform_loop (loop_vec_info loop_vinfo) if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vec_transform_loop ==="); - /* If the loop has data references that may or may not be aligned then + /* If the loop has data references that may or may not be aligned or/and + has data reference relations whose independence was not proven then two versions of the loop need to be generated, one which is vectorized and one which isn't. A test is then generated to control which of the loops is executed. The test checks for the alignment of all of the - data references that may or may not be aligned. */ + data references that may or may not be aligned. An additional + sequence of runtime tests is generated for each pairs of DDRs whose + independence was not proven. The vectorized version of loop is + executed only if both alias and alignment tests are passed. */ - if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))) + if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) + || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) { struct loop *nloop; - tree cond_expr; + tree cond_expr = NULL_TREE; tree cond_expr_stmt_list = NULL_TREE; basic_block condition_bb; block_stmt_iterator cond_exp_bsi; @@ -5936,9 +6641,23 @@ vect_transform_loop (loop_vec_info loop_vinfo) edge new_exit_e, e; tree orig_phi, new_phi, arg; unsigned prob = 4 * REG_BR_PROB_BASE / 5; + tree gimplify_stmt_list; + + if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))) + cond_expr = + vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list); - cond_expr = vect_create_cond_for_align_checks (loop_vinfo, + if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) + vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, &cond_expr_stmt_list); + + cond_expr = + fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node); + cond_expr = + force_gimple_operand (cond_expr, &gimplify_stmt_list, true, + NULL_TREE); + append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list); + initialize_original_copy_tables (); nloop = loop_version (loop, cond_expr, &condition_bb, prob, prob, REG_BR_PROB_BASE - prob, true); @@ -6056,8 +6775,18 @@ vect_transform_loop (loop_vec_info loop_vinfo) fprintf (vect_dump, "------>vectorizing statement: "); print_generic_expr (vect_dump, stmt, TDF_SLIM); } + stmt_info = vinfo_for_stmt (stmt); - gcc_assert (stmt_info); + + /* vector stmts created in the outer-loop during vectorization of + stmts in an inner-loop may not have a stmt_info, and do not + need to be vectorized. */ + if (!stmt_info) + { + bsi_next (&si); + continue; + } + if (!STMT_VINFO_RELEVANT_P (stmt_info) && !STMT_VINFO_LIVE_P (stmt_info)) { @@ -6129,4 +6858,6 @@ vect_transform_loop (loop_vec_info loop_vinfo) if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) fprintf (vect_dump, "LOOP VECTORIZED."); + if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) + fprintf (vect_dump, "OUTER LOOP VECTORIZED."); }