1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 and Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
25 #include "coretypes.h"
31 #include "basic-block.h"
32 #include "tree-pretty-print.h"
33 #include "gimple-pretty-print.h"
34 #include "tree-flow.h"
35 #include "tree-dump.h"
37 #include "tree-chrec.h"
38 #include "tree-scalar-evolution.h"
39 #include "tree-vectorizer.h"
40 #include "diagnostic-core.h"
42 /* Need to include rtl.h, expr.h, etc. for optabs. */
46 /* Return true if load- or store-lanes optab OPTAB is implemented for
47 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
50 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
51 tree vectype, unsigned HOST_WIDE_INT count)
53 enum machine_mode mode, array_mode;
56 mode = TYPE_MODE (vectype);
57 limit_p = !targetm.array_mode_supported_p (mode, count);
58 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
61 if (array_mode == BLKmode)
63 if (vect_print_dump_info (REPORT_DETAILS))
64 fprintf (vect_dump, "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
65 GET_MODE_NAME (mode), count);
69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
71 if (vect_print_dump_info (REPORT_DETAILS))
72 fprintf (vect_dump, "cannot use %s<%s><%s>",
73 name, GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
77 if (vect_print_dump_info (REPORT_DETAILS))
78 fprintf (vect_dump, "can use %s<%s><%s>",
79 name, GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
85 /* Return the smallest scalar part of STMT.
86 This is used to determine the vectype of the stmt. We generally set the
87 vectype according to the type of the result (lhs). For stmts whose
88 result-type is different than the type of the arguments (e.g., demotion,
89 promotion), vectype will be reset appropriately (later). Note that we have
90 to visit the smallest datatype in this function, because that determines the
91 VF. If the smallest datatype in the loop is present only as the rhs of a
92 promotion operation - we'd miss it.
93 Such a case, where a variable of this datatype does not appear in the lhs
94 anywhere in the loop, can only occur if it's an invariant: e.g.:
95 'int_x = (int) short_inv', which we'd expect to have been optimized away by
96 invariant motion. However, we cannot rely on invariant motion to always
97 take invariants out of the loop, and so in the case of promotion we also
98 have to check the rhs.
99 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
103 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
104 HOST_WIDE_INT *rhs_size_unit)
106 tree scalar_type = gimple_expr_type (stmt);
107 HOST_WIDE_INT lhs, rhs;
109 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
111 if (is_gimple_assign (stmt)
112 && (gimple_assign_cast_p (stmt)
113 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
114 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
116 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
118 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
120 scalar_type = rhs_type;
123 *lhs_size_unit = lhs;
124 *rhs_size_unit = rhs;
129 /* Find the place of the data-ref in STMT in the interleaving chain that starts
130 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
133 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
135 gimple next_stmt = first_stmt;
138 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
141 while (next_stmt && next_stmt != stmt)
144 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
154 /* Function vect_insert_into_interleaving_chain.
156 Insert DRA into the interleaving chain of DRB according to DRA's INIT. */
159 vect_insert_into_interleaving_chain (struct data_reference *dra,
160 struct data_reference *drb)
164 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
165 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
167 prev = GROUP_FIRST_ELEMENT (stmtinfo_b);
168 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
171 next_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
172 if (tree_int_cst_compare (next_init, DR_INIT (dra)) > 0)
175 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = DR_STMT (dra);
176 GROUP_NEXT_ELEMENT (stmtinfo_a) = next;
180 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
183 /* We got to the end of the list. Insert here. */
184 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = DR_STMT (dra);
185 GROUP_NEXT_ELEMENT (stmtinfo_a) = NULL;
189 /* Function vect_update_interleaving_chain.
191 For two data-refs DRA and DRB that are a part of a chain interleaved data
192 accesses, update the interleaving chain. DRB's INIT is smaller than DRA's.
194 There are four possible cases:
195 1. New stmts - both DRA and DRB are not a part of any chain:
198 2. DRB is a part of a chain and DRA is not:
199 no need to update FIRST_DR
200 no need to insert DRB
201 insert DRA according to init
202 3. DRA is a part of a chain and DRB is not:
203 if (init of FIRST_DR > init of DRB)
205 NEXT(FIRST_DR) = previous FIRST_DR
207 insert DRB according to its init
208 4. both DRA and DRB are in some interleaving chains:
209 choose the chain with the smallest init of FIRST_DR
210 insert the nodes of the second chain into the first one. */
213 vect_update_interleaving_chain (struct data_reference *drb,
214 struct data_reference *dra)
216 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
217 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
218 tree next_init, init_dra_chain, init_drb_chain;
219 gimple first_a, first_b;
221 gimple node, prev, next, first_stmt;
223 /* 1. New stmts - both DRA and DRB are not a part of any chain. */
224 if (!GROUP_FIRST_ELEMENT (stmtinfo_a) && !GROUP_FIRST_ELEMENT (stmtinfo_b))
226 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (drb);
227 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (drb);
228 GROUP_NEXT_ELEMENT (stmtinfo_b) = DR_STMT (dra);
232 /* 2. DRB is a part of a chain and DRA is not. */
233 if (!GROUP_FIRST_ELEMENT (stmtinfo_a) && GROUP_FIRST_ELEMENT (stmtinfo_b))
235 GROUP_FIRST_ELEMENT (stmtinfo_a) = GROUP_FIRST_ELEMENT (stmtinfo_b);
236 /* Insert DRA into the chain of DRB. */
237 vect_insert_into_interleaving_chain (dra, drb);
241 /* 3. DRA is a part of a chain and DRB is not. */
242 if (GROUP_FIRST_ELEMENT (stmtinfo_a) && !GROUP_FIRST_ELEMENT (stmtinfo_b))
244 gimple old_first_stmt = GROUP_FIRST_ELEMENT (stmtinfo_a);
245 tree init_old = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (
249 if (tree_int_cst_compare (init_old, DR_INIT (drb)) > 0)
251 /* DRB's init is smaller than the init of the stmt previously marked
252 as the first stmt of the interleaving chain of DRA. Therefore, we
253 update FIRST_STMT and put DRB in the head of the list. */
254 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (drb);
255 GROUP_NEXT_ELEMENT (stmtinfo_b) = old_first_stmt;
257 /* Update all the stmts in the list to point to the new FIRST_STMT. */
258 tmp = old_first_stmt;
261 GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) = DR_STMT (drb);
262 tmp = GROUP_NEXT_ELEMENT (vinfo_for_stmt (tmp));
267 /* Insert DRB in the list of DRA. */
268 vect_insert_into_interleaving_chain (drb, dra);
269 GROUP_FIRST_ELEMENT (stmtinfo_b) = GROUP_FIRST_ELEMENT (stmtinfo_a);
274 /* 4. both DRA and DRB are in some interleaving chains. */
275 first_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
276 first_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
277 if (first_a == first_b)
279 init_dra_chain = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_a)));
280 init_drb_chain = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_b)));
282 if (tree_int_cst_compare (init_dra_chain, init_drb_chain) > 0)
284 /* Insert the nodes of DRA chain into the DRB chain.
285 After inserting a node, continue from this node of the DRB chain (don't
286 start from the beginning. */
287 node = GROUP_FIRST_ELEMENT (stmtinfo_a);
288 prev = GROUP_FIRST_ELEMENT (stmtinfo_b);
289 first_stmt = first_b;
293 /* Insert the nodes of DRB chain into the DRA chain.
294 After inserting a node, continue from this node of the DRA chain (don't
295 start from the beginning. */
296 node = GROUP_FIRST_ELEMENT (stmtinfo_b);
297 prev = GROUP_FIRST_ELEMENT (stmtinfo_a);
298 first_stmt = first_a;
303 node_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (node)));
304 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
307 next_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
308 if (tree_int_cst_compare (next_init, node_init) > 0)
311 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = node;
312 GROUP_NEXT_ELEMENT (vinfo_for_stmt (node)) = next;
317 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
321 /* We got to the end of the list. Insert here. */
322 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = node;
323 GROUP_NEXT_ELEMENT (vinfo_for_stmt (node)) = NULL;
326 GROUP_FIRST_ELEMENT (vinfo_for_stmt (node)) = first_stmt;
327 node = GROUP_NEXT_ELEMENT (vinfo_for_stmt (node));
331 /* Check dependence between DRA and DRB for basic block vectorization.
332 If the accesses share same bases and offsets, we can compare their initial
333 constant offsets to decide whether they differ or not. In case of a read-
334 write dependence we check that the load is before the store to ensure that
335 vectorization will not change the order of the accesses. */
338 vect_drs_dependent_in_basic_block (struct data_reference *dra,
339 struct data_reference *drb)
341 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
344 /* We only call this function for pairs of loads and stores, but we verify
346 if (DR_IS_READ (dra) == DR_IS_READ (drb))
348 if (DR_IS_READ (dra))
354 /* Check that the data-refs have same bases and offsets. If not, we can't
355 determine if they are dependent. */
356 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
357 || !dr_equal_offsets_p (dra, drb))
360 /* Check the types. */
361 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
362 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
364 if (type_size_a != type_size_b
365 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
366 TREE_TYPE (DR_REF (drb))))
369 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
370 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
372 /* Two different locations - no dependence. */
373 if (init_a != init_b)
376 /* We have a read-write dependence. Check that the load is before the store.
377 When we vectorize basic blocks, vector load can be only before
378 corresponding scalar load, and vector store can be only after its
379 corresponding scalar store. So the order of the acceses is preserved in
380 case the load is before the store. */
381 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
382 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
389 /* Function vect_check_interleaving.
391 Check if DRA and DRB are a part of interleaving. In case they are, insert
392 DRA and DRB in an interleaving chain. */
395 vect_check_interleaving (struct data_reference *dra,
396 struct data_reference *drb)
398 HOST_WIDE_INT type_size_a, type_size_b, diff_mod_size, step, init_a, init_b;
400 /* Check that the data-refs have same first location (except init) and they
401 are both either store or load (not load and store). */
402 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
403 || !dr_equal_offsets_p (dra, drb)
404 || !tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb))
405 || DR_IS_READ (dra) != DR_IS_READ (drb))
409 1. data-refs are of the same type
410 2. their steps are equal
411 3. the step (if greater than zero) is greater than the difference between
413 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
414 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
416 if (type_size_a != type_size_b
417 || tree_int_cst_compare (DR_STEP (dra), DR_STEP (drb))
418 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
419 TREE_TYPE (DR_REF (drb))))
422 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
423 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
424 step = TREE_INT_CST_LOW (DR_STEP (dra));
428 /* If init_a == init_b + the size of the type * k, we have an interleaving,
429 and DRB is accessed before DRA. */
430 diff_mod_size = (init_a - init_b) % type_size_a;
432 if (step && (init_a - init_b) > step)
435 if (diff_mod_size == 0)
437 vect_update_interleaving_chain (drb, dra);
438 if (vect_print_dump_info (REPORT_DR_DETAILS))
440 fprintf (vect_dump, "Detected interleaving ");
441 print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
442 fprintf (vect_dump, " and ");
443 print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
450 /* If init_b == init_a + the size of the type * k, we have an
451 interleaving, and DRA is accessed before DRB. */
452 diff_mod_size = (init_b - init_a) % type_size_a;
454 if (step && (init_b - init_a) > step)
457 if (diff_mod_size == 0)
459 vect_update_interleaving_chain (dra, drb);
460 if (vect_print_dump_info (REPORT_DR_DETAILS))
462 fprintf (vect_dump, "Detected interleaving ");
463 print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
464 fprintf (vect_dump, " and ");
465 print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
474 /* Check if data references pointed by DR_I and DR_J are same or
475 belong to same interleaving group. Return FALSE if drs are
476 different, otherwise return TRUE. */
479 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
481 gimple stmt_i = DR_STMT (dr_i);
482 gimple stmt_j = DR_STMT (dr_j);
484 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
485 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
486 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
487 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
488 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
494 /* If address ranges represented by DDR_I and DDR_J are equal,
495 return TRUE, otherwise return FALSE. */
498 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
500 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
501 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
502 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
503 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
509 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
510 tested at run-time. Return TRUE if DDR was successfully inserted.
511 Return false if versioning is not supported. */
514 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
516 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
518 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
521 if (vect_print_dump_info (REPORT_DR_DETAILS))
523 fprintf (vect_dump, "mark for run-time aliasing test between ");
524 print_generic_expr (vect_dump, DR_REF (DDR_A (ddr)), TDF_SLIM);
525 fprintf (vect_dump, " and ");
526 print_generic_expr (vect_dump, DR_REF (DDR_B (ddr)), TDF_SLIM);
529 if (optimize_loop_nest_for_size_p (loop))
531 if (vect_print_dump_info (REPORT_DR_DETAILS))
532 fprintf (vect_dump, "versioning not supported when optimizing for size.");
536 /* FORNOW: We don't support versioning with outer-loop vectorization. */
539 if (vect_print_dump_info (REPORT_DR_DETAILS))
540 fprintf (vect_dump, "versioning not yet supported for outer-loops.");
544 VEC_safe_push (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo), ddr);
549 /* Function vect_analyze_data_ref_dependence.
551 Return TRUE if there (might) exist a dependence between a memory-reference
552 DRA and a memory-reference DRB. When versioning for alias may check a
553 dependence at run-time, return FALSE. Adjust *MAX_VF according to
554 the data dependence. */
557 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
558 loop_vec_info loop_vinfo, int *max_vf)
561 struct loop *loop = NULL;
562 struct data_reference *dra = DDR_A (ddr);
563 struct data_reference *drb = DDR_B (ddr);
564 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
565 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
566 lambda_vector dist_v;
567 unsigned int loop_depth;
569 /* Don't bother to analyze statements marked as unvectorizable. */
570 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
571 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
574 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
576 /* Independent data accesses. */
577 vect_check_interleaving (dra, drb);
582 loop = LOOP_VINFO_LOOP (loop_vinfo);
584 if ((DR_IS_READ (dra) && DR_IS_READ (drb) && loop_vinfo) || dra == drb)
587 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
593 if (vect_print_dump_info (REPORT_DR_DETAILS))
595 fprintf (vect_dump, "versioning for alias required: "
596 "can't determine dependence between ");
597 print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
598 fprintf (vect_dump, " and ");
599 print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
602 /* Add to list of ddrs that need to be tested at run-time. */
603 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
606 /* When vectorizing a basic block unknown depnedence can still mean
608 if (vect_check_interleaving (dra, drb))
611 /* Read-read is OK (we need this check here, after checking for
613 if (DR_IS_READ (dra) && DR_IS_READ (drb))
616 if (vect_print_dump_info (REPORT_DR_DETAILS))
618 fprintf (vect_dump, "can't determine dependence between ");
619 print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
620 fprintf (vect_dump, " and ");
621 print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
624 /* We do not vectorize basic blocks with write-write dependencies. */
625 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
628 /* Check that it's not a load-after-store dependence. */
629 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
630 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
636 /* Versioning for alias is not yet supported for basic block SLP, and
637 dependence distance is unapplicable, hence, in case of known data
638 dependence, basic block vectorization is impossible for now. */
641 if (dra != drb && vect_check_interleaving (dra, drb))
644 if (vect_print_dump_info (REPORT_DR_DETAILS))
646 fprintf (vect_dump, "determined dependence between ");
647 print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
648 fprintf (vect_dump, " and ");
649 print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
652 /* Do not vectorize basic blcoks with write-write dependences. */
653 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
656 /* Check if this dependence is allowed in basic block vectorization. */
657 return vect_drs_dependent_in_basic_block (dra, drb);
660 /* Loop-based vectorization and known data dependence. */
661 if (DDR_NUM_DIST_VECTS (ddr) == 0)
663 if (vect_print_dump_info (REPORT_DR_DETAILS))
665 fprintf (vect_dump, "versioning for alias required: bad dist vector for ");
666 print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
667 fprintf (vect_dump, " and ");
668 print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
670 /* Add to list of ddrs that need to be tested at run-time. */
671 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
674 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
675 FOR_EACH_VEC_ELT (lambda_vector, DDR_DIST_VECTS (ddr), i, dist_v)
677 int dist = dist_v[loop_depth];
679 if (vect_print_dump_info (REPORT_DR_DETAILS))
680 fprintf (vect_dump, "dependence distance = %d.", dist);
684 if (vect_print_dump_info (REPORT_DR_DETAILS))
686 fprintf (vect_dump, "dependence distance == 0 between ");
687 print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
688 fprintf (vect_dump, " and ");
689 print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
692 /* For interleaving, mark that there is a read-write dependency if
693 necessary. We check before that one of the data-refs is store. */
694 if (DR_IS_READ (dra))
695 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_a) = true;
698 if (DR_IS_READ (drb))
699 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_b) = true;
705 if (dist > 0 && DDR_REVERSED_P (ddr))
707 /* If DDR_REVERSED_P the order of the data-refs in DDR was
708 reversed (to make distance vector positive), and the actual
709 distance is negative. */
710 if (vect_print_dump_info (REPORT_DR_DETAILS))
711 fprintf (vect_dump, "dependence distance negative.");
716 && abs (dist) < *max_vf)
718 /* The dependence distance requires reduction of the maximal
719 vectorization factor. */
720 *max_vf = abs (dist);
721 if (vect_print_dump_info (REPORT_DR_DETAILS))
722 fprintf (vect_dump, "adjusting maximal vectorization factor to %i",
726 if (abs (dist) >= *max_vf)
728 /* Dependence distance does not create dependence, as far as
729 vectorization is concerned, in this case. */
730 if (vect_print_dump_info (REPORT_DR_DETAILS))
731 fprintf (vect_dump, "dependence distance >= VF.");
735 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
737 fprintf (vect_dump, "not vectorized, possible dependence "
738 "between data-refs ");
739 print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
740 fprintf (vect_dump, " and ");
741 print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
750 /* Function vect_analyze_data_ref_dependences.
752 Examine all the data references in the loop, and make sure there do not
753 exist any data dependences between them. Set *MAX_VF according to
754 the maximum vectorization factor the data dependences allow. */
757 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
758 bb_vec_info bb_vinfo, int *max_vf)
761 VEC (ddr_p, heap) *ddrs = NULL;
762 struct data_dependence_relation *ddr;
764 if (vect_print_dump_info (REPORT_DETAILS))
765 fprintf (vect_dump, "=== vect_analyze_dependences ===");
768 ddrs = LOOP_VINFO_DDRS (loop_vinfo);
770 ddrs = BB_VINFO_DDRS (bb_vinfo);
772 FOR_EACH_VEC_ELT (ddr_p, ddrs, i, ddr)
773 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
780 /* Function vect_compute_data_ref_alignment
782 Compute the misalignment of the data reference DR.
785 1. If during the misalignment computation it is found that the data reference
786 cannot be vectorized then false is returned.
787 2. DR_MISALIGNMENT (DR) is defined.
789 FOR NOW: No analysis is actually performed. Misalignment is calculated
790 only for trivial cases. TODO. */
793 vect_compute_data_ref_alignment (struct data_reference *dr)
795 gimple stmt = DR_STMT (dr);
796 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
797 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
798 struct loop *loop = NULL;
799 tree ref = DR_REF (dr);
801 tree base, base_addr;
804 tree aligned_to, alignment;
806 if (vect_print_dump_info (REPORT_DETAILS))
807 fprintf (vect_dump, "vect_compute_data_ref_alignment:");
810 loop = LOOP_VINFO_LOOP (loop_vinfo);
812 /* Initialize misalignment to unknown. */
813 SET_DR_MISALIGNMENT (dr, -1);
815 misalign = DR_INIT (dr);
816 aligned_to = DR_ALIGNED_TO (dr);
817 base_addr = DR_BASE_ADDRESS (dr);
818 vectype = STMT_VINFO_VECTYPE (stmt_info);
820 /* In case the dataref is in an inner-loop of the loop that is being
821 vectorized (LOOP), we use the base and misalignment information
822 relative to the outer-loop (LOOP). This is ok only if the misalignment
823 stays the same throughout the execution of the inner-loop, which is why
824 we have to check that the stride of the dataref in the inner-loop evenly
825 divides by the vector size. */
826 if (loop && nested_in_vect_loop_p (loop, stmt))
828 tree step = DR_STEP (dr);
829 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
831 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
833 if (vect_print_dump_info (REPORT_ALIGNMENT))
834 fprintf (vect_dump, "inner step divides the vector-size.");
835 misalign = STMT_VINFO_DR_INIT (stmt_info);
836 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
837 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
841 if (vect_print_dump_info (REPORT_ALIGNMENT))
842 fprintf (vect_dump, "inner step doesn't divide the vector-size.");
843 misalign = NULL_TREE;
847 base = build_fold_indirect_ref (base_addr);
848 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
850 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
853 if (vect_print_dump_info (REPORT_ALIGNMENT))
855 fprintf (vect_dump, "Unknown alignment for access: ");
856 print_generic_expr (vect_dump, base, TDF_SLIM);
862 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
864 || (TREE_CODE (base_addr) == SSA_NAME
865 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
866 TREE_TYPE (base_addr)))),
868 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
871 base_aligned = false;
875 /* Do not change the alignment of global variables if
876 flag_section_anchors is enabled. */
877 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
878 || (TREE_STATIC (base) && flag_section_anchors))
880 if (vect_print_dump_info (REPORT_DETAILS))
882 fprintf (vect_dump, "can't force alignment of ref: ");
883 print_generic_expr (vect_dump, ref, TDF_SLIM);
888 /* Force the alignment of the decl.
889 NOTE: This is the only change to the code we make during
890 the analysis phase, before deciding to vectorize the loop. */
891 if (vect_print_dump_info (REPORT_DETAILS))
893 fprintf (vect_dump, "force alignment of ");
894 print_generic_expr (vect_dump, ref, TDF_SLIM);
897 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
898 DECL_USER_ALIGN (base) = 1;
901 /* At this point we assume that the base is aligned. */
902 gcc_assert (base_aligned
903 || (TREE_CODE (base) == VAR_DECL
904 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
906 /* If this is a backward running DR then first access in the larger
907 vectype actually is N-1 elements before the address in the DR.
908 Adjust misalign accordingly. */
909 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
911 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
912 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
913 otherwise we wouldn't be here. */
914 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
915 /* PLUS because DR_STEP was negative. */
916 misalign = size_binop (PLUS_EXPR, misalign, offset);
919 /* Modulo alignment. */
920 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
922 if (!host_integerp (misalign, 1))
924 /* Negative or overflowed misalignment value. */
925 if (vect_print_dump_info (REPORT_DETAILS))
926 fprintf (vect_dump, "unexpected misalign value");
930 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
932 if (vect_print_dump_info (REPORT_DETAILS))
934 fprintf (vect_dump, "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
935 print_generic_expr (vect_dump, ref, TDF_SLIM);
942 /* Function vect_compute_data_refs_alignment
944 Compute the misalignment of data references in the loop.
945 Return FALSE if a data reference is found that cannot be vectorized. */
948 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
949 bb_vec_info bb_vinfo)
951 VEC (data_reference_p, heap) *datarefs;
952 struct data_reference *dr;
956 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
958 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
960 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
961 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
962 && !vect_compute_data_ref_alignment (dr))
966 /* Mark unsupported statement as unvectorizable. */
967 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
978 /* Function vect_update_misalignment_for_peel
980 DR - the data reference whose misalignment is to be adjusted.
981 DR_PEEL - the data reference whose misalignment is being made
982 zero in the vector loop by the peel.
983 NPEEL - the number of iterations in the peel loop if the misalignment
984 of DR_PEEL is known at compile time. */
987 vect_update_misalignment_for_peel (struct data_reference *dr,
988 struct data_reference *dr_peel, int npeel)
991 VEC(dr_p,heap) *same_align_drs;
992 struct data_reference *current_dr;
993 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
994 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
995 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
996 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
998 /* For interleaved data accesses the step in the loop must be multiplied by
999 the size of the interleaving group. */
1000 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
1001 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1002 if (STMT_VINFO_STRIDED_ACCESS (peel_stmt_info))
1003 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1005 /* It can be assumed that the data refs with the same alignment as dr_peel
1006 are aligned in the vector loop. */
1008 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1009 FOR_EACH_VEC_ELT (dr_p, same_align_drs, i, current_dr)
1011 if (current_dr != dr)
1013 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
1014 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
1015 SET_DR_MISALIGNMENT (dr, 0);
1019 if (known_alignment_for_access_p (dr)
1020 && known_alignment_for_access_p (dr_peel))
1022 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1023 int misal = DR_MISALIGNMENT (dr);
1024 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1025 misal += negative ? -npeel * dr_size : npeel * dr_size;
1026 misal &= GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1027 SET_DR_MISALIGNMENT (dr, misal);
1031 if (vect_print_dump_info (REPORT_DETAILS))
1032 fprintf (vect_dump, "Setting misalignment to -1.");
1033 SET_DR_MISALIGNMENT (dr, -1);
1037 /* Function vect_verify_datarefs_alignment
1039 Return TRUE if all data references in the loop can be
1040 handled with respect to alignment. */
1043 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1045 VEC (data_reference_p, heap) *datarefs;
1046 struct data_reference *dr;
1047 enum dr_alignment_support supportable_dr_alignment;
1051 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1053 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
1055 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1057 gimple stmt = DR_STMT (dr);
1058 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1060 /* For interleaving, only the alignment of the first access matters.
1061 Skip statements marked as not vectorizable. */
1062 if ((STMT_VINFO_STRIDED_ACCESS (stmt_info)
1063 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1064 || !STMT_VINFO_VECTORIZABLE (stmt_info))
1067 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1068 if (!supportable_dr_alignment)
1070 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1072 if (DR_IS_READ (dr))
1074 "not vectorized: unsupported unaligned load.");
1077 "not vectorized: unsupported unaligned store.");
1079 print_generic_expr (vect_dump, DR_REF (dr), TDF_SLIM);
1083 if (supportable_dr_alignment != dr_aligned
1084 && vect_print_dump_info (REPORT_ALIGNMENT))
1085 fprintf (vect_dump, "Vectorizing an unaligned access.");
1091 /* Function vector_alignment_reachable_p
1093 Return true if vector alignment for DR is reachable by peeling
1094 a few loop iterations. Return false otherwise. */
1097 vector_alignment_reachable_p (struct data_reference *dr)
1099 gimple stmt = DR_STMT (dr);
1100 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1101 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1103 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
1105 /* For interleaved access we peel only if number of iterations in
1106 the prolog loop ({VF - misalignment}), is a multiple of the
1107 number of the interleaved accesses. */
1108 int elem_size, mis_in_elements;
1109 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1111 /* FORNOW: handle only known alignment. */
1112 if (!known_alignment_for_access_p (dr))
1115 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1116 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1118 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1122 /* If misalignment is known at the compile time then allow peeling
1123 only if natural alignment is reachable through peeling. */
1124 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1126 HOST_WIDE_INT elmsize =
1127 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1128 if (vect_print_dump_info (REPORT_DETAILS))
1130 fprintf (vect_dump, "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1131 fprintf (vect_dump, ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1133 if (DR_MISALIGNMENT (dr) % elmsize)
1135 if (vect_print_dump_info (REPORT_DETAILS))
1136 fprintf (vect_dump, "data size does not divide the misalignment.\n");
1141 if (!known_alignment_for_access_p (dr))
1143 tree type = (TREE_TYPE (DR_REF (dr)));
1144 bool is_packed = contains_packed_reference (DR_REF (dr));
1146 if (compare_tree_int (TYPE_SIZE (type), TYPE_ALIGN (type)) > 0)
1149 if (vect_print_dump_info (REPORT_DETAILS))
1150 fprintf (vect_dump, "Unknown misalignment, is_packed = %d",is_packed);
1151 if (targetm.vectorize.vector_alignment_reachable (type, is_packed))
1161 /* Calculate the cost of the memory access represented by DR. */
1164 vect_get_data_access_cost (struct data_reference *dr,
1165 unsigned int *inside_cost,
1166 unsigned int *outside_cost)
1168 gimple stmt = DR_STMT (dr);
1169 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1170 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1171 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1172 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1173 int ncopies = vf / nunits;
1174 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1176 if (!supportable_dr_alignment)
1177 *inside_cost = VECT_MAX_COST;
1180 if (DR_IS_READ (dr))
1181 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost);
1183 vect_get_store_cost (dr, ncopies, inside_cost);
1186 if (vect_print_dump_info (REPORT_COST))
1187 fprintf (vect_dump, "vect_get_data_access_cost: inside_cost = %d, "
1188 "outside_cost = %d.", *inside_cost, *outside_cost);
1193 vect_peeling_hash (const void *elem)
1195 const struct _vect_peel_info *peel_info;
1197 peel_info = (const struct _vect_peel_info *) elem;
1198 return (hashval_t) peel_info->npeel;
1203 vect_peeling_hash_eq (const void *elem1, const void *elem2)
1205 const struct _vect_peel_info *a, *b;
1207 a = (const struct _vect_peel_info *) elem1;
1208 b = (const struct _vect_peel_info *) elem2;
1209 return (a->npeel == b->npeel);
1213 /* Insert DR into peeling hash table with NPEEL as key. */
1216 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1219 struct _vect_peel_info elem, *slot;
1221 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1224 slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1230 slot = XNEW (struct _vect_peel_info);
1231 slot->npeel = npeel;
1234 new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot,
1239 if (!supportable_dr_alignment && !flag_vect_cost_model)
1240 slot->count += VECT_MAX_COST;
1244 /* Traverse peeling hash table to find peeling option that aligns maximum
1245 number of data accesses. */
1248 vect_peeling_hash_get_most_frequent (void **slot, void *data)
1250 vect_peel_info elem = (vect_peel_info) *slot;
1251 vect_peel_extended_info max = (vect_peel_extended_info) data;
1253 if (elem->count > max->peel_info.count
1254 || (elem->count == max->peel_info.count
1255 && max->peel_info.npeel > elem->npeel))
1257 max->peel_info.npeel = elem->npeel;
1258 max->peel_info.count = elem->count;
1259 max->peel_info.dr = elem->dr;
1266 /* Traverse peeling hash table and calculate cost for each peeling option.
1267 Find the one with the lowest cost. */
1270 vect_peeling_hash_get_lowest_cost (void **slot, void *data)
1272 vect_peel_info elem = (vect_peel_info) *slot;
1273 vect_peel_extended_info min = (vect_peel_extended_info) data;
1274 int save_misalignment, dummy;
1275 unsigned int inside_cost = 0, outside_cost = 0, i;
1276 gimple stmt = DR_STMT (elem->dr);
1277 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1278 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1279 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1280 struct data_reference *dr;
1282 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1284 stmt = DR_STMT (dr);
1285 stmt_info = vinfo_for_stmt (stmt);
1286 /* For interleaving, only the alignment of the first access
1288 if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
1289 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1292 save_misalignment = DR_MISALIGNMENT (dr);
1293 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1294 vect_get_data_access_cost (dr, &inside_cost, &outside_cost);
1295 SET_DR_MISALIGNMENT (dr, save_misalignment);
1298 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel, &dummy,
1299 vect_get_single_scalar_iteraion_cost (loop_vinfo));
1301 if (inside_cost < min->inside_cost
1302 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1304 min->inside_cost = inside_cost;
1305 min->outside_cost = outside_cost;
1306 min->peel_info.dr = elem->dr;
1307 min->peel_info.npeel = elem->npeel;
1314 /* Choose best peeling option by traversing peeling hash table and either
1315 choosing an option with the lowest cost (if cost model is enabled) or the
1316 option that aligns as many accesses as possible. */
1318 static struct data_reference *
1319 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1320 unsigned int *npeel)
1322 struct _vect_peel_extended_info res;
1324 res.peel_info.dr = NULL;
1326 if (flag_vect_cost_model)
1328 res.inside_cost = INT_MAX;
1329 res.outside_cost = INT_MAX;
1330 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1331 vect_peeling_hash_get_lowest_cost, &res);
1335 res.peel_info.count = 0;
1336 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1337 vect_peeling_hash_get_most_frequent, &res);
1340 *npeel = res.peel_info.npeel;
1341 return res.peel_info.dr;
1345 /* Function vect_enhance_data_refs_alignment
1347 This pass will use loop versioning and loop peeling in order to enhance
1348 the alignment of data references in the loop.
1350 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1351 original loop is to be vectorized. Any other loops that are created by
1352 the transformations performed in this pass - are not supposed to be
1353 vectorized. This restriction will be relaxed.
1355 This pass will require a cost model to guide it whether to apply peeling
1356 or versioning or a combination of the two. For example, the scheme that
1357 intel uses when given a loop with several memory accesses, is as follows:
1358 choose one memory access ('p') which alignment you want to force by doing
1359 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1360 other accesses are not necessarily aligned, or (2) use loop versioning to
1361 generate one loop in which all accesses are aligned, and another loop in
1362 which only 'p' is necessarily aligned.
1364 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1365 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1366 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1368 Devising a cost model is the most critical aspect of this work. It will
1369 guide us on which access to peel for, whether to use loop versioning, how
1370 many versions to create, etc. The cost model will probably consist of
1371 generic considerations as well as target specific considerations (on
1372 powerpc for example, misaligned stores are more painful than misaligned
1375 Here are the general steps involved in alignment enhancements:
1377 -- original loop, before alignment analysis:
1378 for (i=0; i<N; i++){
1379 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1380 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1383 -- After vect_compute_data_refs_alignment:
1384 for (i=0; i<N; i++){
1385 x = q[i]; # DR_MISALIGNMENT(q) = 3
1386 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1389 -- Possibility 1: we do loop versioning:
1391 for (i=0; i<N; i++){ # loop 1A
1392 x = q[i]; # DR_MISALIGNMENT(q) = 3
1393 p[i] = y; # DR_MISALIGNMENT(p) = 0
1397 for (i=0; i<N; i++){ # loop 1B
1398 x = q[i]; # DR_MISALIGNMENT(q) = 3
1399 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1403 -- Possibility 2: we do loop peeling:
1404 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1408 for (i = 3; i < N; i++){ # loop 2A
1409 x = q[i]; # DR_MISALIGNMENT(q) = 0
1410 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1413 -- Possibility 3: combination of loop peeling and versioning:
1414 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1419 for (i = 3; i<N; i++){ # loop 3A
1420 x = q[i]; # DR_MISALIGNMENT(q) = 0
1421 p[i] = y; # DR_MISALIGNMENT(p) = 0
1425 for (i = 3; i<N; i++){ # loop 3B
1426 x = q[i]; # DR_MISALIGNMENT(q) = 0
1427 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1431 These loops are later passed to loop_transform to be vectorized. The
1432 vectorizer will use the alignment information to guide the transformation
1433 (whether to generate regular loads/stores, or with special handling for
1437 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1439 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1440 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1441 enum dr_alignment_support supportable_dr_alignment;
1442 struct data_reference *dr0 = NULL, *first_store = NULL;
1443 struct data_reference *dr;
1445 bool do_peeling = false;
1446 bool do_versioning = false;
1449 stmt_vec_info stmt_info;
1450 int vect_versioning_for_alias_required;
1451 unsigned int npeel = 0;
1452 bool all_misalignments_unknown = true;
1453 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1454 unsigned possible_npeel_number = 1;
1456 unsigned int nelements, mis, same_align_drs_max = 0;
1458 if (vect_print_dump_info (REPORT_DETAILS))
1459 fprintf (vect_dump, "=== vect_enhance_data_refs_alignment ===");
1461 /* While cost model enhancements are expected in the future, the high level
1462 view of the code at this time is as follows:
1464 A) If there is a misaligned access then see if peeling to align
1465 this access can make all data references satisfy
1466 vect_supportable_dr_alignment. If so, update data structures
1467 as needed and return true.
1469 B) If peeling wasn't possible and there is a data reference with an
1470 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1471 then see if loop versioning checks can be used to make all data
1472 references satisfy vect_supportable_dr_alignment. If so, update
1473 data structures as needed and return true.
1475 C) If neither peeling nor versioning were successful then return false if
1476 any data reference does not satisfy vect_supportable_dr_alignment.
1478 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1480 Note, Possibility 3 above (which is peeling and versioning together) is not
1481 being done at this time. */
1483 /* (1) Peeling to force alignment. */
1485 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1487 + How many accesses will become aligned due to the peeling
1488 - How many accesses will become unaligned due to the peeling,
1489 and the cost of misaligned accesses.
1490 - The cost of peeling (the extra runtime checks, the increase
1493 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1495 stmt = DR_STMT (dr);
1496 stmt_info = vinfo_for_stmt (stmt);
1498 if (!STMT_VINFO_RELEVANT (stmt_info))
1501 /* For interleaving, only the alignment of the first access
1503 if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
1504 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1507 /* For invariant accesses there is nothing to enhance. */
1508 if (integer_zerop (DR_STEP (dr)))
1511 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1512 do_peeling = vector_alignment_reachable_p (dr);
1515 if (known_alignment_for_access_p (dr))
1517 unsigned int npeel_tmp;
1518 bool negative = tree_int_cst_compare (DR_STEP (dr),
1519 size_zero_node) < 0;
1521 /* Save info about DR in the hash table. */
1522 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1523 LOOP_VINFO_PEELING_HTAB (loop_vinfo) =
1524 htab_create (1, vect_peeling_hash,
1525 vect_peeling_hash_eq, free);
1527 vectype = STMT_VINFO_VECTYPE (stmt_info);
1528 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1529 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1530 TREE_TYPE (DR_REF (dr))));
1531 npeel_tmp = (negative
1532 ? (mis - nelements) : (nelements - mis))
1535 /* For multiple types, it is possible that the bigger type access
1536 will have more than one peeling option. E.g., a loop with two
1537 types: one of size (vector size / 4), and the other one of
1538 size (vector size / 8). Vectorization factor will 8. If both
1539 access are misaligned by 3, the first one needs one scalar
1540 iteration to be aligned, and the second one needs 5. But the
1541 the first one will be aligned also by peeling 5 scalar
1542 iterations, and in that case both accesses will be aligned.
1543 Hence, except for the immediate peeling amount, we also want
1544 to try to add full vector size, while we don't exceed
1545 vectorization factor.
1546 We do this automtically for cost model, since we calculate cost
1547 for every peeling option. */
1548 if (!flag_vect_cost_model)
1549 possible_npeel_number = vf /nelements;
1551 /* Handle the aligned case. We may decide to align some other
1552 access, making DR unaligned. */
1553 if (DR_MISALIGNMENT (dr) == 0)
1556 if (!flag_vect_cost_model)
1557 possible_npeel_number++;
1560 for (j = 0; j < possible_npeel_number; j++)
1562 gcc_assert (npeel_tmp <= vf);
1563 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1564 npeel_tmp += nelements;
1567 all_misalignments_unknown = false;
1568 /* Data-ref that was chosen for the case that all the
1569 misalignments are unknown is not relevant anymore, since we
1570 have a data-ref with known alignment. */
1575 /* If we don't know all the misalignment values, we prefer
1576 peeling for data-ref that has maximum number of data-refs
1577 with the same alignment, unless the target prefers to align
1578 stores over load. */
1579 if (all_misalignments_unknown)
1581 if (same_align_drs_max < VEC_length (dr_p,
1582 STMT_VINFO_SAME_ALIGN_REFS (stmt_info))
1585 same_align_drs_max = VEC_length (dr_p,
1586 STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
1590 if (!first_store && DR_IS_WRITE (dr))
1594 /* If there are both known and unknown misaligned accesses in the
1595 loop, we choose peeling amount according to the known
1599 if (!supportable_dr_alignment)
1602 if (!first_store && DR_IS_WRITE (dr))
1609 if (!aligned_access_p (dr))
1611 if (vect_print_dump_info (REPORT_DETAILS))
1612 fprintf (vect_dump, "vector alignment may not be reachable");
1619 vect_versioning_for_alias_required
1620 = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
1622 /* Temporarily, if versioning for alias is required, we disable peeling
1623 until we support peeling and versioning. Often peeling for alignment
1624 will require peeling for loop-bound, which in turn requires that we
1625 know how to adjust the loop ivs after the loop. */
1626 if (vect_versioning_for_alias_required
1627 || !vect_can_advance_ivs_p (loop_vinfo)
1628 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1631 if (do_peeling && all_misalignments_unknown
1632 && vect_supportable_dr_alignment (dr0, false))
1635 /* Check if the target requires to prefer stores over loads, i.e., if
1636 misaligned stores are more expensive than misaligned loads (taking
1637 drs with same alignment into account). */
1638 if (first_store && DR_IS_READ (dr0))
1640 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1641 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1642 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1643 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1645 vect_get_data_access_cost (dr0, &load_inside_cost,
1646 &load_outside_cost);
1647 vect_get_data_access_cost (first_store, &store_inside_cost,
1648 &store_outside_cost);
1650 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1651 aligning the load DR0). */
1652 load_inside_penalty = store_inside_cost;
1653 load_outside_penalty = store_outside_cost;
1654 for (i = 0; VEC_iterate (dr_p, STMT_VINFO_SAME_ALIGN_REFS
1655 (vinfo_for_stmt (DR_STMT (first_store))),
1658 if (DR_IS_READ (dr))
1660 load_inside_penalty += load_inside_cost;
1661 load_outside_penalty += load_outside_cost;
1665 load_inside_penalty += store_inside_cost;
1666 load_outside_penalty += store_outside_cost;
1669 /* Calculate the penalty for leaving DR0 unaligned (by
1670 aligning the FIRST_STORE). */
1671 store_inside_penalty = load_inside_cost;
1672 store_outside_penalty = load_outside_cost;
1673 for (i = 0; VEC_iterate (dr_p, STMT_VINFO_SAME_ALIGN_REFS
1674 (vinfo_for_stmt (DR_STMT (dr0))),
1677 if (DR_IS_READ (dr))
1679 store_inside_penalty += load_inside_cost;
1680 store_outside_penalty += load_outside_cost;
1684 store_inside_penalty += store_inside_cost;
1685 store_outside_penalty += store_outside_cost;
1688 if (load_inside_penalty > store_inside_penalty
1689 || (load_inside_penalty == store_inside_penalty
1690 && load_outside_penalty > store_outside_penalty))
1694 /* In case there are only loads with different unknown misalignments, use
1695 peeling only if it may help to align other accesses in the loop. */
1696 if (!first_store && !VEC_length (dr_p, STMT_VINFO_SAME_ALIGN_REFS
1697 (vinfo_for_stmt (DR_STMT (dr0))))
1698 && vect_supportable_dr_alignment (dr0, false)
1699 != dr_unaligned_supported)
1703 if (do_peeling && !dr0)
1705 /* Peeling is possible, but there is no data access that is not supported
1706 unless aligned. So we try to choose the best possible peeling. */
1708 /* We should get here only if there are drs with known misalignment. */
1709 gcc_assert (!all_misalignments_unknown);
1711 /* Choose the best peeling from the hash table. */
1712 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel);
1719 stmt = DR_STMT (dr0);
1720 stmt_info = vinfo_for_stmt (stmt);
1721 vectype = STMT_VINFO_VECTYPE (stmt_info);
1722 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1724 if (known_alignment_for_access_p (dr0))
1726 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1727 size_zero_node) < 0;
1730 /* Since it's known at compile time, compute the number of
1731 iterations in the peeled loop (the peeling factor) for use in
1732 updating DR_MISALIGNMENT values. The peeling factor is the
1733 vectorization factor minus the misalignment as an element
1735 mis = DR_MISALIGNMENT (dr0);
1736 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1737 npeel = ((negative ? mis - nelements : nelements - mis)
1741 /* For interleaved data access every iteration accesses all the
1742 members of the group, therefore we divide the number of iterations
1743 by the group size. */
1744 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1745 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
1746 npeel /= GROUP_SIZE (stmt_info);
1748 if (vect_print_dump_info (REPORT_DETAILS))
1749 fprintf (vect_dump, "Try peeling by %d", npeel);
1752 /* Ensure that all data refs can be vectorized after the peel. */
1753 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1755 int save_misalignment;
1760 stmt = DR_STMT (dr);
1761 stmt_info = vinfo_for_stmt (stmt);
1762 /* For interleaving, only the alignment of the first access
1764 if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
1765 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1768 save_misalignment = DR_MISALIGNMENT (dr);
1769 vect_update_misalignment_for_peel (dr, dr0, npeel);
1770 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1771 SET_DR_MISALIGNMENT (dr, save_misalignment);
1773 if (!supportable_dr_alignment)
1780 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1782 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1791 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1792 If the misalignment of DR_i is identical to that of dr0 then set
1793 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1794 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1795 by the peeling factor times the element size of DR_i (MOD the
1796 vectorization factor times the size). Otherwise, the
1797 misalignment of DR_i must be set to unknown. */
1798 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1800 vect_update_misalignment_for_peel (dr, dr0, npeel);
1802 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1804 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1806 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1807 SET_DR_MISALIGNMENT (dr0, 0);
1808 if (vect_print_dump_info (REPORT_ALIGNMENT))
1809 fprintf (vect_dump, "Alignment of access forced using peeling.");
1811 if (vect_print_dump_info (REPORT_DETAILS))
1812 fprintf (vect_dump, "Peeling for alignment will be applied.");
1814 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1821 /* (2) Versioning to force alignment. */
1823 /* Try versioning if:
1824 1) flag_tree_vect_loop_version is TRUE
1825 2) optimize loop for speed
1826 3) there is at least one unsupported misaligned data ref with an unknown
1828 4) all misaligned data refs with a known misalignment are supported, and
1829 5) the number of runtime alignment checks is within reason. */
1832 flag_tree_vect_loop_version
1833 && optimize_loop_nest_for_speed_p (loop)
1834 && (!loop->inner); /* FORNOW */
1838 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1840 stmt = DR_STMT (dr);
1841 stmt_info = vinfo_for_stmt (stmt);
1843 /* For interleaving, only the alignment of the first access
1845 if (aligned_access_p (dr)
1846 || (STMT_VINFO_STRIDED_ACCESS (stmt_info)
1847 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1850 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1852 if (!supportable_dr_alignment)
1858 if (known_alignment_for_access_p (dr)
1859 || VEC_length (gimple,
1860 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
1861 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1863 do_versioning = false;
1867 stmt = DR_STMT (dr);
1868 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1869 gcc_assert (vectype);
1871 /* The rightmost bits of an aligned address must be zeros.
1872 Construct the mask needed for this test. For example,
1873 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1874 mask must be 15 = 0xf. */
1875 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1877 /* FORNOW: use the same mask to test all potentially unaligned
1878 references in the loop. The vectorizer currently supports
1879 a single vector size, see the reference to
1880 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1881 vectorization factor is computed. */
1882 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1883 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1884 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1885 VEC_safe_push (gimple, heap,
1886 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo),
1891 /* Versioning requires at least one misaligned data reference. */
1892 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1893 do_versioning = false;
1894 else if (!do_versioning)
1895 VEC_truncate (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo), 0);
1900 VEC(gimple,heap) *may_misalign_stmts
1901 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1904 /* It can now be assumed that the data references in the statements
1905 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1906 of the loop being vectorized. */
1907 FOR_EACH_VEC_ELT (gimple, may_misalign_stmts, i, stmt)
1909 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1910 dr = STMT_VINFO_DATA_REF (stmt_info);
1911 SET_DR_MISALIGNMENT (dr, 0);
1912 if (vect_print_dump_info (REPORT_ALIGNMENT))
1913 fprintf (vect_dump, "Alignment of access forced using versioning.");
1916 if (vect_print_dump_info (REPORT_DETAILS))
1917 fprintf (vect_dump, "Versioning for alignment will be applied.");
1919 /* Peeling and versioning can't be done together at this time. */
1920 gcc_assert (! (do_peeling && do_versioning));
1922 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1927 /* This point is reached if neither peeling nor versioning is being done. */
1928 gcc_assert (! (do_peeling || do_versioning));
1930 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1935 /* Function vect_find_same_alignment_drs.
1937 Update group and alignment relations according to the chosen
1938 vectorization factor. */
1941 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1942 loop_vec_info loop_vinfo)
1945 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1946 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1947 struct data_reference *dra = DDR_A (ddr);
1948 struct data_reference *drb = DDR_B (ddr);
1949 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1950 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1951 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1952 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1953 lambda_vector dist_v;
1954 unsigned int loop_depth;
1956 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1962 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1965 /* Loop-based vectorization and known data dependence. */
1966 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1969 /* Data-dependence analysis reports a distance vector of zero
1970 for data-references that overlap only in the first iteration
1971 but have different sign step (see PR45764).
1972 So as a sanity check require equal DR_STEP. */
1973 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1976 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1977 FOR_EACH_VEC_ELT (lambda_vector, DDR_DIST_VECTS (ddr), i, dist_v)
1979 int dist = dist_v[loop_depth];
1981 if (vect_print_dump_info (REPORT_DR_DETAILS))
1982 fprintf (vect_dump, "dependence distance = %d.", dist);
1984 /* Same loop iteration. */
1986 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1988 /* Two references with distance zero have the same alignment. */
1989 VEC_safe_push (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a), drb);
1990 VEC_safe_push (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b), dra);
1991 if (vect_print_dump_info (REPORT_ALIGNMENT))
1992 fprintf (vect_dump, "accesses have the same alignment.");
1993 if (vect_print_dump_info (REPORT_DR_DETAILS))
1995 fprintf (vect_dump, "dependence distance modulo vf == 0 between ");
1996 print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
1997 fprintf (vect_dump, " and ");
1998 print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
2005 /* Function vect_analyze_data_refs_alignment
2007 Analyze the alignment of the data-references in the loop.
2008 Return FALSE if a data reference is found that cannot be vectorized. */
2011 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
2012 bb_vec_info bb_vinfo)
2014 if (vect_print_dump_info (REPORT_DETAILS))
2015 fprintf (vect_dump, "=== vect_analyze_data_refs_alignment ===");
2017 /* Mark groups of data references with same alignment using
2018 data dependence information. */
2021 VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2022 struct data_dependence_relation *ddr;
2025 FOR_EACH_VEC_ELT (ddr_p, ddrs, i, ddr)
2026 vect_find_same_alignment_drs (ddr, loop_vinfo);
2029 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2031 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
2033 "not vectorized: can't calculate alignment for data ref.");
2041 /* Analyze groups of strided accesses: check that DR belongs to a group of
2042 strided accesses of legal size, step, etc. Detect gaps, single element
2043 interleaving, and other special cases. Set strided access info.
2044 Collect groups of strided stores for further use in SLP analysis. */
2047 vect_analyze_group_access (struct data_reference *dr)
2049 tree step = DR_STEP (dr);
2050 tree scalar_type = TREE_TYPE (DR_REF (dr));
2051 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2052 gimple stmt = DR_STMT (dr);
2053 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2054 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2055 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2056 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2057 HOST_WIDE_INT stride, last_accessed_element = 1;
2058 bool slp_impossible = false;
2059 struct loop *loop = NULL;
2062 loop = LOOP_VINFO_LOOP (loop_vinfo);
2064 /* For interleaving, STRIDE is STEP counted in elements, i.e., the size of the
2065 interleaving group (including gaps). */
2066 stride = dr_step / type_size;
2068 /* Not consecutive access is possible only if it is a part of interleaving. */
2069 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2071 /* Check if it this DR is a part of interleaving, and is a single
2072 element of the group that is accessed in the loop. */
2074 /* Gaps are supported only for loads. STEP must be a multiple of the type
2075 size. The size of the group must be a power of 2. */
2077 && (dr_step % type_size) == 0
2079 && exact_log2 (stride) != -1)
2081 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2082 GROUP_SIZE (vinfo_for_stmt (stmt)) = stride;
2083 if (vect_print_dump_info (REPORT_DR_DETAILS))
2085 fprintf (vect_dump, "Detected single element interleaving ");
2086 print_generic_expr (vect_dump, DR_REF (dr), TDF_SLIM);
2087 fprintf (vect_dump, " step ");
2088 print_generic_expr (vect_dump, step, TDF_SLIM);
2093 if (vect_print_dump_info (REPORT_DETAILS))
2094 fprintf (vect_dump, "Data access with gaps requires scalar "
2098 if (vect_print_dump_info (REPORT_DETAILS))
2099 fprintf (vect_dump, "Peeling for outer loop is not"
2104 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2110 if (vect_print_dump_info (REPORT_DETAILS))
2112 fprintf (vect_dump, "not consecutive access ");
2113 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2118 /* Mark the statement as unvectorizable. */
2119 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2126 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2128 /* First stmt in the interleaving chain. Check the chain. */
2129 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2130 struct data_reference *data_ref = dr;
2131 unsigned int count = 1;
2133 tree prev_init = DR_INIT (data_ref);
2135 HOST_WIDE_INT diff, count_in_bytes, gaps = 0;
2139 /* Skip same data-refs. In case that two or more stmts share
2140 data-ref (supported only for loads), we vectorize only the first
2141 stmt, and the rest get their vectorized loads from the first
2143 if (!tree_int_cst_compare (DR_INIT (data_ref),
2144 DR_INIT (STMT_VINFO_DATA_REF (
2145 vinfo_for_stmt (next)))))
2147 if (DR_IS_WRITE (data_ref))
2149 if (vect_print_dump_info (REPORT_DETAILS))
2150 fprintf (vect_dump, "Two store stmts share the same dr.");
2154 /* Check that there is no load-store dependencies for this loads
2155 to prevent a case of load-store-load to the same location. */
2156 if (GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next))
2157 || GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev)))
2159 if (vect_print_dump_info (REPORT_DETAILS))
2161 "READ_WRITE dependence in interleaving.");
2165 /* For load use the same data-ref load. */
2166 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2169 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2175 /* Check that all the accesses have the same STEP. */
2176 next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
2177 if (tree_int_cst_compare (step, next_step))
2179 if (vect_print_dump_info (REPORT_DETAILS))
2180 fprintf (vect_dump, "not consecutive access in interleaving");
2184 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2185 /* Check that the distance between two accesses is equal to the type
2186 size. Otherwise, we have gaps. */
2187 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2188 - TREE_INT_CST_LOW (prev_init)) / type_size;
2191 /* FORNOW: SLP of accesses with gaps is not supported. */
2192 slp_impossible = true;
2193 if (DR_IS_WRITE (data_ref))
2195 if (vect_print_dump_info (REPORT_DETAILS))
2196 fprintf (vect_dump, "interleaved store with gaps");
2203 last_accessed_element += diff;
2205 /* Store the gap from the previous member of the group. If there is no
2206 gap in the access, GROUP_GAP is always 1. */
2207 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2209 prev_init = DR_INIT (data_ref);
2210 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2211 /* Count the number of data-refs in the chain. */
2215 /* COUNT is the number of accesses found, we multiply it by the size of
2216 the type to get COUNT_IN_BYTES. */
2217 count_in_bytes = type_size * count;
2219 /* Check that the size of the interleaving (including gaps) is not
2220 greater than STEP. */
2221 if (dr_step && dr_step < count_in_bytes + gaps * type_size)
2223 if (vect_print_dump_info (REPORT_DETAILS))
2225 fprintf (vect_dump, "interleaving size is greater than step for ");
2226 print_generic_expr (vect_dump, DR_REF (dr), TDF_SLIM);
2231 /* Check that the size of the interleaving is equal to STEP for stores,
2232 i.e., that there are no gaps. */
2233 if (dr_step && dr_step != count_in_bytes)
2235 if (DR_IS_READ (dr))
2237 slp_impossible = true;
2238 /* There is a gap after the last load in the group. This gap is a
2239 difference between the stride and the number of elements. When
2240 there is no gap, this difference should be 0. */
2241 GROUP_GAP (vinfo_for_stmt (stmt)) = stride - count;
2245 if (vect_print_dump_info (REPORT_DETAILS))
2246 fprintf (vect_dump, "interleaved store with gaps");
2251 /* Check that STEP is a multiple of type size. */
2252 if (dr_step && (dr_step % type_size) != 0)
2254 if (vect_print_dump_info (REPORT_DETAILS))
2256 fprintf (vect_dump, "step is not a multiple of type size: step ");
2257 print_generic_expr (vect_dump, step, TDF_SLIM);
2258 fprintf (vect_dump, " size ");
2259 print_generic_expr (vect_dump, TYPE_SIZE_UNIT (scalar_type),
2268 GROUP_SIZE (vinfo_for_stmt (stmt)) = stride;
2269 if (vect_print_dump_info (REPORT_DETAILS))
2270 fprintf (vect_dump, "Detected interleaving of size %d", (int)stride);
2272 /* SLP: create an SLP data structure for every interleaving group of
2273 stores for further analysis in vect_analyse_slp. */
2274 if (DR_IS_WRITE (dr) && !slp_impossible)
2277 VEC_safe_push (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo),
2280 VEC_safe_push (gimple, heap, BB_VINFO_STRIDED_STORES (bb_vinfo),
2284 /* There is a gap in the end of the group. */
2285 if (stride - last_accessed_element > 0 && loop_vinfo)
2287 if (vect_print_dump_info (REPORT_DETAILS))
2288 fprintf (vect_dump, "Data access with gaps requires scalar "
2292 if (vect_print_dump_info (REPORT_DETAILS))
2293 fprintf (vect_dump, "Peeling for outer loop is not supported");
2297 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2305 /* Analyze the access pattern of the data-reference DR.
2306 In case of non-consecutive accesses call vect_analyze_group_access() to
2307 analyze groups of strided accesses. */
2310 vect_analyze_data_ref_access (struct data_reference *dr)
2312 tree step = DR_STEP (dr);
2313 tree scalar_type = TREE_TYPE (DR_REF (dr));
2314 gimple stmt = DR_STMT (dr);
2315 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2316 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2317 struct loop *loop = NULL;
2318 HOST_WIDE_INT dr_step;
2321 loop = LOOP_VINFO_LOOP (loop_vinfo);
2323 if (loop_vinfo && !step)
2325 if (vect_print_dump_info (REPORT_DETAILS))
2326 fprintf (vect_dump, "bad data-ref access in loop");
2330 /* Allow invariant loads in loops. */
2331 dr_step = TREE_INT_CST_LOW (step);
2332 if (loop_vinfo && dr_step == 0)
2334 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2335 return DR_IS_READ (dr);
2338 if (loop && nested_in_vect_loop_p (loop, stmt))
2340 /* Interleaved accesses are not yet supported within outer-loop
2341 vectorization for references in the inner-loop. */
2342 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2344 /* For the rest of the analysis we use the outer-loop step. */
2345 step = STMT_VINFO_DR_STEP (stmt_info);
2346 dr_step = TREE_INT_CST_LOW (step);
2350 if (vect_print_dump_info (REPORT_ALIGNMENT))
2351 fprintf (vect_dump, "zero step in outer loop.");
2352 if (DR_IS_READ (dr))
2360 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2362 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2364 /* Mark that it is not interleaving. */
2365 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2369 if (loop && nested_in_vect_loop_p (loop, stmt))
2371 if (vect_print_dump_info (REPORT_ALIGNMENT))
2372 fprintf (vect_dump, "strided access in outer loop.");
2376 /* Not consecutive access - check if it's a part of interleaving group. */
2377 return vect_analyze_group_access (dr);
2381 /* Function vect_analyze_data_ref_accesses.
2383 Analyze the access pattern of all the data references in the loop.
2385 FORNOW: the only access pattern that is considered vectorizable is a
2386 simple step 1 (consecutive) access.
2388 FORNOW: handle only arrays and pointer accesses. */
2391 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2394 VEC (data_reference_p, heap) *datarefs;
2395 struct data_reference *dr;
2397 if (vect_print_dump_info (REPORT_DETAILS))
2398 fprintf (vect_dump, "=== vect_analyze_data_ref_accesses ===");
2401 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2403 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2405 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
2406 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2407 && !vect_analyze_data_ref_access (dr))
2409 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
2410 fprintf (vect_dump, "not vectorized: complicated access pattern.");
2414 /* Mark the statement as not vectorizable. */
2415 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2425 /* Function vect_prune_runtime_alias_test_list.
2427 Prune a list of ddrs to be tested at run-time by versioning for alias.
2428 Return FALSE if resulting list of ddrs is longer then allowed by
2429 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2432 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2434 VEC (ddr_p, heap) * ddrs =
2435 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2438 if (vect_print_dump_info (REPORT_DETAILS))
2439 fprintf (vect_dump, "=== vect_prune_runtime_alias_test_list ===");
2441 for (i = 0; i < VEC_length (ddr_p, ddrs); )
2446 ddr_i = VEC_index (ddr_p, ddrs, i);
2449 for (j = 0; j < i; j++)
2451 ddr_p ddr_j = VEC_index (ddr_p, ddrs, j);
2453 if (vect_vfa_range_equal (ddr_i, ddr_j))
2455 if (vect_print_dump_info (REPORT_DR_DETAILS))
2457 fprintf (vect_dump, "found equal ranges ");
2458 print_generic_expr (vect_dump, DR_REF (DDR_A (ddr_i)), TDF_SLIM);
2459 fprintf (vect_dump, ", ");
2460 print_generic_expr (vect_dump, DR_REF (DDR_B (ddr_i)), TDF_SLIM);
2461 fprintf (vect_dump, " and ");
2462 print_generic_expr (vect_dump, DR_REF (DDR_A (ddr_j)), TDF_SLIM);
2463 fprintf (vect_dump, ", ");
2464 print_generic_expr (vect_dump, DR_REF (DDR_B (ddr_j)), TDF_SLIM);
2473 VEC_ordered_remove (ddr_p, ddrs, i);
2479 if (VEC_length (ddr_p, ddrs) >
2480 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2482 if (vect_print_dump_info (REPORT_DR_DETAILS))
2485 "disable versioning for alias - max number of generated "
2486 "checks exceeded.");
2489 VEC_truncate (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo), 0);
2497 /* Check whether a non-affine read in stmt is suitable for gather load
2498 and if so, return a builtin decl for that operation. */
2501 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2502 tree *offp, int *scalep)
2504 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2505 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2506 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2507 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2508 tree offtype = NULL_TREE;
2509 tree decl, base, off;
2510 enum machine_mode pmode;
2511 int punsignedp, pvolatilep;
2513 /* The gather builtins need address of the form
2514 loop_invariant + vector * {1, 2, 4, 8}
2516 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2517 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2518 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2519 multiplications and additions in it. To get a vector, we need
2520 a single SSA_NAME that will be defined in the loop and will
2521 contain everything that is not loop invariant and that can be
2522 vectorized. The following code attempts to find such a preexistng
2523 SSA_NAME OFF and put the loop invariants into a tree BASE
2524 that can be gimplified before the loop. */
2525 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2526 &pmode, &punsignedp, &pvolatilep, false);
2527 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2529 if (TREE_CODE (base) == MEM_REF)
2531 if (!integer_zerop (TREE_OPERAND (base, 1)))
2533 if (off == NULL_TREE)
2535 double_int moff = mem_ref_offset (base);
2536 off = double_int_to_tree (sizetype, moff);
2539 off = size_binop (PLUS_EXPR, off,
2540 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2542 base = TREE_OPERAND (base, 0);
2545 base = build_fold_addr_expr (base);
2547 if (off == NULL_TREE)
2548 off = size_zero_node;
2550 /* If base is not loop invariant, either off is 0, then we start with just
2551 the constant offset in the loop invariant BASE and continue with base
2552 as OFF, otherwise give up.
2553 We could handle that case by gimplifying the addition of base + off
2554 into some SSA_NAME and use that as off, but for now punt. */
2555 if (!expr_invariant_in_loop_p (loop, base))
2557 if (!integer_zerop (off))
2560 base = size_int (pbitpos / BITS_PER_UNIT);
2562 /* Otherwise put base + constant offset into the loop invariant BASE
2563 and continue with OFF. */
2566 base = fold_convert (sizetype, base);
2567 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2570 /* OFF at this point may be either a SSA_NAME or some tree expression
2571 from get_inner_reference. Try to peel off loop invariants from it
2572 into BASE as long as possible. */
2574 while (offtype == NULL_TREE)
2576 enum tree_code code;
2577 tree op0, op1, add = NULL_TREE;
2579 if (TREE_CODE (off) == SSA_NAME)
2581 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2583 if (expr_invariant_in_loop_p (loop, off))
2586 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2589 op0 = gimple_assign_rhs1 (def_stmt);
2590 code = gimple_assign_rhs_code (def_stmt);
2591 op1 = gimple_assign_rhs2 (def_stmt);
2595 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2597 code = TREE_CODE (off);
2598 extract_ops_from_tree (off, &code, &op0, &op1);
2602 case POINTER_PLUS_EXPR:
2604 if (expr_invariant_in_loop_p (loop, op0))
2609 add = fold_convert (sizetype, add);
2611 add = size_binop (MULT_EXPR, add, size_int (scale));
2612 base = size_binop (PLUS_EXPR, base, add);
2615 if (expr_invariant_in_loop_p (loop, op1))
2623 if (expr_invariant_in_loop_p (loop, op1))
2625 add = fold_convert (sizetype, op1);
2626 add = size_binop (MINUS_EXPR, size_zero_node, add);
2632 if (scale == 1 && host_integerp (op1, 0))
2634 scale = tree_low_cst (op1, 0);
2643 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2644 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2646 if (TYPE_PRECISION (TREE_TYPE (op0))
2647 == TYPE_PRECISION (TREE_TYPE (off)))
2652 if (TYPE_PRECISION (TREE_TYPE (op0))
2653 < TYPE_PRECISION (TREE_TYPE (off)))
2656 offtype = TREE_TYPE (off);
2667 /* If at the end OFF still isn't a SSA_NAME or isn't
2668 defined in the loop, punt. */
2669 if (TREE_CODE (off) != SSA_NAME
2670 || expr_invariant_in_loop_p (loop, off))
2673 if (offtype == NULL_TREE)
2674 offtype = TREE_TYPE (off);
2676 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2678 if (decl == NULL_TREE)
2691 /* Function vect_analyze_data_refs.
2693 Find all the data references in the loop or basic block.
2695 The general structure of the analysis of data refs in the vectorizer is as
2697 1- vect_analyze_data_refs(loop/bb): call
2698 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2699 in the loop/bb and their dependences.
2700 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2701 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2702 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2707 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2708 bb_vec_info bb_vinfo,
2711 struct loop *loop = NULL;
2712 basic_block bb = NULL;
2714 VEC (data_reference_p, heap) *datarefs;
2715 struct data_reference *dr;
2717 bool res, stop_bb_analysis = false;
2719 if (vect_print_dump_info (REPORT_DETAILS))
2720 fprintf (vect_dump, "=== vect_analyze_data_refs ===\n");
2724 loop = LOOP_VINFO_LOOP (loop_vinfo);
2725 res = compute_data_dependences_for_loop
2727 &LOOP_VINFO_LOOP_NEST (loop_vinfo),
2728 &LOOP_VINFO_DATAREFS (loop_vinfo),
2729 &LOOP_VINFO_DDRS (loop_vinfo));
2733 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
2734 fprintf (vect_dump, "not vectorized: loop contains function calls"
2735 " or data references that cannot be analyzed");
2739 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2743 bb = BB_VINFO_BB (bb_vinfo);
2744 res = compute_data_dependences_for_bb (bb, true,
2745 &BB_VINFO_DATAREFS (bb_vinfo),
2746 &BB_VINFO_DDRS (bb_vinfo));
2749 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
2750 fprintf (vect_dump, "not vectorized: basic block contains function"
2751 " calls or data references that cannot be analyzed");
2755 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2758 /* Go through the data-refs, check that the analysis succeeded. Update
2759 pointer from stmt_vec_info struct to DR and vectype. */
2761 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
2764 stmt_vec_info stmt_info;
2765 tree base, offset, init;
2766 bool gather = false;
2769 if (!dr || !DR_REF (dr))
2771 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
2772 fprintf (vect_dump, "not vectorized: unhandled data-ref ");
2777 stmt = DR_STMT (dr);
2778 stmt_info = vinfo_for_stmt (stmt);
2780 if (stop_bb_analysis)
2782 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2786 /* Check that analysis of the data-ref succeeded. */
2787 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2790 /* If target supports vector gather loads, see if they can't
2794 && !TREE_THIS_VOLATILE (DR_REF (dr))
2795 && targetm.vectorize.builtin_gather != NULL
2796 && !nested_in_vect_loop_p (loop, stmt))
2798 struct data_reference *newdr
2799 = create_data_ref (NULL, loop_containing_stmt (stmt),
2800 DR_REF (dr), stmt, true);
2801 gcc_assert (newdr != NULL && DR_REF (newdr));
2802 if (DR_BASE_ADDRESS (newdr)
2803 && DR_OFFSET (newdr)
2806 && integer_zerop (DR_STEP (newdr)))
2812 free_data_ref (newdr);
2817 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
2819 fprintf (vect_dump, "not vectorized: data ref analysis "
2821 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2826 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2827 stop_bb_analysis = true;
2835 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
2837 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
2838 fprintf (vect_dump, "not vectorized: base addr of dr is a "
2843 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2844 stop_bb_analysis = true;
2853 if (TREE_THIS_VOLATILE (DR_REF (dr)))
2855 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
2857 fprintf (vect_dump, "not vectorized: volatile type ");
2858 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2863 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2864 stop_bb_analysis = true;
2871 base = unshare_expr (DR_BASE_ADDRESS (dr));
2872 offset = unshare_expr (DR_OFFSET (dr));
2873 init = unshare_expr (DR_INIT (dr));
2875 if (stmt_can_throw_internal (stmt))
2877 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
2879 fprintf (vect_dump, "not vectorized: statement can throw an "
2881 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2886 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2887 stop_bb_analysis = true;
2896 if (is_gimple_call (stmt))
2898 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
2900 fprintf (vect_dump, "not vectorized: dr in a call ");
2901 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2906 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2907 stop_bb_analysis = true;
2916 /* Update DR field in stmt_vec_info struct. */
2918 /* If the dataref is in an inner-loop of the loop that is considered for
2919 for vectorization, we also want to analyze the access relative to
2920 the outer-loop (DR contains information only relative to the
2921 inner-most enclosing loop). We do that by building a reference to the
2922 first location accessed by the inner-loop, and analyze it relative to
2924 if (loop && nested_in_vect_loop_p (loop, stmt))
2926 tree outer_step, outer_base, outer_init;
2927 HOST_WIDE_INT pbitsize, pbitpos;
2929 enum machine_mode pmode;
2930 int punsignedp, pvolatilep;
2931 affine_iv base_iv, offset_iv;
2934 /* Build a reference to the first location accessed by the
2935 inner-loop: *(BASE+INIT). (The first location is actually
2936 BASE+INIT+OFFSET, but we add OFFSET separately later). */
2937 tree inner_base = build_fold_indirect_ref
2938 (fold_build_pointer_plus (base, init));
2940 if (vect_print_dump_info (REPORT_DETAILS))
2942 fprintf (vect_dump, "analyze in outer-loop: ");
2943 print_generic_expr (vect_dump, inner_base, TDF_SLIM);
2946 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
2947 &poffset, &pmode, &punsignedp, &pvolatilep, false);
2948 gcc_assert (outer_base != NULL_TREE);
2950 if (pbitpos % BITS_PER_UNIT != 0)
2952 if (vect_print_dump_info (REPORT_DETAILS))
2953 fprintf (vect_dump, "failed: bit offset alignment.\n");
2957 outer_base = build_fold_addr_expr (outer_base);
2958 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
2961 if (vect_print_dump_info (REPORT_DETAILS))
2962 fprintf (vect_dump, "failed: evolution of base is not affine.\n");
2969 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
2977 offset_iv.base = ssize_int (0);
2978 offset_iv.step = ssize_int (0);
2980 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
2983 if (vect_print_dump_info (REPORT_DETAILS))
2984 fprintf (vect_dump, "evolution of offset is not affine.\n");
2988 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
2989 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
2990 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
2991 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
2992 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
2994 outer_step = size_binop (PLUS_EXPR,
2995 fold_convert (ssizetype, base_iv.step),
2996 fold_convert (ssizetype, offset_iv.step));
2998 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
2999 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3000 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3001 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3002 STMT_VINFO_DR_OFFSET (stmt_info) =
3003 fold_convert (ssizetype, offset_iv.base);
3004 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3005 size_int (highest_pow2_factor (offset_iv.base));
3007 if (vect_print_dump_info (REPORT_DETAILS))
3009 fprintf (vect_dump, "\touter base_address: ");
3010 print_generic_expr (vect_dump, STMT_VINFO_DR_BASE_ADDRESS (stmt_info), TDF_SLIM);
3011 fprintf (vect_dump, "\n\touter offset from base address: ");
3012 print_generic_expr (vect_dump, STMT_VINFO_DR_OFFSET (stmt_info), TDF_SLIM);
3013 fprintf (vect_dump, "\n\touter constant offset from base address: ");
3014 print_generic_expr (vect_dump, STMT_VINFO_DR_INIT (stmt_info), TDF_SLIM);
3015 fprintf (vect_dump, "\n\touter step: ");
3016 print_generic_expr (vect_dump, STMT_VINFO_DR_STEP (stmt_info), TDF_SLIM);
3017 fprintf (vect_dump, "\n\touter aligned to: ");
3018 print_generic_expr (vect_dump, STMT_VINFO_DR_ALIGNED_TO (stmt_info), TDF_SLIM);
3022 if (STMT_VINFO_DATA_REF (stmt_info))
3024 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
3027 "not vectorized: more than one data ref in stmt: ");
3028 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
3033 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3034 stop_bb_analysis = true;
3043 STMT_VINFO_DATA_REF (stmt_info) = dr;
3045 /* Set vectype for STMT. */
3046 scalar_type = TREE_TYPE (DR_REF (dr));
3047 STMT_VINFO_VECTYPE (stmt_info) =
3048 get_vectype_for_scalar_type (scalar_type);
3049 if (!STMT_VINFO_VECTYPE (stmt_info))
3051 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
3054 "not vectorized: no vectype for stmt: ");
3055 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
3056 fprintf (vect_dump, " scalar_type: ");
3057 print_generic_expr (vect_dump, scalar_type, TDF_DETAILS);
3062 /* Mark the statement as not vectorizable. */
3063 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3064 stop_bb_analysis = true;
3070 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3076 /* Adjust the minimal vectorization factor according to the
3078 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3084 unsigned int j, k, n;
3085 struct data_reference *olddr
3086 = VEC_index (data_reference_p, datarefs, i);
3087 VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo);
3088 struct data_dependence_relation *ddr, *newddr;
3091 VEC (loop_p, heap) *nest = LOOP_VINFO_LOOP_NEST (loop_vinfo);
3093 if (!vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL)
3094 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3096 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
3099 "not vectorized: not suitable for gather ");
3100 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
3105 n = VEC_length (data_reference_p, datarefs) - 1;
3106 for (j = 0, k = i - 1; j < i; j++)
3108 ddr = VEC_index (ddr_p, ddrs, k);
3109 gcc_assert (DDR_B (ddr) == olddr);
3110 newddr = initialize_data_dependence_relation (DDR_A (ddr), dr,
3112 VEC_replace (ddr_p, ddrs, k, newddr);
3113 free_dependence_relation (ddr);
3115 && DR_IS_WRITE (DDR_A (newddr))
3116 && DDR_ARE_DEPENDENT (newddr) != chrec_known)
3122 n = k + VEC_length (data_reference_p, datarefs) - i - 1;
3125 ddr = VEC_index (ddr_p, ddrs, k);
3126 gcc_assert (DDR_A (ddr) == olddr);
3127 newddr = initialize_data_dependence_relation (dr, DDR_B (ddr),
3129 VEC_replace (ddr_p, ddrs, k, newddr);
3130 free_dependence_relation (ddr);
3132 && DR_IS_WRITE (DDR_B (newddr))
3133 && DDR_ARE_DEPENDENT (newddr) != chrec_known)
3137 k = VEC_length (ddr_p, ddrs)
3138 - VEC_length (data_reference_p, datarefs) + i;
3139 ddr = VEC_index (ddr_p, ddrs, k);
3140 gcc_assert (DDR_A (ddr) == olddr && DDR_B (ddr) == olddr);
3141 newddr = initialize_data_dependence_relation (dr, dr, nest);
3142 VEC_replace (ddr_p, ddrs, k, newddr);
3143 free_dependence_relation (ddr);
3144 VEC_replace (data_reference_p, datarefs, i, dr);
3148 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
3151 "not vectorized: data dependence conflict"
3152 " prevents gather");
3153 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
3158 STMT_VINFO_GATHER_P (stmt_info) = true;
3166 /* Function vect_get_new_vect_var.
3168 Returns a name for a new variable. The current naming scheme appends the
3169 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3170 the name of vectorizer generated variables, and appends that to NAME if
3174 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3181 case vect_simple_var:
3184 case vect_scalar_var:
3187 case vect_pointer_var:
3196 char* tmp = concat (prefix, name, NULL);
3197 new_vect_var = create_tmp_var (type, tmp);
3201 new_vect_var = create_tmp_var (type, prefix);
3203 /* Mark vector typed variable as a gimple register variable. */
3204 if (TREE_CODE (type) == VECTOR_TYPE)
3205 DECL_GIMPLE_REG_P (new_vect_var) = true;
3207 return new_vect_var;
3211 /* Function vect_create_addr_base_for_vector_ref.
3213 Create an expression that computes the address of the first memory location
3214 that will be accessed for a data reference.
3217 STMT: The statement containing the data reference.
3218 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3219 OFFSET: Optional. If supplied, it is be added to the initial address.
3220 LOOP: Specify relative to which loop-nest should the address be computed.
3221 For example, when the dataref is in an inner-loop nested in an
3222 outer-loop that is now being vectorized, LOOP can be either the
3223 outer-loop, or the inner-loop. The first memory location accessed
3224 by the following dataref ('in' points to short):
3231 if LOOP=i_loop: &in (relative to i_loop)
3232 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3235 1. Return an SSA_NAME whose value is the address of the memory location of
3236 the first vector of the data reference.
3237 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3238 these statement(s) which define the returned SSA_NAME.
3240 FORNOW: We are only handling array accesses with step 1. */
3243 vect_create_addr_base_for_vector_ref (gimple stmt,
3244 gimple_seq *new_stmt_list,
3248 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3249 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3250 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3252 tree data_ref_base_var;
3254 tree addr_base, addr_expr;
3256 gimple_seq seq = NULL;
3257 tree base_offset = unshare_expr (DR_OFFSET (dr));
3258 tree init = unshare_expr (DR_INIT (dr));
3260 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3261 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3264 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3266 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3268 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3270 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3271 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3272 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3276 base_name = build_fold_indirect_ref (data_ref_base);
3279 base_offset = ssize_int (0);
3280 init = ssize_int (0);
3281 base_name = build_fold_indirect_ref (unshare_expr (DR_REF (dr)));
3284 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
3285 add_referenced_var (data_ref_base_var);
3286 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
3288 gimple_seq_add_seq (new_stmt_list, seq);
3290 /* Create base_offset */
3291 base_offset = size_binop (PLUS_EXPR,
3292 fold_convert (sizetype, base_offset),
3293 fold_convert (sizetype, init));
3294 dest = create_tmp_var (sizetype, "base_off");
3295 add_referenced_var (dest);
3296 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
3297 gimple_seq_add_seq (new_stmt_list, seq);
3301 tree tmp = create_tmp_var (sizetype, "offset");
3303 add_referenced_var (tmp);
3304 offset = fold_build2 (MULT_EXPR, sizetype,
3305 fold_convert (sizetype, offset), step);
3306 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3307 base_offset, offset);
3308 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
3309 gimple_seq_add_seq (new_stmt_list, seq);
3312 /* base + base_offset */
3314 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3317 addr_base = build1 (ADDR_EXPR,
3318 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3319 unshare_expr (DR_REF (dr)));
3322 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3323 base = get_base_address (DR_REF (dr));
3325 && TREE_CODE (base) == MEM_REF)
3327 = build_qualified_type (vect_ptr_type,
3328 TYPE_QUALS (TREE_TYPE (TREE_OPERAND (base, 0))));
3330 vec_stmt = fold_convert (vect_ptr_type, addr_base);
3331 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
3332 get_name (base_name));
3333 add_referenced_var (addr_expr);
3334 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr);
3335 gimple_seq_add_seq (new_stmt_list, seq);
3337 if (DR_PTR_INFO (dr)
3338 && TREE_CODE (vec_stmt) == SSA_NAME)
3340 duplicate_ssa_name_ptr_info (vec_stmt, DR_PTR_INFO (dr));
3343 SSA_NAME_PTR_INFO (vec_stmt)->align = 1;
3344 SSA_NAME_PTR_INFO (vec_stmt)->misalign = 0;
3348 if (vect_print_dump_info (REPORT_DETAILS))
3350 fprintf (vect_dump, "created ");
3351 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
3358 /* Function vect_create_data_ref_ptr.
3360 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3361 location accessed in the loop by STMT, along with the def-use update
3362 chain to appropriately advance the pointer through the loop iterations.
3363 Also set aliasing information for the pointer. This pointer is used by
3364 the callers to this function to create a memory reference expression for
3365 vector load/store access.
3368 1. STMT: a stmt that references memory. Expected to be of the form
3369 GIMPLE_ASSIGN <name, data-ref> or
3370 GIMPLE_ASSIGN <data-ref, name>.
3371 2. AGGR_TYPE: the type of the reference, which should be either a vector
3373 3. AT_LOOP: the loop where the vector memref is to be created.
3374 4. OFFSET (optional): an offset to be added to the initial address accessed
3375 by the data-ref in STMT.
3376 5. BSI: location where the new stmts are to be placed if there is no loop
3377 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3378 pointing to the initial address.
3381 1. Declare a new ptr to vector_type, and have it point to the base of the
3382 data reference (initial addressed accessed by the data reference).
3383 For example, for vector of type V8HI, the following code is generated:
3386 ap = (v8hi *)initial_address;
3388 if OFFSET is not supplied:
3389 initial_address = &a[init];
3390 if OFFSET is supplied:
3391 initial_address = &a[init + OFFSET];
3393 Return the initial_address in INITIAL_ADDRESS.
3395 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3396 update the pointer in each iteration of the loop.
3398 Return the increment stmt that updates the pointer in PTR_INCR.
3400 3. Set INV_P to true if the access pattern of the data reference in the
3401 vectorized loop is invariant. Set it to false otherwise.
3403 4. Return the pointer. */
3406 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3407 tree offset, tree *initial_address,
3408 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3409 bool only_init, bool *inv_p)
3412 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3413 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3414 struct loop *loop = NULL;
3415 bool nested_in_vect_loop = false;
3416 struct loop *containing_loop = NULL;
3421 gimple_seq new_stmt_list = NULL;
3425 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3427 gimple_stmt_iterator incr_gsi;
3430 tree indx_before_incr, indx_after_incr;
3433 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3436 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3437 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3441 loop = LOOP_VINFO_LOOP (loop_vinfo);
3442 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3443 containing_loop = (gimple_bb (stmt))->loop_father;
3444 pe = loop_preheader_edge (loop);
3448 gcc_assert (bb_vinfo);
3453 /* Check the step (evolution) of the load in LOOP, and record
3454 whether it's invariant. */
3455 if (nested_in_vect_loop)
3456 step = STMT_VINFO_DR_STEP (stmt_info);
3458 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3460 if (tree_int_cst_compare (step, size_zero_node) == 0)
3464 negative = tree_int_cst_compare (step, size_zero_node) < 0;
3466 /* Create an expression for the first address accessed by this load
3468 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
3470 if (vect_print_dump_info (REPORT_DETAILS))
3472 tree data_ref_base = base_name;
3473 fprintf (vect_dump, "create %s-pointer variable to type: ",
3474 tree_code_name[(int) TREE_CODE (aggr_type)]);
3475 print_generic_expr (vect_dump, aggr_type, TDF_SLIM);
3476 if (TREE_CODE (data_ref_base) == VAR_DECL
3477 || TREE_CODE (data_ref_base) == ARRAY_REF)
3478 fprintf (vect_dump, " vectorizing an array ref: ");
3479 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
3480 fprintf (vect_dump, " vectorizing a record based array ref: ");
3481 else if (TREE_CODE (data_ref_base) == SSA_NAME)
3482 fprintf (vect_dump, " vectorizing a pointer ref: ");
3483 print_generic_expr (vect_dump, base_name, TDF_SLIM);
3486 /* (1) Create the new aggregate-pointer variable. */
3487 aggr_ptr_type = build_pointer_type (aggr_type);
3488 base = get_base_address (DR_REF (dr));
3490 && TREE_CODE (base) == MEM_REF)
3492 = build_qualified_type (aggr_ptr_type,
3493 TYPE_QUALS (TREE_TYPE (TREE_OPERAND (base, 0))));
3494 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var,
3495 get_name (base_name));
3497 /* Vector and array types inherit the alias set of their component
3498 type by default so we need to use a ref-all pointer if the data
3499 reference does not conflict with the created aggregated data
3500 reference because it is not addressable. */
3501 if (!alias_sets_conflict_p (get_deref_alias_set (aggr_ptr),
3502 get_alias_set (DR_REF (dr))))
3505 = build_pointer_type_for_mode (aggr_type,
3506 TYPE_MODE (aggr_ptr_type), true);
3507 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var,
3508 get_name (base_name));
3511 /* Likewise for any of the data references in the stmt group. */
3512 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3514 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3517 tree lhs = gimple_assign_lhs (orig_stmt);
3518 if (!alias_sets_conflict_p (get_deref_alias_set (aggr_ptr),
3519 get_alias_set (lhs)))
3522 = build_pointer_type_for_mode (aggr_type,
3523 TYPE_MODE (aggr_ptr_type), true);
3525 = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var,
3526 get_name (base_name));
3530 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo_for_stmt (orig_stmt));
3535 add_referenced_var (aggr_ptr);
3537 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3538 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3539 def-use update cycles for the pointer: one relative to the outer-loop
3540 (LOOP), which is what steps (3) and (4) below do. The other is relative
3541 to the inner-loop (which is the inner-most loop containing the dataref),
3542 and this is done be step (5) below.
3544 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3545 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3546 redundant. Steps (3),(4) create the following:
3549 LOOP: vp1 = phi(vp0,vp2)
3555 If there is an inner-loop nested in loop, then step (5) will also be
3556 applied, and an additional update in the inner-loop will be created:
3559 LOOP: vp1 = phi(vp0,vp2)
3561 inner: vp3 = phi(vp1,vp4)
3562 vp4 = vp3 + inner_step
3568 /* (2) Calculate the initial address of the aggregate-pointer, and set
3569 the aggregate-pointer to point to it before the loop. */
3571 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3573 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3579 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3580 gcc_assert (!new_bb);
3583 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3586 *initial_address = new_temp;
3588 /* Create: p = (aggr_type *) initial_base */
3589 if (TREE_CODE (new_temp) != SSA_NAME
3590 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3592 vec_stmt = gimple_build_assign (aggr_ptr,
3593 fold_convert (aggr_ptr_type, new_temp));
3594 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3595 /* Copy the points-to information if it exists. */
3596 if (DR_PTR_INFO (dr))
3597 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3598 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3601 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3602 gcc_assert (!new_bb);
3605 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3608 aggr_ptr_init = new_temp;
3610 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3611 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3612 inner-loop nested in LOOP (during outer-loop vectorization). */
3614 /* No update in loop is required. */
3615 if (only_init && (!loop_vinfo || at_loop == loop))
3616 aptr = aggr_ptr_init;
3619 /* The step of the aggregate pointer is the type size. */
3620 tree step = TYPE_SIZE_UNIT (aggr_type);
3621 /* One exception to the above is when the scalar step of the load in
3622 LOOP is zero. In this case the step here is also zero. */
3624 step = size_zero_node;
3626 step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step);
3628 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3630 create_iv (aggr_ptr_init,
3631 fold_convert (aggr_ptr_type, step),
3632 aggr_ptr, loop, &incr_gsi, insert_after,
3633 &indx_before_incr, &indx_after_incr);
3634 incr = gsi_stmt (incr_gsi);
3635 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3637 /* Copy the points-to information if it exists. */
3638 if (DR_PTR_INFO (dr))
3640 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3641 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3646 aptr = indx_before_incr;
3649 if (!nested_in_vect_loop || only_init)
3653 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3654 nested in LOOP, if exists. */
3656 gcc_assert (nested_in_vect_loop);
3659 standard_iv_increment_position (containing_loop, &incr_gsi,
3661 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3662 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3664 incr = gsi_stmt (incr_gsi);
3665 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3667 /* Copy the points-to information if it exists. */
3668 if (DR_PTR_INFO (dr))
3670 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3671 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3676 return indx_before_incr;
3683 /* Function bump_vector_ptr
3685 Increment a pointer (to a vector type) by vector-size. If requested,
3686 i.e. if PTR-INCR is given, then also connect the new increment stmt
3687 to the existing def-use update-chain of the pointer, by modifying
3688 the PTR_INCR as illustrated below:
3690 The pointer def-use update-chain before this function:
3691 DATAREF_PTR = phi (p_0, p_2)
3693 PTR_INCR: p_2 = DATAREF_PTR + step
3695 The pointer def-use update-chain after this function:
3696 DATAREF_PTR = phi (p_0, p_2)
3698 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3700 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3703 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3705 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3706 the loop. The increment amount across iterations is expected
3708 BSI - location where the new update stmt is to be placed.
3709 STMT - the original scalar memory-access stmt that is being vectorized.
3710 BUMP - optional. The offset by which to bump the pointer. If not given,
3711 the offset is assumed to be vector_size.
3713 Output: Return NEW_DATAREF_PTR as illustrated above.
3718 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3719 gimple stmt, tree bump)
3721 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3722 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3723 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3724 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
3725 tree update = TYPE_SIZE_UNIT (vectype);
3728 use_operand_p use_p;
3729 tree new_dataref_ptr;
3734 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
3735 dataref_ptr, update);
3736 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
3737 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
3738 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3740 /* Copy the points-to information if it exists. */
3741 if (DR_PTR_INFO (dr))
3743 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3744 SSA_NAME_PTR_INFO (new_dataref_ptr)->align = 1;
3745 SSA_NAME_PTR_INFO (new_dataref_ptr)->misalign = 0;
3749 return new_dataref_ptr;
3751 /* Update the vector-pointer's cross-iteration increment. */
3752 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3754 tree use = USE_FROM_PTR (use_p);
3756 if (use == dataref_ptr)
3757 SET_USE (use_p, new_dataref_ptr);
3759 gcc_assert (tree_int_cst_compare (use, update) == 0);
3762 return new_dataref_ptr;
3766 /* Function vect_create_destination_var.
3768 Create a new temporary of type VECTYPE. */
3771 vect_create_destination_var (tree scalar_dest, tree vectype)
3774 const char *new_name;
3776 enum vect_var_kind kind;
3778 kind = vectype ? vect_simple_var : vect_scalar_var;
3779 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3781 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3783 new_name = get_name (scalar_dest);
3786 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3787 add_referenced_var (vec_dest);
3792 /* Function vect_strided_store_supported.
3794 Returns TRUE if interleave high and interleave low permutations
3795 are supported, and FALSE otherwise. */
3798 vect_strided_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3800 enum machine_mode mode = TYPE_MODE (vectype);
3802 /* vect_permute_store_chain requires the group size to be a power of two. */
3803 if (exact_log2 (count) == -1)
3805 if (vect_print_dump_info (REPORT_DETAILS))
3806 fprintf (vect_dump, "the size of the group of strided accesses"
3807 " is not a power of 2");
3811 /* Check that the permutation is supported. */
3812 if (VECTOR_MODE_P (mode))
3814 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3815 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3816 for (i = 0; i < nelt / 2; i++)
3819 sel[i * 2 + 1] = i + nelt;
3821 if (can_vec_perm_p (mode, false, sel))
3823 for (i = 0; i < nelt; i++)
3825 if (can_vec_perm_p (mode, false, sel))
3830 if (vect_print_dump_info (REPORT_DETAILS))
3831 fprintf (vect_dump, "interleave op not supported by target.");
3836 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
3840 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
3842 return vect_lanes_optab_supported_p ("vec_store_lanes",
3843 vec_store_lanes_optab,
3848 /* Function vect_permute_store_chain.
3850 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3851 a power of 2, generate interleave_high/low stmts to reorder the data
3852 correctly for the stores. Return the final references for stores in
3855 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3856 The input is 4 vectors each containing 8 elements. We assign a number to
3857 each element, the input sequence is:
3859 1st vec: 0 1 2 3 4 5 6 7
3860 2nd vec: 8 9 10 11 12 13 14 15
3861 3rd vec: 16 17 18 19 20 21 22 23
3862 4th vec: 24 25 26 27 28 29 30 31
3864 The output sequence should be:
3866 1st vec: 0 8 16 24 1 9 17 25
3867 2nd vec: 2 10 18 26 3 11 19 27
3868 3rd vec: 4 12 20 28 5 13 21 30
3869 4th vec: 6 14 22 30 7 15 23 31
3871 i.e., we interleave the contents of the four vectors in their order.
3873 We use interleave_high/low instructions to create such output. The input of
3874 each interleave_high/low operation is two vectors:
3877 the even elements of the result vector are obtained left-to-right from the
3878 high/low elements of the first vector. The odd elements of the result are
3879 obtained left-to-right from the high/low elements of the second vector.
3880 The output of interleave_high will be: 0 4 1 5
3881 and of interleave_low: 2 6 3 7
3884 The permutation is done in log LENGTH stages. In each stage interleave_high
3885 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
3886 where the first argument is taken from the first half of DR_CHAIN and the
3887 second argument from it's second half.
3890 I1: interleave_high (1st vec, 3rd vec)
3891 I2: interleave_low (1st vec, 3rd vec)
3892 I3: interleave_high (2nd vec, 4th vec)
3893 I4: interleave_low (2nd vec, 4th vec)
3895 The output for the first stage is:
3897 I1: 0 16 1 17 2 18 3 19
3898 I2: 4 20 5 21 6 22 7 23
3899 I3: 8 24 9 25 10 26 11 27
3900 I4: 12 28 13 29 14 30 15 31
3902 The output of the second stage, i.e. the final result is:
3904 I1: 0 8 16 24 1 9 17 25
3905 I2: 2 10 18 26 3 11 19 27
3906 I3: 4 12 20 28 5 13 21 30
3907 I4: 6 14 22 30 7 15 23 31. */
3910 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
3911 unsigned int length,
3913 gimple_stmt_iterator *gsi,
3914 VEC(tree,heap) **result_chain)
3916 tree perm_dest, vect1, vect2, high, low;
3918 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3919 tree perm_mask_low, perm_mask_high;
3921 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
3922 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3924 *result_chain = VEC_copy (tree, heap, dr_chain);
3926 for (i = 0, n = nelt / 2; i < n; i++)
3929 sel[i * 2 + 1] = i + nelt;
3931 perm_mask_high = vect_gen_perm_mask (vectype, sel);
3932 gcc_assert (perm_mask_high != NULL);
3934 for (i = 0; i < nelt; i++)
3936 perm_mask_low = vect_gen_perm_mask (vectype, sel);
3937 gcc_assert (perm_mask_low != NULL);
3939 for (i = 0, n = exact_log2 (length); i < n; i++)
3941 for (j = 0; j < length/2; j++)
3943 vect1 = VEC_index (tree, dr_chain, j);
3944 vect2 = VEC_index (tree, dr_chain, j+length/2);
3946 /* Create interleaving stmt:
3947 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
3948 perm_dest = create_tmp_var (vectype, "vect_inter_high");
3949 DECL_GIMPLE_REG_P (perm_dest) = 1;
3950 add_referenced_var (perm_dest);
3951 high = make_ssa_name (perm_dest, NULL);
3953 = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, high,
3954 vect1, vect2, perm_mask_high);
3955 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3956 VEC_replace (tree, *result_chain, 2*j, high);
3958 /* Create interleaving stmt:
3959 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
3960 nelt*3/2+1, ...}> */
3961 perm_dest = create_tmp_var (vectype, "vect_inter_low");
3962 DECL_GIMPLE_REG_P (perm_dest) = 1;
3963 add_referenced_var (perm_dest);
3964 low = make_ssa_name (perm_dest, NULL);
3966 = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, low,
3967 vect1, vect2, perm_mask_low);
3968 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3969 VEC_replace (tree, *result_chain, 2*j+1, low);
3971 dr_chain = VEC_copy (tree, heap, *result_chain);
3975 /* Function vect_setup_realignment
3977 This function is called when vectorizing an unaligned load using
3978 the dr_explicit_realign[_optimized] scheme.
3979 This function generates the following code at the loop prolog:
3982 x msq_init = *(floor(p)); # prolog load
3983 realignment_token = call target_builtin;
3985 x msq = phi (msq_init, ---)
3987 The stmts marked with x are generated only for the case of
3988 dr_explicit_realign_optimized.
3990 The code above sets up a new (vector) pointer, pointing to the first
3991 location accessed by STMT, and a "floor-aligned" load using that pointer.
3992 It also generates code to compute the "realignment-token" (if the relevant
3993 target hook was defined), and creates a phi-node at the loop-header bb
3994 whose arguments are the result of the prolog-load (created by this
3995 function) and the result of a load that takes place in the loop (to be
3996 created by the caller to this function).
3998 For the case of dr_explicit_realign_optimized:
3999 The caller to this function uses the phi-result (msq) to create the
4000 realignment code inside the loop, and sets up the missing phi argument,
4003 msq = phi (msq_init, lsq)
4004 lsq = *(floor(p')); # load in loop
4005 result = realign_load (msq, lsq, realignment_token);
4007 For the case of dr_explicit_realign:
4009 msq = *(floor(p)); # load in loop
4011 lsq = *(floor(p')); # load in loop
4012 result = realign_load (msq, lsq, realignment_token);
4015 STMT - (scalar) load stmt to be vectorized. This load accesses
4016 a memory location that may be unaligned.
4017 BSI - place where new code is to be inserted.
4018 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4022 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4023 target hook, if defined.
4024 Return value - the result of the loop-header phi node. */
4027 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4028 tree *realignment_token,
4029 enum dr_alignment_support alignment_support_scheme,
4031 struct loop **at_loop)
4033 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4034 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4035 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4036 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4037 struct loop *loop = NULL;
4039 tree scalar_dest = gimple_assign_lhs (stmt);
4046 tree msq_init = NULL_TREE;
4049 tree msq = NULL_TREE;
4050 gimple_seq stmts = NULL;
4052 bool compute_in_loop = false;
4053 bool nested_in_vect_loop = false;
4054 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4055 struct loop *loop_for_initial_load = NULL;
4059 loop = LOOP_VINFO_LOOP (loop_vinfo);
4060 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4063 gcc_assert (alignment_support_scheme == dr_explicit_realign
4064 || alignment_support_scheme == dr_explicit_realign_optimized);
4066 /* We need to generate three things:
4067 1. the misalignment computation
4068 2. the extra vector load (for the optimized realignment scheme).
4069 3. the phi node for the two vectors from which the realignment is
4070 done (for the optimized realignment scheme). */
4072 /* 1. Determine where to generate the misalignment computation.
4074 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4075 calculation will be generated by this function, outside the loop (in the
4076 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4077 caller, inside the loop.
4079 Background: If the misalignment remains fixed throughout the iterations of
4080 the loop, then both realignment schemes are applicable, and also the
4081 misalignment computation can be done outside LOOP. This is because we are
4082 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4083 are a multiple of VS (the Vector Size), and therefore the misalignment in
4084 different vectorized LOOP iterations is always the same.
4085 The problem arises only if the memory access is in an inner-loop nested
4086 inside LOOP, which is now being vectorized using outer-loop vectorization.
4087 This is the only case when the misalignment of the memory access may not
4088 remain fixed throughout the iterations of the inner-loop (as explained in
4089 detail in vect_supportable_dr_alignment). In this case, not only is the
4090 optimized realignment scheme not applicable, but also the misalignment
4091 computation (and generation of the realignment token that is passed to
4092 REALIGN_LOAD) have to be done inside the loop.
4094 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4095 or not, which in turn determines if the misalignment is computed inside
4096 the inner-loop, or outside LOOP. */
4098 if (init_addr != NULL_TREE || !loop_vinfo)
4100 compute_in_loop = true;
4101 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4105 /* 2. Determine where to generate the extra vector load.
4107 For the optimized realignment scheme, instead of generating two vector
4108 loads in each iteration, we generate a single extra vector load in the
4109 preheader of the loop, and in each iteration reuse the result of the
4110 vector load from the previous iteration. In case the memory access is in
4111 an inner-loop nested inside LOOP, which is now being vectorized using
4112 outer-loop vectorization, we need to determine whether this initial vector
4113 load should be generated at the preheader of the inner-loop, or can be
4114 generated at the preheader of LOOP. If the memory access has no evolution
4115 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4116 to be generated inside LOOP (in the preheader of the inner-loop). */
4118 if (nested_in_vect_loop)
4120 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4121 bool invariant_in_outerloop =
4122 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4123 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4126 loop_for_initial_load = loop;
4128 *at_loop = loop_for_initial_load;
4130 if (loop_for_initial_load)
4131 pe = loop_preheader_edge (loop_for_initial_load);
4133 /* 3. For the case of the optimized realignment, create the first vector
4134 load at the loop preheader. */
4136 if (alignment_support_scheme == dr_explicit_realign_optimized)
4138 /* Create msq_init = *(floor(p1)) in the loop preheader */
4140 gcc_assert (!compute_in_loop);
4141 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4142 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4143 NULL_TREE, &init_addr, NULL, &inc,
4145 new_stmt = gimple_build_assign_with_ops
4146 (BIT_AND_EXPR, NULL_TREE, ptr,
4147 build_int_cst (TREE_TYPE (ptr),
4148 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4149 new_temp = make_ssa_name (SSA_NAME_VAR (ptr), new_stmt);
4150 gimple_assign_set_lhs (new_stmt, new_temp);
4151 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4152 gcc_assert (!new_bb);
4154 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4155 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4156 new_stmt = gimple_build_assign (vec_dest, data_ref);
4157 new_temp = make_ssa_name (vec_dest, new_stmt);
4158 gimple_assign_set_lhs (new_stmt, new_temp);
4159 mark_symbols_for_renaming (new_stmt);
4162 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4163 gcc_assert (!new_bb);
4166 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4168 msq_init = gimple_assign_lhs (new_stmt);
4171 /* 4. Create realignment token using a target builtin, if available.
4172 It is done either inside the containing loop, or before LOOP (as
4173 determined above). */
4175 if (targetm.vectorize.builtin_mask_for_load)
4179 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4182 /* Generate the INIT_ADDR computation outside LOOP. */
4183 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4187 pe = loop_preheader_edge (loop);
4188 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4189 gcc_assert (!new_bb);
4192 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4195 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4196 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4198 vect_create_destination_var (scalar_dest,
4199 gimple_call_return_type (new_stmt));
4200 new_temp = make_ssa_name (vec_dest, new_stmt);
4201 gimple_call_set_lhs (new_stmt, new_temp);
4203 if (compute_in_loop)
4204 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4207 /* Generate the misalignment computation outside LOOP. */
4208 pe = loop_preheader_edge (loop);
4209 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4210 gcc_assert (!new_bb);
4213 *realignment_token = gimple_call_lhs (new_stmt);
4215 /* The result of the CALL_EXPR to this builtin is determined from
4216 the value of the parameter and no global variables are touched
4217 which makes the builtin a "const" function. Requiring the
4218 builtin to have the "const" attribute makes it unnecessary
4219 to call mark_call_clobbered. */
4220 gcc_assert (TREE_READONLY (builtin_decl));
4223 if (alignment_support_scheme == dr_explicit_realign)
4226 gcc_assert (!compute_in_loop);
4227 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4230 /* 5. Create msq = phi <msq_init, lsq> in loop */
4232 pe = loop_preheader_edge (containing_loop);
4233 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4234 msq = make_ssa_name (vec_dest, NULL);
4235 phi_stmt = create_phi_node (msq, containing_loop->header);
4236 SSA_NAME_DEF_STMT (msq) = phi_stmt;
4237 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4243 /* Function vect_strided_load_supported.
4245 Returns TRUE if even and odd permutations are supported,
4246 and FALSE otherwise. */
4249 vect_strided_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4251 enum machine_mode mode = TYPE_MODE (vectype);
4253 /* vect_permute_load_chain requires the group size to be a power of two. */
4254 if (exact_log2 (count) == -1)
4256 if (vect_print_dump_info (REPORT_DETAILS))
4257 fprintf (vect_dump, "the size of the group of strided accesses"
4258 " is not a power of 2");
4262 /* Check that the permutation is supported. */
4263 if (VECTOR_MODE_P (mode))
4265 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4266 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4268 for (i = 0; i < nelt; i++)
4270 if (can_vec_perm_p (mode, false, sel))
4272 for (i = 0; i < nelt; i++)
4274 if (can_vec_perm_p (mode, false, sel))
4279 if (vect_print_dump_info (REPORT_DETAILS))
4280 fprintf (vect_dump, "extract even/odd not supported by target");
4284 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4288 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4290 return vect_lanes_optab_supported_p ("vec_load_lanes",
4291 vec_load_lanes_optab,
4295 /* Function vect_permute_load_chain.
4297 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4298 a power of 2, generate extract_even/odd stmts to reorder the input data
4299 correctly. Return the final references for loads in RESULT_CHAIN.
4301 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4302 The input is 4 vectors each containing 8 elements. We assign a number to each
4303 element, the input sequence is:
4305 1st vec: 0 1 2 3 4 5 6 7
4306 2nd vec: 8 9 10 11 12 13 14 15
4307 3rd vec: 16 17 18 19 20 21 22 23
4308 4th vec: 24 25 26 27 28 29 30 31
4310 The output sequence should be:
4312 1st vec: 0 4 8 12 16 20 24 28
4313 2nd vec: 1 5 9 13 17 21 25 29
4314 3rd vec: 2 6 10 14 18 22 26 30
4315 4th vec: 3 7 11 15 19 23 27 31
4317 i.e., the first output vector should contain the first elements of each
4318 interleaving group, etc.
4320 We use extract_even/odd instructions to create such output. The input of
4321 each extract_even/odd operation is two vectors
4325 and the output is the vector of extracted even/odd elements. The output of
4326 extract_even will be: 0 2 4 6
4327 and of extract_odd: 1 3 5 7
4330 The permutation is done in log LENGTH stages. In each stage extract_even
4331 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4332 their order. In our example,
4334 E1: extract_even (1st vec, 2nd vec)
4335 E2: extract_odd (1st vec, 2nd vec)
4336 E3: extract_even (3rd vec, 4th vec)
4337 E4: extract_odd (3rd vec, 4th vec)
4339 The output for the first stage will be:
4341 E1: 0 2 4 6 8 10 12 14
4342 E2: 1 3 5 7 9 11 13 15
4343 E3: 16 18 20 22 24 26 28 30
4344 E4: 17 19 21 23 25 27 29 31
4346 In order to proceed and create the correct sequence for the next stage (or
4347 for the correct output, if the second stage is the last one, as in our
4348 example), we first put the output of extract_even operation and then the
4349 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4350 The input for the second stage is:
4352 1st vec (E1): 0 2 4 6 8 10 12 14
4353 2nd vec (E3): 16 18 20 22 24 26 28 30
4354 3rd vec (E2): 1 3 5 7 9 11 13 15
4355 4th vec (E4): 17 19 21 23 25 27 29 31
4357 The output of the second stage:
4359 E1: 0 4 8 12 16 20 24 28
4360 E2: 2 6 10 14 18 22 26 30
4361 E3: 1 5 9 13 17 21 25 29
4362 E4: 3 7 11 15 19 23 27 31
4364 And RESULT_CHAIN after reordering:
4366 1st vec (E1): 0 4 8 12 16 20 24 28
4367 2nd vec (E3): 1 5 9 13 17 21 25 29
4368 3rd vec (E2): 2 6 10 14 18 22 26 30
4369 4th vec (E4): 3 7 11 15 19 23 27 31. */
4372 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
4373 unsigned int length,
4375 gimple_stmt_iterator *gsi,
4376 VEC(tree,heap) **result_chain)
4378 tree perm_dest, data_ref, first_vect, second_vect;
4379 tree perm_mask_even, perm_mask_odd;
4381 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4382 unsigned int i, j, log_length = exact_log2 (length);
4383 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4384 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4386 *result_chain = VEC_copy (tree, heap, dr_chain);
4388 for (i = 0; i < nelt; ++i)
4390 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4391 gcc_assert (perm_mask_even != NULL);
4393 for (i = 0; i < nelt; ++i)
4395 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4396 gcc_assert (perm_mask_odd != NULL);
4398 for (i = 0; i < log_length; i++)
4400 for (j = 0; j < length; j += 2)
4402 first_vect = VEC_index (tree, dr_chain, j);
4403 second_vect = VEC_index (tree, dr_chain, j+1);
4405 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4406 perm_dest = create_tmp_var (vectype, "vect_perm_even");
4407 DECL_GIMPLE_REG_P (perm_dest) = 1;
4408 add_referenced_var (perm_dest);
4410 perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, perm_dest,
4411 first_vect, second_vect,
4414 data_ref = make_ssa_name (perm_dest, perm_stmt);
4415 gimple_assign_set_lhs (perm_stmt, data_ref);
4416 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4417 mark_symbols_for_renaming (perm_stmt);
4419 VEC_replace (tree, *result_chain, j/2, data_ref);
4421 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4422 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
4423 DECL_GIMPLE_REG_P (perm_dest) = 1;
4424 add_referenced_var (perm_dest);
4426 perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, perm_dest,
4427 first_vect, second_vect,
4430 data_ref = make_ssa_name (perm_dest, perm_stmt);
4431 gimple_assign_set_lhs (perm_stmt, data_ref);
4432 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4433 mark_symbols_for_renaming (perm_stmt);
4435 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
4437 dr_chain = VEC_copy (tree, heap, *result_chain);
4442 /* Function vect_transform_strided_load.
4444 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4445 to perform their permutation and ascribe the result vectorized statements to
4446 the scalar statements.
4450 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
4451 gimple_stmt_iterator *gsi)
4453 VEC(tree,heap) *result_chain = NULL;
4455 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4456 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4457 vectors, that are ready for vector computation. */
4458 result_chain = VEC_alloc (tree, heap, size);
4459 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4460 vect_record_strided_load_vectors (stmt, result_chain);
4461 VEC_free (tree, heap, result_chain);
4464 /* RESULT_CHAIN contains the output of a group of strided loads that were
4465 generated as part of the vectorization of STMT. Assign the statement
4466 for each vector to the associated scalar statement. */
4469 vect_record_strided_load_vectors (gimple stmt, VEC(tree,heap) *result_chain)
4471 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4472 gimple next_stmt, new_stmt;
4473 unsigned int i, gap_count;
4476 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4477 Since we scan the chain starting from it's first node, their order
4478 corresponds the order of data-refs in RESULT_CHAIN. */
4479 next_stmt = first_stmt;
4481 FOR_EACH_VEC_ELT (tree, result_chain, i, tmp_data_ref)
4486 /* Skip the gaps. Loads created for the gaps will be removed by dead
4487 code elimination pass later. No need to check for the first stmt in
4488 the group, since it always exists.
4489 GROUP_GAP is the number of steps in elements from the previous
4490 access (if there is no gap GROUP_GAP is 1). We skip loads that
4491 correspond to the gaps. */
4492 if (next_stmt != first_stmt
4493 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4501 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4502 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4503 copies, and we put the new vector statement in the first available
4505 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4506 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4509 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4512 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4514 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4517 prev_stmt = rel_stmt;
4519 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4522 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4527 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4529 /* If NEXT_STMT accesses the same DR as the previous statement,
4530 put the same TMP_DATA_REF as its vectorized statement; otherwise
4531 get the next data-ref from RESULT_CHAIN. */
4532 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4538 /* Function vect_force_dr_alignment_p.
4540 Returns whether the alignment of a DECL can be forced to be aligned
4541 on ALIGNMENT bit boundary. */
4544 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4546 if (TREE_CODE (decl) != VAR_DECL)
4549 if (DECL_EXTERNAL (decl))
4552 if (TREE_ASM_WRITTEN (decl))
4555 if (TREE_STATIC (decl))
4556 return (alignment <= MAX_OFILE_ALIGNMENT);
4558 return (alignment <= MAX_STACK_ALIGNMENT);
4562 /* Return whether the data reference DR is supported with respect to its
4564 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4565 it is aligned, i.e., check if it is possible to vectorize it with different
4568 enum dr_alignment_support
4569 vect_supportable_dr_alignment (struct data_reference *dr,
4570 bool check_aligned_accesses)
4572 gimple stmt = DR_STMT (dr);
4573 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4574 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4575 enum machine_mode mode = TYPE_MODE (vectype);
4576 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4577 struct loop *vect_loop = NULL;
4578 bool nested_in_vect_loop = false;
4580 if (aligned_access_p (dr) && !check_aligned_accesses)
4585 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4586 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4589 /* Possibly unaligned access. */
4591 /* We can choose between using the implicit realignment scheme (generating
4592 a misaligned_move stmt) and the explicit realignment scheme (generating
4593 aligned loads with a REALIGN_LOAD). There are two variants to the
4594 explicit realignment scheme: optimized, and unoptimized.
4595 We can optimize the realignment only if the step between consecutive
4596 vector loads is equal to the vector size. Since the vector memory
4597 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4598 is guaranteed that the misalignment amount remains the same throughout the
4599 execution of the vectorized loop. Therefore, we can create the
4600 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4601 at the loop preheader.
4603 However, in the case of outer-loop vectorization, when vectorizing a
4604 memory access in the inner-loop nested within the LOOP that is now being
4605 vectorized, while it is guaranteed that the misalignment of the
4606 vectorized memory access will remain the same in different outer-loop
4607 iterations, it is *not* guaranteed that is will remain the same throughout
4608 the execution of the inner-loop. This is because the inner-loop advances
4609 with the original scalar step (and not in steps of VS). If the inner-loop
4610 step happens to be a multiple of VS, then the misalignment remains fixed
4611 and we can use the optimized realignment scheme. For example:
4617 When vectorizing the i-loop in the above example, the step between
4618 consecutive vector loads is 1, and so the misalignment does not remain
4619 fixed across the execution of the inner-loop, and the realignment cannot
4620 be optimized (as illustrated in the following pseudo vectorized loop):
4622 for (i=0; i<N; i+=4)
4623 for (j=0; j<M; j++){
4624 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4625 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4626 // (assuming that we start from an aligned address).
4629 We therefore have to use the unoptimized realignment scheme:
4631 for (i=0; i<N; i+=4)
4632 for (j=k; j<M; j+=4)
4633 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4634 // that the misalignment of the initial address is
4637 The loop can then be vectorized as follows:
4639 for (k=0; k<4; k++){
4640 rt = get_realignment_token (&vp[k]);
4641 for (i=0; i<N; i+=4){
4643 for (j=k; j<M; j+=4){
4645 va = REALIGN_LOAD <v1,v2,rt>;
4652 if (DR_IS_READ (dr))
4654 bool is_packed = false;
4655 tree type = (TREE_TYPE (DR_REF (dr)));
4657 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4658 && (!targetm.vectorize.builtin_mask_for_load
4659 || targetm.vectorize.builtin_mask_for_load ()))
4661 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4662 if ((nested_in_vect_loop
4663 && (TREE_INT_CST_LOW (DR_STEP (dr))
4664 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4666 return dr_explicit_realign;
4668 return dr_explicit_realign_optimized;
4670 if (!known_alignment_for_access_p (dr))
4671 is_packed = contains_packed_reference (DR_REF (dr));
4673 if (targetm.vectorize.
4674 support_vector_misalignment (mode, type,
4675 DR_MISALIGNMENT (dr), is_packed))
4676 /* Can't software pipeline the loads, but can at least do them. */
4677 return dr_unaligned_supported;
4681 bool is_packed = false;
4682 tree type = (TREE_TYPE (DR_REF (dr)));
4684 if (!known_alignment_for_access_p (dr))
4685 is_packed = contains_packed_reference (DR_REF (dr));
4687 if (targetm.vectorize.
4688 support_vector_misalignment (mode, type,
4689 DR_MISALIGNMENT (dr), is_packed))
4690 return dr_unaligned_supported;
4694 return dr_unaligned_unsupported;