From: hubicka Date: Tue, 5 Oct 2010 17:57:09 +0000 (+0000) Subject: * doc/invoke.texi (-flto-partition, lto-partitions, lto-minpartition): X-Git-Url: http://git.sourceforge.jp/view?p=pf3gnuchains%2Fgcc-fork.git;a=commitdiff_plain;h=48e3ea525e2a656e64fe9b011db184249bd0d3b7;hp=f777731452a61b365196ab1c9a9090f087b8596b * doc/invoke.texi (-flto-partition, lto-partitions, lto-minpartition): Document. * opts.c (decode_options): Handle lto partitions. * common.opt (flto-partition=1to1, flto-partition=balanced): New. * params.def (PARAM_LTO_PARTITIONS, MIN_PARTITION_SIZE): New. * lto.c: Include params.h. (add_cgraph_node_to_partition, add_varpool_node_to_partition): Do refcounting in aux field. (undo_partition, partition_cgraph_node_p, partition_varpool_node_p): New functions. (lto_1_to_1_map): Simplify. (lto_balanced_map): New function. (do_whole_program_analysis): Chose proper partitioning alg. * Make-lang.in (lto.o): Add dependency on params.h git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@164995 138bc75d-0d04-0410-961f-82ee72b054a4 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 924eb98e029..de2ab3b421a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,4 +1,12 @@ -2010-09-29 Jan Hubicka +2010-10-05 Jan Hubicka + + * doc/invoke.texi (-flto-partition, lto-partitions, lto-minpartition): + Document. + * opts.c (decode_options): Handle lto partitions. + * common.opt (flto-partition=1to1, flto-partition=balanced): New. + * params.def (PARAM_LTO_PARTITIONS, MIN_PARTITION_SIZE): New. + +2010-10-05 Jan Hubicka * cgraphunit.c (assemble_function): Output thunks and aliases before the function itself. diff --git a/gcc/common.opt b/gcc/common.opt index 2c1bd833f8a..4bb648d4477 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1049,6 +1049,14 @@ flto Common Var(flag_lto) Enable link-time optimization. +flto-partition=1to1 +Common Var(flag_lto_partition_1to1) +Partition functions and vars at linktime based on object files they originate from + +flto-partition=balanced +Common Var(flag_lto_partition_balanced) +Partition functions and vars at linktime into approximately same sized buckets + ; The initial value of -1 comes from Z_DEFAULT_COMPRESSION in zlib.h. flto-compression-level= Common Joined RejectNegative UInteger Var(flag_lto_compression_level) Init(-1) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index af0c90f0d76..12866435d24 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -354,10 +354,10 @@ Objective-C and Objective-C++ Dialects}. -fno-ira-share-spill-slots -fira-verbose=@var{n} @gol -fivopts -fkeep-inline-functions -fkeep-static-consts @gol -floop-block -floop-flatten -floop-interchange -floop-strip-mine @gol --floop-parallelize-all -flto -flto-compression-level -flto-report @gol --fltrans -fltrans-output-list -fmerge-all-constants -fmerge-constants @gol --fmodulo-sched -fmodulo-sched-allow-regmoves -fmove-loop-invariants @gol --fmudflap -fmudflapir -fmudflapth -fno-branch-count-reg @gol +-floop-parallelize-all -flto -flto-compression-level -flto-partition=@var{alg} @gol +-flto-report -fltrans -fltrans-output-list -fmerge-all-constants @gol +-fmerge-constants -fmodulo-sched -fmodulo-sched-allow-regmoves @gol +-fmove-loop-invariants fmudflap -fmudflapir -fmudflapth -fno-branch-count-reg @gol -fno-default-inline @gol -fno-defer-pop -fno-function-cse -fno-guess-branch-probability @gol -fno-inline -fno-math-errno -fno-peephole -fno-peephole2 @gol @@ -7596,6 +7596,13 @@ GNU make. Disabled by default. +@item -flto-partition=@var{alg} +@opindex flto-partition +Specify partitioning algorithm used by @option{-fwhopr} mode. The value is +either @code{1to1} to specify partitioning corresponding to source files +or @code{balanced} to specify partitioning into, if possible, equally sized +chunks. The default value is @code{balanced}. + @item -fwpa @opindex fwpa This is an internal option used by GCC when compiling with @@ -8789,6 +8796,16 @@ parameter in order to perform devirtualization. @option{devirt-type-list-size} is the maximum number of types it stores per a single formal parameter of a function. +@item lto-partitions +Specify desired nuber of partitions produced during WHOPR copmilation. +Number of partitions should exceed number of CPUs used for compilatoin. +Default value is 32. + +@item lto-minpartition +Size of minimal paritition for WHOPR (in estimated instructions). +This prevents expenses of splitting very small programs into too many +partitions. + @end table @end table diff --git a/gcc/lto/ChangeLog b/gcc/lto/ChangeLog index 49e2d75c880..29b29cca553 100644 --- a/gcc/lto/ChangeLog +++ b/gcc/lto/ChangeLog @@ -1,3 +1,15 @@ +2010-10-05 Jan Hubicka + + * lto.c: Include params.h. + (add_cgraph_node_to_partition, add_varpool_node_to_partition): Do + refcounting in aux field. + (undo_partition, partition_cgraph_node_p, partition_varpool_node_p): + New functions. + (lto_1_to_1_map): Simplify. + (lto_balanced_map): New function. + (do_whole_program_analysis): Chose proper partitioning alg. + * Make-lang.in (lto.o): Add dependency on params.h + 2010-10-04 Andi Kleen * Make-lang.in (lto1): Add + to build rule. diff --git a/gcc/lto/Make-lang.in b/gcc/lto/Make-lang.in index 9391405c077..21652315269 100644 --- a/gcc/lto/Make-lang.in +++ b/gcc/lto/Make-lang.in @@ -85,7 +85,7 @@ lto/lto.o: lto/lto.c $(CONFIG_H) $(SYSTEM_H) coretypes.h opts.h \ $(CGRAPH_H) $(GGC_H) tree-ssa-operands.h $(TREE_PASS_H) \ langhooks.h $(VEC_H) $(BITMAP_H) pointer-set.h $(IPA_PROP_H) \ $(COMMON_H) debug.h $(TIMEVAR_H) $(GIMPLE_H) $(LTO_H) $(LTO_TREE_H) \ - $(LTO_TAGS_H) $(LTO_STREAMER_H) $(SPLAY_TREE_H) gt-lto-lto.h + $(LTO_TAGS_H) $(LTO_STREAMER_H) $(SPLAY_TREE_H) gt-lto-lto.h $(PARAMS_H) lto/lto-elf.o: lto/lto-elf.c $(CONFIG_H) coretypes.h $(SYSTEM_H) \ toplev.h $(LTO_H) $(TM_H) $(LIBIBERTY_H) $(GGC_H) $(LTO_STREAMER_H) lto/lto-coff.o: lto/lto-coff.c $(CONFIG_H) coretypes.h $(SYSTEM_H) \ diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c index 2fbea7332d4..6623704fa53 100644 --- a/gcc/lto/lto.c +++ b/gcc/lto/lto.c @@ -44,6 +44,7 @@ along with GCC; see the file COPYING3. If not see #include "lto-tree.h" #include "lto-streamer.h" #include "splay-tree.h" +#include "params.h" /* This needs to be included after config.h. Otherwise, _GNU_SOURCE will not be defined in time to set __USE_GNU in the system headers, and strsignal @@ -759,12 +760,8 @@ add_cgraph_node_to_partition (ltrans_partition part, struct cgraph_node *node) part->insns += node->local.inline_summary.self_size; if (node->aux) - { - gcc_assert (node->aux != part); - node->in_other_partition = 1; - } - else - node->aux = part; + node->in_other_partition = 1; + node->aux = (void *)((size_t)node->aux + 1); cgraph_node_set_add (part->cgraph_set, node); @@ -788,12 +785,8 @@ add_varpool_node_to_partition (ltrans_partition part, struct varpool_node *vnode varpool_node_set_add (part->varpool_set, vnode); if (vnode->aux) - { - gcc_assert (vnode->aux != part); - vnode->in_other_partition = 1; - } - else - vnode->aux = part; + vnode->in_other_partition = 1; + vnode->aux = (void *)((size_t)vnode->aux + 1); add_references_to_partition (part, &vnode->ref_list); @@ -802,6 +795,70 @@ add_varpool_node_to_partition (ltrans_partition part, struct varpool_node *vnode add_varpool_node_to_partition (part, vnode->same_comdat_group); } +/* Undo all additions until number of cgraph nodes in PARITION is N_CGRAPH_NODES + and number of varpool nodes is N_VARPOOL_NODES. */ + +static void +undo_partition (ltrans_partition partition, unsigned int n_cgraph_nodes, + unsigned int n_varpool_nodes) +{ + while (VEC_length (cgraph_node_ptr, partition->cgraph_set->nodes) > + n_cgraph_nodes) + { + struct cgraph_node *node = VEC_index (cgraph_node_ptr, + partition->cgraph_set->nodes, + n_cgraph_nodes); + partition->insns -= node->local.inline_summary.self_size; + cgraph_node_set_remove (partition->cgraph_set, node); + node->aux = (void *)((size_t)node->aux - 1); + } + while (VEC_length (varpool_node_ptr, partition->varpool_set->nodes) > + n_varpool_nodes) + { + struct varpool_node *node = VEC_index (varpool_node_ptr, + partition->varpool_set->nodes, + n_varpool_nodes); + varpool_node_set_remove (partition->varpool_set, node); + node->aux = (void *)((size_t)node->aux - 1); + } +} + +/* Return true if NODE should be partitioned. + This means that partitioning algorithm should put NODE into one of partitions. + This apply to most functions with bodies. Functions that are not partitions + are put into every unit needing them. This is the case of i.e. COMDATs. */ + +static bool +partition_cgraph_node_p (struct cgraph_node *node) +{ + /* We will get proper partition based on function they are inlined to. */ + if (node->global.inlined_to) + return false; + /* Nodes without a body do not need partitioning. */ + if (!node->analyzed) + return false; + /* Extern inlines and comdat are always only in partitions they are needed. */ + if (DECL_EXTERNAL (node->decl) + || DECL_COMDAT (node->decl)) + return false; + return true; +} + +/* Return true if VNODE should be partitioned. + This means that partitioning algorithm should put VNODE into one of partitions. */ + +static bool +partition_varpool_node_p (struct varpool_node *vnode) +{ + if (vnode->alias || !vnode->needed) + return false; + /* Constant pool and comdat are always only in partitions they are needed. */ + if (DECL_IN_CONSTANT_POOL (vnode->decl) + || DECL_COMDAT (vnode->decl)) + return false; + return true; +} + /* Group cgrah nodes by input files. This is used mainly for testing right now. */ @@ -822,15 +879,7 @@ lto_1_to_1_map (void) for (node = cgraph_nodes; node; node = node->next) { - /* We will get proper partition based on function they are inlined to. */ - if (node->global.inlined_to) - continue; - /* Nodes without a body do not need partitioning. */ - if (!node->analyzed) - continue; - /* Extern inlines and comdat are always only in partitions they are needed. */ - if (DECL_EXTERNAL (node->decl) - || DECL_COMDAT (node->decl)) + if (!partition_cgraph_node_p (node)) continue; file_data = node->local.lto_file_data; @@ -865,11 +914,7 @@ lto_1_to_1_map (void) for (vnode = varpool_nodes; vnode; vnode = vnode->next) { - if (vnode->alias || !vnode->needed) - continue; - /* Constant pool and comdat are always only in partitions they are needed. */ - if (DECL_IN_CONSTANT_POOL (vnode->decl) - || DECL_COMDAT (vnode->decl)) + if (!partition_varpool_node_p (vnode)) continue; file_data = vnode->lto_file_data; slot = pointer_map_contains (pmap, file_data); @@ -903,6 +948,300 @@ lto_1_to_1_map (void) ltrans_partitions); } + +/* Group cgraph nodes in qually sized partitions. + + The algorithm deciding paritions are simple: nodes are taken in predefined + order. The order correspond to order we wish to have functions in final + output. In future this will be given by function reordering pass, but at + the moment we use topological order that serve a good approximation. + + The goal is to partition this linear order into intervals (partitions) such + that all partitions have approximately the same size and that the number of + callgraph or IPA reference edgess crossing boundaries is minimal. + + This is a lot faster (O(n) in size of callgraph) than algorithms doing + priority based graph clustering that are generally O(n^2) and since WHOPR + is designed to make things go well across partitions, it leads to good results. + + We compute the expected size of partition as + max (total_size / lto_partitions, min_partition_size). + We use dynamic expected size of partition, so small programs + are partitioning into enough partitions to allow use of multiple CPUs while + large programs are not partitioned too much. Creating too many partition + increase streaming overhead significandly. + + In the future we would like to bound maximal size of partition to avoid + ltrans stage consuming too much memory. At the moment however WPA stage is + most memory intensive phase at large benchmark since too many types and + declarations are read into memory. + + The function implement simple greedy algorithm. Nodes are begin added into + current partition until 3/4th of expected partition size is reached. + After this threshold we keep track of boundary size (number of edges going to + other partitions) and continue adding functions until the current partition + grows into a double of expected partition size. Then the process is undone + till the point when minimal ration of boundary size and in partition calls + was reached. */ + +static void +lto_balanced_map (void) +{ + int n_nodes = 0; + struct cgraph_node **postorder = + XCNEWVEC (struct cgraph_node *, cgraph_n_nodes); + struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid); + int i, postorder_len; + struct cgraph_node *node; + int total_size = 0; + int partition_size; + ltrans_partition partition; + unsigned int last_visited_cgraph_node = 0, last_visited_varpool_node = 0; + struct varpool_node *vnode; + int cost = 0, internal = 0; + int best_n_nodes = 0, best_n_varpool_nodes = 0, best_i = 0, best_cost = + INT_MAX, best_internal = 0; + int npartitions; + + /* Until we have better ordering facility, use toplogical order. + Include only nodes we will partition and compute estimate of program + size. Note that since nodes that are not partitioned might be put into + multiple partitions, this is just an estimate of real size. This is why + we keep partition_size updated after every partition is finalized. */ + postorder_len = cgraph_postorder (postorder); + for (i = 0; i < postorder_len; i++) + { + node = postorder[i]; + if (partition_cgraph_node_p (node)) + { + order[n_nodes++] = node; + total_size += node->local.inline_summary.self_size; + } + } + free (postorder); + + /* Compute partition size and create the first partition. */ + partition_size = total_size / PARAM_VALUE (PARAM_LTO_PARTITIONS); + if (partition_size < PARAM_VALUE (MIN_PARTITION_SIZE)) + partition_size = PARAM_VALUE (MIN_PARTITION_SIZE); + npartitions = 1; + partition = new_partition (""); + if (cgraph_dump_file) + fprintf (cgraph_dump_file, "Total unit size: %i, partition size: %i\n", + total_size, partition_size); + + for (i = 0; i < n_nodes; i++) + { + add_cgraph_node_to_partition (partition, order[i]); + + /* Once we added a new node to the partition, we also want to add + all referenced variables unless they was already added into some + earlier partition. + add_cgraph_node_to_partition adds possibly multiple nodes and + variables that are needed to satisfy needs of ORDER[i]. + We remember last visited cgraph and varpool node from last iteration + of outer loop that allows us to process every new addition. + + At the same time we compute size of the boundary into COST. Every + callgraph or IPA reference edge leaving the partition contributes into + COST. Every edge inside partition was earlier computed as one leaving + it and thus we need to subtract it from COST. */ + while (last_visited_cgraph_node < + VEC_length (cgraph_node_ptr, partition->cgraph_set->nodes) + || last_visited_varpool_node < VEC_length (varpool_node_ptr, + partition->varpool_set-> + nodes)) + { + struct ipa_ref_list *refs; + int j; + struct ipa_ref *ref; + bool cgraph_p = false; + + if (last_visited_cgraph_node < + VEC_length (cgraph_node_ptr, partition->cgraph_set->nodes)) + { + struct cgraph_edge *edge; + + cgraph_p = true; + node = VEC_index (cgraph_node_ptr, partition->cgraph_set->nodes, + last_visited_cgraph_node); + refs = &node->ref_list; + + total_size -= node->local.inline_summary.self_size; + last_visited_cgraph_node++; + + gcc_assert (node->analyzed); + + /* Compute boundary cost of callgrpah edges. */ + for (edge = node->callees; edge; edge = edge->next_callee) + if (edge->callee->analyzed) + { + int edge_cost = edge->frequency; + cgraph_node_set_iterator csi; + + if (!edge_cost) + edge_cost = 1; + gcc_assert (edge_cost > 0); + csi = cgraph_node_set_find (partition->cgraph_set, edge->callee); + if (!csi_end_p (csi) + && csi.index < last_visited_cgraph_node - 1) + cost -= edge_cost, internal+= edge_cost; + else + cost += edge_cost; + } + for (edge = node->callers; edge; edge = edge->next_caller) + { + int edge_cost = edge->frequency; + cgraph_node_set_iterator csi; + + gcc_assert (edge->caller->analyzed); + if (!edge_cost) + edge_cost = 1; + gcc_assert (edge_cost > 0); + csi = cgraph_node_set_find (partition->cgraph_set, edge->caller); + if (!csi_end_p (csi) + && csi.index < last_visited_cgraph_node) + cost -= edge_cost; + else + cost += edge_cost; + } + } + else + { + refs = + &VEC_index (varpool_node_ptr, partition->varpool_set->nodes, + last_visited_varpool_node)->ref_list; + last_visited_varpool_node++; + } + + /* Compute boundary cost of IPA REF edges and at the same time look into + variables referenced from current partition and try to add them. */ + for (j = 0; ipa_ref_list_reference_iterate (refs, j, ref); j++) + if (ref->refered_type == IPA_REF_VARPOOL) + { + varpool_node_set_iterator vsi; + + vnode = ipa_ref_varpool_node (ref); + if (!vnode->finalized) + continue; + if (!vnode->aux && partition_varpool_node_p (vnode)) + add_varpool_node_to_partition (partition, vnode); + vsi = varpool_node_set_find (partition->varpool_set, vnode); + if (!vsi_end_p (vsi) + && vsi.index < last_visited_varpool_node - !cgraph_p) + cost--, internal++; + else + cost++; + } + else + { + cgraph_node_set_iterator csi; + + node = ipa_ref_node (ref); + if (!node->analyzed) + continue; + csi = cgraph_node_set_find (partition->cgraph_set, node); + if (!csi_end_p (csi) + && csi.index < last_visited_cgraph_node - cgraph_p) + cost--, internal++; + else + cost++; + } + for (j = 0; ipa_ref_list_refering_iterate (refs, j, ref); j++) + if (ref->refering_type == IPA_REF_VARPOOL) + { + varpool_node_set_iterator vsi; + + vnode = ipa_ref_refering_varpool_node (ref); + gcc_assert (vnode->finalized); + if (!vnode->aux && partition_varpool_node_p (vnode)) + add_varpool_node_to_partition (partition, vnode); + vsi = varpool_node_set_find (partition->varpool_set, vnode); + if (!vsi_end_p (vsi) + && vsi.index < last_visited_varpool_node) + cost--; + else + cost++; + } + else + { + cgraph_node_set_iterator csi; + + node = ipa_ref_refering_node (ref); + gcc_assert (node->analyzed); + csi = cgraph_node_set_find (partition->cgraph_set, node); + if (!csi_end_p (csi) + && csi.index < last_visited_cgraph_node) + cost--; + else + cost++; + } + } + + /* If the partition is large enough, start looking for smallest boundary cost. */ + if (partition->insns < partition_size * 3 / 4 + || best_cost == INT_MAX + || ((!cost + || (best_internal * (HOST_WIDE_INT) cost + > (internal * (HOST_WIDE_INT)best_cost))) + && partition->insns < partition_size * 5 / 4)) + { + best_cost = cost; + best_internal = internal; + best_i = i; + best_n_nodes = VEC_length (cgraph_node_ptr, + partition->cgraph_set->nodes); + best_n_varpool_nodes = VEC_length (varpool_node_ptr, + partition->varpool_set->nodes); + } + if (cgraph_dump_file) + fprintf (cgraph_dump_file, "Step %i: added %s, size %i, cost %i/%i best %i/%i, step %i\n", i, + cgraph_node_name (order[i]), partition->insns, cost, internal, + best_cost, best_internal, best_i); + /* Partition is too large, unwind into step when best cost was reached and + start new partition. */ + if (partition->insns > 2 * partition_size) + { + if (best_i != i) + { + if (cgraph_dump_file) + fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n", + i - best_i, best_i); + undo_partition (partition, best_n_nodes, best_n_varpool_nodes); + } + i = best_i; + partition = new_partition (""); + last_visited_cgraph_node = 0; + last_visited_varpool_node = 0; + cost = 0; + + if (cgraph_dump_file) + fprintf (cgraph_dump_file, "New partition\n"); + best_n_nodes = 0; + best_n_varpool_nodes = 0; + best_cost = INT_MAX; + + /* Since the size of partitions is just approximate, update the size after + we finished current one. */ + if (npartitions < PARAM_VALUE (PARAM_LTO_PARTITIONS)) + partition_size = total_size + / (PARAM_VALUE (PARAM_LTO_PARTITIONS) - npartitions); + else + partition_size = INT_MAX; + + if (partition_size < PARAM_VALUE (MIN_PARTITION_SIZE)) + partition_size = PARAM_VALUE (MIN_PARTITION_SIZE); + npartitions ++; + } + } + + /* Varables that are not reachable from the code go into last partition. */ + for (vnode = varpool_nodes; vnode; vnode = vnode->next) + if (partition_varpool_node_p (vnode) && !vnode->aux) + add_varpool_node_to_partition (partition, vnode); + free (order); +} + /* Promote variable VNODE to be static. */ static bool @@ -1990,7 +2329,10 @@ do_whole_program_analysis (void) /* We are about to launch the final LTRANS phase, stop the WPA timer. */ timevar_pop (TV_WHOPR_WPA); - lto_1_to_1_map (); + if (flag_lto_partition_1to1) + lto_1_to_1_map (); + else + lto_balanced_map (); if (!quiet_flag) { diff --git a/gcc/opts.c b/gcc/opts.c index 05603358170..03098b53772 100644 --- a/gcc/opts.c +++ b/gcc/opts.c @@ -1081,6 +1081,13 @@ decode_options (unsigned int argc, const char **argv, error ("LTO support has not been enabled in this configuration"); #endif } + if (flag_lto_partition_balanced || flag_lto_partition_1to1) + { + if (flag_lto_partition_balanced && flag_lto_partition_1to1) + error ("Only one -flto-partitoin value can be specified"); + if (!flag_whopr) + error ("-flto-partitoin has effect only with -fwhopr"); + } /* Reconcile -flto and -fwhopr. Set additional flags as appropriate and check option consistency. */ diff --git a/gcc/params.def b/gcc/params.def index fef75dd5b56..49a6185d48b 100644 --- a/gcc/params.def +++ b/gcc/params.def @@ -838,6 +838,17 @@ DEFPARAM (PARAM_DEVIRT_TYPE_LIST_SIZE, "devirtualization", 8, 0, 0) +/* WHOPR partitioning configuration. */ + +DEFPARAM (PARAM_LTO_PARTITIONS, + "lto-partitions", + "Number of paritions program should be split to", + 32, 0, 0) + +DEFPARAM (MIN_PARTITION_SIZE, + "lto-min-partition", + "Size of minimal paritition for WHOPR (in estimated instructions)", + 1000, 0, 0) /* Local variables: mode:c