* config/arm/arm.c (arm_expand_builtin): Remove redundant declaration.

[pf3gnuchains/gcc-fork.git] / gcc / config / arm / arm.c
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c

index 2bb2875..884a1bd 100644 (file)
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -151,7 +151,6 @@ static bool arm_memory_load_p (rtx);
  static bool arm_cirrus_insn_p (rtx);
  static void cirrus_reorg (rtx);
  static void arm_init_builtins (void);
-static rtx arm_expand_builtin (tree, rtx, rtx, enum machine_mode, int);
  static void arm_init_iwmmxt_builtins (void);
  static rtx safe_vector_operand (rtx, enum machine_mode);
  static rtx arm_expand_binop_builtin (enum insn_code, tree, rtx);
@@ -526,6 +525,9 @@ int making_const_table;
  /* The processor for which instructions should be scheduled.  */
  enum processor_type arm_tune = arm_none;
  
+/* The current tuning set.  */
+const struct tune_params *current_tune;
+
  /* The default processor used if not overridden by commandline.  */
  static enum processor_type arm_default_cpu = arm_none;
  
@@ -698,9 +700,6 @@ unsigned arm_pic_register = INVALID_REGNUM;
     the next function.  */
  static int after_arm_reorg = 0;
  
-/* The maximum number of insns to be used when loading a constant.  */
-static int arm_constant_limit = 3;
-
  static enum arm_pcs arm_pcs_default;
  
  /* For an explanation of these variables, see final_prescan_insn below.  */
@@ -739,7 +738,31 @@ struct processors
    enum processor_type core;
    const char *arch;
    const unsigned long flags;
-  bool (* rtx_costs) (rtx, enum rtx_code, enum rtx_code, int *, bool);
+  const struct tune_params *const tune;
+};
+
+const struct tune_params arm_slowmul_tune =
+{
+  arm_slowmul_rtx_costs,
+  3
+};
+
+const struct tune_params arm_fastmul_tune =
+{
+  arm_fastmul_rtx_costs,
+  1
+};
+
+const struct tune_params arm_xscale_tune =
+{
+  arm_xscale_rtx_costs,
+  2
+};
+
+const struct tune_params arm_9e_tune =
+{
+  arm_9e_rtx_costs,
+  1
  };
  
  /* Not all of these give usefully different compilation alternatives,
@@ -748,7 +771,7 @@ static const struct processors all_cores[] =
  {
    /* ARM Cores */
  #define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \
-  {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, arm_##COSTS##_rtx_costs},
+  {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, &arm_##COSTS##_tune},
  #include "arm-cores.def"
  #undef ARM_CORE
    {NULL, arm_none, NULL, 0, NULL}
@@ -757,7 +780,7 @@ static const struct processors all_cores[] =
  static const struct processors all_architectures[] =
  {
    /* ARM Architectures */
-  /* We don't specify rtx_costs here as it will be figured out
+  /* We don't specify tuning costs here as it will be figured out
       from the core.  */
  
    {"armv2",   arm2,       "2",   FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH2, NULL},
@@ -906,6 +929,13 @@ enum tls_reloc {
    TLS_LE32
  };
  
+/* The maximum number of insns to be used when loading a constant.  */
+inline static int
+arm_constant_limit (bool size_p)
+{
+  return size_p ? 1 : current_tune->constant_limit;
+}
+
  /* Emit an insn that's a simple single-set.  Both the operands must be known
     to be valid.  */
  inline static rtx
@@ -1446,6 +1476,7 @@ arm_override_options (void)
    gcc_assert (arm_tune != arm_none);
  
    tune_flags = all_cores[(int)arm_tune].flags;
+  current_tune = all_cores[(int)arm_tune].tune;
  
    if (target_fp16_format_name)
      {
@@ -1842,26 +1873,12 @@ arm_override_options (void)
  
    if (optimize_size)
      {
-      arm_constant_limit = 1;
-
        /* If optimizing for size, bump the number of instructions that we
           are prepared to conditionally execute (even on a StrongARM).  */
        max_insns_skipped = 6;
      }
    else
      {
-      /* For processors with load scheduling, it never costs more than
-         2 cycles to load a constant, and the load scheduler may well
-        reduce that to 1.  */
-      if (arm_ld_sched)
-        arm_constant_limit = 1;
-
-      /* On XScale the longer latency of a load makes it more difficult
-         to achieve a good schedule, so it's faster to synthesize
-        constants that can be done in two insns.  */
-      if (arm_tune_xscale)
-        arm_constant_limit = 2;
-
        /* StrongARM has early execution of branches, so a sequence
           that is worth skipping is shorter.  */
        if (arm_tune_strongarm)
@@ -2362,7 +2379,8 @@ arm_split_constant (enum rtx_code code, enum machine_mode mode, rtx insn,
           && !cond
           && (arm_gen_constant (code, mode, NULL_RTX, val, target, source,
                                 1, 0)
-             > arm_constant_limit + (code != SET)))
+             > (arm_constant_limit (optimize_function_for_size_p (cfun))
+                + (code != SET))))
         {
           if (code == SET)
             {
@@ -7294,9 +7312,9 @@ arm_rtx_costs (rtx x, int code, int outer_code, int *total,
      return arm_size_rtx_costs (x, (enum rtx_code) code,
                                (enum rtx_code) outer_code, total);
    else
-    return all_cores[(int)arm_tune].rtx_costs (x, (enum rtx_code) code,
-                                              (enum rtx_code) outer_code,
-                                              total, speed);
+    return current_tune->rtx_costs (x, (enum rtx_code) code,
+                                   (enum rtx_code) outer_code,
+                                   total, speed);
  }
  
  /* RTX costs for cores with a slow MUL implementation.  Thumb-2 is not
@@ -7441,7 +7459,8 @@ arm_fastmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
     so it can be ignored.  */
  
  static bool
-arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, int *total, bool speed)
+arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+                     int *total, bool speed)
  {
    enum machine_mode mode = GET_MODE (x);
  
@@ -19382,6 +19401,51 @@ thumb_compute_initial_elimination_offset (unsigned int from, unsigned int to)
      }
  }
  
+/* Given the stack offsets and register mask in OFFSETS, decide
+   how many additional registers to push instead of subtracting
+   a constant from SP.  */
+static int
+thumb1_extra_regs_pushed (arm_stack_offsets *offsets)
+{
+  HOST_WIDE_INT amount = offsets->outgoing_args - offsets->saved_regs;
+  unsigned long live_regs_mask = offsets->saved_regs_mask;
+  /* Extract a mask of the ones we can give to the Thumb's push instruction.  */
+  unsigned long l_mask = live_regs_mask & 0x40ff;
+  /* Then count how many other high registers will need to be pushed.  */
+  unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
+  int n_free;
+
+  /* If the stack frame size is 512 exactly, we can save one load
+     instruction, which should make this a win even when optimizing
+     for speed.  */
+  if (!optimize_size && amount != 512)
+    return 0;
+
+  /* Can't do this if there are high registers to push, or if we
+     are not going to do a push at all.  */
+  if (high_regs_pushed != 0 || l_mask == 0)
+    return 0;
+
+  /* Don't do this if thumb1_expand_prologue wants to emit instructions
+     between the push and the stack frame allocation.  */
+  if ((flag_pic && arm_pic_register != INVALID_REGNUM)
+      || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))
+    return 0;
+
+  for (n_free = 0; n_free < 8 && !(live_regs_mask & 1); live_regs_mask >>= 1)
+    n_free++;
+
+  if (n_free == 0)
+    return 0;
+  gcc_assert (amount / 4 * 4 == amount);
+
+  if (amount >= 512 && (amount - n_free * 4) < 512)
+    return (amount - 508) / 4;
+  if (amount <= n_free * 4)
+    return amount / 4;
+  return 0;
+}
+
  /* Generate the rest of a function's prologue.  */
  void
  thumb1_expand_prologue (void)
@@ -19418,6 +19482,7 @@ thumb1_expand_prologue (void)
                     stack_pointer_rtx);
  
    amount = offsets->outgoing_args - offsets->saved_regs;
+  amount -= 4 * thumb1_extra_regs_pushed (offsets);
    if (amount)
      {
        if (amount < 512)
@@ -19722,7 +19787,11 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED)
       register.  */
    else if ((l_mask & 0xff) != 0
            || (high_regs_pushed == 0 && l_mask))
-    thumb_pushpop (f, l_mask, 1, &cfa_offset, l_mask);
+    {
+      unsigned long mask = l_mask;
+      mask |= (1 << thumb1_extra_regs_pushed (offsets)) - 1;
+      thumb_pushpop (f, mask, 1, &cfa_offset, mask);
+    }
  
    if (high_regs_pushed)
      {