From 8f32c760a0b58712d8cd4fc71bf55dbbd7505c34 Mon Sep 17 00:00:00 2001 From: jules Date: Tue, 18 Oct 2011 10:49:44 +0000 Subject: [PATCH] gcc/ * config/arm/arm.c (arm_block_move_unaligned_straight) (arm_adjust_block_mem, arm_block_move_unaligned_loop) (arm_movmemqi_unaligned): New. (arm_gen_movmemqi): Support unaligned block copies. gcc/testsuite/ * lib/target-supports.exp (check_effective_target_arm_unaligned): New. * gcc.target/arm/unaligned-memcpy-1.c: New. * gcc.target/arm/unaligned-memcpy-2.c: New. * gcc.target/arm/unaligned-memcpy-3.c: New. * gcc.target/arm/unaligned-memcpy-4.c: New. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@180131 138bc75d-0d04-0410-961f-82ee72b054a4 --- gcc/ChangeLog | 7 + gcc/config/arm/arm.c | 338 +++++++++++++++++++++- gcc/testsuite/ChangeLog | 8 + gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c | 19 ++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c | 21 ++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c | 21 ++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c | 18 ++ gcc/testsuite/lib/target-supports.exp | 12 + 8 files changed, 442 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c create mode 100644 gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c create mode 100644 gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c create mode 100644 gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1cc34b217e2..2968fb0de5e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2011-10-18 Julian Brown + + * config/arm/arm.c (arm_block_move_unaligned_straight) + (arm_adjust_block_mem, arm_block_move_unaligned_loop) + (arm_movmemqi_unaligned): New. + (arm_gen_movmemqi): Support unaligned block copies. + 2011-10-18 Ira Rosen * doc/md.texi (vec_widen_ushiftl_hi, vec_widen_ushiftl_lo, diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index a429c192fdb..f1ada6f9a73 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -10766,6 +10766,335 @@ gen_const_stm_seq (rtx *operands, int nops) return true; } +/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit + unaligned copies on processors which support unaligned semantics for those + instructions. INTERLEAVE_FACTOR can be used to attempt to hide load latency + (using more registers) by doing e.g. load/load/store/store for a factor of 2. + An interleave factor of 1 (the minimum) will perform no interleaving. + Load/store multiple are used for aligned addresses where possible. */ + +static void +arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase, + HOST_WIDE_INT length, + unsigned int interleave_factor) +{ + rtx *regs = XALLOCAVEC (rtx, interleave_factor); + int *regnos = XALLOCAVEC (int, interleave_factor); + HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD; + HOST_WIDE_INT i, j; + HOST_WIDE_INT remaining = length, words; + rtx halfword_tmp = NULL, byte_tmp = NULL; + rtx dst, src; + bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD; + bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD; + HOST_WIDE_INT srcoffset, dstoffset; + HOST_WIDE_INT src_autoinc, dst_autoinc; + rtx mem, addr; + + gcc_assert (1 <= interleave_factor && interleave_factor <= 4); + + /* Use hard registers if we have aligned source or destination so we can use + load/store multiple with contiguous registers. */ + if (dst_aligned || src_aligned) + for (i = 0; i < interleave_factor; i++) + regs[i] = gen_rtx_REG (SImode, i); + else + for (i = 0; i < interleave_factor; i++) + regs[i] = gen_reg_rtx (SImode); + + dst = copy_addr_to_reg (XEXP (dstbase, 0)); + src = copy_addr_to_reg (XEXP (srcbase, 0)); + + srcoffset = dstoffset = 0; + + /* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST. + For copying the last bytes we want to subtract this offset again. */ + src_autoinc = dst_autoinc = 0; + + for (i = 0; i < interleave_factor; i++) + regnos[i] = i; + + /* Copy BLOCK_SIZE_BYTES chunks. */ + + for (i = 0; i + block_size_bytes <= length; i += block_size_bytes) + { + /* Load words. */ + if (src_aligned && interleave_factor > 1) + { + emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src, + TRUE, srcbase, &srcoffset)); + src_autoinc += UNITS_PER_WORD * interleave_factor; + } + else + { + for (j = 0; j < interleave_factor; j++) + { + addr = plus_constant (src, srcoffset + j * UNITS_PER_WORD + - src_autoinc); + mem = adjust_automodify_address (srcbase, SImode, addr, + srcoffset + j * UNITS_PER_WORD); + emit_insn (gen_unaligned_loadsi (regs[j], mem)); + } + srcoffset += block_size_bytes; + } + + /* Store words. */ + if (dst_aligned && interleave_factor > 1) + { + emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst, + TRUE, dstbase, &dstoffset)); + dst_autoinc += UNITS_PER_WORD * interleave_factor; + } + else + { + for (j = 0; j < interleave_factor; j++) + { + addr = plus_constant (dst, dstoffset + j * UNITS_PER_WORD + - dst_autoinc); + mem = adjust_automodify_address (dstbase, SImode, addr, + dstoffset + j * UNITS_PER_WORD); + emit_insn (gen_unaligned_storesi (mem, regs[j])); + } + dstoffset += block_size_bytes; + } + + remaining -= block_size_bytes; + } + + /* Copy any whole words left (note these aren't interleaved with any + subsequent halfword/byte load/stores in the interests of simplicity). */ + + words = remaining / UNITS_PER_WORD; + + gcc_assert (words < interleave_factor); + + if (src_aligned && words > 1) + { + emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase, + &srcoffset)); + src_autoinc += UNITS_PER_WORD * words; + } + else + { + for (j = 0; j < words; j++) + { + addr = plus_constant (src, + srcoffset + j * UNITS_PER_WORD - src_autoinc); + mem = adjust_automodify_address (srcbase, SImode, addr, + srcoffset + j * UNITS_PER_WORD); + emit_insn (gen_unaligned_loadsi (regs[j], mem)); + } + srcoffset += words * UNITS_PER_WORD; + } + + if (dst_aligned && words > 1) + { + emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase, + &dstoffset)); + dst_autoinc += words * UNITS_PER_WORD; + } + else + { + for (j = 0; j < words; j++) + { + addr = plus_constant (dst, + dstoffset + j * UNITS_PER_WORD - dst_autoinc); + mem = adjust_automodify_address (dstbase, SImode, addr, + dstoffset + j * UNITS_PER_WORD); + emit_insn (gen_unaligned_storesi (mem, regs[j])); + } + dstoffset += words * UNITS_PER_WORD; + } + + remaining -= words * UNITS_PER_WORD; + + gcc_assert (remaining < 4); + + /* Copy a halfword if necessary. */ + + if (remaining >= 2) + { + halfword_tmp = gen_reg_rtx (SImode); + + addr = plus_constant (src, srcoffset - src_autoinc); + mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset); + emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem)); + + /* Either write out immediately, or delay until we've loaded the last + byte, depending on interleave factor. */ + if (interleave_factor == 1) + { + addr = plus_constant (dst, dstoffset - dst_autoinc); + mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset); + emit_insn (gen_unaligned_storehi (mem, + gen_lowpart (HImode, halfword_tmp))); + halfword_tmp = NULL; + dstoffset += 2; + } + + remaining -= 2; + srcoffset += 2; + } + + gcc_assert (remaining < 2); + + /* Copy last byte. */ + + if ((remaining & 1) != 0) + { + byte_tmp = gen_reg_rtx (SImode); + + addr = plus_constant (src, srcoffset - src_autoinc); + mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset); + emit_move_insn (gen_lowpart (QImode, byte_tmp), mem); + + if (interleave_factor == 1) + { + addr = plus_constant (dst, dstoffset - dst_autoinc); + mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset); + emit_move_insn (mem, gen_lowpart (QImode, byte_tmp)); + byte_tmp = NULL; + dstoffset++; + } + + remaining--; + srcoffset++; + } + + /* Store last halfword if we haven't done so already. */ + + if (halfword_tmp) + { + addr = plus_constant (dst, dstoffset - dst_autoinc); + mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset); + emit_insn (gen_unaligned_storehi (mem, + gen_lowpart (HImode, halfword_tmp))); + dstoffset += 2; + } + + /* Likewise for last byte. */ + + if (byte_tmp) + { + addr = plus_constant (dst, dstoffset - dst_autoinc); + mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset); + emit_move_insn (mem, gen_lowpart (QImode, byte_tmp)); + dstoffset++; + } + + gcc_assert (remaining == 0 && srcoffset == dstoffset); +} + +/* From mips_adjust_block_mem: + + Helper function for doing a loop-based block operation on memory + reference MEM. Each iteration of the loop will operate on LENGTH + bytes of MEM. + + Create a new base register for use within the loop and point it to + the start of MEM. Create a new memory reference that uses this + register. Store them in *LOOP_REG and *LOOP_MEM respectively. */ + +static void +arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg, + rtx *loop_mem) +{ + *loop_reg = copy_addr_to_reg (XEXP (mem, 0)); + + /* Although the new mem does not refer to a known location, + it does keep up to LENGTH bytes of alignment. */ + *loop_mem = change_address (mem, BLKmode, *loop_reg); + set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT)); +} + +/* From mips_block_move_loop: + + Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER + bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that + the memory regions do not overlap. */ + +static void +arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length, + unsigned int interleave_factor, + HOST_WIDE_INT bytes_per_iter) +{ + rtx label, src_reg, dest_reg, final_src, test; + HOST_WIDE_INT leftover; + + leftover = length % bytes_per_iter; + length -= leftover; + + /* Create registers and memory references for use within the loop. */ + arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src); + arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest); + + /* Calculate the value that SRC_REG should have after the last iteration of + the loop. */ + final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length), + 0, 0, OPTAB_WIDEN); + + /* Emit the start of the loop. */ + label = gen_label_rtx (); + emit_label (label); + + /* Emit the loop body. */ + arm_block_move_unaligned_straight (dest, src, bytes_per_iter, + interleave_factor); + + /* Move on to the next block. */ + emit_move_insn (src_reg, plus_constant (src_reg, bytes_per_iter)); + emit_move_insn (dest_reg, plus_constant (dest_reg, bytes_per_iter)); + + /* Emit the loop condition. */ + test = gen_rtx_NE (VOIDmode, src_reg, final_src); + emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label)); + + /* Mop up any left-over bytes. */ + if (leftover) + arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor); +} + +/* Emit a block move when either the source or destination is unaligned (not + aligned to a four-byte boundary). This may need further tuning depending on + core type, optimize_size setting, etc. */ + +static int +arm_movmemqi_unaligned (rtx *operands) +{ + HOST_WIDE_INT length = INTVAL (operands[2]); + + if (optimize_size) + { + bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD; + bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD; + /* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit + size of code if optimizing for size. We'll use ldm/stm if src_aligned + or dst_aligned though: allow more interleaving in those cases since the + resulting code can be smaller. */ + unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1; + HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4; + + if (length > 12) + arm_block_move_unaligned_loop (operands[0], operands[1], length, + interleave_factor, bytes_per_iter); + else + arm_block_move_unaligned_straight (operands[0], operands[1], length, + interleave_factor); + } + else + { + /* Note that the loop created by arm_block_move_unaligned_loop may be + subject to loop unrolling, which makes tuning this condition a little + redundant. */ + if (length > 32) + arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16); + else + arm_block_move_unaligned_straight (operands[0], operands[1], length, 4); + } + + return 1; +} + int arm_gen_movmemqi (rtx *operands) { @@ -10778,8 +11107,13 @@ arm_gen_movmemqi (rtx *operands) if (GET_CODE (operands[2]) != CONST_INT || GET_CODE (operands[3]) != CONST_INT - || INTVAL (operands[2]) > 64 - || INTVAL (operands[3]) & 3) + || INTVAL (operands[2]) > 64) + return 0; + + if (unaligned_access && (INTVAL (operands[3]) & 3) != 0) + return arm_movmemqi_unaligned (operands); + + if (INTVAL (operands[3]) & 3) return 0; dstbase = operands[0]; diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index db9417b9ddb..3216bfcb47a 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,11 @@ +2011-10-18 Julian Brown + + * lib/target-supports.exp (check_effective_target_arm_unaligned): New. + * gcc.target/arm/unaligned-memcpy-1.c: New. + * gcc.target/arm/unaligned-memcpy-2.c: New. + * gcc.target/arm/unaligned-memcpy-3.c: New. + * gcc.target/arm/unaligned-memcpy-4.c: New. + 2011-10-18 Janus Weil PR fortran/47023 diff --git a/gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c new file mode 100644 index 00000000000..c4f56404225 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_unaligned } */ +/* { dg-options "-O2" } */ + +#include + +void unknown_alignment (char *dest, char *src) +{ + memcpy (dest, src, 15); +} + +/* We should see three unaligned word loads and store pairs, one unaligned + ldrh/strh pair, and an ldrb/strb pair. Sanity check that. */ + +/* { dg-final { scan-assembler-times "@ unaligned" 8 } } */ +/* { dg-final { scan-assembler-times "ldrh" 1 } } */ +/* { dg-final { scan-assembler-times "strh" 1 } } */ +/* { dg-final { scan-assembler-times "ldrb" 1 } } */ +/* { dg-final { scan-assembler-times "strb" 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c new file mode 100644 index 00000000000..c7d24c9c5c3 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_unaligned } */ +/* { dg-options "-O2" } */ + +#include + +char dest[16]; + +void aligned_dest (char *src) +{ + memcpy (dest, src, 15); +} + +/* Expect a multi-word store for the main part of the copy, but subword + loads/stores for the remainder. */ + +/* { dg-final { scan-assembler-times "stmia" 1 } } */ +/* { dg-final { scan-assembler-times "ldrh" 1 } } */ +/* { dg-final { scan-assembler-times "strh" 1 } } */ +/* { dg-final { scan-assembler-times "ldrb" 1 } } */ +/* { dg-final { scan-assembler-times "strb" 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c new file mode 100644 index 00000000000..5f0413738c5 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_unaligned } */ +/* { dg-options "-O2" } */ + +#include + +char src[16]; + +void aligned_src (char *dest) +{ + memcpy (dest, src, 15); +} + +/* Expect a multi-word load for the main part of the copy, but subword + loads/stores for the remainder. */ + +/* { dg-final { scan-assembler-times "ldmia" 1 } } */ +/* { dg-final { scan-assembler-times "ldrh" 1 } } */ +/* { dg-final { scan-assembler-times "strh" 1 } } */ +/* { dg-final { scan-assembler-times "ldrb" 1 } } */ +/* { dg-final { scan-assembler-times "strb" 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c new file mode 100644 index 00000000000..99957086e7e --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_unaligned } */ +/* { dg-options "-O2" } */ + +#include + +char src[16]; +char dest[16]; + +void aligned_both (void) +{ + memcpy (dest, src, 15); +} + +/* We know both src and dest to be aligned: expect multiword loads/stores. */ + +/* { dg-final { scan-assembler-times "ldmia" 1 } } */ +/* { dg-final { scan-assembler-times "stmia" 1 } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index c4077ffabb6..f19c3c566c6 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -1973,6 +1973,18 @@ proc check_effective_target_arm_dsp { } { }] } +# Return 1 if this is an ARM target that supports unaligned word/halfword +# load/store instructions. + +proc check_effective_target_arm_unaligned { } { + return [check_no_compiler_messages arm_unaligned assembly { + #ifndef __ARM_FEATURE_UNALIGNED + #error no unaligned support + #endif + int i; + }] +} + # Add the options needed for NEON. We need either -mfloat-abi=softfp # or -mfloat-abi=hard, but if one is already specified by the # multilib, use it. Similarly, if a -mfpu option already enables -- 2.11.0