{
/* We're only using DImode here because it's a convenient size. */
ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i);
- ops[1] = adjust_address (mem, SImode, 8 * i);
+ ops[1] = adjust_address (mem, DImode, 8 * i);
if (reg_overlap_mentioned_p (ops[0], mem))
{
gcc_assert (overlap == -1);
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2 -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon" } */
+
+#include <arm_neon.h>
+#include <stddef.h>
+
+void *
+memset (DST, C, LENGTH)
+ void *DST;
+ int C;
+ size_t LENGTH;
+{
+ void* DST0 = DST;
+ unsigned char C_BYTE = C;
+
+
+ if (__builtin_expect(LENGTH < 4, 1)) {
+ size_t i = 0;
+ while (i < LENGTH) {
+ ((char*)DST)[i] = C_BYTE;
+ i++;
+ }
+ return DST;
+ }
+
+ const char* DST_end = (char*)DST + LENGTH;
+
+
+ while ((uintptr_t)DST % 4 != 0) {
+ *(char*) (DST++) = C_BYTE;
+ }
+
+
+ uint32_t C_SHORTWORD = (uint32_t)(unsigned char)(C_BYTE) * 0x01010101;
+
+
+ if (__builtin_expect(DST_end - (char*)DST >= 16, 0)) {
+ while ((uintptr_t)DST % 16 != 0) {
+ *((uint32_t*)((char*)(DST) + (0))) = C_SHORTWORD;
+ DST += 4;
+ }
+
+
+ uint8x16_t C_WORD = vdupq_n_u8(C_BYTE);
+
+
+
+
+
+ size_t i = 0;
+ LENGTH = DST_end - (char*)DST;
+ while (i + 16 * 16 <= LENGTH) {
+ *((uint8x16_t*)((char*)(DST) + (i))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 1))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 2))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 3))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 4))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 5))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 6))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 7))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 8))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 9))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 10))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 11))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 12))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 13))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 14))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 15))) = C_WORD;
+ i += 16 * 16;
+ }
+ while (i + 16 * 4 <= LENGTH) {
+ *((uint8x16_t*)((char*)(DST) + (i))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 1))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 2))) = C_WORD;
+ *((uint8x16_t*)((char*)(DST) + (i + 16 * 3))) = C_WORD;
+ i += 16 * 4;
+ }
+ while (i + 16 <= LENGTH) {
+ *((uint8x16_t*)((char*)(DST) + (i))) = C_WORD;
+ i += 16;
+ }
+ DST += i;
+ }
+
+ while (4 <= DST_end - (char*)DST) {
+ *((uint32_t*)((char*)(DST) + (0))) = C_SHORTWORD;
+ DST += 4;
+ }
+
+
+ while ((char*)DST < DST_end) {
+ *((char*)DST) = C_BYTE;
+ DST++;
+ }
+
+ return DST0;
+}