[v2] Add big-endian support to ARM assembler memcpy

Submitted by Andre McCurdy on Jan. 21, 2020, 6:52 p.m.

Details

Message ID 20200121185215.5958-1-armccurdy@gmail.com
State New
Series "Add big-endian support to ARM assembler memcpy"
Headers show

Commit Message

Andre McCurdy Jan. 21, 2020, 6:52 p.m.
Allow the existing ARM assembler memcpy implementation to be used for
both big and little endian targets.
---

Exactly the same changes as before but rebased to account for
whitespace changes in the preceding patch to add Thumb2 support.

 COPYRIGHT                                |   2 +-
 src/string/arm/{memcpy_le.S => memcpy.S} | 101 ++++++++++++++++++++++-
 src/string/arm/memcpy.c                  |   3 -
 3 files changed, 98 insertions(+), 8 deletions(-)
 rename src/string/arm/{memcpy_le.S => memcpy.S} (82%)
 delete mode 100644 src/string/arm/memcpy.c

Patch hide | download patch | download mbox

diff --git a/COPYRIGHT b/COPYRIGHT
index e6472371..d3edc2a2 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -127,7 +127,7 @@  Copyright © 2017-2018 Arm Limited
 and labelled as such in comments in the individual source files. All
 have been licensed under extremely permissive terms.
 
-The ARM memcpy code (src/string/arm/memcpy_el.S) is Copyright © 2008
+The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008
 The Android Open Source Project and is licensed under a two-clause BSD
 license. It was taken from Bionic libc, used on Android.
 
diff --git a/src/string/arm/memcpy_le.S b/src/string/arm/memcpy.S
similarity index 82%
rename from src/string/arm/memcpy_le.S
rename to src/string/arm/memcpy.S
index 7b35d305..869e3448 100644
--- a/src/string/arm/memcpy_le.S
+++ b/src/string/arm/memcpy.S
@@ -1,5 +1,3 @@ 
-#if !__ARMEB__
-
 /*
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
@@ -42,7 +40,7 @@ 
  * code safely callable from thumb mode, adjusting the return
  * instructions to be compatible with pre-thumb ARM cpus, removal of
  * prefetch code that is not compatible with older cpus and support for
- * building as thumb 2.
+ * building as thumb 2 and big-endian.
  */
 
 .syntax unified
@@ -227,24 +225,45 @@  non_congruent:
 	 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
 	 */
 	movs    r5, r5, lsl #31
+
+#if __ARMEB__
+	movmi   r3, r3, ror #24
+	strbmi	r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs	r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs	r3, [r0], #1
+#else
 	strbmi r3, [r0], #1
 	movmi   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
+#endif
 
 	cmp     r2, #4
 	blo     partial_word_tail
 
+#if __ARMEB__
+	mov	r3, r3, lsr r12
+	mov	r3, r3, lsl r12
+#endif
+
 	/* Align destination to 32 bytes (cache line boundary) */
 1:      tst     r0, #0x1c
 	beq     2f
 	ldr     r5, [r1], #4
 	sub     r2, r2, #4
+#if __ARMEB__
+	mov     r4, r5,                 lsr lr
+	orr     r4, r4, r3
+	mov     r3, r5,                 lsl r12
+#else
 	mov     r4, r5,                 lsl lr
 	orr     r4, r4, r3
 	mov     r3, r5,                 lsr r12
+#endif
 	str     r4, [r0], #4
 	cmp     r2, #4
 	bhs     1b
@@ -270,6 +289,25 @@  loop16:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #16
+	mov     r4, r4, lsl #16
+	orr     r4, r4, r5, lsr #16
+	mov     r5, r5, lsl #16
+	orr     r5, r5, r6, lsr #16
+	mov     r6, r6, lsl #16
+	orr     r6, r6, r7, lsr #16
+	mov     r7, r7, lsl #16
+	orr     r7, r7, r8, lsr #16
+	mov     r8, r8, lsl #16
+	orr     r8, r8, r9, lsr #16
+	mov     r9, r9, lsl #16
+	orr     r9, r9, r10, lsr #16
+	mov     r10, r10,               lsl #16
+	orr     r10, r10, r11, lsr #16
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #16
+#else
 	orr     r3, r3, r4, lsl #16
 	mov     r4, r4, lsr #16
 	orr     r4, r4, r5, lsl #16
@@ -287,6 +325,7 @@  loop16:
 	orr     r10, r10, r11, lsl #16
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #16
+#endif
 	bhs     1b
 	b       less_than_thirtytwo
 
@@ -296,6 +335,25 @@  loop8:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #24
+	mov     r4, r4, lsl #8
+	orr     r4, r4, r5, lsr #24
+	mov     r5, r5, lsl #8
+	orr     r5, r5, r6, lsr #24
+	mov     r6, r6,  lsl #8
+	orr     r6, r6, r7, lsr #24
+	mov     r7, r7,  lsl #8
+	orr     r7, r7, r8,             lsr #24
+	mov     r8, r8,  lsl #8
+	orr     r8, r8, r9,             lsr #24
+	mov     r9, r9,  lsl #8
+	orr     r9, r9, r10,    lsr #24
+	mov     r10, r10, lsl #8
+	orr     r10, r10, r11,  lsr #24
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #8
+#else
 	orr     r3, r3, r4, lsl #24
 	mov     r4, r4, lsr #8
 	orr     r4, r4, r5, lsl #24
@@ -313,6 +371,7 @@  loop8:
 	orr     r10, r10, r11,  lsl #24
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #8
+#endif
 	bhs     1b
 	b       less_than_thirtytwo
 
@@ -322,6 +381,25 @@  loop24:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #8
+	mov     r4, r4, lsl #24
+	orr     r4, r4, r5, lsr #8
+	mov     r5, r5, lsl #24
+	orr     r5, r5, r6, lsr #8
+	mov     r6, r6, lsl #24
+	orr     r6, r6, r7, lsr #8
+	mov     r7, r7, lsl #24
+	orr     r7, r7, r8, lsr #8
+	mov     r8, r8, lsl #24
+	orr     r8, r8, r9, lsr #8
+	mov     r9, r9, lsl #24
+	orr     r9, r9, r10, lsr #8
+	mov     r10, r10, lsl #24
+	orr     r10, r10, r11, lsr #8
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #24
+#else
 	orr     r3, r3, r4, lsl #8
 	mov     r4, r4, lsr #24
 	orr     r4, r4, r5, lsl #8
@@ -339,6 +417,7 @@  loop24:
 	orr     r10, r10, r11, lsl #8
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #24
+#endif
 	bhs     1b
 
 less_than_thirtytwo:
@@ -350,9 +429,15 @@  less_than_thirtytwo:
 
 1:      ldr     r5, [r1], #4
 	sub     r2, r2, #4
+#if __ARMEB__
+	mov     r4, r5,                 lsr lr
+	orr     r4, r4, r3
+	mov     r3,     r5,                     lsl r12
+#else
 	mov     r4, r5,                 lsl lr
 	orr     r4, r4, r3
 	mov     r3,     r5,                     lsr r12
+#endif
 	str     r4, [r0], #4
 	cmp     r2, #4
 	bhs     1b
@@ -360,11 +445,20 @@  less_than_thirtytwo:
 partial_word_tail:
 	/* we have a partial word in the input buffer */
 	movs    r5, lr, lsl #(31-3)
+#if __ARMEB__
+	movmi   r3, r3, ror #24
+	strbmi r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs r3, [r0], #1
+#else
 	strbmi r3, [r0], #1
 	movmi   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
 	strbcs r3, [r0], #1
+#endif
 
 	/* Refill spilled registers from the stack. Don't update sp. */
 	ldmfd   sp, {r5-r11}
@@ -383,4 +477,3 @@  copy_last_3_and_return:
 	ldmfd   sp!, {r0, r4, lr}
 	bx      lr
 
-#endif
diff --git a/src/string/arm/memcpy.c b/src/string/arm/memcpy.c
deleted file mode 100644
index 041614f4..00000000
--- a/src/string/arm/memcpy.c
+++ /dev/null
@@ -1,3 +0,0 @@ 
-#if __ARMEB__
-#include "../memcpy.c"
-#endif

Comments

Rich Felker Jan. 22, 2020, 12:36 a.m.
On Tue, Jan 21, 2020 at 10:52:15AM -0800, Andre McCurdy wrote:
> Allow the existing ARM assembler memcpy implementation to be used for
> both big and little endian targets.
> ---
> 
> Exactly the same changes as before but rebased to account for
> whitespace changes in the preceding patch to add Thumb2 support.

Thanks. I'm not sure if I'll ever apply this, since I think there's a
better memcpy we may be adopting, but it's nice to have in case we
want to.

Rich
Andre McCurdy Jan. 22, 2020, 12:47 a.m.
On Tue, Jan 21, 2020 at 4:36 PM Rich Felker <dalias@libc.org> wrote:
> On Tue, Jan 21, 2020 at 10:52:15AM -0800, Andre McCurdy wrote:
> > Allow the existing ARM assembler memcpy implementation to be used for
> > both big and little endian targets.
> > ---
> >
> > Exactly the same changes as before but rebased to account for
> > whitespace changes in the preceding patch to add Thumb2 support.
>
> Thanks. I'm not sure if I'll ever apply this, since I think there's a
> better memcpy we may be adopting, but it's nice to have in case we
> want to.

OK. Slightly disappointed to hear that. I've already been carrying
this patch for many months in my own builds so hopefully the new
memcpy() which will allow it to be dropped is imminent!
Rich Felker Jan. 22, 2020, 1:31 a.m.
On Tue, Jan 21, 2020 at 04:47:42PM -0800, Andre McCurdy wrote:
> On Tue, Jan 21, 2020 at 4:36 PM Rich Felker <dalias@libc.org> wrote:
> > On Tue, Jan 21, 2020 at 10:52:15AM -0800, Andre McCurdy wrote:
> > > Allow the existing ARM assembler memcpy implementation to be used for
> > > both big and little endian targets.
> > > ---
> > >
> > > Exactly the same changes as before but rebased to account for
> > > whitespace changes in the preceding patch to add Thumb2 support.
> >
> > Thanks. I'm not sure if I'll ever apply this, since I think there's a
> > better memcpy we may be adopting, but it's nice to have in case we
> > want to.
> 
> OK. Slightly disappointed to hear that. I've already been carrying
> this patch for many months in my own builds so hopefully the new
> memcpy() which will allow it to be dropped is imminent!

If it doesn't look immediately clear that we should go with the new
one, I'll go ahead and merge this after release. I wasn't aware anyone
was using or cared about BE, but if you do I don't want to hold it
back just because it might no longer be relevant in future.

Rich