[2/2] Add big-endian support to ARM assembler memcpy

Submitted by Andre McCurdy on Sept. 13, 2019, 6:44 p.m.

Details

Message ID 20190913184432.29753-2-armccurdy@gmail.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Andre McCurdy Sept. 13, 2019, 6:44 p.m.
Allow the existing ARM assembler memcpy implementation to be used for
both big and little endian targets.
---
 COPYRIGHT                                |   2 +-
 src/string/arm/{memcpy_le.S => memcpy.S} | 101 ++++++++++++++++++++++-
 src/string/arm/memcpy.c                  |   3 -
 3 files changed, 98 insertions(+), 8 deletions(-)
 rename src/string/arm/{memcpy_le.S => memcpy.S} (83%)
 delete mode 100644 src/string/arm/memcpy.c

Patch hide | download patch | download mbox

diff --git a/COPYRIGHT b/COPYRIGHT
index 2525ffb5..96c2b070 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -126,7 +126,7 @@  Copyright © 2008 Stephen L. Moshier
 and labelled as such in comments in the individual source files. All
 have been licensed under extremely permissive terms.
 
-The ARM memcpy code (src/string/arm/memcpy_el.S) is Copyright © 2008
+The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008
 The Android Open Source Project and is licensed under a two-clause BSD
 license. It was taken from Bionic libc, used on Android.
 
diff --git a/src/string/arm/memcpy_le.S b/src/string/arm/memcpy.S
similarity index 83%
rename from src/string/arm/memcpy_le.S
rename to src/string/arm/memcpy.S
index 64bc5f9e..766a88a5 100644
--- a/src/string/arm/memcpy_le.S
+++ b/src/string/arm/memcpy.S
@@ -1,5 +1,3 @@ 
-#if !__ARMEB__
-
 /*
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
@@ -42,7 +40,7 @@ 
  * code safely callable from thumb mode, adjusting the return
  * instructions to be compatible with pre-thumb ARM cpus, removal of
  * prefetch code that is not compatible with older cpus and support for
- * building as thumb 2.
+ * building as thumb 2 and big-endian.
  */
 
 .syntax unified
@@ -227,24 +225,45 @@  non_congruent:
 	 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
 	 */
 	movs    r5, r5, lsl #31
+
+#if __ARMEB__
+	movmi   r3, r3, ror #24
+	strbmi	r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs	r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs	r3, [r0], #1
+#else
 	strbmi r3, [r0], #1
 	movmi   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
+#endif
 
 	cmp     r2, #4
 	blo     partial_word_tail
 
+#if __ARMEB__
+	mov	r3, r3, lsr r12
+	mov	r3, r3, lsl r12
+#endif
+
 	/* Align destination to 32 bytes (cache line boundary) */
 1:      tst     r0, #0x1c
 	beq     2f
 	ldr     r5, [r1], #4
 	sub     r2, r2, #4
+#if __ARMEB__
+	mov     r4, r5, lsr lr
+	orr     r4, r4, r3
+	mov     r3, r5, lsl r12
+#else
 	mov     r4, r5, lsl lr
 	orr     r4, r4, r3
 	mov     r3, r5, lsr r12
+#endif
 	str     r4, [r0], #4
 	cmp     r2, #4
 	bhs     1b
@@ -270,6 +289,25 @@  loop16:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #16
+	mov     r4, r4, lsl #16
+	orr     r4, r4, r5, lsr #16
+	mov     r5, r5, lsl #16
+	orr     r5, r5, r6, lsr #16
+	mov     r6, r6, lsl #16
+	orr     r6, r6, r7, lsr #16
+	mov     r7, r7, lsl #16
+	orr     r7, r7, r8, lsr #16
+	mov     r8, r8, lsl #16
+	orr     r8, r8, r9, lsr #16
+	mov     r9, r9, lsl #16
+	orr     r9, r9, r10, lsr #16
+	mov     r10, r10,               lsl #16
+	orr     r10, r10, r11, lsr #16
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #16
+#else
 	orr     r3, r3, r4, lsl #16
 	mov     r4, r4, lsr #16
 	orr     r4, r4, r5, lsl #16
@@ -287,6 +325,7 @@  loop16:
 	orr     r10, r10, r11, lsl #16
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #16
+#endif
 	bhs     1b
 	b       less_than_thirtytwo
 
@@ -296,6 +335,25 @@  loop8:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #24
+	mov     r4, r4, lsl #8
+	orr     r4, r4, r5, lsr #24
+	mov     r5, r5, lsl #8
+	orr     r5, r5, r6, lsr #24
+	mov     r6, r6,  lsl #8
+	orr     r6, r6, r7, lsr #24
+	mov     r7, r7,  lsl #8
+	orr     r7, r7, r8,             lsr #24
+	mov     r8, r8,  lsl #8
+	orr     r8, r8, r9,             lsr #24
+	mov     r9, r9,  lsl #8
+	orr     r9, r9, r10,    lsr #24
+	mov     r10, r10, lsl #8
+	orr     r10, r10, r11,  lsr #24
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #8
+#else
 	orr     r3, r3, r4, lsl #24
 	mov     r4, r4, lsr #8
 	orr     r4, r4, r5, lsl #24
@@ -313,6 +371,7 @@  loop8:
 	orr     r10, r10, r11,  lsl #24
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #8
+#endif
 	bhs     1b
 	b       less_than_thirtytwo
 
@@ -322,6 +381,25 @@  loop24:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #8
+	mov     r4, r4, lsl #24
+	orr     r4, r4, r5, lsr #8
+	mov     r5, r5, lsl #24
+	orr     r5, r5, r6, lsr #8
+	mov     r6, r6, lsl #24
+	orr     r6, r6, r7, lsr #8
+	mov     r7, r7, lsl #24
+	orr     r7, r7, r8, lsr #8
+	mov     r8, r8, lsl #24
+	orr     r8, r8, r9, lsr #8
+	mov     r9, r9, lsl #24
+	orr     r9, r9, r10, lsr #8
+	mov     r10, r10, lsl #24
+	orr     r10, r10, r11, lsr #8
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #24
+#else
 	orr     r3, r3, r4, lsl #8
 	mov     r4, r4, lsr #24
 	orr     r4, r4, r5, lsl #8
@@ -339,6 +417,7 @@  loop24:
 	orr     r10, r10, r11, lsl #8
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #24
+#endif
 	bhs     1b
 
 less_than_thirtytwo:
@@ -350,9 +429,15 @@  less_than_thirtytwo:
 
 1:      ldr     r5, [r1], #4
 	sub     r2, r2, #4
+#if __ARMEB__
+	mov     r4, r5, lsr lr
+	orr     r4, r4, r3
+	mov     r3, r5, lsl r12
+#else
 	mov     r4, r5, lsl lr
 	orr     r4, r4, r3
 	mov     r3, r5, lsr r12
+#endif
 	str     r4, [r0], #4
 	cmp     r2, #4
 	bhs     1b
@@ -360,11 +445,20 @@  less_than_thirtytwo:
 partial_word_tail:
 	/* we have a partial word in the input buffer */
 	movs    r5, lr, lsl #(31-3)
+#if __ARMEB__
+	movmi   r3, r3, ror #24
+	strbmi r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs r3, [r0], #1
+#else
 	strbmi r3, [r0], #1
 	movmi   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
 	strbcs r3, [r0], #1
+#endif
 
 	/* Refill spilled registers from the stack. Don't update sp. */
 	ldmfd   sp, {r5-r11}
@@ -383,4 +477,3 @@  copy_last_3_and_return:
 	ldmfd   sp!, {r0, r4, lr}
 	bx      lr
 
-#endif
diff --git a/src/string/arm/memcpy.c b/src/string/arm/memcpy.c
deleted file mode 100644
index 041614f4..00000000
--- a/src/string/arm/memcpy.c
+++ /dev/null
@@ -1,3 +0,0 @@ 
-#if __ARMEB__
-#include "../memcpy.c"
-#endif

Comments

Rich Felker Sept. 13, 2019, 6:59 p.m.
On Fri, Sep 13, 2019 at 11:44:32AM -0700, Andre McCurdy wrote:
> Allow the existing ARM assembler memcpy implementation to be used for
> both big and little endian targets.

Nice. I don't want to merge this just before release, but as long as
it looks ok I should be able to review and merge it afterward.

Note that I'd really like to replace this giant file with C using
inline asm just for the inner block copies and C for all the flow
control, but I don't mind merging this first as long as it's correct.

Rich
Andre McCurdy Sept. 13, 2019, 8:38 p.m.
On Fri, Sep 13, 2019 at 11:59 AM Rich Felker <dalias@libc.org> wrote:
>
> On Fri, Sep 13, 2019 at 11:44:32AM -0700, Andre McCurdy wrote:
> > Allow the existing ARM assembler memcpy implementation to be used for
> > both big and little endian targets.
>
> Nice. I don't want to merge this just before release, but as long as
> it looks ok I should be able to review and merge it afterward.
>
> Note that I'd really like to replace this giant file with C using
> inline asm just for the inner block copies and C for all the flow
> control, but I don't mind merging this first as long as it's correct.

Sounds good. I'll wait for your feedback after the upcoming release.
Rich Felker Jan. 15, 2020, 3:45 p.m.
On Fri, Sep 13, 2019 at 01:38:34PM -0700, Andre McCurdy wrote:
> On Fri, Sep 13, 2019 at 11:59 AM Rich Felker <dalias@libc.org> wrote:
> >
> > On Fri, Sep 13, 2019 at 11:44:32AM -0700, Andre McCurdy wrote:
> > > Allow the existing ARM assembler memcpy implementation to be used for
> > > both big and little endian targets.
> >
> > Nice. I don't want to merge this just before release, but as long as
> > it looks ok I should be able to review and merge it afterward.
> >
> > Note that I'd really like to replace this giant file with C using
> > inline asm just for the inner block copies and C for all the flow
> > control, but I don't mind merging this first as long as it's correct.
> 
> Sounds good. I'll wait for your feedback after the upcoming release.

Sorry this dropped off my radar. I'd like to merge at least the thumb
part since it's simple enough to review quickly and users have
actually complained about memcpy being slow on armv7 with -mthumb as
default.

Rich
Andre McCurdy Jan. 15, 2020, 6:41 p.m.
On Wed, Jan 15, 2020 at 7:46 AM Rich Felker <dalias@libc.org> wrote:
> On Fri, Sep 13, 2019 at 01:38:34PM -0700, Andre McCurdy wrote:
> > On Fri, Sep 13, 2019 at 11:59 AM Rich Felker <dalias@libc.org> wrote:
> > > On Fri, Sep 13, 2019 at 11:44:32AM -0700, Andre McCurdy wrote:
> > > > Allow the existing ARM assembler memcpy implementation to be used for
> > > > both big and little endian targets.
> > >
> > > Nice. I don't want to merge this just before release, but as long as
> > > it looks ok I should be able to review and merge it afterward.
> > >
> > > Note that I'd really like to replace this giant file with C using
> > > inline asm just for the inner block copies and C for all the flow
> > > control, but I don't mind merging this first as long as it's correct.
> >
> > Sounds good. I'll wait for your feedback after the upcoming release.
>
> Sorry this dropped off my radar. I'd like to merge at least the thumb
> part since it's simple enough to review quickly and users have
> actually complained about memcpy being slow on armv7 with -mthumb as
> default.

Interesting. I wonder what the reference was against which the musl C
code was compared? From my own benchmarking I didn't find the musl
assembler to be much faster than the C code. There are armv6 and maybe
early armv7 CPUs where explicit prefetch instructions make a huge
difference (much more so than C -vs- assembler). Did the users who
complained about musl memcpy() compare against a memcpy() which uses
prefetch? For armv7 using NEON may help, although the latest armv7
cores seem to perform very well with plain old C code too. There are
lots of trade offs so it's impossible for a single implementation to
be universally optimal. The "arm-mem" routines used on Raspberry Pi
seem to be a very fast for many targets, but unfortunately the armv6
memcpy generates mis-aligned accesses so isn't suitable for armv5.

  https://github.com/bavison/arm-mem/
Rich Felker Jan. 15, 2020, 7:22 p.m.
On Wed, Jan 15, 2020 at 10:41:08AM -0800, Andre McCurdy wrote:
> On Wed, Jan 15, 2020 at 7:46 AM Rich Felker <dalias@libc.org> wrote:
> > On Fri, Sep 13, 2019 at 01:38:34PM -0700, Andre McCurdy wrote:
> > > On Fri, Sep 13, 2019 at 11:59 AM Rich Felker <dalias@libc.org> wrote:
> > > > On Fri, Sep 13, 2019 at 11:44:32AM -0700, Andre McCurdy wrote:
> > > > > Allow the existing ARM assembler memcpy implementation to be used for
> > > > > both big and little endian targets.
> > > >
> > > > Nice. I don't want to merge this just before release, but as long as
> > > > it looks ok I should be able to review and merge it afterward.
> > > >
> > > > Note that I'd really like to replace this giant file with C using
> > > > inline asm just for the inner block copies and C for all the flow
> > > > control, but I don't mind merging this first as long as it's correct.
> > >
> > > Sounds good. I'll wait for your feedback after the upcoming release.
> >
> > Sorry this dropped off my radar. I'd like to merge at least the thumb
> > part since it's simple enough to review quickly and users have
> > actually complained about memcpy being slow on armv7 with -mthumb as
> > default.
> 
> Interesting. I wonder what the reference was against which the musl C
> code was compared? From my own benchmarking I didn't find the musl
> assembler to be much faster than the C code. There are armv6 and maybe
> early armv7 CPUs where explicit prefetch instructions make a huge
> difference (much more so than C -vs- assembler). Did the users who
> complained about musl memcpy() compare against a memcpy() which uses
> prefetch? For armv7 using NEON may help, although the latest armv7
> cores seem to perform very well with plain old C code too. There are
> lots of trade offs so it's impossible for a single implementation to
> be universally optimal. The "arm-mem" routines used on Raspberry Pi
> seem to be a very fast for many targets, but unfortunately the armv6
> memcpy generates mis-aligned accesses so isn't suitable for armv5.
> 
>   https://github.com/bavison/arm-mem/

I'm not sure of the details but the comparison was just between the
armv6 version of Alpine and the armv7 version (so using musl's
memcpy_le.S vs memcpy.c).

Rich
Andre McCurdy Jan. 15, 2020, 8:54 p.m.
On Wed, Jan 15, 2020 at 11:22 AM Rich Felker <dalias@libc.org> wrote:
> On Wed, Jan 15, 2020 at 10:41:08AM -0800, Andre McCurdy wrote:
> > On Wed, Jan 15, 2020 at 7:46 AM Rich Felker <dalias@libc.org> wrote:
> > > On Fri, Sep 13, 2019 at 01:38:34PM -0700, Andre McCurdy wrote:
> > > > On Fri, Sep 13, 2019 at 11:59 AM Rich Felker <dalias@libc.org> wrote:
> > > > > On Fri, Sep 13, 2019 at 11:44:32AM -0700, Andre McCurdy wrote:
> > > > > > Allow the existing ARM assembler memcpy implementation to be used for
> > > > > > both big and little endian targets.
> > > > >
> > > > > Nice. I don't want to merge this just before release, but as long as
> > > > > it looks ok I should be able to review and merge it afterward.
> > > > >
> > > > > Note that I'd really like to replace this giant file with C using
> > > > > inline asm just for the inner block copies and C for all the flow
> > > > > control, but I don't mind merging this first as long as it's correct.
> > > >
> > > > Sounds good. I'll wait for your feedback after the upcoming release.
> > >
> > > Sorry this dropped off my radar. I'd like to merge at least the thumb
> > > part since it's simple enough to review quickly and users have
> > > actually complained about memcpy being slow on armv7 with -mthumb as
> > > default.
> >
> > Interesting. I wonder what the reference was against which the musl C
> > code was compared? From my own benchmarking I didn't find the musl
> > assembler to be much faster than the C code. There are armv6 and maybe
> > early armv7 CPUs where explicit prefetch instructions make a huge
> > difference (much more so than C -vs- assembler). Did the users who
> > complained about musl memcpy() compare against a memcpy() which uses
> > prefetch? For armv7 using NEON may help, although the latest armv7
> > cores seem to perform very well with plain old C code too. There are
> > lots of trade offs so it's impossible for a single implementation to
> > be universally optimal. The "arm-mem" routines used on Raspberry Pi
> > seem to be a very fast for many targets, but unfortunately the armv6
> > memcpy generates mis-aligned accesses so isn't suitable for armv5.
> >
> >   https://github.com/bavison/arm-mem/
>
> I'm not sure of the details but the comparison was just between the
> armv6 version of Alpine and the armv7 version (so using musl's
> memcpy_le.S vs memcpy.c).

OK. A comparison with the arm-mem version would be interesting too.
The arm-mem library is designed to be preloaded (ahead of glibc for
Raspberry Pi distros) so it should be possible to preload ahead of
musl on Alpine, which makes testing slightly easier.
Natanael Copa Jan. 16, 2020, 3:21 p.m.
On Wed, 15 Jan 2020 10:41:08 -0800
Andre McCurdy <armccurdy@gmail.com> wrote:

> On Wed, Jan 15, 2020 at 7:46 AM Rich Felker <dalias@libc.org> wrote:
> > On Fri, Sep 13, 2019 at 01:38:34PM -0700, Andre McCurdy wrote:  
> > > On Fri, Sep 13, 2019 at 11:59 AM Rich Felker <dalias@libc.org> wrote:  
> > > > On Fri, Sep 13, 2019 at 11:44:32AM -0700, Andre McCurdy wrote:  
> > > > > Allow the existing ARM assembler memcpy implementation to be used for
> > > > > both big and little endian targets.  
> > > >
> > > > Nice. I don't want to merge this just before release, but as long as
> > > > it looks ok I should be able to review and merge it afterward.
> > > >
> > > > Note that I'd really like to replace this giant file with C using
> > > > inline asm just for the inner block copies and C for all the flow
> > > > control, but I don't mind merging this first as long as it's correct.  
> > >
> > > Sounds good. I'll wait for your feedback after the upcoming release.  
> >
> > Sorry this dropped off my radar. I'd like to merge at least the thumb
> > part since it's simple enough to review quickly and users have
> > actually complained about memcpy being slow on armv7 with -mthumb as
> > default.  
> 
> Interesting. I wonder what the reference was against which the musl C
> code was compared? From my own benchmarking I didn't find the musl
> assembler to be much faster than the C code. There are armv6 and maybe
> early armv7 CPUs where explicit prefetch instructions make a huge
> difference (much more so than C -vs- assembler). Did the users who
> complained about musl memcpy() compare against a memcpy() which uses
> prefetch? For armv7 using NEON may help, although the latest armv7
> cores seem to perform very well with plain old C code too. There are
> lots of trade offs so it's impossible for a single implementation to
> be universally optimal. The "arm-mem" routines used on Raspberry Pi
> seem to be a very fast for many targets, but unfortunately the armv6
> memcpy generates mis-aligned accesses so isn't suitable for armv5.
> 
>   https://github.com/bavison/arm-mem/

The Alpine user reported it here:
https://gitlab.alpinelinux.org/alpine/aports/issues/11128

I don't know if you got the __builtin_memcpy or the libc version. I do
know that qemu once got surprised that `memcpy` used libc's non-atomic
version instead of gcc's atomic __builtin_memcpy. This happened due to
alpine users fortify-headers as FORTIFY_SOURCE implementation.

Not sure if something similar happened here.

-nc