From 26cc89d99cc9d783859eb9d38e067fad5d6bbb60 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Fri, 18 Aug 2017 01:25:25 +0300 Subject: [PATCH] arc: Merge ARCv2 string routines in generic ARC .S files In cde74b83f9b2 "ARC: remove special CFLAGS/LDFLAGS handling" we got rid of CONFIG_ARC_CPU_HS which was used to select ARCv2-specific implementation of optimized string routines. So now ARCv2-tuned memset/memcpy/strcmp are not used, instead those for ARC700 used for both ARC700 and ARCHS. Without uClibc config option we may only tell which CPU type we're targeting by built-in defines of GCC. I.e. no more conditional file inclusion in Makefiles. That leaves us only one option - merge both implementations in 1 file and use ifdefs. Signed-off-by: Alexey Brodkin --- extra/Configs/Config.in | 1 - libc/string/arc/arcv2/strcmp.S | 83 ------------------------------------ libc/string/arc/{arcv2 => }/memcpy.S | 75 ++++++++++++++++++++++++++++++-- libc/string/arc/{arcv2 => }/memset.S | 55 ++++++++++++++++++++++-- libc/string/arc/strcmp.S | 81 ++++++++++++++++++++++++++++++++++- 5 files changed, 202 insertions(+), 93 deletions(-) delete mode 100644 libc/string/arc/arcv2/strcmp.S rename libc/string/arc/{arcv2 => }/memcpy.S (76%) rename libc/string/arc/{arcv2 => }/memset.S (69%) diff --git a/extra/Configs/Config.in b/extra/Configs/Config.in index 59ef31c47..ce832b55b 100644 --- a/extra/Configs/Config.in +++ b/extra/Configs/Config.in @@ -250,7 +250,6 @@ config TARGET_SUBARCH default "i486" if CONFIG_486 default "i586" if CONFIG_586 default "i686" if CONFIG_686 - default "arcv2" if CONFIG_ARC_CPU_HS default "" source "extra/Configs/Config.in.arch" diff --git a/libc/string/arc/arcv2/strcmp.S b/libc/string/arc/arcv2/strcmp.S deleted file mode 100644 index 2e0e64a0c..000000000 --- a/libc/string/arc/arcv2/strcmp.S +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) - * - * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. - */ - -#include -#include - -ENTRY(strcmp) - or r2, r0, r1 - bmsk_s r2, r2, 1 - brne r2, 0, @.Lcharloop - -;;; s1 and s2 are word aligned - ld.ab r2, [r0, 4] - - mov_s r12, 0x01010101 - ror r11, r12 - .align 4 -.LwordLoop: - ld.ab r3, [r1, 4] - ;; Detect NULL char in str1 - sub r4, r2, r12 - ld.ab r5, [r0, 4] - bic r4, r4, r2 - and r4, r4, r11 - brne.d.nt r4, 0, .LfoundNULL - ;; Check if the read locations are the same - cmp r2, r3 - beq.d .LwordLoop - mov.eq r2, r5 - - ;; A match is found, spot it out -#ifdef __LITTLE_ENDIAN__ - swape r3, r3 - mov_s r0, 1 - swape r2, r2 -#else - mov_s r0, 1 -#endif - cmp_s r2, r3 - j_s.d [blink] - bset.lo r0, r0, 31 - - .align 4 -.LfoundNULL: -#ifdef __BIG_ENDIAN__ - swape r4, r4 - swape r2, r2 - swape r3, r3 -#endif - ;; Find null byte - ffs r0, r4 - bmsk r2, r2, r0 - bmsk r3, r3, r0 - swape r2, r2 - swape r3, r3 - ;; make the return value - sub.f r0, r2, r3 - mov.hi r0, 1 - j_s.d [blink] - bset.lo r0, r0, 31 - - .align 4 -.Lcharloop: - ldb.ab r2, [r0, 1] - ldb.ab r3, [r1, 1] - nop - breq r2, 0, .Lcmpend - breq r2, r3, .Lcharloop - - .align 4 -.Lcmpend: - j_s.d [blink] - sub r0, r2, r3 -END(strcmp) -libc_hidden_def(strcmp) - -#ifndef __UCLIBC_HAS_LOCALE__ -strong_alias(strcmp,strcoll) -libc_hidden_def(strcoll) -#endif diff --git a/libc/string/arc/arcv2/memcpy.S b/libc/string/arc/memcpy.S similarity index 76% rename from libc/string/arc/arcv2/memcpy.S rename to libc/string/arc/memcpy.S index ba29e8790..69d7220b8 100644 --- a/libc/string/arc/arcv2/memcpy.S +++ b/libc/string/arc/memcpy.S @@ -1,12 +1,81 @@ /* - * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. */ -#include #include +#if !defined(__ARC700__) && !defined(__ARCHS__) +#error "Neither ARC700 nor ARCHS is defined!" +#endif + +ENTRY(memcpy) + +#ifdef __ARC700__ +/* This memcpy implementation does not support objects of 1GB or larger - + the check for alignment does not work then. */ +/* We assume that most sources and destinations are aligned, and + that also lengths are mostly a multiple of four, although to a lesser + extent. */ + or r3,r0,r1 + asl_s r3,r3,30 + mov_s r5,r0 + brls.d r2,r3,.Lcopy_bytewise + sub.f r3,r2,1 + ld_s r12,[r1,0] + asr.f lp_count,r3,3 + bbit0.d r3,2,.Lnox4 + bmsk_s r2,r2,1 + st.ab r12,[r5,4] + ld.a r12,[r1,4] +.Lnox4: + lppnz .Lendloop + ld_s r3,[r1,4] + st.ab r12,[r5,4] + ld.a r12,[r1,8] + st.ab r3,[r5,4] +.Lendloop: + breq r2,0,.Last_store + ld r3,[r5,0] +#ifdef __LITTLE_ENDIAN__ + add3 r2,-1,r2 + ; uses long immediate + xor_s r12,r12,r3 + bmsk r12,r12,r2 + xor_s r12,r12,r3 +#else /* BIG ENDIAN */ + sub3 r2,31,r2 + ; uses long immediate + xor_s r3,r3,r12 + bmsk r3,r3,r2 + xor_s r12,r12,r3 +#endif /* ENDIAN */ +.Last_store: + j_s.d [blink] + st r12,[r5,0] + + .balign 4 +.Lcopy_bytewise: + jcs [blink] + ldb_s r12,[r1,0] + lsr.f lp_count,r3 + bhs_s .Lnox1 + stb.ab r12,[r5,1] + ldb.a r12,[r1,1] +.Lnox1: + lppnz .Lendbloop + ldb_s r3,[r1,1] + stb.ab r12,[r5,1] + ldb.a r12,[r1,2] + stb.ab r3,[r5,1] +.Lendbloop: + j_s.d [blink] + stb r12,[r5,0] +#endif /* __ARC700__ */ + +#ifdef __ARCHS__ #ifdef __LITTLE_ENDIAN__ # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> @@ -39,7 +108,6 @@ # define ZOLAND 0xF #endif -ENTRY(memcpy) prefetch [r1] ; Prefetch the read location prefetchw [r0] ; Prefetch the write location mov.f 0, r2 @@ -231,6 +299,7 @@ ENTRY(memcpy) stb.ab r6, [r3,1] .Lcopybytewise_3: j [blink] +#endif /* __ARCHS__ */ END(memcpy) libc_hidden_def(memcpy) diff --git a/libc/string/arc/arcv2/memset.S b/libc/string/arc/memset.S similarity index 69% rename from libc/string/arc/arcv2/memset.S rename to libc/string/arc/memset.S index 343cfaf81..0b74ddc7f 100644 --- a/libc/string/arc/arcv2/memset.S +++ b/libc/string/arc/memset.S @@ -1,20 +1,66 @@ - /* - * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. */ -#include #include +#if !defined(__ARC700__) && !defined(__ARCHS__) +#error "Neither ARC700 nor ARCHS is defined!" +#endif + +ENTRY(memset) + +#ifdef __ARC700__ +#define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */ + + mov_s r4,r0 + or r12,r0,r2 + bmsk.f r12,r12,1 + extb_s r1,r1 + asl r3,r1,8 + beq.d .Laligned + or_s r1,r1,r3 + brls r2,SMALL,.Ltiny + add r3,r2,r0 + stb r1,[r3,-1] + bclr_s r3,r3,0 + stw r1,[r3,-2] + bmsk.f r12,r0,1 + add_s r2,r2,r12 + sub.ne r2,r2,4 + stb.ab r1,[r4,1] + and r4,r4,-2 + stw.ab r1,[r4,2] + and r4,r4,-4 +.Laligned: ; This code address should be aligned for speed. + asl r3,r1,16 + lsr.f lp_count,r2,2 + or_s r1,r1,r3 + lpne .Loop_end + st.ab r1,[r4,4] +.Loop_end: + j_s [blink] + + + .balign 4 +.Ltiny: + mov.f lp_count,r2 + lpne .Ltiny_end + stb.ab r1,[r4,1] +.Ltiny_end: + j_s [blink] +#endif /* __ARC700__ */ + +#ifdef __ARCHS__ #ifdef DONT_USE_PREALLOC #define PREWRITE(A,B) prefetchw [(A),(B)] #else #define PREWRITE(A,B) prealloc [(A),(B)] #endif -ENTRY(memset) prefetchw [r0] ; Prefetch the write location mov.f 0, r2 ;;; if size is zero @@ -110,6 +156,7 @@ ENTRY(memset) .Lcopy3bytes: j [blink] +#endif /* __ARCHS__ */ END(memset) libc_hidden_def(memset) diff --git a/libc/string/arc/strcmp.S b/libc/string/arc/strcmp.S index 5a0e56045..ad38d9e00 100644 --- a/libc/string/arc/strcmp.S +++ b/libc/string/arc/strcmp.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -8,6 +8,13 @@ #include #include +#if !defined(__ARC700__) && !defined(__ARCHS__) +#error "Neither ARC700 nor ARCHS is defined!" +#endif + +ENTRY(strcmp) + +#ifdef __ARC700__ /* This is optimized primarily for the ARC700. It would be possible to speed up the loops by one cycle / word respective one cycle / byte by forcing double source 1 alignment, unrolling @@ -15,7 +22,6 @@ source 1; however, that would increase the overhead for loop setup / finish, and strcmp might often terminate early. */ -ENTRY(strcmp) or r2,r0,r1 bmsk_s r2,r2,1 brne r2,0,.Lcharloop @@ -93,6 +99,77 @@ ENTRY(strcmp) .Lcmpend: j_s.d [blink] sub r0,r2,r3 +#endif /* __ARC700__ */ + +#ifdef __ARCHS__ + or r2, r0, r1 + bmsk_s r2, r2, 1 + brne r2, 0, @.Lcharloop + +;;; s1 and s2 are word aligned + ld.ab r2, [r0, 4] + + mov_s r12, 0x01010101 + ror r11, r12 + .align 4 +.LwordLoop: + ld.ab r3, [r1, 4] + ;; Detect NULL char in str1 + sub r4, r2, r12 + ld.ab r5, [r0, 4] + bic r4, r4, r2 + and r4, r4, r11 + brne.d.nt r4, 0, .LfoundNULL + ;; Check if the read locations are the same + cmp r2, r3 + beq.d .LwordLoop + mov.eq r2, r5 + + ;; A match is found, spot it out +#ifdef __LITTLE_ENDIAN__ + swape r3, r3 + mov_s r0, 1 + swape r2, r2 +#else + mov_s r0, 1 +#endif + cmp_s r2, r3 + j_s.d [blink] + bset.lo r0, r0, 31 + + .align 4 +.LfoundNULL: +#ifdef __BIG_ENDIAN__ + swape r4, r4 + swape r2, r2 + swape r3, r3 +#endif + ;; Find null byte + ffs r0, r4 + bmsk r2, r2, r0 + bmsk r3, r3, r0 + swape r2, r2 + swape r3, r3 + ;; make the return value + sub.f r0, r2, r3 + mov.hi r0, 1 + j_s.d [blink] + bset.lo r0, r0, 31 + + .align 4 +.Lcharloop: + ldb.ab r2, [r0, 1] + ldb.ab r3, [r1, 1] + nop + breq r2, 0, .Lcmpend + breq r2, r3, .Lcharloop + + .align 4 +.Lcmpend: + j_s.d [blink] + sub r0, r2, r3 +#endif /* __ARCHS__ */ + END(strcmp) libc_hidden_def(strcmp) -- 2.11.4.GIT