From 8b9ef29fb126c618d0c3451b112ec76d792cbabc Mon Sep 17 00:00:00 2001 From: amiconn Date: Mon, 24 Nov 2008 18:40:49 +0000 Subject: [PATCH] Branch optimisation in both C (giving hints to gcc - verified using -fprofile-arcs and gcov) and asm files. Biggest effect on coldfire (-c1000: +8%, -c2000: +5%), but ARM also profits a bit (less than 1% on ARM7TDMI, around 1% on ARM1136). git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19199 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/demac/libdemac/demac_config.h | 9 ++ apps/codecs/demac/libdemac/entropy.c | 17 ++-- apps/codecs/demac/libdemac/filter.c | 36 ++++---- apps/codecs/demac/libdemac/predictor-arm.S | 27 +++--- apps/codecs/demac/libdemac/predictor-cf.S | 46 +++++----- apps/codecs/demac/libdemac/predictor.c | 137 +++++++++++++++-------------- 6 files changed, 147 insertions(+), 125 deletions(-) diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h index dd3aaa3f9..986e5376c 100644 --- a/apps/codecs/demac/libdemac/demac_config.h +++ b/apps/codecs/demac/libdemac/demac_config.h @@ -70,6 +70,15 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA #define ICODE_ATTR #define ICODE_ATTR_DEMAC +/* Use to give gcc hints on which branch is most likely taken */ +#if defined(__GNUC__) && __GNUC__ >= 3 +#define LIKELY(x) __builtin_expect(!!(x), 1) +#define UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + #endif /* !ROCKBOX */ /* Defaults */ diff --git a/apps/codecs/demac/libdemac/entropy.c b/apps/codecs/demac/libdemac/entropy.c index 54ff226bc..e8561122a 100644 --- a/apps/codecs/demac/libdemac/entropy.c +++ b/apps/codecs/demac/libdemac/entropy.c @@ -283,13 +283,13 @@ static inline void update_rice(struct rice_t* rice, int x) { rice->ksum += ((x + 1) / 2) - ((rice->ksum + 16) >> 5); - if (rice->k == 0) { + if (UNLIKELY(rice->k == 0)) { rice->k = 1; } else { uint32_t lim = 1 << (rice->k + 4); - if (rice->ksum < lim) { + if (UNLIKELY(rice->ksum < lim)) { rice->k--; - } else if (rice->ksum >= 2 * lim) { + } else if (UNLIKELY(rice->ksum >= 2 * lim)) { rice->k++; } } @@ -300,11 +300,12 @@ static inline int entropy_decode3980(struct rice_t* rice) int base, x, pivot, overflow; pivot = rice->ksum >> 5; - if (pivot == 0) pivot=1; + if (UNLIKELY(pivot == 0)) + pivot=1; overflow = range_get_symbol_3980(); - if (overflow == (MODEL_ELEMENTS-1)) { + if (UNLIKELY(overflow == (MODEL_ELEMENTS-1))) { overflow = range_decode_short() << 16; overflow |= range_decode_short(); } @@ -352,7 +353,7 @@ static inline int entropy_decode3970(struct rice_t* rice) int overflow = range_get_symbol_3970(); - if (overflow == (MODEL_ELEMENTS - 1)) { + if (UNLIKELY(overflow == (MODEL_ELEMENTS - 1))) { tmpk = range_decode_bits(5); overflow = 0; } else { @@ -435,13 +436,13 @@ int ICODE_ATTR_DEMAC entropy_decode(struct ape_ctx_t* ape_ctx, memset(decoded1, 0, blockstodecode * sizeof(int32_t)); } else { if (ape_ctx->fileversion > 3970) { - while (blockstodecode--) { + while (LIKELY(blockstodecode--)) { *(decoded0++) = entropy_decode3980(&riceY); if (decoded1 != NULL) *(decoded1++) = entropy_decode3980(&riceX); } } else { - while (blockstodecode--) { + while (LIKELY(blockstodecode--)) { *(decoded0++) = entropy_decode3970(&riceY); if (decoded1 != NULL) *(decoded1++) = entropy_decode3970(&riceX); diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c index 5601fffcd..d66bdc69b 100644 --- a/apps/codecs/demac/libdemac/filter.c +++ b/apps/codecs/demac/libdemac/filter.c @@ -100,7 +100,7 @@ struct filter_t { #if defined(CPU_ARM) && (ARM_ARCH >= 6) #define SATURATE(x) ({int __res; asm("ssat %0, #16, %1" : "=r"(__res) : "r"(x)); __res; }) #else -#define SATURATE(x) (((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF); +#define SATURATE(x) (LIKELY((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF); #endif /* Apply the filter with state f to count entries in data[] */ @@ -109,20 +109,22 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f, int32_t* data, int count) { int res; - int absres; + int absres; #ifdef PREPARE_SCALARPRODUCT PREPARE_SCALARPRODUCT #endif - while(count--) + while(LIKELY(count--)) { res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER)); - if (*data < 0) - vector_add(f->coeffs, f->adaptcoeffs - ORDER); - else if (*data > 0) - vector_sub(f->coeffs, f->adaptcoeffs - ORDER); + if (LIKELY(*data != 0)) { + if (*data < 0) + vector_add(f->coeffs, f->adaptcoeffs - ORDER); + else + vector_sub(f->coeffs, f->adaptcoeffs - ORDER); + } res += *data; @@ -136,11 +138,11 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f, /* Update the adaption coefficients */ absres = (res < 0 ? -res : res); - if (absres > (f->avg * 3)) + if (UNLIKELY(absres > (f->avg * 3))) *f->adaptcoeffs = ((res >> 25) & 64) - 32; else if (absres > (f->avg * 4) / 3) *f->adaptcoeffs = ((res >> 26) & 32) - 16; - else if (absres > 0) + else if (LIKELY(absres > 0)) *f->adaptcoeffs = ((res >> 27) & 16) - 8; else *f->adaptcoeffs = 0; @@ -154,7 +156,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f, f->adaptcoeffs++; /* Have we filled the history buffer? */ - if (f->delay == f->history_end) { + if (UNLIKELY(f->delay == f->history_end)) { memmove(f->coeffs + ORDER, f->delay - (ORDER*2), (ORDER*2) * sizeof(filter_int)); f->adaptcoeffs = f->coeffs + ORDER*2; @@ -172,14 +174,16 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f, PREPARE_SCALARPRODUCT #endif - while(count--) + while(LIKELY(count--)) { res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER)); - if (*data < 0) - vector_add(f->coeffs, f->adaptcoeffs - ORDER); - else if (*data > 0) - vector_sub(f->coeffs, f->adaptcoeffs - ORDER); + if (LIKELY(*data != 0)) { + if (*data < 0) + vector_add(f->coeffs, f->adaptcoeffs - ORDER); + else + vector_sub(f->coeffs, f->adaptcoeffs - ORDER); + } /* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an integer (rounding to nearest) and add the input value to @@ -199,7 +203,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f, f->adaptcoeffs++; /* Have we filled the history buffer? */ - if (f->delay == f->history_end) { + if (UNLIKELY(f->delay == f->history_end)) { memmove(f->coeffs + ORDER, f->delay - (ORDER*2), (ORDER*2) * sizeof(filter_int)); f->adaptcoeffs = f->coeffs + ORDER*2; diff --git a/apps/codecs/demac/libdemac/predictor-arm.S b/apps/codecs/demac/libdemac/predictor-arm.S index dfeba0dcc..d62b6186f 100644 --- a/apps/codecs/demac/libdemac/predictor-arm.S +++ b/apps/codecs/demac/libdemac/predictor-arm.S @@ -468,14 +468,24 @@ loop: add r11, r12, #historybuffer @ r11 := &p->historybuffer[0] - sub r10, r14, #PREDICTOR_HISTORY_SIZE*4 + sub r10, r14, #PREDICTOR_HISTORY_SIZE*4 @ r10 := p->buf - PREDICTOR_HISTORY_SIZE cmp r10, r11 - bne endofloop + beq move_hist @ The history buffer is full, we need to do a memmove - @ The history buffer is full, we need to do a memmove: + @ Check loop count + ldr r0, [sp, #8] + subs r0, r0, #1 + strne r0, [sp, #8] + bne loop + +done: + str r14, [r12] @ Save value of p->buf + add sp, sp, #12 @ Don't bother restoring r1-r3 + ldmia sp!, {r4-r11, pc} +move_hist: @ dest = r11 (p->historybuffer) @ src = r14 (p->buf) @ n = 200 @@ -493,15 +503,10 @@ loop: add r14, r12, #historybuffer @ p->buf = &p->historybuffer[0] - -endofloop: -@ Check loop count + @ Check loop count ldr r0, [sp, #8] subs r0, r0, #1 strne r0, [sp, #8] bne loop - -done: - str r14, [r12] @ Save value of p->buf - add sp, sp, #12 @ Don't bother restoring r1-r3 - ldmia sp!, {r4-r11, pc} + + b done diff --git a/apps/codecs/demac/libdemac/predictor-cf.S b/apps/codecs/demac/libdemac/predictor-cf.S index b12d0932b..0a1ffe9ea 100644 --- a/apps/codecs/demac/libdemac/predictor-cf.S +++ b/apps/codecs/demac/libdemac/predictor-cf.S @@ -486,10 +486,18 @@ predictor_decode_stereo: | %a3 = &p->historybuffer[PREDICTOR_HISTORY_SIZE] cmp.l %a3, %a5 - bne.s .endofloop + beq.s .move_hist | The history buffer is full, we need to do a memmove - | The history buffer is full, we need to do a memmove: + subq.l #1, (8,%sp) | decrease loop count + bne.w .loop +.done: + move.l %a5, (%a6) | Save value of p->buf + movem.l (3*4,%sp), %d2-%d7/%a2-%a6 + lea.l (14*4,%sp), %sp + rts + +.move_hist: lea.l (historybuffer,%a6), %a3 | dest = %a3 (p->historybuffer) @@ -497,33 +505,19 @@ predictor_decode_stereo: | n = 200 movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes - lea.l (40,%a5), %a5 movem.l %d0-%d7/%a0-%a1, (%a3) - lea.l (40,%a3), %a3 - movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes - lea.l (40,%a5), %a5 - movem.l %d0-%d7/%a0-%a1, (%a3) - lea.l (40,%a3), %a3 - movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes - lea.l (40,%a5), %a5 - movem.l %d0-%d7/%a0-%a1, (%a3) - lea.l (40,%a3), %a3 - movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes - lea.l (40,%a5), %a5 - movem.l %d0-%d7/%a0-%a1, (%a3) - lea.l (40,%a3), %a3 - movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes - lea.l (40,%a5), %a5 - movem.l %d0-%d7/%a0-%a1, (%a3) - lea.l (40,%a3), %a3 + movem.l (40,%a5), %d0-%d7/%a0-%a1 | 40 bytes + movem.l %d0-%d7/%a0-%a1, (40,%a3) + movem.l (80,%a5), %d0-%d7/%a0-%a1 | 40 bytes + movem.l %d0-%d7/%a0-%a1, (80,%a3) + movem.l (120,%a5), %d0-%d7/%a0-%a1 | 40 bytes + movem.l %d0-%d7/%a0-%a1, (120,%a3) + movem.l (160,%a5), %d0-%d7/%a0-%a1 | 40 bytes + movem.l %d0-%d7/%a0-%a1, (160,%a3) - lea.l (historybuffer,%a6), %a5 | p->buf = &p->historybuffer[0] + move.l %a3, %a5 | p->buf = &p->historybuffer[0] -.endofloop: subq.l #1, (8,%sp) | decrease loop count bne.w .loop - move.l %a5, (%a6) | Save value of p->buf - movem.l (3*4,%sp), %d2-%d7/%a2-%a6 - lea.l (14*4,%sp), %sp - rts + bra.s .done diff --git a/apps/codecs/demac/libdemac/predictor.c b/apps/codecs/demac/libdemac/predictor.c index 1a9b48e0c..67a17f4b5 100644 --- a/apps/codecs/demac/libdemac/predictor.c +++ b/apps/codecs/demac/libdemac/predictor.c @@ -75,7 +75,7 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p, { int32_t predictionA, predictionB; - while (count--) + while (LIKELY(count--)) { /* Predictor Y */ p->buf[YDELAYA] = p->YlastA; @@ -134,60 +134,66 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p, p->XlastA = *decoded1 + ((predictionA + (predictionB >> 1)) >> 10); p->XfilterA = p->XlastA + ((p->XfilterA * 31) >> 5); - if (*decoded0 > 0) + if (LIKELY(*decoded0 != 0)) { - p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA]; - p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1]; - p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2]; - p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3]; - - p->YcoeffsB[0] -= p->buf[YADAPTCOEFFSB]; - p->YcoeffsB[1] -= p->buf[YADAPTCOEFFSB-1]; - p->YcoeffsB[2] -= p->buf[YADAPTCOEFFSB-2]; - p->YcoeffsB[3] -= p->buf[YADAPTCOEFFSB-3]; - p->YcoeffsB[4] -= p->buf[YADAPTCOEFFSB-4]; - } - else if (*decoded0 < 0) - { - p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA]; - p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1]; - p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2]; - p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3]; - - p->YcoeffsB[0] += p->buf[YADAPTCOEFFSB]; - p->YcoeffsB[1] += p->buf[YADAPTCOEFFSB-1]; - p->YcoeffsB[2] += p->buf[YADAPTCOEFFSB-2]; - p->YcoeffsB[3] += p->buf[YADAPTCOEFFSB-3]; - p->YcoeffsB[4] += p->buf[YADAPTCOEFFSB-4]; + if (*decoded0 > 0) + { + p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA]; + p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1]; + p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2]; + p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3]; + + p->YcoeffsB[0] -= p->buf[YADAPTCOEFFSB]; + p->YcoeffsB[1] -= p->buf[YADAPTCOEFFSB-1]; + p->YcoeffsB[2] -= p->buf[YADAPTCOEFFSB-2]; + p->YcoeffsB[3] -= p->buf[YADAPTCOEFFSB-3]; + p->YcoeffsB[4] -= p->buf[YADAPTCOEFFSB-4]; + } + else + { + p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA]; + p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1]; + p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2]; + p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3]; + + p->YcoeffsB[0] += p->buf[YADAPTCOEFFSB]; + p->YcoeffsB[1] += p->buf[YADAPTCOEFFSB-1]; + p->YcoeffsB[2] += p->buf[YADAPTCOEFFSB-2]; + p->YcoeffsB[3] += p->buf[YADAPTCOEFFSB-3]; + p->YcoeffsB[4] += p->buf[YADAPTCOEFFSB-4]; + } } *(decoded0++) = p->YfilterA; - if (*decoded1 > 0) - { - p->XcoeffsA[0] -= p->buf[XADAPTCOEFFSA]; - p->XcoeffsA[1] -= p->buf[XADAPTCOEFFSA-1]; - p->XcoeffsA[2] -= p->buf[XADAPTCOEFFSA-2]; - p->XcoeffsA[3] -= p->buf[XADAPTCOEFFSA-3]; - - p->XcoeffsB[0] -= p->buf[XADAPTCOEFFSB]; - p->XcoeffsB[1] -= p->buf[XADAPTCOEFFSB-1]; - p->XcoeffsB[2] -= p->buf[XADAPTCOEFFSB-2]; - p->XcoeffsB[3] -= p->buf[XADAPTCOEFFSB-3]; - p->XcoeffsB[4] -= p->buf[XADAPTCOEFFSB-4]; - } - else if (*decoded1 < 0) + if (LIKELY(*decoded1 != 0)) { - p->XcoeffsA[0] += p->buf[XADAPTCOEFFSA]; - p->XcoeffsA[1] += p->buf[XADAPTCOEFFSA-1]; - p->XcoeffsA[2] += p->buf[XADAPTCOEFFSA-2]; - p->XcoeffsA[3] += p->buf[XADAPTCOEFFSA-3]; - - p->XcoeffsB[0] += p->buf[XADAPTCOEFFSB]; - p->XcoeffsB[1] += p->buf[XADAPTCOEFFSB-1]; - p->XcoeffsB[2] += p->buf[XADAPTCOEFFSB-2]; - p->XcoeffsB[3] += p->buf[XADAPTCOEFFSB-3]; - p->XcoeffsB[4] += p->buf[XADAPTCOEFFSB-4]; + if (*decoded1 > 0) + { + p->XcoeffsA[0] -= p->buf[XADAPTCOEFFSA]; + p->XcoeffsA[1] -= p->buf[XADAPTCOEFFSA-1]; + p->XcoeffsA[2] -= p->buf[XADAPTCOEFFSA-2]; + p->XcoeffsA[3] -= p->buf[XADAPTCOEFFSA-3]; + + p->XcoeffsB[0] -= p->buf[XADAPTCOEFFSB]; + p->XcoeffsB[1] -= p->buf[XADAPTCOEFFSB-1]; + p->XcoeffsB[2] -= p->buf[XADAPTCOEFFSB-2]; + p->XcoeffsB[3] -= p->buf[XADAPTCOEFFSB-3]; + p->XcoeffsB[4] -= p->buf[XADAPTCOEFFSB-4]; + } + else + { + p->XcoeffsA[0] += p->buf[XADAPTCOEFFSA]; + p->XcoeffsA[1] += p->buf[XADAPTCOEFFSA-1]; + p->XcoeffsA[2] += p->buf[XADAPTCOEFFSA-2]; + p->XcoeffsA[3] += p->buf[XADAPTCOEFFSA-3]; + + p->XcoeffsB[0] += p->buf[XADAPTCOEFFSB]; + p->XcoeffsB[1] += p->buf[XADAPTCOEFFSB-1]; + p->XcoeffsB[2] += p->buf[XADAPTCOEFFSB-2]; + p->XcoeffsB[3] += p->buf[XADAPTCOEFFSB-3]; + p->XcoeffsB[4] += p->buf[XADAPTCOEFFSB-4]; + } } *(decoded1++) = p->XfilterA; @@ -196,7 +202,7 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p, p->buf++; /* Have we filled the history buffer? */ - if (p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE) { + if (UNLIKELY(p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE)) { memmove(p->historybuffer, p->buf, PREDICTOR_SIZE * sizeof(int32_t)); p->buf = p->historybuffer; @@ -215,7 +221,7 @@ int ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p, currentA = p->YlastA; - while (count--) + while (LIKELY(count--)) { A = *decoded0; @@ -232,25 +238,28 @@ int ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p, p->buf[YADAPTCOEFFSA] = SIGN(p->buf[YDELAYA]); p->buf[YADAPTCOEFFSA-1] = SIGN(p->buf[YDELAYA-1]); - if (A > 0) - { - p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA]; - p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1]; - p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2]; - p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3]; - } - else if (A < 0) + if (LIKELY(A != 0)) { - p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA]; - p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1]; - p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2]; - p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3]; + if (A > 0) + { + p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA]; + p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1]; + p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2]; + p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3]; + } + else + { + p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA]; + p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1]; + p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2]; + p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3]; + } } p->buf++; /* Have we filled the history buffer? */ - if (p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE) { + if (UNLIKELY(p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE)) { memmove(p->historybuffer, p->buf, PREDICTOR_SIZE * sizeof(int32_t)); p->buf = p->historybuffer; -- 2.11.4.GIT