From 5ccaf4eafbd00c7a660afbf6ee1f4475b48598c6 Mon Sep 17 00:00:00 2001 From: malc Date: Tue, 17 Feb 2009 04:06:51 +0300 Subject: [PATCH] Make Altivec fast --- skin.c | 263 ++++++++++++++++++++++++++++++++++++++++++++++------------------- tbs | 3 +- vec.c | 24 +++--- 3 files changed, 200 insertions(+), 90 deletions(-) diff --git a/skin.c b/skin.c index 8209e7e..569ac69 100644 --- a/skin.c +++ b/skin.c @@ -12,38 +12,32 @@ enum {V_IDX, N_IDX, UV_IDX, C_IDX, COUNT}; +#define ALNN(n, i) (((i)+(n-1))&~(n-1)) + +#ifdef G4 +#define DSTAL 32 +#else +#define DSTAL 16 +#endif + #ifdef USE_ALTIVEC /* Altivec code derived from: */ /* http://www.freevec.org/category/simd/algorithms/algebra/matrix_operations */ - #include -#ifndef __APPLE__ #include -#define simd_alloc(s) memalign (16, s) -#else -#define simd_alloc malloc -#endif +#define simd_alloc(b, s) memalign (b, s) #define A16 __attribute__ ((aligned (16))) -#define STRIDE 16 -#define V_ELEMS 4 -#define AL16(i) (((i)+15)&~15) - +#define AL16(i) ALNN (16, i) +#define AL32(i) ALNN (32, i) #else - -#define STRIDE 0 -#define V_ELEMS 3 -#define simd_alloc(s) malloc (s) +#define simd_alloc(b, s) malloc (s) #define A16 #define AL16(i) (i) - +#define AL32(i) (i) #endif struct skin { -#ifdef USE_ALTIVEC - float weights[12]; -#else float weights[3]; -#endif int boneindices[3]; int num_bones; } A16; @@ -80,13 +74,10 @@ static void copy_vertices (float *p, int num_vertices, value a_v) { int i, k; - for (i = 0, k = 0; i < num_vertices; ++i, p += V_ELEMS) { + for (i = 0, k = 0; i < num_vertices; ++i, p += 3) { p[0] = Double_field (a_v, k++); p[1] = Double_field (a_v, k++); p[2] = Double_field (a_v, k++); -#ifdef USE_ALTIVEC - p[3] = 1.0; -#endif } } @@ -123,13 +114,7 @@ static void set_geom (State *s, void **ptrs, value vertexa_v, value normala_v, skin[i].boneindices[j] = (int) val; w = val - skin[i].boneindices[j]; -#ifdef USE_ALTIVEC - vector float vw = {w,w,w,w}; - - vec_st (vw, j*16, skin[i].weights); -#else skin[i].weights[j] = w; -#endif skin[i].boneindices[j] += 1; } } @@ -144,13 +129,13 @@ static void skin_init (State *s, value vertexa_v, value normala_v, s->num_vertices = Wosize_val (vertexa_v) / (Double_wosize * 3); - sizev = V_ELEMS * sizeof (GLfloat) * s->num_vertices; + sizev = AL32 (3 * sizeof (GLfloat) * s->num_vertices); sizeu = 2 * sizeof (GLfloat) * s->num_vertices; sizec = 4 * s->num_vertices; sizevn = sizev * 2; - p = simd_alloc (AL16 (sizevn) + s->num_vertices * sizeof (struct skin)); + p = simd_alloc (16, AL16 (sizevn) + s->num_vertices * sizeof (struct skin)); s->skin = (struct skin *) (p + AL16 (sizevn)); s->ptrs[0] = ptrs[V_IDX] = p; ptrs[N_IDX] = p + sizev; @@ -180,7 +165,7 @@ static void skin_init (State *s, value vertexa_v, value normala_v, s->bufs[C_IDX] = p + sizeu; } else { - p = simd_alloc (sizevn); + p = simd_alloc (DSTAL, sizevn); s->bufs[V_IDX] = p; s->bufs[N_IDX] = p + sizev; s->bufs[UV_IDX] = ptrs[UV_IDX]; @@ -202,8 +187,8 @@ CAMLprim value ml_skin_draw_begin (value unit_v) glEnableClientState (GL_COLOR_ARRAY); if (use_vbo) glBindBuffer (GL_ARRAY_BUFFER, s->bufid[0]); - glVertexPointer (3, GL_FLOAT, V_ELEMS * sizeof (GLfloat), s->bufs[V_IDX]); - glNormalPointer (GL_FLOAT, V_ELEMS * sizeof (GLfloat), s->bufs[N_IDX]); + glVertexPointer (3, GL_FLOAT, 3 * sizeof (GLfloat), s->bufs[V_IDX]); + glNormalPointer (GL_FLOAT, 3 * sizeof (GLfloat), s->bufs[N_IDX]); if (use_vbo) glBindBuffer (GL_ARRAY_BUFFER, s->bufid[1]); glTexCoordPointer (2, GL_FLOAT, 0, s->bufs[UV_IDX]); @@ -262,64 +247,187 @@ static double now (void) } #endif -static void translate (State *s, float *vdst, float *ndst) +#ifdef USE_ALTIVEC + +#define DCB(o, b, i) __asm__ __volatile__ (#o " %0, %1" ::"b"(b),"r"(i)) + +static vector float appbones (State *s, + struct skin *skin, + vector float x, + vector float y, + vector float z, + vector float nx, + vector float ny, + vector float nz, + vector float *np) { - int i, j; + int j; struct bone *b; - float *vsrc = s->ptrs[0]; - float *nsrc = vsrc + s->num_vertices * V_ELEMS; - struct skin *skin = s->skin; - -#ifdef TIMING - double S = now (), E; -#endif + vector float vz = (vector float) vec_splat_u32 (0); + vector float v, w, n; + vector unsigned char S = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4<<3}; -#ifdef USE_ALTIVEC - for (i = 0; i < s->num_vertices; ++i, ++skin) { - vector float v, n, vs, ns, vz; - vector float r0, r1, r2, r3, nx, ny, nz; + v = n = vz; + w = vec_ld (0, skin->weights); - v = n = vz = (vector float) vec_splat_u32 (0); + j = skin->num_bones; + for (j = 0; j < skin->num_bones; ++j) { + vector float t0, t1, t2, t3, t4, t5, r0, r1, r2, r3, vw; - vs = vec_ld (i<<4, vsrc); - ns = vec_ld (i<<4, nsrc); + b = &s->bones[skin->boneindices[j]]; + vw = vec_splat (w, 0); + w = vec_slo (w, S); - nx = vec_splat (ns, 0); - ny = vec_splat (ns, 1); - nz = vec_splat (ns, 2); + r0 = vec_ld ( 0, b->cm); + r1 = vec_ld (16, b->cm); + r2 = vec_ld (32, b->cm); + r3 = vec_ld (48, b->cm); - for (j = 0; j < skin->num_bones; ++j) { - vector float vw, x, y, z, t0, t1, t2; + t0 = vec_madd (r0, x, r3); + t1 = vec_madd (r1, y, t0); + t2 = vec_madd (r2, z, t1); + v = vec_madd (t2, vw, v); - b = &s->bones[skin->boneindices[j]]; + t3 = vec_madd (r0, nx, vz); + t4 = vec_madd (r1, ny, t0); + t5 = vec_madd (r2, nz, t4); + n = vec_madd (t5, vw, n); - vw = vec_ld (j<<4, skin->weights); + } - r0 = vec_ld ( 0, b->cm); - r1 = vec_ld (16, b->cm); - r2 = vec_ld (32, b->cm); - r3 = vec_ld (48, b->cm); + *np = n; + return v; +} +#endif - x = vec_splat (vs, 0); - y = vec_splat (vs, 1); - z = vec_splat (vs, 2); +static void translate (State *s, float *vdst, float *ndst) +{ + int i, j; + struct bone *b; + float *vsrc = s->ptrs[0]; + float *nsrc = vsrc + ALNN (32, s->num_vertices * 3); + struct skin *skin = s->skin; - t0 = vec_madd (r0, x, r3); - t1 = vec_madd (r1, y, t0); - t2 = vec_madd (r2, z, t1); - v = vec_madd (t2, vw, v); +#ifdef TIMING + double S = now (), E; +#endif - t0 = vec_madd (r0, nx, vz); - t1 = vec_madd (r1, ny, t0); - t2 = vec_madd (r2, nz, t1); - n = vec_madd (t2, vw, n); +#ifdef USE_ALTIVEC + vector unsigned char p0 = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19 }; + vector unsigned char p1 = + { 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19, 20, 21, 22, 23 }; + vector unsigned char p2 = + { 8, 9, 10, 11, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }; + + for (i = 0, j = 0; i < s->num_vertices >> 2; ++i, j += 48) { + vector float v0, v1, v2, n0, n1, n2; + vector float vx, vy, vz, nx, ny, nz; + vector float vr0, vr1, vr2, vr3; + vector float nr0, nr1, nr2, nr3; + +#ifdef G4 + if (!(i & 3)) { + DCB (dcbz, vdst, j); + DCB (dcbz, ndst, j); } - vec_st (v, i<<4, vdst); - vec_st (n, i<<4, ndst); + + DCB (dcbz, vdst, j + 32); + DCB (dcbz, ndst, j + 32); +#endif + + DCB (dcbt, skin, 0); + DCB (dcbt, skin + 1, 0); + DCB (dcbt, skin + 2, 0); + DCB (dcbt, skin + 3, 0); + + DCB (dcbt, vsrc, j + 64); + DCB (dcbt, nsrc, j + 64); + DCB (dcbt, vsrc, j + 96); + DCB (dcbt, nsrc, j + 96); + + /* Load */ + v0 = vec_ld (j, vsrc); + v1 = vec_ld (j + 16, vsrc); + v2 = vec_ld (j + 32, vsrc); + n0 = vec_ld (j, nsrc); + n1 = vec_ld (j + 16, nsrc); + n2 = vec_ld (j + 32, nsrc); + + /* First vertex/normal */ + vx = vec_splat (v0, 0); + vy = vec_splat (v0, 1); + vz = vec_splat (v0, 2); + nx = vec_splat (n0, 0); + ny = vec_splat (n0, 1); + nz = vec_splat (n0, 2); + + vr0 = appbones (s, skin, vx, vy, vz, nx, ny, nz, &nr0); + skin++; + + /* Second vertex/normal */ + vx = vec_splat (v0, 3); + vy = vec_splat (v1, 0); + vz = vec_splat (v1, 1); + nx = vec_splat (n0, 3); + ny = vec_splat (n1, 0); + nz = vec_splat (n1, 1); + + vr1 = appbones (s, skin, vx, vy, vz, nx, ny, nz, &nr1); + skin++; + + /* Third vertex/normal */ + vx = vec_splat (v1, 2); + vy = vec_splat (v1, 3); + vz = vec_splat (v2, 0); + nx = vec_splat (n1, 2); + ny = vec_splat (n1, 3); + nz = vec_splat (n2, 0); + + vr2 = appbones (s, skin, vx, vy, vz, nx, ny, nz, &nr2); + skin++; + + /* Fourth vertex/normal */ + vx = vec_splat (v2, 1); + vy = vec_splat (v2, 2); + vz = vec_splat (v2, 3); + nx = vec_splat (n2, 1); + ny = vec_splat (n2, 2); + nz = vec_splat (n2, 3); + + vr3 = appbones (s, skin, vx, vy, vz, nx, ny, nz, &nr3); + skin++; + + /* Assemble */ + v0 = vec_perm (vr0, vr1, p0); + v1 = vec_perm (vr1, vr2, p1); + v2 = vec_perm (vr2, vr3, p2); + + n0 = vec_perm (nr0, nr1, p0); + n1 = vec_perm (nr1, nr2, p1); + n2 = vec_perm (nr2, nr3, p2); + + /* Store */ + vec_st (v0, j, vdst); + vec_st (v1, j + 16, vdst); + vec_st (v2, j + 32, vdst); + + vec_st (n0, j, ndst); + vec_st (n1, j + 16, ndst); + vec_st (n2, j + 32, ndst); } + + i <<= 2; + vsrc += i*3; + nsrc += i*3; + vdst += i*3; + ndst += i*3; #else - for (i = 0; i < s->num_vertices; ++i, - vsrc += 3, nsrc += 3, vdst += 3, ndst += 3, ++skin) + i = 0; +#endif + + for (; i < s->num_vertices; ++i, vsrc += 3, nsrc += 3, vdst += 3, ndst += 3, + ++skin) { float v[3] = {0,0,0}, n[3] = {0,0,0}, v0[4], v1[4], w; @@ -344,7 +452,6 @@ static void translate (State *s, float *vdst, float *ndst) vcopy (vdst, v); vcopy (ndst, n); } -#endif #ifdef TIMING E = now (); @@ -363,7 +470,7 @@ CAMLprim value ml_skin_set_skel (value skel_v) s->num_bones = Wosize_val (skel_v); size = (s->num_bones + 1) * sizeof (struct bone); - s->bones = b = simd_alloc (size); + s->bones = b = simd_alloc (16, size); memset (b, 0, size); b->parent = -1; diff --git a/tbs b/tbs index 9fd9f24..8861804 100644 --- a/tbs +++ b/tbs @@ -29,7 +29,8 @@ test $(hostname) = "linmac" && { ccopt="-Wall -Werror -Wextra -O3 -mabi=altivec -maltivec -Wno-unused-function"; ccopt="$ccopt -fprefetch-loop-arrays -mtune=power6 -mcpu=G4 -funroll-all-loops" ccopt="$ccopt -ftree-loop-linear -ftree-vectorize" -# ccopt="$ccopt -DUSE_ALTIVEC" + ccopt="$ccopt --param l1-cache-line-size=32" + ccopt="$ccopt -DUSE_ALTIVEC -DG4" ccopt="$ccopt -DTIMING" } diff --git a/vec.c b/vec.c index 2b3d366..77be45a 100644 --- a/vec.c +++ b/vec.c @@ -90,25 +90,28 @@ static void q2matrixt (float *mat, float *q, float *v) mat[10] = 1 - 2 * ( xx + yy ); #ifdef USE_ALTIVEC - mat[12] = v[0]; - mat[13] = v[1]; - mat[14] = v[2]; +#define MAT_V0 12 +#define MAT_V1 13 +#define MAT_V2 14 #else - mat[3] = v[0]; - mat[7] = v[1]; - mat[11] = v[2]; +#define MAT_V0 3 +#define MAT_V1 7 +#define MAT_V2 11 #endif + + mat[MAT_V0] = v[0]; + mat[MAT_V1] = v[1]; + mat[MAT_V2] = v[2]; } -#ifndef USE_ALTIVEC static void mapply_to_point (float *res, float *m, float *v) { float x = v[0]; float y = v[1]; float z = v[2]; - res[0] = x*m[0] + y*m[4] + z*m[8] + m[3]; - res[1] = x*m[1] + y*m[5] + z*m[9] + m[7]; - res[2] = x*m[2] + y*m[6] + z*m[10] + m[11]; + res[0] = x*m[0] + y*m[4] + z*m[8] + m[MAT_V0]; + res[1] = x*m[1] + y*m[5] + z*m[9] + m[MAT_V1]; + res[2] = x*m[2] + y*m[6] + z*m[10] + m[MAT_V2]; } static void mapply_to_vector (float *res, float *m, float *v) @@ -134,7 +137,6 @@ static void vcopy (float *res, float *v) *res++ = *v++; *res++ = *v++; } -#endif #else -- 2.11.4.GIT