Build script now builds an oggenc binary with flac support.
[vorbis-lancer-gcc.git] / aotuv-b5_20061024 / vq / latticepare.c
blob63905911e7c1ad0c3616748e6aa02be9d103d6ad
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001 *
9 * by the XIPHOPHORUS Company http://www.xiph.org/ *
10 * *
11 ********************************************************************
13 function: utility for paring low hit count cells from lattice codebook
14 last mod: $Id: latticepare.c 7187 2004-07-20 07:24:27Z xiphmont $
16 ********************************************************************/
18 #include <stdlib.h>
19 #include <stdio.h>
20 #include <math.h>
21 #include <string.h>
22 #include <errno.h>
23 #include "../lib/scales.h"
24 #include "bookutil.h"
25 #include "vqgen.h"
26 #include "vqsplit.h"
27 #include "../lib/os.h"
29 /* Lattice codebooks have two strengths: important fetaures that are
30 poorly modelled by global error minimization training (eg, strong
31 peaks) are not neglected 2) compact quantized representation.
33 A fully populated lattice codebook, however, swings point 1 too far
34 in the opposite direction; rare features need not be modelled quite
35 so religiously and as such, we waste bits unless we eliminate the
36 least common cells. The codebook rep supports unused cells, so we
37 need to tag such cells and build an auxiliary (non-thresh) search
38 mechanism to find the proper match quickly */
40 /* two basic steps; first is pare the cell for which dispersal creates
41 the least additional error. This will naturally choose
42 low-population cells and cells that have not taken on points from
43 neighboring paring (but does not result in the lattice collapsing
44 inward and leaving low population ares totally unmodelled). After
45 paring has removed the desired number of cells, we need to build an
46 auxiliary search for each culled point */
48 /* Although lattice books (due to threshhold-based matching) do not
49 actually use error to make cell selections (in fact, it need not
50 bear any relation), the 'secondbest' entry finder here is in fact
51 error/distance based, so latticepare is only useful on such books */
53 /* command line:
54 latticepare latticebook.vqh input_data.vqd <target_cells>
56 produces a new output book on stdout
59 static float _dist(int el,float *a, float *b){
60 int i;
61 float acc=0.f;
62 for(i=0;i<el;i++){
63 float val=(a[i]-b[i]);
64 acc+=val*val;
66 return(acc);
69 static float *pointlist;
70 static long points=0;
72 void add_vector(codebook *b,float *vec,long n){
73 int dim=b->dim,i,j;
74 int step=n/dim;
75 for(i=0;i<step;i++){
76 for(j=i;j<n;j+=step){
77 pointlist[points++]=vec[j];
82 static int bestm(codebook *b,float *vec){
83 encode_aux_threshmatch *tt=b->c->thresh_tree;
84 int dim=b->dim;
85 int i,k,o;
86 int best=0;
88 /* what would be the closest match if the codebook was fully
89 populated? */
91 for(k=0,o=dim-1;k<dim;k++,o--){
92 int i;
93 for(i=0;i<tt->threshvals-1;i++)
94 if(vec[o]<tt->quantthresh[i])break;
95 best=(best*tt->quantvals)+tt->quantmap[i];
97 return(best);
100 static int closest(codebook *b,float *vec,int current){
101 encode_aux_threshmatch *tt=b->c->thresh_tree;
102 int dim=b->dim;
103 int i,k,o;
105 float bestmetric=0;
106 int bestentry=-1;
107 int best=bestm(b,vec);
109 if(current<0 && b->c->lengthlist[best]>0)return best;
111 for(i=0;i<b->entries;i++){
112 if(b->c->lengthlist[i]>0 && i!=best && i!=current){
113 float thismetric=_dist(dim, vec, b->valuelist+i*dim);
114 if(bestentry==-1 || thismetric<bestmetric){
115 bestentry=i;
116 bestmetric=thismetric;
121 return(bestentry);
124 static float _heuristic(codebook *b,float *ppt,int secondbest){
125 float *secondcell=b->valuelist+secondbest*b->dim;
126 int best=bestm(b,ppt);
127 float *firstcell=b->valuelist+best*b->dim;
128 float error=_dist(b->dim,firstcell,secondcell);
129 float *zero=alloca(b->dim*sizeof(float));
130 float fromzero;
132 memset(zero,0,b->dim*sizeof(float));
133 fromzero=sqrt(_dist(b->dim,firstcell,zero));
135 return(error/fromzero);
138 static int longsort(const void *a, const void *b){
139 return **(long **)b-**(long **)a;
142 void usage(void){
143 fprintf(stderr,"Ogg/Vorbis lattice codebook paring utility\n\n"
144 "usage: latticepare book.vqh data.vqd <target_cells> <protected_cells> base\n"
145 "where <target_cells> is the desired number of final cells (or -1\n"
146 " for no change)\n"
147 " <protected_cells> is the number of highest-hit count cells\n"
148 " to protect from dispersal\n"
149 " base is the base name (not including .vqh) of the new\n"
150 " book\n\n");
151 exit(1);
154 int main(int argc,char *argv[]){
155 char *basename;
156 codebook *b=NULL;
157 int entries=0;
158 int dim=0;
159 long i,j,target=-1,protect=-1;
160 FILE *out=NULL;
162 int argnum=0;
164 argv++;
165 if(*argv==NULL){
166 usage();
167 exit(1);
170 while(*argv){
171 if(*argv[0]=='-'){
173 argv++;
175 }else{
176 switch (argnum++){
177 case 0:case 1:
179 /* yes, this is evil. However, it's very convenient to parse file
180 extentions */
182 /* input file. What kind? */
183 char *dot;
184 char *ext=NULL;
185 char *name=strdup(*argv++);
186 dot=strrchr(name,'.');
187 if(dot)
188 ext=dot+1;
189 else{
190 ext="";
195 /* codebook */
196 if(!strcmp(ext,"vqh")){
198 basename=strrchr(name,'/');
199 if(basename)
200 basename=strdup(basename)+1;
201 else
202 basename=strdup(name);
203 dot=strrchr(basename,'.');
204 if(dot)*dot='\0';
206 b=codebook_load(name);
207 dim=b->dim;
208 entries=b->entries;
211 /* data file; we do actually need to suck it into memory */
212 /* we're dealing with just one book, so we can de-interleave */
213 if(!strcmp(ext,"vqd") && !points){
214 int cols;
215 long lines=0;
216 char *line;
217 float *vec;
218 FILE *in=fopen(name,"r");
219 if(!in){
220 fprintf(stderr,"Could not open input file %s\n",name);
221 exit(1);
224 reset_next_value();
225 line=setup_line(in);
226 /* count cols before we start reading */
228 char *temp=line;
229 while(*temp==' ')temp++;
230 for(cols=0;*temp;cols++){
231 while(*temp>32)temp++;
232 while(*temp==' ')temp++;
235 vec=alloca(cols*sizeof(float));
236 /* count, then load, to avoid fragmenting the hell out of
237 memory */
238 while(line){
239 lines++;
240 for(j=0;j<cols;j++)
241 if(get_line_value(in,vec+j)){
242 fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
243 exit(1);
245 if((lines&0xff)==0)spinnit("counting samples...",lines*cols);
246 line=setup_line(in);
248 pointlist=_ogg_malloc((cols*lines+entries*dim)*sizeof(float));
250 rewind(in);
251 line=setup_line(in);
252 while(line){
253 lines--;
254 for(j=0;j<cols;j++)
255 if(get_line_value(in,vec+j)){
256 fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
257 exit(1);
259 /* deinterleave, add to heap */
260 add_vector(b,vec,cols);
261 if((lines&0xff)==0)spinnit("loading samples...",lines*cols);
263 line=setup_line(in);
265 fclose(in);
268 break;
269 case 2:
270 target=atol(*argv++);
271 if(target==0)target=entries;
272 break;
273 case 3:
274 protect=atol(*argv++);
275 break;
276 case 4:
278 char *buff=alloca(strlen(*argv)+5);
279 sprintf(buff,"%s.vqh",*argv);
280 basename=*argv++;
282 out=fopen(buff,"w");
283 if(!out){
284 fprintf(stderr,"unable ot open %s for output",buff);
285 exit(1);
288 break;
289 default:
290 usage();
294 if(!entries || !points || !out)usage();
295 if(target==-1)usage();
297 /* add guard points */
298 for(i=0;i<entries;i++)
299 for(j=0;j<dim;j++)
300 pointlist[points++]=b->valuelist[i*dim+j];
302 points/=dim;
304 /* set up auxiliary vectors for error tracking */
306 encode_aux_nearestmatch *nt=NULL;
307 long pointssofar=0;
308 long *pointindex;
309 long indexedpoints=0;
310 long *entryindex;
311 long *reventry;
312 long *membership=_ogg_malloc(points*sizeof(long));
313 long *firsthead=_ogg_malloc(entries*sizeof(long));
314 long *secondary=_ogg_malloc(points*sizeof(long));
315 long *secondhead=_ogg_malloc(entries*sizeof(long));
317 long *cellcount=_ogg_calloc(entries,sizeof(long));
318 long *cellcount2=_ogg_calloc(entries,sizeof(long));
319 float *cellerror=_ogg_calloc(entries,sizeof(float));
320 float *cellerrormax=_ogg_calloc(entries,sizeof(float));
321 long cellsleft=entries;
322 for(i=0;i<points;i++)membership[i]=-1;
323 for(i=0;i<entries;i++)firsthead[i]=-1;
324 for(i=0;i<points;i++)secondary[i]=-1;
325 for(i=0;i<entries;i++)secondhead[i]=-1;
327 for(i=0;i<points;i++){
328 /* assign vectors to the nearest cell. Also keep track of second
329 nearest for error statistics */
330 float *ppt=pointlist+i*dim;
331 int firstentry=closest(b,ppt,-1);
332 int secondentry=closest(b,ppt,firstentry);
333 float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
334 float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
336 if(!(i&0xff))spinnit("initializing... ",points-i);
338 membership[i]=firsthead[firstentry];
339 firsthead[firstentry]=i;
340 secondary[i]=secondhead[secondentry];
341 secondhead[secondentry]=i;
343 if(i<points-entries){
344 cellerror[firstentry]+=secondmetric-firstmetric;
345 cellerrormax[firstentry]=max(cellerrormax[firstentry],
346 _heuristic(b,ppt,secondentry));
347 cellcount[firstentry]++;
348 cellcount2[secondentry]++;
352 /* which cells are most heavily populated? Protect as many from
353 dispersal as the user has requested */
355 long **countindex=_ogg_calloc(entries,sizeof(long *));
356 for(i=0;i<entries;i++)countindex[i]=cellcount+i;
357 qsort(countindex,entries,sizeof(long *),longsort);
358 for(i=0;i<protect;i++){
359 int ptr=countindex[i]-cellcount;
360 cellerrormax[ptr]=9e50f;
365 fprintf(stderr,"\r");
366 for(i=0;i<entries;i++){
367 /* decompose index */
368 int entry=i;
369 for(j=0;j<dim;j++){
370 fprintf(stderr,"%d:",entry%b->c->thresh_tree->quantvals);
371 entry/=b->c->thresh_tree->quantvals;
374 fprintf(stderr,":%ld/%ld, ",cellcount[i],cellcount2[i]);
376 fprintf(stderr,"\n");
379 /* do the automatic cull request */
380 while(cellsleft>target){
381 int bestcell=-1;
382 float besterror=0;
383 float besterror2=0;
384 long head=-1;
385 char spinbuf[80];
386 sprintf(spinbuf,"cells left to eliminate: %ld : ",cellsleft-target);
388 /* find the cell with lowest removal impact */
389 for(i=0;i<entries;i++){
390 if(b->c->lengthlist[i]>0){
391 if(bestcell==-1 || cellerrormax[i]<=besterror2){
392 if(bestcell==-1 || cellerrormax[i]<besterror2 ||
393 besterror>cellerror[i]){
394 besterror=cellerror[i];
395 besterror2=cellerrormax[i];
396 bestcell=i;
402 fprintf(stderr,"\reliminating cell %d \n"
403 " dispersal error of %g max/%g total (%ld hits)\n",
404 bestcell,besterror2,besterror,cellcount[bestcell]);
406 /* disperse it. move each point out, adding it (properly) to
407 the second best */
408 b->c->lengthlist[bestcell]=0;
409 head=firsthead[bestcell];
410 firsthead[bestcell]=-1;
411 while(head!=-1){
412 /* head is a point number */
413 float *ppt=pointlist+head*dim;
414 int firstentry=closest(b,ppt,-1);
415 int secondentry=closest(b,ppt,firstentry);
416 float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
417 float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
418 long next=membership[head];
420 if(head<points-entries){
421 cellcount[firstentry]++;
422 cellcount[bestcell]--;
423 cellerror[firstentry]+=secondmetric-firstmetric;
424 cellerrormax[firstentry]=max(cellerrormax[firstentry],
425 _heuristic(b,ppt,secondentry));
428 membership[head]=firsthead[firstentry];
429 firsthead[firstentry]=head;
430 head=next;
431 if(cellcount[bestcell]%128==0)
432 spinnit(spinbuf,cellcount[bestcell]+cellcount2[bestcell]);
436 /* now see that all points that had the dispersed cell as second
437 choice have second choice reassigned */
438 head=secondhead[bestcell];
439 secondhead[bestcell]=-1;
440 while(head!=-1){
441 float *ppt=pointlist+head*dim;
442 /* who are we assigned to now? */
443 int firstentry=closest(b,ppt,-1);
444 /* what is the new second closest match? */
445 int secondentry=closest(b,ppt,firstentry);
446 /* old second closest is the cell being disbanded */
447 float oldsecondmetric=_dist(dim,b->valuelist+dim*bestcell,ppt);
448 /* new second closest error */
449 float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
450 long next=secondary[head];
452 if(head<points-entries){
453 cellcount2[secondentry]++;
454 cellcount2[bestcell]--;
455 cellerror[firstentry]+=secondmetric-oldsecondmetric;
456 cellerrormax[firstentry]=max(cellerrormax[firstentry],
457 _heuristic(b,ppt,secondentry));
460 secondary[head]=secondhead[secondentry];
461 secondhead[secondentry]=head;
462 head=next;
464 if(cellcount2[bestcell]%128==0)
465 spinnit(spinbuf,cellcount2[bestcell]);
468 cellsleft--;
471 /* paring is over. Build decision trees using points that now fall
472 through the thresh matcher. */
473 /* we don't free membership; we flatten it in order to use in lp_split */
475 for(i=0;i<entries;i++){
476 long head=firsthead[i];
477 spinnit("rearranging membership cache... ",entries-i);
478 while(head!=-1){
479 long next=membership[head];
480 membership[head]=i;
481 head=next;
485 free(secondhead);
486 free(firsthead);
487 free(cellerror);
488 free(cellerrormax);
489 free(secondary);
491 pointindex=_ogg_malloc(points*sizeof(long));
492 /* make a point index of fall-through points */
493 for(i=0;i<points;i++){
494 int best=_best(b,pointlist+i*dim,1);
495 if(best==-1)
496 pointindex[indexedpoints++]=i;
497 spinnit("finding orphaned points... ",points-i);
500 /* make an entry index */
501 entryindex=_ogg_malloc(entries*sizeof(long));
502 target=0;
503 for(i=0;i<entries;i++){
504 if(b->c->lengthlist[i]>0)
505 entryindex[target++]=i;
508 /* make working space for a reverse entry index */
509 reventry=_ogg_malloc(entries*sizeof(long));
511 /* do the split */
512 nt=b->c->nearest_tree=
513 _ogg_calloc(1,sizeof(encode_aux_nearestmatch));
515 nt->alloc=4096;
516 nt->ptr0=_ogg_malloc(sizeof(long)*nt->alloc);
517 nt->ptr1=_ogg_malloc(sizeof(long)*nt->alloc);
518 nt->p=_ogg_malloc(sizeof(long)*nt->alloc);
519 nt->q=_ogg_malloc(sizeof(long)*nt->alloc);
520 nt->aux=0;
522 fprintf(stderr,"Leaves added: %d \n",
523 lp_split(pointlist,points,
524 b,entryindex,target,
525 pointindex,indexedpoints,
526 membership,reventry,
527 0,&pointssofar));
528 free(membership);
529 free(reventry);
530 free(pointindex);
532 /* hack alert. I should just change the damned splitter and
533 codebook writer */
534 for(i=0;i<nt->aux;i++)nt->p[i]*=dim;
535 for(i=0;i<nt->aux;i++)nt->q[i]*=dim;
537 /* recount hits. Build new lengthlist. reuse entryindex storage */
538 for(i=0;i<entries;i++)entryindex[i]=1;
539 for(i=0;i<points-entries;i++){
540 int best=_best(b,pointlist+i*dim,1);
541 float *a=pointlist+i*dim;
542 if(!(i&0xff))spinnit("counting hits...",i);
543 if(best==-1){
544 fprintf(stderr,"\nINTERNAL ERROR; a point count not be matched to a\n"
545 "codebook entry. The new decision tree is broken.\n");
546 exit(1);
548 entryindex[best]++;
550 for(i=0;i<nt->aux;i++)nt->p[i]/=dim;
551 for(i=0;i<nt->aux;i++)nt->q[i]/=dim;
553 /* the lengthlist builder doesn't actually deal with 0 hit entries.
554 So, we pack the 'sparse' hit list into a dense list, then unpack
555 the lengths after the build */
557 int upper=0;
558 long *lengthlist=_ogg_calloc(entries,sizeof(long));
559 for(i=0;i<entries;i++){
560 if(b->c->lengthlist[i]>0)
561 entryindex[upper++]=entryindex[i];
562 else{
563 if(entryindex[i]>1){
564 fprintf(stderr,"\nINTERNAL ERROR; _best matched to unused entry\n");
565 exit(1);
570 /* sanity check */
571 if(upper != target){
572 fprintf(stderr,"\nINTERNAL ERROR; packed the wrong number of entries\n");
573 exit(1);
576 build_tree_from_lengths(upper,entryindex,lengthlist);
578 upper=0;
579 for(i=0;i<entries;i++){
580 if(b->c->lengthlist[i]>0)
581 b->c->lengthlist[i]=lengthlist[upper++];
586 /* we're done. write it out. */
587 write_codebook(out,basename,b->c);
589 fprintf(stderr,"\r \nDone.\n");
590 return(0);