mySQL 5.0.11 sources for tomato
[tomato.git] / release / src / router / mysql / storage / myisam / myisampack.c
blob29cdfa9ffd49e76ecabbd148f595e74f60d36411
1 /* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; version 2 of the License.
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 GNU General Public License for more details.
12 You should have received a copy of the GNU General Public License
13 along with this program; if not, write to the Free Software
14 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 /* Pack MyISAM file */
19 #ifndef USE_MY_FUNC
20 #define USE_MY_FUNC /* We need at least my_malloc */
21 #endif
23 #include "myisamdef.h"
24 #include <queues.h>
25 #include <my_tree.h>
26 #include "mysys_err.h"
27 #ifdef MSDOS
28 #include <io.h>
29 #endif
30 #ifndef __GNU_LIBRARY__
31 #define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
32 #endif
33 #include <my_getopt.h>
34 #include <assert.h>
36 #if SIZEOF_LONG_LONG > 4
37 #define BITS_SAVED 64
38 #else
39 #define BITS_SAVED 32
40 #endif
42 #define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
43 #define HEAD_LENGTH 32
44 #define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
46 #define DATA_TMP_EXT ".TMD"
47 #define OLD_EXT ".OLD"
48 #define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
50 struct st_file_buffer {
51 File file;
52 uchar *buffer,*pos,*end;
53 my_off_t pos_in_file;
54 int bits;
55 ulonglong bitbucket;
58 struct st_huff_tree;
59 struct st_huff_element;
61 typedef struct st_huff_counts {
62 uint field_length,max_zero_fill;
63 uint pack_type;
64 uint max_end_space,max_pre_space,length_bits,min_space;
65 ulong max_length;
66 enum en_fieldtype field_type;
67 struct st_huff_tree *tree; /* Tree for field */
68 my_off_t counts[256];
69 my_off_t end_space[8];
70 my_off_t pre_space[8];
71 my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
72 TREE int_tree; /* Tree for detecting distinct column values. */
73 uchar *tree_buff; /* Column values, 'field_length' each. */
74 uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
75 } HUFF_COUNTS;
77 typedef struct st_huff_element HUFF_ELEMENT;
80 WARNING: It is crucial for the optimizations in calc_packed_length()
81 that 'count' is the first element of 'HUFF_ELEMENT'.
83 struct st_huff_element {
84 my_off_t count;
85 union un_element {
86 struct st_nod {
87 HUFF_ELEMENT *left,*right;
88 } nod;
89 struct st_leaf {
90 HUFF_ELEMENT *null;
91 uint element_nr; /* Number of element */
92 } leaf;
93 } a;
97 typedef struct st_huff_tree {
98 HUFF_ELEMENT *root,*element_buffer;
99 HUFF_COUNTS *counts;
100 uint tree_number;
101 uint elements;
102 my_off_t bytes_packed;
103 uint tree_pack_length;
104 uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
105 ulonglong *code;
106 uchar *code_len;
107 } HUFF_TREE;
110 typedef struct st_isam_mrg {
111 MI_INFO **file,**current,**end;
112 uint free_file;
113 uint count;
114 uint min_pack_length; /* Theese is used by packed data */
115 uint max_pack_length;
116 uint ref_length;
117 uint max_blob_length;
118 my_off_t records;
119 /* true if at least one source file has at least one disabled index */
120 my_bool src_file_has_indexes_disabled;
121 } PACK_MRG_INFO;
124 extern int main(int argc,char * *argv);
125 static void get_options(int *argc,char ***argv);
126 static MI_INFO *open_isam_file(char *name,int mode);
127 static my_bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
128 static int compress(PACK_MRG_INFO *file,char *join_name);
129 static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records);
130 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
131 uint trees,
132 HUFF_COUNTS *huff_counts,
133 uint fields);
134 static int compare_tree(void* cmp_arg __attribute__((unused)),
135 const uchar *s,const uchar *t);
136 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
137 static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
138 my_off_t records);
139 static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
140 uint max_space_length,my_off_t *space_counts,
141 my_off_t tot_space_count,
142 enum en_fieldtype field_type);
143 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
144 static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
145 static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
146 static int save_counts_in_queue(uchar *key,element_count count,
147 HUFF_TREE *tree);
148 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
149 static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
150 static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
151 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
152 HUFF_ELEMENT *element,uint size,
153 ulonglong code);
154 static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
155 my_off_t tot_elements,my_off_t filelength);
156 static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
157 static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
158 static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
159 HUFF_ELEMENT *element,
160 uint *offset);
161 static uint max_bit(uint value);
162 static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
163 static char *make_new_name(char *new_name,char *old_name);
164 static char *make_old_name(char *new_name,char *old_name);
165 static void init_file_buffer(File file,pbool read_buffer);
166 static int flush_buffer(ulong neaded_length);
167 static void end_file_buffer(void);
168 static void write_bits(ulonglong value, uint bits);
169 static void flush_bits(void);
170 static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
171 ha_checksum crc);
172 static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
173 ha_checksum crc);
174 static int mrg_close(PACK_MRG_INFO *mrg);
175 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
176 static void mrg_reset(PACK_MRG_INFO *mrg);
177 #if !defined(DBUG_OFF)
178 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
179 static int fakecmp(my_off_t **count1, my_off_t **count2);
180 #endif
183 static int error_on_write=0,test_only=0,verbose=0,silent=0,
184 write_loop=0,force_pack=0, isamchk_neaded=0;
185 static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
186 static my_bool backup, opt_wait;
188 tree_buff_length is somewhat arbitrary. The bigger it is the better
189 the chance to win in terms of compression factor. On the other hand,
190 this table becomes part of the compressed file header. And its length
191 is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
193 static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
194 static char tmp_dir[FN_REFLEN]={0},*join_table;
195 static my_off_t intervall_length;
196 static ha_checksum glob_crc;
197 static struct st_file_buffer file_buffer;
198 static QUEUE queue;
199 static HUFF_COUNTS *global_count;
200 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
201 static const char *load_default_groups[]= { "myisampack",0 };
203 /* The main program */
205 int main(int argc, char **argv)
207 int error,ok;
208 PACK_MRG_INFO merge;
209 char **default_argv;
210 MY_INIT(argv[0]);
212 load_defaults("my",load_default_groups,&argc,&argv);
213 default_argv= argv;
214 get_options(&argc,&argv);
216 error=ok=isamchk_neaded=0;
217 if (join_table)
218 { /* Join files into one */
219 if (open_isam_files(&merge,argv,(uint) argc) ||
220 compress(&merge,join_table))
221 error=1;
223 else while (argc--)
225 MI_INFO *isam_file;
226 if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
227 error=1;
228 else
230 merge.file= &isam_file;
231 merge.current=0;
232 merge.free_file=0;
233 merge.count=1;
234 if (compress(&merge,0))
235 error=1;
236 else
237 ok=1;
240 if (ok && isamchk_neaded && !silent)
241 puts("Remember to run myisamchk -rq on compressed tables");
242 VOID(fflush(stdout));
243 VOID(fflush(stderr));
244 free_defaults(default_argv);
245 my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
246 exit(error ? 2 : 0);
247 #ifndef _lint
248 return 0; /* No compiler warning */
249 #endif
252 enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
254 static struct my_option my_long_options[] =
256 #ifdef __NETWARE__
257 {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
258 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
259 #endif
260 {"backup", 'b', "Make a backup of the table as table_name.OLD.",
261 &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
262 {"character-sets-dir", OPT_CHARSETS_DIR_MP,
263 "Directory where character sets are.", &charsets_dir,
264 &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
265 {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
266 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
267 {"force", 'f',
268 "Force packing of table even if it gets bigger or if tempfile exists.",
269 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
270 {"join", 'j',
271 "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
272 &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
273 0, 0, 0},
274 {"help", '?', "Display this help and exit.",
275 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
276 {"silent", 's', "Be more silent.",
277 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
278 {"tmpdir", 'T', "Use temporary directory to store temporary table.",
279 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
280 {"test", 't', "Don't pack table, only test packing it.",
281 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
282 {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
283 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
284 {"version", 'V', "Output version information and exit.",
285 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
286 {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
287 &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
288 { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
291 #include <help_start.h>
293 static void print_version(void)
295 VOID(printf("%s Ver 1.23 for %s on %s\n",
296 my_progname, SYSTEM_TYPE, MACHINE_TYPE));
297 NETWARE_SET_SCREEN_MODE(1);
301 static void usage(void)
303 print_version();
304 puts("Copyright 2002-2008 MySQL AB, 2008 Sun Microsystems, Inc.");
305 puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
306 puts("and you are welcome to modify and redistribute it under the GPL license\n");
308 puts("Pack a MyISAM-table to take much less space.");
309 puts("Keys are not updated, you must run myisamchk -rq on the datafile");
310 puts("afterwards to update the keys.");
311 puts("You should give the .MYI file as the filename argument.");
313 VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
314 my_print_help(my_long_options);
315 print_defaults("my", load_default_groups);
316 my_print_variables(my_long_options);
319 #include <help_end.h>
321 static my_bool
322 get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
323 char *argument)
325 uint length;
327 switch(optid) {
328 #ifdef __NETWARE__
329 case OPT_AUTO_CLOSE:
330 setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
331 break;
332 #endif
333 case 'f':
334 force_pack= 1;
335 tmpfile_createflag= O_RDWR | O_TRUNC;
336 break;
337 case 's':
338 write_loop= verbose= 0;
339 silent= 1;
340 break;
341 case 't':
342 test_only= 1;
343 /* Avoid to reset 'verbose' if it was already set > 1. */
344 if (! verbose)
345 verbose= 1;
346 break;
347 case 'T':
348 length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
349 if (length != dirname_length(tmp_dir))
351 tmp_dir[length]=FN_LIBCHAR;
352 tmp_dir[length+1]=0;
354 break;
355 case 'v':
356 verbose++; /* Allow for selecting the level of verbosity. */
357 silent= 0;
358 break;
359 case '#':
360 DBUG_PUSH(argument ? argument : "d:t:o");
361 break;
362 case 'V':
363 print_version();
364 exit(0);
365 case 'I':
366 case '?':
367 usage();
368 exit(0);
370 return 0;
373 /* reads options */
374 /* Initiates DEBUG - but no debugging here ! */
376 static void get_options(int *argc,char ***argv)
378 int ho_error;
380 my_progname= argv[0][0];
381 if (isatty(fileno(stdout)))
382 write_loop=1;
384 if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
385 exit(ho_error);
387 if (!*argc)
389 usage();
390 exit(1);
392 if (join_table)
394 backup=0; /* Not needed */
395 tmp_dir[0]=0;
397 return;
401 static MI_INFO *open_isam_file(char *name,int mode)
403 MI_INFO *isam_file;
404 MYISAM_SHARE *share;
405 DBUG_ENTER("open_isam_file");
407 if (!(isam_file=mi_open(name,mode,
408 (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
409 HA_OPEN_ABORT_IF_LOCKED))))
411 VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
412 DBUG_RETURN(0);
414 share=isam_file->s;
415 if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
417 if (!force_pack)
419 VOID(fprintf(stderr, "%s is already compressed\n", name));
420 VOID(mi_close(isam_file));
421 DBUG_RETURN(0);
423 if (verbose)
424 puts("Recompressing already compressed table");
425 share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
427 if (! force_pack && share->state.state.records != 0 &&
428 (share->state.state.records <= 1 ||
429 share->state.state.data_file_length < 1024))
431 VOID(fprintf(stderr, "%s is too small to compress\n", name));
432 VOID(mi_close(isam_file));
433 DBUG_RETURN(0);
435 VOID(mi_lock_database(isam_file,F_WRLCK));
436 DBUG_RETURN(isam_file);
440 static my_bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
442 uint i,j;
443 mrg->count=0;
444 mrg->current=0;
445 mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
446 mrg->free_file=1;
447 mrg->src_file_has_indexes_disabled= 0;
448 for (i=0; i < count ; i++)
450 if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
451 goto error;
453 mrg->src_file_has_indexes_disabled|=
454 ! mi_is_all_keys_active(mrg->file[i]->s->state.key_map,
455 mrg->file[i]->s->base.keys);
457 /* Check that files are identical */
458 for (j=0 ; j < count-1 ; j++)
460 MI_COLUMNDEF *m1,*m2,*end;
461 if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
462 mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
463 goto diff_file;
464 m1=mrg->file[j]->s->rec;
465 end=m1+mrg->file[j]->s->base.fields;
466 m2=mrg->file[j+1]->s->rec;
467 for ( ; m1 != end ; m1++,m2++)
469 if (m1->type != m2->type || m1->length != m2->length)
470 goto diff_file;
473 mrg->count=count;
474 return 0;
476 diff_file:
477 VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
478 my_progname, names[j], names[j+1]));
479 error:
480 while (i--)
481 mi_close(mrg->file[i]);
482 my_free((uchar*) mrg->file,MYF(0));
483 return 1;
487 static int compress(PACK_MRG_INFO *mrg,char *result_table)
489 int error;
490 File new_file,join_isam_file;
491 MI_INFO *isam_file;
492 MYISAM_SHARE *share;
493 char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
494 uint i,header_length,fields,trees,used_trees;
495 my_off_t old_length,new_length,tot_elements;
496 HUFF_COUNTS *huff_counts;
497 HUFF_TREE *huff_trees;
498 DBUG_ENTER("compress");
500 isam_file=mrg->file[0]; /* Take this as an example */
501 share=isam_file->s;
502 new_file=join_isam_file= -1;
503 trees=fields=0;
504 huff_trees=0;
505 huff_counts=0;
507 /* Create temporary or join file */
509 if (backup)
510 VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2));
511 else
512 VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2+4+16));
513 if (!test_only && result_table)
515 /* Make a new indexfile based on first file in list */
516 uint length;
517 uchar *buff;
518 strmov(org_name,result_table); /* Fix error messages */
519 VOID(fn_format(new_name,result_table,"",MI_NAME_IEXT,2));
520 if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
521 < 0)
522 goto err;
523 length=(uint) share->base.keystart;
524 if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
525 goto err;
526 if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
527 my_write(join_isam_file,buff,length,
528 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
530 my_free(buff,MYF(0));
531 goto err;
533 my_free(buff,MYF(0));
534 VOID(fn_format(new_name,result_table,"",MI_NAME_DEXT,2));
536 else if (!tmp_dir[0])
537 VOID(make_new_name(new_name,org_name));
538 else
539 VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
540 if (!test_only &&
541 (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
542 goto err;
544 /* Start calculating statistics */
546 mrg->records=0;
547 for (i=0 ; i < mrg->count ; i++)
548 mrg->records+=mrg->file[i]->s->state.state.records;
550 DBUG_PRINT("info", ("Compressing %s: (%lu records)",
551 result_table ? new_name : org_name,
552 (ulong) mrg->records));
553 if (write_loop || verbose)
555 VOID(printf("Compressing %s: (%lu records)\n",
556 result_table ? new_name : org_name, (ulong) mrg->records));
558 trees=fields=share->base.fields;
559 huff_counts=init_huff_count(isam_file,mrg->records);
560 QUICK_SAFEMALLOC;
563 Read the whole data file(s) for statistics.
565 DBUG_PRINT("info", ("- Calculating statistics"));
566 if (write_loop || verbose)
567 VOID(printf("- Calculating statistics\n"));
568 if (get_statistic(mrg,huff_counts))
569 goto err;
570 NORMAL_SAFEMALLOC;
571 old_length=0;
572 for (i=0; i < mrg->count ; i++)
573 old_length+= (mrg->file[i]->s->state.state.data_file_length -
574 mrg->file[i]->s->state.state.empty);
577 Create a global priority queue in preparation for making
578 temporary Huffman trees.
580 if (init_queue(&queue,256,0,0,compare_huff_elements,0))
581 goto err;
584 Check each column if we should use pre-space-compress, end-space-
585 compress, empty-field-compress or zero-field-compress.
587 check_counts(huff_counts,fields,mrg->records);
590 Build a Huffman tree for each column.
592 huff_trees=make_huff_trees(huff_counts,trees);
595 If the packed lengths of combined columns is less then the sum of
596 the non-combined columns, then create common Huffman trees for them.
597 We do this only for byte compressed columns, not for distinct values
598 compressed columns.
600 if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
601 goto err;
604 Assign codes to all byte or column values.
606 if (make_huff_decode_table(huff_trees,fields))
607 goto err;
609 /* Prepare a file buffer. */
610 init_file_buffer(new_file,0);
613 Reserve space in the target file for the fixed compressed file header.
615 file_buffer.pos_in_file=HEAD_LENGTH;
616 if (! test_only)
617 VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
620 Write field infos: field type, pack type, length bits, tree number.
622 write_field_info(huff_counts,fields,used_trees);
625 Write decode trees.
627 if (!(tot_elements=write_huff_tree(huff_trees,trees)))
628 goto err;
631 Calculate the total length of the compression info header.
632 This includes the fixed compressed file header, the column compression
633 type descriptions, and the decode trees.
635 header_length=(uint) file_buffer.pos_in_file+
636 (uint) (file_buffer.pos-file_buffer.buffer);
639 Compress the source file into the target file.
641 DBUG_PRINT("info", ("- Compressing file"));
642 if (write_loop || verbose)
643 VOID(printf("- Compressing file\n"));
644 error=compress_isam_file(mrg,huff_counts);
645 new_length=file_buffer.pos_in_file;
646 if (!error && !test_only)
648 uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
649 bzero(buff,sizeof(buff));
650 error=my_write(file_buffer.file,buff,sizeof(buff),
651 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
655 Write the fixed compressed file header.
657 if (!error)
658 error=write_header(mrg,header_length,used_trees,tot_elements,
659 new_length);
661 /* Flush the file buffer. */
662 end_file_buffer();
664 /* Display statistics. */
665 DBUG_PRINT("info", ("Min record length: %6d Max length: %6d "
666 "Mean total length: %6ld\n",
667 mrg->min_pack_length, mrg->max_pack_length,
668 (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
669 if (verbose && mrg->records)
670 VOID(printf("Min record length: %6d Max length: %6d "
671 "Mean total length: %6ld\n", mrg->min_pack_length,
672 mrg->max_pack_length, (ulong) (new_length/mrg->records)));
674 /* Close source and target file. */
675 if (!test_only)
677 error|=my_close(new_file,MYF(MY_WME));
678 if (!result_table)
680 error|=my_close(isam_file->dfile,MYF(MY_WME));
681 isam_file->dfile= -1; /* Tell mi_close file is closed */
685 /* Cleanup. */
686 free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
687 if (! test_only && ! error)
689 if (result_table)
691 error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
693 else
695 if (backup)
697 if (my_rename(org_name,make_old_name(temp_name,isam_file->filename),
698 MYF(MY_WME)))
699 error=1;
700 else
702 if (tmp_dir[0])
703 error=my_copy(new_name,org_name,MYF(MY_WME));
704 else
705 error=my_rename(new_name,org_name,MYF(MY_WME));
706 if (!error)
708 VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
709 if (tmp_dir[0])
710 VOID(my_delete(new_name,MYF(MY_WME)));
714 else
716 if (tmp_dir[0])
718 error=my_copy(new_name,org_name,
719 MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
720 if (!error)
721 VOID(my_delete(new_name,MYF(MY_WME)));
723 else
724 error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
726 if (! error)
727 error=save_state(isam_file,mrg,new_length,glob_crc);
730 error|=mrg_close(mrg);
731 if (join_isam_file >= 0)
732 error|=my_close(join_isam_file,MYF(MY_WME));
733 if (error)
735 VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
736 VOID(my_delete(new_name,MYF(MY_WME)));
737 DBUG_RETURN(-1);
739 if (write_loop || verbose)
741 if (old_length)
742 VOID(printf("%.4g%% \n",
743 (((longlong) (old_length - new_length)) * 100.0 /
744 (longlong) old_length)));
745 else
746 puts("Empty file saved in compressed format");
748 DBUG_RETURN(0);
750 err:
751 free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
752 if (new_file >= 0)
753 VOID(my_close(new_file,MYF(0)));
754 if (join_isam_file >= 0)
755 VOID(my_close(join_isam_file,MYF(0)));
756 mrg_close(mrg);
757 VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
758 DBUG_RETURN(-1);
761 /* Init a huff_count-struct for each field and init it */
763 static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
765 reg2 uint i;
766 reg1 HUFF_COUNTS *count;
767 if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
768 sizeof(HUFF_COUNTS),
769 MYF(MY_ZEROFILL | MY_WME))))
771 for (i=0 ; i < info->s->base.fields ; i++)
773 enum en_fieldtype type;
774 count[i].field_length=info->s->rec[i].length;
775 type= count[i].field_type= (enum en_fieldtype) info->s->rec[i].type;
776 if (type == FIELD_INTERVALL ||
777 type == FIELD_CONSTANT ||
778 type == FIELD_ZERO)
779 type = FIELD_NORMAL;
780 if (count[i].field_length <= 8 &&
781 (type == FIELD_NORMAL ||
782 type == FIELD_SKIP_ZERO))
783 count[i].max_zero_fill= count[i].field_length;
785 For every column initialize a tree, which is used to detect distinct
786 column values. 'int_tree' works together with 'tree_buff' and
787 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
788 This is accomplished by '-1' as the element size.
790 init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
791 NULL);
792 if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
793 count[i].tree_pos=count[i].tree_buff =
794 my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
795 MYF(MY_WME));
798 return count;
802 /* Free memory used by counts and trees */
804 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
805 HUFF_COUNTS *huff_counts,
806 uint fields)
808 register uint i;
810 if (huff_trees)
812 for (i=0 ; i < trees ; i++)
814 if (huff_trees[i].element_buffer)
815 my_free((uchar*) huff_trees[i].element_buffer,MYF(0));
816 if (huff_trees[i].code)
817 my_free((uchar*) huff_trees[i].code,MYF(0));
819 my_free((uchar*) huff_trees,MYF(0));
821 if (huff_counts)
823 for (i=0 ; i < fields ; i++)
825 if (huff_counts[i].tree_buff)
827 my_free((uchar*) huff_counts[i].tree_buff,MYF(0));
828 delete_tree(&huff_counts[i].int_tree);
831 my_free((uchar*) huff_counts,MYF(0));
833 delete_queue(&queue); /* This is safe to free */
834 return;
837 /* Read through old file and gather some statistics */
839 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
841 int error;
842 uint length;
843 ulong reclength,max_blob_length;
844 uchar *record,*pos,*next_pos,*end_pos,*start_pos;
845 ha_rows record_count;
846 my_bool static_row_size;
847 HUFF_COUNTS *count,*end_count;
848 TREE_ELEMENT *element;
849 DBUG_ENTER("get_statistic");
851 reclength=mrg->file[0]->s->base.reclength;
852 record=(uchar*) my_alloca(reclength);
853 end_count=huff_counts+mrg->file[0]->s->base.fields;
854 record_count=0; glob_crc=0;
855 max_blob_length=0;
857 /* Check how to calculate checksum */
858 static_row_size=1;
859 for (count=huff_counts ; count < end_count ; count++)
861 if (count->field_type == FIELD_BLOB ||
862 count->field_type == FIELD_VARCHAR)
864 static_row_size=0;
865 break;
869 mrg_reset(mrg);
870 while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
872 ulong tot_blob_length=0;
873 if (! error)
875 /* glob_crc is a checksum over all bytes of all records. */
876 if (static_row_size)
877 glob_crc+=mi_static_checksum(mrg->file[0],record);
878 else
879 glob_crc+=mi_checksum(mrg->file[0],record);
881 /* Count the incidence of values separately for every column. */
882 for (pos=record,count=huff_counts ;
883 count < end_count ;
884 count++,
885 pos=next_pos)
887 next_pos=end_pos=(start_pos=pos)+count->field_length;
890 Put the whole column value in a tree if there is room for it.
891 'int_tree' is used to quickly check for duplicate values.
892 'tree_buff' collects as many distinct column values as
893 possible. If the field length is > 1, it is tree_buff_length,
894 else 2 bytes. Each value is 'field_length' bytes big. If there
895 are more distinct column values than fit into the buffer, we
896 give up with this tree. BLOBs and VARCHARs do not have a
897 tree_buff as it can only be used with fixed length columns.
898 For the special case of field length == 1, we handle only the
899 case that there is only one distinct value in the table(s).
900 Otherwise, we can have a maximum of 256 distinct values. This
901 is then handled by the normal Huffman tree build.
903 Another limit for collecting distinct column values is the
904 number of values itself. Since we would need to build a
905 Huffman tree for the values, we are limited by the 'IS_OFFSET'
906 constant. This constant expresses a bit which is used to
907 determine if a tree element holds a final value or an offset
908 to a child element. Hence, all values and offsets need to be
909 smaller than 'IS_OFFSET'. A tree element is implemented with
910 two integer values, one for the left branch and one for the
911 right branch. For the extreme case that the first element
912 points to the last element, the number of integers in the tree
913 must be less or equal to IS_OFFSET. So the number of elements
914 must be less or equal to IS_OFFSET / 2.
916 WARNING: At first, we insert a pointer into the record buffer
917 as the key for the tree. If we got a new distinct value, which
918 is really inserted into the tree, instead of being counted
919 only, we will copy the column value from the record buffer to
920 'tree_buff' and adjust the key pointer of the tree accordingly.
922 if (count->tree_buff)
924 global_count=count;
925 if (!(element=tree_insert(&count->int_tree,pos, 0,
926 count->int_tree.custom_arg)) ||
927 (element->count == 1 &&
928 (count->tree_buff + tree_buff_length <
929 count->tree_pos + count->field_length)) ||
930 (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
931 (count->field_length == 1 &&
932 count->int_tree.elements_in_tree > 1))
934 delete_tree(&count->int_tree);
935 my_free(count->tree_buff,MYF(0));
936 count->tree_buff=0;
938 else
941 If tree_insert() succeeds, it either creates a new element
942 or increments the counter of an existing element.
944 if (element->count == 1)
946 /* Copy the new column value into 'tree_buff'. */
947 memcpy(count->tree_pos,pos,(size_t) count->field_length);
948 /* Adjust the key pointer in the tree. */
949 tree_set_pointer(element,count->tree_pos);
950 /* Point behind the last column value so far. */
951 count->tree_pos+=count->field_length;
956 /* Save character counters and space-counts and zero-field-counts */
957 if (count->field_type == FIELD_NORMAL ||
958 count->field_type == FIELD_SKIP_ENDSPACE)
960 /* Ignore trailing space. */
961 for ( ; end_pos > pos ; end_pos--)
962 if (end_pos[-1] != ' ')
963 break;
964 /* Empty fields are just counted. Go to the next record. */
965 if (end_pos == pos)
967 count->empty_fields++;
968 count->max_zero_fill=0;
969 continue;
972 Count the total of all trailing spaces and the number of
973 short trailing spaces. Remember the longest trailing space.
975 length= (uint) (next_pos-end_pos);
976 count->tot_end_space+=length;
977 if (length < 8)
978 count->end_space[length]++;
979 if (count->max_end_space < length)
980 count->max_end_space = length;
983 if (count->field_type == FIELD_NORMAL ||
984 count->field_type == FIELD_SKIP_PRESPACE)
986 /* Ignore leading space. */
987 for (pos=start_pos; pos < end_pos ; pos++)
988 if (pos[0] != ' ')
989 break;
990 /* Empty fields are just counted. Go to the next record. */
991 if (end_pos == pos)
993 count->empty_fields++;
994 count->max_zero_fill=0;
995 continue;
998 Count the total of all leading spaces and the number of
999 short leading spaces. Remember the longest leading space.
1001 length= (uint) (pos-start_pos);
1002 count->tot_pre_space+=length;
1003 if (length < 8)
1004 count->pre_space[length]++;
1005 if (count->max_pre_space < length)
1006 count->max_pre_space = length;
1009 /* Calculate pos, end_pos, and max_length for variable length fields. */
1010 if (count->field_type == FIELD_BLOB)
1012 uint field_length=count->field_length -portable_sizeof_char_ptr;
1013 ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
1014 memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*));
1015 end_pos=pos+blob_length;
1016 tot_blob_length+=blob_length;
1017 set_if_bigger(count->max_length,blob_length);
1019 else if (count->field_type == FIELD_VARCHAR)
1021 uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1022 length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1023 uint2korr(start_pos));
1024 pos= start_pos+pack_length;
1025 end_pos= pos+length;
1026 set_if_bigger(count->max_length,length);
1029 /* Evaluate 'max_zero_fill' for short fields. */
1030 if (count->field_length <= 8 &&
1031 (count->field_type == FIELD_NORMAL ||
1032 count->field_type == FIELD_SKIP_ZERO))
1034 uint i;
1035 /* Zero fields are just counted. Go to the next record. */
1036 if (!memcmp((uchar*) start_pos,zero_string,count->field_length))
1038 count->zero_fields++;
1039 continue;
1042 max_zero_fill starts with field_length. It is decreased every
1043 time a shorter "zero trailer" is found. It is set to zero when
1044 an empty field is found (see above). This suggests that the
1045 variable should be called 'min_zero_fill'.
1047 for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1048 i++) ;
1049 if (i < count->max_zero_fill)
1050 count->max_zero_fill=i;
1053 /* Ignore zero fields and check fields. */
1054 if (count->field_type == FIELD_ZERO ||
1055 count->field_type == FIELD_CHECK)
1056 continue;
1059 Count the incidence of every byte value in the
1060 significant field value.
1062 for ( ; pos < end_pos ; pos++)
1063 count->counts[(uchar) *pos]++;
1065 /* Step to next field. */
1068 if (tot_blob_length > max_blob_length)
1069 max_blob_length=tot_blob_length;
1070 record_count++;
1071 if (write_loop && record_count % WRITE_COUNT == 0)
1073 VOID(printf("%lu\r", (ulong) record_count));
1074 VOID(fflush(stdout));
1077 else if (error != HA_ERR_RECORD_DELETED)
1079 VOID(fprintf(stderr, "Got error %d while reading rows", error));
1080 break;
1083 /* Step to next record. */
1085 if (write_loop)
1087 VOID(printf(" \r"));
1088 VOID(fflush(stdout));
1092 If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1093 codes.
1095 DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1097 DBUG_PRINT("info", ("Found the following number of incidents "
1098 "of the byte codes:"));
1099 if (verbose >= 2)
1100 VOID(printf("Found the following number of incidents "
1101 "of the byte codes:\n"));
1102 for (count= huff_counts ; count < end_count; count++)
1104 uint idx;
1105 my_off_t total_count;
1106 char llbuf[32];
1108 DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1109 if (verbose >= 2)
1110 VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
1111 if (count->tree_buff)
1113 DBUG_PRINT("info", ("number of distinct values: %u",
1114 (uint) ((count->tree_pos - count->tree_buff) /
1115 count->field_length)));
1116 if (verbose >= 2)
1117 VOID(printf("number of distinct values: %u\n",
1118 (uint) ((count->tree_pos - count->tree_buff) /
1119 count->field_length)));
1121 total_count= 0;
1122 for (idx= 0; idx < 256; idx++)
1124 if (count->counts[idx])
1126 total_count+= count->counts[idx];
1127 DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1128 llstr((longlong) count->counts[idx], llbuf)));
1129 if (verbose >= 2)
1130 VOID(printf("counts[0x%02x]: %12s\n", idx,
1131 llstr((longlong) count->counts[idx], llbuf)));
1134 DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count,
1135 llbuf)));
1136 if ((verbose >= 2) && total_count)
1138 VOID(printf("total: %12s\n",
1139 llstr((longlong) total_count, llbuf)));
1143 mrg->records=record_count;
1144 mrg->max_blob_length=max_blob_length;
1145 my_afree((uchar*) record);
1146 DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1149 static int compare_huff_elements(void *not_used __attribute__((unused)),
1150 uchar *a, uchar *b)
1152 return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1153 (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
1156 /* Check each tree if we should use pre-space-compress, end-space-
1157 compress, empty-field-compress or zero-field-compress */
1159 static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1160 my_off_t records)
1162 uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1163 my_off_t old_length,new_length,length;
1164 DBUG_ENTER("check_counts");
1166 bzero((uchar*) field_count,sizeof(field_count));
1167 space_fields=fill_zero_fields=0;
1169 for (; trees-- ; huff_counts++)
1171 if (huff_counts->field_type == FIELD_BLOB)
1173 huff_counts->length_bits=max_bit(huff_counts->max_length);
1174 goto found_pack;
1176 else if (huff_counts->field_type == FIELD_VARCHAR)
1178 huff_counts->length_bits=max_bit(huff_counts->max_length);
1179 goto found_pack;
1181 else if (huff_counts->field_type == FIELD_CHECK)
1183 huff_counts->bytes_packed=0;
1184 huff_counts->counts[0]=0;
1185 goto found_pack;
1188 huff_counts->field_type=FIELD_NORMAL;
1189 huff_counts->pack_type=0;
1191 /* Check for zero-filled records (in this column), or zero records. */
1192 if (huff_counts->zero_fields || ! records)
1194 my_off_t old_space_count;
1196 If there are only zero filled records (in this column),
1197 or no records at all, we are done.
1199 if (huff_counts->zero_fields == records)
1201 huff_counts->field_type= FIELD_ZERO;
1202 huff_counts->bytes_packed=0;
1203 huff_counts->counts[0]=0;
1204 goto found_pack;
1206 /* Remeber the number of significant spaces. */
1207 old_space_count=huff_counts->counts[' '];
1208 /* Add all leading and trailing spaces. */
1209 huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1210 huff_counts->tot_pre_space +
1211 huff_counts->empty_fields *
1212 huff_counts->field_length);
1213 /* Check, what the compressed length of this would be. */
1214 old_length=calc_packed_length(huff_counts,0)+records/8;
1215 /* Get the number of zero bytes. */
1216 length=huff_counts->zero_fields*huff_counts->field_length;
1217 /* Add it to the counts. */
1218 huff_counts->counts[0]+=length;
1219 /* Check, what the compressed length of this would be. */
1220 new_length=calc_packed_length(huff_counts,0);
1221 /* If the compression without the zeroes would be shorter, we are done. */
1222 if (old_length < new_length && huff_counts->field_length > 1)
1224 huff_counts->field_type=FIELD_SKIP_ZERO;
1225 huff_counts->counts[0]-=length;
1226 huff_counts->bytes_packed=old_length- records/8;
1227 goto found_pack;
1229 /* Remove the insignificant spaces, but keep the zeroes. */
1230 huff_counts->counts[' ']=old_space_count;
1232 /* Check, what the compressed length of this column would be. */
1233 huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1236 If there are enough empty records (in this column),
1237 treating them specially may pay off.
1239 if (huff_counts->empty_fields)
1241 if (huff_counts->field_length > 2 &&
1242 huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1243 (1+max_bit(max(huff_counts->max_pre_space,
1244 huff_counts->max_end_space))) <
1245 records * max_bit(huff_counts->field_length))
1247 huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1249 else
1251 length=huff_counts->empty_fields*huff_counts->field_length;
1252 if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1254 huff_counts->tot_end_space+=length;
1255 huff_counts->max_end_space=huff_counts->field_length;
1256 if (huff_counts->field_length < 8)
1257 huff_counts->end_space[huff_counts->field_length]+=
1258 huff_counts->empty_fields;
1260 if (huff_counts->tot_pre_space)
1262 huff_counts->tot_pre_space+=length;
1263 huff_counts->max_pre_space=huff_counts->field_length;
1264 if (huff_counts->field_length < 8)
1265 huff_counts->pre_space[huff_counts->field_length]+=
1266 huff_counts->empty_fields;
1272 If there are enough trailing spaces (in this column),
1273 treating them specially may pay off.
1275 if (huff_counts->tot_end_space)
1277 huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1278 if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1279 huff_counts->end_space,
1280 huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1281 goto found_pack;
1282 huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1286 If there are enough leading spaces (in this column),
1287 treating them specially may pay off.
1289 if (huff_counts->tot_pre_space)
1291 if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1292 huff_counts->pre_space,
1293 huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1294 goto found_pack;
1297 found_pack: /* Found field-packing */
1299 /* Test if we can use zero-fill */
1301 if (huff_counts->max_zero_fill &&
1302 (huff_counts->field_type == FIELD_NORMAL ||
1303 huff_counts->field_type == FIELD_SKIP_ZERO))
1305 huff_counts->counts[0]-=huff_counts->max_zero_fill*
1306 (huff_counts->field_type == FIELD_SKIP_ZERO ?
1307 records - huff_counts->zero_fields : records);
1308 huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1309 huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1312 /* Test if intervall-field is better */
1314 if (huff_counts->tree_buff)
1316 HUFF_TREE tree;
1318 DBUG_EXECUTE_IF("forceintervall",
1319 huff_counts->bytes_packed= ~ (my_off_t) 0;);
1320 tree.element_buffer=0;
1321 if (!make_huff_tree(&tree,huff_counts) &&
1322 tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1324 if (tree.elements == 1)
1325 huff_counts->field_type=FIELD_CONSTANT;
1326 else
1327 huff_counts->field_type=FIELD_INTERVALL;
1328 huff_counts->pack_type=0;
1330 else
1332 my_free((uchar*) huff_counts->tree_buff,MYF(0));
1333 delete_tree(&huff_counts->int_tree);
1334 huff_counts->tree_buff=0;
1336 if (tree.element_buffer)
1337 my_free((uchar*) tree.element_buffer,MYF(0));
1339 if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1340 space_fields++;
1341 if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1342 fill_zero_fields++;
1343 field_count[huff_counts->field_type]++;
1345 DBUG_PRINT("info", ("normal: %3d empty-space: %3d "
1346 "empty-zero: %3d empty-fill: %3d",
1347 field_count[FIELD_NORMAL],space_fields,
1348 field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1349 DBUG_PRINT("info", ("pre-space: %3d end-space: %3d "
1350 "intervall-fields: %3d zero: %3d",
1351 field_count[FIELD_SKIP_PRESPACE],
1352 field_count[FIELD_SKIP_ENDSPACE],
1353 field_count[FIELD_INTERVALL],
1354 field_count[FIELD_ZERO]));
1355 if (verbose)
1356 VOID(printf("\nnormal: %3d empty-space: %3d "
1357 "empty-zero: %3d empty-fill: %3d\n"
1358 "pre-space: %3d end-space: %3d "
1359 "intervall-fields: %3d zero: %3d\n",
1360 field_count[FIELD_NORMAL],space_fields,
1361 field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1362 field_count[FIELD_SKIP_PRESPACE],
1363 field_count[FIELD_SKIP_ENDSPACE],
1364 field_count[FIELD_INTERVALL],
1365 field_count[FIELD_ZERO]));
1366 DBUG_VOID_RETURN;
1369 /* Test if we can use space-compression and empty-field-compression */
1371 static int
1372 test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1373 uint max_space_length, my_off_t *space_counts,
1374 my_off_t tot_space_count, enum en_fieldtype field_type)
1376 int min_pos;
1377 uint length_bits,i;
1378 my_off_t space_count,min_space_count,min_pack,new_length,skip;
1380 length_bits=max_bit(max_space_length);
1382 /* Default no end_space-packing */
1383 space_count=huff_counts->counts[(uint) ' '];
1384 min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1385 min_pack=calc_packed_length(huff_counts,0);
1386 min_pos= -2;
1387 huff_counts->counts[(uint) ' ']=space_count;
1389 /* Test with allways space-count */
1390 new_length=huff_counts->bytes_packed+length_bits*records/8;
1391 if (new_length+1 < min_pack)
1393 min_pos= -1;
1394 min_pack=new_length;
1395 min_space_count=space_count;
1397 /* Test with length-flag */
1398 for (skip=0L, i=0 ; i < 8 ; i++)
1400 if (space_counts[i])
1402 if (i)
1403 huff_counts->counts[(uint) ' ']+=space_counts[i];
1404 skip+=huff_counts->pre_space[i];
1405 new_length=calc_packed_length(huff_counts,0)+
1406 (records+(records-skip)*(1+length_bits))/8;
1407 if (new_length < min_pack)
1409 min_pos=(int) i;
1410 min_pack=new_length;
1411 min_space_count=huff_counts->counts[(uint) ' '];
1416 huff_counts->counts[(uint) ' ']=min_space_count;
1417 huff_counts->bytes_packed=min_pack;
1418 switch (min_pos) {
1419 case -2:
1420 return(0); /* No space-compress */
1421 case -1: /* Always space-count */
1422 huff_counts->field_type=field_type;
1423 huff_counts->min_space=0;
1424 huff_counts->length_bits=max_bit(max_space_length);
1425 break;
1426 default:
1427 huff_counts->field_type=field_type;
1428 huff_counts->min_space=(uint) min_pos;
1429 huff_counts->pack_type|=PACK_TYPE_SELECTED;
1430 huff_counts->length_bits=max_bit(max_space_length);
1431 break;
1433 return(1); /* Using space-compress */
1437 /* Make a huff_tree of each huff_count */
1439 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1441 uint tree;
1442 HUFF_TREE *huff_tree;
1443 DBUG_ENTER("make_huff_trees");
1445 if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1446 MYF(MY_WME | MY_ZEROFILL))))
1447 DBUG_RETURN(0);
1449 for (tree=0 ; tree < trees ; tree++)
1451 if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1453 while (tree--)
1454 my_free((uchar*) huff_tree[tree].element_buffer,MYF(0));
1455 my_free((uchar*) huff_tree,MYF(0));
1456 DBUG_RETURN(0);
1459 DBUG_RETURN(huff_tree);
1463 Build a Huffman tree.
1465 SYNOPSIS
1466 make_huff_tree()
1467 huff_tree The Huffman tree.
1468 huff_counts The counts.
1470 DESCRIPTION
1471 Build a Huffman tree according to huff_counts->counts or
1472 huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1473 tree_buff_length of distinct column values. In that case, whole
1474 values can be Huffman encoded instead of single bytes.
1476 RETURN
1477 0 OK
1478 != 0 Error
1481 static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1483 uint i,found,bits_packed,first,last;
1484 my_off_t bytes_packed;
1485 HUFF_ELEMENT *a,*b,*new_huff_el;
1487 first=last=0;
1488 if (huff_counts->tree_buff)
1490 /* Calculate the number of distinct values in tree_buff. */
1491 found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1492 huff_counts->field_length;
1493 first=0; last=found-1;
1495 else
1497 /* Count the number of byte codes found in the column. */
1498 for (i=found=0 ; i < 256 ; i++)
1500 if (huff_counts->counts[i])
1502 if (! found++)
1503 first=i;
1504 last=i;
1507 if (found < 2)
1508 found=2;
1511 /* When using 'tree_buff' we can have more that 256 values. */
1512 if (queue.max_elements < found)
1514 delete_queue(&queue);
1515 if (init_queue(&queue,found,0,0,compare_huff_elements,0))
1516 return -1;
1519 /* Allocate or reallocate an element buffer for the Huffman tree. */
1520 if (!huff_tree->element_buffer)
1522 if (!(huff_tree->element_buffer=
1523 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1524 return 1;
1526 else
1528 HUFF_ELEMENT *temp;
1529 if (!(temp=
1530 (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1531 found*2*sizeof(HUFF_ELEMENT),
1532 MYF(MY_WME))))
1533 return 1;
1534 huff_tree->element_buffer=temp;
1537 huff_counts->tree=huff_tree;
1538 huff_tree->counts=huff_counts;
1539 huff_tree->min_chr=first;
1540 huff_tree->max_chr=last;
1541 huff_tree->char_bits=max_bit(last-first);
1542 huff_tree->offset_bits=max_bit(found-1)+1;
1544 if (huff_counts->tree_buff)
1546 huff_tree->elements=0;
1547 huff_tree->tree_pack_length=(1+15+16+5+5+
1548 (huff_tree->char_bits+1)*found+
1549 (huff_tree->offset_bits+1)*
1550 (found-2)+7)/8 +
1551 (uint) (huff_tree->counts->tree_pos-
1552 huff_tree->counts->tree_buff);
1554 Put a HUFF_ELEMENT into the queue for every distinct column value.
1556 tree_walk() calls save_counts_in_queue() for every element in
1557 'int_tree'. This takes elements from the target trees element
1558 buffer and places references to them into the buffer of the
1559 priority queue. We insert in column value order, but the order is
1560 in fact irrelevant here. We will establish the correct order
1561 later.
1563 tree_walk(&huff_counts->int_tree,
1564 (int (*)(void*, element_count,void*)) save_counts_in_queue,
1565 (uchar*) huff_tree, left_root_right);
1567 else
1569 huff_tree->elements=found;
1570 huff_tree->tree_pack_length=(9+9+5+5+
1571 (huff_tree->char_bits+1)*found+
1572 (huff_tree->offset_bits+1)*
1573 (found-2)+7)/8;
1575 Put a HUFF_ELEMENT into the queue for every byte code found in the column.
1577 The elements are taken from the target trees element buffer.
1578 Instead of using queue_insert(), we just place references to the
1579 elements into the buffer of the priority queue. We insert in byte
1580 value order, but the order is in fact irrelevant here. We will
1581 establish the correct order later.
1583 for (i=first, found=0 ; i <= last ; i++)
1585 if (huff_counts->counts[i])
1587 new_huff_el=huff_tree->element_buffer+(found++);
1588 new_huff_el->count=huff_counts->counts[i];
1589 new_huff_el->a.leaf.null=0;
1590 new_huff_el->a.leaf.element_nr=i;
1591 queue.root[found]=(uchar*) new_huff_el;
1595 If there is only a single byte value in this field in all records,
1596 add a second element with zero incidence. This is required to enter
1597 the loop, which builds the Huffman tree.
1599 while (found < 2)
1601 new_huff_el=huff_tree->element_buffer+(found++);
1602 new_huff_el->count=0;
1603 new_huff_el->a.leaf.null=0;
1604 if (last)
1605 new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1606 else
1607 new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1608 queue.root[found]=(uchar*) new_huff_el;
1612 /* Make a queue from the queue buffer. */
1613 queue.elements=found;
1616 Make a priority queue from the queue. Construct its index so that we
1617 have a partially ordered tree.
1619 for (i=found/2 ; i > 0 ; i--)
1620 _downheap(&queue,i);
1622 /* The Huffman algorithm. */
1623 bytes_packed=0; bits_packed=0;
1624 for (i=1 ; i < found ; i++)
1627 Pop the top element from the queue (the one with the least incidence).
1628 Popping from a priority queue includes a re-ordering of the queue,
1629 to get the next least incidence element to the top.
1631 a=(HUFF_ELEMENT*) queue_remove(&queue,0);
1633 Copy the next least incidence element. The queue implementation
1634 reserves root[0] for temporary purposes. root[1] is the top.
1636 b=(HUFF_ELEMENT*) queue.root[1];
1637 /* Get a new element from the element buffer. */
1638 new_huff_el=huff_tree->element_buffer+found+i;
1639 /* The new element gets the sum of the two least incidence elements. */
1640 new_huff_el->count=a->count+b->count;
1642 The Huffman algorithm assigns another bit to the code for a byte
1643 every time that bytes incidence is combined (directly or indirectly)
1644 to a new element as one of the two least incidence elements.
1645 This means that one more bit per incidence of that byte is required
1646 in the resulting file. So we add the new combined incidence as the
1647 number of bits by which the result grows.
1649 bits_packed+=(uint) (new_huff_el->count & 7);
1650 bytes_packed+=new_huff_el->count/8;
1651 /* The new element points to its children, lesser in left. */
1652 new_huff_el->a.nod.left=a;
1653 new_huff_el->a.nod.right=b;
1655 Replace the copied top element by the new element and re-order the
1656 queue.
1658 queue.root[1]=(uchar*) new_huff_el;
1659 queue_replaced(&queue);
1661 huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1662 huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1663 return 0;
1666 static int compare_tree(void* cmp_arg __attribute__((unused)),
1667 register const uchar *s, register const uchar *t)
1669 uint length;
1670 for (length=global_count->field_length; length-- ;)
1671 if (*s++ != *t++)
1672 return (int) s[-1] - (int) t[-1];
1673 return 0;
1677 Organize distinct column values and their incidences into a priority queue.
1679 SYNOPSIS
1680 save_counts_in_queue()
1681 key The column value.
1682 count The incidence of this value.
1683 tree The Huffman tree to be built later.
1685 DESCRIPTION
1686 We use the element buffer of the targeted tree. The distinct column
1687 values are organized in a priority queue first. The Huffman
1688 algorithm will later organize the elements into a Huffman tree. For
1689 the time being, we just place references to the elements into the
1690 queue buffer. The buffer will later be organized into a priority
1691 queue.
1693 RETURN
1697 static int save_counts_in_queue(uchar *key, element_count count,
1698 HUFF_TREE *tree)
1700 HUFF_ELEMENT *new_huff_el;
1702 new_huff_el=tree->element_buffer+(tree->elements++);
1703 new_huff_el->count=count;
1704 new_huff_el->a.leaf.null=0;
1705 new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1706 tree->counts->field_length;
1707 queue.root[tree->elements]=(uchar*) new_huff_el;
1708 return 0;
1713 Calculate length of file if given counts should be used.
1715 SYNOPSIS
1716 calc_packed_length()
1717 huff_counts The counts for a column of the table(s).
1718 add_tree_lenght If the decode tree length should be added.
1720 DESCRIPTION
1721 We need to follow the Huffman algorithm until we know, how many bits
1722 are required for each byte code. But we do not need the resulting
1723 Huffman tree. Hence, we can leave out some steps which are essential
1724 in make_huff_tree().
1726 RETURN
1727 Number of bytes required to compress this table column.
1730 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1731 uint add_tree_lenght)
1733 uint i,found,bits_packed,first,last;
1734 my_off_t bytes_packed;
1735 HUFF_ELEMENT element_buffer[256];
1736 DBUG_ENTER("calc_packed_length");
1739 WARNING: We use a small hack for efficiency: Instead of placing
1740 references to HUFF_ELEMENTs into the queue, we just insert
1741 references to the counts of the byte codes which appeared in this
1742 table column. During the Huffman algorithm they are successively
1743 replaced by references to HUFF_ELEMENTs. This works, because
1744 HUFF_ELEMENTs have the incidence count at their beginning.
1745 Regardless, wether the queue array contains references to counts of
1746 type my_off_t or references to HUFF_ELEMENTs which have the count of
1747 type my_off_t at their beginning, it always points to a count of the
1748 same type.
1750 Instead of using queue_insert(), we just copy the references into
1751 the buffer of the priority queue. We insert in byte value order, but
1752 the order is in fact irrelevant here. We will establish the correct
1753 order later.
1755 first=last=0;
1756 for (i=found=0 ; i < 256 ; i++)
1758 if (huff_counts->counts[i])
1760 if (! found++)
1761 first=i;
1762 last=i;
1763 /* We start with root[1], which is the queues top element. */
1764 queue.root[found]=(uchar*) &huff_counts->counts[i];
1767 if (!found)
1768 DBUG_RETURN(0); /* Empty tree */
1770 If there is only a single byte value in this field in all records,
1771 add a second element with zero incidence. This is required to enter
1772 the loop, which follows the Huffman algorithm.
1774 if (found < 2)
1775 queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1777 /* Make a queue from the queue buffer. */
1778 queue.elements=found;
1780 bytes_packed=0; bits_packed=0;
1781 /* Add the length of the coding table, which would become part of the file. */
1782 if (add_tree_lenght)
1783 bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1784 (max_bit(found-1)+1+1)*(found-2) +7)/8;
1787 Make a priority queue from the queue. Construct its index so that we
1788 have a partially ordered tree.
1790 for (i=(found+1)/2 ; i > 0 ; i--)
1791 _downheap(&queue,i);
1793 /* The Huffman algorithm. */
1794 for (i=0 ; i < found-1 ; i++)
1796 my_off_t *a;
1797 my_off_t *b;
1798 HUFF_ELEMENT *new_huff_el;
1801 Pop the top element from the queue (the one with the least
1802 incidence). Popping from a priority queue includes a re-ordering
1803 of the queue, to get the next least incidence element to the top.
1805 a= (my_off_t*) queue_remove(&queue, 0);
1807 Copy the next least incidence element. The queue implementation
1808 reserves root[0] for temporary purposes. root[1] is the top.
1810 b= (my_off_t*) queue.root[1];
1811 /* Create a new element in a local (automatic) buffer. */
1812 new_huff_el= element_buffer + i;
1813 /* The new element gets the sum of the two least incidence elements. */
1814 new_huff_el->count= *a + *b;
1816 The Huffman algorithm assigns another bit to the code for a byte
1817 every time that bytes incidence is combined (directly or indirectly)
1818 to a new element as one of the two least incidence elements.
1819 This means that one more bit per incidence of that byte is required
1820 in the resulting file. So we add the new combined incidence as the
1821 number of bits by which the result grows.
1823 bits_packed+=(uint) (new_huff_el->count & 7);
1824 bytes_packed+=new_huff_el->count/8;
1826 Replace the copied top element by the new element and re-order the
1827 queue. This successively replaces the references to counts by
1828 references to HUFF_ELEMENTs.
1830 queue.root[1]=(uchar*) new_huff_el;
1831 queue_replaced(&queue);
1833 DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1837 /* Remove trees that don't give any compression */
1839 static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1841 uint k,tree_number;
1842 HUFF_COUNTS count,*i,*j,*last_count;
1844 last_count=huff_counts+trees;
1845 for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1847 if (!i->tree->tree_number)
1849 i->tree->tree_number= ++tree_number;
1850 if (i->tree_buff)
1851 continue; /* Don't join intervall */
1852 for (j=i+1 ; j < last_count ; j++)
1854 if (! j->tree->tree_number && ! j->tree_buff)
1856 for (k=0 ; k < 256 ; k++)
1857 count.counts[k]=i->counts[k]+j->counts[k];
1858 if (calc_packed_length(&count,1) <=
1859 i->tree->bytes_packed + j->tree->bytes_packed+
1860 i->tree->tree_pack_length+j->tree->tree_pack_length+
1861 ALLOWED_JOIN_DIFF)
1863 memcpy_fixed((uchar*) i->counts,(uchar*) count.counts,
1864 sizeof(count.counts[0])*256);
1865 my_free((uchar*) j->tree->element_buffer,MYF(0));
1866 j->tree->element_buffer=0;
1867 j->tree=i->tree;
1868 bmove((uchar*) i->counts,(uchar*) count.counts,
1869 sizeof(count.counts[0])*256);
1870 if (make_huff_tree(i->tree,i))
1871 return (uint) -1;
1877 DBUG_PRINT("info", ("Original trees: %d After join: %d",
1878 trees, tree_number));
1879 if (verbose)
1880 VOID(printf("Original trees: %d After join: %d\n", trees, tree_number));
1881 return tree_number; /* Return trees left */
1886 Fill in huff_tree encode tables.
1888 SYNOPSIS
1889 make_huff_decode_table()
1890 huff_tree An array of HUFF_TREE which are to be encoded.
1891 trees The number of HUFF_TREE in the array.
1893 RETURN
1894 0 success
1895 != 0 error
1898 static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1900 uint elements;
1901 for ( ; trees-- ; huff_tree++)
1903 if (huff_tree->tree_number > 0)
1905 elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1906 if (!(huff_tree->code =
1907 (ulonglong*) my_malloc(elements*
1908 (sizeof(ulonglong) + sizeof(uchar)),
1909 MYF(MY_WME | MY_ZEROFILL))))
1910 return 1;
1911 huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1912 make_traverse_code_tree(huff_tree, huff_tree->root,
1913 8 * sizeof(ulonglong), LL(0));
1916 return 0;
1920 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1921 HUFF_ELEMENT *element,
1922 uint size, ulonglong code)
1924 uint chr;
1925 if (!element->a.leaf.null)
1927 chr=element->a.leaf.element_nr;
1928 huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
1929 huff_tree->code[chr]= (code >> size);
1930 if (huff_tree->height < 8 * sizeof(ulonglong) - size)
1931 huff_tree->height= 8 * sizeof(ulonglong) - size;
1933 else
1935 size--;
1936 make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1937 make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1938 code + (((ulonglong) 1) << size));
1940 return;
1945 Convert a value into binary digits.
1947 SYNOPSIS
1948 bindigits()
1949 value The value.
1950 length The number of low order bits to convert.
1952 NOTE
1953 The result string is in static storage. It is reused on every call.
1954 So you cannot use it twice in one expression.
1956 RETURN
1957 A pointer to a static NUL-terminated string.
1960 static char *bindigits(ulonglong value, uint bits)
1962 static char digits[72];
1963 char *ptr= digits;
1964 uint idx= bits;
1966 DBUG_ASSERT(idx < sizeof(digits));
1967 while (idx)
1968 *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1969 *ptr= '\0';
1970 return digits;
1975 Convert a value into hexadecimal digits.
1977 SYNOPSIS
1978 hexdigits()
1979 value The value.
1981 NOTE
1982 The result string is in static storage. It is reused on every call.
1983 So you cannot use it twice in one expression.
1985 RETURN
1986 A pointer to a static NUL-terminated string.
1989 static char *hexdigits(ulonglong value)
1991 static char digits[20];
1992 char *ptr= digits;
1993 uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
1995 DBUG_ASSERT(idx < sizeof(digits));
1996 while (idx)
1998 if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
1999 *(ptr - 1)+= 'a' - '9' - 1;
2001 *ptr= '\0';
2002 return digits;
2006 /* Write header to new packed data file */
2008 static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2009 my_off_t tot_elements,my_off_t filelength)
2011 uchar *buff= (uchar*) file_buffer.pos;
2013 bzero(buff,HEAD_LENGTH);
2014 memcpy_fixed(buff,myisam_pack_file_magic,4);
2015 int4store(buff+4,head_length);
2016 int4store(buff+8, mrg->min_pack_length);
2017 int4store(buff+12,mrg->max_pack_length);
2018 int4store(buff+16,tot_elements);
2019 int4store(buff+20,intervall_length);
2020 int2store(buff+24,trees);
2021 buff[26]=(char) mrg->ref_length;
2022 /* Save record pointer length */
2023 buff[27]= (uchar) mi_get_pointer_length((ulonglong) filelength,2);
2024 if (test_only)
2025 return 0;
2026 VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
2027 return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2028 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2031 /* Write fieldinfo to new packed file */
2033 static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2035 reg1 uint i;
2036 uint huff_tree_bits;
2037 huff_tree_bits=max_bit(trees ? trees-1 : 0);
2039 DBUG_PRINT("info", (" "));
2040 DBUG_PRINT("info", ("column types:"));
2041 DBUG_PRINT("info", ("FIELD_NORMAL 0"));
2042 DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1"));
2043 DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2"));
2044 DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3"));
2045 DBUG_PRINT("info", ("FIELD_BLOB 4"));
2046 DBUG_PRINT("info", ("FIELD_CONSTANT 5"));
2047 DBUG_PRINT("info", ("FIELD_INTERVALL 6"));
2048 DBUG_PRINT("info", ("FIELD_ZERO 7"));
2049 DBUG_PRINT("info", ("FIELD_VARCHAR 8"));
2050 DBUG_PRINT("info", ("FIELD_CHECK 9"));
2051 DBUG_PRINT("info", (" "));
2052 DBUG_PRINT("info", ("pack type as a set of flags:"));
2053 DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1"));
2054 DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2"));
2055 DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4"));
2056 DBUG_PRINT("info", (" "));
2057 if (verbose >= 2)
2059 VOID(printf("\n"));
2060 VOID(printf("column types:\n"));
2061 VOID(printf("FIELD_NORMAL 0\n"));
2062 VOID(printf("FIELD_SKIP_ENDSPACE 1\n"));
2063 VOID(printf("FIELD_SKIP_PRESPACE 2\n"));
2064 VOID(printf("FIELD_SKIP_ZERO 3\n"));
2065 VOID(printf("FIELD_BLOB 4\n"));
2066 VOID(printf("FIELD_CONSTANT 5\n"));
2067 VOID(printf("FIELD_INTERVALL 6\n"));
2068 VOID(printf("FIELD_ZERO 7\n"));
2069 VOID(printf("FIELD_VARCHAR 8\n"));
2070 VOID(printf("FIELD_CHECK 9\n"));
2071 VOID(printf("\n"));
2072 VOID(printf("pack type as a set of flags:\n"));
2073 VOID(printf("PACK_TYPE_SELECTED 1\n"));
2074 VOID(printf("PACK_TYPE_SPACE_FIELDS 2\n"));
2075 VOID(printf("PACK_TYPE_ZERO_FILL 4\n"));
2076 VOID(printf("\n"));
2078 for (i=0 ; i++ < fields ; counts++)
2080 write_bits((ulonglong) (int) counts->field_type, 5);
2081 write_bits(counts->pack_type,6);
2082 if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2083 write_bits(counts->max_zero_fill,5);
2084 else
2085 write_bits(counts->length_bits,5);
2086 write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2087 DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u "
2088 "lbits: %2u tree: %2u length: %4u",
2089 i , counts->field_type, counts->pack_type,
2090 counts->max_zero_fill, counts->length_bits,
2091 counts->tree->tree_number, counts->field_length));
2092 if (verbose >= 2)
2093 VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2094 "tree: %2u length: %4u\n", i , counts->field_type,
2095 counts->pack_type, counts->max_zero_fill, counts->length_bits,
2096 counts->tree->tree_number, counts->field_length));
2098 flush_bits();
2099 return;
2102 /* Write all huff_trees to new datafile. Return tot count of
2103 elements in all trees
2104 Returns 0 on error */
2106 static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2108 uint i,int_length;
2109 uint tree_no;
2110 uint codes;
2111 uint errors= 0;
2112 uint *packed_tree,*offset,length;
2113 my_off_t elements;
2115 /* Find the highest number of elements in the trees. */
2116 for (i=length=0 ; i < trees ; i++)
2117 if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2118 length=huff_tree[i].elements;
2120 Allocate a buffer for packing a decode tree. Two numbers per element
2121 (left child and right child).
2123 if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2125 my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2126 return 0;
2129 DBUG_PRINT("info", (" "));
2130 if (verbose >= 2)
2131 VOID(printf("\n"));
2132 tree_no= 0;
2133 intervall_length=0;
2134 for (elements=0; trees-- ; huff_tree++)
2136 /* Skip columns that have been joined with other columns. */
2137 if (huff_tree->tree_number == 0)
2138 continue; /* Deleted tree */
2139 tree_no++;
2140 DBUG_PRINT("info", (" "));
2141 if (verbose >= 3)
2142 VOID(printf("\n"));
2143 /* Count the total number of elements (byte codes or column values). */
2144 elements+=huff_tree->elements;
2145 huff_tree->max_offset=2;
2146 /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2147 if (huff_tree->elements <= 1)
2148 offset=packed_tree;
2149 else
2150 offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2152 /* This should be the same as 'length' above. */
2153 huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2156 Since we check this during collecting the distinct column values,
2157 this should never happen.
2159 if (huff_tree->max_offset >= IS_OFFSET)
2160 { /* This should be impossible */
2161 VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2162 huff_tree->max_offset));
2163 my_afree((uchar*) packed_tree);
2164 return 0;
2167 DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu "
2168 "char_bits: %u\n",
2169 (ulong) (file_buffer.pos - file_buffer.buffer),
2170 huff_tree->elements, (ulong) (offset - packed_tree),
2171 huff_tree->char_bits));
2172 if (!huff_tree->counts->tree_buff)
2174 /* We do a byte compression on this column. Mark with bit 0. */
2175 write_bits(0,1);
2176 write_bits(huff_tree->min_chr,8);
2177 write_bits(huff_tree->elements,9);
2178 write_bits(huff_tree->char_bits,5);
2179 write_bits(huff_tree->offset_bits,5);
2180 int_length=0;
2182 else
2184 int_length=(uint) (huff_tree->counts->tree_pos -
2185 huff_tree->counts->tree_buff);
2186 /* We have distinct column values for this column. Mark with bit 1. */
2187 write_bits(1,1);
2188 write_bits(huff_tree->elements,15);
2189 write_bits(int_length,16);
2190 write_bits(huff_tree->char_bits,5);
2191 write_bits(huff_tree->offset_bits,5);
2192 intervall_length+=int_length;
2194 DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u "
2195 "offset_bits: %2u %s: %5u codelen: %2u",
2196 tree_no, huff_tree->elements, huff_tree->char_bits,
2197 huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2198 "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2199 int_length : huff_tree->min_chr, huff_tree->height));
2200 if (verbose >= 2)
2201 VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2202 "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2203 huff_tree->char_bits, huff_tree->offset_bits,
2204 huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2205 huff_tree->counts->tree_buff ? int_length :
2206 huff_tree->min_chr, huff_tree->height));
2208 /* Check that the code tree length matches the element count. */
2209 length=(uint) (offset-packed_tree);
2210 if (length != huff_tree->elements*2-2)
2212 VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2213 length, huff_tree->elements * 2 - 2));
2214 errors++;
2215 break;
2218 for (i=0 ; i < length ; i++)
2220 if (packed_tree[i] & IS_OFFSET)
2221 write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2222 huff_tree->offset_bits+1);
2223 else
2224 write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2225 DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2226 i, (packed_tree[i] & IS_OFFSET) ?
2227 " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2228 packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2229 if (verbose >= 3)
2230 VOID(printf("tree[0x%04x]: %s0x%04x\n",
2231 i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2232 (packed_tree[i] & IS_OFFSET) ?
2233 packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2235 flush_bits();
2238 Display coding tables and check their correctness.
2240 codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2241 for (i= 0; i < codes; i++)
2243 ulonglong code;
2244 uint bits;
2245 uint len;
2246 uint idx;
2248 if (! (len= huff_tree->code_len[i]))
2249 continue;
2250 DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i,
2251 hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2252 bindigits(huff_tree->code[i],
2253 huff_tree->code_len[i])));
2254 if (verbose >= 3)
2255 VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2256 hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2257 bindigits(huff_tree->code[i], huff_tree->code_len[i])));
2259 /* Check that the encode table decodes correctly. */
2260 code= 0;
2261 bits= 0;
2262 idx= 0;
2263 DBUG_EXECUTE_IF("forcechkerr1", len--;);
2264 DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2265 DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2266 for (;;)
2268 if (! len)
2270 VOID(fflush(stdout));
2271 VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2272 hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
2273 errors++;
2274 break;
2276 code<<= 1;
2277 code|= (huff_tree->code[i] >> (--len)) & 1;
2278 bits++;
2279 if (bits > 8 * sizeof(code))
2281 VOID(fflush(stdout));
2282 VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2283 bits, (uint) (8 * sizeof(code))));
2284 errors++;
2285 break;
2287 idx+= (uint) code & 1;
2288 if (idx >= length)
2290 VOID(fflush(stdout));
2291 VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2292 idx, length));
2293 errors++;
2294 break;
2296 if (packed_tree[idx] & IS_OFFSET)
2297 idx+= packed_tree[idx] & ~IS_OFFSET;
2298 else
2299 break; /* Hit a leaf. This contains the result value. */
2301 if (errors)
2302 break;
2304 DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2305 if (packed_tree[idx] != i)
2307 VOID(fflush(stdout));
2308 VOID(fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
2309 packed_tree[idx], i));
2310 errors++;
2311 break;
2313 } /*end for (codes)*/
2314 if (errors)
2315 break;
2317 /* Write column values in case of distinct column value compression. */
2318 if (huff_tree->counts->tree_buff)
2320 for (i=0 ; i < int_length ; i++)
2322 write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2323 DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2324 i, (uchar) huff_tree->counts->tree_buff[i]));
2325 if (verbose >= 3)
2326 VOID(printf("column_values[0x%04x]: 0x%02x\n",
2327 i, (uchar) huff_tree->counts->tree_buff[i]));
2330 flush_bits();
2332 DBUG_PRINT("info", (" "));
2333 if (verbose >= 2)
2334 VOID(printf("\n"));
2335 my_afree((uchar*) packed_tree);
2336 if (errors)
2338 VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
2339 return 0;
2341 return elements;
2345 static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2346 uint *offset)
2348 uint *prev_offset;
2350 prev_offset= offset;
2352 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2353 then there is no left child and, hence no right child either. This
2354 is a property of a binary tree. An element is either a node with two
2355 childs, or a leaf without childs.
2357 The current element is always a node with two childs. Go left first.
2359 if (!element->a.nod.left->a.leaf.null)
2361 /* Store the byte code or the index of the column value. */
2362 prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2363 offset+=2;
2365 else
2368 Recursively traverse the tree to the left. Mark it as an offset to
2369 another tree node (in contrast to a byte code or column value index).
2371 prev_offset[0]= IS_OFFSET+2;
2372 offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2375 /* Now, check the right child. */
2376 if (!element->a.nod.right->a.leaf.null)
2378 /* Store the byte code or the index of the column value. */
2379 prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2380 return offset;
2382 else
2385 Recursively traverse the tree to the right. Mark it as an offset to
2386 another tree node (in contrast to a byte code or column value index).
2388 uint temp=(uint) (offset-prev_offset-1);
2389 prev_offset[1]= IS_OFFSET+ temp;
2390 if (huff_tree->max_offset < temp)
2391 huff_tree->max_offset = temp;
2392 return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2396 /* Get number of bits neaded to represent value */
2398 static uint max_bit(register uint value)
2400 reg2 uint power=1;
2402 while ((value>>=1))
2403 power++;
2404 return (power);
2408 static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2410 int error;
2411 uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length,
2412 intervall,field_length,max_pack_length,pack_blob_length;
2413 my_off_t record_count;
2414 char llbuf[32];
2415 ulong length,pack_length;
2416 uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2417 HUFF_COUNTS *count,*end_count;
2418 HUFF_TREE *tree;
2419 MI_INFO *isam_file=mrg->file[0];
2420 uint pack_version= (uint) isam_file->s->pack.version;
2421 DBUG_ENTER("compress_isam_file");
2423 /* Allocate a buffer for the records (excluding blobs). */
2424 if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2425 return -1;
2427 end_count=huff_counts+isam_file->s->base.fields;
2428 min_record_length= (uint) ~0;
2429 max_record_length=0;
2432 Calculate the maximum number of bits required to pack the records.
2433 Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2434 The tree height determines the maximum number of bits per value.
2435 Some fields skip leading or trailing spaces or zeroes. The skipped
2436 number of bytes is encoded by 'length_bits' bits.
2437 Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2438 and varchar get a leading 0 bit.
2440 for (i=max_calc_length=0 ; i < isam_file->s->base.fields ; i++)
2442 if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2443 huff_counts[i].max_zero_fill=0;
2444 if (huff_counts[i].field_type == FIELD_CONSTANT ||
2445 huff_counts[i].field_type == FIELD_ZERO ||
2446 huff_counts[i].field_type == FIELD_CHECK)
2447 continue;
2448 if (huff_counts[i].field_type == FIELD_INTERVALL)
2449 max_calc_length+=huff_counts[i].tree->height;
2450 else if (huff_counts[i].field_type == FIELD_BLOB ||
2451 huff_counts[i].field_type == FIELD_VARCHAR)
2452 max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2453 else
2454 max_calc_length+=
2455 (huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2456 huff_counts[i].tree->height+huff_counts[i].length_bits;
2458 max_calc_length= (max_calc_length + 7) / 8;
2459 pack_ref_length= calc_pack_length(pack_version, max_calc_length);
2460 record_count=0;
2461 /* 'max_blob_length' is the max length of all blobs of a record. */
2462 pack_blob_length= isam_file->s->base.blobs ?
2463 calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2464 max_pack_length=pack_ref_length+pack_blob_length;
2466 DBUG_PRINT("fields", ("==="));
2467 mrg_reset(mrg);
2468 while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2470 ulong tot_blob_length=0;
2471 if (! error)
2473 if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
2474 break;
2475 record_pos= (uchar*) file_buffer.pos;
2476 file_buffer.pos+=max_pack_length;
2477 for (start_pos=record, count= huff_counts; count < end_count ; count++)
2479 end_pos=start_pos+(field_length=count->field_length);
2480 tree=count->tree;
2482 DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u "
2483 "lbits: %2u tree: %2u length: %4u",
2484 (ulong) (count - huff_counts + 1),
2485 count->field_type,
2486 count->pack_type, count->max_zero_fill,
2487 count->length_bits, count->tree->tree_number,
2488 count->field_length));
2490 /* Check if the column contains spaces only. */
2491 if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2493 for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2494 if (pos == end_pos)
2496 DBUG_PRINT("fields",
2497 ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1"));
2498 DBUG_PRINT("fields", ("---"));
2499 write_bits(1,1);
2500 start_pos=end_pos;
2501 continue;
2503 DBUG_PRINT("fields",
2504 ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1"));
2505 write_bits(0,1);
2507 end_pos-=count->max_zero_fill;
2508 field_length-=count->max_zero_fill;
2510 switch (count->field_type) {
2511 case FIELD_SKIP_ZERO:
2512 if (!memcmp((uchar*) start_pos,zero_string,field_length))
2514 DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1"));
2515 write_bits(1,1);
2516 start_pos=end_pos;
2517 break;
2519 DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1"));
2520 write_bits(0,1);
2521 /* Fall through */
2522 case FIELD_NORMAL:
2523 DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2524 (ulong) (end_pos - start_pos)));
2525 for ( ; start_pos < end_pos ; start_pos++)
2527 DBUG_PRINT("fields",
2528 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2529 (uchar) *start_pos,
2530 hexdigits(tree->code[(uchar) *start_pos]),
2531 (uint) tree->code_len[(uchar) *start_pos],
2532 bindigits(tree->code[(uchar) *start_pos],
2533 (uint) tree->code_len[(uchar) *start_pos])));
2534 write_bits(tree->code[(uchar) *start_pos],
2535 (uint) tree->code_len[(uchar) *start_pos]);
2537 break;
2538 case FIELD_SKIP_ENDSPACE:
2539 for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2540 length= (ulong) (end_pos - pos);
2541 if (count->pack_type & PACK_TYPE_SELECTED)
2543 if (length > count->min_space)
2545 DBUG_PRINT("fields",
2546 ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1"));
2547 DBUG_PRINT("fields",
2548 ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2549 length, field_length, count->length_bits));
2550 write_bits(1,1);
2551 write_bits(length,count->length_bits);
2553 else
2555 DBUG_PRINT("fields",
2556 ("FIELD_SKIP_ENDSPACE not more than min_space, "
2557 "bits: 1"));
2558 write_bits(0,1);
2559 pos=end_pos;
2562 else
2564 DBUG_PRINT("fields",
2565 ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2566 length, field_length, count->length_bits));
2567 write_bits(length,count->length_bits);
2569 /* Encode all significant bytes. */
2570 DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2571 (ulong) (pos - start_pos)));
2572 for ( ; start_pos < pos ; start_pos++)
2574 DBUG_PRINT("fields",
2575 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2576 (uchar) *start_pos,
2577 hexdigits(tree->code[(uchar) *start_pos]),
2578 (uint) tree->code_len[(uchar) *start_pos],
2579 bindigits(tree->code[(uchar) *start_pos],
2580 (uint) tree->code_len[(uchar) *start_pos])));
2581 write_bits(tree->code[(uchar) *start_pos],
2582 (uint) tree->code_len[(uchar) *start_pos]);
2584 start_pos=end_pos;
2585 break;
2586 case FIELD_SKIP_PRESPACE:
2587 for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2588 length= (ulong) (pos - start_pos);
2589 if (count->pack_type & PACK_TYPE_SELECTED)
2591 if (length > count->min_space)
2593 DBUG_PRINT("fields",
2594 ("FIELD_SKIP_PRESPACE more than min_space, bits: 1"));
2595 DBUG_PRINT("fields",
2596 ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2597 length, field_length, count->length_bits));
2598 write_bits(1,1);
2599 write_bits(length,count->length_bits);
2601 else
2603 DBUG_PRINT("fields",
2604 ("FIELD_SKIP_PRESPACE not more than min_space, "
2605 "bits: 1"));
2606 pos=start_pos;
2607 write_bits(0,1);
2610 else
2612 DBUG_PRINT("fields",
2613 ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2614 length, field_length, count->length_bits));
2615 write_bits(length,count->length_bits);
2617 /* Encode all significant bytes. */
2618 DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2619 (ulong) (end_pos - start_pos)));
2620 for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2622 DBUG_PRINT("fields",
2623 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2624 (uchar) *start_pos,
2625 hexdigits(tree->code[(uchar) *start_pos]),
2626 (uint) tree->code_len[(uchar) *start_pos],
2627 bindigits(tree->code[(uchar) *start_pos],
2628 (uint) tree->code_len[(uchar) *start_pos])));
2629 write_bits(tree->code[(uchar) *start_pos],
2630 (uint) tree->code_len[(uchar) *start_pos]);
2632 break;
2633 case FIELD_CONSTANT:
2634 case FIELD_ZERO:
2635 case FIELD_CHECK:
2636 DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2637 start_pos=end_pos;
2638 break;
2639 case FIELD_INTERVALL:
2640 global_count=count;
2641 pos=(uchar*) tree_search(&count->int_tree, start_pos,
2642 count->int_tree.custom_arg);
2643 intervall=(uint) (pos - count->tree_buff)/field_length;
2644 DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2645 DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u",
2646 intervall, hexdigits(tree->code[intervall]),
2647 (uint) tree->code_len[intervall]));
2648 write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2649 start_pos=end_pos;
2650 break;
2651 case FIELD_BLOB:
2653 ulong blob_length=_mi_calc_blob_length(field_length-
2654 portable_sizeof_char_ptr,
2655 start_pos);
2656 /* Empty blobs are encoded with a single 1 bit. */
2657 if (!blob_length)
2659 DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1"));
2660 write_bits(1,1);
2662 else
2664 uchar *blob,*blob_end;
2665 DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1"));
2666 write_bits(0,1);
2667 /* Write the blob length. */
2668 DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2669 blob_length, count->length_bits));
2670 write_bits(blob_length,count->length_bits);
2671 memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
2672 sizeof(char*));
2673 blob_end=blob+blob_length;
2674 /* Encode the blob bytes. */
2675 for ( ; blob < blob_end ; blob++)
2677 DBUG_PRINT("fields",
2678 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2679 (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2680 (uint) tree->code_len[(uchar) *blob],
2681 bindigits(tree->code[(uchar) *start_pos],
2682 (uint)tree->code_len[(uchar) *start_pos])));
2683 write_bits(tree->code[(uchar) *blob],
2684 (uint) tree->code_len[(uchar) *blob]);
2686 tot_blob_length+=blob_length;
2688 start_pos= end_pos;
2689 break;
2691 case FIELD_VARCHAR:
2693 uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2694 ulong col_length= (var_pack_length == 1 ?
2695 (uint) *(uchar*) start_pos :
2696 uint2korr(start_pos));
2697 /* Empty varchar are encoded with a single 1 bit. */
2698 if (!col_length)
2700 DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1"));
2701 write_bits(1,1); /* Empty varchar */
2703 else
2705 uchar *end= start_pos + var_pack_length + col_length;
2706 DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1"));
2707 write_bits(0,1);
2708 /* Write the varchar length. */
2709 DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2710 col_length, count->length_bits));
2711 write_bits(col_length,count->length_bits);
2712 /* Encode the varchar bytes. */
2713 for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2715 DBUG_PRINT("fields",
2716 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2717 (uchar) *start_pos,
2718 hexdigits(tree->code[(uchar) *start_pos]),
2719 (uint) tree->code_len[(uchar) *start_pos],
2720 bindigits(tree->code[(uchar) *start_pos],
2721 (uint)tree->code_len[(uchar) *start_pos])));
2722 write_bits(tree->code[(uchar) *start_pos],
2723 (uint) tree->code_len[(uchar) *start_pos]);
2726 start_pos= end_pos;
2727 break;
2729 case FIELD_LAST:
2730 case FIELD_enum_val_count:
2731 abort(); /* Impossible */
2733 start_pos+=count->max_zero_fill;
2734 DBUG_PRINT("fields", ("---"));
2736 flush_bits();
2737 length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
2738 pack_length= save_pack_length(pack_version, record_pos, length);
2739 if (pack_blob_length)
2740 pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2741 tot_blob_length);
2742 DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu "
2743 "length-bytes: %lu", (ulong) record_count, length,
2744 tot_blob_length, pack_length));
2745 DBUG_PRINT("fields", ("==="));
2747 /* Correct file buffer if the header was smaller */
2748 if (pack_length != max_pack_length)
2750 bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2751 file_buffer.pos-= (max_pack_length-pack_length);
2753 if (length < (ulong) min_record_length)
2754 min_record_length=(uint) length;
2755 if (length > (ulong) max_record_length)
2756 max_record_length=(uint) length;
2757 record_count++;
2758 if (write_loop && record_count % WRITE_COUNT == 0)
2760 VOID(printf("%lu\r", (ulong) record_count));
2761 VOID(fflush(stdout));
2764 else if (error != HA_ERR_RECORD_DELETED)
2765 break;
2767 if (error == HA_ERR_END_OF_FILE)
2768 error=0;
2769 else
2771 VOID(fprintf(stderr, "%s: Got error %d reading records\n",
2772 my_progname, error));
2774 if (verbose >= 2)
2775 VOID(printf("wrote %s records.\n", llstr((longlong) record_count, llbuf)));
2777 my_afree((uchar*) record);
2778 mrg->ref_length=max_pack_length;
2779 mrg->min_pack_length=max_record_length ? min_record_length : 0;
2780 mrg->max_pack_length=max_record_length;
2781 DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2785 static char *make_new_name(char *new_name, char *old_name)
2787 return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2790 static char *make_old_name(char *new_name, char *old_name)
2792 return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2795 /* rutines for bit writing buffer */
2797 static void init_file_buffer(File file, pbool read_buffer)
2799 file_buffer.file=file;
2800 file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2801 MYF(MY_WME));
2802 file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2803 file_buffer.pos_in_file=0;
2804 error_on_write=0;
2805 if (read_buffer)
2808 file_buffer.pos=file_buffer.end;
2809 file_buffer.bits=0;
2811 else
2813 file_buffer.pos=file_buffer.buffer;
2814 file_buffer.bits=BITS_SAVED;
2816 file_buffer.bitbucket= 0;
2820 static int flush_buffer(ulong neaded_length)
2822 ulong length;
2825 file_buffer.end is 8 bytes lower than the real end of the buffer.
2826 This is done so that the end-of-buffer condition does not need to be
2827 checked for every byte (see write_bits()). Consequently,
2828 file_buffer.pos can become greater than file_buffer.end. The
2829 algorithms in the other functions ensure that there will never be
2830 more than 8 bytes written to the buffer without an end-of-buffer
2831 check. So the buffer cannot be overrun. But we need to check for the
2832 near-to-buffer-end condition to avoid a negative result, which is
2833 casted to unsigned and thus becomes giant.
2835 if ((file_buffer.pos < file_buffer.end) &&
2836 ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2837 return 0;
2838 length=(ulong) (file_buffer.pos-file_buffer.buffer);
2839 file_buffer.pos=file_buffer.buffer;
2840 file_buffer.pos_in_file+=length;
2841 if (test_only)
2842 return 0;
2843 if (error_on_write|| my_write(file_buffer.file,
2844 (const uchar*) file_buffer.buffer,
2845 length,
2846 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2848 error_on_write=1;
2849 return 1;
2852 if (neaded_length != ~(ulong) 0 &&
2853 (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2855 char *tmp;
2856 neaded_length+=256; /* some margin */
2857 tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
2858 if (!tmp)
2859 return 1;
2860 file_buffer.pos= ((uchar*) tmp +
2861 (ulong) (file_buffer.pos - file_buffer.buffer));
2862 file_buffer.buffer= (uchar*) tmp;
2863 file_buffer.end= (uchar*) (tmp+neaded_length-8);
2865 return 0;
2869 static void end_file_buffer(void)
2871 my_free((uchar*) file_buffer.buffer,MYF(0));
2874 /* output `bits` low bits of `value' */
2876 static void write_bits(register ulonglong value, register uint bits)
2878 DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2879 (bits == 8 * sizeof(value)));
2881 if ((file_buffer.bits-= (int) bits) >= 0)
2883 file_buffer.bitbucket|= value << file_buffer.bits;
2885 else
2887 reg3 ulonglong bit_buffer;
2888 bits= (uint) -file_buffer.bits;
2889 bit_buffer= (file_buffer.bitbucket |
2890 ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2891 #if BITS_SAVED == 64
2892 *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2893 *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2894 *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2895 *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2896 #endif
2897 *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2898 *file_buffer.pos++= (uchar) (bit_buffer >> 16);
2899 *file_buffer.pos++= (uchar) (bit_buffer >> 8);
2900 *file_buffer.pos++= (uchar) (bit_buffer);
2902 if (bits != 8 * sizeof(value))
2903 value&= (((ulonglong) 1) << bits) - 1;
2904 if (file_buffer.pos >= file_buffer.end)
2905 VOID(flush_buffer(~ (ulong) 0));
2906 file_buffer.bits=(int) (BITS_SAVED - bits);
2907 file_buffer.bitbucket= value << (BITS_SAVED - bits);
2909 return;
2912 /* Flush bits in bit_buffer to buffer */
2914 static void flush_bits(void)
2916 int bits;
2917 ulonglong bit_buffer;
2919 bits= file_buffer.bits & ~7;
2920 bit_buffer= file_buffer.bitbucket >> bits;
2921 bits= BITS_SAVED - bits;
2922 while (bits > 0)
2924 bits-= 8;
2925 *file_buffer.pos++= (uchar) (bit_buffer >> bits);
2927 if (file_buffer.pos >= file_buffer.end)
2928 VOID(flush_buffer(~ (ulong) 0));
2929 file_buffer.bits= BITS_SAVED;
2930 file_buffer.bitbucket= 0;
2934 /****************************************************************************
2935 ** functions to handle the joined files
2936 ****************************************************************************/
2938 static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
2939 ha_checksum crc)
2941 MYISAM_SHARE *share=isam_file->s;
2942 uint options=mi_uint2korr(share->state.header.options);
2943 uint key;
2944 DBUG_ENTER("save_state");
2946 options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2947 mi_int2store(share->state.header.options,options);
2949 share->state.state.data_file_length=new_length;
2950 share->state.state.del=0;
2951 share->state.state.empty=0;
2952 share->state.dellink= HA_OFFSET_ERROR;
2953 share->state.split=(ha_rows) mrg->records;
2954 share->state.version=(ulong) time((time_t*) 0);
2955 if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
2958 Some indexes are disabled, cannot use current key_file_length value
2959 as an estimate of upper bound of index file size. Use packed data file
2960 size instead.
2962 share->state.state.key_file_length= new_length;
2965 If there are no disabled indexes, keep key_file_length value from
2966 original file so "myisamchk -rq" can use this value (this is necessary
2967 because index size cannot be easily calculated for fulltext keys)
2969 mi_clear_all_keys_active(share->state.key_map);
2970 for (key=0 ; key < share->base.keys ; key++)
2971 share->state.key_root[key]= HA_OFFSET_ERROR;
2972 for (key=0 ; key < share->state.header.max_block_size_index ; key++)
2973 share->state.key_del[key]= HA_OFFSET_ERROR;
2974 isam_file->state->checksum=crc; /* Save crc here */
2975 share->changed=1; /* Force write of header */
2976 share->state.open_count=0;
2977 share->global_changed=0;
2978 VOID(my_chsize(share->kfile, share->base.keystart, 0, MYF(0)));
2979 if (share->base.keys)
2980 isamchk_neaded=1;
2981 DBUG_RETURN(mi_state_info_write(share->kfile,&share->state,1+2));
2985 static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
2986 ha_checksum crc)
2988 MI_STATE_INFO state;
2989 MI_INFO *isam_file=mrg->file[0];
2990 uint options;
2991 DBUG_ENTER("save_state_mrg");
2993 state= isam_file->s->state;
2994 options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
2995 HA_OPTION_READ_ONLY_DATA);
2996 mi_int2store(state.header.options,options);
2997 state.state.data_file_length=new_length;
2998 state.state.del=0;
2999 state.state.empty=0;
3000 state.state.records=state.split=(ha_rows) mrg->records;
3001 /* See comment above in save_state about key_file_length handling. */
3002 if (mrg->src_file_has_indexes_disabled)
3004 isam_file->s->state.state.key_file_length=
3005 max(isam_file->s->state.state.key_file_length, new_length);
3007 state.dellink= HA_OFFSET_ERROR;
3008 state.version=(ulong) time((time_t*) 0);
3009 mi_clear_all_keys_active(state.key_map);
3010 state.state.checksum=crc;
3011 if (isam_file->s->base.keys)
3012 isamchk_neaded=1;
3013 state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3014 DBUG_RETURN (mi_state_info_write(file,&state,1+2));
3018 /* reset for mrg_rrnd */
3020 static void mrg_reset(PACK_MRG_INFO *mrg)
3022 if (mrg->current)
3024 mi_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3025 mrg->current=0;
3029 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3031 int error;
3032 MI_INFO *isam_info;
3033 my_off_t filepos;
3035 if (!info->current)
3037 isam_info= *(info->current=info->file);
3038 info->end=info->current+info->count;
3039 mi_reset(isam_info);
3040 mi_extra(isam_info, HA_EXTRA_CACHE, 0);
3041 filepos=isam_info->s->pack.header_length;
3043 else
3045 isam_info= *info->current;
3046 filepos= isam_info->nextpos;
3049 for (;;)
3051 isam_info->update&= HA_STATE_CHANGED;
3052 if (!(error=(*isam_info->s->read_rnd)(isam_info,(uchar*) buf,
3053 filepos, 1)) ||
3054 error != HA_ERR_END_OF_FILE)
3055 return (error);
3056 mi_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3057 if (info->current+1 == info->end)
3058 return(HA_ERR_END_OF_FILE);
3059 info->current++;
3060 isam_info= *info->current;
3061 filepos=isam_info->s->pack.header_length;
3062 mi_reset(isam_info);
3063 mi_extra(isam_info,HA_EXTRA_CACHE, 0);
3068 static int mrg_close(PACK_MRG_INFO *mrg)
3070 uint i;
3071 int error=0;
3072 for (i=0 ; i < mrg->count ; i++)
3073 error|=mi_close(mrg->file[i]);
3074 if (mrg->free_file)
3075 my_free((uchar*) mrg->file,MYF(0));
3076 return error;
3080 #if !defined(DBUG_OFF)
3082 Fake the counts to get big Huffman codes.
3084 SYNOPSIS
3085 fakebigcodes()
3086 huff_counts A pointer to the counts array.
3087 end_count A pointer past the counts array.
3089 DESCRIPTION
3091 Huffman coding works by removing the two least frequent values from
3092 the list of values and add a new value with the sum of their
3093 incidences in a loop until only one value is left. Every time a
3094 value is reused for a new value, it gets one more bit for its
3095 encoding. Hence, the least frequent values get the longest codes.
3097 To get a maximum code length for a value, two of the values must
3098 have an incidence of 1. As their sum is 2, the next infrequent value
3099 must have at least an incidence of 2, then 4, 8, 16 and so on. This
3100 means that one needs 2**n bytes (values) for a code length of n
3101 bits. However, using more distinct values forces the use of longer
3102 codes, or reaching the code length with less total bytes (values).
3104 To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3105 I assign counts of 1 to the two most frequent values, a count of 2
3106 for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3107 the remaining values get 1. That way every possible byte has an
3108 assigned code, though not all codes are used if not all byte values
3109 are present in the column.
3111 This strategy would work with distinct column values too, but
3112 requires that at least 64(32) values are present. To make things
3113 easier here, I cancel all distinct column values and force byte
3114 compression for all columns.
3116 RETURN
3117 void
3120 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3122 HUFF_COUNTS *count;
3123 my_off_t *cur_count_p;
3124 my_off_t *end_count_p;
3125 my_off_t **cur_sort_p;
3126 my_off_t **end_sort_p;
3127 my_off_t *sort_counts[256];
3128 my_off_t total;
3129 DBUG_ENTER("fakebigcodes");
3131 for (count= huff_counts; count < end_count; count++)
3134 Remove distinct column values.
3136 if (huff_counts->tree_buff)
3138 my_free((uchar*) huff_counts->tree_buff, MYF(0));
3139 delete_tree(&huff_counts->int_tree);
3140 huff_counts->tree_buff= NULL;
3141 DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3145 Sort counts by decreasing incidence.
3147 cur_count_p= count->counts;
3148 end_count_p= cur_count_p + 256;
3149 cur_sort_p= sort_counts;
3150 while (cur_count_p < end_count_p)
3151 *(cur_sort_p++)= cur_count_p++;
3152 (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3155 Assign faked counts.
3157 cur_sort_p= sort_counts;
3158 #if SIZEOF_LONG_LONG > 4
3159 end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3160 #else
3161 end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3162 #endif
3163 /* Most frequent value gets a faked count of 1. */
3164 **(cur_sort_p++)= 1;
3165 total= 1;
3166 while (cur_sort_p < end_sort_p)
3168 **(cur_sort_p++)= total;
3169 total<<= 1;
3171 /* Set the last value. */
3172 **(cur_sort_p++)= --total;
3174 Set the remaining counts.
3176 end_sort_p= sort_counts + 256;
3177 while (cur_sort_p < end_sort_p)
3178 **(cur_sort_p++)= 1;
3180 DBUG_VOID_RETURN;
3185 Compare two counts for reverse sorting.
3187 SYNOPSIS
3188 fakecmp()
3189 count1 One count.
3190 count2 Another count.
3192 RETURN
3193 1 count1 < count2
3194 0 count1 == count2
3195 -1 count1 > count2
3198 static int fakecmp(my_off_t **count1, my_off_t **count2)
3200 return ((**count1 < **count2) ? 1 :
3201 (**count1 > **count2) ? -1 : 0);
3203 #endif