1 /* rewrite in C and caml stubs by b8_bavard (2002) */
2 /* rewrite to class without glib by Mathias Küster (2002) */
4 /* DCTC - a Direct Connect text clone for Linux
5 * Copyright (C) 2001 Eric Prevoteau
7 * he3.c: Copyright (C) Eric Prevoteau <www@ac2i.tzo.com>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
29 #include <dclib/clist.h>
30 #include <dclib/cbytearray.h>
34 /******************************************************/
35 /* patch from David Toth for */
36 /* File sharing list Windows (DC) compatibiliy bugfix */
37 /* (byte 4 meaning and computation). */
38 /******************************************************/
40 /*********************************************************************/
41 /* When the file list of another user is retrieved, it is compressed */
42 /* The data format is described below.
47 byte 4: 8 bit parity of the decoded data (the result of all bytes "xor"-ed)
48 byte 5 - 8 : it is a number encoded in little endian. It it the number of bytes to produce.
49 byte 9 - 10: it is a number encoded in little endian. This number gives the number of couples.
50 Let's name this number as N
51 byte 11 - (10+2*N): this is a list of couple. The first byte is a character (the one before compression),
52 the second byte gives the length of the compressed pattern. Let's name this value L.
53 byte (11+2*N) - (11+2*N+ ((sum(L)+7)&~7): a bit barbare :) To simplify. Let's name K, the sum of all L values.
54 Here, you find a string of K bits (byte aligned). The L0 first bits are the encoded value of
55 caracter of couple 0. The next Lx bits are the encoded value of caractere of couple x.
56 Note: each byte are used from the lower bit to the upper bit.
57 after this string, you have the encoded string.
58 When data are finally decoded. The format is always:
60 \x9 filename | filesize \xD\xA
61 \x9 filename | filesize \xD\xA
62 \x9 filename | filesize \xD\xA
63 \x9 filename | filesize \xD\xA
64 \x9 filename | filesize \xD\xA
69 /* ============= Decoding functions ================ */
71 /******************************************************/
72 /*get 1 bit from the current bit position inside data */
73 /******************************************************/
74 unsigned long get_bit(unsigned char *data
, unsigned long *cur_pos
)
78 out
=((unsigned long)(data
[(*cur_pos
)/8]>>((*cur_pos
)&7)))&1;
85 /*********************************************************/
86 /* get nb_bits from the current bit position inside data */
87 /*********************************************************/
88 unsigned long get_bits(unsigned char *data
, unsigned long *cur_pos
, int nb_bit
)
95 res
=(res
<<1)|get_bit(data
,cur_pos
);
100 /**************************************************/
101 /* decompress data compressed using HE3 algorithm */
102 /**********************************************************/
103 /* input: a GByteArray containing HE3 compressed data */
104 /* output: a GString containing uncompressed data or NULL */
105 /**********************************************************/
106 char *decode_he3_data(char *data_string
, int data_len
, int* final_len
)
108 char *output_string
= NULL
;
112 if((data_string
[0]=='H')&&(data_string
[1]=='E')&&(data_string
[2]=='3')&&(data_string
[3]==0xD))
114 char *decode_array
=NULL
;
117 int max_len
=0; /* max size of encoded pattern */
118 int ttl_len
=0; /* total size of all encoded patterns */
119 unsigned long offset_pattern
;
120 unsigned long offset_encoded
;
124 /* compute the number of bytes to produce */
125 nb_output
=(((int)(data_string
[8]))&255);
127 nb_output
|=(((int)(data_string
[7]))&255);
129 nb_output
|=(((int)(data_string
[6]))&255);
131 nb_output
|=(((int)(data_string
[5]))&255);
133 /* compute the number of couples */
134 nb_couple
=data_string
[9];
135 nb_couple
+=((((int)(data_string
[10]))&255)<<8);
137 for(pos
=0;pos
<nb_couple
;pos
++)
141 v
=(((int)(data_string
[12+pos
*2]))&255);
147 //decode_array=g_byte_array_new();
149 /* theorytically, max_len could reach up to 255 */
152 decode_array
= malloc(1<<(max_len
+1));
154 /* but really, a value above 15 is not oftenly encountered */
155 /* thus the algorithm needs no more than ... 64KB */
156 /* raisonnable value is upto 23 (requires 8MB) */
158 if(decode_array
!=NULL
)
160 /* clear the decode array */
161 /* the decode array is technically a binary tree */
162 /* if the depth of the tree is big (let's say more than 10), */
163 /* storing the binary tree inside an array becomes very memory consumming */
164 /* but I am too lazy to program all binary tree creation/addition/navigation/destruction functions :) */
165 memset(decode_array
,0,1<<(max_len
+1));
167 offset_pattern
=8*(11+nb_couple
*2); /* position of the pattern block, it is just after the list of couples */
168 offset_encoded
=offset_pattern
+ ((ttl_len
+7)&~7); /* the encoded data are just after */
169 /* the pattern block (rounded to upper full byte) */
171 /* decode_array is a binary tree. byte 0 is the level 0 of the tree. byte 2-3, the level 1, byte 4-7, the level 2, */
172 /* in decode array, a N bit length pattern having the value K is its data at the position: */
173 /* 2^N + (K&((2^N)-1)) */
174 /* due to the fact K has always N bit length, the formula can be simplified into: */
176 for(pos
=0;pos
<nb_couple
;pos
++)
181 v_len
=(((int)(data_string
[12+pos
*2]))&255); /* the number of bit required */
183 value
=get_bits(data_string
,&offset_pattern
,v_len
);
184 decode_array
[(1<<v_len
)+ value
]=data_string
[11+pos
*2]; /* the character */
187 /* now, its time to decode */
188 //while(output->len!=nb_output)
189 output_string
= malloc(nb_output
+1);
190 output_string
[nb_output
] = 0;
191 while(output_len
!= nb_output
)
192 //while(l!=nb_output)
194 unsigned long cur_val
;
195 unsigned int nb_bit_val
;
197 cur_val
=get_bit(data_string
,&offset_encoded
); /* get one bit */
200 while(decode_array
[(1<<nb_bit_val
) + cur_val
]==0)
202 cur_val
=(cur_val
<<1)|get_bit(data_string
,&offset_encoded
);
206 //output=g_string_append_c(output,decode_array->data[(1<<nb_bit_val) + cur_val ]);
207 output_string
[output_len
++] = decode_array
[(1<<nb_bit_val
) + cur_val
];
211 //g_byte_array_free(decode_array,TRUE);
218 unsigned char parity
=0;
220 for(i
=0;i
<output_len
;i
++)
221 parity
^=output_string
[i
];
225 *final_len
= output_len
;
226 return output_string
;
231 /* ================= Encoding functions ======================== */
233 //static gint huf_insert_glist(gconstpointer a, gconstpointer b)
234 int huf_insert_glist(void * a
, void * b
)
236 if(((const HUFNODE
*)a
)->occur
<((const HUFNODE
*)b
)->occur
)
237 return -1; /* a is must be before b */
238 else if(((const HUFNODE
*)a
)->occur
>((const HUFNODE
*)b
)->occur
)
239 return 1; /* a is must be after b */
241 /* if both have the same occurences, the one without son comes before */
242 if( (((const HUFNODE
*)a
)->left
==NULL
) && (((const HUFNODE
*)b
)->left
==NULL
) )
245 if(((const HUFNODE
*)a
)->left
==NULL
)
252 /**************************************************************************/
253 /* recursively scan all nodes of the huffman tree and fill encoding table */
254 /**************************************************************************/
255 void use_hufnode(HUFENCODE tbl_enc
[256], HUFNODE
*node
,unsigned int bits_len
, unsigned long bits
)
258 { /* it is a node, right is always != NULL */
259 use_hufnode(tbl_enc
,node
->left
,bits_len
+1,(bits
<<1)|0);
260 use_hufnode(tbl_enc
,node
->right
,bits_len
+1,(bits
<<1)|1);
263 { /* it is a leaf, right is always NULL */
264 int idx
=((int)node
->val
)&255;
265 tbl_enc
[idx
].bits_len
=bits_len
;
266 tbl_enc
[idx
].bits
=bits
;
267 // printf("huf: bits_len: %u pattern: %lu ",bits_len,bits);
268 // printf("leaf: %d (%c)\n",idx,node->val);
270 // if(bits_len>8*sizeof(unsigned long)) /* compared with the size of unsigned long in bits */
271 // disp_msg(ERR_MSG,"user_hufnode","encoded value cannot fit into unsigned long",NULL);
277 /*****************************************************************/
278 /* recursively free memory used by all nodes of the huffman tree */
279 /*****************************************************************/
280 void free_hufnode(HUFNODE
*node
)
282 // printf("free node %c\n",node->val);
286 free_hufnode(node
->left
);
287 if(node
->right
!=NULL
)
288 free_hufnode(node
->right
);
293 void add_bit(char *data
, unsigned long *bit_pos
, unsigned char bit_value
)
295 if(((*bit_pos
)&7)==0) /* starting a new byte ? */
297 //data=g_byte_array_append(data,&v,1);
298 data
[(*bit_pos
)/8]|= 0;
301 /* due to the fact we always add 0 as new byte, we don't have to set bit having 0 as value */
305 data
[(*bit_pos
)/8]|= (1<<((*bit_pos
)&7));
311 /**************************************/
312 /* append the pattern to data@bit_pos */
313 /**************************************/
314 void add_bits(char *data
, unsigned long *bit_pos
, unsigned long pattern
, unsigned int pattern_length
)
318 for(i
=0;i
<pattern_length
;i
++)
320 add_bit(data
, bit_pos
,(unsigned char)(pattern
>>(pattern_length
-1-i
))&1 );
321 /* use pattern from upper to lower bit */
327 static void disp_huf(GList
*pre_tree
)
332 //for(i=0;i<g_list_length(pre_tree);i++)
333 for(i
=0;i
<pre_tree
->size
;i
++)
335 w
=g_list_nth_data(pre_tree
,i
); /* get the first HUFNODE of the list */
337 printf("occur: %ld, left=%p, right=%p, val=%02X\n",w
->occur
,w
->left
,w
->right
,w
->val
);
341 /*****************************************************************************************/
342 /* compress data compressed using an Huffman algorithm and store it in HE3 usable format */
343 /*****************************************************************************************/
344 /* input: a GString containing a string to compress */
345 /* output: a GByteArray containing compressed data or NULL */
346 /***********************************************************/
347 static HUFNODE
*pre_tree
[256];
348 static int pre_tree_len
= 0;
350 void insert_node(HUFNODE
*nw
){
355 while(j
>0 && (pre_tree
[j
-1]->occur
< nw
->occur
|| (pre_tree
[j
-1]->occur
== nw
->occur
&& pre_tree
[j
-1]->left
== NULL
&& nw
->left
!= NULL
))){
356 pre_tree
[j
] = pre_tree
[j
-1];
357 // printf("move %d -> %d\n", j- 1, j);
361 // printf("insert at %d\n", j);
364 HUFNODE
*remove_node()
367 // printf("remove %d\n", pre_tree_len);
368 return pre_tree
[pre_tree_len
];
371 char *encode_he3_data(char *str
, int len
, int* final_len
)
373 unsigned long occur
[256];
374 HUFENCODE tbl_enc
[256];
378 HUFNODE
*root_huf
=NULL
;
380 unsigned long bit_pos
;
383 if((str
==NULL
)||(len
==0)) {
388 /* count the number of times each character appears */
389 memset(occur
,0,sizeof(occur
));
393 occur
[((int)(str
[i
]))&255]++;
396 /* now, we will build the huffman tree */
399 /* stage 1: create all the leafs */
406 nw
=(HUFNODE
*)malloc(sizeof(HUFNODE
));
410 nw
->val
=(unsigned char)i
;
416 while(pre_tree_len
>1)
420 nw
=(HUFNODE
*)malloc(sizeof(HUFNODE
));
422 nw
->left
=remove_node();
423 nw
->right
=remove_node ();
425 nw
->occur
=nw
->left
->occur
+nw
->right
->occur
;
433 /* now, the huffman tree is done */
434 //root_huf=g_list_nth_data(pre_tree,0);
435 root_huf
=pre_tree
[0];
438 memset(tbl_enc
,0,sizeof(tbl_enc
));
440 /* now, we will compute encoding pattern from huffman tree */
441 use_hufnode(tbl_enc
,root_huf
,0,0);
443 /* well, encoding table is now done, we can encode the data */
444 //data=g_byte_array_new();
445 data
= malloc(len
+ 256 * 10);
447 /* set the initial HE3 header */
449 unsigned char he3_header
[]={'H','E','3',0xD,
450 0, /* parity: all uncompressed bytes xor'ed */
451 0,0,0,0, /* number of bytes to produce (little-endian) */
452 0,0}; /* number of couples */
454 /* patch from David Toth (byte 4: parity computation) */
455 unsigned char parity
=0;
459 he3_header
[4]=(unsigned char)(parity
&255);
461 //he3_header[5]=str->len&255;
462 he3_header
[5]=(unsigned char)(len
&255);
463 he3_header
[6]=(unsigned char)((len
>>8)&255);
464 he3_header
[7]=(unsigned char)((len
>>16)&255);
465 he3_header
[8]=(unsigned char)((len
>>24)&255);
466 he3_header
[9]=(unsigned char)(nb_val
&255);
467 he3_header
[10]=(unsigned char)((nb_val
>>8)&255);
469 //data=g_byte_array_append(data,he3_header,11);
470 memmove(pos
, he3_header
, 11);
474 /* add the couple list (character, pattern length) */
479 *pos
++ = (unsigned char)i
;
480 *pos
++ = (unsigned char)tbl_enc
[i
].bits_len
;
484 /* pos--; *//* increment pos when bit_pos mod 8 = 0. The semantics of pos
485 change: instead of the next byte to write, it is the previous byte written. */
487 /* and now, add all the patterns */
488 bit_pos
=(pos
- data
)*8;
493 add_bits(data
,&bit_pos
,tbl_enc
[i
].bits
, tbl_enc
[i
].bits_len
);
497 bit_pos
=(bit_pos
+7)&~7; /* move to the beginning of the next byte */
499 /* now, we encode the string */
502 int idx
=((int)str
[i
])&255;
504 add_bits(data
,&bit_pos
,tbl_enc
[idx
].bits
, tbl_enc
[idx
].bits_len
);
507 bit_pos
=(bit_pos
+7)&~7; /* move to the beginning of the next byte */
508 *final_len
= (bit_pos
/ 8)+1;
510 // printf("final len %d\n", *final_len);
512 /* free huffman tree */
513 free_hufnode(root_huf
);
520 #include "caml/mlvalues.h"
521 #include "caml/fail.h"
522 #include "caml/alloc.h"
525 value
ml_che3_decompress(value s_v
)
527 char *s
= String_val(s_v
);
528 int len
= string_length(s_v
);
533 result
= decode_he3_data(s
, len
, &final_len
);
535 res
= alloc_string(final_len
);
536 memmove(String_val(res
), result
, final_len
);
541 value
ml_che3_compress(value s_v
)
543 char *s
= String_val(s_v
);
544 int len
= string_length(s_v
);
549 result
= encode_he3_data(s
, len
, &final_len
);
551 res
= alloc_string(final_len
);
552 memmove(String_val(res
), result
, final_len
);
559 int main(int argc
,char **argv
)
564 unsigned char tst
[54]={0x48,0x45,0x33,0x0D,0x12,0x24,0x00,0x00,0x00,0x0B,0x00,0x09,0x04,0x0A,0x03,0x0D,0x04,0x2E,0x04,0x32,0x04,0x61,0x02,0x62,0x03,0x65,0x05,0x73,0x05,0x74,0x03,0x7C,0x04,0xA6,0xF4,0xE2,0x21,0xAE,0x01,0x61,0x71,0x29,0xFB,0xFF,0xFF,0xBE,0x75,0xA5,0x0C,0x00,0xF0,0xAD,0x2B,0x05};
566 out
=decode_he3_data(tst
, 54, &final_len
);
568 printf("LEN[%d] = [%s]\n",final_len
, out
);
573 int main(int argc
,char **argv
)
581 in
= encode_he3_data(argv
[1], strlen(argv
[1]), &start_len
);
582 for(i
=0; i
<start_len
;i
++)
583 printf("%4x,", in
[i
]);
585 out
=decode_he3_data(in
, start_len
, &final_len
);
587 printf("LEN[%d] = [%s]\n",final_len
, out
);