new file: extract_bam_tag.sh
[GalaxyCodeBases.git] / c_cpp / readscorr / sdleft.c
blob803d1e5797c1bfb9373b85604117b3b596715e00
1 #include <stdlib.h> //calloc
2 #include <stdint.h> // uint_fast8_t
3 #include <math.h> //log2, ceil
4 #include <stdio.h> //fprintf, fseek
5 #include <err.h>
6 #include <string.h> //strcmp, strncpy
7 #include <sys/mman.h>
8 #include <endian.h> //BYTE_ORDER, LITTLE_ENDIAN 1234
9 //#include <asm/byteorder.h> // __LITTLE_ENDIAN_BITFIELD or __BIG_ENDIAN_BITFIELD
10 #ifdef PTHREAD
11 #include <pthread.h>
12 #endif
13 #include "sdleft.h"
14 //#include "sdleftTF.h"
15 #include "MurmurHash3.h"
16 #include "2bitseq.h"
17 #include "chrseq.h"
19 #define HASHSEED (0x3ab2ae35)
21 #if BYTE_ORDER != LITTLE_ENDIAN
22 #error We focus on Little Endian now.
23 #endif
25 unsigned char GETitemByte_PADrBit_trimSubItemCount(unsigned char CountBit, unsigned char *prBit, uint16_t *pSubItemCount){
26 unsigned char itemByte = (CountBit+*prBit+7u) >> 3; // 2^3=8
27 *prBit = (itemByte<<3) - CountBit;
28 #ifndef TEST /* SubItemCount is trimed on rBit only */
29 double maxSubItemByR = floor(pow(2.0,(double)*prBit));
30 if ( *pSubItemCount > maxSubItemByR ) *pSubItemCount = (uint16_t)maxSubItemByR; // safe since *pSubItemCount was bigger
31 #endif
32 return itemByte;
35 // the smarter one
36 SDLeftArray_t *dleft_arraynew(const unsigned char CountBit, const SDLConfig * const psdlcfg){
37 unsigned char rBit;
38 size_t ArraySize;
39 uint16_t SubItemCount;
41 return dleft_arrayinit(CountBit, ArraySize, SubItemCount);
44 // the native one
45 SDLeftArray_t *dleft_arrayinit(unsigned char CountBit, size_t ArraySize, uint16_t SubItemCount) {
46 if (ArraySize<2u || ArraySize>(1LLU<<63) || CountBit<3u || CountBit>8u*sizeof(uint64_t) || SubItemCount<1u ) {
47 err(EXIT_FAILURE, "[x]Wrong D Left Array Parameters:(%d)[%u]x%zd ",CountBit,SubItemCount,ArraySize);
48 } // CountBit+rBit >= 9, makes uint16_t always OK
49 unsigned char ArrayBit = ceil(log2(ArraySize));
50 unsigned char rBit = ArrayBit + ENCODEDBIT_ENTROPY_PAD;
51 /* if (ArraySize<2u || CountBit<3u || rBit<6u || rBit>8u*sizeof(uint64_t) || CountBit>8u*sizeof(uint64_t) || SubItemCount<1u ) {
52 err(EXIT_FAILURE, "[x]Wrong D Left Array Parameters:(%d+%d)[%u]x%zd ",rBit,CountBit,SubItemCount,ArraySize);
53 } // CountBit+rBit >= 9, makes uint16_t always OK
54 */
55 #ifdef TEST /* Test mode, keep rBit, pad CountBit */
56 unsigned char itemByte = GETitemByte_PADrBit_trimSubItemCount(rBit,&CountBit,&SubItemCount);
57 #else /* Normal, keep CountBit, pad rBit */
58 unsigned char itemByte = GETitemByte_PADrBit_trimSubItemCount(CountBit,&rBit,&SubItemCount);
59 #endif
60 SDLeftArray_t *dleftobj = calloc(1,sizeof(SDLeftArray_t)); // set other int to 0
61 dleftobj->SDLAbyte = (SubItemCount*itemByte*ArraySize+127u)&(~(size_t)127u); // We are reading in uint128_t now.
62 dleftobj->pDLA = calloc(1,dleftobj->SDLAbyte);
63 int mlock_r = mlock(dleftobj->pDLA,dleftobj->SDLAbyte);
64 if (mlock_r) warn("[!]Cannot lock SDL array in memory. Performance maybe reduced.");
65 //unsigned char SDLArray[ArraySize][SubItemCount][itemByte];
66 //the GNU C Compiler allocates memory for VLAs on the stack, >_<
67 dleftobj->CountBit = CountBit;
68 dleftobj->rBit = rBit;
69 dleftobj->ArrayBit = ArrayBit;
70 dleftobj->itemByte = itemByte;
71 dleftobj->ArraySize = ArraySize;
72 dleftobj->SubItemCount = SubItemCount;
73 dleftobj->SubItemByUnit = SubItemCount/SDL_SUBARRAY_UNIT;
74 dleftobj->maxCountSeen = 1; //if SDLA is not empty, maxCountSeen>=1.
75 /* only one 128bit hash needed.
76 dleftobj->ArrayCount = ArrayCount;
77 dleftobj->HashCnt = (rBit+ArrayBit+(HASH_LENB-1))/HASH_LENB;
78 dleftobj->outhash = malloc(dleftobj->HashCnt * HASH_LENB/8);*/
79 dleftobj->FalsePositiveRatio = exp2(-rBit)/(double)ArraySize;
80 dleftobj->ItemInsideAll=0;
81 dleftobj->CellOverflowCount=0;
82 dleftobj->Item_rBitMask=(uint128_t)((1LLU<<rBit)-1u) << CountBit;
83 dleftobj->Item_CountBitMask=(1LLU<<CountBit)-1u; // == maxCount
84 dleftobj->Hash_ArrayBitMask=(1LLU<<ArrayBit)-1u;
85 dleftobj->Hash_rBitMask=(1LLU<<rBit)-1u;
86 return dleftobj;
89 void fprintSDLAnfo(FILE *stream, const SDLeftArray_t * dleftobj){
90 fprintf(stream,"SDLA(%#zx) -> {\n\
91 Size:[r:%uB+cnt:%uB]*Item:%zd(%.3f~%uB)*subArray:%u = %g MiB\n\
92 Hash:%u*%uB ItemByte:%u MaxCountSeen:%lu%s\n\
93 Designed Capacity:%lu ItemCount:%lu, with Overflow:%lu\n\
94 FP:%g, estimated FP item count:%.10g\n\
95 Mem:%zu bytes\n\
97 (size_t)dleftobj,
98 dleftobj->rBit,dleftobj->CountBit,dleftobj->ArraySize,log2(dleftobj->ArraySize),
99 dleftobj->ArrayBit,dleftobj->SubItemCount,(double)dleftobj->SubItemCount*dleftobj->itemByte*dleftobj->ArraySize/1048576,
100 1,HASH_LENB,dleftobj->itemByte,dleftobj->maxCountSeen,(dleftobj->ItemInsideAll)?"":"(=0, as SDLA is empty)",
101 dleftobj->ArraySize*(dleftobj->SubItemCount*3/4),
102 dleftobj->ItemInsideAll,dleftobj->CellOverflowCount,
103 dleftobj->FalsePositiveRatio,dleftobj->ItemInsideAll*dleftobj->FalsePositiveRatio,
104 dleftobj->SDLAbyte);
106 fprintf(stream," Item_rBitMask:[%016lX %016lX]\n", (uint64_t)(dleftobj->Item_rBitMask>>64), (uint64_t)dleftobj->Item_rBitMask);
107 fprintf(stream," Item_CountBitMask:[%016lX]\n", dleftobj->Item_CountBitMask);
108 uint128_t check = dleftobj->Item_rBitMask + dleftobj->Item_CountBitMask;
109 fprintf(stream," Sum:[%016lX %016lX]\n", (uint64_t)(check>>64), (uint64_t)check);
110 fprintf(stream," Hash_ArrayBitMask:[%016lX]\n", dleftobj->Hash_ArrayBitMask);
111 fprintf(stream," Hash_rBitMask:[%016lX]\n", dleftobj->Hash_rBitMask);
113 fputs("}\n", stream);
116 FORCE_INLINE uint32_t rotl32 ( uint32_t x, int8_t r ){
117 return (x << r) | (x >> (32 - r));
120 // this is POP, thus the input *pdat will be changed/shifted.
121 // Max bits is the same as size_t
122 // this is a static function, bits has already been <= 8*sizeof(size_t)
123 // static function, thus the input parameters should be clean
124 FORCE_INLINE uint64_t popLowestBits(unsigned char bits, uint64_t *pdat, uint_fast8_t *pdatLenu64t){
125 //unsigned char lastBits = bits % (8*sizeof(size_t));
126 unsigned char nullBits = 8*sizeof(uint64_t)-bits;
127 //char MoreUnit = (bits-lastBits)/sizeof(size_t);
128 uint64_t bitMask = (1LLU<<bits)-1U;
129 uint64_t outUnit = pdat[0] & bitMask;
130 //printf("[b]%d,%d,%016zx,%016zx\n",bits,nullBits,bitMask,outUnit);
131 uint_fast8_t i;
132 for(i=1;i<*pdatLenu64t;i++){
133 uint64_t tmpUnit = *(pdat+i) & bitMask;
134 *(pdat+i-1) = (*(pdat+i-1) >> bits) | (tmpUnit << nullBits);
136 *(pdat+i-1) = *(pdat+i-1) >> bits;
137 *pdatLenu64t -= bits/(8*sizeof(uint64_t));
138 //printf("[c][%016lx,%016lx]\n",pdat[0],pdat[1]);
139 return outUnit;
142 void *searchSubArray(unsigned char* pChunk){}
144 // rBits is (0,64]
145 FORCE_INLINE void incSDLArray(size_t ArrayBits, uint64_t rBits, SDLeftArray_t *dleftobj){
146 //size_t ArrayPos = ArrayBits % dleftobj->ArraySize;
147 size_t ArrayPos = ((uint128_t)ArrayBits*(uint128_t)dleftobj->ArraySize) >> dleftobj->ArrayBit;
148 size_t relAddr = ArrayPos*dleftobj->SubItemCount*dleftobj->itemByte;
149 unsigned char* pChunk = (unsigned char*)dleftobj->pDLA + relAddr;
150 unsigned char* pEndChunk = (unsigned char*)pChunk + dleftobj->SubItemCount*dleftobj->itemByte;
151 //unsigned char BoundaryByteRel = rBits % 8; // Well, I will not touch the egg-head ...
152 uint64_t Item_rBits, Item_CountBits;
153 uint128_t theItem;
155 union u128t {
156 uint128_t all;
157 unsigned char byte[16];
158 } theItem;
160 #ifdef PTHREAD
161 uint16_t SubItemByUnit=dleftobj->SubItemByUnit;
162 unsigned char itemByte=dleftobj->itemByte;
163 pthread_t *ptWorkers=malloc(sizeof(pthread_t)*SubItemByUnit);
164 for (uint16_t i=0;i<SubItemByUnit;i++) {
165 pthread_create(ptWorkers+i,NULL,&searchSubArray,pChunk+i*itemByte);
168 for (uint16_t i=0;i<SubItemByUnit;i++) {
169 pthread_join(*(ptWorkers+i),NULL);
171 free(ptWorkers);
172 #else
173 #endif
174 while (pChunk < pEndChunk) {
175 #ifdef OLD
176 theItem = 0;
177 for (uint_fast8_t i=0;i<dleftobj->itemByte;i++) {
178 theItem |= ((uint128_t)*(pChunk+i)) << (i*8u);
179 //theItem.byte[i] = *(pChunk+i);
181 #elif defined NEW /* faster for one less register shift operation for memory uint8_t */
182 theItem = 0;
183 for (uint_fast8_t i=0;i<dleftobj->itemByte;i++) {
184 theItem = theItem << 8u;
185 theItem |= *(pChunk+i);
186 //theItem.byte[i] = *(pChunk+i);
188 #elif BYTE_ORDER == LITTLE_ENDIAN
189 theItem = *(uint128_t*)pChunk;
190 #else
191 #error Faster version is Little Endian, choose OLD or NEW to define !
192 #endif
193 Item_CountBits = theItem & dleftobj->Item_CountBitMask;
194 if (Item_CountBits == 0) { // reaching the pre-end
195 Item_CountBits = 1;
196 ++dleftobj->ItemInsideAll;
197 //printf("<%lu>",dleftobj->ItemInsideAll);
198 break;
200 Item_rBits = (theItem & dleftobj->Item_rBitMask) >> dleftobj->CountBit;
201 if (Item_rBits == rBits) {
202 if (Item_CountBits < dleftobj->Item_CountBitMask) {
203 ++Item_CountBits;
204 if (Item_CountBits > dleftobj->maxCountSeen) {
205 dleftobj->maxCountSeen = Item_CountBits;
206 #ifdef DEBUG
207 fprintf(stderr,"[sdlm][%zu][%lu]:[%lx],[%lu] [%016lx %016lx]\n",
208 ArrayPos,dleftobj->SubItemCount-(pEndChunk-pChunk)/dleftobj->itemByte,rBits,Item_CountBits,(uint64_t)(theItem>>64u),(uint64_t)theItem);
209 #endif
210 } // if SDLA is not empty, maxCountSeen>=1, no need to check when Item_CountBits == 0.
211 break;
212 } else {
213 ++dleftobj->CountBitOverflow;
214 return;
217 pChunk += dleftobj->itemByte;
218 } // reaching the structure-end
219 if (pChunk < pEndChunk) {
220 //printf("Old:%zu[%lx %lx]<-[%lx][%lx][%lx %lx] ",(size_t)((char*)pChunk - (char*)dleftobj->pDLA)-relAddr,(uint64_t)(theItem>>64),(uint64_t)theItem,rBits,Item_CountBits,(uint64_t)((((uint128_t)rBits)<<dleftobj->CountBit)>>64),(uint64_t)(((uint128_t)rBits)<<dleftobj->CountBit));
222 theItem &= ~dleftobj->CountBit;
223 theItem |= Item_CountBits;
224 *(uint128_t*)pChunk = theItem;
226 theItem = (((uint128_t)rBits)<<dleftobj->CountBit) | Item_CountBits;
227 #ifdef NEW
228 //printf("Old: %lx, %lx\n",(uint64_t)(theItem>>64u),(uint64_t)theItem);
229 pChunk += dleftobj->itemByte-1;
230 for (uint_fast8_t i=0;i<dleftobj->itemByte;i++) {
231 *pChunk-- = (uint8_t)(theItem>>(i*8u));
234 ++pChunk;
235 theItem = 0;
236 for (uint_fast8_t i=0;i<dleftobj->itemByte;i++) {
237 theItem = theItem << 8u;
238 theItem |= (uint128_t)*(pChunk+i);
239 //theItem.byte[i] = *(pChunk+i);
241 printf("New: %lx, %lx\n",(uint64_t)(theItem>>64u),(uint64_t)theItem);
243 #elif (BYTE_ORDER == LITTLE_ENDIAN) || (defined OLD)
244 for (uint_fast8_t i=0;i<dleftobj->itemByte;i++) {
245 uint128_t tmpMask = ((uint128_t)0xffLLU) << (i*8u);
246 *pChunk++ = (theItem & tmpMask)>>(i*8u);
248 #else
249 #error Faster version is Little Endian, choose OLD or NEW to define !
250 #endif
251 /*printf("New:%zu[%lx %lx] ",(size_t)((char*)pChunk - (char*)dleftobj->pDLA)-relAddr,(uint64_t)(theItem>>64),(uint64_t)theItem);
252 printf("Mem:%zu[",(size_t)((char*)pChunk - (char*)dleftobj->pDLA)-relAddr);
253 for (uint_fast8_t i=0;i<dleftobj->itemByte;i++) {
254 printf("%x ",*(pChunk-dleftobj->itemByte+i));
256 puts("]");*/
257 } else { // reaching the structure-end
258 ++dleftobj->CellOverflowCount;
259 // deal(*pextree);
262 // return 0 for not found
263 FORCE_INLINE uint64_t querySDLArray(size_t ArrayBits, uint64_t rBits, SDLeftArray_t *dleftobj){
264 size_t ArrayPos = ((uint128_t)ArrayBits*(uint128_t)dleftobj->ArraySize) >> dleftobj->ArrayBit;
265 size_t relAddr = ArrayPos*dleftobj->SubItemCount*dleftobj->itemByte;
266 unsigned char* pChunk = (unsigned char*)dleftobj->pDLA + relAddr;
267 unsigned char* pEndChunk = (unsigned char*)pChunk + dleftobj->SubItemCount*dleftobj->itemByte;
268 uint64_t Item_rBits;
269 uint64_t Item_CountBits=0; // set value in case SubItemCount*dleftobj->itemByte == 0 (EP ?)
270 uint128_t theItem;
271 while (pChunk < pEndChunk) {
272 #ifdef OLD
273 theItem = 0;
274 for (uint_fast8_t i=0;i<dleftobj->itemByte;i++) {
275 theItem |= ((uint128_t)*(pChunk+i)) << (i*8u);
277 #elif defined NEW /* faster for one less register shift operation for memory uint8_t */
278 theItem = 0;
279 for (uint_fast8_t i=0;i<dleftobj->itemByte;i++) {
280 theItem = theItem << 8u;
281 theItem |= *(pChunk+i);
282 //theItem.byte[i] = *(pChunk+i);
284 #elif BYTE_ORDER == LITTLE_ENDIAN
285 theItem = *(uint128_t*)pChunk;
286 #else
287 #error Faster version is Little Endian, choose OLD or NEW to define !
288 #endif
289 Item_CountBits = theItem & dleftobj->Item_CountBitMask;
290 if (Item_CountBits == 0) { // reaching the pre-end, not found
291 break;
293 Item_rBits = (theItem & dleftobj->Item_rBitMask) >> dleftobj->CountBit;
294 if (Item_rBits == rBits) { // found
295 break;
297 pChunk += dleftobj->itemByte;
299 if (pChunk >= pEndChunk) { // reaching the structure-end
300 // deal(*pextree);
302 return Item_CountBits;
307 // return 0 for not found
308 void travelSDLArray(SDLeftArray_t *dleftobj, FILE *fpdat){
309 size_t ArrayBits;
310 uint64_t rBits;
311 uint64_t Item_rBits;
312 uint64_t Item_CountBits=0; // set value in case SubItemCount*dleftobj->itemByte == 0 (EP ?)
313 uint128_t theItem;
314 unsigned char* pChunk = (unsigned char*)dleftobj->pDLA;
315 unsigned char* pEndChunk = (unsigned char*)pChunk + dleftobj->SDLAbyte;
316 fputs("ItemNo,rBit\tCount\n",fpdat);
317 while (pChunk < pEndChunk) {
318 #if BYTE_ORDER == LITTLE_ENDIAN
319 theItem = *(uint128_t*)pChunk;
320 #else
321 #error Faster version is Little Endian, choose OLD or NEW to define !
322 #endif
323 Item_CountBits = theItem & dleftobj->Item_CountBitMask;
324 Item_rBits = (theItem & dleftobj->Item_rBitMask) >> dleftobj->CountBit;
325 if (Item_CountBits) {
326 fprintf(fpdat, "%zx,%lX\t%lu\n",pChunk, Item_rBits, Item_CountBits);
328 pChunk += dleftobj->itemByte;
330 fputs("\n\4",fpdat);
335 FORCE_INLINE int_fast8_t dleft_insert_kmer(const char *const kmer, const size_t len, SDLeftArray_t *dleftobj,
336 uint64_t **dibskmer,size_t * const uint64cnt) {
337 char* revcomkmer = ChrSeqRevComp(kmer,len);
338 const char *const smallerkmer = (strcmp(kmer,revcomkmer)<=0)?kmer:revcomkmer; // not strncmp since the first odd len bytes mush be different.
339 #ifdef DEBUGMORE
340 printf("[%zd]->[%s,%s,%s]\n",len,kmer,revcomkmer,smallerkmer);
341 #endif
342 size_t bytelen = (len+3u)/4;
343 size_t Ncount = ChrSeq2dib(smallerkmer,len,dibskmer,uint64cnt);
344 //printf("%zd:%zd:[%016lx][%016lx]->[%s] (%zd)\n",*uint64cnt,bytelen,*dibskmer[0],*dibskmer[1],dib2basechr(*dibskmer,len),Ncount);
345 //uint64_t *ptmpout = dleftobj->outhash;
346 //for(uint_fast8_t i=0;i<dleftobj->HashCnt;i++){
347 //uint32_t seed=rotl32(0x3ab2ae35-i,i);
348 //MurmurHash3_x64_128(*dibskmer,bytelen,seed,ptmpout);
349 free(revcomkmer);
350 if (! Ncount) {
351 MurmurHash3_x64_128(*dibskmer,bytelen,HASHSEED,dleftobj->outhash);
352 //printf("[%016lx,%016lx]\n",dleftobj->outhash[0],dleftobj->outhash[1]);
353 //ptmpout += 2;
355 uint_fast8_t datLenu64t = HASH_LENB/(8*sizeof(uint64_t));
356 //printf("[x][%016lx,%016lx]\n",popLowestBits(60,dleftobj->outhash,&datLenu64t),popLowestBits(56,dleftobj->outhash,&datLenu64t));
357 uint64_t rBits = popLowestBits(dleftobj->rBit,dleftobj->outhash,&datLenu64t);
358 size_t ArrayBits = popLowestBits(dleftobj->ArrayBit,dleftobj->outhash,&datLenu64t);
359 incSDLArray(ArrayBits, rBits, dleftobj);
360 return 1;
362 return 0;
365 size_t dleft_insert_read(unsigned int k, char const *const inseq, size_t len, SDLeftArray_t *dleftobj) {
366 if (len<k) return 0;
367 size_t insertedCount=0;
368 uint64_t *dibskmer=NULL;
369 size_t uint64cnt = 0;
370 for (size_t i=0;i<=len-k;i++) {
371 if (dleft_insert_kmer(inseq+i,k,dleftobj,&dibskmer,&uint64cnt)) {
372 ++insertedCount;
375 free(dibskmer);
376 return insertedCount;
379 void dleft_dump(const SDLeftArray_t * const dleftobj, SDLdumpHead * const pSDLeftStat, FILE *stream){
380 rewind(stream); // for Binary file, position is important.
381 strncpy(pSDLeftStat->FileID,"GDSD",4);
382 pSDLeftStat->FileVersion[0]=0u;
383 pSDLeftStat->FileVersion[1]=1u;
384 //pSDLeftStat->extreebyte = 0;
385 pSDLeftStat->SubItemCount=dleftobj->SubItemCount;
386 pSDLeftStat->CountBit=dleftobj->CountBit;
387 pSDLeftStat->rBit=dleftobj->rBit;
388 pSDLeftStat->ArraySize=dleftobj->ArraySize;
389 pSDLeftStat->SDLAbyte=dleftobj->SDLAbyte;
390 pSDLeftStat->ItemInsideAll=dleftobj->ItemInsideAll;
391 pSDLeftStat->CellOverflowCount=dleftobj->CellOverflowCount;
392 pSDLeftStat->CountBitOverflow=dleftobj->CountBitOverflow;
393 pSDLeftStat->maxCountSeen=dleftobj->maxCountSeen;
394 pSDLeftStat->crc32c=0xffffffff; //later
395 size_t unitwritten;
396 unitwritten=fwrite(pSDLeftStat,sizeof(SDLdumpHead),1u,stream);
397 if (unitwritten != 1)
398 err(EXIT_FAILURE, "Fail to write dat file ! [%zd]",unitwritten);
399 unitwritten=fwrite(dleftobj->pDLA,pSDLeftStat->SDLAbyte,1u,stream);
400 if (unitwritten != 1)
401 err(EXIT_FAILURE, "Cannot write to dat file ! [%zd]",unitwritten);
402 printf("Cb %x rB %x AS %lx Size %lx HMC %lx HMH %lx\n",pSDLeftStat->CountBit,pSDLeftStat->rBit,pSDLeftStat->ArraySize,pSDLeftStat->SDLAbyte,pSDLeftStat->HistMaxCntVal,pSDLeftStat->HistMaxHistVal);
405 void dleft_arraydestroy(SDLeftArray_t * const dleftobj){
406 free(dleftobj->pDLA);
407 //free(dleftobj->outhash);
408 free(dleftobj);
411 #ifdef TEST
412 SDLeftStat_t * dleft_stat(SDLeftArray_t * const dleftobj, FILE *stream, FILE *fpdat) {
413 #else
414 SDLeftStat_t * dleft_stat(SDLeftArray_t * const dleftobj, FILE *stream) {
415 #endif
416 SDLeftStat_t *pSDLeftStat = malloc(sizeof(SDLeftStat_t));
417 uint64_t * const pCountHistArray = calloc(sizeof(uint64_t),1+dleftobj->maxCountSeen);
418 size_t totalDLAsize = dleftobj->SubItemCount * dleftobj->itemByte * dleftobj->ArraySize;
419 const unsigned char * const pDLA = dleftobj->pDLA;
420 uint64_t Item_CountBits=0; // set value in case SDLA_ITEMARRAY*dleftobj->itemByte == 0 (EP ?)
421 uint128_t theItem;
422 #ifdef TEST
423 uint16_t SubItemCount = dleftobj->SubItemCount;
424 uint64_t * const pCountSubArray = calloc(sizeof(uint64_t),1+SubItemCount);
425 uint64_t * const pCountSubNoOneArray = calloc(sizeof(uint64_t),1+SubItemCount);
426 uint16_t SubItemUsed=0;
427 uint16_t SubItemUsedCountIsOne=0;
428 uint64_t ArraySize = dleftobj->ArraySize;
429 size_t mainArrayID=0;
430 rewind(fpdat);
431 travelSDLArray(dleftobj, fpdat);
433 size_t unitwritten;
434 unitwritten=fwrite(&ArraySize,sizeof(uint64_t),1u,fpdat);
435 if (unitwritten != 1)
436 err(EXIT_FAILURE, "Fail to write dat file ! [%zd]",unitwritten);
438 #endif
439 for (size_t i=0;i<totalDLAsize;i+=dleftobj->itemByte) {
440 //const unsigned char * pChunk = pDLA + i;
441 #ifdef OLD
442 theItem = 0;
443 for (uint_fast8_t j=0;j<dleftobj->itemByte;j++) {
444 theItem |= ((uint128_t)*(pDLA + i + j)) << (j*8u);
446 #elif defined NEW
447 theItem = 0;
448 for (uint_fast8_t j=0;j<dleftobj->itemByte;j++) {
449 theItem = theItem << 8u;
450 theItem |= *(pDLA + i + j);
452 #elif BYTE_ORDER == LITTLE_ENDIAN
453 theItem = *(uint128_t*)(pDLA+i);
454 #else
455 #error Faster version is Little Endian, choose OLD or NEW to define !
456 #endif
457 Item_CountBits = (uint64_t)theItem & dleftobj->Item_CountBitMask; // Item_CountBitMask is uint64_t.
458 ++pCountHistArray[Item_CountBits];
459 #ifdef TEST
460 ++mainArrayID;
461 if (Item_CountBits) {
462 ++SubItemUsed;
463 if (Item_CountBits==1) ++SubItemUsedCountIsOne;
465 if (!(mainArrayID % SubItemCount)) {
466 ++pCountSubArray[SubItemUsed];
467 ++pCountSubNoOneArray[SubItemUsed-SubItemUsedCountIsOne];
468 fwrite(&SubItemUsed,sizeof(uint8_t),1u,fpdat); // Well, just write 1 byte each. (LE)
469 //printf("-%zd %u\n",mainArrayID,SubItemUsed);
470 SubItemUsed=0;
471 SubItemUsedCountIsOne=0;
472 } else { // mainArrayID mod SubItemCount != 0. mainArrayID == (i+1)/dleftobj->itemByte .
473 //printf("---%zd %% %d = %zd %d\n",mainArrayID,SubItemCount,mainArrayID % SubItemCount,SubItemUsed);
475 #endif
477 //THETYPE HistSum=0; // HistSum == dleftobj->ItemInsideAll
478 float128 HistSumSquared=0.0;
479 double SStd; // We need to return in a fixed type for printf
480 for (size_t p=1;p<=dleftobj->maxCountSeen;p++) {
481 //HistSum += pCountHistArray[p];
482 HistSumSquared += pCountHistArray[p] * pCountHistArray[p];
484 //http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
485 SStd = sqrtl( ( HistSumSquared-((long double)dleftobj->ItemInsideAll*(long double)dleftobj->ItemInsideAll/(long double)dleftobj->maxCountSeen) ) / (long double)(dleftobj->maxCountSeen -1) );
486 pSDLeftStat->HistSStd = SStd;
487 pSDLeftStat->HistMean = (double)dleftobj->ItemInsideAll / (double)dleftobj->maxCountSeen;
488 pSDLeftStat->HistMaxCntVal = 1; //later
489 pSDLeftStat->HistMaxHistVal = 1; //later
490 fprintf(stream,"#Kmer_real_count: %ld\n#Kmer_count_hist: %ld\n"
491 "#Kmer_depth_mean: %f\n#Kmer_depth_sStd: %f\n"
492 "#CountBit_overflow: %lu\n"
493 "\n#Kmer_frequence\tHist_value\tKmer_count\tHist_ratio\n",
494 dleftobj->ItemInsideAll,dleftobj->maxCountSeen,pSDLeftStat->HistMean,SStd,dleftobj->CountBitOverflow);
495 for (size_t p=1;p<=dleftobj->maxCountSeen;p++) {
496 fprintf(stream,"%zu\t%lu\t%lu\t%g\n",p,(uint64_t)pCountHistArray[p],
497 (uint64_t)pCountHistArray[p]*(uint64_t)p,(double)pCountHistArray[p]/(double)dleftobj->ItemInsideAll);
499 free(pCountHistArray);
500 // deal(*pextree);
501 #ifdef TEST
502 fprintf(stream,"#-----------------------------------------------\n\n#Array_size: %lu\n\n"
503 "SubArrayFilled\tCount\tCount_without_1\n",ArraySize);
504 for (uint16_t i=0;i<=SubItemCount;i++) {
505 fprintf(stream,"%u\t%lu, %g\t%lu, %g\n",i,pCountSubArray[i],(double)pCountSubArray[i]/(double)ArraySize,pCountSubNoOneArray[i],(double)pCountSubNoOneArray[i]/(double)ArraySize);
507 free(pCountSubArray);
508 #endif
509 return pSDLeftStat;
513 Speed test with OLD 2bitseqinline.h:
514 ./readscorr Saccharomyces_cerevisiae.cfg -o ttmp 2> ttmp.log
516 ==> tnew.log <==
517 User: 170.445088(s), System: 0.400939(s). Real: 174.093140(s).
518 Sleep: 3.247113(s). Block I/O times: 0/0. MaxRSS: 0 kiB.
519 Wait(s): 62(nvcsw) + 54770(nivcsw). Page Fault(s): 41294(minflt) + 0(majflt).
521 ==> told.log <==
522 User: 204.444919(s), System: 2.459626(s). Real: 215.708506(s).
523 Sleep: 8.803961(s). Block I/O times: 0/0. MaxRSS: 0 kiB.
524 Wait(s): 143(nvcsw) + 58522(nivcsw). Page Fault(s): 41294(minflt) + 0(majflt).
526 ==> ttmp.log <==
527 User: 133.809657(s), System: 1.160823(s). Real: 137.159722(s).
528 Sleep: 2.189242(s). Block I/O times: 0/0. MaxRSS: 0 kiB.
529 Wait(s): 64(nvcsw) + 82900(nivcsw). Page Fault(s): 41301(minflt) + 0(majflt).
533 zz <- file("oSCa8ms64r25k17.dat", "rb")
534 l=readBin(zz,"integer",size=8,signed=F)
535 a=readBin(zz,"integer",l,size=1,signed=F)
536 dim(a)<-c(l/8192,8192)
537 b=colMeans(a)
538 png("oSCa8ms64r25k17.png",2048,600)
539 plot(x=1:length(b),y=b)
540 dev.off();
541 close(zz);