2 #include <stdlib.h> //EXIT_FAILURE
10 #include "MurmurHash3.h"
12 #include "2bitarray.h"
16 #include "cfgparser.h"
20 #define SUBARRAY_MAX 8192u
22 const char *argp_program_version
=
23 "readscorr 0.1 @"__TIME__
"," __DATE__
;
24 const char *argp_program_bug_address
=
25 "<huxuesong@genomics.org.cn>";
27 /* Program documentation. */
29 "a Solexa Reads Kmer Counter using simplied D-Left hashing"
38 /* A description of the arguments we accept. */
39 static char args_doc
[] = "input_config (of nfo & FASTA or FASTQ files)";
41 /* The options we understand. */
42 static struct argp_option options
[] = {
43 {"interactive", 'i', 0, 0, "Prompt arguments before procedure"},
44 {"verbose", 'v', 0, 0, "Produce verbose output" },
45 {"quiet", 'q', 0, 0, "Don't produce any output" },
46 //{"silent", 's', 0, OPTION_ALIAS },
47 {"outprefix",'o', "./out",0, "Output to [./out.{dat,stat,log}]" },
48 {"kmersize", 'k', "21", 0, "K-mer size" },
49 {"countbit",'c', "9", 0, "length of kmer freq. Count in bit" },
50 {"arrayklen",'a', "512", 0, "Size in kilo(x1024) for Simplied D-Left Array" },
51 {"subarray",'s', "32", 0, "Size of single sub array of the SDLA (mod " CAT(SDL_SUBARRAY_UNIT
) ")" },
52 {"remainkeybit",'r', "31", 0, "length of SDLA Remain-Key in bit.\nWill elongate to align(8) with countbit." },
53 {"example", 'e', 0, 0, "OVERWRITE an example to [input_config]"},
57 /* Used by main to communicate with parse_opt. */
59 char *args
[1]; /* arg1 */
60 uint_fast8_t silent
, verbose
, interactive
, writexample
; //_Bool is the same under amd64, as typedef unsigned char uint_fast8_t;
62 unsigned char CountBit
, rBit
;
64 uint16_t SubItemCount
;
68 /* Parse a single option. */
70 parse_opt (int key
, char *arg
, struct argp_state
*state
) {
71 /* Get the input argument from argp_parse, which we
72 know is a pointer to our arguments structure. */
73 struct arguments
*arguments
= state
->input
;
78 arguments
->silent
= 1;
81 arguments
->verbose
= 1;
84 arguments
->writexample
= 1;
87 arguments
->interactive
= 1;
90 arguments
->outprefix
= arg
;
93 tmpArgValue
= atoi(arg
);
94 if (tmpArgValue
>=2 && tmpArgValue
<= MAX_KMER_LEN
) {
95 arguments
->kmersize
= tmpArgValue
;
97 errx(2,"-k \"%s\"=%i is not between [2,%d] !",arg
,tmpArgValue
,MAX_KMER_LEN
);
101 tmpArgValue
= atoi(arg
);
102 if (tmpArgValue
>2 && tmpArgValue
<= 64) {
103 arguments
->CountBit
= tmpArgValue
;
105 errx(2,"-c \"%s\"=%i is not a integer of [3,64] !",arg
,tmpArgValue
);
109 tmpArgValue
= atoi(arg
);
110 if (tmpArgValue
>0 && tmpArgValue
<= SIZE_MAX
/1024) {
111 arguments
->ArraySizeK
= tmpArgValue
;
113 errx(2,"-a \"%s\"=%i is not a integer of [1,%zu] !",arg
,tmpArgValue
,SIZE_MAX
/1024);
117 tmpArgValue
= atoi(arg
);
118 if (tmpArgValue
>0 && tmpArgValue
<= SUBARRAY_MAX
) {
119 arguments
->SubItemCount
= (tmpArgValue
+SDL_SUBARRAY_UNIT
-1)&(~(uint16_t)(SDL_SUBARRAY_UNIT
-1));
121 errx(2,"-s \"%s\"=%i is not a integer of [1,%u] !",arg
,tmpArgValue
,SUBARRAY_MAX
);
125 tmpArgValue
= atoi(arg
);
126 if (tmpArgValue
>5 && tmpArgValue
<= 64) {
127 arguments
->rBit
= tmpArgValue
;
129 errx(2,"-r \"%s\"=%i is not a integer of [6,64] !",arg
,tmpArgValue
);
134 if (state
->arg_num
>= 1)
135 /* Too many arguments. */
137 arguments
->args
[state
->arg_num
] = arg
;
141 if (state
->arg_num
< 1) /* Not enough arguments. */
142 //argp_usage (state);
143 argp_state_help (state
,stderr
,ARGP_HELP_STD_HELP
);
147 return ARGP_ERR_UNKNOWN
;
152 /* Our argp parser. */
153 static struct argp argp
= { options
, parse_opt
, args_doc
, doc
};
155 char *strlinker(char *main
, char *suffix
) {
156 char *outstr
= malloc(strlen(main
)+strlen(suffix
)+1);
158 strcat(outstr
,suffix
);
162 int main (int argc
, char **argv
) {
163 struct arguments arguments
;
166 arguments
.silent
= 0;
167 arguments
.verbose
= 0;
168 arguments
.interactive
= 0;
169 arguments
.writexample
= 0;
170 arguments
.outprefix
= "./out";
171 arguments
.kmersize
= 21;
172 arguments
.CountBit
= 9;
174 arguments
.ArraySizeK
= 512;
175 arguments
.SubItemCount
= 32;
177 // Parse our arguments; every option seen by parse_opt will be reflected in arguments.
178 argp_parse (&argp
, argc
, argv
, 0, 0, &arguments
);
180 if (arguments
.writexample
) {
181 printf("[!] Going to OVERWRITE an example to [%s] !\n",arguments
.args
[0]);
183 write_example_cfg(arguments
.args
[0]);
186 if (arguments
.interactive
) {
187 unsigned char rBit
= arguments
.rBit
;
188 uint16_t SubItemCount
= arguments
.SubItemCount
;
189 unsigned char itemByte
= GETitemByte_PADrBit_trimSubItemCount(arguments
.CountBit
,&rBit
,&SubItemCount
);
190 uint64_t ArraySize
= arguments
.ArraySizeK
*1024;
191 uint64_t Capacity
= ArraySize
*SubItemCount
;
192 uint64_t SDLsize
= itemByte
*Capacity
;
193 double HashBit
= rBit
+ log(ArraySize
)/log(2);
194 printf ("input_config = %s\nOutputPrefix = %s\n"
195 "VERBOSE = %s\nSILENT = %s\n"
196 "SDLA { CountBit:%u. rBit:%u(%u), ArraySize:%lu(k), SubItemCount:%u(%u) }\n"
197 "kmersize = %i, SDLAsize = %.1f MiB (%s Bytes)\n"
198 "HashBit:%.3g, ErrRate:%.12g\nMaxCapacity: %s\n",
201 arguments
.verbose
? "yes" : "no",
202 arguments
.silent
? "yes" : "no",
203 arguments
.CountBit
,rBit
,arguments
.rBit
,arguments
.ArraySizeK
,SubItemCount
,arguments
.SubItemCount
,
204 arguments
.kmersize
,(double)SDLsize
/1048576.0,commaprint(SDLsize
),
205 HashBit
,pow(0.5,rBit
)/ArraySize
,commaprint(Capacity
)
209 char *outStat
= strlinker(arguments
.outprefix
, ".stat");
210 char *outDat
= strlinker(arguments
.outprefix
, ".dat");
211 char *outLog
= strlinker(arguments
.outprefix
, ".log");
212 //printf("Out[%s][%s][%s]\n",outStat,outDat,outLog);
213 SDLConfig
*psdlcfg
=read_SDL_cfg((double)arguments
.kmersize
,arguments
.args
[0]);
216 uint64_t insertedCount
=0;
218 SDLeftArray_t
*dleftp
= dleft_arrayinit(arguments
.CountBit
,arguments
.ArraySizeK
*1024,arguments
.SubItemCount
,arguments
.kmersize
); // (9,31,5000000,48);
219 fputs("SDLA nfo: ", stderr
);
220 fprintSDLAnfo(stderr
,dleftp
);
224 fputs("\nParsing Sequence Files:\n", stderr
);
225 while ((read
= get_next_seqfile(psdlcfg
)) != -1) {
227 line
=psdlcfg
->seqfilename
;
228 fprintf(stderr
, " <%s> ...", line
);
229 //sleep(1); // the Call ...
230 SeqFileObj
*seqobj
= inSeqFinit(line
,GFIOCHRBASE
);
232 while ( (readlength
= (*seqobj
->getNextSeq
)(seqobj
)) >= 0 ) {
235 printf("-ID:[%s,%s] %zu %zu\nSeq:[%s]\nQ:[%s] *%zx,%u\n",
236 seqobj
->name
,seqobj
->comment
,seqobj
->readlength
,seqobj
->binMallocedQQWord
,
237 seqobj
->seq
,seqobj
->qual
,(size_t)seqobj
->seq
,seqobj
->type
);
239 //NormalizeChrSeq((char*)seqobj->seq);
240 printf("rc[%s]\n",tmpseq=ChrSeqRevComp(seqobj->seq,seqobj->readlength));
243 size_t insertedKmer
= dleft_insert_read(arguments
.kmersize
,seqobj
->seq
,seqobj
->readlength
,dleftp
);
244 insertedCount
+= insertedKmer
;
246 printf("Inserted:[%zu of %lu]\n",insertedKmer
,insertedCount
);
250 fputs("\b\b\b\b, done !\n", stderr
);
251 fprintf(stderr
, "[!]Accumulated Inserted Kmer:[%lu] times\n", insertedCount
);
252 inSeqFdestroy(seqobj
);
256 fputs("\nSecond pass:\n", stderr);
257 while ((read = getline(&line, &len, fp)) != -1) {
258 if (*(line+read-1)=='\n') *(line+(--read))='\0';
259 printf("[%zu]+%zu <%s>\n", read, len, line);
262 fputs("\nHashing Done!\n", stderr
);
263 //__clock_diff = clock() - __clock_start;
264 //fprintf(stderr,"\nHashing Done with CPU time: %zd.%06zd s\n", __clock_diff/CLOCKS_PER_SEC, __clock_diff%CLOCKS_PER_SEC);
266 destory_seqfile(psdlcfg
);
267 fputs("SDLA nfo: ", stderr
);
268 fprintSDLAnfo(stderr
,dleftp
);
269 FILE *fpstat
, *fpdat
;
270 fpstat
= fopen(outStat
, "w");
271 fpdat
= fopen(outDat
, "w");
272 fprintf(fpstat
,"#KmerSize: %d\n#Kmer_theory_count: %.34Lg\n",arguments
.kmersize
,powl(4.0,(long double)arguments
.kmersize
)); // 34 from http://en.wikipedia.org/wiki/Quadruple_precision
274 #ifdef TEST /* in TEST mode, "out.stat" changed to subArray filling count */
275 SDLeftStat_t
*SDLeftStat
= dleft_stat(dleftp
,fpstat
,fpdat
);
276 #else /* Normal out.stat */
277 SDLeftStat_t
*SDLeftStat
= dleft_stat(dleftp
,fpstat
);
279 SDLdumpHead
*DatHead
= malloc(sizeof(SDLdumpHead
));
280 DatHead
->kmersize
= arguments
.kmersize
;
281 DatHead
->HistMaxCntVal
= SDLeftStat
->HistMaxCntVal
;
282 DatHead
->HistMaxHistVal
= SDLeftStat
->HistMaxHistVal
;
283 DatHead
->HistMean
= SDLeftStat
->HistMean
;
284 DatHead
->HistSStd
= SDLeftStat
->HistSStd
;
286 dleft_dump(dleftp
,DatHead
,fpdat
);
293 dleft_arraydestroy(dleftp
);
294 free(outStat
); free(outDat
); free(outLog
); // just filenames