2 #include <stdlib.h> //EXIT_FAILURE
12 #define MAXHOMOPOLYLEN (128ul*1024)
19 uint64_t HomoPoly
[6][MAXHOMOPOLYLEN
];
20 uint64_t maxHomoPoly
[6],BaseCounter
[5];
21 unsigned char theBase
;
23 const char *argp_program_version
=
24 "HomoPolymer Counter 0.1 @"__TIME__
"," __DATE__
;
25 const char *argp_program_bug_address
=
26 "<huxuesong@genomics.org.cn>";
28 /* Program documentation. */
36 /* A description of the arguments we accept. */
37 static char args_doc
[] = "input_files (FASTA or FASTQ)";
39 /* The options we understand. */
40 static struct argp_option options
[] = {
41 {"outfile", 'o', "./out.stat",0, "Output file" },
45 /* Used by main to communicate with parse_opt. */
51 /* Parse a single option. */
53 parse_opt (int key
, char *arg
, struct argp_state
*state
) {
54 /* Get the input argument from argp_parse, which we
55 know is a pointer to our arguments structure. */
56 struct arguments
*arguments
= state
->input
;
60 arguments
->outfile
= arg
;
64 arguments
->args
[state
->arg_num
] = arg
;
68 if (state
->arg_num
< 1) /* Not enough arguments. */
70 argp_state_help (state
,stderr
,ARGP_HELP_STD_HELP
);
74 return ARGP_ERR_UNKNOWN
;
79 /* Our argp parser. */
80 static struct argp argp
= { options
, parse_opt
, args_doc
, doc
};
82 int main (int argc
, char **argv
) {
83 struct arguments arguments
;
84 arguments
.args
=calloc(sizeof(size_t),argc
);
87 arguments
.outfile
= "./out.stat";
89 // Parse our arguments; every option seen by parse_opt will be reflected in arguments.
90 argp_parse (&argp
, argc
, argv
, 0, 0, &arguments
);
93 ssize_t lastpos
,polymerLen
;
96 char *const*line
= arguments
.args
-1;
100 fputs("\nParsing Sequence Files:\n", stderr
);
101 while ( *(++line
) ) {
103 fprintf(stderr
, " <%s>:\n", *line
);
104 SeqFileObj
*seqobj
= inSeqFinit(*line
,GFIOCHRBASE
);
106 while ( (readlength
= (*seqobj
->getNextSeq
)(seqobj
)) >= 0 ) {
108 printf("-ID:[%s,%s] %zu %zu %zd\nSeq:[%s]\nQ:[%s] *%zx,%u\n",
109 seqobj
->name
,seqobj
->comment
,seqobj
->readlength
,seqobj
->binMallocedQQWord
,readlength
,
110 seqobj
->seq
,seqobj
->qual
,(size_t)seqobj
->seq
,seqobj
->type
);
112 fprintf(stderr
, " >%s %s,%zu ...",seqobj
->name
,seqobj
->comment
,seqobj
->readlength
);
113 lastbase
=seqobj
->seq
[0];
115 allbases
+= readlength
;
116 for (ssize_t i
=1;i
<=readlength
;i
++) { // *(readlength+1)=='\0'
117 if (seqobj
->seq
[i
] != lastbase
) {
118 polymerLen
= i
-lastpos
;
136 if (polymerLen
<MAXHOMOPOLYLEN
) {
137 ++HomoPoly
[theBase
][polymerLen
];
138 if (lastbase
!= 'N') ++HomoPoly
[BaseATCG
][polymerLen
];
139 if (maxHomoPoly
[theBase
]<polymerLen
) {
140 maxHomoPoly
[theBase
]=polymerLen
;
141 if (lastbase
!= 'N' && maxHomoPoly
[BaseATCG
]<polymerLen
)
142 maxHomoPoly
[BaseATCG
]=polymerLen
;
145 ++HomoPoly
[theBase
][0];
146 if (lastbase
!= 'N') ++HomoPoly
[BaseATCG
][0];
148 BaseCounter
[theBase
] += polymerLen
;
149 lastbase
=seqobj
->seq
[i
];
153 fputs("\b\b\b\b, done !\n", stderr
);
156 inSeqFdestroy(seqobj
);
158 free(arguments
.args
);
159 fputs("\nStat Done!\n", stderr
);
161 FILE *fp
= fopen(arguments
.outfile
, "w");
162 fprintf(fp
,"# Total_Bases: %llu\n"
163 "# A: %llu\tT: %llu\tC: %llu\tG: %llu\tN: %llu\n"
164 "# A%%: %.3f\tT%%: %.3f\tC%%: %.3f\tG%%: %.3f\tN%%: %.3f\n"
165 "\n#Base\tRepeat_Count\tTimes\tBaseRatio\n",
167 BaseCounter
[BaseA
],BaseCounter
[BaseT
],BaseCounter
[BaseC
],BaseCounter
[BaseG
],BaseCounter
[BaseN
],
168 100*(double)BaseCounter
[BaseA
]/allbases
,100*(double)BaseCounter
[BaseT
]/allbases
,
169 100*(double)BaseCounter
[BaseC
]/allbases
,100*(double)BaseCounter
[BaseG
]/allbases
,
170 100*(double)BaseCounter
[BaseN
]/allbases
);
171 for (unsigned char base
=0;base
<6;base
++) {
192 for (size_t polymerLen
=1;polymerLen
<=maxHomoPoly
[base
];polymerLen
++) {
193 if (HomoPoly
[base
][polymerLen
])
194 fprintf(fp
,"%c\t%lu\t%llu\t%g\n",theBase
,polymerLen
,HomoPoly
[base
][polymerLen
],
195 (double)HomoPoly
[base
][polymerLen
]*polymerLen
/allbases
);
197 if (HomoPoly
[base
][0]) {
198 fprintf(fp
,"#%c\t>=%lu\t%llu\n",theBase
,MAXHOMOPOLYLEN
,HomoPoly
[base
][0]);
200 fprintf(fp
,"#----------------------------------------------------------------\n");