modified: faststater
[GalaxyCodeBases.git] / c_cpp / faststater / gcount.c
blobf02b043a1adff6a3f8f1a9c09bb5aff9652dd63c
1 #define _GNU_SOURCE
2 #include <stdlib.h> //EXIT_FAILURE
3 #include <stdio.h>
4 #include <stdint.h>
5 #include <string.h>
6 #include <err.h>
7 #include <argp.h>
8 #include "gFileIO.h"
9 #include "chrseq.h"
10 #include "timer.h"
12 #define MAXHOMOPOLYLEN (128ul*1024)
13 #define BaseA 1
14 #define BaseT 3
15 #define BaseC 2
16 #define BaseG 4
17 #define BaseATCG 5
18 #define BaseN 0
19 uint64_t HomoPoly[6][MAXHOMOPOLYLEN];
20 uint64_t maxHomoPoly[6],BaseCounter[5];
21 unsigned char theBase;
23 const char *argp_program_version =
24 "HomoPolymer Counter 0.1 @"__TIME__ "," __DATE__;
25 const char *argp_program_bug_address =
26 "<huxuesong@genomics.org.cn>";
28 /* Program documentation. */
29 static char doc[] =
30 "HomoPolymer Counter"
31 #ifdef DEBUG
32 " (Debug Version)"
33 #endif
36 /* A description of the arguments we accept. */
37 static char args_doc[] = "input_files (FASTA or FASTQ)";
39 /* The options we understand. */
40 static struct argp_option options[] = {
41 {"outfile", 'o', "./out.stat",0, "Output file" },
42 { 0 }
45 /* Used by main to communicate with parse_opt. */
46 struct arguments {
47 char **args;
48 char *outfile;
51 /* Parse a single option. */
52 static error_t
53 parse_opt (int key, char *arg, struct argp_state *state) {
54 /* Get the input argument from argp_parse, which we
55 know is a pointer to our arguments structure. */
56 struct arguments *arguments = state->input;
58 switch (key) {
59 case 'o':
60 arguments->outfile = arg;
61 break;
63 case ARGP_KEY_ARG:
64 arguments->args[state->arg_num] = arg;
65 break;
67 case ARGP_KEY_END:
68 if (state->arg_num < 1) /* Not enough arguments. */
69 //argp_usage (state);
70 argp_state_help (state,stderr,ARGP_HELP_STD_HELP);
71 break;
73 default:
74 return ARGP_ERR_UNKNOWN;
76 return 0;
79 /* Our argp parser. */
80 static struct argp argp = { options, parse_opt, args_doc, doc };
82 int main (int argc, char **argv) {
83 struct arguments arguments;
84 arguments.args=calloc(sizeof(size_t),argc);
86 // Default values.
87 arguments.outfile = "./out.stat";
89 // Parse our arguments; every option seen by parse_opt will be reflected in arguments.
90 argp_parse (&argp, argc, argv, 0, 0, &arguments);
92 char lastbase;
93 ssize_t lastpos,polymerLen;
94 uint64_t allbases=0;
96 char *const*line = arguments.args-1;
98 G_TIMER_START;
100 fputs("\nParsing Sequence Files:\n", stderr);
101 while ( *(++line) ) {
102 ssize_t readlength;
103 fprintf(stderr, " <%s>:\n", *line);
104 SeqFileObj *seqobj = inSeqFinit(*line,GFIOCHRBASE);
105 if (seqobj) {
106 while ( (readlength = (*seqobj->getNextSeq)(seqobj)) >= 0 ) {
107 #ifdef DEBUG
108 printf("-ID:[%s,%s] %zu %zu %zd\nSeq:[%s]\nQ:[%s] *%zx,%u\n",
109 seqobj->name,seqobj->comment,seqobj->readlength,seqobj->binMallocedQQWord,readlength,
110 seqobj->seq,seqobj->qual,(size_t)seqobj->seq,seqobj->type);
111 #endif
112 fprintf(stderr, " >%s %s,%zu ...",seqobj->name,seqobj->comment,seqobj->readlength);
113 lastbase=seqobj->seq[0];
114 lastpos=0;
115 allbases += readlength;
116 for (ssize_t i=1;i<=readlength;i++) { // *(readlength+1)=='\0'
117 if (seqobj->seq[i] != lastbase) {
118 polymerLen = i-lastpos;
119 switch (lastbase) {
120 case 'A':
121 theBase=BaseA;
122 break;
123 case 'T':
124 theBase=BaseT;
125 break;
126 case 'C':
127 theBase=BaseC;
128 break;
129 case 'G':
130 theBase=BaseG;
131 break;
132 default:
133 theBase=BaseN;
134 break;
136 if (polymerLen<MAXHOMOPOLYLEN) {
137 ++HomoPoly[theBase][polymerLen];
138 if (lastbase != 'N') ++HomoPoly[BaseATCG][polymerLen];
139 if (maxHomoPoly[theBase]<polymerLen) {
140 maxHomoPoly[theBase]=polymerLen;
141 if (lastbase != 'N' && maxHomoPoly[BaseATCG]<polymerLen)
142 maxHomoPoly[BaseATCG]=polymerLen;
144 } else {
145 ++HomoPoly[theBase][0];
146 if (lastbase != 'N') ++HomoPoly[BaseATCG][0];
148 BaseCounter[theBase] += polymerLen;
149 lastbase=seqobj->seq[i];
150 lastpos=i;
153 fputs("\b\b\b\b, done !\n", stderr);
155 } else continue;
156 inSeqFdestroy(seqobj);
158 free(arguments.args);
159 fputs("\nStat Done!\n", stderr);
161 FILE *fp = fopen(arguments.outfile, "w");
162 fprintf(fp,"# Total_Bases: %llu\n"
163 "# A: %llu\tT: %llu\tC: %llu\tG: %llu\tN: %llu\n"
164 "# A%%: %.3f\tT%%: %.3f\tC%%: %.3f\tG%%: %.3f\tN%%: %.3f\n"
165 "\n#Base\tRepeat_Count\tTimes\tBaseRatio\n",
166 allbases,
167 BaseCounter[BaseA],BaseCounter[BaseT],BaseCounter[BaseC],BaseCounter[BaseG],BaseCounter[BaseN],
168 100*(double)BaseCounter[BaseA]/allbases,100*(double)BaseCounter[BaseT]/allbases,
169 100*(double)BaseCounter[BaseC]/allbases,100*(double)BaseCounter[BaseG]/allbases,
170 100*(double)BaseCounter[BaseN]/allbases);
171 for (unsigned char base=0;base<6;base++) {
172 switch (base) {
173 case BaseA:
174 theBase='A';
175 break;
176 case BaseT:
177 theBase='T';
178 break;
179 case BaseC:
180 theBase='C';
181 break;
182 case BaseG:
183 theBase='G';
184 break;
185 case BaseATCG:
186 theBase='H';
187 break;
188 default:
189 theBase='N';
190 break;
192 for (size_t polymerLen=1;polymerLen<=maxHomoPoly[base];polymerLen++) {
193 if (HomoPoly[base][polymerLen])
194 fprintf(fp,"%c\t%lu\t%llu\t%g\n",theBase,polymerLen,HomoPoly[base][polymerLen],
195 (double)HomoPoly[base][polymerLen]*polymerLen/allbases);
197 if (HomoPoly[base][0]) {
198 fprintf(fp,"#%c\t>=%lu\t%llu\n",theBase,MAXHOMOPOLYLEN,HomoPoly[base][0]);
200 fprintf(fp,"#----------------------------------------------------------------\n");
202 fclose(fp);
204 G_TIMER_END;
205 G_TIMER_PRINT;
207 exit(EXIT_SUCCESS);