3 * Sequence handler library by Huzefa Rangwala
16 /*********************************************************/
17 /* ! \brief Initializes the <tt>gk_seq_t</tt> variable
22 \param A pointer to gk_seq_t itself
25 /***********************************************************************/
27 void gk_seq_init(gk_seq_t
*seq
)
40 /***********************************************************************/
41 /*! \brief This function creates the localizations for the various sequences
43 \param string i.e amino acids, nucleotides, sequences
44 \returns gk_i2cc2i_t variable
46 /*********************************************************************/
48 gk_i2cc2i_t
*gk_i2cc2i_create_common(char *alphabet
)
56 nsymbols
= strlen(alphabet
);
57 t
= gk_malloc(sizeof(gk_i2cc2i_t
),"gk_i2c_create_common");
59 t
->i2c
= gk_cmalloc(256, "gk_i2c_create_common");
60 t
->c2i
= gk_imalloc(256, "gk_i2c_create_common");
63 gk_cset(256, -1, t
->i2c
);
64 gk_iset(256, -1, t
->c2i
);
66 for(i
=0;i
<nsymbols
;i
++){
67 t
->i2c
[i
] = alphabet
[i
];
68 t
->c2i
[(int)alphabet
[i
]] = i
;
76 /*********************************************************************/
77 /*! \brief This function reads a pssm in the format of gkmod pssm
79 \param file_name is the name of the pssm file
82 /********************************************************************/
83 gk_seq_t
*gk_seq_ReadGKMODPSSM(char *filename
)
87 size_t ntokens
, nbytes
, len
;
92 static char *AAORDER
= "ARNDCQEGHILKMFPSTWYVBZX*";
93 static int PSSMWIDTH
= 20;
94 char *header
, line
[MAXLINELEN
];
95 gk_i2cc2i_t
*converter
;
97 header
= gk_cmalloc(PSSMWIDTH
, "gk_seq_ReadGKMODPSSM: header");
99 converter
= gk_i2cc2i_create_common(AAORDER
);
101 gk_getfilestats(filename
, &len
, &ntokens
, NULL
, &nbytes
);
104 seq
= gk_malloc(sizeof(gk_seq_t
),"gk_seq_ReadGKMODPSSM");
108 seq
->sequence
= gk_imalloc(len
, "gk_seq_ReadGKMODPSSM");
109 seq
->pssm
= gk_iAllocMatrix(len
, PSSMWIDTH
, 0, "gk_seq_ReadGKMODPSSM");
110 seq
->psfm
= gk_iAllocMatrix(len
, PSSMWIDTH
, 0, "gk_seq_ReadGKMODPSSM");
112 seq
->nsymbols
= PSSMWIDTH
;
113 seq
->name
= gk_getbasename(filename
);
115 fpin
= gk_fopen(filename
,"r","gk_seq_ReadGKMODPSSM");
118 /* Read the header line */
119 if (fgets(line
, MAXLINELEN
-1, fpin
) == NULL
)
120 errexit("Unexpected end of file: %s\n", filename
);
122 gk_strtokenize(line
, " \t\n", &tokens
);
124 for (i
=0; i
<PSSMWIDTH
; i
++)
125 header
[i
] = tokens
.list
[i
][0];
127 gk_freetokenslist(&tokens
);
130 /* Read the rest of the lines */
131 for (i
=0, ii
=0; ii
<len
; ii
++) {
132 if (fgets(line
, MAXLINELEN
-1, fpin
) == NULL
)
133 errexit("Unexpected end of file: %s\n", filename
);
135 gk_strtokenize(line
, " \t\n", &tokens
);
137 seq
->sequence
[i
] = converter
->c2i
[(int)tokens
.list
[1][0]];
139 for (j
=0; j
<PSSMWIDTH
; j
++) {
140 seq
->pssm
[i
][converter
->c2i
[(int)header
[j
]]] = atoi(tokens
.list
[2+j
]);
141 seq
->psfm
[i
][converter
->c2i
[(int)header
[j
]]] = atoi(tokens
.list
[2+PSSMWIDTH
+j
]);
146 gk_freetokenslist(&tokens
);
150 seq
->len
= i
; /* Reset the length if certain characters were skipped */
152 gk_free((void **)&header
, LTERM
);
159 /**************************************************************************/
160 /*! \brief This function frees the memory allocated to the seq structure.
165 /**************************************************************************/
166 void gk_seq_free(gk_seq_t
*seq
)
168 gk_iFreeMatrix(&seq
->pssm
, seq
->len
, seq
->nsymbols
);
169 gk_iFreeMatrix(&seq
->psfm
, seq
->len
, seq
->nsymbols
);
170 gk_free((void **)&seq
->name
, &seq
->sequence
, LTERM
);
171 //gk_free((void **)&seq, LTERM);
172 gk_free((void **) &seq
, LTERM
);