new file: read10x.py
[GalaxyCodeBases.git] / R / FGI / rmd3 / doAll.r
blob9f55bcd44004f89daabbb6f9cab5e6249eae2fda
1 #!/usr/bin/env littler
3 ########################################################################################################
5 # Usage: Rscript gene_id_run.R -i input_samples.csv -f allele_freq.csv -d database.csv -m input_info.txt
7 ########################################################################################################
9 # -i files/msSNP.csv -m files/meta.txt -d files/database.csv -f files/allele_freq.csv -p out
11 # 设置数值显示位数
12 options(scipen = 200)
14 # 读取命令行参数
15 library(getopt)
16 spec = matrix(
17 c("input","i",2,"character","the input genotype csv of suspect",
18 "allele_freq","f",2,"character","allele frequence csv of SNPs",
19 "database","d",2,"character","the database csv stored SNPs genotype",
20 "outpath","p",2,"character","out dir.",
21 "input_info","m",2,"character","meta infomation of input samples"),
22 byrow = TRUE, ncol = 5
25 opt = getopt(spec = spec)
27 library(RCircos)
28 data(UCSC.HG19.Human.CytoBandIdeogram)
30 # 定义一个计算两个样本相似度的核心函数
31 calculate = function(input_geno_df, sample_geno_df, allele_freq_df){
32 likehood = 1
33 for (rsid in colnames(input_geno_df)){
34 ig = input_geno_df[1,rsid]
35 sg = sample_geno_df[1,rsid]
36 if ((ig == 'Null') || (sg == 'Null')){pm_value = 1}
37 else{
38 if (ig != sg){pm_value = 0.0001}
39 if (ig == sg){
40 iw = c(substr(ig,1,1), substr(ig,2,2))
41 sw = c(substr(sg,1,1), substr(sg,2,2))
42 if (iw[1] == iw[2]){pm_value = 1 / ((allele_freq_df[rsid,iw[1]])**2)}
43 if (iw[1] != iw[2]){
44 if ((allele_freq_df[rsid,iw[1]]==0.01) || (allele_freq_df[rsid,iw[2]]==0.01)){pm_value = 100}
45 else{pm_value = 1 / (2 * allele_freq_df[rsid,iw[1]] * allele_freq_df[rsid,iw[2]])}
49 sample_geno_df[1,rsid] = paste(sg, pm_value)
50 likehood = likehood * pm_value
52 sample_geno_df$tpm = likehood
53 return(sample_geno_df)
56 # 读入文件
57 input_geno_df = read.csv(opt$input, row.names = 1)
58 #allele_freq_df = read.csv(opt$allele_freq, row.names = 1)
59 #database_df = read.csv(opt$database, row.names = 1)
60 input_info_df = suppressWarnings(read.delim(opt$input_info, fileEncoding = "UTF-16", row.names = 1))
61 #input_geno_df = read.csv("C:\\Users\\wangjingdong\\Desktop\\gene_id_rscript\\gene_id_rscript\\data\\input_csv.csv", row.names = 1)
62 allele_freq_df = read.csv("files/allele_freq.csv", row.names = 1)
63 database_df = read.csv("files/database.csv", row.names = 1)
64 #input_info_df = read.delim("C:\\Users\\wangjingdong\\Desktop\\gene_id_rscript\\gene_id_rscript\\data\\input_info.txt", fileEncoding = "UTF-16", row.names = 1)
66 # 创建样本比对结果输出文件夹
67 input_file_name = strsplit(basename(opt$input),".", fixed = T)[[1]][1]
68 #input_file_name = strsplit(basename("C:\\Users\\wangjingdong\\Desktop\\gene_id_rscript\\gene_id_rscript\\data\\input_csv.csv"),".", fixed = T)[[1]][1]
69 #out_dir_name = paste(input_file_name, "_out", sep = '')
70 #out_dir_name = paste(opt$outpath)
71 out_dir_name <- 'out'
72 dir.create(out_dir_name,showWarnings=F)
73 dir.create('pdf',showWarnings=F)
75 # 对数据预处理,将等位基因频率0替换成0.01,将纯合‘A’替换成’AA‘
76 allele_freq_df[allele_freq_df==0] = 0.01
77 input_geno_df[input_geno_df == "TRUE"] = "T"
78 for (r in rownames(input_geno_df)){
79 for (c in colnames(input_geno_df)){
80 if (nchar(input_geno_df[r,c]) == 1){
81 input_geno_df[r,c] = paste(input_geno_df[r,c], input_geno_df[r,c], sep = '')
86 for (r in rownames(database_df)){
87 for (c in colnames(database_df)){
88 if (nchar(database_df[r,c]) == 1){
89 database_df[r,c] = paste(database_df[r,c], database_df[r,c], sep = '')
94 # 计算每个样本与数据库的比对结果,并返回比对总表、基因型报告两个表格
95 NeedRerun = c()
96 for (suspect in rownames(input_geno_df)){
97 match_file = paste(suspect, "_match.csv", sep = '')
98 report_file = paste(suspect, "_report.csv", sep = '')
99 suspect_geno_df = input_geno_df[suspect,]
100 cat('Processing',suspect,':')
101 # 计算该样本与数据库中每个样本的比对结果
102 samples = rownames(database_df)
103 sample_geno_df_one = calculate(suspect_geno_df, database_df[samples[1],], allele_freq_df)
104 for (sample_name in samples[c(2:length(samples))]){
105 sample_geno_df_two = calculate(suspect_geno_df, database_df[sample_name,], allele_freq_df)
106 sample_geno_df_one = rbind(sample_geno_df_one, sample_geno_df_two)
108 sorted_all_result_df = sample_geno_df_one[order(sample_geno_df_one[,"tpm"], decreasing = T), ]
109 # 对匹配结果进行分级,做判断陈述
110 QCstate = 'PASS'
111 if (sorted_all_result_df[1,"tpm"] < 0.0001){statement = "无匹配对象"}else{
112 if (sorted_all_result_df[1,"tpm"] <= 8000000000){
113 mid_match = subset(sorted_all_result_df, (tpm <= 8000000000) && (tpm >= 0.0001))
114 samples = paste(rownames(mid_match), collapse = ',')
115 statement = paste("与样本", samples, "可能一致, 应增加其它检测")
116 QCstate = 'FAIL'
117 NeedRerun <- c(NeedRerun,suspect)
118 }else{
119 top_match = subset(sorted_all_result_df, tpm > 8000000000)
120 samples = paste(rownames(top_match), collapse = ',')
121 statement = paste("与样本", samples, "一致")
124 sorted_all_result_df$tpm = format(sorted_all_result_df$tpm, scientific = T)
125 # 输出单个样本比对总表,要根据系统调整文件名分隔符,‘/’或者‘\\’
126 write.csv(sorted_all_result_df, file.path(out_dir_name, match_file), fileEncoding = "UTF-8")
128 # 输出单个样本的基因分型报告
129 best_rate_person = rownames(sorted_all_result_df)[1]
130 best_rate = sorted_all_result_df[best_rate_person, "tpm"]
132 # 合成初步的报告表格
133 trans_suspect_geno_df = as.data.frame(t(suspect_geno_df))
134 names(trans_suspect_geno_df) = "GT"
135 report_df = cbind(trans_suspect_geno_df, allele_freq_df[,c("Number","Chr","Pos","A","C","G","T")])
137 # 计算人群基因型频率百分比
138 for (rsid in rownames(report_df)){
139 genotype = report_df[rsid,"GT"]
140 if (genotype == "Null"){report_df[rsid, "GT%"] = "0%"}
141 else{
142 allele_1 = substr(genotype, 1, 1)
143 allele_2 = substr(genotype, 2, 2)
144 if (allele_1 == allele_2){report_df[rsid, "GT%"] = paste(format((report_df[rsid, allele_1]) ** 2 * 100, digits = 4),"%", sep = '')}
145 else{report_df[rsid, "GT%"] = paste(format(2 * report_df[rsid, allele_1] * report_df[rsid, allele_2] * 100, digits = 4), "%", sep = '')}
149 report_df$rsID = rownames(report_df)
150 report_df[rownames(report_df)[1],"report"] = suspect
151 report_df[rownames(report_df)[2],"report"] = best_rate
152 report_df[rownames(report_df)[3],"report"] = best_rate_person
153 report_df[rownames(report_df)[4],"report"] = input_info_df[suspect, "Name"]
154 report_df[rownames(report_df)[5],"report"] = input_info_df[suspect, "Sex"]
155 report_df[rownames(report_df)[6],"report"] = input_info_df[suspect, "Birth"]
156 report_df[rownames(report_df)[7],"report"] = input_info_df[suspect, "Date"]
157 report_df[rownames(report_df)[8],"report"] = input_info_df[suspect, "Tel"]
158 report_df[rownames(report_df)[9],"report"] = input_info_df[suspect, "Address"]
159 report_df[rownames(report_df)[10],"report"] = statement
160 report_df[rownames(report_df)[11],"report"] = QCstate
161 output_report_df = report_df[,c("Number","GT","GT%","rsID","Chr","Pos","report")]
163 # 输出单个样本基因型报告表格,要根据系统调整文件名分隔符,‘/’或者‘\\’
164 write.csv(output_report_df, file.path(out_dir_name, report_file), row.names = F, fileEncoding = "UTF-8")
166 # Plot
167 sample_report_df <- output_report_df
168 # 准备数据框为目标画图形式
169 sample_report_df$End = sample_report_df$Pos
170 for (rsid in rownames(sample_report_df)){
171 genotype = sample_report_df[rsid,"GT"]
172 if (genotype == "Null"){sample_report_df[rsid, "Val"] = 0}
173 else{
174 allele_1 = substr(genotype, 1, 1)
175 allele_2 = substr(genotype, 2, 2)
176 if (allele_1 == allele_2){sample_report_df[rsid, "Val"] = 1}
177 else{sample_report_df[rsid, "Val"] = 0.5}
180 # 分成4个碱基文件
181 base_group = function(df, base){
182 bool = c()
183 for (i in 1:length(rownames(df))){
184 geno = df[i,2]
185 if (geno == "Null"){bool = append(bool,FALSE)}
186 else{
187 alleles = c(substr(geno, 1, 1), substr(geno, 2, 2))
188 if (base %in% alleles){bool = append(bool,TRUE)}
189 else{bool = append(bool,FALSE)}
192 return(df[rownames(df)[bool],c("Chr","Pos","End","Val")])
195 matrix_A = base_group(sample_report_df, "A")
196 matrix_A$PlotColor = "#5050FF"
198 matrix_T = base_group(sample_report_df, "T")
199 matrix_T$PlotColor = "#CC9900"
201 matrix_G = base_group(sample_report_df, "G")
202 matrix_G$PlotColor = "#00C000"
204 matrix_C = base_group(sample_report_df, "C")
205 matrix_C$PlotColor = "#E00000"
207 #out.file <- "nCircos.pdf"
208 png_file <- paste(suspect, "_report.png", sep = '')
209 png(file=file.path(out_dir_name, png_file), height=2160, width=2160,pointsize=48)
211 #RCircos.Set.Core.Components(cyto.info=UCSC.HG19.Human.CytoBandIdeogram, tracks.inside=4, tracks.outside=0, chr.exclude=c("chrX", "chrY"))
212 suppressMessages(
213 RCircos.Set.Core.Components(cyto.info=UCSC.HG19.Human.CytoBandIdeogram, tracks.inside=4, tracks.outside=0, chr.exclude=c("chrX", "chrY"))
214 , classes = "message")
216 rcircos.params <- RCircos.Get.Plot.Parameters()
217 rcircos.params$track.background <- 'white'
218 rcircos.params$grid.line.color <- 'white'
219 rcircos.params$hist.width <- 160
220 rcircos.params$track.padding <- 0
221 suppressMessages(
222 RCircos.Reset.Plot.Parameters(rcircos.params)
223 , classes = "message")
225 RCircos.Set.Plot.Area(margins=0)
226 legend("center", legend=c("A", "C", "G", "T"), col=c("#5050FF", "#E00000","#00C000","#CC9900"), lwd=16)
228 RCircos.Chromosome.Ideogram.Plot()
230 RCircos.Histogram.Plot(hist.data=matrix_A, data.col=4,track.num=1, side="in",is.sorted=F)
231 RCircos.Histogram.Plot(hist.data=matrix_C, data.col=4,track.num=2, side="in",is.sorted=F)
232 RCircos.Histogram.Plot(hist.data=matrix_G, data.col=4,track.num=3, side="in",is.sorted=F)
233 RCircos.Histogram.Plot(hist.data=matrix_T, data.col=4,track.num=4, side="in",is.sorted=F)
235 dev.off()
237 png_trimed <- paste(suspect, ".png", sep = '')
238 imagemagickCLI <- paste('convert -trim +repage -bordercolor none -border 0x20',file.path(out_dir_name, png_file),file.path(out_dir_name, png_trimed))
239 system(imagemagickCLI)
241 thePrefix <- paste(suspect, "_report", sep = '')
242 invisible(capture.output(
243 rmarkdown::render('20200608rep.Rmd',output_dir=out_dir_name,output_file=paste0(thePrefix,'.pdf'),quiet=TRUE,clean=TRUE)
246 mergeCLI <- paste('qpdf --empty --pages inc/Cover.pdf ',file.path(out_dir_name, paste0(thePrefix,'.pdf')),' -- ', file.path('pdf', paste0(thePrefix,'.pdf')) )
247 system(mergeCLI)
249 cat(' done.\n')
252 qcfilecon <- file(file.path(out_dir_name, '_QC.txt'),'wt',encoding='UTF-8')
253 if (length(NeedRerun)) {
254 QCstate <- paste(sep="\n",'下列样品需要重测:',NeedRerun,'')
255 } else {
256 QCstate <- '这批样品正常。\n'
258 cat('\n=================\n',QCstate,'=================\n',sep='')
259 writeLines(QCstate,con=qcfilecon)
260 close(qcfilecon)