new file: dodseq.r
[GalaxyCodeBases.git] / R / vcfdensity.r
blob5db8edf5149b0d8a1ab2186f91b182212bac1ad7
1 #!/usr/bin/env littler
3 library('parallel')
4 NumofCore <- detectCores(logical = TRUE)
6 # 这是实验室同学委托咱写的,统计给定尺寸的窗口内,SNP出现次数的分布。
7 # 咱觉得那篇文章统计这个纯属无聊,只能证明该物种基因组很大,能被分成50个以上的100k的窗口。即“中心极限定理”。
8 # 好吧,刚才确认定理名时看到:“独立同分布的中心极限定理”,于是,算是证明该个体,在100k的尺度上,没有明显的连锁,吧。额,貌似有点用?
9 # 貌似美元符号会按LaTex解析,额,反正下面有原文本,大家将就看吧。为这个折腾转义也太EP了。
10 # 话说,SNP频率不独立,和连锁没关系吧。那是证明啥?在这个尺度上没有“鱼”和“饵”的关系?本来真核生物中,也就酵母等微生物才把同一代谢通路的基因挨着放吧。于是,这统计到底有啥用?
11 # http://pastebin.com/Ks0F3r3y
12 # https://twitter.com/galaxy001/status/630356226782564353
14 if (!interactive()) {
15 if (is.null(argv) | length(argv)<3) {
16 cat("Usage: ./vcfdensity.r <WinSize> <input.vcf.gz> <output.prefix>\n")
17 q(status=1)
18 } else {
19 WinSize <- as.integer(argv[1])
20 if (is.na(WinSize)) WinSize <- 0L
21 InVCFgz <- trimws(argv[2],'both')
22 OutP <- trimws(argv[3],'both')
23 cat("[!] WinSize=[",WinSize,"], [",InVCFgz,"]->[",OutP,"].*, Core:[",NumofCore,"]\n",sep='')
24 if (!file.exists(InVCFgz)) {
25 cat("[x] File not found:[", InVCFgz, "] !\n",sep='')
26 q(status=-1)
28 if ( WinSize < 1L ) {
29 cat("[x] WinSize=[", WinSize, "] is smaller than [1] !\n",sep='')
30 q(status=-1)
33 } else { # for `source("vcfdensity.r")`
34 InVCFgz <- 't.vcf'
35 WinSize <- 100L
36 OutP <- 'test'
37 cat("[!] WinSize=[",WinSize,"], [",InVCFgz,"]->[",OutP,"].*, Core:[",NumofCore,"]\n",sep='')
40 library('data.table')
41 #suppressPackageStartupMessages(library('zoo'))
42 # http://www.r-bloggers.com/wapply-a-faster-but-less-functional-rollapply-for-vector-setups/
43 wapply <- function(x, width, by = NULL, FUN = NULL, ...) {
44 FUN <- match.fun(FUN)
45 if (is.null(by)) by <- width
46 lenX <- length(x)
47 SEQ1 <- seq(1, lenX - width + 1, by = by)
48 SEQ2 <- lapply(SEQ1, function(x) x:(x + width - 1))
49 OUT <- lapply(SEQ2, function(a) FUN(x[a], ...))
50 #OUT <- mclapply(SEQ2, function(a) FUN(x[a], ...), mc.cores = getOption("mc.cores", NumofCore))
51 OUT <- base:::simplify2array(OUT, higher = TRUE)
52 return(OUT)
55 options(datatable.verbose=T)
57 #tab5rows <- read.table(pipe(paste0("gzip -dc ",InVCFgz," | cut -f1,2")),sep="\t",nrows=5)
58 #classes <- sapply(tab5rows, class)
59 classes <- c(V1="factor",V2="integer")
60 #tabAll <- read.table(pipe("zcat vcf.gz | cut -f1,2|head -300"),sep="\t", colClasses = classes,col.names=c('Chr','Pos'))
62 if (grepl("\\.gz$",InVCFgz,ignore.case=T)) {
63 fileIn <- paste0("gzip -dc ",InVCFgz,"|awk '!/^#|\tINDEL;/'|cut -f1,2")
64 # grep -ve '^#' 也可,但 MacOS X 下没有 grep -vP '\tINDEL'
65 # Piped file will be in `/dev/shm/file46583ba3b517` tempory, thus add `|cut -f1,2` to reduce its footprint.
66 } else if (grepl("\\.vcf$",InVCFgz,ignore.case=T)) {
67 fileIn <- InVCFgz
68 } else {
69 fileIn <- paste0("bcftools view -H -vsnps ",InVCFgz,"|cut -f1,2")
72 tabAll <- fread(fileIn, header=F,stringsAsFactors=T,sep="\t",autostart=100,select=c(1,2), colClasses=classes, data.table=T)
73 setnames(tabAll,1,'Chr')
74 setnames(tabAll,2,'Pos')
75 print(head(tabAll))
76 cat("...\t...\t...\n")
77 setkey(tabAll,Chr)
78 print(tail(tabAll))
80 dorolling <- function(x, rollwin) {
81 chrdat <- integer(0) # No need to `integer(max(x))` as we use `sum(na.rm=T)`
82 chrdat[x] <- 1L
83 #thelen <- length(chrdat)
84 #length(chrdat) <- ceiling(thelen/WinSize)*WinSize # 补齐末端会造成 bias
85 #chrdat[is.na(chrdat)] <- 0L
86 res0 <- wapply(chrdat, rollwin, FUN = sum,na.rm = TRUE)
87 return(res0)
89 #resArr <- tabAll[, dorolling(Pos,rollwin=WinSize), by=Chr]
90 #resAll <- unlist(resArr$V1,use.names=F)
92 #resArr <- tabAll[, .(Density=dorolling(Pos,rollwin=WinSize)), by=Chr]
93 # lapply optimization changed j from 'lapply(.SD, dorolling, rollwin = WinSize)' to 'list(dorolling(Pos, rollwin = WinSize))'
95 # http://stackoverflow.com/questions/19082794/speed-up-data-table-group-by-using-multiple-cores-and-parallel-programming
96 resArr <- mclapply(unique(tabAll[["Chr"]]),
97 function(x) {
98 tabAll[.(x),.(Density=dorolling(Pos,rollwin=WinSize)), by=Chr]
99 }, mc.cores=NumofCore)
100 resArr <- rbindlist(resArr)
102 resAll <- unlist(resArr$Density,use.names=F)
103 cat("[!] Stat done.\n")
105 tbl <- table(resAll)
106 print(head(tbl,80))
107 write.table(tbl, paste0(OutP,".tsv"), sep = "\t", quote=F,row.names=F,col.names=F)
109 reshist <- hist(resAll,plot=F)
110 pdf(file=paste0(OutP,".pdf"),title='Histogram of VCF Density')
111 plot(reshist,freq=F,main='Histogram of SNP windowed density',xlab=paste0('SNP Count in every ',WinSize,' bps'))
112 dev.off()
114 resPhist <- hist(resAll[resAll!=0],plot=F)
115 pdf(file=paste0(OutP,".nonZero.pdf"),title='Histogram of VCF Density')
116 plot(resPhist,freq=F,main='Histogram of SNP windowed density (+)',xlab=paste0('SNP Count in every ',WinSize,' bps'))
117 dev.off()