4 NumofCore
<- detectCores(logical
= TRUE)
6 # 这是实验室同学委托咱写的,统计给定尺寸的窗口内,SNP出现次数的分布。
7 # 咱觉得那篇文章统计这个纯属无聊,只能证明该物种基因组很大,能被分成50个以上的100k的窗口。即“中心极限定理”。
8 # 好吧,刚才确认定理名时看到:“独立同分布的中心极限定理”,于是,算是证明该个体,在100k的尺度上,没有明显的连锁,吧。额,貌似有点用?
9 # 貌似美元符号会按LaTex解析,额,反正下面有原文本,大家将就看吧。为这个折腾转义也太EP了。
10 # 话说,SNP频率不独立,和连锁没关系吧。那是证明啥?在这个尺度上没有“鱼”和“饵”的关系?本来真核生物中,也就酵母等微生物才把同一代谢通路的基因挨着放吧。于是,这统计到底有啥用?
11 # http://pastebin.com/Ks0F3r3y
12 # https://twitter.com/galaxy001/status/630356226782564353
15 if (is
.null(argv
) | length(argv
)<3) {
16 cat("Usage: ./vcfdensity.r <WinSize> <input.vcf.gz> <output.prefix>\n")
19 WinSize
<- as
.integer(argv
[1])
20 if (is
.na(WinSize
)) WinSize
<- 0L
21 InVCFgz
<- trimws(argv
[2],'both')
22 OutP
<- trimws(argv
[3],'both')
23 cat("[!] WinSize=[",WinSize
,"], [",InVCFgz
,"]->[",OutP
,"].*, Core:[",NumofCore
,"]\n",sep
='')
24 if (!file
.exists(InVCFgz
)) {
25 cat("[x] File not found:[", InVCFgz
, "] !\n",sep
='')
29 cat("[x] WinSize=[", WinSize
, "] is smaller than [1] !\n",sep
='')
33 } else { # for `source("vcfdensity.r")`
37 cat("[!] WinSize=[",WinSize
,"], [",InVCFgz
,"]->[",OutP
,"].*, Core:[",NumofCore
,"]\n",sep
='')
41 #suppressPackageStartupMessages(library('zoo'))
42 # http://www.r-bloggers.com/wapply-a-faster-but-less-functional-rollapply-for-vector-setups/
43 wapply
<- function(x
, width
, by
= NULL, FUN
= NULL, ...) {
45 if (is
.null(by
)) by
<- width
47 SEQ1
<- seq(1, lenX
- width
+ 1, by
= by
)
48 SEQ2
<- lapply(SEQ1
, function(x
) x
:(x
+ width
- 1))
49 OUT
<- lapply(SEQ2
, function(a
) FUN(x
[a
], ...))
50 #OUT <- mclapply(SEQ2, function(a) FUN(x[a], ...), mc.cores = getOption("mc.cores", NumofCore))
51 OUT
<- base
:::simplify2array(OUT
, higher
= TRUE)
55 options(datatable
.verbose
=T)
57 #tab5rows <- read.table(pipe(paste0("gzip -dc ",InVCFgz," | cut -f1,2")),sep="\t",nrows=5)
58 #classes <- sapply(tab5rows, class)
59 classes
<- c(V1
="factor",V2
="integer")
60 #tabAll <- read.table(pipe("zcat vcf.gz | cut -f1,2|head -300"),sep="\t", colClasses = classes,col.names=c('Chr','Pos'))
62 if (grepl("\\.gz$",InVCFgz
,ignore
.case
=T)) {
63 fileIn
<- paste0("gzip -dc ",InVCFgz
,"|awk '!/^#|\tINDEL;/'|cut -f1,2")
64 # grep -ve '^#' 也可,但 MacOS X 下没有 grep -vP '\tINDEL'
65 # Piped file will be in `/dev/shm/file46583ba3b517` tempory, thus add `|cut -f1,2` to reduce its footprint.
66 } else if (grepl("\\.vcf$",InVCFgz
,ignore
.case
=T)) {
69 fileIn
<- paste0("bcftools view -H -vsnps ",InVCFgz
,"|cut -f1,2")
72 tabAll
<- fread(fileIn
, header
=F,stringsAsFactors
=T,sep
="\t",autostart
=100,select
=c(1,2), colClasses
=classes
, data
.table
=T)
73 setnames(tabAll
,1,'Chr')
74 setnames(tabAll
,2,'Pos')
76 cat("...\t...\t...\n")
80 dorolling
<- function(x
, rollwin
) {
81 chrdat
<- integer(0) # No need to `integer(max(x))` as we use `sum(na.rm=T)`
83 #thelen <- length(chrdat)
84 #length(chrdat) <- ceiling(thelen/WinSize)*WinSize # 补齐末端会造成 bias
85 #chrdat[is.na(chrdat)] <- 0L
86 res0
<- wapply(chrdat
, rollwin
, FUN
= sum
,na
.rm
= TRUE)
89 #resArr <- tabAll[, dorolling(Pos,rollwin=WinSize), by=Chr]
90 #resAll <- unlist(resArr$V1,use.names=F)
92 #resArr <- tabAll[, .(Density=dorolling(Pos,rollwin=WinSize)), by=Chr]
93 # lapply optimization changed j from 'lapply(.SD, dorolling, rollwin = WinSize)' to 'list(dorolling(Pos, rollwin = WinSize))'
95 # http://stackoverflow.com/questions/19082794/speed-up-data-table-group-by-using-multiple-cores-and-parallel-programming
96 resArr
<- mclapply(unique(tabAll
[["Chr"]]),
98 tabAll
[.(x
),.(Density
=dorolling(Pos
,rollwin
=WinSize
)), by
=Chr
]
99 }, mc
.cores
=NumofCore
)
100 resArr
<- rbindlist(resArr
)
102 resAll
<- unlist(resArr$Density
,use
.names
=F)
103 cat("[!] Stat done.\n")
107 write
.table(tbl
, paste0(OutP
,".tsv"), sep
= "\t", quote
=F,row
.names
=F,col
.names
=F)
109 reshist
<- hist(resAll
,plot
=F)
110 pdf(file
=paste0(OutP
,".pdf"),title
='Histogram of VCF Density')
111 plot(reshist
,freq
=F,main
='Histogram of SNP windowed density',xlab
=paste0('SNP Count in every ',WinSize
,' bps'))
114 resPhist
<- hist(resAll
[resAll
!=0],plot
=F)
115 pdf(file
=paste0(OutP
,".nonZero.pdf"),title
='Histogram of VCF Density')
116 plot(resPhist
,freq
=F,main
='Histogram of SNP windowed density (+)',xlab
=paste0('SNP Count in every ',WinSize
,' bps'))