R/vcfdensity.r

   1 #!/usr/bin/env littler
   2
   3 library('parallel')
   4 NumofCore <- detectCores(logical = TRUE)
   5
   6 # 这是实验室同学委托咱写的，统计给定尺寸的窗口内，SNP出现次数的分布。
   7 # 咱觉得那篇文章统计这个纯属无聊，只能证明该物种基因组很大，能被分成50个以上的100k的窗口。即“中心极限定理”。
   8 # 好吧，刚才确认定理名时看到：“独立同分布的中心极限定理”，于是，算是证明该个体，在100k的尺度上，没有明显的连锁，吧。额，貌似有点用？
   9 # 貌似美元符号会按LaTex解析，额，反正下面有原文本，大家将就看吧。为这个折腾转义也太EP了。
  10 # 话说，SNP频率不独立，和连锁没关系吧。那是证明啥？在这个尺度上没有“鱼”和“饵”的关系？本来真核生物中，也就酵母等微生物才把同一代谢通路的基因挨着放吧。于是，这统计到底有啥用？
  11 # http://pastebin.com/Ks0F3r3y
  12 # https://twitter.com/galaxy001/status/630356226782564353
  13
  14 if (!interactive()) {
  15         if (is.null(argv) | length(argv)<3) {
  16                 cat("Usage: ./vcfdensity.r <WinSize> <input.vcf.gz> <output.prefix>\n")
  17                 q(status=1)
  18         } else {
  19                 WinSize <- as.integer(argv[1])
  20                 if (is.na(WinSize)) WinSize <- 0L
  21                 InVCFgz <- trimws(argv[2],'both')
  22                 OutP <- trimws(argv[3],'both')
  23                 cat("[!] WinSize=[",WinSize,"], [",InVCFgz,"]->[",OutP,"].*, Core:[",NumofCore,"]\n",sep='')
  24                 if (!file.exists(InVCFgz)) {
  25                         cat("[x] File not found:[", InVCFgz, "] !\n",sep='')
  26                         q(status=-1)
  27                 }
  28                 if ( WinSize < 1L ) {
  29                         cat("[x] WinSize=[", WinSize, "] is smaller than [1] !\n",sep='')
  30                         q(status=-1)
  31                 }
  32         }
  33 } else {        # for `source("vcfdensity.r")`
  34         InVCFgz <- 't.vcf'
  35         WinSize <- 100L
  36         OutP <- 'test'
  37         cat("[!] WinSize=[",WinSize,"], [",InVCFgz,"]->[",OutP,"].*, Core:[",NumofCore,"]\n",sep='')
  38 }
  39
  40 library('data.table')
  41 #suppressPackageStartupMessages(library('zoo'))
  42 # http://www.r-bloggers.com/wapply-a-faster-but-less-functional-rollapply-for-vector-setups/
  43 wapply <- function(x, width, by = NULL, FUN = NULL, ...) {
  44         FUN <- match.fun(FUN)
  45         if (is.null(by)) by <- width
  46         lenX <- length(x)
  47         SEQ1 <- seq(1, lenX - width + 1, by = by)
  48         SEQ2 <- lapply(SEQ1, function(x) x:(x + width - 1))
  49         OUT <- lapply(SEQ2, function(a) FUN(x[a], ...))
  50         #OUT <- mclapply(SEQ2, function(a) FUN(x[a], ...), mc.cores = getOption("mc.cores", NumofCore))
  51         OUT <- base:::simplify2array(OUT, higher = TRUE)
  52         return(OUT)
  53 }
  54
  55 options(datatable.verbose=T)
  56
  57 #tab5rows <- read.table(pipe(paste0("gzip -dc ",InVCFgz," | cut -f1,2")),sep="\t",nrows=5)
  58 #classes <- sapply(tab5rows, class)
  59 classes <- c(V1="factor",V2="integer")
  60 #tabAll <- read.table(pipe("zcat vcf.gz | cut -f1,2|head -300"),sep="\t", colClasses = classes,col.names=c('Chr','Pos'))
  61
  62 if (grepl("\\.gz$",InVCFgz,ignore.case=T)) {
  63         fileIn <- paste0("gzip -dc ",InVCFgz,"|awk '!/^#|\tINDEL;/'|cut -f1,2")
  64 # grep -ve '^#' 也可，但 MacOS X 下没有 grep -vP '\tINDEL'
  65 # Piped file will be in `/dev/shm/file46583ba3b517` tempory, thus add `|cut -f1,2` to reduce its footprint.
  66 } else if (grepl("\\.vcf$",InVCFgz,ignore.case=T)) {
  67         fileIn <- InVCFgz
  68 } else {
  69         fileIn <- paste0("bcftools view -H -vsnps ",InVCFgz,"|cut -f1,2")
  70 }
  71
  72 tabAll <- fread(fileIn, header=F,stringsAsFactors=T,sep="\t",autostart=100,select=c(1,2), colClasses=classes, data.table=T)
  73 setnames(tabAll,1,'Chr')
  74 setnames(tabAll,2,'Pos')
  75 print(head(tabAll))
  76 cat("...\t...\t...\n")
  77 setkey(tabAll,Chr)
  78 print(tail(tabAll))
  79
  80 dorolling <- function(x, rollwin) {
  81         chrdat <- integer(0)    # No need to `integer(max(x))` as we use `sum(na.rm=T)`
  82         chrdat[x] <- 1L
  83         #thelen <- length(chrdat)
  84         #length(chrdat) <- ceiling(thelen/WinSize)*WinSize      # 补齐末端会造成 bias
  85         #chrdat[is.na(chrdat)] <- 0L
  86         res0 <- wapply(chrdat, rollwin, FUN = sum,na.rm = TRUE)
  87         return(res0)
  88 }
  89 #resArr <- tabAll[, dorolling(Pos,rollwin=WinSize), by=Chr]
  90 #resAll <- unlist(resArr$V1,use.names=F)
  91
  92 #resArr <- tabAll[, .(Density=dorolling(Pos,rollwin=WinSize)), by=Chr]
  93 # lapply optimization changed j from 'lapply(.SD, dorolling, rollwin = WinSize)' to 'list(dorolling(Pos, rollwin = WinSize))'
  94
  95 # http://stackoverflow.com/questions/19082794/speed-up-data-table-group-by-using-multiple-cores-and-parallel-programming
  96 resArr <- mclapply(unique(tabAll[["Chr"]]),
  97         function(x) {
  98                 tabAll[.(x),.(Density=dorolling(Pos,rollwin=WinSize)), by=Chr]
  99         }, mc.cores=NumofCore)
 100 resArr <- rbindlist(resArr)
 101
 102 resAll <- unlist(resArr$Density,use.names=F)
 103 cat("[!] Stat done.\n")
 104
 105 tbl <- table(resAll)
 106 print(head(tbl,80))
 107 write.table(tbl, paste0(OutP,".tsv"), sep = "\t", quote=F,row.names=F,col.names=F)
 108
 109 reshist <- hist(resAll,plot=F)
 110 pdf(file=paste0(OutP,".pdf"),title='Histogram of VCF Density')
 111 plot(reshist,freq=F,main='Histogram of SNP windowed density',xlab=paste0('SNP Count in every ',WinSize,' bps'))
 112 dev.off()
 113
 114 resPhist <- hist(resAll[resAll!=0],plot=F)
 115 pdf(file=paste0(OutP,".nonZero.pdf"),title='Histogram of VCF Density')
 116 plot(resPhist,freq=F,main='Histogram of SNP windowed density (+)',xlab=paste0('SNP Count in every ',WinSize,' bps'))
 117 dev.off()
 118