3 # Copyright (c) 2018 Leiden University Medical Center
5 # Permission is hereby granted, free of charge, to any person obtaining a copy
6 # of this software and associated documentation files (the "Software"), to deal
7 # in the Software without restriction, including without limitation the rights
8 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 # copies of the Software, and to permit persons to whom the Software is
10 # furnished to do so, subject to the following conditions:
12 # The above copyright notice and this permission notice shall be included in
13 # all copies or substantial portions of the Software.
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 import "tasks/gatk.wdl" as gatk
24 import "tasks/picard.wdl" as picard
26 workflow GatkPreprocess {
30 String bamName = "recalibrated"
31 String outputDir = "."
33 File referenceFastaFai
34 File referenceFastaDict
35 Boolean splitSplicedReads = false
39 Map[String, String] dockerImages = {
40 "picard":"quay.io/biocontainers/picard:2.23.2--0",
41 "gatk4": "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0"
44 meta {allowNestedInputs: true}
46 String scatterDir = outputDir + "/gatk_preprocess_scatter/"
48 Int scatterNumber = length(scatters)
49 Int baseRecalibratorTimeEstimate = 10 + ceil(size(bam, "G") * 36 / scatterNumber)
50 # splitNCigar does two passes and is a lot slower.
51 Int splitNCigarTimeEstimate = 6 * baseRecalibratorTimeEstimate
52 Int applyBqsrTimeEstimate = baseRecalibratorTimeEstimate
54 Boolean scattered = scatterNumber > 1
55 String reportName = outputDir + "/" + bamName + ".bqsr"
57 scatter (bed in scatters) {
58 String scatteredReportName = scatterDir + "/" + basename(bed) + ".bqsr"
60 if (splitSplicedReads) {
61 call gatk.SplitNCigarReads as splitNCigarReads {
64 referenceFasta = referenceFasta,
65 referenceFastaFai = referenceFastaFai,
66 referenceFastaDict = referenceFastaDict,
68 inputBamIndex = bamIndex,
69 outputBam = scatterDir + "/" + basename(bed) + ".split.bam",
70 dockerImage = dockerImages["gatk4"],
71 timeMinutes = splitNCigarTimeEstimate
75 call gatk.BaseRecalibrator as baseRecalibrator {
77 sequenceGroupInterval = [bed],
78 referenceFasta = referenceFasta,
79 referenceFastaFai = referenceFastaFai,
80 referenceFastaDict = referenceFastaDict,
81 inputBam = select_first([splitNCigarReads.bam, bam]),
82 inputBamIndex = select_first([splitNCigarReads.bamIndex, bamIndex]),
83 recalibrationReportPath = if scattered then scatteredReportName else reportName,
85 dbsnpVCFIndex = dbsnpVCFIndex,
86 dockerImage = dockerImages["gatk4"],
87 timeMinutes = baseRecalibratorTimeEstimate
93 call gatk.GatherBqsrReports as gatherBqsr {
95 inputBQSRreports = baseRecalibrator.recalibrationReport,
96 outputReportPath = reportName,
97 dockerImage = dockerImages["gatk4"]
100 File recalibrationReport = select_first([gatherBqsr.outputBQSRreport, baseRecalibrator.recalibrationReport[0]])
102 String recalibratedBamName = outputDir + "/" + bamName + ".bam"
104 scatter (index in range(length(scatters))) {
105 String scatterBamName = if splitSplicedReads
106 then scatterDir + "/" + basename(scatters[index]) + ".split.bqsr.bam"
107 else scatterDir + "/" + basename(scatters[index]) + ".bqsr.bam"
109 call gatk.ApplyBQSR as applyBqsr {
111 sequenceGroupInterval = [scatters[index]],
112 referenceFasta = referenceFasta,
113 referenceFastaFai = referenceFastaFai,
114 referenceFastaDict = referenceFastaDict,
115 inputBam = select_first([splitNCigarReads.bam[index], bam]),
116 inputBamIndex = select_first([splitNCigarReads.bamIndex[index], bamIndex]),
117 recalibrationReport = recalibrationReport,
118 outputBamPath = if scattered then scatterBamName else recalibratedBamName,
119 dockerImage = dockerImages["gatk4"],
120 timeMinutes = applyBqsrTimeEstimate
125 call picard.GatherBamFiles as gatherBamFiles {
127 inputBams = applyBqsr.recalibratedBam,
128 inputBamsIndex = applyBqsr.recalibratedBamIndex,
129 outputBamPath = outputDir + "/" + bamName + ".bam",
130 dockerImage = dockerImages["picard"]
135 File recalibratedBam = select_first([gatherBamFiles.outputBam, applyBqsr.recalibratedBam[0]])
136 File recalibratedBamIndex = select_first([gatherBamFiles.outputBamIndex, applyBqsr.recalibratedBamIndex[0]])
137 File BQSRreport = recalibrationReport
141 bam: {description: "The BAM file which should be processed", category: "required"}
142 bamIndex: {description: "The index for the BAM file", category: "required"}
143 bamName: {description: "The basename for the produced BAM files. This should not include any parent direcoties, use `outputDir` if the output directory should be changed.",
145 outputDir: {description: "The directory to which the outputs will be written.", category: "common"}
146 referenceFasta: {description: "The reference fasta file", category: "required"}
147 referenceFastaFai: {description: "Fasta index (.fai) for the reference fasta file", category: "required"}
148 referenceFastaDict: {description: "Sequence dictionary (.dict) for the reference fasta file", category: "required"}
149 splitSplicedReads: {description: "Whether or not gatk's SplitNCgarReads should be run to split spliced reads. This should be enabled for RNAseq samples.",
151 dbsnpVCF: {description: "A dbSNP vcf.", category: "required"}
152 dbsnpVCFIndex: {description: "Index for dbSNP vcf.", category: "required"}
154 scatters: {description: "The bed files to be used"}
155 dockerImages: {description: "The docker images used. Changing this may result in errors which the developers may choose not to address.",
156 category: "advanced"}