modified: myjupyterlab.sh
[GalaxyCodeBases.git] / etc / gatk-wdl / fm2gatk / gatk-preprocess / gatk-preprocess.wdl
blob4a6c872dc0b3a32c72a4cba2e68926631fd86eb2
1 version 1.0
3 # Copyright (c) 2018 Leiden University Medical Center
5 # Permission is hereby granted, free of charge, to any person obtaining a copy
6 # of this software and associated documentation files (the "Software"), to deal
7 # in the Software without restriction, including without limitation the rights
8 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 # copies of the Software, and to permit persons to whom the Software is
10 # furnished to do so, subject to the following conditions:
12 # The above copyright notice and this permission notice shall be included in
13 # all copies or substantial portions of the Software.
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 # SOFTWARE.
23 import "tasks/gatk.wdl" as gatk
24 import "tasks/picard.wdl" as picard
26 workflow GatkPreprocess {
27     input{
28         File bam
29         File bamIndex
30         String bamName = "recalibrated"
31         String outputDir = "."
32         File referenceFasta
33         File referenceFastaFai
34         File referenceFastaDict
35         Boolean splitSplicedReads = false
36         File dbsnpVCF
37         File dbsnpVCFIndex
38         Array[File] scatters
39         Map[String, String] dockerImages = {
40           "picard":"quay.io/biocontainers/picard:2.23.2--0",
41           "gatk4": "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0"
42         }
43     }
44     meta {allowNestedInputs: true}
46     String scatterDir = outputDir +  "/gatk_preprocess_scatter/"
48     Int scatterNumber = length(scatters)
49     Int baseRecalibratorTimeEstimate = 10 + ceil(size(bam, "G") * 36 / scatterNumber)
50     # splitNCigar does two passes and is a lot slower.
51     Int splitNCigarTimeEstimate = 6 * baseRecalibratorTimeEstimate
52     Int applyBqsrTimeEstimate = baseRecalibratorTimeEstimate
54     Boolean scattered = scatterNumber > 1
55     String reportName = outputDir + "/" + bamName + ".bqsr"
57     scatter (bed in scatters) {
58         String scatteredReportName = scatterDir + "/" + basename(bed) + ".bqsr"
60         if (splitSplicedReads) {
61             call gatk.SplitNCigarReads as splitNCigarReads {
62                 input:
63                     intervals = [bed],
64                     referenceFasta = referenceFasta,
65                     referenceFastaFai = referenceFastaFai,
66                     referenceFastaDict = referenceFastaDict,
67                     inputBam = bam,
68                     inputBamIndex = bamIndex,
69                     outputBam = scatterDir + "/" + basename(bed) + ".split.bam",
70                     dockerImage = dockerImages["gatk4"],
71                     timeMinutes = splitNCigarTimeEstimate
72             }
73         }
75         call gatk.BaseRecalibrator as baseRecalibrator {
76             input:
77                 sequenceGroupInterval = [bed],
78                 referenceFasta = referenceFasta,
79                 referenceFastaFai = referenceFastaFai,
80                 referenceFastaDict = referenceFastaDict,
81                 inputBam = select_first([splitNCigarReads.bam, bam]),
82                 inputBamIndex = select_first([splitNCigarReads.bamIndex, bamIndex]),
83                 recalibrationReportPath = if scattered then scatteredReportName else reportName,
84                 dbsnpVCF = dbsnpVCF,
85                 dbsnpVCFIndex = dbsnpVCFIndex,
86                 dockerImage = dockerImages["gatk4"],
87                 timeMinutes = baseRecalibratorTimeEstimate
88         }
89     }
92     if (scattered) {
93         call gatk.GatherBqsrReports as gatherBqsr {
94             input:
95                 inputBQSRreports = baseRecalibrator.recalibrationReport,
96                 outputReportPath = reportName,
97                 dockerImage = dockerImages["gatk4"]
98         }
99     }
100     File recalibrationReport = select_first([gatherBqsr.outputBQSRreport, baseRecalibrator.recalibrationReport[0]])
101      
102     String recalibratedBamName = outputDir + "/" + bamName + ".bam"
103     
104     scatter (index in range(length(scatters))) {
105         String scatterBamName = if splitSplicedReads
106                     then scatterDir + "/" + basename(scatters[index]) + ".split.bqsr.bam"
107                     else scatterDir + "/" + basename(scatters[index]) + ".bqsr.bam"
109         call gatk.ApplyBQSR as applyBqsr {
110             input:
111                 sequenceGroupInterval = [scatters[index]],
112                 referenceFasta = referenceFasta,
113                 referenceFastaFai = referenceFastaFai,
114                 referenceFastaDict = referenceFastaDict,
115                 inputBam = select_first([splitNCigarReads.bam[index], bam]),
116                 inputBamIndex = select_first([splitNCigarReads.bamIndex[index], bamIndex]),
117                 recalibrationReport = recalibrationReport,
118                 outputBamPath = if scattered then scatterBamName else recalibratedBamName,
119                 dockerImage = dockerImages["gatk4"],
120                 timeMinutes = applyBqsrTimeEstimate
121         }
122     }
124     if (scattered) {
125         call picard.GatherBamFiles as gatherBamFiles {
126             input:
127                 inputBams = applyBqsr.recalibratedBam,
128                 inputBamsIndex = applyBqsr.recalibratedBamIndex,
129                 outputBamPath = outputDir + "/" + bamName + ".bam",
130                 dockerImage = dockerImages["picard"]
131         }
132     }
134     output {
135         File recalibratedBam = select_first([gatherBamFiles.outputBam, applyBqsr.recalibratedBam[0]])
136         File recalibratedBamIndex = select_first([gatherBamFiles.outputBamIndex, applyBqsr.recalibratedBamIndex[0]])
137         File BQSRreport = recalibrationReport
138     }
140     parameter_meta {
141         bam: {description: "The BAM file which should be processed", category: "required"}
142         bamIndex: {description: "The index for the BAM file", category: "required"}
143         bamName: {description: "The basename for the produced BAM files. This should not include any parent direcoties, use `outputDir` if the output directory should be changed.",
144                   category: "common"}
145         outputDir: {description: "The directory to which the outputs will be written.", category: "common"}
146         referenceFasta: {description: "The reference fasta file", category: "required"}
147         referenceFastaFai: {description: "Fasta index (.fai) for the reference fasta file", category: "required"}
148         referenceFastaDict: {description: "Sequence dictionary (.dict) for the reference fasta file", category: "required"}
149         splitSplicedReads: {description: "Whether or not gatk's SplitNCgarReads should be run to split spliced reads. This should be enabled for RNAseq samples.",
150                             category: "common"}
151         dbsnpVCF: {description: "A dbSNP vcf.", category: "required"}
152         dbsnpVCFIndex: {description: "Index for dbSNP vcf.", category: "required"}
154         scatters: {description: "The bed files to be used"}
155         dockerImages: {description: "The docker images used. Changing this may result in errors which the developers may choose not to address.",
156                        category: "advanced"}
157     }