etc/gatk-wdl/fm2gatk/BamMetrics/tasks/picard.wdl

   1 version 1.0
   2
   3 # Copyright (c) 2017 Leiden University Medical Center
   4 #
   5 # Permission is hereby granted, free of charge, to any person obtaining a copy
   6 # of this software and associated documentation files (the "Software"), to deal
   7 # in the Software without restriction, including without limitation the rights
   8 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   9 # copies of the Software, and to permit persons to whom the Software is
  10 # furnished to do so, subject to the following conditions:
  11 #
  12 # The above copyright notice and this permission notice shall be included in
  13 # all copies or substantial portions of the Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21 # SOFTWARE.
  22
  23 task BedToIntervalList {
  24     input {
  25         File bedFile
  26         File dict
  27         String outputPath = "regions.interval_list"
  28
  29         String memory = "4G"
  30         String javaXmx = "3G"
  31         Int timeMinutes = 5
  32         String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
  33     }
  34
  35     command {
  36         set -e
  37         mkdir -p "$(dirname ~{outputPath})"
  38         picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
  39         BedToIntervalList \
  40         I=~{bedFile} \
  41         O=~{outputPath} \
  42         SD=~{dict}
  43     }
  44
  45     output {
  46         File intervalList = outputPath
  47     }
  48
  49     runtime {
  50         docker: dockerImage
  51         time_minutes: timeMinutes
  52         memory: memory
  53     }
  54
  55     parameter_meta {
  56         # inputs
  57         bedFile: {description: "A bed file.", category: "required"}
  58         dict: {description: "A sequence dict file.", category: "required"}
  59         outputPath: {description: "The location the output interval list should be written to.", category: "advanced"}
  60         memory: {description: "The amount of memory this job will use.", category: "advanced"}
  61         javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",
  62                   category: "advanced"}
  63         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
  64         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
  65                       category: "advanced"}
  66     }
  67 }
  68
  69 task CollectMultipleMetrics {
  70     input {
  71         File inputBam
  72         File inputBamIndex
  73         File referenceFasta
  74         File referenceFastaDict
  75         File referenceFastaFai
  76         String basename
  77
  78         Boolean collectAlignmentSummaryMetrics = true
  79         Boolean collectInsertSizeMetrics = true
  80         Boolean qualityScoreDistribution = true
  81         Boolean meanQualityByCycle = true
  82         Boolean collectBaseDistributionByCycle = true
  83         Boolean collectGcBiasMetrics = true
  84         #FIXME: Boolean rnaSeqMetrics = false # There is a bug in picard https://github.com/broadinstitute/picard/issues/999
  85         Boolean collectSequencingArtifactMetrics = true
  86         Boolean collectQualityYieldMetrics = true
  87
  88         Int memoryMb = javaXmxMb + 512
  89         Int javaXmxMb = 3072
  90         # Additional * 2 because picard multiple metrics reads the reference fasta twice.
  91         Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6)
  92         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
  93     }
  94
  95
  96     command {
  97         set -e
  98         mkdir -p "$(dirname ~{basename})"
  99         picard -Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1 \
 100         CollectMultipleMetrics \
 101         I=~{inputBam} \
 102         R=~{referenceFasta} \
 103         O=~{basename} \
 104         PROGRAM=null \
 105         ~{true="PROGRAM=CollectAlignmentSummaryMetrics" false="" collectAlignmentSummaryMetrics} \
 106         ~{true="PROGRAM=CollectInsertSizeMetrics" false="" collectInsertSizeMetrics} \
 107         ~{true="PROGRAM=QualityScoreDistribution" false="" qualityScoreDistribution} \
 108         ~{true="PROGRAM=MeanQualityByCycle" false="" meanQualityByCycle} \
 109         ~{true="PROGRAM=CollectBaseDistributionByCycle" false="" collectBaseDistributionByCycle} \
 110         ~{true="PROGRAM=CollectGcBiasMetrics" false="" collectGcBiasMetrics} \
 111         ~{true="PROGRAM=CollectSequencingArtifactMetrics" false=""
 112             collectSequencingArtifactMetrics} \
 113         ~{true="PROGRAM=CollectQualityYieldMetrics" false="" collectQualityYieldMetrics}
 114     }
 115
 116     output {
 117         File? alignmentSummary = basename + ".alignment_summary_metrics"
 118         File? baitBiasDetail = basename + ".bait_bias_detail_metrics"
 119         File? baitBiasSummary = basename + ".bait_bias_summary_metrics"
 120         File? baseDistributionByCycle = basename + ".base_distribution_by_cycle_metrics"
 121         File? baseDistributionByCyclePdf = basename + ".base_distribution_by_cycle.pdf"
 122         File? errorSummary = basename + ".error_summary_metrics"
 123         File? gcBiasDetail = basename + ".gc_bias.detail_metrics"
 124         File? gcBiasPdf = basename + ".gc_bias.pdf"
 125         File? gcBiasSummary = basename + ".gc_bias.summary_metrics"
 126         File? insertSizeHistogramPdf = basename + ".insert_size_histogram.pdf"
 127         File? insertSize = basename + ".insert_size_metrics"
 128         File? preAdapterDetail = basename + ".pre_adapter_detail_metrics"
 129         File? preAdapterSummary = basename + ".pre_adapter_summary_metrics"
 130         File? qualityByCycle = basename + ".quality_by_cycle_metrics"
 131         File? qualityByCyclePdf = basename + ".quality_by_cycle.pdf"
 132         File? qualityDistribution = basename + ".quality_distribution_metrics"
 133         File? qualityDistributionPdf = basename + ".quality_distribution.pdf"
 134         File? qualityYield = basename + ".quality_yield_metrics"
 135         # Using a glob is easier. But will lead to very ugly output directories.
 136         Array[File] allStats = select_all([
 137             alignmentSummary,
 138             baitBiasDetail,
 139             baitBiasSummary,
 140             baseDistributionByCycle,
 141             baseDistributionByCyclePdf,
 142             errorSummary,
 143             gcBiasDetail,
 144             gcBiasPdf,
 145             gcBiasSummary,
 146             insertSizeHistogramPdf,
 147             insertSize,
 148             preAdapterDetail,
 149             preAdapterSummary,
 150             qualityByCycle,
 151             qualityByCyclePdf,
 152             qualityDistribution,
 153             qualityDistributionPdf,
 154             qualityYield
 155         ])
 156     }
 157
 158     runtime {
 159         docker: dockerImage
 160         time_minutes: timeMinutes
 161         memory: "~{memoryMb}M"
 162     }
 163
 164     parameter_meta {
 165         # inputs
 166         inputBam: {description: "The input BAM file for which metrics will be collected.", category: "required"}
 167         inputBamIndex: {description: "The index of the input BAM file.", category: "required"}
 168         referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"}
 169         referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.",
 170                              category: "required"}
 171         referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"}
 172         basename: {description: "The basename/prefix of the output files (may include directories).", category: "required"}
 173         collectAlignmentSummaryMetrics: {description: "Equivalent to the `PROGRAM=CollectAlignmentSummaryMetrics` argument.",
 174                                          category: "advanced"}
 175         collectInsertSizeMetrics: {description: "Equivalent to the `PROGRAM=CollectInsertSizeMetrics` argument.",
 176                                    category: "advanced"}
 177         qualityScoreDistribution: {description: "Equivalent to the `PROGRAM=QualityScoreDistribution` argument.",
 178                                    category: "advanced"}
 179         meanQualityByCycle: {description: "Equivalent to the `PROGRAM=MeanQualityByCycle` argument.", category: "advanced"}
 180         collectBaseDistributionByCycle: {description: "Equivalent to the `PROGRAM=CollectBaseDistributionByCycle` argument.",
 181                                          category: "advanced"}
 182         collectGcBiasMetrics: {description: "Equivalent to the `PROGRAM=CollectGcBiasMetrics` argument.", category: "advanced"}
 183         collectSequencingArtifactMetrics: {description: "Equivalent to the `PROGRAM=CollectSequencingArtifactMetrics` argument.",
 184                                            category: "advanced"}
 185         collectQualityYieldMetrics: {description: "Equivalent to the `PROGRAM=CollectQualityYieldMetrics` argument.",
 186                                      category: "advanced"}
 187         memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"}
 188         javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.",
 189                   category: "advanced"}
 190         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
 191         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
 192                       category: "advanced"}
 193     }
 194 }
 195
 196 task CollectRnaSeqMetrics {
 197     input {
 198         File inputBam
 199         File inputBamIndex
 200         File refRefflat
 201         String basename
 202         String strandSpecificity = "NONE"
 203
 204         String memory = "9G"
 205         String javaXmx =  "8G"
 206         # With 6 minutes per G there were several timeouts.
 207         Int timeMinutes = 1 + ceil(size(inputBam, "G") * 12)
 208         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
 209     }
 210
 211     command {
 212         set -e
 213         mkdir -p "$(dirname ~{basename})"
 214         picard -Xmx~{javaXmx} \
 215         CollectRnaSeqMetrics -XX:ParallelGCThreads=1 \
 216         I=~{inputBam} \
 217         O=~{basename}.RNA_Metrics \
 218         CHART_OUTPUT=~{basename}.RNA_Metrics.pdf \
 219         STRAND_SPECIFICITY=~{strandSpecificity} \
 220         REF_FLAT=~{refRefflat}
 221     }
 222
 223     output {
 224         File? chart = basename + ".RNA_Metrics.pdf"
 225         File metrics = basename + ".RNA_Metrics"
 226     }
 227
 228     runtime {
 229         docker: dockerImage
 230         time_minutes: timeMinutes
 231         memory: memory
 232     }
 233
 234     parameter_meta {
 235         # inputs
 236         inputBam: {description: "The input BAM file for which metrics will be collected.", category: "required"}
 237         inputBamIndex: {description: "The index of the input BAM file.", category: "required"}
 238         refRefflat: {description: "A refflat file containing gene annotations.", catehory: "required"}
 239         basename: {description: "The basename/prefix of the output files (may include directories).", category: "required"}
 240         strandSpecificity: {description: "Equivalent to the `STRAND_SPECIFICITY` option of picard's CollectRnaSeqMetrics.",
 241                             category: "common"}
 242
 243         memory: {description: "The amount of memory this job will use.", category: "advanced"}
 244         javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",
 245                   category: "advanced"}
 246         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
 247         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
 248                       category: "advanced"}
 249     }
 250 }
 251
 252 task CollectTargetedPcrMetrics {
 253     input {
 254         File inputBam
 255         File inputBamIndex
 256         File referenceFasta
 257         File referenceFastaDict
 258         File referenceFastaFai
 259         File ampliconIntervals
 260         Array[File]+ targetIntervals
 261         String basename
 262
 263         String memory = "4G"
 264         String javaXmx = "3G"
 265         Int timeMinutes = 1 + ceil(size(inputBam, "G") * 6)
 266         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
 267     }
 268
 269     command {
 270         set -e
 271         mkdir -p "$(dirname ~{basename})"
 272         picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
 273         CollectTargetedPcrMetrics \
 274         I=~{inputBam} \
 275         R=~{referenceFasta} \
 276         AMPLICON_INTERVALS=~{ampliconIntervals} \
 277         TARGET_INTERVALS=~{sep=" TARGET_INTERVALS=" targetIntervals} \
 278         O=~{basename}.targetPcrMetrics \
 279         PER_BASE_COVERAGE=~{basename}.targetPcrPerBaseCoverage \
 280         PER_TARGET_COVERAGE=~{basename}.targetPcrPerTargetCoverage
 281     }
 282
 283     output {
 284         File perTargetCoverage = basename + ".targetPcrPerTargetCoverage"
 285         File perBaseCoverage = basename + ".targetPcrPerBaseCoverage"
 286         File metrics = basename + ".targetPcrMetrics"
 287     }
 288
 289     runtime {
 290         docker: dockerImage
 291         time_minutes: timeMinutes
 292         memory: memory
 293     }
 294
 295     parameter_meta {
 296         # inputs
 297         inputBam: {description: "The input BAM file for which metrics will be collected.", category: "required"}
 298         inputBamIndex: {description: "The index of the input BAM file.", category: "required"}
 299         referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"}
 300         referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.",
 301                              category: "required"}
 302         referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"}
 303         ampliconIntervals: {description: "An interval list describinig the coordinates of the amplicons sequenced.",
 304                            category: "required"}
 305         targetIntervals: {description: "An interval list describing the coordinates of the targets sequenced.",
 306                           category: "required"}
 307         basename: {description: "The basename/prefix of the output files (may include directories).", category: "required"}
 308
 309         memory: {description: "The amount of memory this job will use.", category: "advanced"}
 310         javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",
 311                   category: "advanced"}
 312         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
 313         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
 314                       category: "advanced"}
 315     }
 316 }
 317
 318 task CreateSequenceDictionary {
 319     input {
 320         File inputFile
 321         String outputDir
 322
 323         String memory = "3G"
 324         String javaXmx = "2G"
 325         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
 326     }
 327
 328     command {
 329         set -e
 330         mkdir -p "~{outputDir}"
 331         picard -Xmx~{javaXmx} \
 332         -XX:ParallelGCThreads=1 \
 333         CreateSequenceDictionary \
 334         REFERENCE=~{inputFile} \
 335         OUTPUT="~{outputDir}/$(basename ~{inputFile}).dict"
 336     }
 337
 338     output {
 339         File outputDict = outputDir + "/" + basename(inputFile) + ".dict"
 340     }
 341
 342     runtime {
 343         memory: memory
 344         docker: dockerImage
 345     }
 346
 347     parameter_meta {
 348         # inputs
 349         inputFile: {description: "The input fasta file.", category: "required"}
 350         outputDir: {description: "Output directory path.", category: "required"}
 351         memory: {description: "The amount of memory available to the job.", category: "advanced"}
 352         javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"}
 353         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
 354
 355         # outputs
 356         outputDict: {description: "Dictionary of the input fasta file."}
 357     }
 358 }
 359
 360 # Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs
 361 task GatherBamFiles {
 362     input {
 363         Array[File]+ inputBams
 364         Array[File]+ inputBamsIndex
 365         String outputBamPath
 366
 367         Int memoryMb = javaXmxMb + 512
 368         Int javaXmxMb = 1024
 369         Int? compressionLevel
 370         Boolean createMd5File = false
 371         # One minute per input gigabyte.
 372         Int timeMinutes = 1 + ceil(size(inputBams, "G") * 1)
 373         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
 374     }
 375
 376     command {
 377         set -e
 378         mkdir -p "$(dirname ~{outputBamPath})"
 379         picard -Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1 \
 380         GatherBamFiles \
 381         INPUT=~{sep=' INPUT=' inputBams} \
 382         OUTPUT=~{outputBamPath} \
 383         ~{"COMPRESSION_LEVEL=" + compressionLevel} \
 384         CREATE_INDEX=true \
 385         CREATE_MD5_FILE=~{true="true" false="false" createMd5File}
 386     }
 387
 388     output {
 389         File outputBam = outputBamPath
 390         File outputBamIndex = sub(outputBamPath, "\.bam$", ".bai")
 391         File? outputBamMd5 = outputBamPath + ".md5"
 392     }
 393
 394     runtime {
 395         docker: dockerImage
 396         time_minutes: timeMinutes
 397         memory: "~{memoryMb}M"
 398     }
 399
 400     parameter_meta {
 401         # inputs
 402         inputBams: {description: "The BAM files to be merged together.", category: "required"}
 403         inputBamsIndex: {description: "The indexes of the input BAM files.", category: "required"}
 404         outputBamPath: {description: "The path where the merged BAM file will be written.", caregory: "required"}
 405         compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"}
 406         createMd5File: {decription: "Whether to create an md5 file of the output BAM.", category: "advanced"}
 407         memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"}
 408         javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.",
 409                   category: "advanced"}
 410         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
 411         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
 412                       category: "advanced"}
 413     }
 414 }
 415
 416 task GatherVcfs {
 417     input {
 418         Array[File]+ inputVcfs
 419         Array[File]+ inputVcfIndexes
 420         String outputVcfPath = "out.vcf.gz"
 421
 422         String memory = "5G"
 423         String javaXmx = "4G"
 424         Int timeMinutes = 1 + ceil(size(inputVcfs, "G") * 2)
 425         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
 426     }
 427
 428     command {
 429         set -e
 430         mkdir -p "$(dirname ~{outputVcfPath})"
 431         picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
 432         GatherVcfs \
 433         INPUT=~{sep=' INPUT=' inputVcfs} \
 434         OUTPUT=~{outputVcfPath}
 435     }
 436
 437     output {
 438         File outputVcf = outputVcfPath
 439     }
 440
 441     runtime {
 442         docker: dockerImage
 443         memory: memory
 444         time_minutes: timeMinutes
 445     }
 446
 447     parameter_meta {
 448         # inputs
 449         inputVcfs: {description: "The VCF files to be merged together.", category: "required"}
 450         inputVcfIndexes: {description: "The indexes of the input VCF files.", category: "required"}
 451         outputVcfPath: {description: "The path where the merged VCF file will be written.", caregory: "required"}
 452
 453         memory: {description: "The amount of memory this job will use.", category: "advanced"}
 454         javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",
 455                   category: "advanced"}
 456         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
 457         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
 458                       category: "advanced"}
 459     }
 460 }
 461
 462 # Mark duplicate reads to avoid counting non-independent observations
 463 task MarkDuplicates {
 464     input {
 465         Array[File]+ inputBams
 466         String outputBamPath
 467         String metricsPath
 468         Int compressionLevel = 1
 469         Boolean createMd5File = false
 470         Boolean useJdkInflater = true  # Slightly faster than the intel one.
 471         # Better results for compression level 1 (much smaller). Higher compression levels similar to intel deflater.
 472         # NOTE: this might change in the future when the intel deflater is updated!
 473         Boolean useJdkDeflater = true
 474
 475         # In GATK Best practices pipeline MarkDuplicates is given a 7G VM.
 476         # https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/d2934ed656ade44801f9cfe1c0e78d4f80684b7b/PairedEndSingleSampleWf-fc-hg38.wdl#L1040
 477         Int javaXmxMb =  6656  # 6.5G
 478         String memoryMb = javaXmxMb + 512
 479
 480         Int timeMinutes = 1 + ceil(size(inputBams, "G") * 8)
 481         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
 482
 483         # The program default for READ_NAME_REGEX is appropriate in nearly every case.
 484         # Sometimes we wish to supply "null" in order to turn off optical duplicate detection
 485         # This can be desirable if you don't mind the estimated library size being wrong and
 486         # optical duplicate detection is taking >7 days and failing
 487         String? read_name_regex
 488     }
 489
 490     # Task is assuming query-sorted input so that the Secondary and Supplementary reads get
 491     # marked correctly. This works because the output of BWA is query-grouped and therefore,
 492     # so is the output of MergeBamAlignment. While query-grouped isn't actually query-sorted,
 493     # it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname"
 494
 495     command {
 496         set -e
 497         mkdir -p "$(dirname ~{outputBamPath})"
 498         picard -Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1 \
 499         MarkDuplicates \
 500         INPUT=~{sep=' INPUT=' inputBams} \
 501         OUTPUT=~{outputBamPath} \
 502         METRICS_FILE=~{metricsPath} \
 503         COMPRESSION_LEVEL=~{compressionLevel} \
 504         VALIDATION_STRINGENCY=SILENT \
 505         ~{"READ_NAME_REGEX=" + read_name_regex} \
 506         OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \
 507         CLEAR_DT="false" \
 508         CREATE_INDEX=true \
 509         ADD_PG_TAG_TO_READS=false \
 510         CREATE_MD5_FILE=~{true="true" false="false" createMd5File} \
 511         USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \
 512         USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater}
 513     }
 514
 515     output {
 516         File outputBam = outputBamPath
 517         File outputBamIndex = sub(outputBamPath, "\.bam$", ".bai")
 518         File? outputBamMd5 = outputBamPath + ".md5"
 519         File metricsFile = metricsPath
 520     }
 521
 522     runtime {
 523         docker: dockerImage
 524         time_minutes: timeMinutes
 525         memory: "~{memoryMb}M"
 526     }
 527
 528     parameter_meta {
 529         # inputs
 530         inputBams: {description: "The BAM files for which the duplicate reads should be marked.", category: "required"}
 531         outputBamPath: {description: "The location where the ouptut BAM file should be written.", category: "required"}
 532         metricsPath: {description: "The location where the output metrics file should be written.", category: "required"}
 533         read_name_regex: {description: "Equivalent to the `READ_NAME_REGEX` option of MarkDuplicates.", category: "advanced"}
 534         createMd5File: {description: "Whether to create a md5 file for the created BAM file.", category: "advanced"}
 535         useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"}
 536         useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"}
 537         compressionLevel: {description: "The compression level at which the BAM files are written", category: "advanced"}
 538         memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"}
 539         javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.",
 540                   category: "advanced"}
 541         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
 542         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
 543                       category: "advanced"}
 544     }
 545 }
 546
 547 # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
 548 task MergeVCFs {
 549     input {
 550         Array[File]+ inputVCFs
 551         Array[File]+ inputVCFsIndexes
 552         String outputVcfPath
 553
 554         String memory = "5G"
 555         String javaXmx = "4G"
 556         Int timeMinutes = 1 + ceil(size(inputVCFs, "G")) * 2
 557         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
 558         Int compressionLevel = 1
 559         Boolean useJdkInflater = true  # Slightly faster than the intel one.
 560         # Better results for compression level 1 (much smaller). Higher compression levels similar to intel deflater.
 561         # NOTE: this might change in the future when the intel deflater is updated!
 562         Boolean useJdkDeflater = true
 563
 564     }
 565
 566     # Using MergeVcfs instead of GatherVcfs so we can create indices
 567     # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket
 568
 569     command {
 570         set -e
 571         mkdir -p "$(dirname ~{outputVcfPath})"
 572         picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
 573         MergeVcfs \
 574         INPUT=~{sep=' INPUT=' inputVCFs} \
 575         OUTPUT=~{outputVcfPath} \
 576         COMPRESSION_LEVEL=~{compressionLevel} \
 577         USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \
 578         USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater}
 579     }
 580
 581     output {
 582         File outputVcf = outputVcfPath
 583         File outputVcfIndex = outputVcfPath + ".tbi"
 584     }
 585
 586     runtime {
 587         docker: dockerImage
 588         time_minutes: timeMinutes
 589         memory: memory
 590     }
 591
 592     parameter_meta {
 593         # inputs
 594         inputVCFs: {description: "The VCF files to be merged.", category: "required"}
 595         inputVCFsIndexes: {description: "The indexes of the VCF files.", category: "required"}
 596         outputVcfPath: {description: "The location the output VCF file should be written to.", category: "required"}
 597
 598         memory: {description: "The amount of memory this job will use.", category: "advanced"}
 599         javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",
 600                   category: "advanced"}
 601         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
 602         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
 603                       category: "advanced"}
 604         useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"}
 605         useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"}
 606         compressionLevel: {description: "The compression level at which the BAM files are written", category: "advanced"}
 607     }
 608 }
 609
 610 task SamToFastq {
 611     input {
 612         File inputBam
 613         File inputBamIndex
 614         Boolean paired = true
 615
 616         String memory = "17G"
 617         String javaXmx = "16G" # High memory default to avoid crashes.
 618         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
 619         File? NONE
 620     }
 621
 622     String outputRead1 = basename(inputBam, "\.[bs]am") + "_R1.fastq.gz"
 623     String outputRead2 = basename(inputBam, "\.[bs]am") + "_R2.fastq.gz"
 624     String outputUnpaired = basename(inputBam, "\.[bs]am") + "_unpaired.fastq.gz"
 625
 626     command {
 627         set -e
 628         picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
 629         SamToFastq \
 630         I=~{inputBam} \
 631         ~{"FASTQ=" + outputRead1} \
 632         ~{if paired then "SECOND_END_FASTQ=" + outputRead2 else ""} \
 633         ~{if paired then "UNPAIRED_FASTQ=" + outputUnpaired else ""}
 634     }
 635
 636     output {
 637         File read1 = outputRead1
 638         File? read2 = if paired then outputRead2 else NONE
 639         File? unpairedRead = if paired then outputUnpaired else NONE
 640     }
 641
 642     runtime {
 643         docker: dockerImage
 644         memory: memory
 645     }
 646 }
 647
 648 task ScatterIntervalList {
 649     input {
 650         File interval_list
 651         Int scatter_count
 652
 653         String memory = "4G"
 654         String javaXmx = "3G"
 655         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
 656     }
 657
 658     command {
 659         set -e
 660         mkdir scatter_list
 661         picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
 662         IntervalListTools \
 663         SCATTER_COUNT=~{scatter_count} \
 664         SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \
 665         UNIQUE=true \
 666         SORT=true \
 667         INPUT=~{interval_list} \
 668         OUTPUT=scatter_list
 669     }
 670
 671     output {
 672         Array[File] out = glob("scatter_list/*/*.interval_list")
 673         Int interval_count = read_int(stdout())
 674     }
 675
 676     runtime {
 677         docker: dockerImage
 678         memory: memory
 679     }
 680 }
 681
 682 task SortSam {
 683     input {
 684         File inputBam
 685         String outputPath
 686         Boolean sortByName = false
 687         Boolean createMd5File = false
 688         Int maxRecordsInRam = 500000
 689         Int compressionLevel = 1
 690
 691         # Default ram of 4 GB. Using 125001.0  to prevent an answer of
 692         # 4.000000001 which gets rounded to 5.
 693         # GATK Best practices uses 75000 here: https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/d2934ed656ade44801f9cfe1c0e78d4f80684b7b/PairedEndSingleSampleWf-fc-hg38.wdl#L778
 694         Int XmxGb = ceil(maxRecordsInRam / 125001.0)
 695         Int timeMinutes = 1 + ceil(size(inputBam, "G") * 3)
 696         String dockerImage = "quay.io/biocontainers/picard:2.23.2--0"
 697     }
 698
 699     command {
 700         set -e
 701         mkdir -p "$(dirname ~{outputPath})"
 702         picard -Xmx~{XmxGb}G -XX:ParallelGCThreads=1 SortSam \
 703         INPUT=~{inputBam} \
 704         OUTPUT=~{outputPath} \
 705         MAX_RECORDS_IN_RAM=~{maxRecordsInRam} \
 706         SORT_ORDER=~{true="queryname" false="coordinate" sortByName} \
 707         CREATE_INDEX=true \
 708         COMPRESSION_LEVEL=~{compressionLevel} \
 709         VALIDATION_STRINGENCY=SILENT \
 710         CREATE_MD5_FILE=~{true="true" false="false" createMd5File}
 711
 712     }
 713
 714     output {
 715         File outputBam = outputPath
 716         File outputBamIndex = sub(outputPath, "\.bam$", ".bai")
 717     }
 718
 719     runtime {
 720         cpu: 1
 721         memory: "~{1 + XmxGb}G"
 722         time_minutes: timeMinutes
 723         docker: dockerImage
 724     }
 725
 726     parameter_meta {
 727         inputBam: {description: "The unsorted input BAM file", category: "required"}
 728         outputPath: {description: "The location the output BAM file should be written to.", category: "required"}
 729         XmxGb: {description: "The maximum memory available to picard SortSam. Should be lower than `memory` to accommodate JVM overhead and BWA mem's memory usage.",
 730                   category: "advanced"}
 731         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
 732         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
 733                       category: "advanced"}
 734     }
 735 }
 736
 737 task SortVcf {
 738     input {
 739         Array[File]+ vcfFiles
 740         String outputVcfPath
 741         File? dict
 742
 743         String memory = "9G"
 744         String javaXmx = "8G"
 745         Int timeMinutes = 1 + ceil(size(vcfFiles, "G") * 5)
 746         String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
 747     }
 748
 749
 750     command {
 751         set -e
 752         mkdir -p "$(dirname ~{outputVcfPath})"
 753         picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
 754         SortVcf \
 755         I=~{sep=" I=" vcfFiles} \
 756         ~{"SEQUENCE_DICTIONARY=" + dict} \
 757         O=~{outputVcfPath}
 758     }
 759
 760     output {
 761         File outputVcf = outputVcfPath
 762         File outputVcfIndex = outputVcfPath + ".tbi"
 763     }
 764
 765     runtime {
 766         docker: dockerImage
 767         time_minutes: timeMinutes
 768         memory: memory
 769     }
 770
 771     parameter_meta {
 772         # inputs
 773         vcfFiles: {description: "The VCF files to merge and sort.", category: "required"}
 774         outputVcfPath: {description: "The location the sorted VCF files should be written to.", category: "required"}
 775         dict: {description: "A sequence dictionary matching the VCF files.", category: "advanced"}
 776
 777         memory: {description: "The amount of memory this job will use.", category: "advanced"}
 778         javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",
 779                   category: "advanced"}
 780         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
 781         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
 782                       category: "advanced"}
 783     }
 784 }
 785
 786 task RenameSample {
 787     input {
 788         File inputVcf
 789         String outputPath = "./picard/renamed.vcf"
 790         String newSampleName
 791         String memory = "9G"
 792         String javaXmx = "8G"
 793         Int timeMinutes = 1 + ceil(size(inputVcf, "G") * 2)
 794         String dockerImage = "quay.io/biocontainers/picard:2.19.0--0"
 795     }
 796
 797     command {
 798         set -e
 799         mkdir -p "$(dirname ~{outputPath})"
 800         picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
 801         RenameSampleInVcf \
 802         I=~{inputVcf} \
 803         O=~{outputPath} \
 804         NEW_SAMPLE_NAME=~{newSampleName}
 805     }
 806
 807     output {
 808         File renamedVcf = outputPath
 809     }
 810
 811     runtime {
 812         docker: dockerImage
 813         time_minutes: timeMinutes
 814         memory: memory
 815     }
 816
 817     parameter_meta {
 818         # inputs
 819         inputVcf: {description: "The VCF file to process.", category: "required"}
 820         outputPath: {description: "The location the output VCF file should be written.", category: "common"}
 821         newSampleName: {description: "A string to replace the old sample name.", category: "required"}
 822         memory: {description: "The memory required to run the programs", category: "advanced"}
 823         javaXmx: {description: "The max. memory allocated for JAVA", category: "advanced"}
 824         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
 825         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
 826     }
 827 }