| | 1 | Below is the current GoNL pipeline commands along with rough execution times when available on the cluster as of January 25th 2011: |
| | 2 | |
| | 3 | This analysis pipeline consists of two parts. The first one is based on lane analysis. Afterwards lanes are merged and analysis per lane is conducted. |
| | 4 | |
| | 5 | = Lane analysis = |
| | 6 | '''L1. Fastq to check read quality''' |
| | 7 | |
| | 8 | {{{ |
| | 9 | /tools/FastQC/fastqc /data/filename_1.fq.gz \ -Dfastqc.output_dir=/output \ -Dfastqc.unzip=false |
| | 10 | |
| | 11 | /tools/FastQC/fastqc /data/filename_2.fq.gz \ -Dfastqc.output_dir=/output \ -Dfastqc.unzip=false |
| | 12 | }}} |
| | 13 | '''L2. Align both pairs of FASTQ files using BWA '''(~6.5h)'''[[BR]] ''' |
| | 14 | |
| | 15 | {{{ |
| | 16 | /tools/bwa-0.5.8c_patched/bwa aln \ /resources//hg19/indices/b37_1kg.fa \ /data/filename_1.fq.gz -t 4 \ -f /output/filename.b37_1kg.1.sai |
| | 17 | |
| | 18 | /tools/bwa-0.5.8c_patched/bwa aln \ /resources//hg19/indices/b37_1kg.fa \ /data/filename_2.fq.gz -t 4 \ -f /output/filename.b37_1kg.2.sai |
| | 19 | }}} |
| | 20 | '''L3. Create SAM file from both .sai files using BWA sampe '''(~3.5h)'''[[BR]] ''' |
| | 21 | |
| | 22 | {{{ |
| | 23 | /tools/bwa_45_patched/bwa sampe -P -p illumina -i lane -m sample_id -l library \ /resources//hg19/indices/b37_1kg.fa \ /output/filename.b37_1kg.1.sai /output/filename.b37_1kg.2.sai \ /data/filename_1.fq.gz /data/filename_2.fq.gz \ -f /output/filename.b37_1kg.sam |
| | 24 | }}} |
| | 25 | '''L4. Convert SAM to BAM file using Picard '''(~0.5h)'''[[BR]] ''' |
| | 26 | |
| | 27 | {{{ |
| | 28 | java -jar -Xmx3g /tools/picard-tools-1.32/SamFormatConverter.jar \ INPUT=/output/filename.b37_1kg.sam \ OUTPUT=/output/filename.b37_1kg.bam \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 29 | }}} |
| | 30 | '''L5. Sort BAM file and create corresponding index '''(~5h)'''[[BR]] ''' |
| | 31 | |
| | 32 | {{{ |
| | 33 | java -jar -Xmx3g /tools/picard-tools-1.32/SortSam.jar \ INPUT=/output/filename.b37_1kg.bam \ OUTPUT=/output/filename.b37_1kg.sorted.bam \ SORT_ORDER=coordinate \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 34 | |
| | 35 | java -jar -Xmx3g /tools/picard-tools-1.32/BuildBamIndex.jar \ INPUT=/output/filename.b37_1kg.sorted.bam \ OUTPUT=/output/filename.b37_1kg.sorted.bam.bai \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 36 | }}} |
| | 37 | '''L6. Calculate QC metrics on alignment using Picard''' (~3.5h) |
| | 38 | |
| | 39 | This Step is currently updated as: |
| | 40 | |
| | 41 | * [http://www.bbmriwiki.nl/search/opensearch?q=wiki%3ACalculateHsMetrics Picard CalculateHsMetrics] should be replaced by GATK DepthOfCoverage |
| | 42 | * !MeanQualityByCycle and !QualityScoreDistribution should be moved after the recalibration step |
| | 43 | |
| | 44 | {{{ |
| | 45 | java -jar -Xmx4g /tools/picard-tools-1.32/CollectAlignmentSummaryMetrics.jar \ I=/output/filename.b37_1kg.sorted.bam \ O=/output/filename.b37_1kg.AlignmentSummaryMetrics \ R=/resources//hg19/indices/b37_1kg.fa \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 46 | |
| | 47 | java -jar /tools/picard-tools-1.32/CollectGcBiasMetrics.jar \ R=/resources//hg19/indices/b37_1kg.fa \ I=/output/filename.b37_1kg.sorted.bam \ O=/output/filename.b37_1kg.GcBiasMetrics \ CHART=/output/filename.b37_1kg.GcBiasMetrics.pdf \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 48 | |
| | 49 | java -jar /tools/picard-tools-1.32/CollectInsertSizeMetrics.jar \ I=/output/filename.b37_1kg.sorted.bam \ O=/output/filename.b37_1kg.CollectInsertSizeMetrics \ H=/output/filename.b37_1kg.CollectInsertSizeMetrics.pdf \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 50 | |
| | 51 | java -jar /tools/picard-tools-1.32/MeanQualityByCycle.jar \ I=/output/filename.b37_1kg.sorted.bam \ O=/output/filename.b37_1kg.MeanQualityByCycle \ CHART=/output/filename.b37_1kg.MeanQualityByCycle.pdf \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local java -jar /tools/picard-tools-1.32/QualityScoreDistribution.jar \ I=/output/filename.b37_1kg.sorted.bam \ O=/output/filename.b37_1kg.[wiki:QualityScoreDistribution] \ CHART=/output/filename.b37_1kg.[wiki:QualityScoreDistribution].pdf \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 52 | |
| | 53 | java -jar /tools/picard-tools-1.32/BamIndexStats.jar \ INPUT=/output/filename.b37_1kg.sorted.bam \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 54 | |
| | 55 | java -jar -Xmx3g /tools/picard-tools-1.32/CalculateHsMetricsWholeGenome.jar \ INPUT=/output/filename.b37_1kg.sorted.bam \ OUTPUT=/output/filename.b37_1kg.HsMetrics \ BAIT_INTERVALS=/resources//hg19/intervals/GoNL.interval_list \ TARGET_INTERVALS=/resources//hg19/intervals/GoNL.interval_list \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 56 | }}} |
| | 57 | '''L7. Mark duplicates after alignment''' |
| | 58 | |
| | 59 | {{{ |
| | 60 | java -Xmx4g -jar /tools/picard-tools-1.32/MarkDuplicates.jar \ INPUT=/output/filename.b37_1kg.sorted.bam \ OUTPUT=/output/filename.b37_1kg.dedup.bam \ METRICS_FILE=/output/filename.b37_1kg.dedup.metrics \ REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 61 | }}} |
| | 62 | '''L8. Realignment using knowns only''' |
| | 63 | |
| | 64 | {{{ |
| | 65 | java -Djava.io.tmpdir=/local -Xmx8g -jar \ /tools/Sting/dist/GenomeAnalysisTK.jar -l INFO -T IndelRealigner \ -U ALLOW_UNINDEXED_BAM -I /output/filename.b37_1kg.dedup.bam \ -targetIntervals /resources//hg19/intervals/realign_intervals_hg19_b37_1kg.intervals \ -R /resources//hg19/indices/b37_1kg.fa \ -D /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ -[B:indels,VCF B:indels,VCF] /resources//hg19/indels/1kg.pilot_release.merged.indels.sites./hg19.b37_1kg.vcf \ -o /output/filename.b37_1kg.realigned.bam -knownsOnly -LOD 0.4 -compress 0 |
| | 66 | }}} |
| | 67 | '''L9. Fix mates after realignment''' |
| | 68 | |
| | 69 | {{{ |
| | 70 | java -jar -Xmx6g /tools/picard-tools-1.32/FixMateInformation.jar \ INPUT=/output/filename.b37_1kg.realigned.bam \ OUTPUT=/output/filename.b37_1kg.matefixed.bam \ SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT TMP_DIR=/local |
| | 71 | }}} |
| | 72 | '''L10. Calculate covariates before realignment ''' |
| | 73 | |
| | 74 | {{{ |
| | 75 | java -jar -Xmx2g /tools/Sting/dist/GenomeAnalysisTK.jar -l INFO \ -T CountCovariates -U ALLOW_UNINDEXED_BAM \ -R /resources//hg19/indices/b37_1kg.fa \ --DBSNP /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ -I /output/filename.b37_1kg.matefixed.bam \ -cov ReadGroupcovariate -cov QualityScoreCovariate -cov CycleCovariate -cov DinucCovariate \ -recalFile /output/filename.b37_1kg.matefixed.covariate_table.csv |
| | 76 | }}} |
| | 77 | '''L11. Recalibrate mate fixed and realigned alignment''' |
| | 78 | |
| | 79 | {{{ |
| | 80 | java -jar -Xmx4g /tools/Sting/dist/GenomeAnalysisTK.jar -l INFO \ -T TableRecalibration -U ALLOW_UNINDEXED_BAM \ -R /resources//hg19/indices/b37_1kg.fa -I /output/filename.b37_1kg.matefixed.bam \ --recal_file /output/filename.b37_1kg.matefixed.covariate_table.csv \ --out /output/filename.b37_1kg.recal.bam |
| | 81 | }}} |
| | 82 | '''L12. Sort and index recalibrated alignment''' |
| | 83 | |
| | 84 | {{{ |
| | 85 | java -jar -Xmx3g /tools/picard-tools-1.32/SortSam.jar \ INPUT=/output/filename.b37_1kg.recal.bam \ OUTPUT=/output/filename.b37_1kg.recal.sorted.bam \ SORT_ORDER=coordinate \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 86 | |
| | 87 | java -jar -Xmx3g /tools/picard-tools-1.32/BuildBamIndex.jar \ INPUT=/output/filename.b37_1kg.recal.sorted.bam \ OUTPUT=/output/filename.b37_1kg.recal.sorted.bam.bai \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 88 | }}} |
| | 89 | '''L13. Calculate covariates after realignment and recalibration''' |
| | 90 | |
| | 91 | {{{ |
| | 92 | java -jar -Xmx2g /tools/Sting/dist/GenomeAnalysisTK.jar -l INFO \ -T CountCovariates -U ALLOW_UNINDEXED_BAM \ -R /resources//hg19/indices/b37_1kg.fa \ --DBSNP /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ -I /output/filename.b37_1kg.recal.sorted.bam \ -cov ReadGroupcovariate -cov QualityScoreCovariate -cov CycleCovariate -cov DinucCovariate \ -recalFile /output/filename.b37_1kg.recal.covariate_table.csv |
| | 93 | }}} |
| | 94 | '''L14. Analyze covariates before and after''' |
| | 95 | |
| | 96 | {{{ |
| | 97 | java -jar -Xmx4g /tools/Sting/dist/AnalyzeCovariates.jar -l INFO \ -resources /resources//hg19/indices/b37_1kg.fa \ --recal_file /output/filename.b37_1kg.matefixed.covariate_table.csv \ -outputDir /output/filename.b37_1kg.recal.stats_before/ \ -Rscript ${rscript} -ignoreQ 5 |
| | 98 | |
| | 99 | java -jar -Xmx4g /tools/Sting/dist/AnalyzeCovariates.jar -l INFO \ -resources /resources//hg19/indices/b37_1kg.fa \ --recal_file /output/filename.b37_1kg.recal.covariate_table.csv \ -outputDir /output/filename.b37_1kg.recal.stats_after/ \ -Rscript ${rscript} -ignoreQ 5 |
| | 100 | }}} |
| | 101 | = Sample analysis = |
| | 102 | '''S1. Merging three lanes to one''' |
| | 103 | |
| | 104 | {{{ |
| | 105 | java -jar -Xmx4g /tools/picard-tools-1.32/MergeSamFiles.jar \ INPUT/output/filename.b37_1kg.recal.sorted.ba \ INPUT/output/filename2.b37_1kg.recal.sorted.ba \ INPUT/output/filename3.b37_1kg.recal.sorted.ba \ ASSUME_SORTED=true USE_THREADING=true \ TMP_DIR=/local MAX_RECORDS_IN_RAM=30000000 \ OUTPUT=/output/sample_id.b37_1kg.bam SORT_ORDER=coordinate \ VALIDATION_STRINGENCY=SILENT |
| | 106 | }}} |
| | 107 | == To perform QC on initial data SNP calling is performed. This procedure consists of three steps. == |
| | 108 | '''S2. SNP calling using Unified Genotyper''' |
| | 109 | |
| | 110 | {{{ |
| | 111 | java -Xmx4g -jar /tools/Sting/dist/GenomeAnalysisTK.jar \ -l INFO -T UnifiedGenotyper -I /output/sample_id.b37_1kg.bam \ --out /output/sample_id.b37_1kg.qc_check_snps.vcf \ -R /resources//hg19/indices/b37_1kg.fa \ -D /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ -stand_call_conf 30.0 -stand_emit_conf 10.0 |
| | 112 | }}} |
| | 113 | '''S3. Filter SNPs using variant filtration''' |
| | 114 | |
| | 115 | {{{ |
| | 116 | java -jar -Xmx4g /tools/Sting/dist/GenomeAnalysisTK.jar \ -l INFO -T VariantFiltration \ -[B:variant,VCF B:variant,VCF] /output/sample_id.b37_1kg.qc_check_snps.vcf \ -R /resources//hg19/indices/b37_1kg.fa \ -D /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ --out /output/sample_id.b37_1kg.qc_check_snps.filtered.vcf \ --maskName InDel --clusterWindowSize 10 \ --filterName GATK_standard \ --filterExpression "AB > 0.75 && DP > 40 || DP > 100 || MQ0 > 40 || SB > -0.10"|| |
| | 117 | }}} |
| | 118 | '''S4. Generate statistics on called SNPs using variant eval ''' |
| | 119 | |
| | 120 | {{{ |
| | 121 | java -Xmx4g -jar /tools/Sting/dist/GenomeAnalysisTK.jar \ -T VariantEval -R /resources//hg19/indices/b37_1kg.fa \ -l INFO \ -[B:eval,VCF B:eval,VCF] /output/sample_id.b37_1kg.qc_check_snps.filtered.vcf \ -D /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ -o /output/sample_id.b37_1kg.qc_check_snps.filtered.eval |
| | 122 | }}} |
| | 123 | == After this QC check the indel and SNP calling takes place using the next commands. == |
| | 124 | '''S5. Create targets for realignment''' |
| | 125 | |
| | 126 | {{{ |
| | 127 | java -Xmx4g -jar -Djava.io.tmpdir=/local /tools/Sting/dist/GenomeAnalysisTK.jar -l INFO \ -T RealignerTargetCreator \ -I /output/sample_id.b37_1kg.bam \ -R /resources//hg19/indices/b37_1kg.fa \ -D /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ -[B:indels,VCF B:indels,VCF] /resources//hg19/indels/1kg.pilot_release.merged.indels.sites./hg19.b37_1kg.vcf \ -o /output/sample_id.b37_1kg.realign.intervals |
| | 128 | }}} |
| | 129 | '''S6. Realignment using created targets, dbSNP and indels from 1kg project''' |
| | 130 | |
| | 131 | {{{ |
| | 132 | java -Djava.io.tmpdir=/local –Xmx6g -jar \ /tools/Sting/dist/GenomeAnalysisTK.jar -l INFO -T IndelRealigner \ -I /output/sample_id.b37_1kg.bam \ -targetIntervals /output/sample_id.b37_1kg.realign.intervals \ -R /resources//hg19/indices/b37_1kg.fa \ -D /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ -[B:indels,VCF B:indels,VCF] /resources//hg19/indels/1kg.pilot_release.merged.indels.sites./hg19.b37_1kg.vcf \ --out /output/sample_id.b37_1kg.realigned.bam \ -maxReads 500000 |
| | 133 | }}} |
| | 134 | '''S7. Fix mates after realignment and create corresponding BAM index''' |
| | 135 | |
| | 136 | {{{ |
| | 137 | java -jar -Xmx6g /tools/picard-tools-1.32/FixMateInformation.jar \ INPUT=/output/sample_id.b37_1kg.realigned.bam \ OUTPUT=/output/sample_id.b37_1kg.matesfixed.bam \ SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT TMP_DIR=/local |
| | 138 | |
| | 139 | java -jar -Xmx3g /tools/picard-tools-1.32/BuildBamIndex.jar \ INPUT=/output/sample_id.b37_1kg.matesfixed.bam \ OUTPUT=/output/sample_id.b37_1kg.matesfixed.bam.bai \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=/local |
| | 140 | }}} |
| | 141 | '''S8. Indel calling on realigned BAM file''' |
| | 142 | |
| | 143 | {{{ |
| | 144 | java -Xmx4g -jar /tools/Sting/dist/GenomeAnalysisTK.jar -l INFO \ -T IndelGenotyperV2 -I /output/sample_id.b37_1kg.matesfixed.bam \ --out /output/sample_id.b37_1kg.indels.vcf \ --bedOutput /output/sample_id.b37_1kg.indels.bed \ -R /resources//hg19/indices/b37_1kg.fa \ -verbose /output/sample_id.b37_1kg.indels.verboseoutput.txt |
| | 145 | }}} |
| | 146 | '''S9. Filter indels ''' |
| | 147 | |
| | 148 | {{{ |
| | 149 | perl /tools/filterSingleSampleCalls.pl \ --calls /output/sample_id.b37_1kg.indels.bed > /output/sample_id.b37_1kg.indels.filtered.bed \ --max_cons_av_mm 3.0 --max_cons_nqs_av_mm 0.5 --mode ANNOTATE |
| | 150 | }}} |
| | 151 | '''S10. SNP calling on realigned BAM file''' |
| | 152 | |
| | 153 | {{{ |
| | 154 | java -Xmx4g -jar /tools/Sting/dist/GenomeAnalysisTK.jar \ -l INFO -T UnifiedGenotyper -I /output/sample_id.b37_1kg.matesfixed.bam \ --out /output/sample_id.b37_1kg.snps.vcf \ -R /resources//hg19/indices/b37_1kg.fa \ -D /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ -stand_call_conf 30.0 -stand_emit_conf 10.0 |
| | 155 | }}} |
| | 156 | '''S11. Create indel mask used in SNP calling python ''' |
| | 157 | |
| | 158 | {{{ |
| | 159 | /tools/makeIndelMask.py \ /output/sample_id.b37_1kg.indels.filtered.bed 10 \ /output/sample_id.b37_1kg.indels.mask.bed |
| | 160 | }}} |
| | 161 | '''S12. Filter SNP calls using the indel mask, dbsnp and indels from 1kg project''' |
| | 162 | |
| | 163 | {{{ |
| | 164 | java -jar -Xmx4g /tools/Sting/dist/GenomeAnalysisTK.jar \ -l INFO -T VariantFiltration \ -[B:variant,VCF B:variant,VCF] /output/sample_id.b37_1kg.snps.vcf \ -[B:mask,Bed B:mask,Bed] /output/sample_id.b37_1kg.indels.mask.bed \ -R /resources//hg19/indices/b37_1kg.fa \ -D /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ --out /output/sample_id.b37_1kg.snps.filtered.vcf \ --maskName InDel --clusterWindowSize 10 \ --filterName GATK_standard \ --filterExpression "AB > 0.75 && DP > 40 || DP > 100 || MQ0 > 40 || SB > -0.10"|| |
| | 165 | }}} |
| | 166 | '''S13. Variant eval on detected SNPs''' |
| | 167 | |
| | 168 | {{{ |
| | 169 | java -Xmx4g -jar /tools/Sting/dist/GenomeAnalysisTK.jar \ -T VariantEval -R /resources//hg19/indices/b37_1kg.fa \ -l INFO \ -[B:eval,VCF B:eval,VCF] /output/sample_id.b37_1kg.snps.filtered.vcf \ -D /resources//hg19/dbsnp/dbsnp_129_b37_b37_1kg.rod \ -o /output/sample_id.b37_1kg.snps.filtered.eval |
| | 170 | }}} |