PageRenderTime 24ms
CodeModel.GetById 16ms
app.highlight 5ms
RepoModel.GetById 1ms
app.codeStats 0ms
/Phase/phase_trio.sh
github.com/BioinformaticsArchive/fCNV
Shell | 87 lines |
37 code |
21 blank |
29 comment |
2 complexity | 381d3181a62b5f4993d29ca92b939711 MD5 |
raw file
1#!/bin/bash 2# --> genotype & phase the trio 3 4# $1 - path to maternal bam files 5# $2 - path to paternal bam files 6# $3 - path to fetal bam files 7# $4 - reference sequence .fa file 8# $5 - region to extract 9 10#logfile=log_phaseTrio_$5.$$.log 11#exec > $logfile 2>&1 12 13date 14 15if [ $# -ne 5 ]; then 16 echo "Wrong number of arguments: got $#, expected 5" 17 exit 18fi 19 20bindir="/dupa-filer/laci/bin" 21 22#parse out chromosome ID 23region=$5 24chr=`echo $region | awk 'BEGIN {FS=":"}{print $1}'` 25chrno=`echo $chr | awk 'BEGIN {FS="chr"}{print $2}'` 26reference=$4 27 28<<comment 29# (1) extract the region and remove PCR duplicates 30echo "merging and removing PCR duplicates:" 31samtools merge -R $region - $1*/*/*.bam | samtools rmdup - __M.part.bam & 32samtools merge -R $region - $2*/*/*.bam | samtools rmdup - __P.part.bam & 33samtools merge -R $region - $3*/*/*.bam | samtools rmdup - __F.part.bam & 34wait 35samtools index __M.part.bam & 36samtools index __P.part.bam & 37samtools index __F.part.bam & 38wait 39#samtools view -h __M.part.bam > __M.part.sam & 40#samtools view -h __P.part.bam > __P.part.sam & 41#wait 42echo "-------- step 1 done ----------" 43comment 44 45# (2) genotype M, P, F, filter and phase 46prefix=trio 47echo "genotyping the trio" 48time samtools mpileup -uDSI -C50 -r $region -f $reference __M.allreads.part.bam __P.part.bam __F.part.bam | bcftools view -bvcg - > $prefix.genotype.raw.bcf 49 50time bcftools view $prefix.genotype.raw.bcf | vcfutils.pl varFilter -d60 -D200 -Q20 > $prefix.genotype.vcf 51# TODO: ???? what limit for depth of coverage to use? 52 53#annotate SNPs by rs-ids from dbSNP 54echo "annotating called SNPs" 55java -Xmx24000m -jar ~/apps/snpEff/SnpSift.jar annotate -v /dupa-filer/laci/dbSnp.vcf $prefix.genotype.vcf > $prefix.genotype.annot.vcf 56 57#extract only SNPs with reasonable quality score TODO: change qlimit depending on nummber of samples 58snpsFile=$prefix.snps.annot 59cat $prefix.genotype.annot.vcf | extract_annot_snps.awk -v qlimit=100 > $snpsFile.vcf 60 61#compress and index 62#bgzip $prefix.snps.annot.vcf 63#tabix -p vcf $prefix.snps.annot.vcf.gz 64 65refpanels=/dupa-filer/laci/reference_panels/$chr.1kg.ref.phase1_release_v3.20101123.vcf.gz 66 67#modify our trio VCF file so that its records are consistent with the reference VCF file 68#...this doesn't seems to work, rather use custom implementation - conform.py 69#time java -jar ~/apps/jar/conform-gt.jar gt=$prefix.snps.annot.vcf.gz out=conform chrom=$chr ref=$refpanels 70 71#exclude markers that are not in the reference 72echo "conforming markers with the reference panels" 73zcat $refpanels | conform.py $snpsFile.vcf /dev/stdin 74sed "s/$chr/$chrno/g" $snpsFile.ftr.vcf > $snpsFile.ftr2.vcf 75 76#phase by Beagle 77echo "invoking Beagle for trio phasing w/ reference panels" 78time java -Xmx24000m -jar $bindir/b4.r1128.jar gt=$snpsFile.ftr2.vcf ped=pedigree.txt out=$prefix.phase ref=$refpanels impute=false 79#time java -jar $bindir/b4.r1128.jar gt=$snpsFile.vcf.gz ped=pedigree.txt out=$prefix.phase 80 81zcat $prefix.phase.vcf.gz > $prefix.phase.vcf 82 83# CLEAN-UP 84#rm __*.* 85 86 87
Read more here: Source link