phase_trio.sh | searchcode








phase_trio.sh | searchcode






PageRenderTime 24ms
CodeModel.GetById 16ms
app.highlight 5ms
RepoModel.GetById 1ms
app.codeStats 0ms






/Phase/phase_trio.sh







github.com/BioinformaticsArchive/fCNV

Shell | 87 lines |
37 code |
21 blank |
29 comment |
2 complexity | 381d3181a62b5f4993d29ca92b939711 MD5 |
raw file




 1#!/bin/bash
 2# --> genotype & phase the trio
 3
 4# $1 - path to maternal bam files
 5# $2 - path to paternal bam files
 6# $3 - path to fetal bam files
 7# $4 - reference sequence .fa file
 8# $5 - region to extract
 9
10#logfile=log_phaseTrio_$5.$$.log
11#exec > $logfile 2>&1
12
13date
14
15if [ $# -ne 5 ]; then
16    echo "Wrong number of arguments: got $#, expected 5"
17    exit
18fi
19
20bindir="/dupa-filer/laci/bin"
21
22#parse out chromosome ID
23region=$5
24chr=`echo $region | awk 'BEGIN {FS=":"}{print $1}'`
25chrno=`echo $chr | awk 'BEGIN {FS="chr"}{print $2}'`
26reference=$4
27
28<<comment
29# (1) extract the region and remove PCR duplicates
30echo "merging and removing PCR duplicates:"
31samtools merge -R $region - $1*/*/*.bam | samtools rmdup - __M.part.bam &
32samtools merge -R $region - $2*/*/*.bam | samtools rmdup - __P.part.bam &
33samtools merge -R $region - $3*/*/*.bam | samtools rmdup - __F.part.bam &
34wait
35samtools index __M.part.bam &
36samtools index __P.part.bam &
37samtools index __F.part.bam &
38wait
39#samtools view -h __M.part.bam > __M.part.sam &
40#samtools view -h __P.part.bam > __P.part.sam &
41#wait
42echo "-------- step 1 done ----------"
43comment
44
45# (2) genotype M, P, F, filter and phase
46prefix=trio
47echo "genotyping the trio"
48time samtools mpileup -uDSI -C50 -r $region -f $reference __M.allreads.part.bam __P.part.bam __F.part.bam | bcftools view -bvcg - > $prefix.genotype.raw.bcf
49
50time bcftools view $prefix.genotype.raw.bcf | vcfutils.pl varFilter -d60 -D200 -Q20 > $prefix.genotype.vcf
51# TODO: ???? what limit for depth of coverage to use?
52
53#annotate SNPs by rs-ids from dbSNP
54echo  "annotating called SNPs"
55java -Xmx24000m -jar ~/apps/snpEff/SnpSift.jar annotate -v /dupa-filer/laci/dbSnp.vcf $prefix.genotype.vcf > $prefix.genotype.annot.vcf
56 
57#extract only SNPs with reasonable quality score TODO: change qlimit depending on nummber of samples
58snpsFile=$prefix.snps.annot
59cat $prefix.genotype.annot.vcf | extract_annot_snps.awk -v qlimit=100 > $snpsFile.vcf
60
61#compress and index
62#bgzip $prefix.snps.annot.vcf
63#tabix -p vcf $prefix.snps.annot.vcf.gz
64
65refpanels=/dupa-filer/laci/reference_panels/$chr.1kg.ref.phase1_release_v3.20101123.vcf.gz
66
67#modify our trio VCF file so that its records are consistent with the reference VCF file
68#...this doesn't seems to work, rather use custom implementation - conform.py
69#time java -jar ~/apps/jar/conform-gt.jar  gt=$prefix.snps.annot.vcf.gz out=conform chrom=$chr ref=$refpanels
70
71#exclude markers that are not in the reference
72echo  "conforming markers with the reference panels"
73zcat $refpanels | conform.py $snpsFile.vcf /dev/stdin
74sed "s/$chr/$chrno/g" $snpsFile.ftr.vcf > $snpsFile.ftr2.vcf
75
76#phase by Beagle
77echo  "invoking Beagle for trio phasing w/ reference panels"
78time java -Xmx24000m -jar $bindir/b4.r1128.jar gt=$snpsFile.ftr2.vcf ped=pedigree.txt out=$prefix.phase ref=$refpanels impute=false
79#time java -jar $bindir/b4.r1128.jar gt=$snpsFile.vcf.gz ped=pedigree.txt out=$prefix.phase
80
81zcat $prefix.phase.vcf.gz > $prefix.phase.vcf
82
83# CLEAN-UP
84#rm __*.*
85
86
87








searchcode is proudly made in Sydney by ben boyter







Read more here: Source link