annotation_file
##INFO=<ID=ClinVar_CLNSIG,Number=.,xxx
##INFO=<ID=ClinVar_CLNREVSTAT,Number=.,yyy
##INFO=<ID=ClinVar_CLNDN,Number=.zzz
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 10145 . AAC A 101.83 . AC=2;AF=0.067;AN=30;aaa
chr1 10146 . AC A 98.25 . AC=2;AF=0.083;AN=24;bbb
chr1 10146 . AC * 79.25 . AC=2;AF=0.083;AN=24;ccc
chr1 10439 . AC A 81.33 . AC=1;AF=0.008333;AN=120;ddd
chr1 10450 . T G 53.09 . AC=2;AF=0.016;AN=124;eee
prioritization_file
#version 2.4
Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2
DDX11L1 100287596 . GRCh38 chr1 10146 10147 + 5'Flank DEL AC AC - novel TUMOR NORMAL AC AC
DDX11L1 100287596 . GRCh38 chr1 10147 10147 + 5'Flank DEL C C - rs779258992 TUMOR NORMAL C C
Unknown 0 . GRCh38 chr1 10146 10147 + Targeted_Region DEL AC AC * TUMOR NORMAL AC AC
DDX11L1 100287596 . GRCh38 chr1 10440 10440 + 5'Flank DEL C C - rs112766696 TUMOR NORMAL C C
DDX11L1 100287596 . GRCh38 chr1 10450 10450 + 5'Flank SNP T T G novel TUMOR NORMAL T T
DDX11L1 100287596 . GRCh38 chr1 10456 10456 + 5'Flank SNP T T G rs1447714920 TUMOR NORMAL T T
Expected output
Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Allele_Count Allele_Freq Allele_Num
DDX11L1 100287596 . GRCh38 chr1 10146 10147 + 5'Flank DEL AC AC - novel TUMOR NORMAL AC AC NA NA NA
DDX11L1 100287596 . GRCh38 chr1 10147 10147 + 5'Flank DEL C C - rs779258992 TUMOR NORMAL C C NA NA NA
Unknown 0 . GRCh38 chr1 10146 10147 + Targeted_Region DEL AC AC * TUMOR NORMAL AC AC 2 0.083 24
DDX11L1 100287596 . GRCh38 chr1 10440 10440 + 5'Flank DEL C C - rs112766696 TUMOR NORMAL C C NA NA NA
DDX11L1 100287596 . GRCh38 chr1 10450 10450 + 5'Flank SNP T T G novel TUMOR NORMAL T T 2 0.016 124
DDX11L1 100287596 . GRCh38 chr1 10456 10456 + 5'Flank SNP T T G rs1447714920 TUMOR NORMAL T T NA NA NA
I want to match two files using dictionaries, and appending the contents. I already extracted all the keys and values from two files, but I don’t know how to write them all(matched variants) in a file. If you have an idea, please let me know it. Thanks
(steps)
- get ‘chr_pos_ref_alt’ information of vcf file as ‘keys’ and ‘AC,AF,AN’ information in info of vcf file as ‘values’
- get ‘chr_pos_ref_alt’ information of maf file as ‘keys’ and each line as ‘values’
- write header and contents
Code
annotation_file_path = "annotation_file.vep.vcf"
prioritization_file_path = "./prioritization_file.maf"
output_file_path = "./temp.txt"
vcf_dict = {}
vcf_header = []
bool_start_variant = False
with open(annotation_file_path, "r") as annotation_file:
for line in annotation_file:
if "#CHROM" in line:
bool_start_variant = True
continue
elif bool_start_variant:
list_variant = line.split('\t')
# ChrNum_Pos_RefAllele_AltAllele
key_from_annotation_file = list_variant[0] + "_" + list_variant[1] + "_" + list_variant[3] + "_" + list_variant[4]
info = list_variant[7]
list_info = info.split(';')
# Allele_Count '\t' Allele_Frequency '\t' Allele_Number
value_allele_information = str(list_info[0][3:]) + '\t' + str(float(list_info[1][3:])) + '\t' + str(list_info[2][3:])
vcf_dict[key_from_annotation_file] = value_allele_information
else:
pass
maf_header = []
with open(prioritization_file_path, 'r') as prioritization_file:
removed_first_row = prioritization_file.readline()
for line in prioritization_file:
maf_header.append(line.replace('\n', ''))
break
maf_dict = {}
bool_start_variant = False
with open(prioritization_file_path, 'r') as prioritization_file:
for line in prioritization_file:
if "#version" in line:
bool_start_variant = True
continue
elif bool_start_variant:
list_variant = line.split('\t')
# ChrNum_Pos_RefAllele_AltAllele
key_from_prioritization_file = list_variant[4] + '_' + list_variant[5] + '_' + list_variant[10] + '_' + list_variant[12]
maf_dict[key_from_prioritization_file] = list_variant
else:
pass
vcf_header="Allele_Count" + '\t' + 'Allele_Freq' + '\t' + 'Allele_Num'
with open(output_file_path, 'w') as output_file:
# write header
output_file.write(maf_header[0] + '\t' + vcf_header + '\n')
for component in vcf_dict.keys():
if component in maf_dict.keys():
output_file.write(maf_dict.get(component) + '\t' + vcf_dict.get(component) + '\n')
Read more here: Source link