python – Matching two files(vcf to maf) using a dictionaries, and appending the contents

annotation_file

##INFO=<ID=ClinVar_CLNSIG,Number=.,xxx
##INFO=<ID=ClinVar_CLNREVSTAT,Number=.,yyy
##INFO=<ID=ClinVar_CLNDN,Number=.zzz
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    
chr1    10145   .       AAC     A       101.83  .       AC=2;AF=0.067;AN=30;aaa
chr1    10146   .       AC      A       98.25   .       AC=2;AF=0.083;AN=24;bbb
chr1    10146   .       AC      *       79.25   .       AC=2;AF=0.083;AN=24;ccc
chr1    10439   .       AC      A       81.33   .       AC=1;AF=0.008333;AN=120;ddd
chr1    10450   .       T       G       53.09   .       AC=2;AF=0.016;AN=124;eee

prioritization_file

#version 2.4
Hugo_Symbol     Entrez_Gene_Id  Center  NCBI_Build      Chromosome      Start_Position  End_Position    Strand  Variant_Classification  Variant_Type    Reference_Allele    Tumor_Seq_Allele1   Tumor_Seq_Allele2
DDX11L1 100287596       .       GRCh38  chr1    10146   10147   +       5'Flank DEL     AC      AC      -       novel           TUMOR   NORMAL  AC      AC                                              
DDX11L1 100287596       .       GRCh38  chr1    10147   10147   +       5'Flank DEL     C       C       -       rs779258992             TUMOR   NORMAL  C       C                                       
Unknown 0       .       GRCh38  chr1    10146   10147   +       Targeted_Region DEL     AC      AC      *                       TUMOR   NORMAL  AC      AC                                              
DDX11L1 100287596       .       GRCh38  chr1    10440   10440   +       5'Flank DEL     C       C       -       rs112766696             TUMOR   NORMAL  C       C                                       
DDX11L1 100287596       .       GRCh38  chr1    10450   10450   +       5'Flank SNP     T       T       G       novel           TUMOR   NORMAL  T       T                                               
DDX11L1 100287596       .       GRCh38  chr1    10456   10456   +       5'Flank SNP     T       T       G       rs1447714920            TUMOR   NORMAL  T       T      

Expected output

Hugo_Symbol     Entrez_Gene_Id  Center  NCBI_Build      Chromosome      Start_Position  End_Position    Strand  Variant_Classification  Variant_Type    Reference_Allele    Tumor_Seq_Allele1   Tumor_Seq_Allele2   Allele_Count   Allele_Freq    Allele_Num
DDX11L1 100287596       .       GRCh38  chr1    10146   10147   +       5'Flank DEL     AC      AC      -       novel           TUMOR   NORMAL  AC      AC  NA  NA  NA                                              
DDX11L1 100287596       .       GRCh38  chr1    10147   10147   +       5'Flank DEL     C       C       -       rs779258992             TUMOR   NORMAL  C       C  NA  NA  NA                                        
Unknown 0       .       GRCh38  chr1    10146   10147   +       Targeted_Region DEL     AC      AC      *                       TUMOR   NORMAL  AC      AC  2    0.083    24                                              
DDX11L1 100287596       .       GRCh38  chr1    10440   10440   +       5'Flank DEL     C       C       -       rs112766696             TUMOR   NORMAL  C       C  NA  NA  NA                                        
DDX11L1 100287596       .       GRCh38  chr1    10450   10450   +       5'Flank SNP     T       T       G       novel           TUMOR   NORMAL  T       T   2    0.016    124                                               
DDX11L1 100287596       .       GRCh38  chr1    10456   10456   +       5'Flank SNP     T       T       G       rs1447714920            TUMOR   NORMAL  T       T  NA  NA  NA                                        

I want to match two files using dictionaries, and appending the contents. I already extracted all the keys and values from two files, but I don’t know how to write them all(matched variants) in a file. If you have an idea, please let me know it. Thanks

(steps)

  1. get ‘chr_pos_ref_alt’ information of vcf file as ‘keys’ and ‘AC,AF,AN’ information in info of vcf file as ‘values’
  2. get ‘chr_pos_ref_alt’ information of maf file as ‘keys’ and each line as ‘values’
  3. write header and contents

Code

annotation_file_path = "annotation_file.vep.vcf"
prioritization_file_path = "./prioritization_file.maf"
output_file_path = "./temp.txt"



vcf_dict = {}
vcf_header = []
bool_start_variant = False
with open(annotation_file_path, "r") as annotation_file:
    
    for line in annotation_file:
        
        if "#CHROM" in line:
            bool_start_variant = True
            continue
        
        elif bool_start_variant:
            list_variant = line.split('\t')
            
            # ChrNum_Pos_RefAllele_AltAllele
            key_from_annotation_file = list_variant[0] + "_" + list_variant[1] + "_" + list_variant[3] + "_" + list_variant[4]


            info = list_variant[7]
            list_info = info.split(';')
            
            # Allele_Count '\t' Allele_Frequency '\t' Allele_Number
            value_allele_information = str(list_info[0][3:]) + '\t' + str(float(list_info[1][3:])) + '\t' + str(list_info[2][3:])
            vcf_dict[key_from_annotation_file] = value_allele_information
            
        else:
            pass



maf_header = []        
with open(prioritization_file_path, 'r') as prioritization_file:
    
    removed_first_row = prioritization_file.readline()
    for line in prioritization_file:
        maf_header.append(line.replace('\n', ''))
        break



maf_dict = {}
bool_start_variant = False    
with open(prioritization_file_path, 'r') as prioritization_file:
    
    for line in prioritization_file:

        if "#version" in line:
            bool_start_variant = True
            continue

        elif bool_start_variant:
            list_variant = line.split('\t')

            # ChrNum_Pos_RefAllele_AltAllele          
            key_from_prioritization_file = list_variant[4] + '_' + list_variant[5] + '_' + list_variant[10] + '_' + list_variant[12]
            maf_dict[key_from_prioritization_file] = list_variant

        else:
            pass



vcf_header="Allele_Count" + '\t' + 'Allele_Freq' + '\t' + 'Allele_Num'
with open(output_file_path, 'w') as output_file:
    # write header
    output_file.write(maf_header[0] + '\t' + vcf_header + '\n')
    
    for component in vcf_dict.keys():
        if component in maf_dict.keys():
            output_file.write(maf_dict.get(component) + '\t' + vcf_dict.get(component) + '\n')

                        

Read more here: Source link