High ref mismatch rate after liftOver from 23andme hg19 to hg38

I lifted some 23andme files from hg19 to hg38 using the following workflow in R calling samtools,plink and liftOver:

library(tidyverse)


#set working directory to data directory
trio_wd <- str_glue(here::here(),'/trio/K/')

#create file list for raw data
file_list <- str_c(trio_wd,dir(trio_wd)) %>% str_extract('genome.+\\d.txt') %>% str_extract('^(?:(?!admix).)+$') %>%
  unique() %>%
  {.[!is.na(.)]}  %>% str_c(trio_wd,.)

#liftover loop
for (x in file_list){
  #extract sample name
  name = str_extract(x,'(?<=genome_)(.*?)(?=(_v5|_Full))') %>% str_replace('_','')
  #recode to vcf
  call = (str_glue("plink --23file {x} {name} --snps-only just-acgt --recode vcf --out {trio_wd}{name}"))
  system(call)
  #recode to uscf bed format 
  #https://www.researchgate.net/post/How_can_I_create_a_UCSC_BED_file_from_plink_data
  call = str_c("grep -v '^#' ", trio_wd,name,".vcf | awk -F '\t' '{print $1,($2-1),$2,$3}' | sed -e 's/^/chr/' > ", trio_wd,name,".ucsc.bed ")
  system(call)
  #iftover
  #correct chain file : https://groups.google.com/a/soe.ucsc.edu/g/genome/c/BYVXWMpOEjc
  system(str_glue("liftOver  {trio_wd}{name}.ucsc.bed /reference/hg19ToHg38.over.chain.gz {trio_wd}{name}_liftOver.bed  {trio_wd}{name}unlifted.bed"))

  #update positions
#read in the original vcf file
vcf <- read_table(str_glue( "{trio_wd}{name}.vcf"), skip = 30)
#read in the liftOver output
lift <- read_table(str_glue("{trio_wd}{name}_liftOver.bed"),col_names = F)
#update the positions of the original vcf file to the lifted positions
vcf %>% filter(ID %in% lift$X4) %>% mutate(POS = lift$X3,
                                           `#CHROM` = lift$X1) %>% write_tsv(str_glue( "{trio_wd}{name}_lift.vcf"), col_names = T)
#recode to plink
call = str_glue("plink -vcf {trio_wd}{name}_lift.vcf --make-bed --aec --chr 1-22 -out {trio_wd}{name}_lift")
system(call)
}


#create list of lifted files
file_list <- list.files(trio_wd) %>%  str_extract('(.+((k|K)|P))_lift') %>%
  unique %>% {.[!is.na(.)]} %>% str_c(trio_wd,.)

#main loop for data cleaning 
for (x in file_list){
name = str_extract(x,'[A-Za-z]+(?=((k|K)|P)).+')
call = str_glue("plink2 --bfile {trio_wd}{name} --set-all-var-ids '@_#' --make-bed -out {trio_wd}{name}_fixvar")
system(call)
call = str_glue("plink2 --bfile {trio_wd}{name}_fixvar --rm-dup --make-bed -out {trio_wd}{name}_dup")
system(call)
#a few (<10 SNPs): 'duplicate IDs with inconsistent genotype data or variant information', remove if there
if(file.exists(str_glue("{trio_wd}{name}_dup.rmdup.mismatch"))){
  call = str_glue("plink -bfile {trio_wd}{name}_fixvar --exclude {trio_wd}{name}_dup.rmdup.mismatch --make-bed -out {trio_wd}{name}_dup_mis")
  system(call)
  call = str_glue("plink2 --bfile {trio_wd}{name}_dup_mis --set-all-var-ids '@_#' --make-bed -out {trio_wd}{name}_dup_mis_id")
  system(call)
  call = str_glue("plink2 --bfile {trio_wd}{name}_dup_mis_id --rm-dup --make-bed -out {trio_wd}{name}_dup")
  system(call)
}
#ensure that there are no positions with same allele coding
call = str_c("awk '$5==$6 {print $2}' " ,trio_wd, name, "_dup.bim >", trio_wd, name, "_dup_noalle.txt")
system(call)
call = str_glue("plink -bfile {trio_wd}{name}_dup --exclude {trio_wd}{name}_dup_noalle.txt  --make-bed -out {trio_wd}{name}_dup_fixal")
system(call)
}


file_list <- list.files(trio_wd) %>%  str_extract('.+_dup_fixal.bed$') %>%
  str_remove('.bed')  %>%
    {.[!is.na(.)]} %>% str_c(trio_wd,.)
write_lines(file_list,str_glue('{trio_wd}merge.txt'))

# try to merge to single file
system(str_glue("plink --merge-list {trio_wd}merge.txt --make-bed --out {trio_wd}K"))

for(x in file_list){
  system(str_glue("plink --bfile {x} --exclude {trio_wd}K-merge.missnp --make-bed --out {str_replace(x, 'fixal', 'subset')}"))
}

file_list <- list.files(trio_wd) %>%  str_extract('.+_dup_subset.bed') %>%
  str_remove('.bed')  %>%
  {.[!is.na(.)]} %>% str_c(trio_wd,.)
write_lines(file_list,str_glue('{trio_wd}merge.txt'))

# merge to single file
system(str_glue("plink --merge-list {trio_wd}merge.txt --make-bed --out {trio_wd}K"))

If do some sanity check afterwards to verify that the liftover process went well I tried the fixref plugin from bcftools to get an overview of possible ref missmatches:

#use some filtering to improve genotype rates
     call <- str_glue('plink --bfile {trio_wd}K  --snps-only just-acgt -geno 0.05 --recode vcf bgz --out {trio_wd}K ')
      system(call)
#index the compressed vcf file
      call = str_glue('bcftools index -f {trio_wd}K.vcf.gz')
      system(call)
#call fixref
      call = str_glue("bcftools +fixref {trio_wd}K.vcf.gz -- -f reference/GRCh38_full_analysis_set_plus_decoy_hla_modified.fa")
      system(call)

The results can be seen in the following screenshot:

fixref plugin result

This clearly looks to me that something went wrong but I can’t figure out what the possible problem here could be. Do I use an incorrect hg38 reference file? Did I use some wrong filter? Is the liftOver procedure inccorect?

Read more here: Source link