python – Missing input files after defining them in function

I am trying to do QC on RNAseq data that is tarballed. I am using Snakemake as a workflow manager and am aware that Snakemake does not like one-to-many rules. I defining a checkpoint would fix the problem but when I run the script I get this this error message with rule fastqc.

MissingInputException in line 49 of /home/user/2022-h1n1/Snakefile:
Missing input files for rule fastqc:
raw/[]_R1.fastq.gz

Here is my Snakefile, and I have the data linked in rule download_data. The file is 61.1Gb so it may not be worth downloading if your system does not have a lot of memory. Any advice would help!

# Snakemake file - input raw reads to generate quant files for analysis in R
configfile: "config.yaml"

import io 
import os
import pandas as pd
import pathlib
from snakemake.exceptions import print_exception, WorkflowError

#----SET VARIABLES----#
PROJ = config["proj_name"]
INPUTDIR = config["raw-data"]
SCRATCH = config["scratch"]
REFERENCE = config["ref"]
OUTPUTDIR = config["outputDIR"]

# Adapters
SE_ADAPTER = config['seq']['SE']
SE_SEQUENCE = config['seq']['trueseq-se']

# Organsim
TRANSCRIPTOME = config['transcriptome']['human']
SPECIES = config['species']['human']

SAMPLE_LIST = glob_wildcards(INPUTDIR + "{basenames}_R1.fastq.gz")

rule all:
    input:
        "finished.txt",
        raw_multi_html = SCRATCH + "fastqc/raw_multiqc.html",
        raw_multi_stats = SCRATCH + "fastqc/raw_multiqc_general_stats.txt",
        raw_qc = expand( SCRATCH + "fastqc/{basenames}_R1_fastqc.zip", basenames=SAMPLE_LIST),
        trim_qc = expand( SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip", basenames=SAMPLE_LIST)

rule download_data:
    output: "high_quality_files.tgz"
    shell: "curl -L -o {output} https://osf.io/pcxfg/download"

checkpoint decompress_h1n1:
    output: directory(INPUTDIR)
    input: "high_quality_files.tgz"
    params: INPUTDIR
    shell:
        '''
        mkdir -p {params}
        tar xzvf {input} -C {params}
        '''

rule fastqc:
    input:  INPUTDIR + "{basenames}_R1.fastq.gz"
    output:
        raw_html = SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
        raw_zip = SCRATCH + "fastqc/{basenames}_R1_fastqc.zip"
    wrapper:
        "0.80.3/bio/fastqc"

def aggregate_decompress_h1n1(wildcards):
    checkpoint_output = checkpoints.decompress_h1n1.get(**wildcards).output[0]    
    filenames = expand(SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
     SCRATCH + "fastqc/{basenames}_R1_fastqc.zip",
     basenames = glob_wildcards(os.path.join(checkpoint_output, "{basenames}_R1.fastq.gz")).basenames)
    return filenames

rule download_trimmomatic_adapter_file:
    output: REFERENCE + SE_ADAPTER
    shell:
        """
        curl -L {SE_SEQUENCE} -o {output}
        """

rule trimmmomatic_se:
    input: 
        reads= INPUTDIR + "{basenames}_R1.fastq.gz",
        adapters= REFERENCE + SE_ADAPTER,
    output: 
        reads = SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
        unpaired = SCRATCH + "trimmed/{basenames}_R1.unpaired.fastq.gz"
    conda: "rnaseq.yml"
    shell:
        """
        trimmomatic SE {input.reads} 
        {output.reads} {output.unpaired} 
        ILLUMINACLIP:{input.adapters}:2:0:15 LEADING:2 TRAILING:2 
        SLIDINGWINDOW:4:2 MINLEN:25    
        """

rule fastqc_trim:
    input: SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz"
    output:
      html = SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.html",
      zip = SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip"
    params: ""
    log:
      SCRATCH + "logs/fastqc/{basenames}_R1_trimmed.log"
    wrapper:
        "0.35.2/bio/fastqc"

rule multiqc:
    input:
        raw_qc = expand(SCRATCH + "fastqc/{basenames}_R1_fastqc.zip", basenames=SAMPLE_LIST),
        trim_qc = expand(SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip", basenames=SAMPLE_LIST)
    output:
        raw_multi_html = SCRATCH + "fastqc/raw_multiqc.html", 
        raw_multi_stats = SCRATCH + "fastqc/raw_multiqc_general_stats.txt",
        trim_multi_html = SCRATCH + "fastqc/trimmed_multiqc.html", 
        trim_multi_stats = SCRATCH + "fastqc/trimmed_multiqc_general_stats.txt"
    conda: "env/rnaseq.yml"
    shell: 
        """
        multiqc -n multiqc.html {input.raw_qc} #run multiqc
        mv multiqc.html {output.raw_multi_html} #rename html
        mv multiqc_data/multiqc_general_stats.txt {output.raw_multi_stats} #move and rename stats
        rm -rf multiqc_data #clean-up
        #repeat for trimmed data
        multiqc -n multiqc.html {input.trim_qc} #run multiqc
        mv multiqc.html {output.trim_multi_html} #rename html
        mv multiqc_data/multiqc_general_stats.txt {output.trim_multi_stats} #move and rename stats
        rm -rf multiqc_data #clean-up
        """ 

rule download_transcriptome:
    output: REFERENCE + SPECIES
    shell:
        """
        curl -L {TRANSCRIPTOME} -o {output}
        """

rule salmon_index:
    input:
        ref = REFERENCE + SPECIES
    output: directory(OUTPUTDIR + "quant/sc_ensembl_index")
    conda: "env/rnaseq.yml"
    shell:
        """
        salmon index --index {output} --transcripts {input} # --type quasi
        """

rule salmon_quant:
    input:
        reads = SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
        index_dir = OUTPUTDIR + "quant/sc_ensembl_index"
    output: OUTPUTDIR + "{basenames}_quant/quant.sf"
    params:
        outdir= lambda wildcards: OUTPUTDIR + wildcards.sample + "_quant"
    conda: "env/rnaseq.yml"
    shell:
        """
        salmon quant -i {input.index_dir} --libType A -r {input.reads} -o {params.outdir} --seqBias --gcBias --validateMappings
        """

rule finished:
    input: aggregate_decompress_h1n1
    output: "finished.txt"
    shell:
        '''
        touch {output}
        '''

Read more here: Source link