#!/usr/bin/env python3

import tempfile
from pathlib import PosixPath
from functools import cache


def get_readfile(wildcards):
    if reads_tarfile:
        readfile = Path(workingdir, "readfiles", "{read_file}.fastq.gz")
    elif reads:
        read_file_to_name = map_read_file_to_name(reads)
        readfile = read_file_to_name[wildcards.read_file]
    return readfile


def get_all_names(wildcards):
    if reads_tarfile:
        all_names = get_tarfile_names(wildcards)
    elif reads:
        read_file_to_name = map_read_file_to_name(reads)
        all_names = sorted(set(read_file_to_name.keys()))

    wildcard_constraints:
        read_file="|".join(all_names),

    return all_names


def get_porechop_results(wildcards):
    all_names = get_all_names(wildcards)

    porechop_results = expand(
        Path(workingdir, "porechop", "{read_file}.fastq"), read_file=all_names
    )
    return porechop_results


def map_read_file_to_name(reads):
    """
    This gets run twice, but if I try to cache it I get
      TypeError: unhashable type: 'list'
    """
    if not isinstance(reads, list):
        raise ValueError("Input must be a list of read files.")

    read_file_to_name = {}

    for file in reads:
        if not isinstance(file, PosixPath):
            raise ValueError(f"File {file} is not a Path() object")

        name = file.name.split(".")[0]

        if name in read_file_to_name:
            logger.error(f"Read files:     {reads}")
            logger.error(f"Existing names: {sorted(set(read_file_to_name.keys()))}")
            logger.error(f"Duplicate name: {name}")
            raise ValueError(f"The name of readfile {file} is not unique.")

        read_file_to_name[name] = file

    return read_file_to_name


globals().update(config)
workingdir = tempfile.mkdtemp()

logger.debug(f"Using {workingdir} for intermediate files")

if not logs_directory:
    logger.debug(f"Not keeping logs")
    logs_directory = workingdir
else:
    logger.debug(f"Saving logs to {logs_directory}")


if reads_tarfile:

    include: "rules/single_tarfile.smk"


rule compress_output:
    input:
        pipe=Path(workingdir, "filtlong.fastq"),
    output:
        reads_out=reads_out,
        gchist=Path(logs_directory, "gchist.txt"),
    log:
        Path(logs_directory, "compress_output", "reformat.log"),
    threads: workflow.cores - 1
    shell:
        "cat {input.pipe} | "
        "reformat.sh "
        "in=stdin.fastq "
        "int=f "
        "out={output.reads_out} "
        "gchist={output.gchist} "
        "threads={threads} "
        "zl=9 "
        "2>{log}"


rule filtlong:
    input:
        Path(workingdir, "porechop.fastq"),
    output:
        pipe(Path(workingdir, "filtlong.fastq")),
    params:
        min_length=min_length,
    log:
        Path(logs_directory, "filtlong.log"),
    shell:
        "filtlong "
        "--min_length {params.min_length} "
        "{input} "
        ">> {output} "
        "2> {log}"

# The glob doesn't get expanded by Snakemake and cat.sh is low memory, so I'm
# hoping this saves RAM usage for large number of input files.
rule collect_porechop_results:
    input:
        get_porechop_results,
    output:
        temp(Path(workingdir, "porechop.fastq")),
    params:
        reads_dir=lambda wildcards, input: subpath(input[0], parent=True),
    log:
        Path(logs_directory, "collect_porechop_results.log"),
    shell:
        "cat.sh "
        "{params.reads_dir}/*.fastq "
        "out={output} "
        "2> {log}"


# filtlong reads the input file twice, so you have to write it to disk :(
rule porechop:
    input:
        Path(workingdir, "collect_reads", "{read_file}.fastq"),
    output:
        temp(Path(workingdir, "porechop", "{read_file}.fastq")),
    log:
        Path(logs_directory, "porechop", "{read_file}.log"),
    threads: 1
    shell:
        "porechop "
        "-i {input} "
        "-o {output} "
        "--verbosity 1 "
        "--threads {threads} "
        "--discard_middle "
        "&> {log}"


rule collect_reads:
    input:
        get_readfile,
    output:
        fastq=temp(Path(workingdir, "collect_reads", "{read_file}.fastq")),
    log:
        Path(logs_directory, "collect_reads", "{read_file}.log"),
    threads: 1
    shell:
        "reformat.sh "
        "in={input} "
        "out={output.fastq} "
        "2>{log}"


include: "rules/stats.smk"


rule target:
    default_target: True
    input:
        reads_out,
        stats,
