import csv
import logging
from io import StringIO
from typing import Dict, Union

from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound
from multiqc.plots import table
from multiqc.plots.table_object import TableConfig

log = logging.getLogger(__name__)


class MultiqcModule(BaseMultiqcModule):
    """The module parses the `quality_report.tsv` files generated by CheckM2.
    All statistics for all samples are saved to `multiqc_data/checkm2-first-table.txt`.

    Tested with CheckM2 v1.0.1 and v1.0.2
    """

    def __init__(self):
        super(MultiqcModule, self).__init__(
            name="CheckM2",
            anchor="checkm2",
            href="https://github.com/chklovski/CheckM2",
            info="Assesses microbial genome quality using machine learning.",
            doi=["10.1038/s41592-023-01940-w"],
        )

        data_by_sample = {}
        for f in self.find_log_files("checkm2"):
            self.parse_file(f, data_by_sample)
            self.add_data_source(f)

        data_by_sample = self.ignore_samples(data_by_sample)
        if len(data_by_sample) == 0:
            raise ModuleNoSamplesFound
        log.info(f"Found {len(data_by_sample)} reports")

        # Superfluous function call to confirm that it is used in this module
        # Replace None with actual version if it is available
        self.add_software_version()

        # Write parsed report data to a file
        self.write_data_file(data_by_sample, "multiqc_checkm2")

        self.mag_quality_table(data_by_sample)

    def parse_file(self, f, data_by_sample):
        """Parse the quality_report.tsv output."""
        reader = csv.DictReader(StringIO(f["f"]), delimiter="\t")
        for row in reader:
            sname = row.pop("Name")  # Remove and get the Name column
            if sname in data_by_sample:
                log.debug(f"Duplicate sample name found! Overwriting: {sname}")
            data_by_sample[sname] = {k: v for k, v in row.items() if v != "None"}

    def mag_quality_table(self, data_by_sample):
        """Write some quality stats and measures into a table."""
        headers = {
            "Completeness": {
                "title": "Predicted Completeness",
                "description": "The percentage of MAG length relative to predicted total MAG length.",
                "min": 0,
                "max": 100,
                "suffix": "%",
                "scale": "YlGn",
            },
            "Contamination": {
                "title": "Predicted Contamination",
                "description": "The length of the contaminating portion relative to the expected (complete, uncontaminated) genome length.",
                "min": 0,
                "suffix": "%",
                "format": "{:,.2f}",
                "scale": "YlOrRd",
            },
            "Completeness_Model_Used": {
                "title": "Completness Model Used",
                "description": "Which ML model was used to predict completeness.",
                "hidden": True,
            },
            "Translation_Table_Used": {
                "title": "Translation Table Used",
                "description": "Genetic code translation table Prodigal used for gene predition.",
                "scale": False,
                "hidden": True,
            },
            "Coding_Density": {
                "title": "Coding Density",
                "description": "Fraction of bases that are in predicted coding regions.",
                "min": 0,
                "max": 1,
                "scale": "YlGn",
                "format": "{:,.3f}",
            },
            "Contig_N50": {
                "title": "Contig N50",
                "description": "The contig length such that the sum of all contigs at least as long will be 50% of the total MAG length.",
                "hidden": True,
            },
            "Average_Gene_Length": {
                "title": "Average Gene Leght",
                "description": "The average number of amino acids in predicted genes.",
                "suffix": "a.a.",
                "format": "{:,.0f}",
            },
            "Genome_Size": {
                "title": "Genome Size",
                "description": "The predicted size of the genome",
                "scale": "YlGn",
            },
            "GC_Content": {
                "title": "GC Content",
                "description": "The fraction of the binned contig seqence that is G or C.",
                "format": "{:,.2f}",
            },
            "Total_Coding_Sequences": {
                "title": "Total Coding Sequences",
                "description": "The number of predicted coding sequences from Prodigal.",
                "scale": "YlGn",
            },
            "Total_Contigs": {
                "title": "Total Contigs",
                "description": "The number of contigs in the bin.",
                "hidden": True,
            },
            "Max_Contig_Length": {
                "title": "Max Contig Length",
                "description": "The length of the largest contig.",
                "hidden": True,
                "scale": "YlGn",
            },
            "Additional_Notes": {
                "title": "Additional Notes",
                "description": "Any additional notes output by CheckM2.",
            },
        }
        pconfig = TableConfig(
            title="Genome Quality",
            id="checkm2-first-table",
        )
        self.add_section(
            name="Bin quality",
            anchor="checkm2-quality",
            description="Rapid assessment of genome bin quality using machine learning.",
            helptext="The main use of CheckM2 is to predict the completeness and contamination of metagenome-assembled genomes (MAGs) and single-amplified genomes (SAGs), although it can also be applied to isolate genomes.",
            plot=table.plot(data=data_by_sample, headers=headers, pconfig=pconfig),
        )
