import logging
import re

from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound
from multiqc.plots import table
from multiqc.plots.table_object import TableConfig
from multiqc.utils import mqc_colour

log = logging.getLogger(__name__)


class MultiqcModule(BaseMultiqcModule):
    """The module parses the output files generated by CheckM.
    It will only parse an output file from `checkm lineage_wf`, `checkm taxonomy_wf`, and `checkm qa`.
    The output file needs to be in format 1 (`-o 1`).
    All statistics for all samples are saved to `multiqc_data/checkm-table.txt`.

    Tested with CheckM v1.2.1
    """

    def __init__(self):
        super(MultiqcModule, self).__init__(
            name="CheckM",
            anchor="checkm",
            href="https://github.com/Ecogenomics/CheckM",
            info="Estimates genome completeness and contamination based on the presence or absence of marker genes.",
            doi=["10.1101/gr.186072.114"],
        )

        data_by_sample = {}
        for f in self.find_log_files("checkm"):
            self.parse_file(f, data_by_sample)
            self.add_data_source(f)

        data_by_sample = self.ignore_samples(data_by_sample)
        if len(data_by_sample) == 0:
            raise ModuleNoSamplesFound
        log.info(f"Found {len(data_by_sample)} reports")

        # Superfluous function call to confirm that it is used in this module
        # Replace None with actual version if it is available
        self.add_software_version()

        # Write parsed report data to a file
        self.write_data_file(data_by_sample, "multiqc_checkm")

        self.mag_quality_table(data_by_sample)

    def parse_file(self, f, data_by_sample):
        """Parses the file from `checkm qa`.
        Outputs from this command can come in several formats and with spaces or tabs.
        This is tested with formats 1 and 2 `-o [1|2]`, and with spaces (default) and tabs `--tab-file`
        """

        column_names_format_1 = (
            "Bin Id",
            "Marker lineage",
            "# genomes",
            "# markers",
            "# marker sets",
            "0",
            "1",
            "2",
            "3",
            "4",
            "5+",
            "Completeness",
            "Contamination",
            "Strain heterogeneity",
        )
        column_names_format_2 = (
            "Bin Id",
            "Marker lineage",
            "# genomes",
            "# markers",
            "# marker sets",
            "Completeness",
            "Contamination",
            "Strain heterogeneity",
            "Genome size (bp)",
            "# ambiguous bases",
            "# scaffolds",
            "# contigs",
            "N50 (scaffolds)",
            "N50 (contigs)",
            "Mean scaffold length (bp)",
            "Mean contig length (bp)",
            "Longest scaffold (bp)",
            "Longest contig (bp)",
            "GC",
            "GC std (scaffolds > 1kbp)",
            "Coding density",
            "Translation table",
            "# predicted genes",
            "0",
            "1",
            "2",
            "3",
            "4",
            "5+",
        )
        lines = f["f"].splitlines()
        lines = [line.strip() for line in lines if line.strip() and not line.startswith("--")]
        if len(lines) <= 1:
            log.warning(f"Skipping file {f['fn']} because it has no data")
            return

        header = lines[0].strip()
        if not header.startswith(("Bin Id")):
            log.warning(f"Unrecognized header in {f['fn']}: {header}")
            return

        # Check which format the data is in so we can grab the correct columns later
        column_names = []
        cols = re.split(r"\t| {3,}", header.rstrip("\n"))
        format_different_column = cols[5]
        if format_different_column == "0":
            column_names = column_names_format_1
        elif format_different_column == "Completeness":
            column_names = column_names_format_2
        else:
            log.warning(f"Unrecognized header in {f['fn']}: {header}")
            return

        for line in lines[1:]:
            row = re.split(r"\t| {3,}", line.rstrip("\n"))
            sname = row[0]
            if sname in data_by_sample:
                log.debug(f"Duplicate sample name found! Overwriting: {sname}")
            data_by_sample[sname] = {k: v for k, v in zip(column_names[1:], row[1:]) if v is not None}

    def mag_quality_table(self, data_by_sample):
        lineages = list(set(d.get("Marker lineage") for d in data_by_sample.values()))
        scale = mqc_colour.mqc_colour_scale("Dark2")
        lineages_colors = [{v: scale.get_colour(i, lighten=0.5)} for i, v in enumerate(lineages)]
        headers = {
            "Marker lineage": {
                "title": "Marker lineage",
                "description": "indicates lineage used for inferring marker set (a precise indication of where a bin was placed in CheckM's reference tree can be obtained with the tree_qa command)",
                "cond_formatting_colours": lineages_colors,
                "cond_formatting_rules": {v: [{"s_eq": v}] for v in lineages},
            },
            "# genomes": {
                "title": "Genomes",
                "description": "Number of reference genomes used to infer marker set.",
                "min": 0,
            },
            "# markers": {
                "title": "Markers",
                "description": "Number of inferred marker genes.",
                "min": 0,
                "scale": "YlGn",
            },
            "# marker sets": {
                "title": "Marker sets",
                "description": "Number of inferred co-located marker sets",
                "min": 0,
                "scale": "YlOrRd-rev",
            },
            "Completeness": {
                "title": "Completeness",
                "description": "Estimated completeness of genome as determined from the presence/absence of marker genes and the expected collocalization of these genes",
                "min": 0,
                "max": 100,
                "suffix": "%",
                "scale": "Purples",
                "format": "{:,.2f}",
            },
            "Contamination": {
                "title": "Contamination",
                "description": "Estimated contamination of genome as determined by the presence of multi-copy marker genes and the expected collocalization of these genes",
                "min": 0,
                "max": 100,
                "suffix": "%",
                "scale": "Reds",
                "format": "{:,.2f}",
            },
        }
        pconfig = TableConfig(
            title="Genome Quality",
            id="checkm-first-table",
            col1_header="Bin Id",
        )
        self.add_section(
            name="Bin quality",
            anchor="checkm-quality",
            description="The quality of microbial genomes recovered from isolates, single cells, and metagenomes.",
            helptext="An automated method for assessing the quality of a genome using a broader set of marker genes specific to the position of a genome within a reference genome tree and information about the collocation of these genes.",
            plot=table.plot(data=data_by_sample, headers=headers, pconfig=pconfig),
        )
