from Bio.SeqFeature import FeatureLocation, SeqFeature
from collections import defaultdict
from operator import itemgetter
import csv
import decimal
import itertools
import json
import math
import os
from typing import Dict, List, Optional, Union, Any, Tuple, Generator, Set, TypedDict
import pickle
import shutil
import statistics
import subprocess
import sys
import traceback
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from scipy import stats
import pandas as pd
import pyhmmer
import tqdm
import xlsxwriter
from zol import data_dictionary, util
import peptides


AMBIGUOUS_AMINO_ACIDS = set(["B", "J", "Z", "X"])

def determine_ranges(i) -> Generator[Tuple[int, int], None, None]:
    """
    Deteremine continuous ranges from a list of integers. Answer taken from the answers by
    user97370 and bossylobster on StackOverflow:
    https://stackoverflow.com/questions/4628333/converting-a-list-of-integers-into-range-in-python
    """
    for a, b in itertools.groupby(
        enumerate(i), lambda pair: pair[1] - pair[0]
    ):
        b = list(b)
        yield b[0][1], b[-1][1]


def split_by_idx(S, list_of_indices) -> Any:
    """
    Function taken from https://stackoverflow.com/questions/10851445/splitting-a-string-by-list-of-indices
    """
    left, right = 0, list_of_indices[0]
    yield S[left:right]
    left = right
    for right in list_of_indices[1:]:
        yield S[left:right]
        left = right
    yield S[left:]


def perform_sl_full_protein_clustering(
    ortho_matrix_file, workspace_dir, full_prot_cluster_file, log_object
) -> None:
    """
    Description:
    Runs single-linkage clustering of full proteins based on domain / intra-domain sequence orthogroups.
    """
    try:
        observed_fp_pairs = set([])
        observed_fp = set([])
        pair_file = workspace_dir + "Full_Protein_Pairs.txt"
        with open(pair_file, "w") as opf_handle:
            with open(ortho_matrix_file) as oomf:
                for i, line in enumerate(oomf):
                    line = line.strip("\n")
                    ls = line.split("\t")
                    if i == 0:
                        continue
                    else:
                        og = ls[0]
                        flat_fps = []
                        for lts in ls[1:]:
                            lts = lts.strip()
                            if lts == "":
                                continue
                            for lt in lts.split(", "):
                                fp = "|".join(lt.split("|")[:-2])
                                flat_fps.append(fp)
                                observed_fp.add(fp)
                        for j, fp1 in enumerate(sorted(flat_fps)):
                            for k, fp2 in enumerate(sorted(flat_fps)):
                                if j <= k:
                                    continue
                                pair_tuple = tuple(sorted([fp1, fp2]))
                                if pair_tuple in observed_fp_pairs:
                                    continue
                                opf_handle.write(fp1 + f"\t{fp2}\n")
                                observed_fp_pairs.add(pair_tuple)
        

        clusters_file = workspace_dir + "Full_Protein_Clusters.txt"
        clust_cmd = ["slclust", "<", pair_file, ">", clusters_file]
        util.run_cmd_via_subprocess(clust_cmd, log_object=log_object, check_files=[clusters_file])

        cluster_id = 1
        with open(full_prot_cluster_file, "w") as res_handle:
            clustered_set = set([])
            with open(clusters_file) as ocf:
                for line in ocf:
                    line = line.strip()
                    ls = line.split()
                    for prot in ls:
                        clustered_set.add(prot)
                        res_handle.write(
                            f"{prot}\tSL_{cluster_id}\n"
                        )
                    cluster_id += 1

            for prot in observed_fp:
                if not prot in clustered_set:
                    res_handle.write(f"{prot}\tSL_{cluster_id}\n")
                    cluster_id += 1

    except Exception as e:
        sys.stderr.write(traceback.format_exc() + "\n")
        log_object.error(traceback.format_exc())
        sys.exit(1)


def map_chunk_protein_coords_to_feature_coords(
    start_coord,
    end_coord,
    tg_seq_chunk,
    tg_coord_info,
    name,
    evalue,
    eukaryotic_gene_cluster_flag,
) -> SeqFeature:
    """
    Description:
    Determines coordinates of chopped protein region from feature nucleotide coordinate info - which could include
    skipped regions corresponding to introns.
    ********************************************************************************************************************
    Parameters:
    - start_coord: The start coordinate of the chunk along the protein sequence.
    - end_coord: The end coordinate of the chuk along the protein sequence.
    - tg_seq_chunk: The protein chunk for validation
    - tg_coord_info: Coordinate info for the full feature from the GenBank file.
    - name: The unique domain identifier.
    - evalue: The E-value from pyhmmer for the domain hit. NA if not a domain hit but inter-domain region or full protein.
    - log_object: A logging object.
    ********************************************************************************************************************
    """

    try:
        start_coord = start_coord - 1
        nucl_seq, all_coords, start, end, direction = (
            tg_coord_info
        )
        chunk_nucl_seq = ""
        nucl_start_coord = start_coord * 3
        nucl_end_coord = end_coord * 3
        chunk_coords = []
        nucl_coord = 0

        if direction == "+":
            for sc, ec, dc in sorted(
                all_coords, key=itemgetter(0), reverse=False
            ):
                for pos in range(sc, ec + 1):
                    if (
                        nucl_coord >= nucl_start_coord
                        and nucl_coord < nucl_end_coord
                    ):
                        chunk_nucl_seq += nucl_seq[nucl_coord]
                        chunk_coords.append(pos)
                    nucl_coord += 1
        else:
            for sc, ec, dc in sorted(
                all_coords, key=itemgetter(0), reverse=True
            ):
                coord_list = list(set(range(sc, ec + 1)))
                for pos in sorted(coord_list, reverse=True):
                    if (
                        nucl_coord >= nucl_start_coord
                        and nucl_coord < nucl_end_coord
                    ):
                        chunk_nucl_seq += nucl_seq[nucl_coord]
                        chunk_coords.append(pos)
                    nucl_coord += 1

        translated_prot_seq = str(Seq(chunk_nucl_seq).translate())

        """
		# useful code for debugging
        if not len(chunk_nucl_seq)%3 == 0:
            print(start)
            print(start_coord + 1)

            print(end)
            print(end_coord)
            print(nucl_start_coord)
            print(nucl_end_coord)

            print(direction)
            print('------')
            print(nucl_seq)
            print('-----')
            print(chunk_nucl_seq)
            print(translated_prot_seq)
            print(tg_seq_chunk)
            sys.exit(1)
        """

        # account for alterante initiator codons "GTG" and "TTG"
        if (
            not eukaryotic_gene_cluster_flag
            and start_coord == 0
            and translated_prot_seq[0] in set(["V", "L"])
        ):
            translated_prot_seq = "M" + translated_prot_seq[1:]
            msg = \
     "Warning: changing starting protein residue from V or L to M. This is expected for bacteria, but if you are seeing this with running a eukaryotic dataset - it indicates an issue!"
            sys.stderr.write(msg + "\n")

        try:
            assert translated_prot_seq == tg_seq_chunk
        except:
            # further check if the translated sequence is just the 
            # result of imputation of an amino acid with an ambiguuous
            # base at the wobble position.
            self_translation_amb_count = 0
            gbk_translation_amb_count = 0
            for i, aa in enumerate(translated_prot_seq):
                gbk_aa = tg_seq_chunk[i]
                if gbk_aa == aa or (gbk_aa in AMBIGUOUS_AMINO_ACIDS or aa in AMBIGUOUS_AMINO_ACIDS):
                    if gbk_aa in AMBIGUOUS_AMINO_ACIDS:
                        gbk_translation_amb_count += 1
                    if aa in AMBIGUOUS_AMINO_ACIDS:
                        self_translation_amb_count += 1
                else:
                    raise RuntimeError(f"Translation error for {name}, the trnaslsated sequence {translated_prot_seq} does not match translation in GenBank {tg_seq_chunk}")   
            
            # conservatively go with the one with more ambiguous amino acids
            if self_translation_amb_count > gbk_translation_amb_count:
                tg_seq_chunk = translated_prot_seq

        fstrand = 1
        if direction == "-":
            fstrand = -1

        range_coords = determine_ranges(sorted(chunk_coords))
        coord_feat_locs = []

        for rc in range_coords:
            coord_feat_locs.append(
                FeatureLocation(rc[0], rc[1], strand=fstrand)
            )

        summed_coord_feat_locs = sum(coord_feat_locs)
        if direction == "-":
            summed_coord_feat_locs = sum(coord_feat_locs[::-1])
        feature = SeqFeature(summed_coord_feat_locs, type="cCDS")

        feature.qualifiers["translation"] = Seq(tg_seq_chunk)
        feature.qualifiers["paf_nucl_seq"] = chunk_nucl_seq
        feature.qualifiers["locus_tag"] = name
        feature.qualifiers["ccds_pyhmmer_evalue"] = evalue
        return feature
    except Exception as e:
        sys.stderr.write(traceback.format_exc() + "\n")
        raise RuntimeError()


def create_chopped_genbank(inputs) -> Tuple[str, Optional[str]]:
    """
    Description:
    Create a chopped CDS GenBank file from a regular GenBank file - core function for batchCreateChoppedGenbanks(). \
    ********************************************************************************************************************
    Parameters:
    - inputs:
            - gbk: Input GenBank file.
            - prot_file: The protbin / cgceome file to temporary create to perfrom domain annotation using pyhmmer.
            - mapping_file: The mapping file between
            - ccds_gbk_file: The chopped up GenBank file.
            - pfam_db_file: The Pfam HMM DB file.
            - pfam_z: The Pfam record count - for accurate E-value estimation.
            - minimal_length: The minimum length in amino acids for a domain matching or intra-domain region to be kept and
                          tagged as a chopped CDS feature.
    - threads: The number of threads to use [Default is 1].
    ********************************************************************************************************************
    """
    (
        gbk,
        prot_file,
        mapping_file,
        ccds_gbk_file,
        pfam_db_file,
        pfam_z,
        minimal_length,
        eukaryotic_gene_cluster_flag,
    ) = inputs
    try:
        lt_coord_info: Dict[str, Any] = {}
        with open(prot_file, "w") as pf_handle:
            feat_record: Dict[str, Any] = {}
            with open(gbk) as ogbk:
                for rec in SeqIO.parse(ogbk, "genbank"):
                    full_sequence = str(rec.seq)
                    for feature in rec.features:
                        if feature.type == "CDS":
                            lt = feature.qualifiers.get("locus_tag")[0]
                            seq = feature.qualifiers.get("translation")[0]
                            nucl_seq = None
                            start, end, direction, all_coords = util.process_location_string(str(feature.location)) # type: ignore

                            nucl_seq = ""
                            for sc, ec, dc in sorted(
                                all_coords, key=itemgetter(0), reverse=False
                            ):
                                if ec >= len(full_sequence):
                                    nucl_seq += full_sequence[sc - 1 :]
                                else:
                                    nucl_seq += full_sequence[sc - 1 : ec]
                            if direction == "-":
                                nucl_seq = str(Seq(nucl_seq).reverse_complement())

                            feat_record[lt] = rec.id
                            lt_coord_info[lt] = [
                                nucl_seq,
                                all_coords,
                                start,
                                end,
                                direction
                            ]
                            pf_handle.write(f">{lt}\n" + str(seq) + "\n")
        
        # align Pfam domains and remove overlap similar to BiG-SCAPE
        alphabet = pyhmmer.easel.Alphabet.amino()
        sequences = []
        with pyhmmer.easel.SequenceFile(
            prot_file, digital=True, alphabet=alphabet
        ) as seq_file:
            sequences = list(seq_file)

        target_dom_hits = defaultdict(list)
        with pyhmmer.plan7.HMMFile(pfam_db_file) as hmm_file:
            for hits in pyhmmer.hmmsearch(
                hmm_file,
                sequences,
                bit_cutoffs="trusted",
                Z=int(pfam_z),
                cpus=1,
            ):
                for hit in hits:
                    for domain in hit.domains.included:
                        target_dom_hits[hit.name.decode()].append(
                            [
                                hits.query.name.decode(),
                                domain.alignment.target_from,
                                domain.alignment.target_to,
                                domain.score,
                                domain.i_evalue,
                            ]
                        )

        # chop up FASTA based on mostly non-overlapping domains, 10% leaway is given
        with open(mapping_file, "w") as map_handle:
            breakpoints = defaultdict(list)
            dom_start_names = defaultdict(lambda: "NA")
            dom_start_evalues = defaultdict(lambda: "NA")
            for tg in target_dom_hits:
                tg_dom_name_iter = defaultdict(int)
                accounted_coords = set([])
                for dom_align_info in sorted(
                    target_dom_hits[tg], key=itemgetter(3), reverse=True
                ):
                    dom_name, start, end, score, i_evalue = dom_align_info
                    overlap_coords = accounted_coords.intersection(
                        set(range(start, end + 1))
                    )
                    if (len(overlap_coords) / float(end - start + 1) >= 0.1) or (
                        len(overlap_coords) >= minimal_length
                    ):
                        continue
                    accounted_coords = accounted_coords.union(
                        set(range(start, end + 1))
                    )
                    breakpoints[tg].append(start)
                    breakpoints[tg].append(end + 1)
                    map_handle.write(
                        "\t".join(
                            [
                                str(x)
                                for x in [
                                    tg,
                                    start,
                                    end + 1,
                                    dom_name,
                                    score,
                                    i_evalue,
                                ]
                            ]
                        )
                        + "\n"
                    )
                    dom_start_names[tg + "_" + str(start)] = (
                        tg
                        + f"|{dom_name}|"
                        + str(tg_dom_name_iter[dom_name] + 1)
                    )
                    tg_dom_name_iter[dom_name] += 1
                    dom_start_evalues[tg + "_" + str(start)] = str(i_evalue)
            

        chopped_features = defaultdict(list)
        with open(prot_file) as ocf:
            for rec in SeqIO.parse(ocf, "fasta"):
                tg = rec.id
                record = feat_record[tg]
                tg_seq = str(rec.seq)
                prev_end_coord = 1
                tg_interdomain_index = 1
                if not tg in breakpoints and len(tg_seq) >= minimal_length:
                    dn = tg + "|full_protein|1"
                    de = "NA"
                    chopped_feature = map_chunk_protein_coords_to_feature_coords(
                        1,
                        len(tg_seq),
                        tg_seq,
                        lt_coord_info[tg],
                        dn,
                        de,
                        eukaryotic_gene_cluster_flag
                    )
                    chopped_features[record].append(chopped_feature)
                else:
                    for tg_seq_chunk in split_by_idx(
                        tg_seq, ([0] + sorted(breakpoints[tg]))
                    ):
                        if tg_seq_chunk.strip() == "":
                            continue
                        end_coord = prev_end_coord + len(tg_seq_chunk) - 1
                        # limit chosen arbitrarily because avg. alpha-helix is ~10 residues and beta-strand 4-10 residues
                        if len(tg_seq_chunk) >= minimal_length:
                            dn = dom_start_names[
                                tg + "_" + str(prev_end_coord - 1)
                            ]
                            de = dom_start_evalues[
                                tg + "_" + str(prev_end_coord - 1)
                            ]
                            if dn == "NA":
                                dn = (
                                    tg
                                    + "|inter-domain_region|"
                                    + str(tg_interdomain_index)
                                )
                                tg_interdomain_index += 1
                            chopped_feature = (
                                map_chunk_protein_coords_to_feature_coords(
                                    prev_end_coord,
                                    end_coord,
                                    tg_seq_chunk,
                                    lt_coord_info[tg],
                                    dn,
                                    de,
                                    eukaryotic_gene_cluster_flag
                                )
                            )
                            chopped_features[record].append(chopped_feature)

                        prev_end_coord = end_coord + 1

        with open(ccds_gbk_file, "w") as occgf:
            with open(gbk) as ogbk:
                for rec in SeqIO.parse(ogbk, "genbank"):
                    seq = Seq(rec.seq)
                    gbk_rec = SeqRecord(
                        seq, id=rec.id, name=rec.id, description=rec.description
                    )
                    gbk_rec.annotations["molecule_type"] = "DNA"
                    gbk_rec.features = chopped_features[rec.id]
                    SeqIO.write(gbk_rec, occgf, "genbank")
            
        return ('success', None)

    except Exception as e:
        error_msg = f"An issue occurred with creating chopped up version of GenBank file {gbk}: {str(e)}"
        return ('error', error_msg)


def batch_create_chopped_genbanks(
    genbanks,
    minimal_length,
    dm_scratch_dir,
    modified_genbank_dir,
    dom_to_cds_relations_file,
    log_object,
    eukaryotic_gene_cluster_flag=False,
    threads=1,
) -> List[Any]:
    """
    Description:
    Create chopped CDS GenBank files from regular GenBank input with CDS features.
    ********************************************************************************************************************
    Parameters:
    - genbanks: Input GenBank files.
    - minimal_length: The minimum length in amino acids for a domain matching or intra-domain region to be kept and
                      tagged as a chopped CDS feature.
    - dm_scratch_dir: Scratchdirectory for generating chopped up CDS GenBank files.
    - modified_genbank_dir: The output directory where to write chopped CDS GenBank files.
    - dom_to_cds_relations_file: The chopped CDS to full CDS relationship file.
    - log_object: A logging object.
    - eukaryotic_gene_cluster_flag: Flag for weather eukaryotic gene clusters are being investigated.
    - threads: The number of threads to use [Default is 1].
    ********************************************************************************************************************
    Returns:
    - chopped_genbanks: A python list containing paths to chopped CDS GenBank files.
    ********************************************************************************************************************
    """

    chopped_genbanks = []
    try:
        dm_scratch_dir = os.path.abspath(dm_scratch_dir) + "/"

        pfam_db_file = None
        pfam_z = None
        try:
            zol_data_directory = str(os.getenv("ZOL_DATA_PATH")).strip()
            db_locations = None
            if zol_data_directory != "None":
                try:
                    zol_data_directory = (
                        os.path.abspath(zol_data_directory) + "/"
                    )
                    db_locations = (
                        zol_data_directory + "database_location_paths.txt"
                    )
                except Exception as e:
                    pass

            if db_locations == None or not os.path.isfile(db_locations):
                sys.stderr.write(
                    "Warning: databases do not appear to be setup or setup properly - so unable to annotate!\n"
                )

            if db_locations is not None:
                with open(db_locations) as odl: 
                    for line in odl:
                        line = line.strip()
                        if len(line.split("\t")) != 4:
                            continue
                        name, _, db_file, z = line.split("\t")
                        if name == "pfam":
                            pfam_db_file = db_file
                            pfam_z = int(z)

            assert (
                pfam_db_file != None
                and os.path.isfile(pfam_db_file)
                and pfam_z != None
            )
        except Exception as e:
            msg = \
     "Issues validating that the Pfam database - please run setup_annotation_database.py to setup databases - note you might be interested in downloading the minimal database instead of the full one."
            log_object.warning(msg)
            sys.stderr.write(msg + "\n")
            sys.exit(1)
            
        proteome_dir = dm_scratch_dir + "GenBank_Proteins/"
        mapping_dir = dm_scratch_dir + "cCDS_to_CDS_Mappings/"
        util.setup_ready_directory([proteome_dir, mapping_dir])

        gbk_mod_inputs = []
        for gbk in genbanks:
            ccds_gbk_file = modified_genbank_dir + gbk.split("/")[-1]
            prot_file = (
                proteome_dir + (".".join(gbk.split("/")[-1].split(".")[:-1]) + ".faa")
            )
            mapping_file = (
                mapping_dir + (".".join(gbk.split("/")[-1].split(".")[:-1]) + ".txt")
            )
            gbk_mod_inputs.append(
                [
                    gbk,
                    prot_file,
                    mapping_file,
                    ccds_gbk_file,
                    pfam_db_file,
                    pfam_z,
                    minimal_length,
                    eukaryotic_gene_cluster_flag,
                ]
            )

        # Use robust error handling for chopped GenBank creation
        result_summary = util.robust_multiprocess_executor(
            worker_function=create_chopped_genbank,
            inputs=gbk_mod_inputs,
            pool_size=threads,
            error_strategy="report_and_continue",  # Continue even if some GenBank processing fails
            log_object=log_object,
            description="domain-chopped GenBank file creation"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        if success_prop != 1.0:
            msg = f"Issues with domain-chopped GenBank file creation for at least one GenBank file. Exiting now ..."
            sys.stderr.write(msg + '\n')
            log_object.error(msg)
            sys.exit(1)

        with open(dom_to_cds_relations_file, "w") as ccc_handle:
            for f in os.listdir(modified_genbank_dir):
                gbk_file = modified_genbank_dir + f
                chopped_genbanks.append(gbk_file)
                with open(gbk_file) as ogf:
                    for rec in SeqIO.parse(ogf, "genbank"):
                        for feat in rec.features:
                            if feat.type != "cCDS":
                                continue
                            name = feat.qualifiers.get("locus_tag")[0]
                            tg = name.split("|")[0]
                            ccc_handle.write(
                                "\t".join([name, tg, gbk_file]) + "\n"
                            )
        

        return chopped_genbanks
    except Exception as e:
        msg = "Issue regenerating GenBank files with CDS features chopped up."
        log_object.warning(msg)
        sys.stderr.write(traceback.format_exc())
        sys.stderr.write(msg + "\n")
        sys.exit(1)


def reinflate_orthogroups(
    ortho_matrix_file,
    prot_dir,
    rog_dir,
    log_object,
    diamond_params="--approx-id 98 --mutual-cover 95 -M 4G",
    threads=1,
) -> None:
    """
    Description:
    This function reinflates a matrix of ortholog groups to include all proteins in a given directory.
    The function first reads the ortholog group matrix and creates a set of protein IDs that are representatives of
    ortholog groups from the representative (dereplicated) set of gene clusters. The function then uses diamond linclust
    to cluster all proteins in the prot_dir directory and reads the diamond linclust clustering output to create a
    dictionary that maps non - representative protein IDs to ortholog groups.

    *******************************************************************************************************************
    Parameters:
    - orthogroup_matrix_file: The ortholog group vs sample matrix file, where cells correspond to locus tag identifiers.
    - prot_dir: A directory containing protein FASTA files.
    - rog_dir: A directory to write temporary + result files pertaining to reinflation to.
    - log_object: An object for logging messages.
    - diamond_params: Diamond linclust parameters. Default is: "--approx-id 98 --mutual-cover 95 -M 4G"
    - threads: The number of threads to use for the diamond linclust clustering step.
    *******************************************************************************************************************
    """
    try:
        reps = set([])
        rep_to_hg: Dict[str, Any] = {}
        with open(ortho_matrix_file) as oomf:
            for i, line in enumerate(oomf):
                if i == 0:
                    continue
                line = line.strip("\n")
                ls = line.split("\t")
                hg = ls[0]
                for pids in ls[1:]:
                    for pid in pids.split(","):
                        pid = pid.strip()
                        reps.add(pid)
                        rep_to_hg[pid] = hg

        comp_prot_file = rog_dir + "All_Proteins.faa"
        with open(comp_prot_file, "w") as cf_handle:
            for f in os.listdir(prot_dir):
                with open(prot_dir + f) as olf:
                    for rec in SeqIO.parse(olf, "fasta"):
                        cf_handle.write(f">{rec.id}\n{rec.seq}\n")

        diamond_cluster_file = rog_dir + "diamond_linclust_clusters.tsv"
        diamond_cmd = [
            "diamond",
            "linclust",
            "-d",
            comp_prot_file,
            "-o",
            diamond_cluster_file,
            diamond_params,
            "--threads",
            str(threads),
        ] + diamond_params.split()

        util.run_cmd_via_subprocess(diamond_cmd, log_object=log_object, check_files=[diamond_cluster_file])

        clust_proteins = defaultdict(set)
        protein_to_clust: Dict[str, str] = {}
        with open(diamond_cluster_file) as occf:
            for line in occf:
                line = line.strip()
                if not line:
                    continue
                parts = line.split('\t')
                if len(parts) < 2:
                    continue
                rep_id = parts[0]
                member_id = parts[1]
                protein_to_clust[member_id] = rep_id
                clust_proteins[rep_id].add(member_id)

        all_samples = set([])
        for f in os.listdir(prot_dir):
            all_samples.add(".faa".join(f.split(".faa")[:-1]))

        inflated_og_matrix_file = rog_dir + "Orthogroups.tsv"
        with open(inflated_og_matrix_file, "w") as iomf_handle:
            accounted = set([])
            with open(ortho_matrix_file) as oomf:
                for i, line in enumerate(oomf):
                    line = line.strip("\n")
                    ls = line.split("\t")
                    if i == 0:
                        iomf_handle.write(
                            "Sample\t"
                            + "\t".join(sorted(list(all_samples)))
                            + "\n"
                        )
                        continue
                    hg = ls[0]
                    cluster_obs = set([])
                    all_pids_by_sample = defaultdict(set)
                    for pids in ls[1:]:
                        for pid in pids.split(","):
                            pid = pid.strip()
                            if pid == "":
                                continue
                            cluster_obs.add(protein_to_clust[pid])
                            all_pids_by_sample[pid.split("|")[0]].add(pid)
                    for clust in cluster_obs:
                        for pid in clust_proteins[clust]:
                            if pid in reps and rep_to_hg[pid] != hg:
                                sys.stderr.write(
                                    f"Warning: The protein {pid} is a representative of ortholog group {rep_to_hg[pid]}, but can potentially belong to multiple. Skipping its incorporation for ortholog group {hg}.\n"
                                )
                                log_object.warning(
                                    f"The protein {pid} is a representative of a ortholog group {rep_to_hg[pid]}, but can potentially belong to multiple. Skipping its incorporation for ortholog group {hg}."
                                )
                            elif pid in accounted:
                                sys.stderr.write(
                                    f"Warning: The protein {pid} has already been clustered into a ortholog group, but can potentially belong to multiple. Skipping its incorporation for ortholog group {hg}.\n"
                                )
                                log_object.warning(
                                    f"The protein {pid} has already been clustered into a ortholog group, but can potentially belong to multiple. Skipping its incorporation for ortholog group {hg}."
                                )
                            else:
                                all_pids_by_sample[pid.split("|")[0]].add(pid)
                            accounted.add(pid)
                    row = [hg]
                    for sample in sorted(list(all_samples)):
                        samp_pids_for_hg = ""
                        if sample in all_pids_by_sample:
                            samp_pids_for_hg = ", ".join(
                                sorted(all_pids_by_sample[sample])
                            )
                        row.append(samp_pids_for_hg)
                    iomf_handle.write("\t".join(row) + "\n")
        

    except Exception as e:
        sys.stderr.write(
            "Issues with reinflation of orthogroups to full gene cluster set.\n"
        )
        log_object.error(
            "Issues with reinflation of orthogroups to full gene cluster set."
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def dereplicate_using_skani(
    genbanks,
    focal_genbanks,
    derep_dir,
    kept_dir,
    log_object,
    skani_small_genomes_preset=False,
    skani_identiy_threshold=99.0,
    skani_coverage_threshold=95.0,
    mcl_inflation=None,
    threads=1,
) -> Tuple[Set[str], Dict[str, Set[str]]]:
    """
    Description:
    This function dereplicates a set of GenBank files using the skani to estimate pairwise gene cluster ANI and either
    single - linkage clustering (slclust) or MCL to cluster and select representative gene clusters. If focal gene \
    clusters are noted by the user, they are given priority as representatives otherwise representatives are chosen
    based on length (longest given priority).
    *******************************************************************************************************************
    Parameters:
    - genbanks: A list of paths to GenBank files.
    - focal_genbanks: A list of paths to GenBank files that should be included in the dereplicated set if possible,
                    even if they are not the largest sequences in their cluster.
    - derep_dir: The directory to write the dereplicated GenBank files to.
    - kept_dir: The directory to write the GenBank files that were kept after dereplication to.
    - log_object: An object for logging messages.
    - skani_small_genomes_preset: Use the --small-genomes preset in skani for faster computes.
    - skani_identiy_threshold: The minimum identity threshold for two sequences to be considered similar.
    - skani_coverage_threshold: The minimum coverage threshold for two sequences to be considered similar.
    - mcl_inflation: The inflation factor to use for the MCL clustering algorithm. If not provided (default), single- \
                     linkage clustering (via slclust) will be used instead.
    - threads: The number of threads to use for the skani and MCL clustering algorithms.
    *******************************************************************************************************************
    Returns:
    A tuple of two lists:
            - The first list contains the paths to the dereplicated GenBank files.
            - The second list contains the members of each cluster in the dereplicated set.
    *******************************************************************************************************************
    """

    derep_genbanks = set([])
    try:
        derep_dir = os.path.abspath(derep_dir) + "/"
        
        full_nucl_seq_dir = derep_dir + "FASTAs/"
        util.setup_ready_directory([full_nucl_seq_dir])

        fasta_listing_file = derep_dir + "Gene_Clusters_FASTA_Listing.txt"
        with open(fasta_listing_file, "w") as flf_handle:
            longest_seq = defaultdict(int)
            tot_seq = defaultdict(int)
            for gbk in genbanks:
                gbk_prefix = None
                if (
                    gbk.endswith(".gbk")
                    or gbk.endswith(".gbff")
                    or gbk.endswith(".genbank")
                ):
                    gbk_prefix = ".".join(gbk.split("/")[-1].split(".")[:-1])
                assert gbk_prefix != None
                gbk_fasta_file = full_nucl_seq_dir + (gbk_prefix + ".fasta")
                flf_handle.write(str(gbk_fasta_file) + "\n")
                gbk_fasta_handle = open(gbk_fasta_file, "w")
                with open(gbk) as ogbk:
                    for rec in SeqIO.parse(ogbk, "genbank"):
                        gbk_fasta_handle.write(f">{rec.id}\n{rec.seq}\n")
                        if longest_seq[gbk_prefix] < len(str(rec.seq)):
                            longest_seq[gbk_prefix] = len(str(rec.seq))
                        tot_seq[gbk_prefix] += len(str(rec.seq))
                gbk_fasta_handle.close()
            

        skani_sketch_db = derep_dir + "skani_sketch/"
        skani_sketch_cmd = [
            "skani",
            "sketch",
            "-t",
            str(threads),
            "-l",
            fasta_listing_file,
            "-o",
            skani_sketch_db,
        ]
        # For skani >= 0.3.0, list inputs require --separate-sketches
        try:
            if util.is_skani_version_at_least_0_3_0():
                skani_sketch_cmd.append("--separate-sketches")
        except Exception:
            pass
        
        util.run_cmd_via_subprocess(skani_sketch_cmd, log_object=log_object, check_directories=[skani_sketch_db])
        
        skani_result_file = derep_dir + "skani_results.tsv"
        skani_dist_cmd = [
            "skani",
            "dist",
            "-t",
            str(threads),
            "-q",
            str(skani_sketch_db) + "*",
            "-r",
            str(skani_sketch_db) + "*",
            "--min-af",
            str(skani_coverage_threshold),
            "-o",
            skani_result_file,
        ]
        if skani_small_genomes_preset:
            skani_dist_cmd += ["--small-genomes"]

        util.run_cmd_via_subprocess(skani_dist_cmd, log_object=log_object, check_files=[skani_result_file])
        
        similar_pairs_file = derep_dir + "Similar_Pairs.txt"
        with open(similar_pairs_file, "w") as similar_pairs_handle:
            all_gcs = set([])
            paired_gcs = set([])
            visited = set([])
            with open(skani_result_file) as osrf:
                for i, line in enumerate(osrf):
                    if i == 0:
                        continue
                    line = line.strip()
                    f1, f2, ani, _, _, _, _ = line.split("\t")
                    s1 = ".fasta".join(f1.split("/")[-1].split(".fasta")[:-1])
                    s2 = ".fasta".join(f2.split("/")[-1].split(".fasta")[:-1])
                    if float(ani) >= skani_identiy_threshold:
                        pair_tup = sorted([s1, s2])
                        if not tuple(pair_tup) in visited:
                            if mcl_inflation == None:
                                similar_pairs_handle.write(
                                    pair_tup[0] + "\t" + pair_tup[1] + "\n"
                                )
                            else:
                                similar_pairs_handle.write(
                                    pair_tup[0]
                                    + "\t"
                                    + pair_tup[1]
                                    + "\t"
                                    + str(ani)
                                    + "\n"
                                )
                            if s1 != s2:
                                paired_gcs.add(s1)
                                paired_gcs.add(s2)
                            else:
                                all_gcs.add(s1)
                        visited.add(tuple(pair_tup))
        

        focal = None
        if focal_genbanks != None and os.path.isfile(focal_genbanks):
            focal = set([])
            with open(focal_genbanks) as ofgf:
                for line in ofgf:
                    gbk = line.strip()
                    gbk_prefix = None
                    if (
                        gbk.endswith(".gbk")
                        or gbk.endswith(".gbff")
                        or gbk.endswith(".genbank")
                    ):
                        gbk_prefix = ".".join(
                            gbk.split("/")[-1].split(".")[:-1]
                        )
                    assert gbk_prefix != None
                    focal.add(gbk_prefix)

        clusters_file = derep_dir + "Cluster_Families.txt"
        if mcl_inflation == None:
            clust_cmd = [
                "slclust",
                "<",
                similar_pairs_file,
                ">",
                clusters_file,
            ]
        else:
            clust_cmd = [
                "mcl",
                similar_pairs_file,
                "--abc",
                "-I",
                str(mcl_inflation),
                "-o",
                clusters_file,
                "-te",
                str(threads),
            ]

        util.run_cmd_via_subprocess(clust_cmd, log_object=log_object, check_files=[clusters_file])

        representatives = set([])
        rep_genbank_members = defaultdict(set)
        with open(clusters_file) as ocf:
            for line in ocf:
                line = line.strip()
                gcc = line.split()
                cluster_gc_stats = []
                gcs = set([])
                for gc in gcc:
                    cluster_gc_stats.append([gc, longest_seq[gc], tot_seq[gc]])
                    gcs.add(gc)
                focal_not_relevant = False
                if focal == None or len(gcs.intersection(focal)) == 0:
                    focal_not_relevant = True
                rep = [
                    x[0]
                    for x in sorted(
                        cluster_gc_stats, key=itemgetter(1, 2), reverse=True
                    )
                    if focal_not_relevant or x[0] in focal
                ][0]
                rep_genbank_members[rep] = gcs
                representatives.add(rep)

        for gc in all_gcs.difference(paired_gcs):
            rep_genbank_members[gc] = set([gc])
            representatives.add(gc)

        for gbk in genbanks:
            gbk_prefix = None
            if (
                gbk.endswith(".gbk")
                or gbk.endswith(".gbff")
                or gbk.endswith(".genbank")
            ):
                gbk_prefix = ".".join(gbk.split("/")[-1].split(".")[:-1])
            assert gbk_prefix != None
            if not gbk_prefix in representatives:
                continue
            shutil.copy(gbk, kept_dir)
            derep_genbanks.add(kept_dir + gbk.split("/")[-1])

        num_gbk = len(derep_genbanks)
        if num_gbk == 0:
            sys.stderr.write(
                "Issues with dereplicating GenBanks! All GenBanks deemed as redundant ...\n"
            )
            log_object.error(
                "Issues with dereplicating GenBanks! All GenBanks deemed as redundant ..."
            )
        else:
            sys.stdout.write(
                f"Found {num_gbk} GenBanks retained after dereplication.\n"
            )
            log_object.info(
                f"Found {num_gbk} GenBanks retained after dereplication."
            )

    except Exception as e:
        sys.stderr.write(
            "Issues with run skani based dereplication of input GenBanks.\n"
        )
        log_object.error(
            "Issues with run skani based dereplication of input GenBanks."
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)

    return derep_genbanks, rep_genbank_members

def partition_sequences_by_homolog_groups(
    ortho_matrix_file, prot_dir, nucl_dir, hg_prot_dir, hg_nucl_dir, log_object
) -> None:
    """
    Description:
    This function partitions gene cluster gene / protein sequences by ortholog groups.
    *******************************************************************************************************************
    Parameters:
    - ortho_matrix_file: A file containing an orthomatrix.
    - prot_dir: A directory containing protein sequences.
    - nucl_dir: A directory containing DNA sequences.
    - hg_prot_dir: A directory to write protein sequences for each ortholog group.
    - hg_nucl_dir: A directory to write DNA sequences for each ortholog group.
    - log_object: A logging object.
    *******************************************************************************************************************
    """
    try:
        g_to_hg: Dict[str, Any] = {}
        samples = []
        with open(ortho_matrix_file) as oomf:
            for i, line in enumerate(oomf):
                line = line.strip()
                ls = line.split("\t")
                if i == 0:
                    samples = ls[1:]
                    continue
                hg = ls[0]
                for j, gs in enumerate(ls[1:]):
                    for g in gs.split(", "):
                        g = g.strip()
                        g_to_hg[g] = hg
        for pf in os.listdir(prot_dir):
            pfile = prot_dir + pf
            with open(pfile) as opf:
                for rec in SeqIO.parse(opf, "fasta"):
                    if not rec.id in g_to_hg:
                        continue
                    hg = g_to_hg[rec.id]
                    with open(hg_prot_dir + hg + ".faa", "a+") as hpf_handle:
                        hpf_handle.write(f">{rec.description}\n{str(rec.seq)}\n")
                    
        for nf in os.listdir(nucl_dir):
            nfile = nucl_dir + nf
            with open(nfile) as onf:
                for rec in SeqIO.parse(onf, "fasta"):
                    if not rec.id in g_to_hg:
                        continue
                    hg = g_to_hg[rec.id]
                    with open(hg_nucl_dir + hg + ".fna", "a+") as hnf_handle:
                        hnf_handle.write(f">{rec.description}\n{str(rec.seq)}\n")
                    
    except Exception as e:
        sys.stderr.write(
            "Issues with partitioning sequences to ortholog groups.\n"
        )
        log_object.error(
            "Issues with partitioning sequences to ortholog groups."
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def partition_and_create_upstream_nucl_alignments(
    ortho_matrix_file,
    nucl_upstr_dir,
    hg_upst_dir,
    upst_algn_dir,
    log_object,
    threads=1,
    use_super5=False,
) -> None:
    """
    Description:
    This function partitions upstream DNA sequences into ortholog groups and creates alignments for each group.
    *******************************************************************************************************************
    Parameters:
    - ortho_matrix_file: A file containing an orthomatrix.
    - nucl_upstr_dir: A directory containing upstream DNA sequences.
    - hg_upst_dir: A directory to write upstream DNA sequences for each ortholog group.
    - upst_algn_dir: A directory to write alignments for each ortholog group.
    - log_object: A logging object.
    - threads: The number of threads to use for alignment.
    - use_super5: Whether to use the SUPER5 algorithm for MUSCLE alignment.
    """
    try:
        g_to_hg: Dict[str, Any] = {}
        samples = []
        with open(ortho_matrix_file) as oomf:
            for i, line in enumerate(oomf):
                line = line.strip()
                ls = line.split("\t")
                if i == 0:
                    samples = ls[1:]
                    continue
                hg = ls[0]
                for j, gs in enumerate(ls[1:]):
                    for g in gs.split(", "):
                        g = g.strip()
                        g_to_hg[g] = hg

        for uf in os.listdir(nucl_upstr_dir):
            ufile = nucl_upstr_dir + uf
            with open(ufile) as ouf:
                for rec in SeqIO.parse(ouf, "fasta"):
                    if not rec.id in g_to_hg:
                        continue
                    hg = g_to_hg[rec.id]
                    with open(hg_upst_dir + hg + ".fna", "a+") as hpf_handle:
                        hpf_handle.write(f">{rec.description}\n{str(rec.seq)}\n")
                    
        for pf in os.listdir(hg_upst_dir):
            prefix = ".fna".join(pf.split(".fna")[:-1])
            upst_file = hg_upst_dir + pf
            if os.path.getsize(upst_file) == 0:
                continue
            min_seq_len = 10000
            with open(upst_file) as ouf:
                for rec in SeqIO.parse(ouf, "fasta"):
                    if len(str(rec.seq)) < min_seq_len:
                        min_seq_len = len(str(rec.seq))
            if min_seq_len < 10:
                continue
            upst_algn_file = upst_algn_dir + prefix + ".msa.fna"
            align_cmd = [
                "muscle",
                "-align",
                upst_file,
                "-output",
                upst_algn_file,
                "-nt",
                "-threads",
                str(threads),
                "-perturb",
                "12345",
            ]
            if use_super5:
                align_cmd = [
                    "muscle",
                    "-super5",
                    upst_file,
                    "-output",
                    upst_algn_file,
                    "-nt",
                    "-threads",
                    str(threads),
                    "-perturb",
                    "12345",
                ]
            util.run_cmd_via_subprocess(align_cmd, log_object=log_object, 
                                        check_files=[upst_algn_file], verbose=False)

    except Exception:
        msg = "Issues with partitioning / aligning upstream sequences."
        sys.stderr.write(msg + "\n")
        sys.stderr.write(traceback.format_exc() + "\n")
        log_object.error(msg)
        log_object.error(traceback.format_exc())
        sys.exit(1)


def create_protein_alignments(
    prot_dir, prot_algn_dir, log_object, use_super5=False, threads=1
) -> None:
    """
    Description:
    This function creates protein alignments from a directory of protein sequences.
    *******************************************************************************************************************
    Parameters:
    - prot_dir: A directory containing protein sequences.
    - prot_algn_dir: A directory to write protein alignments.
    - log_object: A logging object.
    - use_super5: Whether to use the SUPER5 algorithm for MUSCLE alignment.
    - threads: The number of threads to use for alignment.
    """
    try:
        for pf in os.listdir(prot_dir):
            prefix = ".faa".join(pf.split(".faa")[:-1])
            prot_file = prot_dir + pf
            prot_algn_file = prot_algn_dir + prefix + ".msa.faa"
            align_cmd = [
                "muscle",
                "-align",
                prot_file,
                "-output",
                prot_algn_file,
                "-amino",
                "-threads",
                str(threads),
                "-perturb",
                "12345",
            ]
            if use_super5:
                align_cmd = [
                    "muscle",
                    "-super5",
                    prot_file,
                    "-output",
                    prot_algn_file,
                    "-amino",
                    "-threads",
                    str(threads),
                    "-perturb",
                    "12345",
                ]
            util.run_cmd_via_subprocess(align_cmd, log_object=log_object, 
                                        check_files=[prot_algn_file], verbose=False)
    except Exception as e:
        msg = "Issues with creating protein alignments."
        sys.stderr.write(msg + "\n")
        log_object.error(msg)
        sys.stderr.write(traceback.format_exc() + "\n")
        log_object.error(traceback.format_exc())
        sys.exit(1)

"""
# This function is still experimental but might switch to later if pal2nal
# is found to be slower or installation is problematic.

def convert_protein_alignments_to_codon_alignments(inputs):
    # Description:
    # This function uses Biopython's Align module to convert protein
    # alignments to codon alignments. It writes sequences to the output
    # file iteratively to minimize memory usage.
    try:
        prot_algn_file, nucl_file, codo_algn_file = inputs

        # Read protein alignment and nucleotide sequences
        prot_algn = AlignIO.read(prot_algn_file, "fasta")
        nucl_seqs = {rec.id: rec for rec in SeqIO.parse(nucl_file, "fasta")}

        def codon_record_generator():
            # A generator that yields codon sequence records.
            for prot_rec in prot_algn:
                nucl_rec = nucl_seqs.get(prot_rec.id)
                if not nucl_rec:
                    raise ValueError(f"Nucleotide sequence not found for protein {prot_rec.id}")
                
                codon_seq = ""
                nucl_idx = 0
                for aa in prot_rec.seq:
                    if aa == "-":
                        codon_seq += "---"
                    else:
                        codon_seq += str(nucl_rec.seq[nucl_idx:nucl_idx+3])
                        nucl_idx += 3
                
                yield SeqRecord(Seq(codon_seq), id=prot_rec.id, description="")

        # Write codon alignment to file iteratively
        with open(codo_algn_file, "w") as ocaf:
            SeqIO.write(codon_record_generator(), ocaf, "fasta")

    except Exception as e:
        sys.stderr.write(f"Issues with converting protein alignment {prot_algn_file} to a codon alignment.\n")
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        return ('error', f"Failed to process {prot_algn_file} due to {str(e)}")
    return ('success', None)
"""

def create_codon_alignments(
    prot_algn_dir, nucl_dir, codo_algn_dir, log_object, threads=1
) -> None:
    """
    Description:
    This function creates codon alignments from a directory of protein alignments and a directory of DNA sequences.
    *******************************************************************************************************************
    Parameters:
    - prot_algn_dir: A directory containing protein alignments.
    - nucl_dir: A directory containing DNA sequences.
    - codo_algn_dir: A directory to write codon alignments.
    - log_object: A logging object.
    - threads: The number of threads to use for alignment.
    """
    try:
        pal2nal_cmds = []
        for paf in os.listdir(prot_algn_dir):
            prefix = ".msa.faa".join(paf.split(".msa.faa")[:-1])
            prot_algn_file = prot_algn_dir + paf
            nucl_file = nucl_dir + prefix + ".fna"
            codo_algn_file = codo_algn_dir + prefix + ".msa.fna"
            pal2nal_cmds.append(
                [
                    "pal2nal.pl",
                    prot_algn_file,
                    nucl_file,
                    "-output",
                    "fasta",
                    ">",
                    codo_algn_file,
                ]
            )
        msg = (
            "Converting protein alignments to codon alignments for %d ortholog groups"
            % len(pal2nal_cmds)
        )
        # Use robust error handling for pal2nal processing
        result_summary = util.robust_multiprocess_executor(
            worker_function=util.multi_process_safe,
            inputs=pal2nal_cmds,
            pool_size=threads,
            error_strategy="report_and_continue",  # Continue even if some pal2nal runs fail
            log_object=log_object,
            description="codon alignment creation"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        if success_prop != 1.0:
            msg = f"Issues with pal2nal codon alignment creation for at least one ortholog group. Exiting now ..."
            sys.stderr.write(msg + '\n')
            log_object.error(msg)
            sys.exit(1)

    except Exception as e:
        sys.stderr.write("Issues with creating codon alignments.\n")
        log_object.error("Issues with creating codon alignments.")
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def trim_alignments(
    prot_algn_dir,
    codo_algn_dir,
    prot_algn_trim_dir,
    codo_algn_trim_dir,
    log_object,
    threads=1,
) -> None:
    """
    Description:
    This function trims protein and codon alignments using TrimAl.
    *******************************************************************************************************************
    Parameters:
    - prot_algn_dir: The directory containing the protein alignments.
    - codo_algn_dir: The directory containing the codon alignments.
    - prot_algn_trim_dir: The directory where the trimmed protein alignments will be saved.
    - codo_algn_trim_dir: The directory where the trimmed codon alignments will be saved.
    - log_object: A logging object.
    - threads: The number of threads to use for trimming the alignments.
    *******************************************************************************************************************
    """
    try:
        trim_cmds = []
        for paf in os.listdir(prot_algn_dir):
            prefix = ".msa.faa".join(paf.split(".msa.faa")[:-1])
            prot_algn_file = prot_algn_dir + paf
            prot_algn_trim_file = prot_algn_trim_dir + paf
            codo_algn_file = codo_algn_dir + prefix + ".msa.fna"
            codo_algn_trim_file = codo_algn_trim_dir + prefix + ".msa.fna"
            trim_cmds.append(
                [
                    "trimal",
                    "-in",
                    prot_algn_file,
                    "-out",
                    prot_algn_trim_file,
                    "-keepseqs",
                    "-gt",
                    "0.9",
                ]
            )
            trim_cmds.append(
                [
                    "trimal",
                    "-in",
                    codo_algn_file,
                    "-out",
                    codo_algn_trim_file,
                    "-keepseqs",
                    "-gt",
                    "0.9"
                ]
            )

        msg = (
            "Running trimal to generate trimmed protein and codon alignments for %d ortholog groups"
            % (len(trim_cmds) / 2)
        )
        log_object.info(msg)
        sys.stdout.write(msg + "\n")

        # Use robust error handling for trimming
        result_summary = util.robust_multiprocess_executor(
            worker_function=util.multi_process_safe,
            inputs=trim_cmds,
            pool_size=threads,
            error_strategy="report_and_continue",  # Continue even if some trimming fails
            log_object=log_object,
            description="alignment trimming"
        )

        successfully_trimmed = 0
        for f in os.listdir(prot_algn_trim_dir):
            if f.endswith(".msa.faa"):
                no_sites = False
                with open(prot_algn_trim_dir + f) as oaf:
                    for rec in SeqIO.parse(oaf, "fasta"):
                        if len(rec.seq) == 0:
                            no_sites = True
                            break
                if no_sites:
                    util.remove_file(prot_algn_trim_dir + f)
                    if os.path.isfile(codo_algn_trim_dir + f):
                        util.remove_file(codo_algn_trim_dir + f)
                else:
                    successfully_trimmed += 1
        
        msg = f"Successfully trimmed {successfully_trimmed} protein / codon alignments."
        sys.stdout.write(msg + "\n")
        log_object.info(msg)

    except Exception as e:
        sys.stderr.write("Issues with trimming protein / codon alignments.\n")
        log_object.error("Issues with trimming protein / codon alignments.")
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def create_gene_trees(
    codo_algn_trim_dir, codo_algn_dir, tree_dir, log_object, threads=1
) -> None:
    """
    Description:
    This function creates gene trees from trimmed codon alignments using FastTree2 for ortholog groups.
    *******************************************************************************************************************
    Parameters:
    - codo_algn_trim_dir: The directory containing trimmed codon alignments.
    - codo_algn_trim: The directory containing untrimmed codon alignments to default on.
    - tree_dir: The directory where trees in Newick format will be saved.
    - log_object: A logging object.
    - threads: The number of threads to use.
    *******************************************************************************************************************
    """
    try:
        fasttree_cmds = []
        for catf in os.listdir(codo_algn_dir):
            if not catf.endswith(".msa.fna"):
                continue
            prefix = ".msa.fna".join(catf.split(".msa.fna")[:-1])
            codo_algn_trim_file = codo_algn_trim_dir + catf
            codo_algn_file = codo_algn_dir + catf
            tree_file = tree_dir + prefix + ".tre"

            seqlen = 0
            try:
                with open(codo_algn_trim_file) as ocatf:
                    for rec in SeqIO.parse(ocatf, "fasta"):
                        seqlen = len(str(rec.seq))
            except Exception as e:
                pass
            if seqlen > 0:
                fasttree_cmds.append(
                    [
                        "fasttree",
                        "-nt",
                        codo_algn_trim_file,
                        ">",
                        tree_file
                    ]
                )
            else:
                fasttree_cmds.append(
                    [
                        "fasttree",
                        "-nt",
                        codo_algn_file,
                        ">",
                        tree_file
                    ]
                )
                msg = (
                    f"Trimmed codon alignment was blank so defaulting to untrimmed for constructing ortholog group tree for {prefix}."
                )
                log_object.warning(msg)
                sys.stderr.write(f"Warning: {msg}\n")

        msg = (
            "Running FastTree 2 to generate gene trees (based on trimmed codon alignments) for %d ortholog groups" \
            % len(fasttree_cmds)
        )
        log_object.info(msg)
        sys.stdout.write(msg + "\n")

        # Use robust error handling for FastTree
        result_summary = util.robust_multiprocess_executor(
            worker_function=util.multi_process_safe,
            inputs=fasttree_cmds,
            pool_size=threads,
            error_strategy="report_and_stop",  # Continue even if some tree building fails
            log_object=log_object,
            description="FastTree phylogeny construction"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        if success_prop != 1.0:
            msg = f"Issues with FastTree phylogeny construction for at least one ortholog group. Exiting now ..."
            sys.stderr.write(msg + '\n')
            log_object.error(msg)
            sys.exit(1)

    except Exception as e:
        sys.stderr.write("Issues with creating gene trees.\n")
        log_object.error("Issues with creating gene trees.")
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def create_profile_hmms_and_consensus_seqs(
    prot_algn_dir, phmm_dir, cons_dir, log_object, threads=1
) -> None:
    """
    Description:
    This function creates profile HMMs and emits consensus sequences based on protein MSAs using HMMER.
    *******************************************************************************************************************
    Parameters:
    - prot_algn_dir: The directory containing protein alignments.
    - phmm_dir: The directory where profile HMMs in HMMER3 HMM format will be saved.
    - cons_dir: The directory where consensus sequences in FASTA format will be saved.
    - log_object: A logging object.
    - threads: The number of threads to use.
    *******************************************************************************************************************
    """
    try:
        hmmbuild_cmds = []
        hmmemit_cmds = []
        for paf in os.listdir(prot_algn_dir):
            prefix = ".msa.faa".join(paf.split(".msa.faa")[:-1])
            prot_algn_file = prot_algn_dir + paf
            prot_hmm_file = phmm_dir + prefix + ".hmm"
            prot_cons_file = cons_dir + prefix + ".cons.faa"
            hmmbuild_cmds.append(
                [
                    "hmmbuild",
                    "--amino",
                    "--cpu",
                    "2",
                    "-n",
                    prefix,
                    prot_hmm_file,
                    prot_algn_file
                ]
            )
            hmmemit_cmds.append(
                [
                    "hmmemit",
                    "-c",
                    "-o",
                    prot_cons_file,
                    prot_hmm_file
                ]
            )

        msg = (
            "Running HMMER3 hmmbuild to generate profile HMMs for %d ortholog groups"
            % len(hmmbuild_cmds)
        )
        log_object.info(msg)
        sys.stdout.write(msg + "\n")

        # Use robust error handling for hmmbuild
        result_summary = util.robust_multiprocess_executor(
            worker_function=util.multi_process_safe,
            inputs=hmmbuild_cmds,
            pool_size=threads,
            error_strategy="report_and_continue",  # Continue even if some HMM building fails
            log_object=log_object,
            description="HMMER3 hmmbuild"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        if success_prop != 1.0:
            msg = f"Issues with HMMER3 hmmbuild for at least one ortholog group. Exiting now ..."
            sys.stderr.write(msg + '\n')
            log_object.error(msg)
            sys.exit(1)

        msg = (
            "Running HMMER3 hmmemit to generate consensus protein sequences for %d ortholog groups"
            % len(hmmemit_cmds)
        )
        log_object.info(msg)
        sys.stdout.write(msg + "\n")

        # Use robust error handling for hmmemit
        result_summary = util.robust_multiprocess_executor(
            worker_function=util.multi_process_safe,
            inputs=hmmemit_cmds,
            pool_size=threads,
            error_strategy="report_and_stop",
            log_object=log_object,
            description="HMMER3 hmmemit consensus generation"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        if success_prop != 1.0:
            msg = f"Issues with HMMER3 hmmemit consensus generation for at least one ortholog group. Exiting now ..."
            sys.stderr.write(msg + '\n')
            log_object.error(msg)
            sys.exit(1)

    except Exception as e:
        sys.stderr.write(
            "Issues with creating profile HMMs and consensus sequences.\n"
        )
        log_object.error(
            "Issues with creating profile HMMs and consensus sequences."
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)

# TypedDict for best hits structure in annotate_custom_database
class BestHitInfo(TypedDict):
    hits: List[str]
    evalues: List[float]
    bitscore: float

# TypedDict for custom annotation results
class CustomAnnotationResult(TypedDict):
    descriptions: List[str]
    evalues: List[float]
    hits: List[str]

def annotate_custom_database(
    protein_faa,
    custom_protein_db_faa,
    annotation_dir,
    log_object,
    threads=1,
    max_annotation_evalue=1e-5,
) -> Dict[str, CustomAnnotationResult]:
    """
    Description:
    This function will annotate consensus sequences for ortholog groups with a custom database of protein sequences
    provided by the user in FASTA format. The best hit per ortholog group is selected based on bitscore provided that
    the E-value threshold is met.
    *******************************************************************************************************************
    Parameters:
    - protein_faa: Consensus protein sequences for ortholog groups in FASTA format.
    - custom_protein_db_faa: Custom database of reference proteins in FASTA format.
    - annotation_dir: Directory where to perform annotation analysis.
    - log_object: A logging object.
    - threads: The number of threads to use.
    - max_annotation_evalue: The maximum E-value by DIAMOND to regard an alignment between a consensus ortholog group
                             sequence and a reference protein sequence.
    *******************************************************************************************************************
    Returns:
    A dictionary where the key is the ortholog group identifier and the value is a TypedDict containing:
    - descriptions: List of identifiers/descriptions of the custom/reference proteins
    - evalues: List of the respective E-values
    - hits: List of the protein hits/identifiers
    *******************************************************************************************************************
    """
    custom_annotations: Dict[str, CustomAnnotationResult] = {}
    try:
        annotation_dir = os.path.abspath(annotation_dir) + "/"
        custom_annot_dir = annotation_dir + "Custom_Annotation/"

        util.setup_ready_directory([custom_annot_dir])
        dmnd_db = custom_annot_dir + "Custom.dmnd"
        blastp_file = custom_annot_dir + "Custom.txt"
        makedb_cmd = [
            "diamond",
            "makedb",
            "--ignore-warnings",
            "--in",
            custom_protein_db_faa,
            "-d",
            dmnd_db,
            "--threads",
            str(threads),
        ]
        search_cmd = [
            "diamond",
            "blastp",
            "--ignore-warnings",
            "-p",
            str(threads),
            "-d",
            dmnd_db,
            "-q",
            protein_faa,
            "-o",
            blastp_file,
        ]
        
        util.run_cmd_via_subprocess(makedb_cmd, log_object=log_object, check_files=[dmnd_db])
        util.run_cmd_via_subprocess(search_cmd, log_object=log_object, check_files=[blastp_file])

        id_to_description: Dict[str, str] = {}
        with open(custom_protein_db_faa) as ocpdf:
            for rec in SeqIO.parse(ocpdf, "fasta"):
                id_to_description[rec.id] = rec.description

        # Use TypedDict for better type safety
        best_hits_by_bitscore: Dict[str, BestHitInfo] = defaultdict(
            lambda: {"hits": [], "evalues": [], "bitscore": 0.0}
        )
        
        with open(blastp_file) as obf:
            for line in obf:
                line = line.strip()
                ls = line.split("\t")
                que, hit = ls[:2]
                eval = float(ls[10])
                if eval > max_annotation_evalue:
                    continue
                bitscore = float(ls[11])
                
                # Type assertion to ensure we're working with the correct types
                current_best = best_hits_by_bitscore[que]
                if bitscore > current_best["bitscore"]:
                    best_hits_by_bitscore[que] = {
                        "hits": [hit],
                        "evalues": [eval],
                        "bitscore": bitscore
                    }
                elif bitscore == current_best["bitscore"]:
                    current_best["hits"].append(hit)
                    current_best["evalues"].append(eval)

        for que in best_hits_by_bitscore:
            current_best = best_hits_by_bitscore[que]
            custom_annotations[que] = {
                "descriptions": [id_to_description[x] for x in current_best["hits"]],
                "evalues": current_best["evalues"],
                "hits": current_best["hits"]
            }

    except Exception as e:
        sys.stderr.write("Issues with annotating using custom database.\n")
        log_object.error("Issues with annotating using custom database.")
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)
    return custom_annotations

def run_pyhmmer(inputs) -> Tuple[str, Optional[str]]:
    name, db_file, z, protein_faa, annotation_result_file, threads = inputs
    try:
        alphabet = pyhmmer.easel.Alphabet.amino()
        sequences = []
        with pyhmmer.easel.SequenceFile(
            protein_faa, digital=True, alphabet=alphabet
        ) as seq_file:
            sequences = list(seq_file)

        with open(annotation_result_file, "w") as outf:
            if name == "pfam":
                with pyhmmer.plan7.HMMFile(db_file) as hmm_file:
                    for hits in pyhmmer.hmmsearch(
                        hmm_file,
                        sequences,
                        bit_cutoffs="trusted",
                        Z=int(z),
                        cpus=threads,
                    ):
                        for hit in hits:
                            accession = "NA"
                            try:
                                if hits.query.accession is not None:
                                    accession = hits.query.accession.decode()
                            except Exception as e:
                                accession = "NA"
                            outf.write(
                                "\t".join(
                                    [
                                        hits.query.name.decode(),
                                        accession,
                                        "NA",
                                        hit.name.decode(),
                                        "NA",
                                        str(hit.evalue),
                                        str(hit.score),
                                    ]
                                )
                                + "\n"
                            )
            else:
                with pyhmmer.plan7.HMMFile(db_file) as hmm_file:
                    for hits in pyhmmer.hmmsearch(
                        hmm_file,
                        sequences,
                        Z=int(z),
                        cpus=threads,
                    ):
                        for hit in hits:
                            accession = "NA"
                            try:
                                if hits.query.accession is not None:
                                    accession = hits.query.accession.decode()
                            except Exception as e:
                                accession = "NA"
                            outf.write(
                                "\t".join(
                                    [
                                        hits.query.name.decode(),
                                        accession,
                                        "NA",
                                        hit.name.decode(),
                                        "NA",
                                        str(hit.evalue),
                                        str(hit.score),
                                    ]
                                )
                                + "\n"
                            )
        
        return ('success', None)
            
    except Exception as e:
        error_msg = f"Problem running pyhmmer! {str(e)}"
        return ('error', error_msg)


# TypedDict for best hits structure in annotate_consensus_sequences
class BestHitInfoConsensus(TypedDict):
    hits: List[str]
    evalues: List[Union[decimal.Decimal, str]]
    score: float

# TypedDict for consensus annotation results
class ConsensusAnnotationResult(TypedDict):
    descriptions: List[str]
    evalues: List[Union[decimal.Decimal, str]]
    hits: List[str]

def annotate_consensus_sequences(
    protein_faa,
    annotation_dir,
    log_object,
    threads=1,
    max_annotation_evalue=1e-5,
) -> Dict[str, Dict[str, ConsensusAnnotationResult]]:
    """
    Description:
    This function will attempt to annotate consensus sequences for ortholog groups with the default databases supported
    within zol assuming they have been (properly) setup. The best hit per ortholog group per database is selected based \
    on bitscore (if FASTA database) or score (if pHMM database) provided that the E-value threshold is met. \
    *******************************************************************************************************************
    Parameters:
    - protein_faa: Consensus protein sequences for ortholog groups in FASTA format.
    - annotation_dir: Directory where to perform annotation analysis.
    - log_object: A logging object.
    - threads: The number of threads to use.
    - max_annotation_evalue: The maximum E-value by DIAMOND to regard an alignment between a consensus ortholog group
                             sequence and a database sequence.
    *******************************************************************************************************************
    Returns:
    A dictionary of dictionaries where the primary key is the name of the database and the secondary key is the homolog
    group identifier and the values are a TypedDict containing descriptions and evalues.
    *******************************************************************************************************************
    """
    zol_data_directory = str(os.getenv("ZOL_DATA_PATH")).strip()
    db_locations = None
    if zol_data_directory != "None":
        try:
            zol_data_directory = os.path.abspath(zol_data_directory) + "/"
            db_locations = zol_data_directory + "database_location_paths.txt"
        except Exception as e:
            pass
    if db_locations == None or not os.path.isfile(db_locations):
        sys.stderr.write(
            "Warning: databases do not appear to be setup or setup properly!\n"
        )
        empty_annotations: Dict[str, Dict[str, ConsensusAnnotationResult]] = {}
        return empty_annotations

    try:
        dmnd_individual_threads = 1
        dmnd_pool_size = threads
        if threads > 5:
            dmnd_individual_threads = math.floor(threads / 5)
            dmnd_pool_size = 5

        hmm_individual_threads = 1
        hmm_pool_size = threads
        if threads > 4:
            hmm_individual_threads = math.floor(threads / 4)
            hmm_pool_size = 4

        assert os.path.isfile(db_locations)
        dmnd_search_cmds = []
        hmm_search_cmds = []
        name_to_info_file: Dict[str, str] = {}
        hmm_based_annotations = set([])
        with open(db_locations) as odls:
            for line in odls:
                line = line.strip()
                name, annot_info_file, db_file, z = line.split("\t")
                name_to_info_file[name] = annot_info_file
                annotation_result_file = annotation_dir + name + ".txt"
                if db_file.endswith(".hmm"):
                    hmm_based_annotations.add(name)
                    hmm_search_cmds.append(
                        [
                            name,
                            db_file,
                            z,
                            protein_faa,
                            annotation_result_file,
                            hmm_individual_threads
                        ]
                    )
                elif db_file.endswith(".dmnd") and not name in set(
                    ["riboprots", "mobsuite"]
                ):
                    search_cmd = [
                        "diamond",
                        "blastp",
                        "--ignore-warnings",
                        "-p",
                        str(dmnd_individual_threads),
                        "-d",
                        db_file,
                        "-q",
                        protein_faa,
                        "-o",
                        annotation_result_file
                    ]
                    dmnd_search_cmds.append(search_cmd)

        msg = (
            "Running pyhmmer hmmsearch for functional annotation for %d databases"
            % len(hmm_search_cmds)
        )
        log_object.info(msg)
        sys.stdout.write(msg + "\n")

        # Use robust error handling for pyhmmer
        result_summary = util.robust_multiprocess_executor(
            worker_function=run_pyhmmer,
            inputs=hmm_search_cmds,
            pool_size=hmm_pool_size,
            error_strategy="report_and_continue",  # Continue even if some HMM searches fail
            log_object=log_object,
            description="pyhmmer functional annotation"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        
        msg = f"{success_prop*100.0}% of pyhmmer-based database annotations were successful"
        if success_prop != 1.0:
            msg += f" - this is not critical but unexpected - please report on GitHub issues."

        sys.stdout.write(msg + '\n')
        log_object.info(msg)

        msg = (
            "Running DIAMOND blastp for functional annotation for %d databases"
            % len(dmnd_search_cmds)
        )
        log_object.info(msg)
        sys.stdout.write(msg + "\n")

        # Use robust error handling for DIAMOND search
        result_summary = util.robust_multiprocess_executor(
            worker_function=util.multi_process_safe,
            inputs=dmnd_search_cmds,
            pool_size=dmnd_pool_size,
            error_strategy="report_and_continue",
            log_object=log_object,
            description="DIAMOND blastp functional annotation"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        msg = f"{success_prop*100.0}% of DIAMOND blastp-based database annotations were successful"
        if success_prop != 1.0:
            msg += f" - this is not critical but unexpected - please report on GitHub issues."
        sys.stdout.write(msg + '\n')
        log_object.info(msg)

        annotations: Dict[str, Dict[str, ConsensusAnnotationResult]] = {}
        for rf in os.listdir(annotation_dir):
            if not rf.endswith(".txt"):
                continue
            db_name = rf.split(".txt")[0]
            if db_name in set(["riboprots", "mobsuite"]):
                continue
            annot_info_file = name_to_info_file[db_name]

            id_to_description: Dict[str, str] = {}
            with open(annot_info_file) as oaif:
                for line in oaif:
                    line = line.strip()
                    ls = line.split("\t")
                    id_to_description[ls[0]] = ls[1]

            # or by_score if HMMscan - lets avoid a second variable
            best_hits_by_bitscore: Dict[str, BestHitInfoConsensus] = {}
            # note second listing here is evalues for most annotation dbs but for Pfam it is accessions (the PF ids) \
            if db_name in hmm_based_annotations:
                # parse HMM based results from pyhmmer
                with open(annotation_dir + rf) as oarf:
                    for line in oarf:
                        line = line.rstrip("\n")
                        if line.startswith("#"):
                            continue
                        ls = line.split()
                        query = ls[3]
                        accession = ls[1]
                        hit = ls[0]
                        evalue = decimal.Decimal(ls[5])
                        score = float(ls[6])
                        if evalue > max_annotation_evalue:
                            continue
                        if db_name != "pfam":
                            if query not in best_hits_by_bitscore or score > best_hits_by_bitscore[query]["score"]:
                                best_hits_by_bitscore[query] = {
                                    "hits": [hit],
                                    "evalues": [evalue],
                                    "accessions": [accession],
                                    "score": score,
                                }
                            elif score == best_hits_by_bitscore[query]["score"]:
                                best_hits_by_bitscore[query]["hits"].append(hit)
                                best_hits_by_bitscore[query]["evalues"].append(evalue)
                                best_hits_by_bitscore[query]["score"] = score
                        else:
                            if evalue < max_annotation_evalue:
                                if query not in best_hits_by_bitscore:
                                    best_hits_by_bitscore[query] = {
                                        "hits": [hit],
                                        "evalues": [evalue],
                                        "accessions": [accession],
                                        "score": score,
                                    }
                                else:
                                    best_hits_by_bitscore[query]["hits"].append(hit)
                                    best_hits_by_bitscore[query]["evalues"].append(evalue)
                                    best_hits_by_bitscore[query]["accessions"].append(accession)
                                    best_hits_by_bitscore[query]["score"] = score
            else:
                # parse DIAMOND BLASTp based results
                with open(annotation_dir + rf) as oarf:
                    for line in oarf:
                        line = line.strip()
                        ls = line.split("\t")
                        query = ls[0]
                        hit = ls[1]
                        bitscore = float(ls[11])
                        evalue = decimal.Decimal(ls[10])
                        if evalue > max_annotation_evalue:
                            continue
                        if query not in best_hits_by_bitscore or bitscore > best_hits_by_bitscore[query]["score"]:
                            best_hits_by_bitscore[query] = {
                                "hits": [hit],
                                "evalues": [evalue],
                                "score": bitscore,
                                "accessions": [],
                            }
                        elif bitscore == best_hits_by_bitscore[query]["score"]:
                            best_hits_by_bitscore[query]["hits"].append(hit)
                            best_hits_by_bitscore[query]["evalues"].append(evalue)

            if db_name not in annotations:
                annotations[db_name] = {}
                
            with open(protein_faa) as opf:
                for rec in SeqIO.parse(opf, "fasta"):
                    if rec.id in best_hits_by_bitscore:
                        annotations[db_name][rec.id] = {
                            "descriptions": [
                                id_to_description.get(x, "NA")
                                for x in best_hits_by_bitscore[rec.id]["hits"]
                            ],
                            "evalues": best_hits_by_bitscore[rec.id]["evalues"],
                            "hits": best_hits_by_bitscore[rec.id]["accessions"],
                        }
        return annotations
    except Exception as e:
        sys.stderr.write("Issues with annotating consensus sequences.\n")
        log_object.error("Issues with annotating consensus sequences.")
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def default_to_regular(d) -> Dict[str, Any]:
    """
    Convert a defaultdict to a regular old dict.

    Function taken from:
    https://stackoverflow.com/questions/26496831/how-to-convert-defaultdict-of-defaultdicts-of-defaultdicts-to-dict-of-dicts-o
    """
    try:
        if isinstance(d, defaultdict):
            d = {k: default_to_regular(v) for k, v in d.items()}
        return d
    except Exception as e:
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def determine_consensus_order_of_ogs(
    genbanks,
    ortho_matrix_file,
    markovian_file,
    consensus_path_file,
    log_object,
    domain_mode=False,
) -> Dict:
    """
    Description:
    This function determines the consensus order and directionality of ortholog groups in a set of gene cluster
    GenBanks. It is closely based on code from lsaBGC.
    *******************************************************************************************************************
    Parameters:
    - genbanks: A list of gene cluster GenBank files.
    - orthogroup_matrix_file: The ortholog group vs sample matrix file, where cells correspond to locus tag identifiers.
    - log_object: A logging object.
    *******************************************************************************************************************
    Results:
    - hg_order_scores: A dictionary where keys are ortholog group identifiers and the values are a list of two items:
                       (1) consensus order and (2) consensus direction.
    *******************************************************************************************************************
    """
    try:
        gc_gene_to_hg: Dict[str, Any] = {}
        most_conserved_hgs = set([])
        hg_conservation_values = []
        max_hg_conservation = 0.0
        core_hgs = set([])
        single_copy_core_hgs = set([])
        with open(ortho_matrix_file) as omf:
            for i, line in enumerate(omf):
                if i == 0:
                    continue
                line = line.rstrip("\n")
                ls = line.split("\t")
                hg = ls[0]
                sample_count = 0
                sc_sample_count = 0
                for lts in ls[1:]:
                    for lt in lts.split(", "):
                        gc_gene_to_hg[lt] = hg
                    if lts.strip() != "":
                        sample_count += 1
                    if lts.strip() != "" and not ", " in lts:
                        sc_sample_count += 1
                hg_conservation = sample_count / float(len(ls[1:]))
                hg_conservation_values.append([hg, hg_conservation])
                if hg_conservation >= max_hg_conservation:
                    max_hg_conservation = hg_conservation
                if hg_conservation == 1.0:
                    core_hgs.add(hg)
                    if sc_sample_count / float(len(ls[1:])) == 1.0:
                        single_copy_core_hgs.add(hg)

        for hgv in hg_conservation_values:
            if hgv[1] == max_hg_conservation:
                most_conserved_hgs.add(hgv[0])

        gc_gene_counts = defaultdict(int)
        gc_genes = defaultdict(set)
        scaff_relative_gene_locations: Dict[str, Any] = {}
        for gbk in genbanks:
            prefix = ".".join(gbk.split("/")[-1].split(".")[:-1])
            if domain_mode:
                gene_locations = util.parse_gbk(
                    gbk, prefix, log_object, feature_type="cCDS"
                )
            else:
                gene_locations = util.parse_gbk(gbk, prefix, log_object)
            scaffolds = set(
                [gene_locations[x]["scaffold"] for x in gene_locations]
            )
            for scaff in scaffolds:
                gc_gene_locations: Dict[str, Any] = {}
                for g in gene_locations:
                    if gene_locations[g]["scaffold"] == scaff:
                        gc_gene_locations[g] = gene_locations[g]
                        scaff_relative_gene_locations[g] = gene_locations[g]
                gc_gene_counts[gbk + "|" + scaff] = len(gc_gene_locations)
                gc_genes[gbk + "|" + scaff] = set(gc_gene_locations.keys())

        ref_bgc = None
        for i, item in enumerate(
            sorted(gc_gene_counts.items(), key=itemgetter(1), reverse=True)
        ):
            ref_bgc = item[0]
            break

        gcs_ordered = [ref_bgc] + sorted(
            list(set(gc_genes.keys()).difference(set([ref_bgc])))
        )
        ref_hg_directions: Dict[str, Any] = {}
        hg_pair_score = defaultdict(int)
        hg_preceding_score = defaultdict(lambda: defaultdict(int))
        hg_following_score = defaultdict(lambda: defaultdict(int))
        all_hgs = set(["start", "end"])
        direction_forward_support = defaultdict(int)
        direction_reverse_support = defaultdict(int)
        for i, gc in enumerate(gcs_ordered):
            curr_gc_genes = gc_genes[gc]
            hg_directions: Dict[str, Any] = {}
            hg_lengths = defaultdict(list)
            hg_starts: Dict[str, Any] = {}
            for g in sorted(curr_gc_genes):
                ginfo = scaff_relative_gene_locations[g]
                gstart = ginfo["start"]
                gend = ginfo["end"]
                if g in gc_gene_to_hg:
                    hg = gc_gene_to_hg[g]
                    hg_directions[hg] = ginfo["direction"]
                    hg_lengths[hg].append(abs(gend - gstart))
                    hg_starts[hg] = ginfo["start"]
            reverse_flag = False
            if i == 0:
                ref_hg_directions = hg_directions
            else:
                flip_support = 0
                keep_support = 0
                for c in ref_hg_directions:
                    if not c in hg_directions:
                        continue
                    hg_weight = statistics.mean(hg_lengths[c])
                    if hg_directions[c] == ref_hg_directions[c]:
                        keep_support += hg_weight
                    else:
                        flip_support += hg_weight

                # reverse ordering
                if flip_support > keep_support:
                    reverse_flag = True

            hgs = []
            for c in sorted(
                hg_starts.items(), key=itemgetter(1), reverse=reverse_flag
            ):
                hgs.append(c[0])
                if reverse_flag == False:
                    if hg_directions[c[0]] == "+":
                        direction_forward_support[c[0]] += 1
                    elif hg_directions[c[0]] == "-":
                        direction_reverse_support[c[0]] += 1
                else:
                    if hg_directions[c[0]] == "+":
                        direction_reverse_support[c[0]] += 1
                    elif hg_directions[c[0]] == "-":
                        direction_forward_support[c[0]] += 1

            for j, hg in enumerate(hgs):
                all_hgs.add(hg)
                if j == 0:
                    hg_previ = "start"
                    hg_preceding_score[hg][hg_previ] += 1
                    hg_following_score[hg_previ][hg] += 1
                    hg_pair_score[tuple([hg_previ, hg])] += 1
                try:
                    hg_after = hgs[j + 1]
                    # make sure you don't get lost with broken / fragmented genes in BGCs that might be
                    # in the process being lost.
                    if hg != hg_after:
                        hg_preceding_score[hg_after][hg] += 1
                        hg_following_score[hg][hg_after] += 1
                        hg_pair_score[tuple([hg, hg_after])] += 1
                except Exception as e:
                    hg_after = "end"
                    hg_preceding_score[hg_after][hg] += 1
                    hg_following_score[hg][hg_after] += 1
                    hg_pair_score[tuple([hg, hg_after])] += 1

        with open(markovian_file, "w") as markovian_handle:
            markovian_handle.write(
    "og\tog_after\tsupport\tog_direction\tog_after_direction\n"
            )
            for hg in hg_following_score:
                for hg_after in hg_following_score[hg]:
                    hg_dir = "-"
                    if (
                        direction_forward_support[hg]
                        >= direction_reverse_support[hg]
                    ):
                        hg_dir = "+"
                    hg_after_dir = "-"
                    if (
                        direction_forward_support[hg_after]
                        >= direction_reverse_support[hg_after]
                    ):
                        hg_after_dir = "+"
                    markovian_handle.write(
                        f"{hg}\t{hg_after}\t{hg_following_score[hg][hg_after]}\t{hg_dir}\t{hg_after_dir}\n"
                    )
            

        with open(consensus_path_file, "w") as consensus_handle:
                curr_og = "start"
                visited_ogs = set([])
                while curr_og != "end":
                    next_og = ""
                    max_score = 0
                    for follow_og in hg_following_score[curr_og]:
                        if hg_following_score[curr_og][follow_og] >= max_score:
                            max_score = hg_following_score[curr_og][follow_og]
                            next_og = follow_og
                    consensus_handle.write(curr_og + f"\t{next_og}\n")
                    if next_og in visited_ogs:
                        break
                    visited_ogs.add(curr_og)
                    curr_og = next_og
                

        anchor_edge = None
        if len(single_copy_core_hgs) > 0:
            # first attempt to find anchor edge using a single copy core ortholog groups if any exist
            for hps in sorted(
                hg_pair_score.items(), key=itemgetter(1), reverse=True
            ):
                if (
                    hps[0][0] in single_copy_core_hgs
                    and hps[0][1] in single_copy_core_hgs
                ):
                    anchor_edge = hps[0]
                    break
        if len(core_hgs) > 0 and anchor_edge == None:
            # looks like that failed, now lets use any available core ortholog groups (not necessarily single copy) if any exist \
            for hps in sorted(
                hg_pair_score.items(), key=itemgetter(1), reverse=True
            ):
                if hps[0][0] in core_hgs and hps[0][1] in core_hgs:
                    anchor_edge = hps[0]
                    break
            try:
                assert anchor_edge != None
            except Exception as e:
                for hps in sorted(
                    hg_pair_score.items(), key=itemgetter(1), reverse=True
                ):
                    if hps[0][0] in core_hgs or hps[0][1] in core_hgs:
                        anchor_edge = hps[0]
                        break

        if anchor_edge == None:
            # ahh, that also failed welp - lets use the most conserved gene available and write a warning to the log file and console
            stars = "*" * 34 + "\n"
            sys.stderr.write(f"{stars}WARNING!!! No core ortholog groups were detected across homologous gene cluster\ninstances - the consensus order and direction predictions will likely be lower quality.{stars}")
            log_object.warning(
                "No core ortholog groups were detected across homologous gene cluster\ninstances - the quality of the consensus order and direction\npredictions will be lower.\n"
            )
            try:
                for hps in sorted(
                    hg_pair_score.items(), key=itemgetter(1), reverse=True
                ):
                    if (
                        hps[0][0] in most_conserved_hgs
                        and hps[0][1] in most_conserved_hgs
                    ):
                        anchor_edge = hps[0]
                        break
                try:
                    assert anchor_edge != None
                except Exception as e:
                    for hps in sorted(
                        hg_pair_score.items(), key=itemgetter(1), reverse=True
                    ):
                        if (
                            hps[0][0] in most_conserved_hgs
                            or hps[0][1] in most_conserved_hgs
                        ):
                            anchor_edge = hps[0]
                            break
                assert anchor_edge != None
            except Exception as e:
                sys.stderr.write(traceback.format_exc())
                sys.stderr.write(
                    "\nUnexpected error, no anchor edge found, could be because no protocore ortholog group exists, which shouldn't be the case!\n"
                )
                sys.exit(1)

        # use to keep track of which HGs have been accounted for already at different steps of assigning order
        accounted_hgs = set([anchor_edge[0], anchor_edge[1]])

        # primary expansion left
        curr_hg = anchor_edge[0]
        left_expansion = [curr_hg]
        while not curr_hg == "start":
            new_hg = None
            for i, hg in enumerate(
                sorted(
                    hg_preceding_score[curr_hg].items(),
                    key=itemgetter(1),
                    reverse=True,
                )
            ):
                if not hg[0] in accounted_hgs:
                    new_hg = hg[0]
                    left_expansion = [new_hg] + left_expansion
                    accounted_hgs.add(new_hg)
                    break
            if new_hg != None:
                curr_hg = new_hg
            else:
                # shouldn't ever be the case, but breaking just in case
                break

        # primary expansion right
        curr_hg = anchor_edge[1]
        right_expansion = [curr_hg]
        while not curr_hg == "end":
            new_hg = None
            for i, hg in enumerate(
                sorted(
                    hg_following_score[curr_hg].items(),
                    key=itemgetter(1),
                    reverse=True,
                )
            ):
                if not hg[0] in accounted_hgs:
                    new_hg = hg[0]
                    right_expansion.append(new_hg)
                    accounted_hgs.add(new_hg)
                    break
            if new_hg != None:
                curr_hg = new_hg
            else:
                # shouldn't ever be the case, but breaking just in case
                break

        primary_path_ordered = left_expansion + right_expansion
        ordered_hgs_list = primary_path_ordered

        # figure out where non - accounted for HGs belong best in the primary path.
        not_accounted_hgs = all_hgs.difference(accounted_hgs)
        while len(not_accounted_hgs) > 0:
            progress_made = False
            for hg in sorted(not_accounted_hgs):
                best_score = 0
                relative_pos = None
                neighboriest_hg = None
                for phg in sorted(
                    hg_preceding_score[hg].items(),
                    key=itemgetter(1),
                    reverse=True,
                ):
                    if best_score < phg[1] and phg[0] in accounted_hgs:
                        best_score = phg[1]
                        relative_pos = "after"
                        neighboriest_hg = phg[0]
                        break
                for fhg in sorted(
                    hg_following_score[hg].items(),
                    key=itemgetter(1),
                    reverse=True,
                ):
                    if best_score < fhg[1] and fhg[0] in accounted_hgs:
                        best_score = fhg[1]
                        relative_pos = "before"
                        neighboriest_hg = fhg[0]
                        break
                if best_score > 0:
                    neighboriest_hg_index = ordered_hgs_list.index(
                        neighboriest_hg
                    )

                    if relative_pos == "before":
                        ordered_hgs_list.insert(neighboriest_hg_index, hg)
                    elif relative_pos == "after":
                        ordered_hgs_list.insert(neighboriest_hg_index + 1, hg)
                    accounted_hgs.add(hg)
                    not_accounted_hgs = all_hgs.difference(accounted_hgs)
                    progress_made = True
                    break

            if not progress_made:
                break
        # these shouldn't really exist but just append them to the end if they do
        unaccountable_hgs = all_hgs.difference(accounted_hgs)
        ordered_hgs_list += list(sorted(unaccountable_hgs))

        hg_order_scores = {}
        i = 1
        for hg in ordered_hgs_list:
            if not hg in set(["start", "end"]):
                consensus_direction = "-"
                if (
                    direction_forward_support[hg]
                    >= direction_reverse_support[hg]
                ):
                    consensus_direction = "+"
                hg_order_scores[hg] = [i, consensus_direction]
                i += 1
        return hg_order_scores
    except Exception as e:
        msg = "Issues in attempting to calculate order score for each ortholog group."
        sys.stderr.write(msg + "\n")
        log_object.error(msg)
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def determine_og_stats(
    orthogroup_matrix_file,
    hg_nucl_dir,
    log_object,
    representative_associated_members=None,
    impute_broad_conservation=False,
) -> List[Any]:
    """
    Description:
    This function determines basic statistics for ortholog groups, including: (i) percentage of gene clusters which \
    feature theme, (ii) the median GC%, (iii) the median GC skew, (iv) whether they are found at max a copy - count of \
    one per gene cluster, and (v) the median legnth of nucleotide sequences.
    *******************************************************************************************************************
    Parameters:
    - orthogroup_matrix_file: The ortholog group vs sample matrix file, where cells correspond to locus tag identifiers.
    - hg_nucl_dir: The directory of FASTA files with nucleotide sequence for each ortholog group.
    - log_object: A logging object.
    - representative_associated_members:
    - impute_broad_conservation:
    *******************************************************************************************************************
    Results:
    - hg_order_scores: a dictionary where the key is the ortholog group identifier
    *******************************************************************************************************************
    """
    try:
        hg_single_copy_status: Dict[str, Any] = {}
        hg_prop_samples: Dict[str, Any] = {}
        hg_lts = defaultdict(set)
        samples = []
        with open(orthogroup_matrix_file) as omf:
            for i, line in enumerate(omf):
                line = line.rstrip("\n")
                ls = line.split("\t")
                if i == 0:
                    samples = ls[1:]
                    continue
                hg = ls[0]
                is_single_copy = True
                sample_count = 0
                weighted_count = 0
                total_weighted_count = 0
                for j, lts in enumerate(ls[1:]):
                    samp = samples[j]
                    if (
                        representative_associated_members != None
                        and impute_broad_conservation
                    ):
                        total_weighted_count += len(
                            representative_associated_members[samp]
                        )
                    if "," in lts:
                        is_single_copy = False
                    if lts.strip() != "":
                        sample_count += 1
                        if (
                            representative_associated_members != None
                            and impute_broad_conservation
                        ):
                            weighted_count += len(
                                representative_associated_members[samp]
                            )
                    for lt in lts.split(", "):
                        if lt.strip() == "":
                            continue
                        hg_lts[hg].add(lt)
                hg_single_copy_status[hg] = is_single_copy
                if (
                    representative_associated_members != None
                    and impute_broad_conservation
                ):
                    hg_prop_samples[hg] = weighted_count / float(
                        total_weighted_count
                    )
                else:
                    hg_prop_samples[hg] = sample_count / float(len(ls[1:]))
        hg_median_lengths: Dict[str, Any] = {}
        hg_median_gcskew: Dict[str, Any] = {}
        hg_median_gc: Dict[str, Any] = {}
        for f in os.listdir(hg_nucl_dir):
            hg = f.split(".fna")[0]
            lengths = []
            gcs = []
            gc_skews = []
            with open(hg_nucl_dir + f) as ohpf:
                for rec in SeqIO.parse(ohpf, "fasta"):
                    seq = str(rec.seq)
                    tot_bases = len(seq)
                    g = sum([1 for x in seq if x == "C"])
                    c = sum([1 for x in seq if x == "G"])
                    gc_sum = g + c
                    gc = gc_sum / tot_bases
                    gcs.append(gc)
                    gc_skew = float("nan")
                    if gc_sum > 0:
                        gc_skew = (g - c) / gc_sum
                    gc_skews.append(gc_skew)
                    lengths.append(len(str(rec.seq)))
            hg_median_lengths[hg] = statistics.median(lengths)
            hg_median_gcskew[hg] = statistics.median(gc_skews)
            hg_median_gc[hg] = statistics.median(gcs)
        return [
            hg_single_copy_status,
            hg_prop_samples,
            hg_median_lengths,
            hg_median_gcskew,
            hg_median_gc,
            dict(hg_lts),
        ]
    except Exception as e:
        log_object.error(
            "Issues with determining basic stats for ortholog groups."
        )
        sys.stderr.write(
            "Issues with determining basic stats for ortholog groups.\n"
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def individual_hyphy_run(inputs) -> Tuple[str, Optional[str]]:
    """
    Description:
    This functions run HyPhy based analyses (GARD + FUBAR) for a single ortholog group.
    *******************************************************************************************************************
    Parameters:
    - inputs: a list which can be expanded to the following items:
            - hg: ortholog group identifier.
            - hg_codo_algn_file: ortholog group codon alignment file in FASTA format.
            - hg_codo_tree_file: ortholog group (approximate) phylogeny file in Newick format. \
            - gard_output: output *.json file from GARD analysis.
            - best_gard_output: output *.best file from GARD analysis.
            - fubar_outdir: results directory for FUBAR analysis.
            - skip_gard: boolean flag for whether to skip GARD analysis.
            - skip_busted: boolean flag for whether to skip BUSTED analysis.
            - gard_mode: analysis mode for GARD - either "Faster" or "Normal".
            - gard_timeout: timeout for running gard (in minutes).
    *******************************************************************************************************************
    """
    (
        hg,
        hg_codo_algn_file,
        hg_full_codo_tree_file,
        gard_output,
        best_gard_output,
        fubar_outdir,
        busted_outdir,
        skip_gard,
        skip_busted,
        gard_mode,
        gard_timeout,
    ) = inputs
    try:
        input_gbks_with_hg = set([])
        with open(hg_codo_algn_file) as ohcaf:
            for rec in SeqIO.parse(ohcaf, "fasta"):
                input_gbks_with_hg.add(rec.id.split("|")[0])

        if len(input_gbks_with_hg) < 4:
            return ('success', 'skipped_insufficient_genomes')

        unique_seqs = set([])
        align_len = 0
        with open(hg_codo_algn_file) as ohcaf:
            for rec in SeqIO.parse(ohcaf, "fasta"):
                unique_seqs.add(str(rec.seq))
                align_len = len(str(rec.seq))

        if len(unique_seqs) == 1:
            return ('success', 'skipped_identical_sequences')

        if align_len <= 200:
            return ('success', 'skipped_short_alignment')

        if skip_gard:
            fubar_cmd = [
                "hyphy",
                "CPU=1",
                "fubar",
                "--alignment",
                hg_codo_algn_file,
                "--tree",
                hg_full_codo_tree_file,
            ]
            util.run_cmd_via_subprocess(fubar_cmd, 
                                        check_files=[hg_codo_algn_file + ".FUBAR.json"],
                                        verbose=False)
            os.system(f"mv {hg_codo_algn_file}.FUBAR.json {fubar_outdir}")
            
            if not skip_busted:
                busted_cmd = [
                    "hyphy",
                    "CPU=1",
                    "busted",
                    "--alignment",
                    hg_codo_algn_file,
                    "--tree",
                    hg_full_codo_tree_file,
                ]
                util.run_cmd_via_subprocess(busted_cmd, 
                                            check_files=[hg_codo_algn_file + ".BUSTED.json"],
                                            verbose=False)
                os.system(f"mv {hg_codo_algn_file}.BUSTED.json {busted_outdir}")
            
            return ('success', None)

        else:
            gard_cmd = [
                "hyphy",
                "CPU=1",
                "gard",
                "--mode",
                gard_mode,
                "--alignment",
                hg_codo_algn_file,
                "--output",
                gard_output,
                "--output-lf",
                best_gard_output,
            ]
            add_tree = False

            try:
                subprocess.run(
                    " ".join(gard_cmd),
                    shell=True,
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.DEVNULL,
                    executable="/bin/bash",
                    timeout=(60 * gard_timeout),
                )
            except subprocess.TimeoutExpired as e:
                msg = f"Timed out running GARD: {' '.join(gard_cmd)}, defaulting to using original alignment in downstream selection analyses."
                sys.stderr.write(msg + "\n")
                best_gard_output = hg_codo_algn_file
                add_tree = True

            if not add_tree:
                try:
                    assert (
                        os.path.isfile(best_gard_output)
                        and os.path.getsize(best_gard_output) >= 100
                    )
                except Exception as e:
                    msg = f"Had an issue running GARD: {' '.join(gard_cmd)}, defaulting to using original alignment in downstream selection analyses."
                    sys.stderr.write(msg + "\n")      
                    best_gard_output = hg_codo_algn_file
                    add_tree = True

            fubar_cmd = [
                "hyphy",
                "CPU=1",
                "fubar",
                "--alignment",
                best_gard_output,
            ]
            if add_tree:
                fubar_cmd += ["--tree", hg_full_codo_tree_file]

            util.run_cmd_via_subprocess(fubar_cmd, 
                                        check_files=[best_gard_output + ".FUBAR.json"],
                                        verbose=False)
            os.system(f"mv {hg_codo_algn_file}.FUBAR.json {fubar_outdir}")

            if not skip_busted:
                busted_cmd = [
                    "hyphy",
                    "CPU=1",
                    "busted",
                    "--alignment",
                    best_gard_output,
                ]
                if add_tree:
                    busted_cmd += ["--tree", hg_full_codo_tree_file]
                util.run_cmd_via_subprocess(busted_cmd, 
                                            check_files=[best_gard_output + ".BUSTED.json"],
                                            verbose=False)
                os.system(f"mv {hg_codo_algn_file}.BUSTED.json {busted_outdir}")
        
        return ('success', None)

    except Exception as e:
        error_msg = f"Issues with running HyPhy based analyses for ortholog group {hg}: {str(e)}"
        return ('error', error_msg)


def run_hyphy_analyses(
    codo_algn_dir,
    tree_dir,
    gard_results_dir,
    fubar_results_dir,
    busted_results_dir,
    log_object,
    skip_gard=False,
    skip_busted=False,
    gard_mode="Faster",
    gard_timeout=60,
    threads=1,
) -> List[Any]:
    """f
    Description:
    This function oversees running of HyPhy based analyses (GARD + FUBAR) for ortholog groups and parses resulting \
    statistics from resulting JSON files to include in the consolidated report created at the end of zol.
    *******************************************************************************************************************
    Parameters:
    - codo_algn_dir: The directory with codon alignments for each ortholog group.
    - tree_dir: The directory with gene trees for each ortholog group (recall - these are made using FastTree2 on
                        trimmed codon alignments.
    - gard_results_dir: The directory where GARD result files should be saved.
    - fubar_results_dir: The directory where FUBAR result files should be saved.
    - busted_results_dir: The directory where BUSTED result files should be saved.
    - log_object: A logging object.
    - skip_gard: Boolean indicating whether user has requested to skip GARD analsyis.
    - skip_busted: Boolean indicating whether user has requested to skip BUSTED analysis.
    - gard_mode: Which mode to run GARD analysis using, can either be "Faster" or "Normal".
    - gard_timeout: timeout for running gard (in minutes).
    - threads: The number of threads to use.
    *******************************************************************************************************************
    """
    try:
        hyphy_inputs = []
        for caf in os.listdir(codo_algn_dir):
            if not caf.endswith(".msa.fna"):
                continue
            hg = caf.split(".msa.fna")[0]
            hg_codo_algn_file = codo_algn_dir + caf
            hg_codo_tree_file = tree_dir + hg + ".tre"
            gard_output = gard_results_dir + hg + ".json"
            best_gard_output = gard_results_dir + hg + ".best"
            hyphy_inputs.append(
                [
                    hg,
                    hg_codo_algn_file,
                    hg_codo_tree_file,
                    gard_output,
                    best_gard_output,
                    fubar_results_dir,
                    busted_results_dir,
                    skip_gard,
                    skip_busted,
                    gard_mode,
                    gard_timeout
                ]
            )

        msg = (
            "Running HyPhy recombination / selection analyses for %d ortholog groups"
            % len(hyphy_inputs)
        )
        log_object.info(msg)
        sys.stdout.write(msg + "\n")

        # Use robust error handling for HyPhy analysis
        result_summary = util.robust_multiprocess_executor(
            worker_function=individual_hyphy_run,
            inputs=hyphy_inputs,
            pool_size=threads,
            error_strategy="report_and_continue",  # Continue even if some HyPhy runs fail
            log_object=log_object,
            description="HyPhy recombination & selection analysis"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        msg = f"{success_prop*100.0}% of HyPhy runs were successful."
        sys.stdout.write(msg + '\n')
        log_object.info(msg)

        gard_partitions: Dict[str, Any] = {}
        for f in os.listdir(gard_results_dir):
            if f.endswith(".json"):
                hg = f.split(".json")[0]
                gard_json_result = gard_results_dir + f
                if os.path.getsize(gard_json_result) < 100:
                    continue
                with open(gard_json_result) as ogjr:
                    gard_results = json.load(ogjr)
                try:
                    number_of_partitions = len(gard_results["trees"])
                except KeyError:
                    number_of_partitions = "NA"
                    log_object.warning(f"GARD results for {hg} missing 'trees' key - potentially due to timeout.")
                gard_partitions[hg] = number_of_partitions

        fubar_sel_props: Dict[str, Any] = {}
        fubar_sel_sites: Dict[str, Any] = {}
        fubar_deba: Dict[str, Any] = {}
        for f in os.listdir(fubar_results_dir):
            if f.endswith(".json"):
                hg = f.split(".msa.fna.FUBAR.json")[0]
                if f.endswith(".best.FUBAR.json"):
                    hg = f.split(".best.FUBAR.json")[0]
                fubar_json_result = fubar_results_dir + f
                try:
                    with open(fubar_json_result) as ofjr:
                        fubar_results = json.load(ofjr)
                    pos_selected_sites = 0
                    neg_selected_sites = 0
                    sum_deba = 0
                    tot_sites = 0
                    for partition in fubar_results["MLE"]["content"]:
                        for site_mle_info in fubar_results["MLE"]["content"][
                            partition
                        ]:
                            tot_sites += 1
                            (
                                alpha,
                                beta,
                                diff,
                                prob_agb,
                                prob_alb,
                                bayesfactor,
                                _,
                                _,
                            ) = site_mle_info
                            sum_deba += beta - alpha
                            if prob_agb >= 0.9:
                                neg_selected_sites += 1
                            if prob_alb >= 0.9:
                                pos_selected_sites += 1
                    tot_selected_sites = (
                        pos_selected_sites + neg_selected_sites
                    )
                    prop_selected_sites_positive = "NA"
                    if tot_selected_sites >= 1:
                        prop_selected_sites_positive = float(
                            pos_selected_sites
                        ) / float(neg_selected_sites + pos_selected_sites)
                    fubar_sel_props[hg] = prop_selected_sites_positive
                    fubar_sel_sites[hg] = (
                        tot_selected_sites  # /float(tot_sites) TODO: make this proportion - more useful!! \
                    )
                    avg_deba = "NA"
                    if tot_sites > 0:
                        avg_deba = sum_deba / float(tot_sites)
                    fubar_deba[hg] = avg_deba
                    # TODO: process "grid" field in FUBAR results to get most probable dN / dS ratio
                except Exception as e:
                    fubar_sel_props[hg] = "NA"
                    fubar_sel_sites[hg] = "NA"
                    fubar_deba[hg] = "NA"

        busted_pval: Dict[str, Any] = {}
        for f in os.listdir(busted_results_dir):
            if f.endswith(".json"):
                hg = f.split(".msa.fna.BUSTED.json")[0]
                if f.endswith(".best.BUSTED.json"):
                    hg = f.split(".best.BUSTED.json")[0]
                busted_json_file = busted_results_dir + f
                try:
                    with open(busted_json_file) as objr:
                        busted_results = json.load(objr)
                    pval = float(busted_results["test results"]["p-value"])
                    busted_pval[hg] = pval
                except Exception as e:
                    busted_pval[hg] = "NA"

        return [
            gard_partitions,
            fubar_sel_props,
            fubar_sel_sites,
            fubar_deba,
            busted_pval,
        ]
    except Exception as e:
        sys.stderr.write(
            "Issues with running HyPhy GARD, BUSTED, or FUBAR analyses.\n"
        )
        log_object.error(
            "Issues with running HyPhy GARD, BUSTED, or FUBAR analyses."
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def determine_seq_sim_protein_alignment(inputs) -> Tuple[str, Optional[str]]:
    """
    Description:
    This function computes the sequence similarity / identity between proteins in a MSA.
    *******************************************************************************************************************
    Parameters:
    - input: A list which can be expanded to the following items:
            - hg: The ortholog group identifier.
            - protein_alignment_file: The protein multiple sequence alignment file for the ortholog group in FASTA format.
            - outf: The output file where to write pairwise sequence similarities.
    *******************************************************************************************************************
    """
    try:
        use_only_core = True  # hardcoded true at the moment
        hg, protein_alignment_file, outf = inputs
        protein_sequences: Dict[str, Any] = {}
        with open(protein_alignment_file) as ocaf:
            for rec in SeqIO.parse(ocaf, "fasta"):
                protein_sequences[rec.id] = str(rec.seq).upper()

        pair_seq_matching = defaultdict(lambda: defaultdict(lambda: 0.0))
        for i, g1 in enumerate(sorted(protein_sequences)):
            s1 = g1.split("|")[0]
            g1s = protein_sequences[g1]
            for j, g2 in enumerate(sorted(protein_sequences)):
                if i >= j:
                    continue
                s2 = g2.split("|")[0]
                if s1 == s2:
                    continue
                g2s = protein_sequences[g2]
                tot_comp_pos = 0
                match_pos = 0
                for pos, g1a in enumerate(g1s):
                    g2a = g2s[pos]
                    if g1a != "-" or g2a != "-":
                        if not use_only_core or (
                            use_only_core and g1a != "-" and g2a != "-"
                        ):
                            tot_comp_pos += 1
                            if g1a == g2a:
                                match_pos += 1
                general_matching_percentage = 0.0
                if tot_comp_pos > 0:
                    general_matching_percentage = float(match_pos) / float(
                        tot_comp_pos
                    )
                if (
                    pair_seq_matching[s1][s2] < general_matching_percentage
                    and pair_seq_matching[s2][s1] < general_matching_percentage
                ):
                    pair_seq_matching[s1][s2] = general_matching_percentage
                    pair_seq_matching[s2][s1] = general_matching_percentage

        pair_seq_matching_normal = default_to_regular(pair_seq_matching)
        with open(outf, "wb") as pickle_file:
            pickle.dump(
                pair_seq_matching_normal,
                pickle_file,
                protocol=pickle.HIGHEST_PROTOCOL,
            )
        
        return ('success', None)
    
    except Exception as e:
        error_msg = f"Error in determine_seq_sim_protein_alignment for ortholog group {hg}: {str(e)}"
        return ('error', error_msg)

def compute_beta_rd_gc(prot_algn_dir, evo_dir, log_object, threads=1) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Description:
    This function computes the BetaRD - gc statistic for each ortholog group - which is an estimate of how sequence
    similarity varies for the ortholog group in relation to other ortholog groups from the gene cluster.
    Note, Beta-RD gene cluster statistic here is being computed in a different manner than what we did in lsaBGC, it is
    a different statistic.
    *******************************************************************************************************************
    Parameters:
    - prot_algn_dir: The directory with protein alignments for ortholog groups.
    - evo_dir: The workspace / directory where evolutionary analyses are to be performed under.
    - log_object: A logging object.
    - threads: The number of threads to use.
    *******************************************************************************************************************
    Returns:
    - A list of two dictionaries:
            - hg_med_beta_rd: A dictionary mapping ortholog groups to the median BetaRD-gc statistic.
            - hg_max_beta_rd: A dictionary mapping ortholog groups to the max BetaRD-gc statistic.
    *******************************************************************************************************************
    """
    evo_dir = os.path.abspath(evo_dir) + "/"
    brd_results_dir = evo_dir + "BetaRDgc_Calculations/"
    util.setup_ready_directory([brd_results_dir])
    hg_med_beta_rd: Dict[str, Any] = {}
    hg_max_beta_rd: Dict[str, Any] = {}
    try:
        inputs = []
        for f in os.listdir(prot_algn_dir):
            hg = f.split(".msa.faa")[0]
            outf = brd_results_dir + hg + ".sims.pkl"
            inputs.append([hg, prot_algn_dir + f, outf])

        msg = f"Determining Beta - RDgc statistic for {len(inputs)} ortholog groups"
        log_object.info(msg)
        sys.stdout.write(msg + "\n")

        # Use robust error handling for sequence similarity analysis
        result_summary = util.robust_multiprocess_executor(
            worker_function=determine_seq_sim_protein_alignment,
            inputs=inputs,
            pool_size=threads,
            error_strategy="report_and_continue",  # Continue even if some similarity analysis fails
            log_object=log_object,
            description="protein sequence similarity analysis"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        msg = f"{success_prop*100.0}% of protein sequence similarity analysis runs were successful."
        sys.stdout.write(msg + '\n')
        log_object.info(msg)

        hg_sims_dict: Dict[str, Any] = {}
        gc_wide_sims_dict = defaultdict(lambda: defaultdict(list))
        for f in os.listdir(brd_results_dir):
            hg = f.split(".sims.pkl")[0]
            sims_dict = None
            with open(brd_results_dir + f, "rb") as handle:
                sims_dict = pickle.load(handle)
            if len(sims_dict) < 2:
                continue
            hg_sims_dict[hg] = sims_dict
            for i, s1 in enumerate(sorted(sims_dict)):
                for j, s2 in enumerate(sorted(sims_dict)):
                    if s1 != s2 and s2 in sims_dict[s1]:
                        gc_wide_sims_dict[s1][s2].append(sims_dict[s1][s2])
                    else:
                        gc_wide_sims_dict[s1][s2].append(0.0)

        for hg in hg_sims_dict:
            Brdgc = []
            for i, s1 in enumerate(sorted(hg_sims_dict[hg])):
                for j, s2 in enumerate(sorted(hg_sims_dict[hg])):
                    if i >= j:
                        continue
                    Brdgc.append(
                        hg_sims_dict[hg][s1][s2]
                        / float(statistics.median(gc_wide_sims_dict[s1][s2]))
                    )
            if len(Brdgc) > 0:
                hg_med_beta_rd[hg] = statistics.median(Brdgc)
                hg_max_beta_rd[hg] = max(Brdgc)
    except Exception as e:
        sys.stderr.write(
            "Issues with calculating Beta-RD gene cluster for ortholog groups.\n"
        )
        log_object.error(
            "Issues with calculating Beta-RD gene cluster for ortholog groups."
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)
    return hg_med_beta_rd, hg_max_beta_rd


def calculate_msa_entropy(inputs) -> Tuple[str, Optional[str]]:
    """
    Description:
    This function computes the average entropy statistic for a MSA of proteins for an individual ortholog group.
    *******************************************************************************************************************
    Parameters:
    - input: A list which can be expanded to the following items:
            - hg: The ortholog group identifier.
            - nucl_algn_fasta: The nucleotide / codon multiple sequence alignment file for the ortholog group in FASTA format.
            - outf: The output file where to write the average entropy calculated.
    *******************************************************************************************************************
    """
    hg, nucl_algn_fasta, outf = inputs
    try:
        seqs = []
        with open(nucl_algn_fasta) as onaf:
            for rec in SeqIO.parse(onaf, "fasta"):
                seqs.append(list(str(rec.seq)))
        accounted_sites = 0
        all_entropy = 0.0
        for tup in zip(*seqs):
            als = list(tup)
            missing_prop = sum(
                [1 for al in als if not al in set(["A", "C", "G", "T"])]
            ) / float(len(als))
            if missing_prop >= 0.1:
                continue
            filt_als = [al for al in als if al in set(["A", "C", "G", "T"])]
            a_freq = sum([1 for al in filt_als if al == "A"]) / float(
                len(filt_als)
            )
            c_freq = sum([1 for al in filt_als if al == "C"]) / float(
                len(filt_als)
            )
            g_freq = sum([1 for al in filt_als if al == "G"]) / float(
                len(filt_als)
            )
            t_freq = sum([1 for al in filt_als if al == "T"]) / float(
                len(filt_als)
            )
            site_entropy = stats.entropy(
                [a_freq, c_freq, g_freq, t_freq], base=4
            )
            all_entropy += site_entropy
            accounted_sites += 1
        avg_entropy = "NA"
        if accounted_sites > 0:
            avg_entropy = all_entropy / accounted_sites
        with open(outf, "w") as outf_handle:
            outf_handle.write(f"{hg}\t{avg_entropy}\n")
        return ('success', None)
    except Exception as e:
        error_msg = f"Error calculating MSA entropy for ortholog group {hg}: {str(e)}"
        return ('error', error_msg)

def run_entropy_analysis(
    codo_algn_trim_dir, upst_algn_dir, evo_dir, log_object, threads=1
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Description:
    This function computes the average entropy statistic for ortholog groups.
    *******************************************************************************************************************
    Parameters:
    - codon_algn_trim_dir: The directory with trimmed codon alignments for ortholog groups.
    - upst_algn_dir: The directory with upstream nucleotide alignments for ortholog groups.
    - evo_dir: The workspace / directory where evolutionary analyses are to be performed under.
    - log_object: A logging object.
    - threads: The number of threads to use.
    *******************************************************************************************************************
    Returns:
    - A list of two dictionaries:
            - hg_entropy: A dictionary mapping ortholog groups to the average entropy statistic.
            - hg_upst_entropy: A dictionary mapping ortholog groups to the upstream sequence average entropy statistic.
    *******************************************************************************************************************
    """
    try:
        evo_dir = os.path.abspath(evo_dir) + "/"
        entropy_res_dir = evo_dir + "Entropy_Calculations/"
        util.setup_ready_directory([entropy_res_dir])
        inputs = []
        for f in os.listdir(codo_algn_trim_dir):
            hg = f.split(".msa.fna")[0]
            caf = codo_algn_trim_dir + f
            outf = entropy_res_dir + hg + "_codon.txt"
            inputs.append([hg, caf, outf])

        for f in os.listdir(upst_algn_dir):
            hg = f.split(".msa.fna")[0]
            uaf = upst_algn_dir + f
            outf = entropy_res_dir + hg + "_upstream.txt"
            inputs.append([hg, uaf, outf])

        msg = "Computing sequence and upstream sequence entropy for ortholog groups"
        log_object.info(msg)
        sys.stdout.write(msg + "\n")

        # Use robust error handling for MSA entropy calculation
        result_summary = util.robust_multiprocess_executor(
            worker_function=calculate_msa_entropy,
            inputs=inputs,
            pool_size=threads,
            error_strategy="report_and_continue",  # Continue even if some entropy calculations fail
            log_object=log_object,
            description="MSA entropy calculation"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        msg = f"{success_prop*100.0}% of MSA entropy calculation runs were successful."
        sys.stdout.write(msg + '\n')
        log_object.info(msg)

        hg_entropy: Dict[str, Any] = {}
        hg_upst_entropy: Dict[str, Any] = {}
        for f in os.listdir(entropy_res_dir):
            with open(entropy_res_dir + f) as oef:
                for line in oef:
                    line = line.strip()
                    hg, ep = line.split("\t")
                    if f.endswith("_upstream.txt"):
                        hg_upst_entropy[hg] = ep
                    elif f.endswith("_codon.txt"):
                        hg_entropy[hg] = ep
        return hg_entropy, hg_upst_entropy
    except Exception as e:
        sys.stderr.write(
            "Issues with calculating entropy for ortholog groups or their upstream regions.\n"
        )
        log_object.error(
            "Issues with calculating entropy for ortholog groups or their upstream regions."
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def calculate_ambiguity(codo_algn_dir, codo_algn_trim_dir, log_object) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Description:
    This function computes the proportion of ambiguous sites (>10% gaps) in full and trimmed codon alignments. \
    *******************************************************************************************************************
    Parameters:
    - codon_algn_dir: The directory with codon alignments for ortholog groups.
    - codon_algn_trim_dir: The directory with trimmed codon alignments for ortholog groups.
    - log_object: A logging object.
    *******************************************************************************************************************
    Returns:
    - A list of two dictionaries:
            - full_amb_prop: A dictionary mapping ortholog groups to the proportion of ambiguous sites in the full codon
                             alignment.
            - trim_amb_prop: A dictionary mapping ortholog groups to the proportion of ambiguous sites in the trimmed codon
                             alignment.
    *******************************************************************************************************************
    """
    full_amb_prop: Dict[str, Any] = {}
    trim_amb_prop: Dict[str, Any] = {}
    try:
        for caf in os.listdir(codo_algn_dir):
            if not caf.endswith(".msa.fna"):
                continue
            hg = caf.split(".msa.fna")[0]
            codo_algn_file = codo_algn_dir + caf
            codo_sequences = []
            with open(codo_algn_file) as ocaf:
                for rec in SeqIO.parse(ocaf, "fasta"):
                    codo_sequences.append(list(str(rec.seq)))
            tot = 0
            amb = 0
            for al in zip(*codo_sequences):
                tot += 1
                all = list(al)
                amb_site_prop = sum(
                    [1 for x in all if not x in set(["A", "C", "G", "T"])]
                ) / float(len(all))
                if amb_site_prop >= 0.1:
                    amb += 1
            if tot == 0:
                amb_prop = float("nan")
            else:
                amb_prop = float(amb) / float(tot)
            full_amb_prop[hg] = amb_prop

        for catf in os.listdir(codo_algn_trim_dir):
            if not catf.endswith(".msa.fna"):
                continue
            hg = catf.split(".msa.fna")[0]
            codo_algn_trimmed_file = codo_algn_trim_dir + catf
            codo_sequences = []
            with open(codo_algn_trimmed_file) as ocatf:
                for rec in SeqIO.parse(ocatf, "fasta"):
                    codo_sequences.append(list(str(rec.seq)))
            tot = 0
            amb = 0
            for al in zip(*codo_sequences):
                tot += 1
                all = list(al)
                amb_site_prop = sum(
                    [1 for x in all if not x in set(["A", "C", "G", "T"])]
                ) / float(len(all))
                if amb_site_prop >= 0.1:
                    amb += 1
            if tot == 0:
                amb_prop = float("nan")
            else:
                amb_prop = float(amb) / float(tot)
            trim_amb_prop[hg] = amb_prop

    except Exception as e:
        sys.stderr.write(
            "Issues with calculating ambiguity for full or trimmed codon alignments.\n"
        )
        log_object.error(
            "Issues with calculating ambiguity for full or trimmed codon alignments."
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)
    return full_amb_prop, trim_amb_prop


def run_tajimas_d_analysis_per_hg(inputs) -> Tuple[str, Optional[str]]:
    """
    Description:
    This function oversees the calculation of Tajima's D and proportion of segregating sites per ortholog group.
    *******************************************************************************************************************
    Parameters:
    - input: A list which can be expanded to the following items:
            - hg: The ortholog group identifier.
            - trim_codon_align: The trimmed codon multiple sequence alignment file for the ortholog group in FASTA format.
            - outf: The output file where to write the Tajima's D statistic and the proportion of segregating sites for the
                    ortholog group.
    *******************************************************************************************************************
    """
    hg, trim_codon_align, outf = inputs
    try:
        with open(outf, "w") as outf_handle:
            codo_sequences = []
            with open(trim_codon_align) as ocatf:
                for rec in SeqIO.parse(ocatf, "fasta"):
                    codo_sequences.append(str(rec.seq))
            # at least 4 sequences and 60 bp in filtered alignment
            if len(codo_sequences) >= 4 and len(codo_sequences[0]) > 60:
                taj_d, seg_sites = calculate_tajimas_d(codo_sequences)
                seg_sites_prop = seg_sites / len(codo_sequences[0])
            else:
                taj_d = "NA"
                seg_sites_prop = "NA"
            outf_handle.write(f"{hg}\t{taj_d}\t{seg_sites_prop}\n")
        return ('success', None)
    except Exception as e:
        error_msg = f"Issues with calculating Tajima's D for ortholog group {hg}: {str(e)}"
        return ('error', error_msg)


def determine_bgc_and_viral_scores(pfam_annotations, log_object) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Description:
    Maps Pfam annotations per homolog group to BGC and viral scores from GECCO weights and V-scores, respecitvely.
    ********************************************************************************************************************
    Parameters:
    - pfam_annotations: dictionary of pfam annotations for homolog groups.
    - log_object: a Python logging object.
    ********************************************************************************************************************
    Returns:
    - gecco_weights:
    - vscores:
    ********************************************************************************************************************
    """

    gecco_weights = defaultdict(lambda: "NA")
    vscores = defaultdict(lambda: "NA")
    zol_data_directory = str(os.getenv("ZOL_DATA_PATH")).strip()
    gecco_weights_file = None
    vscore_file = None
    if zol_data_directory != "None":
        try:
            zol_data_directory = os.path.abspath(zol_data_directory) + "/"
            gecco_weights_file = zol_data_directory + "GECCO_Weights.txt"
            vscore_file = zol_data_directory + "VScoreDataNormalized.csv"
        except Exception as e:
            pass
    if (
        gecco_weights_file == None
        or not os.path.isfile(gecco_weights_file)
        or vscore_file == None
        or not os.path.isfile(vscore_file)
    ):
        sys.stderr.write(
            "Warning: weight / score files do not appear to be setup or setup properly!\n"
        )
        return gecco_weights, vscores
    else:
        pf_vscores: Dict[str, Any] = {}
        pf_gecco_weights: Dict[str, Any] = {}
        try:
            with open(vscore_file) as ovsf:
                csv_handle = list(csv.reader(ovsf))
                for i, ls in enumerate(csv_handle):
                    if i == 0:
                        continue
                    if not ls[0].startswith("PF"):
                        continue
                    pf = ls[0].strip().split(".")[0]
                    pf_vscores[pf] = float(ls[2])

            with open(gecco_weights_file) as ogwf:
                for line in ogwf:
                    line = line.strip()
                    pf, weight = line.split("\t")
                    pf = pf.strip().split(".")[0]
                    pf_gecco_weights[pf] = float(weight)

            for hg in pfam_annotations:
                max_vscore = -1.0
                max_bgc = -7.0
                for pf in pfam_annotations[hg]["hits"]:
                    if not pf.startswith("PF"):
                        continue
                    pf = pf.split(".")[0]

                    if pf in pf_vscores:
                        if pf_vscores[pf] > max_vscore:
                            max_vscore = pf_vscores[pf]
                    if pf in pf_gecco_weights:
                        if pf_gecco_weights[pf] > max_bgc:
                            max_bgc = pf_gecco_weights[pf]

                if max_bgc == -7:
                    max_bgc = "NA"
                if max_vscore == -1.0:
                    max_vscore = "NA"

                gecco_weights[hg] = max_bgc # type: ignore
                vscores[hg] = max_vscore # type: ignore
            return gecco_weights, vscores

        except Exception as e:
            sys.stderr.write(
                "Issues with mapping pfam domain annotations to BGC and viral scores from GECCO and vScore.\n"
            )
            log_object.error(
                "Issues with calculating Tajima's D for ortholog groups."
            )
            sys.stderr.write(str(e) + "\n")
            sys.stderr.write(traceback.format_exc())
            sys.exit(1)


def run_tajimas_d_analysis(codo_algn_trim_dir, evo_dir, log_object, threads=1) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Description:
    This function runs Tajima's D analysis for all ortholog groups.
    ********************************************************************************************************************
    Parameters:
    - codo_algn_trim_dir: The directory where trimmed codon alignments are stored for ortholog groups.
    - evo_dir: The workspace / directory where Tajima's D analyses should be performed under.
    - log_object: A logging object.
    - threads: The number of threads to use.
    ********************************************************************************************************************
    Returns:
    - A list with two items:
            - hg_tajimas_d: A dictionary mapping ortholog groups to their Tajima's D estimates.
            - hg_seg_sites: A dictionary mapping ortholog groups to the proportion of segregating sites in their trimmed
                            codon alignments.
    ********************************************************************************************************************
    """
    try:
        evo_dir = os.path.abspath(evo_dir) + "/"
        codo_algn_trim_dir = os.path.abspath(codo_algn_trim_dir) + "/"

        tajd_resdir = evo_dir + "TajimasD_and_SegSites_Calculations/"
        util.setup_ready_directory([tajd_resdir])

        inputs = []
        for catf in os.listdir(codo_algn_trim_dir):
            if not catf.endswith(".msa.fna"):
                continue
            hg = catf.split(".msa.fna")[0]
            trim_codon_align = codo_algn_trim_dir + catf
            outf = tajd_resdir + hg + ".txt"
            inputs.append([hg, trim_codon_align, outf])

        msg = (
            "Computing Tajima's D statistic (using trimmed codon alignments) for %d ortholog groups" \
            % len(inputs)
        )
        log_object.info(msg)
        sys.stdout.write(msg + "\n")

        # Use robust error handling for Tajima's D analysis
        result_summary = util.robust_multiprocess_executor(
            worker_function=run_tajimas_d_analysis_per_hg,
            inputs=inputs,
            pool_size=threads,
            error_strategy="report_and_continue", 
            log_object=log_object,
            description="Tajima's D analysis"
        )

        success_prop = result_summary['success_count'] / result_summary['total_processed']
        msg = f"{success_prop*100.0}% of Tajima's D computations were successful."
        sys.stdout.write(msg + '\n')
        log_object.info(msg)

        hg_tajimas_d: Dict[str, Any] = {}
        hg_seg_sites_prop: Dict[str, Any] = {}
        for f in os.listdir(tajd_resdir):
            with open(tajd_resdir + f) as otf:
                for line in otf:
                    line = line.strip()
                    hg, tajd, ssp = line.split("\t")
                    hg_tajimas_d[hg] = tajd
                    hg_seg_sites_prop[hg] = ssp
        return hg_tajimas_d, hg_seg_sites_prop
    except Exception as e:
        sys.stderr.write(
            "Issues with calculating Tajima's D for ortholog groups.\n"
        )
        log_object.error(
            "Issues with calculating Tajima's D for ortholog groups."
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def calculate_tajimas_d(sequences) -> Tuple[Any, int]:
    """
    Description:
    Takes a list of aligned sequences (trimmed codon alignments) and computes Tajima's D statistic. The code for this \
    functionality was largely taken from Tom Whalley's Tajima's D implementation in Python and further modified /
    corrected to better match the calculation of the statistic as described by Tajima 1989.
    ********************************************************************************************************************
    Parameters:
    - sequences: a list of aligned nucleotide sequences.
    ********************************************************************************************************************
    Returns:
    - An estimate of Tajima's D for the input.
    ********************************************************************************************************************
    """

    """Calculate pi"""
    numseqs = len(sequences)
    divisor = math.comb(numseqs, 2)
    combos = itertools.combinations(sequences, 2)
    differences = 0
    for pair in combos:
        seq_a = pair[0]
        seq_b = pair[1]
        for p, a in enumerate(seq_a):
            b = seq_b[p]
            if a != b and a != "-" and b != "-":
                differences += 1
    pi = float(differences) / divisor

    """Calculate s, number of segregation sites)."""
    # Assume if we're in here seqs have already been checked
    combos = itertools.combinations(sequences, 2)
    indexes = set([])
    for pair in combos:
        seq_a = pair[0]
        seq_b = pair[1]
        for idx, (i, j) in enumerate(zip(seq_a, seq_b)):
            if i != j and i != "-" and j != "-":
                indexes.add(idx)

    indexes = list(indexes)
    S = len(indexes)

    """
	Now we have pi (pairwise differences) and s (number
	of segregating sites). This gives us 'little d', so
	now we need to divide it by sqrt of variance.
	"""
    l = len(sequences)

    # calculate D
    a1 = sum([(1.0 / float(i)) for i in range(1, l)])
    a2 = sum([(1.0 / (i**2)) for i in range(1, l)])

    b1 = float(l + 1) / (3 * (l - 1))
    b2 = float(2 * ((l**2) + l + 3)) / (9 * l * (l - 1))

    c1 = b1 - (1.0 / a1)
    c2 = b2 - (float(l + 2) / (a1 * l)) + (float(a2) / (a1**2.0))

    e1 = float(c1) / a1
    e2 = float(c2) / ((a1**2) + a2)
    if S >= 3:
        D = float(pi - (float(S) / a1)) / math.sqrt(
            (e1 * S) + ((e2 * S) * (S - 1))
        )
        return D, S
    else:
        return "< 3 segregating sites!", S


def compare_focal_and_comparator_gene_clusters(
    focal_genbank_ids,
    comparator_genbank_ids,
    codo_algn_trim_dir,
    upst_algn_dir,
    log_object,
    representative_associated_members=None,
    impute_broad_conservation=False,
) -> Dict[str, Dict[str, Any]]:
    """
    Description:
    This function performs comparative analyses between focal and comparator / complementary gene clusters if requested by
    the user. Will compute conservation percentages between the two sets and also FST for each ortholog group for the
    focal gene cluster and upstream regions.
    *******************************************************************************************************************
    Parameters:
    - focal_genbank_ids: A set of gene cluster identifiers which correspond to the focal set delineated by the user.
    - comparator_genbank_ids: A set of gene cluster identifiers which correspond to the comparator or complementary set
                              either specified by the user or automatically determined.
    - codo_algn_trim_dir: The directory where the trimmed codon alignments for ortholog groups are stored.
    - upst_algn_dir: The directory where the alignments of the upstream sequences for ortholog groups are stored.
    - log_object: A logging object.
    - representative_associated_members: A mapping of gene clusters (including those removed due to redundancy from
                                         dereplication) to representative gene clusters.
    - impute_broad_conservation: Whether to impute conservation comprehensively, regarding members of a set of
                                 similar gene clusters as featuring an ortholog group if their respetive representative
                                 gene cluster had the ortholog group.
    *******************************************************************************************************************
    Returns:
    - comp_stats: A dictionary which contains four dictionaries:
            - prop_foc_with: A dictionary which maps ortholog groups to the proportion of focal gene clusters with them.
            - prop_com_with: A dictionary which maps ortholog groups to the proportion of comparator gene clusters with them.
            - fst: A dictionary which maps ortholog groups to the FST for the focal gene clusters.
            - upst_fst:
    comp_stats[hg] = \
     {'prop_foc_with': prop_foc_with, 'prop_com_with': prop_com_with, 'fst': fst, 'fst_upst': upst_fst}
    *******************************************************************************************************************
    """
    comp_stats: Dict[str, Any] = {}
    try:
        total_foc_broad = set([])
        total_com_broad = set([])
        if (
            impute_broad_conservation
            and representative_associated_members != None
        ):
            for gc in focal_genbank_ids:
                total_foc_broad.add(gc)
                for orthogc in representative_associated_members[gc]:
                    total_foc_broad.add(orthogc)
            for gc in comparator_genbank_ids:
                total_com_broad.add(gc)
                for orthogc in representative_associated_members[gc]:
                    total_com_broad.add(orthogc)
        for f in os.listdir(codo_algn_trim_dir):
            hg = f.split(".msa.fna")[0]
            codo_algn_trim_file = codo_algn_trim_dir + f
            focal_samps_with_hg = set([])
            focal_samps_with_hg_broad = set([])
            compa_samps_with_hg = set([])
            compa_samps_with_hg_broad = set([])
            focal_seqs = []
            compa_seqs = []
            with open(codo_algn_trim_file) as opatf:
                for rec in SeqIO.parse(opatf, "fasta"):
                    sample = rec.id.split("|")[0]
                    seq = str(rec.seq)
                    if sample in focal_genbank_ids:
                        focal_samps_with_hg.add(sample)
                        focal_samps_with_hg_broad.add(sample)
                        if (
                            impute_broad_conservation
                            and representative_associated_members != None
                        ):
                            for orthogc in representative_associated_members[
                                sample
                            ]:
                                focal_samps_with_hg_broad.add(orthogc)
                        focal_seqs.append(seq)
                    elif sample in comparator_genbank_ids:
                        compa_samps_with_hg.add(sample)
                        compa_samps_with_hg_broad.add(sample)
                        if (
                            impute_broad_conservation
                            and representative_associated_members != None
                        ):
                            for orthogc in representative_associated_members[
                                sample
                            ]:
                                compa_samps_with_hg_broad.add(orthogc)
                        compa_seqs.append(seq)

            diff_between = 0
            pw_between = 0
            diff_foc_within = 0
            pw_foc_within = 0
            for i, s1 in enumerate(focal_seqs):
                for j, s2 in enumerate(focal_seqs):
                    if i >= j:
                        continue
                    diff_foc_within += sum(
                        1
                        for a, b in zip(s1, s2)
                        if a != b and a != "-" and b != "-"
                    )
                    pw_foc_within += 1

            for i, s1 in enumerate(focal_seqs):
                for j, s2 in enumerate(compa_seqs):
                    diff_between += sum(
                        1
                        for a, b in zip(s1, s2)
                        if a != b and a != "-" and b != "-"
                    )
                    pw_between += 1

            # Fst estimated according to Hudson, Slatkin and Maddison 1989
            # which is closely related to Nei and Chesser 1983.
            # While the derivations were specific to diploid organisms,
            # the concept of the estimation can more simply be applied
            # to haploid and that is what is assumed here.
            pi_between, pi_within, fst = ["NA"] * 3
            if pw_between > 0 and pw_foc_within > 0:
                pi_between = diff_between / float(pw_between)
                pi_within = (diff_foc_within) / float(pw_foc_within)
                if pi_between > 0:
                    fst = 1.0 - (float(pi_within) / float(pi_between))

            # pi_foc = diff_foc_within / float(pw_foc_within)
            # pi_com = diff_com_within / float(pw_com_within)

            if (
                impute_broad_conservation
                and representative_associated_members != None
            ):
                prop_foc_with = len(focal_samps_with_hg_broad) / float(
                    len(total_foc_broad)
                )
                prop_com_with = len(compa_samps_with_hg_broad) / float(
                    len(total_com_broad)
                )
            else:
                prop_foc_with = len(focal_samps_with_hg) / float(
                    len(focal_genbank_ids)
                )
                prop_com_with = len(compa_samps_with_hg) / float(
                    len(comparator_genbank_ids)
                )

            upst_algn_file = upst_algn_dir + hg + ".msa.fna"
            upst_fst = "NA"
            if (
                os.path.isfile(upst_algn_file)
                and os.path.getsize(upst_algn_file) > 0
            ):
                focal_samps_with_hg = set([])
                compa_samps_with_hg = set([])
                focal_seqs = []
                compa_seqs = []
                with open(upst_algn_file) as oaf:
                    for rec in SeqIO.parse(oaf, "fasta"):
                        sample = rec.id.split("|")[0]
                        seq = str(rec.seq)
                        if sample in focal_genbank_ids:
                            focal_samps_with_hg.add(sample)
                            focal_seqs.append(seq)
                        elif sample in comparator_genbank_ids:
                            compa_samps_with_hg.add(sample)
                            compa_seqs.append(seq)

                diff_between = 0
                pw_between = 0
                diff_foc_within = 0
                diff_com_within = 0
                pw_foc_within = 0
                pw_com_within = 0
                for i, s1 in enumerate(focal_seqs):
                    for j, s2 in enumerate(focal_seqs):
                        if i >= j:
                            continue
                        diff_foc_within += sum(
                            1
                            for a, b in zip(s1, s2)
                            if a != b and a != "-" and b != "-"
                        )
                        pw_foc_within += 1

                for i, s1 in enumerate(compa_seqs):
                    for j, s2 in enumerate(compa_seqs):
                        if i >= j:
                            continue
                        diff_com_within += sum(
                            1
                            for a, b in zip(s1, s2)
                            if a != b and a != "-" and b != "-"
                        )
                        pw_com_within += 1

                for i, s1 in enumerate(focal_seqs):
                    for j, s2 in enumerate(compa_seqs):
                        diff_between += sum(
                            1
                            for a, b in zip(s1, s2)
                            if a != b and a != "-" and b != "-"
                        )
                        pw_between += 1

                # print([diff_between, pw_between, pw_foc_within, pw_com_within, diff_foc_within, diff_com_within]) \
                if pw_between > 0 and pw_foc_within > 0:
                    pi_between = diff_between / float(pw_between)
                    pi_within = (diff_foc_within) / float(pw_foc_within)
                    if pi_between > 0:
                        upst_fst = 1.0 - (float(pi_within) / float(pi_between))

            comp_stats[hg] = {
                "prop_foc_with": prop_foc_with,
                "prop_com_with": prop_com_with,
                "fst": fst,
                "fst_upst": upst_fst,
            }

    except Exception as e:
        sys.stderr.write(
            "Issues with performing comparative analyses between user - defined Gene Cluster groups.\n"
        )
        log_object.error(
            "Issues with performing comparative analyses between user - defined Gene Cluster groups."
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)
    return comp_stats


def create_ortho_group_matrix_from_precomputed_file(
    precomputed_orthogroups_file, fo_prot_dir, ortho_matrix_file, log_object
) -> None:
    """
    Description:
    Create an orthogroup matrix from a file listing ortholog group designations for locus tags.
    ********************************************************************************************************************
    Parameters:
    - precomputed_orthogroups_file: The file listing the precomputed designations of orthogroups per locus tag.
    - fo_prot_dir: The directory of proteins extracted from CDS features in gene cluster GenBank files.
    - ortho_matrix_file: The output file to write the final orthogroup vs. sample matrix used by zol in subsequent steps.
    - log_object: A logging object.
    ********************************************************************************************************************
    """
    try:
        lt_to_og: Dict[str, Any] = {}
        try:
            with open(precomputed_orthogroups_file) as opof:
                for line in opof:
                    line = line.strip()
                    ls = line.split("\t")
                    lt_to_og[ls[0]] = ls[1]
        except Exception as e:
            msg = (
                f"Issues processing precomputed orthogroups designations in the file {precomputed_orthogroups_file}"
            )
            sys.stderr.write(msg + "\n")
            log_object.error(msg)
            raise RuntimeError()

        all_gcs = set([])
        all_ogs = set([])
        og_gc_lts = defaultdict(lambda: defaultdict(set))
        for prot_faa in os.listdir(fo_prot_dir):
            prot_faa_file = fo_prot_dir + prot_faa
            gc = ".faa".join(prot_faa.split(".faa")[:-1])
            all_gcs.add(gc)
            with open(prot_faa_file) as opff:
                for rec in SeqIO.parse(opff, "fasta"):
                    assert rec.id.startswith(gc + "|")
                    lt = rec.id.split(gc + "|")[1]
                    og = None
                    try:
                        og = lt_to_og[lt]
                    except Exception as e:
                        msg = (
                            f"Issues finding corresponding orthogroup designation for the locus tag{lt}"
                        )
                        sys.stderr.write(msg + "\n")
                        log_object.error(msg)
                        raise RuntimeError()
                    all_ogs.add(og)
                    og_gc_lts[og][gc].add(rec.id)

        outf = open(ortho_matrix_file, "w")
        outf.write("\t".join(["Sample"] + sorted(all_gcs)) + "\n")
        for og in sorted(all_ogs):
            printlist = [og]
            for gc in sorted(list(all_gcs)):
                printlist.append(", ".join(og_gc_lts[og][gc]))
            outf.write("\t".join(printlist) + "\n")
        outf.close()

    except Exception as e:
        sys.stderr.write(
            "Issues with creating a sample vs. ortholog group matrix file from pre-computed locus tag to orthogroup designations.\n"
        )
        log_object.error(
            "Issues with creating a sample vs. ortholog group matrix file from pre-computed locus tag to orthogroup designations"
        )
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def consolidate_report(
    consensus_prot_seqs_faa,
    comp_stats,
    hg_stats,
    annotations,
    evo_stats,
    type_weights,
    final_report_xlsx,
    final_report_tsv,
    full_prot_clusters,
    log_object,
    domain_mode=False,
    run_hyphy=False,
    ces=False,
) -> None:
    """
    Description:
    This function creates the final consolidated TSV and XLSX report for zol where each row corresponds to an ortholog
    group.
    ********************************************************************************************************************
    Parameters:
    - consensus_prot_seqs_faa: FASTA file with consensus protein sequences for each ortholog group.
    - comp_stats: A dictionary with information from comparative analysis of gene clusters in focal vs. rest / comparator
                  set of gene cluster for each ortholog group.
    - hg_stats: A dictionary with general information / statistics for each ortholog group.
    - annotations: A dictionary containing annotation information for each ortholog group.
    - evo_stats: A dictionary containing evolutionary statistics for each ortholog group.
    - type_weights: A dictionary containing orthogroup specific weights for BGC or viral related.
    - final_report_xlsx: The path to the final XLSX report spreadsheet.
    - final_report_tsv: The path to the final TSV report table.
    - log_object: A logging object.
    - run_hyphy: Whether HyPhy analysis was requested by the user.
    - ces: Whether comprehensive reporting of stats for all ortholog groups, regardless of whether they are found in
           multiple copies in some gene clusters (some statistics are filtered by default for such ortholog groups). \
    ********************************************************************************************************************
    """

    try:
        # Note to self, eventually conditionally display all columns (e.g. FUBAR columns) when requested by user \
        # to avoid having columns with NA values.
        header = [
            "Ortholog Group (OG) ID",
            "OG is Single Copy?",
            "Proportion of Total Gene Clusters with OG",
            "OG Median Length (bp)",
            "OG Consensus Order",
            "OG Consensus Direction",
            "Custom Annotation (E-value)",
            "KO Annotation (E-value)",
            "Pfam Domains",
        ]
        if domain_mode != False:
            header = [
                "Ortholog Group (OG) ID",
                "OG is Single Copy?",
                "Proportion of Total Gene Clusters with OG",
                "OG Median Length (bp)",
                "Single-Linkage Full Protein Cluster",
                "OG Consensus Order",
                "OG Consensus Direction",
                "Custom Annotation (E-value)",
                "KO Annotation (E-value)",
                "Pfam Domains",
            ]
        if comp_stats != None:
            header += [
                "Proportion of Focal Gene Clusters with OG",
                "Proportion of Comparator Gene Clusters with OG",
                "Fixation Index",
                "Upstream Region Fixation Index",
            ]
        header += [
            "Tajima's D",
            "Proportion of Filtered Codon Alignment is Segregating Sites",
            "Entropy",
            "Upstream Region Entropy",
            "Median Beta-RD-gc",
            "Max Beta-RD-gc",
            "Proportion of sites which are highly ambiguous in codon alignment",
            "Proportion of sites which are highly ambiguous in trimmed codon alignment",
            "Median GC",
            "Median GC Skew",
            "BGC score (GECCO weights)",
            "Viral score (V-Score)",
            "Hydrophobicity Mean",
            "Hydrophobicity Std Dev",
            "Aliphatic Index Mean",
            "Aliphatic Index Std Dev",
            "m/z Mean",
            "m/z Std Dev",
        ]
        if run_hyphy:
            header += [
                "GARD Partitions Based on Recombination Breakpoints",
                "Number of Sites Identified as Under Positive or Negative Selection by FUBAR",
                "Average delta(Beta, Alpha) by FUBAR across sites",
                "Proportion of Sites Under Selection which are Positive",
                "P-value for gene-wide episodic selection by BUSTED",
            ]
        header += [
            "PGAP Annotation (E-value)",
            "PaperBLAST Annotation (E-value)",
            "CARD Annotation (E-value)",
            "IS Finder (E-value)",
            "MIBiG Annotation (E-value)",
            "VOG Annotation (E-value)",
            "VFDB Annotation (E-value)",
            "CDS Locus Tags",
            "OG Consensus Sequence",
        ]

        seqs: Dict[str, Any] = {}
        with open(consensus_prot_seqs_faa) as ocpsf:
            for rec in SeqIO.parse(ocpsf, "fasta"):
                seqs[rec.id] = str(rec.seq)

        frt_handle = open(final_report_tsv, "w")
        frt_handle.write("\t".join(header) + "\n")
        # traverse HG in consensus order
        num_rows = 1
        for hg_tup in sorted(
            hg_stats["hg_order_scores"].items(), key=lambda e: e[1][0]
        ):
            hg = hg_tup[0]
            if not hg in hg_stats["hg_median_lengths"]:
                continue
            hg_scs = util.gather_value_from_dict_for_homolog_group(
                hg, hg_stats["hg_single_copy_status"]
            )
            hg_cons = util.gather_value_from_dict_for_homolog_group(
                hg, hg_stats["hg_prop_samples"]
            )
            hg_mlen = util.gather_value_from_dict_for_homolog_group(
                hg, hg_stats["hg_median_lengths"]
            )
            hg_lts = "NA"
            if hg in hg_stats["hg_locus_tags"]:
                hg_lts = "; ".join(sorted(hg_stats["hg_locus_tags"][hg]))
            hg_full_amb = util.gather_value_from_dict_for_homolog_group(
                hg, hg_stats["hg_full_ambiguity"]
            )
            hg_trim_amb = util.gather_value_from_dict_for_homolog_group(
                hg, hg_stats["hg_trim_ambiguity"]
            )
            hg_gc = util.gather_value_from_dict_for_homolog_group(
                hg, hg_stats["hg_median_gc"]
            )
            hg_gcs = util.gather_value_from_dict_for_homolog_group(
                hg, hg_stats["hg_median_gcskew"]
            )
            hg_peptides_stats = util.gather_value_from_dict_for_homolog_group(
                hg, hg_stats["hg_peptides_stats"]
            )
            hg_ordr = hg_tup[1][0]
            hg_dire = '"' + hg_tup[1][1] + '"'
            (
                hg_tajd,
                hg_entr,
                hg_upst_entr,
                hg_segs,
                hg_gpar,
                hg_ssit,
                hg_deba,
                hg_spro,
                hg_med_brdgc,
                hg_max_brdgc,
                fst,
                fst_upst,
            ) = ["NA"] * 12
            if hg_scs == True or ces:
                hg_tajd = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["tajimas_d"]
                )
                hg_entr = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["entropy"]
                )
                hg_upst_entr = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["entropy_upst"]
                )
                hg_segs = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["segregating_sites_prop"]
                )
                hg_gpar = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["gard_partitions"]
                )
                hg_ssit = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["fubar_sel_sites"]
                )
                hg_spro = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["fubar_sel_props"]
                )
                hg_deba = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["fubar_dba"]
                )
                hg_bpva = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["busted_pvals"]
                )
                hg_med_brdgc = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["median_beta_rd_gc"]
                )
                hg_max_brdgc = util.gather_value_from_dict_for_homolog_group(
                    hg, evo_stats["max_beta_rd_gc"]
                )
            cust_annot = util.gather_annotation_from_dict_for_homolog_group(
                hg, "custom", annotations
            )
            ko_annot = util.gather_annotation_from_dict_for_homolog_group(
                hg, "ko", annotations
            )
            pgap_annot = util.gather_annotation_from_dict_for_homolog_group(
                hg, "pgap", annotations
            )
            pb_annot = util.gather_annotation_from_dict_for_homolog_group(
                hg, "paperblast", annotations
            )
            card_annot = util.gather_annotation_from_dict_for_homolog_group(
                hg, "card", annotations
            )
            isf_annot = util.gather_annotation_from_dict_for_homolog_group(
                hg, "isfinder", annotations
            )
            mibig_annot = util.gather_annotation_from_dict_for_homolog_group(
                hg, "mibig", annotations
            )
            vog_annot = util.gather_annotation_from_dict_for_homolog_group(
                hg, "vog", annotations
            )
            vfdb_annot = util.gather_annotation_from_dict_for_homolog_group(
                hg, "vfdb", annotations
            )
            pfam_annots = "NA"
            if "pfam" in annotations and hg in annotations["pfam"]:
                pfam_annots = "; ".join(annotations["pfam"][hg]["descriptions"])
            con_seq = seqs[hg]
            row = [hg, hg_scs, hg_cons, hg_mlen, hg_ordr, hg_dire, cust_annot, ko_annot, pfam_annots]
            if domain_mode:
                fp_clusters = set([])
                for dog in hg_stats["hg_locus_tags"][hg]:
                    fp = "|".join(dog.split("|")[:-2])
                    fpc = full_prot_clusters[fp]
                    fp_clusters.add(fpc)
                try:
                    assert len(fp_clusters) == 1
                except Exception as e:
                    msg = \
     "Issue with validating all domain instances belong to one full protein cluster - shouldn't happen - please report on Github issues!"
                    sys.stderr.write(msg + "\n")
                    log_object.error(msg)

                fp_cluster = list(fp_clusters)[0]
                row = [
                    "D" + hg,
                    hg_scs,
                    hg_cons,
                    hg_mlen,
                    fp_cluster,
                    hg_ordr,
                    hg_dire,
                    cust_annot,
                    ko_annot,
                    pfam_annots,
                ]
            if comp_stats != None:
                fp = comp_stats[hg]["prop_foc_with"]
                cp = comp_stats[hg]["prop_com_with"]
                if hg_scs == True or ces:
                    fst = comp_stats[hg]["fst"]
                    fst_upst = comp_stats[hg]["fst_upst"]
                row += [fp, cp, fst, fst_upst]

            hg_gw = type_weights["bgc"][hg]
            hg_vs = type_weights["viral"][hg]

            row += [
                hg_tajd,
                hg_segs,
                hg_entr,
                hg_upst_entr,
                hg_med_brdgc,
                hg_max_brdgc,
                hg_full_amb,
                hg_trim_amb,
                hg_gc,
                hg_gcs,
                hg_gw,
                hg_vs,
                hg_peptides_stats['hydrophobicity_mean'],
                hg_peptides_stats['hydrophobicity_std'],
                hg_peptides_stats['aliphatic_index_mean'],
                hg_peptides_stats['aliphatic_index_std'],
                hg_peptides_stats['mz_mean'],
                hg_peptides_stats['mz_std'],
            ]
            if run_hyphy:
                row += [hg_gpar, hg_ssit, hg_deba, hg_spro, hg_bpva]

            row += [
                pgap_annot,
                pb_annot,
                card_annot,
                isf_annot,
                mibig_annot,
                vog_annot,
                vfdb_annot,
                hg_lts,
                con_seq,
            ]
            row = [str(x) for x in row]
            frt_handle.write("\t".join(row) + "\n")
            num_rows += 1
        frt_handle.close()

        # Generate Excel spreadsheet
        writer = pd.ExcelWriter(final_report_xlsx, engine="xlsxwriter")
        workbook = writer.book
        dd_sheet = workbook.add_worksheet("Data Dictionary")
        dd_sheet.write(
            0,
            0,
            'Data Dictionary describing columns of "ZoL Results" spreadsheet can be found below and on zol\'s Wiki page at:',
        )
        dd_sheet.write(
            1,
            0,
            "https://github.com/Kalan-Lab/zol/wiki/3.-more-info-on-zol#explanation-of-report",
        )

        wrap_format = workbook.add_format(
            {
                "text_wrap": True,
                "valign": "vcenter",
                "align": "center",
                "border": 1,
            }
        )
        header_format = workbook.add_format(
            {
                "bold": True,
                "text_wrap": True,
                "valign": "top",
                "fg_color": "#FFFFFF",
                "border": 1,
                "border_color": "#000000",
            }
        )

        data_dict_zol = data_dictionary.zol_dd()
        data_dict_zol_df = util.load_table_in_panda_data_frame_from_string(
            data_dict_zol
        )
        worksheet_dd = writer.sheets["Data Dictionary"]
        worksheet_dd.set_column(1, 3, 50)

        for col_num, value in enumerate(data_dict_zol_df.columns.values):
            worksheet_dd.write(3, col_num + 1, value, header_format)

        colnames = ["Column", "Description", "Notes"]
        for index, row in data_dict_zol_df.iterrows():
            row_ind = index + 4 # type: ignore
            format = wrap_format
            for col_ind in range(0, 3):
                col_name = colnames[col_ind]
                worksheet_dd.write(row_ind, col_ind + 1, row[col_name], format)

        numeric_columns = {
            "Proportion of Total Gene Clusters with OG",
            "Proportion of Focal Gene Clusters with OG",
            "Proportion of Comparator Gene Clusters with OG",
            "Fixation Index",
            "Upstream Region Fixation Index",
            "OG Median Length (bp)",
            "OG Consensus Order",
            "Tajima's D",
            "Entropy",
            "Upstream Region Entropy",
            "GARD Partitions Based on Recombination Breakpoints",
            "Number of Sites Identified as Under Positive or Negative Selection by FUBAR",
            "Proportion of Sites Under Selection which are Positive",
            "P-value for gene-wide episodic selection by BUSTED",
            "Median Beta-RD-gc",
            "Max Beta-RD-gc",
            "Proportion of Filtered Codon Alignment is Segregating Sites",
            "Proportion of sites which are highly ambiguous in codon alignment",
            "Proportion of sites which are highly ambiguous in trimmed codon alignment",
            "Average delta(Beta, Alpha) by FUBAR across sites",
            "Median GC",
            "Median GC Skew",
            "BGC score (GECCO weights)",
            "Viral score (V-Score)",
            "Hydrophobicity Mean",
            "Hydrophobicity Std Dev",
            "Aliphatic Index Mean",
            "Aliphatic Index Std Dev",
            "m/z Mean",
            "m/z Std Dev",
        }

        warn_format = workbook.add_format(
            {"bg_color": "#bf241f", "bold": True, "font_color": "#FFFFFF", "border": 1, "border_color": "#DCDCDC"}
        )
        na_format = workbook.add_format(
            {"font_color": "#a6a6a6", "bg_color": "#FFFFFF", "italic": True, "border": 1, "border_color": "#DCDCDC"}
        )
        header_format = workbook.add_format(
            {
                "bold": True,
                "text_wrap": False,
                "valign": "top",
                "fg_color": "#FFFFFF",
                "font_color": "#000000",
                "border": 1,
                "border_color": "#DCDCDC",
            }
        )
        gecco_format = workbook.add_format(
            {"bg_color": "#d5abde", "border": 1, "border_color": "#DCDCDC"}
        )

        results_df = util.load_table_in_panda_data_frame(
            final_report_tsv, numeric_columns
        )
        results_df.to_excel(
            writer, sheet_name="ZoL Results", index=False, na_rep="NA"
        )

        worksheet = writer.sheets["ZoL Results"]
        border_format = workbook.add_format({'border': 1, 'border_color': '#DCDCDC'})
        worksheet.set_column(0, len(results_df.columns) - 1, None, border_format)
        
        # Apply header formatting directly to header row cells
        for col_num in range(len(results_df.columns)):
            worksheet.write(0, col_num, results_df.columns[col_num], header_format)
        worksheet.conditional_format(
            "B2:B" + str(num_rows),
            {
                "type": "cell",
                "criteria": "==",
                "value": '"False"',
                "format": warn_format,
            },
        )
        worksheet.conditional_format(
            f"A2:{util.get_excel_columns()[len(results_df.columns)-1]}{num_rows}",
            {
                "type": "cell",
                "criteria": "==",
                "value": '"NA"',
                "format": na_format,
            },
        )

        excel_cols = util.get_excel_columns()
        col_map = {col: excel_cols[i] for i, col in enumerate(results_df.columns)}

        # prop gene clusters with hg
        col = 'Proportion of Total Gene Clusters with OG'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#FFFFFF",
                    "max_color": "#CCCCCC",
                    "min_value": 0.0,
                    "max_value": 1.0,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
        # gene lengths
        col = 'OG Median Length (bp)'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#FFFFFF",
                    "max_color": "#5A8AC6",
                    "min_value": 100,
                    "max_value": 2500,
                    "min_type": "num",
                    "max_type": "num",
                },
            )

        if comp_stats:
            # prop focal gene clusters with hg
            col = 'Proportion of Focal Gene Clusters with OG'
            if col in col_map:
                cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
                worksheet.conditional_format(
                    cell_range,
                    {
                        "type": "2_color_scale",
                        "min_color": "#FFFFFF",
                        "max_color": "#CCCCCC",
                        "min_type": "num",
                        "max_type": "num",
                        "min_value": 0.0,
                        "max_value": 1.0,
                    },
                )
            # prop comparator gene clusters with hg
            col = 'Proportion of Comparator Gene Clusters with OG'
            if col in col_map:
                cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
                worksheet.conditional_format(
                    cell_range,
                    {
                        "type": "2_color_scale",
                        "min_color": "#FFFFFF",
                        "max_color": "#CCCCCC",
                        "min_type": "num",
                        "max_type": "num",
                        "min_value": 0.0,
                        "max_value": 1.0,
                    },
                )
            # Fixation Index
            col = 'Fixation Index'
            if col in col_map:
                cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
                worksheet.conditional_format(
                    cell_range,
                    {
                        "type": "3_color_scale",
                        "min_color": "#5A8AC6",
                        "mid_color": "#FFFFFF",
                        "max_color": "#D98880",
                        "min_value": 0.0,
                        "mid_value": 0.5,
                        "max_value": 1.0,
                        "min_type": "num",
                        "mid_type": "num",
                        "max_type": "num",
                    },
                )
            # Upstream Region Fixation Index
            col = 'Upstream Region Fixation Index'
            if col in col_map:
                cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
                worksheet.conditional_format(
                    cell_range,
                    {
                        "type": "3_color_scale",
                        "min_color": "#5A8AC6",
                        "mid_color": "#FFFFFF",
                        "max_color": "#D98880",
                        "min_value": 0.0,
                        "mid_value": 0.5,
                        "max_value": 1.0,
                        "min_type": "num",
                        "mid_type": "num",
                        "max_type": "num",
                    },
                )

        # Tajima's D
        col = "Tajima's D"
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "3_color_scale",
                    "min_color": "#E6B0AA",
                    "mid_color": "#FFFFFF",
                    "max_color": "#B8CCE4",
                    "min_value": -2.0,
                    "mid_value": 0.0,
                    "max_value": 2.0,
                    "min_type": "num",
                    "mid_type": "num",
                    "max_type": "num",
                },
            )
        # Entropy
        col = 'Entropy'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#FFFFFF",
                    "max_color": "#F86B6B",
                    "min_value": 0.0,
                    "max_value": 1.0,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
        # Upstream Entropy
        col = 'Upstream Region Entropy'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#FFFFFF",
                    "max_color": "#F86B6B",
                    "min_value": 0.0,
                    "max_value": 1.0,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
            
        # Beta-RD
        col1 = 'Median Beta-RD-gc'
        col2 = 'Max Beta-RD-gc'
        if col1 in col_map and col2 in col_map:
            cell_range = f"{col_map[col1]}2:{col_map[col2]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "3_color_scale",
                    "min_color": "#F8696B",
                    "mid_color": "#FFEB84",
                    "max_color": "#63BE7B",
                    "min_value": 0.75,
                    "mid_value": 1.0,
                    "max_value": 1.25,
                    "min_type": "num",
                    "mid_type": "num",
                    "max_type": "num",
                },
            )
        # proportion ambiguous
        col1 = 'Proportion of sites which are highly ambiguous in codon alignment'
        col2 = 'Proportion of sites which are highly ambiguous in trimmed codon alignment'
        if col1 in col_map and col2 in col_map:
            cell_range = f"{col_map[col1]}2:{col_map[col2]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#FFFFFF",
                    "max_color": "#E6B0AA",
                    "min_value": 0.0,
                    "max_value": 1.0,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
        
        if run_hyphy:
            # GARD Partitions
            col = 'GARD Partitions Based on Recombination Breakpoints'
            if col in col_map:
                cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
                worksheet.conditional_format(
                    cell_range,
                    {
                        "type": "2_color_scale",
                        "min_color": "#FFFFFF",
                        "max_color": "#F8696B",
                        "min_value": 1,
                        "max_value": 5,
                        "min_type": "num",
                        "max_type": "num",
                    },
                )
            # FUBAR sites
            col = 'Number of Sites Identified as Under Positive or Negative Selection by FUBAR'
            if col in col_map:
                cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
                worksheet.conditional_format(
                    cell_range,
                    {
                        "type": "2_color_scale",
                        "min_color": "#FFFFFF",
                        "max_color": "#B8D4B8",
                        "min_value": 0,
                        "max_value": 10,
                        "min_type": "num",
                        "max_type": "num",
                    },
                )
            # FUBAR dba
            col = 'Average delta(Beta, Alpha) by FUBAR across sites'
            if col in col_map:
                cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
                worksheet.conditional_format(
                    cell_range,
                    {
                        "type": "3_color_scale",
                        "min_color": "#E6B0AA",
                        "mid_color": "#FFFFFF",
                        "max_color": "#B8D4B8",
                        "min_value": -5,
                        "mid_value": 0,
                        "max_value": 5,
                        "min_type": "num",
                        "mid_type": "num",
                        "max_type": "num",
                    },
                )
            # FUBAR prop
            col = 'Proportion of Sites Under Selection which are Positive'
            if col in col_map:
                cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
                worksheet.conditional_format(
                    cell_range,
                    {
                        "type": "3_color_scale",
                        "min_color": "#E6B0AA",
                        "mid_color": "#FFFFFF",
                        "max_color": "#B8D4B8",
                        "min_value": 0,
                        "mid_value": 0.5,
                        "max_value": 1,
                        "min_type": "num",
                        "mid_type": "num",
                        "max_type": "num",
                    },
                )
            # BUSTED pval
            col = 'P-value for gene-wide episodic selection by BUSTED'
            if col in col_map:
                cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
                worksheet.conditional_format(
                    cell_range,
                    {
                        "type": "2_color_scale",
                        "min_color": "#63BE7B",
                        "max_color": "#FFFFFF",
                        "min_value": 0,
                        "max_value": 0.05,
                        "min_type": "num",
                        "max_type": "num",
                    },
                )
        
        # GC
        col = 'Median GC'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#FFFFFF",
                    "max_color": "#63BE7B",
                    "min_value": 0.0,
                    "max_value": 1.0,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
            
        # Median GC Skew
        col = 'Median GC Skew'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "3_color_scale",
                    "min_color": "#E8D5D5",
                    "mid_color": "#FFFFFF",
                    "max_color": "#D5E8F5",
                    "min_value": -0.2,
                    "mid_value": 0.0,
                    "max_value": 0.2,
                    "min_type": "num",
                    "mid_type": "num",
                    "max_type": "num",
                },
            )
        # BGC score (GECCO)
        col = 'BGC score (GECCO weights)'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "cell",
                    "criteria": ">",
                    "value": 2,
                    "format": gecco_format,
                },
            )
        # Viral score (V-score)
        col = 'Viral score (V-Score)'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#FFFFFF",
                    "max_color": "#E6B0AA",
                    "min_value": 0,
                    "max_value": 4,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
        # Hydrophobicity Mean
        col = 'Hydrophobicity Mean'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "3_color_scale",
                    "min_color": "#5A8AC6",
                    "mid_color": "#FFFFFF",
                    "max_color": "#F8696B",
                    "min_value": -2.5,
                    "mid_value": 0,
                    "max_value": 2.5,
                    "min_type": "num",
                    "mid_type": "num",
                    "max_type": "num",
                },
            )
        # Hydrophobicity Std Dev
        col = 'Hydrophobicity Std Dev'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#FFFFFF",
                    "max_color": "#F8696B",
                    "min_value": 0,
                    "max_value": 2,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
        # Aliphatic Index Mean
        col = 'Aliphatic Index Mean'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#FFFFFF",
                    "max_color": "#63BE7B",
                    "min_value": 50,
                    "max_value": 150,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
        # Aliphatic Index Std Dev
        col = 'Aliphatic Index Std Dev'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#FFFFFF",
                    "max_color": "#F8696B",
                    "min_value": 0,
                    "max_value": 30,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
        # m/z Mean
        col = 'm/z Mean'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#dcedde",
                    "max_color": "#87bba2",
                    "min_value": 5000,
                    "max_value": 75000,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
        # m/z Std Dev
        col = 'm/z Std Dev'
        if col in col_map:
            cell_range = f"{col_map[col]}2:{col_map[col]}{num_rows}"
            worksheet.conditional_format(
                cell_range,
                {
                    "type": "2_color_scale",
                    "min_color": "#dcedde",
                    "max_color": "#87bba2",
                    "min_value": 0,
                    "max_value": 5000,
                    "min_type": "num",
                    "max_type": "num",
                },
            )
        
        worksheet.autofilter(f"A1:{excel_cols[len(results_df.columns) - 1]}{num_rows}")
        if 'Proportion of Total Gene Clusters with OG' in col_map:
            worksheet.filter_column(col_map["Proportion of Total Gene Clusters with OG"], "x >= 0.1")
        workbook.close()
    
    except Exception as e:
        log_object.error("Issues creating consolidated results files.")
        log_object.error(e)
        log_object.error(traceback.format_exc())
        sys.exit(1)


def plot_heatmap(
    hg_stats,
    genbanks,
    plot_result_pdf,
    work_dir,
    log_object,
    height=7,
    width=10,
    full_genbank_labels=False,
) -> None:
    """
    Description:
    This function create a heatmap figure showing the presence of ortholog groups across gene clusters.
    ********************************************************************************************************************
    Parameters:
    - hg_stats: A dictionary containing general statistics for each ortholog group (including which gene clusters have
                them).
    - genbanks: The set of gene clusters to account for in the heatmap.
    - plot_result_pdf: The resulting heatmap plot PDF path.
    - work_dir: The workspace / directory where to store intermediate files for plotting.
    - log_object: A logging object.
    - height: The height of the plot in inches.
    - width: The width of the plot in inches.
    - full_genbank_labels: Whether to use the full gene cluster identifiers as labels.
    ********************************************************************************************************************
    """
    try:
        representative_genbanks = set([])
        for gbk in genbanks:
            gbk_prefix = None
            if (
                gbk.endswith(".gbk")
                or gbk.endswith(".gbff")
                or gbk.endswith(".genbank")
            ):
                gbk_prefix = ".".join(gbk.split("/")[-1].split(".")[:-1])
            assert gbk_prefix != None
            representative_genbanks.add(gbk_prefix)

        # create input tracks
        ml_track_file = work_dir + "OG_Median_Length_Info.txt"
        hm_track_file = work_dir + "OG_Heatmap_Info.txt"
        ml_track_handle = open(ml_track_file, "w")
        hm_track_handle = open(hm_track_file, "w")
        ml_track_handle.write("og\tog_order\tmed_length\n")
        hm_track_handle.write(
            "og\tog_order\tgenbank\tog_presence\tcopy_count\n"
        )
        gn_lab_keys = set([])
        gn_labs = set([])
        for hg_tup in sorted(
            hg_stats["hg_order_scores"].items(), key=lambda e: e[1][0]
        ):
            hg = hg_tup[0]
            if not hg in hg_stats["hg_median_lengths"]:
                continue
            hg_mlen = hg_stats["hg_median_lengths"][hg]
            hg_lts = hg_stats["hg_locus_tags"][hg]
            hg_ordr = hg_tup[1][0]
            sample_copy_counts = defaultdict(int)
            for lt in hg_lts:
                gn = lt.split("|")[0]
                if not gn in representative_genbanks:
                    continue
                sample_copy_counts[gn] += 1
            if sum(sample_copy_counts.values()) == 0:
                continue
            ml_track_handle.write(
                f"{hg}\t{hg_ordr}\t{float(hg_mlen) / 1000.0}\n"
            )
            for gn in representative_genbanks:
                pres = "0"
                copy_count = ""
                if sample_copy_counts[gn] > 0:
                    pres = "1"
                    if sample_copy_counts[gn] > 1:
                        copy_count = str(sample_copy_counts[gn])
                gn_label = gn
                if not full_genbank_labels:
                    gn_label = gn
                    if len(gn) >= 21:
                        gn_label = gn[:20]
                gn_lab_keys.add(tuple([gn, gn_label]))
                gn_labs.add(gn_label)
                hm_track_handle.write(
                    f"{hg}\t{hg_ordr}\t{gn_label}\t{pres}\t{copy_count}\n"
                )
        
        hm_track_handle.close()
        ml_track_handle.close()

        try:
            assert len(gn_labs) == len(gn_lab_keys)
        except Exception as e:
            log_object.info(
                'Non-unique labels resulted from truncating GenBank names. Please rerun zol with the "--full_genbank_labels" argument.'
            )
            sys.stderr.write(
                'Non-unique labels resulted from truncating GenBank names. Please rerun zol with the "--full_genbank_labels" argument.\n'
            )
            sys.exit(1)

        rscript_path = work_dir + "clusterHeatmap.R"
        util.cluster_heatmap_r(
            ml_track_file,
            hm_track_file,
            plot_result_pdf,
            height,
            width,
            rscript_path,
            log_object,
        )
        plot_cmd = ["Rscript", rscript_path]
        util.run_cmd_via_subprocess(plot_cmd, log_object=log_object, check_files=[plot_result_pdf])

    except Exception as e:
        sys.stderr.write("Issues creating heatmap visualization in zol.\n")
        log_object.error("Issues creating heatmap visualization in zol.")
        sys.stderr.write(str(e) + "\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)


def compute_peptides_stats(
    orthogroup_matrix_file,
    hg_prot_dir,
    log_object,
) -> Dict[str, Dict[str, float]]:
    """
    Description:
    This function computes peptides.py statistics (hydrophobicity, aliphatic_index, mz) 
    for each ortholog group, calculating mean and standard deviation for each property.
    *******************************************************************************************************************
    Parameters:
    - orthogroup_matrix_file: The ortholog group vs sample matrix file, where cells correspond to locus tag identifiers.
    - hg_prot_dir: The directory of FASTA files with protein sequences for each ortholog group.
    - log_object: A logging object.
    *******************************************************************************************************************
    Results:
    - peptides_stats: A dictionary where keys are ortholog group IDs and values are dictionaries
      containing 'hydrophobicity_mean', 'hydrophobicity_std', 'aliphatic_index_mean', 
      'aliphatic_index_std', 'mz_mean', 'mz_std' for each ortholog group.
    *******************************************************************************************************************
    """
    try:        
        peptides_stats = {}
        
        # Get list of ortholog groups from matrix file
        ortholog_groups = []
        with open(orthogroup_matrix_file) as omf:
            for i, line in enumerate(omf):
                if i == 0:  # Skip header
                    continue
                line = line.rstrip("\\n")
                ls = line.split("\t")
                hg = ls[0]
                ortholog_groups.append(hg)
        
        log_object.info(f"Computing peptides.py statistics for {len(ortholog_groups)} ortholog groups")
        
        for hg in ortholog_groups:
            prot_file = hg_prot_dir + hg + ".faa"
            
            if not os.path.isfile(prot_file):
                log_object.warning(f"Protein file not found for ortholog group {hg}: {prot_file}")
                peptides_stats[hg] = {
                    'hydrophobicity_mean': 'NA',
                    'hydrophobicity_std': 'NA',
                    'aliphatic_index_mean': 'NA',
                    'aliphatic_index_std': 'NA',
                    'mz_mean': 'NA',
                    'mz_std': 'NA'
                }
                continue
            
            # Read protein sequences and compute peptides.py statistics
            hydrophobicity_values = []
            aliphatic_index_values = []
            mz_values = []
            
            try:
                with open(prot_file) as pf:
                    for rec in SeqIO.parse(pf, "fasta"):
                        seq = str(rec.seq)
                        
                        try:
                            # Create Peptide object
                            peptide = peptides.Peptide(seq)

                            # Detect outliers
                            outlier_result = peptide.detect_outlier()
                            if outlier_result.is_outlier:
                                log_object.info(f"Skipping outlier sequence {rec.id} in {hg}: {', '.join(outlier_result.issues)}")
                                continue

                            # Compute hydrophobicity (using Kyte-Doolittle scale)
                            hydrophobicity = peptide.hydrophobicity()
                            hydrophobicity_values.append(hydrophobicity)
                            
                            # Compute aliphatic index
                            aliphatic_idx = peptide.aliphatic_index()
                            aliphatic_index_values.append(aliphatic_idx)
                            
                            # Compute molecular weight (mz)
                            mw = peptide.molecular_weight()
                            mz_values.append(mw)
                            
                        except Exception as e:
                            log_object.warning(f"Error computing peptides.py stats for sequence {rec.id} in ortholog group {hg}: {e}")
                            continue
                
                # Calculate mean and standard deviation
                if hydrophobicity_values:
                    peptides_stats[hg] = {
                        'hydrophobicity_mean': statistics.mean(hydrophobicity_values),
                        'hydrophobicity_std': statistics.stdev(hydrophobicity_values) if len(hydrophobicity_values) > 1 else 0.0,
                        'aliphatic_index_mean': statistics.mean(aliphatic_index_values),
                        'aliphatic_index_std': statistics.stdev(aliphatic_index_values) if len(aliphatic_index_values) > 1 else 0.0,
                        'mz_mean': statistics.mean(mz_values),
                        'mz_std': statistics.stdev(mz_values) if len(mz_values) > 1 else 0.0
                    }
                else:
                    peptides_stats[hg] = {
                        'hydrophobicity_mean': 'NA',
                        'hydrophobicity_std': 'NA',
                        'aliphatic_index_mean': 'NA',
                        'aliphatic_index_std': 'NA',
                        'mz_mean': 'NA',
                        'mz_std': 'NA'
                    }
                    
            except Exception as e:
                log_object.error(f"Error processing protein file for ortholog group {hg}: {e}")
                peptides_stats[hg] = {
                    'hydrophobicity_mean': 'NA',
                    'hydrophobicity_std': 'NA',
                    'aliphatic_index_mean': 'NA',
                    'aliphatic_index_std': 'NA',
                    'mz_mean': 'NA',
                    'mz_std': 'NA'
                }
        
        log_object.info("Completed computing peptides.py statistics")
        return peptides_stats
        
    except ImportError:
        log_object.error("peptides.py library not found. Please install it with: pip install peptides")
        sys.stderr.write("peptides.py library not found. Please install it with: pip install peptides\\n")
        sys.exit(1)
    except (IOError, ValueError, statistics.StatisticsError) as e:
        error_msg = f"Issues with computing peptides.py statistics for ortholog groups. Error: {e}"
        log_object.error(error_msg)
        sys.stderr.write(error_msg + "\\n")
        sys.stderr.write(traceback.format_exc())
        sys.exit(1)

def run_fegenie(
    input_file,
    output_file,
    log_object,
    threads=1,
):
    try:
        cmd = [
            "fegenie",
            "-i",
            input_file,
            "-o",
            output_file,
            "--threads",
            str(threads),
        ]
        util.run_cmd_via_subprocess(cmd, log_object=log_object, check_files=[output_file])
        return ('success', None)
    except Exception as e:
        error_msg = f"Issues with running fegenie: {str(e)}"
        return ('error', error_msg)
