#!/opt/mambaforge/envs/bioconda/conda-bld/zol_1759109280278/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_/bin/python

"""
Program: prepTG
Author: Rauf Salamzade
Kalan Lab
UW Madison, Department of Medical Microbiology and Immunology
"""

from math import log
from Bio import SeqIO
from rich_argparse import RawTextRichHelpFormatter
from zol import util
import argparse
import os
import shutil
import sys

# BSD 3-Clause License
#
# Copyright (c) 2023-2025, Kalan-Lab
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met: 
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice, 
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# Set environment variable for OpenMP
os.environ["OMP_NUM_THREADS"] = "1"

# Constants
VALID_GTDB_RELEASES = {'R220', 'R226'}
PREMADE_DB_SET = {
    'Acinetobacter', 'Bacillales', 'Corynebacterium', 'Enterobacter', 
    'Enterococcus', 'Escherichia', 'Klebsiella', 'Lactobacillus', 
    'Listeria', 'Micromonospora', 'Mycobacterium', 'Pseudomonas', 
    'Salmonella', 'Staphylococcus', 'Streptococcus', 'Neisseria', 
    'Streptomyces', 'Cutibacterium'
}

PREMADE_DOWNLOAD_LINKS = {
    'Acinetobacter': 'https://zenodo.org/record/10042148/files/Acinetobacter.tar.gz?download=1', 
    'Bacillales': 'https://zenodo.org/record/10044725/files/Bacillales.tar.gz?download=1', 
    'Corynebacterium': 'https://zenodo.org/record/10032209/files/Corynebacterium.tar.gz?download=1', 
    'Enterobacter': 'https://zenodo.org/record/10042148/files/Enterobacter.tar.gz?download=1', 
    'Enterococcus': 'https://zenodo.org/record/10042148/files/Enterococcus.tar.gz?download=1', 
    'Escherichia': 'https://zenodo.org/record/10032209/files/Escherichia.tar.gz?download=1', 
    'Klebsiella': 'https://zenodo.org/record/10042148/files/Klebsiella.tar.gz?download=1', 
    'Cutibacterium': 'https://zenodo.org/record/10032209/files/Cutibacterium.tar.gz?download=1', 
    'Listeria': 'https://zenodo.org/record/10032209/files/Listeria.tar.gz?download=1', 
    'Lactobacillus': 'https://zenodo.org/record/10032209/files/Lactobacillus.tar.gz?download=1', 
    'Micromonospora': 'https://zenodo.org/record/10044725/files/Micromonospora.tar.gz?download=1', 
    'Mycobacterium': 'https://zenodo.org/record/10032209/files/Mycobacterium.tar.gz?download=1', 
    'Pseudomonas': 'https://zenodo.org/record/10042148/files/Pseudomonas.tar.gz?download=1', 
    'Salmonella': 'https://zenodo.org/record/10032209/files/Salmonella.tar.gz?download=1', 
    'Staphylococcus': 'https://zenodo.org/record/10042148/files/Staphylococcus.tar.gz?download=1', 
    'Streptomyces': 'https://zenodo.org/record/10044725/files/Streptomyces.tar.gz?download=1', 
    'Neisseria': 'https://zenodo.org/record/10032209/files/Neisseria.tar.gz?download=1', 
    'Streptococcus': 'https://zenodo.org/record/10032209/files/Streptococcus.tar.gz?download=1'
}


def create_parser(): 
    """Parse arguments."""
    description = """
Program: prepTG
Author: Rauf Salamzade
Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology

Prepares a directory of target genomes for being searched for query gene clusters using fai.

Premade databases of representative genomes are available for the following genera: 

Acinetobacter (n = 1, 643), Bacillales (n = 3, 150), Corynebacterium (n = 726), 
Enterobacter (n = 878), Enterococcus (n = 937), Escherichia (n = 2, 436), 
Klebsiella (n = 1, 022), Listeria (n = 353), Mycobacterium (n = 744), 
Pseudomonas (n = 2, 666), Salmonella (n = 308), Staphylococcus (n = 496), 
Streptomyces (n = 1, 555), Streptococcus (n = 2, 452), Cutibacterium (n = 27), 
Neisseria (n = 414), Lactobacillus (n = 541), and Micromonospora (n = 211).

In addition, users can simply request all genomes belonging to a specific species/genus
in GTDB to be downloaded.

---------------------------------------------------------------------------------------
> Example commands: 

1. Setup a prepTG database which includes some local genomes in FASTA format

    $ prepTG -i User_Genomes_Directory/

2. Setup a prepTG database which includes some local genomes and all Cutibacterium 
   granulosum genomes in GTDB: 

   $ prepTG -i User_Genomes_Directory/ -g "Cutibacterium granulosum" -o prepTG_Database/

3. Setup local prepTG database by downloading a premade one of representative
   Cutibacterium genomes: 

   $ prepTG -d Cutibacterium -o prepTG_Database/

---------------------------------------------------------------------------------------
> Considerations
If FASTA format is provided, assumption is that genomes are prokaryotic and
pyrodigal/prodigal will be used to perform gene-calling. Eukaryotic genomes can
be provided as FASTA format but the --reference-proteome file should be used in
such case to map proteins from a reference proteome (from the same species ideally)
on to the target genomes. This will prevent detection of new genes in gene-clusters
detected by fai but synchronize gene-calling and allow for better similarity
assessment between orthologous genes.

If you are interested in inferring horizontal gene transfer using salt downstream
and are working with bacterial genomes - consider issuing the "--mge-annotation"
flag to annotate phage, plasmid and IS element associated proteins.

If GenBank files are provided, CDS features are expected and further each CDS
feature should contain a "translation" qualifier which features the protein sequence
and optionally a "locus_tag" qualifier. Options to consider when providing GenBank
files as input include --rename-locus-tags, --error-no-lt, and --error-no-translation.
Note, by default, if a CDS does not feature a locus tag, it will be given an arbitrary
one. Also, if a CDS does not feature a translation, the CDS feature will be skipped.
If > 10% of CDS features are skipped for lacking a translation, then the entire genome
will be skipped.
"""

    parser = argparse.ArgumentParser(
        description = description, 
        formatter_class = RawTextRichHelpFormatter
    )

    parser.add_argument(
        '-i', '--input-dir', 
        help = "Directory with target genomes (either featuring GenBanks or FASTAs).", 
        required = False, default = None
    )
    parser.add_argument(
        '-g', '--gtdb-taxon', 
        help = "Name of a GTDB valid genus or species to incorporate genomes from.\n"
               "Should be surrounded by\nquotes (e.g. 'Escherichia coli').", 
        required = False, default = None
    )
    parser.add_argument(
        '-gr', '--gtdb-release', 
        help = 'GTDB release to use. [Current default is R226].', 
        default = 'R226', required = False
    )
    parser.add_argument(
        '-d', '--download-premade', 
        help = "Download and setup pre-made databases of representative genomes\n"
               "for specific taxon/genus. Provide name of the taxon, \n"
               "e.g. \"Escherichia\"", 
        required = False, default = None
    )
    parser.add_argument(
        '-o', '--output-dir', 
        help = 'Output directory, which can then be provided as input for the\n'
             '"-tg" argument in fai.', 
        required = True
    )
    parser.add_argument(
        '-r', '--rename-locus-tags', 
        action = 'store_true', 
        help = "Whether to rename all locus tags for CDS features if provided\n"
               "GenBanks as input.", 
        required = False, default = False
    )
    parser.add_argument(
        '-enl', '--error-no-lt', 
        action = 'store_true', 
        help = "Do not rename locus tags if not found/other issue -\n"
               "will result in skipping inclusion of entire genome.", 
        default = False, required = False
    )
    parser.add_argument(
        '-ent', '--error-no-translation', 
        action = 'store_true', 
        help = "Do not skip CDS without translation -\n"
               "will result in skipping inclusion of entire genome.", 
        default = False, required = False
    )
    parser.add_argument(
        '-l', '--locus-tag-length', 
        type = int, 
        help = "Length of locus tags to set. [Default is 3,\n"
               "allows for < ~18k genomes].", 
        required = False, default = 3
    )
    parser.add_argument(
        '-gcm', '--gene-calling-method', 
        help = "Method to use for gene calling. Options are: pyrodigal, prodigal,\n"
               "or prodigal-gv. [Default is pyrodigal].", 
        required = False, default = 'pyrodigal'
    )
    parser.add_argument(
        '-m', '--meta-mode', 
        action = 'store_true', 
        help = "Flag to use meta mode instead of single for pyrodigal/prodigal.", 
        default = False, required = False
    )
    parser.add_argument(
        '-rp', '--reference-proteome', 
        help = "Provide path to a reference proteome to use for protein/\n"
               "gene-calling in target genomes - which should be in FASTA\n"
               "format.", 
        default = None, required = False
    )
    parser.add_argument(
        '-cst', '--create-species-tree', 
        action = 'store_true', 
        help = "Use skani to infer a neighbor-joing based species\n"
               "tree for the genomes.", 
        required = False, default = False
    )
    parser.add_argument(
        '-ma', '--mge-annotation', 
        action = 'store_true', 
        help = "Perform MGE annotation of proteins - for\n"
               "bacterial genomes only.", 
        required = False, default = False
    )
    parser.add_argument(
        '-cp', '--collapsing-params', 
        help = "DIAMOND linclust parameters used for collapsing proteins for\n"
               "creating DIAMOND database. [Default is \"--approx-id 90\n"
               "--mutual-cover 90\"].", 
    	required = False, default = "--approx-id 90 --mutual-cover 90"
    )
    parser.add_argument(
        '-sd', '--skip-diamond-linclust', 
        action = 'store_true', 
        help = "Skip DIAMOND linclust clustering of proteins.", 
        required = False, default = False
    )  
    parser.add_argument(
        '-c', '--threads', 
        type = int, 
        help = "The number of threads to use. [Default is 1].", 
        required = False, default = 1
    )
    parser.add_argument(
        '-mm', '--max-memory', 
        type = int, 
        help = "Uses resource module to set soft memory limit. Provide\n"
               "in Giga-bytes. Configured in the shell environment. For\n"
               "DIAMOND linclust if this parameter is not set it will default\n" 
               "to using a memory of 16GB. [Default is None].", 
        required = False, default = None
    )
    parser.add_argument(
        '-v', '--version', 
        action = 'store_true', 
        help = "Get version and exit.", 
        required = False, default = False
    )

    return parser.parse_args()


def prep_tg(): 
    """Run primary workflow for program."""
    # Get version
    version = util.get_version()

    if len(sys.argv) > 1 and ('-v' in set(sys.argv) or '--version' in set(sys.argv)): 
        sys.stdout.write(version + '\n')
        sys.exit(0)

    # Parse arguments
    args = create_parser()

    target_genomes_dir = args.input_dir
    gtdb_taxon = args.gtdb_taxon
    download_premade = args.download_premade
    outdir = os.path.abspath(args.output_dir) + '/'
    locus_tag_length = args.locus_tag_length
    rename_locus_tags = args.rename_locus_tags
    error_no_lt = args.error_no_lt
    error_no_translation = args.error_no_translation
    gtdb_release = args.gtdb_release
    threads = args.threads
    max_memory = args.max_memory
    gene_calling_method = args.gene_calling_method
    mge_annotation_flag = args.mge_annotation
    meta_mode = args.meta_mode
    create_species_tree = args.create_species_tree
    reference_proteome = args.reference_proteome
    collapsing_params = args.collapsing_params
    skip_diamond_linclust = args.skip_diamond_linclust
    if reference_proteome is not None: 
        gene_calling_method = 'miniprot reference proteome mapping'
        reference_proteome = os.path.abspath(args.reference_proteome)

    # create output directory if needed, or warn of over-writing
    if os.path.isdir(outdir):
        sys.stderr.write("Output directory exists. Files will be overwritten, but\n"
                       "checkpoints will be used to avoid redoing successfully\n"
                       "completed steps.\n"
                       "Do you wish to proceed? (yes/no): ")
        user_input = input().strip().lower()
        if user_input not in ['yes', 'y']:
            sys.stderr.write("Execution cancelled by user.\n")
            sys.exit(0)
    else:
        util.setup_ready_directory([outdir], delete_if_exist = False)

    if (target_genomes_dir is None and download_premade is None and
            gtdb_taxon is None): 
        sys.stderr.write('No input genomes provided or requested!\n')
        sys.exit(1)

    if download_premade is not None: 
        try: 
            assert download_premade in PREMADE_DB_SET
        except AssertionError: 
            sys.stderr.write("Non-valid genus specified for downloading a premade database.\n")
            sys.stderr.write(f"Valid options include: \n{', '.join(sorted(PREMADE_DB_SET))}\n")
            sys.exit(1)

    if gtdb_taxon is not None and download_premade is not None: 
        sys.stderr.write('Currently requesting genomes for a specific GTDB genus/species '
                        'and also requesting using a premade database is not supported.\n')
        sys.exit(1)

    if target_genomes_dir is not None and download_premade is not None: 
        sys.stderr.write('Currently providing genomes in a directory and also requesting '
                        'a premade database is not supported.\n')
        sys.exit(1)

    if target_genomes_dir is not None: 
        try: 
            assert os.path.isdir(target_genomes_dir)
            target_genomes_dir = os.path.abspath(target_genomes_dir) + '/'
        except AssertionError: 
            sys.stderr.write('Issue with validating directory with target genomes exists.\n')
            sys.exit(1)

    # Start workflow
    sys.stdout.write(f'Input command: \n{" ".join(sys.argv)}\n')

    # Create logging object
    log_file = outdir + 'Progress.log'
    log_object = util.create_logger_object(log_file)
    log_object.info("Saving parameters for future records.")
    parameters_file = outdir + 'Parameter_Inputs.txt'
    parameter_values = [
        download_premade, target_genomes_dir, gtdb_taxon, gtdb_release, outdir, 
        reference_proteome, gene_calling_method, meta_mode, create_species_tree, 
        threads, max_memory, mge_annotation_flag, rename_locus_tags, error_no_lt, 
        error_no_translation, collapsing_params, skip_diamond_linclust
    ]
    parameter_names = [
        "Download Premade Database", "Target Genomes Directory", "GTDB species/genus", 
        "GTDB Release", "Output Directory", "Reference Proteome Provided", 
        "Gene Calling Method", "Use Meta Mode?", "Create Species Tree?", 
        "Number of threads", "Maximum Memory in GB", "MGE Annotation Requested?", 
        "Rename Locus Tags in GenBank files?", "Error if no Locus Tags in GenBank files?", 
        "Error if no Translation in GenBank files?", 
        "DIAMOND linclust Collapsing Parameters", "Skip DIAMOND linclust Clustering?"
    ]
    util.log_parameters_to_file(parameters_file, parameter_names, parameter_values)
    log_object.info("Done saving parameters!")

    log_object.info(f'Running prepTG version {version}')
    sys.stdout.write(f'Running prepTG version {version}\n')

    gtdb_genomes_dir = outdir + 'GTDB_Genomes/'
    gtdb_listing_file = outdir + "GTDB_" + gtdb_release + "_Information_with_Genome_URLs.txt.gz"
    taxon_listing_file = outdir + 'GTDB_Genomes_to_Download.txt'

    if gtdb_taxon is not None: 
        # Step 0: Download GTDB listing file from a git repo, parse GTDB information
        # file, and download the genomes.
        gtdb_release = gtdb_release.upper()
        try: 
            assert gtdb_release in VALID_GTDB_RELEASES
        except AssertionError: 
            sys.stderr.write('GTDB release specified is not valid, options include: ' +
                           ', '.join(VALID_GTDB_RELEASES))
            sys.exit(1)

        try: 
            util.download_gtdb_genomes(
                gtdb_taxon, gtdb_release, gtdb_genomes_dir, gtdb_listing_file, 
                taxon_listing_file, log_object, sanity_check = False, 
                automated_download = False
            )
        except Exception: 
            sys.stderr.write('Had an issue downloading GTDB genomes.\n')
            sys.exit(1)

    if target_genomes_dir is not None: 
        try: 
            assert os.path.isdir(target_genomes_dir)
            target_genomes_dir = os.path.abspath(target_genomes_dir) + '/'
        except AssertionError: 
            sys.stderr.write('Issue with validating directory with target genomes exists.\n')
            sys.exit(1)

    # Set max memory limit
    if max_memory is not None: 
        log_object.info(f"Setting maximum memory usage to: {max_memory}GB")
        try: 
            util.memory_limit(max_memory)
        except Exception as e: 
            print(f"Error: {e}")
            log_object.info("Error setting memory limit")
            sys.stderr.write("Error setting memory limit\n")

    if download_premade is not None: 
        # Step 1: Download and setup premade prepTG database for genus
        curr_workspace_dir = os.getcwd()
        dl = PREMADE_DOWNLOAD_LINKS[download_premade]
        dl_base = dl.split('/')[-1].split('?')[0]
        curl_download_dbs_cmd = ['curl', '-L', dl, '-o', dl_base]
        extract_cmd = ['tar', '-zxvf', dl_base]
        reorganize_cmd = ['mv', download_premade + '/*', '.']
        rm_cmd = ['rm', '-f', dl_base]
        try: 
            os.chdir(outdir)
            os.system(' '.join(curl_download_dbs_cmd))
            os.system(' '.join(extract_cmd))
            assert os.path.isdir(download_premade)
            os.system(' '.join(reorganize_cmd))
            shutil.rmtree(download_premade)
            assert (os.path.isdir(outdir) and
                   os.path.isfile(outdir + 'Target_Genomes_DB.dmnd'))
            os.system(' '.join(rm_cmd))
            os.chdir(curr_workspace_dir)
        except Exception as e: 
            print(f"Error: {e}")
            log_object.error('Had an issue downloading/setting-up the premade database requested.')
            sys.stderr.write('Had an issue downloading/setting-up the premade database requested.')
            log_object.error(e)
            sys.exit(1)
    else: 
        # Step 1: List genomes in directory
        msg = '--------------------\nStep 1\n--------------------\n'
        msg += 'Processing genomes provided in input or requested for download from GTDB/NCBI.'
        sys.stdout.write(msg + '\n')
        log_object.info(msg)

        target_listing_file = outdir + 'Target_Genomes.txt'
        uncompress_dir = outdir + 'Uncompressed_Genomes/'

        if target_genomes_dir is not None: 
            list_cmd = ['listAllGenomesInDirectory.py', '-z', '-i', target_genomes_dir, 
                        '--uncompress_dir', uncompress_dir, ' > ', target_listing_file]
            util.run_cmd_via_subprocess(list_cmd, log_object=log_object, 
                                        check_files = [target_listing_file], 
                                        verbose = False)

        if gtdb_taxon is not None: 
            list_cmd = ['listAllGenomesInDirectory.py', '-i', gtdb_genomes_dir, '-z', 
                        '--uncompress_dir', uncompress_dir, ' >> ', target_listing_file]
            util.run_cmd_via_subprocess(list_cmd, log_object=log_object, 
                                        check_files = [target_listing_file], 
                                        verbose = False)

        genome_count = 0
        with open(target_listing_file) as otlf: 
            for line in otlf: 
                genome_count += 1
        if genome_count == 0: 
            log_object.error('No genomes found and/or could be downloaded!!! Exiting!')
            sys.stderr.write('No genomes found and/or could be downloaded!!! Exiting!\n')
            sys.exit(1)

        # Step 2: Setup files needed for fai
        msg = '--------------------\nStep 2\n--------------------\n'
        msg += 'Re-formatting (and gene-calling/gene-mapping) genomes for usage in fai downstream.'
        sys.stdout.write(msg + '\n')
        log_object.info(msg)

        format_assess_dir = outdir + 'Format_Assessment/'
        util.setup_ready_directory([format_assess_dir], delete_if_exist = True)
        format_predictions_file = outdir + 'All_Format_Inferences.txt'
        target_genome_annotation_listing_file = outdir + 'Target_Genome_Annotation_Files.txt'

        additional_sample_genomes, additional_format_prediction = util.parse_sample_genomes(
            target_listing_file, format_assess_dir, format_predictions_file, 
            log_object, threads = threads
        )

        if additional_format_prediction == 'mixed': 
            log_object.error('Format of additional genomes provided is not consistently '
                           'FASTA or Genbank, please check input.')
            raise RuntimeError('Format of additional genomes provided is not consistently '
                             'FASTA or Genbank, please check input.')

        additional_proteomes_directory = outdir + 'Predicted_Proteomes_Additional/'
        additional_genbanks_directory = outdir + 'Genomic_Genbanks_Additional/'
        additional_genbanks_directory_local = 'Genomic_Genbanks_Additional/'
        util.setup_ready_directory([additional_proteomes_directory, additional_genbanks_directory], 
                               delete_if_exist = True)

        if additional_format_prediction == 'fasta': 
            if reference_proteome is None: 
                additional_prodigal_outdir = outdir + 'Prodigal_Gene_Calling_Additional/'
                util.setup_ready_directory([additional_prodigal_outdir], 
                    delete_if_exist = True)
                util.process_genomes_using_prodigal(
                    additional_sample_genomes, additional_prodigal_outdir, 
                    additional_proteomes_directory, additional_genbanks_directory, 
                    log_object, threads = threads, locus_tag_length = locus_tag_length, 
                    gene_calling_method = gene_calling_method, meta_mode = meta_mode
                )
                shutil.rmtree(additional_prodigal_outdir)
                shutil.rmtree(uncompress_dir)
            else: 
                additional_miniprot_outdir = outdir + 'Miniprot_Gene_Calling_Additional/'
                util.setup_ready_directory([additional_miniprot_outdir], 
                    delete_if_exist = True)
                util.process_genomes_using_miniprot(
                    reference_proteome, additional_sample_genomes, 
                    additional_miniprot_outdir, additional_proteomes_directory, 
                    additional_genbanks_directory, log_object, threads = threads, 
                    locus_tag_length = locus_tag_length
                )
                shutil.rmtree(additional_miniprot_outdir)
                shutil.rmtree(uncompress_dir)
        else: 
            # Genomes are provided as Genbanks with CDS features
            gene_name_mapping_outdir = outdir + 'Mapping_of_New_Gene_Names_to_Original/'
            error_outdir = outdir + 'GenBank_Processing_Error_Files/'
            error_summary_file = outdir + 'Errors_from_GenBank_Processing.txt'
            util.setup_ready_directory([gene_name_mapping_outdir, error_outdir], delete_if_exist = True)
            util.process_genomes_as_genbanks(
                additional_sample_genomes, additional_proteomes_directory, 
                additional_genbanks_directory, gene_name_mapping_outdir, error_outdir,
                error_summary_file, log_object, threads = threads, 
                locus_tag_length = locus_tag_length, rename_locus_tags = rename_locus_tags, 
                error_no_lt = error_no_lt, error_no_translation = error_no_translation
            )
            shutil.rmtree(uncompress_dir)

        target_genome_annotation_listing_handle = open(target_genome_annotation_listing_file, 
            'w')
        target_genomes_concat_db_faa = outdir + 'Target_Genomes_DB.faa'
        target_genomes_concat_db_handle = open(target_genomes_concat_db_faa, 'w')

        for f in os.listdir(additional_proteomes_directory): 
            sample = f.split('.faa')[0]
            target_genome_annotation_listing_handle.write(
                sample + '\t' + additional_genbanks_directory_local + sample + '.gbk.gz\n'
            )
            try: 
                with open(additional_proteomes_directory + sample + '.faa') as ospf: 
                    for rec in SeqIO.parse(ospf, 'fasta'): 
                        target_genomes_concat_db_handle.write(
                            '>' + sample + '|' + rec.id + '\n' + str(rec.seq).rstrip('*') + '\n'
                        )
            except Exception: 
                pass

        target_genome_annotation_listing_handle.close()
        target_genomes_concat_db_handle.close()

        diamond_cluster_file = outdir + 'Target_Genomes_DB_clustered.tsv'
        if not skip_diamond_linclust:
            msg = '--------------------\nStep 3\n--------------------\n'
            msg += 'Clustering proteins using DIAMOND linclust to avoid redundancy in downstream DIAMOND database.'
            sys.stdout.write(msg + '\n')
            log_object.info(msg)

            # Run DIAMOND linclust to cluster proteins
            diamond_linclust_output_faa = outdir + 'Target_Genomes_DB_clustered.faa'
            diamond_linclust_cmd = [
                'diamond', 'linclust', '-d', target_genomes_concat_db_faa, '-o', diamond_cluster_file,
                collapsing_params, '--threads', str(threads)
            ]
            
            if max_memory is not None:
                diamond_linclust_cmd += ['-M', str(max_memory) + 'G']
                
            try:
                util.run_cmd_via_subprocess(diamond_linclust_cmd, log_object=log_object, 
                                            check_files=[diamond_cluster_file])

                # Recreate FASTA database for clustered proteins
                diamond_linclust_output_faa = outdir + 'Target_Genomes_DB_clustered.faa'

                diamond_linclust_reps = set([])
                with open(diamond_cluster_file) as dcf:
                    for line in dcf:
                        cluster_id, _ = line.strip().split('\t')
                        diamond_linclust_reps.add(cluster_id)

                with open(diamond_linclust_output_faa, 'w') as dcf:
                    with open(target_genomes_concat_db_faa) as otgcf:
                        for rec in SeqIO.parse(otgcf, 'fasta'):
                            if rec.id in diamond_linclust_reps:
                                dcf.write('>' + rec.id + '\n' + str(rec.seq).rstrip('*') + '\n')

                os.remove(target_genomes_concat_db_faa)

                # Use the clustered FASTA file for DIAMOND database creation
                target_genomes_concat_db_faa = diamond_linclust_output_faa
            except Exception as e:
                msg = f'Had an issue running or processing DIAMOND linclust: {" ".join(diamond_linclust_cmd)}'
                sys.stderr.write(msg + '\n')
                log_object.error(msg)
                log_object.error(e)
                sys.exit(1)
        else:
            util.create_fake_diamond_linclust_file(target_genomes_concat_db_faa, diamond_cluster_file, log_object)

        try:
            diamond_linclust_cluster_pickle_file = outdir + 'Target_Genomes_DB_clustered.pkl'
            util.process_diamond_linclust_cluster_file(diamond_cluster_file, diamond_linclust_cluster_pickle_file, log_object)
        except Exception as e:
            msg = f'Had an issue processing DIAMOND linclust cluster file: {diamond_cluster_file}'
            sys.stderr.write(msg + '\n')
            log_object.info(msg)
            log_object.error(e)
            sys.exit(1)

        # If requested, run MGE annotations
        if mge_annotation_flag: 
            mge_annot_dir = outdir + 'MGE_Annotations/'
            mge_annot_search_dir = mge_annot_dir + 'Search_Results/'
            mge_annot_summary_dir = mge_annot_dir + 'Summary_Results/'
            util.setup_ready_directory([mge_annot_dir, mge_annot_search_dir, mge_annot_summary_dir], 
                                   delete_if_exist = True)
            search_mge_annot_inputs = []
            process_mge_annot_inputs = []

            for f in os.listdir(additional_proteomes_directory): 
                sample = f.split('.faa')[0]
                faa_file = additional_proteomes_directory + f
                gbk_file = additional_genbanks_directory + sample + '.gbk.gz'
                vog_annot_file = mge_annot_search_dir + sample + '.vog.txt'
                mobsuite_annot_file = mge_annot_search_dir + sample + '.mobsuite.txt'
                is_annot_file = mge_annot_search_dir + sample + '.isfinder.txt'
                summary_file = mge_annot_summary_dir + sample + '.txt'
                search_mge_annot_inputs.append([
                    sample, faa_file, vog_annot_file, mobsuite_annot_file, 
                    is_annot_file
                ])
                process_mge_annot_inputs.append([
                    sample, gbk_file, summary_file, vog_annot_file, 
                    mobsuite_annot_file, is_annot_file
                ])

            # Use robust error handling for MGE annotation search
            result_summary1 = util.robust_multiprocess_executor(
                worker_function=util.annotate_mges,
                inputs=search_mge_annot_inputs,
                pool_size=threads,
                error_strategy="report_and_stop",
                log_object=log_object,
                description="MGE annotation search"
            )

            success_prop = result_summary1['success_count'] / result_summary1['total_processed']
            if success_prop != 1.0:
                msg = f'Issues with MGE annotation search for at least one sample. Exiting now ...'
                sys.stderr.write(msg + '\n')
                log_object.error(msg)
                sys.exit(1)

            # Use robust error handling for MGE annotation processing
            result_summary2 = util.robust_multiprocess_executor(
                worker_function=util.process_mge_annotations,
                inputs=process_mge_annot_inputs,
                pool_size=threads,
                error_strategy="report_and_stop", 
                log_object=log_object,
                description="MGE annotation processing"
            )

            success_prop = result_summary1['success_count'] / result_summary1['total_processed']
            if success_prop != 1.0:
                msg = f'Issues with MGE annotation search for at least one sample. Exiting now ...'
                sys.stderr.write(msg + '\n')
                log_object.error(msg)
                sys.exit(1)

        # Remove individual proteome files and compress genomes
        msg = '--------------------\nStep 4\n--------------------\n'
        msg += 'Removing intermediate individual proteome files to save space and creating DIAMOND database.'
        sys.stdout.write(msg + '\n')
        log_object.info(msg)

        shutil.rmtree(additional_proteomes_directory)

        target_genomes_concat_db_dmnd = outdir + 'Target_Genomes_DB.dmnd'
        diamond_makedb_cmd = ['diamond', 'makedb', '--ignore-warnings', '--in', 
                              target_genomes_concat_db_faa, '-d', 
                              target_genomes_concat_db_dmnd, '--threads', str(threads)]
        util.run_cmd_via_subprocess(diamond_makedb_cmd, log_object=log_object, 
                                    check_files=[target_genomes_concat_db_dmnd])
        util.remove_file(target_genomes_concat_db_faa, log_object)
        
        target_genome_annotation_data = util.read_in_annotation_files_for_expanded_sample_set(target_genome_annotation_listing_file, additional_genbanks_directory)

        pickle_output_dir = outdir + 'Target_Genomes_Info_Pickled/'
        util.setup_ready_directory([pickle_output_dir], delete_if_exist = True)
        try:
            sample_genbank_info = []
            for sample in target_genome_annotation_data: 
                sample_genbank = target_genome_annotation_data[sample]['genbank']
                result_pkl_file = pickle_output_dir + sample + '.pkl'
                sample_genbank_info.append([sample, sample_genbank, result_pkl_file])

            # Use robust error handling for GenBank parsing and boundary gene identification
            result_summary = util.robust_multiprocess_executor(
                worker_function=util.parse_genbank_and_find_boundary_genes,
                inputs=sample_genbank_info,
                pool_size=threads,
                error_strategy="report_and_stop",  # Continue even if some parsing fails
                log_object=log_object,
                description="GenBank parsing and boundary gene identification"
            )

            success_prop = result_summary['success_count'] / result_summary['total_processed']
            if success_prop != 1.0:
                msg = f'Issues with GenBank parsing and boundary gene identification for at least one sample. Exiting now ...'
                sys.stderr.write(msg + '\n')
                log_object.error(msg)
                sys.exit(1)

        except Exception as e: 
            sys.stderr.write('Issues creating pickle files needed for fai.')
            sys.stderr.write(str(e) + '\n')
            log_object.error(e)
            sys.exit(1)

        # If requested - create a species tree based on skani ANI estimates and neighbor-joining
        species_tree_dir = outdir + 'Species_Tree_Workspace/'
        species_tree = outdir + 'Species_Tree.nwk'
        if create_species_tree: 
            util.setup_ready_directory([species_tree_dir], delete_if_exist = True)
            msg = '--------------------\nStep 5\n--------------------\n'    
            msg += 'Inferring species tree based on skani ANI estimates + neighbor-joining.'
            sys.stdout.write(msg + '\n')
            log_object.info(msg)

            util.create_nj_tree(additional_genbanks_directory, species_tree, species_tree_dir, 
                            log_object, threads = threads)

    # Close logging object and exit
    util.close_logger_object(log_object)
    sys.exit(0)

if __name__ == '__main__': 
    prep_tg()
