#!/opt/mambaforge/envs/bioconda/conda-bld/zol_1759105867893/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_/bin/python

"""
Program: listAllGenomesInDirectory.py
Author: Rauf Salamzade
Kalan Lab
UW Madison, Department of Medical Microbiology and Immunology
"""

# BSD 3-Clause License
#
# Copyright (c) 2023-2025, Kalan-Lab
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import os
import argparse
import sys


def create_parser():
	""" Parse arguments """
	parser = argparse.ArgumentParser(description="""
	Program: listAllGenomesInDirectory.py
	Author: Rauf Salamzade
	Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology

	Program to create genomic listing file needed for lsaBGC-Ready.py. Provided a directory 
	with genomes, such as those generated by ncbi-genome-download, it will create a two-column, 
	tab-delimited listing file where the first column is the sample name and the second is the 
	full path to the genomic assembly in FASTA or Genbank format. It will recursively search so they 
	do not need to be in the main level of the directory structure. 

	If --list_cmds is specified together with other complementary option

	""", formatter_class=argparse.RawTextHelpFormatter)

	parser.add_argument('-i', '--input_genomes_dir', help='Path to genomic assembly in FASTA/Genbank format.',
						required=True)
	parser.add_argument('-z', '--uncompress', action='store_true', help='Whether to uncompress genomes files.',
						required=False, default=False)
	parser.add_argument('-u', '--uncompress_dir',
						help='Path to temporary directory where to unzip genomic assembly files if requested and update paths listed accordingly.',
						required=False, default='./Uncompressed_Genomes/')
	parser.add_argument('-l', '--list_cmds', action='store_true',
						help='List BGC commands instead of producing a sample to genome mapping file needed for input into lsaBGC-Ready.py.',
						required=False, default=False)
	parser.add_argument('-o', '--bgc_prediction_dir',
						help='Path to output directory to list for BGC prediction output.',
						default='./BGC_Predictions/', required=False)
	parser.add_argument('-p', '--bgc_prediction_software',
						help='Software used to predict BGCs (Options: antiSMASH, DeepBGC, GECCO).\nDefault is antiSMASH.',
						default='antiSMASH', required=False)
	parser.add_argument('-c', '--threads', help='The number of threads to use per BGC-prediction job [Default is 1].', required=False, default=1)
	parser.add_argument('-t', '--taxon',
						help='Taxon class to provide BGC prediction software, e.g. antiSMASH. Options: bacteri, fungi. Default: bacteria',
						default="bacteria", required=False)
	parser.add_argument('-d', '--dryrun_naming_file',
						help='Results from running ncbi-genome-download in dry-run mode to use for sample naming.',
						required=False, default=None)
	args = parser.parse_args()
	return args


def siftAndPrint():
	"""
	Void function which runs primary workflow for program.
	"""

	"""
	PARSE INPUTS
	"""
	myargs = create_parser()

	input_genomes_dir = os.path.abspath(myargs.input_genomes_dir) + '/'
	uncompress_flag = myargs.uncompress
	uncompress_dir = os.path.abspath(myargs.uncompress_dir) + '/'
	bgc_prediction_dir = os.path.abspath(myargs.bgc_prediction_dir) + '/'
	list_cmds_flag = myargs.list_cmds
	bgc_prediction_software = myargs.bgc_prediction_software.upper()
	threads = myargs.threads
	taxon = myargs.taxon.lower()
	dryrun_naming_file = myargs.dryrun_naming_file

	genome_id_to_sample_name = {}
	if dryrun_naming_file != None:
		try:
			assert (os.path.isfile(dryrun_naming_file))
		except Exception as e:
			raise RuntimeError('Cannot locate the ncbi-genome-download dryrun naming file provided.')

	try:
		assert (os.path.isdir(input_genomes_dir))
	except Exception as e:
		raise RuntimeError('Cannot find input directory of genomes directory.')

	try:
		assert (os.path.isdir(uncompress_dir))
	except Exception as e:
		os.system(f'mkdir {uncompress_dir}')
		try:
			assert (os.path.isdir(uncompress_dir))
		except Exception as e:
			raise RuntimeError('Cannot find/create directory for uncompressing genomes.')

	try:
		assert (os.path.isdir(bgc_prediction_dir))
	except Exception as e:
		if list_cmds_flag:
			os.system(f'mkdir {bgc_prediction_dir}')
			try:
				assert (os.path.isdir(bgc_prediction_dir))
			except Exception as e:
				raise RuntimeError('Cannot find/create output directory for BGC prediction commands.')

	try:
		assert (bgc_prediction_software in set(['ANTISMASH', 'DEEPBGC', 'GECCO']))
	except Exception as e:
		raise RuntimeError('BGC prediction software option is not a valid option.')

	try:
		assert (taxon in set(['bacteria', 'fungi']))
	except Exception as e:
		raise RuntimeError('Taxon is not a valid option.')

	"""
	START WORKFLOW
	"""

	genome_id_to_sample_name = {}
	if dryrun_naming_file:
		with open(dryrun_naming_file) as odnf:
			for line in odnf:
				line = line.strip()
				sample_name = '_'.join(line.split())
				genome_id = line.split()[0]
				genome_id_to_sample_name[genome_id] = sample_name

	sample_to_genome = {}
	any_file_gzipped = False
	for dirpath, dirnames, files in os.walk(input_genomes_dir):
		for f in files:
			suffix = f.split('.')[-1]
			gzip_flag = False
			if suffix == 'gz':
				suffix = f.split('.')[-2] + '.gz'
				gzip_flag = True
				any_file_gzipped = True
			if not suffix in set(
					['fasta', 'fna', 'fa', 'gbff', 'fasta.gz', 'fna.gz', 'fa.gz', 'gbff.gz', 'gbk', 'gbk.gz']):
				sys.stderr.write(
					f'Warning, skipping file: {f}, does not appear to have suffix expected of nucleotide FASTA files.\n')
			else:
				sample = '.'.join(f.split('.')[:-1])
				if gzip_flag:
					sample = '.'.join(f.split('.')[:-2])
				if sample.endswith('_genomic'):
					sample = sample.split('_genomic')[0]
				full_file_name = dirpath + '/' + f
				if sample in sample_to_genome:
					sys.stderr.write(f'Warning, sample {sample} has more than one genome, skipping second instance')
				else:
					sample_to_genome[sample] = full_file_name

	if (uncompress_flag or list_cmds_flag) and any_file_gzipped:
		for sample in sample_to_genome:
			genome_file = sample_to_genome[sample]
			if genome_file.endswith('.gz'):
				uncompressed_genome_file = uncompress_dir + genome_file.split('/')[-1].split('.gz')[0]
				os.system(f'cp {genome_file} {uncompress_dir}')
				os.system(f'gunzip {uncompressed_genome_file}.gz')
				try:
					assert (os.path.isfile(uncompressed_genome_file))
				except Exception as e:
					raise RuntimeError(
						f'Had issues creating uncompressed genome {uncompressed_genome_file} for sample {sample}')
				sample_to_genome[sample] = uncompressed_genome_file

	for sample in sample_to_genome:
		genome_file = sample_to_genome[sample]
		if dryrun_naming_file != None:
			genome_id = '_'.join(genome_file.split('/')[-1].split('_')[:2])
			sample = genome_id_to_sample_name[genome_id]
		sample = sample.replace('#', '').replace('*', '_').replace(':', '_').replace(';', '_').replace(' ',
																									   '_').replace(':',
																													'_').replace(
			'|', '_').replace('"', '_').replace("'", '_').replace("=", "_").replace('-', '_').replace('(', '').replace(
			')', '').replace('/', '').replace('\\', '').replace('[', '').replace(']', '').replace(',', '')
		if list_cmds_flag:
			bgc_cmd = None
			if bgc_prediction_software == 'ANTISMASH':
				gene_finding = 'prodigal'
				if taxon == 'fungi':
					gene_finding = 'glimmerhmm'
				if genome_file.endswith('.gbff.gz') or genome_file.endswith('.gbk.gz') or genome_file.endswith(
						'.gbk') or genome_file.endswith('.gbff'):
					gene_finding = 'none'
				bgc_cmd = ['antismash', '--taxon', taxon, '--output-dir', bgc_prediction_dir + sample + '/', '-c',
						   str(threads), '--genefinding-tool', gene_finding, '--output-basename', sample, genome_file]
			elif bgc_prediction_software == 'DEEPBGC':
				bgc_cmd = ['deepbgc', 'pipeline', '--output', bgc_prediction_dir + sample + '/', genome_file]
			elif bgc_prediction_software == 'GECCO':
				if taxon == 'fungi':
					raise RuntimeError("Not recommended to run GECCO with fungal genomes.")
				bgc_cmd = ['gecco', 'run', '-j', str(threads), '-o', bgc_prediction_dir + sample + '/', '-g', genome_file]
			print(' '.join(bgc_cmd))	# type: ignore
		else:
			print(sample + '\t' + genome_file)


if __name__ == '__main__':
	siftAndPrint()