#!/opt/mambaforge/envs/bioconda/conda-bld/skder_1757269308801/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehol/bin/python

### Program: cidder
### Author: Rauf Salamzade
### Kalan Lab
### UW Madison, Department of Medical Microbiology and Immunology

# BSD 3-Clause License
#
# Copyright (c) 2024, Rauf Salamzade
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import os
import sys
from skDER import util, cidder
from time import sleep
import argparse
import traceback

version = util.get_version()

VALID_GTDB_RELEASES = set(['R220', 'R226'])

def create_parser():
	""" Parse arguments """
	parser = argparse.ArgumentParser(description="""
	Program: cidder
	Author: Rauf Salamzade
	Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology

	CiDDER: Performs genome dereplication based on CD-HIT clustering of proteins to 
			select a representative set of genomes which adequately samples the 
			pangenome space. Because gene prediction is performed using pyrodigal, 
			geneder only works for bacterial genomes at the moment. 
								  
	The general algorithm is to first select the genome with the most number of distinct 
	open-reading-frames (ORFS; predicted genes) and then iteratively add genomes based on 
	which maximizes the number of new ORFs. This iterative addition of selected genomes
	is performed until: (i) the next genome to add does not have a minimum of X new disintct 
	ORFs to add to the set of ORFs belonging, (ii) some percentage Y of the total distinct 
	ORFs are found to have been sampled, or (iii) some percentage Z of the total multi-genome
	distinct ORFs are found to have been sampled. The "added-on" genomes from the iterative 
	procedure are listed as representative genomes.  
								  
	For information on how to alter CD-HIT parameters, please see: 
	https://github.com/weizhongli/cdhit/blob/master/doc/cdhit-user-guide.wiki#cd-hit

	Note, if --filter-mge is requested, the statistics reported in clustering reports (number 
	of proteins overlapping, ANI) in the clustering reports will all be based on processed 
	(MGE filtered) genomes. However, the final representative genomes in the 
	Dereplicated_Representative_Genomes/ folder will be the original unprocesed genomes.

	If you use CiDDER for your research, please kindly cite both:

	CD-HIT: accelerated for clustering the next-generation sequencing data. Bioinformatics.
	Fu et al. 2012.

	and

	skDER & CiDDER: microbial genome dereplication approaches for comparative genomic and 
	metagenomic applications. Salamzade, Kottapalli, and Kalan, 2025.
    """, formatter_class=argparse.RawTextHelpFormatter)

	parser.add_argument('-g', '--genomes', nargs='+', help='Genome assembly file paths or paths to containing\ndirectories. Files should be in FASTA format (accepted suffices\nare: *.fasta, *.fa, *.fas, or *.fna) [Optional].', required=False, default=[])
	parser.add_argument('-p', '--proteomes', nargs='+', help='Proteome file paths or paths to containing\ndirectories. Files should be in FASTA format (accepted suffices\nare: *.fasta, *.fa, or *.faa) [Optional].', required=False, default=[])
	parser.add_argument('-t', '--taxa-name', help='Genus or species identifier from GTDB for which to\ndownload genomes for and include in\ndereplication analysis [Optional].', required=False, default=None)
	parser.add_argument('-a', '--new-proteins-needed', type=int, help="The number of new protein clusters needed to add [Default is 0].", required=False, default=0)
	parser.add_argument('-ts', '--total-saturation', type=float, help="The percentage of total proteins clusters needed to stop representative\ngenome selection [Default is 90.0].", required=False, default=90.0)
	parser.add_argument('-mgs', '--multi-genome-saturation', type=float, help="The percentage of total multi-genome protein clusters needed to stop\nrepresentative genome selection [Default is 100.0].", required=False, default=100.0)
	parser.add_argument('-rs', '--require-similarity', type=float, help="Require non-representative genomes to have X%% of their protein\nclusters represented by an individual representative genome [Default is 0.0].", required=False, default=0.0)
	parser.add_argument('-o', '--output-directory', help='Output directory.', required=True)
	parser.add_argument('-cdp', '--cd-hit-params', help="CD-HIT parameters to use for clustering proteins - select carefully\n(don't set threads or memory - those are done by default in cidder) and\nsurround by quotes [Default is: \"-n 5 -c 0.95 -aL 0.75 -aS 0.90\"]", required=False, default="-n 5 -c 0.95 -aL 0.75 -aS 0.90")
	parser.add_argument('-mg', '--metagenome-mode', action='store_true', help="Run pyrodigal using metagenome mode.",  required=False, default=False)
	parser.add_argument('-e', '--include-edge-orfs', action='store_true', help="Include proteins from ORFs that hang off the edge of a contig/scaffold.", required=False, default=False)
	parser.add_argument('-s', '--sanity-check', action='store_true', help="Confirm each FASTA file provided or downloaded is actually\na FASTA file. Makes it slower, but generally\ngood practice.", required=False, default=False)
	parser.add_argument('-fm', '--filter-mge', action='store_true', help="Filter predicted MGE coordinates along genomes before\ndereplication assessment but after N50\ncomputation.", required=False, default=False)
	parser.add_argument('-gd', '--genomad-database', help="If filter-mge is specified, it will by default use PhiSpy;\nhowever, if a database directory for\ngeNomad is provided - it will use that instead\nto predict MGEs.", default=None, required=False)
	parser.add_argument('-n', '--determine-clusters', action='store_true', help="Perform secondary clustering to assign non-representative\ngenomes to their closest representative genomes based on shared\nprotein clusters.", required=False, default=False)
	parser.add_argument('-ns', '--determine-clusters-skani', action='store_true', help="Perform secondary clustering to assign non-representative\ngenomes to their closest representative genomes based on skani-computed\nANI.", required=False, default=False)
	parser.add_argument('-mn', '--minimal_n50', type=int, help="Minimal N50 of genomes to be included in dereplication\n[Default is 0].", required=False, default=0)
	parser.add_argument('-l', '--symlink', action='store_true', help="Symlink representative genomes in results subdirectory\ninstead of performing a copy of the files.", required=False, default=False)
	parser.add_argument('-r', '--gtdb-release', help='Which GTDB release to use if -t argument issued [Default is R226].', default="R226")
	parser.add_argument('-auto', '--automate', action='store_true', help="Automatically skip warnings and download genomes from NCBI if -t\nargument issued. Automatation off by default to prevent\nunexpected downloading of large genomes [Default\nis False].", required=False, default=False)
	parser.add_argument('-c', '--threads', type=int, help="Number of threads/processes to use [Default is 1].", required=False, default=1)
	parser.add_argument('-mm', '--max-memory', type=int, help="Max memory in Gigabytes [Default is 0 = unlimited].", required=False, default=0)
	parser.add_argument('-v', '--version', action='store_true', help="Report version of CiDDER.", required=False, default=False)
	args = parser.parse_args()
	return args

def cidder_main():
	if len(sys.argv)>1 and ('-v' in set(sys.argv) or '--version' in set(sys.argv)):
		sys.stderr.write('Version of CiDDER (skDER pacakge) being used is: ' + str(version) + '\n')
		sys.exit(0)
	
	# Parse arguments
	myargs = create_parser()

	genomes = myargs.genomes
	proteomes = myargs.proteomes
	taxa_name = None
	if myargs.taxa_name:
		taxa_name = myargs.taxa_name.strip('"')
	gtdb_release = myargs.gtdb_release.upper()
	outdir = os.path.abspath(myargs.output_directory) + '/'
	cd_hit_params = myargs.cd_hit_params.strip('"')
	metagenome_mode = myargs.metagenome_mode
	include_edge_orfs = myargs.include_edge_orfs
	new_proteins_needed = myargs.new_proteins_needed
	saturation_cutoff = myargs.total_saturation
	multigenome_saturation_cutoff = myargs.multi_genome_saturation
	sanity_check_flag = myargs.sanity_check
	symlink_flag = myargs.symlink
	threads = myargs.threads
	determine_clusters_flag = myargs.determine_clusters
	determine_clusters_ani_flag = myargs.determine_clusters_skani
	filter_mge_flag = myargs.filter_mge
	automate_flag = myargs.automate
	genomad_database = myargs.genomad_database
	require_similarity = myargs.require_similarity
	max_memory = myargs.max_memory


	minimal_n50 = myargs.minimal_n50
	if minimal_n50 < 0:
		sys.stderr.write('Minimal N50 requested is less than 0. Exiting ...\n')
		sys.exit(1)

	if genomad_database != None:
		sys.stdout.write('Attempting to use geNomad to predict and cut out MGEs from genome assembly!\n')
		try:
			assert(genomad_database != None and os.path.isdir(genomad_database))
		except:
			sys.stderr.write('geNomad requested as method but no or invalid path to the geNomad database provided via --genomad-db.\n')
			sys.exit(1)

	try:
		assert(gtdb_release in VALID_GTDB_RELEASES)
	except:
		sys.stderr.write('GTDB release requested is not valid. Valid options include: %s\n' % ' '.join(VALID_GTDB_RELEASES))
		sys.exit(1)

	if os.path.isdir(outdir):
		sys.stderr.write("Output directory already exists!\n")
		#sleep(5)

	util.setupDirectories([outdir], automate_flag=automate_flag)

	# Create logging object
	log_file = outdir + 'Progress.log'
	logObject = util.createLoggerObject(log_file)
	parameters_file = outdir + 'Command_Issued.txt'
	sys.stdout.write('Running CiDDER (skDER pacakge) version %s\n' % version)
	sys.stdout.write("Appending command issued for future records to: %s\n" % parameters_file)
	sys.stdout.write("Logging more details at: %s\n" % log_file)
	logObject.info("\nNEW RUN!!!\n**************************************")
	logObject.info('Running CiDDER (skDER pacakge) version %s' % version)
	logObject.info("Appending command issued for future records to: %s" % parameters_file)

	parameters_handle = open(parameters_file, 'a+')
	parameters_handle.write(' '.join(sys.argv) + '\n')
	parameters_handle.close()
	
	# set max memory limit
	if max_memory != 0:
		logObject.info("Setting maximum memory usage to: %dGB" % max_memory)
		sys.stdout.write("Setting maximum memory usage to: %dGB\n" % max_memory)
		try:
			util.memory_limit(max_memory)
		except Exception as e:
			logObject.info("Error setting memory limit")
			sys.stdout.write("Error setting memory limit\n")

	all_genomes_listing_file = outdir + 'All_Genomes_Listing.txt'
	if os.path.isfile(all_genomes_listing_file):
		try:
			os.remove(all_genomes_listing_file)
		except OSError as e:
			msg = 'Error: issues removing the file %s.' % all_genomes_listing_file
			sys.stderr.write(msg + '\n')
			logObject.warning(msg)	
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)

	# process input	
	if taxa_name != "None" and taxa_name != None:
		util.downloadGTDBGenomes(taxa_name, gtdb_release, outdir, all_genomes_listing_file, logObject, sanity_check=sanity_check_flag, automated_download=automate_flag, gunzip=True, threads=threads)
	if len(genomes) > 0:
		util.processInputGenomes(genomes, all_genomes_listing_file, logObject, sanity_check=sanity_check_flag, allow_gzipped=False)

	number_of_genomes = 0
	genome_name_to_path = {}
	try:
		if os.path.isfile(all_genomes_listing_file):
			with open(all_genomes_listing_file) as oaglf:
				for line in oaglf:
					genome_path = line.strip()
					genome_name = '.'.join(line.split('/')[-1].split('.')[:-1])
					if genome_path.endswith('.gz'):	
						genome_name = '.'.join(line.split('/')[-1].split('.')[:-2])
					genome_name_to_path[genome_name] = genome_path
					number_of_genomes += 1
		if len(proteomes) == 0:
			assert(number_of_genomes >= 2)
	except Exception as e:
		msg = 'Error: fewer than 2 genomes downloaded / provided. Exiting ...'
		logObject.error(msg)
		sys.stderr.write(msg + '\n')
		sys.stderr.write(traceback.format_exc() + '\n')
		logObject.error(traceback.format_exc())
		sys.exit(1)
	else:
		msg = 'Number of valid genomes provided as input: %d' % number_of_genomes
		sys.stdout.write(msg + '\n')
		logObject.info(msg)
		
	# calculate N50s
	n50s = None
	try:
		n50s = util.determineN50(all_genomes_listing_file, outdir, logObject, threads=threads)
	except Exception as e:
		msg = 'Error: issue determining N50s. Exiting ...'	
		sys.stderr.write(msg + '\n')
		logObject(msg)
		sys.stderr.write(traceback.format_exc() + '\n')
		logObject.error(traceback.format_exc())
		sys.exit(1)

	if minimal_n50 > 0:
		msg = 'Filtering genomes with an N50 less than %d.' % minimal_n50
		sys.stdout.write(msg + '\n')
		logObject.info(msg)
		number_of_genomes = 0 
		try:
			aglf_outf = open(all_genomes_listing_file, 'w')
			n50s_filt = {}
			for genome in n50s:
				if n50s[genome] >= minimal_n50:
					number_of_genomes += 1
					aglf_outf.write(genome + '\n')
					n50s_filt[genome] = n50s[genome]
			aglf_outf.close()
			n50s = n50s_filt
			try:
				if len(proteomes) == 0:
					assert(number_of_genomes >= 2)
			except Exception as e:
				msg = 'Error: fewer than 2 genomes downloaded / provided / valid after filtering. Exiting ...'
				logObject.error(msg)
				sys.stderr.write(msg + '\n')
				sys.stderr.write(traceback.format_exc() + '\n')
				logObject.error(traceback.format_exc())
				sys.exit(1)
			else:
				msg = 'Number of valid genomes provided as input after N50 filtering: %d' % number_of_genomes
				sys.stdout.write(msg + '\n')
				logObject.info(msg)
		except Exception as e:
			msg = 'Error: issue filtering genomes by N50. Exiting ...'
			sys.stderr.write(msg + '\n')
			logObject.error(msg)
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)

	# filter genomes for MGEs if requested
	if number_of_genomes > 0 and filter_mge_flag:
		proc_genomes_listing_file = outdir + 'All_Genomes_Listing_mgecut_Processed.txt'
		try:
			_, _ = util.filterMGEs(all_genomes_listing_file, outdir, proc_genomes_listing_file, logObject, threads=threads, genomad_database=genomad_database)
		except Exception as e:
			msg = 'Error: issue filtering MGEs. Exiting ...'
			sys.stderr.write(msg + '\n')
			logObject.error(msg)
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)
		all_genomes_listing_file = proc_genomes_listing_file

	combined_proteome_faa = outdir + 'All_Proteins.faa'
	if number_of_genomes > 0:
		try:
			cidder.runPyrodigal(all_genomes_listing_file, outdir, combined_proteome_faa, logObject, metagenome_mode=metagenome_mode, include_edge_orfs=include_edge_orfs, threads=threads)
		except Exception as e:
			msg = 'Error: running gene predictions. Exiting ...'
			sys.stderr.write(msg + '\n')
			logObject.error(msg)
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)

	proteome_name_to_path = {}
	if len(proteomes) > 0:
		try:
			proteome_name_to_path = util.processInputProteomes(proteomes, combined_proteome_faa, genome_name_to_path, logObject, sanity_check=sanity_check_flag, allow_gzipped=False)
		except Exception as e:
			msg = 'Error: issues processing user-provided proteomes. Exiting ...'
			sys.stderr.write(msg + '\n')
			logObject.error(msg)
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)

	number_of_genomes += len(proteome_name_to_path)
	if number_of_genomes < 2:
		msg = 'Error: fewer than 2 genomes downloaded / provided. Exiting ...'
		logObject.error(msg)
		sys.stderr.write(msg + '\n')
		sys.stderr.write(traceback.format_exc() + '\n')
		logObject.error(traceback.format_exc())
		sys.exit(1)
	elif len(proteome_name_to_path) > 0:
		msg = 'Total number of valid genomes/proteomes provided as input: %d' % number_of_genomes
		sys.stdout.write(msg + '\n')
		logObject.info(msg)

	cdhit_result_prefix = outdir + 'CD-HIT_clustering.faa'
	try:
		genome_protein_clusters, genome_cluster_counts, c_count, mgc_count, multi_genome_clusters = cidder.runAndProcessCDHIT(combined_proteome_faa, cdhit_result_prefix, cd_hit_params, max_memory, threads, logObject)
	except Exception as e:
		msg = 'Error: running or processing CD-HIT clustering. Exiting ...'
		sys.stderr.write(msg + '\n')
		logObject.error(msg)
		sys.stderr.write(traceback.format_exc() + '\n')
		logObject.error(traceback.format_exc())
		sys.exit(1)

	rep_genomes = set([])
	# Perform representative genome selection procedure
	rep_appending_order_file = outdir + 'CiDDER_Results.txt'
	cidder_drep_dir = outdir + 'Dereplicated_Representative_Genomes/'
	util.setupDirectories([cidder_drep_dir])	
	try:
		rep_genomes = cidder.performRepresentativeGenomeSelection(genome_protein_clusters, genome_cluster_counts, c_count,
																  mgc_count, multi_genome_clusters, new_proteins_needed, 
																  saturation_cutoff, multigenome_saturation_cutoff, 
																  rep_appending_order_file, genome_name_to_path,
																  proteome_name_to_path, cidder_drep_dir, logObject,
																  symlink_flag=symlink_flag)
	except Exception as e:
		msg = 'Error: issues with representative genome selection. Exiting ...'
		sys.stderr.write(msg + '\n')
		logObject.error(msg)
		sys.stderr.write(traceback.format_exc() + '\n')
		logObject.error(traceback.format_exc())
		sys.exit(1)

	if require_similarity > 0.0:
		try:
			rep_genomes = cidder.appendAdditionalReps(rep_genomes, genome_protein_clusters, require_similarity, 
											          rep_appending_order_file, genome_name_to_path, 
													  proteome_name_to_path, cidder_drep_dir, logObject, 
													  symlink_flag=symlink_flag)
		except Exception as e:
			msg = 'Error: issues with appending additional representative genomes based on required similarity between non-representative genomes to individual representative genomes. Exiting ...'
			sys.stderr.write(msg + '\n')
			logObject.error(msg)
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)

	msg = 'There were %d representative genomes selected from %d considered!' % (len(rep_genomes), number_of_genomes)
	sys.stdout.write(msg + '\n')
	logObject.info(msg)	

	# perform secondary clustering
	if determine_clusters_flag:
		msg = 'Performing assignment of genomes to their nearest representative genomes based on protein cluster containment.'
		sys.stdout.write(msg + '\n')
		logObject.info(msg)

		cidder_cluster_result_file = outdir + 'CiDDER_Clustering.txt'
		try:
			cidder.secondaryClustering(rep_genomes, genome_protein_clusters, cidder_cluster_result_file)
		except:
			msg = 'Error: issues with secondary clustering. Exiting ...'
			sys.stderr.write(msg + '\n')
			logObject.error(msg)
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)
		
	if determine_clusters_ani_flag:
		msg = 'Performing assignment of genomes to their nearest representative genomes based on ANI.'
		sys.stdout.write(msg + '\n')
		logObject.info(msg)

		cluster_dir = outdir + 'skani_for_Clustering_Workspace/'
		util.setupDirectories([cluster_dir])
		skani_result_file = cluster_dir + 'Skani_Dist_Output.txt'
		cidder_cluster_result_file = outdir + 'CiDDER_skani_Clustering.txt'
		try:
			cidder.secondaryClusteringSkani(cluster_dir, all_genomes_listing_file, rep_genomes, 
								            genome_protein_clusters, skani_result_file, 
								            cidder_cluster_result_file, logObject, threads=threads)
		except:
			msg = 'Error: issues with secondary clustering based on skani. Exiting ...'
			sys.stderr.write(msg + '\n')
			logObject.error(msg)
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)

	# close logging object and exit
	logObject.info('******************\nCiDDER finished!\n******************\nDirectory with representative genomes can be found at: %s' % cidder_drep_dir)
	sys.stdout.write('******************\nCiDDER finished!\n******************\nDirectory with representative genomes can be found at: %s\n' % cidder_drep_dir)
	util.closeLoggerObject(logObject)

	# create completion file for workflows to check
	completion_file = outdir + 'COMPLETED.txt'
	os.system('echo "CiDDER completed successfully!" > %s' % completion_file)
	assert(os.path.isfile(completion_file))
	sys.exit(0)

if __name__ == '__main__':
    try:
        cidder_main()
    except KeyboardInterrupt:
        print('Interrupted')
        try:
            sys.exit(130)
        except SystemExit:
            os._exit(130)