#!/opt/mambaforge/envs/bioconda/conda-bld/skder_1757269308801/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehol/bin/python

### Program: skder
### Author: Rauf Salamzade
### Kalan Lab
### UW Madison, Department of Medical Microbiology and Immunology

# BSD 3-Clause License
#
# Copyright (c) 2023, Rauf Salamzade
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import os
import sys
from skDER import util, skder
from time import sleep
import argparse
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import traceback

version = util.get_version() 

VALID_GTDB_RELEASES = set(['R220', 'R226'])
PRESELECTED_ANI_CUTOFFS = [90.0, 95.0, 97.0, 98.0, 99.0, 99.5]
PRESELECTED_AF_CUTOFFS = [10.0, 25.0, 50.0, 75.0, 90.0]

def create_parser():
	""" Parse arguments """
	parser = argparse.ArgumentParser(description="""
	Program: skder
	Author: Rauf Salamzade
	Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology

	skDER: efficient & high-resolution dereplication of microbial genomes to select 
		   representative genomes.

	skDER will perform dereplication of genomes using skani average nucleotide identity 
	(ANI) and aligned fraction (AF) estimates and either a dynamic programming or 
	greedy-based based approach. It assesses such pairwise ANI & AF estimates to determine 
	whether two genomes are similar to each other and then chooses which genome is better 
	suited to serve as a representative based on assembly N50 (favoring the more contiguous 
	assembly) and connectedness (favoring genomes deemed similar to a greater number of 
	alternate genomes).
								  
	Note, if --filter-mge is requested, the original paths to genomes are reported but 
	the statistics reported in the clustering reports (e.g. ANI, AF) will all be based 
	on processed (MGE filtered) genomes. Importantly, computation of N50 is performed 
	before MGE filtering to not penalize genomes of high quality that simply have many 
	MGEs and enable them to still be selected as representatives.
	
	If you use skDER for your research, please kindly cite both:

	Fast and robust metagenomic sequence comparison through sparse chaining with skani.
	Nature Methods. Shaw and Yu, 2023.

	and
    
	skDER & CiDDER: two scalable approaches for microbial dereplication. Microbial
	Genomics. Salamzade, Kottapalli, and Kalan, 2025.
    """, formatter_class=argparse.RawTextHelpFormatter)

	parser.add_argument('-g', '--genomes', nargs='+', help='Genome assembly file paths or paths to containing\ndirectories. Files should be in FASTA format and can be gzipped\n(accepted suffices are: *.fasta,\n*.fa, *.fas, or *.fna) [Optional].', required=False, default=[])
	parser.add_argument('-t', '--taxa-name', help='Genus or species identifier from GTDB for which to\ndownload genomes for and include in\ndereplication analysis [Optional].', required=False, default=None)
	parser.add_argument('-o', '--output-directory', help='Output directory.', required=True)
	parser.add_argument('-d', '--dereplication-mode', help='Whether to use a "dynamic" (more concise), "greedy" (more\ncomprehensive), or "low_mem_greedy" (currently\nexperimental) approach to selecting representative genomes.\n[Default is "greedy"]', required=False, default="greedy")
	parser.add_argument('-i', '--percent-identity-cutoff', type=float, help="ANI cutoff for dereplication [Default is 99.5].", required=False, default=99.5)
	parser.add_argument('-f', '--aligned-fraction-cutoff', type=float, help="Aligned cutoff threshold for dereplication - only needed by\none genome [Default is 50.0].", required=False, default=50.0)
	parser.add_argument('-a', '--max-af-distance-cutoff', type=float, help="Maximum difference for aligned fraction between a pair to\nautomatically disqualify the genome with a higher\nAF from being a representative [Default is 10.0].", required=False, default=10.0)
	parser.add_argument('-tc', '--test-cutoffs', action='store_true', help="Assess clustering using various pre-selected cutoffs.", required=False, default=False)
	parser.add_argument('-p', '--skani-triangle-parameters', help="Options for skani triangle. Note ANI and AF cutoffs\nare specified separately and the -E parameter is always\nrequested. [Default is \"-s X\", where X is\n10 below the ANI cutoff].", default="-s X", required=False)
	parser.add_argument('-s', '--sanity-check', action='store_true', help="Confirm each FASTA file provided or downloaded is actually\na FASTA file. Makes it slower, but generally\ngood practice.", required=False, default=False)
	parser.add_argument('-fm', '--filter-mge', action='store_true', help="Filter predicted MGE coordinates along genomes before\ndereplication assessment but after N50\ncomputation.", required=False, default=False)
	parser.add_argument('-gd', '--genomad-database', help="If filter-mge is specified, it will by default use PhiSpy;\nhowever, if a database directory for\ngeNomad is provided - it will use that instead\nto predict MGEs.", default=None, required=False)
	parser.add_argument('-n', '--determine-clusters', action='store_true', help="Perform secondary clustering to assign non-representative\ngenomes to their closest representative genomes.", required=False, default=False)
	parser.add_argument('-mn', '--minimal_n50', type=int, help="Minimal N50 of genomes to be included in dereplication\n[Default is 0].", required=False, default=0)
	parser.add_argument('-l', '--symlink', action='store_true', help="Symlink representative genomes in results subdirectory\ninstead of performing a copy of the files.", required=False, default=False)
	parser.add_argument('-r', '--gtdb-release', help='Which GTDB release to use if -t argument issued [Default is R226].', default="R226")
	parser.add_argument('-auto', '--automate', action='store_true', help="Automatically skip warnings and download genomes from NCBI if -t\nargument issued. Automatation off by default to prevent\nunexpected downloading of large genomes [Default\nis False].", required=False, default=False)
	parser.add_argument('-mm', '--max-memory', type=int, help="Max memory in Gigabytes [Default is 0 = unlimited].", required=False, default=0)
	parser.add_argument('-c', '--threads', type=int, help="Number of threads/processes to use [Default is 1].", required=False, default=1)
	parser.add_argument('-v', '--version', action='store_true', help="Report version of skDER.", required=False, default=False)
	args = parser.parse_args()
	return args

def skder_main():
	if len(sys.argv)>1 and ('-v' in set(sys.argv) or '--version' in set(sys.argv)):
		sys.stderr.write('Version of skDER being used is: ' + str(version) + '\n')
		sys.exit(0)
	
	# Parse arguments
	myargs = create_parser()

	genomes = myargs.genomes
	taxa_name = None
	if myargs.taxa_name:
		taxa_name = myargs.taxa_name.strip('"')
	gtdb_release = myargs.gtdb_release.upper()
	outdir = os.path.abspath(myargs.output_directory) + '/'
	selection_mode = myargs.dereplication_mode.lower()
	percent_identity_cutoff = myargs.percent_identity_cutoff
	aligned_fraction_cutoff = myargs.aligned_fraction_cutoff
	skani_triangle_parameters = myargs.skani_triangle_parameters.strip('"')
	max_af_distance_cutoff = myargs.max_af_distance_cutoff
	test_cutoffs_flag = myargs.test_cutoffs
	threads = myargs.threads
	symlink_flag = myargs.symlink
	determine_clusters_flag = myargs.determine_clusters
	minimal_n50 = myargs.minimal_n50
	sanity_check_flag = myargs.sanity_check
	filter_mge_flag = myargs.filter_mge
	automate_flag = myargs.automate
	genomad_database = myargs.genomad_database
	max_memory = myargs.max_memory

	if minimal_n50 < 0:
		sys.stderr.write('Minimal N50 requested is less than 0. Exiting ...\n')
		sys.exit(1)

	if genomad_database != None:
		sys.stdout.write('Attempting to use geNomad to predict and cut out MGEs from genome assembly!\n')
		try:
			assert(genomad_database != None and os.path.isdir(genomad_database))
		except:
			sys.stderr.write('geNomad requested as method but no or invalid path to the geNomad database provided via --genomad-db.\n')
			sys.exit(1)

	try:
		assert(selection_mode in set(['dynamic', 'greedy', 'low_mem_greedy']))
	except:
		sys.stderr.write('Selection mode requested not valid, must be either "greedy" or  "dynamic".')
		sys.exit(1)

	try:
		assert(gtdb_release in VALID_GTDB_RELEASES)
	except:
		sys.stderr.write('GTDB release requested is not valid. Valid options include: %s\n' % ' '.join(VALID_GTDB_RELEASES))
		sys.exit(1)
	
	if os.path.isdir(outdir):
		sys.stderr.write("Output directory already exists!\n")# Overwriting in 5 seconds...\n")
		#sleep(5)
	util.setupDirectories([outdir], automate_flag=automate_flag)
		
	# Create logging object
	log_file = outdir + 'Progress.log'
	logObject = util.createLoggerObject(log_file)
	parameters_file = outdir + 'Command_Issued.txt'
	sys.stdout.write('Running version %s\n' % version)
	sys.stdout.write("Appending command issued for future records to: %s\n" % parameters_file)
	sys.stdout.write("Logging more details at: %s\n" % log_file)
	logObject.info("\nNEW RUN!!!\n**************************************")
	logObject.info('Running version %s' % version)
	logObject.info("Appending command issued for future records to: %s" % parameters_file)

	parameters_handle = open(parameters_file, 'a+')
	parameters_handle.write(' '.join(sys.argv) + '\n')
	parameters_handle.close()

	# set max memory limit
	if max_memory != 0:
		logObject.info("Setting maximum memory usage to: %dGB" % max_memory)
		sys.stdout.write("Setting maximum memory usage to: %dGB\n" % max_memory)
		try:
			util.memory_limit(max_memory)
		except Exception as e:
			logObject.info("Error setting memory limit")
			sys.stdout.write("Error setting memory limit\n")

	if skani_triangle_parameters=="-s X":
		screen_cutoff = max(percent_identity_cutoff - 10.0, 0.0)
		skani_triangle_parameters = "-s " + str(screen_cutoff)
		msg = "Setting ANI screening parameter to %f." % screen_cutoff
		sys.stdout.write(msg + '\n')
		logObject.info(msg)

	if percent_identity_cutoff < 80.0:
		msg = "Warning: ANI cutoff requested is less than 80.0. This might lead to inaccurate estimates - see skani documentation."
		sys.stderr.write(msg + '\n')
		logObject.warning(msg)
		if not automate_flag:
			response = input("Do you wish to continue? (yes/no): ")
			if response.lower() != 'yes':
				sys.stderr.write("Exiting...\n")
				sys.exit(0)

	specified_ani_cutoff = percent_identity_cutoff
	specified_af_cutoff = aligned_fraction_cutoff
	if determine_clusters_flag and selection_mode == 'dynamic':
		aligned_fraction_cutoff = max([aligned_fraction_cutoff - 20.0, 0.0])

	all_genomes_listing_file = outdir + 'All_Genomes_Listing.txt'
	if os.path.isfile(all_genomes_listing_file):
		try:
			os.remove(all_genomes_listing_file)
		except OSError as e:
			msg = 'Error: issues removing the file %s.' % all_genomes_listing_file
			sys.stderr.write(msg + '\n')
			logObject.warning(msg)	
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)

	# process input	
	if taxa_name != "None" and taxa_name != None:
		util.downloadGTDBGenomes(taxa_name, gtdb_release, outdir, all_genomes_listing_file, logObject, sanity_check=sanity_check_flag, automated_download=automate_flag)
	if len(genomes) > 0:
		util.processInputGenomes(genomes, all_genomes_listing_file, logObject, sanity_check=sanity_check_flag)

	number_of_genomes = None
	try:
		assert(os.path.isfile(all_genomes_listing_file))
		with open(all_genomes_listing_file) as oaglf:
			number_of_genomes= len(oaglf.readlines())
		assert(number_of_genomes >= 2)
	except Exception as e:
		msg = 'Error: fewer than 2 genomes downloaded / provided. Exiting ...'
		logObject.error(msg)
		sys.stderr.write(msg + '\n')
		sys.stderr.write(traceback.format_exc() + '\n')
		logObject.error(traceback.format_exc())
		sys.exit(1)
	else:
		msg = 'Number of valid genomes provided as input: %d' % number_of_genomes
		sys.stdout.write(msg + '\n')
		logObject.info(msg)

	# calculate N50s
	n50s = None
	try:
		n50s = util.determineN50(all_genomes_listing_file, outdir, logObject, threads=threads)
	except Exception as e:
		msg = 'Error: issue determining N50s. Exiting ...'	
		sys.stderr.write(msg + '\n')
		logObject(msg)
		sys.stderr.write(traceback.format_exc() + '\n')
		logObject.error(traceback.format_exc())
		sys.exit(1)
		
	if minimal_n50 > 0:
		msg = 'Filtering genomes with an N50 less than %d.' % minimal_n50
		sys.stdout.write(msg + '\n')
		logObject.info(msg)
		number_of_genomes = 0 
		try:
			aglf_outf = open(all_genomes_listing_file, 'w')
			n50s_filt = {}
			for genome in n50s:
				if n50s[genome] >= minimal_n50:
					number_of_genomes += 1
					aglf_outf.write(genome + '\n')
					n50s_filt[genome] = n50s[genome]
			aglf_outf.close()
			n50s = n50s_filt
			try:
				assert(number_of_genomes >= 2)
			except Exception as e:
				msg = 'Error: fewer than 2 genomes downloaded / provided / valid after filtering. Exiting ...'
				logObject.error(msg)
				sys.stderr.write(msg + '\n')
				sys.stderr.write(traceback.format_exc() + '\n')
				logObject.error(traceback.format_exc())
				sys.exit(1)
			else:
				msg = 'Number of valid genomes provided as input after N50 filtering: %d' % number_of_genomes
				sys.stdout.write(msg + '\n')
				logObject.info(msg)
		except Exception as e:
			msg = 'Error: issue filtering genomes by N50. Exiting ...'
			sys.stderr.write(msg + '\n')
			logObject.error(msg)
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)

	# filter genomes for MGEs if requested
	mge_proc_to_unproc_mapping = None
	mge_unproc_to_proc_mapping = None
	if filter_mge_flag:
		proc_genomes_listing_file = outdir + 'All_Genomes_Listing_mgecut_Processed.txt'	
		try:
			mge_proc_to_unproc_mapping, mge_unproc_to_proc_mapping = util.filterMGEs(all_genomes_listing_file, outdir, proc_genomes_listing_file, logObject, threads=threads, genomad_database=genomad_database)
		except Exception as e:
			msg = 'Error: issue filtering MGEs. Exiting ...'
			sys.stderr.write(msg + '\n')
			logObject.error(msg)
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)
		all_genomes_listing_file = proc_genomes_listing_file

	# create N50 file with names matching file paths
	concat_n50_result_file = outdir + 'Concatenated_N50.txt'
	util.writeN50sToFile(n50s, all_genomes_listing_file, concat_n50_result_file, logObject)

	# run skani triangle
	skani_result_file = outdir + 'Skani_Triangle_Edge_Output.txt'
	if selection_mode == 'greedy' or selection_mode == 'dynamic':
		skder.runSkaniTriangle(all_genomes_listing_file, skani_result_file, skani_triangle_parameters, 
			       	 aligned_fraction_cutoff, selection_mode, test_cutoffs_flag, logObject, threads=threads)

	if test_cutoffs_flag:
		skder_result_dir = outdir + 'skDER_Result/'
		util.setupDirectories([skder_result_dir])
		
		heatmap_df = [['ANI/AF'] + [str(x) for x in PRESELECTED_AF_CUTOFFS]]
		for ani_cutoff in PRESELECTED_ANI_CUTOFFS:
			row_data = [str(ani_cutoff)]
			for af_cutoff in PRESELECTED_AF_CUTOFFS:
				
				skder_result_file = skder_result_dir + 'skDER_Results_ANI' + str(ani_cutoff) + '_AF' + str(af_cutoff) + '.txt'
				if selection_mode == 'dynamic':
					try:
						skder.dynamicDerep(skani_result_file, concat_n50_result_file, skder_result_file, outdir,
						               ani_cutoff, af_cutoff, max_af_distance_cutoff, logObject, 
						               mge_proc_to_unproc_mapping=mge_proc_to_unproc_mapping)
					except Exception as e:
						msg = 'Error: issue running skDER dynamic method while testing parameters. Exiting ...'
						sys.stderr.write(msg + '\n')
						logObject.error(msg)
						sys.stderr.write(traceback.format_exc() + '\n')
						logObject.error(traceback.format_exc())
						sys.exit(1)

				elif selection_mode == 'greedy':
					try:
						skder.greedyDerep(skani_result_file, concat_n50_result_file, skder_result_file, outdir,
								  		  ani_cutoff, af_cutoff, logObject, 
								          mge_proc_to_unproc_mapping=mge_proc_to_unproc_mapping, threads=threads)
					except Exception as e:
						msg = 'Error: issue running skDER greedy method while testing parameters. Exiting ...'
						sys.stderr.write(msg + '\n')
						logObject.error(msg)
						sys.stderr.write(traceback.format_exc() + '\n')
						logObject.error(traceback.format_exc())
						sys.exit(1)	

				elif selection_mode == 'low_mem_greedy':
					try:
						skder_lm_workspace = outdir + 'skDER_iterative_greedy_workspace/'
						util.setupDirectories([skder_lm_workspace], automate_flag=True)
						skder.lowMemGreedyDerep(all_genomes_listing_file, skder_lm_workspace, concat_n50_result_file, skder_result_file, 
												outdir, ani_cutoff, af_cutoff, logObject, mge_proc_to_unproc_mapping=mge_proc_to_unproc_mapping, 
												threads=threads)
					except Exception as e:
						msg = 'Error: issue running skDER low memory greedy method while testing parameters. Exiting ...'
						sys.stderr.write(msg + '\n')
						logObject.error(msg)
						sys.stderr.write(traceback.format_exc() + '\n')
						logObject.error(traceback.format_exc())
						sys.exit(1)

				rep_genome_count = 0 
				with open(skder_result_file) as osrf:
					for line in osrf:
						rep_genome_count += 1
				row_data.append(rep_genome_count)
			heatmap_df.append(row_data)

		headers = heatmap_df.pop(0) 
		heatmap_pd_df = pd.DataFrame(heatmap_df, columns=headers)
		heatmap_pd_df.set_index('ANI/AF', inplace=True)
		
		sys.stdout.write("\nNumber of representative genomes selected:\n")
		sys.stdout.write(str(heatmap_pd_df) + '\n')
		
		heatmap_pdf = outdir + 'Parameter_Impacts_Overview.pdf'
		sns.set_theme(style='white')
		p = sns.heatmap(heatmap_pd_df, annot=True, fmt=',d', cmap='Reds', linewidth=0.5).set_title('Number of representative genomes selected')
		plt.ylabel('Average Nucleotide Identity Cutoff')
		plt.xlabel('Aligned Fraction Cutoff')
		plt.savefig(heatmap_pdf, format='pdf')

		# close logging object and exit
		msg = '******************\nskDER in testing mode finished!\n******************\nHeatmap with number of representative genomes from different cutoffs can be found at: %s' % heatmap_pdf
		logObject.info(msg)
		sys.stdout.write(msg + '\n')
		util.closeLoggerObject(logObject)
	else:
		skder_result_file = outdir + 'skDER_Results.txt'
		if selection_mode == 'dynamic':
			try:
				skder.dynamicDerep(skani_result_file, concat_n50_result_file, skder_result_file, outdir,
					               specified_ani_cutoff, specified_af_cutoff, max_af_distance_cutoff, logObject, 
					               mge_proc_to_unproc_mapping=mge_proc_to_unproc_mapping)
			except Exception as e:
				msg = 'Error: issue running skDER dynamic method. Exiting ...'
				sys.stderr.write(msg + '\n')
				logObject.error(msg)
				sys.stderr.write(traceback.format_exc() + '\n')
				logObject.error(traceback.format_exc())
				sys.exit(1)

		elif selection_mode == 'greedy':
			try:
				skder.greedyDerep(skani_result_file, concat_n50_result_file, skder_result_file, outdir,
								  specified_ani_cutoff, specified_af_cutoff, logObject, 
								  mge_proc_to_unproc_mapping=mge_proc_to_unproc_mapping, threads=threads)
			except Exception as e:
				msg = 'Error: issue running skDER greedy method. Exiting ...'
				sys.stderr.write(msg + '\n')
				logObject.error(msg)
				sys.stderr.write(traceback.format_exc() + '\n')
				logObject.error(traceback.format_exc())
				sys.exit(1)

		elif selection_mode == 'low_mem_greedy':
			try:
				skder_lm_workspace = outdir + 'skDER_iterative_greedy_workspace/'
				util.setupDirectories([skder_lm_workspace])
				skder.lowMemGreedyDerep(all_genomes_listing_file, skder_lm_workspace, concat_n50_result_file, skder_result_file, outdir,
										specified_ani_cutoff, specified_af_cutoff, logObject, mge_proc_to_unproc_mapping=mge_proc_to_unproc_mapping, threads=threads)
			except Exception as e:
				msg = 'Error: issue running skDER low memory greedy method while testing parameters. Exiting ...'
				sys.stderr.write(msg + '\n')
				logObject.error(msg)
				sys.stderr.write(traceback.format_exc() + '\n')
				logObject.error(traceback.format_exc())
				sys.exit(1)	

		try:
			util.copyRepresentativeGenomesToDirectory(skder_result_file, outdir, logObject, symlink_flag=symlink_flag)
		except Exception as e:
			msg = 'Error: issue copying representative genomes to final results directory. Exiting ...'
			sys.stderr.write(msg + '\n')
			logObject.error(msg)
			sys.stderr.write(traceback.format_exc() + '\n')
			logObject.error(traceback.format_exc())
			sys.exit(1)

		# create representative genome listing file
		# assign non-representative genomes to representative genomes (i.e. determine clusters) if requested		
		if determine_clusters_flag:
			skder_cluster_result_file = outdir + 'skDER_Clustering.txt'
			try:
				if selection_mode == 'low_mem_greedy':
					cluster_dir = outdir + 'skani_dist_Workspace/'
					util.setupDirectories([cluster_dir])
					skani_result_file = 'Skani_Dist_Output.txt'
					skder.runSkaniDist(cluster_dir, skder_result_file, all_genomes_listing_file, skani_result_file, 
									   skani_triangle_parameters, aligned_fraction_cutoff, selection_mode, test_cutoffs_flag, 
									   logObject, threads=threads)

				skder.determineClusters(skder_result_file, skani_result_file, mge_unproc_to_proc_mapping,
										mge_proc_to_unproc_mapping, skder_cluster_result_file, specified_af_cutoff,
						  				specified_ani_cutoff)
			except Exception as e:
				msg = 'Error: issue performing secondary clustering of non-representative genomes to representative genomes. Exiting ...'
				sys.stderr.write(msg + '\n')
				logObject.error(msg)
				sys.stderr.write(traceback.format_exc() + '\n')
				logObject.error(traceback.format_exc())
				sys.exit(1)	

		# close logging object and exit
		msg = '******************\nskDER finished!\n******************\nDirectory with representative genomes can be found at: %s' % outdir
		logObject.info(msg)
		sys.stdout.write(msg + '\n')

	# create completion file for workflows to check
	util.closeLoggerObject(logObject)
	completion_file = outdir + 'COMPLETED.txt'
	os.system('echo "skDER completed successfully!" > %s' % completion_file)
	assert(os.path.isfile(completion_file))
	sys.exit(0)

if __name__ == '__main__':
    try:
        skder_main()
    except KeyboardInterrupt:
        print('Interrupted')
        try:
            sys.exit(130)
        except SystemExit:
            os._exit(130)
