#!/opt/mambaforge/envs/bioconda/conda-bld/zol_1763327478657/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_/bin/python

"""
Program: cagecatProcess.py
Author: Rauf Salamzade
Kalan Lab
UW Madison, Department of Medical Microbiology and Immunology
"""

# BSD 3-Clause License
#
# Copyright (c) 2023-2025, Kalan-Lab
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import os
import sys
import argparse
from Bio import SeqIO
from zol import util

def create_parser():
	""" Parse arguments """
	parser = argparse.ArgumentParser(description="""               
	Program: cagecatProcess.py
	Author: Rauf Salamzade
	Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology

	
    CAGECAT (https://cagecat.bioinformatics.nl/) allows identification of homologous gene clusters to 
	a query set of co-located proteins. While the set of clusters should be directly compatible with 
	processing using zol, CAGECAT primarily uses protein_id features in gene cluster GenBank files as 
	protein identifiers whereas zol primarily expects locus_tag. This can be overcome using the "-r"
	option in zol which will create arbitrary locus tags with GenBank files to allow for 
	smooth processing in zol. However, if you wish to retain the protein_id information, this 
	script will take in as input the zip folder following gene cluster extraction from CAGECAT
	and create a directory of similar GenBank files with locus_tag features harboring the same value 
	as the protein_id features in the original GenBank files.
								  
	This is not designed for fungi/eukaryotes currently because CAGECAT CDS feature coordinates 
	do not appear to contain exon info needed for zol.
	""", formatter_class=argparse.RawTextHelpFormatter)

	parser.add_argument('-i', '--extract_clusters_zip', help='Pat.', required=True)
	parser.add_argument('-o', '--output_dir', help='Path to output directory.', required=True)
	args = parser.parse_args()
	return args

def cagecatProcess():
	"""
	Void function which runs primary workflow for program.
	"""

	"""
	PARSE INPUTS
	"""
	myargs = create_parser()

	extract_clusters_zip_file = os.path.abspath(myargs.extract_clusters_zip) 
	outdir = os.path.abspath(myargs.output_dir) + '/'

	if not os.path.isdir(outdir):
		os.system(f'mkdir {outdir}')
	else:
		sys.stderr.write('Note, output directory exists already! Exiting ...\n')
		sys.exit(1)
	
	# create logging object
	log_file = outdir + 'Progress.log'
	log_object = util.create_logger_object(log_file)

	version = util.get_version()
	sys.stdout.write(f'Running version: {version}\n')
	log_object.info(f"Running version: {version}")

	parameters_file = outdir + 'Command_Issued.txt'
	sys.stdout.write(f"Appending command issued for future records to: {parameters_file}\n")
	sys.stdout.write(f"Logging more details at: {log_file}\n")
	log_object.info("\nNEW RUN!!!\n**************************************")
	log_object.info(f'Running version {version}')
	log_object.info(f"Appending command issued for future records to: {parameters_file}")

	parameters_handle = open(parameters_file, 'a+')
	parameters_handle.write(' '.join(sys.argv) + '\n')
	parameters_handle.close()

	"""
	START WORKFLOW
	"""

	# Step 1: Uncompress zip folder into output directory
	msg = "--------------------\nStep 1\n--------------------\nUncompressing zipped cluster extraction from CAGECAT."
	sys.stdout.write(msg + "\n")
	log_object.info(msg)
	
	cagecat_dir = outdir + 'CAGECAT_Results/'
	cagecat_res_dir = cagecat_dir + 'results/'
	extraction_cmd = ['unzip', extract_clusters_zip_file, '-d', cagecat_dir]	
	util.run_cmd_via_subprocess(extraction_cmd, log_object=log_object, 
								check_directories = [cagecat_res_dir])
	
	# Step 2: Process GenBank files and produce final versions with locus_tags
	msg = "--------------------\nStep 1\n--------------------\nProcessing GenBank files and producing final versions."
	sys.stdout.write(msg + "\n")
	log_object.info(msg)

	final_genbank_dir = outdir + 'Processed_GenBank_Files/'
	if not os.path.isdir(final_genbank_dir):
		os.mkdir(final_genbank_dir)

	for f in os.listdir(cagecat_res_dir):
		if f.endswith('.gbk'):
			mod_gbk = final_genbank_dir + f
			mod_gbk_handle = open(mod_gbk, 'w')
			cds_count = 1
			try:
				with open(cagecat_res_dir + f) as ocrf:
					for rec in SeqIO.parse(ocrf, 'genbank'):
						for feat in rec.features:
							if feat.type == 'CDS':
								protein_id = 'CDS_' + str(cds_count)
								try:
									protein_id = feat.qualifiers.get('protein_id')[0]
								except Exception as e:
									pass
								feat.qualifiers['locus_tag'] = protein_id
								cds_count += 1
						SeqIO.write(rec, mod_gbk_handle, 'genbank')
			except Exception as e:
				log_object.error(f"Issue with processing GenBank file: {' '.join(cagecat_res_dir + f)}")
				log_object.error(e)
				raise RuntimeError(e)
			mod_gbk_handle.close()
			
	# DONE!
	msg = f"--------------------\nDONE!\n--------------------\n"
	msg += f"Directory of processed GenBank files from CAGECAT can be found at: {final_genbank_dir}\n"
	sys.stdout.write(msg + "\n")
	log_object.info(msg)

if __name__ == '__main__':
	cagecatProcess()

