#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2012 Tobias Marschall
# 
# This file is part of CLEVER.
# 
# CLEVER is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# CLEVER is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with CLEVER.  If not, see <http://www.gnu.org/licenses/>.

from optparse import OptionParser
import io
import sys
import os
import gzip
import subprocess

__author__ = "Tobias Marschall"

usage = """%prog [options] <output-prefix>

Reads alignment priors (e.g. as generated by bam-to-alignment-priors) from 
stdin and writes them to a separate file for each chromosome named
<output-prefix>.<chromosome>.aln-priors."""

def read_sample_dict(filename):
	import pysam
	l = pysam.Samfile(filename).header['RG']
	result = {}
	samples = set()
	for d in l:
		sample = d['SM']
		result[d['ID']] = sample
		samples.add(sample)
	return result, samples

def main():
	parser = OptionParser(usage=usage)
	parser.add_option("-z", action="store_true", dest="zip_output", default=False,
					  help="GZIP output files")
	parser.add_option("-c", action="store", dest="chromosomes", default=None,
					  help="Only extracts given chromosomes. Expects comma-separated list.")
	parser.add_option("-s", action="store", dest="samplewise", default=None,
					  help="Read header of given BAM file and priors split by sample (encoded in read groups).")
	parser.add_option("-g", action="store_true", dest="gzip", default=False,
					  help="Use gzip instead of python zip module. Can be faster, but will spawn one gzip process for every chromosome/sample, which can be a lot.")
	(options, args) = parser.parse_args()
	if (len(args)!=1) or (os.isatty(0)):
		parser.print_help()
		sys.exit(1)
	if options.zip_output:
		zip_executable = None
		if options.gzip and (os.system('which gzip > /dev/null 2>&1') == 0):
			zip_executable = 'gzip'
		# running so many instance if pigz spawns too many processes. gzip is fine.
		#if os.system('which pigz > /dev/null 2>&1') == 0:
			#zip_executable = 'pigz'
	# dictionary mapping chromosome names to output files
	output_files = dict()
	if options.samplewise != None:
		readgroup_dict, samples = read_sample_dict(options.samplewise)
	def open_file(file_id):
		"""Normally, file_id corresponds to chromosome name. If splitting samplewise, it
		corresponds to <chromosome>.<sample>"""
		if options.zip_output:
			if zip_executable == None:
				output_files[file_id] = gzip.open('%s.%s.aln-priors.gz'%(args[0],file_id), 'wb')
			else:
				output_files[file_id] = subprocess.Popen(zip_executable, stdin=subprocess.PIPE, stdout=open('%s.%s.aln-priors.gz'%(args[0],file_id),'w')).stdin
		else:
			output_files[file_id] = open('%s.%s.aln-priors'%(args[0],file_id), 'w')
	if options.chromosomes != None:
		for chromosome in options.chromosomes.strip().split(','):
			if options.samplewise == None:
				open_file(chromosome)
			else:
				for sample in samples:
					open_file(chromosome + '.' + sample)
	for line in io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'):
		fields = line.split()
		if len(fields) == 14:
			assert fields[3] == fields[8]
			chromosome = fields[3]
		elif len(fields) == 15:
			assert fields[4] == fields[9]
			chromosome = fields[4]
		else:
			assert False
		if options.samplewise == None:
			file_id = chromosome
		else:
			sample = readgroup_dict[fields[2]]
			file_id = chromosome+'.'+sample
		if file_id not in output_files:
			if options.chromosomes == None: 
				open_file(file_id)
			else:
				continue
		output_files[file_id].write(line.encode('utf-8'))

if __name__ == '__main__':
	sys.exit(main())
