#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from optparse import OptionParser, OptionGroup
import sys
import os
from collections import defaultdict
from bisect import bisect_right
import gzip

__author__ = "Tobias Marschall"

usage = """%prog [options] <deletions.vcf(.gz)>

Reads VCF with known deletions and writes a file in format to be used for recalibration
three columns (<chromosome> <start> <end> <type>), where coordinates are 0-based and inclusive."""

allowed_dna_chars = set(['A','C','G','T','N','a','c','g','t','n'])

def valid_dna_string(s):
	chars = set(c for c in s)
	return chars.issubset(allowed_dna_chars)

def main():
	parser = OptionParser(usage=usage)
	parser.add_option("-m", action="store", dest="max_length", default=None, type=int,
			help='Maximal length to be considered.')
	parser.add_option("-i", action="store_true", dest="include_imprecise", default=False,
			help='Include deletions marked as IMPRECISE.')
	parser.add_option("-g", action="store_true", dest="print_genotypes", default=False,
			help='Print list of genotypes as comma-separated list in last column.')


	(options, args) = parser.parse_args()
	if (len(args) != 1):
		parser.print_help()
		sys.exit(1)

	n = 0
	header = None
	header_dict = None
	imprecise_count = 0
	input_filename = args[0]
	if input_filename.endswith('.gz'):
		input_file = gzip.open(input_filename)
	else:
		input_file = open(input_filename)
	for line in (s.strip() for s in input_file):
		n += 1 
		if line.startswith('##'): continue
		if line.startswith('#'):
			header = line[1:].split()
			header_dict = dict((name.lower(),index) for index,name in enumerate(header))
			continue
		fields = line.split()
		chromosome = fields[0]
		ref = fields[3]
		alt = fields[4]
		is_imprecise = 'IMPRECISE' in fields[7].strip(';').split(';')
		if options.print_genotypes:
			assert fields[8].split(':')[0] == 'GT', 'fields[8] = %s'%fields[8]
			genotype_list = [x.split(':')[0] for x in fields[9:]]
		info_fields = dict(s.split('=') for s in fields[7].strip(';').split(';') if '=' in s)
		if (alt == '.') or (ref == '.') or (alt == '<DEL>'):
			if not 'SVTYPE' in info_fields: continue
			if not 'SVLEN' in info_fields: continue
			if info_fields['SVTYPE'] == 'DEL':
				vartype = 'DEL'
				svlen = abs(int(info_fields['SVLEN']))
				coord1 = int(fields[1])
				coord2 = coord1 + svlen
			else:
				continue
		else:
			if (not valid_dna_string(ref)) or (not valid_dna_string(alt)):
				continue
			if (len(ref) > 1) and (len(alt) == 1):
				vartype = 'DEL'
				svlen = len(ref) - 1
				coord1 = int(fields[1])
				coord2 = coord1 + svlen
			else:
				continue
		expected_support = float(info_fields.get('ESUPPORT', '0.0'))
		if (options.max_length != None) and (svlen > options.max_length):
			continue
		if (not is_imprecise) or options.include_imprecise:
			if options.print_genotypes:
				print(chromosome, coord1, coord2-1, 'DEL', expected_support, ','.join(genotype_list))
			else:
				print(chromosome, coord1, coord2-1, 'DEL', expected_support)

if __name__ == '__main__':
	sys.exit(main())
