#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
It extracts the fusion genes generated by 'label_fusion_genes.py'.



Author: Daniel Nicorici, Daniel.Nicorici@gmail.com

Copyright (c) 2009-2022 Daniel Nicorici

This file is part of FusionCatcher.

FusionCatcher is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

FusionCatcher is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with FusionCatcher (see file 'COPYING.txt').  If not, see
<http://www.gnu.org/licenses/>.

By default, FusionCatcher is running BLAT aligner
<http://users.soe.ucsc.edu/~kent/src/> but it offers also the option to disable
all its scripts which make use of BLAT aligner if you choose explicitly to do so.
BLAT's license does not allow to be used for commercial activities. If BLAT
license does not allow to be used in your case then you may still use
FusionCatcher by forcing not use the BLAT aligner by specifying the option
'--skip-blat'. Fore more information regarding BLAT please see its license.

Please, note that FusionCatcher does not require BLAT in order to find
candidate fusion genes!

This file is not running/executing/using BLAT.
"""
import sys
import os
import optparse

if __name__ == '__main__':

    #command line parsing

    usage="%prog [options]"
    description="""It extracts the fusion genes generated by 'label_fusion_genes.py' for further analysis."""
    version="%prog 0.12 beta"

    parser=optparse.OptionParser(usage=usage,description=description,version=version)

    parser.add_option("--input",
                      action="store",
                      type="string",
                      dest="input_filename",
                      help="""The input file in text tab delimited format containing the fusion genes candidates produced by 'label_fusion_genes.py'. """)

    parser.add_option("--input_fusion_reads",
                      action="store",
                      type="string",
                      dest="input_fusion_reads_filename",
                      help="""The input file in text tab delimited format containing the fusion genes and supporting reads produced by find_fusion_genes_map.py. """)

    parser.add_option("--threshold_pairs",
                      action="store",
                      type="int",
                      dest="threshold_pairs",
                      default=3,
                      help="""The threshold for the number of paired-end reads necessary for considering the fusion gene candidate for further analysis. All gene fusions candidate with a larger and equal number of supporting paired-end reads are written in the output file. Default value is %default.""")

    parser.add_option("--threshold_pairs_known",
                      action="store",
                      type="int",
                      dest="threshold_pairs_known",
                      default=0,
                      help="""The threshold for the number of paired-end reads necessary for considering the fusion gene candidate, which was labeled using '--allowed_labels', for further analysis. All gene fusions candidate with a larger and equal number of supporting paired-end reads are written in the output file. Default value is %default.""")


    parser.add_option("--threshold_count",
                      action="store",
                      type="int",
                      dest="threshold_count",
                      default=5000,
                      help="""If --threshold_pairs selects more fusions than this threshold than this threshold will have priority (only the first top N will be selected). Default value is %default.""")


    parser.add_option("--skip_labels",
                      action="store",
                      type="string",
                      dest="skip_labels",
                      default="paralog",
                      help="""A fusion gene candidate which has been labeled in the input file with one of the following labels is skipped and not written in the output file. Default value is '%default'.""")

    parser.add_option("--allowed_labels",
                      action="store",
                      type="string",
                      dest="allowed_labels",
                      default="known",
                      help="""A fusion gene candidate which has been labeled in the input file with one of the following labels is considered for further analysis even if it has labels which disqualify it (from the --skipp_labels). Default value is '%default'.""")

    parser.add_option("--further_labels",
                      action="store",
                      type="string",
                      dest="further_labels",
                      default="further_fusion",
                      help="""A fusion gene candidate which has been labeled in the input file with one of the following labels is considered for further analysis even if it has labels which disqualify it (from the --skipp_labels) or is below the threshold. Default value is '%default'.""")



    parser.add_option("--output",
                      action="store",
                      type="string",
                      dest="output_filename",
                      help="""The text tab separated file containing the fusion genes candidates which meet the conditions imposed by options '--threshold' and '--skip_labels' are written.""")

    parser.add_option("--output_fusion",
                      action="store",
                      type="string",
                      dest="output_fusion_filename",
                      help="""The same file as the input file where and extra column is added for the genes which have been selected for further analysis.""")

    parser.add_option("--output_fusion_reads",
                      action="store",
                      type="string",
                      dest="output_fusion_reads_filename",
                      help="""The file containing the supporting reads which supports the gene considered for further analysis.""")


    (options,args)=parser.parse_args()

    # validate options
    if not (options.input_filename and
            options.output_filename
            ):
        parser.print_help()
        parser.error("One of the options has not been specified.")
        sys.exit(1)


    skip_labels = set(options.skip_labels.strip().split(','))
    super_labels = set()
    if options.allowed_labels:
        super_labels = set(options.allowed_labels.strip().split(','))
    further_labels = set()
    
    if options.further_labels:
        further_labels = set(options.further_labels.strip().split(','))

    if not options.threshold_pairs_known:
        options.threshold_pairs_known = options.threshold_pairs

    print "Reading...",options.input_filename
    # Assume format:
    #Fusion_gene_1	Fusion_gene_2	Count_paired-end_reads	Fusion_gene_symbol_1	Fusion_gene_symbol_2	Information_fusion_genes
    #ENSG00000126351	ENSG00000235300	78	THRA		no_protein_product
    #ENSG00000132142	ENSG00000141750	56	ACACA	STAC2
    #ENSG00000124164	ENSG00000161405	43	VAPB	IKZF3
    #ENSG00000101040	ENSG00000126001	34	ZMYND8	CEP250
    #ENSG00000138744	ENSG00000153207	30	NAAA	AHCTF1
    #ENSG00000101146	ENSG00000124222	29	RAE1	STX16
    #ENSG00000187653	ENSG00000205542	16	TMSL3	TMSB4X	known_paralogs,similar_reads
    #ENSG00000197111	ENSG00000235701	15	PCBP2	PCBP2P1	no_protein_product,similar_reads
    #ENSG00000125686	ENSG00000166263	14	MED1	STXBP4
    #...
    data = [line.rstrip('\r\n').split('\t') for line in file(options.input_filename,'r').readlines() if line.rstrip('\r\n')]
    header = data.pop(0) # remove header


    print "Processing..."
    # col 3 holds the number of paired-ends reads
    # col 6 contains the labels
    data_ids = []
    data_all = []
    data_all.append('\t'.join(header+['Analysis_status'])+'\n')
    i = 0
    for line in data:
        counts = int(line[2])
        labels = set(line[5].strip().split(','))
        if (
           labels.intersection(further_labels) or 
                (
                counts >= options.threshold_pairs and 
                i < options.threshold_count and 
                    (
                    labels.intersection(super_labels) or 
                    (not labels.intersection(skip_labels))
                    )
                ) or 
                (
                counts >= options.threshold_pairs_known and 
                i < options.threshold_count and 
                labels.intersection(super_labels)
                )
            ):
            data_ids.append('\t'.join(line[:2])+'\n')
            data_all.append('\t'.join(line+['further_analysis'])+'\n')
            i = i + 1
        else:
            data_all.append('\t'.join(line+['skipped'])+'\n')

    print "Writing...",options.output_filename
    file(options.output_filename,'w').writelines(data_ids)

    if options.output_fusion_filename:
        print "Writing...",options.output_fusion_filename
        file(options.output_fusion_filename,'w').writelines(data_all)

    if options.output_fusion_reads_filename and options.input_fusion_reads_filename:
        print "Writing...",options.output_fusion_reads_filename
        data_ids = set(data_ids)
        r = set()
        d = [line.rstrip('\r\n').split('\t') for line in file(options.input_fusion_reads_filename,'r').readlines() if line.rstrip('\r\n')]
        # remove header
        d.pop(0)
        for line in d:
            k = "%s\t%s\n" % (line[2],line[3])
            if k in data_ids:
                ps = line[5].split(',')
                for p in ps:
                    r.add("%s/1\n" % (p,))
                    r.add("%s/2\n" % (p,))
        r = sorted(r)
        file(options.output_fusion_reads_filename,'w').writelines(r)

    print "The end."
