#!/opt/conda/conda-bld/cistrome_beta_1764199073615/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold/bin/python

#########################
####     Su Wang      ###
####    2-27-2015     ###
####   version 1.0.7   ###
#########################

"""Script Description:

BETA-Binding and Expression Targets Analysis: Use the ChIP-x data and Microarray/RNAseq data to predict a factor's targets

Part1:
use binding data to calc the score of each gene to be regulated by factor.
1. For each refseq gene in genome, input a distance (for example 100kb), then I get the peak center within 100kb from gene TSS. 
2. filter the peaks by p-value < 1e-5 from MACS, and only get top 10,000 peaks if it's more than 10,000
3. Then calculate a sum of 'Score' for each gene use this formula: 
  Sg = lambda ldx: sum([math.exp(-0.5-4*t) for t in ldx])
4. output is in bed format. the 5th column is score.

input bed file should 'chr1' instead of 'chrI'

Part2:
use differentail expression data rank each genes by t-score from limma result
1. input expression file need to be the standard limma result format
2. Rank each genes by the t-score calculated by limma, the positive one represent up regualate,and negative one represent downregulate genes.
3. Mutiply the rank of gene score and the rank of gene expression t score, get the the rank product
4. Do the permutation to get the fdr to dicide the confidence of the target genes we found
5. use the regulate potential score to do up and down regulate gene set association analysis compared to the genes with no differential expression as a background

Part3:
Motif analysis on target regions

Update02/27/2015: BETA can do the CR/TF function prediction by both regulatory potential and distance
This code is free software; you can redistribute it and/or modify it.

@version: $1.0.7$
@author: Su Wang
@contact: wangsu0623@gmail.com
"""


import sys
import argparse as ap

def prepare_argparser():
    """Prepare optparser object. New options will be added in this
    function first.
    
    """
    description = "BETA --- Binding Expression Target Analysis"
    epilog = "For command line options of each command, type: %(prog)s COMMAND -h" 
    
    argparser = ap.ArgumentParser( description = description, epilog = epilog) #, usage = usage 
    BETA_VERSION = '1.0.7'
    argparser.add_argument("--version", action="version", version="%(prog)s "+BETA_VERSION)
    subparsers = argparser.add_subparsers( help = "sub-command help", dest = 'subcommand_name' ) #help="sub-command help"
    
    # command for 'basic beta'
    add_basic_parser( subparsers )

    # command for 'super beta'
    add_plus_parser( subparsers )
    # command for 'noexpre beta'
    add_minus_parser( subparsers )

    return argparser

def add_basic_parser(subparsers):
    """Add main function 'target prediction' argument parsers.
    """
    basicparser = subparsers.add_parser("basic",help="Main BETA Function: Transcription factors direct targets prediction.",
                                        description="BETA-basic: Predict Direct targets of TF and the active/repressive function prediction.\n\
                                        EXAMPLE: BETA -p 2723_peaks.bed -e gene_exp.diff -k R -g hg19 -n test -o basic")
    #Main Arguments
    basicparser.add_argument("-p","--peakfile",dest="peakfile",type=str,required = True,
                         help="The bed format of peaks binding  sites. (BETA support 3 or 5 columns bed format, CHROM, START, END (NAME, SCORE))")
    basicparser.add_argument("-e","--diff_expr",dest="exprefile",type=str,  required = True,
                         help="IThe differential expression file get from limma for MicroArray ddata and cuffdiff for RNAseq data")
    basicparser.add_argument("-k","--kind", dest = "kind",  choices = ("LIM", "CUF", "BSF", "O"),  required = True, help="the kind of your expression file,this is required,it can be LIM, CUF, BSF, O. LIM for LIMMA standard format. CUF for CUFDIFF standard format, BSF for BETA specific format and O for other formats, if is 'O', columns infor required via --info")
    basicparser.add_argument("-g","--genome",dest="genome", choices = ("hg38","hg19","hg18","mm10", "mm9"),
                         help="Specify your species, hg38,hg19,hg18,mm10,mm9. for other genome assembily versions of human and mouse or other species, ignore this paramter")
    basicparser.add_argument("--gname2",dest="gname2", action="store_true", help = "If this switch is on, gene or transcript IDs in files given through -e will be considered as official gene symbols, DEFAULT=FALSE", default=False)
    basicparser.add_argument("--info",dest="expreinfo", type=str, help="specify the geneID, up/down status and statistcal values column of your expression data. NOTE: use a comma as an connector. for example: 1,2,6 means geneID in the 1st column, logFC in 2nd column and FDR in 6th column. DEFAULT:1,2,6 for LIMMA; 2,10,13 for Cuffdiff and 1,2,3 for BETA specific format. You'd better set it based on your exact expression file")
                             
    basicparser.add_argument("-r","--reference", dest="reference", type = str, help = "the refgene info file downloaded from UCSC genome browser.input this file only if your genome is neither hg18, hg19,hg38 nor mm9,mm10")
    basicparser.add_argument("-o","--output", dest="output", type = str, help = "the directory to store all the output files, if you don't set this, files will be output into the current directory")
    basicparser.add_argument("--bl", dest="boundarylimit", action="store_true", help = "whether or not use CTCF boundary to filter peaks around a gene, DEFAULT=FALSE", default=False )
    basicparser.add_argument("--bf", dest="boundaryfile", type=str, help = "CTCF conserved peaks bed file, use this only when you set --bl and the genome is neither hg19 nor mm9")
    basicparser.add_argument("--pn",dest="peaknumber",type=int,
                         help="the number of peaks you want to consider, DEFAULT=10000",default=10000)
    basicparser.add_argument("--method", dest = "method",  choices = ("score","distance"), default="score", help="Define the method to do the TF/CR function prediction, score for regulatory potential, distance for the distance to the proximal binding peak. DEFAULT:SCORE")
    basicparser.add_argument("-n","--name",dest="name",type=str,
                         help="this argument is used as the prefix for the result files. If not set, 'NA' will be used instead")
    basicparser.add_argument("-d","--distance", dest="distance", type=int,
                         help="Set a number which unit is 'base'. It will get peaks within this distance from gene TSS. default:100000 (100kb)", default=100000)
    basicparser.add_argument("--df",dest = "diff_fdr",type = float, help = "Input a number 0~1 as a threshold to pick out the most significant differential expressed genes by FDR,\
                               DEFAULT = 1, that is select all the genes",default = 1)
    basicparser.add_argument("--da",dest="diff_amount",type = float,help = "Get the most significant differential expressed genes by the percentage(0-1) or number(larger than 1)Input a number between 0-1, the rank based on fdr\
                                for example, 2000, so that the script will only consider top 2000 genes as the differentially expressed genes. DEFAULT = 0.5,\
                               that is select top 50 percent genes of up and down seprately. NOTE: if you want to use diff_fdr, please set this parameter to 1, otherwise it will get the intersection of these two parameters",default = 0.5)
    basicparser.add_argument("-c","--cutoff",dest="cutoff",type=float,
                         help="Input a number between 0~1 as a threshold to select the closer target gene list(up regulate or down regulate or both) with the p value was called by one side ks-test, DEFAULT = 0.001",default = 0.001)

    return

def add_plus_parser(subparsers):
    """Add super function 'target prediction & motif analysis' argument parsers.
    """
    plusparser = subparsers.add_parser('plus', help = "The super beta beta can not only do TF's direct targets and active or repressive prediction, but also do the motif analysis with the target regions",
                                        description="BETA-super: Predict Direct targets of TF and the active/repressive function prediction.Do motif analysis at targets region as well.\n\
                                        EXAMPLE: BETA plus -p 2723_peaks.bed -e gene_exp.diff -k R -g hg19 -n test -o super --gs ~/Documents/hg19.fa")

    #Main Arguments
    plusparser.add_argument("-p","--peakfile",dest="peakfile",type=str, required = True,
                         help="The bed format of peaks binding  sites.(BETA support 3 or 5 columns bed format, CHROM, START, END (NAME, SCORE))")
    plusparser.add_argument("-e","--diff_expr",dest="exprefile",type=str, required = True,
                         help="IThe differential expression file get from limma for MicroArray ddata and cuffdiff for RNAseq data")
    plusparser.add_argument("-k","--kind", dest = "kind",  choices = ("LIM", "CUF", "BSF", "O"),  required = True, help="the kind of your expression file,this is required,it can be LIM, CUF, BSF, O. LIM for LIMMA standard format. CUF for CUFDIFF standard format, BSF for BETA specific format and O for other formats, if is 'O', columns infor required via --info")
    plusparser.add_argument("-g","--genome",dest="genome", choices = ("hg38","hg19","hg18","mm10", "mm9", "hg", "mm"),
                         help="Specify your species, hg38,hg19,hg18,mm10,mm9. for other genome assembily versions of human and mouse, select hg or mm. Other species, ignore this paramter")
    plusparser.add_argument("--gname2",dest="gname2", action="store_true", help = "If this switch is on, gene or transcript IDs in files given through -g will be considered as official gene symbols, DEFAULT=FALSE", default=False)
    plusparser.add_argument("--info",dest="expreinfo", type=str, help="specify the geneID, up/down status and statistcal values column of your expression data.\
                            NOTE: use a comma as an connector. for example: 1,2,6 means geneID in the 1st column, logFC in 2nd column and FDR in 6th column\
                            DEFAULT:1,2,6 for LIMMA; 2,10,13 for Cuffdiff and 1,2,3 for BETA specific format. You'd better set it based on your exact expression file")
    plusparser.add_argument("--gs", dest="genomesequence", type = str,  required = True, help = "genome sequence data")

    plusparser.add_argument("-r","--reference", dest="reference", type = str, help = "the refgene info file downloaded from UCSC genome browser.input this file only if your genome is neither hg18,hg19,hg38 nor mm9,mm10")
    plusparser.add_argument("-o","--output", dest="output", type = str, help = "the directory to store all the output files, if you don't set this, files will be output into the current directory")
    plusparser.add_argument("--bl", dest="boundarylimit", action="store_true", help = "wether or not use CTCF boundary to filter peaks around a gene, DEFAULT=FALSE", default=False )
    plusparser.add_argument("--bf", dest="boundaryfile", type=str, help = "CTCF conserved peaks bed file, use this only when you set --bl and the genome is neither hg19 nor mm9")
    plusparser.add_argument("--pn",dest="peaknumber",type=int,
                         help="the number of peaks you want to consider, DEFAULT=10000",default=10000)
    plusparser.add_argument("--method", dest = "method",  choices = ("score","distance"), default="score", help="Define the method to do the TF/CR function prediction, score for regulatory potential, distance for the distance to the proximal binding peak. DEFAULT:SCORE")
    plusparser.add_argument("-n","--name",dest="name",type=str,
                         help="this argument is used as the prefix of the result files. If not set, 'NA' will be used instead")
    plusparser.add_argument("-d","--distance", dest="distance", type=int,
                         help="Set a number which unit is 'base'. It will get peaks within this distance from gene TSS. default:100000 (100kb)", default=100000)
    plusparser.add_argument("--df",dest = "diff_fdr",type = float, help = "Input a number 0~1 as a threshold to pick out the most significant differential expressed genes by FDR,\
                               DEFAULT = 1, that is select all the genes",default = 1)
    plusparser.add_argument("--da",dest="diff_amount",type = float, help = "Get the most significant differential expressed genes by the percentage(0-1) or number(larger than 1)Input a number between 0-1, the rank based on fdr\
                                for example, 2000, so that the script will only consider top 2000 genes as the differentially expressed genes. DEFAULT = 0.5,\
                               that is select top 50 percent genes of up and down seprately. NOTE: if you want to use diff_fdr, please set this parameter to 1, otherwise it will get the intersection of these two parameters",default = 0.5)
    plusparser.add_argument("-c","--cutoff",dest="cutoff",type=float,
                         help="Input a number between 0~1 as a threshold to select the closer target gene list(up regulate or down regulate or both) with the p value was called by one side ks-test, DEFAULT = 0.001",default = 0.001)
    plusparser.add_argument("--mn",dest="motifnumber",type=float,
                         help="Input a number between 0~1 as the p-value cutoff or larger than 1 as the number to get the motif retrieved in BETA output in html file. DEFAULT: 10", default=10)

    return

def add_minus_parser(subparsers):
    """Add main function 'target prediction' argument parsers.
    """
    minusparser = subparsers.add_parser("minus", help="Find Target Genes with only binding data: regulatiry potential score",
                                        description="BETA-basic: Predict Direct targets of TF and the active/repressive function prediction.\n\
                                        EXAMPLE: BETA minus -p 2723_peaks.bed -e gene_exp.diff -k R -g hg19 -n test -o basic")
    #Main Arguments
    minusparser.add_argument("-p","--peakfile",dest="peakfile",type=str,required = True,
                         help="The bed format of peaks binding  sites. (BETA support 3 or 5 columns bed format, CHROM, START, END (NAME, SCORE))")
    minusparser.add_argument("-g","--genome",dest="genome", choices = ("hg38","hg19","hg18","mm10", "mm9"),
                         help="Specify your species, hg38,hg19,hg18,mm10,mm9. For other genome assembily versions of human and mouse or other species, ignore this paramter")
    minusparser.add_argument("-r","--reference", dest="reference", type = str, help = "the refgene info file downloaded from UCSC genome browser.input this file only if your genome is neither hg19 nor mm9")
    minusparser.add_argument("-n","--name",dest="name",type=str,
                         help="this argument is used as the prefix of the result file. If not set, 'NA' will be used instead")
    minusparser.add_argument("-o","--output", dest="output", type = str, help = "the directory to store all the output files, if you don't set this, files will be output into the current directory")
    minusparser.add_argument("-d","--distance", dest="distance", type=int,
                         help="Set a number which unit is 'base'. It will get peaks within this distance from gene TSS. default:100000 (100kb)", default=100000)
    minusparser.add_argument("--bl", dest="boundarylimit", action="store_true", help = "whether or not use CTCF boundary to filter peaks around a gene, DEFAULT=FALSE", default=False )
    minusparser.add_argument("--bf", dest="boundaryfile", type=str, help = "CTCF conserved peaks bed file, use this only when you set --bl and the genome is neither hg19 nor mm9")
    minusparser.add_argument("--pn",dest="peaknumber",type=int,
                         help="the number of peaks you want to consider, DEFAULT=10000",default=10000)
    return

def main():
    """The Main function/pipeline for BETA"""
    #parse options...
    argparser = prepare_argparser()
    args = argparser.parse_args()
    
    subcommand = args.subcommand_name
    if subcommand == "basic" or not subcommand:
        from BETA.runbeta import basicrun
        basicrun(argparser)
    if subcommand == "plus":
        from BETA.runbeta import plusrun
        plusrun(argparser)
    if subcommand == "minus":
        from BETA.runbeta import minusrun
        minusrun(argparser)
    
if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        sys.stderr.write("User interrunpt me! ;-) Bye!\n")
        sys.exit(0)
