#!/usr/bin/env perl

use strict ;
use warnings ;

use Cwd qw(cwd abs_path) ;
use File::Basename ;
use File::Path qw(make_path) ;

my $progName = "run-t1k" ;
my $version = "v1.0.9-r251" ;

die "T1K $version usage: ./$progName [OPTIONS]:\n".
    "Required:\n".
    #"\t[Input]:\n".
    "\t-1 STRING -2 STRING: path to paired-end read files\n".
    "\t-u STRING: path to single-end read file\n".
    "\t-i STRING: path to interleaved read file\n".
    "\t-b STRING: path to BAM file\n".
    "\t-f STRING: path to the reference sequence file\n".
    "Optional:\n".
    "\t-c STRING: path to the gene coordinate file (required when -b input)\n".
    "\t-o STRING: prefix of output files. (default: inferred from file prefix)\n".
    "\t--od STRING: the directory for output files. (default: ./)\n".
    "\t-t INT: number of threads (default: 1)\n".
    "\t-s FLOAT: minimum alignment similarity (default: 0.8)\n".
    "\t-n INT: maximal number of alleles per read (default: 2000)\n".
    "\t--frac FLOAT: filter if abundance is less than the frac of dominant allele (default: 0.15)\n".
    "\t--cov FLOAT: filter genes with average coverage less than the specified value (default: 1.0)\n".
    "\t--crossGeneRate FLOAT: the effect from other gene's expression (0.04)\n".
    "\t--squaremMinAlpha FLOAT: minimum value (should be negative) for the alpha (step length) in the SQUAREM algorithm (default: not set)\n".
    "\t--alleleDigitUnits INT: the number of units in genotyping result (default: automatic)\n".
    "\t--alleleDelimiter CHR: the delimiter character for digit unit (default: automatic)\n".
    "\t--alleleWhitelist STRING: only consider read aligned to the listed allele sereies. (default: not used)\n".
    "\t--barcode STRING: if -b, BAM field for barcode; if -1 -2/-u, file containing barcodes (default: not used)\n".
    "\t--barcodeRange INT INT CHAR: start, end(-1 for length-1), strand in a barcode is the true barcode (default: 0 -1 +)\n".
    "\t--barcodeWhitelist STRING: path to the barcode whitelist (default: not used)\n".
    "\t--read1Range INT INT: start, end(-1 for length-1) in -1/-u files for genomic sequence (default: 0 -1)\n".
    "\t--read2Range INT INT: start, end(-1 for length-1) in -2 files for genomic sequence (default: 0 -1)\n".
    #"\t--UMI STRING: if -b, bam field for UMI; if -1 -2/-u, file containing UMIs (default: not used)\n".
    #"\t--umiRange INT INT CHAR: start, end(-1 for lenght-1), strand in a umi is the true umi (default: 0 -1 +)\n".
    "\t--mateIdSuffixLen INT: the suffix length in read id for mate. (default: not used)\n".
    "\t--abnormalUnmapFlag: the flag in BAM for the unmapped read-pair is nonconcordant (default: not set)\n".
    "\t--relaxIntronAlign: allow one more mismatch in intronic alignment (default: false)\n".
    "\t--preset STRING: preset parameters for cases requiring non-default settings:\n".
    "\t\thla: HLA genotyping in general\n".
    "\t\thla-wgs: HLA genotyping on WGS data\n".
    "\t\tkir-wgs: KIR genotyping on WGS data\n".
    "\t\tkir-wes: KIR genotyping on WES data\n".
    "\t--noExtraction: directly use the files from provided -1 -2/-u for genotyping (default: extraction first)\n".
    "\t--skipPostAnalysis: only conduct genotyping. (default: conduct the post analysis)\n".
    "\t--outputReadAssignment: output the allele assignment for each read to prefix_assign.tsv file (default: not used)\n".
    "\t--stage INT: start genotyping on specified stage (default: 0):\n".
    "\t\t0: start from beginning (candidate read extraction)\n".
    "\t\t1: start from genotype with candidate reads\n".
    "\t\t2: start from post analysis\n".
    "\tParameters for post analysis:\n".
    "\t\t--post-varMaxGroup INT: the maximum variant group size to call novel variant. -1 for no limitation (default: 8)\n"
    if ( @ARGV == 0 ) ;

sub system_call
{
  print STDERR "[".localtime()."] SYSTEM CALL: ".join(" ",@_)."\n";
  system(@_) == 0
    or die "system @_ failed: $?";
  #print STDERR " finished\n";
} 

my $WD = dirname( abs_path( $0 ) ) ;
my $i ;
my $j ;


# process the options.
my @firstMateFiles ;
my @secondMateFiles ;
my @interleavedFiles ;
my @bamFiles ;
my @barcodeFiles ;
my $prefix = "" ;
my $bamExtractorArgs = "" ;
my $fastqExtractorArgs = "" ;
my $genotyperArgs = "" ;
my $analyzerArgs = "" ;
my $threadCnt = 1 ;
my $stage = 0 ;
my $noExtraction = 0 ;
my $skipPostAnalysis = 0 ;
my $hasBarcode = 0 ;
my $hasUmi = 0 ;
my $outputDirectory = "" ;

my $refCoordFasta = "" ;
my $refSeqFasta = "" ;

my $preset = "" ;

#my $filterFrac = 0.15 ;
#my $filterCov = 1.0 ;
#my $crossGeneRate = 0.0005 ;
my %genotyperArgNames = ("--frac"=>0, "--cov"=>0, "--crossGeneRate"=>0, "-s"=>0, "-n"=>2000,
								"--alleleDigitUnits"=>0, "--alleleDelimiter"=>0, "--alleleWhitelist"=>0,
                "--squaremMinAlpha"=>0) ;
my %analyzerArgNames = ("-s"=>0, "--alleleDigitUnits"=>0, "--alleleDelimiter"=>0,
  "--post-varMaxGroup"=>8) ;
print STDERR "[".localtime()."] $progName $version begins.\n" ;
for ( $i = 0 ; $i < @ARGV ; ++$i )
{
	if ( $ARGV[$i] eq "-1" )
	{
		for ($j = $i + 1; $j < @ARGV; ++$j )		
		{
			last if ($ARGV[$j] =~ /^-/) ;
			push @firstMateFiles, glob($ARGV[$j]) ;
		}
		$i = $j - 1 ;
	}
	elsif ( $ARGV[$i] eq "-2" )
	{	
		for ($j = $i + 1; $j < @ARGV; ++$j )		
		{
			last if ($ARGV[$j] =~ /^-/) ;
			push @secondMateFiles, glob($ARGV[$j]) ;
		}
		$i = $j - 1 ;
	}
	elsif ( $ARGV[ $i ] eq "-u" ) 
	{
		for ($j = $i + 1; $j < @ARGV; ++$j )		
		{
			last if ($ARGV[$j] =~ /^-/) ;
			push @firstMateFiles, glob($ARGV[$j]) ;
		}
		$i = $j - 1 ;
	}
	elsif ( $ARGV[$i] eq "-b" )
	{
		push @bamFiles, $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "-i" )
	{
		push @interleavedFiles, $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "-f" )
	{	
		$refSeqFasta = $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "-c" )
	{	
		$refCoordFasta = $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "-o" )
	{
		$prefix = $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "--od" )
	{
		$outputDirectory = $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "-t" )
	{
		$threadCnt = $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "--abnormalUnmapFlag" )
	{
		$bamExtractorArgs .= " -u" ;
	}
	elsif ( $ARGV[$i] eq "--mateIdSuffixLen" )
	{
		$bamExtractorArgs .= "--mateIdSuffixLen ".$ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "--noExtraction" )
	{
		$noExtraction = 1 ;
	}
	elsif ( $ARGV[$i] eq "--skipPostAnalysis" )
	{
		$skipPostAnalysis = 1 ;
	}
	elsif ( $ARGV[$i] eq "--preset" )
	{
		$preset = $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "--barcode" )
	{
		$hasBarcode = 1 ;
		$bamExtractorArgs .= " --barcode ".$ARGV[$i + 1] ;
		#$fastqExtractorArgs .= " --barcode ".$ARGV[$i + 1] ;
		for ($j = $i + 1; $j < @ARGV; ++$j )		
		{
			last if ($ARGV[$j] =~ /^-/) ;
			push @barcodeFiles, glob($ARGV[$j]) ;
		}
		$i = $j - 1 ;
	}
	elsif ( $ARGV[$i] eq "--barcodeRange" )
	{
		$fastqExtractorArgs .= " --barcodeStart ".$ARGV[$i + 1]." --barcodeEnd ".$ARGV[$i + 2] ;
		if ( $ARGV[$i + 3] eq "-" ) 
		{
			$fastqExtractorArgs .= " --barcodeRevComp" ;
		}

		$i += 3 ;
	}
	elsif ( $ARGV[$i] eq "--read1Range")
	{
		$fastqExtractorArgs .= " --read1Start ".$ARGV[$i + 1]." --read1End ".$ARGV[$i + 2] ;
		$i += 2 ;
	}
	elsif ( $ARGV[$i] eq "--read2Range")
	{
		$fastqExtractorArgs .= " --read2Start ".$ARGV[$i + 1]." --read2End ".$ARGV[$i + 2] ;
		$i += 2 ;
	}
	elsif ( $ARGV[$i] eq "--barcodeWhitelist" )
	{
		$fastqExtractorArgs .= " --barcodeWhitelist ".$ARGV[$i + 1] ;
		$i += 1 ;
	}
	elsif ( $ARGV[$i] eq "--UMI" )
	{
		$hasUmi = 1 ;
		$bamExtractorArgs .= " --UMI ".$ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "--relaxIntronAlign" )
	{
		$genotyperArgs .= " ".$ARGV[$i] ;
	}
	elsif ( $ARGV[$i] eq "--outputReadAssignment" )
	{
		$genotyperArgs .= " ".$ARGV[$i] ;
	}
	elsif ( $ARGV[$i] eq "--stage" )
	{
		$stage = $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( defined( $genotyperArgNames{$ARGV[$i]} ) 
		|| defined ($analyzerArgNames{$ARGV[$i]}))
	{
		if ( defined( $genotyperArgNames{$ARGV[$i]} ) ) 
		{
			$genotyperArgs .= " ".$ARGV[$i]." ".$ARGV[$i + 1] ;
		}
		if ( defined( $analyzerArgNames{$ARGV[$i]} ) )
		{
      $ARGV[$i] =~ s/^--post-/--/ ;
			$analyzerArgs .= " ".$ARGV[$i]." ".$ARGV[$i + 1] ;
		}
		++$i ;
	}
	else
	{
		die "Unknown parameter ".$ARGV[$i]."\n" ;
	}
}

if ( @bamFiles == 0 && @firstMateFiles == 0 && @interleavedFiles == 0)
{
	die "Need to use -b/{-1,-2}/-u/-i to specify input reads.\n" ;
}

if ( @bamFiles > 0 && $noExtraction == 1 )
{
	die "--noExtraction option can only be set when using -1 -2/-u as input.\n" ;
}

if ( $refSeqFasta eq "" )
{
	die "Need to use -f to specify the reference sequence file.\n" ;
}

if ( @bamFiles > 0 && $refCoordFasta eq "" )
{
	die "Need to use -c to specify gene coordinate file for BAM input.\n" ;
}

if ( $preset ne "" )
{
	if ($preset =~ "hla")
	{
		$genotyperArgs .= " -s 0.97" ;	
		$analyzerArgs .= " -s 0.97"	;
		if ($preset eq "hla-wgs")
		{
			$fastqExtractorArgs .= " -s 0.97" ;
		}
	}
	elsif ($preset eq "kir-wgs")
	{
		$genotyperArgs .= " -s 0.9 --relaxIntronAlign" ;
		$analyzerArgs .= "-s 0.9 --relaxIntronAlign" ;
	}
	elsif ($preset eq "kir-wes")
	{
		$genotyperArgs .= " --relaxIntronAlign" ;	
		$analyzerArgs .= " --relaxIntronAlign" ;	
	}
	else
	{
		die "Unknown preset parameter $preset.\n" ;
	}
}

# Infer the output prefix.
if ( $prefix eq "" )
{
	# infer the output prefix.
	if ( @bamFiles > 0 )
	{
		$prefix = "T1K_".( split /\./, basename( $bamFiles[0] ) )[0] ;
	}
	elsif ( @firstMateFiles > 0 )
	{
		$prefix = "T1K_".( split /\./, basename( $firstMateFiles[0] ) )[0] ;
	}
	else
	{
		$prefix = "T1K" ;
	}
}

if ( $outputDirectory ne "" )
{
	make_path($outputDirectory) if ( !-d $outputDirectory ) ;
	$prefix = "$outputDirectory/$prefix" ;
}

# Extract the file
my $extractorPrefix = "${prefix}_candidate" ;
my $candidateRd1 = "${extractorPrefix}_1.fq" ;
my $candidateRd2 = "${extractorPrefix}_2.fq" ;
my $candidateRd = "${extractorPrefix}.fq" ;
my $candidateFiles = "$candidateRd1 $candidateRd2" ;
if ( $stage <= 0 )
{
	if ( @bamFiles > 0 )
	{
		system_call( "$WD/bam-extractor -b ".$bamFiles[0]." -t $threadCnt -f $refCoordFasta -o $extractorPrefix $bamExtractorArgs" ) ;
	}
	elsif ( (@secondMateFiles > 0 || @interleavedFiles > 0) && $noExtraction == 0 )
	{
		my $fname ; 
		foreach $fname (@firstMateFiles)
		{
			$fastqExtractorArgs .= " -1 ".$fname ;
		}
		foreach $fname (@secondMateFiles)
		{
			$fastqExtractorArgs .= " -2 ".$fname ;
		}
		foreach $fname (@interleavedFiles)
		{
			$fastqExtractorArgs .= " -i ".$fname ;
		}
		foreach $fname (@barcodeFiles)
		{
			$fastqExtractorArgs .= " --barcode ".$fname ;
		}
		system_call( "$WD/fastq-extractor -t $threadCnt -f $refSeqFasta -o $extractorPrefix $fastqExtractorArgs" ) ;
	}
	elsif ( $noExtraction == 0 )
	{
		my $fname ; 
		foreach $fname (@firstMateFiles)
		{
			$fastqExtractorArgs .= " -u ".$fname ;
		}
		foreach $fname (@barcodeFiles)
		{
			$fastqExtractorArgs .= " --barcode ".$fname ;
		}
		system_call( "$WD/fastq-extractor -t $threadCnt -f $refSeqFasta -o $extractorPrefix $fastqExtractorArgs" ) ;
		$candidateFiles = "$candidateRd" ;
	}
}

# determine paired-end or single-end
if ( $noExtraction == 0 )
{
	if ( -e $candidateRd1 )
	{
		;
	}
	elsif ( -e $candidateRd )
	{
		$candidateFiles = "$candidateRd" ;
	}
	elsif ( $stage <= 1 )
	{
		die "Could not find files like ${extractorPrefix}*.fq\n" ;
	}
}
else
{
	if ( @secondMateFiles > 0 )
	{
		$candidateFiles = $firstMateFiles[0]." ".$secondMateFiles[0] ;		
	}
	elsif ( @firstMateFiles > 0 )
	{
		$candidateFiles = $firstMateFiles[0] ;
	}
}

if ($hasBarcode == 1)
{
	$genotyperArgs .= " --barcode ${prefix}_candidate_bc.fa" ;
	$analyzerArgs .= " --barcode ${prefix}_aligned_bc.fa" ;
}

# Obtain the genotype
if ( $stage <= 1 )
{
	#system_call("python3 $WD/KirGenotype.py -a ${prefix}_kallisto/abundance.tsv > ${prefix}_genotype.tsv") ;
	my @cols = split /\s/, $candidateFiles ;
	if (scalar(@cols) > 1)
	{
		system_call("$WD/genotyper $genotyperArgs -o $prefix -t $threadCnt -f $refSeqFasta -1 ".$cols[0]." -2 ".$cols[1]) ;
	}
	else
	{
		system_call("$WD/genotyper $genotyperArgs -o $prefix -t $threadCnt -f $refSeqFasta -u ".$cols[0]) ;
	}
}

if ($stage <= 2 && $skipPostAnalysis == 0)
{
	my @cols = split /\s/, $candidateFiles ;
	if (scalar(@cols) > 1)
	{
		system_call("$WD/analyzer $analyzerArgs -o $prefix -t $threadCnt -f $refSeqFasta -a ${prefix}_allele.tsv -1 ${prefix}_aligned_1.fa -2 ${prefix}_aligned_2.fa") ;
	}
	else
	{
		system_call("$WD/analyzer $analyzerArgs -o $prefix -t $threadCnt -f $refSeqFasta -a ${prefix}_allele.tsv -u ${prefix}_aligned.fa") ;
	}
}

print STDERR "[".localtime()."] Finish.\n";
