#!/usr/bin/env perl

# $Id: rnazSort.pl,v 1.2 2006/03/24 15:43:14 wash Exp $

use strict;
use warnings;
use FindBin;
use lib "$FindBin::Bin/../lib/5.26.2";
use RNAz;
use Getopt::Long;
use Pod::Usage;


my @fieldsList=qw(windowID locusID seqID start end strand N columns
				  identity meanMFE consensusMFE energyTerm covarianceTerm combPerPair z SCI decValue P );
my $reverse=0;
my $noClusters=0;
my $version=0;
my $man=0;
my $help=0;

GetOptions('reverse' => \$reverse,
		   'r' => \$reverse,
		   'no-loci'=>\$noClusters,
		   'n'=>\$noClusters,
		   'help'=>\$help,
		   'h'=>\$help,
		   'version'=>\$version,
		   'v'=>\$version,
		   'man'=>\$man
		  ) or pod2usage(1);

pod2usage(1) if $help;
pod2usage(-verbose => 2) if $man;

if ($version){
  print "\nrnazSort.pl is part of RNAz $RNAz::rnazVersion\n\n";
  print "http://www.tbi.univie.ac.at/~wash/RNAz\n\n";
  exit(0);
}


my $sortKeyInput= shift @ARGV;
my $sortKey='';

foreach my $key (@fieldsList){
  if (lc($key) eq lc($sortKeyInput)){
	$sortKey=$key;
	last;
  }
}

if (!$sortKey){
  print STDERR "Unknown sort key.\n";
  exit(1);
}

my %clustersHeader=();
my %clusters=();
my %hits=();

while (my $line=<>){

  next if $line=~/^\s?\#/;
  next if $line=~/^\s+$/;

  if ($line=~/^(locus\d+)/){
	$clustersHeader{$1}=$line;
  }

  if ($line=~/^(window\d+)\s+(locus\d+)/){
	$clusters{$2}=[] if !defined $clusters{$2};
	push @{$clusters{$2}},$1;
	my %fields;
	@fields{@fieldsList}=split(/\s+/,$line);
	$hits{$1}={%fields};
  }
}

if (!$noClusters){

  foreach my $windowID (sort sortFunction keys %hits){
	
	next if not defined $clusters{$hits{$windowID}->{locusID}};
	
	print $clustersHeader{$hits{$windowID}->{locusID}};
	
	foreach my $hitInCluster (sort sortFunction @{$clusters{$hits{$windowID}->{locusID}}}){
	  printLine($hitInCluster);
	}
	
	$clusters{$hits{$windowID}->{locusID}}=undef;
	
  }
} else {
  foreach my $windowID (sort sortFunction keys %hits){
	printLine($windowID);
  }
}


sub sortFunction{

  if ($sortKey eq "windowID" or
	  $sortKey eq "locusID" or
	  $sortKey eq "seqID" or
	  $sortKey eq "strand"){
	if (!$reverse){
	  return $hits{$a}->{$sortKey} cmp $hits{$b}->{$sortKey};
	} else {
	  return $hits{$b}->{$sortKey} cmp $hits{$a}->{$sortKey};
	}
  }

  if ($sortKey eq "start" or
	  $sortKey eq "end" or
	  $sortKey eq "z" or
	  $sortKey eq "meanMFE" or
	  $sortKey eq "consensusMFE" or
	  $sortKey eq "energyTerm" or
	  $sortKey eq "covarianceTerm"){
	if (!$reverse){
	  return $hits{$a}->{$sortKey} <=> $hits{$b}->{$sortKey};
	} else {
	  return $hits{$b}->{$sortKey} <=> $hits{$a}->{$sortKey};
	}
  }

  if ($sortKey eq "N" or
	  $sortKey eq "columns" or
	  $sortKey eq "identity" or
	  $sortKey eq "SCI" or
	  $sortKey eq "combPerPair" or
	  $sortKey eq "decValue" or
	  $sortKey eq "P"){
	if (!$reverse){
	  return $hits{$b}->{$sortKey} <=> $hits{$a}->{$sortKey};
	} else {
	  return $hits{$a}->{$sortKey} <=> $hits{$b}->{$sortKey};
	}
  }
  return $hits{$a}->{$sortKey}<=>$hits{$b}->{$sortKey};
}

sub printLine{
  my $windowID=shift;
  my @tmp;
  foreach my $key (@fieldsList){
	push @tmp, $hits{$windowID}->{$key};
  }
  print join("\t",@tmp);
  print "\n";
}

__END__

=head1 NAME

C<rnazSort.pl> - Sorts output files from C<rnazCluster.pl> by different criteria

=head1 SYNOPSIS

 rnazSort.pl [options] key [file]

=head1 OPTIONS

=over 8


=item B<-r, --reverse>

Sort in reverse order. 

=item B<--no-loci>

Do not preserve the locus grouping but simply sort the windows.

=item B<-v, --version>

Prints version information and exits.

=item B<-h, --help>

Prints a short help message and exits.

=item B<--man>

Prints a detailed manual page and exits.

=back

=head1 DESCRIPTION

C<rnazSort.pl> reads tab-delimited data files as generated by
C<rnazCluster.pl>. The files are sorted according to a key which is
given at the command line as a mandatory argument. See below for a
list of possible keys. By default ``better" hits are listed first
(e.g. lower z-score or higher P). This can be changed by using the
C<--reverse> option. By default, the grouping in loci is preserved
during sorting. For example if you sort by z-score, you get first the
locus first whicht contains the window with the lowest z-score. If you
simply want all windows sorted without considering the grouping use
the C<--no-loci> option.

=head1 FIELDS

=over 8

=item 1. B<windowID>

Consecutive numbered ID for each window. BUG: currently window10 comes
before window9 because it is sorted alphabetically.

=item 2. B<locusID>

The locus which this window belongs to. BUG: currently locus10 comes
before locus9 because it is sorted alphabetically.

=item 3. B<seqID>

Identifier of the sequence (e.g. human.chr1 or contig42)

=item 4. B<start>

Start position of the reference sequence in the window

=item 5. B<end>

End position of the reference sequence in the window

=item 6. B<strand>

Indicates if the reference seqeunce is from the positive or
negative strand

=item 7. B<N>

Number of sequences in the alignment

=item 8. B<columns>

Number of columns in the alignment

=item 9. B<identity>

Mean pairwise identity of the alignment

=item 10. B<meanMFE>

Mean minimum free energy of the single sequences as
calculated by the RNAfold algorithm

=item 11. B<consensusMFE>

``Consensus MFE" for the alignment as calculated by
RNAalifold algorithm

=item 12. B<energyTerm>

Contribution to the consensus MFE which comes from the
energy part of the RNAalifold algorithm

=item 13. B<covarianceTerm>

Contribution to the consensus MFE which comes from the covariance part
of the RNAalifold algorithm

=item 14. B<combPerPair>

Number of different base combinations per predicted
pair in the consensus seconary structure

=item 15. B<z>

Mean z-score of the sequences in the alignment

=item 16. B<SCI>

Structure conservation index for the alignment

=item 17. B<decValue>

Support vector machine decision value

=item 18. B<P>

RNA class probability as calculated by the SVM

=back

=head1 EXAMPLES

 # rnazSort.pl combPerPair results.dat

Sort by ``combinations per pair" value, i.e. gives you the hits with
the most compensatory mutations.

=head1 AUTHOR

Stefan Washietl <wash@tbi.univie.ac.at>

=cut

