#!/usr/bin/env perl

# $Id: rnazAnnotate.pl,v 1.3 2006/03/24 15:43:13 wash Exp $

use strict;
use warnings;
use FindBin;
use lib "$FindBin::Bin/../lib/5.26.2";
use RNAz;
use Getopt::Long;
use Pod::Usage;

my $BEDfile='';
my $version=0;
my $man=0;
my $help='';

GetOptions('bed=s' => \$BEDfile,
		   'b=s' => \$BEDfile,
		   'version'=>\$version,
		   'v'=>\$version,
		   'man'=>\$man,
		   'help'=>\$help,
		   'h'=>\$help
		  ) or pod2usage(1);

pod2usage(1) if $help;
pod2usage(-verbose => 2) if $man;

if ($version){
  print "\nrnazAnnotate.pl is part of RNAz $RNAz::rnazVersion\n\n";
  print "http://www.tbi.univie.ac.at/~wash/RNAz\n\n";
  exit(0);
}


open(BED,"<$BEDfile")||die("Could not read $BEDfile ($!)");

my %track=();

while (<BED>){
  next if /track/;
  (my $chrom, my $start, my $end, my $name)=split;

  if (!exists($track{$chrom})){
	$track{$chrom}=[{start=>$start,end=>$end,name=>$name}];
  } else {
	push @{$track{$chrom}},{start=>$start,end=>$end,name=>$name};
  }	
}

foreach my $key (keys %track){
  $track{$key}=[sort {$a->{start}<=>$b->{start}} @{$track{$key}}];
}

while (my $line=<>){

  # Only consider "cluster" entries for annotation, simply print "hits"
  if (!($line=~/\s?^locus/)){
	print $line;
	next;
  }

  (my $clusterID,my $findChrom,my $findStart,my $findEnd)=split(/\t/,$line);

  # In BED files usually only chromosome identifier are stored
  # (e.g. chr6), while in MAFs and rnazCluster.pl output you find
  # "hg17.chr6".  If the sequence idenitfiers in the original MAF are
  # of the form x.y, only y is used for comparison with the BED
  if ($findChrom =~ /^(.*)\.(.*)$/){
	$findChrom=$2;
  }

  next if (!exists($track{$findChrom}));

  # Look for two neighbouring BED entries using intervallschachtelung
  my $n1=0;
  my $n2=@{$track{$findChrom}}-1;

  while (($n2-$n1)>1){
	my $divide=$n1+int(($n2-$n1)/2);
	if (($track{$findChrom}->[$divide]->{start})<=$findStart){
	  $n1=$divide
	} else {
	  $n2=$divide;
	}
  }

  my $hit=undef;

  # if overlaps two BED entries the first is taken
  if (overlaps($findStart,$findEnd,$track{$findChrom}->[$n1]->{start},$track{$findChrom}->[$n1]->{end})){
	$hit=$n1;
  } elsif (overlaps($findStart,$findEnd,$track{$findChrom}->[$n2]->{start},$track{$findChrom}->[$n2]->{end})){
	$hit=$n2;
  }

  # If there is overlap, add the name from bed in "" as last field
  # to the cluster line
  if (defined $hit){
	my $ann=$track{$findChrom}->[$hit]->{name};
	$ann=~s/\t/ /g;
	chomp($line);
	print "$line\t\"$ann\"\n";
  } else {
	chomp($line);
	print "$line\t-\n";
  }
}

sub overlaps{
  (my $queryStart,my $queryEnd,my $subjectStart,my $subjectEnd)=@_;
  return 0 if (($queryEnd<$subjectStart) or ($queryStart>$subjectEnd));
  return 1;
}

__END__

=head1 NAME

C<rnazAnnotate.pl> - Compare tab-delimited data file as generated by
C<rnazCluster> to a BED annotation file.

=head1 SYNOPSIS

 rnazAnnotate.pl [options] [file]

=head1 OPTIONS

=over 8

=item B<-b, --bed>

Set the annotation BED file with this option.

=back

=head1 DESCRIPTION

This simple programs reads a tab-delimited data file as generated by
C<rnazCluster.pl>. It compares the genomic region of each predicted
locus to the annotations of a BED file. If there is some overlap, the
description field of the annotation line in the BED file is added in
double quotes as the last field to the locus line.

=head1 EXAMPLES

 # rnazAnnotate.pl -b annotation.bed results.dat

Annotates the loci in C<results.dat> with annotations in
C<annotation.bed>.

=head1 AUTHORS

Stefan Washietl <wash@tbi.univie.ac.at>

=cut
