#!/usr/bin/perl -w

################################################################################
# Provided as-is
# For any question contact mbrieuc at u.washington.edu
###########################################

#Command line: ./sequence_trim.pl

use strict;
use Getopt::Long;
use Data::Dumper;

my $file = undef;
my $clustal_dir = "clustal";
my $fasta_dir = "fasta";

GetOptions("file=s" => \$file,
           "clustal_dir=s" => \$clustal_dir,
           "fasta_dir=s" => \$fasta_dir);

if (!defined($file)) {
    usage();
    exit(1);
}

# main
open(INFILE, "<$file");
my $pairs = {};

# read data in and sort by pairs
classifyData($pairs);

# no need to keep file handle open, everything's in memory now
close(INFILE);

# now starts the data processing
processSequencePairs($pairs);

# write the clustal files out
writeClustalFiles($clustal_dir, $pairs);

# calculate 80th percentile sequences
my $percentiles = calculateMatchPercentage($pairs);

# dump everything over X percent into fasta
my @inclusion;
foreach my $keys (keys(%$percentiles)) {
    my $percent = $percentiles->{$keys};
    if ($percent >= 70) {
	push(@inclusion, $keys);
    }
}
writeFastaFiles($fasta_dir, $pairs, \@inclusion);

sub calculateMatchPercentage {
    my ($pairs) = @_;
    my $result = {};

    foreach my $gene (keys(%$pairs)) {
	my $total_length = 0;
	my $total_matches = 0;

        foreach my $seq (@{$pairs->{$gene}}) {
	    my @align = split(//, $seq->{'align'});

            $total_length += length($seq->{'align'});
	    $total_matches += count(\@align, "*");
	}

	my $percentage = ($total_matches / $total_length) * 100;

	# store percentages by gene
	$result->{$gene} = $percentage;
    }

    return $result;
}

sub writeFastaFiles {
    my ($fasta_dir, $pairs, $inclusion) = @_;

    my $FASTA_HEADER = ">";

    # check this for failure
    my $failed = mkdir("./$fasta_dir", 0777);
    my $err = $!;
    
    if ($failed && !($err =~ m/'File exists'/)) {
        die "[ERROR] Failed to create fasta directory $fasta_dir\n err: $!";
    }

    foreach my $gene (@$inclusion) {
	my $collection = {};

	# add all the data to collection
	foreach my $seq (@{$pairs->{$gene}}) {

	    foreach my $dataset (@{$seq->{'data'}}) {
		my $name = $dataset->{'name'};
		my $value = $dataset->{'value'};

		if (!defined($collection->{$name})) {
		    $collection->{$name} = $value;
		} else {
		    $collection->{$name} .= $value;
		}

	    }

	}

	open(FASTA, ">$fasta_dir/$gene.fasta") 
	  || print "[ERROR] Unable to open $gene.fasta";

	# print the actual sequence
	foreach my $entry (keys(%$collection)) {
	    my $value = $collection->{$entry};

	    print FASTA $FASTA_HEADER . $entry . "\n";
	    print FASTA $value;
	    print FASTA "\n";
        }

	close(FASTA);
    }
}

sub writeClustalFiles {
    my ($clustal_dir, $pairs) = @_;

    my $CLUSTAL_HEADER = "CLUSTAL 2.0.11 multiple sequence alignment\n\n\n";

    # don't check this for failure
    my $failed = mkdir("./$clustal_dir", 0777);
    my $err = $!;

    if ($failed && !($err =~ m/'File exists'/)) {
        die "[ERROR] Failed to create clustal directory $clustal_dir\n err: $!";
    }

    foreach my $gene (keys(%$pairs)) {
	open(CLUSTAL, ">$clustal_dir/$gene.aln") 
	  || print "[ERROR] Unable to open $gene.aln";

	print CLUSTAL $CLUSTAL_HEADER;

	foreach my $seq (@{$pairs->{$gene}}) {

	    # print the actual sequence
	    foreach my $dataset (@{$seq->{'data'}}) {

	        print CLUSTAL $dataset->{'name'};

		# format the sequence to start exactly 22 
		# characters from the left margin
		my $length = length($dataset->{'name'});

		for ( my $i = (22 - $length); $i >= 0; $i--) { 
		    print CLUSTAL " ";
	        }
		print CLUSTAL $dataset->{'value'};
		print CLUSTAL "\n";
	    }

	    # print the alignment data
	    print CLUSTAL "                     ";
	    print CLUSTAL $seq->{'align'};
	    print CLUSTAL "\n\n";
	}

	close(CLUSTAL);
    }
}

sub classifyData {
    my ($pairs) = @_;
    my $line = readLine();

    while ( defined($line) ) {
      my $type = getLineType($line);

      if ($type eq "unknown") {
        print "[ERROR] Found 'unknown type for line '$line'\n";
      }	elsif ($type eq "data") {
	  my @data;

	  # read as many data lines as possible
	  while($type eq "data") {
	      push(@data, $line);
	      $line = readLine();
	      $type = getLineType($line);
	  }

	  # process the sequence
	  processSequence(\@data, $pairs);
      } else {
	  $line = readLine();
      }
    }

    return $pairs;
}

sub processSequencePairs {
    my ($pairs) = @_;

    foreach my $pair (keys(%$pairs)) {
      my @arr = @{$pairs->{$pair}};
      my $start_index = 0;
      my $end_index = $#arr;

      # clean up first part of sequence
      while(cleanUpData($arr[$start_index++], "front")) {
          shift(@{$pairs->{$pair}});
      }

      # clean up last part of sequence IF STILL SOMETHING THERE!!!
           
          
                  while(cleanUpData($arr[$end_index--], "back")) {
                 pop(@{$pairs->{$pair}});
           }
        
    
    }
}

sub cleanUpData {
  my ($seq, $start_point) = @_;
  my $longestDashedSequence = -1;
  my $sequenceWraps = 0;

  foreach my $hash (@{$seq->{'data'}}) {

      # bail immediately if the whole line is dashes
      if ($hash->{'value'} =~ m/^-*$/) {
	  return 1;
      }

      # only inspect sequence if the first (or last) character is a '-'
      elsif (($hash->{'value'} =~ m/^-/ && ($start_point eq "front")) ||
          ($hash->{'value'} =~ m/-$/ && ($start_point eq "back"))) {
          my $lastDashIndex = 0;

	  # convert string to an array for processing a char at a time
          my @value = split(//, $hash->{'value'});


	  # store index of last dash
	  if ($start_point eq "front") {
	    $lastDashIndex = searchForLastIndexFromBeginning(\@value, "\-");
	  } elsif ($start_point eq "back") {
	    $lastDashIndex = searchForLastIndexFromEnd(\@value, "\-");
 	  } else {
	      print "[ERROR] Unexpected start point $start_point\n";
	  }

	  # record the length of the longest string of dashes
	  if ($lastDashIndex > $longestDashedSequence) {
	      $longestDashedSequence = $lastDashIndex;
	  }
      }
  }

  # now remove the actual values
  if ($longestDashedSequence > -1) {

      # remove the values from the alignment
      unless( $start_point eq "back" ) {
	# trim from the beginning of the string
        $seq->{'align'} = substr( $seq->{'align'}, $longestDashedSequence);
      } else {
	my $length = length($seq->{'align'});

	# trim from the end of the string
        $seq->{'align'} = substr( $seq->{'align'}, 0, ($length - $longestDashedSequence));
      }

      # now remove the values from the actual sequence
      foreach my $hash (@{$seq->{'data'}}) {
	  unless ( $start_point eq "back" ) {
	    # trim from the beginning of the string
            $hash->{'value'} = substr( $hash->{'value'}, $longestDashedSequence);
          } else {
	    my $length = length($hash->{'value'});

	    # trim from the end of the string
            $hash->{'value'} = substr( $hash->{'value'}, 0, ($length - $longestDashedSequence));
          }
      }
  }

  return 0;
}

sub length {
    my ($arr) = @_;
    my @val = @$arr;
    return $#val;
}

sub count {
    my ($arr, $char) = @_;
    my $count = 0;

    foreach my $c (@$arr) {
	if ($c eq $char) {
	    $count++;
	}
    }

    return $count;
}


sub searchForLastIndexFromBeginning {
    my ($input, $exp) = @_;
    my @value = @$input;
    my $lastDashIndex = 0;

    for (my $i = 0; $i <= $#value; $i++) {
        if ($value[$i] =~ m/$exp/) {
          $lastDashIndex++;
        } else {
          last;
        }
    }

    return $lastDashIndex;
}

sub searchForLastIndexFromEnd {
    my ($input, $exp) = @_;
    my @value = @$input;
    my $lastDashIndex = 0;

    for (my $i = $#value; $i >= 0; $i--) {
        if ($value[$i] =~ m/$exp/) {
          $lastDashIndex++;
        } else {
          last;
        }
    }

    return $lastDashIndex;
}

# need to tease out the lines to dump each sequence in a separate file
# need to preserve the "full" value (e.g. "lcl|<blah>) to reconstruct the line
sub processSequence {
  my ($sequence, $pairs) = @_;

  my $pair_key = ();
  my $pair_value = ();
  my $pair_alignment = undef;

  my $skip_sequence = 1;

  # slurp out the pair_name and pair_value
  foreach my $l (@$sequence) {
    my $key = $l; # internal bookkeeping
    $skip_sequence = 0;

    # values stored in result
    my $full_name = $l;
    my $value = $l;
    my $alignment = $l;

    # extract pair name
    #$full_name =~ s/(lcl\|\w+)\s+.*/$1/;
    $full_name =~ s/(lcl\|\S+)\s+.*/$1/;
    #$key =~ s/lcl\|(\w+|)\s+.*/$1/;
    $key =~ s/lcl\|(\S+)\s+.*/$1/;
    #$value =~ s/lcl\|\w+\s+(.*)/$1/;
    $value =~ s/lcl\|\S+\s+(.*)/$1/;

    # hard-coded to read everything after the 21st character
    $alignment =~ s/\s{21}(\s*\**)/$1/;

    # no keys or values on this line, but it is an alignment
    if ($key eq $l && $value eq $l && $alignment ne $l) {
      if (!defined($pair_alignment)) {
        $pair_alignment = $alignment;
      } else {
	print "[ERROR] Already found alignment for {$pair_key}, skipping sequence\n";
        $skip_sequence = 1;
      }
      next;
    }

    if ($key ne $l) {
      push(@$pair_key, $key);
    } else {
      print "[ERROR] Problem with '$l', unable to extract sequence name, skipping sequence\n";
      $skip_sequence = 1;
    }

    if ($full_name ne $l && $value ne $l) {
      my $seq = { 'name' => $full_name,
                  'value' => $value };
      push(@$pair_value, $seq);
    } else {
      print "[ERROR] Problem with '$l', unable to extract sequence value, skipping sequence\n";
      $skip_sequence = 1;
    }
  }

  unless($skip_sequence) {
    # add the sequence to the results
    my $pair_key = buildSequenceKey($pair_key);

    if (!defined($pairs->{$pair_key})) {
      $pairs->{$pair_key} = ();
    }

    my $result = { 'data' => $pair_value,
                   'align' => $pair_alignment };

    # put the values in an order preserving array
    push(@{$pairs->{$pair_key}}, $result);
  }

  return $pairs;
}

# read and validate $num_lines from the $file and validate against
# type specified by $expected_type
sub readData {
  my ($num_lines, $expected_type) = @_;
  my $lines = ();

  for (my $i = 0; $i < $num_lines; $i++) {
    my $line = readLine();
    my $type = getLineType($line);

    if ($type ne $expected_type) {
      print "[ERROR] Expected $expected_type, got $type for line \"$line\"\n";
    }

    push(@$lines, $line);
  }

  return $lines;
}

# method to build a key for any given sequence tuple
sub buildSequenceKey {
  my ($values) = @_;
  my $result = undef; 

  foreach my $k (@$values) {
    unless( !defined($result) ) {
      $result .= $k;
    } else {
      $result = $k;
    }
  }

  return $result;
}
  
# method to read a line from the file
sub readLine {
  my $line = <INFILE>;

  if (!defined($line)) {
      return undef;
  }

  chomp($line);
  return $line;
}

# method to determine line type
sub getLineType {
  my ($line) = @_;

  if (!defined($line)) {
      return "unknown";
  }

  if ($line =~ /^\>/ || $line =~ /^CLUSTAL/) {
    return "header";
  } elsif ($line =~ /^$/) {
    return "empty_line";
  } elsif ($line =~ /^lcl/ || $line =~ /^\s*|\**$/) {
    return "data";
  } else {
    return "unknown";
  }
}

sub usage {
  print "./marine.pl --file <filename> --clustal_dir <clustal output>\n";
}