#!/usr/bin/perl -w ################################################################################ # Provided as-is # For any question contact mbrieuc at u.washington.edu ########################################### #Command line: ./sequence_trim.pl use strict; use Getopt::Long; use Data::Dumper; my $file = undef; my $clustal_dir = "clustal"; my $fasta_dir = "fasta"; GetOptions("file=s" => \$file, "clustal_dir=s" => \$clustal_dir, "fasta_dir=s" => \$fasta_dir); if (!defined($file)) { usage(); exit(1); } # main open(INFILE, "<$file"); my $pairs = {}; # read data in and sort by pairs classifyData($pairs); # no need to keep file handle open, everything's in memory now close(INFILE); # now starts the data processing processSequencePairs($pairs); # write the clustal files out writeClustalFiles($clustal_dir, $pairs); # calculate 80th percentile sequences my $percentiles = calculateMatchPercentage($pairs); # dump everything over X percent into fasta my @inclusion; foreach my $keys (keys(%$percentiles)) { my $percent = $percentiles->{$keys}; if ($percent >= 70) { push(@inclusion, $keys); } } writeFastaFiles($fasta_dir, $pairs, \@inclusion); sub calculateMatchPercentage { my ($pairs) = @_; my $result = {}; foreach my $gene (keys(%$pairs)) { my $total_length = 0; my $total_matches = 0; foreach my $seq (@{$pairs->{$gene}}) { my @align = split(//, $seq->{'align'}); $total_length += length($seq->{'align'}); $total_matches += count(\@align, "*"); } my $percentage = ($total_matches / $total_length) * 100; # store percentages by gene $result->{$gene} = $percentage; } return $result; } sub writeFastaFiles { my ($fasta_dir, $pairs, $inclusion) = @_; my $FASTA_HEADER = ">"; # check this for failure my $failed = mkdir("./$fasta_dir", 0777); my $err = $!; if ($failed && !($err =~ m/'File exists'/)) { die "[ERROR] Failed to create fasta directory $fasta_dir\n err: $!"; } foreach my $gene (@$inclusion) { my $collection = {}; # add all the data to collection foreach my $seq (@{$pairs->{$gene}}) { foreach my $dataset (@{$seq->{'data'}}) { my $name = $dataset->{'name'}; my $value = $dataset->{'value'}; if (!defined($collection->{$name})) { $collection->{$name} = $value; } else { $collection->{$name} .= $value; } } } open(FASTA, ">$fasta_dir/$gene.fasta") || print "[ERROR] Unable to open $gene.fasta"; # print the actual sequence foreach my $entry (keys(%$collection)) { my $value = $collection->{$entry}; print FASTA $FASTA_HEADER . $entry . "\n"; print FASTA $value; print FASTA "\n"; } close(FASTA); } } sub writeClustalFiles { my ($clustal_dir, $pairs) = @_; my $CLUSTAL_HEADER = "CLUSTAL 2.0.11 multiple sequence alignment\n\n\n"; # don't check this for failure my $failed = mkdir("./$clustal_dir", 0777); my $err = $!; if ($failed && !($err =~ m/'File exists'/)) { die "[ERROR] Failed to create clustal directory $clustal_dir\n err: $!"; } foreach my $gene (keys(%$pairs)) { open(CLUSTAL, ">$clustal_dir/$gene.aln") || print "[ERROR] Unable to open $gene.aln"; print CLUSTAL $CLUSTAL_HEADER; foreach my $seq (@{$pairs->{$gene}}) { # print the actual sequence foreach my $dataset (@{$seq->{'data'}}) { print CLUSTAL $dataset->{'name'}; # format the sequence to start exactly 22 # characters from the left margin my $length = length($dataset->{'name'}); for ( my $i = (22 - $length); $i >= 0; $i--) { print CLUSTAL " "; } print CLUSTAL $dataset->{'value'}; print CLUSTAL "\n"; } # print the alignment data print CLUSTAL " "; print CLUSTAL $seq->{'align'}; print CLUSTAL "\n\n"; } close(CLUSTAL); } } sub classifyData { my ($pairs) = @_; my $line = readLine(); while ( defined($line) ) { my $type = getLineType($line); if ($type eq "unknown") { print "[ERROR] Found 'unknown type for line '$line'\n"; } elsif ($type eq "data") { my @data; # read as many data lines as possible while($type eq "data") { push(@data, $line); $line = readLine(); $type = getLineType($line); } # process the sequence processSequence(\@data, $pairs); } else { $line = readLine(); } } return $pairs; } sub processSequencePairs { my ($pairs) = @_; foreach my $pair (keys(%$pairs)) { my @arr = @{$pairs->{$pair}}; my $start_index = 0; my $end_index = $#arr; # clean up first part of sequence while(cleanUpData($arr[$start_index++], "front")) { shift(@{$pairs->{$pair}}); } # clean up last part of sequence IF STILL SOMETHING THERE!!! while(cleanUpData($arr[$end_index--], "back")) { pop(@{$pairs->{$pair}}); } } } sub cleanUpData { my ($seq, $start_point) = @_; my $longestDashedSequence = -1; my $sequenceWraps = 0; foreach my $hash (@{$seq->{'data'}}) { # bail immediately if the whole line is dashes if ($hash->{'value'} =~ m/^-*$/) { return 1; } # only inspect sequence if the first (or last) character is a '-' elsif (($hash->{'value'} =~ m/^-/ && ($start_point eq "front")) || ($hash->{'value'} =~ m/-$/ && ($start_point eq "back"))) { my $lastDashIndex = 0; # convert string to an array for processing a char at a time my @value = split(//, $hash->{'value'}); # store index of last dash if ($start_point eq "front") { $lastDashIndex = searchForLastIndexFromBeginning(\@value, "\-"); } elsif ($start_point eq "back") { $lastDashIndex = searchForLastIndexFromEnd(\@value, "\-"); } else { print "[ERROR] Unexpected start point $start_point\n"; } # record the length of the longest string of dashes if ($lastDashIndex > $longestDashedSequence) { $longestDashedSequence = $lastDashIndex; } } } # now remove the actual values if ($longestDashedSequence > -1) { # remove the values from the alignment unless( $start_point eq "back" ) { # trim from the beginning of the string $seq->{'align'} = substr( $seq->{'align'}, $longestDashedSequence); } else { my $length = length($seq->{'align'}); # trim from the end of the string $seq->{'align'} = substr( $seq->{'align'}, 0, ($length - $longestDashedSequence)); } # now remove the values from the actual sequence foreach my $hash (@{$seq->{'data'}}) { unless ( $start_point eq "back" ) { # trim from the beginning of the string $hash->{'value'} = substr( $hash->{'value'}, $longestDashedSequence); } else { my $length = length($hash->{'value'}); # trim from the end of the string $hash->{'value'} = substr( $hash->{'value'}, 0, ($length - $longestDashedSequence)); } } } return 0; } sub length { my ($arr) = @_; my @val = @$arr; return $#val; } sub count { my ($arr, $char) = @_; my $count = 0; foreach my $c (@$arr) { if ($c eq $char) { $count++; } } return $count; } sub searchForLastIndexFromBeginning { my ($input, $exp) = @_; my @value = @$input; my $lastDashIndex = 0; for (my $i = 0; $i <= $#value; $i++) { if ($value[$i] =~ m/$exp/) { $lastDashIndex++; } else { last; } } return $lastDashIndex; } sub searchForLastIndexFromEnd { my ($input, $exp) = @_; my @value = @$input; my $lastDashIndex = 0; for (my $i = $#value; $i >= 0; $i--) { if ($value[$i] =~ m/$exp/) { $lastDashIndex++; } else { last; } } return $lastDashIndex; } # need to tease out the lines to dump each sequence in a separate file # need to preserve the "full" value (e.g. "lcl|) to reconstruct the line sub processSequence { my ($sequence, $pairs) = @_; my $pair_key = (); my $pair_value = (); my $pair_alignment = undef; my $skip_sequence = 1; # slurp out the pair_name and pair_value foreach my $l (@$sequence) { my $key = $l; # internal bookkeeping $skip_sequence = 0; # values stored in result my $full_name = $l; my $value = $l; my $alignment = $l; # extract pair name #$full_name =~ s/(lcl\|\w+)\s+.*/$1/; $full_name =~ s/(lcl\|\S+)\s+.*/$1/; #$key =~ s/lcl\|(\w+|)\s+.*/$1/; $key =~ s/lcl\|(\S+)\s+.*/$1/; #$value =~ s/lcl\|\w+\s+(.*)/$1/; $value =~ s/lcl\|\S+\s+(.*)/$1/; # hard-coded to read everything after the 21st character $alignment =~ s/\s{21}(\s*\**)/$1/; # no keys or values on this line, but it is an alignment if ($key eq $l && $value eq $l && $alignment ne $l) { if (!defined($pair_alignment)) { $pair_alignment = $alignment; } else { print "[ERROR] Already found alignment for {$pair_key}, skipping sequence\n"; $skip_sequence = 1; } next; } if ($key ne $l) { push(@$pair_key, $key); } else { print "[ERROR] Problem with '$l', unable to extract sequence name, skipping sequence\n"; $skip_sequence = 1; } if ($full_name ne $l && $value ne $l) { my $seq = { 'name' => $full_name, 'value' => $value }; push(@$pair_value, $seq); } else { print "[ERROR] Problem with '$l', unable to extract sequence value, skipping sequence\n"; $skip_sequence = 1; } } unless($skip_sequence) { # add the sequence to the results my $pair_key = buildSequenceKey($pair_key); if (!defined($pairs->{$pair_key})) { $pairs->{$pair_key} = (); } my $result = { 'data' => $pair_value, 'align' => $pair_alignment }; # put the values in an order preserving array push(@{$pairs->{$pair_key}}, $result); } return $pairs; } # read and validate $num_lines from the $file and validate against # type specified by $expected_type sub readData { my ($num_lines, $expected_type) = @_; my $lines = (); for (my $i = 0; $i < $num_lines; $i++) { my $line = readLine(); my $type = getLineType($line); if ($type ne $expected_type) { print "[ERROR] Expected $expected_type, got $type for line \"$line\"\n"; } push(@$lines, $line); } return $lines; } # method to build a key for any given sequence tuple sub buildSequenceKey { my ($values) = @_; my $result = undef; foreach my $k (@$values) { unless( !defined($result) ) { $result .= $k; } else { $result = $k; } } return $result; } # method to read a line from the file sub readLine { my $line = ; if (!defined($line)) { return undef; } chomp($line); return $line; } # method to determine line type sub getLineType { my ($line) = @_; if (!defined($line)) { return "unknown"; } if ($line =~ /^\>/ || $line =~ /^CLUSTAL/) { return "header"; } elsif ($line =~ /^$/) { return "empty_line"; } elsif ($line =~ /^lcl/ || $line =~ /^\s*|\**$/) { return "data"; } else { return "unknown"; } } sub usage { print "./marine.pl --file --clustal_dir \n"; }