#!/usr/bin/perl -w ################################################################################ # Provided as-is # For any question contact mbrieuc at u.washington.edu ########################################### # Command line: # ./clustal_align.pl --file name_input_file # Align homologous sequences for 6 species ; the number of species can easily be adjusted # Change the name of the databases on line 61-66 use Getopt::Long; # Input data file - File with name of contig and names of ortholog sequences for the 6 species my $in_file = undef; GetOptions("file=s" => \$in_file); $out_file = $in_file . "_output.txt"; $stat_out_file = $in_file . "_StatFile.txt"; # Files used in data processing $sp1Seq = "sp1.fasta"; $sp2Seq = "sp2.fasta"; $sp3Seq = "sp3.fasta"; $sp4Seq = "sp4.fasta"; $sp5Seq = "sp5.fasta"; $sp6Seq = "sp6.fasta"; $seqPairFile = "seqPair.fa"; # Remove $out_file from any previous script outputs if (-e "$out_file") { unlink($out_file); } # Open the homology data file open (HOMOLOGY, $in_file) || die "Major problem: cannot open $in_file for reading: $!"; while () # Read one line at a time { chomp($_); # Get rid of new line at end of each line @data = split(/\t/, $_); # Split the line in to tab-delimited fields # Get sequence names for each species $contig_Name=$data[0]; $sp1_Name = $data[1]; $sp2_Name = $data[2]; $sp3_Name = $data[3]; $sp4_Name = $data[4]; $sp5_Name = $data[5]; $sp6_Name = $data[6]; print "$sp1_Name vs. $sp2_Name vs. $sp3_Name vs. $sp4_Name vs. $sp5_Name vs. $sp6_Name\n"; # Get a sequence from a BLAST-formatted database using the "fastacmd" syntax # fastacmd -d database -s sequence_name and append (>) to a file `./fastacmd -d db1.fasta -s $sp1_Name > $sp1Seq`; `./fastacmd -d db2.fa -s $sp2_Name > $sp2Seq`; `./fastacmd -d db3.fa -s $sp3_Name > $sp3Seq`; `./fastacmd -d db4.fa -s $sp4_Name > $sp4Seq`; `./fastacmd -d db5.fa -s $sp5_Name > $sp5Seq`; `./fastacmd -d db6.fa -s $sp6_Name > $sp6Seq`; # Concatenate all six sequences to same file (if doing clustal alignment) `cat $sp1Seq $sp2Seq $sp3Seq $sp4Seq $sp5Seq $sp6Seq > $seqPairFile`; # Do alignments (perform subroutines below) doClustal(); } # Close file handles and delete temporary files close (HOMOLOGY); unlink ($sp1Seq, $sp2Seq, $sp3Seq, $sp4Seq, $sp5Seq, $sp6Seq, $seqPairFile); print "\nSee $out_file for output\n\n"; ########## subroutines ########## sub doClustal { $clustalOut = "$contig_Name.aln"; $clustalStat= "$contig_Name.Stat.txt"; `./clustalw2 -INFILE=$seqPairFile -TYPE=DNA -OUTFILE=$clustalOut -STATS=$clustalStat -ENDGAPS`; open (STAT, $clustalStat) || die "Major problem: cannot open $clustalStat for reading: $!"; while () { chomp($_); # Get rid of new line at end of each line @data = split(/\:/, $_); # Split the line in to tab-delimited fields # Get names of sequences for each alignment $type = $data[0]; $value = $data[1]; if($type eq "aln pw-id lowest") { if($value != 0.00) { # Add sequence headers to big alignment file `grep ">" $seqPairFile >> $out_file`; # Append alignment to big file and delete the small files if there is an overlapping region to all 6 species `cat $clustalOut >> $out_file`; `cat $clustalStat >> $stat_out_file`; unlink($clustalOut); } } } unlink("seqPair.dnd",$clustalStat); close (STAT); }