Installation and use of blast_it.pl; Provided as-is. For any questions contact mbrieuc at u.washington.edu 1. Instal perl and the BIOPERL module 2. download the blast and formatdb executive files from NCBI and place them in a folder (recommended: /bin/blast) 3. Create the databases using formatdb and place them in a databases folder (recommended: /data/databases) 4. Create one folder for each sequence file you would like to blast and place those folders in the /data folder. Each sequence file should be named NAME.fa and the folder named NAME (same name as the fasta file without the .fa extension) 5.Copy this script and save it as blast_it.sh and save it in the main working directory ----------------------------------------------------------------------- #!/bin/sh ############################## # Main Blast IT Script # Author - Giles Goetz ############################## # First Step # Run the blast_it.pl script to generate # the blast script and the run blast script # Set the working directory WKDIR=`pwd` TIMESTAMP=`/bin/date +%y%m%d_%H%M%S` /usr/bin/perl ${WKDIR}/blast_it.pl ${TIMESTAMP} # Next Run the run genomic script /bin/sh ${WKDIR}/run_blast_it.${TIMESTAMP}.sh ----------------------------------------------------------------------- 6. Copy this script and save it as blast_it.pl and save it in the main working directory; Change the working directory in the script; ----------------------------------------------------------------------- #!/usr/bin/perl -w ########################################### # Version 2.0 of Blast IT # Automated Blast # Provided as-is # For any question contact mbrieuc at u.washington.edu ########################################### # First step is show into screen and create timestamp # as well as load the default configuration print "\n\tWelcome to Blast IT 2.0\n"; print "\tAuthor - Giles Goetz\n\n"; # Set Working Directory use Cwd; $working_dir = "ADDRESS OF THE WORKING DIRECTORY HERE"; print "Setting the Working Directory to Current Directory.\n"; print "Working Directory: $working_dir\n"; # Load configuration print "\nLoading Configuration\n"; if (!open(CONFIG, '<', "blast_it.conf")) { print "\nERROR: Unable to load configuration file blast_it.conf.\n"; print "Make sure it is in the same directory as this program.\n\n"; exit; } # Read each line of config file while () { # Remove the trailing line return chomp; if (/^DataDirectory:\s(.*)$/) { # Directory that stores all the data files for processing $data_dir = $working_dir . "/" . $1; print "DataDirectory: $data_dir\n"; } elsif (/^BlastParameters:$/) { while () { chomp; if (/^Blast(\d+):\s(\S+)\s(\S+)$/) { print "Blast Parameter: $2 $3\n"; $blast_parameters[$1]{"type"} = $2; $blast_parameters[$1]{"db"} = $3; } elsif (/^Location:\s(.*)$/) { print "Blast Parameter: Location $1\n"; $blast_location = $working_dir . "/" . $1; } elsif (/^BlastDBs:\s(.*)$/) { print "Blast Parameter: BlastDBs $1\n"; $blast_blastdbs = $working_dir . "/" . $1; } elsif (/^EndBlastParameters:$/) { last; } else { print "ERROR: Bad BlastParameter in config file.\n"; exit; } } } } close(CONFIG); print "Configuration Loaded\n\n"; # Generate TimeStamp print "Generating Time Stamp\n"; # Changed to get from commandline $timestamp = shift(@ARGV); print "TimeStamp: $timestamp\n"; # Ok Now we select our options... # Select folders to process print "\nType in the name(s) of the folder(s) you wish to process.\n"; print "Enter each one separately, when finished type 'done'.\n"; print "Enter name of directory > "; # Loop to save names of folders, if user types done it ends loop # @base_dirs = variable to save names of folders @base_dirs = (); while (<>) { chomp; $folder = $_; # No more inputs, end this loop if ($folder eq "done") { last; } push(@base_dirs, $folder); print "Added directory $folder\n"; print "Enter another or type 'done' > "; } # Just to let the user know what directories will be processed print "\nListing directories that will be processed.\n"; for ($indx = 0; $indx <= $#base_dirs; $indx++) { print $indx + 1 . ". $base_dirs[$indx]\n"; } print "\n"; # Blast print "Blast Parameters\n"; print "Do you wish to use Blast?\n"; print "Type 'y' or 'n' > "; while (<>) { chomp; if ($_ eq "y") { $blast = 1; print "Blast: Use (Yes)\n"; print "\nSelect Blast(s) to run\n"; print "List of Available Blast(s)\n"; for ($indx = 1; $indx <= $#blast_parameters; $indx++) { print "$indx: Blast Type: $blast_parameters[$indx]{'type'}\t" . "Database: $blast_parameters[$indx]{'db'}\n"; } # Clear the blast_run variable for ($indx = 1; $indx <= $#blast_parameters; $indx++) { $blast_run[$indx] = 0; } print "Enter the number for each one separately or type 'all' for all\n"; print "> "; while (<>) { chomp; $line = $_; if (/^(\d+)$/) { $blast_run[$1] = 1; print "You set #$1 -> $blast_parameters[$1]{'type'} " . "$blast_parameters[$1]{'db'}\n"; print "Enter another number or type 'done'\n"; print "> "; } elsif ($line eq 'all') { for ($indx = 1; $indx <= $#blast_parameters; $indx++) { $blast_run[$indx] = 1; } print "All Blast(s) set\n"; last; } elsif ($line eq 'done') { last; } else { print "ERROR: Invalid Input, enter a number or type 'all'\n"; print "> "; } } last; } elsif ($_ eq "n") { $blast = 0; print "Blast: Use (No)\n"; last; } else { print "ERROR: Bad Input\n"; print "Type either 'y' or 'n' > "; } } print "\n"; # Start Generation of Script print "\nGenerating Blast IT Script\n"; $script_file = $working_dir . "/blast_it_" . $timestamp . ".pl"; $log_file = $working_dir . "/blast_it_" . $timestamp . ".log"; if (!open(SCRIPT, ">", $script_file)) { print "ERROR: Unable to open script file $script_file.\n"; exit; } # Write Header for script print SCRIPT "#!/usr/bin/perl\n"; print SCRIPT "# Blast IT Script Generated $timestamp\n"; print SCRIPT "print \"Blast IT 2.0\\n\";\n"; print SCRIPT "print \"Author - Giles Goetz\\n\";\n\n"; # Now need Loop to add for each directory processed for ($main_indx = 0; $main_indx <= $#base_dirs; $main_indx++) { $base = $base_dirs[$main_indx]; # Now for each one we add the parts print SCRIPT "print \"Working in directory " . $working_dir . "\\n\";\n"; print SCRIPT "chdir \"$working_dir\";\n\n"; print SCRIPT "print \"Using base of " . $base . "\\n\";\n"; $base_dir = $data_dir . "/" . $base; print SCRIPT "print \"Changing directory to " . $base_dir . "\\n\";\n"; print SCRIPT "chdir \"$base_dir\";\n\n"; # $phred_input = $base_dir . "/chromat"; # $phred_out_fasta = $base_dir . "/phred_" . $base . ".fa"; # $phred_out_qual = $base_dir . "/phred_" . $base . ".fa.qual"; if ($blast) { # Reset the mergetable list so we can add the file names as we go along @mergetable_args = (); # Loop to set the Blast Stuff for ($indx = 1; $indx <= $#blast_run; $indx++) { if ($blast_run[$indx]) { $blast_type = $blast_parameters[$indx]{'type'}; $blast_db = $blast_parameters[$indx]{'db'}; $blast_db_loc = $blast_blastdbs . "/" . $blast_db; $blast_input = $base_dir . "/" . $base . ".fa";; $blast_output = $base_dir . "/" . $blast_type . "_" . $blast_db . "_" . $base . ".out"; $blast_v = 1; $blast_b = 1; print SCRIPT "print \"Starting " . $blast_type . " with " . $blast_db . " database.\\n\";\n"; @date_args = ("/bin/date", "+\\\%y\\\%m\\\%d_\\\%H\\\%M\\\%S"); print SCRIPT "system(\"@date_args\");\n"; @blast_args = ($blast_location . "/blastall", "-p " . $blast_type, "-d " . $blast_db_loc, "-i " . $blast_input, "-o " . $blast_output, "-v " . $blast_v, "-b " . $blast_b); print SCRIPT "system(\"@blast_args\");\n"; print SCRIPT "print \"Blast Finished\\n\";\n"; print SCRIPT "system(\"@date_args\");\n\n"; print SCRIPT "print \"\\n\";\n"; # Blast2Table Part goes here print SCRIPT "print \"Gathering Blast data into .tab file\\n\\n\";\n"; $blast2table_input = $blast_output; $blast2table_output = $blast_output . ".tab"; # @blast2table_args = ($working_dir . "/bin/blast2table.pl", # $blast2table_input, # $blast_type, # $blast_db); @blast2table_args = ($working_dir . "/bin/blast2spreadsheet.pl", "-i $blast2table_input", "-h 1"); print SCRIPT "system(\"@blast2table_args\");\n\n"; push(@mergetable_args, $blast2table_output); } } } # MergeTable goes here $mergetable_loc = $working_dir . "/bin/mergetable.pl"; $mergetable_output = $base_dir . "/blast_" . $base . ".out.tab"; unshift(@mergetable_args, $mergetable_output); unshift(@mergetable_args, $mergetable_loc); print SCRIPT "print \"Running Mergetable\\n\";\n"; print SCRIPT "system(\"@mergetable_args\");\n"; print SCRIPT "print \"Finished with Mergetable\\n\\n\";\n\n"; } # End For loop # Ok cleanup here print SCRIPT "print \"Cleaning up files\\n\";\n\n"; # mv the run script to backup directory $run_script_file = $working_dir . "/run_blast_it." . $timestamp . ".sh"; @mv_run_script_args = ("/bin/mv", $run_script_file, $working_dir . "/backup"); print SCRIPT "print \"Moving Run Script File: $run_script_file\\n\";\n"; print SCRIPT "system(\"@mv_run_script_args\");\n\n"; # mv the generated script to backup directory @mv_script_args = ("/bin/mv", $script_file, $working_dir . "/backup"); print SCRIPT "print \"Moving Script File: $script_file\\n\";\n"; print SCRIPT "system(\"@mv_script_args\");\n\n"; # mv the log file to backup directory @mv_log_args = ("/bin/mv", $log_file, $working_dir . "/backup"); print SCRIPT "print \"Moving Log File: $log_file\\n\";\n"; print SCRIPT "system(\"@mv_log_args\");\n\n"; close(SCRIPT); # Need to generate run script open(RUN_SCRIPT, ">", $run_script_file); # Print Header print RUN_SCRIPT "#!/bin/sh\n"; print RUN_SCRIPT "# Script automatically generated by Genomic IT\n"; print RUN_SCRIPT "# Author - Giles Goetz\n\n"; # Print Variables print RUN_SCRIPT 'SCRIPT_FILE="' . $script_file . '"' . "\n"; print RUN_SCRIPT 'LOG_FILE="' . $log_file . '"' . "\n\n"; # Startup Genomic Run Program print "Do you wish to run the script in the foreground or background?\n"; print "Type either 'f' or 'b' > "; while (<>) { chomp; if ('f' eq $_) { $run_script_args = "/usr/bin/perl " . '${SCRIPT_FILE} 2>&1 | ' . "/usr/bin/tee " . '${LOG_FILE}' . "\n"; print "Running Script in Foreground\n\n"; last; } elsif ('b' eq $_) { $run_script_args = "/usr/bin/perl " . '${SCRIPT_FILE} 1>${LOG_FILE} 2>&1 &' . "\n"; print "Running Script in Background\n\n"; last; } else { print "ERROR: Bad Input\n"; print "Type either 'f' or 'b' > "; } } # Print the run script line print RUN_SCRIPT $run_script_args; # Close Run Script close(RUN_SCRIPT); # Quit # EOF --------------------------------------------------------------------------------- 7- Create the blast_it.conf file and place it in the main working directory. It should have the following layout: DataDirectory: *Directory where the data folders can be found* BlastParameters: *leave it empty* Location: *Folder where the blast executive file can be found* BlastDBs: *Folder where the databases can be found* Blast1: *Type of blast* *NAME_DATABASE1.fasta* Blast2: *ex:blastn* *NAME_DATABASE2.fasta* ## Add as many databases as you want --------------------------------------------------------------------------------- 8 - Copy and save the following script as blast2table.pl and place it the /bin folder #!/usr/bin/perl -W ##################################################### ## Perl Script - Author: Giles Goetz ## Designed to parse blast output file to some format ##################################################### while(@ARGV) { $file = shift(@ARGV); $blast_type = shift(@ARGV); $blast_db = shift(@ARGV); $table_file = $file . ".tab"; open(FILE_READ, "<", $file); open(FILE_WRITE, ">", $table_file); $hits = 0; $no_hits = 0; $errors = 0; @no_hits = ('No hits list'); @errors = ('Errors list'); # Clear Variables undef @seq_id; undef @bps; undef @db; undef @acc_num; undef @tbh; undef @score; undef @e_value; while() { chomp; $line = $_; # Look for statement 'Query= ' $p1 = 'Query= '; if(/^$p1(.*)$/) { # Parse the Values after Query statement $query_string = $1; $_ = $query_string; # Check For Cross Match Query String if(/^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.*)$/) { $seq_id = $1; $bps = $2; push(@seq_id, $seq_id); push(@bps, $bps); } elsif(/^(\S+)\s+(\d+)$/) { # Check for RemoveXs Query String $seq_id = $1; $bps = $2; push(@seq_id, $seq_id); push(@bps, $bps); } elsif(/^[^|]+\|\d+\|[^|]+\|([^|]+)\|(.*)$/) { # Check for Generate Fasta Query String $seq_id = $1; $notfound = 1; while ($notfound) { chomp($line = ); $_ = $line; if (/^\s+\((\d+)\s\S+\)(.*)$/) { $bps = $1; $notfound = 0; } } push(@seq_id, $seq_id); push(@bps, $bps); } # Now find 'Search...done' string $next_seq = 0; while($next_seq == 0) { if (chomp($line = ) == 0) { print "\nERROR: Bad End of line in Input File: " . $file . "\n"; print "Check to see if blast finished.\n\n"; exit; } $p1 = 'Searching'; $p2 = '([.]*)'; $p3 = 'done'; $_ = $line; if(/^$p1$p2$p3$/) { $next_seq = 1; # Discard this line chomp($line = ); # Test for no hits chomp($line = ); $p1 = "\Q ***** No hits found ******\E"; $_ = $line; if (/^$p1$/) { $no_hits += 1; push(@no_hits,$file_name); push(@db,""); push(@acc_num,""); push(@tbh,"No Hits Found"); push(@score,""); push(@e_value,""); } else { chomp($line = ); chomp($line = ); chomp($data = ); $hits += 1; push(@hits,$file_name); # Parse the Data Line $_ = $data; if (/^(\S+)\|(\S*)\|(.*)\s{2,3}(\d+)\s\s\s(.*)$/) { $db = $1; $acc_num = $2; $tbh = $3; $score = $4; $e_value = $5; push(@db,$db); push(@acc_num,$acc_num); push(@tbh,$tbh); push(@score,$score); push(@e_value,$e_value); } } } # End of check for Searching...done } # End of While loop } # End of check for Query= } # End of File Loop # Print Header for Table File print FILE_WRITE "SeqID\tBPs\tType of Blast\tDatabase\tAccession Number\t", "Top Blast Hit\tScore\tE Value\n"; # Foreach Loop to Print to File foreach $index (0 .. $#seq_id) { print FILE_WRITE $seq_id[$index], "\t", $bps[$index], "\t", $blast_type, " ", $blast_db, "\t", $db[$index], "\t", $acc_num[$index], "\t", $tbh[$index], "\t", $score[$index], "\t", $e_value[$index], "\n"; } # End of Foreach close(FILE_READ); close(FILE_WRITE); } # End of ARGV Loop ------------------------------------------------------------------------- 9. To run blast_it.pl Do not run the blast_it.pl itself, this will not work. Run blast_it.sh and follow the instructions on the screen.