Index: Bio/Tools/Run/RemoteBlast.pm =================================================================== RCS file: /home/repository/bioperl/bioperl-live/Bio/Tools/Run/RemoteBlast.pm,v retrieving revision 1.5 diff -a -u -r1.5 RemoteBlast.pm --- Bio/Tools/Run/RemoteBlast.pm 16 May 2001 14:57:48 -0000 1.5 +++ Bio/Tools/Run/RemoteBlast.pm 22 Mar 2005 16:50:36 -0000 @@ -1,8 +1,8 @@ -# $Id: RemoteBlast.pm,v 1.5 2001/05/16 14:57:48 heikki Exp $ +# $Id: RemoteBlast.pm,v 1.21 2004/11/17 23:54:50 bosborne Exp $ # -# BioPerl module for Bio::Tools::StandAloneBlast +# BioPerl module for Bio::Tools::Run::RemoteBlast # -# Cared for by Jason Stajich +# Cared for by Jason Stajich, Mat Wiepert # # Copyright Jason Stajich # @@ -12,28 +12,95 @@ =head1 NAME -Bio::Tools::Run::RemoteBlast - Object for remote execution of the NCBI Blast via HTTP +Bio::Tools::Run::RemoteBlast - Object for remote execution of the NCBI Blast +via HTTP =head1 SYNOPSIS -Remote-blast "factory object" creation and blast-parameter initialization: + #Remote-blast "factory object" creation and blast-parameter initialization - $factory = Bio::Tools::Run::RemoteBlast->new(@params); + use Bio::Tools::Run::RemoteBlast; + use strict; + my $prog = 'blastp'; + my $db = 'swissprot'; + my $e_val= '1e-10'; + + my @params = ( '-prog' => $prog, + '-data' => $db, + '-expect' => $e_val, + '-readmethod' => 'SearchIO' ); + + my $factory = Bio::Tools::Run::RemoteBlast->new(@params); + + #change a query paramter + $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]'; + + #change a retrieval parameter + $Bio::Tools::Run::RemoteBlast::RETRIEVALHEADER{'DESCRIPTIONS'} = 1000; + + #remove a parameter + delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'}; + + #$v is just to turn on and off the messages + my $v = 1; + + my $str = Bio::SeqIO->new(-file=>'amino.fa' , -format => 'fasta' ); + + while (my $input = $str->next_seq()){ + #Blast a sequence against a database: + + #Alternatively, you could pass in a file with many + #sequences rather than loop through sequence one at a time + #Remove the loop starting 'while (my $input = $str->next_seq())' + #and swap the two lines below for an example of that. + my $r = $factory->submit_blast($input); + #my $r = $factory->submit_blast('amino.fa'); + + print STDERR "waiting..." if( $v > 0 ); + while ( my @rids = $factory->each_rid ) { + foreach my $rid ( @rids ) { + my $rc = $factory->retrieve_blast($rid); + if( !ref($rc) ) { + if( $rc < 0 ) { + $factory->remove_rid($rid); + } + print STDERR "." if ( $v > 0 ); + sleep 5; + } else { + my $result = $rc->next_result(); + #save the output + my $filename = $result->query_name()."\.out"; + $factory->save_output($filename); + $factory->remove_rid($rid); + print "\nQuery Name: ", $result->query_name(), "\n"; + while ( my $hit = $result->next_hit ) { + next unless ( $v > 0); + print "\thit name is ", $hit->name, "\n"; + while( my $hsp = $hit->next_hsp ) { + print "\t\tscore is ", $hsp->score, "\n"; + } + } + } + } + } + } -Blast a sequence against a database: + # This example shows how to change a CGI parameter: + $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = 'BLOSUM25'; - $str = Bio::SeqIO->new(-file=>'t/amino.fa' , '-format' => 'Fasta' ); - $input = $str->next_seq(); - $input2 = $str->next_seq(); - $blast_report = $factory->runblast($input); + # And this is how to delete a CGI parameter: + delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'}; -Various additional options and input formats are available. See the -DESCRIPTION section for details. =head1 DESCRIPTION Class for remote execution of the NCBI Blast via HTTP. +For a description of the many CGI parameters see: +http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html + +Various additional options and input formats are available. + =head1 FEEDBACK =head2 Mailing Lists @@ -56,7 +123,7 @@ =head1 AUTHOR - Jason Stajich -Email jason@chg.mc.duke.edu +Email jason@bioperl.org =head1 APPENDIX @@ -67,93 +134,210 @@ package Bio::Tools::Run::RemoteBlast; -use vars qw($AUTOLOAD @ISA %BLAST_PARAMS $URLBASE %HEADER %RETRIEVALHEADER - $RIDLINE); +use vars qw($AUTOLOAD @ISA $URLBASE %HEADER %RETRIEVALHEADER + $RIDLINE $MODVERSION %PUTPARAMS %GETPARAMS); use strict; -use Bio::Root::RootI; +use Bio::Root::Root; use Bio::Root::IO; use Bio::SeqIO; use IO::String; use Bio::Tools::BPlite; -use Bio::Tools::Blast; +use Bio::SearchIO; use LWP; use HTTP::Request::Common; -BEGIN { + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +BEGIN { + $MODVERSION = $Bio::Root::Version::VERSION; $URLBASE = 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi'; + + # In GET/PUTPARAMS the values are regexes which validate the input. + %PUTPARAMS = ( + 'AUTO_FORMAT' => '(Off|(Semi|Full)auto)', # Off, Semiauto, Fullauto + 'COMPOSITION_BASED_STATISTICS' => '(yes|no)', # yes, no + 'DATABASE' => '.*', + 'DB_GENETIC_CODE' => '([1-9]|1[1-6]|2(1|2))', # 1..16,21,22 + 'ENDPOINTS' => '(yes|no)', # yes,no + 'ENTREZ_QUERY' => '.*', + 'EXPECT' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double + 'FILTER' => '[LRm]', # L or R or m + 'GAPCOSTS' => '-?\d+(\.\d+)\s+i-?\d+(\.\d+)', + # Two space separated float values + 'GENETIC_CODE' => '([1-9]|1[1-6]|2(1|2))', # 1..16,21,22 + 'HITLIST_SIZE' => '\d+', # Positive integer + 'I_THRESH' => '-?\d+(\.\d+)([eE]-\d+)?', # float + 'LAYOUT' => '(One|Two)Windows?', # onewindow, twowindows + 'LCASE_MASK' => '(yes|no)', # yes, no + 'MATRIX_NAME' => '.*', + 'NUCL_PENALTY' => '-\d+', # Negative integer + 'NUCL_REWARD' => '-?\d+', # Integer + 'OTHER_ADVANCED' => '.*', + 'PERC_IDENT' => '\d\d+', # Integer, 0-99 inclusive + 'PHI_PATTERN' => '.*', + 'PROGRAM' => '(blastp|t?blast[nx])', + # tblastn, tblastx, blastp, blastn, blastx + 'QUERY' => '.*', + 'QUERY_FILE' => '.*', + 'QUERY_BELIEVE_DEFLINE' => '(yes|no)', # yes, no + 'QUERY_FROM' => '\d+', # Positive integer + 'QUERY_TO' => '\d+', # Positive integer + 'SEARCHSP_EFF' => '\d+', # Positive integer + 'SERVICE' => '(plain|p[sh]i|(rps|mega)blast)', + # plain,psi,phi,rpsblast,megablast + 'THRESHOLD' => '-?\d+', # Integer + 'UNGAPPED_ALIGNMENT' => '(yes|no)', # yes, no + 'WORD_SIZE' => '\d+' # Positive integer + ); + %GETPARAMS = ( + 'ALIGNMENTS' => '\d+', # Positive integer + 'ALIGNMENT_VIEW' => + '(Pairwise|(Flat)?QueryAnchored(NoIdentities)?|Tabular)', + # Pairwise, QueryAnchored, QueryAnchoredNoIdentities, + # FlatQueryAnchored, FlatQueryAnchoredNoIdentities, Tabular + 'DESCRIPTIONS' => '\d+', # Positive integer + 'ENTREZ_LINKS_NEW_WINDOW' => '(yes|no)', # yes, no + 'EXPECT_LOW' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double + 'EXPECT_HIGH' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double + 'FORMAT_ENTREZ_QUERY' => '', + 'FORMAT_OBJECT' => + '(Alignment|Neighbors|PSSM|SearchInfo|TaxBlast(Parent|MultiFrame)?)', + # Alignment, Neighbors, PSSM, SearchInfo + # TaxBlast, TaxblastParent, TaxBlastMultiFrame + 'FORMAT_TYPE' => '((HT|X)ML|ASN\.1|Text)', + # HTML, Text, ASN.1, XML + 'NCBI_GI' => '(yes|no)', # yes, no + 'RID' => '.*', + 'RESULTS_FILE' => '(yes|no)', # yes, no + 'SERVICE' => '(plain|p[sh]i|(rps|mega)blast)', + # plain,psi,phi,rpsblast,megablast + 'SHOW_OVERVIEW' => '(yes|no)' # yes, no + ); + + # Default values go in here for PUT %HEADER = ('CMD' => 'Put', - 'PROGRAM' => '', - 'DATABASE' => '', - 'FILTER' => 'L', - 'EXPECT' => '', - 'QUERY' => '', - 'CDD_SEARCH' => 'off', - 'COMPOSITION_BASED_STATISTICS' => 'off', 'FORMAT_OBJECT' => 'Alignment', - 'SERVICE' => 'plain', + 'COMPOSITION_BASED_STATISTICS' => 'off', + 'DATABASE' => 'nr', + 'EXPECT' => '1e-3', + 'FILTER' => 'L', + 'PROGRAM' => 'blastp', + 'SERVICE' => 'plain' ); - + + # Default values go in here for GET %RETRIEVALHEADER = ('CMD' => 'Get', - 'RID' => '', + 'ALIGNMENTS' => '50', 'ALIGNMENT_VIEW' => 'Pairwise', - 'DESCRIPTIONS' => 100, - 'ALIGNMENTS' => 50, - 'FORMAT_TYPE' => 'Text', + 'DESCRIPTIONS' => '100', + 'FORMAT_TYPE' => 'Text' ); - - $RIDLINE = 'RID\s+=\s+(\d+-\d+-\d+)'; - - %BLAST_PARAMS = ( 'prog' => 'blastp', - 'data' => 'nr', - 'expect' => '1e-3', - 'readmethod' => 'BPlite' - ); + $RIDLINE = 'RID\s+=\s+(\S+)'; } -@ISA = qw(Bio::Root::RootI Bio::Root::IO); - sub new { - my ($caller, @args) = @_; - # chained new - my $self = $caller->SUPER::new(@args); - # so that tempfiles are cleaned up - $self->_initialize_io(); - my ($prog, $data, $expect, - $readmethod) = $self->_rearrange([qw(PROG DATA - EXPECT - READMETHOD)], + my ($caller, @args) = @_; + # chained new + my $self = $caller->SUPER::new(@args); + # so that tempfiles are cleaned up + $self->_initialize_io(); + my ($prog, $data, + $readmethod) = $self->_rearrange([qw(PROG DATA + READMETHOD)], @args); - - $readmethod = $BLAST_PARAMS{'readmethod'} unless defined $readmethod; - $prog = $BLAST_PARAMS{'prog'} unless defined $prog; - $data = $BLAST_PARAMS{'data'} unless defined $data; - $expect = $BLAST_PARAMS{'expect'} unless defined $expect; - $self->readmethod($readmethod); - $self->program($prog); - $self->database($data); - $self->expect($expect); - - return $self; + # Use these two parameters for backward-compatibility. + # Overridden by PROGRAM and DATABASE if supplied. + $self->submit_parameter('PROGRAM',$prog) if $prog; + $self->submit_parameter('DATABASE',$data) if $data; + + $readmethod = 'SearchIO' unless defined $readmethod; + $self->readmethod($readmethod); + + # Now read the rest of the parameters and set them all + + # PUT parameters first + my @putValues = $self->_rearrange([keys %PUTPARAMS],@args); + my %putNames; + @putNames{keys %PUTPARAMS} = @putValues; + foreach my $putName (keys %putNames) { + $self->submit_parameter($putName,$putNames{$putName}); + } + # GET parameters second + my @getValues = $self->_rearrange([keys %GETPARAMS],@args); + my %getNames; + @getNames{keys %GETPARAMS} = @getValues; + foreach my $getName (keys %getNames) { + $self->retrieve_parameter($getName,$getNames{$getName}); + } + + return $self; +} + +=head2 retrieve_parameter + + Title : retrieve_parameter + Usage : my $db = $self->retrieve_parameter + Function: Get/Set the named parameter for the retrieve_blast operation. + Returns : string + Args : $name : name of GET parameter + $val : optional value to set the parameter to + +=cut + +sub retrieve_parameter { + my ($self, $name, $val) = @_; + $name = uc($name); + $self->throw($name." is not a valid GET parameter.") unless + exists $GETPARAMS{$name}; + if (defined $val) { + my $regex = $GETPARAMS{$name}; + $val =~ m/^$regex$/i or + $self->throw("Value ".$val." for GET parameter ".$name." does not match expression ".$regex.". Rejecting."); + $RETRIEVALHEADER{$name} = $val; + } + return $RETRIEVALHEADER{$name}; +} + +=head2 submit_parameter + + Title : submit_parameter + Usage : my $db = $self->submit_parameter + Function: Get/Set the named parameter for the submit_blast operation. + Returns : string + Args : $name : name of PUT parameter + $val : optional value to set the parameter to + +=cut + +sub submit_parameter { + my ($self, $name, $val) = @_; + $name = uc($name); + $self->throw($name." is not a valid PUT parameter.") unless + exists $PUTPARAMS{$name}; + if (defined $val) { + my $regex = $PUTPARAMS{$name}; + $val =~ m/^$regex$/i or + $self->throw("Value ".$val." for PUT parameter ".$name." does not match expression ".$regex.". Rejecting."); + $HEADER{$name} = $val; + } + return $HEADER{$name}; } =head2 header Title : header Usage : my $header = $self->header - Function: Get/Set HTTP header for blast query + Function: Get HTTP header for blast query Returns : string Args : none - + =cut sub header { my ($self) = @_; - my %h = %HEADER; - $h{'PROGRAM'} = $self->program; - $h{'DATABASE'} = $self->database; - $h{'EXPECT'} = $self->expect; - return %h; + return %HEADER; } =head2 readmethod @@ -179,7 +363,7 @@ Title : program Usage : my $prog = $self->program - Function: Get/Set the program to run + Function: Get/Set the program to run. Retained for backwards-compatibility. Returns : string Args : string [ blastp, blastn, blastx, tblastn, tblastx ] @@ -187,15 +371,7 @@ sub program { my ($self, $val) = @_; - if( defined $val ) { - $val = lc $val; - if( $val !~ /t?blast[pnx]/ ) { - $self->warn("trying to set program to an invalid program name ($val) -- defaulting to blastp"); - $val = 'blastp'; - } - $self->{'_program'} = $val; - } - return $self->{'_program'}; + return $self->submit_parameter('PROGRAM',$val); } @@ -203,7 +379,7 @@ Title : database Usage : my $db = $self->database - Function: Get/Set the database to search + Function: Get/Set the database to search. Retained for backwards-compatibility. Returns : string Args : string [ swissprot, nr, nt, etc... ] @@ -211,10 +387,7 @@ sub database { my ($self, $val) = @_; - if( defined $val ) { - $self->{'_database'} = $val; - } - return $self->{'_database'}; + return $self->submit_parameter('DATABASE',$val); } @@ -222,7 +395,7 @@ Title : expect Usage : my $expect = $self->expect - Function: Get/Set the E value cutoff + Function: Get/Set the E value cutoff. Retained for backwards-compatibility. Returns : string Args : string [ '1e-4' ] @@ -230,16 +403,13 @@ sub expect { my ($self, $val) = @_; - if( defined $val ) { - $self->{'_expect'} = $val; - } - return $self->{'_expect'}; + return $self->submit_parameter('EXPECT',$val); } =head2 ua Title : ua - Usage : my $ua = $self->ua or + Usage : my $ua = $self->ua or $self->ua($ua) Function: Get/Set a LWP::UserAgent for use Returns : reference to LWP::UserAgent Object @@ -249,9 +419,12 @@ =cut sub ua { - my ($self, $value) = @_; + my ($self, $value) = @_; if( ! defined $self->{'_ua'} ) { - $self->{'_ua'} = new LWP::UserAgent; + $self->{'_ua'} = new LWP::UserAgent(); + my $nm = ref($self); + $nm =~ s/::/_/g; + $self->{'_ua'}->agent("bioperl-$nm/$MODVERSION"); } return $self->{'_ua'}; } @@ -259,7 +432,7 @@ =head2 proxy Title : proxy - Usage : $httpproxy = $db->proxy('http') or + Usage : $httpproxy = $db->proxy('http') or $db->proxy(['http','ftp'], 'http://myproxy' ) Function: Get/Set a proxy for use of proxy Returns : a string indicating the proxy @@ -270,7 +443,7 @@ sub proxy { my ($self,$protocol,$proxy) = @_; - return undef if ( !defined $self->ua || !defined $protocol + return undef if ( !defined $self->ua || !defined $protocol || !defined $proxy ); return $self->ua->proxy($protocol,$proxy); } @@ -292,7 +465,7 @@ } sub each_rid { - my ($self) = @_; + my ($self) = @_; return keys %{$self->{'_rids'}}; } @@ -310,91 +483,156 @@ =cut sub submit_blast { - my ($self, $input) = @_; - my @seqs = $self->_load_input($input); + my ($self, $input) = @_; + my @seqs = $self->_load_input($input); return 0 unless ( @seqs ); my $tcount = 0; - my %header = $self->header; + my %header = $self->header; foreach my $seq ( @seqs ) { - $header{'QUERY'} = $seq->seq(); + #If query has a fasta header, the output has the query line. + $header{'QUERY'} = ">".(defined $seq->display_id() ? $seq->display_id() : ""). + " ".(defined $seq->desc() ? $seq->desc() : "")."\n".$seq->seq(); my $request = POST $URLBASE, [%header]; - $self->warn($request->as_string) if ( $self->verbose > 0); - my $response = $self->ua->request( $request); + $self->warn($request->as_string) if ($self->verbose > 0); + my $response = $self->ua->request($request); if( $response->is_success ) { - if( $self->verbose > 0 ) { - open(TMP, ">$ENV{TEMPDIR}/j.html"); - print TMP $response->content; - } my @subdata = split(/\n/, $response->content ); my $count = 0; foreach ( @subdata ) { - if( /$RIDLINE/ ) { - $count++; - print STDERR $_ if( $self->verbose > 0); - $self->add_rid($1); - last; - } + if( /$RIDLINE/ ) { + $count++; + print STDERR $_ if( $self->verbose > 0); + $self->add_rid($1); + last; + } } if( $count == 0 ) { - $self->warn("req was ". $request->as_string() . "\n"); - $self->warn(join('', @subdata)); - } + $self->warn("req was ". $request->as_string() . "\n"); + $self->warn(join('', @subdata)); + } $tcount += $count; - } else { + } else { # should try and be a little more verbose here $self->warn("req was ". $request->as_string() . "\n" . $response->error_as_HTML); - } + $tcount = -1; + } } return $tcount; } +=head2 retrieve_blast + + Title : retrieve_blast + Usage : my $blastreport = $blastfactory->retrieve_blast($rid); + Function: Attempts to retrieve a blast report from remote blast queue + Returns : -1 on error, + 0 on 'job not finished', + Bio::Tools::BPlite or Bio::Tools::Blast object + (depending on how object was initialized) on success + Args : Remote Blast ID (RID) + +=cut + sub retrieve_blast { - my($self, $rid) = @_; - my ($fh,$tempfile) = $self->tempfile(); - my %hdr = %RETRIEVALHEADER; - $hdr{'RID'} = $rid; - my $req = POST $URLBASE, [%hdr]; - if( $self->verbose > 0 ) { - $self->warn("retrieve request is " . $req->as_string()); - } - my $response = $self->ua->request($req, $tempfile); - if( $self->verbose > 0 ) { - open(TMP, $tempfile) or $self->throw("cannot open $tempfile"); - while() { print $_ } - close TMP; - } - if( $response->is_success ) { - my $size = -s $tempfile; - if( $size > 1000 ) { - my $blastobj; - if( $self->readmethod =~ /Blast/ ) { - $blastobj = new Bio::Tools::Blast(-file => $tempfile); - } else { - $blastobj = new Bio::Tools::BPlite(-file => $tempfile); + my($self, $rid) = @_; + my ($fh,$tempfile) = $self->tempfile(); + close $fh; #explicit close + my %hdr = %RETRIEVALHEADER; + $hdr{'RID'} = $rid; + my $req = POST $URLBASE, [%hdr]; + if( $self->verbose > 0 ) { + $self->warn("retrieve request is " . $req->as_string()); + } + my $response = $self->ua->request($req, $tempfile); + if( $response->is_success ) { + # read file until QBlastInfoEnd to pull out status + my $status = ''; + my $junk = ''; + open(TMP, $tempfile) or $self->throw("cannot open $tempfile"); + while( defined (my $line = ) ) { + last if ($line =~ /QBlastInfoEnd/); + ($junk, $status) = (split /=/, $line) if ($line =~ /waiting|ready/i); } - return $blastobj; - } elsif( $size < 500 ) { # search had a problem - open(ERR, "<$tempfile") or $self->throw("cannot open file $tempfile"); - $self->warn(join("", )); - close ERR; + close TMP; + + if( $self->verbose > 0 ) { + #print content of reply if verbose > 1 + open(TMP, $tempfile) or $self->throw("cannot open $tempfile"); + while() { print $_; } + close TMP; + } + # Check job status + if ( $status =~ /waiting/i ) { + return 0; + } elsif ( $status =~ /ready/i ) { + my $blastobj; + if( $self->readmethod =~ /BPlite/ ) { + $blastobj = new Bio::Tools::BPlite(-file => $tempfile); + } else { + $blastobj = new Bio::SearchIO( -file => $tempfile, + -format => 'blast'); + } + + ## store filename in object ## + $self->file($tempfile); + return $blastobj; + } else { # search had a problem + open(ERR, "<$tempfile") or $self->throw("cannot open file $tempfile"); + $self->warn(join("", )); + close ERR; + return -1; + } + } else { + $self->warn($response->error_as_HTML); return -1; - } else { # still working - return 0; } - } else { - $self->warn($response->error_as_HTML); - return -1; - } +} + +=head2 save_output + + Title : saveoutput + Usage : my $saveoutput = $self->save_output($filename) + Function: Method to save the blast report + Returns : 1 (throws error otherwise) + Args : string [rid, filename] + +=cut + +sub save_output { + my ($self, $filename) = @_; + if( ! defined $filename ) { + $self->throw("Can't save blast output. You must specify a filename to save to."); + } + my $blastfile = $self->file; + #open temp file and output file, have to filter out some HTML + open(TMP, $blastfile) or $self->throw("cannot open $blastfile"); + + open(SAVEOUT, ">$filename") or $self->throw("cannot open $filename"); + my $seentop = 0; + while(my $l = ) { + next if ($l =~ /
/);
+		if( $l =~ /^(?:[T]?BLAST[NPX])\s*.+$/i ||
+			 $l =~/^RPS-BLAST\s*.+$/i ) {
+			$seentop=1;
+		}
+		next if !$seentop;
+		if( $seentop ) {
+			print SAVEOUT $l;
+		}
+	}
+	close TMP;
+	close SAVEOUT;
+	return 1;
 }
 
 sub _load_input {
     my ($self, $input) = @_;
-    
-    if( ! defined $input ) { 
-	$self->throw("Calling runblast with no input");	
-    }    
+
+    if( ! defined $input ) {
+		$self->throw("Calling remote blast with no input");
+    }
     my @seqs;
     if( ! ref $input ) {
 	if( -e $input ) {