# $Id: LocalServerBlast.pm,v 0.01 2005/03/18 23:54:50 bosborne Exp $ # # BioPerl module for Bio::Tools::Run::LocalServerBlast # # Cared for by Madeleine Lemieux # # Copyright Jason Stajich, Max Wiepert, Madeleine Lemieux # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::LocalServerBlast - Object for local execution of wwwBlast via HTTP =head1 SYNOPSIS # LocalServerBlast "factory object" creation and Blast parameter initialization # DB needs to be in BLASTDIR/db use Bio::Tools::Run::LocalServerBlast; use strict; my @params = ( '-server' => 'http://127.0.0.1/blast/Blast.cgi', '-prog' => 'blastp', '-data' => 'my_db', '-expect' => '1e-6', '-readmethod' => 'SearchIO' ); my $factory = Bio::Tools::Run::LocalServerBlast->new(@params); #change a blast parameter $Bio::Tools::Run::LocalServerBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]'; #remove a parameter delete $Bio::Tools::Run::LocalServerBlast::HEADER{'FILTER'}; # $verbose is just to turn on and off the messages my $verbose = 1; # You can loop through sequences one at a time my $str = Bio::SeqIO->new(-file=>'amino.fa' , -format => 'fasta' ); while (my $input = $str->next_seq()){ my $r = $factory->submit_blast($input); my $result = $r->next_result(); #save the output my $filename = $result->query_name()."\.out"; $factory->save_output($filename); print "\nQuery Name: ", $result->query_name(), "\n"; while ( my $hit = $result->next_hit ) { next unless ( $verbose > 0); print "\thit name is ", $hit->name, "\n"; while( my $hsp = $hit->next_hsp ) { print "\t\tscore is ", $hsp->score, "\n"; } } } # Alternatively, Blast multiple sequences from a file against a database. Retrieve the # results as above. my $r = $factory->submit_blast('amino.fa'); # This example shows how to change a CGI parameter: $Bio::Tools::Run::LocalServerBlast::HEADER{'MATRIX_NAME'} = 'BLOSUM25'; # And this is how to delete a CGI parameter: delete $Bio::Tools::Run::LocalServerBlast::HEADER{'FILTER'}; =head1 DESCRIPTION Class for executing Blast from a local wwwBlast server via HTTP. For a description of the many CGI parameters see: http://athena.bioc.uvic.ca/blast/readme.html#Installation or the wwwBlast readme files. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bio.perl.org/MailList.html - About the mailing lists =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via email or the web: bioperl-bugs@bio.perl.org http://bio.perl.org/bioperl-bugs/ =head1 AUTHOR - Jason Stajich Email jason@bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::LocalServerBlast; use vars qw($AUTOLOAD @ISA $URLBASE %HEADER $MODVERSION %PARAMETERS); use strict; use Bio::Root::Root; use Bio::Root::IO; use Bio::SeqIO; use IO::String; use Bio::Tools::BPlite; use Bio::SearchIO; use LWP; use HTTP::Request::Common; @ISA = qw(Bio::Root::Root Bio::Root::IO); BEGIN { $MODVERSION = $Bio::Root::Version::VERSION; $URLBASE = 'http://localhost/blast/Blast.cgi'; # The values of %PARAMETERS are regexes which validate the input. %PARAMETERS = ( 'AUTO_FORMAT' => '(Off|(Semi|Full)auto)', # Off, Semiauto, Fullauto 'COMPOSITION_BASED_STATISTICS' => '(yes|no)', # yes, no 'DATABASE' => '.*', # needs to be defined in blast.rc # default test_aa_db 'DB_GENETIC_CODE' => '([1-9]|1[1-6]|2(1|2))', # 1..16,21,22 'ENDPOINTS' => '(yes|no)', # yes,no 'ENTREZ_QUERY' => '.*', 'EXPECT' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double 'FILTER' => '[LRm]', # L or R or m 'GAPCOSTS' => '-?\d+(\.\d+)\s+i-?\d+(\.\d+)', # Two space separated float values 'GAP_EXTEND' => '\d+', # 0 or unset uses server default 'GAP_OPEN' => '\d+', # 0 or unset uses server default 'GENETIC_CODE' => '([1-9]|1[1-6]|2(1|2))', # 1..16,21,22 'HITLIST_SIZE' => '\d+', # Positive integer 'I_THRESH' => '-?\d+(\.\d+)([eE]-\d+)?', # float 'LAYOUT' => '(One|Two)Windows?', # onewindow, twowindows 'LCASE_MASK' => '(yes|no)', # yes, no 'MAT_PARAM' => '(PAM(30|70)|BLOSUM(80|62|45)) \d+ \d+', # sets matrix name and integer open & extend gap costs 'MATRIX' => '(PAM(30|70)|BLOSUM(80|62|45))', 'NUCL_PENALTY' => '-\d+', # Negative integer 'NUCL_REWARD' => '-?\d+', # Integer 'NUM_OF_BITS' => '\d+', # number of bits to trigger gapping 'OTHER_ADVANCED' => '.*', 'PERC_IDENT' => '\d\d+', # Integer, 0-99 inclusive 'PHI_PATTERN' => '.*', 'PROGRAM' => '(blastp|t?blast[nx])', # default is blastn # others: tblastn, tblastx, blastp, blastx 'QUERY' => '.*', 'QUERY_FILE' => '.*', 'QUERY_BELIEVE_DEFLINE' => '(yes|no)', # yes, no 'QUERY_FROM' => '\d+', # Positive integer 'QUERY_TO' => '\d+', # Positive integer 'SEARCHSP_EFF' => '\d+', # Positive integer 'SEQFILE' => '.*', 'SEQUENCE' => '.*', # SEQUENCE checked before SEQFILE 'SERVICE' => '(plain|p[sh]i|(rps|mega)blast)', # plain,psi,phi,rpsblast,megablast 'THRESHOLD_2' => '-?\d+', # Integer 'UNGAPPED_ALIGNMENT' => '.*', # gapped is default; if set, does ungapped 'WORD_SIZE' => '\d+', # Positive integer 'X_DROPOFF' => '\d+', # 0 or missing uses server default 'ALIGNMENTS' => '\d+', # Positive integer 'ALIGNMENT_VIEW' => '[0123479]', # Pairwise, QueryAnchored, QueryAnchoredNoIdentities, # FlatQueryAnchored, FlatQueryAnchoredNoIdentities, # BlastXML, Tabular 'BLAST_TYPE' => '.*', # ??? 'COLOR_SCHEMA' => '[1..6]', # see docs/color_schema.html 'DESCRIPTIONS' => '\d+', # Positive integer 'ENTREZ_LINKS_NEW_WINDOW' => '(yes|no)', # yes, no 'ENTREZ_QUERY' => '(yes|no)', 'EXPECT_LOW' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double 'EXPECT_HIGH' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double 'FORMAT_ENTREZ_QUERY' => '', 'FORMAT_OBJECT' => '(Alignment|Neighbors|PSSM|SearchInfo|TaxBlast(Parent|MultiFrame)?)', # Alignment, Neighbors, PSSM, SearchInfo # TaxBlast, TaxblastParent, TaxBlastMultiFrame 'FORMAT_TYPE' => '((HT|X)ML|ASN\.1|Text)', # HTML, Text, ASN.1, XML 'NCBI_GI' => '(yes|no)', # yes, no 'OOF_ALIGN' => '\d+', # non-zero value sets frame shift penalty 'OTHER_ADVANCED' => '.*', # cop-out: let server decide if valid 'OVERVIEW' => '(yes|no)', # yes, no 'RID' => '.*', 'RESULTS_FILE' => '(yes|no)', # yes, no 'RPSBLAST' => '(yes|no)', 'SERVICE' => '(plain|p[sh]i|(rps|mega)blast)', # plain,psi,phi,rpsblast,megablast 'SHOW_OVERVIEW' => '(yes|no)', # yes, no 'TAX_BLAST' => '(yes|no)', 'XML_OUTPUT' => '(yes|no)', ); # Default values go in here for PUT %HEADER = ('CMD' => 'Put', 'OVERVIEW' => 'no', 'COMPOSITION_BASED_STATISTICS' => 'off', 'DATABASE' => 'test_aa_db', 'EXPECT' => '1e-3', 'FILTER' => 'L', 'PROGRAM' => 'blastp', 'SERVICE' => 'plain', 'ALIGNMENTS' => '50', 'ALIGNMENT_VIEW' => '0', 'DESCRIPTIONS' => '100', 'FORMAT_TYPE' => 'Text' ); } sub new { my ($caller, @args) = @_; # chained new my $self = $caller->SUPER::new(@args); # so that tempfiles are cleaned up $self->_initialize_io(); my ($prog, $data, $readmethod) = $self->_rearrange([qw(PROG DATA READMETHOD)], @args); # Use these two parameters for backward-compatibility. # Overridden by PROGRAM and DATABASE if supplied. $self->submit_parameter('PROGRAM',$prog) if $prog; $self->submit_parameter('DATABASE',$data) if $data; $readmethod = 'SearchIO' unless defined $readmethod; $self->readmethod($readmethod); # Now read the rest of the parameters and set them all # Load parameters my @putValues = $self->_rearrange([keys %PARAMETERS],@args); my %putNames; @putNames{keys %PARAMETERS} = @putValues; foreach my $putName (keys %putNames) { $self->submit_parameter($putName,$putNames{$putName}); } return $self; } =head2 submit_parameter Title : submit_parameter Usage : my $db = $self->submit_parameter Function: Get/Set the named parameter for the submit_blast operation. Returns : string Args : $name : name of PUT parameter $val : optional value to set the parameter to =cut sub submit_parameter { my ($self, $name, $val) = @_; $name = uc($name); $self->throw($name." is not a valid parameter.") unless exists $PARAMETERS{$name}; if (defined $val) { my $regex = $PARAMETERS{$name}; $val =~ m/^$regex$/i or $self->throw("Value ".$val." for parameter ".$name." does not match expression ".$regex.". Rejecting."); $HEADER{$name} = $val; } return $HEADER{$name}; } =head2 header Title : header Usage : my $header = $self->header Function: Get HTTP header for blast query Returns : string Args : none =cut sub header { my ($self) = @_; return %HEADER; } =head2 readmethod Title : readmethod Usage : my $readmethod = $self->readmethod Function: Get/Set the method to read the blast report Returns : string Args : string [ Blast, BPlite ] =cut sub readmethod { my ($self, $val) = @_; if( defined $val ) { $self->{'_readmethod'} = $val; } return $self->{'_readmethod'}; } =head2 program Title : program Usage : my $prog = $self->program Function: Get/Set the program to run. Retained for backwards-compatibility. Returns : string Args : string [ blastp, blastn, blastx, tblastn, tblastx ] =cut sub program { my ($self, $val) = @_; return $self->submit_parameter('PROGRAM',$val); } =head2 database Title : database Usage : my $db = $self->database Function: Get/Set the database to search. Retained for backwards-compatibility. Returns : string Args : string [ swissprot, nr, nt, etc... ] =cut sub database { my ($self, $val) = @_; return $self->submit_parameter('DATABASE',$val); } =head2 expect Title : expect Usage : my $expect = $self->expect Function: Get/Set the E value cutoff. Retained for backwards-compatibility. Returns : string Args : string [ '1e-4' ] =cut sub expect { my ($self, $val) = @_; return $self->submit_parameter('EXPECT',$val); } =head2 ua Title : ua Usage : my $ua = $self->ua or $self->ua($ua) Function: Get/Set a LWP::UserAgent for use Returns : reference to LWP::UserAgent Object Args : none Comments: Will create a UserAgent if none has been requested before. =cut sub ua { my ($self, $value) = @_; if( ! defined $self->{'_ua'} ) { $self->{'_ua'} = new LWP::UserAgent(); my $nm = ref($self); $nm =~ s/::/_/g; $self->{'_ua'}->agent("bioperl-$nm/$MODVERSION"); } return $self->{'_ua'}; } =head2 proxy Title : proxy Usage : $httpproxy = $db->proxy('http') or $db->proxy(['http','ftp'], 'http://myproxy' ) Function: Get/Set a proxy for use of proxy Returns : a string indicating the proxy Args : $protocol : an array ref of the protocol(s) to set/get $proxyurl : url of the proxy to use for the specified protocol =cut sub proxy { my ($self,$protocol,$proxy) = @_; return undef if ( !defined $self->ua || !defined $protocol || !defined $proxy ); return $self->ua->proxy($protocol,$proxy); } =head2 submit_blast Title : submit_blast Usage : $self->submit_blast([$seq1,$seq2]); Function: Submit blast requests to a locally accessible wwwBlast server. Returns : 0 on error, An array of Bio::Tools::BPlite or Bio::Tools::Blast objects (depending on how object was initialized) on success Args : input can be: * sequence object * array ref of sequence objects * filename of file containing fasta-formatted sequences =cut sub submit_blast { my ($self, $input) = @_; my @seqs = $self->_load_input($input); return 0 unless ( @seqs ); my %header = $self->header; my @blasts; foreach my $seq ( @seqs ) { #If query has a fasta header, the output has the query line. $header{'SEQUENCE'} = ">".(defined $seq->display_id() ? $seq->display_id() : ""). " ".(defined $seq->desc() ? $seq->desc() : "")."\n".$seq->seq(); my $request = POST $URLBASE, [%header]; $self->warn($request->as_string) if ( $self->verbose > 0); my $response = $self->ua->request($request); if( $response->is_success ) { # strip HTML from response my $content = $response->content; my $stripped = $self->_strip_html(\$content); my ($tempfh, $tempfile) = $self->tempfile(); print $tempfh $content; close $tempfh; if( $self->verbose > 0 ) { #print content of reply if verbose > 1 print "No HTML in response\n" unless $stripped; open(TMP, $tempfile) or $self->throw("cannot open $tempfile"); while() { print $_; } close TMP; } my $blastobj; if( $self->readmethod =~ /BPlite/ ) { $blastobj = new Bio::Tools::BPlite(-file => $tempfile); } else { $blastobj = new Bio::SearchIO( -file => $tempfile, -format => 'blast'); } # store filename in object $self->file($tempfile); my $result = $blastobj->next_result(); if ( $result ) { push @blasts, $result; } else { # error response from server $self->warn("Invalid request was ". $request->as_string . "\n" . $response->as_string); return 0; } } elsif ( $response->is_error ) { $self->warn("Server error. Request was ". $request->as_string . "\n" . $response->as_string); return 0; } else { # information or indirection $self->warn("Not sure what happened. Request was " . $request->as_string . "\n" . $response->as_string); return 0; } } return @blasts; } =head2 saveoutput Title : saveoutput Usage : my $saveoutput = $self->save_output($filename) Function: Method to save the blast report Returns : 1 (throws error otherwise) Args : filename =cut sub save_output { my ($self, $filename) = @_; if( ! defined $filename ) { $self->throw("Can't save blast output. You must specify a filename to save to."); } my $blastfile = $self->file; #open temp file and output file, have to filter out some HTML open(TMP, $blastfile) or $self->throw("cannot open $blastfile"); open(SAVEOUT, ">$filename") or $self->throw("cannot open $filename"); my $seentop = 0; while(my $l = ) { next if ($l =~ /
/);
		if( $l =~ /^(?:[T]?BLAST[NPX])\s*.+$/i ||
			 $l =~/^RPS-BLAST\s*.+$/i ) {
			$seentop=1;
		}
		next if !$seentop;
		if( $seentop ) {
			print SAVEOUT $l;
		}
	}
	close TMP;
	close SAVEOUT;
	return 1;
}

sub _load_input {
    my ($self, $input) = @_;

    if( ! defined $input ) {
		$self->throw("Calling remote blast with no input");
    }
    my @seqs;
    if( ! ref $input ) {
	if( -e $input ) {
	    my $seqio = new Bio::SeqIO(-format => 'fasta', -file => $input);
	    while( my $seq = $seqio->next_seq ) {
		push @seqs, $seq;
	    }
	} else {
	    $self->throw("Input $input was not a valid filename");
	}	
    } elsif( ref($input) =~ /ARRAY/i ) {
	foreach ( @$input ) {
	    if( ref($_) && $_->isa('Bio::PrimarySeqI') ) {
		push @seqs, $_;
	    } else {
		$self->warn("Trying to add a " . ref($_) .
			    " but expected a Bio::PrimarySeqI");
	    }
	}
	if( ! @seqs) {
	    $self->throw("Did not pass in valid input -- no sequence objects found");
	}
    } elsif( $input->isa('Bio::PrimarySeqI') ) {
	push @seqs, $input;
    }
    return @seqs;
}

sub _strip_html {
    # This strips html tags, including line-spanning ones,
    # but doesn't touch singleton >.
      
    my ($self, $string_ref) = @_;

    ref $string_ref eq 'SCALAR' or 
	croak ("Can't strip HTML: ".
	       "Argument should be a SCALAR reference not a ${\ref $string_ref}");

    my $str = $$string_ref;
    my $stripped = 0;

    # remove simple and closing tags first then leftover tags
    $str =~ s/<(\/)?\w+>//g and $stripped = 1;
    $str =~ s/<\D+([^>]*\n*)*>//g and $stripped = 1;
    # clean up double newlines
    $str =~ s/(\n\n)+/\n/g;

    $$string_ref = $str;
    
    return $stripped;
}

1;
__END__