Index: Bio/Perl.pm =================================================================== RCS file: /home/repository/bioperl/bioperl-live/Bio/Perl.pm,v retrieving revision 1.5 diff -a -u -r1.5 Perl.pm --- Bio/Perl.pm 16 Feb 2002 21:16:43 -0000 1.5 +++ Bio/Perl.pm 22 Mar 2005 16:49:33 -0000 @@ -1,4 +1,4 @@ - +# $Id: Perl.pm,v 1.20 2003/09/15 13:38:07 bosborne Exp $ # # BioPerl module for Bio::Perl # @@ -16,47 +16,52 @@ =head1 SYNOPSIS - use Bio::Perl qw(read_sequence read_all_sequences write_sequence new_sequence get_sequence); - -# will guess file format from extension - $seq_object = read_sequence($filename); - -# forces genbank format - $seq_object = read_sequence($filename,'genbank'); - -# reads an array of sequences - @seq_object_array = read_all_sequences($filename,'fasta'); - -# sequences are Bio::Seq objects, so the following methods work -# for more info see L, or do 'perldoc Bio/Seq.pm' - - print "Sequence name is ",$seq_object->display_id,"\n"; - print "Sequence acc is ",$seq_object->accession_number,"\n"; - print "First 5 bases is ",$seq_object->subseq(1,5),"\n"; - -# get the whole sequence as a single string - - $sequence_as_a_string = $seq_object->seq(); - -# writing sequences - - write_sequence(">$filename",'genbank',$seq_object); + use Bio::Perl; - write_sequence(">$filename",'genbank',@seq_object_array); + # will guess file format from extension + $seq_object = read_sequence($filename); -# making a new sequence from just strings you have -# from something else + # forces genbank format + $seq_object = read_sequence($filename,'genbank'); - $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA","myname","AL12232"); + # reads an array of sequences + @seq_object_array = read_all_sequences($filename,'fasta'); + + # sequences are Bio::Seq objects, so the following methods work + # for more info see Bio::Seq, or do 'perldoc Bio/Seq.pm' + print "Sequence name is ",$seq_object->display_id,"\n"; + print "Sequence acc is ",$seq_object->accession_number,"\n"; + print "First 5 bases is ",$seq_object->subseq(1,5),"\n"; + + # get the whole sequence as a single string + $sequence_as_a_string = $seq_object->seq(); + + # writing sequences + write_sequence(">$filename",'genbank',$seq_object); + write_sequence(">$filename",'genbank',@seq_object_array); + + # making a new sequence from just a string + $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA", + "myname","AL12232"); + + # getting a sequence from a database (assumes internet connection) + $seq_object = get_sequence('swissprot',"ROA1_HUMAN"); + $seq_object = get_sequence('embl',"AI129902"); + $seq_object = get_sequence('genbank',"AI129902"); + + # BLAST a sequence (assumes an internet connection) + $blast_report = blast_sequence($seq_object); + + # BLAST one or more sequences (requires a local Blast server running wwwBlast) + # Default URL http://localhost/blast/Blast.cgi + my $seq_string = 'MKVDVGPDPSLVYRPDVDPEVAKDKASFRNYTSGPLLDRVFT'; + my $localServerURL = 'http://127.0.0.1/blast/Blast.cgi'; + my $localDB = 'test_aa_db'; + $blast_report = wwwBlast_sequence($seq_object, $localDB); + $blast_report = wwwBlast_sequence($seq_string, $localServerURL, $localDB); - -# getting a sequence from a database (assumes internet connection) - - $seq_object = get_sequence('swissprot',"ROA1_HUMAN"); - - $seq_object = get_sequence('embl',"AI129902"); - - $seq_object = get_sequence('genbank',"AI129902"); + # writing out a blast report + write_blast(">blast.out",$blast_report); =head1 DESCRIPTION @@ -80,7 +85,7 @@ or the web: bioperl-bugs@bio.perl.org - http://bio.perl.org/bioperl-bugs/ + http://bugzilla.bioperl.org/ =head1 AUTHOR - Ewan Birney @@ -90,7 +95,7 @@ =head1 APPENDIX -The rest of the documentation details each of the object methods. +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut @@ -100,19 +105,20 @@ package Bio::Perl; -use vars qw(@ISA @EXPORT_OK $DBOKAY); +use vars qw(@ISA @EXPORT @EXPORT_OK $DBOKAY); use strict; use Carp; use Exporter; use Bio::SeqIO; use Bio::Seq; -BEGIN { - eval { +BEGIN { + eval { require Bio::DB::EMBL; require Bio::DB::GenBank; require Bio::DB::SwissProt; require Bio::DB::RefSeq; + require Bio::DB::GenPept; }; if( $@ ) { $DBOKAY = 0; @@ -123,8 +129,13 @@ @ISA = qw(Exporter); -@EXPORT_OK = qw(read_sequence read_all_sequences write_sequence - new_sequence get_sequence); +@EXPORT = qw(read_sequence read_all_sequences write_sequence + new_sequence get_sequence translate translate_as_string + reverse_complement revcom revcom_as_string + reverse_complement_as_string blast_sequence + wwwBlast_sequence write_blast); + +@EXPORT_OK = @EXPORT; =head2 read_sequence @@ -247,13 +258,18 @@ my $error = 0; my $seqname = "sequence1"; + # catch users who haven't passed us a filename we can open + if( $filename !~ /^\>/ && $filename !~ /^|/ ) { + $filename = ">".$filename; + } + my $seqio = Bio::SeqIO->new('-file' => $filename, '-format' => $format); foreach my $seq ( @sequence_objects ) { my $seq_obj; if( !ref $seq ) { - if( length $seq > 50 ) { + if( length $seq > 50 ) { # odds are this is a sequence as a string, and someone has not figured out # how to make objects. Warn him/her and then make a sequence object # from this @@ -286,7 +302,7 @@ Usage : Function: Example : - Returns : + Returns : Args : @@ -308,14 +324,245 @@ return $seq_object; } +=head2 blast_sequence + + Title : blast_sequence + Usage : $blast_result = blast_sequence($seq) + $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE'); + + Function: If the computer has Internet accessibility, blasts + the sequence using the NCBI BLAST server against nrdb. + + Any additional parameters specified for the GET or PUT are passed + as is to Bio::Tools::Run::RemoteBlast. By default, runs blastp + with E-val = 1e-10. + + This function uses Bio::Tools::Run::RemoteBlast, which itself + use Bio::SearchIO - as soon as you want to know more, check out + these modules. + + Returns : Bio::Search::Result::GenericResult + + Args : $seq = A string of protein letters or nucleotides + or a Bio::Seq object. + 1 (verbose) or 0 (quiet) + @args = optional PUT and GET arguments to pass to RemoteBlast + (e.g. '-prog' => 'blastn', '-expect' => '1e-3', + '-descriptions' => '25'). + For a description of the many possible parameters see: + http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html or the BEGIN + block of Bio::Tools::Run::Tools::RemoteBlast. + +=cut + +sub blast_sequence { + my ($seq, $verbose, @params) = @_; + + if ( !defined $verbose ) { + $verbose = 1; + } elsif ( $verbose =~ /^-/ ) { + # $verbose is actually tag for first argument restore it + # where it belongs + unshift @params, $verbose; + $verbose = 1; + } + + if( !ref $seq ) { + $seq = Bio::Seq->new('-seq' => $seq, '-id' => 'blast-sequence-temp-id'); + } elsif ( !$seq->isa('Bio::PrimarySeqI') ) { + croak("[$seq] is an object, but not a Bio::Seq object, cannot be blasted"); + } + + require Bio::Tools::Run::RemoteBlast; + + # set some default values which are different from those in RemoteBlast + # unless their values are passed as parameters + my ($prog_found, $expect_found); + my $count = 0; + for my $arg ( @params ) { + if ( $arg =~ /-prog/ ) { + $prog_found = 1; + if ( ++$count > 1) { + last; + } + } elsif ( $arg =~ /-expect/ ) { + $expect_found = 1; + if ( ++$count > 1) { + last; + } + } + } + push @params, ('-prog', 'blastp') unless $prog_found; + push @params, ('-expect', '1e-10') unless $expect_found; + push @params, ('-readmethod', 'SearchIO'); + + my $factory = Bio::Tools::Run::RemoteBlast->new(@params); + + my $r = $factory->submit_blast($seq); + if( $verbose ) { + print STDERR "Submitted Blast for [".$seq->id."] "; + } + sleep 5; + + my $result; + + LOOP : + while( my @rids = $factory->each_rid) { + foreach my $rid ( @rids ) { + my $rc = $factory->retrieve_blast($rid); + if( !ref($rc) ) { + if( $rc < 0 ) { + $factory->remove_rid($rid); + } + if( $verbose ) { + print STDERR "."; + } + sleep 10; + } else { + $result = $rc->next_result(); + $factory->remove_rid($rid); + last LOOP; + } + } + } + + if( $verbose ) { + print STDERR "\n"; + } + return $result; +} + +=head2 wwwBlast_sequence + + Title : wwwBlast_sequence + + Usage : $blast_result = wwwBlast_sequence($seq_obj) + $blast_result = wwwBlast_sequence($seq, $localDB, $localServerURL) + $blast_result = wwwBlast_sequence($filename) + + Function: Blasts the sequence using an accessible wwwBlast server against + an optionally specified local database. Default server is + http://localhost/blast/Blast.cgi. Default db is test_na_db. Default + program is blastn. + + Any additional parameters specified for the GET or PUT are passed + as is to Bio::Tools::Run::LocalServerBlast. Default values are those + of Bio::Tools::Run::LocalServerBlast. + + This function uses Bio::Tools::Run::LocalServerBlast, which itself + uses Bio::SearchIO - as soon as you want to know more, check out + these modules. + + Returns : -1 on error + An array of Bio::Search::Result::GenericResult + + Args : $seq = A string of protein letters or nucleotides, the name of + of a file containing one or more Fasta-formatted sequences, + or a Bio::Seq object. + 1 (verbose) or 0 (quiet) + @args = optional arguments, including the URL of the local Blast + server ('-server' => 'http://localhost/blast/Blast.cgi'), + the name of the database to blast against ('-database' => + 'my_db'), or PUT and GET arguments to pass to LocalServerBlast + (e.g. HITLIST_SIZE => '250', EXPECT => '1e-6', + DESCRIPTIONS => '25'). + For a description of the many possible parameters see: + http://athena.bioc.uvic.ca/blast/readme.html#Installation or the BEGIN + block of Bio::Tools::Run::Tools::LocalServerBlast. + +=cut + +sub wwwBlast_sequence { + + my ($seq, $verbose, @params) = @_; + + # do a quick check on $seq + if ( !defined $seq ) { + confess("No sequence provided. Can't Blast.\n"); + } elsif ( ref $seq) { + $seq->isa('Bio::PrimarySeqI') or + croak("[$seq] is an object but not a Bio::Seq object so cannot be Blasted\n"); + } elsif ( ! -e $seq ) { + # not a valid filename so probably a sequence-in-a-string + $seq = Bio::Seq->new( '-seq' => $seq, '-id' => 'default ID'); + } + + if ( !defined $verbose ) { + $verbose = 1; + } elsif ( $verbose =~ /^\D+/ ) { + # $verbose is actually tag for first argument so restore it to + # where it belongs + unshift @params, $verbose; + $verbose = 1; + } + + require Bio::Tools::Run::LocalServerBlast; + # set the server address if it's been passed. Kludge for compatibility: also + # check whether ALIGNMENT_VIEW set using text rather than number and fix if so + my $index = 0; + my %alignment_options = ('Pairwise' => 0, + 'QueryAnchored' => 1, + 'QueryAnchoredNoIdentitites' => 2, + 'FlatQueryAnchored' => 3, + 'FlatQueryAnchoredNoIdentities' => 4, + 'BlastXML' => 7, + 'Tabular' => 9); + for my $arg ( @params ) { + if ( $arg =~ /-server/i ) { + $Bio::Tools::Run::LocalServerBlast::URLBASE = $params[$index]; + last; + } elsif ( ($arg =~ /alignment_view/i) && ($params[$index+1] !~ /[0-9]/) ) { + $params[$index+1] = $alignment_options{$params[$index+1]}; + } + $index++; + } + push @params, ('-readmethod', 'SearchIO'); + + my $factory = Bio::Tools::Run::LocalServerBlast->new(@params); + print STDOUT "Job\(s\) submitted.\n" if $verbose; + my @r = $factory->submit_blast($seq); + + return @r; +} + +=head2 write_blast + + Title : write_blast + Usage : write_blast($filename,$blast_report); + + Function: Writes a BLAST result object (or more formally + a SearchIO result object) out to a filename + in BLAST-like format + + Returns : none + + Args : filename as a string + Bio::SearchIO::Results object + +=cut + +sub write_blast { + my ($filename,$blast) = @_; + + if( $filename !~ /^\>/ && $filename !~ /^|/ ) { + $filename = ">".$filename; + } + + my $output = Bio::SearchIO->new( -output_format => 'blast', -file => $filename); + + $output->write_result($blast); + +} + =head2 get_sequence Title : get_sequence Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN"); - Function: If the computer has Internet accessibility, gets + Function: If the computer has Internet access this method gets the sequence from Internet accessible databases. Currently - this supports Swissprot, EMBL, GenBank and RefSeq. + this supports Swissprot ('swiss'), EMBL ('embl'), GenBank + ('genbank'), GenPept ('genpept'), and RefSeq ('refseq'). Swissprot and EMBL are more robust than GenBank fetching. @@ -324,12 +571,13 @@ Returns : A Bio::Seq object - Args : database type - one of swiss, embl, genbank or refseq - identifier or accession number + Args : database type - one of swiss, embl, genbank, genpept, or + refseq =cut my $genbank_db = undef; +my $genpept_db = undef; my $embl_db = undef; my $swiss_db = undef; my $refseq_db = undef; @@ -337,38 +585,45 @@ sub get_sequence{ my ($db_type,$identifier) = @_; if( ! $DBOKAY ) { - confess("Your system does not have IO::String installed so the DB retrieval method is not available"); + confess ("Your system does not have one of LWP, HTTP::Request::Common, IO::String installed so the DB retrieval method is not available. \nFull error message is:\n $!\n"); return; } $db_type = lc($db_type); my $db; - if( $db_type =~ /gen/ ) { + if( $db_type =~ /genbank/ ) { if( !defined $genbank_db ) { $genbank_db = Bio::DB::GenBank->new(); - } + } $db = $genbank_db; } + if( $db_type =~ /genpept/ ) { + if( !defined $genpept_db ) { + $genpept_db = Bio::DB::GenPept->new(); + } + $db = $genpept_db; + } if( $db_type =~ /swiss/ ) { if( !defined $swiss_db ) { $swiss_db = Bio::DB::SwissProt->new(); - } + } $db = $swiss_db; } if( $db_type =~ /embl/ ) { if( !defined $embl_db ) { $embl_db = Bio::DB::EMBL->new(); - } + } $db = $embl_db; } - if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and $identifier =~ /_/)) { + if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and + $identifier =~ /^\s*N\S+_/)) { if( !defined $refseq_db ) { $refseq_db = Bio::DB::RefSeq->new(); - } + } $db = $refseq_db; } @@ -383,4 +638,181 @@ return $seq; } +=head2 translate + + Title : translate + Usage : $seqobj = translate($seq_or_string_scalar) + + Function: translates a DNA sequence object OR just a plain + string of DNA to amino acids + Returns : A Bio::Seq object + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub translate { + my ($scalar) = shift; + + my $obj; + + if( ref $scalar ) { + if( !$scalar->isa("Bio::PrimarySeqI") ) { + confess("Expecting a sequence object not a $scalar"); + } else { + $obj= $scalar; + + } + + } else { + + # check this looks vaguely like DNA + my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ ); + + if( $n < length($scalar) * 0.85 ) { + confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me"); + } + + $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar); + } + + return $obj->translate(); +} + + +=head2 translate_as_string + + Title : translate_as_string + Usage : $seqstring = translate_as_string($seq_or_string_scalar) + + Function: translates a DNA sequence object OR just a plain + string of DNA to amino acids + Returns : A stirng of just amino acids + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub translate_as_string { + my ($scalar) = shift; + + my $obj = Bio::Perl::translate($scalar); + + return $obj->seq; +} + + +=head2 reverse_complement + + Title : reverse_complement + Usage : $seqobj = reverse_complement($seq_or_string_scalar) + + Function: reverse complements a string or sequence argument + producing a Bio::Seq - if you want a string, you + can use reverse_complement_as_string + Returns : A Bio::Seq object + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub reverse_complement { + my ($scalar) = shift; + + my $obj; + + if( ref $scalar ) { + if( !$scalar->isa("Bio::PrimarySeqI") ) { + confess("Expecting a sequence object not a $scalar"); + } else { + $obj= $scalar; + + } + + } else { + + # check this looks vaguely like DNA + my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ ); + + if( $n < length($scalar) * 0.85 ) { + confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me"); + } + + $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar); + } + + return $obj->revcom(); +} + +=head2 revcom + + Title : revcom + Usage : $seqobj = revcom($seq_or_string_scalar) + + Function: reverse complements a string or sequence argument + producing a Bio::Seq - if you want a string, you + can use reverse_complement_as_string + + This is an alias for reverse_complement + Returns : A Bio::Seq object + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub revcom { + return &Bio::Perl::reverse_complement(@_); +} + + +=head2 reverse_complement_as_string + + Title : reverse_complement_as_string + Usage : $string = reverse_complement_as_string($seq_or_string_scalar) + + Function: reverse complements a string or sequence argument + producing a string + Returns : A string of DNA letters + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub reverse_complement_as_string { + my ($scalar) = shift; + + my $obj = &Bio::Perl::reverse_complement($scalar); + + return $obj->seq; +} + + +=head2 revcom_as_string + + Title : revcom_as_string + Usage : $string = revcom_as_string($seq_or_string_scalar) + + Function: reverse complements a string or sequence argument + producing a string + Returns : A string of DNA letters + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub revcom_as_string { + my ($scalar) = shift; + + my $obj = &Bio::Perl::reverse_complement($scalar); + + return $obj->seq; +} + + 1;