#!/usr/local/bin/perl -w use strict; use Data::Dumper; ###################################################################################### # NOTE: # # # # 1. This code takes sequences/motifs array and # # 2. Returns PWM and their IC # # 3. IC is computed on the assumption that the distribution of the bases is uniform # # # # Copyright 2005 by Edward Wijaya # # This library is free software; you can redistribute it and/or modify it # # under the same terms as Perl itself. # # # # # ##################################################################################### #Here is the example: my @Array = ('aaa','atg','ttt','gtc'); my ($ic,$PWM) = compute_ic_pwm(@Array); print Dumper $ic; print Dumper $PWM; #The code that does the job sub compute_ic_pwm { my @mi = @_; my $motif_count; my $L; #-------Beginning of PWM processing ---------------- foreach my $mi (@mi) { chomp($mi); $mi =~ s/\s//g; $L = $mi; #for motif instances count my @words = split( /\W+/, $mi ); $motif_count += @words; } $motif_count =0; my $w = length($L); my @A = (); my @T = (); my @C = (); my @G = (); for ( my $j = 0 ; $j < $w ; $j++ ) { # Initialize the base counts. my $a = 0; my $c = 0; my $g = 0; my $t = 0; foreach my $mi (@mi) { chomp($mi); my $L = $mi; my $sb = substr( $L, $j, 1 ); while ( $sb =~ /a/ig ) { $a++ } while ( $sb =~ /t/ig ) { $t++ } while ( $sb =~ /c/ig ) { $c++ } while ( $sb =~ /g/ig ) { $g++ } } push( @A, $a ); push( @T, $t ); push( @C, $c ); push( @G, $g ); } my $sumA = sumArray(@A); my $sumT = sumArray(@T); my $sumC = sumArray(@C); my $sumG = sumArray(@G); #print "\nNumber of Motif Instances: $motif_count\n"; #print "Motif Length : $w\n"; #print "Frequency of A=$sumA T=$sumT C=$sumC G=$sumG\n\n"; #print "-------------- PWM (Actual Count) ---------------------\n"; #print "A = @A\n"; #print "T = @T\n"; #print "C = @C\n"; #print "G = @G\n"; #print "--------------------------------------------------------\n"; #----- Begin PWM normalization step 1 (divide elements of each column with the Sum of each column position ------------- my @m = (); my @Anm1 =(); my @Tnm1 =(); my @Cnm1 =(); my @Gnm1 =(); my @sPos =(); #summing up A,T,C,G for all position for ( my $b = 0 ; $b < $w ; $b++ ) { my $sumPos = $A[$b] + $T[$b] + $C[$b] + $G[$b]; push( @sPos, $sumPos ); my $nm1A = $A[$b] / $sPos[$b]; my $nm1T = $T[$b] / $sPos[$b]; my $nm1C = $C[$b] / $sPos[$b]; my $nm1G = $G[$b] / $sPos[$b]; push( @Anm1, $nm1A ); push( @Tnm1, $nm1T ); push( @Cnm1, $nm1C ); push( @Gnm1, $nm1G ); } my @PWM = pwm(\@Anm1,\@Tnm1,\@Cnm1,\@Gnm1); #----- End of PWM generation ------------------------------------------------------------------------- #----------------- Begin computing Information content for motif instances --------------------------- my @pLogpA = (); my @pLogpT = (); my @pLogpC = (); my @pLogpG = (); for ( my $i = 0 ; $i < $w ; $i++ ) { if ( $Anm1[$i] > 0 ) { my $plogpA = $Anm1[$i] * log_base( 2, 4 * $Anm1[$i] ); push( @pLogpA, $plogpA ); } if ( $Tnm1[$i] > 0 ) { my $plogpT = $Tnm1[$i] * log_base( 2, 4 * $Tnm1[$i] ); push( @pLogpT, $plogpT ); } if ( $Cnm1[$i] > 0 ) { my $plogpC = $Cnm1[$i] * log_base( 2, 4 * $Cnm1[$i] ); push( @pLogpC, $plogpC ); } if ( $Gnm1[$i] > 0 ) { my $plogpG = $Gnm1[$i] * log_base( 2, 4 * $Gnm1[$i] ); push( @pLogpG, $plogpG ); } } my $IC = ( sumArray(@pLogpA) + sumArray(@pLogpT) + sumArray(@pLogpC) + sumArray(@pLogpG) ); return ($IC,\@PWM); } #--------------- Subs of the subroutines ---------------------------- sub pwm { #PWM in forms of AoH # Result looks like this: # @AoHoA_ref = { { 'A' => [1,2,3], # 'T' => [2,3,4], # 'C' => [3,2,2], # 'G' => [1,1,1], }}; my ($A,$T,$C,$G) = @_; #input are array references my (%Ah,%Th,%Ch,%Gh); $Ah{'A'} = [@$A]; $Th{'T'} = [@$T]; $Ch{'C'} = [@$C]; $Gh{'G'} = [@$G]; my @PWM; #AoH push @PWM, {%Ah,%Th,%Ch,%Gh}; return @PWM; } sub zup { join "\n" => map { join " " => map { shift @$_ } @_ } @{ $_[0] }; } sub log_base { my ( $base, $value ) = @_; return (log($value) / log($base)); } sub sumArray { my @arr = @_; my $sum = 0; my $count = $#arr + 1; for(my $i=0;$i<$count;$i++){ $sum += $arr[$i]; } return $sum; } sub countArray { my @params = @_; my $Key; my $count; foreach $Key (@params) { $count++ } return $count; } sub get_file_data { my ($filename) = @_; use strict; use warnings; # Initialize variables my @filedata = (); unless ( open( GET_FILE_DATA, $filename ) ) { print STDERR "Cannot open file \"$filename\"\n\n"; exit; } @filedata = ; close GET_FILE_DATA; return @filedata; } sub extract_sequence_from_fasta_data { my (@fasta_file_data) = @_; use strict; use warnings; # Declare and initialize variables my $sequence = ''; foreach my $line (@fasta_file_data) { # discard blank line if ( $line =~ /^\s*$/ ) { next; # discard comment line } elsif ( $line =~ /^\s*#/ ) { next; # discard fasta header line } elsif ( $line =~ /^>/ ) { next; # keep line, add to sequence string } else { $sequence .= $line; } } # remove non-sequence data (in this case, whitespace) from $sequence string $sequence =~ s/\s//g; return $sequence; } sub enum_lmers { my ( $sequence, $lmer_size ) = @_; use strict; use warnings; my @lmer_array = (); my $lmer; my $enum_size; $enum_size = length($sequence) - $lmer_size + 1; #print "$enum_size\n"; for ( my $pos = 0 ; $pos < $enum_size ; $pos++ ) { $lmer = substr( $sequence, $pos, $lmer_size ); #print "$lmer\n"; push( @lmer_array, $lmer ); } return @lmer_array; }