"Fossies" - the Fresh Open Source Software Archive

Member "augustus-3.3.3/scripts/uniquePeptides.pl" (13 Sep 2019, 1969 Bytes) of package /linux/misc/augustus-3.3.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "uniquePeptides.pl" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 3.3.2_vs_3.3.3.

    1 #!/usr/bin/env perl
    2 
    3 # Creates a fasta file with unique peptides from a fasta input file
    4 # where identical peptide match multiple positions. The first found 
    5 # peptide occurrence is used for counting multiplicity.
    6 # Input format fasta header examples:
    7 # >scaffold4083_450 [15866 - 17011]  (65.81)
    8 # >au14.g511.t1 (46.17)
    9 # ... there is a score in the end of the header in round brackets!
   10 # Output format fasta header:
   11 # >peptideX mult=x
   12 # Do not use for long protein sequences, only for short peptides!!!
   13 # prefix
   14 
   15 my $usage = "uniquePeptides.pl in.fa prefix > out.fa\n\nRead script head comments for futher documentation!\n\n";
   16 
   17 if(@ARGV!=2){print STDERR $usage; exit -1;}
   18 
   19 my $in = $ARGV[0];
   20 my $prefix = $ARGV[1];
   21 
   22 # two hashes for the same peptide (hash of hashes would be more elegant, but I am too lazy for that), key is always the peptide sequence, trimmed of "-" characters that sometimes appear to occur in Harald's peptide files.
   23 my %locHash = ();
   24 my %multHash = ();
   25 my $maxLen = 100;
   26 
   27 my $header;
   28 
   29 open(IN, "<", $in) or die("Could not open file $in!\n");
   30 
   31 while ( <IN> ) { 
   32     chomp;
   33     if($_=~m/^>/){
   34         $_=~s/>//;
   35         $_=~s/\(\d+\.\d+\)//; # delete score
   36         $header = $_;
   37     }else{
   38         if(length($_) > $maxLen){
   39             print STDERR "The peptide $_ is longer $maxLen. Aborting.\n";
   40             exit -1;
   41         }
   42         while($_=~m/-/){ # delete weird dashes in sequences
   43             $_ = s/-//;
   44         }
   45         if(not(exists($locHash{$_}))){
   46             $locHash{$_} = "$header";
   47             $multHash{$_} = 1;          
   48         }elsif($locHash{$_} eq $header){
   49             $multHash{$_} = $multHash{$_} + 1;
   50         }
   51         
   52         
   53     }
   54 }
   55 
   56 close IN or die("Could not close file $in!\n");
   57 
   58 # print unique fasta file
   59 my $c = 1;
   60 while ( ($k,$v) = each %multHash ) {
   61     print ">$prefix$c mult=$v\n$k\n";
   62     $c++;
   63 }
   64 
   65 
   66 # print mapping file
   67 #open(OUT2, ">", $mapping) or die("Could not close file $mapping!\n");
   68 #for $k ( keys %mapHash ) {
   69 #   print OUT2 "$k:\n";
   70 #   foreach(@{$mapHash{$k}}){
   71 #       print OUT2 "$_\n";
   72 #   }
   73 #}
   74 #close(OUT2) or die("Could not close file $mapping!\n");
   75