"Fossies" - the Fresh Open Source Software Archive

Member "augustus-3.3.3/scripts/parseSim4Output.pl" (13 Sep 2019, 3610 Bytes) of package /linux/misc/augustus-3.3.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "parseSim4Output.pl" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 3.3.2_vs_3.3.3.

    1 #!/usr/bin/env perl
    2 #
    3 # parse the output of sim4, fiter the alignments and
    4 # output them to stdout
    5 #
    6 # Mario Stanke, August 17th, 2005
    7 
    8 use SplicedAlignment;
    9 use strict;
   10 
   11 my $usage = "$0 -- filter the output of sim4\n";
   12 $usage .= "Usage: $0 sim4.out seqfile.fa\n";
   13 $usage .= "output to stdout. first messages. then complete genes. then genes complete only at the 5' end\n";
   14   
   15 if ($#ARGV != 1) {
   16     die "Unknown option\n\n$usage";
   17 }
   18 my $simfilename = $ARGV[0];
   19 my $seqfilename = $ARGV[1];
   20 my @alignmentList=();
   21 
   22 ###########################################################################################
   23 #
   24 # read in the sim4 alignments
   25 #
   26 ###########################################################################################
   27 
   28 
   29 open(SIM, "<$simfilename") or die ("Could not open $simfilename");
   30 
   31 my ($sa, $estName, $estLength, $genomicName, $genomicLength, $estExonBegin, $estExonEnd, $exonBegin, $exonEnd, $intronFlag, $quality, $complement);
   32 my @genomicMatches;
   33 $/="--------\n";
   34 while (<SIM>) {
   35     next unless /seq1 =/;
   36     />(.*)\n/;
   37     $estName = $1;
   38     /seq1 = .*, (\d+) bp/;
   39     $estLength = $1;
   40     @genomicMatches = split(/seq2 = /, $_);
   41     shift @genomicMatches;
   42     foreach my $match (@genomicMatches) {
   43     $match =~ /\((.*)\), (\d+) bp/;
   44     $genomicName = $1;
   45     $genomicLength = $2;
   46     $complement = ($match =~ /complement/);
   47     $sa = new SplicedAlignment($estName, $estLength, $genomicName, $genomicLength, $complement);
   48     foreach (split (/\n/, $match)){
   49         if (/(\d+)-(\d+) +\((\d+)-(\d+)\) +(\d+)%(.*)/) {
   50         $estExonBegin = $1;
   51         $estExonEnd = $2;
   52         $exonBegin = $3;
   53         $exonEnd = $4;
   54         $quality = $5 / 100;
   55         $intronFlag = $6;
   56         $intronFlag =~ s/ //;
   57         $sa->addExon($estExonBegin-1, $estExonEnd-1, $exonBegin-1, $exonEnd-1 , $quality, $intronFlag);
   58         }
   59     }
   60     push @alignmentList, $sa;
   61     }
   62 }
   63 
   64 my @geneList=();
   65 
   66 ###########################################################################################
   67 #
   68 # filter and enrich the data
   69 #
   70 ###########################################################################################
   71 
   72 foreach $sa (@alignmentList){
   73     $sa->checkAlignment();
   74     print STDERR $sa->get_status(), "\n";
   75     if ($sa->get_status() eq "OK"){
   76     my $seq = getSequence($sa->get_contigname);
   77     $sa->makeGene($seq);
   78     $sa->findSplicedUTR();
   79     print STDERR "\t\t", $sa->get_status(), "\n"; 
   80     if ($sa->get_status() eq "OK"){
   81         push @geneList, $sa;
   82     }
   83     }
   84 }
   85 
   86 ###########################################################################################
   87 #
   88 # output the data
   89 #
   90 ###########################################################################################
   91 
   92 print "### complete genes\n";
   93 foreach $sa (@geneList){
   94     if ($sa->get_complete5prime() && $sa->get_complete3prime()){
   95     print $sa->output;
   96     }
   97 }
   98 print "### genes incomplete at the 3' end and complete at the 5' end\n";
   99 foreach $sa (@geneList){
  100     if ($sa->get_complete5prime() && !$sa->get_complete3prime()){
  101     print $sa->output;
  102     }
  103 }
  104 
  105 
  106 
  107 ###########################################################################################
  108 #
  109 # subroutines
  110 #
  111 ###########################################################################################
  112 
  113 sub getSequence {
  114     my $seqname = shift;
  115     my $seq;
  116     # let cdbyank use the index to search the sequence
  117     system("echo '$seqname' | cdbyank ${seqfilename}.cidx -d $seqfilename > genomic.fa");
  118     system ("perl -i.orig -p -e 's/^Incorrectly.*fixed.\n//' genomic.fa");
  119     open (SEQ, "<genomic.fa") or die ("Could not open genomic.fa");
  120     $/="\n";
  121     my @lines=<SEQ>;
  122     shift @lines;
  123     $seq = join ("", @lines);
  124     $seq =~ s/\n//g;
  125     return $seq;
  126 }