"Fossies" - the Fresh Open Source Software Archive

Member "augustus-3.3.3/scripts/evalCGP.pl" (13 Sep 2019, 9679 Bytes) of package /linux/misc/augustus-3.3.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "evalCGP.pl" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 3.3.2_vs_3.3.3.

    1 #!/usr/bin/env perl
    2 
    3 ###################################################################################################
    4 # evalCGP
    5 # evaluates a prediction in GTF format against an annotation
    6 # using the external evaluation package Eval by Evan Keibler and Michael R. Brent┬╣
    7 # and returns accuracy values (SN and SP on gene, exon and nucleotide level)
    8 # evalCGP only compares gene features on given genomic intervals that
    9 # are parsed from the prediction files.
   10 #
   11 # ┬╣(Eval: A software package for analysis of genome annotations. BMC Bioinformatics 4: 50 (2003))
   12 #
   13 # usage
   14 #
   15 # evalCGP.pl --anno=annotation.gtf --pred=prediction.gtf
   16 #
   17 # Stefanie Koenig, 11.08.2014
   18 ###################################################################################################
   19 
   20 
   21 use strict;
   22 use IO::File;
   23 
   24 my %cmdpars = ( 'pred'              => '',
   25         'anno'              => '',
   26         'joingenes'         => '',
   27         'wholeGenome'       => '',
   28                 'alternatives'      => '',
   29         'noselection'       => '',
   30         'nojoin'            => '',
   31         'jg_exec_dir'       => '',
   32         'eval_exec_dir'     => '');
   33 
   34 
   35 my $usage = <<'ENDUSAGE';
   36 
   37 evalCGP.pl      evaluates a prediction in GTF format against an annotation
   38                 using the external evaluation package Eval by Evan Keibler and Michael R. Brent
   39                 and returns accuracy values (SN and SP on gene, exon and nucleotide level)
   40                 evalCGP only compares gene features on given genomic intervals that
   41                 are parsed from the prediction files.
   42 
   43 USAGE
   44 
   45 evalCGP.pl --anno=annotation.gtf --pred=prediction.gtf
   46 
   47       annotation.gtf           Annotation file in GTF format.
   48       prediction.gtf           Prediction file in GTF format.
   49 
   50 OPTIONS
   51 
   52     --eval_exec_dir=d          Directory that contains the executable evaluate_gtf.pl from the eval package.
   53                                If not specified it must be in \$PATH environment variable.
   54     --joingenes=1              Use this option to merge genes in the prediction set and filter out duplicates (default: 0)
   55     --wholeGenome=1            If this flag is set evaluation is on the whole genome. Per default, evaluation
   56                                is restricted to the gene ranges
   57     --alternatives=1           Parameter of joingenes. If this flag is set, joingenes keeps alternative splice forms of a gene, otherwise
   58                                it only keeps the best splicing form. Per definition, alternative splice forms are either transcripts
   59                                with the same gene ID or the same coding start AND end coordinates (default: 0).
   60     --noselection=1            Parameter of joingenes. If this flag is set, joingenes does NOT select a single best transcripts
   61                                among multiple conflicting transcripts. Two transcripts are confliciting if they overlap
   62                                each other and are no alternative splice forms.
   63                                considered as conflicting.
   64     --nojoin=1                 Parameter of joingenes. If this flag is set, joingenes does NOT create new
   65                                transcripts by merging input transcripts, f.i. it does NOT combine two
   66                                incomplete transcripts to a single complete transcript, where possible.
   67                                
   68 
   69 
   70 ENDUSAGE
   71 
   72 ##############################################################
   73 # Check the command line
   74 ##############################################################
   75 
   76 if ($#ARGV<0) {
   77     print "$usage";
   78     exit;
   79 }
   80 
   81 foreach (@ARGV) {
   82     if (/--(\w+)=(.*)/){
   83     if (!exists($cmdpars{$1})){
   84         print "unknown parameter: " . $1 . "\n$usage";
   85         exit;
   86     }
   87     $cmdpars{$1}=$2;
   88     } 
   89 }
   90 if ($cmdpars{"pred"} eq ""){
   91     print "prediction file missing\n$usage";
   92     exit;
   93 }
   94 if ($cmdpars{"anno"} eq ""){
   95     print "annotation file missing\n$usage";
   96     exit;
   97 }
   98 if ($cmdpars{'eval_exec_dir'} =~ /.[^\/]$/) {
   99     $cmdpars{'eval_exec_dir'} .= '/';
  100 }
  101 
  102 if ($cmdpars{'jg_exec_dir'} =~ /.[^\/]$/) {
  103     $cmdpars{'jg_exec_dir'} .= '/';
  104 }
  105 
  106 my $joingenes=0;
  107 if ($cmdpars{'joingenes'} eq '1'){
  108     $joingenes=1;
  109 }
  110 
  111 my $wholegenome=0;
  112 if ($cmdpars{'wholeGenome'} eq '1'){
  113     $wholegenome=1;
  114 }
  115 
  116 my $jg_pars="";
  117 if ($cmdpars{'alternatives'} eq '1'){
  118     $jg_pars.=" -a";
  119 }
  120 if ($cmdpars{'noselection'} eq '1'){
  121     $jg_pars.=" -l";
  122 }
  123 if ($cmdpars{'nojoin'} eq '1'){
  124     $jg_pars.=" -j";
  125 }
  126 
  127 # check whether joingenes is properly installed
  128 if ($joingenes && qx(which "$cmdpars{'jg_exec_dir'}joingenes") !~ /joingenes$/){
  129     die ("joingenes is not executable. Please add the directory which contains the executable joingenes to the PATH environment variable or specify the path with --jg_exec_dir.");
  130 } 
  131 
  132 # check whether the eval package is properly installed
  133 if (qx(which "$cmdpars{'eval_exec_dir'}evaluate_gtf.pl") !~ /evaluate_gtf.pl$/){
  134     die ("eval is not executable. Please add the directory which contains the executable evaluate_gtf.pl to the PATH environment variable or specify the path with --eval_exec_dir.");
  135 }
  136 if (qx("$cmdpars{'eval_exec_dir'}evaluate_gtf.pl" 2>&1) =~ /^Can\'t\slocate\s(\w+\.pm)/ ){
  137     die ("eval is not executable. The perl library " . $1 . " cannot be located.\n" . 
  138      "Please add the directory which contains " . $1 . " to the PERL5LIB environment variable, e.g. add the following line to your .bashrc file:\n\n" . 
  139      "export PERL5LIB=\$PERL5LIB:/path/to/" . $1 . "\n\n");
  140 }
  141 
  142 my @gfflines = ();
  143 
  144 # reading in annotation file and checking if it is in a valid GTF format
  145 print STDERR "reading in annotation file $cmdpars{'anno'}...\n";
  146 open (ANNO, <$cmdpars{"anno"}>) or die ("Could not open $cmdpars{'anno'} for reading: $!");
  147 while(<ANNO>){
  148     if (/^\s*\#.*/ || /^\s*$/){ # skip comment lines and empty lines
  149     next;
  150     }
  151     my @line = split(/\t/,$_);
  152     if(@line < 9){
  153     die ("Not GTF format in the following line:\n$_\n");
  154     }
  155     if($line[2] eq "CDS" || $line[2] eq "stop_codon" || $line[2] eq "start_codon"){
  156     if ($line[8] !~ /transcript_id\s"?[^";]+"?;/){
  157         die ("Not GTF format in the following line:\n$_\ntranscript_id not found.\n");
  158     }
  159     if ($line[8] !~ /gene_id\s"?[^";]+"?;/){
  160         die ("Not GTF format in the following line:\n$_\ngene_id not found.\n");
  161     }
  162     push @gfflines, $_;
  163     }   
  164 }
  165 close(ANNO);
  166 
  167 # temporary directory that contains the prediction and the annotation split by seq
  168 my $gffDir = "tempGFF";
  169 system ("rm -rf $gffDir; mkdir $gffDir");
  170     
  171 my @intervals=(); # hash of genomic intervals
  172 my %seqlist=(); # hash of sequences (only keys, no values)
  173        
  174 open (PRED, <$cmdpars{"pred"}>) or die ("Could not open $cmdpars{'pred'} for reading: $!");
  175 open (JOINPRED, ">$gffDir/pred.gtf") or die("Could not open $gffDir/pred.gtf for writing: $!");
  176 
  177 while(<PRED>){
  178     if(/prediction on sequence range (\w+):(\d+)\-(\d+)/){
  179     push @intervals,[$1, $2, $3]; # store genomic intervals on which gene prediction is executed
  180     print JOINPRED $_;
  181     }
  182     if(/\t(CDS|stop_codon|start_codon)\t/){
  183     print JOINPRED $_;
  184     my $chr = (split /\t/, $_)[0];
  185     if (!$seqlist{$chr}){ # add new sequences to seqlist                                                                                                                         
  186         $seqlist{$chr} = 1;
  187     }
  188     }   
  189 }
  190 close(PRED);
  191 close(JOINPRED);
  192 
  193 # join overlapping genomic intervals
  194 # sort intervals by 1. chromosome, 2. start
  195 @intervals = sort {$a->[0] cmp $b->[0] || $a->[1] <=> $b->[1]} @intervals;
  196 
  197 my @joined=(); # array of joined genomic intervals
  198 my ($chr, $start, $end);
  199 foreach (@intervals){
  200     if(defined($chr) && $_->[0] eq $chr && $_->[1] - $end <= 0 ) { # overlap between the last and the current interval
  201     if($end < $_->[2]){
  202         $end = $_->[2];
  203     }
  204     } else {
  205     if (defined($chr)){
  206         push @joined,[$chr, $start, $end];
  207     }
  208     ($chr, $start, $end) = ($_->[0], $_->[1], $_->[2]);
  209     }
  210 }
  211 push @joined,[$chr, $start, $end];
  212 
  213 # make a new annotation file that only contains features from the training set that are completely contained in one of the intervals
  214 # (if necessary, this can be done faster with a single loop over the intervals, requires presorting of @gfflines)
  215 open(ANNO, '>', "$gffDir/anno.gtf") or die ("Could not open $gffDir/anno.gtf for writing: $!");
  216 if($wholegenome){
  217     foreach my $line (@gfflines){
  218     print ANNO $line;
  219     my $chr = (split /\t/, $line)[0];
  220     if (!$seqlist{$chr}){ # add new sequences to seqlist                                                                                                                         
  221         $seqlist{$chr} = 1;
  222     }
  223     }
  224 }
  225 else{
  226     foreach my $line (@gfflines){
  227     my @gffline = split(/\t/,$line);
  228     my ($chr, $start, $end)=($gffline[0], $gffline[3], $gffline[4]);
  229     foreach my $i (@joined){
  230         if($chr eq $i->[0] && !($start > $i->[2]) && !($end < $i->[1]) ){
  231         print ANNO $line;
  232         last;
  233         }
  234     }
  235     }
  236 }
  237 close(ANNO);
  238 
  239 # join genes
  240 if($joingenes){
  241     system("mv $gffDir/pred.gtf $gffDir/pred.unfiltered.gtf");
  242     system("$cmdpars{'jg_exec_dir'}joingenes $jg_pars -g $gffDir/pred.unfiltered.gtf -o $gffDir/pred.gtf");
  243 }
  244 
  245 # split annotation and prediction file by seqs and prepare
  246 # list files that contain the directories of the GTF files being compared (required by eval)
  247 system ("rm -f $gffDir/annotation_list");
  248 system ("rm -f $gffDir/prediction_list");
  249 foreach my $seq (keys %seqlist) {
  250     system ("echo '$gffDir/$seq.anno.gtf' >> $gffDir/annotation_list");
  251     system ("echo '$gffDir/$seq.pred.gtf' >> $gffDir/prediction_list");
  252     system ("grep \"^$seq\\b\" $gffDir/pred.gtf > $gffDir/$seq.pred.gtf");
  253     system ("grep \"^$seq\\b\" $gffDir/anno.gtf > $gffDir/$seq.anno.gtf");
  254 }
  255 
  256 # call evaluate_gtf
  257 system ("$cmdpars{'eval_exec_dir'}evaluate_gtf.pl $gffDir/annotation_list $gffDir/prediction_list");