"Fossies" - the Fresh Open Source Software Archive

Member "augustus-3.3.3/scripts/eval_dualdecomp.pl" (13 Sep 2019, 13251 Bytes) of package /linux/misc/augustus-3.3.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "eval_dualdecomp.pl" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 3.3.2_vs_3.3.3.

    1 #!/usr/bin/env perl
    2 
    3 use strict;
    4 use warnings;
    5 use Getopt::Long; # for parameter specification on the command line
    6 
    7 if  (eval {require Statistics::R;1;} ne 1) { # for making R plots
    8     # if module can't load
    9     print "perl module Statistics::R not installed.\n";
   10     print "Please install, e.g. via CPAN. On command line, type:\n\n";
   11     print "perl -MCPAN -e 'install Statistics::R'\n";
   12 } else {
   13     Statistics::R->import();
   14 }
   15 
   16 my $usage = <<'ENDUSAGE';
   17 eval_dualdecomp.pl    evaluate the effectivness of dual decomposition either on a single
   18                       AUGUSTUS output file or a directory of AUGUSTUS output files.
   19 SYNOPSIS
   20 
   21 eval_dualdecomp.pl [ --f=input_file | --d=input_directory ]
   22 
   23     --f=<file>                          intput AUGUSTUS file 
   24     --d=<dir>                           directory of input AUGUSTUS files (recognized by .out file extension)
   25 
   26 OPTIONS
   27 
   28     --help                              output this help message
   29     --hist_iterations=<out.pdf>         output histogram of iterations to out.pdf
   30     --hist_errors=<out.pdf>             output histogram of error estimates to out.pdf for all cases, where
   31                                         no convergence is achieved.
   32     --err_per_iter=<out.pdf>            plots the average percentage of initial error against the iterations to out.pdf.
   33                                         If after a certain number of iterations the error no further drops, 
   34                                         early stopping is recommended, i.e. in the next run, the number of 
   35                                         DD iterations can be restricted to this number of iterations.
   36     --t=<foat>                          threshold for percentage of initial error. For all cases with an estimated
   37                                         error higher than this threshold, the evolution of primal an dual values
   38                                         are plotted against the iterations. This helps debugging cases with a
   39                                         high error estimate. The threshold is between [0-100] (default: 5)
   40     --outdir=<dir>                      put all plots in this output directory
   41 
   42 
   43 DESCRIPTION
   44       
   45   Example:
   46 
   47     eval_dualdecomp.pl --d=augouts --hist_iterations=iterations.pdf --hist_errors=errors.pdf
   48     eval_dualdecomp.pl --d=augouts --hist_iterations=iterations.pdf --hist_errors=errors.pdf --err_per_iter=error_per_iter.pdf --outdir=out
   49     eval_dualdecomp.pl --f=aug.out --t=0
   50 
   51 ENDUSAGE
   52 
   53 
   54 my ($hist_iter, $hist_err, $err_per_iter, $DIR, $FILE, $help); # options
   55 my $t = 5;  # threshold for percentage of initial error
   56 my @docs = ();
   57 my $outdir = "";
   58 
   59 GetOptions('d=s'=>\$DIR,
   60        'f=s'=>\$FILE,
   61        'hist_iterations=s' =>\$hist_iter,
   62        'hist_errors=s' =>\$hist_err,
   63        'err_per_iter=s' =>\$err_per_iter,
   64        't=f' =>\$t,
   65        'outdir=s' =>\$outdir,
   66        'help!'=>\$help);
   67 
   68 if (defined($DIR)){
   69     $DIR =~ s/\/$//g;
   70     opendir(my $DH, $DIR) or die "cannot open directory $DIR: $!";
   71     @docs = grep(/\.out$/,readdir($DH));
   72     @docs = map "$DIR/$_", @docs;
   73 }
   74 if (defined($FILE)){
   75     push @docs, $FILE;
   76 }
   77 
   78 if(!defined($DIR) && !defined($FILE)){
   79     print "either an input directory (option --d) or input file (option --f) is required.\n$usage";
   80     exit(0);
   81 }
   82 if ($outdir ne ""){
   83     system ("rm -rf $outdir; mkdir $outdir");
   84     $outdir =~ s/([^\/])$/$1\//g;
   85 }
   86 
   87 
   88 my $id = 0;                 # gene range ID
   89 my @avgErr = ();            # average error per iteration
   90 
   91 my @conv_iter = ();         # stores number of iterations of all cases that did converge
   92 my @conv_best_p = ();       # stores the iterations of the best primal values of all cases that did converge
   93 
   94 my @not_conv_iter = ();     # stores number of iterations of all cases that did not converge
   95 my @not_conv_best_p = ();   # stores the iterations of the best primal values of all cases that did not converge
   96 my @not_conv_error = ();    # stores the estimated errors of all cases that did not converge
   97 
   98 my $alreadyOpt = 0; # counts number of cases that are already optimal without DD
   99 
  100 my @rounds = (); # number of cases that converged per round
  101 
  102 # hash of gene ranges
  103 #    keys: gene range nr
  104 #    values: hash reference
  105 #            keys: iter               array of iterations
  106 #                  primal             array of primal values
  107 #                  dual               array of dual values
  108 #                  stepsize           array if step sizes
  109 #                  inconsistencies    array of inconsistencies
  110 
  111 my %geneRanges = ();
  112 
  113 
  114 foreach my $file (@docs) {
  115     open (RES, $file) or die "could not open file $file\n";
  116     my $contents = 0;
  117     while(<RES>){
  118     if(/dual decomposition on gene Range (\d+)/){
  119         $id++;
  120         $geneRanges{$id} = {"iter"=>[], "primal"=>[], "dual"=>[], "stepsize"=>[]};
  121     }
  122     next if(!/^round\titer/ && !$contents);
  123     if(/^dual decomposition reduced/){
  124         my $numIter = scalar @{$geneRanges{$id}{"primal"}};
  125         if($numIter <= 1){ # already optimal cases, no DD required
  126         $alreadyOpt++;
  127         $contents=0;
  128         delete $geneRanges{$id};
  129         next;
  130         }       
  131         my ($idx_max, $max) = findMaxValueIndex(\@{$geneRanges{$id}{"primal"}});
  132         my ($idx_min, $min) = findMinValueIndex(\@{$geneRanges{$id}{"dual"}});
  133         
  134         # calculate percentage of initial error
  135         my $perc_initial_err = ($min - @{$geneRanges{$id}{"primal"}}[0]) > 0 ? ($min - $max)*100 / ($min - $geneRanges{$id}{"primal"}->[0]) : 0;
  136         if($perc_initial_err < 0){ # rounding error
  137         $perc_initial_err = 0;
  138         }
  139         # plot dual and primal values against iterations for all gene ranges with an
  140         # error greater than threshold t
  141         if($perc_initial_err > $t){
  142         plot_dual_vs_primal($perc_initial_err);
  143         }
  144         if($geneRanges{$id}{"inconsistencies"} == 0 || $min-$max < 1e-8){ # convergence achieved
  145         push @conv_iter, $numIter;
  146         push @conv_best_p, $idx_max;
  147         $rounds[$geneRanges{$id}{"round"}]++;
  148         }
  149         else{ # not converged, stores errors 
  150         push @not_conv_error, $perc_initial_err;
  151         push @not_conv_iter, $numIter;
  152         push @not_conv_best_p, $idx_max; 
  153         }
  154         delete $geneRanges{$id};
  155         $contents=0;
  156         next;
  157     }
  158     if($contents){
  159         my @l = split(/\s+/,$_);
  160         my ($round, $iteration, $stepsize, $primal, $dual, $inconsist) = ($l[0], $l[1], $l[2], $l[3], $l[4], $l[5]);
  161         if(!defined($geneRanges{$id}{"total_iter"})){
  162         $geneRanges{$id}{"total_iter"}=0;
  163         }
  164         else{
  165         $geneRanges{$id}{"total_iter"}++;
  166         }
  167         $geneRanges{$id}{"round"} = $round;
  168         push @{$geneRanges{$id}{"primal"}}, $primal;
  169         push @{$geneRanges{$id}{"dual"}}, $dual;
  170         push @{$geneRanges{$id}{"stepsize"}}, $stepsize;
  171         $geneRanges{$id}{"inconsistencies"}=$inconsist;
  172         if(!defined($geneRanges{$id}{"best_primal"}) || $geneRanges{$id}{"best_primal"} < $primal){
  173         $geneRanges{$id}{"best_primal"} = $primal;
  174         }
  175         if(!defined($geneRanges{$id}{"best_dual"}) || $geneRanges{$id}{"best_dual"} > $dual){
  176         $geneRanges{$id}{"best_dual"} = $dual;
  177         }
  178         my $best_d = $geneRanges{$id}{"best_dual"};
  179         my $best_p = $geneRanges{$id}{"best_primal"};
  180         my $initial_p = $geneRanges{$id}{"primal"}->[0];
  181         my $e = 0;
  182         if(($best_d - $initial_p) > 0){
  183         $e = ($best_d - $best_p)*100 / ($best_d - $initial_p);
  184         }
  185         if($e < 0){
  186         $e = 0;  # rounding error
  187         }
  188         push @{$avgErr[$geneRanges{$id}{"total_iter"}]} , $e;
  189     }
  190     $contents=1;
  191     }
  192 }
  193 my @iter = (@conv_iter, @not_conv_iter);
  194 my @best_p = (@conv_best_p, @not_conv_best_p);
  195 
  196 my ($idx_error, $max_error) = (0,0);
  197 if(@not_conv_error){
  198  ($idx_error, $max_error) = findMaxValueIndex(\@not_conv_error);
  199 }
  200 my @total_error = (@not_conv_error, ((0) x (scalar @conv_iter)));
  201 
  202 print "\n$alreadyOpt gene Ranges were discarded, because they were already optimal prior to Dual Decomposition\n\n";
  203 printf("+-------------------------------------+----------+-----------+-----------+----------------------+ \n");
  204 printf("| gene Ranges      |        No.       | avg iter | avg error | max error | avg iter best primal |\n");
  205 printf("+-------------------------------------+----------+-----------+-----------+----------------------+\n");
  206 printf("| e-convergence    | %6u (%6.2f%%) |   %4u   |  %5.2f%%   |  %5.2f%%   |        %4u          |\n", scalar @conv_iter, (scalar @conv_iter *100 / scalar @iter),avg(\@conv_iter),0,0,avg(\@conv_best_p));
  207 printf("| no e-convergence | %6u (%6.2f%%) |   %4u   |  %5.2f%%   |  %5.2f%%   |        %4u          |\n", scalar @not_conv_iter, (scalar @not_conv_iter *100 / scalar @iter), avg(\@not_conv_iter), avg(\@not_conv_error), $max_error, avg(\@not_conv_best_p));
  208 printf("+-------------------------------------+----------+-----------+-----------+----------------------+\n");
  209 printf("| total            | %6u (%6.2f%%) |   %4u   |  %5.2f%%   |  %5.2f%%   |        %4u          |\n",  scalar @iter, 100, avg(\@iter), avg(\@total_error), $max_error, avg(\@best_p));
  210 printf("+-------------------------------------+----------+-----------+-----------+----------------------+\n\n");
  211 print "No. of e-convergences per round of Dual Decomposition\n\n";
  212 foreach my $i (0 .. $#rounds) {
  213     print "round $i - $rounds[$i]\n";
  214 }
  215 print "\nIf after n rounds the No. of e-convergences is only very small, the rounds\n";
  216 print "can be restricted to n in the next run.\n\n";
  217     
  218 # plot histogram of iterations
  219 if(defined($hist_iter)){
  220     my $R = Statistics::R->new();
  221     $R->set('font',"Helvetica");
  222     $R->set('filename', $outdir . $hist_iter);
  223     $R->set('data',\@iter);
  224     $R->run(q`(if(require(extrafont)){library(extrafont); font <- "LM Roman 10"})`);
  225     $R->run(q`pdf(filename, family=font)`);
  226     $R->run(q`par(cex.main=1.5, cex.lab=1.5, cex.axis=1.5, cex=1)`);
  227     $R->run(q`hist(data,breaks=50,col="red",xlab="Iterations", main="")`);
  228     $R->run(q`dev.off()`);
  229 }
  230 
  231 # plot histogram of errors
  232 if(defined($hist_err) && @not_conv_error){
  233     my $R = Statistics::R->new();
  234     $R->set('filename', $outdir . $hist_err);
  235     $R->set('font',"Helvetica");
  236     $R->set('data',\@not_conv_error);
  237     $R->run(q`(if(require(extrafont)){library(extrafont); font <- "LM Roman 10"})`);
  238     $R->run(q`pdf(filename, family=font)`);
  239     $R->run(q`par(cex.main=1.5, cex.lab=1.5, cex.axis=1.5, cex=1)`);
  240     $R->run(q`hist(data,breaks=50,col="red",xlab="% of initial error", main="")`);
  241     $R->run(q`dev.off()`);
  242 }
  243 
  244 # plot average percentage of initial error against iterations
  245 if(defined($err_per_iter)){
  246     my $n = scalar @{$avgErr[0]};
  247     my @avgs = ();
  248     for my $i (@avgErr){ # compute average error per iteration
  249     my $avg = sum($i) / $n;
  250     push @avgs, $avg;
  251     if( scalar @avgs > 1  && ($avgs[$#avgs-1] - $avgs[$#avgs]  < 0.00001)){
  252         last;
  253     }
  254     }
  255     my @x = (0..$#avgs);
  256 
  257     my $R = Statistics::R->new();
  258     $R->set('font',"Helvetica");
  259     $R->set('avgs',\@avgs);
  260     $R->set('iter',\@x);
  261     $R->set('filename', $outdir . $err_per_iter);
  262     $R->run(q`(if(require(extrafont)){library(extrafont); font <- "LM Roman 10"})`);
  263     $R->run(q`pdf(filename, family=font)`);
  264     $R->run(q`plot(iter,avgs, type="l", lwd=2, col="blue", ylim=c(min(avgs), max(avgs)), xlim=c(0,max(iter)), xlab=expression(paste("iteration ",italic(t))), ylab="average % of initial error", cex.axis=2, cex.lab=2, cex.main=2)`);
  265     $R->run(q`dev.off()`);      
  266 }
  267 
  268 # plot dual and primal values against iterations
  269 sub plot_dual_vs_primal{
  270     my $err = shift;
  271     my $filename = $outdir . "gr_" . $id . ".pdf";
  272     my @x = (0..$geneRanges{$id}{"total_iter"});
  273     my $R = Statistics::R->new();
  274     $R->set('font',"Helvetica");
  275     $R->set('iter',\@x);
  276     $R->set('primal',\@{$geneRanges{$id}{"primal"}});
  277     $R->set('dual',\@{$geneRanges{$id}{"dual"}});
  278     
  279     $R->set('filename', $filename);
  280     $R->set('error', $err);
  281     $R->run(q`(if(require(extrafont)){library(extrafont); font <- "LM Roman 10"})`);
  282     $R->run(q`pdf(filename, family=font)`);
  283     $R->run(q`plot(iter,primal, type="l", lwd=2, col="blue", ylim=c(min(primal), max(dual)), xlab=expression(paste("iteration ",italic(t))), ylab="value", cex.axis=2, cex.lab=2, main=bquote("Percentage of initial error" ~ italic(epsilon) == .(error) ~ "%"), cex.main=2)`);
  284     $R->run(q`lines(iter,dual, type="l", lwd=2, col="red")`);
  285     $R->run(q`legend("bottomright",c(expression(paste("current primal ",italic(p^t))),expression(paste("current dual ",italic(d^t)))),col=c("blue", "red"), lwd=c(2,2), cex=2, bty="n")`);
  286     $R->run(q`points(iter[which.max(primal)], max(primal), cex = 1.5, pch = 20)`);
  287     $R->run(q`text(iter[which.max(primal)], max(primal), labels = expression(italic(p)[best]), cex= 1.5, pos=3)`);
  288     $R->run(q`dev.off()`);  
  289 }
  290 
  291 sub findMaxValueIndex{
  292     my $idx;
  293     my $max;
  294     my @array = @{$_[0]};
  295     for my $i (0 .. $#array){
  296     if(!defined($max) || $max < $array[$i]){
  297         $idx = $i;
  298         $max = $array[$i];
  299     }
  300     }
  301     return ($idx, $max);
  302 }
  303 
  304 sub findMinValueIndex{
  305     my $idx;
  306     my $min;
  307     my @array = @{$_[0]};
  308     for my $i (0 .. $#array){
  309     if(!defined($min) || $min > $array[$i]){
  310         $idx = $i;
  311         $min = $array[$i];
  312     }
  313     }
  314     return ($idx, $min);
  315 }
  316 
  317 sub avg{
  318     my @array = @{$_[0]};
  319     if(@array){
  320     my $sum = sum(\@array);
  321     $sum /= scalar @array;
  322     return $sum;
  323     }
  324     return 0;
  325 }
  326 
  327 sub sum{
  328     my $sum = 0;
  329     my @array = @{$_[0]};
  330     for my $i (@array){
  331     $sum+=$i;
  332     }
  333     return $sum;
  334 }