"Fossies" - the Fresh Open Source Software Archive

Member "augustus-3.3.3/scripts/filterShrimp.pl" (13 Sep 2019, 4469 Bytes) of package /linux/misc/augustus-3.3.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "filterShrimp.pl" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 3.3.2_vs_3.3.3.

    1 #!/usr/bin/env perl
    2 #
    3 # filter a file from the Shrimp alignment program
    4 #
    5 #
    6 #
    7 use strict;
    8 use Getopt::Long;
    9 
   10 my $usage = "$0 -- filter a SHRIMP format file\n";
   11 $usage .= "\n";
   12 $usage .= "Usage: $0 <in.psl >out.f.psl\n";
   13 $usage .= "  PREREQUISITE: input file must be sorted by query name\n";
   14 $usage .= "  \n";
   15 $usage .= "  options:\n";
   16 $usage .= "  --minScore=n       minimal percentage of identity (default 300)\n";
   17 $usage .= "  --uniq             take only best match and only, when second best is much worse (default false)\n";
   18 $usage .= "  --uniqthresh=f     the best alignment is considered uniquen when the second best has a score at most this much fraction of the best (default 0.9)\n";
   19 $usage .= "  --best             take (all) best alignment(s) if they pass the minScore filter.\n";
   20 $usage .= "  --commongenefile=s file name in which to write cases where one read maps to several different genes\n";
   21 $usage .= "  --verbose          output debugging info (default false)\n";
   22 
   23 
   24 my $minScore = 300;;
   25 my $uniqthresh = 0.9; # a match is considered unique if the second best match has less than this percentage of the best
   26 my $uniq = 0;
   27 my $best = 0;
   28 my $commongenefile;
   29 my $verbose = 0;
   30 my $help = 0;
   31 my $cmdline = join(" ", @ARGV);
   32 
   33 GetOptions(
   34     'help!'=>\$help,
   35     'minScore:i'=>\$minScore,
   36     'uniqthresh:f'=>\$uniqthresh,
   37     'uniq!'=>\$uniq,
   38     'best!'=>\$best,
   39     'commongenefile:s'=>\$commongenefile,
   40     'verbose!'=>\$verbose);
   41 
   42 if ($help) {
   43     print "$usage";
   44     exit(0);
   45 }
   46 if ($uniq && $best){
   47     print "--uniq and --best cannot be used at the same time\n";
   48     exit(0);
   49 }
   50 
   51 my ($readname, $contigname, $strand, $contigstart, $contigend, $readstart, $readend, $readlength, $score, $editstring);
   52 my $oldreadname = "";
   53 my @f;
   54 my ($outMinScore, $outUnique, $outBest) = (0,0,0); # number of reasons for filtering (nested, this order)
   55 my @qali = (); # array of array references: lines for each query (pair)
   56 open (COMMON, ">$commongenefile") or die ("Could not open $commongenefile for writing.") if (defined($commongenefile));
   57 
   58 while (<>) {
   59     s/^#.*//;
   60     next unless /\S/;
   61 
   62     @f = split /\t/, $_;
   63     if (@f < 10) { warn "Not SHRIMP format"; next }
   64     # ($readname, $contigname, $strand, $contigstart, $contigend, $readstart, $readend, $readlength, $score, $editstring) = @f;
   65     $readname = $f[0];
   66     $contigname = $f[1];
   67     $score    = $f[8];
   68     
   69     processQuery() if ($oldreadname ne $readname && $oldreadname ne "");
   70 
   71     # filter for minimum score
   72     if ($score < $minScore){
   73     $outMinScore++;
   74     next;
   75     }
   76 
   77     push @qali, [$_, $score, $contigname];
   78     $oldreadname = $readname;
   79 }
   80 processQuery() if ($readname ne "");
   81 
   82 close COMMON if (defined($commongenefile));
   83 
   84 
   85 print STDERR "\n        filtered:\n";
   86 print STDERR "----------------:\n";
   87 print STDERR "mininum score   : $outMinScore\n";
   88 print STDERR "uniqueness      : $outUnique\n";
   89 print STDERR "best            : $outBest\n";
   90 print STDERR "command line: $cmdline\n";
   91 
   92 sub processQuery(){
   93     # print "processing " . scalar(@qali) . " alignments\n";
   94     
   95     if (($uniq || $best) && @qali>1){
   96     my %rscores;
   97     foreach my $ali (@qali){
   98         $rscores{$ali} = scoreAli($ali); # store scores in hash so later sorting is faster
   99     }
  100     @qali = sort {-($rscores{$a} <=> $rscores{$b})} @qali;
  101     if ($uniq){
  102         # let pass only best alignment, and only if second is significantly worse
  103         my $ratio = $rscores{$qali[1]}/$rscores{$qali[0]};
  104         if ($verbose){
  105         print "best two alignments\n";
  106         print "" . $qali[0]->[0] ."\nscore=$rscores{$qali[0]}\n";
  107         print "" . $qali[1]->[0] ."\nscore=$rscores{$qali[1]}\n";
  108         print "ratio = $ratio\n";
  109         }
  110         if ($ratio < $uniqthresh){
  111         print $qali[0]->[0];
  112         } else {
  113         $outUnique++;
  114         }
  115     } else { # best filtering, take all alignments that have the same score as the best
  116         my $bestscore = $rscores{$qali[0]};
  117         my @bestTnames = ();
  118         while ($rscores{$qali[0]} == $bestscore){
  119         print $qali[0]->[0];
  120         push @bestTnames, $qali[0]->[2];
  121         shift @qali;
  122         }
  123         $outBest += @qali;
  124         if (@bestTnames > 1){
  125         my %genenames = ();
  126         foreach my $Tname (@bestTnames) { $Tname =~ s/\.t\d+//; $genenames{$Tname}=1; }
  127         print COMMON $oldreadname . "\t" . join(" ", keys %genenames) . "\n" if (%genenames>1 && defined($commongenefile));
  128         }
  129     }
  130     } else {
  131     foreach my $ali (@qali){
  132         print $ali->[0];
  133     }
  134     }
  135     @qali = ();
  136 }
  137 
  138 # for comparing quality of two alignments
  139 sub scoreAli{
  140     my $ali = shift;
  141     return $ali->[1];
  142 }