[LON-CAPA-cvs] cvs: modules /gerd/harvesting problem_taxonomy.pl

Fri Sep 9 20:45:44 EDT 2011

www		Sat Sep 10 00:45:44 2011 EDT

  Added files:                 
    /modules/gerd/harvesting	problem_taxonomy.pl 
  Log:
  Final step: add taxonomies to problems
  
  

Index: modules/gerd/harvesting/problem_taxonomy.pl
+++ modules/gerd/harvesting/problem_taxonomy.pl
use strict;

my %st=();
open(IN,'seq_taxonomy.dat');
while (my $line=<IN>) {
   chomp($line);
   my ($seq,$tax)=split(/\t/,$line);
   $st{$seq}=$tax;
}
close(IN);

my @taxo=();
open(IN,'probs_in_seq.dat');
while (my $line=<IN>) {
   chomp($line);
   my ($seq,$probs)=split(/\t/,$line);
   foreach my $pn (split(/\,/,$probs)) {
      $taxo[$pn].=','.$st{$seq};
   }
}
close(IN);

my @keywords=();

my $keystr='';
my ($id,$key)=(1,'');
open(IN,'keywords.dat');
while (my $line=<IN>) {
   my $oid=$id;
   chomp($line);
   ($id,$key)=split(/\t/,$line);
   if ($id!=$oid) {
      $keystr=~s/^\,//;
      $keywords[$oid]=$keystr;
      $keystr='';
   }
   $keystr.=','.lc($key);
}
close(IN);

for (my $i=0; $i<=$#taxo; $i++) {
   my %taxh=();
   foreach my $taxi (split(/\,/,$taxo[$i])) {
       $taxh{$taxi}++;
   }
   my $taxout='';
   my $j=0;
   foreach my $key (sort(keys(%taxh))) {
      if ($key=~/\w/) {
         $taxout.=','.$key.':'.$taxh{$key};
         $j++;
      }
   }
   $taxout=~s/^\,//;
   print $i."\t".$taxout."\n";
   if ($j>1) { print "KEYS: $i $keywords[$i]\n"; }
}