[LON-CAPA-cvs] cvs: modules /gerd/harvesting combine.pl finalize.pl

www www at source.lon-capa.org
Wed Sep 14 13:16:53 EDT 2011


www		Wed Sep 14 17:16:53 2011 EDT

  Added files:                 
    /modules/gerd/harvesting	combine.pl finalize.pl 
  Log:
  Two more helper scripts for inital round of taxonomy.
  
  

Index: modules/gerd/harvesting/combine.pl
+++ modules/gerd/harvesting/combine.pl
use strict;

my @seq=();
my @dir=();
my @top=();

open(IN,'problem_taxonomy.dat') || die("Could not open seq file");
while (my $line=<IN>) {
   chomp($line);
   my ($idx,$tax)=split(/\t/,$line);
   $seq[$idx]=$tax;
}
close(IN);

open(IN,'direct_taxonomy.dat') || die("Could not open direct file");
while (my $line=<IN>) {
   chomp($line);
   my ($idx,$tax)=split(/\t/,$line);
   $dir[$idx]=$tax;
}
close(IN);


open(IN,'res_topics_from_courses.dat') || die("Could not open course file");
while (my $line=<IN>) {
   chomp($line);
   my ($idx,$tax)=split(/\t/,$line);
   $top[$idx]=$tax;
}
close(IN);

for (my $i=0; $i<=$#dir; $i++) {
   my $comb=$seq[$i].','.$dir[$i].','.($top[$i]?$top[$i].':4':'');
#   print "-> $comb\n";
   my %cats=();
   foreach my $keycnt (split(/\,/,$comb)) {
      if ($keycnt=~/\w/) {
         my ($key,$cnt)=($keycnt=~/^(.+)\:(\d+)$/);
         $cats{$key}+=$cnt;
      }
   }
   my $newcats='';
   foreach my $key (sort(keys(%cats))) {
      $newcats.=','.$key.':'.$cats{$key};
   }
   $newcats=~s/^\,//;
   print "$i\t$newcats\n";
}

Index: modules/gerd/harvesting/finalize.pl
+++ modules/gerd/harvesting/finalize.pl
use strict;

open(IN,"combined_taxonomy.dat");
while (my $line=<IN>) {
   chomp($line);
   my ($idx,$tax)=split(/\t/,$line);
   my %lev1=();
   my %lev2=();
   my %lev3=();
   foreach my $taxcnt (split(/\,/,$tax)) {
      my ($itax,$cnt)=($taxcnt=~/^(.+)\:(\d+)$/);
      my @levels=split(/\:/,$itax);
      $lev1{$levels[0]}+=3.*$cnt;
      if ($levels[1]) {
         $lev2{$levels[0].':'.$levels[1]}+=2.*$cnt;
      }
      if ($levels[2]) {
         $lev3{$levels[0].':'.$levels[1].':'.$levels[2]}+=$cnt;
      }
   }
#   print "-> $tax\n";
   my $winners='';
# We do not want more than two level 3 taxonomies
   my $levthree=0;
   foreach my $le3 (sort(keys(%lev3))) {
      if ($lev3{$le3}>=4) {
         $levthree++;
      }
   }
   if ($levthree<3) {
      foreach my $le3 (sort(keys(%lev3))) {
         if ($lev3{$le3}>=4) {
            $winners.=','.$le3;
            my ($l1,$l2)=split(/\:/,$le3);
            $lev2{$l1.':'.$l2}=0;
            $lev1{$l1}=0;
         }
      }
   }
# We do not want more than two level 2 taxonomies, either
   my $levtwo=0;
   foreach my $le2 (sort(keys(%lev2))) {
      if ($lev2{$le2}>=4) {
         $levtwo++;
      }
   }
   if ($levtwo<3) {
      foreach my $le2 (sort(keys(%lev2))) {
         if ($lev2{$le2}>=4) {
            $winners.=','.$le2;
            my ($l1)=split(/\:/,$le2);
            $lev1{$l1}=0;
         }
      }
   }
   foreach my $le1 (sort(keys(%lev1))) {
      if ($lev1{$le1}>=4) {
         $winners.=','.$le1;
      }
   }
   $winners=~s/^\,//;
   print $idx."\t".$winners."\n";
}
close(IN);




More information about the LON-CAPA-cvs mailing list