[LON-CAPA-cvs] cvs: modules /gerd/harvesting combine.pl finalize.pl
www
www at source.lon-capa.org
Wed Sep 14 13:16:53 EDT 2011
www Wed Sep 14 17:16:53 2011 EDT
Added files:
/modules/gerd/harvesting combine.pl finalize.pl
Log:
Two more helper scripts for inital round of taxonomy.
Index: modules/gerd/harvesting/combine.pl
+++ modules/gerd/harvesting/combine.pl
use strict;
my @seq=();
my @dir=();
my @top=();
open(IN,'problem_taxonomy.dat') || die("Could not open seq file");
while (my $line=<IN>) {
chomp($line);
my ($idx,$tax)=split(/\t/,$line);
$seq[$idx]=$tax;
}
close(IN);
open(IN,'direct_taxonomy.dat') || die("Could not open direct file");
while (my $line=<IN>) {
chomp($line);
my ($idx,$tax)=split(/\t/,$line);
$dir[$idx]=$tax;
}
close(IN);
open(IN,'res_topics_from_courses.dat') || die("Could not open course file");
while (my $line=<IN>) {
chomp($line);
my ($idx,$tax)=split(/\t/,$line);
$top[$idx]=$tax;
}
close(IN);
for (my $i=0; $i<=$#dir; $i++) {
my $comb=$seq[$i].','.$dir[$i].','.($top[$i]?$top[$i].':4':'');
# print "-> $comb\n";
my %cats=();
foreach my $keycnt (split(/\,/,$comb)) {
if ($keycnt=~/\w/) {
my ($key,$cnt)=($keycnt=~/^(.+)\:(\d+)$/);
$cats{$key}+=$cnt;
}
}
my $newcats='';
foreach my $key (sort(keys(%cats))) {
$newcats.=','.$key.':'.$cats{$key};
}
$newcats=~s/^\,//;
print "$i\t$newcats\n";
}
Index: modules/gerd/harvesting/finalize.pl
+++ modules/gerd/harvesting/finalize.pl
use strict;
open(IN,"combined_taxonomy.dat");
while (my $line=<IN>) {
chomp($line);
my ($idx,$tax)=split(/\t/,$line);
my %lev1=();
my %lev2=();
my %lev3=();
foreach my $taxcnt (split(/\,/,$tax)) {
my ($itax,$cnt)=($taxcnt=~/^(.+)\:(\d+)$/);
my @levels=split(/\:/,$itax);
$lev1{$levels[0]}+=3.*$cnt;
if ($levels[1]) {
$lev2{$levels[0].':'.$levels[1]}+=2.*$cnt;
}
if ($levels[2]) {
$lev3{$levels[0].':'.$levels[1].':'.$levels[2]}+=$cnt;
}
}
# print "-> $tax\n";
my $winners='';
# We do not want more than two level 3 taxonomies
my $levthree=0;
foreach my $le3 (sort(keys(%lev3))) {
if ($lev3{$le3}>=4) {
$levthree++;
}
}
if ($levthree<3) {
foreach my $le3 (sort(keys(%lev3))) {
if ($lev3{$le3}>=4) {
$winners.=','.$le3;
my ($l1,$l2)=split(/\:/,$le3);
$lev2{$l1.':'.$l2}=0;
$lev1{$l1}=0;
}
}
}
# We do not want more than two level 2 taxonomies, either
my $levtwo=0;
foreach my $le2 (sort(keys(%lev2))) {
if ($lev2{$le2}>=4) {
$levtwo++;
}
}
if ($levtwo<3) {
foreach my $le2 (sort(keys(%lev2))) {
if ($lev2{$le2}>=4) {
$winners.=','.$le2;
my ($l1)=split(/\:/,$le2);
$lev1{$l1}=0;
}
}
}
foreach my $le1 (sort(keys(%lev1))) {
if ($lev1{$le1}>=4) {
$winners.=','.$le1;
}
}
$winners=~s/^\,//;
print $idx."\t".$winners."\n";
}
close(IN);
More information about the LON-CAPA-cvs
mailing list