[LON-CAPA-cvs] cvs: modules /gerd/harvesting combine.pl cor_sanity.pl finalize.pl reconcile.pl taxonomy.dat

www www at source.lon-capa.org
Thu Sep 15 14:40:39 EDT 2011


www		Thu Sep 15 18:40:39 2011 EDT

  Added files:                 
    /modules/gerd/harvesting	cor_sanity.pl reconcile.pl 

  Modified files:              
    /modules/gerd/harvesting	combine.pl finalize.pl taxonomy.dat 
  Log:
  Almost done with taxonomy!
  
  
-------------- next part --------------
Index: modules/gerd/harvesting/combine.pl
diff -u modules/gerd/harvesting/combine.pl:1.1 modules/gerd/harvesting/combine.pl:1.2
--- modules/gerd/harvesting/combine.pl:1.1	Wed Sep 14 17:16:52 2011
+++ modules/gerd/harvesting/combine.pl	Thu Sep 15 18:40:39 2011
@@ -3,6 +3,7 @@
 my @seq=();
 my @dir=();
 my @top=();
+my @man=();
 
 open(IN,'problem_taxonomy.dat') || die("Could not open seq file");
 while (my $line=<IN>) {
@@ -29,8 +30,17 @@
 }
 close(IN);
 
+open(IN,'manual.dat') || die("Could not open manual file");
+while (my $line=<IN>) {
+   chomp($line);
+   my ($idx,$tax)=split(/\t/,$line);
+   $man[$idx]=$tax;
+}
+close(IN);
+
+
 for (my $i=0; $i<=$#dir; $i++) {
-   my $comb=$seq[$i].','.$dir[$i].','.($top[$i]?$top[$i].':4':'');
+   my $comb=$man[$i].','.$seq[$i].','.$dir[$i].','.($top[$i]?$top[$i].':4':'');
 #   print "-> $comb\n";
    my %cats=();
    foreach my $keycnt (split(/\,/,$comb)) {
Index: modules/gerd/harvesting/finalize.pl
diff -u modules/gerd/harvesting/finalize.pl:1.1 modules/gerd/harvesting/finalize.pl:1.2
--- modules/gerd/harvesting/finalize.pl:1.1	Wed Sep 14 17:16:52 2011
+++ modules/gerd/harvesting/finalize.pl	Thu Sep 15 18:40:39 2011
@@ -10,55 +10,60 @@
    foreach my $taxcnt (split(/\,/,$tax)) {
       my ($itax,$cnt)=($taxcnt=~/^(.+)\:(\d+)$/);
       my @levels=split(/\:/,$itax);
-      $lev1{$levels[0]}+=3.*$cnt;
+      $lev1{$levels[0]}+=$cnt;
       if ($levels[1]) {
-         $lev2{$levels[0].':'.$levels[1]}+=2.*$cnt;
+         $lev2{$levels[0].':'.$levels[1]}+=$cnt;
       }
       if ($levels[2]) {
          $lev3{$levels[0].':'.$levels[1].':'.$levels[2]}+=$cnt;
       }
    }
-#   print "-> $tax\n";
-   my $winners='';
-# We do not want more than two level 3 taxonomies
-   my $levthree=0;
+   print "------ $tax\n";
+
+
+
+# Three levels deep
+
+   my $t3=0;
+   my $win3='';
    foreach my $le3 (sort(keys(%lev3))) {
-      if ($lev3{$le3}>=4) {
-         $levthree++;
-      }
+      print $le3."\t".$lev3{$le3}."\n";
+      $t3+=$lev3{$le3};
    }
-   if ($levthree<3) {
-      foreach my $le3 (sort(keys(%lev3))) {
-         if ($lev3{$le3}>=4) {
-            $winners.=','.$le3;
-            my ($l1,$l2)=split(/\:/,$le3);
-            $lev2{$l1.':'.$l2}=0;
-            $lev1{$l1}=0;
-         }
+   foreach my $le3 (sort(keys(%lev3))) {
+      if ($lev3{$le3}>$t3/2.) {
+         $win3=$le3;
       }
    }
-# We do not want more than two level 2 taxonomies, either
-   my $levtwo=0;
+
+# Two levels deep
+
+   my $t2=0;
+   my $win2='';
    foreach my $le2 (sort(keys(%lev2))) {
-      if ($lev2{$le2}>=4) {
-         $levtwo++;
-      }
+      print $le2."\t".$lev2{$le2}."\n";
+      $t2+=$lev2{$le2};
    }
-   if ($levtwo<3) {
-      foreach my $le2 (sort(keys(%lev2))) {
-         if ($lev2{$le2}>=4) {
-            $winners.=','.$le2;
-            my ($l1)=split(/\:/,$le2);
-            $lev1{$l1}=0;
-         }
+   foreach my $le2 (sort(keys(%lev2))) {
+      if ($lev2{$le2}>$t2/2.) {
+         $win2=$le2;
       }
    }
+
+
+# One level deep
+
+   my $t1=0;
+   my $win1='';
+   foreach my $le1 (sort(keys(%lev1))) {
+      print $le1."\t".$lev1{$le1}."\n";
+      $t1+=$lev1{$le1};
+   }
    foreach my $le1 (sort(keys(%lev1))) {
-      if ($lev1{$le1}>=4) {
-         $winners.=','.$le1;
+      if ($lev1{$le1}>$t1/2.) {
+         $win1=$le1;
       }
    }
-   $winners=~s/^\,//;
-   print $idx."\t".$winners."\n";
+   print "WIN: $win3, $win2, $win1\n";
 }
 close(IN);
Index: modules/gerd/harvesting/taxonomy.dat
diff -u modules/gerd/harvesting/taxonomy.dat:1.7 modules/gerd/harvesting/taxonomy.dat:1.8
--- modules/gerd/harvesting/taxonomy.dat:1.7	Tue Sep 13 19:45:04 2011
+++ modules/gerd/harvesting/taxonomy.dat	Thu Sep 15 18:40:39 2011
@@ -4,6 +4,7 @@
 physics:mechanics:rotationalkinematics	rotation,turn,turning,angular,speed,velocity,acceleration,angle,angles,degree,degrees,radians,displacement,balance	work,force,torque,atom,quantum	
 physics:mechanics:lineardynamics	force,forces,free,diagram,acceleration,mass,newton,weight	torque,angle,angular,charge,magnetic,atom,quantum,work
 physics:mechanics:rotationaldynamics	torque,angular,acceleration,inertia,moment,rolling,rotate,rotation,rotating,rotational,torques	atom,quantum,momentum
+physics:mechanics:statics
 physics:mechanics:gravity	gravity,gravitational,gravitation,universal,newton,mass,field
 physics:mechanics:linearmomentum	momentum,velocity,mass,collision,collisions,elastic,inelastic,impulse	angular,atom,quantum,inertia,torque
 physics:mechanics:angularmomentum	momentum,velocity,angular,inertia,moment,torque	atom,quantum
@@ -17,6 +18,8 @@
 physics:thermodynamics:heat	capacity,heat,combustion,specific,temperature
 physics:thermodynamics:gaslaws	energy,work,pressure,temperature,gas,ideal,gas,volume,atm
 physics:thermodynamics:processes	adiabatic,isobaric,isometric,process,carnot,isothermal,gas,cylinder,engine,diagram,processes,internal,energy,thermodynamics,exhaust,absorb,work,reservoir,heat,entropy
+physics:thermodynamics:kinetictheory
+physics:thermodynamics:engines
 physics:electromagnetism:electrostatics	electric,charge,coulomb,coulombs,field,charged,charges	magnet,capacitor
 physics:electromagnetism:potentials	voltage,potential,field	current,ampere,amperes,rlc
 physics:electromagnetism:capacitance	capacitance,capacitor,capacitors,charge,plate,farad,field,electric,charged,charging,discharging,plates,dielectric
@@ -31,6 +34,8 @@
 physics:modern:relativity	frame,relative,dilation,contraction,time,space,transform,light,observer,speed,relativity,invariant,invariance,momentum,energy	lens,lenses,diffraction,eye,focus,friction
 physics:modern:quantum	spin,quantum,level,energy,black,body,bohr,heisenberg,atom,atoms,electron,electrons,wave,state,states,uncertainty,spectrum,line,photon,emission,absorption,emitted	isotope,nuclear,neutron,neutrons,compound
 physics:modern:nuclear	nucleus,proton,neutron,decay,halflife,atoms,atomic,fission,fusion,decays,protons,neutrons,isotope,radioactive,beta,gamma,activity,nuclear,mass,radiation,activity,alpha
+physics:modern:solidstate
+physics:modern:highenergy
 chemistry:introduction:units	unit,measurement,mass,fahrenheit,celsius,kelvin,joule,rounding,significant
 chemistry:introduction:states			physics:thermodynamics:states
 chemistry:introduction:mixtures	substance,matter,pure,mixture,mixtures,homogenous,heterogenous
@@ -38,6 +43,7 @@
 chemistry:introduction:periodictable	elements,element,periodic,table,alkali,alkaline,metals,halogens,noble,transition
 chemistry:introduction:massconservation	mass,conservation,reaction,reactions
 chemistry:introduction:energy		exothermic,endothermic,energy,reaction,reactions,heat
+chemistry:thermodynamics:states			physics:thermodynamics:states
 chemistry:matter:nuclear			physics:modern:nuclear
 chemistry:matter:shell			physics:modern:quantum
 chemistry:bonding:ionic	bonding,bond,ions,ionic,compound,compounds
@@ -90,6 +96,7 @@
 mathematics:functions:exponential	function,functions,exponent,exponential,power,exp,exponential
 mathematics:functions:logarithmic	function,functions,logarithm,logarithmic,hyperbolic,logarithmus
 mathematics:functions:extrema	function,minimum,maximum,minima,maxima,extrema,local,global,derivative,ableitung,derivatives,relative,absolute
+mathematics:functions:linear
 mathematics:calculus:extrema			mathematics:functions:extrema
 mathematics:calculus:series	function,series,expansion,taylor,reihe
 mathematics:calculus:integrals	integral,integrals,stammfunktion,integral,grenze,grenzen,area,curve
@@ -116,3 +123,23 @@
 geology:waterresources	water,resource,resources,aquifer,ground,gravel,saturated
 geology:weatheringerosion	weather,weathering,erosion
 geology:windsdeserts	desert,deserts,wind,air,sand,sands,erosion
+history
+accounting
+botany
+computerscience
+computerscience:finiteautomata
+computerscience:booleanlogic
+computerscience:multimedia
+computerscience:programming
+geometry
+astronomy
+design
+medicine
+finance
+advertising
+engineering:civil
+engineering:electrical
+languages:english
+languages:french
+ecology
+nutrition

Index: modules/gerd/harvesting/cor_sanity.pl
+++ modules/gerd/harvesting/cor_sanity.pl
use strict;

my %cats=();
open(IN,'taxonomy.dat');
while (my $line=<IN>) {
   chomp($line);
   my ($cat,$is,$isnot,$equiv)=split(/\t/,$line);
   $cats{$cat}=1;
}
close(IN);

open(IN,'corrections.dat');
while (my $line=<IN>) {
   chomp($line);
   my ($url,$corrected)=split(/\t/,$line);
   unless ($corrected) {
      print "No correction for $url\n";
   }
   my $found=0;
   foreach my $exist (keys(%cats)) {
      if ($exist=~/^\Q$corrected\E/) {
         $found=1;
         last;
      }
   }
   unless ($found) {
      print "Cannot find $url: $corrected\n";
   }
}
close(IN);

Index: modules/gerd/harvesting/reconcile.pl
+++ modules/gerd/harvesting/reconcile.pl
use strict;
my $filter=shift;

my @taxo=();
my $i=0;

open(IN,"taxonomy.dat");
while (my $line=<IN>) {
   my ($tax)=(split(/\t/,$line));
   $taxo[$i]=$tax;
   $i++;   
}
close(IN);

open(OUT,">>corrections.dat");
open(IN,"zeroth.dat");
while (my $line=<IN>) {
   if (($filter) && ($line!~/\Q$filter\E/)) { next; }
   print ($line);
   print "? ";
   my $judge=<STDIN>;
   if ($judge=~/q/i) { last; }
   if ($judge=~/\w/) {
      chomp($line);
      my ($url,$assigned)=split(/\t/,$line);
      my $pi=0;
      my $build='';
      my @urllevel=();
      foreach my $part (split(/\//,$url)) {
         $build.='/'.$part;
         print $pi."\t".$build."\n";
         $urllevel[$pi]=$build;
         $pi++;
      }
      print "> ";
      my $new=<STDIN>;
      if ($new=~/^\d/) {
         my ($level,$class)=split(/\s+/,$new);
         $urllevel[$level]=~s/^\///;
         print OUT $urllevel[$level]."\t".$class."\n";
      }
   }
}
close(IN);
close(OUT);


More information about the LON-CAPA-cvs mailing list