[LON-CAPA-cvs] cvs: modules /gerd/harvesting combine.pl cor_sanity.pl finalize.pl reconcile.pl taxonomy.dat
www
www at source.lon-capa.org
Thu Sep 15 14:40:39 EDT 2011
www Thu Sep 15 18:40:39 2011 EDT
Added files:
/modules/gerd/harvesting cor_sanity.pl reconcile.pl
Modified files:
/modules/gerd/harvesting combine.pl finalize.pl taxonomy.dat
Log:
Almost done with taxonomy!
-------------- next part --------------
Index: modules/gerd/harvesting/combine.pl
diff -u modules/gerd/harvesting/combine.pl:1.1 modules/gerd/harvesting/combine.pl:1.2
--- modules/gerd/harvesting/combine.pl:1.1 Wed Sep 14 17:16:52 2011
+++ modules/gerd/harvesting/combine.pl Thu Sep 15 18:40:39 2011
@@ -3,6 +3,7 @@
my @seq=();
my @dir=();
my @top=();
+my @man=();
open(IN,'problem_taxonomy.dat') || die("Could not open seq file");
while (my $line=<IN>) {
@@ -29,8 +30,17 @@
}
close(IN);
+open(IN,'manual.dat') || die("Could not open manual file");
+while (my $line=<IN>) {
+ chomp($line);
+ my ($idx,$tax)=split(/\t/,$line);
+ $man[$idx]=$tax;
+}
+close(IN);
+
+
for (my $i=0; $i<=$#dir; $i++) {
- my $comb=$seq[$i].','.$dir[$i].','.($top[$i]?$top[$i].':4':'');
+ my $comb=$man[$i].','.$seq[$i].','.$dir[$i].','.($top[$i]?$top[$i].':4':'');
# print "-> $comb\n";
my %cats=();
foreach my $keycnt (split(/\,/,$comb)) {
Index: modules/gerd/harvesting/finalize.pl
diff -u modules/gerd/harvesting/finalize.pl:1.1 modules/gerd/harvesting/finalize.pl:1.2
--- modules/gerd/harvesting/finalize.pl:1.1 Wed Sep 14 17:16:52 2011
+++ modules/gerd/harvesting/finalize.pl Thu Sep 15 18:40:39 2011
@@ -10,55 +10,60 @@
foreach my $taxcnt (split(/\,/,$tax)) {
my ($itax,$cnt)=($taxcnt=~/^(.+)\:(\d+)$/);
my @levels=split(/\:/,$itax);
- $lev1{$levels[0]}+=3.*$cnt;
+ $lev1{$levels[0]}+=$cnt;
if ($levels[1]) {
- $lev2{$levels[0].':'.$levels[1]}+=2.*$cnt;
+ $lev2{$levels[0].':'.$levels[1]}+=$cnt;
}
if ($levels[2]) {
$lev3{$levels[0].':'.$levels[1].':'.$levels[2]}+=$cnt;
}
}
-# print "-> $tax\n";
- my $winners='';
-# We do not want more than two level 3 taxonomies
- my $levthree=0;
+ print "------ $tax\n";
+
+
+
+# Three levels deep
+
+ my $t3=0;
+ my $win3='';
foreach my $le3 (sort(keys(%lev3))) {
- if ($lev3{$le3}>=4) {
- $levthree++;
- }
+ print $le3."\t".$lev3{$le3}."\n";
+ $t3+=$lev3{$le3};
}
- if ($levthree<3) {
- foreach my $le3 (sort(keys(%lev3))) {
- if ($lev3{$le3}>=4) {
- $winners.=','.$le3;
- my ($l1,$l2)=split(/\:/,$le3);
- $lev2{$l1.':'.$l2}=0;
- $lev1{$l1}=0;
- }
+ foreach my $le3 (sort(keys(%lev3))) {
+ if ($lev3{$le3}>$t3/2.) {
+ $win3=$le3;
}
}
-# We do not want more than two level 2 taxonomies, either
- my $levtwo=0;
+
+# Two levels deep
+
+ my $t2=0;
+ my $win2='';
foreach my $le2 (sort(keys(%lev2))) {
- if ($lev2{$le2}>=4) {
- $levtwo++;
- }
+ print $le2."\t".$lev2{$le2}."\n";
+ $t2+=$lev2{$le2};
}
- if ($levtwo<3) {
- foreach my $le2 (sort(keys(%lev2))) {
- if ($lev2{$le2}>=4) {
- $winners.=','.$le2;
- my ($l1)=split(/\:/,$le2);
- $lev1{$l1}=0;
- }
+ foreach my $le2 (sort(keys(%lev2))) {
+ if ($lev2{$le2}>$t2/2.) {
+ $win2=$le2;
}
}
+
+
+# One level deep
+
+ my $t1=0;
+ my $win1='';
+ foreach my $le1 (sort(keys(%lev1))) {
+ print $le1."\t".$lev1{$le1}."\n";
+ $t1+=$lev1{$le1};
+ }
foreach my $le1 (sort(keys(%lev1))) {
- if ($lev1{$le1}>=4) {
- $winners.=','.$le1;
+ if ($lev1{$le1}>$t1/2.) {
+ $win1=$le1;
}
}
- $winners=~s/^\,//;
- print $idx."\t".$winners."\n";
+ print "WIN: $win3, $win2, $win1\n";
}
close(IN);
Index: modules/gerd/harvesting/taxonomy.dat
diff -u modules/gerd/harvesting/taxonomy.dat:1.7 modules/gerd/harvesting/taxonomy.dat:1.8
--- modules/gerd/harvesting/taxonomy.dat:1.7 Tue Sep 13 19:45:04 2011
+++ modules/gerd/harvesting/taxonomy.dat Thu Sep 15 18:40:39 2011
@@ -4,6 +4,7 @@
physics:mechanics:rotationalkinematics rotation,turn,turning,angular,speed,velocity,acceleration,angle,angles,degree,degrees,radians,displacement,balance work,force,torque,atom,quantum
physics:mechanics:lineardynamics force,forces,free,diagram,acceleration,mass,newton,weight torque,angle,angular,charge,magnetic,atom,quantum,work
physics:mechanics:rotationaldynamics torque,angular,acceleration,inertia,moment,rolling,rotate,rotation,rotating,rotational,torques atom,quantum,momentum
+physics:mechanics:statics
physics:mechanics:gravity gravity,gravitational,gravitation,universal,newton,mass,field
physics:mechanics:linearmomentum momentum,velocity,mass,collision,collisions,elastic,inelastic,impulse angular,atom,quantum,inertia,torque
physics:mechanics:angularmomentum momentum,velocity,angular,inertia,moment,torque atom,quantum
@@ -17,6 +18,8 @@
physics:thermodynamics:heat capacity,heat,combustion,specific,temperature
physics:thermodynamics:gaslaws energy,work,pressure,temperature,gas,ideal,gas,volume,atm
physics:thermodynamics:processes adiabatic,isobaric,isometric,process,carnot,isothermal,gas,cylinder,engine,diagram,processes,internal,energy,thermodynamics,exhaust,absorb,work,reservoir,heat,entropy
+physics:thermodynamics:kinetictheory
+physics:thermodynamics:engines
physics:electromagnetism:electrostatics electric,charge,coulomb,coulombs,field,charged,charges magnet,capacitor
physics:electromagnetism:potentials voltage,potential,field current,ampere,amperes,rlc
physics:electromagnetism:capacitance capacitance,capacitor,capacitors,charge,plate,farad,field,electric,charged,charging,discharging,plates,dielectric
@@ -31,6 +34,8 @@
physics:modern:relativity frame,relative,dilation,contraction,time,space,transform,light,observer,speed,relativity,invariant,invariance,momentum,energy lens,lenses,diffraction,eye,focus,friction
physics:modern:quantum spin,quantum,level,energy,black,body,bohr,heisenberg,atom,atoms,electron,electrons,wave,state,states,uncertainty,spectrum,line,photon,emission,absorption,emitted isotope,nuclear,neutron,neutrons,compound
physics:modern:nuclear nucleus,proton,neutron,decay,halflife,atoms,atomic,fission,fusion,decays,protons,neutrons,isotope,radioactive,beta,gamma,activity,nuclear,mass,radiation,activity,alpha
+physics:modern:solidstate
+physics:modern:highenergy
chemistry:introduction:units unit,measurement,mass,fahrenheit,celsius,kelvin,joule,rounding,significant
chemistry:introduction:states physics:thermodynamics:states
chemistry:introduction:mixtures substance,matter,pure,mixture,mixtures,homogenous,heterogenous
@@ -38,6 +43,7 @@
chemistry:introduction:periodictable elements,element,periodic,table,alkali,alkaline,metals,halogens,noble,transition
chemistry:introduction:massconservation mass,conservation,reaction,reactions
chemistry:introduction:energy exothermic,endothermic,energy,reaction,reactions,heat
+chemistry:thermodynamics:states physics:thermodynamics:states
chemistry:matter:nuclear physics:modern:nuclear
chemistry:matter:shell physics:modern:quantum
chemistry:bonding:ionic bonding,bond,ions,ionic,compound,compounds
@@ -90,6 +96,7 @@
mathematics:functions:exponential function,functions,exponent,exponential,power,exp,exponential
mathematics:functions:logarithmic function,functions,logarithm,logarithmic,hyperbolic,logarithmus
mathematics:functions:extrema function,minimum,maximum,minima,maxima,extrema,local,global,derivative,ableitung,derivatives,relative,absolute
+mathematics:functions:linear
mathematics:calculus:extrema mathematics:functions:extrema
mathematics:calculus:series function,series,expansion,taylor,reihe
mathematics:calculus:integrals integral,integrals,stammfunktion,integral,grenze,grenzen,area,curve
@@ -116,3 +123,23 @@
geology:waterresources water,resource,resources,aquifer,ground,gravel,saturated
geology:weatheringerosion weather,weathering,erosion
geology:windsdeserts desert,deserts,wind,air,sand,sands,erosion
+history
+accounting
+botany
+computerscience
+computerscience:finiteautomata
+computerscience:booleanlogic
+computerscience:multimedia
+computerscience:programming
+geometry
+astronomy
+design
+medicine
+finance
+advertising
+engineering:civil
+engineering:electrical
+languages:english
+languages:french
+ecology
+nutrition
Index: modules/gerd/harvesting/cor_sanity.pl
+++ modules/gerd/harvesting/cor_sanity.pl
use strict;
my %cats=();
open(IN,'taxonomy.dat');
while (my $line=<IN>) {
chomp($line);
my ($cat,$is,$isnot,$equiv)=split(/\t/,$line);
$cats{$cat}=1;
}
close(IN);
open(IN,'corrections.dat');
while (my $line=<IN>) {
chomp($line);
my ($url,$corrected)=split(/\t/,$line);
unless ($corrected) {
print "No correction for $url\n";
}
my $found=0;
foreach my $exist (keys(%cats)) {
if ($exist=~/^\Q$corrected\E/) {
$found=1;
last;
}
}
unless ($found) {
print "Cannot find $url: $corrected\n";
}
}
close(IN);
Index: modules/gerd/harvesting/reconcile.pl
+++ modules/gerd/harvesting/reconcile.pl
use strict;
my $filter=shift;
my @taxo=();
my $i=0;
open(IN,"taxonomy.dat");
while (my $line=<IN>) {
my ($tax)=(split(/\t/,$line));
$taxo[$i]=$tax;
$i++;
}
close(IN);
open(OUT,">>corrections.dat");
open(IN,"zeroth.dat");
while (my $line=<IN>) {
if (($filter) && ($line!~/\Q$filter\E/)) { next; }
print ($line);
print "? ";
my $judge=<STDIN>;
if ($judge=~/q/i) { last; }
if ($judge=~/\w/) {
chomp($line);
my ($url,$assigned)=split(/\t/,$line);
my $pi=0;
my $build='';
my @urllevel=();
foreach my $part (split(/\//,$url)) {
$build.='/'.$part;
print $pi."\t".$build."\n";
$urllevel[$pi]=$build;
$pi++;
}
print "> ";
my $new=<STDIN>;
if ($new=~/^\d/) {
my ($level,$class)=split(/\s+/,$new);
$urllevel[$level]=~s/^\///;
print OUT $urllevel[$level]."\t".$class."\n";
}
}
}
close(IN);
close(OUT);
More information about the LON-CAPA-cvs
mailing list