[LON-CAPA-cvs] cvs: modules /gerd/harvesting rectrans.pl /gerd/harvesting/dbfiles inspect.pl
www
www at source.lon-capa.org
Sat Jun 23 20:27:39 EDT 2012
www Sun Jun 24 00:27:39 2012 EDT
Added files:
/modules/gerd/harvesting rectrans.pl
/modules/gerd/harvesting/dbfiles inspect.pl
Log:
Scripts to generate and inspect database files for the recommender script
Index: modules/gerd/harvesting/rectrans.pl
+++ modules/gerd/harvesting/rectrans.pl
#!/usr/bin/perl
use strict;
use HTML::Entities;
use GDBM_File;
my $datapath='/home/www/loncapa/modules/gerd/harvesting/';
if (0) {
print("Resource data ...\n");
# Get resource URL, etc
my %count=();
tie(%count, 'GDBM_File', $datapath.'dbfiles/count.db',&GDBM_WRCREAT(), 0640);
my %url=();
tie(%url,'GDBM_File', $datapath.'dbfiles/url.db',&GDBM_WRCREAT(), 0640);
open(IN,$datapath.'resource.dat') || print("Could not read resource ...\n");
while (my $line=<IN>) {
my @entries=split(/\t/,$line);
$url{$entries[0]}=$entries[4];
$url{$entries[4]}=$entries[0];
my $cnt=$entries[17];
if ($cnt>0) { $count{$entries[0]}=$entries[17]; }
}
close(IN);
untie(%count);
untie(%url);
# Get keywords
my %keywords=();
tie(%keywords, 'GDBM_File', $datapath.'dbfiles/keywords.db',&GDBM_WRCREAT(), 0640);
#my %revkeywords=();
#tie(%revkeywords, 'GDBM_File', $datapath.'dbfiles/rev_keywords.db',&GDBM_WRCREAT(), 0640);
print("Keywords ...\n");
my %keystr=();
my ($id,$key)=(1,'');
open(IN,$datapath.'keywords.dat') || print("Could not read keywords ...\n");
while (my $line=<IN>) {
my $oid=$id;
chomp($line);
($id,$key)=split(/\t/,$line);
$key=~s/\W//gs;
$key=lc($key);
# if ($key=~/\w/) {
# my %thesekeys=();
# foreach my $tkey (split(/\,/,$revkeywords{$key})) {
# $thesekeys{$tkey}=1;
# }
# $thesekeys{$id}=1;
# $revkeywords{$key}=join(",",keys(%thesekeys));
# }
if ($id!=$oid) {
$keywords{$oid}=join(",",keys(%keystr));
%keystr=();
}
$keystr{$key}=1;
}
close(IN);
untie(%keywords);
#untie(%revkeywords);
# Get combined problems
print("Associations ...\n");
my %assoc=();
tie(%assoc, 'GDBM_File', $datapath.'dbfiles/associations.db',&GDBM_WRCREAT(), 0640);
open(IN,$datapath.'associations.dat') || print("Could not read associations ...\n");
while (my $line=<IN>) {
chomp($line);
my @entries=split(/\t/,$line);
my ($idx1,$idx2)=split(/\,/,$entries[0]);
$assoc{$idx1}.=",".$idx2.':'.$entries[1];
$assoc{$idx2}.=",".$idx1.':'.$entries[1];
}
close(IN);
untie(%assoc);
# Blocked above
}
# Get taxonomy
print("Taxonomies ...\n");
my %taxo1=();
tie(%taxo1, 'GDBM_File', $datapath.'dbfiles/taxo_level1.db',&GDBM_WRCREAT(), 0640);
my %taxo2=();
tie(%taxo2, 'GDBM_File', $datapath.'dbfiles/taxo_level2.db',&GDBM_WRCREAT(), 0640);
my %taxo3=();
tie(%taxo3, 'GDBM_File', $datapath.'dbfiles/taxo_level3.db',&GDBM_WRCREAT(), 0640);
my %restaxo=();
tie(%restaxo, 'GDBM_File', $datapath.'dbfiles/taxonomy.db',&GDBM_WRCREAT(), 0640);
my %totals=();
tie(%totals,'GDBM_File', $datapath.'dbfiles/taxo_categories.db',&GDBM_WRCREAT(), 0640);
my $tid=0;
open(IN,$datapath.'res_taxonomy.dat') || print("Could not read taxonomy ...\n");
while (my $line=<IN>) {
chomp($line);
my @entries=split(/\t/,$line);
my $turl=$entries[0];
my $where1=$turl;
my $where2=$turl;
my $cid1;
my $cid2;
my $cid3;
foreach my $indi (split(/\,/,$entries[1])) {
#print "==== $turl -> $indi\n";
my @sub=split(/\:/,$indi);
if ($sub[0]) {
unless ($totals{$sub[0]}) {
$tid++;
$totals{$sub[0]}=$tid;
}
$cid1=$totals{$sub[0]};
$totals{$cid1}++;
#print "- Level 1: $cid1\n";
if ($sub[1]) {
unless ($totals{$sub[0].':'.$sub[1]}) {
$tid++;
$totals{$sub[0].':'.$sub[1]}=$tid;
}
$cid2=$totals{$sub[0].':'.$sub[1]};
$totals{$cid2}++;
#print "-- Level 2: $cid2\n";
if ($sub[2]) {
unless ($totals{$sub[0].':'.$sub[1].':'.$sub[2]}) {
$tid++;
$totals{$sub[0].':'.$sub[1].':'.$sub[2]}=$tid;
}
$cid3=$totals{$sub[0].':'.$sub[1].':'.$sub[2]};
$totals{$cid3}++;
#print "--- Level 3: $cid3\n";
#print "--- Adding level 3: $turl\n";
$taxo3{$cid3}.=",".$turl;
#print "--- Now level 3: $cid3 -> $taxo3{$cid3}\n";
$where1="3:$cid3";
$where2="3:$cid3";
} else {
$where1="2:$cid2";
}
#print "-- Adding level 2: $where2\n";
my %secondlevel=();
foreach my $entry (split(/\,/,$taxo2{$cid2})) {
$secondlevel{$entry}=1;
}
$secondlevel{$where2}=1;
$taxo2{$cid2}=join(",",keys(%secondlevel));
#print "-- Now level 2: $cid2 -> $taxo2{$cid2}\n";
}
#print "- Adding level 1: $where1\n";
if ($where1=~/\:/) {
my %firstlevel=();
foreach my $entry (split(/\,/,$taxo1{$cid1})) {
$firstlevel{$entry}=1;
}
$firstlevel{$where1}=1;
$taxo1{$cid1}=join(",",sort(keys(%firstlevel)));
} else {
my ($first,$last)=split(/\-/,(split(/\,/,$taxo1{$cid1}))[-1]);
if (($last) && ($where1==$last+1)) {
$taxo1{$cid1}=~s/\,[^\,]+$/\,$first\-$where1/;
} elsif ($where1==$first+1) {
$taxo1{$cid1}.='-'.$where1;
} else {
$taxo1{$cid1}.=','.$where1;
}
}
#print "- Now level 1: $cid1 -> $taxo1{$cid1}\n";
}
$restaxo{$turl}.=",".$totals{$indi};
}
}
untie(%taxo1);
untie(%taxo2);
untie(%taxo3);
untie(%totals);
untie(%restaxo);
Index: modules/gerd/harvesting/dbfiles/inspect.pl
+++ modules/gerd/harvesting/dbfiles/inspect.pl
use GDBM_File;
my $fn=shift;
my $skey=shift;
my %hash;
tie(%hash, 'GDBM_File', $fn,&GDBM_READER(),0640);
if ($skey) {
print "[".$skey."]\t".$hash{$skey}."\n";
} else {
foreach my $key (sort(keys(%hash))) {
print "[".$key."]\t".$hash{$key}."\n";
}
}
untie(%hash);
More information about the LON-CAPA-cvs
mailing list