[LON-CAPA-cvs] cvs: modules /gerd/harvesting rectrans.pl /gerd/harvesting/dbfiles inspect.pl

Sat Jun 23 20:27:39 EDT 2012

www		Sun Jun 24 00:27:39 2012 EDT

  Added files:                 
    /modules/gerd/harvesting	rectrans.pl 
    /modules/gerd/harvesting/dbfiles	inspect.pl 
  Log:
  Scripts to generate and inspect database files for the recommender script
  
  
Index: modules/gerd/harvesting/rectrans.pl
+++ modules/gerd/harvesting/rectrans.pl
#!/usr/bin/perl
use strict;
use HTML::Entities;
use GDBM_File;

my $datapath='/home/www/loncapa/modules/gerd/harvesting/';

if (0) {

print("Resource data ...\n");

# Get resource URL, etc
my %count=();
tie(%count, 'GDBM_File', $datapath.'dbfiles/count.db',&GDBM_WRCREAT(), 0640);

my %url=();
tie(%url,'GDBM_File', $datapath.'dbfiles/url.db',&GDBM_WRCREAT(), 0640);

open(IN,$datapath.'resource.dat') || print("Could not read resource ...\n");
while (my $line=<IN>) {
   my @entries=split(/\t/,$line);
   $url{$entries[0]}=$entries[4];
   $url{$entries[4]}=$entries[0];
   my $cnt=$entries[17];
   if ($cnt>0) { $count{$entries[0]}=$entries[17]; }
}
close(IN);
untie(%count);
untie(%url);

# Get keywords

my %keywords=();
tie(%keywords, 'GDBM_File', $datapath.'dbfiles/keywords.db',&GDBM_WRCREAT(), 0640);
#my %revkeywords=();
#tie(%revkeywords, 'GDBM_File', $datapath.'dbfiles/rev_keywords.db',&GDBM_WRCREAT(), 0640);


print("Keywords ...\n");
my %keystr=();
my ($id,$key)=(1,'');
open(IN,$datapath.'keywords.dat') || print("Could not read keywords ...\n");
while (my $line=<IN>) {
   my $oid=$id;
   chomp($line);
   ($id,$key)=split(/\t/,$line);
   $key=~s/\W//gs;
   $key=lc($key);
#   if ($key=~/\w/) {
#      my %thesekeys=();
#      foreach my $tkey (split(/\,/,$revkeywords{$key})) {
#         $thesekeys{$tkey}=1;
#      }
#      $thesekeys{$id}=1;
#      $revkeywords{$key}=join(",",keys(%thesekeys));
#   }
   if ($id!=$oid) {
      $keywords{$oid}=join(",",keys(%keystr));
      %keystr=();
   }
   $keystr{$key}=1;
}
close(IN);
untie(%keywords);
#untie(%revkeywords);

# Get combined problems
print("Associations ...\n");
my %assoc=();
tie(%assoc, 'GDBM_File', $datapath.'dbfiles/associations.db',&GDBM_WRCREAT(), 0640);

open(IN,$datapath.'associations.dat') || print("Could not read associations ...\n");
while (my $line=<IN>) {
   chomp($line);
   my @entries=split(/\t/,$line);
   my ($idx1,$idx2)=split(/\,/,$entries[0]);
   $assoc{$idx1}.=",".$idx2.':'.$entries[1];
   $assoc{$idx2}.=",".$idx1.':'.$entries[1];
}
close(IN);
untie(%assoc);

# Blocked above
}

# Get taxonomy

print("Taxonomies ...\n");
my %taxo1=();
tie(%taxo1, 'GDBM_File', $datapath.'dbfiles/taxo_level1.db',&GDBM_WRCREAT(), 0640);

my %taxo2=();
tie(%taxo2, 'GDBM_File', $datapath.'dbfiles/taxo_level2.db',&GDBM_WRCREAT(), 0640);

my %taxo3=();
tie(%taxo3, 'GDBM_File', $datapath.'dbfiles/taxo_level3.db',&GDBM_WRCREAT(), 0640);


my %restaxo=();
tie(%restaxo, 'GDBM_File', $datapath.'dbfiles/taxonomy.db',&GDBM_WRCREAT(), 0640);

my %totals=();
tie(%totals,'GDBM_File', $datapath.'dbfiles/taxo_categories.db',&GDBM_WRCREAT(), 0640);

my $tid=0;

open(IN,$datapath.'res_taxonomy.dat') || print("Could not read taxonomy ...\n");
while (my $line=<IN>) {
   chomp($line);
   my @entries=split(/\t/,$line);
   my $turl=$entries[0];
   my $where1=$turl;
   my $where2=$turl;
   my $cid1;
   my $cid2;
   my $cid3;
   foreach my $indi (split(/\,/,$entries[1])) {
      #print "==== $turl -> $indi\n";
      my @sub=split(/\:/,$indi);
      if ($sub[0]) {
         unless ($totals{$sub[0]}) {
            $tid++;
            $totals{$sub[0]}=$tid;
         }
         $cid1=$totals{$sub[0]};
         $totals{$cid1}++;
         #print "- Level 1: $cid1\n";
         if ($sub[1]) {
            unless ($totals{$sub[0].':'.$sub[1]}) {
               $tid++;
               $totals{$sub[0].':'.$sub[1]}=$tid;
            }
            $cid2=$totals{$sub[0].':'.$sub[1]};
            $totals{$cid2}++;
            #print "-- Level 2: $cid2\n";
            if ($sub[2]) {
               unless ($totals{$sub[0].':'.$sub[1].':'.$sub[2]}) {
                  $tid++;
                  $totals{$sub[0].':'.$sub[1].':'.$sub[2]}=$tid;
               }
               $cid3=$totals{$sub[0].':'.$sub[1].':'.$sub[2]};
               $totals{$cid3}++;
               #print "--- Level 3: $cid3\n";
               #print "--- Adding level 3: $turl\n";
               $taxo3{$cid3}.=",".$turl;
               #print "--- Now level 3: $cid3 -> $taxo3{$cid3}\n";
               $where1="3:$cid3";
               $where2="3:$cid3";
            } else {
               $where1="2:$cid2";
            }
            #print "-- Adding level 2: $where2\n";
            my %secondlevel=();
            foreach my $entry (split(/\,/,$taxo2{$cid2})) {
               $secondlevel{$entry}=1;
            }
            $secondlevel{$where2}=1;
            $taxo2{$cid2}=join(",",keys(%secondlevel));
            #print "-- Now level 2: $cid2 -> $taxo2{$cid2}\n";
         }
         #print "- Adding level 1: $where1\n";
         if ($where1=~/\:/) {
            my %firstlevel=();
            foreach my $entry (split(/\,/,$taxo1{$cid1})) {
               $firstlevel{$entry}=1;
            }
            $firstlevel{$where1}=1;
            $taxo1{$cid1}=join(",",sort(keys(%firstlevel)));
         } else {
            my ($first,$last)=split(/\-/,(split(/\,/,$taxo1{$cid1}))[-1]);
            if (($last) && ($where1==$last+1)) {
                $taxo1{$cid1}=~s/\,[^\,]+$/\,$first\-$where1/;
            } elsif ($where1==$first+1) {
                $taxo1{$cid1}.='-'.$where1;
            } else {
                $taxo1{$cid1}.=','.$where1;
            }
         }
         #print "- Now level 1: $cid1 -> $taxo1{$cid1}\n";
      }
      $restaxo{$turl}.=",".$totals{$indi};
   }
}
untie(%taxo1);
untie(%taxo2);
untie(%taxo3);
untie(%totals);
untie(%restaxo);

Index: modules/gerd/harvesting/dbfiles/inspect.pl
+++ modules/gerd/harvesting/dbfiles/inspect.pl
use GDBM_File;
my $fn=shift;
my $skey=shift;
my %hash;
tie(%hash, 'GDBM_File', $fn,&GDBM_READER(),0640);
if ($skey) {
   print "[".$skey."]\t".$hash{$skey}."\n";
} else {
   foreach my $key (sort(keys(%hash))) {
      print "[".$key."]\t".$hash{$key}."\n";
   }
}
untie(%hash);