[LON-CAPA-cvs] cvs: modules /droeschl/Metadata create.sql harvestrun.pl /droeschl/Metadata/lib/Metadata Clean.pm Harvest.pm Load.pm /droeschl/Metadata/t 10-clean 10-harvest clean.test corrupt.test misc.t misc.t notcorrupt.test pod-coverage.t pod.t useforloadtest.test
droeschl
droeschl@source.lon-capa.org
Mon, 16 Aug 2010 20:54:26 -0000
This is a MIME encoded message
--droeschl1281992066
Content-Type: text/plain
droeschl Mon Aug 16 20:54:26 2010 EDT
Added files:
/modules/droeschl/Metadata create.sql harvestrun.pl
/modules/droeschl/Metadata/lib/Metadata Clean.pm Harvest.pm Load.pm
/modules/droeschl/Metadata/t misc.t misc.t clean.test corrupt.test
notcorrupt.test pod-coverage.t pod.t
useforloadtest.test
Modified files:
/modules/droeschl/Metadata/t 10-clean 10-harvest
Log:
Metadata Harvesting.
--droeschl1281992066
Content-Type: text/plain
Content-Disposition: attachment; filename="droeschl-20100816205426.txt"
Index: modules/droeschl/Metadata/create.sql
+++ modules/droeschl/Metadata/create.sql
SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL';
CREATE SCHEMA IF NOT EXISTS `metadata` DEFAULT CHARACTER SET utf8 ;
USE `metadata` ;
-- -----------------------------------------------------
-- Table `metadata`.`resource`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`resource` ;
CREATE TABLE IF NOT EXISTS `metadata`.`resource` (
`key` INT NOT NULL ,
`title` TEXT NULL ,
`author` TEXT NULL ,
`subject` TEXT NULL ,
`url` VARCHAR(255) NOT NULL ,
`note` TEXT NULL ,
`abstract` TEXT NULL ,
`mime` TEXT NULL ,
`language` TEXT NULL ,
`creationdate` DATETIME NULL ,
`lastrevisiondate` DATETIME NULL ,
`owner` TEXT NULL ,
`domain` VARCHAR(45) NULL ,
`dependencies` TEXT NULL ,
`lowestgradelevel` INT NULL ,
`highestgradelevel` INT NULL ,
`standards` TEXT NULL ,
`count` INT NULL ,
`course` INT NULL ,
`goto` INT NULL ,
`comefrom` INT NULL ,
`sequsage` INT NULL ,
`stdno` INT NULL ,
`avetries` FLOAT NULL ,
`difficulty` FLOAT NULL ,
PRIMARY KEY (`key`) ,
UNIQUE INDEX `url_UNIQUE` (`url` ASC) )
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`authorspace`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`authorspace` ;
CREATE TABLE IF NOT EXISTS `metadata`.`authorspace` (
`key` INT NOT NULL ,
`domain` VARCHAR(45) NOT NULL ,
`user` VARCHAR(45) NOT NULL ,
PRIMARY KEY (`key`) ,
INDEX `fk_authorspace_resource1` (`key` ASC) ,
CONSTRAINT `fk_authorspace_resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`avetries_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`avetries_list` ;
CREATE TABLE IF NOT EXISTS `metadata`.`avetries_list` (
`key` INT NOT NULL ,
`avetries` FLOAT NOT NULL ,
`count` INT NULL ,
PRIMARY KEY (`avetries`, `key`) ,
INDEX `fk_avetries_list_Resource1` (`key` ASC) ,
CONSTRAINT `fk_avetries_list_Resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`comefrom_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`comefrom_list` ;
CREATE TABLE IF NOT EXISTS `metadata`.`comefrom_list` (
`key` INT NOT NULL ,
`comefrom_url` VARCHAR(255) NOT NULL ,
PRIMARY KEY (`key`, `comefrom_url`) ,
INDEX `fk_comefrom_list_Resource1` (`key` ASC) ,
INDEX `fk_comefrom_list_Resource2` (`comefrom_url` ASC) ,
CONSTRAINT `fk_comefrom_list_Resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_comefrom_list_Resource2`
FOREIGN KEY (`comefrom_url` )
REFERENCES `metadata`.`resource` (`url` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`copyright`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`copyright` ;
CREATE TABLE IF NOT EXISTS `metadata`.`copyright` (
`key` INT NOT NULL ,
`copyright` VARCHAR(45) NOT NULL ,
INDEX `fk_copyright_resource1` (`key` ASC) ,
PRIMARY KEY (`key`, `copyright`) ,
CONSTRAINT `fk_copyright_resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`course_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`course_list` ;
CREATE TABLE IF NOT EXISTS `metadata`.`course_list` (
`key` INT NOT NULL ,
`course` VARCHAR(45) NOT NULL ,
INDEX `fk_course_list_resource1` (`key` ASC) ,
PRIMARY KEY (`course`, `key`) ,
CONSTRAINT `fk_course_list_resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`difficulty_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`difficulty_list` ;
CREATE TABLE IF NOT EXISTS `metadata`.`difficulty_list` (
`key` INT NOT NULL ,
`difficulty` FLOAT NOT NULL ,
`count` INT NULL ,
INDEX `fk_difficulty_list_resource1` (`key` ASC) ,
PRIMARY KEY (`key`, `difficulty`) ,
CONSTRAINT `fk_difficulty_list_resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`disc_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`disc_list` ;
CREATE TABLE IF NOT EXISTS `metadata`.`disc_list` (
`key` INT NOT NULL ,
`disc` FLOAT NOT NULL ,
`count` INT NULL ,
PRIMARY KEY (`key`, `disc`) ,
INDEX `fk_disc_list_resource1` (`key` ASC) ,
CONSTRAINT `fk_disc_list_resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`evaluation`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`evaluation` ;
CREATE TABLE IF NOT EXISTS `metadata`.`evaluation` (
`key` INT NOT NULL ,
`clear` FLOAT NULL ,
`correct` FLOAT NULL ,
`depth` FLOAT NULL ,
`helpful` FLOAT NULL ,
`technical` FLOAT NULL ,
INDEX `fk_evaluation_resource1` (`key` ASC) ,
PRIMARY KEY (`key`) ,
CONSTRAINT `fk_evaluation_resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`goto_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`goto_list` ;
CREATE TABLE IF NOT EXISTS `metadata`.`goto_list` (
`key` INT NOT NULL ,
`goto_url` VARCHAR(255) NOT NULL ,
PRIMARY KEY (`key`, `goto_url`) ,
INDEX `fk_goto_list_resource1` (`key` ASC) ,
INDEX `fk_goto_list_resource2` (`goto_url` ASC) ,
CONSTRAINT `fk_goto_list_resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_goto_list_resource2`
FOREIGN KEY (`goto_url` )
REFERENCES `metadata`.`resource` (`url` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`keywords`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`keywords` ;
CREATE TABLE IF NOT EXISTS `metadata`.`keywords` (
`key` INT NOT NULL ,
`keyword` VARCHAR(100) NOT NULL ,
PRIMARY KEY (`key`, `keyword`) ,
INDEX `fk_table1_resource1` (`key` ASC) ,
CONSTRAINT `fk_table1_resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`modifyinguser`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`modifyinguser` ;
CREATE TABLE IF NOT EXISTS `metadata`.`modifyinguser` (
`key` INT NOT NULL ,
`user` VARCHAR(45) NULL ,
`domain` VARCHAR(45) NULL ,
PRIMARY KEY (`key`) ,
INDEX `fk_modifyinguser_resource1` (`key` ASC) ,
CONSTRAINT `fk_modifyinguser_resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
-- -----------------------------------------------------
-- Table `metadata`.`sequsage_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`sequsage_list` ;
CREATE TABLE IF NOT EXISTS `metadata`.`sequsage_list` (
`key` INT NOT NULL ,
`sequsage_url` VARCHAR(255) NOT NULL ,
PRIMARY KEY (`key`, `sequsage_url`) ,
INDEX `fk_sequsage_list_resource1` (`key` ASC) ,
INDEX `fk_sequsage_list_resource2` (`sequsage_url` ASC) ,
CONSTRAINT `fk_sequsage_list_resource1`
FOREIGN KEY (`key` )
REFERENCES `metadata`.`resource` (`key` )
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_sequsage_list_resource2`
FOREIGN KEY (`sequsage_url` )
REFERENCES `metadata`.`resource` (`url` )
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;
Index: modules/droeschl/Metadata/harvestrun.pl
+++ modules/droeschl/Metadata/harvestrun.pl
#!/usr/bin/perl
use strict;
use warnings;
use lib 'lib/';
#use Metadata::Harvest;
use Metadata::Clean;
use Metadata::Load;
my $file;
my $dir = '/Users/stefan/tests/';
#disabled locally
#my $h = Metadata::Harvest->new();
#print "Harvesting...\n";
#$file = $h->harvest();
$file = "/Users/stefan/tests/merge.dat";
my $c = Metadata::Clean->new(file =>$file,
out =>$dir,
debug =>10,
remove=>1, );
print "Cleaning...\n";
$file = $c->clean();
print "Sorting...\n"; #sort -u similar to sort $file | uniq
system("/usr/bin/sort","-u", "-o",$file, $file) == 0 #but uniq shortens lines!
or die $?; #unfortunately this has to be done 'manually' as of yet
my $l = Metadata::Load->new( debug =>10,
intodb =>0,
out =>$dir,
file =>$file,
remove =>1, );
print "Loading...\n";
print "Files can be found in:" . $l->load();
Index: modules/droeschl/Metadata/lib/Metadata/Clean.pm
+++ modules/droeschl/Metadata/lib/Metadata/Clean.pm
package Metadata::Clean;
use warnings;
use strict;
use lib '/home/httpd/lib/perl';
use Encode qw(decode encode_utf8 decode_utf8);
use Data::Dumper;
eval "use LONCAPA::Configuration";
=head1 NAME
Metadata::Clean - cleans harvested metadata
=head1 VERSION
Version 1.00
=cut
our $VERSION = '1.00';
=head1 SYNOPSIS
use Metadata::Clean;
my $c = Metadata::Clean->new(file=>$filename);
...
=head1 SUBROUTINES/METHODS
=head2 new
This is the constructor.
my $h = Metadata::Clean->new();
=over
=item file (mandatory)
Filename of the file to clean.
Most probably output of Metadata::Harvest::harvest()
=item remove
If set to a true value $file will be deleted after processing.
=item debug
The debug option sets the debug level. Default is 0.
=item out
The out option determines the output directory.
By default it is set to the tmp directory.
=item log
Sets logfile. Default /home/httpd/perl/logs/clean.log
=back
=cut
#TODO remove duplicate rows
#TODO remove directories
#TODO remove code for local execution
sub new {
my $class = shift;
my $config = shift;
my $self = bless {}, $class;
# arguments may be passed as new({key=>val}) or new(key=>val)
$config = { $config, @_ } if ref($config) ne 'HASH' and @_;
die "File $config->{file} does not exist" unless -e $config->{file};
my %lcconf;
if (defined &LONCAPA::Configuration::read_conf){
%lcconf = %{LONCAPA::Configuration::read_conf('loncapa.conf')};
}
$self->{FILE} = $config->{file};
$self->{DEBUG} = $config->{debug} || 0 ;
$self->{UNLINK} = $config->{remove} || 0 ;
$self->{LOG} = $config->{log} || %lcconf ? $lcconf{lonDaemons} . '/logs/clean.log' : 'clean.log';
$self->{OUT} = $config->{out} || %lcconf ? $lcconf{lonDaemons} . '/tmp/' : '';
return $self;
}
=head2 clean
This does the actual cleaning.
=cut
sub clean{
my ($self) = @_;
my $output = $self->{OUT} . 'clean.dat';
$self->_log(0, "Start: $self->{FILE}");
open my $I, '<', $self->{FILE}
or $self->_log(0, "Cannot read $self->{FILE}:$!") and return;
open my $O, '>', $output
or $self->_log(0, "Cannot open output file:$!") and return;
# we read and write utf-8 data
binmode $I, ':encoding(utf8)'; # w/o these settings perl handles input as binary
binmode $O, ':encoding(utf8)'; # and may (en|de)code as ISO-8859-1. -> garbage output
#TODO optimize
while(<$I>){
s/%5[cC]/\\\\/g; # \ => \\
s/^,/\\N,/g; # \N => mysql NULL start of line
s/,$/,\\N/g; # \N => mysql NULL end of line
s/,(?=,)/,\\N/g; # \N => mysql NULL in between
tr/,/\t/;
$_ = $self->_url_unescape($_);
print $O $_;
}
close $O;
close $I;
$self->_log(0, "done.");
unlink $self->{FILE} if $self->{UNLINK};
return $output;
}
=head2 _url_unescape
--internal-- unescapes URL escapes.
It doesn't unescape control characters (00-1F)
which is important in order to avoid conflicts
=cut
sub _url_unescape {
my ($self, $str) = @_ ;
# pack operates on binary data
$str=encode_utf8($str);
# don't unescape control characters (00-1F)
$str =~ s/%([a-fA-F2-9][a-fA-F0-9])/pack("C",hex($1))/eg;
# double escaped spaces and . and -
$str =~ s/%(20|2e|2d)/pack("C",hex($1))/eg;
# CR and LF
#FIXME something else should happen with those, convenience for now
$str =~ s/%0(?:d|a)//g;
# mostly good utf-8
return decode_utf8($str);
}
=head2 _log
--internal-- logging method
=cut
sub _log {
my ($self, $level, $msg) = @_;
return unless $level <= $self->{DEBUG};
open my $LOGFILE, '>>', $self->{LOG}
or die "Couldn't open logfile: $!";
( caller 1 )[3] =~ /::(\w+)$/ if ( caller 1 )[3];
print $LOGFILE time . " ".($1?$1:caller).": $msg \n" ;
close $LOGFILE;
}
=head2 DESTROY
This is the destructor.
=cut
sub DESTROY {
my ($self) = @_;
}
=head1 AUTHOR
Stefan Droeschler, C<< <st.droeschler at ostfalia.de> >>
=head1 BUGS
Please report any bugs or feature requests to L<bugs.loncapa.org>.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc Metadata::Clean
You can also look for information at:
=over 4
=item * LON-CAPA: LON-CAPA Project Website
L<http://www.loncapa.org>
=back
=head1 LICENSE AND COPYRIGHT
Copyright 2010 Stefan Droeschler.
This program is free software; you can redistribute it and/or modify it
under the terms of either: the GNU General Public License as published
by the Free Software Foundation; or the Artistic License.
See http://dev.perl.org/licenses/ for more information.
=cut
1; # End of Metadata::Clean
Index: modules/droeschl/Metadata/lib/Metadata/Harvest.pm
+++ modules/droeschl/Metadata/lib/Metadata/Harvest.pm
package Metadata::Harvest;
use warnings;
use strict;
use lib '/home/httpd/lib/perl';
use LONCAPA::Configuration;
use Apache::lonnet;
use File::ReadBackwards;
use Tie::File;
use Data::Dumper;
=head1 NAME
Metadata::Harvest - harvests the LON-CAPA network's metadata
=head1 VERSION
Version 1.00
=cut
our $VERSION = '1.00';
#TODO add methods to add/remove library servers
=head1 SYNOPSIS
use Metadata::Harvest;
my $h = Metadata::Harvest->new();
my $datafile = $h->harvest();
--or--
my $datafile = Metadata::Harvest->new()->harvest();
...
=head1 SUBROUTINES/METHODS
=head2 new
This is the constructor.
my $h = Metadata::Harvest->new();
my $h = Metadata::Harvest->new(debug => 1);
my $h = Metadata::Harvest->new({debug => 1});
=over
=item debug
The debug option sets the debug level. Default is 0.
=over
0: general messages are printed.
1: prints messages from _file_ok if an error occurred.
2: prints messages from _file_ok while waiting on datafiles.
=back
=item timeout
Timeout corresponds to the number of seconds between query was send
and datafile is received (over one query cycle).
Servers not responding within this timeout are considered 'dead'.
Default is 119.
=item dss
The dss option (datasetsize) determines how many records are requested.
Default is 200.
=item percycle
Sets the number of servers that are queried in each cycle.
Default is 5.
=item once
If once is set to a true value, each server will only be queried once.
(for testing purposes)
=item tmp
The tmp option determines the tmp directory to use.
Default is LON-CAPA's tmp directory.
=item out
The out option determines the output directory.
By default it is set to the tmp directory.
=item log
Sets logfile. Default /home/httpd/perl/logs/harvest.log
=back
=cut
#TODO provide reasonable values for dss and timeout
#TODO add possibility to pass servers
sub new {
my $class = shift;
my $config = shift;
my $self = bless {}, $class;
# arguments may be passed as new({key=>val}) or new(key=>val)
$config = { $config, @_ } if ref($config) ne 'HASH' and @_;
my %lcconf = %{LONCAPA::Configuration::read_conf('loncapa.conf')};
$self->{DEBUG} = $config->{debug} || 0 ;
$self->{TOINIT} = $config->{timeout} || 119; # change if 0 is wanted
$self->{DSS} = $config->{dss} || 200;
$self->{ONCE} = $config->{once} || 0 ;
$self->{PERCYCLE} = $config->{percycle} || 5 ;
$self->{TMP} = $config->{tmp} || $lcconf{lonDaemons} . '/tmp/' ;
$self->{LOG} = $config->{log} || $lcconf{lonDaemons} . '/logs/harvest.log' ;
$self->{OUT} = $config->{out} || $self->{TMP} ;
#$self->_log(5,"Debug: $self->{DEBUG}, timeout: $self->{TIMEOUT}, dss: $self->{DSS}");
return $self;
}
=head2 harvest
Harvests all library servers.
Returns filename to merged datafile.
=cut
sub harvest {
my ($self) = @_;
my $libraries = _load_libraries();
$self->_log(0, scalar @$libraries . " library servers found.");
my @files;
while(@$libraries){
push @files, $self->_query_libraries( ( splice @$libraries, 0, $self->{PERCYCLE} ) );
}
$self->_log(5, "Received files:" . Dumper(\@files));
$self->_log(0, "Harvest done.");
return $self->_merge(@files);
}
=head2 _load_libraries
--internal-- used to load library servers
Returns host IDs of library servers.
=cut
sub _load_libraries{
# domains on the same physical server share the same hostname
# this removes duplicate entries
# lonnet.pm >= 1.1062 offers unique_library():
# return [ keys Apache::lonnet::unique_library() ]
my %libraries = reverse Apache::lonnet::all_library();
return [ values %libraries ];
}
=head2 _query_libraries
--internal-- used to query library servers.
Discards empty files and servers that caused an error.
Returns an array containing filenames of received datafiles.
=cut
sub _query_libraries{
my ($self, @libs) = @_;
my @files = ();
my $offset = -$self->{DSS};
while( @libs = grep { defined } @libs ) {
$offset += $self->{DSS};
$self->{TIMEOUT} = $self->{TOINIT}; # reset timeout
my $reply = $self->_query($offset, \@libs, $self->{DSS});
my $oklines = 0;
foreach my $l (0 .. $#libs){
$self->_log(0, "$libs[$l] returned $$reply{$libs[$l]}");
if ( $$reply{$libs[$l]} =~ /^(con_lost|refused|rejected|no_such_host|error)/
or not $self->_file_ok($self->{TMP} . $$reply{$libs[$l]}) )
{
# this one is dead -> no reason to go on
delete $libs[$l] and next;
}
if ( $self->_file_corrupt($self->{TMP} . $$reply{$libs[$l]}) )
{
# take what's good, and...
$oklines += $self->_file_repair($self->{TMP} . $$reply{$libs[$l]});
push @files, $$reply{$libs[$l]};
# ...retry to get the rest
my $r = $self->_query($offset+$oklines, [ $libs[$l] ], $self->{DSS}-$oklines);
$$reply{$libs[$l]} = $$r{$libs[$l]};
$self->{TIMEOUT} = $self->{TOINIT}; # reset timeout
redo;
}
# else: received a good file
$oklines = 0; # reset necessary
push @files, $$reply{$libs[$l]};
}
undef @libs if $self->{ONCE};
}
return @files;
}
=head2 _query
--internal-- wrapper for Apache::lonnet::metadata_query
=cut
sub _query{
my ($self, $offset, $libs, $dss) = @_;
my $query = $dss > 0 ? "SELECT * FROM metadata LIMIT $offset, $dss"
: "SELECT * FROM metadata";
$self->_log(1, "Offset: $offset, dss: $dss");
return Apache::lonnet::metadata_query($query,undef,undef,$libs);
}
=head2 _file_ok
--internal-- checks datafiles for existence and size
=cut
sub _file_ok{
my ($self, $file) = @_;
while (! -e "$file.end" && $self->{TIMEOUT}) { # print every 20 sec
$self->_log(2, "$self->{TIMEOUT} seconds left for $file.end.") unless $self->{TIMEOUT} % 20;
$self->{TIMEOUT}--;
sleep(1); # zzZZZ
}
if( not $self->{TIMEOUT} ) {
$self->_log(1, "ERROR: Couldn't find $file.end within $self->{TOINIT} seconds.");
return;
}elsif ( -z $file ) {
$self->_log(1, "Empty file received.");
unlink $file, $file . '.end';
return;
}
return 1;
}
=head2 _file_corrupt
--internal-- checks datafiles for exit at the end.
This might occur when the remote server runs into a timeout.
Returns:
=over
0: file not corrupt.
1: file contains /exit$/ -> corrupt
2: file not readable
=back
=cut
#TODO file might be corrupt w/o 'exit' at the end,
# need a smart way to check for this
sub _file_corrupt{
my ($self, $file) = @_;
my $f = File::ReadBackwards->new($file)
or $self->_log(0, "Couldn't read file $file:$!") and return 2;
if ( $f->readline =~ /exit$/ ){
$self->_log(0, "'exit' was found in $file.");
return 1;
}
$f->close;
return 0;
}
=head2 _file_repair
--internal-- removes last line of corrupt files.
Returns number of lines left
see: Perl Cookbook, 8.10
=cut
sub _file_repair{
my ($self, $file) = @_;
tie my @lines, "Tie::File", $file or $self->_log(0, "Cannot read file $file: $!") and return;
delete $lines[-1] if $lines[-1] =~ /exit$/;
$self->_log(0, "Repaired $file.");
return scalar @lines;
}
=head2 _merge
--internal-- merge files and delete source files.
=cut
sub _merge {
my ($self, @files) = @_;
my $out = $self->{OUT} . 'merge.dat';
open my $MERGE, '>', $out
or $self->_log(0, "Cannot merge: $!");
for my $f (@files){
open my $F, '<', $self->{TMP} . $f
or $self->_log(0, "Cannot open $_: $!");
while(<$F>){ chomp; print $MERGE "$_\n"; }
close $F;
unlink $self->{TMP} . $f, $self->{TMP} . $f . '.end'; # clean up
}
close $MERGE;
return $out;
}
=head2 _log
--internal-- logging method
=cut
sub _log {
my ($self, $level, $msg) = @_;
return unless $level <= $self->{DEBUG};
open my $LOGFILE, '>>', $self->{LOG}
or die "Couldn't open logfile: $!";
( caller 1 )[3] =~ /::(\w+)$/ if ( caller 1 )[3];
print $LOGFILE time . " ".($1?$1:caller).": $msg \n" ;
close $LOGFILE;
}
=head2 DESTROY
This is the destructor.
=cut
sub DESTROY {
my ($self) = @_;
}
=head1 AUTHOR
Stefan Droeschler, C<< <st.droeschler at ostfalia.de> >>
=head1 BUGS
Please report any bugs or feature requests to L<bugs.loncapa.org>.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc Metadata::Harvest
You can also look for information at:
=over 4
=item * LON-CAPA: LON-CAPA Project Website
L<http://www.loncapa.org>
=back
=head1 LICENSE AND COPYRIGHT
Copyright 2010 Stefan Droeschler.
This program is free software; you can redistribute it and/or modify it
under the terms of either: the GNU General Public License as published
by the Free Software Foundation; or the Artistic License.
See http://dev.perl.org/licenses/ for more information.
=cut
1; # End of Metadata::Harvest
Index: modules/droeschl/Metadata/lib/Metadata/Load.pm
+++ modules/droeschl/Metadata/lib/Metadata/Load.pm
package Metadata::Load;
use warnings;
use strict;
use lib '/home/httpd/lib/perl';
use Encode qw(decode encode_utf8 decode_utf8);
use Data::Dumper;
use DBI;
eval "use LONCAPA::Configuration";
=head1 NAME
Metadata::Load - Loads cleaned metadata into database.
=head1 VERSION
Version 1.00
=cut
our $VERSION = '1.00';
=head1 SYNOPSIS
use Metadata::Load;
my $c = Metadata::Load->new(file=>$filename);
=head1 SUBROUTINES/METHODS
=head2 new
This is the constructor.
my $h = Metadata::Load->new(file=>$filename);
=over
=item file (mandatory)
Filename of the file containing cleaned data.
Most probably output of Metadata::Clean::clean()
=item intodb
Automatically insert data into database.
Default is currently 0 for testing.
=item remove
If set to a true value $file will be deleted after processing.
=item debug
The debug option sets the debug level. Default is 0.
=item log
Sets logfile. Default /home/httpd/perl/logs/load.log
=back
=cut
#TODO remove code for local execution
sub new {
my $class = shift;
my $config = shift;
my $self = bless {}, $class;
# arguments may be passed as new({key=>val}) or new(key=>val)
$config = { $config, @_ } if ref($config) ne 'HASH' and @_;
die "File $config->{file} does not exist" unless -e $config->{file};
my %lcconf;
if (defined &LONCAPA::Configuration::read_conf){
%lcconf = %{LONCAPA::Configuration::read_conf('loncapa.conf')};
}
$self->{FILE} = $config->{file};
$self->{DEBUG} = $config->{debug} || 0 ;
$self->{UNLINK} = $config->{remove} || 0 ;
$self->{INTODB} = $config->{intodb} || 0 ;
$self->{LOG} = $config->{log} || %lcconf ? $lcconf{lonDaemons} . '/logs/load.log' : 'load.log';
$self->{OUT} = $config->{out} || %lcconf ? $lcconf{lonDaemons} . '/tmp/' : '';
return $self;
}
=head2 load
Maps cleaned metadata input to output for database.
Currently it has an intermediate step in which it saves
output data to files that are loaded into database afterwards.
See Metadata::Clean for more information about how to 'clean' files.
=cut
#TODO needs some descriptive text
#TODO refactor
#TODO improve logging
#TODO use _trim
sub load{
my ($self) = @_;
# INPUT fields, see db:loncapa table:metadata
my @fields = qw(title author subject url keywords
version note abstract mime language
creationdate lastrevisiondate owner copyright domain
dependencies modifyinguser authorspace lowestgradelevel highestgradelevel
standards count course course_list goto
goto_list comefrom comefrom_list sequsage sequsage_list
stdno stdno_list avetries avetries_list difficulty
difficulty_list disc disc_list clear technical
correct helpful depth );
# TRANSFORMATION functions
# they map input to output
# $_[0] : ref to current field
# $_[1] : ref to entire hash
#TODO describe output
# split at ,
my %map = ();
for (qw( copyright course_list ))
{
$map{ $_ } = sub { return unless ${$_[0]} =~ /,/; ${$_[0]} = [ split /,/, ${$_[0]} ]};
}
# split at , and also count
for (qw( avetries_list difficulty_list disc_list ))
{
$map{ $_ } = sub { return unless ${$_[0]} =~ /,/;
# 0 + to get rid of -0.000
my %c; $c{ 0+ sprintf("%.3f", $_) }++ for grep { not /^(\s*|nan)$/ } split /,/, ${$_[0]};
my @a; push @a, [ $_, $c{$_} ] for keys %c;
${$_[0]} = \@a;
};
}
# similar to above but these contain URLs which can contain ,
# and are delimited by ,
# so until this is fixed in the source database on the remote server
# this code has to stay in. it doesn't solve all problems, but many
#TODO also for dependencies
for (qw( goto_list comefrom_list sequsage_list))
{
$map{ $_ } = sub { return unless ${$_[0]} =~ /,/;
my @split = split /,/, ${$_[0]};
while ( grep { not m{^(([\w@.-]+/){2}|adm/|ext/|sarasota/)} } @split )
{
for (0..$#split){ #no idea?!
next if not $split[$_] or $split[$_] =~ m{^(([\w@.-]+/){2}|adm/|ext/|sarasota/)};
delete $split[$_] and next if $split[$_] eq 'Chemistry B (2ndSEM)'; #no idea ?! -> remove it
$split[ $_ - 1 ] .= ',' . $split[$_]; # it's part of the prev. element
delete $split[$_];
}
@split = grep {defined} @split;
}
${$_[0]} = \@split ;
};
}
$map{keywords} = sub { ${$_[0]} = [ split /[,\s]/, ${$_[0]} ]};
$map{modifyinguser} = sub { my @split = split /[:@]/, ${$_[0]}; ${$_[0]} = @split ? { user=>$split[0], domain=>$split[1] } : undef };
$map{authorspace} = sub { my @split = split /[:@]/, ${$_[0]}; ${$_[0]} = @split ? { user=>$split[0], domain=>$split[1] } : undef };
$map{version} = sub { undef ${$_[0]} };
$map{stdno_list} = sub { undef ${$_[0]} };
$map{evaluation} = sub { ${$_[0]} = { map {$_ =>$_[1]{$_}} grep { $_[1]{$_} } qw(clear technical correct helpful depth) } };
#FIXME reactivate to save some storage space, but
# verify first
# $map{clear} = sub { undef ${$_[0]} };
# $map{technical} = sub { undef ${$_[0]} };
# $map{correct} = sub { undef ${$_[0]} };
# $map{helpful} = sub { undef ${$_[0]} };
# $map{depth} = sub { undef ${$_[0]} };
my @addfields = qw(evaluation); # additional fields, each of them will
# end up in an own table (if sub provided)
open my $res, '>', $self->{OUT} . "resource.dat" or die $!; # the resource.dat is kinda special,
binmode $res, ':encoding(utf8)'; # so we open it first
unless ($self->{INTODB}){
print $res "key\t"; # print header line - convenience
print $res "$_\t" for grep { ! $map{$_} } @fields;
print $res "\n";
}
my %handles; # pre-open all necessary filehandles
for (@addfields, @fields){ # use them with
next unless ( ref $map{$_} eq 'CODE' ); # print {$handles{$field}} "super duper";
open $handles{$_}, '>', $self->{OUT} . "$_.dat" or die $!;
binmode $handles{$_}, ':encoding(utf8)'; # !
}
# input file
open my $IN, '<', $self->{FILE} or $self->_log(0, "Cannot read $self->{FILE}:$!") and die $!;
binmode $IN, ':encoding(utf8)'; # !
my $line = 0;
while(<$IN>) {
$line++; # primary key for resource (and other)
s/\\N//g; # confuses perl sometimes
my %data; # split input row into data hash
@data{ (@fields) } = split /\t/, $_;
for my $field (@addfields,@fields){
# if there's a mapping function for this field, invoke it
# $data{$_} will be replaced within mapping function (most likely array ref)
if (ref $map{$field} eq 'CODE'){ # outsourced table
# do the mapping input -> output
$map{ $field }->( \$data{$field}, \%data );
# OUTPUT
if ( ref $data{$field} eq 'ARRAY') { # these end up as:
# primkey \t value \n primkey \t value ...
for my $val (@{ $data{$field}}) {
if ( ref $val eq 'ARRAY' ){
print {$handles{$field}} $line;
print {$handles{$field}} "\t$_" for @$val;
print {$handles{$field}} "\n";
}else{
print {$handles{$field}} $line . "\t" . $val . "\n";
}
}
# these are more complicated: primkey \t val1 \t val2 ... \n ...
} elsif (ref $data{$field} eq 'HASH' and (keys %{ $data{$field}}) > 0) {
print {$handles{$field}} $line ;
for my $key (sort keys %{ $data{$field}}){
print {$handles{$field}} "\t" . $data{$field}->{$key};
}
print {$handles{$field}} "\n";
}
} else { # one of the resource attributes
# use line number as primary key
print $res $line . "\t" if $field eq $fields[0];
print $res ( $data{$field} ? $data{$field} : '\N' ); # \N: MySQL NULL
print $res ( $field eq $fields[-1] ? "\n" : "\t" ); # last field?
}
}
}
close $IN;
close $handles{$_} for grep {defined $handles{$_} } (@addfields, @fields);
close $res;
for (@addfields, @fields){ # remove empty files
next unless ( ref $map{$_} eq 'CODE' );
unlink $self->{OUT} . "$_.dat" if -z $self->{OUT} . "$_.dat";
}
$self->_load_in_db("resource", @addfields, @fields) if $self->{INTODB};
unlink $self->{FILE} if $self->{UNLINK};
return $self->{OUT};
}
=head2 _load_in_db
--internal-- loads data into database
Files need to be accessible by mysql.
=cut
#TODO verify everything went ok
#TODO create tables if necessary
#TODO verify table structure if tables present
#TODO load config from loncapa config
sub _load_in_db {
my ($self, @files) = @_;
#TODO :----config: should be fetched from central loncapa config later
my $db_source = 'dbi:mysql:';
my $db_db = 'metadata';
my $db_user = 'root';
my $db_pass = '';
my $dbh = DBI->connect($db_source.$db_db, $db_user, $db_pass, { RaiseError =>1, })
or die "Can't connect to $db_source: $DBI::errstr";
my %opt = ( keywords => 'IGNORE', # tables where IGNORE or REPLACE
goto_list => 'IGNORE', # is necessary
comefrom_list => 'IGNORE',
);
for (@files){
my $file = $self->{OUT} . $_ . ".dat";
next unless -e $file;
print "processing: $_\n"; #FIXME only debug
my $sth = $dbh->prepare("truncate table $_");
$sth->execute() or print "ALARM!!\n";
my $opt = $opt{$_} || '';
$sth = $dbh->prepare("LOAD DATA INFILE '$file' $opt INTO TABLE $_ "
."CHARACTER SET UTF8 FIELDS TERMINATED BY '\t' "
."LINES TERMINATED BY '\n';");
$sth->execute() or print "ALARM!!\n";
}
$dbh->disconnect;
}
=head2 _trim
--internal-- Trims whitespace at the beginning/end of a string.
Usage:
trim; trims $_ inplace
$new = trim; trims (and returns) a copy of $_
trim $str; trims $str inplace
$new = trim $str; trims (and returns) a copy of $str
trim @list; trims @list inplace
@new = trim @list; trims (and returns) a copy of @list
source: http://www.perlmonks.org/?node_id=36684
=cut
#TODO integrate
sub _trim {
@_ = $_ if not @_ and defined wantarray;
@_ = @_ if defined wantarray;
for (@_ ? @_ : $_) { s/^\s+//, s/\s+$// }
return wantarray ? @_ : $_[0] if defined wantarray;
}
=head2 _log
--internal-- logging method
=cut
sub _log {
my ($self, $level, $msg) = @_;
return unless $level <= $self->{DEBUG};
open my $LOGFILE, '>>', $self->{LOG}
or die "Couldn't open logfile: $!";
( caller 1 )[3] =~ /::(\w+)$/ if ( caller 1 )[3];
print $LOGFILE time . " ".($1?$1:caller).": $msg \n" ;
close $LOGFILE;
}
=head2 DESTROY
This is the destructor.
=cut
sub DESTROY {
my ($self) = @_;
}
=head1 AUTHOR
Stefan Droeschler, C<< <st.droeschler at ostfalia.de> >>
=head1 BUGS
Please report any bugs or feature requests to L<bugs.loncapa.org>.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc Metadata::Load
You can also look for information at:
=over 4
=item * LON-CAPA: LON-CAPA Project Website
L<http://www.loncapa.org>
=back
=head1 LICENSE AND COPYRIGHT
Copyright 2010 Stefan Droeschler.
This program is free software; you can redistribute it and/or modify it
under the terms of either: the GNU General Public License as published
by the Free Software Foundation; or the Artistic License.
See http://dev.perl.org/licenses/ for more information.
=cut
1; # End of Metadata::Load
Index: modules/droeschl/Metadata/t/misc.t
+++ modules/droeschl/Metadata/t/misc.t
Index: modules/droeschl/Metadata/t/misc.t
+++ modules/droeschl/Metadata/t/misc.t
Index: modules/droeschl/Metadata/t/clean.test
+++ modules/droeschl/Metadata/t/clean.test
add content
Index: modules/droeschl/Metadata/t/corrupt.test
+++ modules/droeschl/Metadata/t/corrupt.test
content doesn't matter
content doesn't matter
content doesn't matterexit
Index: modules/droeschl/Metadata/t/notcorrupt.test
+++ modules/droeschl/Metadata/t/notcorrupt.test
content doesn't matter
content doesn't matter
content doesn't matter
Index: modules/droeschl/Metadata/t/pod-coverage.t
+++ modules/droeschl/Metadata/t/pod-coverage.t
use strict;
use warnings;
use Test::More;
# Ensure a recent version of Test::Pod::Coverage
my $min_tpc = 1.08;
eval "use Test::Pod::Coverage $min_tpc";
plan skip_all => "Test::Pod::Coverage $min_tpc required for testing POD coverage"
if $@;
# Test::Pod::Coverage doesn't require a minimum Pod::Coverage version,
# but older versions don't recognize some common documentation styles
my $min_pc = 0.18;
eval "use Pod::Coverage $min_pc";
plan skip_all => "Pod::Coverage $min_pc required for testing POD coverage"
if $@;
all_pod_coverage_ok();
Index: modules/droeschl/Metadata/t/pod.t
+++ modules/droeschl/Metadata/t/pod.t
#!perl -T
use strict;
use warnings;
use Test::More;
# Ensure a recent version of Test::Pod
my $min_tp = 1.22;
eval "use Test::Pod $min_tp";
plan skip_all => "Test::Pod $min_tp required for testing POD" if $@;
all_pod_files_ok();
Index: modules/droeschl/Metadata/t/useforloadtest.test
+++ modules/droeschl/Metadata/t/useforloadtest.test
\N \N \N /res/MIT/RELATE/MAPS_course/System_Constituents/ \N current \N \N \N notset 0000-00-00 00:00:00 0000-00-00 00:00:00 \N public \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N
--droeschl1281992066--