[LON-CAPA-cvs] cvs: modules /droeschl/Metadata create.sql harvestrun.pl /droeschl/Metadata/lib/Metadata Clean.pm Harvest.pm Load.pm /droeschl/Metadata/t 10-clean 10-harvest clean.test corrupt.test misc.t misc.t notcorrupt.test pod-coverage.t pod.t useforloadtest.test

droeschl droeschl@source.lon-capa.org
Mon, 16 Aug 2010 20:54:26 -0000


This is a MIME encoded message

--droeschl1281992066
Content-Type: text/plain

droeschl		Mon Aug 16 20:54:26 2010 EDT

  Added files:                 
    /modules/droeschl/Metadata	create.sql harvestrun.pl 
    /modules/droeschl/Metadata/lib/Metadata	Clean.pm Harvest.pm Load.pm 
    /modules/droeschl/Metadata/t	misc.t misc.t clean.test corrupt.test 
                                	notcorrupt.test pod-coverage.t pod.t 
                                	useforloadtest.test 

  Modified files:              
    /modules/droeschl/Metadata/t	10-clean 10-harvest 
  Log:
  Metadata Harvesting.
  
  
--droeschl1281992066
Content-Type: text/plain
Content-Disposition: attachment; filename="droeschl-20100816205426.txt"


Index: modules/droeschl/Metadata/create.sql
+++ modules/droeschl/Metadata/create.sql
SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL';

CREATE SCHEMA IF NOT EXISTS `metadata` DEFAULT CHARACTER SET utf8 ;
USE `metadata` ;

-- -----------------------------------------------------
-- Table `metadata`.`resource`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`resource` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`resource` (
  `key` INT NOT NULL ,
  `title` TEXT NULL ,
  `author` TEXT NULL ,
  `subject` TEXT NULL ,
  `url` VARCHAR(255) NOT NULL ,
  `note` TEXT NULL ,
  `abstract` TEXT NULL ,
  `mime` TEXT NULL ,
  `language` TEXT NULL ,
  `creationdate` DATETIME NULL ,
  `lastrevisiondate` DATETIME NULL ,
  `owner` TEXT NULL ,
  `domain` VARCHAR(45) NULL ,
  `dependencies` TEXT NULL ,
  `lowestgradelevel` INT NULL ,
  `highestgradelevel` INT NULL ,
  `standards` TEXT NULL ,
  `count` INT NULL ,
  `course` INT NULL ,
  `goto` INT NULL ,
  `comefrom` INT NULL ,
  `sequsage` INT NULL ,
  `stdno` INT NULL ,
  `avetries` FLOAT NULL ,
  `difficulty` FLOAT NULL ,
  PRIMARY KEY (`key`) ,
  UNIQUE INDEX `url_UNIQUE` (`url` ASC) )
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`authorspace`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`authorspace` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`authorspace` (
  `key` INT NOT NULL ,
  `domain` VARCHAR(45) NOT NULL ,
  `user` VARCHAR(45) NOT NULL ,
  PRIMARY KEY (`key`) ,
  INDEX `fk_authorspace_resource1` (`key` ASC) ,
  CONSTRAINT `fk_authorspace_resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`avetries_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`avetries_list` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`avetries_list` (
  `key` INT NOT NULL ,
  `avetries` FLOAT NOT NULL ,
  `count` INT NULL ,
  PRIMARY KEY (`avetries`, `key`) ,
  INDEX `fk_avetries_list_Resource1` (`key` ASC) ,
  CONSTRAINT `fk_avetries_list_Resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`comefrom_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`comefrom_list` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`comefrom_list` (
  `key` INT NOT NULL ,
  `comefrom_url` VARCHAR(255) NOT NULL ,
  PRIMARY KEY (`key`, `comefrom_url`) ,
  INDEX `fk_comefrom_list_Resource1` (`key` ASC) ,
  INDEX `fk_comefrom_list_Resource2` (`comefrom_url` ASC) ,
  CONSTRAINT `fk_comefrom_list_Resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION,
  CONSTRAINT `fk_comefrom_list_Resource2`
    FOREIGN KEY (`comefrom_url` )
    REFERENCES `metadata`.`resource` (`url` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`copyright`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`copyright` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`copyright` (
  `key` INT NOT NULL ,
  `copyright` VARCHAR(45) NOT NULL ,
  INDEX `fk_copyright_resource1` (`key` ASC) ,
  PRIMARY KEY (`key`, `copyright`) ,
  CONSTRAINT `fk_copyright_resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`course_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`course_list` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`course_list` (
  `key` INT NOT NULL ,
  `course` VARCHAR(45) NOT NULL ,
  INDEX `fk_course_list_resource1` (`key` ASC) ,
  PRIMARY KEY (`course`, `key`) ,
  CONSTRAINT `fk_course_list_resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`difficulty_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`difficulty_list` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`difficulty_list` (
  `key` INT NOT NULL ,
  `difficulty` FLOAT NOT NULL ,
  `count` INT NULL ,
  INDEX `fk_difficulty_list_resource1` (`key` ASC) ,
  PRIMARY KEY (`key`, `difficulty`) ,
  CONSTRAINT `fk_difficulty_list_resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`disc_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`disc_list` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`disc_list` (
  `key` INT NOT NULL ,
  `disc` FLOAT NOT NULL ,
  `count` INT NULL ,
  PRIMARY KEY (`key`, `disc`) ,
  INDEX `fk_disc_list_resource1` (`key` ASC) ,
  CONSTRAINT `fk_disc_list_resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`evaluation`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`evaluation` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`evaluation` (
  `key` INT NOT NULL ,
  `clear` FLOAT NULL ,
  `correct` FLOAT NULL ,
  `depth` FLOAT NULL ,
  `helpful` FLOAT NULL ,
  `technical` FLOAT NULL ,
  INDEX `fk_evaluation_resource1` (`key` ASC) ,
  PRIMARY KEY (`key`) ,
  CONSTRAINT `fk_evaluation_resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`goto_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`goto_list` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`goto_list` (
  `key` INT NOT NULL ,
  `goto_url` VARCHAR(255) NOT NULL ,
  PRIMARY KEY (`key`, `goto_url`) ,
  INDEX `fk_goto_list_resource1` (`key` ASC) ,
  INDEX `fk_goto_list_resource2` (`goto_url` ASC) ,
  CONSTRAINT `fk_goto_list_resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION,
  CONSTRAINT `fk_goto_list_resource2`
    FOREIGN KEY (`goto_url` )
    REFERENCES `metadata`.`resource` (`url` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`keywords`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`keywords` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`keywords` (
  `key` INT NOT NULL ,
  `keyword` VARCHAR(100) NOT NULL ,
  PRIMARY KEY (`key`, `keyword`) ,
  INDEX `fk_table1_resource1` (`key` ASC) ,
  CONSTRAINT `fk_table1_resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`modifyinguser`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`modifyinguser` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`modifyinguser` (
  `key` INT NOT NULL ,
  `user` VARCHAR(45) NULL ,
  `domain` VARCHAR(45) NULL ,
  PRIMARY KEY (`key`) ,
  INDEX `fk_modifyinguser_resource1` (`key` ASC) ,
  CONSTRAINT `fk_modifyinguser_resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;


-- -----------------------------------------------------
-- Table `metadata`.`sequsage_list`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `metadata`.`sequsage_list` ;

CREATE  TABLE IF NOT EXISTS `metadata`.`sequsage_list` (
  `key` INT NOT NULL ,
  `sequsage_url` VARCHAR(255) NOT NULL ,
  PRIMARY KEY (`key`, `sequsage_url`) ,
  INDEX `fk_sequsage_list_resource1` (`key` ASC) ,
  INDEX `fk_sequsage_list_resource2` (`sequsage_url` ASC) ,
  CONSTRAINT `fk_sequsage_list_resource1`
    FOREIGN KEY (`key` )
    REFERENCES `metadata`.`resource` (`key` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION,
  CONSTRAINT `fk_sequsage_list_resource2`
    FOREIGN KEY (`sequsage_url` )
    REFERENCES `metadata`.`resource` (`url` )
    ON DELETE NO ACTION
    ON UPDATE NO ACTION)
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;



SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;

Index: modules/droeschl/Metadata/harvestrun.pl
+++ modules/droeschl/Metadata/harvestrun.pl
#!/usr/bin/perl 
use strict;
use warnings;
use lib 'lib/';
#use Metadata::Harvest;
use Metadata::Clean;
use Metadata::Load;

my $file;
my $dir = '/Users/stefan/tests/';

#disabled locally
#my $h = Metadata::Harvest->new();
#print "Harvesting...\n";
#$file = $h->harvest();
$file = "/Users/stefan/tests/merge.dat";
my $c = Metadata::Clean->new(file  =>$file,
                             out   =>$dir,
                             debug =>10,
                             remove=>1, );
print "Cleaning...\n";
$file = $c->clean();

print "Sorting...\n";                                    #sort -u similar to sort $file | uniq                   
system("/usr/bin/sort","-u", "-o",$file, $file) == 0     #but uniq shortens lines!                               
    or die $?;                                           #unfortunately this has to be done 'manually' as of yet 


my $l = Metadata::Load->new( debug  =>10,
                             intodb =>0,
                             out    =>$dir,
                             file   =>$file,
                             remove =>1, );

print "Loading...\n";
print "Files can be found in:" . $l->load();

Index: modules/droeschl/Metadata/lib/Metadata/Clean.pm
+++ modules/droeschl/Metadata/lib/Metadata/Clean.pm
package Metadata::Clean;

use warnings;
use strict;

use lib '/home/httpd/lib/perl'; 
use Encode qw(decode encode_utf8 decode_utf8);
use Data::Dumper;
eval "use LONCAPA::Configuration";

=head1 NAME

Metadata::Clean - cleans harvested metadata

=head1 VERSION

Version 1.00

=cut

our $VERSION = '1.00';


=head1 SYNOPSIS


    use Metadata::Clean;

    my $c = Metadata::Clean->new(file=>$filename);

    ...

=head1 SUBROUTINES/METHODS

=head2 new

This is the constructor. 

  my $h = Metadata::Clean->new();

=over

=item file (mandatory)

Filename of the file to clean. 
Most probably output of Metadata::Harvest::harvest()

=item remove

If set to a true value $file will be deleted after processing.

=item debug

The debug option sets the debug level. Default is 0.

=item out

The out option determines the output directory. 
By default it is set to the tmp directory.

=item log

Sets logfile. Default /home/httpd/perl/logs/clean.log

=back

=cut

#TODO remove duplicate rows
#TODO remove directories
#TODO remove code for local execution
sub new {
    my $class  = shift;
    my $config = shift;

    my $self = bless {}, $class;

    # arguments may be passed as new({key=>val}) or new(key=>val)
    $config = { $config, @_ } if ref($config) ne 'HASH' and @_;
    die "File $config->{file} does not exist" unless -e $config->{file};

    my %lcconf;
    if (defined &LONCAPA::Configuration::read_conf){
       %lcconf = %{LONCAPA::Configuration::read_conf('loncapa.conf')};
    }
                                                                          
    $self->{FILE}     = $config->{file};  
    $self->{DEBUG}    = $config->{debug}    || 0  ;
    $self->{UNLINK}   = $config->{remove}   || 0  ;
    $self->{LOG}      = $config->{log}      || %lcconf ? $lcconf{lonDaemons} . '/logs/clean.log' : 'clean.log';
    $self->{OUT}      = $config->{out}      || %lcconf ? $lcconf{lonDaemons} . '/tmp/' : '';

    return $self;
}

=head2 clean

This does the actual cleaning.

=cut 

sub clean{
    my ($self) = @_; 
    my $output = $self->{OUT}  . 'clean.dat';

    $self->_log(0, "Start: $self->{FILE}");

    open my $I, '<', $self->{FILE}
        or $self->_log(0, "Cannot read $self->{FILE}:$!") and return;

    open my $O, '>', $output
        or $self->_log(0, "Cannot open output file:$!") and return;

                                      # we read and write utf-8 data                    
    binmode $I, ':encoding(utf8)';    # w/o these settings perl handles input as binary 
    binmode $O, ':encoding(utf8)';    # and may (en|de)code as ISO-8859-1. -> garbage output                 

    #TODO optimize
    while(<$I>){
        s/%5[cC]/\\\\/g; # \ => \\
        s/^,/\\N,/g;     # \N => mysql NULL start of line 
        s/,$/,\\N/g;     # \N => mysql NULL end of line
        s/,(?=,)/,\\N/g; # \N => mysql NULL in between
        tr/,/\t/;        
        $_ = $self->_url_unescape($_);

        print $O $_;  
    }

    close $O;
    close $I;

    $self->_log(0, "done.");

    unlink $self->{FILE} if $self->{UNLINK};
    return $output;
}

=head2 _url_unescape

--internal-- unescapes URL escapes. 
It doesn't unescape control characters (00-1F)
which is important in order to avoid conflicts  

=cut 

sub _url_unescape {
    my ($self, $str) = @_ ;

    # pack operates on binary data
    $str=encode_utf8($str);

    # don't unescape control characters (00-1F)
    $str =~ s/%([a-fA-F2-9][a-fA-F0-9])/pack("C",hex($1))/eg;
    # double escaped spaces and . and -
    $str =~ s/%(20|2e|2d)/pack("C",hex($1))/eg;
    # CR and LF
    #FIXME something else should happen with those, convenience for now
    $str =~ s/%0(?:d|a)//g; 

    # mostly good utf-8
    return decode_utf8($str);
} 

=head2 _log

--internal-- logging method

=cut 

sub _log {
    my ($self, $level, $msg) = @_;
    return unless $level <= $self->{DEBUG};

    open my $LOGFILE, '>>', $self->{LOG}
        or die "Couldn't open logfile: $!";

    ( caller 1 )[3] =~ /::(\w+)$/ if ( caller 1 )[3];
    print $LOGFILE time . " ".($1?$1:caller).": $msg \n" ;

    close $LOGFILE;
}  

=head2 DESTROY

This is the destructor.

=cut 

sub DESTROY {
    my ($self) = @_;
}

=head1 AUTHOR

Stefan Droeschler, C<< <st.droeschler at ostfalia.de> >>

=head1 BUGS

Please report any bugs or feature requests to L<bugs.loncapa.org>.


=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Metadata::Clean

You can also look for information at:

=over 4

=item * LON-CAPA: LON-CAPA Project Website

L<http://www.loncapa.org>

=back

=head1 LICENSE AND COPYRIGHT

Copyright 2010 Stefan Droeschler.

This program is free software; you can redistribute it and/or modify it
under the terms of either: the GNU General Public License as published
by the Free Software Foundation; or the Artistic License.

See http://dev.perl.org/licenses/ for more information.

=cut

1; # End of Metadata::Clean

Index: modules/droeschl/Metadata/lib/Metadata/Harvest.pm
+++ modules/droeschl/Metadata/lib/Metadata/Harvest.pm
package Metadata::Harvest;

use warnings;
use strict;

use lib '/home/httpd/lib/perl'; 
use LONCAPA::Configuration;
use Apache::lonnet;
use File::ReadBackwards;
use Tie::File;
use Data::Dumper;

=head1 NAME

Metadata::Harvest - harvests the LON-CAPA network's metadata

=head1 VERSION

Version 1.00

=cut

our $VERSION = '1.00';

#TODO add methods to add/remove library servers

=head1 SYNOPSIS


    use Metadata::Harvest;

    my $h = Metadata::Harvest->new();
    my $datafile = $h->harvest();

    --or--

    my $datafile = Metadata::Harvest->new()->harvest();

    ...

=head1 SUBROUTINES/METHODS

=head2 new

This is the constructor. 

  my $h = Metadata::Harvest->new();
  my $h = Metadata::Harvest->new(debug => 1);
  my $h = Metadata::Harvest->new({debug => 1});

=over

=item debug

The debug option sets the debug level. Default is 0.

=over

 0: general messages are printed.
 1: prints messages from _file_ok if an error occurred.
 2: prints messages from _file_ok while waiting on datafiles.

=back


=item timeout

Timeout corresponds to the number of seconds between query was send
and datafile is received (over one query cycle).
Servers not responding within this timeout are considered 'dead'.
Default is 119.

=item dss

The dss option (datasetsize) determines how many records are requested. 
Default is 200.

=item percycle

Sets the number of servers that are queried in each cycle.
Default is 5.

=item once

If once is set to a true value, each server will only be queried once. 
(for testing purposes)

=item tmp

The tmp option determines the tmp directory to use. 
Default is LON-CAPA's tmp directory.

=item out

The out option determines the output directory. 
By default it is set to the tmp directory.

=item log

Sets logfile. Default /home/httpd/perl/logs/harvest.log

=back

=cut

#TODO provide reasonable values for dss and timeout
#TODO add possibility to pass servers
sub new {
    my $class  = shift;
    my $config = shift;

    my $self = bless {}, $class;

    # arguments may be passed as new({key=>val}) or new(key=>val)
    $config = { $config, @_ } if ref($config) ne 'HASH' and @_;
    my %lcconf = %{LONCAPA::Configuration::read_conf('loncapa.conf')};
                                                                          
    $self->{DEBUG}    = $config->{debug}    || 0  ;
    $self->{TOINIT}   = $config->{timeout}  || 119; # change if 0 is wanted
    $self->{DSS}      = $config->{dss}      || 200;
    $self->{ONCE}     = $config->{once}     || 0  ;
    $self->{PERCYCLE} = $config->{percycle} || 5  ;

    $self->{TMP}      = $config->{tmp}      || $lcconf{lonDaemons} . '/tmp/' ;
    $self->{LOG}      = $config->{log}      || $lcconf{lonDaemons} . '/logs/harvest.log' ;
    $self->{OUT}      = $config->{out}      || $self->{TMP} ;


    #$self->_log(5,"Debug: $self->{DEBUG}, timeout: $self->{TIMEOUT}, dss: $self->{DSS}");
    return $self;
}

=head2 harvest

Harvests all library servers. 
Returns filename to merged datafile.

=cut

sub harvest {
    my ($self) = @_;
    
    my $libraries = _load_libraries();
    $self->_log(0, scalar @$libraries  . " library servers found.");
    
    my @files;
    while(@$libraries){
        push @files, $self->_query_libraries(  ( splice @$libraries, 0, $self->{PERCYCLE} ) ); 
    }
    
    $self->_log(5, "Received files:" . Dumper(\@files));
    $self->_log(0, "Harvest done.");
    return $self->_merge(@files);
}             

=head2 _load_libraries

--internal-- used to load library servers
Returns host IDs of library servers.

=cut

sub _load_libraries{
    # domains on the same physical server share the same hostname
    # this removes duplicate entries

    # lonnet.pm >= 1.1062 offers unique_library():
    # return [ keys  Apache::lonnet::unique_library() ]

    my %libraries = reverse Apache::lonnet::all_library();
    return [ values %libraries ];
}

=head2 _query_libraries

--internal-- used to query library servers.
Discards empty files and servers that caused an error.
Returns an array containing filenames of received datafiles.

=cut

sub _query_libraries{
    my ($self, @libs) = @_;

    my @files  = ();
    my $offset = -$self->{DSS};

    while( @libs = grep { defined } @libs ) {
        $offset  += $self->{DSS};

        $self->{TIMEOUT} = $self->{TOINIT}; # reset timeout

        my $reply = $self->_query($offset, \@libs, $self->{DSS});

        my $oklines = 0;
        foreach my $l (0 .. $#libs){

            $self->_log(0, "$libs[$l] returned $$reply{$libs[$l]}");

            if (   $$reply{$libs[$l]} =~ /^(con_lost|refused|rejected|no_such_host|error)/
                or not $self->_file_ok($self->{TMP} . $$reply{$libs[$l]}) )
            {
                # this one is dead -> no reason to go on
                delete $libs[$l] and next;  
            } 
            
            if ( $self->_file_corrupt($self->{TMP} . $$reply{$libs[$l]}) )
            {
                # take what's good, and...
                $oklines += $self->_file_repair($self->{TMP} . $$reply{$libs[$l]});
                push @files, $$reply{$libs[$l]};

                # ...retry to get the rest
                my $r = $self->_query($offset+$oklines, [ $libs[$l] ], $self->{DSS}-$oklines);
                $$reply{$libs[$l]} = $$r{$libs[$l]};

                $self->{TIMEOUT} = $self->{TOINIT}; # reset timeout

                redo;
            } 

            # else: received a good file

            $oklines = 0; # reset necessary

            push @files, $$reply{$libs[$l]};
        }
        
        undef @libs if $self->{ONCE};
    }
                                     
    return @files;
} 

=head2 _query

--internal-- wrapper for Apache::lonnet::metadata_query

=cut

sub _query{
    my ($self, $offset, $libs, $dss) = @_;

    my $query = $dss > 0 ? "SELECT * FROM  metadata LIMIT $offset, $dss" 
                         : "SELECT * FROM  metadata"; 

    $self->_log(1, "Offset: $offset, dss: $dss");

    return Apache::lonnet::metadata_query($query,undef,undef,$libs);
}

=head2 _file_ok

--internal-- checks datafiles for existence and size

=cut

sub _file_ok{
    my ($self, $file) = @_;

    while (! -e "$file.end" && $self->{TIMEOUT}) {                     # print every 20 sec
        $self->_log(2, "$self->{TIMEOUT} seconds left for $file.end.") unless $self->{TIMEOUT} % 20; 
        $self->{TIMEOUT}--; 
        sleep(1);   # zzZZZ
    }

    if( not $self->{TIMEOUT} ) {
        $self->_log(1, "ERROR: Couldn't find $file.end within $self->{TOINIT} seconds.");
        return;
    }elsif ( -z $file ) {
        $self->_log(1, "Empty file received.");
        unlink $file, $file . '.end';
        return;
    }

    return 1; 
} 

=head2 _file_corrupt

--internal-- checks datafiles for exit at the end.
This might occur when the remote server runs into a timeout.
Returns:

=over

    0: file not corrupt.
    1: file contains /exit$/ -> corrupt
    2: file not readable

=back

=cut

#TODO file might be corrupt w/o 'exit' at the end, 
#     need a smart way to check for this
sub _file_corrupt{
    my ($self, $file) = @_;

    my $f = File::ReadBackwards->new($file) 
        or $self->_log(0, "Couldn't read file $file:$!") and return 2;

    if ( $f->readline =~ /exit$/ ){
        $self->_log(0, "'exit' was found in $file.");
        return 1;
    }
    $f->close;
    return 0; 
} 

=head2 _file_repair

--internal-- removes last line of corrupt files.
Returns number of lines left
see: Perl Cookbook, 8.10

=cut

sub _file_repair{
    my ($self, $file) = @_;

    tie my @lines, "Tie::File", $file or $self->_log(0, "Cannot read file $file: $!") and return;
    delete $lines[-1] if $lines[-1] =~ /exit$/;

    $self->_log(0, "Repaired $file.");
    return scalar @lines;
} 

=head2 _merge

--internal-- merge files and delete source files.

=cut 

sub _merge { 
    my ($self, @files) = @_;

    my $out =  $self->{OUT} . 'merge.dat';
    open my $MERGE, '>', $out
        or $self->_log(0, "Cannot merge: $!");
    
    for my $f (@files){
        open my $F, '<', $self->{TMP}  . $f
            or $self->_log(0, "Cannot open $_: $!");

        while(<$F>){ chomp; print $MERGE "$_\n"; }

        close $F;

        unlink $self->{TMP} . $f, $self->{TMP} . $f . '.end'; # clean up
    }

    close $MERGE;

    return $out;
}

=head2 _log

--internal-- logging method

=cut 

sub _log {
    my ($self, $level, $msg) = @_;
    return unless $level <= $self->{DEBUG};

    open my $LOGFILE, '>>', $self->{LOG}
        or die "Couldn't open logfile: $!";

    ( caller 1 )[3] =~ /::(\w+)$/ if ( caller 1 )[3];
    print $LOGFILE time . " ".($1?$1:caller).": $msg \n" ; 
    close $LOGFILE;
}

=head2 DESTROY

This is the destructor.

=cut 

sub DESTROY {
    my ($self) = @_;
}

=head1 AUTHOR

Stefan Droeschler, C<< <st.droeschler at ostfalia.de> >>

=head1 BUGS

Please report any bugs or feature requests to L<bugs.loncapa.org>.


=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Metadata::Harvest

You can also look for information at:

=over 4

=item * LON-CAPA: LON-CAPA Project Website

L<http://www.loncapa.org>

=back

=head1 LICENSE AND COPYRIGHT

Copyright 2010 Stefan Droeschler.

This program is free software; you can redistribute it and/or modify it
under the terms of either: the GNU General Public License as published
by the Free Software Foundation; or the Artistic License.

See http://dev.perl.org/licenses/ for more information.

=cut

1; # End of Metadata::Harvest

Index: modules/droeschl/Metadata/lib/Metadata/Load.pm
+++ modules/droeschl/Metadata/lib/Metadata/Load.pm
package Metadata::Load;

use warnings;
use strict;

use lib '/home/httpd/lib/perl'; 
use Encode qw(decode encode_utf8 decode_utf8);
use Data::Dumper;
use DBI;
eval "use LONCAPA::Configuration";

=head1 NAME

Metadata::Load - Loads cleaned metadata into database.

=head1 VERSION

Version 1.00

=cut

our $VERSION = '1.00';


=head1 SYNOPSIS

    use Metadata::Load;

    my $c = Metadata::Load->new(file=>$filename);


=head1 SUBROUTINES/METHODS

=head2 new

This is the constructor. 

  my $h = Metadata::Load->new(file=>$filename);

=over

=item file (mandatory)

Filename of the file containing cleaned data. 
Most probably output of Metadata::Clean::clean()

=item intodb

Automatically insert data into database. 
Default is currently 0 for testing.

=item remove

If set to a true value $file will be deleted after processing.  

=item debug

The debug option sets the debug level. Default is 0.

=item log

Sets logfile. Default /home/httpd/perl/logs/load.log

=back

=cut
#TODO remove code for local execution
sub new {
    my $class  = shift;
    my $config = shift;

    my $self = bless {}, $class;

    # arguments may be passed as new({key=>val}) or new(key=>val)
    $config = { $config, @_ } if ref($config) ne 'HASH' and @_;
    die "File $config->{file} does not exist" unless -e $config->{file}; 

    my %lcconf;
    if (defined &LONCAPA::Configuration::read_conf){
       %lcconf = %{LONCAPA::Configuration::read_conf('loncapa.conf')};
    }
                                                                          
    $self->{FILE}     = $config->{file}; 
    $self->{DEBUG}    = $config->{debug}    || 0  ;
    $self->{UNLINK}   = $config->{remove}   || 0  ;
    $self->{INTODB}   = $config->{intodb}   || 0  ;
    $self->{LOG}      = $config->{log}      || %lcconf ? $lcconf{lonDaemons} . '/logs/load.log' : 'load.log';
    $self->{OUT}      = $config->{out}      || %lcconf ? $lcconf{lonDaemons} . '/tmp/'  : '';

    return $self;
}

=head2 load

Maps cleaned metadata input to output for database. 
Currently it has an intermediate step in which it saves 
output data to files that are loaded into database afterwards.

See Metadata::Clean for more information about how to 'clean' files.

=cut 
#TODO needs some descriptive text
#TODO refactor
#TODO improve logging
#TODO use _trim
sub load{
       my ($self) = @_; 

       # INPUT fields, see db:loncapa table:metadata
       my @fields = qw(title             author              subject         url                 keywords
                       version           note                abstract        mime                language
                       creationdate      lastrevisiondate    owner           copyright           domain
                       dependencies      modifyinguser       authorspace     lowestgradelevel    highestgradelevel
                       standards         count               course          course_list         goto          
                       goto_list         comefrom            comefrom_list   sequsage            sequsage_list
                       stdno             stdno_list          avetries        avetries_list       difficulty
                       difficulty_list   disc                disc_list       clear               technical
                       correct           helpful             depth           ); 

       # TRANSFORMATION functions 
       # they map input to output
       # $_[0] : ref to current field
       # $_[1] : ref to entire hash
       #TODO describe output

       # split at ,
       my %map = ();                 
       for (qw( copyright  course_list ))
       {
           $map{ $_ } = sub { return unless ${$_[0]} =~ /,/;   ${$_[0]} = [ split /,/, ${$_[0]} ]};
       }

       # split at , and also count
       for (qw( avetries_list  difficulty_list  disc_list ))
       {
           $map{ $_ } = sub {  return unless ${$_[0]} =~ /,/;
                                        # 0 + to get rid of -0.000
                               my %c; $c{ 0+ sprintf("%.3f", $_) }++ for grep { not /^(\s*|nan)$/ } split /,/, ${$_[0]};
                               my @a; push @a, [ $_, $c{$_} ] for keys %c;
                               ${$_[0]} = \@a;
                            };
       }

       # similar to above but these contain URLs which can contain ,
       # and are delimited by ,
       # so until this is fixed in the source database on the remote server
       # this code has to stay in. it doesn't solve all problems, but many
       #TODO also for dependencies
       for  (qw( goto_list  comefrom_list  sequsage_list))
       {
           $map{ $_ } = sub {  return unless ${$_[0]} =~ /,/;
                               my @split = split /,/, ${$_[0]}; 

                               while ( grep { not m{^(([\w@.-]+/){2}|adm/|ext/|sarasota/)} } @split )
                               {
                                   for (0..$#split){                                      #no idea?!
                                       next if not $split[$_] or $split[$_] =~ m{^(([\w@.-]+/){2}|adm/|ext/|sarasota/)};
                                       delete $split[$_] and next if $split[$_] eq 'Chemistry B (2ndSEM)'; #no idea ?! -> remove it

                                       $split[ $_ - 1 ] .= ',' . $split[$_]; # it's part of the prev. element
                                       delete $split[$_];                                       
                                   }
                                   @split = grep {defined} @split;
                               }
                               ${$_[0]} = \@split ;
                           };
       } 

       $map{keywords}      = sub { ${$_[0]} = [ split /[,\s]/, ${$_[0]} ]};
       $map{modifyinguser} = sub { my @split = split /[:@]/, ${$_[0]}; ${$_[0]} = @split ? { user=>$split[0], domain=>$split[1] } : undef };
       $map{authorspace}   = sub { my @split = split /[:@]/, ${$_[0]}; ${$_[0]} = @split ? { user=>$split[0], domain=>$split[1] } : undef };
       $map{version}       = sub { undef ${$_[0]} };
       $map{stdno_list}    = sub { undef ${$_[0]} };

       $map{evaluation}    = sub { ${$_[0]} = { map {$_ =>$_[1]{$_}} grep { $_[1]{$_} } qw(clear technical correct helpful depth) } };

       #FIXME reactivate to save some storage space, but 
       #      verify first
       #  $map{clear}         = sub { undef ${$_[0]} };
       #  $map{technical}     = sub { undef ${$_[0]} };
       #  $map{correct}       = sub { undef ${$_[0]} };
       #  $map{helpful}       = sub { undef ${$_[0]} };
       #  $map{depth}         = sub { undef ${$_[0]} };

       my @addfields = qw(evaluation);  # additional fields, each of them will 
                                        # end up in an own table (if sub provided)
       open my $res, '>', $self->{OUT} . "resource.dat" or die $!;  # the resource.dat is kinda special,  
       binmode $res, ':encoding(utf8)';                             # so we open it first                 

       unless ($self->{INTODB}){
           print $res "key\t";  # print header line - convenience
           print $res "$_\t" for grep { ! $map{$_} } @fields;
           print $res "\n";
       }

       my %handles;                                   # pre-open all necessary filehandles      
       for (@addfields, @fields){                     # use them with                           
           next unless ( ref $map{$_} eq 'CODE' );    # print {$handles{$field}} "super duper"; 

           open $handles{$_}, '>', $self->{OUT} . "$_.dat" or die $!;
           binmode $handles{$_}, ':encoding(utf8)';   # !
       }
       
       # input file
       open my $IN, '<', $self->{FILE} or $self->_log(0, "Cannot read $self->{FILE}:$!") and die $!; 
       binmode $IN, ':encoding(utf8)'; # !

       my $line = 0;
       while(<$IN>) {
           $line++;   # primary key for resource (and other)
           s/\\N//g;  # confuses perl sometimes

           my %data;  # split input row into data hash
           @data{ (@fields) } = split /\t/, $_;
           for my $field (@addfields,@fields){

               # if there's a mapping function for this field, invoke it
               # $data{$_} will be replaced within mapping function (most likely array ref)

               if (ref $map{$field} eq 'CODE'){           # outsourced table

                   # do the mapping input -> output
                   $map{ $field }->( \$data{$field}, \%data );

                   # OUTPUT
                   if ( ref $data{$field} eq 'ARRAY') {   # these end up as: 
                                                          # primkey \t value \n primkey \t value ...
                       for my $val (@{ $data{$field}}) {
                           if ( ref $val eq 'ARRAY' ){
                               print {$handles{$field}} $line;
                               print {$handles{$field}} "\t$_" for @$val;
                               print {$handles{$field}} "\n";
                           }else{
                               print {$handles{$field}} $line . "\t" . $val . "\n";
                           }
                       }
                           # these are more complicated: primkey \t val1 \t val2 ... \n ...
                   } elsif (ref $data{$field} eq 'HASH' and (keys %{ $data{$field}}) > 0) {

                       print {$handles{$field}} $line ;

                       for my $key (sort keys %{ $data{$field}}){
                           print {$handles{$field}} "\t" . $data{$field}->{$key};
                       } 

                       print {$handles{$field}} "\n";
                   }
               } else {                                  # one of the resource attributes

                   # use line number as primary key
                   print $res $line . "\t" if $field eq $fields[0];

                   print $res ( $data{$field} ? $data{$field} : '\N' );  # \N: MySQL NULL

                   print $res ( $field eq $fields[-1] ? "\n" : "\t" );   # last field?
               }   
           }
       }
       close $IN;
       close $handles{$_} for grep {defined $handles{$_} } (@addfields, @fields);
       close $res;

       for (@addfields, @fields){ # remove empty files               
           next unless ( ref $map{$_} eq 'CODE' );
           unlink $self->{OUT} . "$_.dat" if -z $self->{OUT} . "$_.dat";
       }

    $self->_load_in_db("resource", @addfields, @fields) if $self->{INTODB};
    unlink $self->{FILE} if $self->{UNLINK};
    return $self->{OUT};
}

=head2 _load_in_db

--internal-- loads data into database
Files need to be accessible by mysql.

=cut 
#TODO verify everything went ok
#TODO create tables if necessary
#TODO verify table structure if tables present
#TODO load config from loncapa config
sub _load_in_db {
    my ($self, @files) = @_;

    #TODO :----config: should be fetched from central loncapa config later
    my $db_source = 'dbi:mysql:';
    my $db_db     = 'metadata';
    my $db_user   = 'root';
    my $db_pass   = '';  

    my $dbh = DBI->connect($db_source.$db_db, $db_user, $db_pass, { RaiseError =>1, })
                        or die "Can't connect to $db_source: $DBI::errstr"; 

    my %opt = ( keywords      => 'IGNORE',       # tables where IGNORE or REPLACE 
                goto_list     => 'IGNORE',       # is necessary
                comefrom_list => 'IGNORE',
              ); 
    for (@files){
        my $file = $self->{OUT} . $_ . ".dat";
        next unless -e $file;
        print "processing: $_\n"; #FIXME only debug
        my $sth = $dbh->prepare("truncate table $_");
        $sth->execute() or print "ALARM!!\n";

        my $opt = $opt{$_} || '';
        $sth = $dbh->prepare("LOAD DATA INFILE '$file' $opt INTO TABLE $_ "
                            ."CHARACTER SET UTF8 FIELDS TERMINATED BY '\t' "
                            ."LINES TERMINATED BY '\n';");

        $sth->execute() or print "ALARM!!\n";
    }

    $dbh->disconnect;

}

=head2 _trim

--internal-- Trims whitespace at the beginning/end of a string.
Usage:
    trim;                trims $_ inplace
    $new = trim;         trims (and returns) a copy of $_
    trim $str;           trims $str inplace
    $new = trim $str;    trims (and returns) a copy of $str
    trim @list;          trims @list inplace
    @new = trim @list;   trims (and returns) a copy of @list

    source: http://www.perlmonks.org/?node_id=36684
=cut
#TODO integrate
sub _trim {
  @_ = $_ if not @_ and defined wantarray;
  @_ = @_ if defined wantarray;
  for (@_ ? @_ : $_) { s/^\s+//, s/\s+$// }
  return wantarray ? @_ : $_[0] if defined wantarray;
}

=head2 _log

--internal-- logging method

=cut 

sub _log {
    my ($self, $level, $msg) = @_;
    return unless $level <= $self->{DEBUG};

    open my $LOGFILE, '>>', $self->{LOG}
        or die "Couldn't open logfile: $!";

    ( caller 1 )[3] =~ /::(\w+)$/ if ( caller 1 )[3];
    print $LOGFILE time . " ".($1?$1:caller).": $msg \n" ;

    close $LOGFILE;
}  

=head2 DESTROY

This is the destructor.

=cut 

sub DESTROY {
    my ($self) = @_;
}

=head1 AUTHOR

Stefan Droeschler, C<< <st.droeschler at ostfalia.de> >>

=head1 BUGS

Please report any bugs or feature requests to L<bugs.loncapa.org>.


=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Metadata::Load

You can also look for information at:

=over 4

=item * LON-CAPA: LON-CAPA Project Website

L<http://www.loncapa.org>

=back

=head1 LICENSE AND COPYRIGHT

Copyright 2010 Stefan Droeschler.

This program is free software; you can redistribute it and/or modify it
under the terms of either: the GNU General Public License as published
by the Free Software Foundation; or the Artistic License.

See http://dev.perl.org/licenses/ for more information.

=cut

1; # End of Metadata::Load

Index: modules/droeschl/Metadata/t/misc.t
+++ modules/droeschl/Metadata/t/misc.t

Index: modules/droeschl/Metadata/t/misc.t
+++ modules/droeschl/Metadata/t/misc.t

Index: modules/droeschl/Metadata/t/clean.test
+++ modules/droeschl/Metadata/t/clean.test
add content

Index: modules/droeschl/Metadata/t/corrupt.test
+++ modules/droeschl/Metadata/t/corrupt.test
content doesn't matter
content doesn't matter
content doesn't matterexit

Index: modules/droeschl/Metadata/t/notcorrupt.test
+++ modules/droeschl/Metadata/t/notcorrupt.test
content doesn't matter
content doesn't matter
content doesn't matter

Index: modules/droeschl/Metadata/t/pod-coverage.t
+++ modules/droeschl/Metadata/t/pod-coverage.t
use strict;
use warnings;
use Test::More;

# Ensure a recent version of Test::Pod::Coverage
my $min_tpc = 1.08;
eval "use Test::Pod::Coverage $min_tpc";
plan skip_all => "Test::Pod::Coverage $min_tpc required for testing POD coverage"
    if $@;

# Test::Pod::Coverage doesn't require a minimum Pod::Coverage version,
# but older versions don't recognize some common documentation styles
my $min_pc = 0.18;
eval "use Pod::Coverage $min_pc";
plan skip_all => "Pod::Coverage $min_pc required for testing POD coverage"
    if $@;

all_pod_coverage_ok();

Index: modules/droeschl/Metadata/t/pod.t
+++ modules/droeschl/Metadata/t/pod.t
#!perl -T

use strict;
use warnings;
use Test::More;

# Ensure a recent version of Test::Pod
my $min_tp = 1.22;
eval "use Test::Pod $min_tp";
plan skip_all => "Test::Pod $min_tp required for testing POD" if $@;

all_pod_files_ok();

Index: modules/droeschl/Metadata/t/useforloadtest.test
+++ modules/droeschl/Metadata/t/useforloadtest.test
\N	\N	\N	/res/MIT/RELATE/MAPS_course/System_Constituents/	\N	current	\N	\N	\N	notset 	0000-00-00 00:00:00	0000-00-00 00:00:00	\N	public	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	

--droeschl1281992066--