[LON-CAPA-cvs] cvs: loncom /metadata_database searchcat.pl
harris41
lon-capa-cvs@mail.lon-capa.org
Mon, 03 Feb 2003 05:39:37 -0000
This is a MIME encoded message
--harris411044250777
Content-Type: text/plain
harris41 Mon Feb 3 00:39:37 2003 EDT
Modified files:
/loncom/metadata_database searchcat.pl
Log:
beautified; BUG 1215 FIXED documentation that was horrible is now non-horrible
and
it now exists in every place that it should be; BUG 1216 FIXED stale records that
are not accounted for on the system are deleted; BUG 1217 FIXED, the
loncapa:metadata table can now change structure, and this script will
verify and alter the table structure if necessary; BUG 1218 PARTIALLY FIXED
(a small solid step toward The Great Metadata Overhaul); also using File::Find
now instead of find.pl
--harris411044250777
Content-Type: text/plain
Content-Disposition: attachment; filename="harris41-20030203003937.txt"
Index: loncom/metadata_database/searchcat.pl
diff -u loncom/metadata_database/searchcat.pl:1.27 loncom/metadata_database/searchcat.pl:1.28
--- loncom/metadata_database/searchcat.pl:1.27 Sat Jan 4 14:23:31 2003
+++ loncom/metadata_database/searchcat.pl Mon Feb 3 00:39:37 2003
@@ -2,29 +2,30 @@
# The LearningOnline Network
# searchcat.pl "Search Catalog" batch script
#
-# $Id: searchcat.pl,v 1.27 2003/01/04 19:23:31 www Exp $
+# $Id: searchcat.pl,v 1.28 2003/02/03 05:39:37 harris41 Exp $
#
# Copyright Michigan State University Board of Trustees
#
-# This file is part of the LearningOnline Network with CAPA (LON-CAPA).
+# This file is part of the LearningOnline Network with a
+# Computer assisted personalized approach (loncapa).
#
-# LON-CAPA is free software; you can redistribute it and/or modify
+# Loncapa is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
-# LON-CAPA is distributed in the hope that it will be useful,
+# Loncapa is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
-# along with LON-CAPA; if not, write to the Free Software
+# along with loncapa; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# /home/httpd/html/adm/gpl.txt
#
-# http://www.lon-capa.org/
+# http://www.loncapa.org/
#
# YEAR=2001
# 04/14/2001, 04/16/2001 Scott Harrison
@@ -32,11 +33,51 @@
# YEAR=2002
# 05/11/2002 Scott Harrison
#
+# YEAR=2003
+# Scott Harrison
+#
###
-# This script goes through a LON-CAPA resource
-# directory and gathers metadata.
-# The metadata is entered into a SQL database.
+=pod
+
+=head1 NAME
+
+B<searchcat.pl> - put authoritative filesystem data into sql database.
+
+=head1 SYNOPSIS
+
+Ordinarily this script is to be called from a loncapa cron job
+(CVS source location: F<loncapa/loncom/cron/loncapa>; typical
+filesystem installation location: F</etc/cron.d/loncapa>).
+
+Here is the cron job entry.
+
+C<# Repopulate and refresh the metadata database used for the search catalog.>
+
+C<10 1 * * 7 www /home/httpd/perl/searchcat.pl>
+
+This script only allows itself to be run as the user C<www>.
+
+=head1 DESCRIPTION
+
+This script goes through a loncapa resource directory and gathers metadata.
+The metadata is entered into a SQL database.
+
+This script also does general database maintenance such as reformatting
+the C<loncapa:metadata> table if it is deprecated.
+
+This script also builds dynamic temporal metadata and stores this inside
+a F<nohist_resevaldata.db> database file.
+
+This script is playing an increasingly important role for a loncapa
+library server. The proper operation of this script is critical for a smooth
+and correct user experience.
+
+=cut
+
+# ========================================================== Setting things up.
+
+# ------------------------------------------------------ Use external modules.
use lib '/home/httpd/lib/perl/';
use LONCAPA::Configuration;
@@ -47,346 +88,891 @@
use GDBM_File;
use POSIX qw(strftime mktime);
+# ----------------- Code to enable 'find' subroutine listing of the .meta files
+use File::Find;
+
+# List of .meta files (used on a per-user basis).
my @metalist;
+# --------------- Read loncapa_apache.conf and loncapa.conf and get variables.
+my $perlvarref = LONCAPA::Configuration::read_conf('loncapa.conf');
+my %perlvar = %{$perlvarref};
+undef($perlvarref); # Remove since sensitive and not needed.
+delete($perlvar{'lonReceipt'}); # Remove since sensitive and not needed.
+
+# ------------------------------------- Only run if machine is a library server
+if ($perlvar{'lonRole'} ne 'library')
+ {
+ exit(0);
+ }
+
+# ------------------------------ Make sure this process is running as user=www.
+my $wwwid = getpwnam('www');
+if ($wwwid != $<)
+ {
+ $emailto = "$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
+ $subj = "LON: $perlvar{'lonHostID'} User ID mismatch";
+ system("echo 'User ID mismatch. searchcat.pl must be run as user www.' | ".
+ "mailto $emailto -s '$subj' > /dev/null");
+ exit(1);
+ }
+
+# ------------------------------------------------------ Initialize log output.
+open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
+print(LOG '==== Searchcat Run '.localtime().' ===='."\n\n");
+
+my $dbh; # Database object reference handle.
+
+# ----------------------------- Verify connection to loncapa:metadata database.
+unless (
+ $dbh = DBI->connect('DBI:mysql:loncapa','www',
+ $perlvar{'lonSqlAccess'},
+ { RaiseError => 0,PrintError => 0})
+ )
+ {
+ print(LOG '**** ERROR **** Cannot connect to database!'."\n");
+ exit(0);
+ }
+
+# ------------------------------ Create loncapa:metadata table if non-existent.
+my $make_metadata_table = 'CREATE TABLE IF NOT EXISTS metadata ('.
+ 'title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, '.
+ 'version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, '.
+ 'creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, '.
+ 'copyright TEXT, utilitysemaphore BOOL, FULLTEXT idx_title (title), '.
+ 'FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), '.
+ 'FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), '.
+ 'FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), '.
+ 'FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), '.
+ 'FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), '.
+ 'FULLTEXT idx_copyright (copyright)) TYPE=MYISAM';
+
+$dbh->do($make_metadata_table); # Generate the table.
+
+# ----------------------------- Verify format of the loncapa:metadata database.
+# (delete and recreate database if necessary).
+
+# Make a positive control for verifying table structure.
+my $make_metadata_table_CONTROL = $make_metadata_table;
+$make_metadata_table_CONTROL =~
+ s/^(CREATE TABLE IF NOT EXISTS) metadata/$1 CONTROL_metadata/;
+
+$dbh->do('DROP TABLE IF EXISTS CONTROL_metadata');
+$dbh->do($make_metadata_table_CONTROL);
+
+my $table_description; # selectall reference to the table description.
+
+my $CONTROL_table_string; # What the table description should look like.
+my $table_string; # What the table description does look like.
+
+# Calculate the CONTROL table description (what it should be).
+$table_description = $dbh->selectall_arrayref('describe CONTROL_metadata');
+foreach my $table_row (@{$table_description})
+ {
+ $CONTROL_table_string .= join(',',@{$table_row})."\n";
+ }
+
+# Calculate the current table description (what it currently looks like).
+$table_description = $dbh->selectall_arrayref('describe metadata');
+foreach my $table_row (@{$table_description})
+ {
+ $table_string .= join(',',@{$table_row})."\n";
+ }
+
+if ($table_string ne $CONTROL_table_string)
+ {
+ # Log this incident.
+ print(LOG '**** WARNING **** Table structure mismatch, need to regenerate'.
+ '.'."\n");
+ # Delete the table.
+ $dbh->do('DROP TABLE IF EXISTS metadata');
+ # Generate the table.
+ $dbh->do($make_metadata_table);
+ }
-# ----------------------------------------------------- Un-Escape Special Chars
+$dbh->do('DROP TABLE IF EXISTS CONTROL_metadata'); # Okay. Done with control.
-sub unescape {
- my $str=shift;
+# ----------------------------------------------- Set utilitysemaphore to zero.
+$dbh->do('UPDATE metadata SET utilitysemaphore = 0');
+
+# ========================================================= Main functionality.
+
+# - Determine home authors on this server based on resources dir and user tree.
+
+# RESOURCES: the resources directory (subdirs correspond to author usernames).
+opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}") or
+ (print(LOG '=== /res/--lonDefDomain-- directory is not accessible'."\n")
+ and exit(0));
+
+# query_home_server_status will look for user home directories on this machine.
+my @homeusers =
+ grep {&query_home_server_status($perlvar{'lonDocRoot'}.'/res/'.
+ $perlvar{'lonDefDomain'}.'/'.$_)
+ } grep {!/^\.\.?$/} readdir(RESOURCES);
+closedir(RESOURCES);
+
+unless (@homeusers)
+ {
+ print(LOG '=== No home users found on this server.'."\n");
+ }
+
+# Consider each author individually.
+foreach my $user (@homeusers)
+ {
+ # Make a log entry.
+ print(LOG "\n".'=== User: '.$user."\n\n");
+
+ # Get filesystem path to this user's directory.
+ my $user_directory =
+ &construct_path_to_user_directory($perlvar{'lonDefDomain'},$user);
+
+ # Remove left-over db-files from a potentially crashed searchcat run.
+ unlink($user_directory.'/nohist_new_resevaldata.db');
+
+ # Cleanup the metalist array.
+ undef(@metalist);
+ @metalist = ();
+
+ # This will add entries to the @metalist array.
+ &File::Find::find(\&wanted,
+ $perlvar{'lonDocRoot'}.'/res/'.
+ $perlvar{'lonDefDomain'}.'/'.$user);
+
+ # -- process file to get metadata and put into search catalog SQL database
+ # Also, build and store dynamic metadata.
+ # Also, delete record entries before refreshing.
+ foreach my $m (@metalist)
+ {
+ # Log this action.
+ print(LOG "- ".$m."\n");
+
+ # Get metadata from the file.
+ my $ref = get_metadata_from_file($m);
+
+ # Make a datarecord identifier for this resource.
+ my $m2 = '/res/'.declutter($m);
+ $m2 =~ s/\.meta$//;
+
+ # Build and store dynamic metadata inside nohist_resevaldata.db.
+ build_on_the_fly_dynamic_metadata($m2);
+
+ # Delete record if it already exists.
+ my $q2 = 'select * from metadata where url like binary '."'".$m2."'";
+ my $sth = $dbh->prepare($q2);
+ $sth->execute();
+ my $r1 = $sth->fetchall_arrayref;
+ if (@$r1)
+ {
+ $sth =
+ $dbh->prepare('delete from metadata where url like binary '.
+ "'".$m2."'");
+ $sth->execute();
+ }
+
+ # Add new/replacement record into the loncapa:metadata table.
+ $sth = $dbh->prepare('insert into metadata values ('.
+ '"'.delete($ref->{'title'}).'"'.','.
+ '"'.delete($ref->{'author'}).'"'.','.
+ '"'.delete($ref->{'subject'}).'"'.','.
+ '"'.$m2.'"'.','.
+ '"'.delete($ref->{'keywords'}).'"'.','.
+ '"'.'current'.'"'.','.
+ '"'.delete($ref->{'notes'}).'"'.','.
+ '"'.delete($ref->{'abstract'}).'"'.','.
+ '"'.delete($ref->{'mime'}).'"'.','.
+ '"'.delete($ref->{'language'}).'"'.','.
+ '"'.sql_formatted_time(
+ delete($ref->{'creationdate'})).'"'.','.
+ '"'.sql_formatted_time(
+ delete($ref->{'lastrevisiondate'})).'"'.','.
+ '"'.delete($ref->{'owner'}).'"'.','.
+ '"'.delete($ref->{'copyright'}).'"'.','.
+ '1'.')');
+ $sth->execute();
+ }
+
+# ----------------------- Clean up database, remove stale SQL database records.
+ $dbh->do('DELETE FROM metadata WHERE utilitysemaphore = 0');
+
+# -------------------------------------------------- Copy over the new db-files
+ system('mv '.$user_directory.'/nohist_new_resevaldata.db '.
+ $user_directory.'/nohist_resevaldata.db');
+ }
+
+# --------------------------------------------------- Close database connection
+$dbh->disconnect;
+print LOG "\n==== Searchcat completed ".localtime()." ====\n";
+close(LOG);
+exit(0);
+
+# ================================================================ Subroutines.
+
+=pod
+
+=head1 SUBROUTINES
+
+=cut
+
+=pod
+
+B<unescape> - translate to unstrange escaped syntax to strange characters.
+
+=over 4
+
+Parameters:
+
+=item I<$str> - string with unweird characters.
+
+=back
+
+=over 4
+
+Returns:
+
+=item C<string> - string with potentially weird characters.
+
+=back
+
+=cut
+
+sub unescape ($)
+ {
+ my $str = shift(@_);
$str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg;
- return $str;
-}
+ return($str);
+ }
+
+=pod
+
+B<escape> - translate strange characters to unstrange escaped syntax.
+
+=over 4
+
+Parameters:
-# -------------------------------------------------------- Escape Special Chars
+=item I<$str> - string with potentially weird characters to unweird-ify.
-sub escape {
- my $str=shift;
+=back
+
+=over 4
+
+Returns:
+
+=item C<string> - unweird-ified string.
+
+=back
+
+=cut
+
+sub escape ($)
+ {
+ my $str = shift(@_);
$str =~ s/(\W)/"%".unpack('H2',$1)/eg;
- return $str;
-}
+ return($str);
+ }
+
+=pod
+
+B<build_on_the_fly_dynamic_metadata> - evaluate and store dynamic metadata.
+
+Dynamic metadata is stored in a nohist_resevaldata GDBM database.
+Most of the calculations in this subroutine are totally pointless
+and not useful for anything that this subroutine does.
+(THIS IS A FRUSTRATED SUBROUTINE THAT IS NON-OPTIMAL, *&*&!.)
+The only thing that this subroutine really makes happen is adjusting
+a 'count' value inside the F<nohist_new_resevaldata.db> as well
+as updating F<nohist_new_resevaldata.db> with information from
+F<nohist_resevaldata.db>.
+
+=over 4
+
+Parameters:
+
+=item I<$url> - the filesystem path (url may be a misnomer...)
+
+=back
+
+=over 4
+
+Returns:
+=item C<hash> - key-value table of dynamically evaluated metadata.
-# ------------------------------------------- Code to evaluate dynamic metadata
+=back
-sub dynamicmeta {
+=cut
- my $url=&declutter(shift);
- $url=~s/\.meta$//;
- my %returnhash=();
- my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//);
- my $prodir=&propath($adomain,$aauthor);
+sub build_on_the_fly_dynamic_metadata ($)
+ {
+ # BEWARE ALL WHO TRY TO UNDERSTAND THIS ABSURDLY HORRIBLE SUBROUTINE.
+
+ # Do all sorts of mumbo-jumbo to compute the user's directory.
+ my $url = &declutter(shift(@_));
+ $url =~ s/\.meta$//;
+ my %returnhash = ();
+ my ($adomain,$aauthor) = ($url =~ m!^(\w+)/(\w+)/!);
+ my $user_directory = &construct_path_to_user_directory($adomain,$aauthor);
+
+ # Attempt a GDBM database instantiation inside users directory and proceed.
if ((tie(%evaldata,'GDBM_File',
- $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) &&
+ $user_directory.
+ '/nohist_resevaldata.db',&GDBM_READER(),0640)) &&
(tie(%newevaldata,'GDBM_File',
- $prodir.'/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640))) {
- my %sum=();
- my %cnt=();
- my %listitems=('count' => 'add',
- 'course' => 'add',
- 'avetries' => 'avg',
- 'stdno' => 'add',
- 'difficulty' => 'avg',
- 'clear' => 'avg',
- 'technical' => 'avg',
- 'helpful' => 'avg',
- 'correct' => 'avg',
- 'depth' => 'avg',
- 'comments' => 'app',
- 'usage' => 'cnt'
- );
- my $regexp=$url;
- $regexp=~s/(\W)/\\$1/g;
- $regexp='___'.$regexp.'___([a-z]+)$';
- foreach (keys %evaldata) {
- my $key=&unescape($_);
- if ($key=~/$regexp/) {
- my $ctype=$1;
- if (defined($cnt{$ctype})) {
- $cnt{$ctype}++;
- } else {
- $cnt{$ctype}=1;
- }
- unless ($listitems{$ctype} eq 'app') {
- if (defined($sum{$ctype})) {
- $sum{$ctype}+=$evaldata{$_};
- } else {
- $sum{$ctype}=$evaldata{$_};
- }
- } else {
- if (defined($sum{$ctype})) {
- if ($evaldata{$_}) {
- $sum{$ctype}.='<hr>'.$evaldata{$_};
- }
- } else {
- $sum{$ctype}=''.$evaldata{$_};
- }
- }
- if ($ctype ne 'count') {
- $newevaldata{$_}=$evaldata{$_};
- }
- }
+ $user_directory.
+ '/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640)))
+ {
+ # For different variables, track the running sum and counts.
+ my %sum = ();
+ my %cnt = ();
+
+ # Define computed items as a sum (add) or an average (avg) or a raw
+ # count (cnt) or 'app'?
+ my %listitems=('count' => 'add',
+ 'course' => 'add',
+ 'avetries' => 'avg',
+ 'stdno' => 'add',
+ 'difficulty' => 'avg',
+ 'clear' => 'avg',
+ 'technical' => 'avg',
+ 'helpful' => 'avg',
+ 'correct' => 'avg',
+ 'depth' => 'avg',
+ 'comments' => 'app',
+ 'usage' => 'cnt'
+ );
+
+ # Untaint the url and use as part of a regular expression.
+ my $regexp = $url;
+ $regexp =~ s/(\W)/\\$1/g;
+ $regexp = '___'.$regexp.'___([a-z]+)$';
+
+ # Check existing nohist database for this url.
+ # THE ONLY TIME THIS IS IMPORTANT FOR THIS AWFUL SUBROUTINE
+ # IS FOR 'count' ENTRIES
+ # AND FOR REFRESHING non-'count' ENTRIES INSIDE nohist_new DATABASE.
+ foreach (keys %evaldata)
+ {
+ my $key = &unescape($_);
+ if ($key =~ /$regexp/) # If url-based entry exists.
+ {
+ my $ctype = $1; # Set to specific category type.
+
+ # Do an increment for this category type.
+ if (defined($cnt{$ctype}))
+ {
+ $cnt{$ctype}++;
+ }
+ else
+ {
+ $cnt{$ctype} = 1;
+ }
+ unless ($listitems{$ctype} eq 'app') # WHAT DOES 'app' MEAN?
+ {
+ # Increment the sum based on the evaluated data in the db.
+ if (defined($sum{$ctype}))
+ {
+ $sum{$ctype} += $evaldata{$_};
+ }
+ else
+ {
+ $sum{$ctype} = $evaldata{$_};
+ }
+ }
+ else # 'app' mode, means to use '<hr />' as a separator
+ {
+ if (defined($sum{$ctype}))
+ {
+ if ($evaldata{$_})
+ {
+ $sum{$ctype} .= '<hr />'.$evaldata{$_};
+ }
+ }
+ else
+ {
+ $sum{$ctype} = ''.$evaldata{$_};
+ }
+ }
+ if ($ctype ne 'count')
+ {
+ # ALERT! THIS HORRIBLE LOOP IS ACTUALLY DOING SOMETHING
+ # USEFUL!
+ $newevaldata{$_} = $evaldata{$_};
+ }
+ }
+ }
+
+ # THE ONLY OTHER TIME THIS LOOP IS USEFUL IS FOR THE 'count' HASH
+ # ELEMENT.
+ foreach (keys %cnt)
+ {
+ if ($listitems{$_} eq 'avg')
+ {
+ $returnhash{$_} = int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0;
+ }
+ elsif ($listitems{$_} eq 'cnt')
+ {
+ $returnhash{$_} = $cnt{$_};
+ }
+ else
+ {
+ $returnhash{$_} = $sum{$_};
+ }
+ }
+
+ # A RARE MOMENT OF DOING ANYTHING USEFUL INSIDE THIS
+ # BLEEPING SUBROUTINE.
+ if ($returnhash{'count'})
+ {
+ my $newkey = $$.'_'.time.'_searchcat___'.&escape($url).'___count';
+ $newevaldata{$newkey} = $returnhash{'count'};
+ }
+
+ untie(%evaldata); # Close/release the original nohist database.
+ untie(%newevaldata); # Close/release the new nohist database.
}
- foreach (keys %cnt) {
- if ($listitems{$_} eq 'avg') {
- $returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0;
- } elsif ($listitems{$_} eq 'cnt') {
- $returnhash{$_}=$cnt{$_};
- } else {
- $returnhash{$_}=$sum{$_};
- }
- }
- if ($returnhash{'count'}) {
- my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count';
- $newevaldata{$newkey}=$returnhash{'count'};
- }
- untie(%evaldata);
- untie(%newevaldata);
- }
- return %returnhash;
-}
-
-# ----------------- Code to enable 'find' subroutine listing of the .meta files
-require "find.pl";
-sub wanted {
+ return(%returnhash);
+ # Celebrate! We have now accomplished some simple calculations using
+ # 1000% bloated functionality in our subroutine. Go wash your eyeballs
+ # out now.
+ }
+
+=pod
+
+B<wanted> - used by B<File::Find::find> subroutine.
+
+This evaluates whether a file is wanted, and pushes it onto the
+I<@metalist> array. This subroutine was, for the most part, auto-generated
+by the B<find2perl> command.
+
+=over 4
+
+Parameters:
+
+=item I<$file> - a path to the file.
+
+=back
+
+=over 4
+
+Returns:
+
+=item C<boolean> - true or false based on logical statement.
+
+=back
+
+=cut
+
+sub wanted ($)
+ {
(($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) &&
- -f _ &&
+ -f $_ &&
/^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ &&
- push(@metalist,"$dir/$_");
-}
+ push(@metalist,$File::Find::dir.'/'.$_);
+ }
-# --------------- Read loncapa_apache.conf and loncapa.conf and get variables
-my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
-my %perlvar=%{$perlvarref};
-undef $perlvarref; # remove since sensitive and not needed
-delete $perlvar{'lonReceipt'}; # remove since sensitive and not needed
+=pod
-# ------------------------------------- Only run if machine is a library server
-exit unless $perlvar{'lonRole'} eq 'library';
+B<get_metadata_from_file> - read xml-tagged file and return parsed metadata.
-# ----------------------------- Make sure this process is running from user=www
+I<Note that this is significantly altered from a subroutine present in lonnet.>
-my $wwwid=getpwnam('www');
-if ($wwwid!=$<) {
- $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
- $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
- system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\
- mailto $emailto -s '$subj' > /dev/null");
- exit 1;
-}
+=over 4
+Parameters:
-# ---------------------------------------------------------- We are in business
+=item I<$file> - a path.to the file.
-open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
-print LOG '==== Searchcat Run '.localtime()."====\n\n";
-my $dbh;
-# ------------------------------------- Make sure that database can be accessed
-{
- unless (
- $dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0})
- ) {
- print LOG "Cannot connect to database!\n";
- exit;
- }
- my $make_metadata_table = "CREATE TABLE IF NOT EXISTS metadata (".
- "title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, ".
- "version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, ".
- "creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, ".
- "copyright TEXT, FULLTEXT idx_title (title), ".
- "FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), ".
- "FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), ".
- "FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), ".
- "FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), ".
- "FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), ".
- "FULLTEXT idx_copyright (copyright)) TYPE=MYISAM";
- # It would sure be nice to have some logging mechanism.
- $dbh->do($make_metadata_table);
-}
+=back
-# ------------------------------------------------------------- get .meta files
-opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}");
-my @homeusers=grep
- {&ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")}
- grep {!/^\.\.?$/} readdir(RESOURCES);
-closedir RESOURCES;
-foreach my $user (@homeusers) {
- print LOG "\n=== User: ".$user."\n\n";
-# Remove left-over db-files from potentially crashed searchcat run
- my $prodir=&propath($perlvar{'lonDefDomain'},$user);
- unlink($prodir.'/nohist_new_resevaldata.db');
-# Use find.pl
- undef @metalist;
- @metalist=();
- &find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user");
-
-# -- process each file to get metadata and put into search catalog SQL database
-# Also, check to see if already there.
-# I could just delete (without searching first), but this works for now.
-foreach my $m (@metalist) {
- print LOG "- ".$m."\n";
- my $ref=&metadata($m);
- my $m2='/res/'.&declutter($m);
- $m2=~s/\.meta$//;
- &dynamicmeta($m2);
- my $q2="select * from metadata where url like binary '$m2'";
- my $sth = $dbh->prepare($q2);
- $sth->execute();
- my $r1=$sth->fetchall_arrayref;
- if (@$r1) {
- $sth=$dbh->prepare("delete from metadata where url like binary '$m2'");
- $sth->execute();
- }
- $sth=$dbh->prepare('insert into metadata values ('.
- '"'.delete($ref->{'title'}).'"'.','.
- '"'.delete($ref->{'author'}).'"'.','.
- '"'.delete($ref->{'subject'}).'"'.','.
- '"'.$m2.'"'.','.
- '"'.delete($ref->{'keywords'}).'"'.','.
- '"'.'current'.'"'.','.
- '"'.delete($ref->{'notes'}).'"'.','.
- '"'.delete($ref->{'abstract'}).'"'.','.
- '"'.delete($ref->{'mime'}).'"'.','.
- '"'.delete($ref->{'language'}).'"'.','.
- '"'.sqltime(delete($ref->{'creationdate'})).'"'.','.
- '"'.sqltime(delete($ref->{'lastrevisiondate'})).'"'.','.
- '"'.delete($ref->{'owner'}).'"'.','.
- '"'.delete($ref->{'copyright'}).'"'.')');
- $sth->execute();
-}
-
-# ----------------------------------------------------------- Clean up database
-# Need to, perhaps, remove stale SQL database records.
-# ... not yet implemented
+=over 4
+Returns:
-# -------------------------------------------------- Copy over the new db-files
- system('mv '.$prodir.'/nohist_new_resevaldata.db '.
- $prodir.'/nohist_resevaldata.db');
-}
-# --------------------------------------------------- Close database connection
-$dbh->disconnect;
-print LOG "\n==== Searchcat completed ".localtime()." ====\n";
-close(LOG);
-exit 0;
-# =============================================================================
+=item C<hash reference> - a hash array (keys and values).
+
+=back
-# ---------------------------------------------------------------- Get metadata
-# significantly altered from subroutine present in lonnet
-sub metadata {
- my ($uri,$what)=@_;
- my %metacache;
- $uri=&declutter($uri);
- my $filename=$uri;
- $uri=~s/\.meta$//;
- $uri='';
- unless ($metacache{$uri.'keys'}) {
- unless ($filename=~/\.meta$/) { $filename.='.meta'; }
- my $metastring=&getfile($perlvar{'lonDocRoot'}.'/res/'.$filename);
- my $parser=HTML::TokeParser->new(\$metastring);
- my $token;
- while ($token=$parser->get_token) {
- if ($token->[0] eq 'S') {
- my $entry=$token->[1];
- my $unikey=$entry;
- if (defined($token->[2]->{'part'})) {
- $unikey.='_'.$token->[2]->{'part'};
+=cut
+
+sub get_metadata_from_file ($)
+ {
+ my ($filename) = @_;
+ my %metatable; # Used to store return value of hash-tabled metadata.
+ $filename = &declutter($filename); # Remove non-identifying filesystem info
+ my $uri = ''; # The URI is not relevant in this scenario.
+ unless ($filename =~ m/\.meta$/) # Unless ending with .meta.
+ {
+ $filename .= '.meta'; # Append a .meta suffix.
+ }
+ # Get the file contents.
+ my $metadata_string =
+ &get_file_contents($perlvar{'lonDocRoot'}.'/res/'.$filename);
+
+ # Parse the file based on its XML tags.
+ my $parser = HTML::TokeParser->new(\$metadata_string);
+ my $token;
+ while ($token = $parser->get_token) # Loop through tokens.
+ {
+ if ($token->[0] eq 'S') # If it is a start token.
+ {
+ my $entry = $token->[1];
+ my $unikey = $entry; # A unique identifier for this xml tag key.
+ if (defined($token->[2]->{'part'}))
+ {
+ $unikey .= '_'.$token->[2]->{'part'};
}
- if (defined($token->[2]->{'name'})) {
- $unikey.='_'.$token->[2]->{'name'};
+ if (defined($token->[2]->{'name'}))
+ {
+ $unikey .= '_'.$token->[2]->{'name'};
}
- if ($metacache{$uri.'keys'}) {
- $metacache{$uri.'keys'}.=','.$unikey;
- } else {
- $metacache{$uri.'keys'}=$unikey;
+ # Append $unikey to metatable's keys entry.
+ if ($metatable{$uri.'keys'})
+ {
+ $metatable{$uri.'keys'} .= ','.$unikey;
}
- map {
- $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_};
- } @{$token->[3]};
- unless (
- $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry)
- ) { $metacache{$uri.''.$unikey}=
- $metacache{$uri.''.$unikey.'.default'};
- }
- }
- }
- }
- return \%metacache;
-}
-
-# ------------------------------------------------------------ Serves up a file
-# returns either the contents of the file or a -1
-sub getfile {
- my $file=shift;
- if (! -e $file ) { return -1; };
- my $fh=IO::File->new($file);
- my $a='';
- while (<$fh>) { $a .=$_; }
- return $a
-}
-
-# ------------------------------------------------------------- Declutters URLs
-sub declutter {
- my $thisfn=shift;
- $thisfn=~s/^$perlvar{'lonDocRoot'}//;
- $thisfn=~s/^\///;
- $thisfn=~s/^res\///;
- return $thisfn;
-}
-
-# --------------------------------------- Is this the home server of an author?
-# (copied from lond, modification of the return value)
-sub ishome {
- my $author=shift;
- $author=~s/\/home\/httpd\/html\/res\/([^\/]*)\/([^\/]*).*/$1\/$2/;
- my ($udom,$uname)=split(/\//,$author);
- my $proname=propath($udom,$uname);
- if (-e $proname) {
- return 1;
- } else {
- return 0;
- }
-}
-
-# -------------------------------------------- Return path to profile directory
-# (copied from lond)
-sub propath {
- my ($udom,$uname)=@_;
- $udom=~s/\W//g;
- $uname=~s/\W//g;
- my $subdir=$uname.'__';
- $subdir =~ s/(.)(.)(.).*/$1\/$2\/$3/;
- my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname";
- return $proname;
-}
+ else
+ {
+ $metatable{$uri.'keys'} = $unikey;
+ }
+ # Insert contents into metatable entry for the unikey.
+ foreach my $t3 (@{$token->[3]})
+ {
+ $metatable{$uri.''.$unikey.'.'.$_} = $token->[2]->{$t3};
+ }
+ # If there was no text contained inside the tags, set = default.
+ unless
+ (
+ $metatable{$uri.''.$unikey} = $parser->get_text('/'.$entry)
+ )
+ {
+ $metatable{$uri.''.$unikey} =
+ $metatable{$uri.''.$unikey.'.default'};
+ }
+ }
+ }
+ # Return with a key-value table of XML tags and their tag contents.
+ return(\%metatable);
+ }
-# ---------------------------- convert 'time' format into a datetime sql format
-sub sqltime {
+=pod
+
+B<get_file_contents> - returns either the contents of the file or a -1.
+
+=over 4
+
+Parameters:
+
+=item I<$file> - a complete filesystem path.to the file.
+
+=back
+
+=over 4
+
+Returns:
+
+=item C<string> - file contents or a -1.
+
+=back
+
+=cut
+
+sub get_file_contents ($)
+ {
+ my $file = shift(@_);
+
+ # If file does not exist, then return a -1 value.
+ unless (-e $file)
+ {
+ return(-1);
+ }
+
+ # Read in file contents.
+ my $file_handle = IO::File->new($file);
+ my $file_contents = '';
+ while (<$file_handle>)
+ {
+ $file_contents .= $_;
+ }
+
+ # Return file contents.
+ return($file_contents);
+ }
+
+=pod
+
+B<declutter> - Declutters URLs (remove extraneous prefixed filesystem path).
+
+=over 4
+
+Parameters:
+
+=item I<$filesystem_path> - a complete filesystem path.
+
+=back
+
+=over 4
+
+Returns:
+
+=item C<string> - remnants of the filesystem path (beginning portion removed).
+
+=back
+
+=cut
+
+sub declutter
+ {
+ my $filesystem_path = shift(@_);
+
+ # Remove beginning portions of the filesystem path.
+ $filesystem_path =~ s/^$perlvar{'lonDocRoot'}//;
+ $filesystem_path =~ s!^/!!;
+ $filesystem_path =~ s!^res/!!;
+
+ # Return what is remaining for the filesystem path.
+ return($filesystem_path);
+ }
+
+=pod
+
+B<query_home_server_status> - Is this the home server of an author's directory?
+
+=over 4
+
+Parameters:
+
+=item I<$author_filesystem_path> - directory path for a user.
+
+=back
+
+=over 4
+
+Returns:
+
+=item C<boolean> - 1 if true; 0 if false.
+
+=back
+
+=cut
+
+sub query_home_server_status ($)
+ {
+ my $author_filesystem_path = shift(@_);
+
+ # Remove beginning portion of this filesystem path.
+ $author_filesystem_path =~ s!/home/httpd/html/res/([^/]*)/([^/]*).*!$1/$2!;
+
+ # Construct path to the author's ordinary user directory.
+ my ($user_domain,$username) = split(m!/!,$author_filesystem_path);
+ my $user_directory_path = construct_path_to_user_directory($user_domain,
+ $username);
+
+ # Return status of whether the user directory path is defined.
+ if (-e $user_directory_path)
+ {
+ return(1); # True.
+ }
+ else
+ {
+ return(0); # False.
+ }
+ }
+
+=pod
+
+B<construct_path_to_user_directory> ($$) - makes a filesystem path to user dir.
+
+=over 4
+
+Parameters:
+
+=item I<$user_domain> - the loncapa domain of the user.
+
+=item I<$username> - the unique username (user id) of the user.
+
+=back
+
+=over 4
+
+Returns:
+
+=item C<string> - representing the path on the filesystem.
+
+=back
+
+=cut
+
+sub construct_path_to_user_directory ($$)
+ {
+ my ($user_domain,$username) = @_;
+
+ # Untaint.
+ $user_domain =~ s/\W//g;
+ $username =~ s/\W//g;
+
+ # Create three levels of sub-directoried filesystem path
+ # based on the first three characters of the username.
+ my $sub_filesystem_path = $username.'__';
+ $sub_filesystem_path =~ s!(.)(.)(.).*!$1/$2/$3/!;
+
+ # Use the sub-directoried levels and other variables to generate
+ # the complete filesystem path.
+ my $complete_filesystem_path =
+ join('/',($perlvar{'lonUsersDir'},
+ $user_domain,
+ $sub_filesystem_path,
+ $username));
+
+ # Return the complete filesystem path.
+ return($complete_filesystem_path);
+ }
+
+=pod
+
+B<sql_formatted_time> (@) - turns seconds since epoch into datetime sql format.
+
+=over 4
+
+Parameters:
+
+=item I<$epochtime> - time in seconds since epoch (may need to be sanitized).
+
+=back
+
+=over 4
+
+Returns:
+
+=item C<string> - datetime sql formatted string.
+
+=back
+
+=cut
+
+sub sql_formatted_time ($)
+ {
+ # Sanitize the time argument and convert to localtime array.
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
- localtime(&unsqltime(@_[0]));
- $mon++; $year+=1900;
- return "$year-$mon-$mday $hour:$min:$sec";
-}
-
-sub maketime {
- my %th=@_;
- return POSIX::mktime(
- ($th{'seconds'},$th{'minutes'},$th{'hours'},
- $th{'day'},$th{'month'}-1,$th{'year'}-1900,0,0,$th{'dlsav'}));
-}
-
-
-#########################################
-#
-# Retro-fixing of un-backward-compatible time format
-
-sub unsqltime {
- my $timestamp=shift;
- if ($timestamp=~/^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) {
- $timestamp=&maketime(
- 'year'=>$1,'month'=>$2,'day'=>$3,
- 'hours'=>$4,'minutes'=>$5,'seconds'=>$6);
- }
- return $timestamp;
-}
+ localtime(&sanitize_time(shift(@_)));
+
+ # Convert month from (0..11) to (1..12).
+ $mon += 1;
+
+ # Make the year compatible with A.D. specification.
+ $year += 1900;
+
+ # Return a date which is compatible with MySQL's "DATETIME" format.
+ return(join('-',($year,$mon,$mday)).
+ ' '.
+ join(':',($hour,$min,$sec))
+ );
+ }
+
+
+# ==================================== The following two subroutines are needed
+# for accommodating incorrect time formats inside the metadata.
+
+=pod
+
+B<make_seconds_since_epoch> (@) - turns time metadata into seconds since epoch.
+
+=over 4
+
+Parameters:
+
+=item I<%time_metadata> - a key-value listing characterizing month, year, etc.
+
+=back
+
+=over 4
+
+Returns:
+
+=item C<integer> - seconds since epoch.
+
+=back
+
+=cut
+
+sub make_seconds_since_epoch (@)
+ {
+ # Keytable of time metadata.
+ my %time_metadata = @_;
+
+ # Return seconds since the epoch (January 1, 1970, 00:00:00 UTC).
+ return(POSIX::mktime(
+ ($time_metadata{'seconds'},
+ $time_metadata{'minutes'},
+ $time_metadata{'hours'},
+ $time_metadata{'day'},
+ $time_metadata{'month'}-1,
+ $time_metadata{'year'}-1900,
+ 0,
+ 0,
+ $time_metadata{'dlsav'})
+ )
+ );
+ }
+
+=pod
+
+B<sanitize_time> - if time looks sql-formatted, make it seconds since epoch.
+
+Somebody described this subroutine as
+"retro-fixing of un-backward-compatible time format".
+
+What this means, is that a part of this code expects to get UTC seconds
+since the epoch (beginning of 1970). Yet, some of the .meta files have
+sql-formatted time strings (2001-04-01, etc.) instead of seconds-since-epoch
+integers (e.g. 1044147435). These time strings do not encode the timezone
+and, in this sense, can be considered "un-backwards-compatible".
+
+=over 4
+
+Parameters:
+
+=item I<$potentially_badformat_string> - string to "retro-fix".
+
+=back
+
+=over 4
+
+Returns:
+
+=item C<integer> - seconds since epoch.
+
+=back
+
+=cut
+
+sub sanitize_time ($)
+ {
+ my $timestamp = shift(@_);
+ # If timestamp is in this unexpected format....
+ if ($timestamp =~ /^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/)
+ {
+ # then convert into seconds since epoch (the expected format).
+ $timestamp = &make_seconds_since_epoch(
+ 'year' => $1,
+ 'month' => $2,
+ 'day' => $3,
+ 'hours' => $4,
+ 'minutes' => $5,
+ 'seconds' => $6
+ );
+ }
+ # Otherwise we assume timestamp to be as expected.
+ return($timestamp);
+ }
+
+=pod
+
+=head1 AUTHOR
+
+Written to help the loncapa project.
+
+Scott Harrison, sharrison@users.sourceforge.net
+
+This is distributed under the same terms as loncapa (i.e. "freeware").
+=cut
--harris411044250777--