[LON-CAPA-cvs] cvs: nsdl /nsdlloncapaorg harvester.pl
www
lon-capa-cvs@mail.lon-capa.org
Fri, 25 Nov 2005 19:29:58 -0000
www Fri Nov 25 14:29:58 2005 EDT
Modified files:
/nsdl/nsdlloncapaorg harvester.pl
Log:
Updated harvester script
- new hosts
- delete obsolete hosts
- keep stats
- detect all English docs (senisoUS, etc)
Index: nsdl/nsdlloncapaorg/harvester.pl
diff -u nsdl/nsdlloncapaorg/harvester.pl:1.7 nsdl/nsdlloncapaorg/harvester.pl:1.8
--- nsdl/nsdlloncapaorg/harvester.pl:1.7 Tue Oct 21 11:58:26 2003
+++ nsdl/nsdlloncapaorg/harvester.pl Fri Nov 25 14:29:56 2005
@@ -29,25 +29,40 @@
my $debug = 0;
+# Stats
+my %allstats=();
+my %filterstats=();
+my %knockout=();
+my %knockoutlang=();
+
# The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
my @servers = (
-'newscience.westshore.cc.mi.us',
+'newscience.westshore.edu',
's10.lite.msu.edu',
's12.lite.msu.edu',
-'lon-capa.chem.sunysb.edu',
'schubert.tmcc.edu',
'dalton.chem.sfu.ca',
'capa2.phy.ohiou.edu',
'pollux.physics.fsu.edu',
-'loncapa.physics.sc.edu',
-'loncapa.math.ucf.edu',
+'loncapa3.physics.sc.edu',
'zappa.ags.udel.edu',
'loncapa.gwu.edu',
'neptune.physics.ndsu.nodak.edu',
'capa1.uwsp.edu',
-'natasha.it.fit.edu',
'loncapa.Mines.EDU',
-'loncapa.chm.nau.edu');
+'loncapa.chm.nau.edu',
+'library1.lon-capa.uiuc.edu',
+'lon-capa.bsu.edu',
+'psblnx03.bd.psu.edu',
+'lon-capa.acadiau.ca',
+'harvard.lon-capa.org',
+'capa1.cc.huji.ac.il',
+'lon-capa.phy.cmich.edu',
+'meitner.physics.hope.edu',
+'loncapa.vcu.edu',
+'lon-capa.ucsc.edu',
+'lon-capa.bsu.edu'
+);
foreach (@servers) {
my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl';
@@ -85,14 +100,20 @@
my %records = ();;
+my %stats=();
+
foreach my $metadata (@loncapa) {
chomp $metadata;
$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
my @tkline = split('\|', $metadata);
- my $title = $tkline[0];
- next if ( $title eq '' );
+ my ($rawtype)=($tkline[3]=~/\.(\w+)$/);
+ $rawtype=~tr/A-Z/a-z/;
+ $allstats{$rawtype}++;
+
+ my $title = $tkline[0];
+ if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; }
my $author = $tkline[1];
- next if ( $author eq '' );
+ if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; }
my @authorname = split(' ', $author);
my $author_fname = $authorname[0];
my $author_lname = $authorname[1];
@@ -113,6 +134,9 @@
my $fileid=md5_hex($baseid);
next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
+# too many fragments out there
+ next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i);
+
my $keywords = $tkline[4];
my $version = $tkline[5];
my $notes = $tkline[6];
@@ -120,7 +144,9 @@
unless ($abstract) { $abstract=$subject; }
unless ($abstract) { $abstract=$title; }
unless ($abstract) { $abstract=$keywords; }
- my $type = $tkline[8];
+ my $type = $rawtype;
+ if ($type=~/htm/) { $type='htm'; }
+
my $learning_resource_type;
if ( $type eq 'problem' ) {
$learning_resource_type = 114;
@@ -153,8 +179,11 @@
$media_format = 0;
}
- my $language = $tkline[9]; # Look only for seniso
- next if ( $language ne 'seniso');
+ my $language = $tkline[9];
+# likelihood is that the following is true (people would bother if it is not)
+ if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; }
+# NSDL only does English
+ if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; }
my $primary_language='en-US';
my $creation_date = $tkline[10];
my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
@@ -175,10 +204,16 @@
# Domain means restricted to a particular LON-CAPA domain
# Defaults mean access open to any registered LON-CAPA user
# Private means open only to author of material
- next if ( $copyright eq 'private');
- next if ( $copyright eq 'domain');
+ if ( $copyright eq 'private') { $knockout{'private_'.$rawtype}++; next; }
+ if ( $copyright eq 'domain') { $knockout{'domain_'.$rawtype}++; next; }
+ if ( $copyright eq 'custom') { $knockout{'custom_'.$rawtype}++; next; }
my $platform = "5"; # HTML Browser (not specified but construed from metadata)
#
+# We actually do this
+#
+ $stats{$type}++;
+ $filterstats{$type}++;
+#
# Create path
#
unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
@@ -207,4 +242,17 @@
ENDMETA
close (XML);
}
+foreach my $thistype (sort keys %stats) {
+ print "\n$thistype: $stats{$thistype}";
+}
+print "\n----\n";
+}
+print "\nDone.\n";
+foreach my $thistype (sort keys %allstats) {
+ print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}";
+}
+print "\n----\n";
+foreach my $thislang (sort keys %knockoutlang) {
+print "\n>$thislang<: $knockoutlang{$thislang}";
}
+print "\n";