[LON-CAPA-cvs] cvs: nsdl /nsdlloncapaorg harvester.pl

www lon-capa-cvs@mail.lon-capa.org
Fri, 25 Nov 2005 19:29:58 -0000


www		Fri Nov 25 14:29:58 2005 EDT

  Modified files:              
    /nsdl/nsdlloncapaorg	harvester.pl 
  Log:
  Updated harvester script
  - new hosts
  - delete obsolete hosts
  - keep stats
  - detect all English docs (senisoUS, etc)
  
  
Index: nsdl/nsdlloncapaorg/harvester.pl
diff -u nsdl/nsdlloncapaorg/harvester.pl:1.7 nsdl/nsdlloncapaorg/harvester.pl:1.8
--- nsdl/nsdlloncapaorg/harvester.pl:1.7	Tue Oct 21 11:58:26 2003
+++ nsdl/nsdlloncapaorg/harvester.pl	Fri Nov 25 14:29:56 2005
@@ -29,25 +29,40 @@
 
 my $debug = 0;
 
+# Stats
+my %allstats=();
+my %filterstats=();
+my %knockout=();
+my %knockoutlang=();
+
 # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
 my @servers = (
-'newscience.westshore.cc.mi.us',
+'newscience.westshore.edu',
 's10.lite.msu.edu',
 's12.lite.msu.edu',
-'lon-capa.chem.sunysb.edu',
 'schubert.tmcc.edu',
 'dalton.chem.sfu.ca',
 'capa2.phy.ohiou.edu',
 'pollux.physics.fsu.edu',
-'loncapa.physics.sc.edu',
-'loncapa.math.ucf.edu',
+'loncapa3.physics.sc.edu',
 'zappa.ags.udel.edu',
 'loncapa.gwu.edu',
 'neptune.physics.ndsu.nodak.edu',
 'capa1.uwsp.edu',
-'natasha.it.fit.edu',
 'loncapa.Mines.EDU',
-'loncapa.chm.nau.edu');
+'loncapa.chm.nau.edu',
+'library1.lon-capa.uiuc.edu',
+'lon-capa.bsu.edu',
+'psblnx03.bd.psu.edu',
+'lon-capa.acadiau.ca',
+'harvard.lon-capa.org',
+'capa1.cc.huji.ac.il',
+'lon-capa.phy.cmich.edu',
+'meitner.physics.hope.edu',
+'loncapa.vcu.edu',
+'lon-capa.ucsc.edu',
+'lon-capa.bsu.edu'
+);
 
 foreach (@servers) {
     my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl';
@@ -85,14 +100,20 @@
 
 my %records = ();;
 
+my %stats=();
+
 foreach my $metadata (@loncapa) {
 	chomp $metadata;
 	$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
 	my @tkline = split('\|', $metadata);
-	my $title = $tkline[0];
-	next if ( $title eq '' );
+        my ($rawtype)=($tkline[3]=~/\.(\w+)$/);
+        $rawtype=~tr/A-Z/a-z/;
+        $allstats{$rawtype}++;
+        
+        my $title = $tkline[0];
+	if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; }
 	my $author = $tkline[1];
-	next if ( $author eq '' );
+	if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; }
 	my @authorname = split(' ', $author);
 	my $author_fname = $authorname[0];
 	my $author_lname = $authorname[1];
@@ -113,6 +134,9 @@
 	my $fileid=md5_hex($baseid);
 
 	next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
+# too many fragments out there
+        next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i);
+
 	my $keywords = $tkline[4];
 	my $version = $tkline[5];
 	my $notes = $tkline[6];
@@ -120,7 +144,9 @@
 	unless ($abstract) { $abstract=$subject; }
 	unless ($abstract) { $abstract=$title; }
 	unless ($abstract) { $abstract=$keywords; }
-	my $type = $tkline[8];
+	my $type = $rawtype;
+        if ($type=~/htm/) { $type='htm'; }
+
 	my $learning_resource_type;
 	if ( $type eq 'problem' ) {
 		$learning_resource_type = 114;
@@ -153,8 +179,11 @@
 		$media_format = 0;
 	}
 
-	my $language = $tkline[9]; # Look only for seniso
-	next if ( $language ne 'seniso');
+	my $language = $tkline[9];
+# likelihood is that the following is true (people would bother if it is not)
+        if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; }
+# NSDL only does English
+        if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; } 
 	my $primary_language='en-US';
 	my $creation_date = $tkline[10];
 	my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
@@ -175,10 +204,16 @@
 	# Domain means restricted to a particular LON-CAPA domain
 	# Defaults mean access open to any registered LON-CAPA user
 	# Private means open only to author of material
-	next if ( $copyright eq 'private');
-	next if ( $copyright eq 'domain');
+	if ( $copyright eq 'private') { $knockout{'private_'.$rawtype}++; next; } 
+	if ( $copyright eq 'domain') { $knockout{'domain_'.$rawtype}++; next; }
+        if ( $copyright eq 'custom') { $knockout{'custom_'.$rawtype}++; next; }
 	my $platform = "5";     # HTML Browser (not specified but construed from metadata)
 #
+# We actually do this
+#
+        $stats{$type}++;
+        $filterstats{$type}++;
+#
 # Create path
 #
 	unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
@@ -207,4 +242,17 @@
 ENDMETA
       close (XML);
 }
+foreach my $thistype (sort keys %stats) {
+   print "\n$thistype: $stats{$thistype}";
+}
+print "\n----\n";
+}
+print "\nDone.\n";
+foreach my $thistype (sort keys %allstats) {
+   print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}";
+}
+print "\n----\n";
+foreach my $thislang (sort keys %knockoutlang) {
+print "\n>$thislang<: $knockoutlang{$thislang}";
 }
+print "\n";