[LON-CAPA-cvs] cvs: nsdl /nsdlloncapaorg harvester.pl

www lon-capa-cvs@mail.lon-capa.org
Mon, 28 Jul 2003 20:14:17 -0000


www		Mon Jul 28 16:14:17 2003 EDT

  Modified files:              
    /nsdl/nsdlloncapaorg	harvester.pl 
  Log:
  Harvesting script
  
  
Index: nsdl/nsdlloncapaorg/harvester.pl
diff -u nsdl/nsdlloncapaorg/harvester.pl:1.1 nsdl/nsdlloncapaorg/harvester.pl:1.2
--- nsdl/nsdlloncapaorg/harvester.pl:1.1	Mon Jul 28 10:27:05 2003
+++ nsdl/nsdlloncapaorg/harvester.pl	Mon Jul 28 16:14:17 2003
@@ -11,32 +11,7 @@
 use strict;
 use LWP::UserAgent;
 use Getopt::Std;
-
-use DBI;
-use DBD::ODBC;
-
-require OAIcataloging_v2;
-
-# -u flag specifies [u]pdate database; otherwise output to STDOUT
-
-my $usage = << "EOT";
-Usage: lon-capa.pl -u
-
-    -u (U)pdate the database
-
-    Without -u it simply prints SQL UPDATE statements to STDOUT
-EOT
-
-my %args;
-getopts('u', \%args) || die $usage;
-
-my $useDatabase = 1 if ($args{'u'});
-
-#my $DBI_DSN='dbi:ODBC:needs2_mel_needs_3_1_dev.odbc';
-my $DBI_DSN='dbi:ODBC:needs2_mel_needs_3_1.odbc';
-my $DBI_USER='autocataloger';
-my $DBI_PWD='regolatacotua';
-my $dbh;
+use Digest::MD5 qw(md5_hex);
 
 my $pub_month;
 my $pub_year;
@@ -50,43 +25,44 @@
 # Configuration
 
 my $debug = 0;
-my $url = 'http://data.lite.msu.edu/cgi-bin/metadata_harvest.pl';
+my $url = 'http://s10.lite.msu.edu/cgi-bin/metadata_harvest.pl';
 # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
 my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu');
 
 # End Configuration
 
-#my $ua = new LWP::UserAgent;
-#$ua->timeout(600);
+my $ua = new LWP::UserAgent;
+$ua->timeout(600);
 
-#my $request = new HTTP::Request GET => $url;
-#$request->authorization_basic('reaper', 'cat4u');
+my $request = new HTTP::Request GET => $url;
+$request->authorization_basic('reaper', 'cat4u');
 
-#my $response = $ua->request( $request );
+my $response = $ua->request( $request );
 
-#if ( $response->is_success ) {
-#	$content = $response->content;
+if ( $response->is_success ) {
+	$content = $response->content;
 # Delete all blank lines
-#	$content =~ s/(?<!.)\n//g;
+	$content =~ s/(?<!.)\n//g;
 # Replace all ^M with spaces
-#	$content =~ s//\s/g;
+	$content =~ s//\s/g;
 # Push the content into an array
-#	@loncapa = split /\n/, $content;
-#} else {
-#	die 'LON-CAPA request failed: ' . $response->message;
-#}
+	@loncapa = split /\n/, $content;
+} else {
+	die 'LON-CAPA request failed: ' . $response->message;
+}
 
-@loncapa=undef;
-open (LON_FILE, 'metadata_harvest.txt') || die;
+#@loncapa=undef;
+#open (LON_FILE, 'metadata_harvest.txt') || die;
 
-while (<LON_FILE>) {
-       chomp;
-       push(@loncapa,$_);
-}
+#while (<LON_FILE>) {
+#       chomp;
+#       push(@loncapa,$_);
+#}
 
 my %records = ();;
 foreach my $metadata (@loncapa) {
 	chomp $metadata;
+	$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
 	my @tkline = split('\|', $metadata);
 	my $title = $tkline[0];
 	next if ( $title eq '' );
@@ -104,7 +80,11 @@
 	}
 	my $subject = $tkline[2];
 	next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
-	my $resourceurl = 'http://lon-capa.smete.org' . $tkline[3];
+	my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
+        my $baseid=$tkline[3];
+	$baseid=~s/\W/\_/g;
+	$baseid=~s/^\_res\_//g;
+
 	next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
 	my $keywords = $tkline[4];
 	my $version = $tkline[5];
@@ -167,65 +147,18 @@
 	# Private means open only to author of material
 	next if ( $copyright eq 'private');
 	my $platform = "5";     # HTML Browser (not specified but construed from metadata)
+	print (<<ENDMETA);
+<rdf about="lon-capa.nsdl.collections/$baseid">
+    <dc:title>$title</dc:title>
+    <dc:creator>$author_fname $author_lname</dc:creator>
+    <dc:subject>$keywords</dc:subject>
+    <dc:subject>$subject</dc:subject>
+    <dc:identifier scheme="URI">$resourceurl</dc:identifier>
+    <dc:language>$primary_language</dc:language>
+    <dc:description>$abstract<dc:description>
+    <dc:date>$revision_date</dc:date>
+</rdf>
 
-# Connect to database
-if ( $useDatabase ) {
-	$dbh= DBI->connect($DBI_DSN, $DBI_USER, $DBI_PWD, { RaiseError => 1, AutoCommit => 0 }) || die "Unable to connect to database $DBI_DSN as $DBI_USER: ($DBI::err) $DBI::errstr\n";;
-	# Configuration information for LON-CAPA
-	my $collection_id = OAIc_orgexists($dbh,'LearningOnline Network with CAPA');
-	my $submitter_id = OAIc_personexists($dbh,'adong@smete.org');
-	my $image = 'http://www.lite.msu.edu/liteani.gif';
-	my $cost = 1; # version.purchase_license_type_id
-	my $collection = 'LearningOnline Network with CAPA';
-	# LON-CAPA has single authors
-	my $reg_key;
-	if ( $object_type eq 'organization' ) {
-		if ( ! ($reg_key = OAIc_orgexists($dbh,join(' ',$author_fname,$author_lname))) ) {
-		printf("Inserting new organization %s\n", join(' ',$author_fname, $author_lname));
-		my $success = OAIc_insert_org($dbh,$collection_id,$submitter_id,'',join(' ',$author_fname,$author_lname),'','','','','','','','');
-		$reg_key = OAIc_orgexists($dbh,join(' ',$author_fname,$author_lname));
-		}
-	} else {
-		if ( ! ($reg_key = OAIc_personexists_name($dbh,join(' ',$author_fname,$author_lname))) ) {
-		printf("Inserting new person(author) %s\n", join(' ',$author_fname, $author_lname));
-		my $success = OAIc_insert_person($dbh,$collection_id,$submitter_id,$author_lname,$author_fname,'','');
-		$reg_key = OAIc_personexists_name($dbh,join(' ',$author_fname,$author_lname));
-		}
-	}
-	my $updated;
-	my $inserted;
-	if ( my $general_key = OAIc_loexists($dbh,$title) ) {
-		# Do nothing
-		$updated = $updated + 1;
-	} else {
-		printf("Inserting new record for %s\n",$title);	
-		my $success = OAIc_insert_lo($dbh, $title, $primary_language, $abstract, $image, $pub_month, $pub_year, $keywords, $submitter_id, $reg_key, $collection_id, $collection_id, $media_format, $platform, , '', $resourceurl, '', 1, $reg_key, $collection_id, $collection_id, '', '', '', $learning_resource_type, $rights_description, $cost);
-		$inserted = $inserted + 1;
-	}
+ENDMETA
 }
 
-if (! $useDatabase ) { # Print information if no database updates requested
-	printf("Title: %s\n", $title);
-	printf("Author First Name: %s\n", $author_fname);
-	printf("Author Last Name: %s\n", $author_lname);
-	printf("Subject: %s\n", $subject);
-	printf("URL: %s\n", $resourceurl);
-	printf("Keywords: %s\n", $keywords);
-	printf("Version: %s\n", $version);
-	printf("Notes: %s\n", $notes);
-	printf("Abstract: %s\n", $abstract);
-	printf("Learning Resource Type: %d\n", $learning_resource_type);
-	printf("Media Format: %d\n", $media_format);
-	printf("Primary Language: %s\n", $primary_language);
-	printf("Creation Date: %s\n", $creation_date);
-	printf("Revision Date: %s\n", $revision_date);
-	printf("Copyright: %s\n", $copyright);
-	printf("Publication Year: %4d\tPublication Month: %02d\n", $pub_year, $pub_month);
-}
-
-if ( $useDatabase ) {
-	$dbh->commit;
-	$dbh->disconnect;
-}
-
-}