File:  [LON-CAPA] / nsdl / nsdlloncapaorg / harvester.pl
Revision 1.2: download - view: text, annotated - select for diffs
Mon Jul 28 20:14:17 2003 UTC (20 years, 9 months ago) by www
Branches: MAIN
CVS tags: HEAD
Harvesting script

    1: #!/usr/local/bin/perl
    2: 
    3: #
    4: # lon-capa.pl
    5: # Parse the LON-CAPA metadata
    6: #
    7: # Andy Dong <adong@smete.org> 10/23/2002
    8: #
    9: # Contact Gerd Kortemeyer (korte@lite.msu.edu)
   10: 
   11: use strict;
   12: use LWP::UserAgent;
   13: use Getopt::Std;
   14: use Digest::MD5 qw(md5_hex);
   15: 
   16: my $pub_month;
   17: my $pub_year;
   18: my @loncapa;
   19: 
   20: # HTTP requests
   21: 
   22: my $content;
   23: my $content_regex = 'File Not Found';
   24: 
   25: # Configuration
   26: 
   27: my $debug = 0;
   28: my $url = 'http://s10.lite.msu.edu/cgi-bin/metadata_harvest.pl';
   29: # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
   30: my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu');
   31: 
   32: # End Configuration
   33: 
   34: my $ua = new LWP::UserAgent;
   35: $ua->timeout(600);
   36: 
   37: my $request = new HTTP::Request GET => $url;
   38: $request->authorization_basic('reaper', 'cat4u');
   39: 
   40: my $response = $ua->request( $request );
   41: 
   42: if ( $response->is_success ) {
   43: 	$content = $response->content;
   44: # Delete all blank lines
   45: 	$content =~ s/(?<!.)\n//g;
   46: # Replace all ^M with spaces
   47: 	$content =~ s/
/\s/g;
   48: # Push the content into an array
   49: 	@loncapa = split /\n/, $content;
   50: } else {
   51: 	die 'LON-CAPA request failed: ' . $response->message;
   52: }
   53: 
   54: #@loncapa=undef;
   55: #open (LON_FILE, 'metadata_harvest.txt') || die;
   56: 
   57: #while (<LON_FILE>) {
   58: #       chomp;
   59: #       push(@loncapa,$_);
   60: #}
   61: 
   62: my %records = ();;
   63: foreach my $metadata (@loncapa) {
   64: 	chomp $metadata;
   65: 	$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
   66: 	my @tkline = split('\|', $metadata);
   67: 	my $title = $tkline[0];
   68: 	next if ( $title eq '' );
   69: 	my $author = $tkline[1];
   70: 	next if ( $author eq '' );
   71: 	my @authorname = split(' ', $author);
   72: 	my $author_fname = $authorname[0];
   73: 	my $author_lname = $authorname[1];
   74: 	# We have to make an exception for Multimedia Physics which is an organization not a person
   75: 	my $object_type;
   76: 	if ( $author_lname eq 'Physics' ) {
   77: 		$object_type = 'organization';
   78: 	} else {
   79: 		$object_type = 'person';
   80: 	}
   81: 	my $subject = $tkline[2];
   82: 	next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
   83: 	my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
   84:         my $baseid=$tkline[3];
   85: 	$baseid=~s/\W/\_/g;
   86: 	$baseid=~s/^\_res\_//g;
   87: 
   88: 	next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
   89: 	my $keywords = $tkline[4];
   90: 	my $version = $tkline[5];
   91: 	my $notes = $tkline[6];
   92: 	my $abstract = $tkline[7];
   93: 	next if ($abstract eq '');
   94: 	my $type = $tkline[8];
   95: 	my $learning_resource_type;
   96: 	if ( $type eq 'problem' ) {
   97: 		$learning_resource_type = 114;
   98: 	} elsif ( $type eq 'exam' ) {
   99: 		$learning_resource_type = 114;
  100: 	} elsif ( $type eq 'quiz' ) {
  101: 		$learning_resource_type = 114;
  102: 	} elsif ( $type eq 'assess' ) {
  103: 		$learning_resource_type = 114;
  104: 	} elsif ( $type eq 'survey' ) {
  105: 		$learning_resource_type = 114;
  106: 	} elsif ( $type eq 'form' ) {
  107: 		$learning_resource_type = 114;
  108: 	} elsif ( $type eq 'library' ) {
  109: 		$learning_resource_type = 107;
  110: 	} elsif ( $type eq 'page' ) {
  111: 		$learning_resource_type = 104;
  112: 	} elsif ( $type eq 'sequence' ) {
  113: 		$learning_resource_type = 104;
  114: 	} elsif ( $type eq 'spreadsheet' ) {
  115: 		$learning_resource_type = 114;
  116: 	} else {
  117: 		$learning_resource_type = 0;
  118: 	}
  119: 	
  120: 	my $media_format;
  121: 	if ( ($type eq 'htm') || ($type eq 'gif') || ($type eq 'mov') || ($type eq 'xml') ) {
  122: 		$media_format = 70;
  123: 	} else {
  124: 		$media_format = 0;
  125: 	}
  126: 
  127: 	my $language = $tkline[9]; # Look only for seniso
  128: 	next if ( $language ne 'seniso');
  129: 	my $primary_language='en-US';
  130: 	my $creation_date = $tkline[10];
  131: 	my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4})-(\d{2})-(\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
  132: 	my $revision_date = $tkline[11];
  133: 	my $owner = $tkline[12];
  134: 	my $rights_description;
  135: 	my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain)
  136: 	# Public means no login required
  137: 
  138: 	if ( $copyright eq 'public' ) {
  139: 		$rights_description = 'LON-CAPA Public Resource. No login required.';
  140: 	} elsif ($copyright eq 'domain') {
  141: 		$rights_description = 'Restricted to certain LON-CAPA domains.';
  142: 	} else {
  143: 		$rights_description = 'LON-CAPA Default Use Restriction. Login required.';
  144: 	}
  145: 	# Domain means restricted to a particular LON-CAPA domain
  146: 	# Defaults mean access open to any registered LON-CAPA user
  147: 	# Private means open only to author of material
  148: 	next if ( $copyright eq 'private');
  149: 	my $platform = "5";     # HTML Browser (not specified but construed from metadata)
  150: 	print (<<ENDMETA);
  151: <rdf about="lon-capa.nsdl.collections/$baseid">
  152:     <dc:title>$title</dc:title>
  153:     <dc:creator>$author_fname $author_lname</dc:creator>
  154:     <dc:subject>$keywords</dc:subject>
  155:     <dc:subject>$subject</dc:subject>
  156:     <dc:identifier scheme="URI">$resourceurl</dc:identifier>
  157:     <dc:language>$primary_language</dc:language>
  158:     <dc:description>$abstract<dc:description>
  159:     <dc:date>$revision_date</dc:date>
  160: </rdf>
  161: 
  162: ENDMETA
  163: }
  164: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>