Annotation of nsdl/nsdlloncapaorg/harvester.pl, revision 1.4

1.1       www         1: #!/usr/local/bin/perl
                      2: 
                      3: #
                      4: # lon-capa.pl
                      5: # Parse the LON-CAPA metadata
                      6: #
                      7: # Andy Dong <adong@smete.org> 10/23/2002
                      8: #
                      9: # Contact Gerd Kortemeyer (korte@lite.msu.edu)
                     10: 
                     11: use strict;
                     12: use LWP::UserAgent;
                     13: use Getopt::Std;
1.2       www        14: use Digest::MD5 qw(md5_hex);
1.4     ! www        15: use IO::File;
        !            16: 
        !            17: my $basepath='/home/httpd/cgi-bin/OAI-XMLFile/XMLFile/nsdlexport/data';
1.1       www        18: 
                     19: my $pub_month;
                     20: my $pub_year;
                     21: my @loncapa;
                     22: 
                     23: # HTTP requests
                     24: 
                     25: my $content;
                     26: my $content_regex = 'File Not Found';
                     27: 
                     28: # Configuration
                     29: 
                     30: my $debug = 0;
1.2       www        31: my $url = 'http://s10.lite.msu.edu/cgi-bin/metadata_harvest.pl';
1.1       www        32: # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
                     33: my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu');
                     34: 
                     35: # End Configuration
                     36: 
1.2       www        37: my $ua = new LWP::UserAgent;
                     38: $ua->timeout(600);
1.1       www        39: 
1.2       www        40: my $request = new HTTP::Request GET => $url;
                     41: $request->authorization_basic('reaper', 'cat4u');
1.1       www        42: 
1.2       www        43: my $response = $ua->request( $request );
1.1       www        44: 
1.2       www        45: if ( $response->is_success ) {
                     46: 	$content = $response->content;
1.1       www        47: # Delete all blank lines
1.2       www        48: 	$content =~ s/(?<!.)\n//g;
1.1       www        49: # Replace all ^M with spaces
1.2       www        50: 	$content =~ s/
/\s/g;
1.1       www        51: # Push the content into an array
1.2       www        52: 	@loncapa = split /\n/, $content;
                     53: } else {
                     54: 	die 'LON-CAPA request failed: ' . $response->message;
                     55: }
1.1       www        56: 
1.2       www        57: #@loncapa=undef;
                     58: #open (LON_FILE, 'metadata_harvest.txt') || die;
1.1       www        59: 
1.2       www        60: #while (<LON_FILE>) {
                     61: #       chomp;
                     62: #       push(@loncapa,$_);
                     63: #}
1.1       www        64: 
                     65: my %records = ();;
1.3       www        66: 
1.1       www        67: foreach my $metadata (@loncapa) {
                     68: 	chomp $metadata;
1.2       www        69: 	$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
1.1       www        70: 	my @tkline = split('\|', $metadata);
                     71: 	my $title = $tkline[0];
                     72: 	next if ( $title eq '' );
                     73: 	my $author = $tkline[1];
                     74: 	next if ( $author eq '' );
                     75: 	my @authorname = split(' ', $author);
                     76: 	my $author_fname = $authorname[0];
                     77: 	my $author_lname = $authorname[1];
                     78: 	# We have to make an exception for Multimedia Physics which is an organization not a person
                     79: 	my $object_type;
                     80: 	if ( $author_lname eq 'Physics' ) {
                     81: 		$object_type = 'organization';
                     82: 	} else {
                     83: 		$object_type = 'person';
                     84: 	}
                     85: 	my $subject = $tkline[2];
                     86: 	next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
1.2       www        87: 	my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
                     88:         my $baseid=$tkline[3];
1.4     ! www        89: 	my ($adom,$auname)=($baseid=~/^\/res\/(\w+)\/(\w+)\//);
1.2       www        90: 	$baseid=~s/\W/\_/g;
                     91: 	$baseid=~s/^\_res\_//g;
1.4     ! www        92: 	my $fileid=md5_hex($baseid);
1.2       www        93: 
1.1       www        94: 	next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
                     95: 	my $keywords = $tkline[4];
                     96: 	my $version = $tkline[5];
                     97: 	my $notes = $tkline[6];
                     98: 	my $abstract = $tkline[7];
                     99: 	next if ($abstract eq '');
                    100: 	my $type = $tkline[8];
                    101: 	my $learning_resource_type;
                    102: 	if ( $type eq 'problem' ) {
                    103: 		$learning_resource_type = 114;
                    104: 	} elsif ( $type eq 'exam' ) {
                    105: 		$learning_resource_type = 114;
                    106: 	} elsif ( $type eq 'quiz' ) {
                    107: 		$learning_resource_type = 114;
                    108: 	} elsif ( $type eq 'assess' ) {
                    109: 		$learning_resource_type = 114;
                    110: 	} elsif ( $type eq 'survey' ) {
                    111: 		$learning_resource_type = 114;
                    112: 	} elsif ( $type eq 'form' ) {
                    113: 		$learning_resource_type = 114;
                    114: 	} elsif ( $type eq 'library' ) {
                    115: 		$learning_resource_type = 107;
                    116: 	} elsif ( $type eq 'page' ) {
                    117: 		$learning_resource_type = 104;
                    118: 	} elsif ( $type eq 'sequence' ) {
                    119: 		$learning_resource_type = 104;
                    120: 	} elsif ( $type eq 'spreadsheet' ) {
                    121: 		$learning_resource_type = 114;
                    122: 	} else {
                    123: 		$learning_resource_type = 0;
                    124: 	}
                    125: 	
                    126: 	my $media_format;
                    127: 	if ( ($type eq 'htm') || ($type eq 'gif') || ($type eq 'mov') || ($type eq 'xml') ) {
                    128: 		$media_format = 70;
                    129: 	} else {
                    130: 		$media_format = 0;
                    131: 	}
                    132: 
                    133: 	my $language = $tkline[9]; # Look only for seniso
                    134: 	next if ( $language ne 'seniso');
                    135: 	my $primary_language='en-US';
                    136: 	my $creation_date = $tkline[10];
1.3       www       137: 	my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
1.1       www       138: 	my $revision_date = $tkline[11];
1.3       www       139: 	my ($rev_year,$rev_month,$rev_day) = ( $revision_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
1.1       www       140: 	my $owner = $tkline[12];
                    141: 	my $rights_description;
                    142: 	my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain)
                    143: 	# Public means no login required
                    144: 
                    145: 	if ( $copyright eq 'public' ) {
                    146: 		$rights_description = 'LON-CAPA Public Resource. No login required.';
                    147: 	} elsif ($copyright eq 'domain') {
                    148: 		$rights_description = 'Restricted to certain LON-CAPA domains.';
                    149: 	} else {
                    150: 		$rights_description = 'LON-CAPA Default Use Restriction. Login required.';
                    151: 	}
                    152: 	# Domain means restricted to a particular LON-CAPA domain
                    153: 	# Defaults mean access open to any registered LON-CAPA user
                    154: 	# Private means open only to author of material
                    155: 	next if ( $copyright eq 'private');
                    156: 	my $platform = "5";     # HTML Browser (not specified but construed from metadata)
1.4     ! www       157: #
        !           158: # Create path
        !           159: #
        !           160: 	unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
        !           161: 	unless (-e $basepath.'/'.$adom.'/'.$auname) { 
        !           162: 	    mkdir($basepath.'/'.$adom.'/'.$auname) || die 'Could not create '.$basepath.'/'.$adom.'/'.$auname;
        !           163: 	}
        !           164: 	open(XML,'>'.$basepath.'/'.$adom.'/'.$auname.'/'.$baseid.'.xml');
        !           165: 	print XML (<<ENDMETA);
        !           166: <?xml version="1.0" encoding="UTF-8"?>
        !           167: 
1.3       www       168: <oaidc:dc xmlns="http://purl.org/dc/elements/1.1/" 
                    169:           xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/" 
                    170:           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
                    171:           xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ 
                    172:                               http://www.openarchives.org/OAI/2.0/oai_dc.xsd"
                    173: >
                    174:     <title>$title</title>
                    175:     <creator>$author_fname $author_lname</creator>
                    176:     <identifier>$resourceurl</identifier>
                    177:     <subject>$keywords</subject>
                    178:     <subject>$subject</subject>
                    179:     <language>$primary_language</language>
                    180:     <description>$abstract</description>
                    181:     <date>$rev_year-$rev_month-$rev_day</date>
                    182: </oaidc:dc>
1.2       www       183: ENDMETA
1.4     ! www       184:       close (XML);
1.1       www       185: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>