Annotation of nsdl/nsdlloncapaorg/harvester.pl, revision 1.9

1.1       www         1: #!/usr/local/bin/perl
                      2: 
                      3: #
                      4: # lon-capa.pl
                      5: # Parse the LON-CAPA metadata
                      6: #
                      7: # Andy Dong <adong@smete.org> 10/23/2002
                      8: #
                      9: # Contact Gerd Kortemeyer (korte@lite.msu.edu)
                     10: 
                     11: use strict;
                     12: use LWP::UserAgent;
                     13: use Getopt::Std;
1.2       www        14: use Digest::MD5 qw(md5_hex);
1.4       www        15: use IO::File;
                     16: 
                     17: my $basepath='/home/httpd/cgi-bin/OAI-XMLFile/XMLFile/nsdlexport/data';
1.1       www        18: 
                     19: my $pub_month;
                     20: my $pub_year;
                     21: my @loncapa;
                     22: 
                     23: # HTTP requests
                     24: 
                     25: my $content;
                     26: my $content_regex = 'File Not Found';
                     27: 
                     28: # Configuration
                     29: 
                     30: my $debug = 0;
1.5       www        31: 
1.8       www        32: # Stats
                     33: my %allstats=();
                     34: my %filterstats=();
                     35: my %knockout=();
                     36: my %knockoutlang=();
                     37: 
1.1       www        38: # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
1.5       www        39: my @servers = (
1.8       www        40: 'newscience.westshore.edu',
1.5       www        41: 's10.lite.msu.edu',
                     42: 's12.lite.msu.edu',
                     43: 'schubert.tmcc.edu',
                     44: 'dalton.chem.sfu.ca',
                     45: 'capa2.phy.ohiou.edu',
                     46: 'pollux.physics.fsu.edu',
1.8       www        47: 'loncapa3.physics.sc.edu',
1.5       www        48: 'zappa.ags.udel.edu',
                     49: 'loncapa.gwu.edu',
                     50: 'neptune.physics.ndsu.nodak.edu',
1.6       www        51: 'capa1.uwsp.edu',
                     52: 'loncapa.Mines.EDU',
1.8       www        53: 'loncapa.chm.nau.edu',
                     54: 'library1.lon-capa.uiuc.edu',
                     55: 'lon-capa.bsu.edu',
                     56: 'psblnx03.bd.psu.edu',
                     57: 'lon-capa.acadiau.ca',
                     58: 'harvard.lon-capa.org',
                     59: 'capa1.cc.huji.ac.il',
                     60: 'lon-capa.phy.cmich.edu',
                     61: 'meitner.physics.hope.edu',
                     62: 'loncapa.vcu.edu',
                     63: 'lon-capa.ucsc.edu',
1.9     ! www        64: 'lon-capa.bsu.edu',
        !            65: 'harvard.lon-capa.org'
1.8       www        66: );
1.1       www        67: 
1.5       www        68: foreach (@servers) {
                     69:     my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl';
1.1       www        70: # End Configuration
                     71: 
1.2       www        72: my $ua = new LWP::UserAgent;
                     73: $ua->timeout(600);
1.1       www        74: 
1.2       www        75: my $request = new HTTP::Request GET => $url;
                     76: $request->authorization_basic('reaper', 'cat4u');
1.1       www        77: 
1.2       www        78: my $response = $ua->request( $request );
1.1       www        79: 
1.2       www        80: if ( $response->is_success ) {
1.5       www        81:      print 'SUCCESS: ' . $response->message.' for '.$url."\n\n";
1.2       www        82: 	$content = $response->content;
1.1       www        83: # Delete all blank lines
1.2       www        84: 	$content =~ s/(?<!.)\n//g;
1.1       www        85: # Replace all ^M with spaces
1.2       www        86: 	$content =~ s/
/\s/g;
1.1       www        87: # Push the content into an array
1.2       www        88: 	@loncapa = split /\n/, $content;
                     89: } else {
1.5       www        90:      print 'LON-CAPA request failed: ' . $response->message.' for '.$url."\n\n";
                     91:      next;
1.2       www        92: }
1.1       www        93: 
1.2       www        94: #@loncapa=undef;
                     95: #open (LON_FILE, 'metadata_harvest.txt') || die;
1.1       www        96: 
1.2       www        97: #while (<LON_FILE>) {
                     98: #       chomp;
                     99: #       push(@loncapa,$_);
                    100: #}
1.1       www       101: 
                    102: my %records = ();;
1.3       www       103: 
1.8       www       104: my %stats=();
                    105: 
1.1       www       106: foreach my $metadata (@loncapa) {
                    107: 	chomp $metadata;
1.2       www       108: 	$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
1.1       www       109: 	my @tkline = split('\|', $metadata);
1.8       www       110:         my ($rawtype)=($tkline[3]=~/\.(\w+)$/);
                    111:         $rawtype=~tr/A-Z/a-z/;
                    112:         $allstats{$rawtype}++;
                    113:         
                    114:         my $title = $tkline[0];
                    115: 	if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; }
1.1       www       116: 	my $author = $tkline[1];
1.8       www       117: 	if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; }
1.1       www       118: 	my @authorname = split(' ', $author);
                    119: 	my $author_fname = $authorname[0];
                    120: 	my $author_lname = $authorname[1];
                    121: 	# We have to make an exception for Multimedia Physics which is an organization not a person
                    122: 	my $object_type;
                    123: 	if ( $author_lname eq 'Physics' ) {
                    124: 		$object_type = 'organization';
                    125: 	} else {
                    126: 		$object_type = 'person';
                    127: 	}
                    128: 	my $subject = $tkline[2];
                    129: 	next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
1.2       www       130: 	my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
                    131:         my $baseid=$tkline[3];
1.4       www       132: 	my ($adom,$auname)=($baseid=~/^\/res\/(\w+)\/(\w+)\//);
1.2       www       133: 	$baseid=~s/\W/\_/g;
                    134: 	$baseid=~s/^\_res\_//g;
1.4       www       135: 	my $fileid=md5_hex($baseid);
1.2       www       136: 
1.1       www       137: 	next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
1.8       www       138: # too many fragments out there
                    139:         next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i);
                    140: 
1.1       www       141: 	my $keywords = $tkline[4];
                    142: 	my $version = $tkline[5];
                    143: 	my $notes = $tkline[6];
                    144: 	my $abstract = $tkline[7];
1.9     ! www       145:         $abstract=~s/ s / /gs;
        !           146:         $abstract=~s/\s+/ /gs;
        !           147:         my $postsubject=$subject;
        !           148:         unless ($postsubject) {
        !           149:            $postsubject=$keywords;
        !           150:         } else {
        !           151:            $postsubject.=' ('.$keywords.')';
        !           152:         }
        !           153:         unless ($postsubject=~/\w/) { $knockout{'nosubject_'.$rawtype}++; next; }
        !           154:         unless ($abstract) { $knockout{'noabstract_'.$rawtype}++; next; }
1.8       www       155: 	my $type = $rawtype;
                    156:         if ($type=~/htm/) { $type='htm'; }
                    157: 
1.1       www       158: 	my $learning_resource_type;
                    159: 	if ( $type eq 'problem' ) {
                    160: 		$learning_resource_type = 114;
                    161: 	} elsif ( $type eq 'exam' ) {
                    162: 		$learning_resource_type = 114;
                    163: 	} elsif ( $type eq 'quiz' ) {
                    164: 		$learning_resource_type = 114;
                    165: 	} elsif ( $type eq 'assess' ) {
                    166: 		$learning_resource_type = 114;
                    167: 	} elsif ( $type eq 'survey' ) {
                    168: 		$learning_resource_type = 114;
                    169: 	} elsif ( $type eq 'form' ) {
                    170: 		$learning_resource_type = 114;
                    171: 	} elsif ( $type eq 'library' ) {
                    172: 		$learning_resource_type = 107;
                    173: 	} elsif ( $type eq 'page' ) {
                    174: 		$learning_resource_type = 104;
                    175: 	} elsif ( $type eq 'sequence' ) {
                    176: 		$learning_resource_type = 104;
                    177: 	} elsif ( $type eq 'spreadsheet' ) {
                    178: 		$learning_resource_type = 114;
                    179: 	} else {
                    180: 		$learning_resource_type = 0;
                    181: 	}
                    182: 	
                    183: 	my $media_format;
                    184: 	if ( ($type eq 'htm') || ($type eq 'gif') || ($type eq 'mov') || ($type eq 'xml') ) {
                    185: 		$media_format = 70;
                    186: 	} else {
                    187: 		$media_format = 0;
                    188: 	}
                    189: 
1.8       www       190: 	my $language = $tkline[9];
                    191: # likelihood is that the following is true (people would bother if it is not)
                    192:         if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; }
                    193: # NSDL only does English
                    194:         if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; } 
1.1       www       195: 	my $primary_language='en-US';
                    196: 	my $creation_date = $tkline[10];
1.3       www       197: 	my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
1.1       www       198: 	my $revision_date = $tkline[11];
1.3       www       199: 	my ($rev_year,$rev_month,$rev_day) = ( $revision_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
1.1       www       200: 	my $owner = $tkline[12];
                    201: 	my $rights_description;
                    202: 	my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain)
                    203: 	# Public means no login required
                    204: 
                    205: 	if ( $copyright eq 'public' ) {
                    206: 		$rights_description = 'LON-CAPA Public Resource. No login required.';
                    207: 	} elsif ($copyright eq 'domain') {
                    208: 		$rights_description = 'Restricted to certain LON-CAPA domains.';
                    209: 	} else {
                    210: 		$rights_description = 'LON-CAPA Default Use Restriction. Login required.';
                    211: 	}
                    212: 	# Domain means restricted to a particular LON-CAPA domain
                    213: 	# Defaults mean access open to any registered LON-CAPA user
                    214: 	# Private means open only to author of material
1.9     ! www       215:         unless ($copyright eq 'public') { $knockout{'notpublic_'.$rawtype}++; next; }
1.1       www       216: 	my $platform = "5";     # HTML Browser (not specified but construed from metadata)
1.4       www       217: #
1.8       www       218: # We actually do this
                    219: #
                    220:         $stats{$type}++;
                    221:         $filterstats{$type}++;
                    222: #
1.4       www       223: # Create path
                    224: #
                    225: 	unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
                    226: 	unless (-e $basepath.'/'.$adom.'/'.$auname) { 
                    227: 	    mkdir($basepath.'/'.$adom.'/'.$auname) || die 'Could not create '.$basepath.'/'.$adom.'/'.$auname;
                    228: 	}
                    229: 	open(XML,'>'.$basepath.'/'.$adom.'/'.$auname.'/'.$baseid.'.xml');
                    230: 	print XML (<<ENDMETA);
                    231: <?xml version="1.0" encoding="UTF-8"?>
                    232: 
1.3       www       233: <oaidc:dc xmlns="http://purl.org/dc/elements/1.1/" 
                    234:           xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/" 
                    235:           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
                    236:           xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ 
                    237:                               http://www.openarchives.org/OAI/2.0/oai_dc.xsd"
                    238: >
                    239:     <title>$title</title>
1.9     ! www       240:     <creator>$author</creator>
1.3       www       241:     <identifier>$resourceurl</identifier>
1.9     ! www       242:     <subject>$postsubject</subject>
1.3       www       243:     <language>$primary_language</language>
                    244:     <description>$abstract</description>
                    245:     <date>$rev_year-$rev_month-$rev_day</date>
                    246: </oaidc:dc>
1.2       www       247: ENDMETA
1.4       www       248:       close (XML);
1.5       www       249: }
1.8       www       250: foreach my $thistype (sort keys %stats) {
                    251:    print "\n$thistype: $stats{$thistype}";
                    252: }
                    253: print "\n----\n";
                    254: }
                    255: print "\nDone.\n";
                    256: foreach my $thistype (sort keys %allstats) {
                    257:    print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}";
                    258: }
                    259: print "\n----\n";
                    260: foreach my $thislang (sort keys %knockoutlang) {
                    261: print "\n>$thislang<: $knockoutlang{$thislang}";
1.1       www       262: }
1.8       www       263: print "\n";

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>