File:  [LON-CAPA] / nsdl / nsdlloncapaorg / harvester.pl
Revision 1.8: download - view: text, annotated - select for diffs
Fri Nov 25 19:29:56 2005 UTC (18 years, 5 months ago) by www
Branches: MAIN
CVS tags: HEAD
Updated harvester script
- new hosts
- delete obsolete hosts
- keep stats
- detect all English docs (senisoUS, etc)

    1: #!/usr/local/bin/perl
    2: 
    3: #
    4: # lon-capa.pl
    5: # Parse the LON-CAPA metadata
    6: #
    7: # Andy Dong <adong@smete.org> 10/23/2002
    8: #
    9: # Contact Gerd Kortemeyer (korte@lite.msu.edu)
   10: 
   11: use strict;
   12: use LWP::UserAgent;
   13: use Getopt::Std;
   14: use Digest::MD5 qw(md5_hex);
   15: use IO::File;
   16: 
   17: my $basepath='/home/httpd/cgi-bin/OAI-XMLFile/XMLFile/nsdlexport/data';
   18: 
   19: my $pub_month;
   20: my $pub_year;
   21: my @loncapa;
   22: 
   23: # HTTP requests
   24: 
   25: my $content;
   26: my $content_regex = 'File Not Found';
   27: 
   28: # Configuration
   29: 
   30: my $debug = 0;
   31: 
   32: # Stats
   33: my %allstats=();
   34: my %filterstats=();
   35: my %knockout=();
   36: my %knockoutlang=();
   37: 
   38: # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
   39: my @servers = (
   40: 'newscience.westshore.edu',
   41: 's10.lite.msu.edu',
   42: 's12.lite.msu.edu',
   43: 'schubert.tmcc.edu',
   44: 'dalton.chem.sfu.ca',
   45: 'capa2.phy.ohiou.edu',
   46: 'pollux.physics.fsu.edu',
   47: 'loncapa3.physics.sc.edu',
   48: 'zappa.ags.udel.edu',
   49: 'loncapa.gwu.edu',
   50: 'neptune.physics.ndsu.nodak.edu',
   51: 'capa1.uwsp.edu',
   52: 'loncapa.Mines.EDU',
   53: 'loncapa.chm.nau.edu',
   54: 'library1.lon-capa.uiuc.edu',
   55: 'lon-capa.bsu.edu',
   56: 'psblnx03.bd.psu.edu',
   57: 'lon-capa.acadiau.ca',
   58: 'harvard.lon-capa.org',
   59: 'capa1.cc.huji.ac.il',
   60: 'lon-capa.phy.cmich.edu',
   61: 'meitner.physics.hope.edu',
   62: 'loncapa.vcu.edu',
   63: 'lon-capa.ucsc.edu',
   64: 'lon-capa.bsu.edu'
   65: );
   66: 
   67: foreach (@servers) {
   68:     my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl';
   69: # End Configuration
   70: 
   71: my $ua = new LWP::UserAgent;
   72: $ua->timeout(600);
   73: 
   74: my $request = new HTTP::Request GET => $url;
   75: $request->authorization_basic('reaper', 'cat4u');
   76: 
   77: my $response = $ua->request( $request );
   78: 
   79: if ( $response->is_success ) {
   80:      print 'SUCCESS: ' . $response->message.' for '.$url."\n\n";
   81: 	$content = $response->content;
   82: # Delete all blank lines
   83: 	$content =~ s/(?<!.)\n//g;
   84: # Replace all ^M with spaces
   85: 	$content =~ s/
/\s/g;
   86: # Push the content into an array
   87: 	@loncapa = split /\n/, $content;
   88: } else {
   89:      print 'LON-CAPA request failed: ' . $response->message.' for '.$url."\n\n";
   90:      next;
   91: }
   92: 
   93: #@loncapa=undef;
   94: #open (LON_FILE, 'metadata_harvest.txt') || die;
   95: 
   96: #while (<LON_FILE>) {
   97: #       chomp;
   98: #       push(@loncapa,$_);
   99: #}
  100: 
  101: my %records = ();;
  102: 
  103: my %stats=();
  104: 
  105: foreach my $metadata (@loncapa) {
  106: 	chomp $metadata;
  107: 	$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
  108: 	my @tkline = split('\|', $metadata);
  109:         my ($rawtype)=($tkline[3]=~/\.(\w+)$/);
  110:         $rawtype=~tr/A-Z/a-z/;
  111:         $allstats{$rawtype}++;
  112:         
  113:         my $title = $tkline[0];
  114: 	if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; }
  115: 	my $author = $tkline[1];
  116: 	if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; }
  117: 	my @authorname = split(' ', $author);
  118: 	my $author_fname = $authorname[0];
  119: 	my $author_lname = $authorname[1];
  120: 	# We have to make an exception for Multimedia Physics which is an organization not a person
  121: 	my $object_type;
  122: 	if ( $author_lname eq 'Physics' ) {
  123: 		$object_type = 'organization';
  124: 	} else {
  125: 		$object_type = 'person';
  126: 	}
  127: 	my $subject = $tkline[2];
  128: 	next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
  129: 	my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
  130:         my $baseid=$tkline[3];
  131: 	my ($adom,$auname)=($baseid=~/^\/res\/(\w+)\/(\w+)\//);
  132: 	$baseid=~s/\W/\_/g;
  133: 	$baseid=~s/^\_res\_//g;
  134: 	my $fileid=md5_hex($baseid);
  135: 
  136: 	next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
  137: # too many fragments out there
  138:         next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i);
  139: 
  140: 	my $keywords = $tkline[4];
  141: 	my $version = $tkline[5];
  142: 	my $notes = $tkline[6];
  143: 	my $abstract = $tkline[7];
  144: 	unless ($abstract) { $abstract=$subject; }
  145: 	unless ($abstract) { $abstract=$title; }
  146: 	unless ($abstract) { $abstract=$keywords; }
  147: 	my $type = $rawtype;
  148:         if ($type=~/htm/) { $type='htm'; }
  149: 
  150: 	my $learning_resource_type;
  151: 	if ( $type eq 'problem' ) {
  152: 		$learning_resource_type = 114;
  153: 	} elsif ( $type eq 'exam' ) {
  154: 		$learning_resource_type = 114;
  155: 	} elsif ( $type eq 'quiz' ) {
  156: 		$learning_resource_type = 114;
  157: 	} elsif ( $type eq 'assess' ) {
  158: 		$learning_resource_type = 114;
  159: 	} elsif ( $type eq 'survey' ) {
  160: 		$learning_resource_type = 114;
  161: 	} elsif ( $type eq 'form' ) {
  162: 		$learning_resource_type = 114;
  163: 	} elsif ( $type eq 'library' ) {
  164: 		$learning_resource_type = 107;
  165: 	} elsif ( $type eq 'page' ) {
  166: 		$learning_resource_type = 104;
  167: 	} elsif ( $type eq 'sequence' ) {
  168: 		$learning_resource_type = 104;
  169: 	} elsif ( $type eq 'spreadsheet' ) {
  170: 		$learning_resource_type = 114;
  171: 	} else {
  172: 		$learning_resource_type = 0;
  173: 	}
  174: 	
  175: 	my $media_format;
  176: 	if ( ($type eq 'htm') || ($type eq 'gif') || ($type eq 'mov') || ($type eq 'xml') ) {
  177: 		$media_format = 70;
  178: 	} else {
  179: 		$media_format = 0;
  180: 	}
  181: 
  182: 	my $language = $tkline[9];
  183: # likelihood is that the following is true (people would bother if it is not)
  184:         if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; }
  185: # NSDL only does English
  186:         if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; } 
  187: 	my $primary_language='en-US';
  188: 	my $creation_date = $tkline[10];
  189: 	my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
  190: 	my $revision_date = $tkline[11];
  191: 	my ($rev_year,$rev_month,$rev_day) = ( $revision_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
  192: 	my $owner = $tkline[12];
  193: 	my $rights_description;
  194: 	my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain)
  195: 	# Public means no login required
  196: 
  197: 	if ( $copyright eq 'public' ) {
  198: 		$rights_description = 'LON-CAPA Public Resource. No login required.';
  199: 	} elsif ($copyright eq 'domain') {
  200: 		$rights_description = 'Restricted to certain LON-CAPA domains.';
  201: 	} else {
  202: 		$rights_description = 'LON-CAPA Default Use Restriction. Login required.';
  203: 	}
  204: 	# Domain means restricted to a particular LON-CAPA domain
  205: 	# Defaults mean access open to any registered LON-CAPA user
  206: 	# Private means open only to author of material
  207: 	if ( $copyright eq 'private') { $knockout{'private_'.$rawtype}++; next; } 
  208: 	if ( $copyright eq 'domain') { $knockout{'domain_'.$rawtype}++; next; }
  209:         if ( $copyright eq 'custom') { $knockout{'custom_'.$rawtype}++; next; }
  210: 	my $platform = "5";     # HTML Browser (not specified but construed from metadata)
  211: #
  212: # We actually do this
  213: #
  214:         $stats{$type}++;
  215:         $filterstats{$type}++;
  216: #
  217: # Create path
  218: #
  219: 	unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
  220: 	unless (-e $basepath.'/'.$adom.'/'.$auname) { 
  221: 	    mkdir($basepath.'/'.$adom.'/'.$auname) || die 'Could not create '.$basepath.'/'.$adom.'/'.$auname;
  222: 	}
  223: 	open(XML,'>'.$basepath.'/'.$adom.'/'.$auname.'/'.$baseid.'.xml');
  224: 	print XML (<<ENDMETA);
  225: <?xml version="1.0" encoding="UTF-8"?>
  226: 
  227: <oaidc:dc xmlns="http://purl.org/dc/elements/1.1/" 
  228:           xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/" 
  229:           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
  230:           xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ 
  231:                               http://www.openarchives.org/OAI/2.0/oai_dc.xsd"
  232: >
  233:     <title>$title</title>
  234:     <creator>$author_fname $author_lname</creator>
  235:     <identifier>$resourceurl</identifier>
  236:     <subject>$keywords</subject>
  237:     <subject>$subject</subject>
  238:     <language>$primary_language</language>
  239:     <description>$abstract</description>
  240:     <date>$rev_year-$rev_month-$rev_day</date>
  241: </oaidc:dc>
  242: ENDMETA
  243:       close (XML);
  244: }
  245: foreach my $thistype (sort keys %stats) {
  246:    print "\n$thistype: $stats{$thistype}";
  247: }
  248: print "\n----\n";
  249: }
  250: print "\nDone.\n";
  251: foreach my $thistype (sort keys %allstats) {
  252:    print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}";
  253: }
  254: print "\n----\n";
  255: foreach my $thislang (sort keys %knockoutlang) {
  256: print "\n>$thislang<: $knockoutlang{$thislang}";
  257: }
  258: print "\n";

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>