File:  [LON-CAPA] / nsdl / nsdlloncapaorg / harvester.pl
Revision 1.9: download - view: text, annotated - select for diffs
Wed May 10 16:28:56 2006 UTC (17 years, 10 months ago) by www
Branches: MAIN
CVS tags: HEAD
More NSDLy.

#!/usr/local/bin/perl

#
# lon-capa.pl
# Parse the LON-CAPA metadata
#
# Andy Dong <adong@smete.org> 10/23/2002
#
# Contact Gerd Kortemeyer (korte@lite.msu.edu)

use strict;
use LWP::UserAgent;
use Getopt::Std;
use Digest::MD5 qw(md5_hex);
use IO::File;

my $basepath='/home/httpd/cgi-bin/OAI-XMLFile/XMLFile/nsdlexport/data';

my $pub_month;
my $pub_year;
my @loncapa;

# HTTP requests

my $content;
my $content_regex = 'File Not Found';

# Configuration

my $debug = 0;

# Stats
my %allstats=();
my %filterstats=();
my %knockout=();
my %knockoutlang=();

# The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
my @servers = (
'newscience.westshore.edu',
's10.lite.msu.edu',
's12.lite.msu.edu',
'schubert.tmcc.edu',
'dalton.chem.sfu.ca',
'capa2.phy.ohiou.edu',
'pollux.physics.fsu.edu',
'loncapa3.physics.sc.edu',
'zappa.ags.udel.edu',
'loncapa.gwu.edu',
'neptune.physics.ndsu.nodak.edu',
'capa1.uwsp.edu',
'loncapa.Mines.EDU',
'loncapa.chm.nau.edu',
'library1.lon-capa.uiuc.edu',
'lon-capa.bsu.edu',
'psblnx03.bd.psu.edu',
'lon-capa.acadiau.ca',
'harvard.lon-capa.org',
'capa1.cc.huji.ac.il',
'lon-capa.phy.cmich.edu',
'meitner.physics.hope.edu',
'loncapa.vcu.edu',
'lon-capa.ucsc.edu',
'lon-capa.bsu.edu',
'harvard.lon-capa.org'
);

foreach (@servers) {
    my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl';
# End Configuration

my $ua = new LWP::UserAgent;
$ua->timeout(600);

my $request = new HTTP::Request GET => $url;
$request->authorization_basic('reaper', 'cat4u');

my $response = $ua->request( $request );

if ( $response->is_success ) {
     print 'SUCCESS: ' . $response->message.' for '.$url."\n\n";
	$content = $response->content;
# Delete all blank lines
	$content =~ s/(?<!.)\n//g;
# Replace all ^M with spaces
	$content =~ s/
/\s/g;
# Push the content into an array
	@loncapa = split /\n/, $content;
} else {
     print 'LON-CAPA request failed: ' . $response->message.' for '.$url."\n\n";
     next;
}

#@loncapa=undef;
#open (LON_FILE, 'metadata_harvest.txt') || die;

#while (<LON_FILE>) {
#       chomp;
#       push(@loncapa,$_);
#}

my %records = ();;

my %stats=();

foreach my $metadata (@loncapa) {
	chomp $metadata;
	$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
	my @tkline = split('\|', $metadata);
        my ($rawtype)=($tkline[3]=~/\.(\w+)$/);
        $rawtype=~tr/A-Z/a-z/;
        $allstats{$rawtype}++;
        
        my $title = $tkline[0];
	if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; }
	my $author = $tkline[1];
	if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; }
	my @authorname = split(' ', $author);
	my $author_fname = $authorname[0];
	my $author_lname = $authorname[1];
	# We have to make an exception for Multimedia Physics which is an organization not a person
	my $object_type;
	if ( $author_lname eq 'Physics' ) {
		$object_type = 'organization';
	} else {
		$object_type = 'person';
	}
	my $subject = $tkline[2];
	next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
	my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
        my $baseid=$tkline[3];
	my ($adom,$auname)=($baseid=~/^\/res\/(\w+)\/(\w+)\//);
	$baseid=~s/\W/\_/g;
	$baseid=~s/^\_res\_//g;
	my $fileid=md5_hex($baseid);

	next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
# too many fragments out there
        next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i);

	my $keywords = $tkline[4];
	my $version = $tkline[5];
	my $notes = $tkline[6];
	my $abstract = $tkline[7];
        $abstract=~s/ s / /gs;
        $abstract=~s/\s+/ /gs;
        my $postsubject=$subject;
        unless ($postsubject) {
           $postsubject=$keywords;
        } else {
           $postsubject.=' ('.$keywords.')';
        }
        unless ($postsubject=~/\w/) { $knockout{'nosubject_'.$rawtype}++; next; }
        unless ($abstract) { $knockout{'noabstract_'.$rawtype}++; next; }
	my $type = $rawtype;
        if ($type=~/htm/) { $type='htm'; }

	my $learning_resource_type;
	if ( $type eq 'problem' ) {
		$learning_resource_type = 114;
	} elsif ( $type eq 'exam' ) {
		$learning_resource_type = 114;
	} elsif ( $type eq 'quiz' ) {
		$learning_resource_type = 114;
	} elsif ( $type eq 'assess' ) {
		$learning_resource_type = 114;
	} elsif ( $type eq 'survey' ) {
		$learning_resource_type = 114;
	} elsif ( $type eq 'form' ) {
		$learning_resource_type = 114;
	} elsif ( $type eq 'library' ) {
		$learning_resource_type = 107;
	} elsif ( $type eq 'page' ) {
		$learning_resource_type = 104;
	} elsif ( $type eq 'sequence' ) {
		$learning_resource_type = 104;
	} elsif ( $type eq 'spreadsheet' ) {
		$learning_resource_type = 114;
	} else {
		$learning_resource_type = 0;
	}
	
	my $media_format;
	if ( ($type eq 'htm') || ($type eq 'gif') || ($type eq 'mov') || ($type eq 'xml') ) {
		$media_format = 70;
	} else {
		$media_format = 0;
	}

	my $language = $tkline[9];
# likelihood is that the following is true (people would bother if it is not)
        if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; }
# NSDL only does English
        if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; } 
	my $primary_language='en-US';
	my $creation_date = $tkline[10];
	my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
	my $revision_date = $tkline[11];
	my ($rev_year,$rev_month,$rev_day) = ( $revision_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
	my $owner = $tkline[12];
	my $rights_description;
	my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain)
	# Public means no login required

	if ( $copyright eq 'public' ) {
		$rights_description = 'LON-CAPA Public Resource. No login required.';
	} elsif ($copyright eq 'domain') {
		$rights_description = 'Restricted to certain LON-CAPA domains.';
	} else {
		$rights_description = 'LON-CAPA Default Use Restriction. Login required.';
	}
	# Domain means restricted to a particular LON-CAPA domain
	# Defaults mean access open to any registered LON-CAPA user
	# Private means open only to author of material
        unless ($copyright eq 'public') { $knockout{'notpublic_'.$rawtype}++; next; }
	my $platform = "5";     # HTML Browser (not specified but construed from metadata)
#
# We actually do this
#
        $stats{$type}++;
        $filterstats{$type}++;
#
# Create path
#
	unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
	unless (-e $basepath.'/'.$adom.'/'.$auname) { 
	    mkdir($basepath.'/'.$adom.'/'.$auname) || die 'Could not create '.$basepath.'/'.$adom.'/'.$auname;
	}
	open(XML,'>'.$basepath.'/'.$adom.'/'.$auname.'/'.$baseid.'.xml');
	print XML (<<ENDMETA);
<?xml version="1.0" encoding="UTF-8"?>

<oaidc:dc xmlns="http://purl.org/dc/elements/1.1/" 
          xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/" 
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
          xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ 
                              http://www.openarchives.org/OAI/2.0/oai_dc.xsd"
>
    <title>$title</title>
    <creator>$author</creator>
    <identifier>$resourceurl</identifier>
    <subject>$postsubject</subject>
    <language>$primary_language</language>
    <description>$abstract</description>
    <date>$rev_year-$rev_month-$rev_day</date>
</oaidc:dc>
ENDMETA
      close (XML);
}
foreach my $thistype (sort keys %stats) {
   print "\n$thistype: $stats{$thistype}";
}
print "\n----\n";
}
print "\nDone.\n";
foreach my $thistype (sort keys %allstats) {
   print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}";
}
print "\n----\n";
foreach my $thislang (sort keys %knockoutlang) {
print "\n>$thislang<: $knockoutlang{$thislang}";
}
print "\n";

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>