Diff for /nsdl/nsdlloncapaorg/harvester.pl between versions 1.2 and 1.7

version 1.2, 2003/07/28 20:14:17 version 1.7, 2003/10/21 15:58:26
Line 12  use strict; Line 12  use strict;
 use LWP::UserAgent;  use LWP::UserAgent;
 use Getopt::Std;  use Getopt::Std;
 use Digest::MD5 qw(md5_hex);  use Digest::MD5 qw(md5_hex);
   use IO::File;
   
   my $basepath='/home/httpd/cgi-bin/OAI-XMLFile/XMLFile/nsdlexport/data';
   
 my $pub_month;  my $pub_month;
 my $pub_year;  my $pub_year;
Line 25  my $content_regex = 'File Not Found'; Line 28  my $content_regex = 'File Not Found';
 # Configuration  # Configuration
   
 my $debug = 0;  my $debug = 0;
 my $url = 'http://s10.lite.msu.edu/cgi-bin/metadata_harvest.pl';  
 # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab  # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
 my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu');  my @servers = (
   'newscience.westshore.cc.mi.us',
   's10.lite.msu.edu',
   's12.lite.msu.edu',
   'lon-capa.chem.sunysb.edu',
   'schubert.tmcc.edu',
   'dalton.chem.sfu.ca',
   'capa2.phy.ohiou.edu',
   'pollux.physics.fsu.edu',
   'loncapa.physics.sc.edu',
   'loncapa.math.ucf.edu',
   'zappa.ags.udel.edu',
   'loncapa.gwu.edu',
   'neptune.physics.ndsu.nodak.edu',
   'capa1.uwsp.edu',
   'natasha.it.fit.edu',
   'loncapa.Mines.EDU',
   'loncapa.chm.nau.edu');
   
   foreach (@servers) {
       my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl';
 # End Configuration  # End Configuration
   
 my $ua = new LWP::UserAgent;  my $ua = new LWP::UserAgent;
Line 40  $request->authorization_basic('reaper', Line 62  $request->authorization_basic('reaper',
 my $response = $ua->request( $request );  my $response = $ua->request( $request );
   
 if ( $response->is_success ) {  if ( $response->is_success ) {
        print 'SUCCESS: ' . $response->message.' for '.$url."\n\n";
  $content = $response->content;   $content = $response->content;
 # Delete all blank lines  # Delete all blank lines
  $content =~ s/(?<!.)\n//g;   $content =~ s/(?<!.)\n//g;
Line 48  if ( $response->is_success ) { Line 71  if ( $response->is_success ) {
 # Push the content into an array  # Push the content into an array
  @loncapa = split /\n/, $content;   @loncapa = split /\n/, $content;
 } else {  } else {
  die 'LON-CAPA request failed: ' . $response->message;       print 'LON-CAPA request failed: ' . $response->message.' for '.$url."\n\n";
        next;
 }  }
   
 #@loncapa=undef;  #@loncapa=undef;
Line 60  if ( $response->is_success ) { Line 84  if ( $response->is_success ) {
 #}  #}
   
 my %records = ();;  my %records = ();;
   
 foreach my $metadata (@loncapa) {  foreach my $metadata (@loncapa) {
  chomp $metadata;   chomp $metadata;
  $metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;   $metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
Line 82  foreach my $metadata (@loncapa) { Line 107  foreach my $metadata (@loncapa) {
  next if ( ($subject eq 'Sample') || ($subject eq 'Something') );   next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
  my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];   my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
         my $baseid=$tkline[3];          my $baseid=$tkline[3];
    my ($adom,$auname)=($baseid=~/^\/res\/(\w+)\/(\w+)\//);
  $baseid=~s/\W/\_/g;   $baseid=~s/\W/\_/g;
  $baseid=~s/^\_res\_//g;   $baseid=~s/^\_res\_//g;
    my $fileid=md5_hex($baseid);
   
  next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );   next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
  my $keywords = $tkline[4];   my $keywords = $tkline[4];
  my $version = $tkline[5];   my $version = $tkline[5];
  my $notes = $tkline[6];   my $notes = $tkline[6];
  my $abstract = $tkline[7];   my $abstract = $tkline[7];
  next if ($abstract eq '');   unless ($abstract) { $abstract=$subject; }
    unless ($abstract) { $abstract=$title; }
    unless ($abstract) { $abstract=$keywords; }
  my $type = $tkline[8];   my $type = $tkline[8];
  my $learning_resource_type;   my $learning_resource_type;
  if ( $type eq 'problem' ) {   if ( $type eq 'problem' ) {
Line 128  foreach my $metadata (@loncapa) { Line 157  foreach my $metadata (@loncapa) {
  next if ( $language ne 'seniso');   next if ( $language ne 'seniso');
  my $primary_language='en-US';   my $primary_language='en-US';
  my $creation_date = $tkline[10];   my $creation_date = $tkline[10];
  my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4})-(\d{2})-(\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );   my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
  my $revision_date = $tkline[11];   my $revision_date = $tkline[11];
    my ($rev_year,$rev_month,$rev_day) = ( $revision_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
  my $owner = $tkline[12];   my $owner = $tkline[12];
  my $rights_description;   my $rights_description;
  my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain)   my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain)
Line 146  foreach my $metadata (@loncapa) { Line 176  foreach my $metadata (@loncapa) {
  # Defaults mean access open to any registered LON-CAPA user   # Defaults mean access open to any registered LON-CAPA user
  # Private means open only to author of material   # Private means open only to author of material
  next if ( $copyright eq 'private');   next if ( $copyright eq 'private');
    next if ( $copyright eq 'domain');
  my $platform = "5";     # HTML Browser (not specified but construed from metadata)   my $platform = "5";     # HTML Browser (not specified but construed from metadata)
  print (<<ENDMETA);  #
 <rdf about="lon-capa.nsdl.collections/$baseid">  # Create path
     <dc:title>$title</dc:title>  #
     <dc:creator>$author_fname $author_lname</dc:creator>   unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
     <dc:subject>$keywords</dc:subject>   unless (-e $basepath.'/'.$adom.'/'.$auname) { 
     <dc:subject>$subject</dc:subject>      mkdir($basepath.'/'.$adom.'/'.$auname) || die 'Could not create '.$basepath.'/'.$adom.'/'.$auname;
     <dc:identifier scheme="URI">$resourceurl</dc:identifier>   }
     <dc:language>$primary_language</dc:language>   open(XML,'>'.$basepath.'/'.$adom.'/'.$auname.'/'.$baseid.'.xml');
     <dc:description>$abstract<dc:description>   print XML (<<ENDMETA);
     <dc:date>$revision_date</dc:date>  <?xml version="1.0" encoding="UTF-8"?>
 </rdf>  
   <oaidc:dc xmlns="http://purl.org/dc/elements/1.1/
             xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance
             xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ 
                                 http://www.openarchives.org/OAI/2.0/oai_dc.xsd"
   >
       <title>$title</title>
       <creator>$author_fname $author_lname</creator>
       <identifier>$resourceurl</identifier>
       <subject>$keywords</subject>
       <subject>$subject</subject>
       <language>$primary_language</language>
       <description>$abstract</description>
       <date>$rev_year-$rev_month-$rev_day</date>
   </oaidc:dc>
 ENDMETA  ENDMETA
         close (XML);
   }
 }  }
   

Removed from v.1.2  
changed lines
  Added in v.1.7


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>