Diff for /nsdl/nsdlloncapaorg/harvester.pl between versions 1.1 and 1.2

version 1.1, 2003/07/28 14:27:05 version 1.2, 2003/07/28 20:14:17
Line 11 Line 11
 use strict;  use strict;
 use LWP::UserAgent;  use LWP::UserAgent;
 use Getopt::Std;  use Getopt::Std;
   use Digest::MD5 qw(md5_hex);
 use DBI;  
 use DBD::ODBC;  
   
 require OAIcataloging_v2;  
   
 # -u flag specifies [u]pdate database; otherwise output to STDOUT  
   
 my $usage = << "EOT";  
 Usage: lon-capa.pl -u  
   
     -u (U)pdate the database  
   
     Without -u it simply prints SQL UPDATE statements to STDOUT  
 EOT  
   
 my %args;  
 getopts('u', \%args) || die $usage;  
   
 my $useDatabase = 1 if ($args{'u'});  
   
 #my $DBI_DSN='dbi:ODBC:needs2_mel_needs_3_1_dev.odbc';  
 my $DBI_DSN='dbi:ODBC:needs2_mel_needs_3_1.odbc';  
 my $DBI_USER='autocataloger';  
 my $DBI_PWD='regolatacotua';  
 my $dbh;  
   
 my $pub_month;  my $pub_month;
 my $pub_year;  my $pub_year;
Line 50  my $content_regex = 'File Not Found'; Line 25  my $content_regex = 'File Not Found';
 # Configuration  # Configuration
   
 my $debug = 0;  my $debug = 0;
 my $url = 'http://data.lite.msu.edu/cgi-bin/metadata_harvest.pl';  my $url = 'http://s10.lite.msu.edu/cgi-bin/metadata_harvest.pl';
 # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab  # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
 my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu');  my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu');
   
 # End Configuration  # End Configuration
   
 #my $ua = new LWP::UserAgent;  my $ua = new LWP::UserAgent;
 #$ua->timeout(600);  $ua->timeout(600);
   
 #my $request = new HTTP::Request GET => $url;  my $request = new HTTP::Request GET => $url;
 #$request->authorization_basic('reaper', 'cat4u');  $request->authorization_basic('reaper', 'cat4u');
   
 #my $response = $ua->request( $request );  my $response = $ua->request( $request );
   
 #if ( $response->is_success ) {  if ( $response->is_success ) {
 # $content = $response->content;   $content = $response->content;
 # Delete all blank lines  # Delete all blank lines
 # $content =~ s/(?<!.)\n//g;   $content =~ s/(?<!.)\n//g;
 # Replace all ^M with spaces  # Replace all ^M with spaces
 # $content =~ s/ /\s/g;   $content =~ s/ /\s/g;
 # Push the content into an array  # Push the content into an array
 # @loncapa = split /\n/, $content;   @loncapa = split /\n/, $content;
 #} else {  } else {
 # die 'LON-CAPA request failed: ' . $response->message;   die 'LON-CAPA request failed: ' . $response->message;
 #}  }
   
 @loncapa=undef;  #@loncapa=undef;
 open (LON_FILE, 'metadata_harvest.txt') || die;  #open (LON_FILE, 'metadata_harvest.txt') || die;
   
 while (<LON_FILE>) {  #while (<LON_FILE>) {
        chomp;  #       chomp;
        push(@loncapa,$_);  #       push(@loncapa,$_);
 }  #}
   
 my %records = ();;  my %records = ();;
 foreach my $metadata (@loncapa) {  foreach my $metadata (@loncapa) {
  chomp $metadata;   chomp $metadata;
    $metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
  my @tkline = split('\|', $metadata);   my @tkline = split('\|', $metadata);
  my $title = $tkline[0];   my $title = $tkline[0];
  next if ( $title eq '' );   next if ( $title eq '' );
Line 104  foreach my $metadata (@loncapa) { Line 80  foreach my $metadata (@loncapa) {
  }   }
  my $subject = $tkline[2];   my $subject = $tkline[2];
  next if ( ($subject eq 'Sample') || ($subject eq 'Something') );   next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
  my $resourceurl = 'http://lon-capa.smete.org' . $tkline[3];   my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
           my $baseid=$tkline[3];
    $baseid=~s/\W/\_/g;
    $baseid=~s/^\_res\_//g;
   
  next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );   next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
  my $keywords = $tkline[4];   my $keywords = $tkline[4];
  my $version = $tkline[5];   my $version = $tkline[5];
Line 167  foreach my $metadata (@loncapa) { Line 147  foreach my $metadata (@loncapa) {
  # Private means open only to author of material   # Private means open only to author of material
  next if ( $copyright eq 'private');   next if ( $copyright eq 'private');
  my $platform = "5";     # HTML Browser (not specified but construed from metadata)   my $platform = "5";     # HTML Browser (not specified but construed from metadata)
    print (<<ENDMETA);
   <rdf about="lon-capa.nsdl.collections/$baseid">
       <dc:title>$title</dc:title>
       <dc:creator>$author_fname $author_lname</dc:creator>
       <dc:subject>$keywords</dc:subject>
       <dc:subject>$subject</dc:subject>
       <dc:identifier scheme="URI">$resourceurl</dc:identifier>
       <dc:language>$primary_language</dc:language>
       <dc:description>$abstract<dc:description>
       <dc:date>$revision_date</dc:date>
   </rdf>
   
 # Connect to database  ENDMETA
 if ( $useDatabase ) {  
  $dbh= DBI->connect($DBI_DSN, $DBI_USER, $DBI_PWD, { RaiseError => 1, AutoCommit => 0 }) || die "Unable to connect to database $DBI_DSN as $DBI_USER: ($DBI::err) $DBI::errstr\n";;  
  # Configuration information for LON-CAPA  
  my $collection_id = OAIc_orgexists($dbh,'LearningOnline Network with CAPA');  
  my $submitter_id = OAIc_personexists($dbh,'adong@smete.org');  
  my $image = 'http://www.lite.msu.edu/liteani.gif';  
  my $cost = 1; # version.purchase_license_type_id  
  my $collection = 'LearningOnline Network with CAPA';  
  # LON-CAPA has single authors  
  my $reg_key;  
  if ( $object_type eq 'organization' ) {  
  if ( ! ($reg_key = OAIc_orgexists($dbh,join(' ',$author_fname,$author_lname))) ) {  
  printf("Inserting new organization %s\n", join(' ',$author_fname, $author_lname));  
  my $success = OAIc_insert_org($dbh,$collection_id,$submitter_id,'',join(' ',$author_fname,$author_lname),'','','','','','','','');  
  $reg_key = OAIc_orgexists($dbh,join(' ',$author_fname,$author_lname));  
  }  
  } else {  
  if ( ! ($reg_key = OAIc_personexists_name($dbh,join(' ',$author_fname,$author_lname))) ) {  
  printf("Inserting new person(author) %s\n", join(' ',$author_fname, $author_lname));  
  my $success = OAIc_insert_person($dbh,$collection_id,$submitter_id,$author_lname,$author_fname,'','');  
  $reg_key = OAIc_personexists_name($dbh,join(' ',$author_fname,$author_lname));  
  }  
  }  
  my $updated;  
  my $inserted;  
  if ( my $general_key = OAIc_loexists($dbh,$title) ) {  
  # Do nothing  
  $updated = $updated + 1;  
  } else {  
  printf("Inserting new record for %s\n",$title);  
  my $success = OAIc_insert_lo($dbh, $title, $primary_language, $abstract, $image, $pub_month, $pub_year, $keywords, $submitter_id, $reg_key, $collection_id, $collection_id, $media_format, $platform, , '', $resourceurl, '', 1, $reg_key, $collection_id, $collection_id, '', '', '', $learning_resource_type, $rights_description, $cost);  
  $inserted = $inserted + 1;  
  }  
 }  }
   
 if (! $useDatabase ) { # Print information if no database updates requested  
  printf("Title: %s\n", $title);  
  printf("Author First Name: %s\n", $author_fname);  
  printf("Author Last Name: %s\n", $author_lname);  
  printf("Subject: %s\n", $subject);  
  printf("URL: %s\n", $resourceurl);  
  printf("Keywords: %s\n", $keywords);  
  printf("Version: %s\n", $version);  
  printf("Notes: %s\n", $notes);  
  printf("Abstract: %s\n", $abstract);  
  printf("Learning Resource Type: %d\n", $learning_resource_type);  
  printf("Media Format: %d\n", $media_format);  
  printf("Primary Language: %s\n", $primary_language);  
  printf("Creation Date: %s\n", $creation_date);  
  printf("Revision Date: %s\n", $revision_date);  
  printf("Copyright: %s\n", $copyright);  
  printf("Publication Year: %4d\tPublication Month: %02d\n", $pub_year, $pub_month);  
 }  
   
 if ( $useDatabase ) {  
  $dbh->commit;  
  $dbh->disconnect;  
 }  
   
 }  

Removed from v.1.1  
changed lines
  Added in v.1.2


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>