--- nsdl/nsdlloncapaorg/harvester.pl 2003/07/28 14:27:05 1.1 +++ nsdl/nsdlloncapaorg/harvester.pl 2005/11/25 19:29:56 1.8 @@ -11,32 +11,10 @@ use strict; use LWP::UserAgent; use Getopt::Std; +use Digest::MD5 qw(md5_hex); +use IO::File; -use DBI; -use DBD::ODBC; - -require OAIcataloging_v2; - -# -u flag specifies [u]pdate database; otherwise output to STDOUT - -my $usage = << "EOT"; -Usage: lon-capa.pl -u - - -u (U)pdate the database - - Without -u it simply prints SQL UPDATE statements to STDOUT -EOT - -my %args; -getopts('u', \%args) || die $usage; - -my $useDatabase = 1 if ($args{'u'}); - -#my $DBI_DSN='dbi:ODBC:needs2_mel_needs_3_1_dev.odbc'; -my $DBI_DSN='dbi:ODBC:needs2_mel_needs_3_1.odbc'; -my $DBI_USER='autocataloger'; -my $DBI_PWD='regolatacotua'; -my $dbh; +my $basepath='/home/httpd/cgi-bin/OAI-XMLFile/XMLFile/nsdlexport/data'; my $pub_month; my $pub_year; @@ -50,48 +28,92 @@ my $content_regex = 'File Not Found'; # Configuration my $debug = 0; -my $url = 'http://data.lite.msu.edu/cgi-bin/metadata_harvest.pl'; + +# Stats +my %allstats=(); +my %filterstats=(); +my %knockout=(); +my %knockoutlang=(); + # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab -my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu'); +my @servers = ( +'newscience.westshore.edu', +'s10.lite.msu.edu', +'s12.lite.msu.edu', +'schubert.tmcc.edu', +'dalton.chem.sfu.ca', +'capa2.phy.ohiou.edu', +'pollux.physics.fsu.edu', +'loncapa3.physics.sc.edu', +'zappa.ags.udel.edu', +'loncapa.gwu.edu', +'neptune.physics.ndsu.nodak.edu', +'capa1.uwsp.edu', +'loncapa.Mines.EDU', +'loncapa.chm.nau.edu', +'library1.lon-capa.uiuc.edu', +'lon-capa.bsu.edu', +'psblnx03.bd.psu.edu', +'lon-capa.acadiau.ca', +'harvard.lon-capa.org', +'capa1.cc.huji.ac.il', +'lon-capa.phy.cmich.edu', +'meitner.physics.hope.edu', +'loncapa.vcu.edu', +'lon-capa.ucsc.edu', +'lon-capa.bsu.edu' +); +foreach (@servers) { + my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl'; # End Configuration -#my $ua = new LWP::UserAgent; -#$ua->timeout(600); +my $ua = new LWP::UserAgent; +$ua->timeout(600); -#my $request = new HTTP::Request GET => $url; -#$request->authorization_basic('reaper', 'cat4u'); +my $request = new HTTP::Request GET => $url; +$request->authorization_basic('reaper', 'cat4u'); -#my $response = $ua->request( $request ); +my $response = $ua->request( $request ); -#if ( $response->is_success ) { -# $content = $response->content; +if ( $response->is_success ) { + print 'SUCCESS: ' . $response->message.' for '.$url."\n\n"; + $content = $response->content; # Delete all blank lines -# $content =~ s/(?message; -#} + @loncapa = split /\n/, $content; +} else { + print 'LON-CAPA request failed: ' . $response->message.' for '.$url."\n\n"; + next; +} -@loncapa=undef; -open (LON_FILE, 'metadata_harvest.txt') || die; +#@loncapa=undef; +#open (LON_FILE, 'metadata_harvest.txt') || die; -while () { - chomp; - push(@loncapa,$_); -} +#while () { +# chomp; +# push(@loncapa,$_); +#} my %records = ();; + +my %stats=(); + foreach my $metadata (@loncapa) { chomp $metadata; + $metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs; my @tkline = split('\|', $metadata); - my $title = $tkline[0]; - next if ( $title eq '' ); + my ($rawtype)=($tkline[3]=~/\.(\w+)$/); + $rawtype=~tr/A-Z/a-z/; + $allstats{$rawtype}++; + + my $title = $tkline[0]; + if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; } my $author = $tkline[1]; - next if ( $author eq '' ); + if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; } my @authorname = split(' ', $author); my $author_fname = $authorname[0]; my $author_lname = $authorname[1]; @@ -104,14 +126,27 @@ foreach my $metadata (@loncapa) { } my $subject = $tkline[2]; next if ( ($subject eq 'Sample') || ($subject eq 'Something') ); - my $resourceurl = 'http://lon-capa.smete.org' . $tkline[3]; + my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3]; + my $baseid=$tkline[3]; + my ($adom,$auname)=($baseid=~/^\/res\/(\w+)\/(\w+)\//); + $baseid=~s/\W/\_/g; + $baseid=~s/^\_res\_//g; + my $fileid=md5_hex($baseid); + next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ ); +# too many fragments out there + next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i); + my $keywords = $tkline[4]; my $version = $tkline[5]; my $notes = $tkline[6]; my $abstract = $tkline[7]; - next if ($abstract eq ''); - my $type = $tkline[8]; + unless ($abstract) { $abstract=$subject; } + unless ($abstract) { $abstract=$title; } + unless ($abstract) { $abstract=$keywords; } + my $type = $rawtype; + if ($type=~/htm/) { $type='htm'; } + my $learning_resource_type; if ( $type eq 'problem' ) { $learning_resource_type = 114; @@ -144,12 +179,16 @@ foreach my $metadata (@loncapa) { $media_format = 0; } - my $language = $tkline[9]; # Look only for seniso - next if ( $language ne 'seniso'); + my $language = $tkline[9]; +# likelihood is that the following is true (people would bother if it is not) + if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; } +# NSDL only does English + if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; } my $primary_language='en-US'; my $creation_date = $tkline[10]; - my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4})-(\d{2})-(\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ ); + my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ ); my $revision_date = $tkline[11]; + my ($rev_year,$rev_month,$rev_day) = ( $revision_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ ); my $owner = $tkline[12]; my $rights_description; my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain) @@ -165,67 +204,55 @@ foreach my $metadata (@loncapa) { # Domain means restricted to a particular LON-CAPA domain # Defaults mean access open to any registered LON-CAPA user # Private means open only to author of material - next if ( $copyright eq 'private'); + if ( $copyright eq 'private') { $knockout{'private_'.$rawtype}++; next; } + if ( $copyright eq 'domain') { $knockout{'domain_'.$rawtype}++; next; } + if ( $copyright eq 'custom') { $knockout{'custom_'.$rawtype}++; next; } my $platform = "5"; # HTML Browser (not specified but construed from metadata) - -# Connect to database -if ( $useDatabase ) { - $dbh= DBI->connect($DBI_DSN, $DBI_USER, $DBI_PWD, { RaiseError => 1, AutoCommit => 0 }) || die "Unable to connect to database $DBI_DSN as $DBI_USER: ($DBI::err) $DBI::errstr\n";; - # Configuration information for LON-CAPA - my $collection_id = OAIc_orgexists($dbh,'LearningOnline Network with CAPA'); - my $submitter_id = OAIc_personexists($dbh,'adong@smete.org'); - my $image = 'http://www.lite.msu.edu/liteani.gif'; - my $cost = 1; # version.purchase_license_type_id - my $collection = 'LearningOnline Network with CAPA'; - # LON-CAPA has single authors - my $reg_key; - if ( $object_type eq 'organization' ) { - if ( ! ($reg_key = OAIc_orgexists($dbh,join(' ',$author_fname,$author_lname))) ) { - printf("Inserting new organization %s\n", join(' ',$author_fname, $author_lname)); - my $success = OAIc_insert_org($dbh,$collection_id,$submitter_id,'',join(' ',$author_fname,$author_lname),'','','','','','','',''); - $reg_key = OAIc_orgexists($dbh,join(' ',$author_fname,$author_lname)); - } - } else { - if ( ! ($reg_key = OAIc_personexists_name($dbh,join(' ',$author_fname,$author_lname))) ) { - printf("Inserting new person(author) %s\n", join(' ',$author_fname, $author_lname)); - my $success = OAIc_insert_person($dbh,$collection_id,$submitter_id,$author_lname,$author_fname,'',''); - $reg_key = OAIc_personexists_name($dbh,join(' ',$author_fname,$author_lname)); - } - } - my $updated; - my $inserted; - if ( my $general_key = OAIc_loexists($dbh,$title) ) { - # Do nothing - $updated = $updated + 1; - } else { - printf("Inserting new record for %s\n",$title); - my $success = OAIc_insert_lo($dbh, $title, $primary_language, $abstract, $image, $pub_month, $pub_year, $keywords, $submitter_id, $reg_key, $collection_id, $collection_id, $media_format, $platform, , '', $resourceurl, '', 1, $reg_key, $collection_id, $collection_id, '', '', '', $learning_resource_type, $rights_description, $cost); - $inserted = $inserted + 1; - } +# +# We actually do this +# + $stats{$type}++; + $filterstats{$type}++; +# +# Create path +# + unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); } + unless (-e $basepath.'/'.$adom.'/'.$auname) { + mkdir($basepath.'/'.$adom.'/'.$auname) || die 'Could not create '.$basepath.'/'.$adom.'/'.$auname; + } + open(XML,'>'.$basepath.'/'.$adom.'/'.$auname.'/'.$baseid.'.xml'); + print XML (< + + + $title + $author_fname $author_lname + $resourceurl + $keywords + $subject + $primary_language + $abstract + $rev_year-$rev_month-$rev_day + +ENDMETA + close (XML); } - -if (! $useDatabase ) { # Print information if no database updates requested - printf("Title: %s\n", $title); - printf("Author First Name: %s\n", $author_fname); - printf("Author Last Name: %s\n", $author_lname); - printf("Subject: %s\n", $subject); - printf("URL: %s\n", $resourceurl); - printf("Keywords: %s\n", $keywords); - printf("Version: %s\n", $version); - printf("Notes: %s\n", $notes); - printf("Abstract: %s\n", $abstract); - printf("Learning Resource Type: %d\n", $learning_resource_type); - printf("Media Format: %d\n", $media_format); - printf("Primary Language: %s\n", $primary_language); - printf("Creation Date: %s\n", $creation_date); - printf("Revision Date: %s\n", $revision_date); - printf("Copyright: %s\n", $copyright); - printf("Publication Year: %4d\tPublication Month: %02d\n", $pub_year, $pub_month); -} - -if ( $useDatabase ) { - $dbh->commit; - $dbh->disconnect; +foreach my $thistype (sort keys %stats) { + print "\n$thistype: $stats{$thistype}"; } - +print "\n----\n"; +} +print "\nDone.\n"; +foreach my $thistype (sort keys %allstats) { + print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}"; +} +print "\n----\n"; +foreach my $thislang (sort keys %knockoutlang) { +print "\n>$thislang<: $knockoutlang{$thislang}"; } +print "\n";