--- nsdl/nsdlloncapaorg/harvester.pl 2003/07/29 14:37:51 1.4 +++ nsdl/nsdlloncapaorg/harvester.pl 2006/05/10 16:28:56 1.9 @@ -28,10 +28,45 @@ my $content_regex = 'File Not Found'; # Configuration my $debug = 0; -my $url = 'http://s10.lite.msu.edu/cgi-bin/metadata_harvest.pl'; + +# Stats +my %allstats=(); +my %filterstats=(); +my %knockout=(); +my %knockoutlang=(); + # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab -my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu'); +my @servers = ( +'newscience.westshore.edu', +'s10.lite.msu.edu', +'s12.lite.msu.edu', +'schubert.tmcc.edu', +'dalton.chem.sfu.ca', +'capa2.phy.ohiou.edu', +'pollux.physics.fsu.edu', +'loncapa3.physics.sc.edu', +'zappa.ags.udel.edu', +'loncapa.gwu.edu', +'neptune.physics.ndsu.nodak.edu', +'capa1.uwsp.edu', +'loncapa.Mines.EDU', +'loncapa.chm.nau.edu', +'library1.lon-capa.uiuc.edu', +'lon-capa.bsu.edu', +'psblnx03.bd.psu.edu', +'lon-capa.acadiau.ca', +'harvard.lon-capa.org', +'capa1.cc.huji.ac.il', +'lon-capa.phy.cmich.edu', +'meitner.physics.hope.edu', +'loncapa.vcu.edu', +'lon-capa.ucsc.edu', +'lon-capa.bsu.edu', +'harvard.lon-capa.org' +); +foreach (@servers) { + my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl'; # End Configuration my $ua = new LWP::UserAgent; @@ -43,6 +78,7 @@ $request->authorization_basic('reaper', my $response = $ua->request( $request ); if ( $response->is_success ) { + print 'SUCCESS: ' . $response->message.' for '.$url."\n\n"; $content = $response->content; # Delete all blank lines $content =~ s/(?is_success ) { # Push the content into an array @loncapa = split /\n/, $content; } else { - die 'LON-CAPA request failed: ' . $response->message; + print 'LON-CAPA request failed: ' . $response->message.' for '.$url."\n\n"; + next; } #@loncapa=undef; @@ -64,14 +101,20 @@ if ( $response->is_success ) { my %records = ();; +my %stats=(); + foreach my $metadata (@loncapa) { chomp $metadata; $metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs; my @tkline = split('\|', $metadata); - my $title = $tkline[0]; - next if ( $title eq '' ); + my ($rawtype)=($tkline[3]=~/\.(\w+)$/); + $rawtype=~tr/A-Z/a-z/; + $allstats{$rawtype}++; + + my $title = $tkline[0]; + if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; } my $author = $tkline[1]; - next if ( $author eq '' ); + if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; } my @authorname = split(' ', $author); my $author_fname = $authorname[0]; my $author_lname = $authorname[1]; @@ -92,12 +135,26 @@ foreach my $metadata (@loncapa) { my $fileid=md5_hex($baseid); next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ ); +# too many fragments out there + next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i); + my $keywords = $tkline[4]; my $version = $tkline[5]; my $notes = $tkline[6]; my $abstract = $tkline[7]; - next if ($abstract eq ''); - my $type = $tkline[8]; + $abstract=~s/ s / /gs; + $abstract=~s/\s+/ /gs; + my $postsubject=$subject; + unless ($postsubject) { + $postsubject=$keywords; + } else { + $postsubject.=' ('.$keywords.')'; + } + unless ($postsubject=~/\w/) { $knockout{'nosubject_'.$rawtype}++; next; } + unless ($abstract) { $knockout{'noabstract_'.$rawtype}++; next; } + my $type = $rawtype; + if ($type=~/htm/) { $type='htm'; } + my $learning_resource_type; if ( $type eq 'problem' ) { $learning_resource_type = 114; @@ -130,8 +187,11 @@ foreach my $metadata (@loncapa) { $media_format = 0; } - my $language = $tkline[9]; # Look only for seniso - next if ( $language ne 'seniso'); + my $language = $tkline[9]; +# likelihood is that the following is true (people would bother if it is not) + if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; } +# NSDL only does English + if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; } my $primary_language='en-US'; my $creation_date = $tkline[10]; my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ ); @@ -152,9 +212,14 @@ foreach my $metadata (@loncapa) { # Domain means restricted to a particular LON-CAPA domain # Defaults mean access open to any registered LON-CAPA user # Private means open only to author of material - next if ( $copyright eq 'private'); + unless ($copyright eq 'public') { $knockout{'notpublic_'.$rawtype}++; next; } my $platform = "5"; # HTML Browser (not specified but construed from metadata) # +# We actually do this +# + $stats{$type}++; + $filterstats{$type}++; +# # Create path # unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); } @@ -172,10 +237,9 @@ foreach my $metadata (@loncapa) { http://www.openarchives.org/OAI/2.0/oai_dc.xsd" > $title - $author_fname $author_lname + $author $resourceurl - $keywords - $subject + $postsubject $primary_language $abstract $rev_year-$rev_month-$rev_day @@ -183,3 +247,17 @@ foreach my $metadata (@loncapa) { ENDMETA close (XML); } +foreach my $thistype (sort keys %stats) { + print "\n$thistype: $stats{$thistype}"; +} +print "\n----\n"; +} +print "\nDone.\n"; +foreach my $thistype (sort keys %allstats) { + print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}"; +} +print "\n----\n"; +foreach my $thislang (sort keys %knockoutlang) { +print "\n>$thislang<: $knockoutlang{$thislang}"; +} +print "\n";