Diff for /nsdl/nsdlloncapaorg/harvester.pl between versions 1.5 and 1.9

version 1.5, 2003/07/29 15:10:31 version 1.9, 2006/05/10 16:28:56
Line 29  my $content_regex = 'File Not Found'; Line 29  my $content_regex = 'File Not Found';
   
 my $debug = 0;  my $debug = 0;
   
   # Stats
   my %allstats=();
   my %filterstats=();
   my %knockout=();
   my %knockoutlang=();
   
 # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab  # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
 my @servers = (  my @servers = (
 'newscience.westshore.cc.mi.us',  'newscience.westshore.edu',
 's10.lite.msu.edu',  's10.lite.msu.edu',
 's12.lite.msu.edu',  's12.lite.msu.edu',
 'lon-capa.chem.sunysb.edu',  
 'schubert.tmcc.edu',  'schubert.tmcc.edu',
 'dalton.chem.sfu.ca',  'dalton.chem.sfu.ca',
 'capa2.phy.ohiou.edu',  'capa2.phy.ohiou.edu',
 'pollux.physics.fsu.edu',  'pollux.physics.fsu.edu',
 'loncapa.physics.sc.edu',  'loncapa3.physics.sc.edu',
 'loncapa.math.ucf.edu',  
 'zappa.ags.udel.edu',  'zappa.ags.udel.edu',
 'loncapa.gwu.edu',  'loncapa.gwu.edu',
 'neptune.physics.ndsu.nodak.edu',  'neptune.physics.ndsu.nodak.edu',
 'capa1.uwsp.edu');  'capa1.uwsp.edu',
   'loncapa.Mines.EDU',
   'loncapa.chm.nau.edu',
   'library1.lon-capa.uiuc.edu',
   'lon-capa.bsu.edu',
   'psblnx03.bd.psu.edu',
   'lon-capa.acadiau.ca',
   'harvard.lon-capa.org',
   'capa1.cc.huji.ac.il',
   'lon-capa.phy.cmich.edu',
   'meitner.physics.hope.edu',
   'loncapa.vcu.edu',
   'lon-capa.ucsc.edu',
   'lon-capa.bsu.edu',
   'harvard.lon-capa.org'
   );
   
 foreach (@servers) {  foreach (@servers) {
     my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl';      my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl';
Line 82  if ( $response->is_success ) { Line 101  if ( $response->is_success ) {
   
 my %records = ();;  my %records = ();;
   
   my %stats=();
   
 foreach my $metadata (@loncapa) {  foreach my $metadata (@loncapa) {
  chomp $metadata;   chomp $metadata;
  $metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;   $metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
  my @tkline = split('\|', $metadata);   my @tkline = split('\|', $metadata);
  my $title = $tkline[0];          my ($rawtype)=($tkline[3]=~/\.(\w+)$/);
  next if ( $title eq '' );          $rawtype=~tr/A-Z/a-z/;
           $allstats{$rawtype}++;
           
           my $title = $tkline[0];
    if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; }
  my $author = $tkline[1];   my $author = $tkline[1];
  next if ( $author eq '' );   if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; }
  my @authorname = split(' ', $author);   my @authorname = split(' ', $author);
  my $author_fname = $authorname[0];   my $author_fname = $authorname[0];
  my $author_lname = $authorname[1];   my $author_lname = $authorname[1];
Line 110  foreach my $metadata (@loncapa) { Line 135  foreach my $metadata (@loncapa) {
  my $fileid=md5_hex($baseid);   my $fileid=md5_hex($baseid);
   
  next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );   next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
   # too many fragments out there
           next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i);
   
  my $keywords = $tkline[4];   my $keywords = $tkline[4];
  my $version = $tkline[5];   my $version = $tkline[5];
  my $notes = $tkline[6];   my $notes = $tkline[6];
  my $abstract = $tkline[7];   my $abstract = $tkline[7];
  next if ($abstract eq '');          $abstract=~s/ s / /gs;
  my $type = $tkline[8];          $abstract=~s/\s+/ /gs;
           my $postsubject=$subject;
           unless ($postsubject) {
              $postsubject=$keywords;
           } else {
              $postsubject.=' ('.$keywords.')';
           }
           unless ($postsubject=~/\w/) { $knockout{'nosubject_'.$rawtype}++; next; }
           unless ($abstract) { $knockout{'noabstract_'.$rawtype}++; next; }
    my $type = $rawtype;
           if ($type=~/htm/) { $type='htm'; }
   
  my $learning_resource_type;   my $learning_resource_type;
  if ( $type eq 'problem' ) {   if ( $type eq 'problem' ) {
  $learning_resource_type = 114;   $learning_resource_type = 114;
Line 148  foreach my $metadata (@loncapa) { Line 187  foreach my $metadata (@loncapa) {
  $media_format = 0;   $media_format = 0;
  }   }
   
  my $language = $tkline[9]; # Look only for seniso   my $language = $tkline[9];
  next if ( $language ne 'seniso');  # likelihood is that the following is true (people would bother if it is not)
           if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; }
   # NSDL only does English
           if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; } 
  my $primary_language='en-US';   my $primary_language='en-US';
  my $creation_date = $tkline[10];   my $creation_date = $tkline[10];
  my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );   my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
Line 170  foreach my $metadata (@loncapa) { Line 212  foreach my $metadata (@loncapa) {
  # Domain means restricted to a particular LON-CAPA domain   # Domain means restricted to a particular LON-CAPA domain
  # Defaults mean access open to any registered LON-CAPA user   # Defaults mean access open to any registered LON-CAPA user
  # Private means open only to author of material   # Private means open only to author of material
  next if ( $copyright eq 'private');          unless ($copyright eq 'public') { $knockout{'notpublic_'.$rawtype}++; next; }
  my $platform = "5";     # HTML Browser (not specified but construed from metadata)   my $platform = "5";     # HTML Browser (not specified but construed from metadata)
 #  #
   # We actually do this
   #
           $stats{$type}++;
           $filterstats{$type}++;
   #
 # Create path  # Create path
 #  #
  unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }   unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
Line 190  foreach my $metadata (@loncapa) { Line 237  foreach my $metadata (@loncapa) {
                               http://www.openarchives.org/OAI/2.0/oai_dc.xsd"                                http://www.openarchives.org/OAI/2.0/oai_dc.xsd"
 >  >
     <title>$title</title>      <title>$title</title>
     <creator>$author_fname $author_lname</creator>      <creator>$author</creator>
     <identifier>$resourceurl</identifier>      <identifier>$resourceurl</identifier>
     <subject>$keywords</subject>      <subject>$postsubject</subject>
     <subject>$subject</subject>  
     <language>$primary_language</language>      <language>$primary_language</language>
     <description>$abstract</description>      <description>$abstract</description>
     <date>$rev_year-$rev_month-$rev_day</date>      <date>$rev_year-$rev_month-$rev_day</date>
Line 201  foreach my $metadata (@loncapa) { Line 247  foreach my $metadata (@loncapa) {
 ENDMETA  ENDMETA
       close (XML);        close (XML);
 }  }
   foreach my $thistype (sort keys %stats) {
      print "\n$thistype: $stats{$thistype}";
   }
   print "\n----\n";
   }
   print "\nDone.\n";
   foreach my $thistype (sort keys %allstats) {
      print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}";
   }
   print "\n----\n";
   foreach my $thislang (sort keys %knockoutlang) {
   print "\n>$thislang<: $knockoutlang{$thislang}";
 }  }
   print "\n";

Removed from v.1.5  
changed lines
  Added in v.1.9


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>