Diff for /loncom/metadata_database/searchcat.pl between versions 1.3 and 1.40

version 1.3, 2001/04/16 12:07:36 version 1.40, 2003/09/26 00:23:09
Line 1 Line 1
 #!/usr/bin/perl  #!/usr/bin/perl
 # The LearningOnline Network  # The LearningOnline Network
 # searchcat.pl "Search Catalog" batch script  # searchcat.pl "Search Catalog" batch script
   #
   # $Id$
   #
   # Copyright Michigan State University Board of Trustees
   #
   # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
   #
   # LON-CAPA is free software; you can redistribute it and/or modify
   # it under the terms of the GNU General Public License as published by
   # the Free Software Foundation; either version 2 of the License, or
   # (at your option) any later version.
   #
   # LON-CAPA is distributed in the hope that it will be useful,
   # but WITHOUT ANY WARRANTY; without even the implied warranty of
   # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   # GNU General Public License for more details.
   #
   # You should have received a copy of the GNU General Public License
   # along with LON-CAPA; if not, write to the Free Software
   # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   #
   # /home/httpd/html/adm/gpl.txt
   #
   # http://www.lon-capa.org/
   #
   ###
   
   =pod
   
   =head1 NAME
   
   B<searchcat.pl> - put authoritative filesystem data into sql database.
   
   =head1 SYNOPSIS
   
   Ordinarily this script is to be called from a loncapa cron job
   (CVS source location: F<loncapa/loncom/cron/loncapa>; typical
   filesystem installation location: F</etc/cron.d/loncapa>).
   
   Here is the cron job entry.
   
   C<# Repopulate and refresh the metadata database used for the search catalog.>
   C<10 1 * * 7    www    /home/httpd/perl/searchcat.pl>
   
   This script only allows itself to be run as the user C<www>.
   
   =head1 DESCRIPTION
   
   This script goes through a loncapa resource directory and gathers metadata.
   The metadata is entered into a SQL database.
   
   This script also does general database maintenance such as reformatting
   the C<loncapa:metadata> table if it is deprecated.
   
   This script evaluates dynamic metadata from the authors'
   F<nohist_resevaldata.db> database file in order to store it in MySQL, as
   well as to compress the filesize (add up all "count"-type metadata).
   
   This script is playing an increasingly important role for a loncapa
   library server.  The proper operation of this script is critical for a smooth
   and correct user experience.
   
 # 04/14/2001 Scott Harrison  =cut
   
 # This script goes through a LON-CAPA resource  use lib '/home/httpd/lib/perl/';
 # directory and gathers metadata.  use LONCAPA::Configuration;
 # The metadata is entered into a SQL database.  
   
 use IO::File;  use IO::File;
 use HTML::TokeParser;  use HTML::TokeParser;
   use DBI;
   use GDBM_File;
   use POSIX qw(strftime mktime);
   
 my @metalist;  my @metalist;
   
   $simplestatus='';
   my %countext=();
   
   sub writesimple {
       open(SMP,'>/home/httpd/html/lon-status/mysql.txt');
       print SMP $simplestatus."\n";
       close(SMP);
   }
   
   sub writecount {
       open(RSMP,'>/home/httpd/html/lon-status/rescount.txt');
       foreach (keys %countext) {
    print RSMP $_.'='.$countext{$_}.'&';
       }
       print RSMP 'time='.time."\n";
       close(RSMP);
   }
   
   sub count {
       my $file=shift;
       $file=~/\.(\w+)$/;
       my $ext=lc($1);
       if (defined($countext{$ext})) {
    $countext{$ext}++;
       } else {
    $countext{$ext}=1;
       }
   }
   # ----------------------------------------------------- Un-Escape Special Chars
   
   sub unescape {
       my $str=shift;
       $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg;
       return $str;
   }
   
   # -------------------------------------------------------- Escape Special Chars
   
   sub escape {
       my $str=shift;
       $str =~ s/(\W)/"%".unpack('H2',$1)/eg;
       return $str;
   }
   
   
   # ------------------------------------------- Code to evaluate dynamic metadata
   
   sub dynamicmeta {
   
       my $url=&declutter(shift);
       $url=~s/\.meta$//;
       my %returnhash=();
       my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//);
       my $prodir=&propath($adomain,$aauthor);
       if ((tie(%evaldata,'GDBM_File',
               $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) &&
           (tie(%newevaldata,'GDBM_File',
               $prodir.'/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640))) {
          my %sum=();
          my %cnt=();
          my %listitems=('count'        => 'add',
                         'course'       => 'add',
                         'avetries'     => 'avg',
                         'stdno'        => 'add',
                         'difficulty'   => 'avg',
                         'clear'        => 'avg',
                         'technical'    => 'avg',
                         'helpful'      => 'avg',
                         'correct'      => 'avg',
                         'depth'        => 'avg',
                         'comments'     => 'app',
                         'usage'        => 'cnt'
                         );
          my $regexp=$url;
          $regexp=~s/(\W)/\\$1/g;
          $regexp='___'.$regexp.'___([a-z]+)$';
          foreach (keys %evaldata) {
    my $key=&unescape($_);
    if ($key=~/$regexp/) {
       my $ctype=$1;
               if (defined($cnt{$ctype})) { 
                  $cnt{$ctype}++; 
               } else { 
                  $cnt{$ctype}=1; 
               }
               unless ($listitems{$ctype} eq 'app') {
                  if (defined($sum{$ctype})) {
                     $sum{$ctype}+=$evaldata{$_};
             } else {
                     $sum{$ctype}=$evaldata{$_};
          }
               } else {
                  if (defined($sum{$ctype})) {
                     if ($evaldata{$_}) {
                        $sum{$ctype}.='<hr>'.$evaldata{$_};
             }
           } else {
                $sum{$ctype}=''.$evaldata{$_};
          }
       }
       if ($ctype ne 'count') {
          $newevaldata{$_}=$evaldata{$_};
      }
    }
         }
         foreach (keys %cnt) {
            if ($listitems{$_} eq 'avg') {
        $returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0;
            } elsif ($listitems{$_} eq 'cnt') {
                $returnhash{$_}=$cnt{$_};
            } else {
                $returnhash{$_}=$sum{$_};
            }
        }
        if ($returnhash{'count'}) {
            my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count';
            $newevaldata{$newkey}=$returnhash{'count'};
        }
        untie(%evaldata);
        untie(%newevaldata);
      }
      return %returnhash;
   }
     
 # ----------------- Code to enable 'find' subroutine listing of the .meta files  # ----------------- Code to enable 'find' subroutine listing of the .meta files
 require "find.pl";  require "find.pl";
 sub wanted {  sub wanted {
     (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) &&      (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) &&
     -f _ &&          -f _ &&
     /^.*\.meta$/ &&          /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ &&
     push(@metalist,"$dir/$_");          push(@metalist,"$dir/$_");
 }  }
   
 # ------------------------------------ Read httpd access.conf and get variables  # ---------------  Read loncapa_apache.conf and loncapa.conf and get variables
 open (CONFIG,"/etc/httpd/conf/access.conf") || die "Can't read access.conf";  my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
   my %perlvar=%{$perlvarref};
 while ($configline=<CONFIG>) {  undef $perlvarref; # remove since sensitive and not needed
     if ($configline =~ /PerlSetVar/) {  delete $perlvar{'lonReceipt'}; # remove since sensitive and not needed
  my ($dummy,$varname,$varvalue)=split(/\s+/,$configline);  
         chomp($varvalue);  # ------------------------------------- Only run if machine is a library server
         $perlvar{$varname}=$varvalue;  exit unless $perlvar{'lonRole'} eq 'library';
     }  
   # ----------------------------- Make sure this process is running from user=www
   
   my $wwwid=getpwnam('www');
   if ($wwwid!=$<) {
       $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
       $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
       system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\
    mailto $emailto -s '$subj' > /dev/null");
       exit 1;
 }  }
 close(CONFIG);  
   
   
   # ---------------------------------------------------------- We are in business
   
   open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
   print LOG '==== Searchcat Run '.localtime()."====\n\n";
   $simplestatus='time='.time.'&';
 my $dbh;  my $dbh;
 # ------------------------------------- Make sure that database can be accessed  # ------------------------------------- Make sure that database can be accessed
 {  {
     unless (      unless (
     $dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0})      $dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0})
     ) {       ) { 
  print "Cannot connect to database!\n";   print LOG "Cannot connect to database!\n";
    $simplestatus.='mysql=defunct';
    &writesimple();
  exit;   exit;
     }      }
   
       my $make_metadata_table = "CREATE TABLE IF NOT EXISTS metadata (".
           "title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, ".
           "version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, ".
           "creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, ".
           "copyright TEXT, FULLTEXT idx_title (title), ".
           "FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), ".
           "FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), ".
           "FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), ".
           "FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), ".
           "FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), ".
           "FULLTEXT idx_copyright (copyright)) TYPE=MYISAM";
       # It would sure be nice to have some logging mechanism.
       $dbh->do($make_metadata_table);
 }  }
   
 # ------------------------------------------------------------- get .meta files  # ------------------------------------------------------------- get .meta files
 opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}");  opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}");
 my @homeusers=grep  my @homeusers = grep {
           {&ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")}      &ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")
           grep {!/^\.\.?$/} readdir(RESOURCES);      } grep {!/^\.\.?$/} readdir(RESOURCES);
 closedir RESOURCES;  closedir RESOURCES;
   
   #
   # Create the statement handlers we need
   my $delete_sth = $dbh->prepare
       ("DELETE FROM metadata WHERE url LIKE BINARY ?");
   
   my $insert_sth = $dbh->prepare
       ("INSERT INTO metadata VALUES (".
        "?,".   # title
        "?,".   # author
        "?,".   # subject
        "?,".   # m2???
        "?,".   # version
        "?,".   # current
        "?,".   # notes
        "?,".   # abstract
        "?,".   # mime
        "?,".   # language
        "?,".   # creationdate
        "?,".   # revisiondate
        "?,".   # owner
        "?)"    # copyright
        );
   
 foreach my $user (@homeusers) {  foreach my $user (@homeusers) {
       print LOG "\n=== User: ".$user."\n\n";
       # Remove left-over db-files from potentially crashed searchcat run
       my $prodir=&propath($perlvar{'lonDefDomain'},$user);
       unlink($prodir.'/nohist_new_resevaldata.db');
       # Use find.pl
       undef @metalist;
       @metalist=();
     &find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user");      &find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user");
       # -- process each file to get metadata and put into search catalog SQL
       # database.  Also, check to see if already there.
       # I could just delete (without searching first), but this works for now.
       foreach my $m (@metalist) {
           print LOG "- ".$m."\n";
           my $ref=&metadata($m);
     if ($ref->{'obsolete'}) { print LOG "obsolete\n"; next; }
    if ($ref->{'copyright'} eq 'private') { print LOG "private\n"; next; }
           my $m2='/res/'.&declutter($m);
           $m2=~s/\.meta$//;
   #        &dynamicmeta($m2);
    &count($m2);
           $delete_sth->execute($m2);
           $insert_sth->execute($ref->{'title'},
                                $ref->{'author'},
                                $ref->{'subject'},
                                $m2,
                                $ref->{'keywords'},
                                'current',
                                $ref->{'notes'},
                                $ref->{'abstract'},
                                $ref->{'mime'},
                                $ref->{'language'},
                                sqltime($ref->{'creationdate'}),
                                sqltime($ref->{'lastrevisiondate'}),
                                $ref->{'owner'},
                                $ref->{'copyright'});
   #        if ($dbh->err()) {
   #            print STDERR "Error:".$dbh->errstr()."\n";
   #        }
           $ref = undef;
       }
       
       # --------------------------------------------------- Clean up database
       # Need to, perhaps, remove stale SQL database records.
       # ... not yet implemented
           
       # ------------------------------------------- Copy over the new db-files
       #
       # Check the size of nohist_new_resevaldata.db compared to 
       # nohist_resevaldata.db
   #    my @stat_result = stat($prodir.'/nohist_new_resevaldata.db');
   #    my $new_size = $stat_result[7];
   #    @stat_result = stat($prodir.'/nohist_resevaldata.db');
   #    my $old_size = $stat_result[7];
   #    if ($old_size) {
   # if ($new_size/$old_size > 0.15 ) {
   #    system('mv '.$prodir.'/nohist_new_resevaldata.db '.
   #   $prodir.'/nohist_resevaldata.db');
   # } else {
   #    print LOG "Size of '$user' old nohist_reseval: $old_size ".
   # "Size of new: $new_size.  Not overwriting.\n";
   #    my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
   #    my $subj="LON: $perlvar{'lonHostID'} searchcat.pl $user reseval ".
   # "modification error.";
   #    system("echo ".
   # "'See /home/httpd/perl/logs/searchcat.txt for information.' ".
   #   "| mailto $emailto -s '$subj' > /dev/null");
   # }
   #    }   
 }  }
   
 # -- process each file to get metadata and put into search catalog SQL database  
 foreach my $m (@metalist) {  
     my $ref=&metadata($m);  
     my $sth=$dbh->prepare('insert into metadata values ('.  
   delete($ref->{'title'}),  
   delete($ref->{'author'}).','.  
   delete($ref->{'subject'}).','.  
   delete($ref->{'url'}).','.  
   delete($ref->{'keywords'}).','.  
   delete($ref->{'version'}).','.  
   delete($ref->{'notes'}).','.  
   delete($ref->{'abstract'}).','.  
   delete($ref->{'mime'}).','.  
   delete($ref->{'language'}).','.  
   delete($ref->{'creationdate'}).','.  
   delete($ref->{'lastrevisiondate'}).','.  
   delete($ref->{'owner'}).','.  
   delete($ref->{'copyright'}).  
   ')';  
     $sth->execute();  
 }  
   
 # ----------------------------------------------------------- Clean up database  
 # Need to, perhaps, remove stale SQL database records.  
 # ... not yet implemented  
   
 # --------------------------------------------------- Close database connection  # --------------------------------------------------- Close database connection
 $dbh->disconnect;  $dbh->disconnect;
   print LOG "\n==== Searchcat completed ".localtime()." ====\n";
   close(LOG);
   &writesimple();
   &writecount();
   exit 0;
   
   
   
   # =============================================================================
   
 # ---------------------------------------------------------------- Get metadata  # ---------------------------------------------------------------- Get metadata
 # significantly altered from subroutine present in lonnet  # significantly altered from subroutine present in lonnet
Line 98  sub metadata { Line 389  sub metadata {
         my $parser=HTML::TokeParser->new(\$metastring);          my $parser=HTML::TokeParser->new(\$metastring);
         my $token;          my $token;
         while ($token=$parser->get_token) {          while ($token=$parser->get_token) {
            if ($token->[0] eq 'S') {              if ($token->[0] eq 'S') {
       my $entry=$token->[1];                  my $entry=$token->[1];
               my $unikey=$entry;                  my $unikey=$entry;
               if (defined($token->[2]->{'part'})) {                   if (defined($token->[2]->{'part'})) { 
                  $unikey.='_'.$token->[2]->{'part'};                       $unikey.='_'.$token->[2]->{'part'}; 
       }                  }
               if (defined($token->[2]->{'name'})) {                   if (defined($token->[2]->{'name'})) { 
                  $unikey.='_'.$token->[2]->{'name'};                       $unikey.='_'.$token->[2]->{'name'}; 
       }                  }
               if ($metacache{$uri.'keys'}) {                  if ($metacache{$uri.'keys'}) {
                  $metacache{$uri.'keys'}.=','.$unikey;                      $metacache{$uri.'keys'}.=','.$unikey;
               } else {                  } else {
                  $metacache{$uri.'keys'}=$unikey;                      $metacache{$uri.'keys'}=$unikey;
       }                  }
               map {                  map {
   $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_};                      $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_};
               } @{$token->[3]};                  } @{$token->[3]};
               unless (                  unless (
                  $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry)                          $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry)
       ) { $metacache{$uri.''.$unikey}=                          ) { $metacache{$uri.''.$unikey}=
       $metacache{$uri.''.$unikey.'.default'};                                  $metacache{$uri.''.$unikey.'.default'};
       }                          }
           }              }
        }          }
     }      }
     return \%metacache;      return \%metacache;
 }  }
Line 129  sub metadata { Line 420  sub metadata {
 # ------------------------------------------------------------ Serves up a file  # ------------------------------------------------------------ Serves up a file
 # returns either the contents of the file or a -1  # returns either the contents of the file or a -1
 sub getfile {  sub getfile {
   my $file=shift;      my $file=shift;
   if (! -e $file ) { return -1; };      if (! -e $file ) { return -1; };
   my $fh=IO::File->new($file);      my $fh=IO::File->new($file);
   my $a='';      my $a='';
   while (<$fh>) { $a .=$_; }      while (<$fh>) { $a .=$_; }
   return $a      return $a;
 }  }
   
 # ------------------------------------------------------------- Declutters URLs  # ------------------------------------------------------------- Declutters URLs
Line 171  sub propath { Line 462  sub propath {
     my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname";      my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname";
     return $proname;      return $proname;
 }   } 
   
   # ---------------------------- convert 'time' format into a datetime sql format
   sub sqltime {
       my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
    localtime(&unsqltime(@_[0]));
       $mon++; $year+=1900;
       return "$year-$mon-$mday $hour:$min:$sec";
   }
   
   sub maketime {
       my %th=@_;
       return POSIX::mktime(($th{'seconds'},$th{'minutes'},$th{'hours'},
                             $th{'day'},$th{'month'}-1,
                             $th{'year'}-1900,0,0,$th{'dlsav'}));
   }
   
   
   #########################################
   #
   # Retro-fixing of un-backward-compatible time format
   
   sub unsqltime {
       my $timestamp=shift;
       if ($timestamp=~/^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) {
           $timestamp=&maketime('year'=>$1,'month'=>$2,'day'=>$3,
                                'hours'=>$4,'minutes'=>$5,'seconds'=>$6);
       }
       return $timestamp;
   }
   

Removed from v.1.3  
changed lines
  Added in v.1.40


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>