#!/usr/bin/perl # The LearningOnline Network # searchcat.pl "Search Catalog" batch script # # $Id: searchcat.pl,v 1.55 2004/04/08 15:57:32 matthew Exp $ # # Copyright Michigan State University Board of Trustees # # This file is part of the LearningOnline Network with CAPA (LON-CAPA). # # LON-CAPA is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # LON-CAPA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with LON-CAPA; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # /home/httpd/html/adm/gpl.txt # # http://www.lon-capa.org/ # ### =pod =head1 NAME B - put authoritative filesystem data into sql database. =head1 SYNOPSIS Ordinarily this script is to be called from a loncapa cron job (CVS source location: F; typical filesystem installation location: F). Here is the cron job entry. C<# Repopulate and refresh the metadata database used for the search catalog.> C<10 1 * * 7 www /home/httpd/perl/searchcat.pl> This script only allows itself to be run as the user C. =head1 DESCRIPTION This script goes through a loncapa resource directory and gathers metadata. The metadata is entered into a SQL database. This script also does general database maintenance such as reformatting the C table if it is deprecated. This script evaluates dynamic metadata from the authors' F database file in order to store it in MySQL. This script is playing an increasingly important role for a loncapa library server. The proper operation of this script is critical for a smooth and correct user experience. =cut use strict; use DBI; use lib '/home/httpd/lib/perl/'; use LONCAPA::Configuration; use LONCAPA::lonmetadata; use IO::File; use HTML::TokeParser; use GDBM_File; use POSIX qw(strftime mktime); use File::Find; ## ## Use variables for table names so we can test this routine a little easier my $oldname = 'metadata'; my $newname = 'newmetadata'; # # Read loncapa_apache.conf and loncapa.conf my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf'); my %perlvar=%{$perlvarref}; undef $perlvarref; delete $perlvar{'lonReceipt'}; # remove since sensitive (really?) & not needed # # Only run if machine is a library server exit if ($perlvar{'lonRole'} ne 'library'); # # Make sure this process is running from user=www my $wwwid=getpwnam('www'); if ($wwwid!=$<) { my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}"; my $subj="LON: $perlvar{'lonHostID'} User ID mismatch"; system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\ mailto $emailto -s '$subj' > /dev/null"); exit 1; } # # Let people know we are running open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log'); print LOG '==== Searchcat Run '.localtime()."====\n"; # # Connect to database my $dbh; if (! ($dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'}, { RaiseError =>0,PrintError=>0}))) { print LOG "Cannot connect to database!\n"; die "MySQL Error: Cannot connect to database!\n"; } # This can return an error and still be okay, so we do not bother checking. # (perhaps it should be more robust and check for specific errors) $dbh->do('DROP TABLE IF EXISTS '.$newname); # # Create the new table my $request = &LONCAPA::lonmetadata::create_metadata_storage($newname); $dbh->do($request); if ($dbh->err) { $dbh->disconnect(); print LOG "\nMySQL Error Create: ".$dbh->errstr."\n"; die $dbh->errstr; } # # find out which users we need to examine opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}"); my @homeusers = grep { &ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_"); } grep { !/^\.\.?$/; } readdir(RESOURCES); closedir RESOURCES; # # Loop through the users foreach my $user (@homeusers) { print LOG "=== User: ".$user."\n"; my $prodir=&propath($perlvar{'lonDefDomain'},$user); # # Use File::Find to get the files we need to read/modify find( {preprocess => \&only_meta_files, # wanted => \&print_filename, # wanted => \&log_metadata, wanted => \&process_meta_file, }, "$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user"); } # # Rename the table $dbh->do('DROP TABLE IF EXISTS '.$oldname); if (! $dbh->do('RENAME TABLE '.$newname.' TO '.$oldname)) { print LOG "MySQL Error Rename: ".$dbh->errstr."\n"; die $dbh->errstr; } if (! $dbh->disconnect) { print LOG "MySQL Error Disconnect: ".$dbh->errstr."\n"; die $dbh->errstr; } ## ## Finished! print LOG "==== Searchcat completed ".localtime()." ====\n"; close(LOG); &write_type_count(); &write_copyright_count(); exit 0; ######################################################## ######################################################## ### ### ### File::Find support routines ### ### ### ######################################################## ######################################################## ## ## &only_meta_files ## ## Called by File::Find. ## Takes a list of files/directories in and returns a list of files/directories ## to search. sub only_meta_files { my @PossibleFiles = @_; my @ChosenFiles; foreach my $file (@PossibleFiles) { if ( ($file =~ /\.meta$/ && # Ends in meta $file !~ /\.\d+\.[^\.]+\.meta$/ # is not for a prior version ) || (-d $file )) { # directories are okay # but we do not want /. or /.. push(@ChosenFiles,$file); } } return @ChosenFiles; } ## ## ## Debugging routines, use these for 'wanted' in the File::Find call ## sub print_filename { my ($file) = $_; my $fullfilename = $File::Find::name; if (-d $file) { print LOG " Got directory ".$fullfilename."\n"; } else { print LOG " Got file ".$fullfilename."\n"; } $_=$file; } sub log_metadata { my ($file) = $_; my $fullfilename = $File::Find::name; return if (-d $fullfilename); # No need to do anything here for directories print LOG $fullfilename."\n"; my $ref=&metadata($fullfilename); if (! defined($ref)) { print LOG " No data\n"; return; } while (my($key,$value) = each(%$ref)) { print LOG " ".$key." => ".$value."\n"; } &count_copyright($ref->{'copyright'}); $_=$file; } ## ## process_meta_file ## Called by File::Find. ## Only input is the filename in $_. sub process_meta_file { my ($file) = $_; my $filename = $File::Find::name; return if (-d $filename); # No need to do anything here for directories # print LOG $filename."\n"; # my $ref=&metadata($filename); # # $url is the original file url, not the metadata file my $url='/res/'.&declutter($filename); $url=~s/\.meta$//; print LOG " ".$url."\n"; # # Ignore some files based on their metadata if ($ref->{'obsolete'}) { print LOG "obsolete\n"; return; } &count_copyright($ref->{'copyright'}); if ($ref->{'copyright'} eq 'private') { print LOG "private\n"; return; } # # Find the dynamic metadata my %dyn; if ($url=~ m:/default$:) { $url=~ s:/default$:/:; } else { # %dyn=&dynamicmeta($url); &count_type($url); } # $ref->{'creationdate'} = &sqltime($ref->{'creationdate'}); $ref->{'lastrevisiondate'} = &sqltime($ref->{'lastrevisiondate'}); my %Data = ( %$ref, %dyn, 'url'=>$url, 'version'=>'current'); my ($count,$err) = &LONCAPA::lonmetadata::store_metadata($dbh,$newname, \%Data); if ($err) { print LOG "\nMySQL Error Insert: ".$err."\n"; die $err; } if ($count < 1) { print LOG "Unable to insert record into MySQL database for $url\n"; die "Unable to insert record into MySQl database for $url"; } else { print LOG "Count = ".$count."\n"; } # # Reset $_ before leaving $_ = $file; } ######################################################## ######################################################## ### ### ### &metadata($uri) ### ### Retrieve metadata for the given file ### ### ### ######################################################## ######################################################## sub metadata { my ($uri)=@_; my %metacache=(); $uri=&declutter($uri); my $filename=$uri; $uri=~s/\.meta$//; $uri=''; if ($filename !~ /\.meta$/) { $filename.='.meta'; } my $metastring=&getfile($perlvar{'lonDocRoot'}.'/res/'.$filename); return undef if (! defined($metastring)); my $parser=HTML::TokeParser->new(\$metastring); my $token; while ($token=$parser->get_token) { if ($token->[0] eq 'S') { my $entry=$token->[1]; my $unikey=$entry; if (defined($token->[2]->{'part'})) { $unikey.='_'.$token->[2]->{'part'}; } if (defined($token->[2]->{'name'})) { $unikey.='_'.$token->[2]->{'name'}; } if ($metacache{$uri.'keys'}) { $metacache{$uri.'keys'}.=','.$unikey; } else { $metacache{$uri.'keys'}=$unikey; } foreach ( @{$token->[3]}) { $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; } if (! ($metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry))){ $metacache{$uri.''.$unikey} = $metacache{$uri.''.$unikey.'.default'}; } } # End of ($token->[0] eq 'S') } return \%metacache; } ## ## &getfile($filename) ## Slurps up an entire file into a scalar. ## Returns undef if the file does not exist sub getfile { my $file = shift(); if (! -e $file ) { return undef; } my $fh=IO::File->new($file); my $contents = ''; while (<$fh>) { $contents .= $_; } return $contents; } ######################################################## ######################################################## ### ### ### Dynamic Metadata ### ### ### ######################################################## ######################################################## sub dynamicmeta { my $url = &declutter(shift()); $url =~ s/\.meta$//; my %data = ('count' => 0, 'course' => 0, 'course_list' => '', 'avetries' => 'NULL', 'avetries_list' => '', 'stdno' => 0, 'stdno_list' => '', 'usage' => 0, 'usage_list' => '', 'goto' => 0, 'goto_list' => '', 'comefrom' => 0, 'comefrom_list' => '', 'difficulty' => 'NULL', 'difficulty_list' => '', 'sequsage' => '0', 'sequsage_list' => '', 'clear' => 'NULL', 'technical' => 'NULL', 'correct' => 'NULL', 'helpful' => 'NULL', 'depth' => 'NULL', 'comments' => '', ); my ($dom,$auth)=($url=~/^(\w+)\/(\w+)\//); my $prodir=&propath($dom,$auth); # # Get metadata except counts my %evaldata; if (! tie(%evaldata,'GDBM_File', $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) { return (undef); } my %sum=(); my %count=(); my %concat=(); my %listitems=( 'course' => 'add', 'goto' => 'add', 'comefrom' => 'add', 'avetries' => 'average', 'stdno' => 'add', 'difficulty' => 'average', 'clear' => 'average', 'technical' => 'average', 'helpful' => 'average', 'correct' => 'average', 'depth' => 'average', 'comments' => 'append', 'usage' => 'count' ); # my $regexp=$url; $regexp=~s/(\W)/\\$1/g; $regexp='___'.$regexp.'___([a-z]+)$'; while (my ($esckey,$value)=each %evaldata) { my $key=&unescape($esckey); if ($key=~/$regexp/) { my ($item,$purl,$cat)=split(/___/,$key); $count{$cat}++; if ($listitems{$cat} ne 'append') { if (defined($sum{$cat})) { $sum{$cat}+=&unescape($value); $concat{$cat}.=','.$item; } else { $sum{$cat}=&unescape($value); $concat{$cat}=$item; } } else { if (defined($sum{$cat})) { if ($evaldata{$esckey}=~/\w/) { $sum{$cat}.='
'.&unescape($evaldata{$esckey}); } } else { $sum{$cat}=''.&unescape($evaldata{$esckey}); } } } } untie(%evaldata); # transfer gathered data to returnhash, calculate averages where applicable my %returnhash; while (my $cat=each(%count)) { if ($count{$cat} eq 'nan') { next; } if ($sum{$cat} eq 'nan') { next; } if ($listitems{$cat} eq 'average') { if ($count{$cat}) { $returnhash{$cat}=int(($sum{$cat}/$count{$cat})*100.0+0.5)/100.0; } else { $returnhash{$cat}='NULL'; } } elsif ($listitems{$cat} eq 'count') { $returnhash{$cat}=$count{$cat}; } else { $returnhash{$cat}=$sum{$cat}; } $returnhash{$cat.'_list'}=$concat{$cat}; } # # get count if (tie(my %evaldata,'GDBM_File', $prodir.'/nohist_accesscount.db',&GDBM_READER(),0640)) { my $escurl=&escape($url); if (! exists($evaldata{$escurl})) { $returnhash{'count'}=0; } else { $returnhash{'count'}=$evaldata{$escurl}; } untie %evaldata; } return %returnhash; } ######################################################## ######################################################## ### ### ### Counts ### ### ### ######################################################## ######################################################## { my %countext; sub count_type { my $file=shift; $file=~/\.(\w+)$/; my $ext=lc($1); $countext{$ext}++; } sub write_type_count { open(RESCOUNT,'>/home/httpd/html/lon-status/rescount.txt'); while (my ($extension,$count) = each(%countext)) { print RESCOUNT $extension.'='.$count.'&'; } print RESCOUNT 'time='.time."\n"; close(RESCOUNT); } } # end of scope for %countext { my %copyrights; sub count_copyright { $copyrights{@_[0]}++; } sub write_copyright_count { open(COPYCOUNT,'>/home/httpd/html/lon-status/copyrightcount.txt'); while (my ($copyright,$count) = each(%copyrights)) { print COPYCOUNT $copyright.'='.$count.'&'; } print COPYCOUNT 'time='.time."\n"; close(COPYCOUNT); } } # end of scope for %copyrights ######################################################## ######################################################## ### ### ### Miscellanous Utility Routines ### ### ### ######################################################## ######################################################## ## ## &ishome($username) ## Returns 1 if $username is a LON-CAPA author, 0 otherwise ## (copied from lond, modification of the return value) sub ishome { my $author=shift; $author=~s/\/home\/httpd\/html\/res\/([^\/]*)\/([^\/]*).*/$1\/$2/; my ($udom,$uname)=split(/\//,$author); my $proname=propath($udom,$uname); if (-e $proname) { return 1; } else { return 0; } } ## ## &propath($udom,$uname) ## Returns the path to the users LON-CAPA directory ## (copied from lond) sub propath { my ($udom,$uname)=@_; $udom=~s/\W//g; $uname=~s/\W//g; my $subdir=$uname.'__'; $subdir =~ s/(.)(.)(.).*/$1\/$2\/$3/; my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname"; return $proname; } ## ## &sqltime($timestamp) ## ## Convert perl $timestamp to MySQL time. MySQL expects YYYY-MM-DD HH:MM:SS ## sub sqltime { my ($time) = @_; my $mysqltime; if ($time =~ /(\d+)-(\d+)-(\d+) # YYYY-MM-DD \s # a space (\d+):(\d+):(\d+) # HH:MM::SS /x ) { # Some of the .meta files have the time in mysql # format already, so just make sure they are 0 padded and # pass them back. $mysqltime = sprintf('%04d-%02d-%02d %02d:%02d:%02d', $1,$2,$3,$4,$5,$6); } elsif ($time =~ /^\d+$/) { my @TimeData = gmtime($time); # Alter the month to be 1-12 instead of 0-11 $TimeData[4]++; # Alter the year to be from 0 instead of from 1900 $TimeData[5]+=1900; $mysqltime = sprintf('%04d-%02d-%02d %02d:%02d:%02d', @TimeData[5,4,3,2,1,0]); } else { print LOG " Unable to decode time ".$time."\n"; $mysqltime = 0; } return $mysqltime; } ## ## &declutter($filename) ## Given a filename, returns a url for the filename. sub declutter { my $thisfn=shift; $thisfn=~s/^$perlvar{'lonDocRoot'}//; $thisfn=~s/^\///; $thisfn=~s/^res\///; return $thisfn; } ## ## Escape / Unescape special characters sub unescape { my $str=shift; $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; return $str; } sub escape { my $str=shift; $str =~ s/(\W)/"%".unpack('H2',$1)/eg; return $str; }