--- loncom/metadata_database/searchcat.pl 2001/04/16 13:48:14 1.12 +++ loncom/metadata_database/searchcat.pl 2004/04/08 15:57:32 1.55 @@ -1,165 +1,546 @@ #!/usr/bin/perl # The LearningOnline Network # searchcat.pl "Search Catalog" batch script +# +# $Id: searchcat.pl,v 1.55 2004/04/08 15:57:32 matthew Exp $ +# +# Copyright Michigan State University Board of Trustees +# +# This file is part of the LearningOnline Network with CAPA (LON-CAPA). +# +# LON-CAPA is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# LON-CAPA is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LON-CAPA; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# /home/httpd/html/adm/gpl.txt +# +# http://www.lon-capa.org/ +# +### + +=pod + +=head1 NAME + +B - put authoritative filesystem data into sql database. + +=head1 SYNOPSIS + +Ordinarily this script is to be called from a loncapa cron job +(CVS source location: F; typical +filesystem installation location: F). + +Here is the cron job entry. + +C<# Repopulate and refresh the metadata database used for the search catalog.> +C<10 1 * * 7 www /home/httpd/perl/searchcat.pl> + +This script only allows itself to be run as the user C. + +=head1 DESCRIPTION + +This script goes through a loncapa resource directory and gathers metadata. +The metadata is entered into a SQL database. + +This script also does general database maintenance such as reformatting +the C table if it is deprecated. + +This script evaluates dynamic metadata from the authors' +F database file in order to store it in MySQL. + +This script is playing an increasingly important role for a loncapa +library server. The proper operation of this script is critical for a smooth +and correct user experience. -# 04/14/2001 Scott Harrison +=cut -# This script goes through a LON-CAPA resource -# directory and gathers metadata. -# The metadata is entered into a SQL database. +use strict; -use IO::File; -use HTML::TokeParser; use DBI; +use lib '/home/httpd/lib/perl/'; +use LONCAPA::Configuration; +use LONCAPA::lonmetadata; -my @metalist; -# ----------------- Code to enable 'find' subroutine listing of the .meta files -require "find.pl"; -sub wanted { - (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && - -f _ && - /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && - push(@metalist,"$dir/$_"); -} - -# ------------------------------------ Read httpd access.conf and get variables -open (CONFIG,"/etc/httpd/conf/access.conf") || die "Can't read access.conf"; - -while ($configline=) { - if ($configline =~ /PerlSetVar/) { - my ($dummy,$varname,$varvalue)=split(/\s+/,$configline); - chomp($varvalue); - $perlvar{$varname}=$varvalue; - } +use IO::File; +use HTML::TokeParser; +use GDBM_File; +use POSIX qw(strftime mktime); +use File::Find; + +## +## Use variables for table names so we can test this routine a little easier +my $oldname = 'metadata'; +my $newname = 'newmetadata'; + +# +# Read loncapa_apache.conf and loncapa.conf +my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf'); +my %perlvar=%{$perlvarref}; +undef $perlvarref; +delete $perlvar{'lonReceipt'}; # remove since sensitive (really?) & not needed +# +# Only run if machine is a library server +exit if ($perlvar{'lonRole'} ne 'library'); +# +# Make sure this process is running from user=www +my $wwwid=getpwnam('www'); +if ($wwwid!=$<) { + my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}"; + my $subj="LON: $perlvar{'lonHostID'} User ID mismatch"; + system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\ + mailto $emailto -s '$subj' > /dev/null"); + exit 1; } -close(CONFIG); - +# +# Let people know we are running +open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log'); +print LOG '==== Searchcat Run '.localtime()."====\n"; +# +# Connect to database my $dbh; -# ------------------------------------- Make sure that database can be accessed -{ - unless ( - $dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0}) - ) { - print "Cannot connect to database!\n"; - exit; - } +if (! ($dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'}, + { RaiseError =>0,PrintError=>0}))) { + print LOG "Cannot connect to database!\n"; + die "MySQL Error: Cannot connect to database!\n"; } - -# ------------------------------------------------------------- get .meta files +# This can return an error and still be okay, so we do not bother checking. +# (perhaps it should be more robust and check for specific errors) +$dbh->do('DROP TABLE IF EXISTS '.$newname); +# +# Create the new table +my $request = &LONCAPA::lonmetadata::create_metadata_storage($newname); +$dbh->do($request); +if ($dbh->err) { + $dbh->disconnect(); + print LOG "\nMySQL Error Create: ".$dbh->errstr."\n"; + die $dbh->errstr; +} +# +# find out which users we need to examine opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}"); -my @homeusers=grep - {&ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")} - grep {!/^\.\.?$/} readdir(RESOURCES); +my @homeusers = + grep { + &ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_"); + } grep { + !/^\.\.?$/; + } readdir(RESOURCES); closedir RESOURCES; +# +# Loop through the users foreach my $user (@homeusers) { - &find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user"); + print LOG "=== User: ".$user."\n"; + my $prodir=&propath($perlvar{'lonDefDomain'},$user); + # + # Use File::Find to get the files we need to read/modify + find( + {preprocess => \&only_meta_files, +# wanted => \&print_filename, +# wanted => \&log_metadata, + wanted => \&process_meta_file, + }, + "$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user"); +} +# +# Rename the table +$dbh->do('DROP TABLE IF EXISTS '.$oldname); +if (! $dbh->do('RENAME TABLE '.$newname.' TO '.$oldname)) { + print LOG "MySQL Error Rename: ".$dbh->errstr."\n"; + die $dbh->errstr; +} +if (! $dbh->disconnect) { + print LOG "MySQL Error Disconnect: ".$dbh->errstr."\n"; + die $dbh->errstr; +} +## +## Finished! +print LOG "==== Searchcat completed ".localtime()." ====\n"; +close(LOG); + +&write_type_count(); +&write_copyright_count(); + +exit 0; + +######################################################## +######################################################## +### ### +### File::Find support routines ### +### ### +######################################################## +######################################################## +## +## &only_meta_files +## +## Called by File::Find. +## Takes a list of files/directories in and returns a list of files/directories +## to search. +sub only_meta_files { + my @PossibleFiles = @_; + my @ChosenFiles; + foreach my $file (@PossibleFiles) { + if ( ($file =~ /\.meta$/ && # Ends in meta + $file !~ /\.\d+\.[^\.]+\.meta$/ # is not for a prior version + ) || (-d $file )) { # directories are okay + # but we do not want /. or /.. + push(@ChosenFiles,$file); + } + } + return @ChosenFiles; } -# -- process each file to get metadata and put into search catalog SQL database -# Also, check to see if already there. -# I could just delete (without searching first), but this works for now. -foreach my $m (@metalist) { - my $ref=&metadata($m); - my $m2='/res/'.&declutter($m); - $m2=~s/\.meta$//; - my $q2="select * from metadata where url like binary '$m2'"; - my $sth = $dbh->prepare($q2); - $sth->execute(); - my $r1=$sth->fetchall_arrayref; - if (@$r1) { - $sth=$dbh->prepare("delete from metadata where url like binary '$m2'"); - $sth->execute(); - } - $sth=$dbh->prepare('insert into metadata values ('. - '"'.delete($ref->{'title'}).'"'.','. - '"'.delete($ref->{'author'}).'"'.','. - '"'.delete($ref->{'subject'}).'"'.','. - '"'.$m2.'"'.','. - '"'.delete($ref->{'keywords'}).'"'.','. - '"'.'current'.'"'.','. - '"'.delete($ref->{'notes'}).'"'.','. - '"'.delete($ref->{'abstract'}).'"'.','. - '"'.delete($ref->{'mime'}).'"'.','. - '"'.delete($ref->{'language'}).'"'.','. - '"'.delete($ref->{'creationdate'}).'"'.','. - '"'.delete($ref->{'lastrevisiondate'}).'"'.','. - '"'.delete($ref->{'owner'}).'"'.','. - '"'.delete($ref->{'copyright'}).'"'.')'); - $sth->execute(); -} - -# ----------------------------------------------------------- Clean up database -# Need to, perhaps, remove stale SQL database records. -# ... not yet implemented +## +## +## Debugging routines, use these for 'wanted' in the File::Find call +## +sub print_filename { + my ($file) = $_; + my $fullfilename = $File::Find::name; + if (-d $file) { + print LOG " Got directory ".$fullfilename."\n"; + } else { + print LOG " Got file ".$fullfilename."\n"; + } + $_=$file; +} -# --------------------------------------------------- Close database connection -$dbh->disconnect; +sub log_metadata { + my ($file) = $_; + my $fullfilename = $File::Find::name; + return if (-d $fullfilename); # No need to do anything here for directories + print LOG $fullfilename."\n"; + my $ref=&metadata($fullfilename); + if (! defined($ref)) { + print LOG " No data\n"; + return; + } + while (my($key,$value) = each(%$ref)) { + print LOG " ".$key." => ".$value."\n"; + } + &count_copyright($ref->{'copyright'}); + $_=$file; +} -# ---------------------------------------------------------------- Get metadata -# significantly altered from subroutine present in lonnet + +## +## process_meta_file +## Called by File::Find. +## Only input is the filename in $_. +sub process_meta_file { + my ($file) = $_; + my $filename = $File::Find::name; + return if (-d $filename); # No need to do anything here for directories + # + print LOG $filename."\n"; + # + my $ref=&metadata($filename); + # + # $url is the original file url, not the metadata file + my $url='/res/'.&declutter($filename); + $url=~s/\.meta$//; + print LOG " ".$url."\n"; + # + # Ignore some files based on their metadata + if ($ref->{'obsolete'}) { + print LOG "obsolete\n"; + return; + } + &count_copyright($ref->{'copyright'}); + if ($ref->{'copyright'} eq 'private') { + print LOG "private\n"; + return; + } + # + # Find the dynamic metadata + my %dyn; + if ($url=~ m:/default$:) { + $url=~ s:/default$:/:; + } else { + # %dyn=&dynamicmeta($url); + &count_type($url); + } + # + $ref->{'creationdate'} = &sqltime($ref->{'creationdate'}); + $ref->{'lastrevisiondate'} = &sqltime($ref->{'lastrevisiondate'}); + my %Data = ( + %$ref, + %dyn, + 'url'=>$url, + 'version'=>'current'); + my ($count,$err) = &LONCAPA::lonmetadata::store_metadata($dbh,$newname, + \%Data); + if ($err) { + print LOG "\nMySQL Error Insert: ".$err."\n"; + die $err; + } + if ($count < 1) { + print LOG "Unable to insert record into MySQL database for $url\n"; + die "Unable to insert record into MySQl database for $url"; + } else { + print LOG "Count = ".$count."\n"; + } + # + # Reset $_ before leaving + $_ = $file; +} + +######################################################## +######################################################## +### ### +### &metadata($uri) ### +### Retrieve metadata for the given file ### +### ### +######################################################## +######################################################## sub metadata { - my ($uri,$what)=@_; - my %metacache; + my ($uri)=@_; + my %metacache=(); $uri=&declutter($uri); my $filename=$uri; $uri=~s/\.meta$//; $uri=''; - unless ($metacache{$uri.'keys'}) { - unless ($filename=~/\.meta$/) { $filename.='.meta'; } - my $metastring=&getfile($perlvar{'lonDocRoot'}.'/res/'.$filename); - my $parser=HTML::TokeParser->new(\$metastring); - my $token; - while ($token=$parser->get_token) { - if ($token->[0] eq 'S') { - my $entry=$token->[1]; - my $unikey=$entry; - if (defined($token->[2]->{'part'})) { - $unikey.='_'.$token->[2]->{'part'}; - } - if (defined($token->[2]->{'name'})) { - $unikey.='_'.$token->[2]->{'name'}; - } - if ($metacache{$uri.'keys'}) { - $metacache{$uri.'keys'}.=','.$unikey; - } else { - $metacache{$uri.'keys'}=$unikey; - } - map { - $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; - } @{$token->[3]}; - unless ( - $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry) - ) { $metacache{$uri.''.$unikey}= - $metacache{$uri.''.$unikey.'.default'}; - } - } - } + if ($filename !~ /\.meta$/) { + $filename.='.meta'; + } + my $metastring=&getfile($perlvar{'lonDocRoot'}.'/res/'.$filename); + return undef if (! defined($metastring)); + my $parser=HTML::TokeParser->new(\$metastring); + my $token; + while ($token=$parser->get_token) { + if ($token->[0] eq 'S') { + my $entry=$token->[1]; + my $unikey=$entry; + if (defined($token->[2]->{'part'})) { + $unikey.='_'.$token->[2]->{'part'}; + } + if (defined($token->[2]->{'name'})) { + $unikey.='_'.$token->[2]->{'name'}; + } + if ($metacache{$uri.'keys'}) { + $metacache{$uri.'keys'}.=','.$unikey; + } else { + $metacache{$uri.'keys'}=$unikey; + } + foreach ( @{$token->[3]}) { + $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; + } + if (! ($metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry))){ + $metacache{$uri.''.$unikey} = + $metacache{$uri.''.$unikey.'.default'}; + } + } # End of ($token->[0] eq 'S') } return \%metacache; } -# ------------------------------------------------------------ Serves up a file -# returns either the contents of the file or a -1 +## +## &getfile($filename) +## Slurps up an entire file into a scalar. +## Returns undef if the file does not exist sub getfile { - my $file=shift; - if (! -e $file ) { return -1; }; - my $fh=IO::File->new($file); - my $a=''; - while (<$fh>) { $a .=$_; } - return $a + my $file = shift(); + if (! -e $file ) { + return undef; + } + my $fh=IO::File->new($file); + my $contents = ''; + while (<$fh>) { + $contents .= $_; + } + return $contents; } -# ------------------------------------------------------------- Declutters URLs -sub declutter { - my $thisfn=shift; - $thisfn=~s/^$perlvar{'lonDocRoot'}//; - $thisfn=~s/^\///; - $thisfn=~s/^res\///; - return $thisfn; +######################################################## +######################################################## +### ### +### Dynamic Metadata ### +### ### +######################################################## +######################################################## +sub dynamicmeta { + my $url = &declutter(shift()); + $url =~ s/\.meta$//; + my %data = ('count' => 0, + 'course' => 0, + 'course_list' => '', + 'avetries' => 'NULL', + 'avetries_list' => '', + 'stdno' => 0, + 'stdno_list' => '', + 'usage' => 0, + 'usage_list' => '', + 'goto' => 0, + 'goto_list' => '', + 'comefrom' => 0, + 'comefrom_list' => '', + 'difficulty' => 'NULL', + 'difficulty_list' => '', + 'sequsage' => '0', + 'sequsage_list' => '', + 'clear' => 'NULL', + 'technical' => 'NULL', + 'correct' => 'NULL', + 'helpful' => 'NULL', + 'depth' => 'NULL', + 'comments' => '', + ); + my ($dom,$auth)=($url=~/^(\w+)\/(\w+)\//); + my $prodir=&propath($dom,$auth); + # + # Get metadata except counts + my %evaldata; + if (! tie(%evaldata,'GDBM_File', + $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) { + return (undef); + } + my %sum=(); + my %count=(); + my %concat=(); + my %listitems=( + 'course' => 'add', + 'goto' => 'add', + 'comefrom' => 'add', + 'avetries' => 'average', + 'stdno' => 'add', + 'difficulty' => 'average', + 'clear' => 'average', + 'technical' => 'average', + 'helpful' => 'average', + 'correct' => 'average', + 'depth' => 'average', + 'comments' => 'append', + 'usage' => 'count' + ); + # + my $regexp=$url; + $regexp=~s/(\W)/\\$1/g; + $regexp='___'.$regexp.'___([a-z]+)$'; + while (my ($esckey,$value)=each %evaldata) { + my $key=&unescape($esckey); + if ($key=~/$regexp/) { + my ($item,$purl,$cat)=split(/___/,$key); + $count{$cat}++; + if ($listitems{$cat} ne 'append') { + if (defined($sum{$cat})) { + $sum{$cat}+=&unescape($value); + $concat{$cat}.=','.$item; + } else { + $sum{$cat}=&unescape($value); + $concat{$cat}=$item; + } + } else { + if (defined($sum{$cat})) { + if ($evaldata{$esckey}=~/\w/) { + $sum{$cat}.='
'.&unescape($evaldata{$esckey}); + } + } else { + $sum{$cat}=''.&unescape($evaldata{$esckey}); + } + } + } + } + untie(%evaldata); + # transfer gathered data to returnhash, calculate averages where applicable + my %returnhash; + while (my $cat=each(%count)) { + if ($count{$cat} eq 'nan') { next; } + if ($sum{$cat} eq 'nan') { next; } + if ($listitems{$cat} eq 'average') { + if ($count{$cat}) { + $returnhash{$cat}=int(($sum{$cat}/$count{$cat})*100.0+0.5)/100.0; + } else { + $returnhash{$cat}='NULL'; + } + } elsif ($listitems{$cat} eq 'count') { + $returnhash{$cat}=$count{$cat}; + } else { + $returnhash{$cat}=$sum{$cat}; + } + $returnhash{$cat.'_list'}=$concat{$cat}; + } + # + # get count + if (tie(my %evaldata,'GDBM_File', + $prodir.'/nohist_accesscount.db',&GDBM_READER(),0640)) { + my $escurl=&escape($url); + if (! exists($evaldata{$escurl})) { + $returnhash{'count'}=0; + } else { + $returnhash{'count'}=$evaldata{$escurl}; + } + untie %evaldata; + } + return %returnhash; +} + +######################################################## +######################################################## +### ### +### Counts ### +### ### +######################################################## +######################################################## +{ + +my %countext; + +sub count_type { + my $file=shift; + $file=~/\.(\w+)$/; + my $ext=lc($1); + $countext{$ext}++; +} + +sub write_type_count { + open(RESCOUNT,'>/home/httpd/html/lon-status/rescount.txt'); + while (my ($extension,$count) = each(%countext)) { + print RESCOUNT $extension.'='.$count.'&'; + } + print RESCOUNT 'time='.time."\n"; + close(RESCOUNT); +} + +} # end of scope for %countext + +{ + +my %copyrights; + +sub count_copyright { + $copyrights{@_[0]}++; +} + +sub write_copyright_count { + open(COPYCOUNT,'>/home/httpd/html/lon-status/copyrightcount.txt'); + while (my ($copyright,$count) = each(%copyrights)) { + print COPYCOUNT $copyright.'='.$count.'&'; + } + print COPYCOUNT 'time='.time."\n"; + close(COPYCOUNT); } -# --------------------------------------- Is this the home server of an author? -# (copied from lond, modification of the return value) +} # end of scope for %copyrights + +######################################################## +######################################################## +### ### +### Miscellanous Utility Routines ### +### ### +######################################################## +######################################################## +## +## &ishome($username) +## Returns 1 if $username is a LON-CAPA author, 0 otherwise +## (copied from lond, modification of the return value) sub ishome { my $author=shift; $author=~s/\/home\/httpd\/html\/res\/([^\/]*)\/([^\/]*).*/$1\/$2/; @@ -172,8 +553,10 @@ sub ishome { } } -# -------------------------------------------- Return path to profile directory -# (copied from lond) +## +## &propath($udom,$uname) +## Returns the path to the users LON-CAPA directory +## (copied from lond) sub propath { my ($udom,$uname)=@_; $udom=~s/\W//g; @@ -183,3 +566,61 @@ sub propath { my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname"; return $proname; } + +## +## &sqltime($timestamp) +## +## Convert perl $timestamp to MySQL time. MySQL expects YYYY-MM-DD HH:MM:SS +## +sub sqltime { + my ($time) = @_; + my $mysqltime; + if ($time =~ + /(\d+)-(\d+)-(\d+) # YYYY-MM-DD + \s # a space + (\d+):(\d+):(\d+) # HH:MM::SS + /x ) { + # Some of the .meta files have the time in mysql + # format already, so just make sure they are 0 padded and + # pass them back. + $mysqltime = sprintf('%04d-%02d-%02d %02d:%02d:%02d', + $1,$2,$3,$4,$5,$6); + } elsif ($time =~ /^\d+$/) { + my @TimeData = gmtime($time); + # Alter the month to be 1-12 instead of 0-11 + $TimeData[4]++; + # Alter the year to be from 0 instead of from 1900 + $TimeData[5]+=1900; + $mysqltime = sprintf('%04d-%02d-%02d %02d:%02d:%02d', + @TimeData[5,4,3,2,1,0]); + } else { + print LOG " Unable to decode time ".$time."\n"; + $mysqltime = 0; + } + return $mysqltime; +} + +## +## &declutter($filename) +## Given a filename, returns a url for the filename. +sub declutter { + my $thisfn=shift; + $thisfn=~s/^$perlvar{'lonDocRoot'}//; + $thisfn=~s/^\///; + $thisfn=~s/^res\///; + return $thisfn; +} + +## +## Escape / Unescape special characters +sub unescape { + my $str=shift; + $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; + return $str; +} + +sub escape { + my $str=shift; + $str =~ s/(\W)/"%".unpack('H2',$1)/eg; + return $str; +}