#!/usr/bin/perl # The LearningOnline Network # searchcat.pl "Search Catalog" batch script # # $Id: searchcat.pl,v 1.30 2003/02/03 17:01:55 www Exp $ # # Copyright Michigan State University Board of Trustees # # This file is part of the LearningOnline Network with CAPA (LON-CAPA). # # LON-CAPA is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # LON-CAPA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with LON-CAPA; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # /home/httpd/html/adm/gpl.txt # # http://www.lon-capa.org/ # # YEAR=2001 # 04/14/2001, 04/16/2001 Scott Harrison # # YEAR=2002 # 05/11/2002 Scott Harrison # # YEAR=2003 # Scott Harrison # ### =pod =head1 NAME B - put authoritative filesystem data into sql database. =head1 SYNOPSIS Ordinarily this script is to be called from a loncapa cron job (CVS source location: F; typical filesystem installation location: F). Here is the cron job entry. C<# Repopulate and refresh the metadata database used for the search catalog.> C<10 1 * * 7 www /home/httpd/perl/searchcat.pl> This script only allows itself to be run as the user C. =head1 DESCRIPTION This script goes through a loncapa resource directory and gathers metadata. The metadata is entered into a SQL database. This script also does general database maintenance such as reformatting the C table if it is deprecated. This script also builds dynamic temporal metadata and stores this inside a F database file. This script is playing an increasingly important role for a loncapa library server. The proper operation of this script is critical for a smooth and correct user experience. =cut # ========================================================== Setting things up. # ------------------------------------------------------ Use external modules. use lib '/home/httpd/lib/perl/'; use LONCAPA::Configuration; use IO::File; use HTML::TokeParser; use DBI; use GDBM_File; use POSIX qw(strftime mktime); # ----------------- Code to enable 'find' subroutine listing of the .meta files use File::Find; # List of .meta files (used on a per-user basis). my @metalist; # --------------- Read loncapa_apache.conf and loncapa.conf and get variables. my $perlvarref = LONCAPA::Configuration::read_conf('loncapa.conf'); my %perlvar = %{$perlvarref}; undef($perlvarref); # Remove since sensitive and not needed. delete($perlvar{'lonReceipt'}); # Remove since sensitive and not needed. # ------------------------------------- Only run if machine is a library server if ($perlvar{'lonRole'} ne 'library') { exit(0); } # ------------------------------ Make sure this process is running as user=www. my $wwwid = getpwnam('www'); if ($wwwid != $<) { $emailto = "$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}"; $subj = "LON: $perlvar{'lonHostID'} User ID mismatch"; system("echo 'User ID mismatch. searchcat.pl must be run as user www.' | ". "mailto $emailto -s '$subj' > /dev/null"); exit(1); } # ------------------------------------------------------ Initialize log output. open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log'); print(LOG '==== Searchcat Run '.localtime().' ===='."\n\n"); my $dbh; # Database object reference handle. # ----------------------------- Verify connection to loncapa:metadata database. unless ( $dbh = DBI->connect('DBI:mysql:loncapa','www', $perlvar{'lonSqlAccess'}, { RaiseError => 0,PrintError => 0}) ) { print(LOG '**** ERROR **** Cannot connect to database!'."\n"); exit(0); } # ------------------------------ Create loncapa:metadata table if non-existent. my $make_metadata_table = 'CREATE TABLE IF NOT EXISTS metadata ('. 'title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, '. 'version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, '. 'creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, '. 'copyright TEXT, utilitysemaphore BOOL, FULLTEXT idx_title (title), '. 'FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), '. 'FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), '. 'FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), '. 'FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), '. 'FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), '. 'FULLTEXT idx_copyright (copyright)) TYPE=MYISAM'; $dbh->do($make_metadata_table); # Generate the table. # ----------------------------- Verify format of the loncapa:metadata database. # (delete and recreate database if necessary). # Make a positive control for verifying table structure. my $make_metadata_table_CONTROL = $make_metadata_table; $make_metadata_table_CONTROL =~ s/^(CREATE TABLE IF NOT EXISTS) metadata/$1 CONTROL_metadata/; $dbh->do('DROP TABLE IF EXISTS CONTROL_metadata'); $dbh->do($make_metadata_table_CONTROL); my $table_description; # selectall reference to the table description. my $CONTROL_table_string; # What the table description should look like. my $table_string; # What the table description does look like. # Calculate the CONTROL table description (what it should be). $table_description = $dbh->selectall_arrayref('describe CONTROL_metadata'); foreach my $table_row (@{$table_description}) { $CONTROL_table_string .= join(',',@{$table_row})."\n"; } # Calculate the current table description (what it currently looks like). $table_description = $dbh->selectall_arrayref('describe metadata'); foreach my $table_row (@{$table_description}) { $table_string .= join(',',@{$table_row})."\n"; } if ($table_string ne $CONTROL_table_string) { # Log this incident. print(LOG '**** WARNING **** Table structure mismatch, need to regenerate'. '.'."\n"); # Delete the table. $dbh->do('DROP TABLE IF EXISTS metadata'); # Generate the table. $dbh->do($make_metadata_table); } $dbh->do('DROP TABLE IF EXISTS CONTROL_metadata'); # Okay. Done with control. # ----------------------------------------------- Set utilitysemaphore to zero. $dbh->do('UPDATE metadata SET utilitysemaphore = 0'); # ========================================================= Main functionality. # - Determine home authors on this server based on resources dir and user tree. # RESOURCES: the resources directory (subdirs correspond to author usernames). opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}") or (print(LOG '=== /res/--lonDefDomain-- directory is not accessible'."\n") and exit(0)); # query_home_server_status will look for user home directories on this machine. my @homeusers = grep {&query_home_server_status($perlvar{'lonDocRoot'}.'/res/'. $perlvar{'lonDefDomain'}.'/'.$_) } grep {!/^\.\.?$/} readdir(RESOURCES); closedir(RESOURCES); unless (@homeusers) { print(LOG '=== No home users found on this server.'."\n"); } # Consider each author individually. foreach my $user (@homeusers) { # Make a log entry. print(LOG "\n".'=== User: '.$user."\n\n"); # Get filesystem path to this user's directory. my $user_directory = &construct_path_to_user_directory($perlvar{'lonDefDomain'},$user); # Remove left-over db-files from a potentially crashed searchcat run. unlink($user_directory.'/nohist_new_resevaldata.db'); # Cleanup the metalist array. undef(@metalist); @metalist = (); # This will add entries to the @metalist array. &File::Find::find(\&wanted, $perlvar{'lonDocRoot'}.'/res/'. $perlvar{'lonDefDomain'}.'/'.$user); # -- process file to get metadata and put into search catalog SQL database # Also, build and store dynamic metadata. # Also, delete record entries before refreshing. foreach my $m (@metalist) { # Log this action. print(LOG "- ".$m."\n"); # Get metadata from the file. my $ref = get_metadata_from_file($m); # Make a datarecord identifier for this resource. my $m2 = '/res/'.declutter($m); $m2 =~ s/\.meta$//; # Build and store dynamic metadata inside nohist_resevaldata.db. build_on_the_fly_dynamic_metadata($m2); # Delete record if it already exists. my $q2 = 'select * from metadata where url like binary '."'".$m2."'"; my $sth = $dbh->prepare($q2); $sth->execute(); my $r1 = $sth->fetchall_arrayref; if (@$r1) { $sth = $dbh->prepare('delete from metadata where url like binary '. "'".$m2."'"); $sth->execute(); } # Add new/replacement record into the loncapa:metadata table. $sth = $dbh->prepare('insert into metadata values ('. '"'.delete($ref->{'title'}).'"'.','. '"'.delete($ref->{'author'}).'"'.','. '"'.delete($ref->{'subject'}).'"'.','. '"'.$m2.'"'.','. '"'.delete($ref->{'keywords'}).'"'.','. '"'.'current'.'"'.','. '"'.delete($ref->{'notes'}).'"'.','. '"'.delete($ref->{'abstract'}).'"'.','. '"'.delete($ref->{'mime'}).'"'.','. '"'.delete($ref->{'language'}).'"'.','. '"'.sql_formatted_time( delete($ref->{'creationdate'})).'"'.','. '"'.sql_formatted_time( delete($ref->{'lastrevisiondate'})).'"'.','. '"'.delete($ref->{'owner'}).'"'.','. '"'.delete($ref->{'copyright'}).'"'.','. '1'.')'); $sth->execute(); } # ----------------------- Clean up database, remove stale SQL database records. $dbh->do('DELETE FROM metadata WHERE utilitysemaphore = 0'); # -------------------------------------------------- Copy over the new db-files system('mv '.$user_directory.'/nohist_new_resevaldata.db '. $user_directory.'/nohist_resevaldata.db'); } # --------------------------------------------------- Close database connection $dbh->disconnect; print LOG "\n==== Searchcat completed ".localtime()." ====\n"; close(LOG); exit(0); # ================================================================ Subroutines. =pod =head1 SUBROUTINES =cut =pod B - translate to unstrange escaped syntax to strange characters. =over 4 Parameters: =item I<$str> - string with unweird characters. =back =over 4 Returns: =item C - string with potentially weird characters. =back =cut sub unescape ($) { my $str = shift(@_); $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; return($str); } =pod B - translate strange characters to unstrange escaped syntax. =over 4 Parameters: =item I<$str> - string with potentially weird characters to unweird-ify. =back =over 4 Returns: =item C - unweird-ified string. =back =cut sub escape ($) { my $str = shift(@_); $str =~ s/(\W)/"%".unpack('H2',$1)/eg; return($str); } =pod B - evaluate and store dynamic metadata. Returns the dynamic metadata for an author, which will later be added to the MySQL database (not yet implemented). The vast majority of entries in F, which contains the dynamic metadata for an author's resources, are "count", which make the file really large and evaluation really slow. While computing the current value of all dynamic metadata for later insertion into the MySQL metadata cache (not yet implemented), this routine also simply adds up all "count" type fields and replaces them by one new field with the to-date count. Only after successful completion of working with one author, copy new file to original file. Copy to tmp-"new"-db-file was necessary since db-file size would not shrink after "delete" of key. =over 4 Parameters: =item I<$url> - the filesystem path (url may be a misnomer...) =back =over 4 Returns: =item C - key-value table of dynamically evaluated metadata. =back =cut sub build_on_the_fly_dynamic_metadata { # Need to compute the user's directory. my $url=&declutter(shift); $url=~s/\.meta$//; my %returnhash=(); my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//); my $user_directory=&construct_path_to_user_directory($adomain,$aauthor); # Attempt a GDBM database instantiation inside users directory and proceed. if ((tie(%evaldata,'GDBM_File', $user_directory. '/nohist_resevaldata.db',&GDBM_READER(),0640)) && (tie(%newevaldata,'GDBM_File', $user_directory. '/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640))) { # For different variables, track the running sum and counts. my %sum=(); my %cnt=(); # Define computed items as a sum (add) or an average (avg) or a raw # count (cnt) or append (app)? my %listitems=('count' => 'add', 'course' => 'add', 'avetries' => 'avg', 'stdno' => 'add', 'difficulty' => 'avg', 'clear' => 'avg', 'technical' => 'avg', 'helpful' => 'avg', 'correct' => 'avg', 'depth' => 'avg', 'comments' => 'app', 'usage' => 'cnt' ); # Untaint the url and use as part of a regular expression. my $regexp=$url; $regexp=~s/(\W)/\\$1/g; $regexp='___'.$regexp.'___([a-z]+)$'; #' emacs # Check existing database for this author. # this is modifying the 'count' entries # and copying all other entries over foreach (keys %evaldata) { my $key=&unescape($_); if ($key=~/$regexp/) { # If url-based entry exists. my $ctype=$1; # Set to specific category type. # Do an increment for this category type. if (defined($cnt{$ctype})) { $cnt{$ctype}++; } else { $cnt{$ctype}=1; } unless ($listitems{$ctype} eq 'app') { # append comments # Increment the sum based on the evaluated data in the db. if (defined($sum{$ctype})) { $sum{$ctype}+=$evaldata{$_}; } else { $sum{$ctype}=$evaldata{$_}; } } else { # 'app' mode, means to use '
' as a separator if (defined($sum{$ctype})) { if ($evaldata{$_}) { $sum{$ctype}.='
'.$evaldata{$_}; } } else { $sum{$ctype}=''.$evaldata{$_}; } } if ($ctype ne 'count') { # this is copying all data except 'count' attributes $newevaldata{$_}=$evaldata{$_}; } } } # these values will be returned (currently still unused) foreach (keys %cnt) { if ($listitems{$_} eq 'avg') { $returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0; } elsif ($listitems{$_} eq 'cnt') { $returnhash{$_}=$cnt{$_}; } else { $returnhash{$_}=$sum{$_}; } } # generate new count key in resevaldata, insert sum if ($returnhash{'count'}) { my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count'; $newevaldata{$newkey}=$returnhash{'count'}; } untie(%evaldata); # Close/release the original nohist database. untie(%newevaldata); # Close/release the new nohist database. } return %returnhash; } =pod B - used by B subroutine. This evaluates whether a file is wanted, and pushes it onto the I<@metalist> array. This subroutine was, for the most part, auto-generated by the B command. =over 4 Parameters: =item I<$file> - a path to the file. =back =over 4 Returns: =item C - true or false based on logical statement. =back =cut sub wanted ($) { (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && -f $_ && /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && push(@metalist,$File::Find::dir.'/'.$_); } =pod B - read xml-tagged file and return parsed metadata. I =over 4 Parameters: =item I<$file> - a path.to the file. =back =over 4 Returns: =item C - a hash array (keys and values). =back =cut sub get_metadata_from_file ($) { my ($filename) = @_; my %metatable; # Used to store return value of hash-tabled metadata. $filename = &declutter($filename); # Remove non-identifying filesystem info my $uri = ''; # The URI is not relevant in this scenario. unless ($filename =~ m/\.meta$/) # Unless ending with .meta. { $filename .= '.meta'; # Append a .meta suffix. } # Get the file contents. my $metadata_string = &get_file_contents($perlvar{'lonDocRoot'}.'/res/'.$filename); # Parse the file based on its XML tags. my $parser = HTML::TokeParser->new(\$metadata_string); my $token; while ($token = $parser->get_token) # Loop through tokens. { if ($token->[0] eq 'S') # If it is a start token. { my $entry = $token->[1]; my $unikey = $entry; # A unique identifier for this xml tag key. if (defined($token->[2]->{'part'})) { $unikey .= '_'.$token->[2]->{'part'}; } if (defined($token->[2]->{'name'})) { $unikey .= '_'.$token->[2]->{'name'}; } # Append $unikey to metatable's keys entry. if ($metatable{$uri.'keys'}) { $metatable{$uri.'keys'} .= ','.$unikey; } else { $metatable{$uri.'keys'} = $unikey; } # Insert contents into metatable entry for the unikey. foreach my $t3 (@{$token->[3]}) { $metatable{$uri.''.$unikey.'.'.$_} = $token->[2]->{$t3}; } # If there was no text contained inside the tags, set = default. unless ( $metatable{$uri.''.$unikey} = $parser->get_text('/'.$entry) ) { $metatable{$uri.''.$unikey} = $metatable{$uri.''.$unikey.'.default'}; } } } # Return with a key-value table of XML tags and their tag contents. return(\%metatable); } =pod B - returns either the contents of the file or a -1. =over 4 Parameters: =item I<$file> - a complete filesystem path.to the file. =back =over 4 Returns: =item C - file contents or a -1. =back =cut sub get_file_contents ($) { my $file = shift(@_); # If file does not exist, then return a -1 value. unless (-e $file) { return(-1); } # Read in file contents. my $file_handle = IO::File->new($file); my $file_contents = ''; while (<$file_handle>) { $file_contents .= $_; } # Return file contents. return($file_contents); } =pod B - Declutters URLs (remove extraneous prefixed filesystem path). =over 4 Parameters: =item I<$filesystem_path> - a complete filesystem path. =back =over 4 Returns: =item C - remnants of the filesystem path (beginning portion removed). =back =cut sub declutter { my $filesystem_path = shift(@_); # Remove beginning portions of the filesystem path. $filesystem_path =~ s/^$perlvar{'lonDocRoot'}//; $filesystem_path =~ s!^/!!; $filesystem_path =~ s!^res/!!; # Return what is remaining for the filesystem path. return($filesystem_path); } =pod B - Is this the home server of an author's directory? =over 4 Parameters: =item I<$author_filesystem_path> - directory path for a user. =back =over 4 Returns: =item C - 1 if true; 0 if false. =back =cut sub query_home_server_status ($) { my $author_filesystem_path = shift(@_); # Remove beginning portion of this filesystem path. $author_filesystem_path =~ s!/home/httpd/html/res/([^/]*)/([^/]*).*!$1/$2!; # Construct path to the author's ordinary user directory. my ($user_domain,$username) = split(m!/!,$author_filesystem_path); my $user_directory_path = construct_path_to_user_directory($user_domain, $username); # Return status of whether the user directory path is defined. if (-e $user_directory_path) { return(1); # True. } else { return(0); # False. } } =pod B ($$) - makes a filesystem path to user dir. =over 4 Parameters: =item I<$user_domain> - the loncapa domain of the user. =item I<$username> - the unique username (user id) of the user. =back =over 4 Returns: =item C - representing the path on the filesystem. =back =cut sub construct_path_to_user_directory ($$) { my ($user_domain,$username) = @_; # Untaint. $user_domain =~ s/\W//g; $username =~ s/\W//g; # Create three levels of sub-directoried filesystem path # based on the first three characters of the username. my $sub_filesystem_path = $username.'__'; $sub_filesystem_path =~ s!(.)(.)(.).*!$1/$2/$3/!; # Use the sub-directoried levels and other variables to generate # the complete filesystem path. my $complete_filesystem_path = join('/',($perlvar{'lonUsersDir'}, $user_domain, $sub_filesystem_path, $username)); # Return the complete filesystem path. return($complete_filesystem_path); } =pod B (@) - turns seconds since epoch into datetime sql format. =over 4 Parameters: =item I<$epochtime> - time in seconds since epoch (may need to be sanitized). =back =over 4 Returns: =item C - datetime sql formatted string. =back =cut sub sql_formatted_time ($) { # Sanitize the time argument and convert to localtime array. my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(&sanitize_time(shift(@_))); # Convert month from (0..11) to (1..12). $mon += 1; # Make the year compatible with A.D. specification. $year += 1900; # Return a date which is compatible with MySQL's "DATETIME" format. return(join('-',($year,$mon,$mday)). ' '. join(':',($hour,$min,$sec)) ); } # ==================================== The following two subroutines are needed # for accommodating incorrect time formats inside the metadata. =pod B (@) - turns time metadata into seconds since epoch. =over 4 Parameters: =item I<%time_metadata> - a key-value listing characterizing month, year, etc. =back =over 4 Returns: =item C - seconds since epoch. =back =cut sub make_seconds_since_epoch (@) { # Keytable of time metadata. my %time_metadata = @_; # Return seconds since the epoch (January 1, 1970, 00:00:00 UTC). return(POSIX::mktime( ($time_metadata{'seconds'}, $time_metadata{'minutes'}, $time_metadata{'hours'}, $time_metadata{'day'}, $time_metadata{'month'}-1, $time_metadata{'year'}-1900, 0, 0, $time_metadata{'dlsav'}) ) ); } =pod B - if time looks sql-formatted, make it seconds since epoch. Somebody described this subroutine as "retro-fixing of un-backward-compatible time format". What this means, is that a part of this code expects to get UTC seconds since the epoch (beginning of 1970). Yet, some of the .meta files have sql-formatted time strings (2001-04-01, etc.) instead of seconds-since-epoch integers (e.g. 1044147435). These time strings do not encode the timezone and, in this sense, can be considered "un-backwards-compatible". =over 4 Parameters: =item I<$potentially_badformat_string> - string to "retro-fix". =back =over 4 Returns: =item C - seconds since epoch. =back =cut sub sanitize_time ($) { my $timestamp = shift(@_); # If timestamp is in this unexpected format.... if ($timestamp =~ /^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) { # then convert into seconds since epoch (the expected format). $timestamp = &make_seconds_since_epoch( 'year' => $1, 'month' => $2, 'day' => $3, 'hours' => $4, 'minutes' => $5, 'seconds' => $6 ); } # Otherwise we assume timestamp to be as expected. return($timestamp); } =pod =head1 AUTHOR Written to help the loncapa project. Scott Harrison, sharrison@users.sourceforge.net This is distributed under the same terms as loncapa (i.e. "freeware"). =cut