--- loncom/metadata_database/searchcat.pl 2002/10/08 18:45:33 1.23 +++ loncom/metadata_database/searchcat.pl 2003/12/25 15:20:00 1.49 @@ -2,7 +2,7 @@ # The LearningOnline Network # searchcat.pl "Search Catalog" batch script # -# $Id: searchcat.pl,v 1.23 2002/10/08 18:45:33 albertel Exp $ +# $Id: searchcat.pl,v 1.49 2003/12/25 15:20:00 www Exp $ # # Copyright Michigan State University Board of Trustees # @@ -26,17 +26,45 @@ # # http://www.lon-capa.org/ # -# YEAR=2001 -# 04/14/2001, 04/16/2001 Scott Harrison -# -# YEAR=2002 -# 05/11/2002 Scott Harrison -# ### -# This script goes through a LON-CAPA resource -# directory and gathers metadata. -# The metadata is entered into a SQL database. +=pod + +=head1 NAME + +B - put authoritative filesystem data into sql database. + +=head1 SYNOPSIS + +Ordinarily this script is to be called from a loncapa cron job +(CVS source location: F; typical +filesystem installation location: F). + +Here is the cron job entry. + +C<# Repopulate and refresh the metadata database used for the search catalog.> +C<10 1 * * 7 www /home/httpd/perl/searchcat.pl> + +This script only allows itself to be run as the user C. + +=head1 DESCRIPTION + +This script goes through a loncapa resource directory and gathers metadata. +The metadata is entered into a SQL database. + +This script also does general database maintenance such as reformatting +the C table if it is deprecated. + +This script evaluates dynamic metadata from the authors' +F database file in order to store it in MySQL. + +This script is playing an increasingly important role for a loncapa +library server. The proper operation of this script is critical for a smooth +and correct user experience. + +=cut + +use strict; use lib '/home/httpd/lib/perl/'; use LONCAPA::Configuration; @@ -45,10 +73,42 @@ use IO::File; use HTML::TokeParser; use DBI; use GDBM_File; +use POSIX qw(strftime mktime); + +require "find.pl"; my @metalist; +my $simplestatus=''; +my %countext=(); + +# ----------------------------------------------------- write out simple status +sub writesimple { + open(SMP,'>/home/httpd/html/lon-status/mysql.txt'); + print SMP $simplestatus."\n"; + close(SMP); +} + +sub writecount { + open(RSMP,'>/home/httpd/html/lon-status/rescount.txt'); + foreach (keys %countext) { + print RSMP $_.'='.$countext{$_}.'&'; + } + print RSMP 'time='.time."\n"; + close(RSMP); +} +# -------------------------------------- counts files with different extensions +sub count { + my $file=shift; + $file=~/\.(\w+)$/; + my $ext=lc($1); + if (defined($countext{$ext})) { + $countext{$ext}++; + } else { + $countext{$ext}=1; + } +} # ----------------------------------------------------- Un-Escape Special Chars sub unescape { @@ -65,100 +125,119 @@ sub escape { return $str; } - # ------------------------------------------- Code to evaluate dynamic metadata sub dynamicmeta { -# -# -# Do nothing for now ... -# -# - return; -# -# ..., but stuff below already works -# my $url=&declutter(shift); $url=~s/\.meta$//; - my %returnhash=(); + my %returnhash=( + 'count' => 0, + 'course' => 0, + 'course_list' => '', + 'avetries' => 'NULL', + 'avetries_list' => '', + 'stdno' => 0, + 'stdno_list' => '', + 'usage' => 0, + 'usage_list' => '', + 'goto' => 0, + 'goto_list' => '', + 'comefrom' => 0, + 'comefrom_list' => '', + 'difficulty' => 'NULL', + 'difficulty_list' => '', + 'clear' => 'NULL', + 'technical' => 'NULL', + 'correct' => 'NULL', + 'helpful' => 'NULL', + 'depth' => 'NULL', + 'comments' => '' + ); my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//); my $prodir=&propath($adomain,$aauthor); - if (tie(%evaldata,'GDBM_File', - $prodir.'/nohist_resevaldata.db',&GDBM_WRCREAT(),0640)) { - my %sum=(); - my %cnt=(); - my %listitems=('count' => 'add', - 'course' => 'add', - 'avetries' => 'avg', - 'stdno' => 'add', - 'difficulty' => 'avg', - 'clear' => 'avg', - 'technical' => 'avg', - 'helpful' => 'avg', - 'correct' => 'avg', - 'depth' => 'avg', - 'comments' => 'app', - 'usage' => 'cnt' - ); - my $regexp=$url; - $regexp=~s/(\W)/\\$1/g; - $regexp='___'.$regexp.'___([a-z]+)$'; - foreach (keys %evaldata) { - my $key=&unescape($_); - if ($key=~/$regexp/) { - my $ctype=$1; - if (defined($cnt{$ctype})) { - $cnt{$ctype}++; - } else { - $cnt{$ctype}=1; - } - unless ($listitems{$ctype} eq 'app') { - if (defined($sum{$ctype})) { - $sum{$ctype}+=$evaldata{$_}; - } else { - $sum{$ctype}=$evaldata{$_}; - } - } else { - if (defined($sum{$ctype})) { - if ($evaldata{$_}) { - $sum{$ctype}.='
'.$evaldata{$_}; - } - } else { - $sum{$ctype}=''.$evaldata{$_}; - } + +# Get metadata except counts + if (tie(my %evaldata,'GDBM_File', + $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) { + my %sum=(); + my %cnt=(); + my %concat=(); + my %listitems=( + 'course' => 'add', + 'goto' => 'add', + 'comefrom' => 'add', + 'avetries' => 'avg', + 'stdno' => 'add', + 'difficulty' => 'avg', + 'clear' => 'avg', + 'technical' => 'avg', + 'helpful' => 'avg', + 'correct' => 'avg', + 'depth' => 'avg', + 'comments' => 'app', + 'usage' => 'cnt' + ); + + my $regexp=$url; + $regexp=~s/(\W)/\\$1/g; + $regexp='___'.$regexp.'___([a-z]+)$'; + while (my ($esckey,$value)=each %evaldata) { + my $key=&unescape($esckey); + if ($key=~/$regexp/) { + my ($item,$purl,$cat)=split(/___/,$key); + if (defined($cnt{$cat})) { $cnt{$cat}++; } else { $cnt{$cat}=1; } + unless ($listitems{$cat} eq 'app') { + if (defined($sum{$cat})) { + $sum{$cat}+=$evaldata{$esckey}; + $concat{$cat}.=','.$item; + } else { + $sum{$cat}=$evaldata{$esckey}; + $concat{$cat}=$item; + } + } else { + if (defined($sum{$cat})) { + if ($evaldata{$esckey}=~/\w/) { + $sum{$cat}.='
'.$evaldata{$esckey}; + } + } else { + $sum{$cat}=''.$evaldata{$esckey}; + } + } } - if ($ctype eq 'count') { - delete($evaldata{$_}); - } - } - } - foreach (keys %cnt) { - if ($listitems{$_} eq 'avg') { - $returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0; - } elsif ($listitems{$_} eq 'cnt') { - $returnhash{$_}=$cnt{$_}; - } else { - $returnhash{$_}=$sum{$_}; - } - } - if ($returnhash{'count'}) { - my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count'; - $evaldata{$newkey}=$returnhash{'count'}; - } - untie(%evaldata); - } - return %returnhash; + } + untie(%evaldata); +# transfer gathered data to returnhash, calculate averages where applicable + while (my $cat=each(%cnt)) { + if ($cnt{$cat} eq 'nan') { next; } + if ($sum{$cat} eq 'nan') { next; } + if ($listitems{$cat} eq 'avg') { + if ($cnt{$cat}) { + $returnhash{$cat}=int(($sum{$cat}/$cnt{$cat})*100.0+0.5)/100.0; + } else { + $returnhash{$cat}='NULL'; + } + } elsif ($listitems{$cat} eq 'cnt') { + $returnhash{$cat}=$cnt{$cat}; + } else { + $returnhash{$cat}=$sum{$cat}; + } + $returnhash{$cat.'_list'}=$concat{$cat}; + } + } +# get count + if (tie(my %evaldata,'GDBM_File', + $prodir.'/nohist_accesscount.db',&GDBM_READER(),0640)) { + my $escurl=&escape($url); + if (! exists($evaldata{$escurl})) { + $returnhash{'count'}=0; + } else { + $returnhash{'count'}=$evaldata{$escurl}; + } + untie %evaldata; + } + return %returnhash; } -# ----------------- Code to enable 'find' subroutine listing of the .meta files -require "find.pl"; -sub wanted { - (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && - -f _ && - /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && - push(@metalist,"$dir/$_"); -} - # --------------- Read loncapa_apache.conf and loncapa.conf and get variables my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf'); my %perlvar=%{$perlvarref}; @@ -168,86 +247,205 @@ delete $perlvar{'lonReceipt'}; # remove # ------------------------------------- Only run if machine is a library server exit unless $perlvar{'lonRole'} eq 'library'; +# ----------------------------- Make sure this process is running from user=www + +my $wwwid=getpwnam('www'); +if ($wwwid!=$<) { + my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}"; + my $subj="LON: $perlvar{'lonHostID'} User ID mismatch"; + system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\ + mailto $emailto -s '$subj' > /dev/null"); + exit 1; +} + + +# ---------------------------------------------------------- We are in business + +open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log'); +print LOG '==== Searchcat Run '.localtime()."====\n\n"; +$simplestatus='time='.time.'&'; my $dbh; # ------------------------------------- Make sure that database can be accessed { unless ( $dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0}) ) { - print "Cannot connect to database!\n"; + print LOG "Cannot connect to database!\n"; + $simplestatus.='mysql=defunct'; + &writesimple(); exit; } - my $make_metadata_table = "CREATE TABLE IF NOT EXISTS metadata (". + +# Make temporary table + $dbh->do("DROP TABLE IF EXISTS newmetadata"); + my $make_metadata_table = "CREATE TABLE IF NOT EXISTS newmetadata (". "title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, ". "version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, ". "creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, ". - "copyright TEXT, FULLTEXT idx_title (title), ". + "copyright TEXT, ". + "count INTEGER UNSIGNED, ". + "course INTEGER UNSIGNED, course_list TEXT, ". + "goto INTEGER UNSIGNED, goto_list TEXT, ". + "comefrom INTEGER UNSIGNED, comefrom_list TEXT, ". + "sequsage INTEGER UNSIGNED, sequsage_list TEXT, ". + "stdno INTEGER UNSIGNED, stdno_list TEXT, ". + "avetries FLOAT, avetries_list TEXT, ". + "difficulty FLOAT, difficulty_list TEXT, ". + "clear FLOAT, technical FLOAT, correct FLOAT, helpful FLOAT, depth FLOAT, ". + "comments TEXT, ". + "FULLTEXT idx_title (title), ". "FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), ". "FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), ". "FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), ". "FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), ". "FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), ". - "FULLTEXT idx_copyright (copyright)) TYPE=MYISAM"; + "FULLTEXT idx_copyright (copyright)) ". + "TYPE=MyISAM"; # It would sure be nice to have some logging mechanism. - $dbh->do($make_metadata_table); + unless ($dbh->do($make_metadata_table)) { + print LOG "\nMySQL Error Create: ".$dbh->errstr."\n"; + die $dbh->errstr; + } } # ------------------------------------------------------------- get .meta files opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}"); -my @homeusers=grep - {&ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")} - grep {!/^\.\.?$/} readdir(RESOURCES); +my @homeusers = grep { + &ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_") + } grep {!/^\.\.?$/} readdir(RESOURCES); closedir RESOURCES; + +# +# Create the statement handlers we need + +my $insert_sth = $dbh->prepare + ("INSERT INTO newmetadata VALUES (". + "?,". # title + "?,". # author + "?,". # subject + "?,". # declutter url + "?,". # version + "?,". # current + "?,". # notes + "?,". # abstract + "?,". # mime + "?,". # language + "?,". # creationdate + "?,". # revisiondate + "?,". # owner + "?,". # copyright + "?,". # count + "?,". # course + "?,". # course_list + "?,". # goto + "?,". # goto_list + "?,". # comefrom + "?,". # comefrom_list + "?,". # usage + "?,". # usage_list + "?,". # stdno + "?,". # stdno_list + "?,". # avetries + "?,". # avetries_list + "?,". # difficulty + "?,". # difficulty_list + "?,". # clear + "?,". # technical + "?,". # correct + "?,". # helpful + "?,". # depth + "?". # comments + ")" + ); + foreach my $user (@homeusers) { + print LOG "\n=== User: ".$user."\n\n"; + + my $prodir=&propath($perlvar{'lonDefDomain'},$user); + # Use find.pl + undef @metalist; + @metalist=(); &find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user"); + # -- process each file to get metadata and put into search catalog SQL + # database. Also, check to see if already there. + # I could just delete (without searching first), but this works for now. + foreach my $m (@metalist) { + print LOG "- ".$m."\n"; + my $ref=&metadata($m); + my $m2='/res/'.&declutter($m); + $m2=~s/\.meta$//; + if ($ref->{'obsolete'}) { print LOG "obsolete\n"; next; } + if ($ref->{'copyright'} eq 'private') { print LOG "private\n"; next; } + my %dyn=&dynamicmeta($m2); + &count($m2); + unless ($insert_sth->execute( + $ref->{'title'}, + $ref->{'author'}, + $ref->{'subject'}, + $m2, + $ref->{'keywords'}, + 'current', + $ref->{'notes'}, + $ref->{'abstract'}, + $ref->{'mime'}, + $ref->{'language'}, + sqltime($ref->{'creationdate'}), + sqltime($ref->{'lastrevisiondate'}), + $ref->{'owner'}, + $ref->{'copyright'}, + $dyn{'count'}, + $dyn{'course'}, + $dyn{'course_list'}, + $dyn{'goto'}, + $dyn{'goto_list'}, + $dyn{'comefrom'}, + $dyn{'comefrom_list'}, + $dyn{'usage'}, + $dyn{'usage_list'}, + $dyn{'stdno'}, + $dyn{'stdno_list'}, + $dyn{'avetries'}, + $dyn{'avetries_list'}, + $dyn{'difficulty'}, + $dyn{'difficulty_list'}, + $dyn{'clear'}, + $dyn{'technical'}, + $dyn{'correct'}, + $dyn{'helpful'}, + $dyn{'depth'}, + $dyn{'comments'} + )) { + print LOG "\nMySQL Error Insert: ".$dbh->errstr."\n"; + die $dbh->errstr; + } + $ref = undef; + } +} +# --------------------------------------------------- Close database connection +$dbh->do("DROP TABLE IF EXISTS metadata"); +unless ($dbh->do("RENAME TABLE newmetadata TO metadata")) { + print LOG "\nMySQL Error Rename: ".$dbh->errstr."\n"; + die $dbh->errstr; } +unless ($dbh->disconnect) { + print LOG "\nMySQL Error Disconnect: ".$dbh->errstr."\n"; + die $dbh->errstr; +} +print LOG "\n==== Searchcat completed ".localtime()." ====\n"; +close(LOG); +&writesimple(); +&writecount(); +exit 0; -# -- process each file to get metadata and put into search catalog SQL database -# Also, check to see if already there. -# I could just delete (without searching first), but this works for now. -foreach my $m (@metalist) { - my $ref=&metadata($m); - my $m2='/res/'.&declutter($m); - $m2=~s/\.meta$//; - &dynamicmeta($m2); - my $q2="select * from metadata where url like binary '$m2'"; - my $sth = $dbh->prepare($q2); - $sth->execute(); - my $r1=$sth->fetchall_arrayref; - if (@$r1) { - $sth=$dbh->prepare("delete from metadata where url like binary '$m2'"); - $sth->execute(); - } - $sth=$dbh->prepare('insert into metadata values ('. - '"'.delete($ref->{'title'}).'"'.','. - '"'.delete($ref->{'author'}).'"'.','. - '"'.delete($ref->{'subject'}).'"'.','. - '"'.$m2.'"'.','. - '"'.delete($ref->{'keywords'}).'"'.','. - '"'.'current'.'"'.','. - '"'.delete($ref->{'notes'}).'"'.','. - '"'.delete($ref->{'abstract'}).'"'.','. - '"'.delete($ref->{'mime'}).'"'.','. - '"'.delete($ref->{'language'}).'"'.','. - '"'.sqltime(delete($ref->{'creationdate'})).'"'.','. - '"'.sqltime(delete($ref->{'lastrevisiondate'})).'"'.','. - '"'.delete($ref->{'owner'}).'"'.','. - '"'.delete($ref->{'copyright'}).'"'.')'); - $sth->execute(); -} - -# ----------------------------------------------------------- Clean up database -# Need to, perhaps, remove stale SQL database records. -# ... not yet implemented -# --------------------------------------------------- Close database connection -$dbh->disconnect; + +# ============================================================================= # ---------------------------------------------------------------- Get metadata # significantly altered from subroutine present in lonnet sub metadata { my ($uri,$what)=@_; - my %metacache; + my %metacache=(); $uri=&declutter($uri); my $filename=$uri; $uri=~s/\.meta$//; @@ -258,30 +456,30 @@ sub metadata { my $parser=HTML::TokeParser->new(\$metastring); my $token; while ($token=$parser->get_token) { - if ($token->[0] eq 'S') { - my $entry=$token->[1]; - my $unikey=$entry; - if (defined($token->[2]->{'part'})) { - $unikey.='_'.$token->[2]->{'part'}; - } - if (defined($token->[2]->{'name'})) { - $unikey.='_'.$token->[2]->{'name'}; - } - if ($metacache{$uri.'keys'}) { - $metacache{$uri.'keys'}.=','.$unikey; - } else { - $metacache{$uri.'keys'}=$unikey; - } - map { - $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; - } @{$token->[3]}; - unless ( - $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry) - ) { $metacache{$uri.''.$unikey}= - $metacache{$uri.''.$unikey.'.default'}; - } - } - } + if ($token->[0] eq 'S') { + my $entry=$token->[1]; + my $unikey=$entry; + if (defined($token->[2]->{'part'})) { + $unikey.='_'.$token->[2]->{'part'}; + } + if (defined($token->[2]->{'name'})) { + $unikey.='_'.$token->[2]->{'name'}; + } + if ($metacache{$uri.'keys'}) { + $metacache{$uri.'keys'}.=','.$unikey; + } else { + $metacache{$uri.'keys'}=$unikey; + } + map { + $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; + } @{$token->[3]}; + unless ( + $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry) + ) { $metacache{$uri.''.$unikey}= + $metacache{$uri.''.$unikey.'.default'}; + } + } + } } return \%metacache; } @@ -289,12 +487,12 @@ sub metadata { # ------------------------------------------------------------ Serves up a file # returns either the contents of the file or a -1 sub getfile { - my $file=shift; - if (! -e $file ) { return -1; }; - my $fh=IO::File->new($file); - my $a=''; - while (<$fh>) { $a .=$_; } - return $a + my $file=shift; + if (! -e $file ) { return -1; }; + my $fh=IO::File->new($file); + my $a=''; + while (<$fh>) { $a .=$_; } + return $a; } # ------------------------------------------------------------- Declutters URLs @@ -335,7 +533,39 @@ sub propath { # ---------------------------- convert 'time' format into a datetime sql format sub sqltime { my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = - localtime(@_[0]); + localtime(&unsqltime(@_[0])); $mon++; $year+=1900; return "$year-$mon-$mday $hour:$min:$sec"; } + +sub maketime { + my %th=@_; + return POSIX::mktime(($th{'seconds'},$th{'minutes'},$th{'hours'}, + $th{'day'},$th{'month'}-1, + $th{'year'}-1900,0,0,$th{'dlsav'})); +} + + +######################################### +# +# Retro-fixing of un-backward-compatible time format + +sub unsqltime { + my $timestamp=shift; + if ($timestamp=~/^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) { + $timestamp=&maketime('year'=>$1,'month'=>$2,'day'=>$3, + 'hours'=>$4,'minutes'=>$5,'seconds'=>$6); + } + return $timestamp; +} + +# ----------------- Code to enable 'find' subroutine listing of the .meta files + +no strict "vars"; + +sub wanted { + (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && + -f _ && + /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && + push(@metalist,"$dir/$_"); +}