--- loncom/metadata_database/searchcat.pl 2002/05/11 21:28:20 1.17 +++ loncom/metadata_database/searchcat.pl 2003/12/26 16:27:20 1.51 @@ -2,7 +2,7 @@ # The LearningOnline Network # searchcat.pl "Search Catalog" batch script # -# $Id: searchcat.pl,v 1.17 2002/05/11 21:28:20 harris41 Exp $ +# $Id: searchcat.pl,v 1.51 2003/12/26 16:27:20 www Exp $ # # Copyright Michigan State University Board of Trustees # @@ -26,17 +26,45 @@ # # http://www.lon-capa.org/ # -# YEAR=2001 -# 04/14/2001, 04/16/2001 Scott Harrison -# -# YEAR=2002 -# 05/11/2002 Scott Harrison -# ### -# This script goes through a LON-CAPA resource -# directory and gathers metadata. -# The metadata is entered into a SQL database. +=pod + +=head1 NAME + +B - put authoritative filesystem data into sql database. + +=head1 SYNOPSIS + +Ordinarily this script is to be called from a loncapa cron job +(CVS source location: F; typical +filesystem installation location: F). + +Here is the cron job entry. + +C<# Repopulate and refresh the metadata database used for the search catalog.> +C<10 1 * * 7 www /home/httpd/perl/searchcat.pl> + +This script only allows itself to be run as the user C. + +=head1 DESCRIPTION + +This script goes through a loncapa resource directory and gathers metadata. +The metadata is entered into a SQL database. + +This script also does general database maintenance such as reformatting +the C table if it is deprecated. + +This script evaluates dynamic metadata from the authors' +F database file in order to store it in MySQL. + +This script is playing an increasingly important role for a loncapa +library server. The proper operation of this script is critical for a smooth +and correct user experience. + +=cut + +use strict; use lib '/home/httpd/lib/perl/'; use LONCAPA::Configuration; @@ -44,92 +72,395 @@ use LONCAPA::Configuration; use IO::File; use HTML::TokeParser; use DBI; +use GDBM_File; +use POSIX qw(strftime mktime); -my @metalist; -# ----------------- Code to enable 'find' subroutine listing of the .meta files require "find.pl"; -sub wanted { - (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && - -f _ && - /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && - push(@metalist,"$dir/$_"); + +my @metalist; + +my $simplestatus=''; +my %countext=(); + +# ----------------------------------------------------- write out simple status +sub writesimple { + open(SMP,'>/home/httpd/html/lon-status/mysql.txt'); + print SMP $simplestatus."\n"; + close(SMP); +} + +sub writecount { + open(RSMP,'>/home/httpd/html/lon-status/rescount.txt'); + foreach (keys %countext) { + print RSMP $_.'='.$countext{$_}.'&'; + } + print RSMP 'time='.time."\n"; + close(RSMP); +} + +# -------------------------------------- counts files with different extensions +sub count { + my $file=shift; + $file=~/\.(\w+)$/; + my $ext=lc($1); + if (defined($countext{$ext})) { + $countext{$ext}++; + } else { + $countext{$ext}=1; + } +} +# ----------------------------------------------------- Un-Escape Special Chars + +sub unescape { + my $str=shift; + $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; + return $str; } -# ------------------------- Read access.conf and loncapa.conf and get variables -my $perlvarref=LONCAPA::Configuration::read_conf('access.conf','loncapa.conf'); +# -------------------------------------------------------- Escape Special Chars + +sub escape { + my $str=shift; + $str =~ s/(\W)/"%".unpack('H2',$1)/eg; + return $str; +} + +# ------------------------------------------- Code to evaluate dynamic metadata + +sub dynamicmeta { + my $url=&declutter(shift); + $url=~s/\.meta$//; + my %returnhash=( + 'count' => 0, + 'course' => 0, + 'course_list' => '', + 'avetries' => 'NULL', + 'avetries_list' => '', + 'stdno' => 0, + 'stdno_list' => '', + 'usage' => 0, + 'usage_list' => '', + 'goto' => 0, + 'goto_list' => '', + 'comefrom' => 0, + 'comefrom_list' => '', + 'difficulty' => 'NULL', + 'difficulty_list' => '', + 'clear' => 'NULL', + 'technical' => 'NULL', + 'correct' => 'NULL', + 'helpful' => 'NULL', + 'depth' => 'NULL', + 'comments' => '' + ); + my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//); + my $prodir=&propath($adomain,$aauthor); + +# Get metadata except counts + if (tie(my %evaldata,'GDBM_File', + $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) { + my %sum=(); + my %cnt=(); + my %concat=(); + my %listitems=( + 'course' => 'add', + 'goto' => 'add', + 'comefrom' => 'add', + 'avetries' => 'avg', + 'stdno' => 'add', + 'difficulty' => 'avg', + 'clear' => 'avg', + 'technical' => 'avg', + 'helpful' => 'avg', + 'correct' => 'avg', + 'depth' => 'avg', + 'comments' => 'app', + 'usage' => 'cnt' + ); + + my $regexp=$url; + $regexp=~s/(\W)/\\$1/g; + $regexp='___'.$regexp.'___([a-z]+)$'; + while (my ($esckey,$value)=each %evaldata) { + my $key=&unescape($esckey); + if ($key=~/$regexp/) { + my ($item,$purl,$cat)=split(/___/,$key); + if (defined($cnt{$cat})) { $cnt{$cat}++; } else { $cnt{$cat}=1; } + unless ($listitems{$cat} eq 'app') { + if (defined($sum{$cat})) { + $sum{$cat}+=&unescape($evaldata{$esckey}); + $concat{$cat}.=','.$item; + } else { + $sum{$cat}=&unescape($evaldata{$esckey}); + $concat{$cat}=$item; + } + } else { + if (defined($sum{$cat})) { + if ($evaldata{$esckey}=~/\w/) { + $sum{$cat}.='
'.&unescape($evaldata{$esckey}); + } + } else { + $sum{$cat}=''.&unescape($evaldata{$esckey}); + } + } + } + } + untie(%evaldata); +# transfer gathered data to returnhash, calculate averages where applicable + while (my $cat=each(%cnt)) { + if ($cnt{$cat} eq 'nan') { next; } + if ($sum{$cat} eq 'nan') { next; } + if ($listitems{$cat} eq 'avg') { + if ($cnt{$cat}) { + $returnhash{$cat}=int(($sum{$cat}/$cnt{$cat})*100.0+0.5)/100.0; + } else { + $returnhash{$cat}='NULL'; + } + } elsif ($listitems{$cat} eq 'cnt') { + $returnhash{$cat}=$cnt{$cat}; + } else { + $returnhash{$cat}=$sum{$cat}; + } + $returnhash{$cat.'_list'}=$concat{$cat}; + } + } +# get count + if (tie(my %evaldata,'GDBM_File', + $prodir.'/nohist_accesscount.db',&GDBM_READER(),0640)) { + my $escurl=&escape($url); + if (! exists($evaldata{$escurl})) { + $returnhash{'count'}=0; + } else { + $returnhash{'count'}=$evaldata{$escurl}; + } + untie %evaldata; + } + return %returnhash; +} + +# --------------- Read loncapa_apache.conf and loncapa.conf and get variables +my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf'); my %perlvar=%{$perlvarref}; -undef $perlvarref; # remove since sensitive and not needed +undef $perlvarref; delete $perlvar{'lonReceipt'}; # remove since sensitive and not needed # ------------------------------------- Only run if machine is a library server exit unless $perlvar{'lonRole'} eq 'library'; +# ----------------------------- Make sure this process is running from user=www + +my $wwwid=getpwnam('www'); +if ($wwwid!=$<) { + my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}"; + my $subj="LON: $perlvar{'lonHostID'} User ID mismatch"; + system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\ + mailto $emailto -s '$subj' > /dev/null"); + exit 1; +} + + +# ---------------------------------------------------------- We are in business + +open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log'); +print LOG '==== Searchcat Run '.localtime()."====\n\n"; +$simplestatus='time='.time.'&'; my $dbh; # ------------------------------------- Make sure that database can be accessed { unless ( $dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0}) ) { - print "Cannot connect to database!\n"; + print LOG "Cannot connect to database!\n"; + $simplestatus.='mysql=defunct'; + &writesimple(); exit; } + +# Make temporary table + $dbh->do("DROP TABLE IF EXISTS newmetadata"); + my $make_metadata_table = "CREATE TABLE IF NOT EXISTS newmetadata (". + "title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, ". + "version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, ". + "creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, ". + "copyright TEXT, dependencies TEXT, ". + "modifyinguser TEXT, authorspace TEXT, ". + "lowestgradelevel INTEGER UNSIGNED, highestgradelevel INTEGER UNSIGNED, ". + "standards TEXT, ". + "count INTEGER UNSIGNED, ". + "course INTEGER UNSIGNED, course_list TEXT, ". + "goto INTEGER UNSIGNED, goto_list TEXT, ". + "comefrom INTEGER UNSIGNED, comefrom_list TEXT, ". + "sequsage INTEGER UNSIGNED, sequsage_list TEXT, ". + "stdno INTEGER UNSIGNED, stdno_list TEXT, ". + "avetries FLOAT, avetries_list TEXT, ". + "difficulty FLOAT, difficulty_list TEXT, ". + "clear FLOAT, technical FLOAT, correct FLOAT, helpful FLOAT, depth FLOAT, ". + "comments TEXT, ". + "FULLTEXT idx_title (title), ". + "FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), ". + "FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), ". + "FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), ". + "FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), ". + "FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), ". + "FULLTEXT idx_copyright (copyright)) ". + "TYPE=MyISAM"; + # It would sure be nice to have some logging mechanism. + unless ($dbh->do($make_metadata_table)) { + print LOG "\nMySQL Error Create: ".$dbh->errstr."\n"; + die $dbh->errstr; + } } # ------------------------------------------------------------- get .meta files opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}"); -my @homeusers=grep - {&ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")} - grep {!/^\.\.?$/} readdir(RESOURCES); +my @homeusers = grep { + &ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_") + } grep {!/^\.\.?$/} readdir(RESOURCES); closedir RESOURCES; + +# +# Create the statement handlers we need + +my $insert_sth = $dbh->prepare + ("INSERT INTO newmetadata VALUES (". + "?,". # title + "?,". # author + "?,". # subject + "?,". # declutter url + "?,". # version + "?,". # current + "?,". # notes + "?,". # abstract + "?,". # mime + "?,". # language + "?,". # creationdate + "?,". # revisiondate + "?,". # owner + "?,". # copyright + "?,". # dependencies + "?,". # modifyinguser + "?,". # authorspace + "?,". # lowestgradelevel + "?,". # highestgradelevel + "?,". # standards + "?,". # count + "?,". # course + "?,". # course_list + "?,". # goto + "?,". # goto_list + "?,". # comefrom + "?,". # comefrom_list + "?,". # usage + "?,". # usage_list + "?,". # stdno + "?,". # stdno_list + "?,". # avetries + "?,". # avetries_list + "?,". # difficulty + "?,". # difficulty_list + "?,". # clear + "?,". # technical + "?,". # correct + "?,". # helpful + "?,". # depth + "?". # comments + ")" + ); + foreach my $user (@homeusers) { + print LOG "\n=== User: ".$user."\n\n"; + + my $prodir=&propath($perlvar{'lonDefDomain'},$user); + # Use find.pl + undef @metalist; + @metalist=(); &find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user"); + # -- process each file to get metadata and put into search catalog SQL + # database. Also, check to see if already there. + # I could just delete (without searching first), but this works for now. + foreach my $m (@metalist) { + print LOG "- ".$m."\n"; + my $ref=&metadata($m); + my $m2='/res/'.&declutter($m); + $m2=~s/\.meta$//; + if ($ref->{'obsolete'}) { print LOG "obsolete\n"; next; } + if ($ref->{'copyright'} eq 'private') { print LOG "private\n"; next; } + my %dyn=&dynamicmeta($m2); + &count($m2); + unless ($insert_sth->execute( + $ref->{'title'}, + $ref->{'author'}, + $ref->{'subject'}, + $m2, + $ref->{'keywords'}, + 'current', + $ref->{'notes'}, + $ref->{'abstract'}, + $ref->{'mime'}, + $ref->{'language'}, + sqltime($ref->{'creationdate'}), + sqltime($ref->{'lastrevisiondate'}), + $ref->{'owner'}, + $ref->{'copyright'}, + $ref->{'dependencies'}, + $ref->{'modifyinguser'}, + $ref->{'authorspace'}, + $ref->{'lowestgradelevel'}, + $ref->{'highestgradelevel'}, + $ref->{'standards'}, + $dyn{'count'}, + $dyn{'course'}, + $dyn{'course_list'}, + $dyn{'goto'}, + $dyn{'goto_list'}, + $dyn{'comefrom'}, + $dyn{'comefrom_list'}, + $dyn{'usage'}, + $dyn{'usage_list'}, + $dyn{'stdno'}, + $dyn{'stdno_list'}, + $dyn{'avetries'}, + $dyn{'avetries_list'}, + $dyn{'difficulty'}, + $dyn{'difficulty_list'}, + $dyn{'clear'}, + $dyn{'technical'}, + $dyn{'correct'}, + $dyn{'helpful'}, + $dyn{'depth'}, + $dyn{'comments'} + )) { + print LOG "\nMySQL Error Insert: ".$dbh->errstr."\n"; + die $dbh->errstr; + } + $ref = undef; + } +} +# --------------------------------------------------- Close database connection +$dbh->do("DROP TABLE IF EXISTS metadata"); +unless ($dbh->do("RENAME TABLE newmetadata TO metadata")) { + print LOG "\nMySQL Error Rename: ".$dbh->errstr."\n"; + die $dbh->errstr; } +unless ($dbh->disconnect) { + print LOG "\nMySQL Error Disconnect: ".$dbh->errstr."\n"; + die $dbh->errstr; +} +print LOG "\n==== Searchcat completed ".localtime()." ====\n"; +close(LOG); +&writesimple(); +&writecount(); +exit 0; -# -- process each file to get metadata and put into search catalog SQL database -# Also, check to see if already there. -# I could just delete (without searching first), but this works for now. -foreach my $m (@metalist) { - my $ref=&metadata($m); - my $m2='/res/'.&declutter($m); - $m2=~s/\.meta$//; - my $q2="select * from metadata where url like binary '$m2'"; - my $sth = $dbh->prepare($q2); - $sth->execute(); - my $r1=$sth->fetchall_arrayref; - if (@$r1) { - $sth=$dbh->prepare("delete from metadata where url like binary '$m2'"); - $sth->execute(); - } - $sth=$dbh->prepare('insert into metadata values ('. - '"'.delete($ref->{'title'}).'"'.','. - '"'.delete($ref->{'author'}).'"'.','. - '"'.delete($ref->{'subject'}).'"'.','. - '"'.$m2.'"'.','. - '"'.delete($ref->{'keywords'}).'"'.','. - '"'.'current'.'"'.','. - '"'.delete($ref->{'notes'}).'"'.','. - '"'.delete($ref->{'abstract'}).'"'.','. - '"'.delete($ref->{'mime'}).'"'.','. - '"'.delete($ref->{'language'}).'"'.','. - '"'.sqltime(delete($ref->{'creationdate'})).'"'.','. - '"'.sqltime(delete($ref->{'lastrevisiondate'})).'"'.','. - '"'.delete($ref->{'owner'}).'"'.','. - '"'.delete($ref->{'copyright'}).'"'.')'); - $sth->execute(); -} - -# ----------------------------------------------------------- Clean up database -# Need to, perhaps, remove stale SQL database records. -# ... not yet implemented -# --------------------------------------------------- Close database connection -$dbh->disconnect; + +# ============================================================================= # ---------------------------------------------------------------- Get metadata # significantly altered from subroutine present in lonnet sub metadata { my ($uri,$what)=@_; - my %metacache; + my %metacache=(); $uri=&declutter($uri); my $filename=$uri; $uri=~s/\.meta$//; @@ -140,30 +471,30 @@ sub metadata { my $parser=HTML::TokeParser->new(\$metastring); my $token; while ($token=$parser->get_token) { - if ($token->[0] eq 'S') { - my $entry=$token->[1]; - my $unikey=$entry; - if (defined($token->[2]->{'part'})) { - $unikey.='_'.$token->[2]->{'part'}; - } - if (defined($token->[2]->{'name'})) { - $unikey.='_'.$token->[2]->{'name'}; - } - if ($metacache{$uri.'keys'}) { - $metacache{$uri.'keys'}.=','.$unikey; - } else { - $metacache{$uri.'keys'}=$unikey; - } - map { - $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; - } @{$token->[3]}; - unless ( - $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry) - ) { $metacache{$uri.''.$unikey}= - $metacache{$uri.''.$unikey.'.default'}; - } - } - } + if ($token->[0] eq 'S') { + my $entry=$token->[1]; + my $unikey=$entry; + if (defined($token->[2]->{'part'})) { + $unikey.='_'.$token->[2]->{'part'}; + } + if (defined($token->[2]->{'name'})) { + $unikey.='_'.$token->[2]->{'name'}; + } + if ($metacache{$uri.'keys'}) { + $metacache{$uri.'keys'}.=','.$unikey; + } else { + $metacache{$uri.'keys'}=$unikey; + } + map { + $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; + } @{$token->[3]}; + unless ( + $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry) + ) { $metacache{$uri.''.$unikey}= + $metacache{$uri.''.$unikey.'.default'}; + } + } + } } return \%metacache; } @@ -171,12 +502,12 @@ sub metadata { # ------------------------------------------------------------ Serves up a file # returns either the contents of the file or a -1 sub getfile { - my $file=shift; - if (! -e $file ) { return -1; }; - my $fh=IO::File->new($file); - my $a=''; - while (<$fh>) { $a .=$_; } - return $a + my $file=shift; + if (! -e $file ) { return -1; }; + my $fh=IO::File->new($file); + my $a=''; + while (<$fh>) { $a .=$_; } + return $a; } # ------------------------------------------------------------- Declutters URLs @@ -217,7 +548,39 @@ sub propath { # ---------------------------- convert 'time' format into a datetime sql format sub sqltime { my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = - localtime(@_[0]); + localtime(&unsqltime(@_[0])); $mon++; $year+=1900; return "$year-$mon-$mday $hour:$min:$sec"; } + +sub maketime { + my %th=@_; + return POSIX::mktime(($th{'seconds'},$th{'minutes'},$th{'hours'}, + $th{'day'},$th{'month'}-1, + $th{'year'}-1900,0,0,$th{'dlsav'})); +} + + +######################################### +# +# Retro-fixing of un-backward-compatible time format + +sub unsqltime { + my $timestamp=shift; + if ($timestamp=~/^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) { + $timestamp=&maketime('year'=>$1,'month'=>$2,'day'=>$3, + 'hours'=>$4,'minutes'=>$5,'seconds'=>$6); + } + return $timestamp; +} + +# ----------------- Code to enable 'find' subroutine listing of the .meta files + +no strict "vars"; + +sub wanted { + (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && + -f _ && + /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && + push(@metalist,"$dir/$_"); +}