loncom/metadata_database/searchcat.pl - diff

Return to searchcat.pl CVS log

Up to [LON-CAPA] / loncom / metadata_database

Diff for /loncom/metadata_database/searchcat.pl between versions 1.28 and 1.44

-version 1.28, 2003/02/03 05:39:37
+version 1.44, 2003/12/23 15:47:26
  Line 6
  #
  # Copyright Michigan State University Board of Trustees
  #
- # This file is part of the LearningOnline Network with a
+ # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
- # Computer assisted personalized approach (loncapa).
  #
- # Loncapa is free software; you can redistribute it and/or modify
+ # LON-CAPA is free software; you can redistribute it and/or modify
  # it under the terms of the GNU General Public License as published by
  # the Free Software Foundation; either version 2 of the License, or
  # (at your option) any later version.
  #
- # Loncapa is distributed in the hope that it will be useful,
+ # LON-CAPA is distributed in the hope that it will be useful,
  # but WITHOUT ANY WARRANTY; without even the implied warranty of
  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  # GNU General Public License for more details.
  #
  # You should have received a copy of the GNU General Public License
- # along with loncapa; if not, write to the Free Software
+ # along with LON-CAPA; if not, write to the Free Software
  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  #
  # /home/httpd/html/adm/gpl.txt
  #
- # http://www.loncapa.org/
+ # http://www.lon-capa.org/
- #
- # YEAR=2001
- # 04/14/2001, 04/16/2001 Scott Harrison
- #
- # YEAR=2002
- # 05/11/2002 Scott Harrison
- #
- # YEAR=2003
- # Scott Harrison
  #
  ###
- Line 53  filesystem installation location: F</etc
+ Line 43  filesystem installation location: F</etc
  Here is the cron job entry.
  C<# Repopulate and refresh the metadata database used for the search catalog.>
  C<10 1 * * 7    www    /home/httpd/perl/searchcat.pl>
  This script only allows itself to be run as the user C<www>.
- Line 66  The metadata is entered into a SQL datab
+ Line 55  The metadata is entered into a SQL datab
  This script also does general database maintenance such as reformatting
  the C<loncapa:metadata> table if it is deprecated.
- This script also builds dynamic temporal metadata and stores this inside
+ This script evaluates dynamic metadata from the authors'
- a F<nohist_resevaldata.db> database file.
+ F<nohist_resevaldata.db> database file in order to store it in MySQL, as
+ well as to compress the filesize (add up all "count"-type metadata).
  This script is playing an increasingly important role for a loncapa
  library server.  The proper operation of this script is critical for a smooth
- Line 75  and correct user experience.
+ Line 65  and correct user experience.
  =cut
- # ========================================================== Setting things up.
- # ------------------------------------------------------  Use external modules.
  use lib '/home/httpd/lib/perl/';
  use LONCAPA::Configuration;
- Line 88  use DBI;
+ Line 74  use DBI;
  use GDBM_File;
  use POSIX qw(strftime mktime);
- # ----------------- Code to enable 'find' subroutine listing of the .meta files
- use File::Find;
- # List of .meta files (used on a per-user basis).
  my @metalist;
- # ---------------  Read loncapa_apache.conf and loncapa.conf and get variables.
+ $simplestatus='';
- my $perlvarref = LONCAPA::Configuration::read_conf('loncapa.conf');
+ my %countext=();
- my %perlvar = %{$perlvarref};
- undef($perlvarref); # Remove since sensitive and not needed.
- delete($perlvar{'lonReceipt'}); # Remove since sensitive and not needed.
- # ------------------------------------- Only run if machine is a library server
- if ($perlvar{'lonRole'} ne 'library')
-   {
-     exit(0);
-   }
- # ------------------------------ Make sure this process is running as user=www.
- my $wwwid = getpwnam('www');
- if ($wwwid != $<)
-   {
-     $emailto = "$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
-     $subj = "LON: $perlvar{'lonHostID'} User ID mismatch";
-     system("echo 'User ID mismatch. searchcat.pl must be run as user www.' | ".
- 	   "mailto $emailto -s '$subj' > /dev/null");
-     exit(1);
-   }
- # ------------------------------------------------------ Initialize log output.
- open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
- print(LOG '==== Searchcat Run '.localtime().' ===='."\n\n");
- my $dbh; # Database object reference handle.
- # ----------------------------- Verify connection to loncapa:metadata database.
- unless (
- 	$dbh = DBI->connect('DBI:mysql:loncapa','www',
- 			    $perlvar{'lonSqlAccess'},
- 			    { RaiseError => 0,PrintError => 0})
- 	)
-   {
-     print(LOG '**** ERROR **** Cannot connect to database!'."\n");
-     exit(0);
-   }
- # ------------------------------ Create loncapa:metadata table if non-existent.
- my $make_metadata_table = 'CREATE TABLE IF NOT EXISTS metadata ('.
-     'title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, '.
-     'version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, '.
-     'creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, '.
-     'copyright TEXT, utilitysemaphore BOOL, FULLTEXT idx_title (title), '.
-     'FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), '.
-     'FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), '.
-     'FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), '.
-     'FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), '.
-     'FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), '.
-     'FULLTEXT idx_copyright (copyright)) TYPE=MYISAM';
- $dbh->do($make_metadata_table); # Generate the table.
- # ----------------------------- Verify format of the loncapa:metadata database.
- #                               (delete and recreate database if necessary).
- # Make a positive control for verifying table structure.
- my $make_metadata_table_CONTROL = $make_metadata_table;
- $make_metadata_table_CONTROL =~
-     s/^(CREATE TABLE IF NOT EXISTS) metadata/$1 CONTROL_metadata/;
- $dbh->do('DROP TABLE IF EXISTS CONTROL_metadata');
- $dbh->do($make_metadata_table_CONTROL);
- my $table_description; # selectall reference to the table description.
- my $CONTROL_table_string; # What the table description should look like.
- my $table_string; # What the table description does look like.
- # Calculate the CONTROL table description (what it should be).
- $table_description = $dbh->selectall_arrayref('describe CONTROL_metadata');
- foreach my $table_row (@{$table_description})
-   {
-     $CONTROL_table_string .= join(',',@{$table_row})."\n";
-   }
- # Calculate the current table description (what it currently looks like).
- $table_description = $dbh->selectall_arrayref('describe metadata');
- foreach my $table_row (@{$table_description})
-   {
-     $table_string .= join(',',@{$table_row})."\n";
-   }
- if ($table_string ne $CONTROL_table_string)
-   {
-     # Log this incident.
-     print(LOG '**** WARNING **** Table structure mismatch, need to regenerate'.
- 	  '.'."\n");
-     # Delete the table.
-     $dbh->do('DROP TABLE IF EXISTS metadata');
-     # Generate the table.
-     $dbh->do($make_metadata_table);
-   }
- $dbh->do('DROP TABLE IF EXISTS CONTROL_metadata'); # Okay.  Done with control.
- # ----------------------------------------------- Set utilitysemaphore to zero.
- $dbh->do('UPDATE metadata SET utilitysemaphore = 0');
- # ========================================================= Main functionality.
- # - Determine home authors on this server based on resources dir and user tree.
- # RESOURCES: the resources directory (subdirs correspond to author usernames).
- opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}") or
-     (print(LOG '=== /res/--lonDefDomain-- directory is not accessible'."\n")
-      and exit(0));
- # query_home_server_status will look for user home directories on this machine.
- my @homeusers =
-     grep {&query_home_server_status($perlvar{'lonDocRoot'}.'/res/'.
- 				    $perlvar{'lonDefDomain'}.'/'.$_)
- 	  } grep {!/^\.\.?$/} readdir(RESOURCES);
- closedir(RESOURCES);
- unless (@homeusers)
-   {
-     print(LOG '=== No home users found on this server.'."\n");
-   }
- # Consider each author individually.
- foreach my $user (@homeusers)
-   {
-     # Make a log entry.
-     print(LOG "\n".'=== User: '.$user."\n\n");
-     # Get filesystem path to this user's directory.
-     my $user_directory =
- 	&construct_path_to_user_directory($perlvar{'lonDefDomain'},$user);
-     # Remove left-over db-files from a potentially crashed searchcat run.
-     unlink($user_directory.'/nohist_new_resevaldata.db');
-     # Cleanup the metalist array.
-     undef(@metalist);
-     @metalist = ();
-     # This will add entries to the @metalist array.
-     &File::Find::find(\&wanted,
- 		      $perlvar{'lonDocRoot'}.'/res/'.
- 		      $perlvar{'lonDefDomain'}.'/'.$user);
-     # -- process file to get metadata and put into search catalog SQL database
-     # Also, build and store dynamic metadata.
-     # Also, delete record entries before refreshing.
-     foreach my $m (@metalist)
-       {
- 	# Log this action.
- 	print(LOG "- ".$m."\n");
- 	# Get metadata from the file.
- 	my $ref = get_metadata_from_file($m);
- 	# Make a datarecord identifier for this resource.
- 	my $m2 = '/res/'.declutter($m);
- 	$m2 =~ s/\.meta$//;
- 	# Build and store dynamic metadata inside nohist_resevaldata.db.
- 	build_on_the_fly_dynamic_metadata($m2);
- 	# Delete record if it already exists.
- 	my $q2 = 'select * from metadata where url like binary '."'".$m2."'";
- 	my $sth = $dbh->prepare($q2);
- 	$sth->execute();
- 	my $r1 = $sth->fetchall_arrayref;
- 	if (@$r1)
- 	  {
- 	    $sth =
- 		$dbh->prepare('delete from metadata where url like binary '.
- 			      "'".$m2."'");
- 	    $sth->execute();
- 	  }
- 	# Add new/replacement record into the loncapa:metadata table.
- 	$sth = $dbh->prepare('insert into metadata values ('.
- 			     '"'.delete($ref->{'title'}).'"'.','.
- 			     '"'.delete($ref->{'author'}).'"'.','.
- 			     '"'.delete($ref->{'subject'}).'"'.','.
- 			     '"'.$m2.'"'.','.
- 			     '"'.delete($ref->{'keywords'}).'"'.','.
- 			     '"'.'current'.'"'.','.
- 			     '"'.delete($ref->{'notes'}).'"'.','.
- 			     '"'.delete($ref->{'abstract'}).'"'.','.
- 			     '"'.delete($ref->{'mime'}).'"'.','.
- 			     '"'.delete($ref->{'language'}).'"'.','.
- 			     '"'.sql_formatted_time(
- 				       delete($ref->{'creationdate'})).'"'.','.
- 			     '"'.sql_formatted_time(
- 				   delete($ref->{'lastrevisiondate'})).'"'.','.
- 			     '"'.delete($ref->{'owner'}).'"'.','.
- 			     '"'.delete($ref->{'copyright'}).'"'.','.
- 			     '1'.')');
- 	$sth->execute();
-       }
- # ----------------------- Clean up database, remove stale SQL database records.
-     $dbh->do('DELETE FROM metadata WHERE utilitysemaphore = 0');
- # -------------------------------------------------- Copy over the new db-files
-     system('mv '.$user_directory.'/nohist_new_resevaldata.db '.
- 	         $user_directory.'/nohist_resevaldata.db');
-   }
- # --------------------------------------------------- Close database connection
- $dbh->disconnect;
- print LOG "\n==== Searchcat completed ".localtime()." ====\n";
- close(LOG);
- exit(0);
- # ================================================================ Subroutines.
- =pod
- =head1 SUBROUTINES
+ sub writesimple {
+     open(SMP,'>/home/httpd/html/lon-status/mysql.txt');
+     print SMP $simplestatus."\n";
+     close(SMP);
+ }
+ sub writecount {
+     open(RSMP,'>/home/httpd/html/lon-status/rescount.txt');
+     foreach (keys %countext) {
+ 	print RSMP $_.'='.$countext{$_}.'&';
+     }
+     print RSMP 'time='.time."\n";
+     close(RSMP);
+ }
+ sub count {
+     my $file=shift;
+     $file=~/\.(\w+)$/;
+     my $ext=lc($1);
+     if (defined($countext{$ext})) {
+ 	$countext{$ext}++;
+     } else {
+ 	$countext{$ext}=1;
+     }
+ }
+ # ----------------------------------------------------- Un-Escape Special Chars
- =cut
+ sub unescape {
+     my $str=shift;
- =pod
- B<unescape> - translate to unstrange escaped syntax to strange characters.
- =over 4
- Parameters:
- =item I<$str> - string with unweird characters.
- =back
- =over 4
- Returns:
- =item C<string> - string with potentially weird characters.
- =back
- =cut
- sub unescape ($)
-   {
-     my $str = shift(@_);
      $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg;
-     return($str);
+     return $str;
-   }
+ }
- =pod
- B<escape> - translate strange characters to unstrange escaped syntax.
- =over 4
- Parameters:
- =item I<$str> - string with potentially weird characters to unweird-ify.
+ # -------------------------------------------------------- Escape Special Chars
- =back
+ sub escape {
+     my $str=shift;
- =over 4
- Returns:
- =item C<string> - unweird-ified string.
- =back
- =cut
- sub escape ($)
-   {
-     my $str = shift(@_);
      $str =~ s/(\W)/"%".unpack('H2',$1)/eg;
-     return($str);
+     return $str;
-   }
+ }
- =pod
- B<build_on_the_fly_dynamic_metadata> - evaluate and store dynamic metadata.
- Dynamic metadata is stored in a nohist_resevaldata GDBM database.
- Most of the calculations in this subroutine are totally pointless
- and not useful for anything that this subroutine does.
- (THIS IS A FRUSTRATED SUBROUTINE THAT IS NON-OPTIMAL, *&*&!.)
- The only thing that this subroutine really makes happen is adjusting
- a 'count' value inside the F<nohist_new_resevaldata.db> as well
- as updating F<nohist_new_resevaldata.db> with information from
- F<nohist_resevaldata.db>.
- =over 4
- Parameters:
- =item I<$url> - the filesystem path (url may be a misnomer...)
- =back
+ # ------------------------------------------- Code to evaluate dynamic metadata
- =over 4
- Returns:
- =item C<hash> - key-value table of dynamically evaluated metadata.
- =back
- =cut
- sub build_on_the_fly_dynamic_metadata ($)
+ sub dynamicmeta {
-   {
+     my $url=&declutter(shift);
-     # BEWARE ALL WHO TRY TO UNDERSTAND THIS ABSURDLY HORRIBLE SUBROUTINE.
+     $url=~s/\.meta$//;
+     my %returnhash=();
-     # Do all sorts of mumbo-jumbo to compute the user's directory.
+     my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//);
-     my $url = &declutter(shift(@_));
+     my $prodir=&propath($adomain,$aauthor);
-     $url =~ s/\.meta$//;
+ # Get metadata except counts
-     my %returnhash = ();
+     if (tie(%evaldata,'GDBM_File',
-     my ($adomain,$aauthor) = ($url =~ m!^(\w+)/(\w+)/!);
+             $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) {
-     my $user_directory = &construct_path_to_user_directory($adomain,$aauthor);
+ 	my %sum=();
+ 	my %cnt=();
-     # Attempt a GDBM database instantiation inside users directory and proceed.
+ 	my %concat=();
-     if ((tie(%evaldata,'GDBM_File',
+ 	my %listitems=(
-             $user_directory.
- 	     '/nohist_resevaldata.db',&GDBM_READER(),0640)) &&
-         (tie(%newevaldata,'GDBM_File',
- 	     $user_directory.
- 	     '/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640)))
-       {
- 	# For different variables, track the running sum and counts.
- 	my %sum = ();
- 	my %cnt = ();
- 	# Define computed items as a sum (add) or an average (avg) or a raw
- 	# count (cnt) or 'app'?
- 	my %listitems=('count'        => 'add',
  		       'course'       => 'add',
+ 		       'goto'         => 'add',
+ 		       'comefrom'     => 'add',
  		       'avetries'     => 'avg',
  		       'stdno'        => 'add',
  		       'difficulty'   => 'avg',
- Line 441  sub build_on_the_fly_dynamic_metadata ($
+ Line 151  sub build_on_the_fly_dynamic_metadata ($
  		       'usage'        => 'cnt'
  		       );
- 	# Untaint the url and use as part of a regular expression.
+ 	my $regexp=$url;
- 	my $regexp = $url;
+ 	$regexp=~s/(\W)/\\$1/g;
- 	$regexp =~ s/(\W)/\\$1/g;
+ 	$regexp='___'.$regexp.'___([a-z]+)$';
- 	$regexp = '___'.$regexp.'___([a-z]+)$';
+ 	while (my ($esckey,$value)=each %evaldata) {
+ 	    $key=&unescape($esckey);
- 	# Check existing nohist database for this url.
+ 	    if ($key=~/$regexp/) {
- 	# THE ONLY TIME THIS IS IMPORTANT FOR THIS AWFUL SUBROUTINE
+ 		my ($item,$purl,$cat)=split(/___/,$_);
- 	# IS FOR 'count' ENTRIES
+ 		if (defined($cnt{$cat})) { $cnt{$cat}++; } else { $cnt{$cat}=1; }
- 	# AND FOR REFRESHING non-'count' ENTRIES INSIDE nohist_new DATABASE.
+ 		unless ($listitems{$cat} eq 'app') {
- 	foreach (keys %evaldata)
+ 		    if (defined($sum{$cat})) {
- 	  {
+ 			$sum{$cat}+=$evaldata{$_};
- 	    my $key = &unescape($_);
+ 			$concat{$cat}.=','.$item;
- 	    if ($key =~ /$regexp/) # If url-based entry exists.
+ 		    } else {
- 	      {
+ 			$sum{$cat}=$evaldata{$_};
- 		my $ctype = $1; # Set to specific category type.
+ 			$concat{$cat}=$item;
+ 		    }
- 		# Do an increment for this category type.
+ 		} else {
- 		if (defined($cnt{$ctype}))
+ 		    if (defined($sum{$cat})) {
- 		  {
+ 			if ($evaldata{$_}) {
- 		    $cnt{$ctype}++;
+ 			    $sum{$cat}.='<hr>'.$evaldata{$_};
- 		  }
+ 			}
- 		else
+ 		    } else {
- 		  {
+ 			$sum{$cat}=''.$evaldata{$_};
- 		    $cnt{$ctype} = 1;
+ 		    }
- 		  }
+ 		}
-                 unless ($listitems{$ctype} eq 'app') # WHAT DOES 'app' MEAN?
+ 	    }
- 		  {
+ 	}
- 		    # Increment the sum based on the evaluated data in the db.
+ 	untie(%evaldata);
- 		    if (defined($sum{$ctype}))
+     }
- 		      {
+ # construct the return hash for non-count data
- 			$sum{$ctype} += $evaldata{$_};
+     my %returnhash=();
- 		      }
+     while ($_=each(%cnt)) {
- 		    else
+ 	if ($listitems{$_} eq 'avg') {
- 		      {
+ 	    $returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0;
- 			$sum{$ctype} = $evaldata{$_};
+ 	} elsif ($listitems{$_} eq 'cnt') {
- 		      }
+ 	    $returnhash{$_}=$cnt{$_};
-  		  }
+ 	} else {
- 		else # 'app' mode, means to use '<hr />' as a separator
+ 	    $returnhash{$_}=$sum{$_};
- 		  {
+ 	}
- 		    if (defined($sum{$ctype}))
+ 	$returnhash{$_.'_list'}=$concat{$_};
- 		      {
+     }
- 			if ($evaldata{$_})
+ # get count
- 			  {
+     if (tie(%evaldata,'GDBM_File',
- 			    $sum{$ctype} .= '<hr />'.$evaldata{$_};
+             $prodir.'/nohist_accesscount.db',&GDBM_READER(),0640)) {
- 			  }
+ 	if (! exists($evaldata{$uri})) {
- 		      }
+ 	    $returnhash{'count'}='Not Available';
- 		    else
+ 	} else {
- 		      {
+ 	    $returnhash{'count'}=$evaldata{$uri};
- 			$sum{$ctype} = ''.$evaldata{$_};
+ 	}
- 		      }
+ 	untie %evaldata;
- 		  }
+     }
- 		if ($ctype ne 'count')
+     return %returnhash;
- 		  {
+ }
- 		    # ALERT!  THIS HORRIBLE LOOP IS ACTUALLY DOING SOMETHING
- 		    # USEFUL!
+ # ----------------- Code to enable 'find' subroutine listing of the .meta files
- 		    $newevaldata{$_} = $evaldata{$_};
+ require "find.pl";
- 		  }
+ sub wanted {
- 	      }
- 	  }
- 	# THE ONLY OTHER TIME THIS LOOP IS USEFUL IS FOR THE 'count' HASH
- 	# ELEMENT.
- 	foreach (keys %cnt)
- 	  {
- 	    if ($listitems{$_} eq 'avg')
- 	      {
- 		$returnhash{$_} = int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0;
- 	      }
- 	    elsif ($listitems{$_} eq 'cnt')
- 	      {
- 		$returnhash{$_} = $cnt{$_};
- 	      }
- 	    else
- 	      {
- 		$returnhash{$_} = $sum{$_};
- 	      }
- 	  }
- 	# A RARE MOMENT OF DOING ANYTHING USEFUL INSIDE THIS
- 	# BLEEPING SUBROUTINE.
- 	if ($returnhash{'count'})
- 	  {
- 	    my $newkey = $$.'_'.time.'_searchcat___'.&escape($url).'___count';
- 	    $newevaldata{$newkey} = $returnhash{'count'};
- 	  }
- 	untie(%evaldata); # Close/release the original nohist database.
- 	untie(%newevaldata); # Close/release the new nohist database.
-       }
-     return(%returnhash);
-     # Celebrate!  We have now accomplished some simple calculations using
-     # 1000% bloated functionality in our subroutine.  Go wash your eyeballs
-     # out now.
-   }
- =pod
- B<wanted> - used by B<File::Find::find> subroutine.
- This evaluates whether a file is wanted, and pushes it onto the
- I<@metalist> array.  This subroutine was, for the most part, auto-generated
- by the B<find2perl> command.
- =over 4
- Parameters:
- =item I<$file> - a path to the file.
- =back
- =over 4
- Returns:
- =item C<boolean> - true or false based on logical statement.
- =back
- =cut
- sub wanted ($)
-   {
      (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) &&
-     -f $_ &&
+         -f _ &&
-     /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ &&
+         /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ &&
-     push(@metalist,$File::Find::dir.'/'.$_);
+         push(@metalist,"$dir/$_");
-   }
+ }
- =pod
+ # ---------------  Read loncapa_apache.conf and loncapa.conf and get variables
+ my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
- B<get_metadata_from_file> - read xml-tagged file and return parsed metadata.
+ my %perlvar=%{$perlvarref};
+ undef $perlvarref; # remove since sensitive and not needed
- I<Note that this is significantly altered from a subroutine present in lonnet.>
+ delete $perlvar{'lonReceipt'}; # remove since sensitive and not needed
- =over 4
- Parameters:
- =item I<$file> - a path.to the file.
- =back
- =over 4
- Returns:
- =item C<hash reference> - a hash array (keys and values).
- =back
- =cut
- sub get_metadata_from_file ($)
-   {
-     my ($filename) = @_;
-     my %metatable; # Used to store return value of hash-tabled metadata.
-     $filename = &declutter($filename); # Remove non-identifying filesystem info
-     my $uri = ''; # The URI is not relevant in this scenario.
-     unless ($filename =~ m/\.meta$/) # Unless ending with .meta.
-       {
- 	$filename .= '.meta'; # Append a .meta suffix.
-       }
-     # Get the file contents.
-     my $metadata_string =
- 	&get_file_contents($perlvar{'lonDocRoot'}.'/res/'.$filename);
-     # Parse the file based on its XML tags.
-     my $parser = HTML::TokeParser->new(\$metadata_string);
-     my $token;
-     while ($token = $parser->get_token) # Loop through tokens.
-       {
- 	if ($token->[0] eq 'S') # If it is a start token.
- 	  {
- 	    my $entry = $token->[1];
- 	    my $unikey = $entry; # A unique identifier for this xml tag key.
- 	    if (defined($token->[2]->{'part'}))
- 	      {
- 		$unikey .= '_'.$token->[2]->{'part'};
- 	      }
- 	    if (defined($token->[2]->{'name'}))
- 	      {
- 		$unikey .= '_'.$token->[2]->{'name'};
- 	      }
- 	    # Append $unikey to metatable's keys entry.
- 	    if ($metatable{$uri.'keys'})
- 	      {
- 		$metatable{$uri.'keys'} .= ','.$unikey;
- 	      }
- 	    else
- 	      {
- 		$metatable{$uri.'keys'} = $unikey;
- 	      }
- 	    # Insert contents into metatable entry for the unikey.
- 	    foreach my $t3 (@{$token->[3]})
- 	      {
- 		$metatable{$uri.''.$unikey.'.'.$_} = $token->[2]->{$t3};
- 	      }
- 	    # If there was no text contained inside the tags, set = default.
- 	    unless
- 	      (
- 	        $metatable{$uri.''.$unikey} = $parser->get_text('/'.$entry)
- 	      )
- 	      {
- 		$metatable{$uri.''.$unikey} =
- 		    $metatable{$uri.''.$unikey.'.default'};
- 	      }
- 	  }
-       }
-     # Return with a key-value table of XML tags and their tag contents.
-     return(\%metatable);
-   }
- =pod
- B<get_file_contents> - returns either the contents of the file or a -1.
- =over 4
- Parameters:
- =item I<$file> - a complete filesystem path.to the file.
- =back
- =over 4
- Returns:
+ # ------------------------------------- Only run if machine is a library server
+ exit unless $perlvar{'lonRole'} eq 'library';
- =item C<string> - file contents or a -1.
- =back
- =cut
- sub get_file_contents ($)
-   {
-     my $file = shift(@_);
-     # If file does not exist, then return a -1 value.
-     unless (-e $file)
-       {
- 	return(-1);
-       }
-     # Read in file contents.
-     my $file_handle = IO::File->new($file);
-     my $file_contents = '';
-     while (<$file_handle>)
-       {
- 	$file_contents .= $_;
-       }
-     # Return file contents.
-     return($file_contents);
-   }
- =pod
- B<declutter> - Declutters URLs (remove extraneous prefixed filesystem path).
- =over 4
- Parameters:
- =item I<$filesystem_path> - a complete filesystem path.
- =back
- =over 4
- Returns:
- =item C<string> - remnants of the filesystem path (beginning portion removed).
- =back
- =cut
- sub declutter
-   {
-     my $filesystem_path = shift(@_);
-     # Remove beginning portions of the filesystem path.
-     $filesystem_path =~ s/^$perlvar{'lonDocRoot'}//;
-     $filesystem_path =~ s!^/!!;
-     $filesystem_path =~ s!^res/!!;
-     # Return what is remaining for the filesystem path.
-     return($filesystem_path);
-   }
- =pod
- B<query_home_server_status> - Is this the home server of an author's directory?
- =over 4
- Parameters:
- =item I<$author_filesystem_path> - directory path for a user.
- =back
- =over 4
- Returns:
- =item C<boolean> - 1 if true; 0 if false.
- =back
- =cut
- sub query_home_server_status ($)
-   {
-     my $author_filesystem_path = shift(@_);
-     # Remove beginning portion of this filesystem path.
-     $author_filesystem_path =~ s!/home/httpd/html/res/([^/]*)/([^/]*).*!$1/$2!;
-     # Construct path to the author's ordinary user directory.
-     my ($user_domain,$username) = split(m!/!,$author_filesystem_path);
-     my $user_directory_path = construct_path_to_user_directory($user_domain,
- 							       $username);
-     # Return status of whether the user directory path is defined.
-     if (-e $user_directory_path)
-       {
- 	return(1); # True.
-       }
-     else
-       {
-         return(0); # False.
-       }
-   }
- =pod
- B<construct_path_to_user_directory> ($$) - makes a filesystem path to user dir.
- =over 4
- Parameters:
- =item I<$user_domain> - the loncapa domain of the user.
- =item I<$username> - the unique username (user id) of the user.
- =back
- =over 4
- Returns:
- =item C<string> - representing the path on the filesystem.
- =back
- =cut
- sub construct_path_to_user_directory ($$)
-   {
-     my ($user_domain,$username) = @_;
-     # Untaint.
-     $user_domain =~ s/\W//g;
-     $username =~ s/\W//g;
-     # Create three levels of sub-directoried filesystem path
-     # based on the first three characters of the username.
-     my $sub_filesystem_path = $username.'__';
-     $sub_filesystem_path =~ s!(.)(.)(.).*!$1/$2/$3/!;
-     # Use the sub-directoried levels and other variables to generate
-     # the complete filesystem path.
-     my $complete_filesystem_path =
- 	join('/',($perlvar{'lonUsersDir'},
- 		  $user_domain,
- 		  $sub_filesystem_path,
- 		  $username));
-     # Return the complete filesystem path.
-     return($complete_filesystem_path);
-   }
- =pod
- B<sql_formatted_time> (@) - turns seconds since epoch into datetime sql format.
- =over 4
- Parameters:
- =item I<$epochtime> - time in seconds since epoch (may need to be sanitized).
- =back
+ # ----------------------------- Make sure this process is running from user=www
- =over 4
+ my $wwwid=getpwnam('www');
+ if ($wwwid!=$<) {
+     $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
+     $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
+     system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\
+  mailto $emailto -s '$subj' > /dev/null");
+     exit 1;
+ }
- Returns:
- =item C<string> - datetime sql formatted string.
+ # ---------------------------------------------------------- We are in business
- =back
+ open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
+ print LOG '==== Searchcat Run '.localtime()."====\n\n";
+ $simplestatus='time='.time.'&';
+ my $dbh;
+ # ------------------------------------- Make sure that database can be accessed
+ {
+     unless (
+ 	    $dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0})
+ 	    ) {
+ 	print LOG "Cannot connect to database!\n";
+ 	$simplestatus.='mysql=defunct';
+ 	&writesimple();
+ 	exit;
+     }
+     my $make_metadata_table = "CREATE TABLE IF NOT EXISTS metadata (".
+         "title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, ".
+         "version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, ".
+         "creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, ".
+         "copyright TEXT, FULLTEXT idx_title (title), ".
+         "FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), ".
+         "FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), ".
+         "FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), ".
+         "FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), ".
+         "FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), ".
+         "FULLTEXT idx_copyright (copyright)) TYPE=MYISAM";
+     # It would sure be nice to have some logging mechanism.
+     $dbh->do($make_metadata_table);
+ }
- =cut
+ # ------------------------------------------------------------- get .meta files
+ opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}");
+ my @homeusers = grep {
+     &ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")
+     } grep {!/^\.\.?$/} readdir(RESOURCES);
+ closedir RESOURCES;
+ #
+ # Create the statement handlers we need
+ my $delete_sth = $dbh->prepare
+     ("DELETE FROM metadata WHERE url LIKE BINARY ?");
+ my $insert_sth = $dbh->prepare
+     ("INSERT INTO metadata VALUES (".
+      "?,".   # title
+      "?,".   # author
+      "?,".   # subject
+      "?,".   # m2???
+      "?,".   # version
+      "?,".   # current
+      "?,".   # notes
+      "?,".   # abstract
+      "?,".   # mime
+      "?,".   # language
+      "?,".   # creationdate
+      "?,".   # revisiondate
+      "?,".   # owner
+      "?)"    # copyright
+      );
+ foreach my $user (@homeusers) {
+     print LOG "\n=== User: ".$user."\n\n";
+     my $prodir=&propath($perlvar{'lonDefDomain'},$user);
+     # Use find.pl
+     undef @metalist;
+     @metalist=();
+     &find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user");
+     # -- process each file to get metadata and put into search catalog SQL
+     # database.  Also, check to see if already there.
+     # I could just delete (without searching first), but this works for now.
+     foreach my $m (@metalist) {
+         print LOG "- ".$m."\n";
+         my $ref=&metadata($m);
+         my $m2='/res/'.&declutter($m);
+         $m2=~s/\.meta$//;
+         &dynamicmeta($m2);
+  	if ($ref->{'obsolete'}) { print LOG "obsolete\n"; next; }
+ 	if ($ref->{'copyright'} eq 'private') { print LOG "private\n"; next; }
+ 	&count($m2);
+         $delete_sth->execute($m2);
+         $insert_sth->execute($ref->{'title'},
+                              $ref->{'author'},
+                              $ref->{'subject'},
+                              $m2,
+                              $ref->{'keywords'},
+                              'current',
+                              $ref->{'notes'},
+                              $ref->{'abstract'},
+                              $ref->{'mime'},
+                              $ref->{'language'},
+                              sqltime($ref->{'creationdate'}),
+                              sqltime($ref->{'lastrevisiondate'}),
+                              $ref->{'owner'},
+                              $ref->{'copyright'});
+ #        if ($dbh->err()) {
+ #            print STDERR "Error:".$dbh->errstr()."\n";
+ #        }
+         $ref = undef;
+     }
+     # --------------------------------------------------- Clean up database
+     # Need to, perhaps, remove stale SQL database records.
+     # ... not yet implemented
+ }
+ # --------------------------------------------------- Close database connection
+ $dbh->disconnect;
+ print LOG "\n==== Searchcat completed ".localtime()." ====\n";
+ close(LOG);
+ &writesimple();
+ &writecount();
+ exit 0;
+ # =============================================================================
+ # ---------------------------------------------------------------- Get metadata
+ # significantly altered from subroutine present in lonnet
+ sub metadata {
+     my ($uri,$what)=@_;
+     my %metacache;
+     $uri=&declutter($uri);
+     my $filename=$uri;
+     $uri=~s/\.meta$//;
+     $uri='';
+     unless ($metacache{$uri.'keys'}) {
+         unless ($filename=~/\.meta$/) { $filename.='.meta'; }
+ 	my $metastring=&getfile($perlvar{'lonDocRoot'}.'/res/'.$filename);
+         my $parser=HTML::TokeParser->new(\$metastring);
+         my $token;
+         while ($token=$parser->get_token) {
+             if ($token->[0] eq 'S') {
+                 my $entry=$token->[1];
+                 my $unikey=$entry;
+                 if (defined($token->[2]->{'part'})) {
+                     $unikey.='_'.$token->[2]->{'part'};
+                 }
+                 if (defined($token->[2]->{'name'})) {
+                     $unikey.='_'.$token->[2]->{'name'};
+                 }
+                 if ($metacache{$uri.'keys'}) {
+                     $metacache{$uri.'keys'}.=','.$unikey;
+                 } else {
+                     $metacache{$uri.'keys'}=$unikey;
+                 }
+                 map {
+                     $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_};
+                 } @{$token->[3]};
+                 unless (
+                         $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry)
+                         ) { $metacache{$uri.''.$unikey}=
+                                 $metacache{$uri.''.$unikey.'.default'};
+                         }
+             }
+         }
+     }
+     return \%metacache;
+ }
+ # ------------------------------------------------------------ Serves up a file
+ # returns either the contents of the file or a -1
+ sub getfile {
+     my $file=shift;
+     if (! -e $file ) { return -1; };
+     my $fh=IO::File->new($file);
+     my $a='';
+     while (<$fh>) { $a .=$_; }
+     return $a;
+ }
+ # ------------------------------------------------------------- Declutters URLs
+ sub declutter {
+     my $thisfn=shift;
+     $thisfn=~s/^$perlvar{'lonDocRoot'}//;
+     $thisfn=~s/^\///;
+     $thisfn=~s/^res\///;
+     return $thisfn;
+ }
+ # --------------------------------------- Is this the home server of an author?
+ # (copied from lond, modification of the return value)
+ sub ishome {
+     my $author=shift;
+     $author=~s/\/home\/httpd\/html\/res\/([^\/]*)\/([^\/]*).*/$1\/$2/;
+     my ($udom,$uname)=split(/\//,$author);
+     my $proname=propath($udom,$uname);
+     if (-e $proname) {
+ 	return 1;
+     } else {
+         return 0;
+     }
+ }
+ # -------------------------------------------- Return path to profile directory
+ # (copied from lond)
+ sub propath {
+     my ($udom,$uname)=@_;
+     $udom=~s/\W//g;
+     $uname=~s/\W//g;
+     my $subdir=$uname.'__';
+     $subdir =~ s/(.)(.)(.).*/$1\/$2\/$3/;
+     my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname";
+     return $proname;
+ }
- sub sql_formatted_time ($)
+ # ---------------------------- convert 'time' format into a datetime sql format
-   {
+ sub sqltime {
-     # Sanitize the time argument and convert to localtime array.
      my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
- 	localtime(&sanitize_time(shift(@_)));
+ 	localtime(&unsqltime(@_[0]));
+     $mon++; $year+=1900;
-     # Convert month from (0..11) to (1..12).
+     return "$year-$mon-$mday $hour:$min:$sec";
-     $mon += 1;
+ }
-     # Make the year compatible with A.D. specification.
+ sub maketime {
-     $year += 1900;
+     my %th=@_;
+     return POSIX::mktime(($th{'seconds'},$th{'minutes'},$th{'hours'},
-     # Return a date which is compatible with MySQL's "DATETIME" format.
+                           $th{'day'},$th{'month'}-1,
-     return(join('-',($year,$mon,$mday)).
+                           $th{'year'}-1900,0,0,$th{'dlsav'}));
- 	   ' '.
+ }
- 	   join(':',($hour,$min,$sec))
- 	   );
-   }
+ #########################################
+ #
+ # Retro-fixing of un-backward-compatible time format
- # ==================================== The following two subroutines are needed
- #                 for accommodating incorrect time formats inside the metadata.
+ sub unsqltime {
+     my $timestamp=shift;
- =pod
+     if ($timestamp=~/^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) {
+         $timestamp=&maketime('year'=>$1,'month'=>$2,'day'=>$3,
- B<make_seconds_since_epoch> (@) - turns time metadata into seconds since epoch.
+                              'hours'=>$4,'minutes'=>$5,'seconds'=>$6);
+     }
- =over 4
+     return $timestamp;
+ }
- Parameters:
- =item I<%time_metadata> - a key-value listing characterizing month, year, etc.
- =back
- =over 4
- Returns:
- =item C<integer> - seconds since epoch.
- =back
- =cut
- sub make_seconds_since_epoch (@)
-   {
-     # Keytable of time metadata.
-     my %time_metadata = @_;
-     # Return seconds since the epoch (January 1, 1970, 00:00:00 UTC).
-     return(POSIX::mktime(
- 			 ($time_metadata{'seconds'},
- 			  $time_metadata{'minutes'},
- 			  $time_metadata{'hours'},
- 			  $time_metadata{'day'},
- 			  $time_metadata{'month'}-1,
- 			  $time_metadata{'year'}-1900,
-,
-,
- 			  $time_metadata{'dlsav'})
- 			 )
- 	   );
-   }
- =pod
- B<sanitize_time> - if time looks sql-formatted, make it seconds since epoch.
- Somebody described this subroutine as
- "retro-fixing of un-backward-compatible time format".
- What this means, is that a part of this code expects to get UTC seconds
- since the epoch (beginning of 1970).  Yet, some of the .meta files have
- sql-formatted time strings (2001-04-01, etc.) instead of seconds-since-epoch
- integers (e.g. 1044147435).  These time strings do not encode the timezone
- and, in this sense, can be considered "un-backwards-compatible".
- =over 4
- Parameters:
- =item I<$potentially_badformat_string> - string to "retro-fix".
- =back
- =over 4
- Returns:
- =item C<integer> - seconds since epoch.
- =back
- =cut
- sub sanitize_time ($)
-   {
-     my $timestamp = shift(@_);
-     # If timestamp is in this unexpected format....
-     if ($timestamp =~ /^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/)
-       {
- 	# then convert into seconds since epoch (the expected format).
- 	$timestamp = &make_seconds_since_epoch(
- 					       'year' => $1,
- 					       'month' => $2,
- 					       'day' => $3,
- 					       'hours' => $4,
- 					       'minutes' => $5,
- 					       'seconds' => $6
- 					       );
-       }
-     # Otherwise we assume timestamp to be as expected.
-     return($timestamp);
-   }
- =pod
- =head1 AUTHOR
- Written to help the loncapa project.
- Scott Harrison, sharrison@users.sourceforge.net
- This is distributed under the same terms as loncapa (i.e. "freeware").
- =cut

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.28
changed lines
	Added in v.1.44