File:  [LON-CAPA] / loncom / metadata_database / searchcat.pl
Revision 1.77: download - view: text, annotated - select for diffs
Wed Jul 25 23:17:43 2007 UTC (16 years, 8 months ago) by raeburn
Branches: MAIN
CVS tags: version_2_9_99_0, version_2_8_X, version_2_8_99_0, version_2_8_2, version_2_8_1, version_2_8_0, version_2_7_X, version_2_7_99_1, version_2_7_99_0, version_2_7_1, version_2_7_0, version_2_6_X, version_2_6_99_1, version_2_6_99_0, version_2_6_3, version_2_6_2, version_2_6_1, version_2_6_0, version_2_5_X, version_2_5_99_1, version_2_5_99_0, version_2_5_2, version_2_5_1, version_2_5_0, version_2_4_99_0, bz6209-base, bz6209, bz5969, bz2851, HEAD, GCI_3, GCI_2, GCI_1, BZ5971-printing-apage, BZ5434-fox
searchcat.pl
- Additional MySQL table to build: allusers
- &descend_tree() arguments changed. $dom added as first arg, and reference to %allusers hash added as last arg.
- %allusers keys are usernames for all users in the domain which are not courses.
- user information put into allusers MySQL table.

LONCAPA/lonmetadata.pm
- description of allusers table
- &update_metadata(), &lookup_metadata() and &delete_metadata() modified to allow more flexibility in the WHERE condition in the SQL query (no longer forced to be url = ).
- &process_allusers_data() added to add/modify the contents of the allusers table.

#!/usr/bin/perl
# The LearningOnline Network
# searchcat.pl "Search Catalog" batch script
#
# $Id: searchcat.pl,v 1.77 2007/07/25 23:17:43 raeburn Exp $
#
# Copyright Michigan State University Board of Trustees
#
# This file is part of the LearningOnline Network with CAPA (LON-CAPA).
#
# LON-CAPA is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# LON-CAPA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with LON-CAPA; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# /home/httpd/html/adm/gpl.txt
#
# http://www.lon-capa.org/
#
###

=pod

=head1 NAME

B<searchcat.pl> - put authoritative filesystem data into sql database.

=head1 SYNOPSIS

Ordinarily this script is to be called from a loncapa cron job
(CVS source location: F<loncapa/loncom/cron/loncapa>; typical
filesystem installation location: F</etc/cron.d/loncapa>).

Here is the cron job entry.

C<# Repopulate and refresh the metadata database used for the search catalog.>
C<10 1 * * 7    www    /home/httpd/perl/searchcat.pl>

This script only allows itself to be run as the user C<www>.

=head1 DESCRIPTION

This script goes through a loncapa resource directory and gathers metadata.
The metadata is entered into a SQL database.

This script also does general database maintenance such as reformatting
the C<loncapa:metadata> table if it is deprecated.

This script evaluates dynamic metadata from the authors'
F<nohist_resevaldata.db> database file in order to store it in MySQL.

This script is playing an increasingly important role for a loncapa
library server.  The proper operation of this script is critical for a smooth
and correct user experience.

=cut

use strict;
use DBI;
use lib '/home/httpd/lib/perl/';
use LONCAPA::lonmetadata;
use LONCAPA;
use Getopt::Long;
use IO::File;
use HTML::TokeParser;
use GDBM_File;
use POSIX qw(strftime mktime);

use Apache::lonnet();

use File::Find;

#
# Set up configuration options
my ($simulate,$oneuser,$help,$verbose,$logfile,$debug);
GetOptions (
            'help'     => \$help,
            'simulate' => \$simulate,
            'only=s'   => \$oneuser,
            'verbose=s'  => \$verbose,
            'debug' => \$debug,
            );

if ($help) {
    print <<"ENDHELP";
$0
Rebuild and update the LON-CAPA metadata database. 
Options:
    -help          Print this help
    -simulate      Do not modify the database.
    -only=user     Only compute for the given user.  Implies -simulate   
    -verbose=val   Sets logging level, val must be a number
    -debug         Turns on debugging output
ENDHELP
    exit 0;
}

if (! defined($debug)) {
    $debug = 0;
}

if (! defined($verbose)) {
    $verbose = 0;
}

if (defined($oneuser)) {
    $simulate=1;
}

##
## Use variables for table names so we can test this routine a little easier
my %oldnames = (
                 'metadata'    => 'metadata',
                 'portfolio'   => 'portfolio_metadata',
                 'access'      => 'portfolio_access',
                 'addedfields' => 'portfolio_addedfields',
                 'allusers'     => 'allusers',
               );

my %newnames;
# new table names -  append pid to have unique temporary tables
foreach my $key (keys(%oldnames)) {
    $newnames{$key} = 'new'.$oldnames{$key}.$$;
}

#
# Only run if machine is a library server
exit if ($Apache::lonnet::perlvar{'lonRole'} ne 'library');
#
#  Make sure this process is running from user=www
my $wwwid=getpwnam('www');
if ($wwwid!=$<) {
    my $emailto="$Apache::lonnet::perlvar{'lonAdmEMail'},$Apache::lonnet::perlvar{'lonSysEMail'}";
    my $subj="LON: $Apache::lonnet::perlvar{'lonHostID'} User ID mismatch";
    system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\
 mail -s '$subj' $emailto > /dev/null");
    exit 1;
}
#
# Let people know we are running
open(LOG,'>>'.$Apache::lonnet::perlvar{'lonDaemons'}.'/logs/searchcat.log');
&log(0,'==== Searchcat Run '.localtime()."====");


if ($debug) {
    &log(0,'simulating') if ($simulate);
    &log(0,'only processing user '.$oneuser) if ($oneuser);
    &log(0,'verbosity level = '.$verbose);
}
#
# Connect to database
my $dbh;
if (! ($dbh = DBI->connect("DBI:mysql:loncapa","www",$Apache::lonnet::perlvar{'lonSqlAccess'},
                          { RaiseError =>0,PrintError=>0}))) {
    &log(0,"Cannot connect to database!");
    die "MySQL Error: Cannot connect to database!\n";
}
# This can return an error and still be okay, so we do not bother checking.
# (perhaps it should be more robust and check for specific errors)
foreach my $key (keys(%newnames)) {
    if ($newnames{$key} ne '') {
        $dbh->do('DROP TABLE IF EXISTS '.$newnames{$key});
    }
}

#
# Create the new metadata, portfolio and allusers tables
foreach my $key (keys(%newnames)) {
    if ($newnames{$key} ne '') { 
        my $request =
             &LONCAPA::lonmetadata::create_metadata_storage($newnames{$key},$oldnames{$key});
        $dbh->do($request);
        if ($dbh->err) {
            $dbh->disconnect();
            &log(0,"MySQL Error Create: ".$dbh->errstr);
            die $dbh->errstr;
        }
    }
}

#
# find out which users we need to examine
my @domains = sort(&Apache::lonnet::current_machine_domains());
&log(9,'domains ="'.join('","',@domains).'"');

foreach my $dom (@domains) {
    &log(9,'domain = '.$dom);
    opendir(RESOURCES,"$Apache::lonnet::perlvar{'lonDocRoot'}/res/$dom");
    my @homeusers = 
        grep {
            &ishome("$Apache::lonnet::perlvar{'lonDocRoot'}/res/$dom/$_");
        } grep { 
            !/^\.\.?$/;
        } readdir(RESOURCES);
    closedir RESOURCES;
    &log(5,'users = '.$dom.':'.join(',',@homeusers));
    #
    if ($oneuser) {
        @homeusers=($oneuser);
    }
    #
    # Loop through the users
    foreach my $user (@homeusers) {
        &log(0,"=== User: ".$user);
        &process_dynamic_metadata($user,$dom);
        #
        # Use File::Find to get the files we need to read/modify
        find(
             {preprocess => \&only_meta_files,
              #wanted     => \&print_filename,
              #wanted     => \&log_metadata,
              wanted     => \&process_meta_file,
              no_chdir   => 1,
             }, join('/',($Apache::lonnet::perlvar{'lonDocRoot'},'res',$dom,$user)) );
    }
    # Search for all users and public portfolio files
    my (%allusers,%portusers);
    if ($oneuser) {
        %portusers = (
                        $oneuser => '',
                       );
        %allusers = (
                        $oneuser => '',
                       );
    } else {
        my $dir = $Apache::lonnet::perlvar{lonUsersDir}.'/'.$dom;
        &descend_tree($dom,$dir,0,\%portusers,\%allusers);
    }
    foreach my $uname (keys(%portusers)) {
        my $urlstart = '/uploaded/'.$dom.'/'.$uname;
        my $pathstart = &propath($dom,$uname).'/userfiles';
        my $is_course = &Apache::lonnet::is_course($dom,$uname);
        my $curr_perm = &Apache::lonnet::get_portfile_permissions($dom,$uname);
        my %access = &Apache::lonnet::get_access_controls($curr_perm);
        foreach my $file (keys(%access)) {
            my ($group,$url,$fullpath);
            if ($is_course) {
                ($group, my ($path)) = ($file =~ /^(\w+)(\/.+)$/);
                $fullpath = $pathstart.'/groups/'.$group.'/portfolio'.$path;
                $url = $urlstart.'/groups/'.$group.'/portfolio'.$path;
            } else {
                $fullpath = $pathstart.'/portfolio'.$file;
                $url = $urlstart.'/portfolio'.$file;
            }
            if (ref($access{$file}) eq 'HASH') {
                my %portaccesslog = 
                    &LONCAPA::lonmetadata::process_portfolio_access_data($dbh,
                           $simulate,\%newnames,$url,$fullpath,$access{$file});
                &portfolio_logging(%portaccesslog);
            }
            my %portmetalog = &LONCAPA::lonmetadata::process_portfolio_metadata($dbh,$simulate,\%newnames,$url,$fullpath,$is_course,$dom,$uname,$group);
            &portfolio_logging(%portmetalog);
        }
    }
    # Update allusers
    foreach my $uname (keys(%allusers)) {
        my %userdata = 
            &Apache::lonnet::get('environment',['firstname','lastname',
                'middlename','generation','id','permanentemail'],$dom,$uname);
        $userdata{'username'} = $uname;
        $userdata{'domain'} = $dom;
        my %alluserslog = 
            &LONCAPA::lonmetadata::process_allusers_data($dbh,$simulate,
                \%newnames,$uname,$dom,\%userdata);
        foreach my $item (keys(%alluserslog)) {
            &log(0,$alluserslog{$item});
        }
    }
}

#
# Rename the tables
if (! $simulate) {
    foreach my $key (keys(%oldnames)) {
        if (($oldnames{$key} ne '') && ($newnames{$key} ne '')) {
            $dbh->do('DROP TABLE IF EXISTS '.$oldnames{$key});
            if (! $dbh->do('RENAME TABLE '.$newnames{$key}.' TO '.$oldnames{$key})) {
                &log(0,"MySQL Error Rename: ".$dbh->errstr);
                die $dbh->errstr;
            } else {
                &log(1,"MySQL table rename successful for $key.");
            }
        }
    }
}
if (! $dbh->disconnect) {
    &log(0,"MySQL Error Disconnect: ".$dbh->errstr);
    die $dbh->errstr;
}
##
## Finished!
&log(0,"==== Searchcat completed ".localtime()." ====");
close(LOG);

&write_type_count();
&write_copyright_count();

exit 0;

##
## Status logging routine.  Inputs: $level, $message
## 
## $level 0 should be used for normal output and error messages
##
## $message does not need to end with \n.  In the case of errors
## the message should contain as much information as possible to
## help in diagnosing the problem.
##
sub log {
    my ($level,$message)=@_;
    $level = 0 if (! defined($level));
    if ($verbose >= $level) {
        print LOG $message.$/;
    }
}

sub portfolio_logging {
    my (%portlog) = @_;
    foreach my $key (keys(%portlog)) {
        if (ref($portlog{$key}) eq 'HASH') {
            foreach my $item (keys(%{$portlog{$key}})) {
                &log(0,$portlog{$key}{$item});
            }
        }
    }
}

sub descend_tree {
    my ($dom,$dir,$depth,$allportusers,$alldomusers) = @_;
    if (-d $dir) {
        opendir(DIR,$dir);
        my @contents = grep(!/^\./,readdir(DIR));
        closedir(DIR);
        $depth ++;
        foreach my $item (@contents) {
            if ($depth < 4) {
                &descend_tree($dom,$dir.'/'.$item,$depth,$allportusers,$alldomusers);
            } else {
                if (-e $dir.'/'.$item.'/file_permissions.db') {
                     $$allportusers{$item} = '';
                }
                if (!&Apache::lonnet::is_course($dom,$item)) { 
                    $$alldomusers{$item} = '';
                }
            }       
        }
    } 
}

########################################################
########################################################
###                                                  ###
###          File::Find support routines             ###
###                                                  ###
########################################################
########################################################
##
## &only_meta_files
##
## Called by File::Find.
## Takes a list of files/directories in and returns a list of files/directories
## to search.
sub only_meta_files {
    my @PossibleFiles = @_;
    my @ChosenFiles;
    foreach my $file (@PossibleFiles) {
        if ( ($file =~ /\.meta$/ &&            # Ends in meta
              $file !~ /\.\d+\.[^\.]+\.meta$/  # is not for a prior version
             ) || (-d $File::Find::dir."/".$file )) { # directories are okay
                 # but we do not want /. or /..
            push(@ChosenFiles,$file);
        }
    }
    return @ChosenFiles;
}

##
##
## Debugging routines, use these for 'wanted' in the File::Find call
##
sub print_filename {
    my ($file) = $_;
    my $fullfilename = $File::Find::name;
    if ($debug) {
        if (-d $file) {
            &log(5," Got directory ".$fullfilename);
        } else {
            &log(5," Got file ".$fullfilename);
        }
    }
    $_=$file;
}

sub log_metadata {
    my ($file) = $_;
    my $fullfilename = $File::Find::name;
    return if (-d $fullfilename); # No need to do anything here for directories
    if ($debug) {
        &log(6,$fullfilename);
        my $ref = &metadata($fullfilename);
        if (! defined($ref)) {
            &log(6,"    No data");
            return;
        }
        while (my($key,$value) = each(%$ref)) {
            &log(6,"    ".$key." => ".$value);
        }
        &count_copyright($ref->{'copyright'});
    }
    $_=$file;
}

##
## process_meta_file
##   Called by File::Find. 
##   Only input is the filename in $_.  
sub process_meta_file {
    my ($file) = $_;
    my $filename = $File::Find::name; # full filename
    return if (-d $filename); # No need to do anything here for directories
    #
    &log(3,$filename) if ($debug);
    #
    my $ref = &metadata($filename);
    #
    # $url is the original file url, not the metadata file
    my $target = $filename;
    $target =~ s/\.meta$//;
    my $url='/res/'.&declutter($target);
    &log(3,"    ".$url) if ($debug);
    #
    # Ignore some files based on their metadata
    if ($ref->{'obsolete'}) { 
        &log(3,"obsolete") if ($debug);
        return; 
    }
    &count_copyright($ref->{'copyright'});
    if ($ref->{'copyright'} eq 'private') { 
        &log(3,"private") if ($debug);
        return; 
    }
    #
    # Find the dynamic metadata
    my %dyn;
    if ($url=~ m:/default$:) {
        $url=~ s:/default$:/:;
        &log(3,"Skipping dynamic data") if ($debug);
    } else {
        &log(3,"Retrieving dynamic data") if ($debug);
        %dyn=&get_dynamic_metadata($url);
        &count_type($url);
    }
    &LONCAPA::lonmetadata::getfiledates($ref,$target);
    #
    my %Data = (
                %$ref,
                %dyn,
                'url'=>$url,
                'version'=>'current');
    if (! $simulate) {
        my ($count,$err) = 
          &LONCAPA::lonmetadata::store_metadata($dbh,$newnames{'metadata'},
                                                'metadata',\%Data);
        if ($err) {
            &log(0,"MySQL Error Insert: ".$err);
        }
        if ($count < 1) {
            &log(0,"Unable to insert record into MySQL database for $url");
        }
    }
    #
    # Reset $_ before leaving
    $_ = $file;
}

########################################################
########################################################
###                                                  ###
###  &metadata($uri)                                 ###
###   Retrieve metadata for the given file           ###
###                                                  ###
########################################################
########################################################
sub metadata {
    my ($uri) = @_;
    my %metacache=();
    $uri=&declutter($uri);
    my $filename=$uri;
    $uri=~s/\.meta$//;
    $uri='';
    if ($filename !~ /\.meta$/) { 
        $filename.='.meta';
    }
    my $metastring = 
        &LONCAPA::lonmetadata::getfile($Apache::lonnet::perlvar{'lonDocRoot'}.'/res/'.$filename);
    return undef if (! defined($metastring));
    my $parser=HTML::TokeParser->new(\$metastring);
    my $token;
    while ($token=$parser->get_token) {
        if ($token->[0] eq 'S') {
            my $entry=$token->[1];
            my $unikey=$entry;
            if (defined($token->[2]->{'part'})) { 
                $unikey.='_'.$token->[2]->{'part'}; 
            }
            if (defined($token->[2]->{'name'})) { 
                $unikey.='_'.$token->[2]->{'name'}; 
            }
            if ($metacache{$uri.'keys'}) {
                $metacache{$uri.'keys'}.=','.$unikey;
            } else {
                $metacache{$uri.'keys'}=$unikey;
            }
            foreach ( @{$token->[3]}) {
                $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_};
            }
            if (! ($metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry))){
                $metacache{$uri.''.$unikey} = 
                    $metacache{$uri.''.$unikey.'.default'};
            }
        } # End of ($token->[0] eq 'S')
    }
    return \%metacache;
}

########################################################
########################################################
###                                                  ###
###    Dynamic Metadata                              ###
###                                                  ###
########################################################
########################################################
##
## Dynamic metadata description (incomplete)
##
## For a full description of all fields,
## see LONCAPA::lonmetadata
##
##   Field             Type
##-----------------------------------------------------------
##   count             integer
##   course            integer
##   course_list       comma separated list of course ids
##   avetries          real                                
##   avetries_list     comma separated list of real numbers
##   stdno             real
##   stdno_list        comma separated list of real numbers
##   usage             integer   
##   usage_list        comma separated list of resources
##   goto              scalar
##   goto_list         comma separated list of resources
##   comefrom          scalar
##   comefrom_list     comma separated list of resources
##   difficulty        real
##   difficulty_list   comma separated list of real numbers
##   sequsage          scalar
##   sequsage_list     comma separated list of resources
##   clear             real
##   technical         real
##   correct           real
##   helpful           real
##   depth             real
##   comments          html of all the comments made
##
{

my %DynamicData;
my %Counts;

sub process_dynamic_metadata {
    my ($user,$dom) = @_;
    undef(%DynamicData);
    undef(%Counts);
    #
    my $prodir = &propath($dom,$user);
    #
    # Read in the dynamic metadata
    my %evaldata;
    if (! tie(%evaldata,'GDBM_File',
              $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) {
        return 0;
    }
    #
    %DynamicData = &LONCAPA::lonmetadata::process_reseval_data(\%evaldata);
    untie(%evaldata);
    $DynamicData{'domain'} = $dom;
    #print('user = '.$user.' domain = '.$dom.$/);
    #
    # Read in the access count data
    &log(7,'Reading access count data') if ($debug);
    my %countdata;
    if (! tie(%countdata,'GDBM_File',
              $prodir.'/nohist_accesscount.db',&GDBM_READER(),0640)) {
        return 0;
    }
    while (my ($key,$count) = each(%countdata)) {
        next if ($key !~ /^$dom/);
        $key = &unescape($key);
        &log(8,'    Count '.$key.' = '.$count) if ($debug);
        $Counts{$key}=$count;
    }
    untie(%countdata);
    if ($debug) {
        &log(7,scalar(keys(%Counts)).
             " Counts read for ".$user."@".$dom);
        &log(7,scalar(keys(%DynamicData)).
             " Dynamic metadata read for ".$user."@".$dom);
    }
    #
    return 1;
}

sub get_dynamic_metadata {
    my ($url) = @_;
    $url =~ s:^/res/::;
    my %data = &LONCAPA::lonmetadata::process_dynamic_metadata($url,
                                                               \%DynamicData);
    # find the count
    $data{'count'} = $Counts{$url};
    #
    # Log the dynamic metadata
    if ($debug) {
        while (my($k,$v)=each(%data)) {
            &log(8,"    ".$k." => ".$v);
        }
    }
    return %data;
}

} # End of %DynamicData and %Counts scope

########################################################
########################################################
###                                                  ###
###   Counts                                         ###
###                                                  ###
########################################################
########################################################
{

my %countext;

sub count_type {
    my $file=shift;
    $file=~/\.(\w+)$/;
    my $ext=lc($1);
    $countext{$ext}++;
}

sub write_type_count {
    open(RESCOUNT,'>/home/httpd/html/lon-status/rescount.txt');
    while (my ($extension,$count) = each(%countext)) {
	print RESCOUNT $extension.'='.$count.'&';
    }
    print RESCOUNT 'time='.time."\n";
    close(RESCOUNT);
}

} # end of scope for %countext

{

my %copyrights;

sub count_copyright {
    $copyrights{@_[0]}++;
}

sub write_copyright_count {
    open(COPYCOUNT,'>/home/httpd/html/lon-status/copyrightcount.txt');
    while (my ($copyright,$count) = each(%copyrights)) {
	print COPYCOUNT $copyright.'='.$count.'&';
    }
    print COPYCOUNT 'time='.time."\n";
    close(COPYCOUNT);
}

} # end of scope for %copyrights

########################################################
########################################################
###                                                  ###
###   Miscellanous Utility Routines                  ###
###                                                  ###
########################################################
########################################################
##
## &ishome($username)
##   Returns 1 if $username is a LON-CAPA author, 0 otherwise
##   (copied from lond, modification of the return value)
sub ishome {
    my $author=shift;
    $author=~s{/home/httpd/html/res/([^/]*)/([^/]*).*}{$1/$2};
    my ($udom,$uname)=split(/\//,$author);
    my $proname=propath($udom,$uname);
    if (-e $proname) {
	return 1;
    } else {
        return 0;
    }
}

##
## &declutter($filename)
##   Given a filename, returns a url for the filename.
sub declutter {
    my $thisfn=shift;
    $thisfn=~s/^$Apache::lonnet::perlvar{'lonDocRoot'}//;
    $thisfn=~s/^\///;
    $thisfn=~s/^res\///;
    return $thisfn;
}


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>