#!/usr/bin/perl
# The LearningOnline Network
# searchcat.pl "Search Catalog" batch script
#
# $Id: searchcat.pl,v 1.55 2004/04/08 15:57:32 matthew Exp $
#
# Copyright Michigan State University Board of Trustees
#
# This file is part of the LearningOnline Network with CAPA (LON-CAPA).
#
# LON-CAPA is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# LON-CAPA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with LON-CAPA; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# /home/httpd/html/adm/gpl.txt
#
# http://www.lon-capa.org/
#
###

=pod

=head1 NAME

B<searchcat.pl> - put authoritative filesystem data into sql database.

=head1 SYNOPSIS

Ordinarily this script is to be called from a loncapa cron job
(CVS source location: F<loncapa/loncom/cron/loncapa>; typical
filesystem installation location: F</etc/cron.d/loncapa>).

Here is the cron job entry.

C<# Repopulate and refresh the metadata database used for the search catalog.>
C<10 1 * * 7    www    /home/httpd/perl/searchcat.pl>

This script only allows itself to be run as the user C<www>.

=head1 DESCRIPTION

This script goes through a loncapa resource directory and gathers metadata.
The metadata is entered into a SQL database.

This script also does general database maintenance such as reformatting
the C<loncapa:metadata> table if it is deprecated.

This script evaluates dynamic metadata from the authors'
F<nohist_resevaldata.db> database file in order to store it in MySQL.

This script is playing an increasingly important role for a loncapa
library server.  The proper operation of this script is critical for a smooth
and correct user experience.

=cut

use strict;

use DBI;
use lib '/home/httpd/lib/perl/';
use LONCAPA::Configuration;
use LONCAPA::lonmetadata;

use IO::File;
use HTML::TokeParser;
use GDBM_File;
use POSIX qw(strftime mktime);
use File::Find;

##
## Use variables for table names so we can test this routine a little easier
my $oldname = 'metadata';
my $newname = 'newmetadata';

#
# Read loncapa_apache.conf and loncapa.conf
my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
my %perlvar=%{$perlvarref};
undef $perlvarref;
delete $perlvar{'lonReceipt'}; # remove since sensitive (really?) & not needed
#
# Only run if machine is a library server
exit if ($perlvar{'lonRole'} ne 'library');
#
#  Make sure this process is running from user=www
my $wwwid=getpwnam('www');
if ($wwwid!=$<) {
    my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
    my $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
    system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\
 mailto $emailto -s '$subj' > /dev/null");
    exit 1;
}
#
# Let people know we are running
open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
print LOG '==== Searchcat Run '.localtime()."====\n";
#
# Connect to database
my $dbh;
if (! ($dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},
                          { RaiseError =>0,PrintError=>0}))) {
    print LOG "Cannot connect to database!\n";
    die "MySQL Error: Cannot connect to database!\n";
}
# This can return an error and still be okay, so we do not bother checking.
# (perhaps it should be more robust and check for specific errors)
$dbh->do('DROP TABLE IF EXISTS '.$newname);
#
# Create the new table
my $request = &LONCAPA::lonmetadata::create_metadata_storage($newname);
$dbh->do($request);
if ($dbh->err) {
    $dbh->disconnect();
    print LOG "\nMySQL Error Create: ".$dbh->errstr."\n";
    die $dbh->errstr;
}
#
# find out which users we need to examine
opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}");
my @homeusers = 
    grep {
        &ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_");
    } grep { 
        !/^\.\.?$/;
    } readdir(RESOURCES);
closedir RESOURCES;
#
# Loop through the users
foreach my $user (@homeusers) {
    print LOG "=== User: ".$user."\n";
    my $prodir=&propath($perlvar{'lonDefDomain'},$user);
    #
    # Use File::Find to get the files we need to read/modify
    find(
         {preprocess => \&only_meta_files,
#          wanted     => \&print_filename,
#          wanted     => \&log_metadata,
          wanted     => \&process_meta_file,
          }, 
         "$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user");
}
#
# Rename the table
$dbh->do('DROP TABLE IF EXISTS '.$oldname);
if (! $dbh->do('RENAME TABLE '.$newname.' TO '.$oldname)) {
    print LOG "MySQL Error Rename: ".$dbh->errstr."\n";
    die $dbh->errstr;
}
if (! $dbh->disconnect) {
    print LOG "MySQL Error Disconnect: ".$dbh->errstr."\n";
    die $dbh->errstr;
}
##
## Finished!
print LOG "==== Searchcat completed ".localtime()." ====\n";
close(LOG);

&write_type_count();
&write_copyright_count();

exit 0;

########################################################
########################################################
###                                                  ###
###          File::Find support routines             ###
###                                                  ###
########################################################
########################################################
##
## &only_meta_files
##
## Called by File::Find.
## Takes a list of files/directories in and returns a list of files/directories
## to search.
sub only_meta_files {
    my @PossibleFiles = @_;
    my @ChosenFiles;
    foreach my $file (@PossibleFiles) {
        if ( ($file =~ /\.meta$/ &&            # Ends in meta
              $file !~ /\.\d+\.[^\.]+\.meta$/  # is not for a prior version
             ) || (-d $file )) { # directories are okay
                 # but we do not want /. or /..
            push(@ChosenFiles,$file);
        }
    }
    return @ChosenFiles;
}

##
##
## Debugging routines, use these for 'wanted' in the File::Find call
##
sub print_filename {
    my ($file) = $_;
    my $fullfilename = $File::Find::name;
    if (-d $file) {
        print LOG " Got directory ".$fullfilename."\n";
    } else {
        print LOG " Got file ".$fullfilename."\n";
    }
    $_=$file;
}

sub log_metadata {
    my ($file) = $_;
    my $fullfilename = $File::Find::name;
    return if (-d $fullfilename); # No need to do anything here for directories
    print LOG $fullfilename."\n";
    my $ref=&metadata($fullfilename);
    if (! defined($ref)) {
        print LOG "    No data\n";
        return;
    }
    while (my($key,$value) = each(%$ref)) {
        print LOG "    ".$key." => ".$value."\n";
    }
    &count_copyright($ref->{'copyright'});
    $_=$file;
}


##
## process_meta_file
##   Called by File::Find. 
##   Only input is the filename in $_.  
sub process_meta_file {
    my ($file) = $_;
    my $filename = $File::Find::name;
    return if (-d $filename); # No need to do anything here for directories
    #
    print LOG $filename."\n";
    #
    my $ref=&metadata($filename);
    #
    # $url is the original file url, not the metadata file
    my $url='/res/'.&declutter($filename);
    $url=~s/\.meta$//;
    print LOG "    ".$url."\n";
    #
    # Ignore some files based on their metadata
    if ($ref->{'obsolete'}) { 
        print LOG "obsolete\n"; 
        return; 
    }
    &count_copyright($ref->{'copyright'});
    if ($ref->{'copyright'} eq 'private') { 
        print LOG "private\n"; 
        return; 
    }
    #
    # Find the dynamic metadata
    my %dyn;
    if ($url=~ m:/default$:) {
        $url=~ s:/default$:/:;
    } else {
        # %dyn=&dynamicmeta($url);
        &count_type($url);
    }
    #
    $ref->{'creationdate'}     = &sqltime($ref->{'creationdate'});
    $ref->{'lastrevisiondate'} = &sqltime($ref->{'lastrevisiondate'});
    my %Data = (
                %$ref,
                %dyn,
                'url'=>$url,
                'version'=>'current');
    my ($count,$err) = &LONCAPA::lonmetadata::store_metadata($dbh,$newname,
                                                             \%Data);
    if ($err) {
        print LOG "\nMySQL Error Insert: ".$err."\n";
        die $err;
    }
    if ($count < 1) {
        print LOG "Unable to insert record into MySQL database for $url\n";
        die "Unable to insert record into MySQl database for $url";
    } else {
        print LOG "Count = ".$count."\n";
    }
    #
    # Reset $_ before leaving
    $_ = $file;
}

########################################################
########################################################
###                                                  ###
###  &metadata($uri)                                 ###
###   Retrieve metadata for the given file           ###
###                                                  ###
########################################################
########################################################
sub metadata {
    my ($uri)=@_;
    my %metacache=();
    $uri=&declutter($uri);
    my $filename=$uri;
    $uri=~s/\.meta$//;
    $uri='';
    if ($filename !~ /\.meta$/) { 
        $filename.='.meta';
    }
    my $metastring=&getfile($perlvar{'lonDocRoot'}.'/res/'.$filename);
    return undef if (! defined($metastring));
    my $parser=HTML::TokeParser->new(\$metastring);
    my $token;
    while ($token=$parser->get_token) {
        if ($token->[0] eq 'S') {
            my $entry=$token->[1];
            my $unikey=$entry;
            if (defined($token->[2]->{'part'})) { 
                $unikey.='_'.$token->[2]->{'part'}; 
            }
            if (defined($token->[2]->{'name'})) { 
                $unikey.='_'.$token->[2]->{'name'}; 
            }
            if ($metacache{$uri.'keys'}) {
                $metacache{$uri.'keys'}.=','.$unikey;
            } else {
                $metacache{$uri.'keys'}=$unikey;
            }
            foreach ( @{$token->[3]}) {
                $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_};
            } 
            if (! ($metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry))){
                $metacache{$uri.''.$unikey} = 
                    $metacache{$uri.''.$unikey.'.default'};
            }
        } # End of ($token->[0] eq 'S')
    }
    return \%metacache;
}

##
## &getfile($filename)
##   Slurps up an entire file into a scalar.  
##   Returns undef if the file does not exist
sub getfile {
    my $file = shift();
    if (! -e $file ) { 
        return undef; 
    }
    my $fh=IO::File->new($file);
    my $contents = '';
    while (<$fh>) { 
        $contents .= $_;
    }
    return $contents;
}

########################################################
########################################################
###                                                  ###
###    Dynamic Metadata                              ###
###                                                  ###
########################################################
########################################################
sub dynamicmeta {
    my $url = &declutter(shift());
    $url =~ s/\.meta$//;
    my %data = ('count'         => 0,
                'course'        => 0,
                'course_list'   => '',
                'avetries'      => 'NULL',
                'avetries_list' => '',
                'stdno'         => 0,
                'stdno_list'    => '',
                'usage'         => 0,
                'usage_list'    => '',
                'goto'          => 0,
                'goto_list'     => '',
                'comefrom'      => 0,
                'comefrom_list' => '',
                'difficulty'    => 'NULL',
                'difficulty_list' => '',
                'sequsage'      => '0',
                'sequsage_list' => '',
                'clear'         => 'NULL',
                'technical'     => 'NULL',
                'correct'       => 'NULL',
                'helpful'       => 'NULL',
                'depth'         => 'NULL',
                'comments'      => '',                
                );
    my ($dom,$auth)=($url=~/^(\w+)\/(\w+)\//);
    my $prodir=&propath($dom,$auth);
    #
    # Get metadata except counts
    my %evaldata;
    if (! tie(%evaldata,'GDBM_File',
              $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) {
        return (undef);
    }
    my %sum=();
    my %count=();
    my %concat=();
    my %listitems=(
                   'course'       => 'add',
                   'goto'         => 'add',
                   'comefrom'     => 'add',
                   'avetries'     => 'average',
                   'stdno'        => 'add',
                   'difficulty'   => 'average',
                   'clear'        => 'average',
                   'technical'    => 'average',
                   'helpful'      => 'average',
                   'correct'      => 'average',
                   'depth'        => 'average',
                   'comments'     => 'append',
                   'usage'        => 'count'
                   );
    #
    my $regexp=$url;
    $regexp=~s/(\W)/\\$1/g;
    $regexp='___'.$regexp.'___([a-z]+)$';
    while (my ($esckey,$value)=each %evaldata) {
        my $key=&unescape($esckey);
        if ($key=~/$regexp/) {
            my ($item,$purl,$cat)=split(/___/,$key);
            $count{$cat}++;
            if ($listitems{$cat} ne 'append') {
                if (defined($sum{$cat})) {
                    $sum{$cat}+=&unescape($value);
                    $concat{$cat}.=','.$item;
                } else {
                    $sum{$cat}=&unescape($value);
                    $concat{$cat}=$item;
                }
            } else {
                if (defined($sum{$cat})) {
                    if ($evaldata{$esckey}=~/\w/) {
                        $sum{$cat}.='<hr />'.&unescape($evaldata{$esckey});
                    }
                } else {
                    $sum{$cat}=''.&unescape($evaldata{$esckey});
		    }
            }
        }
    }
    untie(%evaldata);
    # transfer gathered data to returnhash, calculate averages where applicable
    my %returnhash;
    while (my $cat=each(%count)) {
        if ($count{$cat} eq 'nan') { next; }
        if ($sum{$cat} eq 'nan') { next; }
        if ($listitems{$cat} eq 'average') {
            if ($count{$cat}) {
                $returnhash{$cat}=int(($sum{$cat}/$count{$cat})*100.0+0.5)/100.0;
            } else {
                $returnhash{$cat}='NULL';
            }
        } elsif ($listitems{$cat} eq 'count') {
            $returnhash{$cat}=$count{$cat};
        } else {
            $returnhash{$cat}=$sum{$cat};
        }
        $returnhash{$cat.'_list'}=$concat{$cat};
    }
    #
    # get count
    if (tie(my %evaldata,'GDBM_File',
            $prodir.'/nohist_accesscount.db',&GDBM_READER(),0640)) {
	my $escurl=&escape($url);
	if (! exists($evaldata{$escurl})) {
	    $returnhash{'count'}=0;
	} else {
	    $returnhash{'count'}=$evaldata{$escurl};
	}
	untie %evaldata;
    }
    return %returnhash;
}

########################################################
########################################################
###                                                  ###
###   Counts                                         ###
###                                                  ###
########################################################
########################################################
{

my %countext;

sub count_type {
    my $file=shift;
    $file=~/\.(\w+)$/;
    my $ext=lc($1);
    $countext{$ext}++;
}

sub write_type_count {
    open(RESCOUNT,'>/home/httpd/html/lon-status/rescount.txt');
    while (my ($extension,$count) = each(%countext)) {
	print RESCOUNT $extension.'='.$count.'&';
    }
    print RESCOUNT 'time='.time."\n";
    close(RESCOUNT);
}

} # end of scope for %countext

{

my %copyrights;

sub count_copyright {
    $copyrights{@_[0]}++;
}

sub write_copyright_count {
    open(COPYCOUNT,'>/home/httpd/html/lon-status/copyrightcount.txt');
    while (my ($copyright,$count) = each(%copyrights)) {
	print COPYCOUNT $copyright.'='.$count.'&';
    }
    print COPYCOUNT 'time='.time."\n";
    close(COPYCOUNT);
}

} # end of scope for %copyrights

########################################################
########################################################
###                                                  ###
###   Miscellanous Utility Routines                  ###
###                                                  ###
########################################################
########################################################
##
## &ishome($username)
##   Returns 1 if $username is a LON-CAPA author, 0 otherwise
##   (copied from lond, modification of the return value)
sub ishome {
    my $author=shift;
    $author=~s/\/home\/httpd\/html\/res\/([^\/]*)\/([^\/]*).*/$1\/$2/;
    my ($udom,$uname)=split(/\//,$author);
    my $proname=propath($udom,$uname);
    if (-e $proname) {
	return 1;
    } else {
        return 0;
    }
}

##
## &propath($udom,$uname)
##   Returns the path to the users LON-CAPA directory
##   (copied from lond)
sub propath {
    my ($udom,$uname)=@_;
    $udom=~s/\W//g;
    $uname=~s/\W//g;
    my $subdir=$uname.'__';
    $subdir =~ s/(.)(.)(.).*/$1\/$2\/$3/;
    my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname";
    return $proname;
} 

##
## &sqltime($timestamp)
##
## Convert perl $timestamp to MySQL time.  MySQL expects YYYY-MM-DD HH:MM:SS
##
sub sqltime {
    my ($time) = @_;
    my $mysqltime;
    if ($time =~ 
        /(\d+)-(\d+)-(\d+) # YYYY-MM-DD
        \s                 # a space
        (\d+):(\d+):(\d+)  # HH:MM::SS
        /x ) { 
        # Some of the .meta files have the time in mysql
        # format already, so just make sure they are 0 padded and
        # pass them back.
        $mysqltime = sprintf('%04d-%02d-%02d %02d:%02d:%02d',
                             $1,$2,$3,$4,$5,$6);
    } elsif ($time =~ /^\d+$/) {
        my @TimeData = gmtime($time);
        # Alter the month to be 1-12 instead of 0-11
        $TimeData[4]++;
        # Alter the year to be from 0 instead of from 1900
        $TimeData[5]+=1900;
        $mysqltime = sprintf('%04d-%02d-%02d %02d:%02d:%02d',
                             @TimeData[5,4,3,2,1,0]);
    } else {
        print LOG "    Unable to decode time ".$time."\n";
        $mysqltime = 0;
    }
    return $mysqltime;
}

##
## &declutter($filename)
##   Given a filename, returns a url for the filename.
sub declutter {
    my $thisfn=shift;
    $thisfn=~s/^$perlvar{'lonDocRoot'}//;
    $thisfn=~s/^\///;
    $thisfn=~s/^res\///;
    return $thisfn;
}

##
## Escape / Unescape special characters
sub unescape {
    my $str=shift;
    $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg;
    return $str;
}

sub escape {
    my $str=shift;
    $str =~ s/(\W)/"%".unpack('H2',$1)/eg;
    return $str;
}