Annotation of loncom/metadata_database/searchcat.pl, revision 1.30

1.1       harris41    1: #!/usr/bin/perl
                      2: # The LearningOnline Network
                      3: # searchcat.pl "Search Catalog" batch script
1.16      harris41    4: #
1.30    ! www         5: # $Id: searchcat.pl,v 1.29 2003/02/03 13:42:16 albertel Exp $
1.16      harris41    6: #
                      7: # Copyright Michigan State University Board of Trustees
                      8: #
1.29      albertel    9: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
1.16      harris41   10: #
1.29      albertel   11: # LON-CAPA is free software; you can redistribute it and/or modify
1.16      harris41   12: # it under the terms of the GNU General Public License as published by
                     13: # the Free Software Foundation; either version 2 of the License, or
                     14: # (at your option) any later version.
                     15: #
1.29      albertel   16: # LON-CAPA is distributed in the hope that it will be useful,
1.16      harris41   17: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     18: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     19: # GNU General Public License for more details.
                     20: #
                     21: # You should have received a copy of the GNU General Public License
1.29      albertel   22: # along with LON-CAPA; if not, write to the Free Software
1.16      harris41   23: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     24: #
                     25: # /home/httpd/html/adm/gpl.txt
                     26: #
1.29      albertel   27: # http://www.lon-capa.org/
1.16      harris41   28: #
                     29: # YEAR=2001
1.17      harris41   30: # 04/14/2001, 04/16/2001 Scott Harrison
1.16      harris41   31: #
1.17      harris41   32: # YEAR=2002
                     33: # 05/11/2002 Scott Harrison
1.16      harris41   34: #
1.28      harris41   35: # YEAR=2003
                     36: # Scott Harrison
                     37: #
1.16      harris41   38: ###
1.1       harris41   39: 
1.28      harris41   40: =pod
                     41: 
                     42: =head1 NAME
                     43: 
                     44: B<searchcat.pl> - put authoritative filesystem data into sql database.
                     45: 
                     46: =head1 SYNOPSIS
                     47: 
                     48: Ordinarily this script is to be called from a loncapa cron job
                     49: (CVS source location: F<loncapa/loncom/cron/loncapa>; typical
                     50: filesystem installation location: F</etc/cron.d/loncapa>).
                     51: 
                     52: Here is the cron job entry.
                     53: 
                     54: C<# Repopulate and refresh the metadata database used for the search catalog.>
                     55: 
                     56: C<10 1 * * 7    www    /home/httpd/perl/searchcat.pl>
                     57: 
                     58: This script only allows itself to be run as the user C<www>.
                     59: 
                     60: =head1 DESCRIPTION
                     61: 
                     62: This script goes through a loncapa resource directory and gathers metadata.
                     63: The metadata is entered into a SQL database.
                     64: 
                     65: This script also does general database maintenance such as reformatting
                     66: the C<loncapa:metadata> table if it is deprecated.
                     67: 
                     68: This script also builds dynamic temporal metadata and stores this inside
                     69: a F<nohist_resevaldata.db> database file.
                     70: 
                     71: This script is playing an increasingly important role for a loncapa
                     72: library server.  The proper operation of this script is critical for a smooth
                     73: and correct user experience.
                     74: 
                     75: =cut
                     76: 
                     77: # ========================================================== Setting things up.
                     78: 
                     79: # ------------------------------------------------------  Use external modules.
1.1       harris41   80: 
1.17      harris41   81: use lib '/home/httpd/lib/perl/';
                     82: use LONCAPA::Configuration;
                     83: 
1.1       harris41   84: use IO::File;
                     85: use HTML::TokeParser;
1.6       harris41   86: use DBI;
1.21      www        87: use GDBM_File;
1.24      www        88: use POSIX qw(strftime mktime);
1.1       harris41   89: 
1.28      harris41   90: # ----------------- Code to enable 'find' subroutine listing of the .meta files
                     91: use File::Find;
                     92: 
                     93: # List of .meta files (used on a per-user basis).
1.1       harris41   94: my @metalist;
1.21      www        95: 
1.28      harris41   96: # ---------------  Read loncapa_apache.conf and loncapa.conf and get variables.
                     97: my $perlvarref = LONCAPA::Configuration::read_conf('loncapa.conf');
                     98: my %perlvar = %{$perlvarref};
                     99: undef($perlvarref); # Remove since sensitive and not needed.
                    100: delete($perlvar{'lonReceipt'}); # Remove since sensitive and not needed.
                    101: 
                    102: # ------------------------------------- Only run if machine is a library server
                    103: if ($perlvar{'lonRole'} ne 'library')
                    104:   {
                    105:     exit(0);
                    106:   }
                    107: 
                    108: # ------------------------------ Make sure this process is running as user=www.
                    109: my $wwwid = getpwnam('www');
                    110: if ($wwwid != $<)
                    111:   {
                    112:     $emailto = "$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
                    113:     $subj = "LON: $perlvar{'lonHostID'} User ID mismatch";
                    114:     system("echo 'User ID mismatch. searchcat.pl must be run as user www.' | ".
                    115: 	   "mailto $emailto -s '$subj' > /dev/null");
                    116:     exit(1);
                    117:   }
                    118: 
                    119: # ------------------------------------------------------ Initialize log output.
                    120: open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
                    121: print(LOG '==== Searchcat Run '.localtime().' ===='."\n\n");
                    122: 
                    123: my $dbh; # Database object reference handle.
                    124: 
                    125: # ----------------------------- Verify connection to loncapa:metadata database.
                    126: unless (
                    127: 	$dbh = DBI->connect('DBI:mysql:loncapa','www',
                    128: 			    $perlvar{'lonSqlAccess'},
                    129: 			    { RaiseError => 0,PrintError => 0})
                    130: 	)
                    131:   { 
                    132:     print(LOG '**** ERROR **** Cannot connect to database!'."\n");
                    133:     exit(0);
                    134:   }
                    135: 
                    136: # ------------------------------ Create loncapa:metadata table if non-existent.
                    137: my $make_metadata_table = 'CREATE TABLE IF NOT EXISTS metadata ('.
                    138:     'title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, '.
                    139:     'version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, '.
                    140:     'creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, '.
                    141:     'copyright TEXT, utilitysemaphore BOOL, FULLTEXT idx_title (title), '.
                    142:     'FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), '.
                    143:     'FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), '.
                    144:     'FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), '.
                    145:     'FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), '.
                    146:     'FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), '.
                    147:     'FULLTEXT idx_copyright (copyright)) TYPE=MYISAM';
                    148: 
                    149: $dbh->do($make_metadata_table); # Generate the table.
                    150: 
                    151: # ----------------------------- Verify format of the loncapa:metadata database.
                    152: #                               (delete and recreate database if necessary).
                    153: 
                    154: # Make a positive control for verifying table structure.
                    155: my $make_metadata_table_CONTROL = $make_metadata_table;
                    156: $make_metadata_table_CONTROL =~
                    157:     s/^(CREATE TABLE IF NOT EXISTS) metadata/$1 CONTROL_metadata/;
                    158: 
                    159: $dbh->do('DROP TABLE IF EXISTS CONTROL_metadata');
                    160: $dbh->do($make_metadata_table_CONTROL);
                    161: 
                    162: my $table_description; # selectall reference to the table description.
                    163: 
                    164: my $CONTROL_table_string; # What the table description should look like.
                    165: my $table_string; # What the table description does look like.
                    166: 
                    167: # Calculate the CONTROL table description (what it should be).
                    168: $table_description = $dbh->selectall_arrayref('describe CONTROL_metadata');
                    169: foreach my $table_row (@{$table_description})
                    170:   {
                    171:     $CONTROL_table_string .= join(',',@{$table_row})."\n";
                    172:   }
                    173: 
                    174: # Calculate the current table description (what it currently looks like).
                    175: $table_description = $dbh->selectall_arrayref('describe metadata');
                    176: foreach my $table_row (@{$table_description})
                    177:   {
                    178:     $table_string .= join(',',@{$table_row})."\n";
                    179:   }
                    180: 
                    181: if ($table_string ne $CONTROL_table_string)
                    182:   {
                    183:     # Log this incident.
                    184:     print(LOG '**** WARNING **** Table structure mismatch, need to regenerate'.
                    185: 	  '.'."\n");
                    186:     # Delete the table.
                    187:     $dbh->do('DROP TABLE IF EXISTS metadata');
                    188:     # Generate the table.
                    189:     $dbh->do($make_metadata_table);
                    190:   }
1.21      www       191: 
1.28      harris41  192: $dbh->do('DROP TABLE IF EXISTS CONTROL_metadata'); # Okay.  Done with control.
1.21      www       193: 
1.28      harris41  194: # ----------------------------------------------- Set utilitysemaphore to zero.
                    195: $dbh->do('UPDATE metadata SET utilitysemaphore = 0');
                    196: 
                    197: # ========================================================= Main functionality.
                    198: 
                    199: # - Determine home authors on this server based on resources dir and user tree.
                    200: 
                    201: # RESOURCES: the resources directory (subdirs correspond to author usernames).
                    202: opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}") or
                    203:     (print(LOG '=== /res/--lonDefDomain-- directory is not accessible'."\n")
                    204:      and exit(0));
                    205: 
                    206: # query_home_server_status will look for user home directories on this machine.
                    207: my @homeusers =
                    208:     grep {&query_home_server_status($perlvar{'lonDocRoot'}.'/res/'.
                    209: 				    $perlvar{'lonDefDomain'}.'/'.$_)
                    210: 	  } grep {!/^\.\.?$/} readdir(RESOURCES);
                    211: closedir(RESOURCES);
                    212: 
                    213: unless (@homeusers)
                    214:   {
                    215:     print(LOG '=== No home users found on this server.'."\n");
                    216:   }
                    217: 
                    218: # Consider each author individually.
                    219: foreach my $user (@homeusers)
                    220:   {
                    221:     # Make a log entry.
                    222:     print(LOG "\n".'=== User: '.$user."\n\n");
                    223: 
                    224:     # Get filesystem path to this user's directory.
                    225:     my $user_directory =
                    226: 	&construct_path_to_user_directory($perlvar{'lonDefDomain'},$user);
                    227: 
                    228:     # Remove left-over db-files from a potentially crashed searchcat run.
                    229:     unlink($user_directory.'/nohist_new_resevaldata.db');
                    230: 
                    231:     # Cleanup the metalist array.
                    232:     undef(@metalist);
                    233:     @metalist = ();
                    234: 
                    235:     # This will add entries to the @metalist array.
                    236:     &File::Find::find(\&wanted,
                    237: 		      $perlvar{'lonDocRoot'}.'/res/'.
                    238: 		      $perlvar{'lonDefDomain'}.'/'.$user);
                    239: 
                    240:     # -- process file to get metadata and put into search catalog SQL database
                    241:     # Also, build and store dynamic metadata.
                    242:     # Also, delete record entries before refreshing.
                    243:     foreach my $m (@metalist)
                    244:       {
                    245: 	# Log this action.
                    246: 	print(LOG "- ".$m."\n");
                    247: 
                    248: 	# Get metadata from the file.
                    249: 	my $ref = get_metadata_from_file($m);
                    250: 
                    251: 	# Make a datarecord identifier for this resource.
                    252: 	my $m2 = '/res/'.declutter($m);
                    253: 	$m2 =~ s/\.meta$//;
                    254: 
                    255: 	# Build and store dynamic metadata inside nohist_resevaldata.db.
                    256: 	build_on_the_fly_dynamic_metadata($m2);
                    257: 
                    258: 	# Delete record if it already exists.
                    259: 	my $q2 = 'select * from metadata where url like binary '."'".$m2."'";
                    260: 	my $sth = $dbh->prepare($q2);
                    261: 	$sth->execute();
                    262: 	my $r1 = $sth->fetchall_arrayref;
                    263: 	if (@$r1)
                    264: 	  {
                    265: 	    $sth = 
                    266: 		$dbh->prepare('delete from metadata where url like binary '.
                    267: 			      "'".$m2."'");
                    268: 	    $sth->execute();
                    269: 	  }
                    270: 
                    271: 	# Add new/replacement record into the loncapa:metadata table.
                    272: 	$sth = $dbh->prepare('insert into metadata values ('.
                    273: 			     '"'.delete($ref->{'title'}).'"'.','.
                    274: 			     '"'.delete($ref->{'author'}).'"'.','.
                    275: 			     '"'.delete($ref->{'subject'}).'"'.','.
                    276: 			     '"'.$m2.'"'.','.
                    277: 			     '"'.delete($ref->{'keywords'}).'"'.','.
                    278: 			     '"'.'current'.'"'.','.
                    279: 			     '"'.delete($ref->{'notes'}).'"'.','.
                    280: 			     '"'.delete($ref->{'abstract'}).'"'.','.
                    281: 			     '"'.delete($ref->{'mime'}).'"'.','.
                    282: 			     '"'.delete($ref->{'language'}).'"'.','.
                    283: 			     '"'.sql_formatted_time(
                    284: 				       delete($ref->{'creationdate'})).'"'.','.
                    285: 			     '"'.sql_formatted_time(
                    286: 				   delete($ref->{'lastrevisiondate'})).'"'.','.
                    287: 			     '"'.delete($ref->{'owner'}).'"'.','.
                    288: 			     '"'.delete($ref->{'copyright'}).'"'.','.
                    289: 			     '1'.')');
                    290: 	$sth->execute();
                    291:       }
                    292: 
                    293: # ----------------------- Clean up database, remove stale SQL database records.
                    294:     $dbh->do('DELETE FROM metadata WHERE utilitysemaphore = 0');
                    295: 
                    296: # -------------------------------------------------- Copy over the new db-files
                    297:     system('mv '.$user_directory.'/nohist_new_resevaldata.db '.
                    298: 	         $user_directory.'/nohist_resevaldata.db');
                    299:   }
                    300: 
                    301: # --------------------------------------------------- Close database connection
                    302: $dbh->disconnect;
                    303: print LOG "\n==== Searchcat completed ".localtime()." ====\n";
                    304: close(LOG);
                    305: exit(0);
                    306: 
                    307: # ================================================================ Subroutines.
                    308: 
                    309: =pod
                    310: 
                    311: =head1 SUBROUTINES
                    312: 
                    313: =cut
                    314: 
                    315: =pod
                    316: 
                    317: B<unescape> - translate to unstrange escaped syntax to strange characters.
                    318: 
                    319: =over 4
                    320: 
                    321: Parameters:
                    322: 
                    323: =item I<$str> - string with unweird characters.
                    324: 
                    325: =back
                    326: 
                    327: =over 4
                    328: 
                    329: Returns:
                    330: 
                    331: =item C<string> - string with potentially weird characters.
                    332: 
                    333: =back
                    334: 
                    335: =cut
                    336: 
                    337: sub unescape ($)
                    338:   {
                    339:     my $str = shift(@_);
1.21      www       340:     $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg;
1.28      harris41  341:     return($str);
                    342:   }
                    343: 
                    344: =pod
                    345: 
                    346: B<escape> - translate strange characters to unstrange escaped syntax.
                    347: 
                    348: =over 4
                    349: 
                    350: Parameters:
1.21      www       351: 
1.28      harris41  352: =item I<$str> - string with potentially weird characters to unweird-ify.
1.22      www       353: 
1.28      harris41  354: =back
                    355: 
                    356: =over 4
                    357: 
                    358: Returns:
                    359: 
                    360: =item C<string> - unweird-ified string.
                    361: 
                    362: =back
                    363: 
                    364: =cut
                    365: 
                    366: sub escape ($)
                    367:   {
                    368:     my $str = shift(@_);
1.22      www       369:     $str =~ s/(\W)/"%".unpack('H2',$1)/eg;
1.28      harris41  370:     return($str);
                    371:   }
                    372: 
                    373: =pod
                    374: 
                    375: B<build_on_the_fly_dynamic_metadata> - evaluate and store dynamic metadata.
                    376: 
1.30    ! www       377: Returns the dynamic metadata for an author, which will later be added to the
        !           378: MySQL database (not yet implemented).
        !           379: 
        !           380: The vast majority of entries in F<nohist_resevaldata.db>, which contains
        !           381: the dynamic metadata for an author's resources, are "count", which make
        !           382: the file really large and evaluation really slow.
        !           383: 
        !           384: While computing the current value of all dynamic metadata
        !           385: for later insertion into the MySQL metadata cache (not yet implemented),
        !           386: this routine also simply adds up all "count" type fields and replaces them by
        !           387: one new field with the to-date count.
        !           388: 
        !           389: Only after successful completion of working with one author, copy new file to
        !           390: original file. Copy to tmp-"new"-db-file was necessary since db-file size 
        !           391: would not shrink after "delete" of key.
1.28      harris41  392: 
                    393: =over 4
                    394: 
                    395: Parameters:
                    396: 
                    397: =item I<$url> - the filesystem path (url may be a misnomer...)
                    398: 
                    399: =back
                    400: 
                    401: =over 4
                    402: 
                    403: Returns:
1.22      www       404: 
1.28      harris41  405: =item C<hash> - key-value table of dynamically evaluated metadata.
1.21      www       406: 
1.28      harris41  407: =back
1.21      www       408: 
1.28      harris41  409: =cut
1.25      www       410: 
1.30    ! www       411: sub build_on_the_fly_dynamic_metadata {
1.29      albertel  412: 
                    413:     # Need to compute the user's directory.
1.30    ! www       414:     my $url=&declutter(shift);
        !           415:     $url=~s/\.meta$//;
        !           416:     my %returnhash=();
        !           417:     my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//);
        !           418:     my $user_directory=&construct_path_to_user_directory($adomain,$aauthor);
1.28      harris41  419: 
                    420:     # Attempt a GDBM database instantiation inside users directory and proceed.
1.25      www       421:     if ((tie(%evaldata,'GDBM_File',
1.28      harris41  422:             $user_directory.
                    423: 	     '/nohist_resevaldata.db',&GDBM_READER(),0640)) &&
1.25      www       424:         (tie(%newevaldata,'GDBM_File',
1.28      harris41  425: 	     $user_directory.
1.30    ! www       426: 	     '/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640))) {
1.28      harris41  427: 	# For different variables, track the running sum and counts.
1.30    ! www       428: 	my %sum=();
        !           429: 	my %cnt=();
1.28      harris41  430: 
                    431: 	# Define computed items as a sum (add) or an average (avg) or a raw
1.30    ! www       432: 	# count (cnt) or append (app)?
1.28      harris41  433: 	my %listitems=('count'        => 'add',
                    434: 		       'course'       => 'add',
                    435: 		       'avetries'     => 'avg',
                    436: 		       'stdno'        => 'add',
                    437: 		       'difficulty'   => 'avg',
                    438: 		       'clear'        => 'avg',
                    439: 		       'technical'    => 'avg',
                    440: 		       'helpful'      => 'avg',
                    441: 		       'correct'      => 'avg',
                    442: 		       'depth'        => 'avg',
                    443: 		       'comments'     => 'app',
                    444: 		       'usage'        => 'cnt'
                    445: 		       );
                    446: 	
                    447: 	# Untaint the url and use as part of a regular expression.
1.30    ! www       448: 	my $regexp=$url;
        !           449: 	$regexp=~s/(\W)/\\$1/g;
        !           450: 	$regexp='___'.$regexp.'___([a-z]+)$'; #' emacs
        !           451: 
        !           452: 	# Check existing database for this author.
        !           453:         # this is modifying the 'count' entries
        !           454:         # and copying all other entries over
        !           455: 
        !           456: 	foreach (keys %evaldata) {
        !           457: 	    my $key=&unescape($_);
        !           458: 	    if ($key=~/$regexp/) { # If url-based entry exists.
        !           459: 		my $ctype=$1; # Set to specific category type.
1.28      harris41  460: 
                    461: 		# Do an increment for this category type.
1.30    ! www       462: 		if (defined($cnt{$ctype})) {
1.28      harris41  463: 		    $cnt{$ctype}++; 
1.30    ! www       464: 		} else {
        !           465: 		    $cnt{$ctype}=1; 
        !           466: 		}
        !           467:                 unless ($listitems{$ctype} eq 'app') { # append comments
1.28      harris41  468: 		    # Increment the sum based on the evaluated data in the db.
1.30    ! www       469: 		    if (defined($sum{$ctype})) {
        !           470: 			$sum{$ctype}+=$evaldata{$_};
        !           471: 		    } else {
        !           472: 			$sum{$ctype}=$evaldata{$_};
        !           473: 		    }
        !           474:  		} else { # 'app' mode, means to use '<hr />' as a separator
        !           475: 		    if (defined($sum{$ctype})) {
        !           476: 			if ($evaldata{$_}) {
        !           477: 			    $sum{$ctype}.='<hr />'.$evaldata{$_};
        !           478: 			}
        !           479: 		    } else {
        !           480: 			$sum{$ctype}=''.$evaldata{$_};
        !           481: 		    }
        !           482: 	        }
        !           483: 		if ($ctype ne 'count') {
1.29      albertel  484:                     # this is copying all data except 'count' attributes
1.30    ! www       485: 		    $newevaldata{$_}=$evaldata{$_};
        !           486: 	        }
        !           487: 	    }
        !           488: 	}
        !           489: 
        !           490:         # these values will be returned (currently still unused)
        !           491: 	foreach (keys %cnt) {
        !           492: 	    if ($listitems{$_} eq 'avg') {
        !           493: 		$returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0;
        !           494: 	    } elsif ($listitems{$_} eq 'cnt') {
        !           495: 		$returnhash{$_}=$cnt{$_};
        !           496: 	    } else {
        !           497: 		$returnhash{$_}=$sum{$_};
        !           498: 	    }
        !           499: 	}
        !           500: 
        !           501:         # generate new count key in resevaldata, insert sum
        !           502: 	if ($returnhash{'count'}) {
        !           503: 	    my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count';
        !           504: 	    $newevaldata{$newkey}=$returnhash{'count'};
        !           505: 	}
1.28      harris41  506: 
                    507: 	untie(%evaldata); # Close/release the original nohist database.
                    508: 	untie(%newevaldata); # Close/release the new nohist database.
1.30    ! www       509:     }
        !           510:     return %returnhash;
        !           511: }
1.28      harris41  512: 
                    513: =pod
                    514: 
                    515: B<wanted> - used by B<File::Find::find> subroutine.
                    516: 
                    517: This evaluates whether a file is wanted, and pushes it onto the
                    518: I<@metalist> array.  This subroutine was, for the most part, auto-generated
                    519: by the B<find2perl> command.
                    520: 
                    521: =over 4
                    522: 
                    523: Parameters:
                    524: 
                    525: =item I<$file> - a path to the file.
                    526: 
                    527: =back
                    528: 
                    529: =over 4
                    530: 
                    531: Returns:
                    532: 
                    533: =item C<boolean> - true or false based on logical statement.
                    534: 
                    535: =back
                    536: 
                    537: =cut
                    538: 
                    539: sub wanted ($)
                    540:   {
1.1       harris41  541:     (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) &&
1.28      harris41  542:     -f $_ &&
1.10      harris41  543:     /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ &&
1.28      harris41  544:     push(@metalist,$File::Find::dir.'/'.$_);
                    545:   }
                    546: 
                    547: =pod
                    548: 
                    549: B<get_metadata_from_file> - read xml-tagged file and return parsed metadata.
1.1       harris41  550: 
1.28      harris41  551: I<Note that this is significantly altered from a subroutine present in lonnet.>
1.15      harris41  552: 
1.28      harris41  553: =over 4
1.1       harris41  554: 
1.28      harris41  555: Parameters:
1.27      www       556: 
1.28      harris41  557: =item I<$file> - a path.to the file.
1.27      www       558: 
1.28      harris41  559: =back
1.27      www       560: 
1.28      harris41  561: =over 4
1.25      www       562: 
1.28      harris41  563: Returns:
1.1       harris41  564: 
1.28      harris41  565: =item C<hash reference> - a hash array (keys and values).
1.1       harris41  566: 
1.28      harris41  567: =back
1.25      www       568: 
1.28      harris41  569: =cut
1.1       harris41  570: 
1.28      harris41  571: sub get_metadata_from_file ($)
                    572:   {
                    573:     my ($filename) = @_;
                    574:     my %metatable; # Used to store return value of hash-tabled metadata.
                    575:     $filename = &declutter($filename); # Remove non-identifying filesystem info
                    576:     my $uri = ''; # The URI is not relevant in this scenario.
                    577:     unless ($filename =~ m/\.meta$/) # Unless ending with .meta.
                    578:       {
                    579: 	$filename .= '.meta'; # Append a .meta suffix.
                    580:       }
                    581:     # Get the file contents.
                    582:     my $metadata_string =
                    583: 	&get_file_contents($perlvar{'lonDocRoot'}.'/res/'.$filename);
                    584: 
                    585:     # Parse the file based on its XML tags.
                    586:     my $parser = HTML::TokeParser->new(\$metadata_string);
                    587:     my $token;
                    588:     while ($token = $parser->get_token) # Loop through tokens.
                    589:       {
                    590: 	if ($token->[0] eq 'S') # If it is a start token.
                    591: 	  {
                    592: 	    my $entry = $token->[1];
                    593: 	    my $unikey = $entry; # A unique identifier for this xml tag key.
                    594: 	    if (defined($token->[2]->{'part'}))
                    595: 	      { 
                    596: 		$unikey .= '_'.$token->[2]->{'part'}; 
                    597: 	      }
                    598: 	    if (defined($token->[2]->{'name'}))
                    599: 	      { 
                    600: 		$unikey .= '_'.$token->[2]->{'name'}; 
                    601: 	      }
                    602: 	    # Append $unikey to metatable's keys entry.
                    603: 	    if ($metatable{$uri.'keys'})
                    604: 	      {
                    605: 		$metatable{$uri.'keys'} .= ','.$unikey;
1.1       harris41  606: 	      }
1.28      harris41  607: 	    else
                    608: 	      {
                    609: 		$metatable{$uri.'keys'} = $unikey;
1.1       harris41  610: 	      }
1.28      harris41  611: 	    # Insert contents into metatable entry for the unikey.
                    612: 	    foreach my $t3 (@{$token->[3]})
                    613: 	      {
                    614: 		$metatable{$uri.''.$unikey.'.'.$_} = $token->[2]->{$t3};
1.1       harris41  615: 	      }
1.28      harris41  616: 	    # If there was no text contained inside the tags, set = default.
                    617: 	    unless
                    618: 	      (
                    619: 	        $metatable{$uri.''.$unikey} = $parser->get_text('/'.$entry)
                    620: 	      )
                    621: 	      {
                    622: 		$metatable{$uri.''.$unikey} =
                    623: 		    $metatable{$uri.''.$unikey.'.default'};
                    624: 	      }
                    625: 	  }
                    626:       }
                    627:     # Return with a key-value table of XML tags and their tag contents.
                    628:     return(\%metatable);
                    629:   }
                    630: 
                    631: =pod
                    632: 
                    633: B<get_file_contents> - returns either the contents of the file or a -1.
                    634: 
                    635: =over 4
                    636: 
                    637: Parameters:
                    638: 
                    639: =item I<$file> - a complete filesystem path.to the file.
                    640: 
                    641: =back
                    642: 
                    643: =over 4
                    644: 
                    645: Returns:
                    646: 
                    647: =item C<string> - file contents or a -1.
                    648: 
                    649: =back
                    650: 
                    651: =cut
                    652: 
                    653: sub get_file_contents ($)
                    654:   {
                    655:     my $file = shift(@_);
                    656: 
                    657:     # If file does not exist, then return a -1 value.
                    658:     unless (-e $file)
                    659:       {
                    660: 	return(-1);
                    661:       }
                    662: 
                    663:     # Read in file contents.
                    664:     my $file_handle = IO::File->new($file);
                    665:     my $file_contents = '';
                    666:     while (<$file_handle>)
                    667:       {
                    668: 	$file_contents .= $_;
                    669:       }
                    670: 
                    671:     # Return file contents.
                    672:     return($file_contents);
                    673:   }
                    674: 
                    675: =pod
                    676: 
                    677: B<declutter> - Declutters URLs (remove extraneous prefixed filesystem path).
                    678: 
                    679: =over 4
                    680: 
                    681: Parameters:
                    682: 
                    683: =item I<$filesystem_path> - a complete filesystem path.
                    684: 
                    685: =back
                    686: 
                    687: =over 4
                    688: 
                    689: Returns:
                    690: 
                    691: =item C<string> - remnants of the filesystem path (beginning portion removed).
                    692: 
                    693: =back
                    694: 
                    695: =cut
                    696: 
                    697: sub declutter
                    698:   {
                    699:     my $filesystem_path = shift(@_);
                    700: 
                    701:     # Remove beginning portions of the filesystem path.
                    702:     $filesystem_path =~ s/^$perlvar{'lonDocRoot'}//;
                    703:     $filesystem_path =~ s!^/!!;
                    704:     $filesystem_path =~ s!^res/!!;
                    705: 
                    706:     # Return what is remaining for the filesystem path.
                    707:     return($filesystem_path);
                    708:   }
                    709: 
                    710: =pod
                    711: 
                    712: B<query_home_server_status> - Is this the home server of an author's directory?
                    713: 
                    714: =over 4
                    715: 
                    716: Parameters:
                    717: 
                    718: =item I<$author_filesystem_path> - directory path for a user.
                    719: 
                    720: =back
                    721: 
                    722: =over 4
                    723: 
                    724: Returns:
                    725: 
                    726: =item C<boolean> - 1 if true; 0 if false.
                    727: 
                    728: =back
                    729: 
                    730: =cut
                    731: 
                    732: sub query_home_server_status ($)
                    733:   {
                    734:     my $author_filesystem_path = shift(@_);
                    735: 
                    736:     # Remove beginning portion of this filesystem path.
                    737:     $author_filesystem_path =~ s!/home/httpd/html/res/([^/]*)/([^/]*).*!$1/$2!;
                    738: 
                    739:     # Construct path to the author's ordinary user directory.
                    740:     my ($user_domain,$username) = split(m!/!,$author_filesystem_path);
                    741:     my $user_directory_path = construct_path_to_user_directory($user_domain,
                    742: 							       $username);
                    743: 
                    744:     # Return status of whether the user directory path is defined.
                    745:     if (-e $user_directory_path)
                    746:       {
                    747: 	return(1); # True.
                    748:       }
                    749:     else
                    750:       {
                    751:         return(0); # False.
                    752:       }
                    753:   }
                    754: 
                    755: =pod
                    756: 
                    757: B<construct_path_to_user_directory> ($$) - makes a filesystem path to user dir.
                    758: 
                    759: =over 4
                    760: 
                    761: Parameters:
                    762: 
                    763: =item I<$user_domain> - the loncapa domain of the user.
                    764: 
                    765: =item I<$username> - the unique username (user id) of the user.
                    766: 
                    767: =back
                    768: 
                    769: =over 4
                    770: 
                    771: Returns:
                    772: 
                    773: =item C<string> - representing the path on the filesystem.
                    774: 
                    775: =back
                    776: 
                    777: =cut
                    778: 
                    779: sub construct_path_to_user_directory ($$)
                    780:   {
                    781:     my ($user_domain,$username) = @_;
                    782: 
                    783:     # Untaint.
                    784:     $user_domain =~ s/\W//g;
                    785:     $username =~ s/\W//g;
                    786: 
                    787:     # Create three levels of sub-directoried filesystem path
                    788:     # based on the first three characters of the username.
                    789:     my $sub_filesystem_path = $username.'__';
                    790:     $sub_filesystem_path =~ s!(.)(.)(.).*!$1/$2/$3/!;
                    791: 
                    792:     # Use the sub-directoried levels and other variables to generate
                    793:     # the complete filesystem path.
                    794:     my $complete_filesystem_path =
                    795: 	join('/',($perlvar{'lonUsersDir'},
                    796: 		  $user_domain,
                    797: 		  $sub_filesystem_path,
                    798: 		  $username));
                    799: 
                    800:     # Return the complete filesystem path.
                    801:     return($complete_filesystem_path);
                    802:   }
                    803: 
                    804: =pod
                    805: 
                    806: B<sql_formatted_time> (@) - turns seconds since epoch into datetime sql format.
                    807: 
                    808: =over 4
                    809: 
                    810: Parameters:
                    811: 
                    812: =item I<$epochtime> - time in seconds since epoch (may need to be sanitized).
                    813: 
                    814: =back
                    815: 
                    816: =over 4
                    817: 
                    818: Returns:
                    819: 
                    820: =item C<string> - datetime sql formatted string.
                    821: 
                    822: =back
                    823: 
                    824: =cut
1.13      harris41  825: 
1.28      harris41  826: sub sql_formatted_time ($)
                    827:   {
                    828:     # Sanitize the time argument and convert to localtime array.
1.13      harris41  829:     my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
1.28      harris41  830: 	localtime(&sanitize_time(shift(@_)));
                    831: 
                    832:     # Convert month from (0..11) to (1..12).
                    833:     $mon += 1;
                    834: 
                    835:     # Make the year compatible with A.D. specification.
                    836:     $year += 1900;
                    837: 
                    838:     # Return a date which is compatible with MySQL's "DATETIME" format.
                    839:     return(join('-',($year,$mon,$mday)).
                    840: 	   ' '.
                    841: 	   join(':',($hour,$min,$sec))
                    842: 	   );
                    843:   }
                    844: 
                    845: 
                    846: # ==================================== The following two subroutines are needed
                    847: #                 for accommodating incorrect time formats inside the metadata.
                    848: 
                    849: =pod
                    850: 
                    851: B<make_seconds_since_epoch> (@) - turns time metadata into seconds since epoch.
                    852: 
                    853: =over 4
                    854: 
                    855: Parameters:
                    856: 
                    857: =item I<%time_metadata> - a key-value listing characterizing month, year, etc.
                    858: 
                    859: =back
                    860: 
                    861: =over 4
                    862: 
                    863: Returns:
                    864: 
                    865: =item C<integer> - seconds since epoch.
                    866: 
                    867: =back
                    868: 
                    869: =cut
                    870: 
                    871: sub make_seconds_since_epoch (@)
                    872:   {
                    873:     # Keytable of time metadata.
                    874:     my %time_metadata = @_;
                    875: 
                    876:     # Return seconds since the epoch (January 1, 1970, 00:00:00 UTC).
                    877:     return(POSIX::mktime(
                    878: 			 ($time_metadata{'seconds'},
                    879: 			  $time_metadata{'minutes'},
                    880: 			  $time_metadata{'hours'},
                    881: 			  $time_metadata{'day'},
                    882: 			  $time_metadata{'month'}-1,
                    883: 			  $time_metadata{'year'}-1900,
                    884: 			  0,
                    885: 			  0,
                    886: 			  $time_metadata{'dlsav'})
                    887: 			 )
                    888: 	   );
                    889:   }
                    890: 
                    891: =pod
                    892: 
                    893: B<sanitize_time> - if time looks sql-formatted, make it seconds since epoch.
                    894: 
                    895: Somebody described this subroutine as
                    896: "retro-fixing of un-backward-compatible time format".
                    897: 
                    898: What this means, is that a part of this code expects to get UTC seconds
                    899: since the epoch (beginning of 1970).  Yet, some of the .meta files have
                    900: sql-formatted time strings (2001-04-01, etc.) instead of seconds-since-epoch
                    901: integers (e.g. 1044147435).  These time strings do not encode the timezone
                    902: and, in this sense, can be considered "un-backwards-compatible".
                    903: 
                    904: =over 4
                    905: 
                    906: Parameters:
                    907: 
                    908: =item I<$potentially_badformat_string> - string to "retro-fix".
                    909: 
                    910: =back
                    911: 
                    912: =over 4
                    913: 
                    914: Returns:
                    915: 
                    916: =item C<integer> - seconds since epoch.
                    917: 
                    918: =back
                    919: 
                    920: =cut
                    921: 
                    922: sub sanitize_time ($)
                    923:   {
                    924:     my $timestamp = shift(@_);
                    925:     # If timestamp is in this unexpected format....
                    926:     if ($timestamp =~ /^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/)
                    927:       {
                    928: 	# then convert into seconds since epoch (the expected format).
                    929: 	$timestamp = &make_seconds_since_epoch(
                    930: 					       'year' => $1,
                    931: 					       'month' => $2,
                    932: 					       'day' => $3,
                    933: 					       'hours' => $4,
                    934: 					       'minutes' => $5,
                    935: 					       'seconds' => $6
                    936: 					       );
                    937:       }
                    938:     # Otherwise we assume timestamp to be as expected.
                    939:     return($timestamp);
                    940:   }
                    941: 
                    942: =pod
                    943: 
                    944: =head1 AUTHOR
                    945: 
                    946: Written to help the loncapa project.
                    947: 
                    948: Scott Harrison, sharrison@users.sourceforge.net
                    949: 
                    950: This is distributed under the same terms as loncapa (i.e. "freeware").
1.24      www       951: 
1.28      harris41  952: =cut

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>