Annotation of loncom/metadata_database/searchcat.pl, revision 1.29

1.1       harris41    1: #!/usr/bin/perl
                      2: # The LearningOnline Network
                      3: # searchcat.pl "Search Catalog" batch script
1.16      harris41    4: #
1.29    ! albertel    5: # $Id: searchcat.pl,v 1.28 2003/02/03 05:39:37 harris41 Exp $
1.16      harris41    6: #
                      7: # Copyright Michigan State University Board of Trustees
                      8: #
1.29    ! albertel    9: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
1.16      harris41   10: #
1.29    ! albertel   11: # LON-CAPA is free software; you can redistribute it and/or modify
1.16      harris41   12: # it under the terms of the GNU General Public License as published by
                     13: # the Free Software Foundation; either version 2 of the License, or
                     14: # (at your option) any later version.
                     15: #
1.29    ! albertel   16: # LON-CAPA is distributed in the hope that it will be useful,
1.16      harris41   17: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     18: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     19: # GNU General Public License for more details.
                     20: #
                     21: # You should have received a copy of the GNU General Public License
1.29    ! albertel   22: # along with LON-CAPA; if not, write to the Free Software
1.16      harris41   23: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     24: #
                     25: # /home/httpd/html/adm/gpl.txt
                     26: #
1.29    ! albertel   27: # http://www.lon-capa.org/
1.16      harris41   28: #
                     29: # YEAR=2001
1.17      harris41   30: # 04/14/2001, 04/16/2001 Scott Harrison
1.16      harris41   31: #
1.17      harris41   32: # YEAR=2002
                     33: # 05/11/2002 Scott Harrison
1.16      harris41   34: #
1.28      harris41   35: # YEAR=2003
                     36: # Scott Harrison
                     37: #
1.16      harris41   38: ###
1.1       harris41   39: 
1.28      harris41   40: =pod
                     41: 
                     42: =head1 NAME
                     43: 
                     44: B<searchcat.pl> - put authoritative filesystem data into sql database.
                     45: 
                     46: =head1 SYNOPSIS
                     47: 
                     48: Ordinarily this script is to be called from a loncapa cron job
                     49: (CVS source location: F<loncapa/loncom/cron/loncapa>; typical
                     50: filesystem installation location: F</etc/cron.d/loncapa>).
                     51: 
                     52: Here is the cron job entry.
                     53: 
                     54: C<# Repopulate and refresh the metadata database used for the search catalog.>
                     55: 
                     56: C<10 1 * * 7    www    /home/httpd/perl/searchcat.pl>
                     57: 
                     58: This script only allows itself to be run as the user C<www>.
                     59: 
                     60: =head1 DESCRIPTION
                     61: 
                     62: This script goes through a loncapa resource directory and gathers metadata.
                     63: The metadata is entered into a SQL database.
                     64: 
                     65: This script also does general database maintenance such as reformatting
                     66: the C<loncapa:metadata> table if it is deprecated.
                     67: 
                     68: This script also builds dynamic temporal metadata and stores this inside
                     69: a F<nohist_resevaldata.db> database file.
                     70: 
                     71: This script is playing an increasingly important role for a loncapa
                     72: library server.  The proper operation of this script is critical for a smooth
                     73: and correct user experience.
                     74: 
                     75: =cut
                     76: 
                     77: # ========================================================== Setting things up.
                     78: 
                     79: # ------------------------------------------------------  Use external modules.
1.1       harris41   80: 
1.17      harris41   81: use lib '/home/httpd/lib/perl/';
                     82: use LONCAPA::Configuration;
                     83: 
1.1       harris41   84: use IO::File;
                     85: use HTML::TokeParser;
1.6       harris41   86: use DBI;
1.21      www        87: use GDBM_File;
1.24      www        88: use POSIX qw(strftime mktime);
1.1       harris41   89: 
1.28      harris41   90: # ----------------- Code to enable 'find' subroutine listing of the .meta files
                     91: use File::Find;
                     92: 
                     93: # List of .meta files (used on a per-user basis).
1.1       harris41   94: my @metalist;
1.21      www        95: 
1.28      harris41   96: # ---------------  Read loncapa_apache.conf and loncapa.conf and get variables.
                     97: my $perlvarref = LONCAPA::Configuration::read_conf('loncapa.conf');
                     98: my %perlvar = %{$perlvarref};
                     99: undef($perlvarref); # Remove since sensitive and not needed.
                    100: delete($perlvar{'lonReceipt'}); # Remove since sensitive and not needed.
                    101: 
                    102: # ------------------------------------- Only run if machine is a library server
                    103: if ($perlvar{'lonRole'} ne 'library')
                    104:   {
                    105:     exit(0);
                    106:   }
                    107: 
                    108: # ------------------------------ Make sure this process is running as user=www.
                    109: my $wwwid = getpwnam('www');
                    110: if ($wwwid != $<)
                    111:   {
                    112:     $emailto = "$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
                    113:     $subj = "LON: $perlvar{'lonHostID'} User ID mismatch";
                    114:     system("echo 'User ID mismatch. searchcat.pl must be run as user www.' | ".
                    115: 	   "mailto $emailto -s '$subj' > /dev/null");
                    116:     exit(1);
                    117:   }
                    118: 
                    119: # ------------------------------------------------------ Initialize log output.
                    120: open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
                    121: print(LOG '==== Searchcat Run '.localtime().' ===='."\n\n");
                    122: 
                    123: my $dbh; # Database object reference handle.
                    124: 
                    125: # ----------------------------- Verify connection to loncapa:metadata database.
                    126: unless (
                    127: 	$dbh = DBI->connect('DBI:mysql:loncapa','www',
                    128: 			    $perlvar{'lonSqlAccess'},
                    129: 			    { RaiseError => 0,PrintError => 0})
                    130: 	)
                    131:   { 
                    132:     print(LOG '**** ERROR **** Cannot connect to database!'."\n");
                    133:     exit(0);
                    134:   }
                    135: 
                    136: # ------------------------------ Create loncapa:metadata table if non-existent.
                    137: my $make_metadata_table = 'CREATE TABLE IF NOT EXISTS metadata ('.
                    138:     'title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, '.
                    139:     'version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, '.
                    140:     'creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, '.
                    141:     'copyright TEXT, utilitysemaphore BOOL, FULLTEXT idx_title (title), '.
                    142:     'FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), '.
                    143:     'FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), '.
                    144:     'FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), '.
                    145:     'FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), '.
                    146:     'FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), '.
                    147:     'FULLTEXT idx_copyright (copyright)) TYPE=MYISAM';
                    148: 
                    149: $dbh->do($make_metadata_table); # Generate the table.
                    150: 
                    151: # ----------------------------- Verify format of the loncapa:metadata database.
                    152: #                               (delete and recreate database if necessary).
                    153: 
                    154: # Make a positive control for verifying table structure.
                    155: my $make_metadata_table_CONTROL = $make_metadata_table;
                    156: $make_metadata_table_CONTROL =~
                    157:     s/^(CREATE TABLE IF NOT EXISTS) metadata/$1 CONTROL_metadata/;
                    158: 
                    159: $dbh->do('DROP TABLE IF EXISTS CONTROL_metadata');
                    160: $dbh->do($make_metadata_table_CONTROL);
                    161: 
                    162: my $table_description; # selectall reference to the table description.
                    163: 
                    164: my $CONTROL_table_string; # What the table description should look like.
                    165: my $table_string; # What the table description does look like.
                    166: 
                    167: # Calculate the CONTROL table description (what it should be).
                    168: $table_description = $dbh->selectall_arrayref('describe CONTROL_metadata');
                    169: foreach my $table_row (@{$table_description})
                    170:   {
                    171:     $CONTROL_table_string .= join(',',@{$table_row})."\n";
                    172:   }
                    173: 
                    174: # Calculate the current table description (what it currently looks like).
                    175: $table_description = $dbh->selectall_arrayref('describe metadata');
                    176: foreach my $table_row (@{$table_description})
                    177:   {
                    178:     $table_string .= join(',',@{$table_row})."\n";
                    179:   }
                    180: 
                    181: if ($table_string ne $CONTROL_table_string)
                    182:   {
                    183:     # Log this incident.
                    184:     print(LOG '**** WARNING **** Table structure mismatch, need to regenerate'.
                    185: 	  '.'."\n");
                    186:     # Delete the table.
                    187:     $dbh->do('DROP TABLE IF EXISTS metadata');
                    188:     # Generate the table.
                    189:     $dbh->do($make_metadata_table);
                    190:   }
1.21      www       191: 
1.28      harris41  192: $dbh->do('DROP TABLE IF EXISTS CONTROL_metadata'); # Okay.  Done with control.
1.21      www       193: 
1.28      harris41  194: # ----------------------------------------------- Set utilitysemaphore to zero.
                    195: $dbh->do('UPDATE metadata SET utilitysemaphore = 0');
                    196: 
                    197: # ========================================================= Main functionality.
                    198: 
                    199: # - Determine home authors on this server based on resources dir and user tree.
                    200: 
                    201: # RESOURCES: the resources directory (subdirs correspond to author usernames).
                    202: opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}") or
                    203:     (print(LOG '=== /res/--lonDefDomain-- directory is not accessible'."\n")
                    204:      and exit(0));
                    205: 
                    206: # query_home_server_status will look for user home directories on this machine.
                    207: my @homeusers =
                    208:     grep {&query_home_server_status($perlvar{'lonDocRoot'}.'/res/'.
                    209: 				    $perlvar{'lonDefDomain'}.'/'.$_)
                    210: 	  } grep {!/^\.\.?$/} readdir(RESOURCES);
                    211: closedir(RESOURCES);
                    212: 
                    213: unless (@homeusers)
                    214:   {
                    215:     print(LOG '=== No home users found on this server.'."\n");
                    216:   }
                    217: 
                    218: # Consider each author individually.
                    219: foreach my $user (@homeusers)
                    220:   {
                    221:     # Make a log entry.
                    222:     print(LOG "\n".'=== User: '.$user."\n\n");
                    223: 
                    224:     # Get filesystem path to this user's directory.
                    225:     my $user_directory =
                    226: 	&construct_path_to_user_directory($perlvar{'lonDefDomain'},$user);
                    227: 
                    228:     # Remove left-over db-files from a potentially crashed searchcat run.
                    229:     unlink($user_directory.'/nohist_new_resevaldata.db');
                    230: 
                    231:     # Cleanup the metalist array.
                    232:     undef(@metalist);
                    233:     @metalist = ();
                    234: 
                    235:     # This will add entries to the @metalist array.
                    236:     &File::Find::find(\&wanted,
                    237: 		      $perlvar{'lonDocRoot'}.'/res/'.
                    238: 		      $perlvar{'lonDefDomain'}.'/'.$user);
                    239: 
                    240:     # -- process file to get metadata and put into search catalog SQL database
                    241:     # Also, build and store dynamic metadata.
                    242:     # Also, delete record entries before refreshing.
                    243:     foreach my $m (@metalist)
                    244:       {
                    245: 	# Log this action.
                    246: 	print(LOG "- ".$m."\n");
                    247: 
                    248: 	# Get metadata from the file.
                    249: 	my $ref = get_metadata_from_file($m);
                    250: 
                    251: 	# Make a datarecord identifier for this resource.
                    252: 	my $m2 = '/res/'.declutter($m);
                    253: 	$m2 =~ s/\.meta$//;
                    254: 
                    255: 	# Build and store dynamic metadata inside nohist_resevaldata.db.
                    256: 	build_on_the_fly_dynamic_metadata($m2);
                    257: 
                    258: 	# Delete record if it already exists.
                    259: 	my $q2 = 'select * from metadata where url like binary '."'".$m2."'";
                    260: 	my $sth = $dbh->prepare($q2);
                    261: 	$sth->execute();
                    262: 	my $r1 = $sth->fetchall_arrayref;
                    263: 	if (@$r1)
                    264: 	  {
                    265: 	    $sth = 
                    266: 		$dbh->prepare('delete from metadata where url like binary '.
                    267: 			      "'".$m2."'");
                    268: 	    $sth->execute();
                    269: 	  }
                    270: 
                    271: 	# Add new/replacement record into the loncapa:metadata table.
                    272: 	$sth = $dbh->prepare('insert into metadata values ('.
                    273: 			     '"'.delete($ref->{'title'}).'"'.','.
                    274: 			     '"'.delete($ref->{'author'}).'"'.','.
                    275: 			     '"'.delete($ref->{'subject'}).'"'.','.
                    276: 			     '"'.$m2.'"'.','.
                    277: 			     '"'.delete($ref->{'keywords'}).'"'.','.
                    278: 			     '"'.'current'.'"'.','.
                    279: 			     '"'.delete($ref->{'notes'}).'"'.','.
                    280: 			     '"'.delete($ref->{'abstract'}).'"'.','.
                    281: 			     '"'.delete($ref->{'mime'}).'"'.','.
                    282: 			     '"'.delete($ref->{'language'}).'"'.','.
                    283: 			     '"'.sql_formatted_time(
                    284: 				       delete($ref->{'creationdate'})).'"'.','.
                    285: 			     '"'.sql_formatted_time(
                    286: 				   delete($ref->{'lastrevisiondate'})).'"'.','.
                    287: 			     '"'.delete($ref->{'owner'}).'"'.','.
                    288: 			     '"'.delete($ref->{'copyright'}).'"'.','.
                    289: 			     '1'.')');
                    290: 	$sth->execute();
                    291:       }
                    292: 
                    293: # ----------------------- Clean up database, remove stale SQL database records.
                    294:     $dbh->do('DELETE FROM metadata WHERE utilitysemaphore = 0');
                    295: 
                    296: # -------------------------------------------------- Copy over the new db-files
                    297:     system('mv '.$user_directory.'/nohist_new_resevaldata.db '.
                    298: 	         $user_directory.'/nohist_resevaldata.db');
                    299:   }
                    300: 
                    301: # --------------------------------------------------- Close database connection
                    302: $dbh->disconnect;
                    303: print LOG "\n==== Searchcat completed ".localtime()." ====\n";
                    304: close(LOG);
                    305: exit(0);
                    306: 
                    307: # ================================================================ Subroutines.
                    308: 
                    309: =pod
                    310: 
                    311: =head1 SUBROUTINES
                    312: 
                    313: =cut
                    314: 
                    315: =pod
                    316: 
                    317: B<unescape> - translate to unstrange escaped syntax to strange characters.
                    318: 
                    319: =over 4
                    320: 
                    321: Parameters:
                    322: 
                    323: =item I<$str> - string with unweird characters.
                    324: 
                    325: =back
                    326: 
                    327: =over 4
                    328: 
                    329: Returns:
                    330: 
                    331: =item C<string> - string with potentially weird characters.
                    332: 
                    333: =back
                    334: 
                    335: =cut
                    336: 
                    337: sub unescape ($)
                    338:   {
                    339:     my $str = shift(@_);
1.21      www       340:     $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg;
1.28      harris41  341:     return($str);
                    342:   }
                    343: 
                    344: =pod
                    345: 
                    346: B<escape> - translate strange characters to unstrange escaped syntax.
                    347: 
                    348: =over 4
                    349: 
                    350: Parameters:
1.21      www       351: 
1.28      harris41  352: =item I<$str> - string with potentially weird characters to unweird-ify.
1.22      www       353: 
1.28      harris41  354: =back
                    355: 
                    356: =over 4
                    357: 
                    358: Returns:
                    359: 
                    360: =item C<string> - unweird-ified string.
                    361: 
                    362: =back
                    363: 
                    364: =cut
                    365: 
                    366: sub escape ($)
                    367:   {
                    368:     my $str = shift(@_);
1.22      www       369:     $str =~ s/(\W)/"%".unpack('H2',$1)/eg;
1.28      harris41  370:     return($str);
                    371:   }
                    372: 
                    373: =pod
                    374: 
                    375: B<build_on_the_fly_dynamic_metadata> - evaluate and store dynamic metadata.
                    376: 
                    377: Dynamic metadata is stored in a nohist_resevaldata GDBM database.
                    378: The only thing that this subroutine really makes happen is adjusting
                    379: a 'count' value inside the F<nohist_new_resevaldata.db> as well
                    380: as updating F<nohist_new_resevaldata.db> with information from
                    381: F<nohist_resevaldata.db>.
                    382: 
1.29    ! albertel  383: It may need optmization, but since it gets called once a week. . .
1.28      harris41  384: =over 4
                    385: 
                    386: Parameters:
                    387: 
                    388: =item I<$url> - the filesystem path (url may be a misnomer...)
                    389: 
                    390: =back
                    391: 
                    392: =over 4
                    393: 
                    394: Returns:
1.22      www       395: 
1.28      harris41  396: =item C<hash> - key-value table of dynamically evaluated metadata.
1.21      www       397: 
1.28      harris41  398: =back
1.21      www       399: 
1.28      harris41  400: =cut
1.25      www       401: 
1.28      harris41  402: sub build_on_the_fly_dynamic_metadata ($)
                    403:   {
1.29    ! albertel  404:     # some elements in here maybe non-obvious
        !           405: 
        !           406:     # Need to compute the user's directory.
1.28      harris41  407:     my $url = &declutter(shift(@_));
                    408:     $url =~ s/\.meta$//;
                    409:     my %returnhash = ();
                    410:     my ($adomain,$aauthor) = ($url =~ m!^(\w+)/(\w+)/!);
                    411:     my $user_directory = &construct_path_to_user_directory($adomain,$aauthor);
                    412: 
                    413:     # Attempt a GDBM database instantiation inside users directory and proceed.
1.25      www       414:     if ((tie(%evaldata,'GDBM_File',
1.28      harris41  415:             $user_directory.
                    416: 	     '/nohist_resevaldata.db',&GDBM_READER(),0640)) &&
1.25      www       417:         (tie(%newevaldata,'GDBM_File',
1.28      harris41  418: 	     $user_directory.
                    419: 	     '/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640)))
                    420:       {
                    421: 	# For different variables, track the running sum and counts.
                    422: 	my %sum = ();
                    423: 	my %cnt = ();
                    424: 
                    425: 	# Define computed items as a sum (add) or an average (avg) or a raw
                    426: 	# count (cnt) or 'app'?
                    427: 	my %listitems=('count'        => 'add',
                    428: 		       'course'       => 'add',
                    429: 		       'avetries'     => 'avg',
                    430: 		       'stdno'        => 'add',
                    431: 		       'difficulty'   => 'avg',
                    432: 		       'clear'        => 'avg',
                    433: 		       'technical'    => 'avg',
                    434: 		       'helpful'      => 'avg',
                    435: 		       'correct'      => 'avg',
                    436: 		       'depth'        => 'avg',
                    437: 		       'comments'     => 'app',
                    438: 		       'usage'        => 'cnt'
                    439: 		       );
                    440: 	
                    441: 	# Untaint the url and use as part of a regular expression.
                    442: 	my $regexp = $url;
                    443: 	$regexp =~ s/(\W)/\\$1/g;
1.29    ! albertel  444: 	$regexp = '___'.$regexp.'___([a-z]+)$'; #' emacs
1.28      harris41  445: 
                    446: 	# Check existing nohist database for this url.
1.29    ! albertel  447:         # this is modfying the 'count' entries
        !           448:         # and copying all othe entries over
1.28      harris41  449: 	foreach (keys %evaldata)
                    450: 	  {
                    451: 	    my $key = &unescape($_);
                    452: 	    if ($key =~ /$regexp/) # If url-based entry exists.
                    453: 	      {
                    454: 		my $ctype = $1; # Set to specific category type.
                    455: 
                    456: 		# Do an increment for this category type.
                    457: 		if (defined($cnt{$ctype}))
                    458: 		  {
                    459: 		    $cnt{$ctype}++; 
                    460: 		  }
                    461: 		else
                    462: 		  {
                    463: 		    $cnt{$ctype} = 1; 
                    464: 		  }
                    465:                 unless ($listitems{$ctype} eq 'app') # WHAT DOES 'app' MEAN?
                    466: 		  {
                    467: 		    # Increment the sum based on the evaluated data in the db.
                    468: 		    if (defined($sum{$ctype}))
                    469: 		      {
                    470: 			$sum{$ctype} += $evaldata{$_};
                    471: 		      }
                    472: 		    else
                    473: 		      {
                    474: 			$sum{$ctype} = $evaldata{$_};
                    475: 		      }
                    476:  		  }
                    477: 		else # 'app' mode, means to use '<hr />' as a separator
                    478: 		  {
                    479: 		    if (defined($sum{$ctype}))
                    480: 		      {
                    481: 			if ($evaldata{$_})
                    482: 			  {
                    483: 			    $sum{$ctype} .= '<hr />'.$evaldata{$_};
                    484: 			  }
                    485: 		      }
                    486: 		    else
                    487: 		      {
                    488: 			$sum{$ctype} = ''.$evaldata{$_};
                    489: 		      }
                    490: 		  }
                    491: 		if ($ctype ne 'count')
                    492: 		  {
1.29    ! albertel  493:                     # this is copying all data except 'count' attributes
1.28      harris41  494: 		    $newevaldata{$_} = $evaldata{$_};
                    495: 		  }
                    496: 	      }
                    497: 	  }
                    498: 
1.29    ! albertel  499:         # the only other time this loop is useful is for the 'count' hash
        !           500:         # element
1.28      harris41  501: 	foreach (keys %cnt)
                    502: 	  {
                    503: 	    if ($listitems{$_} eq 'avg')
                    504: 	      {
                    505: 		$returnhash{$_} = int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0;
                    506: 	      }
                    507: 	    elsif ($listitems{$_} eq 'cnt')
                    508: 	      {
                    509: 		$returnhash{$_} = $cnt{$_};
                    510: 	      }
                    511: 	    else
                    512: 	      {
                    513: 		$returnhash{$_} = $sum{$_};
                    514: 	      }
                    515: 	  }
                    516: 
1.29    ! albertel  517:         # seems to be doing something useful
1.28      harris41  518: 	if ($returnhash{'count'})
                    519: 	  {
                    520: 	    my $newkey = $$.'_'.time.'_searchcat___'.&escape($url).'___count';
                    521: 	    $newevaldata{$newkey} = $returnhash{'count'};
                    522: 	  }
                    523: 
                    524: 	untie(%evaldata); # Close/release the original nohist database.
                    525: 	untie(%newevaldata); # Close/release the new nohist database.
1.22      www       526:       }
1.28      harris41  527:     return(%returnhash);
                    528:   }
                    529: 
                    530: =pod
                    531: 
                    532: B<wanted> - used by B<File::Find::find> subroutine.
                    533: 
                    534: This evaluates whether a file is wanted, and pushes it onto the
                    535: I<@metalist> array.  This subroutine was, for the most part, auto-generated
                    536: by the B<find2perl> command.
                    537: 
                    538: =over 4
                    539: 
                    540: Parameters:
                    541: 
                    542: =item I<$file> - a path to the file.
                    543: 
                    544: =back
                    545: 
                    546: =over 4
                    547: 
                    548: Returns:
                    549: 
                    550: =item C<boolean> - true or false based on logical statement.
                    551: 
                    552: =back
                    553: 
                    554: =cut
                    555: 
                    556: sub wanted ($)
                    557:   {
1.1       harris41  558:     (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) &&
1.28      harris41  559:     -f $_ &&
1.10      harris41  560:     /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ &&
1.28      harris41  561:     push(@metalist,$File::Find::dir.'/'.$_);
                    562:   }
                    563: 
                    564: =pod
                    565: 
                    566: B<get_metadata_from_file> - read xml-tagged file and return parsed metadata.
1.1       harris41  567: 
1.28      harris41  568: I<Note that this is significantly altered from a subroutine present in lonnet.>
1.15      harris41  569: 
1.28      harris41  570: =over 4
1.1       harris41  571: 
1.28      harris41  572: Parameters:
1.27      www       573: 
1.28      harris41  574: =item I<$file> - a path.to the file.
1.27      www       575: 
1.28      harris41  576: =back
1.27      www       577: 
1.28      harris41  578: =over 4
1.25      www       579: 
1.28      harris41  580: Returns:
1.1       harris41  581: 
1.28      harris41  582: =item C<hash reference> - a hash array (keys and values).
1.1       harris41  583: 
1.28      harris41  584: =back
1.25      www       585: 
1.28      harris41  586: =cut
1.1       harris41  587: 
1.28      harris41  588: sub get_metadata_from_file ($)
                    589:   {
                    590:     my ($filename) = @_;
                    591:     my %metatable; # Used to store return value of hash-tabled metadata.
                    592:     $filename = &declutter($filename); # Remove non-identifying filesystem info
                    593:     my $uri = ''; # The URI is not relevant in this scenario.
                    594:     unless ($filename =~ m/\.meta$/) # Unless ending with .meta.
                    595:       {
                    596: 	$filename .= '.meta'; # Append a .meta suffix.
                    597:       }
                    598:     # Get the file contents.
                    599:     my $metadata_string =
                    600: 	&get_file_contents($perlvar{'lonDocRoot'}.'/res/'.$filename);
                    601: 
                    602:     # Parse the file based on its XML tags.
                    603:     my $parser = HTML::TokeParser->new(\$metadata_string);
                    604:     my $token;
                    605:     while ($token = $parser->get_token) # Loop through tokens.
                    606:       {
                    607: 	if ($token->[0] eq 'S') # If it is a start token.
                    608: 	  {
                    609: 	    my $entry = $token->[1];
                    610: 	    my $unikey = $entry; # A unique identifier for this xml tag key.
                    611: 	    if (defined($token->[2]->{'part'}))
                    612: 	      { 
                    613: 		$unikey .= '_'.$token->[2]->{'part'}; 
                    614: 	      }
                    615: 	    if (defined($token->[2]->{'name'}))
                    616: 	      { 
                    617: 		$unikey .= '_'.$token->[2]->{'name'}; 
                    618: 	      }
                    619: 	    # Append $unikey to metatable's keys entry.
                    620: 	    if ($metatable{$uri.'keys'})
                    621: 	      {
                    622: 		$metatable{$uri.'keys'} .= ','.$unikey;
1.1       harris41  623: 	      }
1.28      harris41  624: 	    else
                    625: 	      {
                    626: 		$metatable{$uri.'keys'} = $unikey;
1.1       harris41  627: 	      }
1.28      harris41  628: 	    # Insert contents into metatable entry for the unikey.
                    629: 	    foreach my $t3 (@{$token->[3]})
                    630: 	      {
                    631: 		$metatable{$uri.''.$unikey.'.'.$_} = $token->[2]->{$t3};
1.1       harris41  632: 	      }
1.28      harris41  633: 	    # If there was no text contained inside the tags, set = default.
                    634: 	    unless
                    635: 	      (
                    636: 	        $metatable{$uri.''.$unikey} = $parser->get_text('/'.$entry)
                    637: 	      )
                    638: 	      {
                    639: 		$metatable{$uri.''.$unikey} =
                    640: 		    $metatable{$uri.''.$unikey.'.default'};
                    641: 	      }
                    642: 	  }
                    643:       }
                    644:     # Return with a key-value table of XML tags and their tag contents.
                    645:     return(\%metatable);
                    646:   }
                    647: 
                    648: =pod
                    649: 
                    650: B<get_file_contents> - returns either the contents of the file or a -1.
                    651: 
                    652: =over 4
                    653: 
                    654: Parameters:
                    655: 
                    656: =item I<$file> - a complete filesystem path.to the file.
                    657: 
                    658: =back
                    659: 
                    660: =over 4
                    661: 
                    662: Returns:
                    663: 
                    664: =item C<string> - file contents or a -1.
                    665: 
                    666: =back
                    667: 
                    668: =cut
                    669: 
                    670: sub get_file_contents ($)
                    671:   {
                    672:     my $file = shift(@_);
                    673: 
                    674:     # If file does not exist, then return a -1 value.
                    675:     unless (-e $file)
                    676:       {
                    677: 	return(-1);
                    678:       }
                    679: 
                    680:     # Read in file contents.
                    681:     my $file_handle = IO::File->new($file);
                    682:     my $file_contents = '';
                    683:     while (<$file_handle>)
                    684:       {
                    685: 	$file_contents .= $_;
                    686:       }
                    687: 
                    688:     # Return file contents.
                    689:     return($file_contents);
                    690:   }
                    691: 
                    692: =pod
                    693: 
                    694: B<declutter> - Declutters URLs (remove extraneous prefixed filesystem path).
                    695: 
                    696: =over 4
                    697: 
                    698: Parameters:
                    699: 
                    700: =item I<$filesystem_path> - a complete filesystem path.
                    701: 
                    702: =back
                    703: 
                    704: =over 4
                    705: 
                    706: Returns:
                    707: 
                    708: =item C<string> - remnants of the filesystem path (beginning portion removed).
                    709: 
                    710: =back
                    711: 
                    712: =cut
                    713: 
                    714: sub declutter
                    715:   {
                    716:     my $filesystem_path = shift(@_);
                    717: 
                    718:     # Remove beginning portions of the filesystem path.
                    719:     $filesystem_path =~ s/^$perlvar{'lonDocRoot'}//;
                    720:     $filesystem_path =~ s!^/!!;
                    721:     $filesystem_path =~ s!^res/!!;
                    722: 
                    723:     # Return what is remaining for the filesystem path.
                    724:     return($filesystem_path);
                    725:   }
                    726: 
                    727: =pod
                    728: 
                    729: B<query_home_server_status> - Is this the home server of an author's directory?
                    730: 
                    731: =over 4
                    732: 
                    733: Parameters:
                    734: 
                    735: =item I<$author_filesystem_path> - directory path for a user.
                    736: 
                    737: =back
                    738: 
                    739: =over 4
                    740: 
                    741: Returns:
                    742: 
                    743: =item C<boolean> - 1 if true; 0 if false.
                    744: 
                    745: =back
                    746: 
                    747: =cut
                    748: 
                    749: sub query_home_server_status ($)
                    750:   {
                    751:     my $author_filesystem_path = shift(@_);
                    752: 
                    753:     # Remove beginning portion of this filesystem path.
                    754:     $author_filesystem_path =~ s!/home/httpd/html/res/([^/]*)/([^/]*).*!$1/$2!;
                    755: 
                    756:     # Construct path to the author's ordinary user directory.
                    757:     my ($user_domain,$username) = split(m!/!,$author_filesystem_path);
                    758:     my $user_directory_path = construct_path_to_user_directory($user_domain,
                    759: 							       $username);
                    760: 
                    761:     # Return status of whether the user directory path is defined.
                    762:     if (-e $user_directory_path)
                    763:       {
                    764: 	return(1); # True.
                    765:       }
                    766:     else
                    767:       {
                    768:         return(0); # False.
                    769:       }
                    770:   }
                    771: 
                    772: =pod
                    773: 
                    774: B<construct_path_to_user_directory> ($$) - makes a filesystem path to user dir.
                    775: 
                    776: =over 4
                    777: 
                    778: Parameters:
                    779: 
                    780: =item I<$user_domain> - the loncapa domain of the user.
                    781: 
                    782: =item I<$username> - the unique username (user id) of the user.
                    783: 
                    784: =back
                    785: 
                    786: =over 4
                    787: 
                    788: Returns:
                    789: 
                    790: =item C<string> - representing the path on the filesystem.
                    791: 
                    792: =back
                    793: 
                    794: =cut
                    795: 
                    796: sub construct_path_to_user_directory ($$)
                    797:   {
                    798:     my ($user_domain,$username) = @_;
                    799: 
                    800:     # Untaint.
                    801:     $user_domain =~ s/\W//g;
                    802:     $username =~ s/\W//g;
                    803: 
                    804:     # Create three levels of sub-directoried filesystem path
                    805:     # based on the first three characters of the username.
                    806:     my $sub_filesystem_path = $username.'__';
                    807:     $sub_filesystem_path =~ s!(.)(.)(.).*!$1/$2/$3/!;
                    808: 
                    809:     # Use the sub-directoried levels and other variables to generate
                    810:     # the complete filesystem path.
                    811:     my $complete_filesystem_path =
                    812: 	join('/',($perlvar{'lonUsersDir'},
                    813: 		  $user_domain,
                    814: 		  $sub_filesystem_path,
                    815: 		  $username));
                    816: 
                    817:     # Return the complete filesystem path.
                    818:     return($complete_filesystem_path);
                    819:   }
                    820: 
                    821: =pod
                    822: 
                    823: B<sql_formatted_time> (@) - turns seconds since epoch into datetime sql format.
                    824: 
                    825: =over 4
                    826: 
                    827: Parameters:
                    828: 
                    829: =item I<$epochtime> - time in seconds since epoch (may need to be sanitized).
                    830: 
                    831: =back
                    832: 
                    833: =over 4
                    834: 
                    835: Returns:
                    836: 
                    837: =item C<string> - datetime sql formatted string.
                    838: 
                    839: =back
                    840: 
                    841: =cut
1.13      harris41  842: 
1.28      harris41  843: sub sql_formatted_time ($)
                    844:   {
                    845:     # Sanitize the time argument and convert to localtime array.
1.13      harris41  846:     my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
1.28      harris41  847: 	localtime(&sanitize_time(shift(@_)));
                    848: 
                    849:     # Convert month from (0..11) to (1..12).
                    850:     $mon += 1;
                    851: 
                    852:     # Make the year compatible with A.D. specification.
                    853:     $year += 1900;
                    854: 
                    855:     # Return a date which is compatible with MySQL's "DATETIME" format.
                    856:     return(join('-',($year,$mon,$mday)).
                    857: 	   ' '.
                    858: 	   join(':',($hour,$min,$sec))
                    859: 	   );
                    860:   }
                    861: 
                    862: 
                    863: # ==================================== The following two subroutines are needed
                    864: #                 for accommodating incorrect time formats inside the metadata.
                    865: 
                    866: =pod
                    867: 
                    868: B<make_seconds_since_epoch> (@) - turns time metadata into seconds since epoch.
                    869: 
                    870: =over 4
                    871: 
                    872: Parameters:
                    873: 
                    874: =item I<%time_metadata> - a key-value listing characterizing month, year, etc.
                    875: 
                    876: =back
                    877: 
                    878: =over 4
                    879: 
                    880: Returns:
                    881: 
                    882: =item C<integer> - seconds since epoch.
                    883: 
                    884: =back
                    885: 
                    886: =cut
                    887: 
                    888: sub make_seconds_since_epoch (@)
                    889:   {
                    890:     # Keytable of time metadata.
                    891:     my %time_metadata = @_;
                    892: 
                    893:     # Return seconds since the epoch (January 1, 1970, 00:00:00 UTC).
                    894:     return(POSIX::mktime(
                    895: 			 ($time_metadata{'seconds'},
                    896: 			  $time_metadata{'minutes'},
                    897: 			  $time_metadata{'hours'},
                    898: 			  $time_metadata{'day'},
                    899: 			  $time_metadata{'month'}-1,
                    900: 			  $time_metadata{'year'}-1900,
                    901: 			  0,
                    902: 			  0,
                    903: 			  $time_metadata{'dlsav'})
                    904: 			 )
                    905: 	   );
                    906:   }
                    907: 
                    908: =pod
                    909: 
                    910: B<sanitize_time> - if time looks sql-formatted, make it seconds since epoch.
                    911: 
                    912: Somebody described this subroutine as
                    913: "retro-fixing of un-backward-compatible time format".
                    914: 
                    915: What this means, is that a part of this code expects to get UTC seconds
                    916: since the epoch (beginning of 1970).  Yet, some of the .meta files have
                    917: sql-formatted time strings (2001-04-01, etc.) instead of seconds-since-epoch
                    918: integers (e.g. 1044147435).  These time strings do not encode the timezone
                    919: and, in this sense, can be considered "un-backwards-compatible".
                    920: 
                    921: =over 4
                    922: 
                    923: Parameters:
                    924: 
                    925: =item I<$potentially_badformat_string> - string to "retro-fix".
                    926: 
                    927: =back
                    928: 
                    929: =over 4
                    930: 
                    931: Returns:
                    932: 
                    933: =item C<integer> - seconds since epoch.
                    934: 
                    935: =back
                    936: 
                    937: =cut
                    938: 
                    939: sub sanitize_time ($)
                    940:   {
                    941:     my $timestamp = shift(@_);
                    942:     # If timestamp is in this unexpected format....
                    943:     if ($timestamp =~ /^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/)
                    944:       {
                    945: 	# then convert into seconds since epoch (the expected format).
                    946: 	$timestamp = &make_seconds_since_epoch(
                    947: 					       'year' => $1,
                    948: 					       'month' => $2,
                    949: 					       'day' => $3,
                    950: 					       'hours' => $4,
                    951: 					       'minutes' => $5,
                    952: 					       'seconds' => $6
                    953: 					       );
                    954:       }
                    955:     # Otherwise we assume timestamp to be as expected.
                    956:     return($timestamp);
                    957:   }
                    958: 
                    959: =pod
                    960: 
                    961: =head1 AUTHOR
                    962: 
                    963: Written to help the loncapa project.
                    964: 
                    965: Scott Harrison, sharrison@users.sourceforge.net
                    966: 
                    967: This is distributed under the same terms as loncapa (i.e. "freeware").
1.24      www       968: 
1.28      harris41  969: =cut

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>