File:
[LON-CAPA] /
loncom /
metadata_database /
searchcat.pl
Revision
1.62:
download - view:
text,
annotated -
select for diffs
Fri Mar 11 03:25:18 2005 UTC (19 years, 2 months ago) by
matthew
Branches:
MAIN
CVS tags:
HEAD
searchcat.pl:Bug 3961 - metadata only processed for default domain on
multi-domain servers. Now takes command line option -multi_domain.
This causes a hostname lookup and the hosts.tab file to be parsed for
matches of the hostname. Added 'domain' to the dynamic metadata.
lonmetadata.pm:Added 'domain' to the metadata table and to the dynamic
metadata;
1: #!/usr/bin/perl
2: # The LearningOnline Network
3: # searchcat.pl "Search Catalog" batch script
4: #
5: # $Id: searchcat.pl,v 1.62 2005/03/11 03:25:18 matthew Exp $
6: #
7: # Copyright Michigan State University Board of Trustees
8: #
9: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
10: #
11: # LON-CAPA is free software; you can redistribute it and/or modify
12: # it under the terms of the GNU General Public License as published by
13: # the Free Software Foundation; either version 2 of the License, or
14: # (at your option) any later version.
15: #
16: # LON-CAPA is distributed in the hope that it will be useful,
17: # but WITHOUT ANY WARRANTY; without even the implied warranty of
18: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: # GNU General Public License for more details.
20: #
21: # You should have received a copy of the GNU General Public License
22: # along with LON-CAPA; if not, write to the Free Software
23: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: #
25: # /home/httpd/html/adm/gpl.txt
26: #
27: # http://www.lon-capa.org/
28: #
29: ###
30:
31: =pod
32:
33: =head1 NAME
34:
35: B<searchcat.pl> - put authoritative filesystem data into sql database.
36:
37: =head1 SYNOPSIS
38:
39: Ordinarily this script is to be called from a loncapa cron job
40: (CVS source location: F<loncapa/loncom/cron/loncapa>; typical
41: filesystem installation location: F</etc/cron.d/loncapa>).
42:
43: Here is the cron job entry.
44:
45: C<# Repopulate and refresh the metadata database used for the search catalog.>
46: C<10 1 * * 7 www /home/httpd/perl/searchcat.pl>
47:
48: This script only allows itself to be run as the user C<www>.
49:
50: =head1 DESCRIPTION
51:
52: This script goes through a loncapa resource directory and gathers metadata.
53: The metadata is entered into a SQL database.
54:
55: This script also does general database maintenance such as reformatting
56: the C<loncapa:metadata> table if it is deprecated.
57:
58: This script evaluates dynamic metadata from the authors'
59: F<nohist_resevaldata.db> database file in order to store it in MySQL.
60:
61: This script is playing an increasingly important role for a loncapa
62: library server. The proper operation of this script is critical for a smooth
63: and correct user experience.
64:
65: =cut
66:
67: use strict;
68:
69: use DBI;
70: use lib '/home/httpd/lib/perl/';
71: use LONCAPA::Configuration;
72: use LONCAPA::lonmetadata;
73:
74: use Getopt::Long;
75: use IO::File;
76: use HTML::TokeParser;
77: use GDBM_File;
78: use POSIX qw(strftime mktime);
79:
80: use Sys::Hostname;
81:
82: use File::Find;
83:
84: #
85: # Set up configuration options
86: my ($simulate,$oneuser,$help,$verbose,$logfile,$debug,$multidom);
87: GetOptions (
88: 'help' => \$help,
89: 'simulate' => \$simulate,
90: 'only=s' => \$oneuser,
91: 'verbose=s' => \$verbose,
92: 'debug' => \$debug,
93: 'multi_domain' => \$multidom,
94: );
95:
96: if ($help) {
97: print <<"ENDHELP";
98: $0
99: Rebuild and update the LON-CAPA metadata database.
100: Options:
101: -help Print this help
102: -simulate Do not modify the database.
103: -only=user Only compute for the given user. Implies -simulate
104: -verbose=val Sets logging level, val must be a number
105: -debug Turns on debugging output
106: -multi_domain Parse the hosts.tab file domain(s) to use.
107: ENDHELP
108: exit 0;
109: }
110:
111: if (! defined($debug)) {
112: $debug = 0;
113: }
114:
115: if (! defined($verbose)) {
116: $verbose = 0;
117: }
118:
119: if (defined($oneuser)) {
120: $simulate=1;
121: }
122:
123: ##
124: ## Use variables for table names so we can test this routine a little easier
125: my $oldname = 'metadata';
126: my $newname = 'newmetadata'.$$; # append pid to have unique temporary table
127:
128: #
129: # Read loncapa_apache.conf and loncapa.conf
130: my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
131: my %perlvar=%{$perlvarref};
132: undef $perlvarref;
133: delete $perlvar{'lonReceipt'}; # remove since sensitive (really?) & not needed
134: #
135: # Only run if machine is a library server
136: exit if ($perlvar{'lonRole'} ne 'library');
137: #
138: # Make sure this process is running from user=www
139: my $wwwid=getpwnam('www');
140: if ($wwwid!=$<) {
141: my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
142: my $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
143: system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\
144: mailto $emailto -s '$subj' > /dev/null");
145: exit 1;
146: }
147: #
148: # Let people know we are running
149: open(LOG,'>>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
150: &log(0,'==== Searchcat Run '.localtime()."====");
151:
152:
153: if ($debug) {
154: &log(0,'simulating') if ($simulate);
155: &log(0,'only processing user '.$oneuser) if ($oneuser);
156: &log(0,'verbosity level = '.$verbose);
157: }
158: #
159: # Connect to database
160: my $dbh;
161: if (! ($dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},
162: { RaiseError =>0,PrintError=>0}))) {
163: &log(0,"Cannot connect to database!");
164: die "MySQL Error: Cannot connect to database!\n";
165: }
166: # This can return an error and still be okay, so we do not bother checking.
167: # (perhaps it should be more robust and check for specific errors)
168: $dbh->do('DROP TABLE IF EXISTS '.$newname);
169: #
170: # Create the new table
171: my $request = &LONCAPA::lonmetadata::create_metadata_storage($newname);
172: $dbh->do($request);
173: if ($dbh->err) {
174: $dbh->disconnect();
175: &log(0,"MySQL Error Create: ".$dbh->errstr);
176: die $dbh->errstr;
177: }
178: #
179: # find out which users we need to examine
180: my @domains;
181: if (defined($multidom)) {
182: &log(1,'====multi domain setup====');
183: # Peek into the hosts.tab and look for matches of our hostname
184: my $host = hostname();
185: &log(9,'hostname = "'.$host.'"');
186: open(HOSTFILE,$perlvar{'lonTabDir'}.'/hosts.tab') ||
187: die ("Unable to determine domain(s) of multi-domain server");
188: my %domains;
189: while (<HOSTFILE>) {
190: next if (/^\#/);
191: next if (!/:\Q$host\E/);
192: &log(9,$_);
193: $domains{(split(':',$_))[1]}++;
194: }
195: close HOSTFILE;
196: @domains = sort(keys(%domains));
197: &log(9,join(',',@domains));
198: if (! scalar(@domains)) {
199: die ("Unable to find any domains in the hosts.tab that match ".$host);
200: }
201: } else {
202: push(@domains,$perlvar{'lonDefDomain'});
203: }
204:
205: foreach my $dom (@domains) {
206: &log(9,'domain = '.$dom);
207: opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$dom");
208: my @homeusers =
209: grep {
210: &ishome("$perlvar{'lonDocRoot'}/res/$dom/$_");
211: } grep {
212: !/^\.\.?$/;
213: } readdir(RESOURCES);
214: closedir RESOURCES;
215: &log(5,'users = '.$dom.':'.join(',',@homeusers));
216: #
217: if ($oneuser) {
218: @homeusers=($oneuser);
219: }
220: #
221: # Loop through the users
222: foreach my $user (@homeusers) {
223: &log(0,"=== User: ".$user);
224: &process_dynamic_metadata($user,$dom);
225: #
226: # Use File::Find to get the files we need to read/modify
227: find(
228: {preprocess => \&only_meta_files,
229: #wanted => \&print_filename,
230: #wanted => \&log_metadata,
231: wanted => \&process_meta_file,
232: }, join('/',($perlvar{'lonDocRoot'},'res',$dom,$user)) );
233: }
234: }
235: #
236: # Rename the table
237: if (! $simulate) {
238: $dbh->do('DROP TABLE IF EXISTS '.$oldname);
239: if (! $dbh->do('RENAME TABLE '.$newname.' TO '.$oldname)) {
240: &log(0,"MySQL Error Rename: ".$dbh->errstr);
241: die $dbh->errstr;
242: } else {
243: &log(1,"MySQL table rename successful.");
244: }
245: }
246: if (! $dbh->disconnect) {
247: &log(0,"MySQL Error Disconnect: ".$dbh->errstr);
248: die $dbh->errstr;
249: }
250: ##
251: ## Finished!
252: &log(0,"==== Searchcat completed ".localtime()." ====");
253: close(LOG);
254:
255: &write_type_count();
256: &write_copyright_count();
257:
258: exit 0;
259:
260: ##
261: ## Status logging routine. Inputs: $level, $message
262: ##
263: ## $level 0 should be used for normal output and error messages
264: ##
265: ## $message does not need to end with \n. In the case of errors
266: ## the message should contain as much information as possible to
267: ## help in diagnosing the problem.
268: ##
269: sub log {
270: my ($level,$message)=@_;
271: $level = 0 if (! defined($level));
272: if ($verbose >= $level) {
273: print LOG $message.$/;
274: }
275: }
276:
277: ########################################################
278: ########################################################
279: ### ###
280: ### File::Find support routines ###
281: ### ###
282: ########################################################
283: ########################################################
284: ##
285: ## &only_meta_files
286: ##
287: ## Called by File::Find.
288: ## Takes a list of files/directories in and returns a list of files/directories
289: ## to search.
290: sub only_meta_files {
291: my @PossibleFiles = @_;
292: my @ChosenFiles;
293: foreach my $file (@PossibleFiles) {
294: if ( ($file =~ /\.meta$/ && # Ends in meta
295: $file !~ /\.\d+\.[^\.]+\.meta$/ # is not for a prior version
296: ) || (-d $file )) { # directories are okay
297: # but we do not want /. or /..
298: push(@ChosenFiles,$file);
299: }
300: }
301: return @ChosenFiles;
302: }
303:
304: ##
305: ##
306: ## Debugging routines, use these for 'wanted' in the File::Find call
307: ##
308: sub print_filename {
309: my ($file) = $_;
310: my $fullfilename = $File::Find::name;
311: if ($debug) {
312: if (-d $file) {
313: &log(5," Got directory ".$fullfilename);
314: } else {
315: &log(5," Got file ".$fullfilename);
316: }
317: }
318: $_=$file;
319: }
320:
321: sub log_metadata {
322: my ($file) = $_;
323: my $fullfilename = $File::Find::name;
324: return if (-d $fullfilename); # No need to do anything here for directories
325: if ($debug) {
326: &log(6,$fullfilename);
327: my $ref=&metadata($fullfilename);
328: if (! defined($ref)) {
329: &log(6," No data");
330: return;
331: }
332: while (my($key,$value) = each(%$ref)) {
333: &log(6," ".$key." => ".$value);
334: }
335: &count_copyright($ref->{'copyright'});
336: }
337: $_=$file;
338: }
339:
340: ##
341: ## process_meta_file
342: ## Called by File::Find.
343: ## Only input is the filename in $_.
344: sub process_meta_file {
345: my ($file) = $_;
346: my $filename = $File::Find::name; # full filename
347: return if (-d $filename); # No need to do anything here for directories
348: #
349: &log(3,$filename) if ($debug);
350: #
351: my $ref=&metadata($filename);
352: #
353: # $url is the original file url, not the metadata file
354: my $target = $filename;
355: $target =~ s/\.meta$//;
356: my $url='/res/'.&declutter($target);
357: &log(3," ".$url) if ($debug);
358: #
359: # Ignore some files based on their metadata
360: if ($ref->{'obsolete'}) {
361: &log(3,"obsolete") if ($debug);
362: return;
363: }
364: &count_copyright($ref->{'copyright'});
365: if ($ref->{'copyright'} eq 'private') {
366: &log(3,"private") if ($debug);
367: return;
368: }
369: #
370: # Find the dynamic metadata
371: my %dyn;
372: if ($url=~ m:/default$:) {
373: $url=~ s:/default$:/:;
374: &log(3,"Skipping dynamic data") if ($debug);
375: } else {
376: &log(3,"Retrieving dynamic data") if ($debug);
377: %dyn=&get_dynamic_metadata($url);
378: &count_type($url);
379: }
380: #
381: if (! defined($ref->{'creationdate'}) ||
382: $ref->{'creationdate'} =~ /^\s*$/) {
383: $ref->{'creationdate'} = (stat($target))[9];
384: }
385: if (! defined($ref->{'lastrevisiondate'}) ||
386: $ref->{'lastrevisiondate'} =~ /^\s*$/) {
387: $ref->{'lastrevisiondate'} = (stat($target))[9];
388: }
389: $ref->{'creationdate'} = &sqltime($ref->{'creationdate'});
390: $ref->{'lastrevisiondate'} = &sqltime($ref->{'lastrevisiondate'});
391: my %Data = (
392: %$ref,
393: %dyn,
394: 'url'=>$url,
395: 'version'=>'current');
396: if (! $simulate) {
397: my ($count,$err) = &LONCAPA::lonmetadata::store_metadata($dbh,$newname,
398: \%Data);
399: if ($err) {
400: &log(0,"MySQL Error Insert: ".$err);
401: }
402: if ($count < 1) {
403: &log(0,"Unable to insert record into MySQL database for $url");
404: }
405: }
406: #
407: # Reset $_ before leaving
408: $_ = $file;
409: }
410:
411: ########################################################
412: ########################################################
413: ### ###
414: ### &metadata($uri) ###
415: ### Retrieve metadata for the given file ###
416: ### ###
417: ########################################################
418: ########################################################
419: sub metadata {
420: my ($uri)=@_;
421: my %metacache=();
422: $uri=&declutter($uri);
423: my $filename=$uri;
424: $uri=~s/\.meta$//;
425: $uri='';
426: if ($filename !~ /\.meta$/) {
427: $filename.='.meta';
428: }
429: my $metastring=&getfile($perlvar{'lonDocRoot'}.'/res/'.$filename);
430: return undef if (! defined($metastring));
431: my $parser=HTML::TokeParser->new(\$metastring);
432: my $token;
433: while ($token=$parser->get_token) {
434: if ($token->[0] eq 'S') {
435: my $entry=$token->[1];
436: my $unikey=$entry;
437: if (defined($token->[2]->{'part'})) {
438: $unikey.='_'.$token->[2]->{'part'};
439: }
440: if (defined($token->[2]->{'name'})) {
441: $unikey.='_'.$token->[2]->{'name'};
442: }
443: if ($metacache{$uri.'keys'}) {
444: $metacache{$uri.'keys'}.=','.$unikey;
445: } else {
446: $metacache{$uri.'keys'}=$unikey;
447: }
448: foreach ( @{$token->[3]}) {
449: $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_};
450: }
451: if (! ($metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry))){
452: $metacache{$uri.''.$unikey} =
453: $metacache{$uri.''.$unikey.'.default'};
454: }
455: } # End of ($token->[0] eq 'S')
456: }
457: return \%metacache;
458: }
459:
460: ##
461: ## &getfile($filename)
462: ## Slurps up an entire file into a scalar.
463: ## Returns undef if the file does not exist
464: sub getfile {
465: my $file = shift();
466: if (! -e $file ) {
467: return undef;
468: }
469: my $fh=IO::File->new($file);
470: my $contents = '';
471: while (<$fh>) {
472: $contents .= $_;
473: }
474: return $contents;
475: }
476:
477: ########################################################
478: ########################################################
479: ### ###
480: ### Dynamic Metadata ###
481: ### ###
482: ########################################################
483: ########################################################
484: ##
485: ## Dynamic metadata description (incomplete)
486: ##
487: ## For a full description of all fields,
488: ## see LONCAPA::lonmetadata
489: ##
490: ## Field Type
491: ##-----------------------------------------------------------
492: ## count integer
493: ## course integer
494: ## course_list comma separated list of course ids
495: ## avetries real
496: ## avetries_list comma separated list of real numbers
497: ## stdno real
498: ## stdno_list comma separated list of real numbers
499: ## usage integer
500: ## usage_list comma separated list of resources
501: ## goto scalar
502: ## goto_list comma separated list of resources
503: ## comefrom scalar
504: ## comefrom_list comma separated list of resources
505: ## difficulty real
506: ## difficulty_list comma separated list of real numbers
507: ## sequsage scalar
508: ## sequsage_list comma separated list of resources
509: ## clear real
510: ## technical real
511: ## correct real
512: ## helpful real
513: ## depth real
514: ## comments html of all the comments made
515: ##
516: {
517:
518: my %DynamicData;
519: my %Counts;
520:
521: sub process_dynamic_metadata {
522: my ($user,$dom) = @_;
523: undef(%DynamicData);
524: undef(%Counts);
525: #
526: my $prodir = &propath($dom,$user);
527: #
528: # Read in the dynamic metadata
529: my %evaldata;
530: if (! tie(%evaldata,'GDBM_File',
531: $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) {
532: return 0;
533: }
534: #
535: %DynamicData = &LONCAPA::lonmetadata::process_reseval_data(\%evaldata);
536: untie(%evaldata);
537: $DynamicData{'domain'} = $dom;
538: print('user = '.$user.' domain = '.$dom.$/);
539: #
540: # Read in the access count data
541: &log(7,'Reading access count data') if ($debug);
542: my %countdata;
543: if (! tie(%countdata,'GDBM_File',
544: $prodir.'/nohist_accesscount.db',&GDBM_READER(),0640)) {
545: return 0;
546: }
547: while (my ($key,$count) = each(%countdata)) {
548: next if ($key !~ /^$dom/);
549: $key = &unescape($key);
550: &log(8,' Count '.$key.' = '.$count) if ($debug);
551: $Counts{$key}=$count;
552: }
553: untie(%countdata);
554: if ($debug) {
555: &log(7,scalar(keys(%Counts)).
556: " Counts read for ".$user."@".$dom);
557: &log(7,scalar(keys(%DynamicData)).
558: " Dynamic metadata read for ".$user."@".$dom);
559: }
560: #
561: return 1;
562: }
563:
564: sub get_dynamic_metadata {
565: my ($url) = @_;
566: $url =~ s:^/res/::;
567: my %data = &LONCAPA::lonmetadata::process_dynamic_metadata($url,
568: \%DynamicData);
569: # find the count
570: $data{'count'} = $Counts{$url};
571: #
572: # Log the dynamic metadata
573: if ($debug) {
574: while (my($k,$v)=each(%data)) {
575: &log(8," ".$k." => ".$v);
576: }
577: }
578: return %data;
579: }
580:
581: } # End of %DynamicData and %Counts scope
582:
583: ########################################################
584: ########################################################
585: ### ###
586: ### Counts ###
587: ### ###
588: ########################################################
589: ########################################################
590: {
591:
592: my %countext;
593:
594: sub count_type {
595: my $file=shift;
596: $file=~/\.(\w+)$/;
597: my $ext=lc($1);
598: $countext{$ext}++;
599: }
600:
601: sub write_type_count {
602: open(RESCOUNT,'>/home/httpd/html/lon-status/rescount.txt');
603: while (my ($extension,$count) = each(%countext)) {
604: print RESCOUNT $extension.'='.$count.'&';
605: }
606: print RESCOUNT 'time='.time."\n";
607: close(RESCOUNT);
608: }
609:
610: } # end of scope for %countext
611:
612: {
613:
614: my %copyrights;
615:
616: sub count_copyright {
617: $copyrights{@_[0]}++;
618: }
619:
620: sub write_copyright_count {
621: open(COPYCOUNT,'>/home/httpd/html/lon-status/copyrightcount.txt');
622: while (my ($copyright,$count) = each(%copyrights)) {
623: print COPYCOUNT $copyright.'='.$count.'&';
624: }
625: print COPYCOUNT 'time='.time."\n";
626: close(COPYCOUNT);
627: }
628:
629: } # end of scope for %copyrights
630:
631: ########################################################
632: ########################################################
633: ### ###
634: ### Miscellanous Utility Routines ###
635: ### ###
636: ########################################################
637: ########################################################
638: ##
639: ## &ishome($username)
640: ## Returns 1 if $username is a LON-CAPA author, 0 otherwise
641: ## (copied from lond, modification of the return value)
642: sub ishome {
643: my $author=shift;
644: $author=~s/\/home\/httpd\/html\/res\/([^\/]*)\/([^\/]*).*/$1\/$2/;
645: my ($udom,$uname)=split(/\//,$author);
646: my $proname=propath($udom,$uname);
647: if (-e $proname) {
648: return 1;
649: } else {
650: return 0;
651: }
652: }
653:
654: ##
655: ## &propath($udom,$uname)
656: ## Returns the path to the users LON-CAPA directory
657: ## (copied from lond)
658: sub propath {
659: my ($udom,$uname)=@_;
660: $udom=~s/\W//g;
661: $uname=~s/\W//g;
662: my $subdir=$uname.'__';
663: $subdir =~ s/(.)(.)(.).*/$1\/$2\/$3/;
664: my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname";
665: return $proname;
666: }
667:
668: ##
669: ## &sqltime($timestamp)
670: ##
671: ## Convert perl $timestamp to MySQL time. MySQL expects YYYY-MM-DD HH:MM:SS
672: ##
673: sub sqltime {
674: my ($time) = @_;
675: my $mysqltime;
676: if ($time =~
677: /(\d+)-(\d+)-(\d+) # YYYY-MM-DD
678: \s # a space
679: (\d+):(\d+):(\d+) # HH:MM::SS
680: /x ) {
681: # Some of the .meta files have the time in mysql
682: # format already, so just make sure they are 0 padded and
683: # pass them back.
684: $mysqltime = sprintf('%04d-%02d-%02d %02d:%02d:%02d',
685: $1,$2,$3,$4,$5,$6);
686: } elsif ($time =~ /^\d+$/) {
687: my @TimeData = gmtime($time);
688: # Alter the month to be 1-12 instead of 0-11
689: $TimeData[4]++;
690: # Alter the year to be from 0 instead of from 1900
691: $TimeData[5]+=1900;
692: $mysqltime = sprintf('%04d-%02d-%02d %02d:%02d:%02d',
693: @TimeData[5,4,3,2,1,0]);
694: } elsif (! defined($time) || $time == 0) {
695: $mysqltime = 0;
696: } else {
697: &log(0," sqltime:Unable to decode time ".$time);
698: $mysqltime = 0;
699: }
700: return $mysqltime;
701: }
702:
703: ##
704: ## &declutter($filename)
705: ## Given a filename, returns a url for the filename.
706: sub declutter {
707: my $thisfn=shift;
708: $thisfn=~s/^$perlvar{'lonDocRoot'}//;
709: $thisfn=~s/^\///;
710: $thisfn=~s/^res\///;
711: return $thisfn;
712: }
713:
714: ##
715: ## Escape / Unescape special characters
716: sub unescape {
717: my $str=shift;
718: $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg;
719: return $str;
720: }
721:
722: sub escape {
723: my $str=shift;
724: $str =~ s/(\W)/"%".unpack('H2',$1)/eg;
725: return $str;
726: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>