--- loncom/loncron 2004/05/11 20:19:46 1.48 +++ loncom/loncron 2006/11/01 21:08:06 1.69 @@ -2,7 +2,7 @@ # Housekeeping program, started by cron, loncontrol and loncron.pl # -# $Id: loncron,v 1.48 2004/05/11 20:19:46 albertel Exp $ +# $Id: loncron,v 1.69 2006/11/01 21:08:06 www Exp $ # # Copyright Michigan State University Board of Trustees # @@ -36,6 +36,7 @@ use LONCAPA::Configuration; use IO::File; use IO::Socket; use HTML::Entities; +use Getopt::Long; #globals use vars qw (%perlvar %simplestatus $errors $warnings $notices $totalcount); @@ -44,13 +45,13 @@ my $statusdir="/home/httpd/html/lon-stat # -------------------------------------------------- Non-critical communication sub reply { - my ($cmd,$server)=@_; - my $peerfile="$perlvar{'lonSockDir'}/$server"; + my ($cmd,$server,$hostname)=@_; + my $peerfile="$perlvar{'lonSockDir'}/".$hostname->{$server}; my $client=IO::Socket::UNIX->new(Peer =>"$peerfile", Type => SOCK_STREAM, Timeout => 10) or return "con_lost"; - print $client "$cmd\n"; + print $client "sethost:$server:$cmd\n"; my $answer=<$client>; chomp($answer); if (!$answer) { $answer="con_lost"; } @@ -76,21 +77,28 @@ ENDERROUT } sub start_daemon { - my ($fh,$daemon,$pidfile) = @_; + my ($fh,$daemon,$pidfile,$args) = @_; my $progname=$daemon; - if ($daemon eq 'lonc' && $ARGV[0] eq 'new') { + if ($daemon eq 'lonc' && $args eq 'new') { $progname='loncnew'; print "new "; } - system("$perlvar{'lonDaemons'}/$progname 2>>$perlvar{'lonDaemons'}/logs/${daemon}_errors"); - sleep 2; + my $error_fname="$perlvar{'lonDaemons'}/logs/${daemon}_errors"; + my $size=(stat($error_fname))[7]; + if ($size>40000) { + &log($fh,"

Rotating error logs ...

"); + rename("$error_fname.2","$error_fname.3"); + rename("$error_fname.1","$error_fname.2"); + rename("$error_fname","$error_fname.1"); + } + system("$perlvar{'lonDaemons'}/$progname 2>$perlvar{'lonDaemons'}/logs/${daemon}_errors"); + sleep 1; if (-e $pidfile) { &log($fh,"

Seems like it started ...

"); my $lfh=IO::File->new("$pidfile"); my $daemonpid=<$lfh>; chomp($daemonpid); - sleep 2; - if (kill 0 => $daemonpid) { + if ($daemonpid =~ /^\d+$/ && kill 0 => $daemonpid) { return 1; } else { return 0; @@ -102,10 +110,11 @@ sub start_daemon { } sub checkon_daemon { - my ($fh,$daemon,$maxsize,$sendusr1)=@_; + my ($fh,$daemon,$maxsize,$send,$args)=@_; + my $result; &log($fh,'

'.$daemon.'

Log

'); - printf("%-10s ",$daemon); + printf("%-15s ",$daemon); if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){ open (DFH,"tail -n25 $perlvar{'lonDaemons'}/logs/$daemon.log|"); while (my $line=) { @@ -126,13 +135,20 @@ sub checkon_daemon { my $lfh=IO::File->new("$pidfile"); $daemonpid=<$lfh>; chomp($daemonpid); - if (kill 0 => $daemonpid) { + if ($daemonpid =~ /^\d+$/ && kill 0 => $daemonpid) { &log($fh,"

$daemon at pid $daemonpid responding"); - if ($sendusr1) { &log($fh,", sending USR1"); } + if ($send) { &log($fh,", sending $send"); } &log($fh,"

"); - if ($sendusr1) { kill USR1 => $daemonpid; } + if ($send eq 'USR1') { kill USR1 => $daemonpid; } + if ($send eq 'USR2') { kill USR2 => $daemonpid; } $restartflag=0; - print "running\n"; + if ($send eq 'USR2') { + $result = 'reloaded'; + print "reloaded\n"; + } else { + $result = 'running'; + print "running\n"; + } } else { $errors++; &log($fh,"

$daemon at pid $daemonpid not responding

"); @@ -143,28 +159,33 @@ sub checkon_daemon { if ($restartflag==1) { $simplestatus{$daemon}='off'; $errors++; + my $kadaemon=$daemon; + if ($kadaemon eq 'lonmemcached') { $kadaemon='memcached'; } &log($fh,'
Killall '.$daemon.': '. - `killall $daemon 2>&1`.' - '); - sleep 2; + `killall $kadaemon 2>&1`.' - '); + sleep 1; &log($fh,unlink($pidfile).' - '. - `killall -9 $daemon 2>&1`. + `killall -9 $kadaemon 2>&1`. '
'); &log($fh,"

$daemon not running, trying to start

"); - if (&start_daemon($fh,$daemon,$pidfile)) { + if (&start_daemon($fh,$daemon,$pidfile,$args)) { &log($fh,"

$daemon at pid $daemonpid responding

"); $simplestatus{$daemon}='restarted'; + $result = 'started'; print "started\n"; } else { $errors++; &log($fh,"

$daemon at pid $daemonpid not responding

"); &log($fh,"

Give it one more try ...

"); print " "; - if (&start_daemon($fh,$daemon,$pidfile)) { + if (&start_daemon($fh,$daemon,$pidfile,$args)) { &log($fh,"

$daemon at pid $daemonpid responding

"); $simplestatus{$daemon}='restarted'; + $result = 'started'; print "started\n"; } else { + $result = 'failed'; print " failed\n"; $simplestatus{$daemon}='failed'; $errors++; $errors++; @@ -201,6 +222,7 @@ sub checkon_daemon { } &errout($fh); + return $result; } # --------------------------------------------------------------------- Machine @@ -251,7 +273,7 @@ sub log_machine_info { &log($fh,"
");
     my $psproc=0;
 
-    open (PSH,"ps -aux --cols 140 |");
+    open (PSH,"ps aux --cols 140 |");
     while (my $line=) { 
 	&log($fh,&encode_entities($line,'<>&"')); 
 	$psproc++;
@@ -262,6 +284,16 @@ sub log_machine_info {
     if ($psproc>200) { $notices++; }
     if ($psproc>250) { $notices++; }
 
+    &log($fh,"

distprobe

"); + &log($fh,"
");
+    open(DSH,"$perlvar{'lonDaemons'}/distprobe |");
+    while (my $line=) { 
+	&log($fh,&encode_entities($line,'<>&"')); 
+	$psproc++;
+    }
+    close(DSH);
+    &log($fh,"
"); + &errout($fh); } @@ -450,29 +482,36 @@ sub test_connections { &log($fh,'

Connections

'); print "testing connections\n"; &log($fh,""); + my ($good,$bad)=(0,0); foreach my $tryserver (sort(keys(%{$hostname}))) { print("."); my $result; - my $answer=reply("pong",$tryserver); + my $answer=&reply("ping",$tryserver,$hostname); if ($answer eq "$tryserver:$perlvar{'lonHostID'}") { $result="ok"; + $good++; } else { $result=$answer; $warnings++; - if ($answer eq 'con_lost') { $warnings++; } + if ($answer eq 'con_lost') { + $bad++; + $warnings++; + } else { + $good++; #self connection + } } if ($answer =~ /con_lost/) { print(" $tryserver down\n"); } &log($fh,"\n"); } &log($fh,"
$tryserver$result
"); - + print "\n$good good, $bad bad connections\n"; &errout($fh); } # ------------------------------------------------------------ Delayed messages sub check_delayed_msg { - my ($fh)=@_; + my ($fh,$hostname)=@_; &log($fh,'

Delayed Messages

'); print "checking buffers\n"; @@ -497,13 +536,22 @@ sub check_delayed_msg { if ($unsend) { $simplestatus{'unsend'}=$unsend; } &log($fh,"

Outgoing Buffer

\n
");
-
+# list directory with delayed messages and remember offline servers
+    my %servers=();
     open (DFH,"ls -lF $perlvar{'lonSockDir'}/delayed|");
-    while (my $line=) { 
+    while (my $line=) {
+        my ($server)=($line=~/\.(\w+)$/);
+        if ($server) { $servers{$server}=1; }
 	&log($fh,&encode_entities($line,'<>&"'));
     }
     &log($fh,"
\n"); close (DFH); +# pong to all servers that have delayed messages +# this will trigger a reverse connection, which should flush the buffers + foreach my $tryserver (keys %servers) { + my $answer=&reply("pong",$tryserver,$hostname); + &log($fh,"Pong to $tryserver: $answer
"); + } } sub finish_logging { @@ -538,15 +586,53 @@ sub log_simplestatus { sub send_mail { print "sending mail\n"; my $emailto="$perlvar{'lonAdmEMail'}"; - if ($totalcount>1000) { + if ($totalcount>2500) { $emailto.=",$perlvar{'lonSysEMail'}"; } my $subj="LON: $perlvar{'lonHostID'} E:$errors W:$warnings N:$notices"; - system("metasend -b -t $emailto -s '$subj' -f $statusdir/index.html -m text/html"); + + my $result=system("metasend -b -S 4000000 -t $emailto -s '$subj' -f $statusdir/index.html -m text/html >& /dev/null"); + if ($result != 0) { + $result=system("mail -s '$subj' $emailto < $statusdir/index.html"); + } +} + +sub usage { + print(< \$help, + "oldlonc" => \$oldlonc, + "justcheckdaemons" => \$justcheckdaemons, + "noemail" => \$noemail, + "justcheckconnections" => \$justcheckconnections, + "justreload" => \$justreload + ); + if ($help) { &usage(); return; } # --------------------------------- Read loncapa_apache.conf and loncapa.conf my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf'); %perlvar=%{$perlvarref}; @@ -584,9 +670,10 @@ sub main () { my (%hostname,%hostdom,%hostrole,%spareid); while (my $configline=<$config>) { - next if ($configline =~ /^(\#|\s*$)/); - my ($id,$domain,$role,$name,$ip,$domdescr)=split(/:/,$configline); - if ($id && $domain && $role && $name && $ip) { + next if ($configline =~ /^(\#|\s*\$)/); + my ($id,$domain,$role,$name)=split(/:/,$configline); + if ($id && $domain && $role && $name) { + $name=~s/\s//g; $hostname{$id}=$name; $hostdom{$id}=$domain; $hostrole{$id}=$role; @@ -611,25 +698,44 @@ sub main () { $warnings=0; $notices=0; - my $fh=&start_logging(\%hostdom,\%hostrole,\%hostname,\%spareid); - - &log_machine_info($fh); - &clean_tmp($fh); - &clean_lonIDs($fh); - &check_httpd_logs($fh); - &rotate_lonnet_logs($fh); - &checkon_daemon($fh,'lonsql',200000); - &checkon_daemon($fh,'lond',40000,1); - &checkon_daemon($fh,'lonc',40000,1); - &checkon_daemon($fh,'lonhttpd',40000); - - &test_connections($fh,\%hostname); - &check_delayed_msg($fh); - - &finish_logging($fh); - &log_simplestatus(); - if ($totalcount>200) { &send_mail(); } + my $fh; + if (!$justcheckdaemons && !$justcheckconnections && !$justreload) { + $fh=&start_logging(\%hostdom,\%hostrole,\%hostname,\%spareid); + + &log_machine_info($fh); + &clean_tmp($fh); + &clean_lonIDs($fh); + &check_httpd_logs($fh); + &rotate_lonnet_logs($fh); + } + if (!$justcheckconnections && !$justreload) { + &checkon_daemon($fh,'lonsql',200000); + if ( &checkon_daemon($fh,'lond',40000,'USR1') eq 'running') { + &checkon_daemon($fh,'lond',40000,'USR2'); + } + my $args='new'; + if ($oldlonc) { $args = ''; } + &checkon_daemon($fh,'lonc',40000,'USR1',$args); + &checkon_daemon($fh,'lonhttpd',40000); + &checkon_daemon($fh,'lonmemcached',40000); + } + if ($justreload) { + &checkon_daemon($fh,'lond',40000,'USR2'); + my $args='new'; + if ($oldlonc) { $args = ''; } + &checkon_daemon($fh,'lonc',40000,'USR2',$args); + } + if ($justcheckconnections) { + &test_connections($fh,\%hostname); + } + if (!$justcheckdaemons && !$justcheckconnections && !$justreload) { + &check_delayed_msg($fh,\%hostname); + &finish_logging($fh); + &log_simplestatus(); + + if ($totalcount>200 && !$noemail) { &send_mail(); } + } } &main(); 500 Internal Server Error

Internal Server Error

The server encountered an internal error or misconfiguration and was unable to complete your request.

Please contact the server administrator at root@localhost to inform them of the time this error occurred, and the actions you performed just before this error.

More information about this error may be available in the server error log.