--- loncom/loncnew 2003/07/29 02:33:05 1.16 +++ loncom/loncnew 2003/09/02 10:34:47 1.22 @@ -2,13 +2,12 @@ # The LearningOnline Network with CAPA # lonc maintains the connections to remote computers # -# $Id: loncnew,v 1.16 2003/07/29 02:33:05 foxr Exp $ +# $Id: loncnew,v 1.22 2003/09/02 10:34:47 foxr Exp $ # # Copyright Michigan State University Board of Trustees # # This file is part of the LearningOnline Network with CAPA (LON-CAPA). -# -# LON-CAPA is free software; you can redistribute it and/or modify +## LON-CAPA is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. @@ -46,6 +45,40 @@ # Change log: # $Log: loncnew,v $ +# Revision 1.22 2003/09/02 10:34:47 foxr +# - Fix errors in host dead detection logic (too many cases where the +# retries left were not getting incremented or just not checked). +# - Added some additional status to the ps axuww display: +# o Remaining retries on a host. +# o >>> DEAD <<< indicator if I've given up on a host. +# - Tested the SIGHUP will reset the retries remaining count (thanks to +# the above status stuff, and get allow the loncnew to re-try again +# on the host (thanks to the log). +# +# Revision 1.21 2003/08/26 09:19:51 foxr +# How embarrassing... put in the SocketTimeout function in loncnew and forgot +# to actually hook it into the LondTransaction. Added this to MakeLondConnection +# where it belongs... hopefully transactions (not just connection attempts) will +# timeout more speedily than the socket errors will catch it. +# +# Revision 1.20 2003/08/25 18:48:11 albertel +# - fixing a forgotten ; +# +# Revision 1.19 2003/08/19 09:31:46 foxr +# Get socket directory from configuration rather than the old hard coded test +# way that I forgot to un-hard code. +# +# Revision 1.18 2003/08/06 09:52:29 foxr +# Also needed to remember to fail in-flight transactions if their sends fail. +# +# Revision 1.17 2003/08/03 00:44:31 foxr +# 1. Correct handling of connection failure: Assume it means the host is +# unreachable and fail all of the queued transactions. Note that the +# inflight transactions should fail on their own time due either to timeout +# or send/receive failures. +# 2. Correct handling of logs for forced death signals. Pull the signal +# from the event watcher. +# # Revision 1.16 2003/07/29 02:33:05 foxr # Add SIGINT processing to child processes to toggle annoying trace mode # on/off.. will try to use this to isolate the compute boud process issue. @@ -67,7 +100,7 @@ # Revision 1.10 2003/06/24 02:46:04 foxr # Put a limit on the number of times we'll retry a connection. # Start getting the signal stuff put in as well...note that need to get signals -# going or else 6the client will permanently give up on dead servers. +# going or else the client will permanently give up on dead servers. # # Revision 1.9 2003/06/13 02:38:43 foxr # Add logging in 'expected format' @@ -136,7 +169,7 @@ my $IdleTimeout= 3600; # Wait an hour b # The variables below are only used by the child processes. # my $RemoteHost; # Name of host child is talking to. -my $UnixSocketDir= "/home/httpd/sockets"; +my $UnixSocketDir= $perlvar{'lonSockDir'}; my $IdleConnections = Stack->new(); # Set of idle connections my %ActiveConnections; # Connections to the remote lond. my %ActiveTransactions; # LondTransactions in flight. @@ -306,7 +339,9 @@ sub ShowStatus { sub SocketTimeout { my $Socket = shift; - KillSocket($Socket); + KillSocket($Socket); # A transaction timeout also counts as + # a connection failure: + $ConnectionRetriesLeft--; } =pod @@ -320,8 +355,12 @@ Invoked each timer tick. sub Tick { my $client; - ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount); - + if($ConnectionRetriesLeft > 0) { + ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount + ." Retries remaining: ".$ConnectionRetriesLeft); + } else { + ShowStatus(GetServerHost()." >> DEAD <<"); + } # Is it time to prune connection count: @@ -352,10 +391,16 @@ sub Tick { my $Connections = ($Requests <= $MaxConnectionCount) ? $Requests : $MaxConnectionCount; Debug(1,"Work but no connections, start ".$Connections." of them"); + my $successCount = 0; for ($i =0; $i < $Connections; $i++) { - MakeLondConnection(); + $successCount += MakeLondConnection(); + } + if($successCount == 0) { # All connections failed: + Debug(1,"Work in queue failed to make any connectiouns\n"); + EmptyQueue(); # Fail pending transactions with con_lost. } } else { + ShowStatus(GetServerHost()." >>> DEAD!!! <<<"); Debug(1,"Work in queue, but gave up on connections..flushing\n"); EmptyQueue(); # Connections can't be established. } @@ -602,12 +647,16 @@ Parameters: sub FailTransaction { my $transaction = shift; + Log("WARNING", "Failing transaction ".$transaction->getRequest()); Debug(1, "Failing transaction: ".$transaction->getRequest()); if (!$transaction->isDeferred()) { # If the transaction is deferred we'll get to it. my $client = $transaction->getClient(); Debug(1," Replying con_lost to ".$transaction->getRequest()); StartClientReply($transaction, "con_lost\n"); } + if($ConnectionRetriesLeft <= 0) { + Log("CRITICAL", "Host marked dead: ".GetServerHost()); + } } @@ -619,6 +668,7 @@ sub FailTransaction { =cut sub EmptyQueue { + $ConnectionRetriesLeft--; # Counts as connection failure too. while($WorkQueue->Count()) { my $request = $WorkQueue->dequeue(); # This is a transaction FailTransaction($request); @@ -664,6 +714,7 @@ nonzero if we are allowed to create a ne sub KillSocket { my $Socket = shift; + Log("WARNING", "Shutting down a socket"); $Socket->Shutdown(); # If the socket came from the active connection set, @@ -684,7 +735,7 @@ sub KillSocket { # work queue, the work all gets failed with con_lost. # if($ConnectionCount == 0) { - EmptyQueue; + EmptyQueue(); } } @@ -760,18 +811,21 @@ sub LondReadable { SocketDump(6, $Socket); my $status = $Socket->Readable(); + &Debug(2, "Socket->Readable returned: $status"); if($status != 0) { # bad return from socket read. Currently this means that # The socket has become disconnected. We fail the transaction. + Log("WARNING", + "Lond connection lost."); if(exists($ActiveTransactions{$Socket})) { - Debug(3,"Lond connection lost failing transaction"); FailTransaction($ActiveTransactions{$Socket}); } $Watcher->cancel(); KillSocket($Socket); + $ConnectionRetriesLeft--; # Counts as connection failure return; } SocketDump(6,$Socket); @@ -805,6 +859,10 @@ sub LondReadable { } elsif ($State eq "Idle") { # If necessary, complete a transaction and then go into the # idle queue. + # Note that a trasition to idle indicates a live lond + # on the other end so reset the connection retries. + # + $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count $Watcher->cancel(); if(exists($ActiveTransactions{$Socket})) { Debug(8,"Completing transaction!!"); @@ -917,6 +975,7 @@ sub LondWritable { # We'll treat this as if the socket got disconnected: Log("WARNING", "Connection to ".$RemoteHost. " has been disconnected"); + FailTransaction($ActiveTransactions{$Socket}); $Watcher->cancel(); KillSocket($Socket); return; @@ -1059,7 +1118,7 @@ sub MakeLondConnection { $ConnectionRetriesLeft--; return 0; # Failure. } else { - $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count + # The connection needs to have writability # monitored in order to send the init sequence # that starts the whole authentication/key @@ -1072,7 +1131,8 @@ sub MakeLondConnection { &Debug(9,"MakeLondConnection got socket: ".$Socket); } - + $Connection->SetTimeoutCallback(\&SocketTimeout); + $event = Event->io(fd => $Socket, poll => 'w', cb => \&LondWritable, @@ -1171,8 +1231,15 @@ sub QueueTransaction { Debug(8,"Must queue..."); $WorkQueue->enqueue($requestData); if($ConnectionCount < $MaxConnectionCount) { - Debug(4,"Starting additional lond connection"); - MakeLondConnection(); + if($ConnectionRetriesLeft > 0) { + Debug(4,"Starting additional lond connection"); + if(MakeLondConnection() == 0) { + EmptyQueue(); # Fail transactions, can't make connection. + } + } else { + ShowStatus(GetServerHost()." >>> DEAD !!!! <<<"); + EmptyQueue(); # It's worse than that ... he's dead Jim. + } } } else { # Can start the request: Debug(8,"Can start..."); @@ -1337,6 +1404,8 @@ sub SetupLoncListener { Child USR1 signal handler to report the most recent status into the status file. +We also use this to reset the retries count in order to allow the +client to retry connections with a previously dead server. =cut sub ChildStatus { my $event = shift; @@ -1347,6 +1416,7 @@ sub ChildStatus { my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt"); print $fh $$."\t".$RemoteHost."\t".$Status."\t". $RecentLogEntry."\n"; + $ConnectionRetriesLeft = $ConnectionRetries; } =pod @@ -1363,13 +1433,12 @@ sub SignalledToDeath { my $watcher= $event->w; Debug(2,"Signalled to death! via ".$watcher->data); - my ($signal) = @_; + my ($signal) = $watcher->data; chomp($signal); Log("CRITICAL", "Abnormal exit. Child $$ for $RemoteHost " ."died through "."\"$signal\""); LogPerm("F:lonc: $$ on $RemoteHost signalled to death: " ."\"$signal\""); - die("Signal abnormal end"); exit 0; } @@ -1445,6 +1514,7 @@ sub CreateChild { Log("CRITICAL", "Forking server for ".$host); $pid = fork; if($pid) { # Parent + $RemoteHost = "Parent"; $ChildHash{$pid} = $RemoteHost; sigprocmask(SIG_UNBLOCK, $sigset); @@ -1492,6 +1562,8 @@ open (PIDSAVE, ">$execdir/logs/lonc.pid" print PIDSAVE "$$\n"; close(PIDSAVE); + + if (POSIX::setsid() < 0) { print "Could not create new session\n"; exit -1; @@ -1586,7 +1658,7 @@ sub Restart { =head1 KillThemAll Signal handler that kills all children by sending them a -SIGINT. Responds to sigint and sigterm. +SIGHUP. Responds to sigint and sigterm. =cut @@ -1598,13 +1670,12 @@ sub KillThemAll { Debug(2, "Killing lonc for $serving pid = $pid"); ShowStatus("Killing lonc for $serving pid = $pid"); Log("CRITICAL", "Killing lonc for $serving pid = $pid"); - kill('INT', $pid); - delete($ChildeHash{$pid}); + kill 'QUIT' => $pid; + delete($ChildHash{$pid}); } my $execdir = $perlvar{'lonDaemons'}; unlink("$execdir/logs/lonc.pid"); - ShowStatus("Killing the master process"); - Log("CRITICAL", "Killing the master process."); + } =pod @@ -1617,7 +1688,8 @@ Terminate the system. sub Terminate { KillThemAll; - exit; + Log("CRITICAL","Master process exiting"); + exit 0; } =pod