--- loncom/loncnew 2003/08/26 09:19:51 1.21 +++ loncom/loncnew 2003/09/02 10:34:47 1.22 @@ -2,7 +2,7 @@ # The LearningOnline Network with CAPA # lonc maintains the connections to remote computers # -# $Id: loncnew,v 1.21 2003/08/26 09:19:51 foxr Exp $ +# $Id: loncnew,v 1.22 2003/09/02 10:34:47 foxr Exp $ # # Copyright Michigan State University Board of Trustees # @@ -45,6 +45,16 @@ # Change log: # $Log: loncnew,v $ +# Revision 1.22 2003/09/02 10:34:47 foxr +# - Fix errors in host dead detection logic (too many cases where the +# retries left were not getting incremented or just not checked). +# - Added some additional status to the ps axuww display: +# o Remaining retries on a host. +# o >>> DEAD <<< indicator if I've given up on a host. +# - Tested the SIGHUP will reset the retries remaining count (thanks to +# the above status stuff, and get allow the loncnew to re-try again +# on the host (thanks to the log). +# # Revision 1.21 2003/08/26 09:19:51 foxr # How embarrassing... put in the SocketTimeout function in loncnew and forgot # to actually hook it into the LondTransaction. Added this to MakeLondConnection @@ -90,7 +100,7 @@ # Revision 1.10 2003/06/24 02:46:04 foxr # Put a limit on the number of times we'll retry a connection. # Start getting the signal stuff put in as well...note that need to get signals -# going or else 6the client will permanently give up on dead servers. +# going or else the client will permanently give up on dead servers. # # Revision 1.9 2003/06/13 02:38:43 foxr # Add logging in 'expected format' @@ -329,7 +339,9 @@ sub ShowStatus { sub SocketTimeout { my $Socket = shift; - KillSocket($Socket); + KillSocket($Socket); # A transaction timeout also counts as + # a connection failure: + $ConnectionRetriesLeft--; } =pod @@ -343,8 +355,12 @@ Invoked each timer tick. sub Tick { my $client; - ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount); - + if($ConnectionRetriesLeft > 0) { + ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount + ." Retries remaining: ".$ConnectionRetriesLeft); + } else { + ShowStatus(GetServerHost()." >> DEAD <<"); + } # Is it time to prune connection count: @@ -375,10 +391,16 @@ sub Tick { my $Connections = ($Requests <= $MaxConnectionCount) ? $Requests : $MaxConnectionCount; Debug(1,"Work but no connections, start ".$Connections." of them"); + my $successCount = 0; for ($i =0; $i < $Connections; $i++) { - MakeLondConnection(); + $successCount += MakeLondConnection(); + } + if($successCount == 0) { # All connections failed: + Debug(1,"Work in queue failed to make any connectiouns\n"); + EmptyQueue(); # Fail pending transactions with con_lost. } } else { + ShowStatus(GetServerHost()." >>> DEAD!!! <<<"); Debug(1,"Work in queue, but gave up on connections..flushing\n"); EmptyQueue(); # Connections can't be established. } @@ -632,6 +654,9 @@ sub FailTransaction { Debug(1," Replying con_lost to ".$transaction->getRequest()); StartClientReply($transaction, "con_lost\n"); } + if($ConnectionRetriesLeft <= 0) { + Log("CRITICAL", "Host marked dead: ".GetServerHost()); + } } @@ -643,6 +668,7 @@ sub FailTransaction { =cut sub EmptyQueue { + $ConnectionRetriesLeft--; # Counts as connection failure too. while($WorkQueue->Count()) { my $request = $WorkQueue->dequeue(); # This is a transaction FailTransaction($request); @@ -709,7 +735,7 @@ sub KillSocket { # work queue, the work all gets failed with con_lost. # if($ConnectionCount == 0) { - EmptyQueue; + EmptyQueue(); } } @@ -799,6 +825,7 @@ sub LondReadable { } $Watcher->cancel(); KillSocket($Socket); + $ConnectionRetriesLeft--; # Counts as connection failure return; } SocketDump(6,$Socket); @@ -832,6 +859,10 @@ sub LondReadable { } elsif ($State eq "Idle") { # If necessary, complete a transaction and then go into the # idle queue. + # Note that a trasition to idle indicates a live lond + # on the other end so reset the connection retries. + # + $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count $Watcher->cancel(); if(exists($ActiveTransactions{$Socket})) { Debug(8,"Completing transaction!!"); @@ -1087,7 +1118,7 @@ sub MakeLondConnection { $ConnectionRetriesLeft--; return 0; # Failure. } else { - $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count + # The connection needs to have writability # monitored in order to send the init sequence # that starts the whole authentication/key @@ -1200,9 +1231,14 @@ sub QueueTransaction { Debug(8,"Must queue..."); $WorkQueue->enqueue($requestData); if($ConnectionCount < $MaxConnectionCount) { - Debug(4,"Starting additional lond connection"); - if(MakeLondConnection() == 0) { - EmptyQueue(); # Fail transactions, can't make connection. + if($ConnectionRetriesLeft > 0) { + Debug(4,"Starting additional lond connection"); + if(MakeLondConnection() == 0) { + EmptyQueue(); # Fail transactions, can't make connection. + } + } else { + ShowStatus(GetServerHost()." >>> DEAD !!!! <<<"); + EmptyQueue(); # It's worse than that ... he's dead Jim. } } } else { # Can start the request: @@ -1368,6 +1404,8 @@ sub SetupLoncListener { Child USR1 signal handler to report the most recent status into the status file. +We also use this to reset the retries count in order to allow the +client to retry connections with a previously dead server. =cut sub ChildStatus { my $event = shift; @@ -1378,6 +1416,7 @@ sub ChildStatus { my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt"); print $fh $$."\t".$RemoteHost."\t".$Status."\t". $RecentLogEntry."\n"; + $ConnectionRetriesLeft = $ConnectionRetries; } =pod