--- loncom/loncnew 2003/08/06 09:52:29 1.18 +++ loncom/loncnew 2003/09/15 09:24:49 1.23 @@ -2,7 +2,7 @@ # The LearningOnline Network with CAPA # lonc maintains the connections to remote computers # -# $Id: loncnew,v 1.18 2003/08/06 09:52:29 foxr Exp $ +# $Id: loncnew,v 1.23 2003/09/15 09:24:49 foxr Exp $ # # Copyright Michigan State University Board of Trustees # @@ -45,6 +45,32 @@ # Change log: # $Log: loncnew,v $ +# Revision 1.23 2003/09/15 09:24:49 foxr +# Add use strict and fix all the fallout from that. +# +# Revision 1.22 2003/09/02 10:34:47 foxr +# - Fix errors in host dead detection logic (too many cases where the +# retries left were not getting incremented or just not checked). +# - Added some additional status to the ps axuww display: +# o Remaining retries on a host. +# o >>> DEAD <<< indicator if I've given up on a host. +# - Tested the SIGHUP will reset the retries remaining count (thanks to +# the above status stuff, and get allow the loncnew to re-try again +# on the host (thanks to the log). +# +# Revision 1.21 2003/08/26 09:19:51 foxr +# How embarrassing... put in the SocketTimeout function in loncnew and forgot +# to actually hook it into the LondTransaction. Added this to MakeLondConnection +# where it belongs... hopefully transactions (not just connection attempts) will +# timeout more speedily than the socket errors will catch it. +# +# Revision 1.20 2003/08/25 18:48:11 albertel +# - fixing a forgotten ; +# +# Revision 1.19 2003/08/19 09:31:46 foxr +# Get socket directory from configuration rather than the old hard coded test +# way that I forgot to un-hard code. +# # Revision 1.18 2003/08/06 09:52:29 foxr # Also needed to remember to fail in-flight transactions if their sends fail. # @@ -77,7 +103,7 @@ # Revision 1.10 2003/06/24 02:46:04 foxr # Put a limit on the number of times we'll retry a connection. # Start getting the signal stuff put in as well...note that need to get signals -# going or else 6the client will permanently give up on dead servers. +# going or else the client will permanently give up on dead servers. # # Revision 1.9 2003/06/13 02:38:43 foxr # Add logging in 'expected format' @@ -91,7 +117,7 @@ # complete coding to support deferred transactions. # # - +use strict; use lib "/home/httpd/lib/perl/"; use lib "/home/foxr/newloncapa/types"; use Event qw(:DEFAULT ); @@ -146,7 +172,7 @@ my $IdleTimeout= 3600; # Wait an hour b # The variables below are only used by the child processes. # my $RemoteHost; # Name of host child is talking to. -my $UnixSocketDir= "/home/httpd/sockets"; +my $UnixSocketDir= $perlvar{'lonSockDir'}; my $IdleConnections = Stack->new(); # Set of idle connections my %ActiveConnections; # Connections to the remote lond. my %ActiveTransactions; # LondTransactions in flight. @@ -255,7 +281,7 @@ sub GetPeername { my $peerip; if($AdrFamily == AF_INET) { ($peerport, $peerip) = sockaddr_in($peer); - my $peername = gethostbyaddr($iaddr, $AdrFamily); + my $peername = gethostbyaddr($peerip, $AdrFamily); return $peername; } elsif ($AdrFamily == AF_UNIX) { my $peerfile; @@ -276,7 +302,7 @@ sub Debug { my $level = shift; my $message = shift; if ($level <= $DebugLevel) { - Log("INFO", "-Debug- $message host = $RemotHost"); + Log("INFO", "-Debug- $message host = $RemoteHost"); } } @@ -316,7 +342,9 @@ sub ShowStatus { sub SocketTimeout { my $Socket = shift; - KillSocket($Socket); + KillSocket($Socket); # A transaction timeout also counts as + # a connection failure: + $ConnectionRetriesLeft--; } =pod @@ -330,8 +358,12 @@ Invoked each timer tick. sub Tick { my $client; - ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount); - + if($ConnectionRetriesLeft > 0) { + ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount + ." Retries remaining: ".$ConnectionRetriesLeft); + } else { + ShowStatus(GetServerHost()." >> DEAD <<"); + } # Is it time to prune connection count: @@ -339,7 +371,7 @@ sub Tick { ($WorkQueue->Count() == 0)) { # Idle connections and nothing to do? $IdleSeconds++; if($IdleSeconds > $IdleTimeout) { # Prune a connection... - $Socket = $IdleConnections->pop(); + my $Socket = $IdleConnections->pop(); KillSocket($Socket); } } else { @@ -348,7 +380,7 @@ sub Tick { # # For each inflight transaction, tick down its timeout counter. # - foreach $item (keys %ActiveTransactions) { + foreach my $item (keys %ActiveTransactions) { my $Socket = $ActiveTransactions{$item}->getServer(); $Socket->Tick(); } @@ -362,10 +394,16 @@ sub Tick { my $Connections = ($Requests <= $MaxConnectionCount) ? $Requests : $MaxConnectionCount; Debug(1,"Work but no connections, start ".$Connections." of them"); - for ($i =0; $i < $Connections; $i++) { - MakeLondConnection(); + my $successCount = 0; + for (my $i =0; $i < $Connections; $i++) { + $successCount += MakeLondConnection(); + } + if($successCount == 0) { # All connections failed: + Debug(1,"Work in queue failed to make any connectiouns\n"); + EmptyQueue(); # Fail pending transactions with con_lost. } } else { + ShowStatus(GetServerHost()." >>> DEAD!!! <<<"); Debug(1,"Work in queue, but gave up on connections..flushing\n"); EmptyQueue(); # Connections can't be established. } @@ -417,7 +455,7 @@ sub ServerToIdle { # If there's work to do, start the transaction: - $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction + my $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction unless($reqdata eq undef) { Debug(9, "Queue gave request data: ".$reqdata->getRequest()); &StartRequest($Socket, $reqdata); @@ -551,7 +589,7 @@ sub CompleteTransaction { StartClientReply($Transaction, $data); } else { # Delete deferred transaction file. Log("SUCCESS", "A delayed transaction was completed"); - LogPerm("S:$Client:".$Transaction->getRequest()); + LogPerm("S:$Transaction->getClient() :".$Transaction->getRequest()); unlink $Transaction->getFile(); } } @@ -619,6 +657,9 @@ sub FailTransaction { Debug(1," Replying con_lost to ".$transaction->getRequest()); StartClientReply($transaction, "con_lost\n"); } + if($ConnectionRetriesLeft <= 0) { + Log("CRITICAL", "Host marked dead: ".GetServerHost()); + } } @@ -630,6 +671,7 @@ sub FailTransaction { =cut sub EmptyQueue { + $ConnectionRetriesLeft--; # Counts as connection failure too. while($WorkQueue->Count()) { my $request = $WorkQueue->dequeue(); # This is a transaction FailTransaction($request); @@ -644,7 +686,7 @@ Close all connections open on lond prior =cut sub CloseAllLondConnections { - foreach $Socket (keys %ActiveConnections) { + foreach my $Socket (keys %ActiveConnections) { KillSocket($Socket); } } @@ -696,7 +738,7 @@ sub KillSocket { # work queue, the work all gets failed with con_lost. # if($ConnectionCount == 0) { - EmptyQueue; + EmptyQueue(); } } @@ -765,7 +807,7 @@ sub LondReadable { my $Socket = $Watcher->data; my $client = undef; - &Debug(6,"LondReadable called state = ".$State); + &Debug(6,"LondReadable called state = ".$Socket->GetState()); my $State = $Socket->GetState(); # All action depends on the state. @@ -786,6 +828,7 @@ sub LondReadable { } $Watcher->cancel(); KillSocket($Socket); + $ConnectionRetriesLeft--; # Counts as connection failure return; } SocketDump(6,$Socket); @@ -819,6 +862,10 @@ sub LondReadable { } elsif ($State eq "Idle") { # If necessary, complete a transaction and then go into the # idle queue. + # Note that a trasition to idle indicates a live lond + # on the other end so reset the connection retries. + # + $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count $Watcher->cancel(); if(exists($ActiveTransactions{$Socket})) { Debug(8,"Completing transaction!!"); @@ -1031,8 +1078,7 @@ sub QueueDelayed { Debug(4, "Delayed path: ".$path); opendir(DIRHANDLE, $path); - @alldelayed = grep /\.$RemoteHost$/, readdir DIRHANDLE; - Debug(4, "Got ".$alldelayed." delayed files"); + my @alldelayed = grep /\.$RemoteHost$/, readdir DIRHANDLE; closedir(DIRHANDLE); my $dfname; my $reqfile; @@ -1074,7 +1120,7 @@ sub MakeLondConnection { $ConnectionRetriesLeft--; return 0; # Failure. } else { - $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count + # The connection needs to have writability # monitored in order to send the init sequence # that starts the whole authentication/key @@ -1087,8 +1133,9 @@ sub MakeLondConnection { &Debug(9,"MakeLondConnection got socket: ".$Socket); } - - $event = Event->io(fd => $Socket, + $Connection->SetTimeoutCallback(\&SocketTimeout); + + my $event = Event->io(fd => $Socket, poll => 'w', cb => \&LondWritable, data => $Connection, @@ -1144,7 +1191,7 @@ sub StartRequest { $ActiveTransactions{$Lond} = $Request; $Lond->InitiateTransaction($Request->getRequest()); - $event = Event->io(fd => $Socket, + my $event = Event->io(fd => $Socket, poll => "w", cb => \&LondWritable, data => $Lond, @@ -1186,9 +1233,14 @@ sub QueueTransaction { Debug(8,"Must queue..."); $WorkQueue->enqueue($requestData); if($ConnectionCount < $MaxConnectionCount) { - Debug(4,"Starting additional lond connection"); - if(MakeLondConnection() == 0) { - EmptyQueue(); # Fail transactions, can't make connection. + if($ConnectionRetriesLeft > 0) { + Debug(4,"Starting additional lond connection"); + if(MakeLondConnection() == 0) { + EmptyQueue(); # Fail transactions, can't make connection. + } + } else { + ShowStatus(GetServerHost()." >>> DEAD !!!! <<<"); + EmptyQueue(); # It's worse than that ... he's dead Jim. } } } else { # Can start the request: @@ -1354,6 +1406,8 @@ sub SetupLoncListener { Child USR1 signal handler to report the most recent status into the status file. +We also use this to reset the retries count in order to allow the +client to retry connections with a previously dead server. =cut sub ChildStatus { my $event = shift; @@ -1364,6 +1418,7 @@ sub ChildStatus { my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt"); print $fh $$."\t".$RemoteHost."\t".$Status."\t". $RecentLogEntry."\n"; + $ConnectionRetriesLeft = $ConnectionRetries; } =pod @@ -1459,7 +1514,7 @@ sub CreateChild { my $host = shift; $RemoteHost = $host; Log("CRITICAL", "Forking server for ".$host); - $pid = fork; + my $pid = fork; if($pid) { # Parent $RemoteHost = "Parent"; $ChildHash{$pid} = $RemoteHost; @@ -1467,7 +1522,7 @@ sub CreateChild { } else { # child. ShowStatus("Connected to ".$RemoteHost); - $SIG{INT} = DEFAULT; + $SIG{INT} = 'DEFAULT'; sigprocmask(SIG_UNBLOCK, $sigset); ChildProcess; # Does not return. } @@ -1504,7 +1559,7 @@ if ($childpid != 0) { # ShowStatus("Parent writing pid file:"); -$execdir = $perlvar{'lonDaemons'}; +my $execdir = $perlvar{'lonDaemons'}; open (PIDSAVE, ">$execdir/logs/lonc.pid"); print PIDSAVE "$$\n"; close(PIDSAVE); @@ -1523,7 +1578,7 @@ Log("CRITICAL", "--------------- Startin my $HostIterator = LondConnection::GetHostIterator; while (! $HostIterator->end()) { - $hostentryref = $HostIterator->get(); + my $hostentryref = $HostIterator->get(); CreateChild($hostentryref->[0]); $HostIterator->next(); } @@ -1543,9 +1598,9 @@ $SIG{HUP} = \&Restart; $SIG{USR1} = \&CheckKids; while(1) { - $deadchild = wait(); + my $deadchild = wait(); if(exists $ChildHash{$deadchild}) { # need to restart. - $deadhost = $ChildHash{$deadchild}; + my $deadhost = $ChildHash{$deadchild}; delete($ChildHash{$deadchild}); Log("WARNING","Lost child pid= ".$deadchild. "Connected to host ".$deadhost); @@ -1575,7 +1630,7 @@ sub CheckKids { my $now=time; my $local=localtime($now); print $fh "LONC status $local - parent $$ \n\n"; - foreach $pid (keys %ChildHash) { + foreach my $pid (keys %ChildHash) { Debug(2, "Sending USR1 -> $pid"); kill 'USR1' => $pid; # Tell Child to report status. sleep 1; # Wait so file doesn't intermix. @@ -1593,7 +1648,7 @@ the config file. =cut sub Restart { - KillThemAll; # First kill all the children. + &KillThemAll; # First kill all the children. Log("CRITICAL", "Restarting"); my $execdir = $perlvar{'lonDaemons'}; unlink("$execdir/logs/lonc.pid"); @@ -1612,7 +1667,7 @@ SIGHUP. Responds to sigint and sigterm. sub KillThemAll { Debug(2, "Kill them all!!"); local($SIG{CHLD}) = 'IGNORE'; # Our children >will< die. - foreach $pid (keys %ChildHash) { + foreach my $pid (keys %ChildHash) { my $serving = $ChildHash{$pid}; Debug(2, "Killing lonc for $serving pid = $pid"); ShowStatus("Killing lonc for $serving pid = $pid");