--- loncom/loncnew 2003/08/26 09:19:51 1.21 +++ loncom/loncnew 2004/02/17 09:43:21 1.42 @@ -2,7 +2,7 @@ # The LearningOnline Network with CAPA # lonc maintains the connections to remote computers # -# $Id: loncnew,v 1.21 2003/08/26 09:19:51 foxr Exp $ +# $Id: loncnew,v 1.42 2004/02/17 09:43:21 foxr Exp $ # # Copyright Michigan State University Board of Trustees # @@ -35,78 +35,16 @@ # - Add ability to create/negotiate lond connections (done). # - Add general logic for dispatching requests and timeouts. (done). # - Add support for the lonc/lond requests. (done). -# - Add logging/status monitoring. -# - Add Signal handling - HUP restarts. USR1 status report. +# - Add logging/status monitoring. (done) +# - Add Signal handling - HUP restarts. USR1 status report. (done) # - Add Configuration file I/O (done). -# - Add management/status request interface. +# - Add management/status request interface. (done) # - Add deferred request capability. (done) -# - Detect transmission timeouts. -# - -# Change log: -# $Log: loncnew,v $ -# Revision 1.21 2003/08/26 09:19:51 foxr -# How embarrassing... put in the SocketTimeout function in loncnew and forgot -# to actually hook it into the LondTransaction. Added this to MakeLondConnection -# where it belongs... hopefully transactions (not just connection attempts) will -# timeout more speedily than the socket errors will catch it. -# -# Revision 1.20 2003/08/25 18:48:11 albertel -# - fixing a forgotten ; -# -# Revision 1.19 2003/08/19 09:31:46 foxr -# Get socket directory from configuration rather than the old hard coded test -# way that I forgot to un-hard code. -# -# Revision 1.18 2003/08/06 09:52:29 foxr -# Also needed to remember to fail in-flight transactions if their sends fail. -# -# Revision 1.17 2003/08/03 00:44:31 foxr -# 1. Correct handling of connection failure: Assume it means the host is -# unreachable and fail all of the queued transactions. Note that the -# inflight transactions should fail on their own time due either to timeout -# or send/receive failures. -# 2. Correct handling of logs for forced death signals. Pull the signal -# from the event watcher. -# -# Revision 1.16 2003/07/29 02:33:05 foxr -# Add SIGINT processing to child processes to toggle annoying trace mode -# on/off.. will try to use this to isolate the compute boud process issue. -# -# Revision 1.15 2003/07/15 02:07:05 foxr -# Added code for lonc/lond transaction timeouts. Who knows if it works right. -# The intent is for a timeout to fail any transaction in progress and kill -# off the sockt that timed out. -# -# Revision 1.14 2003/07/03 02:10:18 foxr -# Get all of the signals to work correctly. -# -# Revision 1.13 2003/07/02 01:31:55 foxr -# Added kill -HUP logic (restart). -# -# Revision 1.11 2003/06/25 01:54:44 foxr -# Fix more problems with transaction failure. -# -# Revision 1.10 2003/06/24 02:46:04 foxr -# Put a limit on the number of times we'll retry a connection. -# Start getting the signal stuff put in as well...note that need to get signals -# going or else 6the client will permanently give up on dead servers. -# -# Revision 1.9 2003/06/13 02:38:43 foxr -# Add logging in 'expected format' -# -# Revision 1.8 2003/06/11 02:04:35 foxr -# Support delayed transactions... this is done uniformly by encapsulating -# transactions in an object ... a LondTransaction that is implemented by -# LondTransaction.pm -# -# Revision 1.7 2003/06/03 01:59:39 foxr -# complete coding to support deferred transactions. -# +# - Detect transmission timeouts. (done) # +use strict; use lib "/home/httpd/lib/perl/"; -use lib "/home/foxr/newloncapa/types"; use Event qw(:DEFAULT ); use POSIX qw(:signal_h); use POSIX; @@ -125,17 +63,6 @@ use LONCAPA::Configuration; use LONCAPA::HashIterator; -# -# Disable all signals we might receive from outside for now. -# -#$SIG{QUIT} = IGNORE; -#$SIG{HUP} = IGNORE; -#$SIG{USR1} = IGNORE; -#$SIG{INT} = IGNORE; -#$SIG{CHLD} = IGNORE; -#$SIG{__DIE__} = IGNORE; - - # Read the httpd configuration file to get perl variables # normally set in apache modules: @@ -146,15 +73,19 @@ my %perlvar = %{$perlvarref}; # parent and shared variables. my %ChildHash; # by pid -> host. +my %HostToPid; # By host -> pid. +my %HostHash; # by loncapaname -> IP. my $MaxConnectionCount = 10; # Will get from config later. my $ClientConnection = 0; # Uniquifier for client events. my $DebugLevel = 0; -my $NextDebugLevel= 10; # So Sigint can toggle this. +my $NextDebugLevel= 2; # So Sigint can toggle this. my $IdleTimeout= 3600; # Wait an hour before pruning connections. +my $LogTransactions = 0; # When True, all transactions/replies get logged. + # # The variables below are only used by the child processes. # @@ -169,8 +100,9 @@ my $ConnectionCount = 0; my $IdleSeconds = 0; # Number of seconds idle. my $Status = ""; # Current status string. my $RecentLogEntry = ""; -my $ConnectionRetries=5; # Number of connection retries allowed. -my $ConnectionRetriesLeft=5; # Number of connection retries remaining. +my $ConnectionRetries=2; # Number of connection retries allowed. +my $ConnectionRetriesLeft=2; # Number of connection retries remaining. +my $LondVersion = "unknown"; # Version of lond we talk with. # # The hash below gives the HTML format for log messages @@ -268,7 +200,7 @@ sub GetPeername { my $peerip; if($AdrFamily == AF_INET) { ($peerport, $peerip) = sockaddr_in($peer); - my $peername = gethostbyaddr($iaddr, $AdrFamily); + my $peername = gethostbyaddr($peerip, $AdrFamily); return $peername; } elsif ($AdrFamily == AF_UNIX) { my $peerfile; @@ -276,7 +208,6 @@ sub GetPeername { return $peerfile; } } -#----------------------------- Timer management ------------------------ =pod =head2 Debug @@ -289,7 +220,7 @@ sub Debug { my $level = shift; my $message = shift; if ($level <= $DebugLevel) { - Log("INFO", "-Debug- $message host = $RemotHost"); + Log("INFO", "-Debug- $message host = $RemoteHost"); } } @@ -328,9 +259,21 @@ sub ShowStatus { =cut sub SocketTimeout { my $Socket = shift; - - KillSocket($Socket); + Log("WARNING", "A socket timeout was detected"); + Debug(0, " SocketTimeout called: "); + $Socket->Dump(); + if(exists($ActiveTransactions{$Socket})) { + FailTransaction($ActiveTransactions{$Socket}); + } + KillSocket($Socket); # A transaction timeout also counts as + # a connection failure: + $ConnectionRetriesLeft--; + if($ConnectionRetriesLeft <= 0) { + Log("CRITICAL", "Host marked dead: ".GetServerHost()); + } + } +#----------------------------- Timer management ------------------------ =pod @@ -343,8 +286,12 @@ Invoked each timer tick. sub Tick { my $client; - ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount); - + if($ConnectionRetriesLeft > 0) { + ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount + ." Retries remaining: ".$ConnectionRetriesLeft); + } else { + ShowStatus(GetServerHost()." >> DEAD <<"); + } # Is it time to prune connection count: @@ -352,7 +299,7 @@ sub Tick { ($WorkQueue->Count() == 0)) { # Idle connections and nothing to do? $IdleSeconds++; if($IdleSeconds > $IdleTimeout) { # Prune a connection... - $Socket = $IdleConnections->pop(); + my $Socket = $IdleConnections->pop(); KillSocket($Socket); } } else { @@ -361,9 +308,13 @@ sub Tick { # # For each inflight transaction, tick down its timeout counter. # - foreach $item (keys %ActiveTransactions) { - my $Socket = $ActiveTransactions{$item}->getServer(); - $Socket->Tick(); + + foreach my $item (keys %ActiveConnections) { + my $State = $ActiveConnections{$item}->data->GetState(); + if ($State ne 'Idle') { + Debug(5,"Ticking Socket $State $item"); + $ActiveConnections{$item}->data->Tick(); + } } # Do we have work in the queue, but no connections to service them? # If so, try to make some new connections to get things going again. @@ -374,13 +325,21 @@ sub Tick { if ($ConnectionRetriesLeft > 0) { my $Connections = ($Requests <= $MaxConnectionCount) ? $Requests : $MaxConnectionCount; - Debug(1,"Work but no connections, start ".$Connections." of them"); - for ($i =0; $i < $Connections; $i++) { - MakeLondConnection(); + Debug(5,"Work but no connections, start ".$Connections." of them"); + my $successCount = 0; + for (my $i =0; $i < $Connections; $i++) { + $successCount += MakeLondConnection(); + } + if($successCount == 0) { # All connections failed: + Debug(5,"Work in queue failed to make any connectiouns\n"); + EmptyQueue(); # Fail pending transactions with con_lost. + CloseAllLondConnections(); # Should all be closed but.... } } else { - Debug(1,"Work in queue, but gave up on connections..flushing\n"); + ShowStatus(GetServerHost()." >>> DEAD!!! <<<"); + Debug(5,"Work in queue, but gave up on connections..flushing\n"); EmptyQueue(); # Connections can't be established. + CloseAllLondConnections(); # Should all already be closed but... } } @@ -404,7 +363,7 @@ Trigger disconnections of idle sockets. sub SetupTimer { Debug(6, "SetupTimer"); - Event->timer(interval => 1, debug => 1, cb => \&Tick ); + Event->timer(interval => 1, cb => \&Tick ); } =pod @@ -426,19 +385,19 @@ sub ServerToIdle { my $Socket = shift; # Get the socket. delete($ActiveTransactions{$Socket}); # Server has no transaction - &Debug(6, "Server to idle"); + &Debug(5, "Server to idle"); # If there's work to do, start the transaction: - $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction - unless($reqdata eq undef) { - Debug(9, "Queue gave request data: ".$reqdata->getRequest()); + my $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction + if ($reqdata ne undef) { + Debug(5, "Queue gave request data: ".$reqdata->getRequest()); &StartRequest($Socket, $reqdata); } else { # There's no work waiting, so push the server to idle list. - &Debug(8, "No new work requests, server connection going idle"); + &Debug(5, "No new work requests, server connection going idle"); $IdleConnections->push($Socket); } } @@ -484,7 +443,7 @@ sub ClientWritable { # request. &Debug(9,"Send result is ".$result." Defined: ".defined($result)); - if(defined($result)) { + if($result ne undef) { &Debug(9, "send result was defined"); if($result == length($Data)) { # Entire string sent. &Debug(9, "ClientWritable data all written"); @@ -555,20 +514,25 @@ The transaction that is being completed. =cut sub CompleteTransaction { - &Debug(6,"Complete transaction"); + &Debug(5,"Complete transaction"); my $Socket = shift; my $Transaction = shift; if (!$Transaction->isDeferred()) { # Normal transaction my $data = $Socket->GetReply(); # Data to send. + if($LogTransactions) { + Log("SUCCESS", "Reply from lond: '$data'"); + } StartClientReply($Transaction, $data); } else { # Delete deferred transaction file. Log("SUCCESS", "A delayed transaction was completed"); - LogPerm("S:$Client:".$Transaction->getRequest()); + LogPerm("S:$Transaction->getClient() :".$Transaction->getRequest()); unlink $Transaction->getFile(); } } + =pod + =head1 StartClientReply Initiates a reply to a client where the reply data is a parameter. @@ -584,6 +548,7 @@ sub CompleteTransaction { The data to send to apached client. =cut + sub StartClientReply { my $Transaction = shift; my $data = shift; @@ -594,7 +559,6 @@ sub StartClientReply { &Debug(8," Reply was: ".$data); my $Serial = $ActiveClients{$Client}; my $desc = sprintf("Connection to lonc client %d", - $Serial); Event->io(fd => $Client, poll => "w", @@ -602,7 +566,9 @@ sub StartClientReply { cb => \&ClientWritable, data => $data); } + =pod + =head2 FailTransaction Finishes a transaction with failure because the associated lond socket @@ -612,8 +578,7 @@ sub StartClientReply { - The transaction is 'live' in which case we initiate the sending of "con_lost" to the client. -Deleting the transaction means killing it from the -%ActiveTransactions hash. +Deleting the transaction means killing it from the %ActiveTransactions hash. Parameters: @@ -621,6 +586,7 @@ Parameters: The LondTransaction we are failing. + =cut sub FailTransaction { @@ -643,6 +609,7 @@ sub FailTransaction { =cut sub EmptyQueue { + $ConnectionRetriesLeft--; # Counts as connection failure too. while($WorkQueue->Count()) { my $request = $WorkQueue->dequeue(); # This is a transaction FailTransaction($request); @@ -657,8 +624,11 @@ Close all connections open on lond prior =cut sub CloseAllLondConnections { - foreach $Socket (keys %ActiveConnections) { - KillSocket($Socket); + foreach my $Socket (keys %ActiveConnections) { + if(exists($ActiveTransactions{$Socket})) { + FailTransaction($ActiveTransactions{$Socket}); + } + KillSocket($Socket); } } =cut @@ -702,14 +672,15 @@ sub KillSocket { } if(exists($ActiveConnections{$Socket})) { delete($ActiveConnections{$Socket}); + $ConnectionCount--; + if ($ConnectionCount < 0) { $ConnectionCount = 0; } } - $ConnectionCount--; - # If the connection count has gone to zero and there is work in the # work queue, the work all gets failed with con_lost. # if($ConnectionCount == 0) { - EmptyQueue; + EmptyQueue(); + CloseAllLondConnections; # Should all already be closed but... } } @@ -735,6 +706,17 @@ The connection must echo the challenge b The challenge has been replied to. The we are receiveing the 'ok' from the partner. +=head3 State=ReadingVersionString + +We have requested the lond version and are reading the +version back. Upon completion, we'll store the version away +for future use(?). + +=head3 State=HostSet + +We have selected the domain name of our peer (multhomed hosts) +and are getting the reply (presumably ok) back. + =head3 State=RequestingKey The ok has been received and we need to send the request for @@ -778,7 +760,7 @@ sub LondReadable { my $Socket = $Watcher->data; my $client = undef; - &Debug(6,"LondReadable called state = ".$State); + &Debug(6,"LondReadable called state = ".$Socket->GetState()); my $State = $Socket->GetState(); # All action depends on the state. @@ -789,7 +771,7 @@ sub LondReadable { &Debug(2, "Socket->Readable returned: $status"); if($status != 0) { - # bad return from socket read. Currently this means that + # bad return from socket read. Currently this means that # The socket has become disconnected. We fail the transaction. Log("WARNING", @@ -799,6 +781,7 @@ sub LondReadable { } $Watcher->cancel(); KillSocket($Socket); + $ConnectionRetriesLeft--; # Counts as connection failure return; } SocketDump(6,$Socket); @@ -806,19 +789,34 @@ sub LondReadable { $State = $Socket->GetState(); # Update in case of transition. &Debug(6, "After read, state is ".$State); - if($State eq "Initialized") { + if($State eq "Initialized") { } elsif ($State eq "ChallengeReceived") { # The challenge must be echoed back; The state machine # in the connection takes care of setting that up. Just # need to transition to writable: - + $Watcher->cb(\&LondWritable); $Watcher->poll("w"); } elsif ($State eq "ChallengeReplied") { + } elsif ($State eq "RequestingVersion") { + # Need to ask for the version... that is writiability: + + $Watcher->cb(\&LondWritable); + $Watcher->poll("w"); + + } elsif ($State eq "ReadingVersionString") { + # Read the rest of the version string... + } elsif ($State eq "SetHost") { + # Need to request the actual domain get set... + + $Watcher->cb(\&LondWritable); + $Watcher->poll("w"); + } elsif ($State eq "HostSet") { + # Reading the 'ok' from the peer. } elsif ($State eq "RequestingKey") { # The ok was received. Now we need to request the key @@ -830,11 +828,23 @@ sub LondReadable { } elsif ($State eq "ReceivingKey") { } elsif ($State eq "Idle") { + + # This is as good a spot as any to get the peer version + # string: + + if($LondVersion eq "unknown") { + $LondVersion = $Socket->PeerVersion(); + Log("INFO", "Connected to lond version: $LondVersion"); + } # If necessary, complete a transaction and then go into the # idle queue. + # Note that a trasition to idle indicates a live lond + # on the other end so reset the connection retries. + # + $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count $Watcher->cancel(); if(exists($ActiveTransactions{$Socket})) { - Debug(8,"Completing transaction!!"); + Debug(5,"Completing transaction!!"); CompleteTransaction($Socket, $ActiveTransactions{$Socket}); } else { @@ -855,7 +865,7 @@ sub LondReadable { } else { - # Invalid state. + # Invalid state. Debug(4, "Invalid state in LondReadable"); } } @@ -937,21 +947,36 @@ sub LondWritable { SocketDump(6,$Socket); + # If the socket is writable, we must always write. + # Only by writing will we undergo state transitions. + # Old logic wrote in state specific code below, however + # That forces us at least through another invocation of + # this function after writability is possible again. + # This logic also factors out common code for handling + # write failures... in all cases, write failures + # Kill the socket. + # This logic makes the branches of the >big< if below + # so that the writing states are actually NO-OPs. + + if ($Socket->Writable() != 0) { + # The write resulted in an error. + # We'll treat this as if the socket got disconnected: + Log("WARNING", "Connection to ".$RemoteHost. + " has been disconnected"); + if(exists($ActiveTransactions{$Socket})) { + FailTransaction($ActiveTransactions{$Socket}); + } + $Watcher->cancel(); + KillSocket($Socket); + return; + } + + + if ($State eq "Connected") { - if ($Socket->Writable() != 0) { - # The write resulted in an error. - # We'll treat this as if the socket got disconnected: - Log("WARNING", "Connection to ".$RemoteHost. - " has been disconnected"); - FailTransaction($ActiveTransactions{$Socket}); - $Watcher->cancel(); - KillSocket($Socket); - return; - } # "init" is being sent... - - + } elsif ($State eq "Initialized") { # Now that init was sent, we switch @@ -959,40 +984,43 @@ sub LondWritable { $Watcher->cb(\&LondReadable); $Watcher->poll("r"); - + } elsif ($State eq "ChallengeReceived") { # We received the challenge, now we # are echoing it back. This is a no-op, # we're waiting for the state to change - if($Socket->Writable() != 0) { - - $Watcher->cancel(); - KillSocket($Socket); - return; - } - } elsif ($State eq "ChallengeReplied") { # The echo was sent back, so we switch # to watching readability. $Watcher->cb(\&LondReadable); $Watcher->poll("r"); + } elsif ($State eq "RequestingVersion") { + # Sending the peer a version request... + + } elsif ($State eq "ReadingVersionString") { + # Transition to read since we have sent the + # version command and now just need to read the + # version string from the peer: + + $Watcher->cb(\&LondReadable); + $Watcher->poll("r"); + + } elsif ($State eq "SetHost") { + # Setting the remote domain... + + } elsif ($State eq "HostSet") { + # Back to readable to get the ok. + + $Watcher->cb(\&LondReadable); + $Watcher->poll("r"); + } elsif ($State eq "RequestingKey") { # At this time we're requesting the key. # again, this is essentially a no-op. - # we'll write the next chunk until the - # state changes. - - if($Socket->Writable() != 0) { - # Write resulted in an error. - $Watcher->cancel(); - KillSocket($Socket); - return; - - } } elsif ($State eq "ReceivingKey") { # Now we need to wait for the key # to come back from the peer: @@ -1001,20 +1029,10 @@ sub LondWritable { $Watcher->poll("r"); } elsif ($State eq "SendingRequest") { + # At this time we are sending a request to the # peer... write the next chunk: - if($Socket->Writable() != 0) { - - if(exists($ActiveTransactions{$Socket})) { - Debug(3, "Lond connection lost, failing transactions"); - FailTransaction($ActiveTransactions{$Socket}); - } - $Watcher->cancel(); - KillSocket($Socket); - return; - - } } elsif ($State eq "ReceivingReply") { # The send has completed. Wait for the @@ -1044,8 +1062,7 @@ sub QueueDelayed { Debug(4, "Delayed path: ".$path); opendir(DIRHANDLE, $path); - @alldelayed = grep /\.$RemoteHost$/, readdir DIRHANDLE; - Debug(4, "Got ".$alldelayed." delayed files"); + my @alldelayed = grep /\.$RemoteHost$/, readdir DIRHANDLE; closedir(DIRHANDLE); my $dfname; my $reqfile; @@ -1082,19 +1099,19 @@ sub MakeLondConnection { my $Connection = LondConnection->new(&GetServerHost(), &GetServerPort()); - if($Connection == undef) { # Needs to be more robust later. + if($Connection eq undef) { # Needs to be more robust later. Log("CRITICAL","Failed to make a connection with lond."); $ConnectionRetriesLeft--; return 0; # Failure. } else { - $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count + # The connection needs to have writability # monitored in order to send the init sequence # that starts the whole authentication/key # exchange underway. # my $Socket = $Connection->GetSocket(); - if($Socket == undef) { + if($Socket eq undef) { die "did not get a socket from the connection"; } else { &Debug(9,"MakeLondConnection got socket: ".$Socket); @@ -1102,7 +1119,7 @@ sub MakeLondConnection { $Connection->SetTimeoutCallback(\&SocketTimeout); - $event = Event->io(fd => $Socket, + my $event = Event->io(fd => $Socket, poll => 'w', cb => \&LondWritable, data => $Connection, @@ -1158,7 +1175,7 @@ sub StartRequest { $ActiveTransactions{$Lond} = $Request; $Lond->InitiateTransaction($Request->getRequest()); - $event = Event->io(fd => $Socket, + my $event = Event->io(fd => $Socket, poll => "w", cb => \&LondWritable, data => $Lond, @@ -1197,12 +1214,19 @@ sub QueueTransaction { my $LondSocket = $IdleConnections->pop(); if(!defined $LondSocket) { # Need to queue request. - Debug(8,"Must queue..."); + Debug(5,"Must queue..."); $WorkQueue->enqueue($requestData); if($ConnectionCount < $MaxConnectionCount) { - Debug(4,"Starting additional lond connection"); - if(MakeLondConnection() == 0) { - EmptyQueue(); # Fail transactions, can't make connection. + if($ConnectionRetriesLeft > 0) { + Debug(5,"Starting additional lond connection"); + if(MakeLondConnection() == 0) { + EmptyQueue(); # Fail transactions, can't make connection. + CloseAllLondConnections; # Should all be closed but... + } + } else { + ShowStatus(GetServerHost()." >>> DEAD !!!! <<<"); + EmptyQueue(); # It's worse than that ... he's dead Jim. + CloseAllLondConnections; # Should all be closed but.. } } } else { # Can start the request: @@ -1234,7 +1258,7 @@ sub ClientRequest { my $rv = $socket->recv($thisread, POSIX::BUFSIZ, 0); Debug(8, "rcv: data length = ".length($thisread) ." read =".$thisread); - unless (defined $rv && length($thisread)) { + unless (defined $rv && length($thisread)) { # Likely eof on socket. Debug(5,"Client Socket closed on lonc for ".$RemoteHost); close($socket); @@ -1253,6 +1277,9 @@ sub ClientRequest { exit; } Debug(8, "Complete transaction received: ".$data); + if($LogTransactions) { + Log("SUCCESS", "Transaction: '$data'"); # Transaction has \n. + } my $Transaction = LondTransaction->new($data); $Transaction->SetClient($socket); QueueTransaction($Transaction); @@ -1361,6 +1388,24 @@ sub SetupLoncListener { fd => $socket); } +# +# Toggle transaction logging. +# Implicit inputs: +# LogTransactions +# Implicit Outputs: +# LogTransactions +sub ToggleTransactionLogging { + print STDERR "Toggle transaction logging...\n"; + if(!$LogTransactions) { + $LogTransactions = 1; + } else { + $LogTransactions = 0; + } + + + Log("SUCCESS", "Toggled transaction logging: $LogTransactions \n"); +} + =pod =head2 ChildStatus @@ -1368,6 +1413,8 @@ sub SetupLoncListener { Child USR1 signal handler to report the most recent status into the status file. +We also use this to reset the retries count in order to allow the +client to retry connections with a previously dead server. =cut sub ChildStatus { my $event = shift; @@ -1378,6 +1425,21 @@ sub ChildStatus { my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt"); print $fh $$."\t".$RemoteHost."\t".$Status."\t". $RecentLogEntry."\n"; + # + # Write out information about each of the connections: + # + print $fh "Active connection statuses: \n"; + my $i = 1; + print STDERR "================================= Socket Status Dump:\n"; + foreach my $item (keys %ActiveConnections) { + my $Socket = $ActiveConnections{$item}->data; + my $state = $Socket->GetState(); + print $fh "Connection $i State: $state\n"; + print STDERR "---------------------- Connection $i \n"; + $Socket->Dump(); + $i++; + } + $ConnectionRetriesLeft = $ConnectionRetries; } =pod @@ -1441,6 +1503,8 @@ sub ChildProcess { Event->signal(signal => "USR1", cb => \&ChildStatus, data => "USR1"); + Event->signal(signal => "USR2", + cb => \&ToggleTransactionLogging); Event->signal(signal => "INT", cb => \&ToggleDebug, data => "INT"); @@ -1473,15 +1537,16 @@ sub CreateChild { my $host = shift; $RemoteHost = $host; Log("CRITICAL", "Forking server for ".$host); - $pid = fork; + my $pid = fork; if($pid) { # Parent $RemoteHost = "Parent"; - $ChildHash{$pid} = $RemoteHost; + $ChildHash{$pid} = $host; + $HostToPid{$host}= $pid; sigprocmask(SIG_UNBLOCK, $sigset); } else { # child. ShowStatus("Connected to ".$RemoteHost); - $SIG{INT} = DEFAULT; + $SIG{INT} = 'DEFAULT'; sigprocmask(SIG_UNBLOCK, $sigset); ChildProcess; # Does not return. } @@ -1518,7 +1583,7 @@ if ($childpid != 0) { # ShowStatus("Parent writing pid file:"); -$execdir = $perlvar{'lonDaemons'}; +my $execdir = $perlvar{'lonDaemons'}; open (PIDSAVE, ">$execdir/logs/lonc.pid"); print PIDSAVE "$$\n"; close(PIDSAVE); @@ -1534,11 +1599,13 @@ ShowStatus("Forking node servers"); Log("CRITICAL", "--------------- Starting children ---------------"); +LondConnection::ReadConfig; # Read standard config files. my $HostIterator = LondConnection::GetHostIterator; while (! $HostIterator->end()) { - $hostentryref = $HostIterator->get(); + my $hostentryref = $HostIterator->get(); CreateChild($hostentryref->[0]); + $HostHash{$hostentryref->[0]} = $hostentryref->[4]; $HostIterator->next(); } $RemoteHost = "Parent Server"; @@ -1555,11 +1622,13 @@ $SIG{INT} = \&Terminate; $SIG{TERM} = \&Terminate; $SIG{HUP} = \&Restart; $SIG{USR1} = \&CheckKids; +$SIG{USR2} = \&UpdateKids; # LonManage update request. while(1) { - $deadchild = wait(); + my $deadchild = wait(); if(exists $ChildHash{$deadchild}) { # need to restart. - $deadhost = $ChildHash{$deadchild}; + my $deadhost = $ChildHash{$deadchild}; + delete($HostToPid{$deadhost}); delete($ChildHash{$deadchild}); Log("WARNING","Lost child pid= ".$deadchild. "Connected to host ".$deadhost); @@ -1589,7 +1658,7 @@ sub CheckKids { my $now=time; my $local=localtime($now); print $fh "LONC status $local - parent $$ \n\n"; - foreach $pid (keys %ChildHash) { + foreach my $pid (keys %ChildHash) { Debug(2, "Sending USR1 -> $pid"); kill 'USR1' => $pid; # Tell Child to report status. sleep 1; # Wait so file doesn't intermix. @@ -1598,6 +1667,114 @@ sub CheckKids { =pod +=head1 UpdateKids + +parent's SIGUSR2 handler. This handler: + +=item + +Rereads the hosts file. + +=item + +Kills off (via sigint) children for hosts that have disappeared. + +=item + +QUITs children for hosts that already exist (this just forces a status display +and resets the connection retry count for that host. + +=item + +Starts new children for hosts that have been added to the hosts.tab file since +the start of the master program and maintains them. + +=cut + +sub UpdateKids { + + Log("INFO", "Updating connections via SIGUSR2"); + + # Just in case we need to kill our own lonc, we wait a few seconds to + # give it a chance to receive and relay lond's response to the + # re-init command. + # + + sleep(2); # Wait a couple of seconds. + + my %hosts; # Indexed by loncapa hostname, value=ip. + + # Need to re-read the host table: + + + LondConnection::ReadConfig(); + my $I = LondConnection::GetHostIterator; + while (! $I->end()) { + my $item = $I->get(); + $hosts{$item->[0]} = $item->[4]; + $I->next(); + } + + # The logic below is written for clarity not for efficiency. + # Since I anticipate that this function is only rarely called, that's + # appropriate. There are certainly ways to combine the loops below, + # and anyone wishing to obscure the logic is welcome to go for it. + # Note that we don't re-direct sigchild. Instead we do what's needed + # to the data structures that keep track of children to ensure that + # when sigchild is honored, no new child is born. + # + + # For each existing child; if it's host doesn't exist, kill the child. + + foreach my $child (keys %ChildHash) { + my $oldhost = $ChildHash{$child}; + if (!(exists $hosts{$oldhost})) { + Log("CRITICAL", "Killing child for $oldhost host no longer exists"); + delete $ChildHash{$child}; + delete $HostToPid{$oldhost}; + kill 'QUIT' => $child; + } + } + # For each remaining existing child; if it's host's ip has changed, + # Restart the child on the new IP. + + foreach my $child (keys %ChildHash) { + my $oldhost = $ChildHash{$child}; + my $oldip = $HostHash{$oldhost}; + if ($hosts{$oldhost} ne $oldip) { + + # kill the old child. + + Log("CRITICAL", "Killing child for $oldhost host ip has changed..."); + delete $ChildHash{$child}; + delete $HostToPid{$oldhost}; + kill 'QUIT' => $child; + + # Do the book-keeping needed to start a new child on the + # new ip. + + $HostHash{$oldhost} = $hosts{$oldhost}; + CreateChild($oldhost); + } + } + # Finally, for each new host, not in the host hash, create a + # enter the host and create a new child. + # Force a status display of any existing process. + + foreach my $host (keys %hosts) { + if(!(exists $HostHash{$host})) { + Log("INFO", "New host $host discovered in hosts.tab..."); + $HostHash{$host} = $hosts{$host}; + CreateChild($host); + } else { + kill 'HUP' => $HostToPid{$host}; # status display. + } + } +} + + +=pod + =head1 Restart Signal handler for HUP... all children are killed and @@ -1607,11 +1784,11 @@ the config file. =cut sub Restart { - KillThemAll; # First kill all the children. + &KillThemAll; # First kill all the children. Log("CRITICAL", "Restarting"); my $execdir = $perlvar{'lonDaemons'}; unlink("$execdir/logs/lonc.pid"); - exec("$execdir/lonc"); + exec("$execdir/loncnew"); } =pod @@ -1626,7 +1803,7 @@ SIGHUP. Responds to sigint and sigterm. sub KillThemAll { Debug(2, "Kill them all!!"); local($SIG{CHLD}) = 'IGNORE'; # Our children >will< die. - foreach $pid (keys %ChildHash) { + foreach my $pid (keys %ChildHash) { my $serving = $ChildHash{$pid}; Debug(2, "Killing lonc for $serving pid = $pid"); ShowStatus("Killing lonc for $serving pid = $pid");