--- loncom/loncnew 2003/09/15 09:24:49 1.23 +++ loncom/loncnew 2004/02/09 10:58:03 1.40 @@ -2,7 +2,7 @@ # The LearningOnline Network with CAPA # lonc maintains the connections to remote computers # -# $Id: loncnew,v 1.23 2003/09/15 09:24:49 foxr Exp $ +# $Id: loncnew,v 1.40 2004/02/09 10:58:03 foxr Exp $ # # Copyright Michigan State University Board of Trustees # @@ -35,91 +35,16 @@ # - Add ability to create/negotiate lond connections (done). # - Add general logic for dispatching requests and timeouts. (done). # - Add support for the lonc/lond requests. (done). -# - Add logging/status monitoring. -# - Add Signal handling - HUP restarts. USR1 status report. +# - Add logging/status monitoring. (done) +# - Add Signal handling - HUP restarts. USR1 status report. (done) # - Add Configuration file I/O (done). -# - Add management/status request interface. +# - Add management/status request interface. (done) # - Add deferred request capability. (done) -# - Detect transmission timeouts. +# - Detect transmission timeouts. (done) # -# Change log: -# $Log: loncnew,v $ -# Revision 1.23 2003/09/15 09:24:49 foxr -# Add use strict and fix all the fallout from that. -# -# Revision 1.22 2003/09/02 10:34:47 foxr -# - Fix errors in host dead detection logic (too many cases where the -# retries left were not getting incremented or just not checked). -# - Added some additional status to the ps axuww display: -# o Remaining retries on a host. -# o >>> DEAD <<< indicator if I've given up on a host. -# - Tested the SIGHUP will reset the retries remaining count (thanks to -# the above status stuff, and get allow the loncnew to re-try again -# on the host (thanks to the log). -# -# Revision 1.21 2003/08/26 09:19:51 foxr -# How embarrassing... put in the SocketTimeout function in loncnew and forgot -# to actually hook it into the LondTransaction. Added this to MakeLondConnection -# where it belongs... hopefully transactions (not just connection attempts) will -# timeout more speedily than the socket errors will catch it. -# -# Revision 1.20 2003/08/25 18:48:11 albertel -# - fixing a forgotten ; -# -# Revision 1.19 2003/08/19 09:31:46 foxr -# Get socket directory from configuration rather than the old hard coded test -# way that I forgot to un-hard code. -# -# Revision 1.18 2003/08/06 09:52:29 foxr -# Also needed to remember to fail in-flight transactions if their sends fail. -# -# Revision 1.17 2003/08/03 00:44:31 foxr -# 1. Correct handling of connection failure: Assume it means the host is -# unreachable and fail all of the queued transactions. Note that the -# inflight transactions should fail on their own time due either to timeout -# or send/receive failures. -# 2. Correct handling of logs for forced death signals. Pull the signal -# from the event watcher. -# -# Revision 1.16 2003/07/29 02:33:05 foxr -# Add SIGINT processing to child processes to toggle annoying trace mode -# on/off.. will try to use this to isolate the compute boud process issue. -# -# Revision 1.15 2003/07/15 02:07:05 foxr -# Added code for lonc/lond transaction timeouts. Who knows if it works right. -# The intent is for a timeout to fail any transaction in progress and kill -# off the sockt that timed out. -# -# Revision 1.14 2003/07/03 02:10:18 foxr -# Get all of the signals to work correctly. -# -# Revision 1.13 2003/07/02 01:31:55 foxr -# Added kill -HUP logic (restart). -# -# Revision 1.11 2003/06/25 01:54:44 foxr -# Fix more problems with transaction failure. -# -# Revision 1.10 2003/06/24 02:46:04 foxr -# Put a limit on the number of times we'll retry a connection. -# Start getting the signal stuff put in as well...note that need to get signals -# going or else the client will permanently give up on dead servers. -# -# Revision 1.9 2003/06/13 02:38:43 foxr -# Add logging in 'expected format' -# -# Revision 1.8 2003/06/11 02:04:35 foxr -# Support delayed transactions... this is done uniformly by encapsulating -# transactions in an object ... a LondTransaction that is implemented by -# LondTransaction.pm -# -# Revision 1.7 2003/06/03 01:59:39 foxr -# complete coding to support deferred transactions. -# -# use strict; use lib "/home/httpd/lib/perl/"; -use lib "/home/foxr/newloncapa/types"; use Event qw(:DEFAULT ); use POSIX qw(:signal_h); use POSIX; @@ -138,17 +63,6 @@ use LONCAPA::Configuration; use LONCAPA::HashIterator; -# -# Disable all signals we might receive from outside for now. -# -#$SIG{QUIT} = IGNORE; -#$SIG{HUP} = IGNORE; -#$SIG{USR1} = IGNORE; -#$SIG{INT} = IGNORE; -#$SIG{CHLD} = IGNORE; -#$SIG{__DIE__} = IGNORE; - - # Read the httpd configuration file to get perl variables # normally set in apache modules: @@ -159,15 +73,19 @@ my %perlvar = %{$perlvarref}; # parent and shared variables. my %ChildHash; # by pid -> host. +my %HostToPid; # By host -> pid. +my %HostHash; # by loncapaname -> IP. my $MaxConnectionCount = 10; # Will get from config later. my $ClientConnection = 0; # Uniquifier for client events. my $DebugLevel = 0; -my $NextDebugLevel= 10; # So Sigint can toggle this. +my $NextDebugLevel= 2; # So Sigint can toggle this. my $IdleTimeout= 3600; # Wait an hour before pruning connections. +my $LogTransactions = 0; # When True, all transactions/replies get logged. + # # The variables below are only used by the child processes. # @@ -182,8 +100,9 @@ my $ConnectionCount = 0; my $IdleSeconds = 0; # Number of seconds idle. my $Status = ""; # Current status string. my $RecentLogEntry = ""; -my $ConnectionRetries=5; # Number of connection retries allowed. -my $ConnectionRetriesLeft=5; # Number of connection retries remaining. +my $ConnectionRetries=2; # Number of connection retries allowed. +my $ConnectionRetriesLeft=2; # Number of connection retries remaining. +my $LondVersion = "unknown"; # Version of lond we talk with. # # The hash below gives the HTML format for log messages @@ -289,7 +208,6 @@ sub GetPeername { return $peerfile; } } -#----------------------------- Timer management ------------------------ =pod =head2 Debug @@ -341,11 +259,14 @@ sub ShowStatus { =cut sub SocketTimeout { my $Socket = shift; - + Log("WARNING", "A socket timeout was detected"); + Debug(0, " SocketTimeout called: "); + $Socket->Dump(); KillSocket($Socket); # A transaction timeout also counts as # a connection failure: $ConnectionRetriesLeft--; } +#----------------------------- Timer management ------------------------ =pod @@ -380,9 +301,13 @@ sub Tick { # # For each inflight transaction, tick down its timeout counter. # - foreach my $item (keys %ActiveTransactions) { - my $Socket = $ActiveTransactions{$item}->getServer(); - $Socket->Tick(); + + foreach my $item (keys %ActiveConnections) { + my $State = $ActiveConnections{$item}->data->GetState(); + if ($State ne 'Idle') { + Debug(5,"Ticking Socket $State $item"); + $ActiveConnections{$item}->data->Tick(); + } } # Do we have work in the queue, but no connections to service them? # If so, try to make some new connections to get things going again. @@ -393,18 +318,18 @@ sub Tick { if ($ConnectionRetriesLeft > 0) { my $Connections = ($Requests <= $MaxConnectionCount) ? $Requests : $MaxConnectionCount; - Debug(1,"Work but no connections, start ".$Connections." of them"); + Debug(5,"Work but no connections, start ".$Connections." of them"); my $successCount = 0; for (my $i =0; $i < $Connections; $i++) { $successCount += MakeLondConnection(); } if($successCount == 0) { # All connections failed: - Debug(1,"Work in queue failed to make any connectiouns\n"); + Debug(5,"Work in queue failed to make any connectiouns\n"); EmptyQueue(); # Fail pending transactions with con_lost. } } else { ShowStatus(GetServerHost()." >>> DEAD!!! <<<"); - Debug(1,"Work in queue, but gave up on connections..flushing\n"); + Debug(5,"Work in queue, but gave up on connections..flushing\n"); EmptyQueue(); # Connections can't be established. } @@ -429,7 +354,7 @@ Trigger disconnections of idle sockets. sub SetupTimer { Debug(6, "SetupTimer"); - Event->timer(interval => 1, debug => 1, cb => \&Tick ); + Event->timer(interval => 1, cb => \&Tick ); } =pod @@ -451,19 +376,19 @@ sub ServerToIdle { my $Socket = shift; # Get the socket. delete($ActiveTransactions{$Socket}); # Server has no transaction - &Debug(6, "Server to idle"); + &Debug(5, "Server to idle"); # If there's work to do, start the transaction: my $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction - unless($reqdata eq undef) { - Debug(9, "Queue gave request data: ".$reqdata->getRequest()); + if ($reqdata ne undef) { + Debug(5, "Queue gave request data: ".$reqdata->getRequest()); &StartRequest($Socket, $reqdata); } else { # There's no work waiting, so push the server to idle list. - &Debug(8, "No new work requests, server connection going idle"); + &Debug(5, "No new work requests, server connection going idle"); $IdleConnections->push($Socket); } } @@ -509,7 +434,7 @@ sub ClientWritable { # request. &Debug(9,"Send result is ".$result." Defined: ".defined($result)); - if(defined($result)) { + if($result ne undef) { &Debug(9, "send result was defined"); if($result == length($Data)) { # Entire string sent. &Debug(9, "ClientWritable data all written"); @@ -580,12 +505,15 @@ The transaction that is being completed. =cut sub CompleteTransaction { - &Debug(6,"Complete transaction"); + &Debug(5,"Complete transaction"); my $Socket = shift; my $Transaction = shift; if (!$Transaction->isDeferred()) { # Normal transaction my $data = $Socket->GetReply(); # Data to send. + if($LogTransactions) { + Log("SUCCESS", "Reply from lond: '$data'"); + } StartClientReply($Transaction, $data); } else { # Delete deferred transaction file. Log("SUCCESS", "A delayed transaction was completed"); @@ -619,7 +547,6 @@ sub StartClientReply { &Debug(8," Reply was: ".$data); my $Serial = $ActiveClients{$Client}; my $desc = sprintf("Connection to lonc client %d", - $Serial); Event->io(fd => $Client, poll => "w", @@ -731,9 +658,9 @@ sub KillSocket { } if(exists($ActiveConnections{$Socket})) { delete($ActiveConnections{$Socket}); + $ConnectionCount--; + if ($ConnectionCount < 0) { $ConnectionCount = 0; } } - $ConnectionCount--; - # If the connection count has gone to zero and there is work in the # work queue, the work all gets failed with con_lost. # @@ -764,6 +691,17 @@ The connection must echo the challenge b The challenge has been replied to. The we are receiveing the 'ok' from the partner. +=head3 State=ReadingVersionString + +We have requested the lond version and are reading the +version back. Upon completion, we'll store the version away +for future use(?). + +=head3 State=HostSet + +We have selected the domain name of our peer (multhomed hosts) +and are getting the reply (presumably ok) back. + =head3 State=RequestingKey The ok has been received and we need to send the request for @@ -802,96 +740,119 @@ transaction is in progress, the socket a sub LondReadable { - my $Event = shift; - my $Watcher = $Event->w; - my $Socket = $Watcher->data; - my $client = undef; - - &Debug(6,"LondReadable called state = ".$Socket->GetState()); - + my $Event = shift; + my $Watcher = $Event->w; + my $Socket = $Watcher->data; + my $client = undef; + + &Debug(6,"LondReadable called state = ".$Socket->GetState()); + + + my $State = $Socket->GetState(); # All action depends on the state. + + SocketDump(6, $Socket); + my $status = $Socket->Readable(); + + &Debug(2, "Socket->Readable returned: $status"); + + if($status != 0) { + # bad return from socket read. Currently this means that + # The socket has become disconnected. We fail the transaction. + + Log("WARNING", + "Lond connection lost."); + if(exists($ActiveTransactions{$Socket})) { + FailTransaction($ActiveTransactions{$Socket}); + } + $Watcher->cancel(); + KillSocket($Socket); + $ConnectionRetriesLeft--; # Counts as connection failure + return; + } + SocketDump(6,$Socket); - my $State = $Socket->GetState(); # All action depends on the state. - - SocketDump(6, $Socket); - my $status = $Socket->Readable(); - - &Debug(2, "Socket->Readable returned: $status"); - - if($status != 0) { - # bad return from socket read. Currently this means that - # The socket has become disconnected. We fail the transaction. - - Log("WARNING", - "Lond connection lost."); - if(exists($ActiveTransactions{$Socket})) { - FailTransaction($ActiveTransactions{$Socket}); - } - $Watcher->cancel(); - KillSocket($Socket); - $ConnectionRetriesLeft--; # Counts as connection failure - return; - } - SocketDump(6,$Socket); - - $State = $Socket->GetState(); # Update in case of transition. - &Debug(6, "After read, state is ".$State); + $State = $Socket->GetState(); # Update in case of transition. + &Debug(6, "After read, state is ".$State); if($State eq "Initialized") { - } elsif ($State eq "ChallengeReceived") { + } elsif ($State eq "ChallengeReceived") { # The challenge must be echoed back; The state machine # in the connection takes care of setting that up. Just # need to transition to writable: - $Watcher->cb(\&LondWritable); - $Watcher->poll("w"); - - } elsif ($State eq "ChallengeReplied") { + $Watcher->cb(\&LondWritable); + $Watcher->poll("w"); + } elsif ($State eq "ChallengeReplied") { - } elsif ($State eq "RequestingKey") { + } elsif ($State eq "RequestingVersion") { + # Need to ask for the version... that is writiability: + + $Watcher->cb(\&LondWritable); + $Watcher->poll("w"); + + } elsif ($State eq "ReadingVersionString") { + # Read the rest of the version string... + } elsif ($State eq "SetHost") { + # Need to request the actual domain get set... + + $Watcher->cb(\&LondWritable); + $Watcher->poll("w"); + } elsif ($State eq "HostSet") { + # Reading the 'ok' from the peer. + + } elsif ($State eq "RequestingKey") { # The ok was received. Now we need to request the key # That requires us to be writable: - $Watcher->cb(\&LondWritable); - $Watcher->poll("w"); + $Watcher->cb(\&LondWritable); + $Watcher->poll("w"); - } elsif ($State eq "ReceivingKey") { + } elsif ($State eq "ReceivingKey") { - } elsif ($State eq "Idle") { + } elsif ($State eq "Idle") { + + # This is as good a spot as any to get the peer version + # string: + + if($LondVersion eq "unknown") { + $LondVersion = $Socket->PeerVersion(); + Log("INFO", "Connected to lond version: $LondVersion"); + } # If necessary, complete a transaction and then go into the # idle queue. # Note that a trasition to idle indicates a live lond # on the other end so reset the connection retries. # - $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count - $Watcher->cancel(); - if(exists($ActiveTransactions{$Socket})) { - Debug(8,"Completing transaction!!"); - CompleteTransaction($Socket, - $ActiveTransactions{$Socket}); - } else { - Log("SUCCESS", "Connection ".$ConnectionCount." to " - .$RemoteHost." now ready for action"); - } - ServerToIdle($Socket); # Next work unit or idle. + $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count + $Watcher->cancel(); + if(exists($ActiveTransactions{$Socket})) { + Debug(5,"Completing transaction!!"); + CompleteTransaction($Socket, + $ActiveTransactions{$Socket}); + } else { + Log("SUCCESS", "Connection ".$ConnectionCount." to " + .$RemoteHost." now ready for action"); + } + ServerToIdle($Socket); # Next work unit or idle. - } elsif ($State eq "SendingRequest") { + } elsif ($State eq "SendingRequest") { # We need to be writable for this and probably don't belong # here inthe first place. - Deubg(6, "SendingRequest state encountered in readable"); - $Watcher->poll("w"); - $Watcher->cb(\&LondWritable); + Deubg(6, "SendingRequest state encountered in readable"); + $Watcher->poll("w"); + $Watcher->cb(\&LondWritable); - } elsif ($State eq "ReceivingReply") { + } elsif ($State eq "ReceivingReply") { - } else { + } else { # Invalid state. - Debug(4, "Invalid state in LondReadable"); - } + Debug(4, "Invalid state in LondReadable"); + } } =pod @@ -971,100 +932,131 @@ sub LondWritable { SocketDump(6,$Socket); - if ($State eq "Connected") { + if ($State eq "Connected") { - if ($Socket->Writable() != 0) { - # The write resulted in an error. - # We'll treat this as if the socket got disconnected: - Log("WARNING", "Connection to ".$RemoteHost. - " has been disconnected"); - FailTransaction($ActiveTransactions{$Socket}); - $Watcher->cancel(); - KillSocket($Socket); - return; - } - # "init" is being sent... - - - } elsif ($State eq "Initialized") { + if ($Socket->Writable() != 0) { + # The write resulted in an error. + # We'll treat this as if the socket got disconnected: + Log("WARNING", "Connection to ".$RemoteHost. + " has been disconnected"); + FailTransaction($ActiveTransactions{$Socket}); + $Watcher->cancel(); + KillSocket($Socket); + return; + } + + # "init" is being sent... + + } elsif ($State eq "Initialized") { - # Now that init was sent, we switch - # to watching for readability: + # Now that init was sent, we switch + # to watching for readability: - $Watcher->cb(\&LondReadable); - $Watcher->poll("r"); + $Watcher->cb(\&LondReadable); + $Watcher->poll("r"); - } elsif ($State eq "ChallengeReceived") { - # We received the challenge, now we - # are echoing it back. This is a no-op, - # we're waiting for the state to change + } elsif ($State eq "ChallengeReceived") { + # We received the challenge, now we + # are echoing it back. This is a no-op, + # we're waiting for the state to change - if($Socket->Writable() != 0) { + if($Socket->Writable() != 0) { - $Watcher->cancel(); - KillSocket($Socket); - return; - } + $Watcher->cancel(); + KillSocket($Socket); + return; + } - } elsif ($State eq "ChallengeReplied") { - # The echo was sent back, so we switch - # to watching readability. - - $Watcher->cb(\&LondReadable); - $Watcher->poll("r"); - - } elsif ($State eq "RequestingKey") { - # At this time we're requesting the key. - # again, this is essentially a no-op. - # we'll write the next chunk until the - # state changes. - - if($Socket->Writable() != 0) { - # Write resulted in an error. - - $Watcher->cancel(); - KillSocket($Socket); - return; - - } - } elsif ($State eq "ReceivingKey") { - # Now we need to wait for the key - # to come back from the peer: + } elsif ($State eq "ChallengeReplied") { + # The echo was sent back, so we switch + # to watching readability. + + $Watcher->cb(\&LondReadable); + $Watcher->poll("r"); + } elsif ($State eq "RequestingVersion") { + # Sending the peer a version request... + + if($Socket->Writable() != 0) { + $Watcher->cancel(); + KillSocket($Socket); + return; + } + } elsif ($State eq "ReadingVersionString") { + # Transition to read since we have sent the + # version command and now just need to read the + # version string from the peer: + + $Watcher->cb(\&LondReadable); + $Watcher->poll("r"); + + } elsif ($State eq "SetHost") { + # Setting the remote domain... + + if($Socket->Writable() != 0) { + $Watcher->cancel(); + KillSocket($Socket); + return; + } + } elsif ($State eq "HostSet") { + # Back to readable to get the ok. + + $Watcher->cb(\&LondReadable); + $Watcher->poll("r"); + + + } elsif ($State eq "RequestingKey") { + # At this time we're requesting the key. + # again, this is essentially a no-op. + # we'll write the next chunk until the + # state changes. + + if($Socket->Writable() != 0) { + # Write resulted in an error. + + $Watcher->cancel(); + KillSocket($Socket); + return; + + } + } elsif ($State eq "ReceivingKey") { + # Now we need to wait for the key + # to come back from the peer: - $Watcher->cb(\&LondReadable); - $Watcher->poll("r"); + $Watcher->cb(\&LondReadable); + $Watcher->poll("r"); - } elsif ($State eq "SendingRequest") { - # At this time we are sending a request to the + } elsif ($State eq "SendingRequest") { + + # At this time we are sending a request to the # peer... write the next chunk: - if($Socket->Writable() != 0) { + if($Socket->Writable() != 0) { - if(exists($ActiveTransactions{$Socket})) { - Debug(3, "Lond connection lost, failing transactions"); - FailTransaction($ActiveTransactions{$Socket}); - } - $Watcher->cancel(); - KillSocket($Socket); - return; + if(exists($ActiveTransactions{$Socket})) { + Debug(3, "Lond connection lost, failing transactions"); + FailTransaction($ActiveTransactions{$Socket}); + } + $Watcher->cancel(); + KillSocket($Socket); + return; - } - - } elsif ($State eq "ReceivingReply") { - # The send has completed. Wait for the - # data to come in for a reply. - Debug(8,"Writable sent request/receiving reply"); - $Watcher->cb(\&LondReadable); - $Watcher->poll("r"); + } - } else { - # Control only passes here on an error: - # the socket state does not match any - # of the known states... so an error - # must be logged. + } elsif ($State eq "ReceivingReply") { + # The send has completed. Wait for the + # data to come in for a reply. + Debug(8,"Writable sent request/receiving reply"); + $Watcher->cb(\&LondReadable); + $Watcher->poll("r"); + + } else { + # Control only passes here on an error: + # the socket state does not match any + # of the known states... so an error + # must be logged. - &Debug(4, "Invalid socket state ".$State."\n"); - } + &Debug(4, "Invalid socket state ".$State."\n"); + } } =pod @@ -1115,7 +1107,7 @@ sub MakeLondConnection { my $Connection = LondConnection->new(&GetServerHost(), &GetServerPort()); - if($Connection == undef) { # Needs to be more robust later. + if($Connection eq undef) { # Needs to be more robust later. Log("CRITICAL","Failed to make a connection with lond."); $ConnectionRetriesLeft--; return 0; # Failure. @@ -1127,7 +1119,7 @@ sub MakeLondConnection { # exchange underway. # my $Socket = $Connection->GetSocket(); - if($Socket == undef) { + if($Socket eq undef) { die "did not get a socket from the connection"; } else { &Debug(9,"MakeLondConnection got socket: ".$Socket); @@ -1230,11 +1222,11 @@ sub QueueTransaction { my $LondSocket = $IdleConnections->pop(); if(!defined $LondSocket) { # Need to queue request. - Debug(8,"Must queue..."); + Debug(5,"Must queue..."); $WorkQueue->enqueue($requestData); if($ConnectionCount < $MaxConnectionCount) { if($ConnectionRetriesLeft > 0) { - Debug(4,"Starting additional lond connection"); + Debug(5,"Starting additional lond connection"); if(MakeLondConnection() == 0) { EmptyQueue(); # Fail transactions, can't make connection. } @@ -1272,7 +1264,7 @@ sub ClientRequest { my $rv = $socket->recv($thisread, POSIX::BUFSIZ, 0); Debug(8, "rcv: data length = ".length($thisread) ." read =".$thisread); - unless (defined $rv && length($thisread)) { + unless (defined $rv && length($thisread)) { # Likely eof on socket. Debug(5,"Client Socket closed on lonc for ".$RemoteHost); close($socket); @@ -1291,6 +1283,9 @@ sub ClientRequest { exit; } Debug(8, "Complete transaction received: ".$data); + if($LogTransactions) { + Log("SUCCESS", "Transaction: '$data'"); # Transaction has \n. + } my $Transaction = LondTransaction->new($data); $Transaction->SetClient($socket); QueueTransaction($Transaction); @@ -1399,6 +1394,24 @@ sub SetupLoncListener { fd => $socket); } +# +# Toggle transaction logging. +# Implicit inputs: +# LogTransactions +# Implicit Outputs: +# LogTransactions +sub ToggleTransactionLogging { + print STDERR "Toggle transaction logging...\n"; + if(!$LogTransactions) { + $LogTransactions = 1; + } else { + $LogTransactions = 0; + } + + + Log("SUCCESS", "Toggled transaction logging: $LogTransactions \n"); +} + =pod =head2 ChildStatus @@ -1418,6 +1431,20 @@ sub ChildStatus { my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt"); print $fh $$."\t".$RemoteHost."\t".$Status."\t". $RecentLogEntry."\n"; + # + # Write out information about each of the connections: + # + print $fh "Active connection statuses: \n"; + my $i = 1; + print STDERR "================================= Socket Status Dump:\n"; + foreach my $item (keys %ActiveConnections) { + my $Socket = $ActiveConnections{$item}->data; + my $state = $Socket->GetState(); + print $fh "Connection $i State: $state\n"; + print STDERR "---------------------- Connection $i \n"; + $Socket->Dump(); + $i++; + } $ConnectionRetriesLeft = $ConnectionRetries; } @@ -1482,6 +1509,8 @@ sub ChildProcess { Event->signal(signal => "USR1", cb => \&ChildStatus, data => "USR1"); + Event->signal(signal => "USR2", + cb => \&ToggleTransactionLogging); Event->signal(signal => "INT", cb => \&ToggleDebug, data => "INT"); @@ -1517,7 +1546,8 @@ sub CreateChild { my $pid = fork; if($pid) { # Parent $RemoteHost = "Parent"; - $ChildHash{$pid} = $RemoteHost; + $ChildHash{$pid} = $host; + $HostToPid{$host}= $pid; sigprocmask(SIG_UNBLOCK, $sigset); } else { # child. @@ -1575,11 +1605,13 @@ ShowStatus("Forking node servers"); Log("CRITICAL", "--------------- Starting children ---------------"); +LondConnection::ReadConfig; # Read standard config files. my $HostIterator = LondConnection::GetHostIterator; while (! $HostIterator->end()) { my $hostentryref = $HostIterator->get(); CreateChild($hostentryref->[0]); + $HostHash{$hostentryref->[0]} = $hostentryref->[4]; $HostIterator->next(); } $RemoteHost = "Parent Server"; @@ -1596,11 +1628,13 @@ $SIG{INT} = \&Terminate; $SIG{TERM} = \&Terminate; $SIG{HUP} = \&Restart; $SIG{USR1} = \&CheckKids; +$SIG{USR2} = \&UpdateKids; # LonManage update request. while(1) { my $deadchild = wait(); if(exists $ChildHash{$deadchild}) { # need to restart. my $deadhost = $ChildHash{$deadchild}; + delete($HostToPid{$deadhost}); delete($ChildHash{$deadchild}); Log("WARNING","Lost child pid= ".$deadchild. "Connected to host ".$deadhost); @@ -1639,6 +1673,114 @@ sub CheckKids { =pod +=head1 UpdateKids + +parent's SIGUSR2 handler. This handler: + +=item + +Rereads the hosts file. + +=item + +Kills off (via sigint) children for hosts that have disappeared. + +=item + +QUITs children for hosts that already exist (this just forces a status display +and resets the connection retry count for that host. + +=item + +Starts new children for hosts that have been added to the hosts.tab file since +the start of the master program and maintains them. + +=cut + +sub UpdateKids { + + Log("INFO", "Updating connections via SIGUSR2"); + + # Just in case we need to kill our own lonc, we wait a few seconds to + # give it a chance to receive and relay lond's response to the + # re-init command. + # + + sleep(2); # Wait a couple of seconds. + + my %hosts; # Indexed by loncapa hostname, value=ip. + + # Need to re-read the host table: + + + LondConnection::ReadConfig(); + my $I = LondConnection::GetHostIterator; + while (! $I->end()) { + my $item = $I->get(); + $hosts{$item->[0]} = $item->[4]; + $I->next(); + } + + # The logic below is written for clarity not for efficiency. + # Since I anticipate that this function is only rarely called, that's + # appropriate. There are certainly ways to combine the loops below, + # and anyone wishing to obscure the logic is welcome to go for it. + # Note that we don't re-direct sigchild. Instead we do what's needed + # to the data structures that keep track of children to ensure that + # when sigchild is honored, no new child is born. + # + + # For each existing child; if it's host doesn't exist, kill the child. + + foreach my $child (keys %ChildHash) { + my $oldhost = $ChildHash{$child}; + if (!(exists $hosts{$oldhost})) { + Log("CRITICAL", "Killing child for $oldhost host no longer exists"); + delete $ChildHash{$child}; + delete $HostToPid{$oldhost}; + kill 'QUIT' => $child; + } + } + # For each remaining existing child; if it's host's ip has changed, + # Restart the child on the new IP. + + foreach my $child (keys %ChildHash) { + my $oldhost = $ChildHash{$child}; + my $oldip = $HostHash{$oldhost}; + if ($hosts{$oldhost} ne $oldip) { + + # kill the old child. + + Log("CRITICAL", "Killing child for $oldhost host ip has changed..."); + delete $ChildHash{$child}; + delete $HostToPid{$oldhost}; + kill 'QUIT' => $child; + + # Do the book-keeping needed to start a new child on the + # new ip. + + $HostHash{$oldhost} = $hosts{$oldhost}; + CreateChild($oldhost); + } + } + # Finally, for each new host, not in the host hash, create a + # enter the host and create a new child. + # Force a status display of any existing process. + + foreach my $host (keys %hosts) { + if(!(exists $HostHash{$host})) { + Log("INFO", "New host $host discovered in hosts.tab..."); + $HostHash{$host} = $hosts{$host}; + CreateChild($host); + } else { + kill 'HUP' => $HostToPid{$host}; # status display. + } + } +} + + +=pod + =head1 Restart Signal handler for HUP... all children are killed and @@ -1652,7 +1794,7 @@ sub Restart { Log("CRITICAL", "Restarting"); my $execdir = $perlvar{'lonDaemons'}; unlink("$execdir/logs/lonc.pid"); - exec("$execdir/lonc"); + exec("$execdir/loncnew"); } =pod