--- loncom/loncnew 2003/09/15 09:24:49 1.23 +++ loncom/loncnew 2004/08/26 12:35:10 1.51 @@ -2,7 +2,7 @@ # The LearningOnline Network with CAPA # lonc maintains the connections to remote computers # -# $Id: loncnew,v 1.23 2003/09/15 09:24:49 foxr Exp $ +# $Id: loncnew,v 1.51 2004/08/26 12:35:10 albertel Exp $ # # Copyright Michigan State University Board of Trustees # @@ -35,91 +35,16 @@ # - Add ability to create/negotiate lond connections (done). # - Add general logic for dispatching requests and timeouts. (done). # - Add support for the lonc/lond requests. (done). -# - Add logging/status monitoring. -# - Add Signal handling - HUP restarts. USR1 status report. +# - Add logging/status monitoring. (done) +# - Add Signal handling - HUP restarts. USR1 status report. (done) # - Add Configuration file I/O (done). -# - Add management/status request interface. +# - Add management/status request interface. (done) # - Add deferred request capability. (done) -# - Detect transmission timeouts. +# - Detect transmission timeouts. (done) # -# Change log: -# $Log: loncnew,v $ -# Revision 1.23 2003/09/15 09:24:49 foxr -# Add use strict and fix all the fallout from that. -# -# Revision 1.22 2003/09/02 10:34:47 foxr -# - Fix errors in host dead detection logic (too many cases where the -# retries left were not getting incremented or just not checked). -# - Added some additional status to the ps axuww display: -# o Remaining retries on a host. -# o >>> DEAD <<< indicator if I've given up on a host. -# - Tested the SIGHUP will reset the retries remaining count (thanks to -# the above status stuff, and get allow the loncnew to re-try again -# on the host (thanks to the log). -# -# Revision 1.21 2003/08/26 09:19:51 foxr -# How embarrassing... put in the SocketTimeout function in loncnew and forgot -# to actually hook it into the LondTransaction. Added this to MakeLondConnection -# where it belongs... hopefully transactions (not just connection attempts) will -# timeout more speedily than the socket errors will catch it. -# -# Revision 1.20 2003/08/25 18:48:11 albertel -# - fixing a forgotten ; -# -# Revision 1.19 2003/08/19 09:31:46 foxr -# Get socket directory from configuration rather than the old hard coded test -# way that I forgot to un-hard code. -# -# Revision 1.18 2003/08/06 09:52:29 foxr -# Also needed to remember to fail in-flight transactions if their sends fail. -# -# Revision 1.17 2003/08/03 00:44:31 foxr -# 1. Correct handling of connection failure: Assume it means the host is -# unreachable and fail all of the queued transactions. Note that the -# inflight transactions should fail on their own time due either to timeout -# or send/receive failures. -# 2. Correct handling of logs for forced death signals. Pull the signal -# from the event watcher. -# -# Revision 1.16 2003/07/29 02:33:05 foxr -# Add SIGINT processing to child processes to toggle annoying trace mode -# on/off.. will try to use this to isolate the compute boud process issue. -# -# Revision 1.15 2003/07/15 02:07:05 foxr -# Added code for lonc/lond transaction timeouts. Who knows if it works right. -# The intent is for a timeout to fail any transaction in progress and kill -# off the sockt that timed out. -# -# Revision 1.14 2003/07/03 02:10:18 foxr -# Get all of the signals to work correctly. -# -# Revision 1.13 2003/07/02 01:31:55 foxr -# Added kill -HUP logic (restart). -# -# Revision 1.11 2003/06/25 01:54:44 foxr -# Fix more problems with transaction failure. -# -# Revision 1.10 2003/06/24 02:46:04 foxr -# Put a limit on the number of times we'll retry a connection. -# Start getting the signal stuff put in as well...note that need to get signals -# going or else the client will permanently give up on dead servers. -# -# Revision 1.9 2003/06/13 02:38:43 foxr -# Add logging in 'expected format' -# -# Revision 1.8 2003/06/11 02:04:35 foxr -# Support delayed transactions... this is done uniformly by encapsulating -# transactions in an object ... a LondTransaction that is implemented by -# LondTransaction.pm -# -# Revision 1.7 2003/06/03 01:59:39 foxr -# complete coding to support deferred transactions. -# -# use strict; use lib "/home/httpd/lib/perl/"; -use lib "/home/foxr/newloncapa/types"; use Event qw(:DEFAULT ); use POSIX qw(:signal_h); use POSIX; @@ -138,17 +63,6 @@ use LONCAPA::Configuration; use LONCAPA::HashIterator; -# -# Disable all signals we might receive from outside for now. -# -#$SIG{QUIT} = IGNORE; -#$SIG{HUP} = IGNORE; -#$SIG{USR1} = IGNORE; -#$SIG{INT} = IGNORE; -#$SIG{CHLD} = IGNORE; -#$SIG{__DIE__} = IGNORE; - - # Read the httpd configuration file to get perl variables # normally set in apache modules: @@ -159,14 +73,18 @@ my %perlvar = %{$perlvarref}; # parent and shared variables. my %ChildHash; # by pid -> host. +my %HostToPid; # By host -> pid. +my %HostHash; # by loncapaname -> IP. my $MaxConnectionCount = 10; # Will get from config later. my $ClientConnection = 0; # Uniquifier for client events. my $DebugLevel = 0; -my $NextDebugLevel= 10; # So Sigint can toggle this. -my $IdleTimeout= 3600; # Wait an hour before pruning connections. +my $NextDebugLevel= 2; # So Sigint can toggle this. +my $IdleTimeout= 600; # Wait 10 minutes before pruning connections. + +my $LogTransactions = 0; # When True, all transactions/replies get logged. # # The variables below are only used by the child processes. @@ -182,19 +100,25 @@ my $ConnectionCount = 0; my $IdleSeconds = 0; # Number of seconds idle. my $Status = ""; # Current status string. my $RecentLogEntry = ""; -my $ConnectionRetries=5; # Number of connection retries allowed. -my $ConnectionRetriesLeft=5; # Number of connection retries remaining. - +my $ConnectionRetries=2; # Number of connection retries allowed. +my $ConnectionRetriesLeft=2; # Number of connection retries remaining. +my $LondVersion = "unknown"; # Version of lond we talk with. +my $KeyMode = ""; # e.g. ssl, local, insecure from last connect. + +my $LongTickLength = 10000000; #Tick Frequency when Idle +my $ShortTickLength = 1; #Tick Frequency when Active (many places in + # the code assume this is one) +my $TickLength = $ShortTickLength;#number of seconds to wait until ticking # # The hash below gives the HTML format for log messages # given a severity. # my %LogFormats; -$LogFormats{"CRITICAL"} = "CRITICAL: %s"; -$LogFormats{"SUCCESS"} = "SUCCESS: %s"; -$LogFormats{"INFO"} = "INFO: %s"; -$LogFormats{"WARNING"} = "WARNING: %s"; +$LogFormats{"CRITICAL"} = "CRITICAL: %s"; +$LogFormats{"SUCCESS"} = "SUCCESS: %s"; +$LogFormats{"INFO"} = "INFO: %s"; +$LogFormats{"WARNING"} = "WARNING: %s"; $LogFormats{"DEFAULT"} = " %s "; @@ -237,9 +161,9 @@ host and the time will be formatted into =cut sub Log { - my $severity = shift; - my $message = shift; - + + my ($severity, $message) = @_; + if(!$LogFormats{$severity}) { $severity = "DEFAULT"; } @@ -274,8 +198,10 @@ Returns the name of the host that a sock =cut sub GetPeername { - my $connection = shift; - my $AdrFamily = shift; + + + my ($connection, $AdrFamily) = @_; + my $peer = $connection->peername(); my $peerport; my $peerip; @@ -289,7 +215,6 @@ sub GetPeername { return $peerfile; } } -#----------------------------- Timer management ------------------------ =pod =head2 Debug @@ -299,18 +224,20 @@ Invoked to issue a debug message. =cut sub Debug { - my $level = shift; - my $message = shift; + + my ($level, $message) = @_; + if ($level <= $DebugLevel) { Log("INFO", "-Debug- $message host = $RemoteHost"); } } sub SocketDump { - my $level = shift; - my $socket= shift; + + my ($level, $socket) = @_; + if($level <= $DebugLevel) { - $socket->Dump(); + $socket->Dump(-1); # Ensure it will get dumped. } } @@ -341,11 +268,21 @@ sub ShowStatus { =cut sub SocketTimeout { my $Socket = shift; - + Log("WARNING", "A socket timeout was detected"); + Debug(0, " SocketTimeout called: "); + $Socket->Dump(0); + if(exists($ActiveTransactions{$Socket})) { + FailTransaction($ActiveTransactions{$Socket}); + } KillSocket($Socket); # A transaction timeout also counts as # a connection failure: $ConnectionRetriesLeft--; + if($ConnectionRetriesLeft <= 0) { + Log("CRITICAL", "Host marked dead: ".GetServerHost()); + } + } +#----------------------------- Timer management ------------------------ =pod @@ -360,7 +297,8 @@ sub Tick { my $client; if($ConnectionRetriesLeft > 0) { ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount - ." Retries remaining: ".$ConnectionRetriesLeft); + ." Retries remaining: ".$ConnectionRetriesLeft + ." ($KeyMode)"); } else { ShowStatus(GetServerHost()." >> DEAD <<"); } @@ -369,10 +307,13 @@ sub Tick { if($IdleConnections->Count() && ($WorkQueue->Count() == 0)) { # Idle connections and nothing to do? - $IdleSeconds++; + $IdleSeconds+=$TickLength; if($IdleSeconds > $IdleTimeout) { # Prune a connection... my $Socket = $IdleConnections->pop(); KillSocket($Socket); + if ($IdleConnections->Count() == 0) { + &SetupTimer($LongTickLength); + } } } else { $IdleSeconds = 0; # Reset idle count if not idle. @@ -380,9 +321,13 @@ sub Tick { # # For each inflight transaction, tick down its timeout counter. # - foreach my $item (keys %ActiveTransactions) { - my $Socket = $ActiveTransactions{$item}->getServer(); - $Socket->Tick(); + + foreach my $item (keys %ActiveConnections) { + my $State = $ActiveConnections{$item}->data->GetState(); + if ($State ne 'Idle') { + Debug(5,"Ticking Socket $State $item"); + $ActiveConnections{$item}->data->Tick(); + } } # Do we have work in the queue, but no connections to service them? # If so, try to make some new connections to get things going again. @@ -393,22 +338,27 @@ sub Tick { if ($ConnectionRetriesLeft > 0) { my $Connections = ($Requests <= $MaxConnectionCount) ? $Requests : $MaxConnectionCount; - Debug(1,"Work but no connections, start ".$Connections." of them"); + Debug(5,"Work but no connections, start ".$Connections." of them"); my $successCount = 0; for (my $i =0; $i < $Connections; $i++) { $successCount += MakeLondConnection(); } if($successCount == 0) { # All connections failed: - Debug(1,"Work in queue failed to make any connectiouns\n"); + Debug(5,"Work in queue failed to make any connectiouns\n"); EmptyQueue(); # Fail pending transactions with con_lost. + CloseAllLondConnections(); # Should all be closed but.... } } else { ShowStatus(GetServerHost()." >>> DEAD!!! <<<"); - Debug(1,"Work in queue, but gave up on connections..flushing\n"); + Debug(5,"Work in queue, but gave up on connections..flushing\n"); EmptyQueue(); # Connections can't be established. + CloseAllLondConnections(); # Should all already be closed but... } } + if ($ConnectionCount == 0) { + $KeyMode = ""; + } } =pod @@ -427,9 +377,13 @@ Trigger disconnections of idle sockets. =cut +my $timer; sub SetupTimer { - Debug(6, "SetupTimer"); - Event->timer(interval => 1, debug => 1, cb => \&Tick ); + my ($newLength)=@_; + Debug(6, "SetupTimer $TickLength->$newLength"); + $TickLength=$newLength; + if ($timer) { $timer->cancel; } + $timer=Event->timer(interval => $TickLength, cb => \&Tick ); } =pod @@ -449,21 +403,22 @@ long enough, it will be shut down and re sub ServerToIdle { my $Socket = shift; # Get the socket. + $KeyMode = $Socket->{AuthenticationMode}; delete($ActiveTransactions{$Socket}); # Server has no transaction - &Debug(6, "Server to idle"); + &Debug(5, "Server to idle"); # If there's work to do, start the transaction: my $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction - unless($reqdata eq undef) { - Debug(9, "Queue gave request data: ".$reqdata->getRequest()); + if ($reqdata ne undef) { + Debug(5, "Queue gave request data: ".$reqdata->getRequest()); &StartRequest($Socket, $reqdata); } else { # There's no work waiting, so push the server to idle list. - &Debug(8, "No new work requests, server connection going idle"); + &Debug(5, "No new work requests, server connection going idle"); $IdleConnections->push($Socket); } } @@ -509,7 +464,7 @@ sub ClientWritable { # request. &Debug(9,"Send result is ".$result." Defined: ".defined($result)); - if(defined($result)) { + if($result ne undef) { &Debug(9, "send result was defined"); if($result == length($Data)) { # Entire string sent. &Debug(9, "ClientWritable data all written"); @@ -580,12 +535,15 @@ The transaction that is being completed. =cut sub CompleteTransaction { - &Debug(6,"Complete transaction"); - my $Socket = shift; - my $Transaction = shift; + &Debug(5,"Complete transaction"); + + my ($Socket, $Transaction) = @_; if (!$Transaction->isDeferred()) { # Normal transaction my $data = $Socket->GetReply(); # Data to send. + if($LogTransactions) { + Log("SUCCESS", "Reply from lond: '$data'"); + } StartClientReply($Transaction, $data); } else { # Delete deferred transaction file. Log("SUCCESS", "A delayed transaction was completed"); @@ -593,7 +551,9 @@ sub CompleteTransaction { unlink $Transaction->getFile(); } } + =pod + =head1 StartClientReply Initiates a reply to a client where the reply data is a parameter. @@ -609,17 +569,16 @@ sub CompleteTransaction { The data to send to apached client. =cut + sub StartClientReply { - my $Transaction = shift; - my $data = shift; + my ($Transaction, $data) = @_; my $Client = $Transaction->getClient(); &Debug(8," Reply was: ".$data); my $Serial = $ActiveClients{$Client}; my $desc = sprintf("Connection to lonc client %d", - $Serial); Event->io(fd => $Client, poll => "w", @@ -627,7 +586,9 @@ sub StartClientReply { cb => \&ClientWritable, data => $data); } + =pod + =head2 FailTransaction Finishes a transaction with failure because the associated lond socket @@ -637,8 +598,7 @@ sub StartClientReply { - The transaction is 'live' in which case we initiate the sending of "con_lost" to the client. -Deleting the transaction means killing it from the -%ActiveTransactions hash. +Deleting the transaction means killing it from the %ActiveTransactions hash. Parameters: @@ -646,6 +606,7 @@ Parameters: The LondTransaction we are failing. + =cut sub FailTransaction { @@ -657,9 +618,6 @@ sub FailTransaction { Debug(1," Replying con_lost to ".$transaction->getRequest()); StartClientReply($transaction, "con_lost\n"); } - if($ConnectionRetriesLeft <= 0) { - Log("CRITICAL", "Host marked dead: ".GetServerHost()); - } } @@ -687,7 +645,10 @@ Close all connections open on lond prior =cut sub CloseAllLondConnections { foreach my $Socket (keys %ActiveConnections) { - KillSocket($Socket); + if(exists($ActiveTransactions{$Socket})) { + FailTransaction($ActiveTransactions{$Socket}); + } + KillSocket($Socket); } } =cut @@ -731,14 +692,15 @@ sub KillSocket { } if(exists($ActiveConnections{$Socket})) { delete($ActiveConnections{$Socket}); + $ConnectionCount--; + if ($ConnectionCount < 0) { $ConnectionCount = 0; } } - $ConnectionCount--; - # If the connection count has gone to zero and there is work in the # work queue, the work all gets failed with con_lost. # if($ConnectionCount == 0) { EmptyQueue(); + CloseAllLondConnections; # Should all already be closed but... } } @@ -764,6 +726,17 @@ The connection must echo the challenge b The challenge has been replied to. The we are receiveing the 'ok' from the partner. +=head3 State=ReadingVersionString + +We have requested the lond version and are reading the +version back. Upon completion, we'll store the version away +for future use(?). + +=head3 State=HostSet + +We have selected the domain name of our peer (multhomed hosts) +and are getting the reply (presumably ok) back. + =head3 State=RequestingKey The ok has been received and we need to send the request for @@ -818,7 +791,7 @@ sub LondReadable { &Debug(2, "Socket->Readable returned: $status"); if($status != 0) { - # bad return from socket read. Currently this means that + # bad return from socket read. Currently this means that # The socket has become disconnected. We fail the transaction. Log("WARNING", @@ -836,19 +809,34 @@ sub LondReadable { $State = $Socket->GetState(); # Update in case of transition. &Debug(6, "After read, state is ".$State); - if($State eq "Initialized") { + if($State eq "Initialized") { } elsif ($State eq "ChallengeReceived") { # The challenge must be echoed back; The state machine # in the connection takes care of setting that up. Just # need to transition to writable: - + $Watcher->cb(\&LondWritable); $Watcher->poll("w"); } elsif ($State eq "ChallengeReplied") { + } elsif ($State eq "RequestingVersion") { + # Need to ask for the version... that is writiability: + + $Watcher->cb(\&LondWritable); + $Watcher->poll("w"); + + } elsif ($State eq "ReadingVersionString") { + # Read the rest of the version string... + } elsif ($State eq "SetHost") { + # Need to request the actual domain get set... + + $Watcher->cb(\&LondWritable); + $Watcher->poll("w"); + } elsif ($State eq "HostSet") { + # Reading the 'ok' from the peer. } elsif ($State eq "RequestingKey") { # The ok was received. Now we need to request the key @@ -860,6 +848,14 @@ sub LondReadable { } elsif ($State eq "ReceivingKey") { } elsif ($State eq "Idle") { + + # This is as good a spot as any to get the peer version + # string: + + if($LondVersion eq "unknown") { + $LondVersion = $Socket->PeerVersion(); + Log("INFO", "Connected to lond version: $LondVersion"); + } # If necessary, complete a transaction and then go into the # idle queue. # Note that a trasition to idle indicates a live lond @@ -868,7 +864,7 @@ sub LondReadable { $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count $Watcher->cancel(); if(exists($ActiveTransactions{$Socket})) { - Debug(8,"Completing transaction!!"); + Debug(5,"Completing transaction!!"); CompleteTransaction($Socket, $ActiveTransactions{$Socket}); } else { @@ -889,7 +885,7 @@ sub LondReadable { } else { - # Invalid state. + # Invalid state. Debug(4, "Invalid state in LondReadable"); } } @@ -971,21 +967,36 @@ sub LondWritable { SocketDump(6,$Socket); - if ($State eq "Connected") { - - if ($Socket->Writable() != 0) { - # The write resulted in an error. - # We'll treat this as if the socket got disconnected: - Log("WARNING", "Connection to ".$RemoteHost. - " has been disconnected"); + # If the socket is writable, we must always write. + # Only by writing will we undergo state transitions. + # Old logic wrote in state specific code below, however + # That forces us at least through another invocation of + # this function after writability is possible again. + # This logic also factors out common code for handling + # write failures... in all cases, write failures + # Kill the socket. + # This logic makes the branches of the >big< if below + # so that the writing states are actually NO-OPs. + + if ($Socket->Writable() != 0) { + # The write resulted in an error. + # We'll treat this as if the socket got disconnected: + Log("WARNING", "Connection to ".$RemoteHost. + " has been disconnected"); + if(exists($ActiveTransactions{$Socket})) { FailTransaction($ActiveTransactions{$Socket}); - $Watcher->cancel(); - KillSocket($Socket); - return; } - # "init" is being sent... + $Watcher->cancel(); + KillSocket($Socket); + return; + } - + + + if ($State eq "Connected") { + + # "init" is being sent... + } elsif ($State eq "Initialized") { # Now that init was sent, we switch @@ -993,40 +1004,43 @@ sub LondWritable { $Watcher->cb(\&LondReadable); $Watcher->poll("r"); - + } elsif ($State eq "ChallengeReceived") { # We received the challenge, now we # are echoing it back. This is a no-op, # we're waiting for the state to change - if($Socket->Writable() != 0) { - - $Watcher->cancel(); - KillSocket($Socket); - return; - } - } elsif ($State eq "ChallengeReplied") { # The echo was sent back, so we switch # to watching readability. $Watcher->cb(\&LondReadable); $Watcher->poll("r"); + } elsif ($State eq "RequestingVersion") { + # Sending the peer a version request... + + } elsif ($State eq "ReadingVersionString") { + # Transition to read since we have sent the + # version command and now just need to read the + # version string from the peer: + + $Watcher->cb(\&LondReadable); + $Watcher->poll("r"); + + } elsif ($State eq "SetHost") { + # Setting the remote domain... + + } elsif ($State eq "HostSet") { + # Back to readable to get the ok. + + $Watcher->cb(\&LondReadable); + $Watcher->poll("r"); + } elsif ($State eq "RequestingKey") { # At this time we're requesting the key. # again, this is essentially a no-op. - # we'll write the next chunk until the - # state changes. - - if($Socket->Writable() != 0) { - # Write resulted in an error. - $Watcher->cancel(); - KillSocket($Socket); - return; - - } } elsif ($State eq "ReceivingKey") { # Now we need to wait for the key # to come back from the peer: @@ -1035,20 +1049,10 @@ sub LondWritable { $Watcher->poll("r"); } elsif ($State eq "SendingRequest") { + # At this time we are sending a request to the # peer... write the next chunk: - if($Socket->Writable() != 0) { - - if(exists($ActiveTransactions{$Socket})) { - Debug(3, "Lond connection lost, failing transactions"); - FailTransaction($ActiveTransactions{$Socket}); - } - $Watcher->cancel(); - KillSocket($Socket); - return; - - } } elsif ($State eq "ReceivingReply") { # The send has completed. Wait for the @@ -1115,7 +1119,7 @@ sub MakeLondConnection { my $Connection = LondConnection->new(&GetServerHost(), &GetServerPort()); - if($Connection == undef) { # Needs to be more robust later. + if($Connection eq undef) { # Needs to be more robust later. Log("CRITICAL","Failed to make a connection with lond."); $ConnectionRetriesLeft--; return 0; # Failure. @@ -1127,7 +1131,7 @@ sub MakeLondConnection { # exchange underway. # my $Socket = $Connection->GetSocket(); - if($Socket == undef) { + if($Socket eq undef) { die "did not get a socket from the connection"; } else { &Debug(9,"MakeLondConnection got socket: ".$Socket); @@ -1180,8 +1184,8 @@ The text of the request to send. =cut sub StartRequest { - my $Lond = shift; - my $Request = shift; # This is a LondTransaction. + + my ($Lond, $Request) = @_; Debug(6, "StartRequest: ".$Request->getRequest()); @@ -1230,17 +1234,20 @@ sub QueueTransaction { my $LondSocket = $IdleConnections->pop(); if(!defined $LondSocket) { # Need to queue request. - Debug(8,"Must queue..."); + Debug(5,"Must queue..."); $WorkQueue->enqueue($requestData); if($ConnectionCount < $MaxConnectionCount) { if($ConnectionRetriesLeft > 0) { - Debug(4,"Starting additional lond connection"); + Debug(5,"Starting additional lond connection"); if(MakeLondConnection() == 0) { EmptyQueue(); # Fail transactions, can't make connection. + CloseAllLondConnections; # Should all be closed but... } + &SetupTimer($ShortTickLength); } else { ShowStatus(GetServerHost()." >>> DEAD !!!! <<<"); EmptyQueue(); # It's worse than that ... he's dead Jim. + CloseAllLondConnections; # Should all be closed but.. } } } else { # Can start the request: @@ -1272,7 +1279,7 @@ sub ClientRequest { my $rv = $socket->recv($thisread, POSIX::BUFSIZ, 0); Debug(8, "rcv: data length = ".length($thisread) ." read =".$thisread); - unless (defined $rv && length($thisread)) { + unless (defined $rv && length($thisread)) { # Likely eof on socket. Debug(5,"Client Socket closed on lonc for ".$RemoteHost); close($socket); @@ -1283,7 +1290,7 @@ sub ClientRequest { Debug(8,"Data: ".$data." this read: ".$thisread); $data = $data.$thisread; # Append new data. $watcher->data($data); - if($data =~ /(.*\n)/) { # Request entirely read. + if($data =~ /\n$/) { # Request entirely read. if($data eq "close_connection_exit\n") { Log("CRITICAL", "Request Close Connection ... exiting"); @@ -1291,6 +1298,9 @@ sub ClientRequest { exit; } Debug(8, "Complete transaction received: ".$data); + if($LogTransactions) { + Log("SUCCESS", "Transaction: '$data'"); # Transaction has \n. + } my $Transaction = LondTransaction->new($data); $Transaction->SetClient($socket); QueueTransaction($Transaction); @@ -1399,6 +1409,24 @@ sub SetupLoncListener { fd => $socket); } +# +# Toggle transaction logging. +# Implicit inputs: +# LogTransactions +# Implicit Outputs: +# LogTransactions +sub ToggleTransactionLogging { + print STDERR "Toggle transaction logging...\n"; + if(!$LogTransactions) { + $LogTransactions = 1; + } else { + $LogTransactions = 0; + } + + + Log("SUCCESS", "Toggled transaction logging: $LogTransactions \n"); +} + =pod =head2 ChildStatus @@ -1409,6 +1437,7 @@ into the status file. We also use this to reset the retries count in order to allow the client to retry connections with a previously dead server. =cut + sub ChildStatus { my $event = shift; my $watcher = $event->w; @@ -1418,6 +1447,22 @@ sub ChildStatus { my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt"); print $fh $$."\t".$RemoteHost."\t".$Status."\t". $RecentLogEntry."\n"; + # + # Write out information about each of the connections: + # + if ($DebugLevel > 2) { + print $fh "Active connection statuses: \n"; + my $i = 1; + print STDERR "================================= Socket Status Dump:\n"; + foreach my $item (keys %ActiveConnections) { + my $Socket = $ActiveConnections{$item}->data; + my $state = $Socket->GetState(); + print $fh "Connection $i State: $state\n"; + print STDERR "---------------------- Connection $i \n"; + $Socket->Dump(-1); # Ensure it gets dumped.. + $i++; + } + } $ConnectionRetriesLeft = $ConnectionRetries; } @@ -1482,11 +1527,13 @@ sub ChildProcess { Event->signal(signal => "USR1", cb => \&ChildStatus, data => "USR1"); + Event->signal(signal => "USR2", + cb => \&ToggleTransactionLogging); Event->signal(signal => "INT", cb => \&ToggleDebug, data => "INT"); - SetupTimer(); + SetupTimer($LongTickLength); SetupLoncListener(); @@ -1517,7 +1564,8 @@ sub CreateChild { my $pid = fork; if($pid) { # Parent $RemoteHost = "Parent"; - $ChildHash{$pid} = $RemoteHost; + $ChildHash{$pid} = $host; + $HostToPid{$host}= $pid; sigprocmask(SIG_UNBLOCK, $sigset); } else { # child. @@ -1575,11 +1623,13 @@ ShowStatus("Forking node servers"); Log("CRITICAL", "--------------- Starting children ---------------"); +LondConnection::ReadConfig; # Read standard config files. my $HostIterator = LondConnection::GetHostIterator; while (! $HostIterator->end()) { my $hostentryref = $HostIterator->get(); CreateChild($hostentryref->[0]); + $HostHash{$hostentryref->[0]} = $hostentryref->[4]; $HostIterator->next(); } $RemoteHost = "Parent Server"; @@ -1596,11 +1646,13 @@ $SIG{INT} = \&Terminate; $SIG{TERM} = \&Terminate; $SIG{HUP} = \&Restart; $SIG{USR1} = \&CheckKids; +$SIG{USR2} = \&UpdateKids; # LonManage update request. while(1) { my $deadchild = wait(); if(exists $ChildHash{$deadchild}) { # need to restart. my $deadhost = $ChildHash{$deadchild}; + delete($HostToPid{$deadhost}); delete($ChildHash{$deadchild}); Log("WARNING","Lost child pid= ".$deadchild. "Connected to host ".$deadhost); @@ -1639,6 +1691,114 @@ sub CheckKids { =pod +=head1 UpdateKids + +parent's SIGUSR2 handler. This handler: + +=item + +Rereads the hosts file. + +=item + +Kills off (via sigint) children for hosts that have disappeared. + +=item + +QUITs children for hosts that already exist (this just forces a status display +and resets the connection retry count for that host. + +=item + +Starts new children for hosts that have been added to the hosts.tab file since +the start of the master program and maintains them. + +=cut + +sub UpdateKids { + + Log("INFO", "Updating connections via SIGUSR2"); + + # Just in case we need to kill our own lonc, we wait a few seconds to + # give it a chance to receive and relay lond's response to the + # re-init command. + # + + sleep(2); # Wait a couple of seconds. + + my %hosts; # Indexed by loncapa hostname, value=ip. + + # Need to re-read the host table: + + + LondConnection::ReadConfig(); + my $I = LondConnection::GetHostIterator; + while (! $I->end()) { + my $item = $I->get(); + $hosts{$item->[0]} = $item->[4]; + $I->next(); + } + + # The logic below is written for clarity not for efficiency. + # Since I anticipate that this function is only rarely called, that's + # appropriate. There are certainly ways to combine the loops below, + # and anyone wishing to obscure the logic is welcome to go for it. + # Note that we don't re-direct sigchild. Instead we do what's needed + # to the data structures that keep track of children to ensure that + # when sigchild is honored, no new child is born. + # + + # For each existing child; if it's host doesn't exist, kill the child. + + foreach my $child (keys %ChildHash) { + my $oldhost = $ChildHash{$child}; + if (!(exists $hosts{$oldhost})) { + Log("CRITICAL", "Killing child for $oldhost host no longer exists"); + delete $ChildHash{$child}; + delete $HostToPid{$oldhost}; + kill 'QUIT' => $child; + } + } + # For each remaining existing child; if it's host's ip has changed, + # Restart the child on the new IP. + + foreach my $child (keys %ChildHash) { + my $oldhost = $ChildHash{$child}; + my $oldip = $HostHash{$oldhost}; + if ($hosts{$oldhost} ne $oldip) { + + # kill the old child. + + Log("CRITICAL", "Killing child for $oldhost host ip has changed..."); + delete $ChildHash{$child}; + delete $HostToPid{$oldhost}; + kill 'QUIT' => $child; + + # Do the book-keeping needed to start a new child on the + # new ip. + + $HostHash{$oldhost} = $hosts{$oldhost}; + CreateChild($oldhost); + } + } + # Finally, for each new host, not in the host hash, create a + # enter the host and create a new child. + # Force a status display of any existing process. + + foreach my $host (keys %hosts) { + if(!(exists $HostHash{$host})) { + Log("INFO", "New host $host discovered in hosts.tab..."); + $HostHash{$host} = $hosts{$host}; + CreateChild($host); + } else { + kill 'HUP' => $HostToPid{$host}; # status display. + } + } +} + + +=pod + =head1 Restart Signal handler for HUP... all children are killed and @@ -1652,7 +1812,7 @@ sub Restart { Log("CRITICAL", "Restarting"); my $execdir = $perlvar{'lonDaemons'}; unlink("$execdir/logs/lonc.pid"); - exec("$execdir/lonc"); + exec("$execdir/loncnew"); } =pod