Diff for /loncom/loncnew between versions 1.18 and 1.25

version 1.18, 2003/08/06 09:52:29 version 1.25, 2003/09/23 11:22:14
Line 45 Line 45
   
 # Change log:  # Change log:
 #    $Log$  #    $Log$
   #    Revision 1.25  2003/09/23 11:22:14  foxr
   #    Tested ability to receive sigusr2  This is now logged and must be
   #    properly implemented as a re-read of hosts and re-init of appropriate
   #    children.
   #
   #    Revision 1.24  2003/09/16 09:46:42  foxr
   #    Added skeletal infrastructure to support SIGUSR2 update hosts request.
   #
   #    Revision 1.23  2003/09/15 09:24:49  foxr
   #    Add use strict and fix all the fallout from that.
   #
   #    Revision 1.22  2003/09/02 10:34:47  foxr
   #    - Fix errors in host dead detection logic (too many cases where the
   #      retries left were not getting incremented or just not checked).
   #    - Added some additional status to the ps axuww display:
   #      o Remaining retries on a host.
   #      o >>> DEAD <<< indicator if I've given up on a host.
   #    - Tested the SIGHUP will reset the retries remaining count (thanks to
   #      the above status stuff, and get allow the loncnew to re-try again
   #      on the host (thanks to the log).
   #
   #    Revision 1.21  2003/08/26 09:19:51  foxr
   #    How embarrassing... put in the SocketTimeout function in loncnew and forgot
   #    to actually hook it into the LondTransaction.  Added this to MakeLondConnection
   #    where it belongs... hopefully transactions (not just connection attempts) will
   #    timeout more speedily than the socket errors will catch it.
   #
   #    Revision 1.20  2003/08/25 18:48:11  albertel
   #    - fixing a forgotten ;
   #
   #    Revision 1.19  2003/08/19 09:31:46  foxr
   #    Get socket directory from configuration rather than the old hard coded test
   #    way that I forgot to un-hard code.
   #
 #    Revision 1.18  2003/08/06 09:52:29  foxr  #    Revision 1.18  2003/08/06 09:52:29  foxr
 #    Also needed to remember to fail in-flight transactions if their sends fail.  #    Also needed to remember to fail in-flight transactions if their sends fail.
 #  #
Line 77 Line 111
 #    Revision 1.10  2003/06/24 02:46:04  foxr  #    Revision 1.10  2003/06/24 02:46:04  foxr
 #    Put a limit on  the number of times we'll retry a connection.  #    Put a limit on  the number of times we'll retry a connection.
 #    Start getting the signal stuff put in as well...note that need to get signals  #    Start getting the signal stuff put in as well...note that need to get signals
 #    going or else 6the client will permanently give up on dead servers.  #    going or else the client will permanently give up on dead servers.
 #  #
 #    Revision 1.9  2003/06/13 02:38:43  foxr  #    Revision 1.9  2003/06/13 02:38:43  foxr
 #    Add logging in 'expected format'  #    Add logging in 'expected format'
Line 91 Line 125
 #    complete coding to support deferred transactions.  #    complete coding to support deferred transactions.
 #  #
 #  #
   use strict;
 use lib "/home/httpd/lib/perl/";  use lib "/home/httpd/lib/perl/";
 use lib "/home/foxr/newloncapa/types";  use lib "/home/foxr/newloncapa/types";
 use Event qw(:DEFAULT );  use Event qw(:DEFAULT );
Line 115  use LONCAPA::HashIterator; Line 149  use LONCAPA::HashIterator;
 #  #
 #   Disable all signals we might receive from outside for now.  #   Disable all signals we might receive from outside for now.
 #  #
 #$SIG{QUIT}  = IGNORE;  
 #$SIG{HUP}   = IGNORE;  
 #$SIG{USR1}  = IGNORE;  
 #$SIG{INT}   = IGNORE;  
 #$SIG{CHLD}  = IGNORE;  
 #$SIG{__DIE__}  = IGNORE;  
   
   
 # Read the httpd configuration file to get perl variables  # Read the httpd configuration file to get perl variables
Line 146  my $IdleTimeout= 3600;  # Wait an hour b Line 174  my $IdleTimeout= 3600;  # Wait an hour b
 #  The variables below are only used by the child processes.  #  The variables below are only used by the child processes.
 #  #
 my $RemoteHost; # Name of host child is talking to.  my $RemoteHost; # Name of host child is talking to.
 my $UnixSocketDir= "/home/httpd/sockets";   my $UnixSocketDir= $perlvar{'lonSockDir'};
 my $IdleConnections = Stack->new(); # Set of idle connections  my $IdleConnections = Stack->new(); # Set of idle connections
 my %ActiveConnections; # Connections to the remote lond.  my %ActiveConnections; # Connections to the remote lond.
 my %ActiveTransactions; # LondTransactions in flight.  my %ActiveTransactions; # LondTransactions in flight.
Line 255  sub GetPeername { Line 283  sub GetPeername {
     my $peerip;      my $peerip;
     if($AdrFamily == AF_INET) {      if($AdrFamily == AF_INET) {
  ($peerport, $peerip) = sockaddr_in($peer);   ($peerport, $peerip) = sockaddr_in($peer);
  my $peername    = gethostbyaddr($iaddr, $AdrFamily);   my $peername    = gethostbyaddr($peerip, $AdrFamily);
  return $peername;   return $peername;
     } elsif ($AdrFamily == AF_UNIX) {      } elsif ($AdrFamily == AF_UNIX) {
  my $peerfile;   my $peerfile;
Line 276  sub Debug { Line 304  sub Debug {
     my $level   = shift;      my $level   = shift;
     my $message = shift;      my $message = shift;
     if ($level <= $DebugLevel) {      if ($level <= $DebugLevel) {
  Log("INFO", "-Debug- $message host = $RemotHost");   Log("INFO", "-Debug- $message host = $RemoteHost");
     }      }
 }  }
   
Line 316  sub ShowStatus { Line 344  sub ShowStatus {
 sub SocketTimeout {  sub SocketTimeout {
     my $Socket = shift;      my $Socket = shift;
           
     KillSocket($Socket);      KillSocket($Socket); # A transaction timeout also counts as
                                   # a connection failure:
       $ConnectionRetriesLeft--;
 }  }
   
 =pod  =pod
Line 330  Invoked  each timer tick. Line 360  Invoked  each timer tick.
   
 sub Tick {  sub Tick {
     my $client;      my $client;
     ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount);      if($ConnectionRetriesLeft > 0) {
    ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount
      ." Retries remaining: ".$ConnectionRetriesLeft);
       } else {
    ShowStatus(GetServerHost()." >> DEAD <<");
       }
     # Is it time to prune connection count:      # Is it time to prune connection count:
   
   
Line 339  sub Tick { Line 373  sub Tick {
        ($WorkQueue->Count() == 0)) { # Idle connections and nothing to do?         ($WorkQueue->Count() == 0)) { # Idle connections and nothing to do?
  $IdleSeconds++;   $IdleSeconds++;
  if($IdleSeconds > $IdleTimeout) { # Prune a connection...   if($IdleSeconds > $IdleTimeout) { # Prune a connection...
     $Socket = $IdleConnections->pop();      my $Socket = $IdleConnections->pop();
     KillSocket($Socket);      KillSocket($Socket);
  }   }
     } else {      } else {
Line 348  sub Tick { Line 382  sub Tick {
     #      #
     #  For each inflight transaction, tick down its timeout counter.      #  For each inflight transaction, tick down its timeout counter.
     #      #
     foreach $item (keys %ActiveTransactions) {      foreach my $item (keys %ActiveTransactions) {
  my $Socket = $ActiveTransactions{$item}->getServer();   my $Socket = $ActiveTransactions{$item}->getServer();
  $Socket->Tick();   $Socket->Tick();
     }      }
Line 362  sub Tick { Line 396  sub Tick {
     my $Connections = ($Requests <= $MaxConnectionCount) ?      my $Connections = ($Requests <= $MaxConnectionCount) ?
  $Requests : $MaxConnectionCount;   $Requests : $MaxConnectionCount;
     Debug(1,"Work but no connections, start ".$Connections." of them");      Debug(1,"Work but no connections, start ".$Connections." of them");
     for ($i =0; $i < $Connections; $i++) {      my $successCount = 0;
  MakeLondConnection();      for (my $i =0; $i < $Connections; $i++) {
    $successCount += MakeLondConnection();
       }
       if($successCount == 0) { # All connections failed:
    Debug(1,"Work in queue failed to make any connectiouns\n");
    EmptyQueue(); # Fail pending transactions with con_lost.
     }      }
  } else {   } else {
       ShowStatus(GetServerHost()." >>> DEAD!!! <<<");
     Debug(1,"Work in queue, but gave up on connections..flushing\n");      Debug(1,"Work in queue, but gave up on connections..flushing\n");
     EmptyQueue(); # Connections can't be established.      EmptyQueue(); # Connections can't be established.
  }   }
Line 417  sub ServerToIdle { Line 457  sub ServerToIdle {
   
     #  If there's work to do, start the transaction:      #  If there's work to do, start the transaction:
   
     $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction      my $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction
     unless($reqdata eq undef)  {      unless($reqdata eq undef)  {
  Debug(9, "Queue gave request data: ".$reqdata->getRequest());   Debug(9, "Queue gave request data: ".$reqdata->getRequest());
  &StartRequest($Socket,  $reqdata);   &StartRequest($Socket,  $reqdata);
Line 551  sub CompleteTransaction { Line 591  sub CompleteTransaction {
  StartClientReply($Transaction, $data);   StartClientReply($Transaction, $data);
     } else { # Delete deferred transaction file.      } else { # Delete deferred transaction file.
  Log("SUCCESS", "A delayed transaction was completed");   Log("SUCCESS", "A delayed transaction was completed");
  LogPerm("S:$Client:".$Transaction->getRequest());   LogPerm("S:$Transaction->getClient() :".$Transaction->getRequest());
  unlink $Transaction->getFile();   unlink $Transaction->getFile();
     }      }
 }  }
Line 619  sub FailTransaction { Line 659  sub FailTransaction {
  Debug(1," Replying con_lost to ".$transaction->getRequest());   Debug(1," Replying con_lost to ".$transaction->getRequest());
  StartClientReply($transaction, "con_lost\n");   StartClientReply($transaction, "con_lost\n");
     }      }
       if($ConnectionRetriesLeft <= 0) {
    Log("CRITICAL", "Host marked dead: ".GetServerHost());
       }
   
 }  }
   
Line 630  sub FailTransaction { Line 673  sub FailTransaction {
   
 =cut  =cut
 sub EmptyQueue {  sub EmptyQueue {
       $ConnectionRetriesLeft--; # Counts as connection failure too.
     while($WorkQueue->Count()) {      while($WorkQueue->Count()) {
  my $request = $WorkQueue->dequeue(); # This is a transaction   my $request = $WorkQueue->dequeue(); # This is a transaction
  FailTransaction($request);   FailTransaction($request);
Line 644  Close all connections open on lond prior Line 688  Close all connections open on lond prior
   
 =cut  =cut
 sub CloseAllLondConnections {  sub CloseAllLondConnections {
     foreach $Socket (keys %ActiveConnections) {      foreach my $Socket (keys %ActiveConnections) {
  KillSocket($Socket);   KillSocket($Socket);
     }      }
 }  }
Line 696  sub KillSocket { Line 740  sub KillSocket {
     #  work queue, the work all gets failed with con_lost.      #  work queue, the work all gets failed with con_lost.
     #      #
     if($ConnectionCount == 0) {      if($ConnectionCount == 0) {
  EmptyQueue;   EmptyQueue();
     }      }
 }  }
   
Line 765  sub LondReadable { Line 809  sub LondReadable {
     my $Socket     = $Watcher->data;      my $Socket     = $Watcher->data;
     my $client     = undef;      my $client     = undef;
   
     &Debug(6,"LondReadable called state = ".$State);      &Debug(6,"LondReadable called state = ".$Socket->GetState());
   
   
     my $State = $Socket->GetState(); # All action depends on the state.      my $State = $Socket->GetState(); # All action depends on the state.
Line 786  sub LondReadable { Line 830  sub LondReadable {
  }   }
  $Watcher->cancel();   $Watcher->cancel();
  KillSocket($Socket);   KillSocket($Socket);
    $ConnectionRetriesLeft--;       # Counts as connection failure
  return;   return;
     }      }
     SocketDump(6,$Socket);      SocketDump(6,$Socket);
Line 819  sub LondReadable { Line 864  sub LondReadable {
     } elsif ($State eq "Idle") {      } elsif ($State eq "Idle") {
  # If necessary, complete a transaction and then go into the   # If necessary, complete a transaction and then go into the
  # idle queue.   # idle queue.
    #  Note that a trasition to idle indicates a live lond
    # on the other end so reset the connection retries.
    #
    $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count
  $Watcher->cancel();   $Watcher->cancel();
  if(exists($ActiveTransactions{$Socket})) {   if(exists($ActiveTransactions{$Socket})) {
     Debug(8,"Completing transaction!!");      Debug(8,"Completing transaction!!");
Line 1031  sub QueueDelayed { Line 1080  sub QueueDelayed {
     Debug(4, "Delayed path: ".$path);      Debug(4, "Delayed path: ".$path);
     opendir(DIRHANDLE, $path);      opendir(DIRHANDLE, $path);
           
     @alldelayed = grep /\.$RemoteHost$/, readdir DIRHANDLE;      my @alldelayed = grep /\.$RemoteHost$/, readdir DIRHANDLE;
     Debug(4, "Got ".$alldelayed." delayed files");  
     closedir(DIRHANDLE);      closedir(DIRHANDLE);
     my $dfname;      my $dfname;
     my $reqfile;      my $reqfile;
Line 1074  sub MakeLondConnection { Line 1122  sub MakeLondConnection {
  $ConnectionRetriesLeft--;   $ConnectionRetriesLeft--;
  return 0; # Failure.   return 0; # Failure.
     }  else {      }  else {
  $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count  
  # The connection needs to have writability    # The connection needs to have writability 
  # monitored in order to send the init sequence   # monitored in order to send the init sequence
  # that starts the whole authentication/key   # that starts the whole authentication/key
Line 1087  sub MakeLondConnection { Line 1135  sub MakeLondConnection {
     &Debug(9,"MakeLondConnection got socket: ".$Socket);      &Debug(9,"MakeLondConnection got socket: ".$Socket);
  }   }
   
    $Connection->SetTimeoutCallback(\&SocketTimeout);
  $event = Event->io(fd       => $Socket,  
    my $event = Event->io(fd       => $Socket,
    poll     => 'w',     poll     => 'w',
    cb       => \&LondWritable,     cb       => \&LondWritable,
    data     => $Connection,     data     => $Connection,
Line 1144  sub StartRequest { Line 1193  sub StartRequest {
     $ActiveTransactions{$Lond} = $Request;      $ActiveTransactions{$Lond} = $Request;
   
     $Lond->InitiateTransaction($Request->getRequest());      $Lond->InitiateTransaction($Request->getRequest());
     $event = Event->io(fd      => $Socket,      my $event = Event->io(fd      => $Socket,
        poll    => "w",         poll    => "w",
        cb      => \&LondWritable,         cb      => \&LondWritable,
        data    => $Lond,         data    => $Lond,
Line 1186  sub QueueTransaction { Line 1235  sub QueueTransaction {
  Debug(8,"Must queue...");   Debug(8,"Must queue...");
  $WorkQueue->enqueue($requestData);   $WorkQueue->enqueue($requestData);
  if($ConnectionCount < $MaxConnectionCount) {   if($ConnectionCount < $MaxConnectionCount) {
     Debug(4,"Starting additional lond connection");      if($ConnectionRetriesLeft > 0) {
     if(MakeLondConnection() == 0) {   Debug(4,"Starting additional lond connection");
  EmptyQueue(); # Fail transactions, can't make connection.   if(MakeLondConnection() == 0) {
       EmptyQueue(); # Fail transactions, can't make connection.
    }
       } else {
    ShowStatus(GetServerHost()." >>> DEAD !!!! <<<");
    EmptyQueue(); # It's worse than that ... he's dead Jim.
     }      }
  }   }
     } else { # Can start the request:      } else { # Can start the request:
Line 1354  sub SetupLoncListener { Line 1408  sub SetupLoncListener {
 Child USR1 signal handler to report the most recent status  Child USR1 signal handler to report the most recent status
 into the status file.  into the status file.
   
   We also use this to reset the retries count in order to allow the
   client to retry connections with a previously dead server.
 =cut  =cut
 sub ChildStatus {  sub ChildStatus {
     my $event = shift;      my $event = shift;
Line 1364  sub ChildStatus { Line 1420  sub ChildStatus {
     my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt");      my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt");
     print $fh $$."\t".$RemoteHost."\t".$Status."\t".      print $fh $$."\t".$RemoteHost."\t".$Status."\t".
  $RecentLogEntry."\n";   $RecentLogEntry."\n";
       $ConnectionRetriesLeft = $ConnectionRetries;
 }  }
   
 =pod  =pod
Line 1459  sub CreateChild { Line 1516  sub CreateChild {
     my $host = shift;      my $host = shift;
     $RemoteHost = $host;      $RemoteHost = $host;
     Log("CRITICAL", "Forking server for ".$host);      Log("CRITICAL", "Forking server for ".$host);
     $pid          = fork;      my $pid          = fork;
     if($pid) { # Parent      if($pid) { # Parent
  $RemoteHost = "Parent";   $RemoteHost = "Parent";
  $ChildHash{$pid} = $RemoteHost;   $ChildHash{$pid} = $RemoteHost;
Line 1467  sub CreateChild { Line 1524  sub CreateChild {
   
     } else { # child.      } else { # child.
  ShowStatus("Connected to ".$RemoteHost);   ShowStatus("Connected to ".$RemoteHost);
  $SIG{INT} = DEFAULT;   $SIG{INT} = 'DEFAULT';
  sigprocmask(SIG_UNBLOCK, $sigset);   sigprocmask(SIG_UNBLOCK, $sigset);
  ChildProcess; # Does not return.   ChildProcess; # Does not return.
     }      }
Line 1504  if ($childpid != 0) { Line 1561  if ($childpid != 0) {
 #  #
   
 ShowStatus("Parent writing pid file:");  ShowStatus("Parent writing pid file:");
 $execdir = $perlvar{'lonDaemons'};  my $execdir = $perlvar{'lonDaemons'};
 open (PIDSAVE, ">$execdir/logs/lonc.pid");  open (PIDSAVE, ">$execdir/logs/lonc.pid");
 print PIDSAVE "$$\n";  print PIDSAVE "$$\n";
 close(PIDSAVE);  close(PIDSAVE);
Line 1523  Log("CRITICAL", "--------------- Startin Line 1580  Log("CRITICAL", "--------------- Startin
 my $HostIterator = LondConnection::GetHostIterator;  my $HostIterator = LondConnection::GetHostIterator;
 while (! $HostIterator->end()) {  while (! $HostIterator->end()) {
   
     $hostentryref = $HostIterator->get();      my $hostentryref = $HostIterator->get();
     CreateChild($hostentryref->[0]);      CreateChild($hostentryref->[0]);
     $HostIterator->next();      $HostIterator->next();
 }  }
Line 1541  $SIG{INT}  = \&Terminate; Line 1598  $SIG{INT}  = \&Terminate;
 $SIG{TERM} = \&Terminate;   $SIG{TERM} = \&Terminate; 
 $SIG{HUP}  = \&Restart;  $SIG{HUP}  = \&Restart;
 $SIG{USR1} = \&CheckKids;   $SIG{USR1} = \&CheckKids; 
   $SIG{USR2} = \&UpdateKids; # LonManage update request.
   
 while(1) {  while(1) {
     $deadchild = wait();      my $deadchild = wait();
     if(exists $ChildHash{$deadchild}) { # need to restart.      if(exists $ChildHash{$deadchild}) { # need to restart.
  $deadhost = $ChildHash{$deadchild};   my $deadhost = $ChildHash{$deadchild};
  delete($ChildHash{$deadchild});   delete($ChildHash{$deadchild});
  Log("WARNING","Lost child pid= ".$deadchild.   Log("WARNING","Lost child pid= ".$deadchild.
       "Connected to host ".$deadhost);        "Connected to host ".$deadhost);
Line 1575  sub CheckKids { Line 1633  sub CheckKids {
     my $now=time;      my $now=time;
     my $local=localtime($now);      my $local=localtime($now);
     print $fh "LONC status $local - parent $$ \n\n";      print $fh "LONC status $local - parent $$ \n\n";
     foreach $pid (keys %ChildHash) {      foreach my $pid (keys %ChildHash) {
  Debug(2, "Sending USR1 -> $pid");   Debug(2, "Sending USR1 -> $pid");
  kill 'USR1' => $pid; # Tell Child to report status.   kill 'USR1' => $pid; # Tell Child to report status.
  sleep 1; # Wait so file doesn't intermix.   sleep 1; # Wait so file doesn't intermix.
Line 1584  sub CheckKids { Line 1642  sub CheckKids {
   
 =pod  =pod
   
   =head1  UpdateKids
   
   parent's SIGUSR2 handler.  This handler:
   
   =item
   
   Rereads the hosts file.
   
   =item
    
   Kills off (via sigint) children for hosts that have disappeared.
   
   =item
   
   HUP's children for hosts that already exist (this just forces a status display
   and resets the connection retry count for that host.
   
   =item
   
   Starts new children for hosts that have been added to the hosts.tab file since
   the start of the master program and maintains them.
   
   =cut
   
   sub UpdateKids {
       Log("INFO", "Updating connections via SIGUSR2");
   }
   
   
   =pod
   
 =head1 Restart  =head1 Restart
   
 Signal handler for HUP... all children are killed and  Signal handler for HUP... all children are killed and
Line 1593  the config file. Line 1682  the config file.
 =cut  =cut
   
 sub Restart {  sub Restart {
     KillThemAll; # First kill all the children.      &KillThemAll; # First kill all the children.
     Log("CRITICAL", "Restarting");      Log("CRITICAL", "Restarting");
     my $execdir = $perlvar{'lonDaemons'};      my $execdir = $perlvar{'lonDaemons'};
     unlink("$execdir/logs/lonc.pid");      unlink("$execdir/logs/lonc.pid");
Line 1612  SIGHUP.  Responds to sigint and sigterm. Line 1701  SIGHUP.  Responds to sigint and sigterm.
 sub KillThemAll {  sub KillThemAll {
     Debug(2, "Kill them all!!");      Debug(2, "Kill them all!!");
     local($SIG{CHLD}) = 'IGNORE';      # Our children >will< die.      local($SIG{CHLD}) = 'IGNORE';      # Our children >will< die.
     foreach $pid (keys %ChildHash) {      foreach my $pid (keys %ChildHash) {
  my $serving = $ChildHash{$pid};   my $serving = $ChildHash{$pid};
  Debug(2, "Killing lonc for $serving pid = $pid");   Debug(2, "Killing lonc for $serving pid = $pid");
  ShowStatus("Killing lonc for $serving pid = $pid");   ShowStatus("Killing lonc for $serving pid = $pid");

Removed from v.1.18  
changed lines
  Added in v.1.25


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>