File:  [LON-CAPA] / loncom / Attic / lonc
Revision 1.51: download - view: text, annotated - select for diffs
Wed Jul 23 16:52:30 2003 UTC (20 years, 9 months ago) by bowersj2
Branches: MAIN
CVS tags: HEAD
Information from "SessionOne.html" of the Guts manual.

I'm not sure all of this is going to live in lonc; some parts may move
elsewhere (load balancing probably belongs in the login routines), and
it all may be picked up and dropping into lonnet.pm if that makes more
sense. The main point is that it's now in POD format and the above can
be done with copy & paste.

    1: #!/usr/bin/perl
    2: 
    3: # The LearningOnline Network
    4: # lonc - LON TCP-Client Domain-Socket-Server
    5: # provides persistent TCP connections to the other servers in the network
    6: # through multiplexed domain sockets
    7: #
    8: # $Id: lonc,v 1.51 2003/07/23 16:52:30 bowersj2 Exp $
    9: #
   10: # Copyright Michigan State University Board of Trustees
   11: #
   12: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
   13: #
   14: # LON-CAPA is free software; you can redistribute it and/or modify
   15: # it under the terms of the GNU General Public License as published by
   16: # the Free Software Foundation; either version 2 of the License, or
   17: # (at your option) any later version.
   18: #
   19: # LON-CAPA is distributed in the hope that it will be useful,
   20: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   21: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   22: # GNU General Public License for more details.
   23: #
   24: # You should have received a copy of the GNU General Public License
   25: # along with LON-CAPA; if not, write to the Free Software
   26: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   27: #
   28: # /home/httpd/html/adm/gpl.txt
   29: #
   30: # http://www.lon-capa.org/
   31: #
   32: # PID in subdir logs/lonc.pid
   33: # kill kills
   34: # HUP restarts
   35: # USR1 tries to open connections again
   36: 
   37: # 6/4/99,6/5,6/7,6/8,6/9,6/10,6/11,6/12,7/14,7/19,
   38: # 10/8,10/9,10/15,11/18,12/22,
   39: # 2/8,7/25 Gerd Kortemeyer
   40: # 12/05 Gerd Kortemeyer
   41: # YEAR=2001
   42: # 03/14/01,03/15,06/12,11/26,11/27,11/28 Gerd Kortemeyer
   43: # YEAR=2002
   44: # 2/19/02,02/22/02,02/25/02 Gerd Kortemeyer
   45: # 3/07/02 Ron Fox 
   46: # based on nonforker from Perl Cookbook
   47: # - server who multiplexes without forking
   48: 
   49: use lib '/home/httpd/lib/perl/';
   50: use LONCAPA::Configuration;
   51: 
   52: use POSIX;
   53: use IO::Socket;
   54: use IO::Select;
   55: use IO::File;
   56: use Socket;
   57: use Fcntl;
   58: use Tie::RefHash;
   59: use Crypt::IDEA;
   60: #use Net::Ping;
   61: use LWP::UserAgent();
   62: 
   63: $status='';
   64: $lastlog='';
   65: $conserver='SHELL';
   66: $DEBUG = 0;			# Set to 1 for annoyingly complete logs.
   67: $VERSION='$Revison$'; #' stupid emacs
   68: $remoteVERSION;
   69: # -------------------------------- Set signal handlers to record abnormal exits
   70: 
   71: &status("Init exception handlers");
   72: $SIG{QUIT}=\&catchexception;
   73: $SIG{__DIE__}=\&catchexception;
   74: 
   75: # ---------------------------------- Read loncapa_apache.conf and loncapa.conf
   76: &status("Read loncapa.conf and loncapa_apache.conf");
   77: my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
   78: my %perlvar=%{$perlvarref};
   79: undef $perlvarref;
   80: 
   81: # ----------------------------- Make sure this process is running from user=www
   82: &status("Check user ID");
   83: my $wwwid=getpwnam('www');
   84: if ($wwwid!=$<) {
   85:    $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
   86:    $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
   87:    system("echo 'User ID mismatch.  lonc must be run as user www.' |\
   88:  mailto $emailto -s '$subj' > /dev/null");
   89:    exit 1;
   90: }
   91: 
   92: # --------------------------------------------- Check if other instance running
   93: 
   94: my $pidfile="$perlvar{'lonDaemons'}/logs/lonc.pid";
   95: 
   96: if (-e $pidfile) {
   97:    my $lfh=IO::File->new("$pidfile");
   98:    my $pide=<$lfh>;
   99:    chomp($pide);
  100:    if (kill 0 => $pide) { die "already running"; }
  101: }
  102: 
  103: # ------------------------------------------------------------- Read hosts file
  104: 
  105: open (CONFIG,"$perlvar{'lonTabDir'}/hosts.tab") || die "Can't read host file";
  106: 
  107: while ($configline=<CONFIG>) {
  108:     my ($id,$domain,$role,$name,$ip)=split(/:/,$configline);
  109:     chomp($ip);
  110:     if ($ip) {
  111:      $hostip{$id}=$ip;
  112:      $hostname{$id}=$name;
  113:     }
  114: }
  115: 
  116: close(CONFIG);
  117: 
  118: # -------------------------------------------------------- Routines for forking
  119: 
  120: %children               = ();       # keys are current child process IDs,
  121:                                     # values are hosts
  122: %childpid               = ();       # the other way around
  123: 
  124: %childatt               = ();       # number of attempts to start server
  125:                                     # for ID
  126: 
  127: $childmaxattempts=5;
  128: 
  129: # ---------------------------------------------------- Fork once and dissociate
  130: &status("Fork and dissociate");
  131: $fpid=fork;
  132: exit if $fpid;
  133: die "Couldn't fork: $!" unless defined ($fpid);
  134: 
  135: POSIX::setsid() or die "Can't start new session: $!";
  136: 
  137: $conserver='PARENT';
  138: 
  139: # ------------------------------------------------------- Write our PID on disk
  140: &status("Write PID");
  141: $execdir=$perlvar{'lonDaemons'};
  142: open (PIDSAVE,">$execdir/logs/lonc.pid");
  143: print PIDSAVE "$$\n";
  144: close(PIDSAVE);
  145: &logthis("<font color=red>CRITICAL: ---------- Starting ----------</font>");
  146: 
  147: # ----------------------------- Ignore signals generated during initial startup
  148: $SIG{HUP}=$SIG{USR1}='IGNORE';
  149: # ------------------------------------------------------- Now we are on our own
  150:     
  151: # Fork off our children, one for every server
  152: 
  153: &status("Forking ...");
  154: 
  155: foreach $thisserver (keys %hostip) {
  156:     #if (&online($hostname{$thisserver})) {
  157:        make_new_child($thisserver);
  158:     #}
  159: }
  160: 
  161: &logthis("Done starting initial servers");
  162: # ----------------------------------------------------- Install signal handlers
  163: 
  164: 
  165: $SIG{INT}  = $SIG{TERM} = \&HUNTSMAN;
  166: $SIG{HUP}  = \&HUPSMAN;
  167: $SIG{USR1} = \&USRMAN;
  168: 
  169: # And maintain the population.
  170: while (1) {
  171:     my $deadpid = wait;		# Wait for the next child to die.
  172:                                 # See who died and start new one
  173:                                 # or a signal (e.g. USR1 for restart).
  174:                                 # if a signal, the wait will fail
  175:                                 # This is ordinarily detected by
  176:                                 # checking for the existence of the
  177:                                 # pid index inthe children hash since
  178:                                 # the return value from a failed wait is -1
  179:                                 # which is an impossible PID.
  180:     &status("Woke up");
  181:     my $skipping='';
  182: 
  183:     if(exists($children{$deadpid})) {
  184: 
  185: 	$thisserver = $children{$deadpid}; # Look name of dead guy's peer.
  186: 
  187: 	delete($children{$deadpid}); # Get rid of dead hash entry.
  188: 
  189: 	if($childatt{$thisserver} < $childmaxattempts) {
  190: 	    $childatt{$thisserver}++;
  191: 	    &logthis(
  192: 	       "<font color=yellow>INFO: Trying to reconnect for $thisserver "
  193:             ."($childatt{$thisserver} of $childmaxattempts attempts)</font>"); 
  194: 	    make_new_child($thisserver);
  195: 	
  196: 	}
  197: 	else {
  198: 	    $skipping .= $thisserver.' ';
  199: 	}
  200: 	if($skipping) {
  201: 	    &logthis("<font color=blue>WARNING: Skipped $skipping</font>");
  202:   
  203: 	}
  204:     }
  205: 
  206: }
  207: 
  208: 
  209: 
  210: sub make_new_child {
  211:    
  212:     $newserver=shift;
  213:     my $pid;
  214:     my $sigset;
  215:     &logthis("Attempting to start child for server $newserver");
  216:     # block signal for fork
  217:     $sigset = POSIX::SigSet->new(SIGINT);
  218:     sigprocmask(SIG_BLOCK, $sigset)
  219:         or die "Can't block SIGINT for fork: $!\n";
  220:     
  221:     die "fork: $!" unless defined ($pid = fork);
  222:     
  223:     if ($pid) {
  224:         # Parent records the child's birth and returns.
  225:         sigprocmask(SIG_UNBLOCK, $sigset)
  226:             or die "Can't unblock SIGINT for fork: $!\n";
  227:         $children{$pid} = $newserver;
  228:         $childpid{$newserver} = $pid;
  229:         return;
  230:     } else {
  231:         $conserver=$newserver;
  232:         # Child can *not* return from this subroutine.
  233:         $SIG{INT} = 'DEFAULT';      # make SIGINT kill us as it did before
  234:         $SIG{USR1}= \&logstatus;
  235:    
  236:         # unblock signals
  237:         sigprocmask(SIG_UNBLOCK, $sigset)
  238:             or die "Can't unblock SIGINT for fork: $!\n";
  239: 
  240: # ----------------------------- This is the modified main program of non-forker
  241: 
  242: $port = "$perlvar{'lonSockDir'}/$conserver";
  243: 
  244: unlink($port);
  245: 
  246: # -------------------------------------------------------------- Open other end
  247: 
  248: &openremote($conserver);
  249: 	&logthis("<font color=green> Connection to $conserver open </font>");
  250: # ----------------------------------------- We're online, send delayed messages
  251:     &status("Checking for delayed messages");
  252: 
  253:     my @allbuffered;
  254:     my $path="$perlvar{'lonSockDir'}/delayed";
  255:     opendir(DIRHANDLE,$path);
  256:     @allbuffered=grep /\.$conserver$/, readdir DIRHANDLE;
  257:     closedir(DIRHANDLE);
  258:     my $dfname;
  259:     foreach (sort @allbuffered) {
  260:         &status("Sending delayed: $_");
  261:         $dfname="$path/$_";
  262:         if($DEBUG) { &logthis('Sending '.$dfname); }
  263:         my $wcmd;
  264:         {
  265:          my $dfh=IO::File->new($dfname);
  266:          $cmd=<$dfh>;
  267:         }
  268:         chomp($cmd);
  269:         my $bcmd=$cmd;
  270:         if ($cmd =~ /^encrypt\:/) {
  271: 	    my $rcmd=$cmd;
  272:             $rcmd =~ s/^encrypt\://;
  273:             chomp($rcmd);
  274:             my $cmdlength=length($rcmd);
  275:             $rcmd.="         ";
  276:             my $encrequest='';
  277:             for (my $encidx=0;$encidx<=$cmdlength;$encidx+=8) {
  278:                 $encrequest.=
  279:                     unpack("H16",$cipher->encrypt(substr($rcmd,$encidx,8)));
  280:             }
  281:             $cmd="enc:$cmdlength:$encrequest\n";
  282:         }
  283: 	$answer = londtransaction($remotesock, $cmd, 60);
  284: 	chomp($answer);
  285: 
  286:         if (($answer ne '') && ($@!~/timeout/)) {
  287: 	    unlink("$dfname");
  288:             &logthis("Delayed $cmd: >$answer<");
  289:             &logperm("S:$conserver:$bcmd");
  290:         }        
  291:     }
  292: 	if($DEBUG) { &logthis("<font color=green> Delayed transactions sent"); }
  293: 
  294: # ------------------------------------------------------- Listen to UNIX socket
  295: &status("Opening socket");
  296: unless (
  297:   $server = IO::Socket::UNIX->new(Local  => $port,
  298:                                   Type   => SOCK_STREAM,
  299:                                   Listen => 10 )
  300:    ) { 
  301:        my $st=120+int(rand(240));
  302:        &logthis(
  303:          "<font color=blue>WARNING: ".
  304:          "Can't make server socket ($st secs):  .. exiting</font>");
  305:        sleep($st);
  306:        exit; 
  307:      };
  308:    
  309: # -----------------------------------------------------------------------------
  310: 
  311: &logthis("<font color=green>$conserver online</font>");
  312: 
  313: # -----------------------------------------------------------------------------
  314: # begin with empty buffers
  315: %inbuffer  = ();
  316: %outbuffer = ();
  317: %ready     = ();
  318: %servers   = ();	# To be compatible with make filevector.  indexed by
  319: 			# File ids, values are sockets.
  320: 			# note that the accept socket is omitted.
  321: 
  322: tie %ready, 'Tie::RefHash';
  323: 
  324: # nonblock($server);
  325: # $select = IO::Select->new($server);
  326: 
  327: # Main loop: check reads/accepts, check writes, check ready to process
  328: 
  329: status("Main loop $conserver");
  330: while (1) {
  331:     my $client;
  332:     my $rv;
  333:     my $data;
  334: 
  335:     my $infdset;		# bit vec of fd's to select on input.
  336: 
  337:     my $outfdset;		# Bit vec of fd's to select on output.
  338: 
  339: 
  340:     $infdset = MakeFileVector(\%servers);
  341:     $outfdset= MakeFileVector(\%outbuffer);
  342:     vec($infdset, $server->fileno, 1) = 1;
  343:     if($DEBUG) {
  344: 	&logthis("Adding ".$server->fileno.
  345: 		 " to input select vector (listner)".
  346: 		 unpack("b*",$infdset)."\n");
  347:     }
  348:     DoSelect(\$infdset, \$outfdset); # Wait for input.
  349:     if($DEBUG) {
  350: 	&logthis("Doselect completed!");
  351: 	&logthis("ins = ".unpack("b*",$infdset)."\n");
  352: 	&logthis("outs= ".unpack("b*",$outfdset)."\n");
  353: 		 
  354:     }
  355: 
  356:     # Checkfor new connections:
  357:     if (vec($infdset, $server->fileno, 1)) {
  358: 	if($DEBUG) {
  359: 	    &logthis("New connection established");
  360: 	}
  361: 	# accept a new connection
  362: 	&status("Accept new connection: $conserver");
  363: 	$client = $server->accept();
  364: 	if($DEBUG) {
  365: 	    &logthis("New client fd = ".$client->fileno."\n");
  366: 	}
  367: 	$servers{$client->fileno} = $client;
  368: 	nonblock($client);
  369: 	$client->sockopt(SO_KEEPALIVE, 1);# Enable monitoring of
  370: 	                                  # connection liveness.
  371:     }
  372:     HandleInput($infdset, \%servers, \%inbuffer, \%outbuffer, \%ready);
  373:     HandleOutput($outfdset, \%servers, \%outbuffer, \%inbuffer,
  374: 		 \%ready);
  375: # -------------------------------------------------------- Wow, connection lost
  376: 
  377: }
  378:    
  379:     }
  380: }
  381: 
  382: # ------------------------------------------------------- End of make_new_child
  383: 
  384: 
  385: #
  386: #  Make a vector of file descriptors to wait for in a select.
  387: #  parameters:
  388: #     \%fdhash  -reference to a hash which has IO::Socket's as indices.  
  389: #                We only care about the indices, not the values.
  390: #  A select vector is created from all indices of the hash.
  391: 
  392: sub MakeFileVector
  393: {
  394:     my $fdhash = shift;
  395:     my $selvar = "";
  396: 
  397:     foreach $socket (keys %$fdhash) {
  398: 	if($DEBUG) {
  399: 	    &logthis("Adding  ".$socket.
  400: 		     "to select vector. (client)\n");
  401: 	}
  402: 	vec($selvar, $socket, 1) = 1;
  403:     }
  404:     return $selvar;
  405: }
  406: 
  407: 
  408: #
  409: #  HandleOutput:
  410: #    Processes output on a buffered set of file descriptors which are
  411: #    ready to be read.
  412: #  Parameters:
  413: #    $selvector - Vector of file descriptors which are writable.
  414: #    \%sockets  - Vector of socket references indexed by socket.
  415: #    \%buffers  - Reference to a hash containing output buffers.
  416: #                 Hashes are indexed by sockets.  The file descriptors of some
  417: #                 of those sockets will be present in $selvector.
  418: #                 For each one of those, we will attempt to write the output
  419: #                 buffer to the socket.  Note that we will assume that
  420: #                 the sockets are being run in non blocking mode.
  421: #   \%inbufs    - Reference to hash containing input buffers.
  422: #   \%readys    - Reference to hash containing flags for items with complete
  423: #                 requests.
  424: #
  425: sub HandleOutput
  426: {
  427:     my $selvector = shift;
  428:     my $sockets   = shift;
  429:     my $buffers   = shift;
  430:     my $inbufs    = shift;
  431:     my $readys    = shift;
  432:     my $sock;
  433: 
  434:     if($DEBUG) {
  435: 	&logthis("HandleOutput entered\n");
  436:     }
  437: 
  438:     foreach $sock (keys %$sockets) {
  439: 	my $socket = $sockets->{$sock};
  440: 	if(vec($selvector, $sock, 1)) { # $socket is writable.
  441: 	    if($DEBUG) {
  442: 		&logthis("Sending $buffers->{$sock} \n");
  443: 	    }
  444: 	    my $rv = $socket->send($buffers->{$sock}, 0);
  445: 	    $errno = $!;
  446: 	    unless ($buffers->{$sock} eq "con_lost\n") {
  447: 		unless (defined $rv) { # Write failed... could be EINTR
  448: 		    unless ($errno == POSIX::EINTR) {
  449: 			&logthis("Write failed on writable socket");
  450: 		    }		# EINTR is not an error .. just retry.
  451: 		    next;
  452: 		}
  453: 		if( ($rv == length $buffers->{$sock})    ||
  454: 		    ($errno == POSIX::EWOULDBLOCK)       ||
  455: 		    ($errno == POSIX::EAGAIN)            || # same as above.
  456: 		    ($errno == POSIX::EINTR)             || # signal during IO
  457: 		    ($errno == 0)) {
  458: 		    substr($buffers->{$sock}, 0, $rv)=""; # delete written part
  459: 		    delete $buffers->{$sock} unless length $buffers->{$sock};
  460: 		} else {
  461: 		    # For some reason the write failed with an error code
  462: 		    # we didn't look for.  Shutdown the socket.
  463: 		    &logthis("Unable to write data with ".$errno.": ".
  464: 			     "Dropping data: ".length($buffers->{$sock}).
  465: 			     ", $rv");
  466: 		    #
  467: 		    # kill off the buffers in the hash:
  468: 
  469: 		    delete $buffers->{$sock};
  470: 		    delete $inbufs->{$sock};
  471: 		    delete $readys->{$sock};
  472: 
  473: 		    close($socket); # Close the client socket.
  474: 		    next;
  475: 		}
  476: 	    } else {		# Kludgy way to mark lond connection lost.
  477: 		&logthis(
  478: 		 "<font color=red>CRITICAL lond connection lost</font>");
  479: 		status("Connection lost");
  480: 		$remotesock->shutdown(2);
  481: 		&logthis("Attempting to open a new connection");
  482: 		&openremote($conserver);
  483: 	    }
  484: 		   
  485: 	}
  486:     }
  487: 
  488: }
  489: #
  490: #   HandleInput - Deals with input on client sockets.
  491: #                 Each socket has an associated input buffer.
  492: #                 For each readable socket, the currently available
  493: #                 data is appended to this buffer.
  494: #                 If necessary, the buffer is created.
  495: #                 On various failures, we may shutdown the client.
  496: #  Parameters:
  497: #     $selvec   - Vector of readable sockets.
  498: #     \%sockets - Refers to the  Hash of sockets indexed by sockets.  
  499: #                 Each of these may or may not have it's fd bit set 
  500: #                 in the $selvec.
  501: #     \%ibufs   - Refers to the hash of input buffers indexed by socket.
  502: #     \%obufs   - Hash of output buffers indexed by socket. 
  503: #     \%ready   - Hash of ready flags indicating the existence of a completed
  504: #                 Request.
  505: sub HandleInput 
  506: {
  507: 
  508:     # Marshall the parameters.   Note that the hashes are actually
  509:     # references not values.
  510: 
  511:     my $selvec  = shift;
  512:     my $sockets = shift;
  513:     my $ibufs   = shift;
  514:     my $obufs   = shift;
  515:     my $ready   = shift;
  516:     my $sock;
  517: 
  518:     if($DEBUG) {
  519: 	&logthis("Entered HandleInput\n");
  520:     }
  521:     foreach $sock (keys %$sockets) {
  522: 	my $socket = $sockets->{$sock};
  523: 	if(vec($selvec, $sock, 1)) { # Socket which is readable.
  524: 
  525: 	    #  Attempt to read the data and do error management.
  526: 	    my $data = '';
  527: 	    my $rv = $socket->recv($data, POSIX::BUFSIZ, 0);
  528: 	    if($DEBUG) {
  529: 		&logthis("Received $data from socket");
  530: 	    }
  531: 	    unless (defined($rv) && length $data) {
  532: 
  533: 		# Read an end of file.. this is a disconnect from the peer.
  534: 
  535: 		delete $sockets->{$sock};
  536: 		delete $ibufs->{$sock};
  537: 		delete $obufs->{$sock};
  538: 		delete $ready->{$sock};
  539: 
  540: 		status("Idle");
  541: 		close $socket;
  542: 		next;
  543: 	    }
  544: 	    #  Append the read data to the input buffer. If the buffer
  545: 	    # now contains a \n the request is complete and we can 
  546: 	    # mark this in the $ready hash (one request for each \n.)
  547: 
  548: 	    $ibufs->{$sock} .= $data;
  549: 	    while($ibufs->{$sock} =~ s/(.*\n)//) {
  550: 		push(@{$ready->{$sock}}, $1);
  551: 	    }
  552: 	    
  553: 	}
  554:     }
  555:     #  Now handle any requests which are ready:
  556: 
  557:     foreach $client (keys %ready) {
  558: 	handle($client);
  559:     }
  560: }
  561: 
  562: # DoSelect:  does a select with no timeout.  On signal (errno == EINTR), 
  563: #            the select is retried until there are items in the returned
  564: #            vectors.  
  565: #
  566: # Parameters:
  567: #   \$readvec   - Reference to a vector of file descriptors to 
  568: #                 check for readability.
  569: #   \$writevec  - Reference to a vector of file descriptors to check for
  570: #                 writability.
  571: #  On exit, the referents are modified with vectors indicating which 
  572: #  file handles are readable/writable.
  573: #
  574: sub DoSelect {
  575:     my $readvec = shift;
  576:     my $writevec= shift;
  577:     my $outs;
  578:     my $ins;
  579: 
  580:     while (1) {
  581: 	my $nfds = select( $ins = $$readvec, $outs = $$writevec, undef, undef);
  582: 	if($nfds) {
  583: 	    if($DEBUG) {
  584: 		&logthis("select exited with ".$nfds." fds\n");
  585: 		&logthis("ins = ".unpack("b*",$ins).
  586: 			 " readvec = ".unpack("b*",$$readvec)."\n");
  587: 		&logthis("outs = ".unpack("b*",$outs).
  588: 			 " writevec = ".unpack("b*",$$writevec)."\n");
  589: 	    }
  590: 	    $$readvec  = $ins;
  591: 	    $$writevec = $outs;
  592: 	    return;
  593: 	} else {
  594: 	    if($DEBUG) {
  595: 		&logthis("Select exited with no bits set in mask\n");
  596: 	    }
  597: 	    die "Select failed" unless $! == EINTR;
  598: 	}
  599:     }
  600: }
  601: 
  602: # handle($socket) deals with all pending requests for $client
  603: #
  604: sub handle {
  605:     # requests are in $ready{$client}
  606:     # send output to $outbuffer{$client}
  607:     my $client = shift;
  608:     my $request;
  609:     foreach $request (@{$ready{$client}}) {
  610: # ============================================================= Process request
  611:         # $request is the text of the request
  612:         # put text of reply into $outbuffer{$client}
  613: # ------------------------------------------------------------ Is this the end?
  614: 	chomp($request);
  615: 	if($DEBUG) {
  616:      &logthis("<font color=green> Request $request processing starts</font>");
  617:         }
  618:         if ($request eq "close_connection_exit\n") {
  619: 	    &status("Request close connection");
  620:            &logthis(
  621:      "<font color=red>CRITICAL: Request Close Connection ... exiting</font>");
  622:            $remotesock->shutdown(2);
  623:            $server->close();
  624:            exit;
  625:         }
  626: # -----------------------------------------------------------------------------
  627:         if ($request =~ /^encrypt\:/) {
  628: 	    my $cmd=$request;
  629:             $cmd =~ s/^encrypt\://;
  630:             chomp($cmd);
  631:             my $cmdlength=length($cmd);
  632:             $cmd.="         ";
  633:             my $encrequest='';
  634:             for (my $encidx=0;$encidx<=$cmdlength;$encidx+=8) {
  635:                 $encrequest.=
  636:                     unpack("H16",$cipher->encrypt(substr($cmd,$encidx,8)));
  637:             }
  638:             $request="enc:$cmdlength:$encrequest";
  639:         }
  640: # --------------------------------------------------------------- Main exchange
  641: 	$answer = londtransaction($remotesock, $request, 300);
  642: 
  643: 	if($DEBUG) { 
  644: 	    &logthis("<font color=green> Request data exchange complete");
  645: 	}
  646: 	if ($@=~/timeout/) { 
  647: 	    $answer='';
  648: 	    &logthis(
  649: 		     "<font color=red>CRITICAL: Timeout: $request</font>");
  650: 	}  
  651: 
  652: 
  653:         if ($answer) {
  654: 	   if ($answer =~ /^enc/) {
  655:                my ($cmd,$cmdlength,$encinput)=split(/:/,$answer);
  656:                chomp($encinput);
  657: 	       $answer='';
  658:                for (my $encidx=0;$encidx<length($encinput);$encidx+=16) {
  659:                   $answer.=$cipher->decrypt(
  660:                    pack("H16",substr($encinput,$encidx,16))
  661:                   );
  662: 	       }
  663: 	      $answer=substr($answer,0,$cmdlength);
  664: 	      $answer.="\n";
  665: 	   }
  666: 	   if($DEBUG) {
  667: 	       &logthis("sending $answer to client\n");
  668: 	   }
  669:            $outbuffer{$client} .= $answer;
  670:         } else {
  671:            $outbuffer{$client} .= "con_lost\n";
  672:         }
  673: 
  674:      &status("Completed: $request");
  675: 	if($DEBUG) {
  676: 	    &logthis("<font color=green> Request processing complete</font>");
  677: 	}
  678: # ===================================================== Done processing request
  679:     }
  680:     delete $ready{$client};
  681: # -------------------------------------------------------------- End non-forker
  682:     if($DEBUG) {
  683: 	&logthis("<font color=green> requests for child handled</font>");
  684:     }
  685: }
  686: # ---------------------------------------------------------- End make_new_child
  687: 
  688: # nonblock($socket) puts socket into nonblocking mode
  689: sub nonblock {
  690:     my $socket = shift;
  691:     my $flags;
  692: 
  693:     
  694:     $flags = fcntl($socket, F_GETFL, 0)
  695:             or die "Can't get flags for socket: $!\n";
  696:     fcntl($socket, F_SETFL, $flags | O_NONBLOCK)
  697:             or die "Can't make socket nonblocking: $!\n";
  698: }
  699: 
  700: 
  701: sub openremote {
  702: # ---------------------------------------------------- Client to network server
  703: 
  704:     my $conserver=shift;
  705: 
  706:     &status("Opening TCP $conserver");
  707:     my $st=120+int(rand(240)); # Sleep before opening:
  708: 
  709:     unless (
  710: 	    $remotesock = IO::Socket::INET->new(PeerAddr => $hostip{$conserver},
  711: 						PeerPort => $perlvar{'londPort'},
  712: 						Proto    => "tcp",
  713: 						Type     => SOCK_STREAM)
  714: 	   ) {
  715: 
  716: 	&logthis(
  717: 		 "<font color=blue>WARNING: Couldn't connect to $conserver ($st secs): </font>");
  718: 	sleep($st);
  719: 	exit;
  720:     };
  721: # ----------------------------------------------------------------- Init dialog
  722: 
  723:     &logthis("<font color=green>INFO Connected to $conserver, initing</font>");
  724:     &status("Init dialogue: $conserver");
  725: 
  726:     $answer = londtransaction($remotesock, "init", 60);
  727:     chomp($answer);
  728:     $answer = londtransaction($remotesock, $answer, 60);
  729:     chomp($answer);
  730: 
  731:     if ($@=~/timeout/) {
  732: 	&logthis("Timed out during init.. exiting");
  733: 	exit;
  734:     }
  735: 
  736:     if ($answer ne 'ok') {
  737: 	&logthis("Init reply: >$answer<");
  738: 	my $st=120+int(rand(240));
  739: 	&logthis("<font color=blue>WARNING: Init failed ($st secs)</font>");
  740: 	sleep($st);
  741: 	exit;
  742:     }
  743: 
  744:     $answer = londtransaction($remotesock,"sethost:$conserver",60);
  745:     chomp($answer);
  746:     if ( $answer ne 'ok') {
  747: 	&logthis('<font color="blue">WARNING: unable to specify remote host'.
  748: 		 $answer.'</font>');
  749:     }
  750: 
  751:     $answer = londtransaction($remotesock,"version:$VERSION",60);
  752:     chomp($answer);
  753:     if ($answer =~ /^version:/) {
  754: 	$remoteVERSION=(split(/:/,$answer))[1];
  755:     } else {
  756: 	&logthis('<font color="blue">WARNING: request remote version failed :'.
  757: 		 $answer.': my version is :'.$VERSION.':</font>');
  758:     }
  759: 
  760:     sleep 5;
  761:     &status("Ponging $conserver");
  762:     print $remotesock "pong\n";
  763:     $answer=<$remotesock>;
  764:     chomp($answer);
  765:     if ($answer!~/^$conserver/) {
  766: 	&logthis("Pong reply: >$answer<");
  767:     }
  768: # ----------------------------------------------------------- Initialize cipher
  769: 
  770:     &status("Initialize cipher");
  771:     print $remotesock "ekey\n";
  772:     my $buildkey=<$remotesock>;
  773:     my $key=$conserver.$perlvar{'lonHostID'};
  774:     $key=~tr/a-z/A-Z/;
  775:     $key=~tr/G-P/0-9/;
  776:     $key=~tr/Q-Z/0-9/;
  777:     $key=$key.$buildkey.$key.$buildkey.$key.$buildkey;
  778:     $key=substr($key,0,32);
  779:     my $cipherkey=pack("H32",$key);
  780:     if ($cipher=new IDEA $cipherkey) {
  781: 	&logthis("Secure connection initialized");
  782:     } else {
  783: 	my $st=120+int(rand(240));
  784: 	&logthis("<font color=blue>WARNING: ".
  785: 		 "Could not establish secure connection ($st secs)!</font>");
  786: 	sleep($st);
  787: 	exit;
  788:     }
  789:     &logthis("<font color=green> Remote open success </font>");
  790: }
  791: 
  792: 
  793: 
  794: # grabs exception and records it to log before exiting
  795: sub catchexception {
  796:     my ($signal)=@_;
  797:     $SIG{QUIT}='DEFAULT';
  798:     $SIG{__DIE__}='DEFAULT';
  799:     chomp($signal);
  800:     &logthis("<font color=red>CRITICAL: "
  801:      ."ABNORMAL EXIT. Child $$ for server [$wasserver] died through "
  802:      ."\"$signal\" with parameter </font>");
  803:     die("Signal abend");
  804: }
  805: 
  806: # -------------------------------------- Routines to see if other box available
  807: 
  808: #sub online {
  809: #    my $host=shift;
  810: #    &status("Pinging ".$host);
  811: #    my $p=Net::Ping->new("tcp",20);
  812: #    my $online=$p->ping("$host");
  813: #    $p->close();
  814: #    undef ($p);
  815: #    return $online;
  816: #}
  817: 
  818: sub connected {
  819:     my ($local,$remote)=@_;
  820:     &status("Checking connection $local to $remote");
  821:     $local=~s/\W//g;
  822:     $remote=~s/\W//g;
  823: 
  824:     unless ($hostname{$local}) { return 'local_unknown'; }
  825:     unless ($hostname{$remote}) { return 'remote_unknown'; }
  826: 
  827:     #unless (&online($hostname{$local})) { return 'local_offline'; }
  828: 
  829:     my $ua=new LWP::UserAgent;
  830:     
  831:     my $request=new HTTP::Request('GET',
  832:       "http://".$hostname{$local}.'/cgi-bin/ping.pl?'.$remote);
  833: 
  834:     my $response=$ua->request($request);
  835: 
  836:     unless ($response->is_success) { return 'local_error'; }
  837: 
  838:     my $reply=$response->content;
  839:     $reply=(split("\n",$reply))[0];
  840:     $reply=~s/\W//g;
  841:     if ($reply ne $remote) { return $reply; }
  842:     return 'ok';
  843: }
  844: 
  845: 
  846: 
  847: sub hangup {
  848:     foreach (keys %children) {
  849:         $wasserver=$children{$_};
  850:         &status("Closing $wasserver");
  851:         &logthis('Closing '.$wasserver.': '.&subreply('exit',$wasserver));
  852:         &status("Kill PID $_ for $wasserver");
  853: 	kill ('INT',$_);
  854:     }
  855: }
  856: 
  857: sub HUNTSMAN {                      # signal handler for SIGINT
  858:     local($SIG{CHLD}) = 'IGNORE';   # we're going to kill our children
  859:     &hangup();
  860:     my $execdir=$perlvar{'lonDaemons'};
  861:     unlink("$execdir/logs/lonc.pid");
  862:     &logthis("<font color=red>CRITICAL: Shutting down</font>");
  863:     exit;                           # clean up with dignity
  864: }
  865: 
  866: sub HUPSMAN {                      # signal handler for SIGHUP
  867:     local($SIG{CHLD}) = 'IGNORE';  # we're going to kill our children
  868:     &hangup();
  869:     &logthis("<font color=red>CRITICAL: Restarting</font>");
  870:     my $execdir=$perlvar{'lonDaemons'};
  871:     unlink("$execdir/logs/lonc.pid");
  872:     exec("$execdir/lonc");         # here we go again
  873: }
  874: 
  875: sub checkchildren {
  876:     &initnewstatus();
  877:     &logstatus();
  878:     &logthis('Going to check on the children');
  879:     foreach (sort keys %children) {
  880: 	sleep 1;
  881:         unless (kill 'USR1' => $_) {
  882: 	    &logthis ('<font color=red>CRITICAL: Child '.$_.' is dead</font>');
  883:             &logstatus($$.' is dead');
  884:         } 
  885:     }
  886: }
  887: 
  888: sub USRMAN {
  889:     &logthis("USR1: Trying to establish connections again");
  890:     #
  891:     #  It is really important not to just clear the childatt hash or we will
  892:     #  lose all memory of the children.  What we really want to do is this:
  893:     #  For each index where childatt is >= $childmaxattempts
  894:     #  Zero the associated counter and do a make_child for the host.
  895:     #  Regardles, the childatt entry is zeroed:
  896:     my $host;
  897:     foreach $host (keys %childatt) {
  898: 	if ($childatt{$host} >= $childmaxattempts) {
  899: 	    $childatt{$host} = 0;
  900: 	    &logthis("<font color=green>INFO: Restarting child for server: "
  901: 		     .$host."</font>\n");
  902: 	    make_new_child($host);
  903: 	}
  904: 	else {
  905: 	    $childatt{$host} = 0;
  906: 	}
  907:     }
  908:     &checkchildren();		# See if any children are still dead...
  909: }
  910: 
  911: # -------------------------------------------------- Non-critical communication
  912: sub subreply { 
  913:  my ($cmd,$server)=@_;
  914:  my $answer='';
  915:  if ($server ne $perlvar{'lonHostID'}) { 
  916:     my $peerfile="$perlvar{'lonSockDir'}/$server";
  917:     my $sclient=IO::Socket::UNIX->new(Peer    =>"$peerfile",
  918:                                       Type    => SOCK_STREAM,
  919:                                       Timeout => 10)
  920:        or return "con_lost";
  921: 
  922: 
  923:     $answer = londtransaction($sclient, $cmd, 10);
  924: 
  925:     if ((!$answer) || ($@=~/timeout/)) { $answer="con_lost"; }
  926:     $SIG{ALRM}='DEFAULT';
  927:     $SIG{__DIE__}=\&catchexception;
  928:  } else { $answer='self_reply'; }
  929:  return $answer;
  930: }
  931: 
  932: # --------------------------------------------------------------------- Logging
  933: 
  934: sub logthis {
  935:     my $message=shift;
  936:     my $execdir=$perlvar{'lonDaemons'};
  937:     my $fh=IO::File->new(">>$execdir/logs/lonc.log");
  938:     my $now=time;
  939:     my $local=localtime($now);
  940:     $lastlog=$local.': '.$message;
  941:     print $fh "$local ($$) [$conserver] [$status]: $message\n";
  942: }
  943: 
  944: #--------------------------------------  londtransaction:
  945: #  
  946: #  Performs a transaction with lond with timeout support.
  947: #    result = londtransaction(socket,request,timeout)
  948: #
  949: sub londtransaction {
  950:     my ($socket, $request, $tmo) = @_;
  951: 
  952:     if($DEBUG) {
  953: 	&logthis("londtransaction request: $request");
  954:     }
  955: 
  956:     # Set the signal handlers: ALRM for timeout and disble the others.
  957: 
  958:     $SIG{ALRM} = sub { die "timeout" };
  959:     $SIG{__DIE__} = 'DEFAULT';
  960:     
  961:     # Disable all but alarm so that only that can interupt the
  962:     # send /receive.
  963:     #
  964:     my $sigset = POSIX::SigSet->new(QUIT, USR1, HUP, INT, TERM);
  965:     my $priorsigs = POSIX::SigSet->new;
  966:     unless (defined sigprocmask(SIG_BLOCK, $sigset, $priorsigs)) {
  967: 	&logthis("<font color=red> CRITICAL -- londtransaction ".
  968: 		"failed to block signals </font>");
  969: 	die "could not block signals in londtransaction";
  970:     }
  971:     $answer = '';
  972:     #
  973:     #  Send request to lond.
  974:     #
  975:     eval { 
  976: 	alarm($tmo);
  977: 	print $socket "$request\n";
  978: 	alarm(0);
  979:     };
  980:     #  If request didn't timeout, try for the response.
  981:     #
  982: 
  983:     if ($@!~/timeout/) {
  984: 	eval {
  985: 	    alarm($tmo);
  986: 	    $answer = <$socket>;
  987: 	    if($DEBUG) {
  988: 		&logthis("Received $answer in londtransaction");
  989: 	    }
  990: 	    alarm(0);
  991: 	};
  992:     } else {
  993: 	&logthis("lonc - suiciding on send Timeout");
  994: 	die("lonc - suiciding on send Timeout");
  995:     }
  996:     if ($@ =~ /timeout/) {
  997: 	&logthis("lonc - suiciding on read Timeout");
  998: 	die("lonc - suiciding on read Timeout");
  999:     }
 1000:     #
 1001:     # Restore the initial sigmask set.
 1002:     #
 1003:     unless (defined sigprocmask(SIG_UNBLOCK, $priorsigs)) {
 1004: 	&logthis("<font color=red> CRITICAL -- londtransaction ".
 1005: 		"failed to re-enable signal processing. </font>");
 1006: 	die "londtransaction failed to re-enable signals";
 1007:     }
 1008:     #
 1009:     # go back to the prior handler set.
 1010:     #
 1011:     $SIG{ALRM} = 'DEFAULT';
 1012:     $SIG{__DIE__} = \&cathcexception;
 1013: 
 1014:     #    chomp $answer;
 1015:     if ($DEBUG) {
 1016: 	&logthis("Returning $answer in londtransaction");
 1017:     }
 1018:     return $answer;
 1019: 
 1020: }
 1021: 
 1022: sub logperm {
 1023:     my $message=shift;
 1024:     my $execdir=$perlvar{'lonDaemons'};
 1025:     my $now=time;
 1026:     my $local=localtime($now);
 1027:     my $fh=IO::File->new(">>$execdir/logs/lonnet.perm.log");
 1028:     print $fh "$now:$message:$local\n";
 1029: }
 1030: # ------------------------------------------------------------------ Log status
 1031: 
 1032: sub logstatus {
 1033:     my $docdir=$perlvar{'lonDocRoot'};
 1034:     my $fh=IO::File->new(">>$docdir/lon-status/loncstatus.txt");
 1035:     print $fh $$."\t".$conserver."\t".$status."\t".$lastlog."\n";
 1036: }
 1037: 
 1038: sub initnewstatus {
 1039:     my $docdir=$perlvar{'lonDocRoot'};
 1040:     my $fh=IO::File->new(">$docdir/lon-status/loncstatus.txt");
 1041:     my $now=time;
 1042:     my $local=localtime($now);
 1043:     print $fh "LONC status $local - parent $$\n\n";
 1044: }
 1045: 
 1046: # -------------------------------------------------------------- Status setting
 1047: 
 1048: sub status {
 1049:     my $what=shift;
 1050:     my $now=time;
 1051:     my $local=localtime($now);
 1052:     $status=$local.': '.$what;
 1053:     $0='lonc: '.$what.' '.$local;
 1054: }
 1055: 
 1056: 
 1057: 
 1058: # ----------------------------------- POD (plain old documentation, CPAN style)
 1059: 
 1060: =head1 NAME
 1061: 
 1062: lonc - LON TCP-MySQL-Server Daemon for handling database requests.
 1063: 
 1064: =head1 SYNOPSIS
 1065: 
 1066: Usage: B<lonc>
 1067: 
 1068: Should only be run as user=www.  This is a command-line script which
 1069: is invoked by B<loncron>.  There is no expectation that a typical user
 1070: will manually start B<lonc> from the command-line.  (In other words,
 1071: DO NOT START B<lonc> YOURSELF.)
 1072: 
 1073: =head1 OVERVIEW
 1074: 
 1075: =head2 Physical Overview
 1076: 
 1077: =begin latex 
 1078: 
 1079: \begin{figure} 
 1080:   \begin{center}
 1081:     \includegraphics[width=0.65\paperwidth,keepaspectratio]{LONCAPA_Network_Diagram}
 1082:   \end{center}
 1083:   \caption{\label{Overview_Of_Network}Overview of Network}
 1084: \end{figure}
 1085: 
 1086: =end latex
 1087: 
 1088: Physically, the Network consists of relatively inexpensive
 1089: upper-PC-class server machines which are linked through the commodity
 1090: internet in a load-balancing, dynamically content-replicating and
 1091: failover-secure way.
 1092: 
 1093: All machines in the Network are connected with each other through
 1094: two-way persistent TCP/IP connections. Clients (B<B>, B<F>, B<G> and
 1095: B<H> in Fig. Overview of Network) connect to the servers via standard
 1096: HTTP. There are two classes of servers, B<Library Servers> (B<A> and
 1097: B<E> in Fig. Overview of Network) and B<Access Servers> (B<C>, B<D>,
 1098: B<I> and B<J> in Fig. Overview of Network).
 1099: 
 1100: B<Library Servers> X<library server> X<server, library> are used to
 1101: store all personal records of a set of users, and are responsible for
 1102: their initial authentication when a session is opened on any server in
 1103: the Network. For Authors, Library Servers also hosts their
 1104: construction area and the authoritative copy of the current and
 1105: previous versions of every resource that was published by that
 1106: author. Library servers can be used as backups to host sessions when
 1107: all access servers in the Network are overloaded. Otherwise, for
 1108: learners, access servers are used to host the sessions. Library
 1109: servers need to have strong I/O capabilities.
 1110: 
 1111: B<Access Servers> X<access server> X<server, access> provide LON-CAPA
 1112: service to users, using the library servers as their data source. The
 1113: network is designed so that the number of concurrent sessions can be
 1114: increased over a wide range by simply adding additional access servers
 1115: before having to add additional library servers. Preliminary tests
 1116: showed that a library server could handle up to 10 access servers
 1117: fully parallel. Access servers can generally be cheaper hardware then
 1118: library servers require.
 1119: 
 1120: The Network is divided into B<domains> X<domain>, which are logical
 1121: boundaries between participating institutions. These domains can be
 1122: used to limit the flow of personal user information across the
 1123: network, set access privileges and enforce royalty schemes. LON-CAPA
 1124: domains bear no relationship to any other domain, including domains
 1125: used by the DNS system; LON-CAPA domains may be freely configured in
 1126: any manner that suits your use pattern.
 1127: 
 1128: =head2 Example Transactions
 1129: 
 1130: Fig. Overview of Network also depicts examples for several kinds of
 1131: transactions conducted across the Network.
 1132: 
 1133: An instructor at client B<B> modifies and publishes a resource on her
 1134: Home Server B<A>. Server B<A> has a record of all server machines
 1135: currently subscribed to this resource, and replicates it to servers
 1136: B<D> and B<I>. However, server B<D> is currently offline, so the
 1137: update notification gets buffered on B<A> until B<D> comes online
 1138: again. Servers B<C> and B<J> are currently not subscribed to this
 1139: resource.
 1140: 
 1141: Learners B<F> and B<G> have open sessions on server B<I>, and the new
 1142: resource is immediately available to them.
 1143: 
 1144: Learner B<H> tries to connect to server B<I> for a new session,
 1145: however, the machine is not reachable, so he connects to another
 1146: Access Server B<J> instead. This server currently does not have all
 1147: necessary resources locally present to host learner B<H>, but
 1148: subscribes to them and replicates them as they are accessed by B<H>.
 1149: 
 1150: Learner B<H> solves a problem on server B<J>. Library Server B<E> is
 1151: B<H>'s Home Server, so this information gets forwarded to B<E>, where
 1152: the records of H are updated.
 1153: 
 1154: =head2 lond, lonc, lonnet
 1155: 
 1156: =begin latex
 1157: 
 1158: \begin{figure}
 1159: \includegraphics[width=0.75\paperwidth,keepaspectratio]{LONCAPA_Network_Diagram2}
 1160:   \caption{\label{Overview_Of_Network_Communication}Overview of
 1161: Network Communication} \end{figure}
 1162: 
 1163: =end latex
 1164: 
 1165: Fig. Overview of Network Communication elaborates on the details of
 1166: this network infrastructure. It depicts three servers (B<A>, B<B> and
 1167: B<C>) and a client who has a session on server B<C>.
 1168: 
 1169: As B<C> accesses different resources in the system, different
 1170: handlers, which are incorporated as modules into the child processes
 1171: of the web server software, process these requests.
 1172: 
 1173: Our current implementation uses C<mod_perl> inside of the Apache web
 1174: server software. As an example, server B<C> currently has four active
 1175: web server software child processes. The chain of handlers dealing
 1176: with a certain resource is determined by both the server content
 1177: resource area (see below) and the MIME type, which in turn is
 1178: determined by the URL extension. For most URL structures, both an
 1179: authentication handler and a content handler are registered.
 1180: 
 1181: Handlers use a common library C<lonnet> X<lonnet> to interact with
 1182: both locally present temporary session data and data across the server
 1183: network. For example, lonnet provides routines for finding the home
 1184: server of a user, finding the server with the lowest loadavg, sending
 1185: simple command-reply sequences, and sending critical messages such as
 1186: a homework completion, etc. For a non-critical message, the routines
 1187: reply with a simple "connection lost" if the message could not be
 1188: delivered. For critical messages, lonnet tries to re-establish
 1189: connections, re-send the command, etc. If no valid reply could be
 1190: received, it answers "connection deferred" and stores the message in
 1191: buffer space to be sent at a later point in time. Also, failed
 1192: critical messages are logged.
 1193: 
 1194: The interface between C<lonnet> and the Network is established by a
 1195: multiplexed UNIX domain socket, denoted B<DS> in Fig. Overview of
 1196: Network Communication. The rationale behind this rather involved
 1197: architecture is that httpd processes (Apache children) dynamically
 1198: come and go on the timescale of minutes, based on workload and number
 1199: of processed requests. Over the lifetime of an httpd child, however,
 1200: it has to establish several hundred connections to several different
 1201: servers in the Network.
 1202: 
 1203: On the other hand, establishing a TCP/IP connection is resource
 1204: consuming for both ends of the line, and to optimize this connectivity
 1205: between different servers, connections in the Network are designed to
 1206: be persistent on the timescale of months, until either end is
 1207: rebooted. This mechanism will be elaborated on below.
 1208: 
 1209: =begin latex
 1210: 
 1211: \begin{figure}
 1212: \begin{lyxcode}
 1213: msul1:msu:library:zaphod.lite.msu.edu:35.8.63.51
 1214: 
 1215: msua1:msu:access:agrajag.lite.msu.edu:35.8.63.68
 1216: 
 1217: msul2:msu:library:frootmig.lite.msu.edu:35.8.63.69
 1218: 
 1219: msua2:msu:access:bistromath.lite.msu.edu:35.8.63.67
 1220: 
 1221: hubl14:hub:library:hubs128-pc-14.cl.msu.edu:35.8.116.34
 1222: 
 1223: hubl15:hub:library:hubs128-pc-15.cl.msu.edu:35.8.116.35
 1224: 
 1225: hubl16:hub:library:hubs128-pc-16.cl.msu.edu:35.8.116.36
 1226: 
 1227: huba20:hub:access:hubs128-pc-20.cl.msu.edu:35.8.116.40
 1228: 
 1229: huba21:hub:access:hubs128-pc-21.cl.msu.edu:35.8.116.41
 1230: 
 1231: huba22:hub:access:hubs128-pc-22.cl.msu.edu:35.8.116.42
 1232: 
 1233: huba23:hub:access:hubs128-pc-23.cl.msu.edu:35.8.116.43
 1234: 
 1235: hubl25:other:library:hubs128-pc-25.cl.msu.edu:35.8.116.45
 1236: 
 1237: huba27:other:access:hubs128-pc-27.cl.msu.edu:35.8.116.47
 1238: \end{lyxcode}
 1239: 
 1240: \caption{\label{Example_Of_hosts.tab}Example of Hosts Lookup table\texttt{/home/httpd/lonTabs/hosts.tab}} 
 1241: \end{figure}
 1242: 
 1243: =end latex
 1244: 
 1245: Establishing a connection to a UNIX domain socket is far less resource
 1246: consuming than the establishing of a TCP/IP connection. C<lonc>
 1247: X<lonc> is a proxy daemon that forks off a child for every server in
 1248: the Network. Which servers are members of the Network is determined by
 1249: a lookup table, such as the one in Fig. Examples of Hosts. In order,
 1250: the entries denote an internal name for the server, the domain of the
 1251: server, the type of the server, the host name and the IP address.
 1252: 
 1253: The C<lonc> parent process maintains the population and listens for
 1254: signals to restart or shutdown, as well as I<USR1>. Every child
 1255: establishes a multiplexed UNIX domain socket for its server and opens
 1256: a TCP/IP connection to the lond daemon (discussed below) on the remote
 1257: machine, which it keeps alive. If the connection is interrupted, the
 1258: child dies, whereupon the parent makes several attempts to fork
 1259: another child for that server.
 1260: 
 1261: When starting a new child (a new connection), first an init-sequence
 1262: is carried out, which includes receiving the information from the
 1263: remote C<lond> which is needed to establish the 128-bit encryption key
 1264: - the key is different for every connection. Next, any buffered
 1265: (delayed) messages for the server are sent.
 1266: 
 1267: In normal operation, the child listens to the UNIX socket, forwards
 1268: requests to the TCP connection, gets the reply from C<lond>, and sends
 1269: it back to the UNIX socket. Also, C<lonc> takes care to the encryption
 1270: and decryption of messages.
 1271: 
 1272: C<lond> X<lond> is the remote end of the TCP/IP connection and acts as
 1273: a remote command processor. It receives commands, executes them, and
 1274: sends replies. In normal operation, a C<lonc> child is constantly
 1275: connected to a dedicated C<lond> child on the remote server, and the
 1276: same is true vice versa (two persistent connections per server
 1277: combination).
 1278: 
 1279: lond listens to a TCP/IP port (denoted B<P> in Fig. Overview of
 1280: Network Communication) and forks off enough child processes to have
 1281: one for each other server in the network plus two spare children. The
 1282: parent process maintains the population and listens for signals to
 1283: restart or shutdown. Client servers are authenticated by IP.
 1284: 
 1285: When a new client server comes online, C<lond> sends a signal I<USR1>
 1286: to lonc, whereupon C<lonc> tries again to reestablish all lost
 1287: connections, even if it had given up on them before - a new client
 1288: connecting could mean that that machine came online again after an
 1289: interruption.
 1290: 
 1291: The gray boxes in Fig. Overview of Network Communication denote the
 1292: entities involved in an example transaction of the Network. The Client
 1293: is logged into server B<C>, while server B<B> is her Home
 1294: Server. Server B<C> can be an access server or a library server, while
 1295: server B<B> is a library server. She submits a solution to a homework
 1296: problem, which is processed by the appropriate handler for the MIME
 1297: type "problem". Through C<lonnet>, the handler writes information
 1298: about this transaction to the local session data. To make a permanent
 1299: log entry, C<lonnet> establishes a connection to the UNIX domain
 1300: socket for server B<B>. C<lonc> receives this command, encrypts it,
 1301: and sends it through the persistent TCP/IP connection to the TCP/IP
 1302: port of the remote C<lond>. C<lond> decrypts the command, executes it
 1303: by writing to the permanent user data files of the client, and sends
 1304: back a reply regarding the success of the operation. If the operation
 1305: was unsuccessful, or the connection would have broken down, C<lonc>
 1306: would write the command into a FIFO buffer stack to be sent again
 1307: later. C<lonc> now sends a reply regarding the overall success of the
 1308: operation to C<lonnet> via the UNIX domain port, which is eventually
 1309: received back by the handler.
 1310: 
 1311: =head2 Dynamic Resource Replication
 1312: 
 1313: Since resources are assembled into higher order resources simply by
 1314: reference, in principle it would be sufficient to retrieve them from
 1315: the respective Home Servers of the authors. However, there are several
 1316: problems with this simple approach: since the resource assembly
 1317: mechanism is designed to facilitate content assembly from a large
 1318: number of widely distributed sources, individual sessions would depend
 1319: on a large number of machines and network connections to be available,
 1320: thus be rather fragile. Also, frequently accessed resources could
 1321: potentially drive individual machines in the network into overload
 1322: situations.
 1323: 
 1324: Finally, since most resources depend on content handlers on the Access
 1325: Servers to be served to a client within the session context, the raw
 1326: source would first have to be transferred across the Network from the
 1327: respective Library Server to the Access Server, processed there, and
 1328: then transferred on to the client.
 1329: 
 1330: =begin latex
 1331: 
 1332: \begin{figure}
 1333: \includegraphics[width=0.75\paperwidth,keepaspectratio]{Dynamic_Replication_Request}
 1334:   \caption{\label{Dynamic_Replication}Dynamic Replication} 
 1335: \end{figure}
 1336: 
 1337: =end latex
 1338: 
 1339: To enable resource assembly in a reliable and scalable way, a dynamic
 1340: resource replication scheme was developed. Fig. "Dynamic Replication"
 1341: shows the details of this mechanism.
 1342: 
 1343: Anytime a resource out of the resource space is requested, a handler
 1344: routine is called which in turn calls the replication routine. As a
 1345: first step, this routines determines whether or not the resource is
 1346: currently in replication transfer (Step B<D1a>). During replication
 1347: transfer, the incoming data is stored in a temporary file, and Step
 1348: B<D1a> checks for the presence of that file. If transfer of a resource
 1349: is actively going on, the controlling handler receives an error
 1350: message, waits for a few seconds, and then calls the replication
 1351: routine again. If the resource is still in transfer, the client will
 1352: receive the message "Service currently not available".
 1353: 
 1354: In the next step (Step B<D1b>), the replication routine checks if the
 1355: URL is locally present. If it is, the replication routine returns OK
 1356: to the controlling handler, which in turn passes the request on to the
 1357: next handler in the chain.
 1358: 
 1359: If the resource is not locally present, the Home Server of the
 1360: resource author (as extracted from the URL) is determined (Step
 1361: B<D2>). This is done by contacting all library servers in the author?s
 1362: domain (as determined from the lookup table, see Fig. 1.1.2B). In Step
 1363: B<D2b> a query is sent to the remote server whether or not it is the
 1364: Home Server of the author (in our current implementation, an
 1365: additional cache is used to store already identified Home Servers (not
 1366: shown in the figure)). In Step B<D2c>, the remote server answers the
 1367: query with True or False. If the Home Server was found, the routine
 1368: continues, otherwise it contacts the next server (Step D2a). If no
 1369: server could be found, a "File not Found" error message is issued. In
 1370: our current implementation, in this step the Home Server is also
 1371: written into a cache for faster access if resources by the same author
 1372: are needed again (not shown in the figure).
 1373: 
 1374: =begin latex
 1375: 
 1376: \begin{figure}
 1377: \includegraphics[width=0.75\paperwidth,keepaspectratio]{Dynamic_Replication_Change}
 1378:   \caption{\label{Dynamic_Replication_Change}Dynamic Replication: Change} \end{figure}
 1379: 
 1380: =end latex
 1381: 
 1382: In Step B<D3a>, the routine sends a subscribe command for the URL to
 1383: the Home Server of the author. The Home Server first determines if the
 1384: resource is present, and if the access privileges allow it to be
 1385: copied to the requesting server (B<D3b>). If this is true, the
 1386: requesting server is added to the list of subscribed servers for that
 1387: resource (Step B<D3c>). The Home Server will reply with either OK or
 1388: an error message, which is determined in Step D4. If the remote
 1389: resource was not present, the error message "File not Found" will be
 1390: passed on to the client, if the access was not allowed, the error
 1391: message "Access Denied" is passed on. If the operation succeeded, the
 1392: requesting server sends an HTTP request for the resource out of the
 1393: C</raw> server content resource area of the Home Server.
 1394: 
 1395: The Home Server will then check if the requesting server is part of
 1396: the network, and if it is subscribed to the resource (Step B<D5b>). If
 1397: it is, it will send the resource via HTTP to the requesting server
 1398: without any content handlers processing it (Step B<D5c>). The
 1399: requesting server will store the incoming data in a temporary data
 1400: file (Step B<D5a>) - this is the file that Step B<D1a> checks for. If
 1401: the transfer could not complete, and appropriate error message is sent
 1402: to the client (Step B<D6>). Otherwise, the transferred temporary file
 1403: is renamed as the actual resource, and the replication routine returns
 1404: OK to the controlling handler (Step B<D7>).
 1405: 
 1406: Fig. "Dynamic Replication: Change" depicts the process of modifying a
 1407: resource. When an author publishes a new version of a resource, the
 1408: Home Server will contact every server currently subscribed to the
 1409: resource (Step B<U1>), as determined from the list of subscribed
 1410: servers for the resource generated in Step B<D3c>. The subscribing
 1411: servers will receive and acknowledge the update message (Step
 1412: B<U1c>). The update mechanism finishes when the last subscribed server
 1413: has been contacted (messages to unreachable servers are buffered).
 1414: 
 1415: Each subscribing server will check if the resource in question had
 1416: been accessed recently, that is, within a configurable amount of time
 1417: (Step B<U2>).
 1418: 
 1419: If the resource had not been accessed recently, the local copy of the
 1420: resource is deleted (Step B<U3a>) and an unsubscribe command is sent
 1421: to the Home Server (Step B<U3b>). The Home Server will check if the
 1422: server had indeed originally subscribed to the resource (Step B<U3c>)
 1423: and then delete the server from the list of subscribed servers for the
 1424: resource (Step B<U3d>).
 1425: 
 1426: If the resource had been accessed recently, the modified resource will
 1427: be copied over using the same mechanism as in Step B<D5a> through
 1428: B<D7>, which represents steps Steps B<U4a> through B<U6> in the
 1429: replication figure.
 1430: 
 1431: =head2 Load Balancing X<load balancing>
 1432: 
 1433: C<lond> provides a function to query the server's current loadavg. As
 1434: a configuration parameter, one can determine the value of loadavg,
 1435: which is to be considered 100%, for example, 2.00.
 1436: 
 1437: Access servers can have a list of spare access servers,
 1438: C</home/httpd/lonTabs/spares.tab>, to offload sessions depending on
 1439: own workload. This check happens is done by the login handler. It
 1440: re-directs the login information and session to the least busy spare
 1441: server if itself is overloaded. An additional round-robin IP scheme
 1442: possible. See Fig. "Load Balancing Sample" for an example of a
 1443: load-balancing scheme.
 1444: 
 1445: =begin latex
 1446: 
 1447: \begin{figure}
 1448: \includegraphics[width=0.75\paperwidth,keepaspectratio]{Load_Balancing_Example}
 1449:   \caption{\label{Load_Balancing_Example}Load Balancing Example} \end{figure}
 1450: 
 1451: =end latex
 1452: 
 1453: =head1 DESCRIPTION
 1454: 
 1455: Provides persistent TCP connections to the other servers in the network
 1456: through multiplexed domain sockets
 1457: 
 1458: B<lonc> forks off children processes that correspond to the other servers
 1459: in the network.  Management of these processes can be done at the
 1460: parent process level or the child process level.
 1461: 
 1462: After forking off the children, B<lonc> the B<parent> executes a main
 1463: loop which simply waits for processes to exit.  As a process exits, a
 1464: new process managing a link to the same peer as the exiting process is
 1465: created.
 1466: 
 1467: B<logs/lonc.log> is the location of log messages.
 1468: 
 1469: The process management is now explained in terms of linux shell commands,
 1470: subroutines internal to this code, and signal assignments:
 1471: 
 1472: =over 4
 1473: 
 1474: =item *
 1475: 
 1476: PID is stored in B<logs/lonc.pid>
 1477: 
 1478: This is the process id number of the parent B<lonc> process.
 1479: 
 1480: =item *
 1481: 
 1482: SIGTERM and SIGINT
 1483: 
 1484: Parent signal assignment:
 1485:  $SIG{INT}  = $SIG{TERM} = \&HUNTSMAN;
 1486: 
 1487: Child signal assignment:
 1488:  $SIG{INT}  = 'DEFAULT'; (and SIGTERM is DEFAULT also)
 1489: (The child dies and a SIGALRM is sent to parent, awaking parent from slumber
 1490:  to restart a new child.)
 1491: 
 1492: Command-line invocations:
 1493:  B<kill> B<-s> SIGTERM I<PID>
 1494:  B<kill> B<-s> SIGINT I<PID>
 1495: 
 1496: Subroutine B<HUNTSMAN>:
 1497:  This is only invoked for the B<lonc> parent I<PID>.
 1498: This kills all the children, and then the parent.
 1499: The B<lonc.pid> file is cleared.
 1500: 
 1501: =item *
 1502: 
 1503: SIGHUP
 1504: 
 1505: Current bug:
 1506:  This signal can only be processed the first time
 1507: on the parent process.  Subsequent SIGHUP signals
 1508: have no effect.
 1509: 
 1510: Parent signal assignment:
 1511:  $SIG{HUP}  = \&HUPSMAN;
 1512: 
 1513: Child signal assignment:
 1514:  none (nothing happens)
 1515: 
 1516: Command-line invocations:
 1517:  B<kill> B<-s> SIGHUP I<PID>
 1518: 
 1519: Subroutine B<HUPSMAN>:
 1520:  This is only invoked for the B<lonc> parent I<PID>,
 1521: This kills all the children, and then the parent.
 1522: The B<lonc.pid> file is cleared.
 1523: 
 1524: =item *
 1525: 
 1526: SIGUSR1
 1527: 
 1528: Parent signal assignment:
 1529:  $SIG{USR1} = \&USRMAN;
 1530: 
 1531: Child signal assignment:
 1532:  $SIG{USR1}= \&logstatus;
 1533: 
 1534: Command-line invocations:
 1535:  B<kill> B<-s> SIGUSR1 I<PID>
 1536: 
 1537: Subroutine B<USRMAN>:
 1538:  When invoked for the B<lonc> parent I<PID>,
 1539: SIGUSR1 is sent to all the children, and the status of
 1540: each connection is logged.
 1541: 
 1542: 
 1543: =back
 1544: 
 1545: =cut

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>