Annotation of loncom/lonc, revision 1.52

1.1       albertel    1: #!/usr/bin/perl
                      2: 
                      3: # The LearningOnline Network
                      4: # lonc - LON TCP-Client Domain-Socket-Server
                      5: # provides persistent TCP connections to the other servers in the network
                      6: # through multiplexed domain sockets
                      7: #
1.51      bowersj2    8: # $Id: lonc,v 1.50 2003/07/02 01:28:12 foxr Exp $
1.22      www         9: #
                     10: # Copyright Michigan State University Board of Trustees
                     11: #
                     12: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
                     13: #
                     14: # LON-CAPA is free software; you can redistribute it and/or modify
                     15: # it under the terms of the GNU General Public License as published by
                     16: # the Free Software Foundation; either version 2 of the License, or
                     17: # (at your option) any later version.
                     18: #
                     19: # LON-CAPA is distributed in the hope that it will be useful,
                     20: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     21: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     22: # GNU General Public License for more details.
                     23: #
                     24: # You should have received a copy of the GNU General Public License
                     25: # along with LON-CAPA; if not, write to the Free Software
                     26: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     27: #
                     28: # /home/httpd/html/adm/gpl.txt
                     29: #
                     30: # http://www.lon-capa.org/
                     31: #
1.1       albertel   32: # PID in subdir logs/lonc.pid
                     33: # kill kills
                     34: # HUP restarts
                     35: # USR1 tries to open connections again
                     36: 
1.2       www        37: # 6/4/99,6/5,6/7,6/8,6/9,6/10,6/11,6/12,7/14,7/19,
1.5       www        38: # 10/8,10/9,10/15,11/18,12/22,
1.10      www        39: # 2/8,7/25 Gerd Kortemeyer
                     40: # 12/05 Gerd Kortemeyer
1.23      harris41   41: # YEAR=2001
1.21      www        42: # 03/14/01,03/15,06/12,11/26,11/27,11/28 Gerd Kortemeyer
1.26      www        43: # YEAR=2002
1.29      www        44: # 2/19/02,02/22/02,02/25/02 Gerd Kortemeyer
1.33      foxr       45: # 3/07/02 Ron Fox 
1.1       albertel   46: # based on nonforker from Perl Cookbook
                     47: # - server who multiplexes without forking
1.40      harris41   48: 
                     49: use lib '/home/httpd/lib/perl/';
                     50: use LONCAPA::Configuration;
1.1       albertel   51: 
                     52: use POSIX;
                     53: use IO::Socket;
                     54: use IO::Select;
                     55: use IO::File;
                     56: use Socket;
                     57: use Fcntl;
                     58: use Tie::RefHash;
                     59: use Crypt::IDEA;
1.32      foxr       60: #use Net::Ping;
1.26      www        61: use LWP::UserAgent();
1.1       albertel   62: 
1.30      www        63: $status='';
                     64: $lastlog='';
                     65: $conserver='SHELL';
1.32      foxr       66: $DEBUG = 0;			# Set to 1 for annoyingly complete logs.
1.49      albertel   67: $VERSION='$Revison$'; #' stupid emacs
                     68: $remoteVERSION;
1.8       harris41   69: # -------------------------------- Set signal handlers to record abnormal exits
                     70: 
1.29      www        71: &status("Init exception handlers");
1.26      www        72: $SIG{QUIT}=\&catchexception;
1.8       harris41   73: $SIG{__DIE__}=\&catchexception;
                     74: 
1.41      matthew    75: # ---------------------------------- Read loncapa_apache.conf and loncapa.conf
1.42      harris41   76: &status("Read loncapa.conf and loncapa_apache.conf");
                     77: my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
1.40      harris41   78: my %perlvar=%{$perlvarref};
                     79: undef $perlvarref;
1.7       www        80: 
1.13      harris41   81: # ----------------------------- Make sure this process is running from user=www
1.29      www        82: &status("Check user ID");
1.13      harris41   83: my $wwwid=getpwnam('www');
                     84: if ($wwwid!=$<) {
                     85:    $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
                     86:    $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
1.14      www        87:    system("echo 'User ID mismatch.  lonc must be run as user www.' |\
1.13      harris41   88:  mailto $emailto -s '$subj' > /dev/null");
                     89:    exit 1;
                     90: }
                     91: 
1.7       www        92: # --------------------------------------------- Check if other instance running
                     93: 
                     94: my $pidfile="$perlvar{'lonDaemons'}/logs/lonc.pid";
                     95: 
                     96: if (-e $pidfile) {
                     97:    my $lfh=IO::File->new("$pidfile");
                     98:    my $pide=<$lfh>;
                     99:    chomp($pide);
1.11      harris41  100:    if (kill 0 => $pide) { die "already running"; }
1.7       www       101: }
1.1       albertel  102: 
                    103: # ------------------------------------------------------------- Read hosts file
                    104: 
1.11      harris41  105: open (CONFIG,"$perlvar{'lonTabDir'}/hosts.tab") || die "Can't read host file";
1.1       albertel  106: 
                    107: while ($configline=<CONFIG>) {
                    108:     my ($id,$domain,$role,$name,$ip)=split(/:/,$configline);
                    109:     chomp($ip);
1.28      www       110:     if ($ip) {
                    111:      $hostip{$id}=$ip;
                    112:      $hostname{$id}=$name;
                    113:     }
1.1       albertel  114: }
1.27      www       115: 
1.1       albertel  116: close(CONFIG);
                    117: 
                    118: # -------------------------------------------------------- Routines for forking
                    119: 
                    120: %children               = ();       # keys are current child process IDs,
                    121:                                     # values are hosts
                    122: %childpid               = ();       # the other way around
                    123: 
                    124: %childatt               = ();       # number of attempts to start server
                    125:                                     # for ID
                    126: 
1.30      www       127: $childmaxattempts=5;
1.3       www       128: 
1.1       albertel  129: # ---------------------------------------------------- Fork once and dissociate
1.29      www       130: &status("Fork and dissociate");
1.1       albertel  131: $fpid=fork;
                    132: exit if $fpid;
1.11      harris41  133: die "Couldn't fork: $!" unless defined ($fpid);
1.1       albertel  134: 
1.11      harris41  135: POSIX::setsid() or die "Can't start new session: $!";
1.1       albertel  136: 
1.30      www       137: $conserver='PARENT';
                    138: 
1.1       albertel  139: # ------------------------------------------------------- Write our PID on disk
1.29      www       140: &status("Write PID");
1.1       albertel  141: $execdir=$perlvar{'lonDaemons'};
                    142: open (PIDSAVE,">$execdir/logs/lonc.pid");
                    143: print PIDSAVE "$$\n";
                    144: close(PIDSAVE);
1.5       www       145: &logthis("<font color=red>CRITICAL: ---------- Starting ----------</font>");
1.1       albertel  146: 
                    147: # ----------------------------- Ignore signals generated during initial startup
                    148: $SIG{HUP}=$SIG{USR1}='IGNORE';
                    149: # ------------------------------------------------------- Now we are on our own
                    150:     
                    151: # Fork off our children, one for every server
                    152: 
1.18      www       153: &status("Forking ...");
                    154: 
1.1       albertel  155: foreach $thisserver (keys %hostip) {
1.32      foxr      156:     #if (&online($hostname{$thisserver})) {
1.26      www       157:        make_new_child($thisserver);
1.32      foxr      158:     #}
1.1       albertel  159: }
                    160: 
                    161: &logthis("Done starting initial servers");
                    162: # ----------------------------------------------------- Install signal handlers
                    163: 
1.32      foxr      164: 
1.1       albertel  165: $SIG{INT}  = $SIG{TERM} = \&HUNTSMAN;
                    166: $SIG{HUP}  = \&HUPSMAN;
                    167: $SIG{USR1} = \&USRMAN;
                    168: 
                    169: # And maintain the population.
                    170: while (1) {
1.32      foxr      171:     my $deadpid = wait;		# Wait for the next child to die.
1.39      foxr      172:                                 # See who died and start new one
                    173:                                 # or a signal (e.g. USR1 for restart).
                    174:                                 # if a signal, the wait will fail
                    175:                                 # This is ordinarily detected by
                    176:                                 # checking for the existence of the
                    177:                                 # pid index inthe children hash since
                    178:                                 # the return value from a failed wait is -1
                    179:                                 # which is an impossible PID.
1.18      www       180:     &status("Woke up");
1.30      www       181:     my $skipping='';
1.32      foxr      182: 
                    183:     if(exists($children{$deadpid})) {
                    184: 
                    185: 	$thisserver = $children{$deadpid}; # Look name of dead guy's peer.
                    186: 
                    187: 	delete($children{$deadpid}); # Get rid of dead hash entry.
                    188: 
                    189: 	if($childatt{$thisserver} < $childmaxattempts) {
                    190: 	    $childatt{$thisserver}++;
                    191: 	    &logthis(
                    192: 	       "<font color=yellow>INFO: Trying to reconnect for $thisserver "
                    193:             ."($childatt{$thisserver} of $childmaxattempts attempts)</font>"); 
                    194: 	    make_new_child($thisserver);
                    195: 	
                    196: 	}
                    197: 	else {
                    198: 	    $skipping .= $thisserver.' ';
                    199: 	}
                    200: 	if($skipping) {
                    201: 	    &logthis("<font color=blue>WARNING: Skipped $skipping</font>");
                    202:   
                    203: 	}
1.30      www       204:     }
1.32      foxr      205: 
1.1       albertel  206: }
                    207: 
                    208: 
1.32      foxr      209: 
1.1       albertel  210: sub make_new_child {
                    211:    
1.30      www       212:     $newserver=shift;
1.1       albertel  213:     my $pid;
                    214:     my $sigset;
1.30      www       215:     &logthis("Attempting to start child for server $newserver");
1.1       albertel  216:     # block signal for fork
                    217:     $sigset = POSIX::SigSet->new(SIGINT);
                    218:     sigprocmask(SIG_BLOCK, $sigset)
1.11      harris41  219:         or die "Can't block SIGINT for fork: $!\n";
1.1       albertel  220:     
1.11      harris41  221:     die "fork: $!" unless defined ($pid = fork);
1.1       albertel  222:     
                    223:     if ($pid) {
                    224:         # Parent records the child's birth and returns.
                    225:         sigprocmask(SIG_UNBLOCK, $sigset)
1.11      harris41  226:             or die "Can't unblock SIGINT for fork: $!\n";
1.30      www       227:         $children{$pid} = $newserver;
1.32      foxr      228:         $childpid{$newserver} = $pid;
1.1       albertel  229:         return;
                    230:     } else {
1.30      www       231:         $conserver=$newserver;
1.1       albertel  232:         # Child can *not* return from this subroutine.
                    233:         $SIG{INT} = 'DEFAULT';      # make SIGINT kill us as it did before
1.18      www       234:         $SIG{USR1}= \&logstatus;
                    235:    
1.1       albertel  236:         # unblock signals
                    237:         sigprocmask(SIG_UNBLOCK, $sigset)
1.11      harris41  238:             or die "Can't unblock SIGINT for fork: $!\n";
1.1       albertel  239: 
                    240: # ----------------------------- This is the modified main program of non-forker
                    241: 
                    242: $port = "$perlvar{'lonSockDir'}/$conserver";
                    243: 
                    244: unlink($port);
1.18      www       245: 
1.29      www       246: # -------------------------------------------------------------- Open other end
1.1       albertel  247: 
1.29      www       248: &openremote($conserver);
1.32      foxr      249: 	&logthis("<font color=green> Connection to $conserver open </font>");
1.3       www       250: # ----------------------------------------- We're online, send delayed messages
1.18      www       251:     &status("Checking for delayed messages");
1.32      foxr      252: 
1.4       www       253:     my @allbuffered;
1.3       www       254:     my $path="$perlvar{'lonSockDir'}/delayed";
1.4       www       255:     opendir(DIRHANDLE,$path);
                    256:     @allbuffered=grep /\.$conserver$/, readdir DIRHANDLE;
                    257:     closedir(DIRHANDLE);
1.3       www       258:     my $dfname;
1.44      www       259:     foreach (sort @allbuffered) {
1.30      www       260:         &status("Sending delayed: $_");
1.4       www       261:         $dfname="$path/$_";
1.32      foxr      262:         if($DEBUG) { &logthis('Sending '.$dfname); }
1.3       www       263:         my $wcmd;
                    264:         {
                    265:          my $dfh=IO::File->new($dfname);
1.4       www       266:          $cmd=<$dfh>;
1.3       www       267:         }
                    268:         chomp($cmd);
                    269:         my $bcmd=$cmd;
                    270:         if ($cmd =~ /^encrypt\:/) {
                    271: 	    my $rcmd=$cmd;
                    272:             $rcmd =~ s/^encrypt\://;
                    273:             chomp($rcmd);
                    274:             my $cmdlength=length($rcmd);
                    275:             $rcmd.="         ";
                    276:             my $encrequest='';
                    277:             for (my $encidx=0;$encidx<=$cmdlength;$encidx+=8) {
                    278:                 $encrequest.=
                    279:                     unpack("H16",$cipher->encrypt(substr($rcmd,$encidx,8)));
                    280:             }
                    281:             $cmd="enc:$cmdlength:$encrequest\n";
                    282:         }
1.33      foxr      283: 	$answer = londtransaction($remotesock, $cmd, 60);
1.3       www       284: 	chomp($answer);
1.20      www       285: 
                    286:         if (($answer ne '') && ($@!~/timeout/)) {
1.3       www       287: 	    unlink("$dfname");
1.30      www       288:             &logthis("Delayed $cmd: >$answer<");
1.3       www       289:             &logperm("S:$conserver:$bcmd");
                    290:         }        
1.23      harris41  291:     }
1.32      foxr      292: 	if($DEBUG) { &logthis("<font color=green> Delayed transactions sent"); }
1.1       albertel  293: 
                    294: # ------------------------------------------------------- Listen to UNIX socket
1.30      www       295: &status("Opening socket");
1.1       albertel  296: unless (
                    297:   $server = IO::Socket::UNIX->new(Local  => $port,
                    298:                                   Type   => SOCK_STREAM,
                    299:                                   Listen => 10 )
1.5       www       300:    ) { 
                    301:        my $st=120+int(rand(240));
                    302:        &logthis(
                    303:          "<font color=blue>WARNING: ".
1.33      foxr      304:          "Can't make server socket ($st secs):  .. exiting</font>");
1.5       www       305:        sleep($st);
1.1       albertel  306:        exit; 
                    307:      };
1.32      foxr      308:    
1.1       albertel  309: # -----------------------------------------------------------------------------
                    310: 
1.5       www       311: &logthis("<font color=green>$conserver online</font>");
                    312: 
                    313: # -----------------------------------------------------------------------------
1.1       albertel  314: # begin with empty buffers
                    315: %inbuffer  = ();
                    316: %outbuffer = ();
                    317: %ready     = ();
1.35      foxr      318: %servers   = ();	# To be compatible with make filevector.  indexed by
1.37      foxr      319: 			# File ids, values are sockets.
1.35      foxr      320: 			# note that the accept socket is omitted.
1.1       albertel  321: 
                    322: tie %ready, 'Tie::RefHash';
                    323: 
1.37      foxr      324: # nonblock($server);
                    325: # $select = IO::Select->new($server);
1.1       albertel  326: 
                    327: # Main loop: check reads/accepts, check writes, check ready to process
1.37      foxr      328: 
1.46      albertel  329: status("Main loop $conserver");
1.1       albertel  330: while (1) {
                    331:     my $client;
                    332:     my $rv;
                    333:     my $data;
                    334: 
1.35      foxr      335:     my $infdset;		# bit vec of fd's to select on input.
                    336: 
                    337:     my $outfdset;		# Bit vec of fd's to select on output.
                    338: 
                    339: 
                    340:     $infdset = MakeFileVector(\%servers);
                    341:     $outfdset= MakeFileVector(\%outbuffer);
1.37      foxr      342:     vec($infdset, $server->fileno, 1) = 1;
                    343:     if($DEBUG) {
                    344: 	&logthis("Adding ".$server->fileno.
                    345: 		 " to input select vector (listner)".
                    346: 		 unpack("b*",$infdset)."\n");
1.1       albertel  347:     }
1.37      foxr      348:     DoSelect(\$infdset, \$outfdset); # Wait for input.
                    349:     if($DEBUG) {
                    350: 	&logthis("Doselect completed!");
                    351: 	&logthis("ins = ".unpack("b*",$infdset)."\n");
                    352: 	&logthis("outs= ".unpack("b*",$outfdset)."\n");
                    353: 		 
1.1       albertel  354:     }
1.15      www       355: 
1.37      foxr      356:     # Checkfor new connections:
                    357:     if (vec($infdset, $server->fileno, 1)) {
                    358: 	if($DEBUG) {
                    359: 	    &logthis("New connection established");
                    360: 	}
                    361: 	# accept a new connection
                    362: 	&status("Accept new connection: $conserver");
                    363: 	$client = $server->accept();
                    364: 	if($DEBUG) {
                    365: 	    &logthis("New client fd = ".$client->fileno."\n");
                    366: 	}
                    367: 	$servers{$client->fileno} = $client;
                    368: 	nonblock($client);
1.46      albertel  369: 	$client->sockopt(SO_KEEPALIVE, 1);# Enable monitoring of
                    370: 	                                  # connection liveness.
1.37      foxr      371:     }
                    372:     HandleInput($infdset, \%servers, \%inbuffer, \%outbuffer, \%ready);
                    373:     HandleOutput($outfdset, \%servers, \%outbuffer, \%inbuffer,
                    374: 		 \%ready);
                    375: # -------------------------------------------------------- Wow, connection lost
1.15      www       376: 
1.37      foxr      377: }
                    378:    
1.1       albertel  379:     }
                    380: }
1.25      albertel  381: 
1.1       albertel  382: # ------------------------------------------------------- End of make_new_child
                    383: 
1.35      foxr      384: 
                    385: #
                    386: #  Make a vector of file descriptors to wait for in a select.
                    387: #  parameters:
                    388: #     \%fdhash  -reference to a hash which has IO::Socket's as indices.  
                    389: #                We only care about the indices, not the values.
                    390: #  A select vector is created from all indices of the hash.
                    391: 
                    392: sub MakeFileVector
                    393: {
                    394:     my $fdhash = shift;
                    395:     my $selvar = "";
                    396: 
1.37      foxr      397:     foreach $socket (keys %$fdhash) {
                    398: 	if($DEBUG) {
                    399: 	    &logthis("Adding  ".$socket.
                    400: 		     "to select vector. (client)\n");
                    401: 	}
                    402: 	vec($selvar, $socket, 1) = 1;
1.35      foxr      403:     }
                    404:     return $selvar;
                    405: }
                    406: 
                    407: 
                    408: #
                    409: #  HandleOutput:
                    410: #    Processes output on a buffered set of file descriptors which are
                    411: #    ready to be read.
                    412: #  Parameters:
1.37      foxr      413: #    $selvector - Vector of file descriptors which are writable.
1.35      foxr      414: #    \%sockets  - Vector of socket references indexed by socket.
                    415: #    \%buffers  - Reference to a hash containing output buffers.
                    416: #                 Hashes are indexed by sockets.  The file descriptors of some
                    417: #                 of those sockets will be present in $selvector.
                    418: #                 For each one of those, we will attempt to write the output
                    419: #                 buffer to the socket.  Note that we will assume that
                    420: #                 the sockets are being run in non blocking mode.
                    421: #   \%inbufs    - Reference to hash containing input buffers.
                    422: #   \%readys    - Reference to hash containing flags for items with complete
                    423: #                 requests.
                    424: #
                    425: sub HandleOutput
                    426: {
                    427:     my $selvector = shift;
                    428:     my $sockets   = shift;
                    429:     my $buffers   = shift;
                    430:     my $inbufs    = shift;
                    431:     my $readys    = shift;
1.37      foxr      432:     my $sock;
1.35      foxr      433: 
1.37      foxr      434:     if($DEBUG) {
                    435: 	&logthis("HandleOutput entered\n");
                    436:     }
                    437: 
                    438:     foreach $sock (keys %$sockets) {
1.35      foxr      439: 	my $socket = $sockets->{$sock};
1.37      foxr      440: 	if(vec($selvector, $sock, 1)) { # $socket is writable.
                    441: 	    if($DEBUG) {
                    442: 		&logthis("Sending $buffers->{$sock} \n");
                    443: 	    }
                    444: 	    my $rv = $socket->send($buffers->{$sock}, 0);
1.35      foxr      445: 	    $errno = $!;
                    446: 	    unless ($buffers->{$sock} eq "con_lost\n") {
                    447: 		unless (defined $rv) { # Write failed... could be EINTR
                    448: 		    unless ($errno == POSIX::EINTR) {
                    449: 			&logthis("Write failed on writable socket");
                    450: 		    }		# EINTR is not an error .. just retry.
                    451: 		    next;
                    452: 		}
                    453: 		if( ($rv == length $buffers->{$sock})    ||
                    454: 		    ($errno == POSIX::EWOULDBLOCK)       ||
                    455: 		    ($errno == POSIX::EAGAIN)            || # same as above.
                    456: 		    ($errno == POSIX::EINTR)             || # signal during IO
                    457: 		    ($errno == 0)) {
                    458: 		    substr($buffers->{$sock}, 0, $rv)=""; # delete written part
                    459: 		    delete $buffers->{$sock} unless length $buffers->{$sock};
                    460: 		} else {
                    461: 		    # For some reason the write failed with an error code
                    462: 		    # we didn't look for.  Shutdown the socket.
                    463: 		    &logthis("Unable to write data with ".$errno.": ".
                    464: 			     "Dropping data: ".length($buffers->{$sock}).
                    465: 			     ", $rv");
                    466: 		    #
                    467: 		    # kill off the buffers in the hash:
                    468: 
                    469: 		    delete $buffers->{$sock};
                    470: 		    delete $inbufs->{$sock};
                    471: 		    delete $readys->{$sock};
                    472: 
1.37      foxr      473: 		    close($socket); # Close the client socket.
1.35      foxr      474: 		    next;
                    475: 		}
                    476: 	    } else {		# Kludgy way to mark lond connection lost.
                    477: 		&logthis(
                    478: 		 "<font color=red>CRITICAL lond connection lost</font>");
                    479: 		status("Connection lost");
                    480: 		$remotesock->shutdown(2);
                    481: 		&logthis("Attempting to open a new connection");
1.37      foxr      482: 		&openremote($conserver);
1.35      foxr      483: 	    }
                    484: 		   
                    485: 	}
                    486:     }
                    487: 
                    488: }
                    489: #
                    490: #   HandleInput - Deals with input on client sockets.
                    491: #                 Each socket has an associated input buffer.
                    492: #                 For each readable socket, the currently available
                    493: #                 data is appended to this buffer.
                    494: #                 If necessary, the buffer is created.
                    495: #                 On various failures, we may shutdown the client.
                    496: #  Parameters:
                    497: #     $selvec   - Vector of readable sockets.
                    498: #     \%sockets - Refers to the  Hash of sockets indexed by sockets.  
                    499: #                 Each of these may or may not have it's fd bit set 
                    500: #                 in the $selvec.
                    501: #     \%ibufs   - Refers to the hash of input buffers indexed by socket.
                    502: #     \%obufs   - Hash of output buffers indexed by socket. 
                    503: #     \%ready   - Hash of ready flags indicating the existence of a completed
                    504: #                 Request.
                    505: sub HandleInput 
                    506: {
                    507: 
                    508:     # Marshall the parameters.   Note that the hashes are actually
                    509:     # references not values.
                    510: 
                    511:     my $selvec  = shift;
                    512:     my $sockets = shift;
                    513:     my $ibufs   = shift;
                    514:     my $obufs   = shift;
                    515:     my $ready   = shift;
1.37      foxr      516:     my $sock;
1.35      foxr      517: 
1.38      foxr      518:     if($DEBUG) {
                    519: 	&logthis("Entered HandleInput\n");
                    520:     }
1.37      foxr      521:     foreach $sock (keys %$sockets) {
1.35      foxr      522: 	my $socket = $sockets->{$sock};
1.37      foxr      523: 	if(vec($selvec, $sock, 1)) { # Socket which is readable.
1.35      foxr      524: 
                    525: 	    #  Attempt to read the data and do error management.
                    526: 	    my $data = '';
1.37      foxr      527: 	    my $rv = $socket->recv($data, POSIX::BUFSIZ, 0);
                    528: 	    if($DEBUG) {
                    529: 		&logthis("Received $data from socket");
                    530: 	    }
1.35      foxr      531: 	    unless (defined($rv) && length $data) {
                    532: 
                    533: 		# Read an end of file.. this is a disconnect from the peer.
                    534: 
                    535: 		delete $sockets->{$sock};
                    536: 		delete $ibufs->{$sock};
                    537: 		delete $obufs->{$sock};
                    538: 		delete $ready->{$sock};
                    539: 
                    540: 		status("Idle");
1.37      foxr      541: 		close $socket;
1.35      foxr      542: 		next;
                    543: 	    }
                    544: 	    #  Append the read data to the input buffer. If the buffer
                    545: 	    # now contains a \n the request is complete and we can 
                    546: 	    # mark this in the $ready hash (one request for each \n.)
                    547: 
                    548: 	    $ibufs->{$sock} .= $data;
                    549: 	    while($ibufs->{$sock} =~ s/(.*\n)//) {
                    550: 		push(@{$ready->{$sock}}, $1);
                    551: 	    }
                    552: 	    
                    553: 	}
                    554:     }
                    555:     #  Now handle any requests which are ready:
                    556: 
                    557:     foreach $client (keys %ready) {
                    558: 	handle($client);
1.36      foxr      559:     }
                    560: }
                    561: 
                    562: # DoSelect:  does a select with no timeout.  On signal (errno == EINTR), 
                    563: #            the select is retried until there are items in the returned
                    564: #            vectors.  
                    565: #
                    566: # Parameters:
                    567: #   \$readvec   - Reference to a vector of file descriptors to 
                    568: #                 check for readability.
                    569: #   \$writevec  - Reference to a vector of file descriptors to check for
                    570: #                 writability.
                    571: #  On exit, the referents are modified with vectors indicating which 
                    572: #  file handles are readable/writable.
                    573: #
                    574: sub DoSelect {
                    575:     my $readvec = shift;
                    576:     my $writevec= shift;
                    577:     my $outs;
                    578:     my $ins;
                    579: 
                    580:     while (1) {
1.37      foxr      581: 	my $nfds = select( $ins = $$readvec, $outs = $$writevec, undef, undef);
                    582: 	if($nfds) {
                    583: 	    if($DEBUG) {
                    584: 		&logthis("select exited with ".$nfds." fds\n");
                    585: 		&logthis("ins = ".unpack("b*",$ins).
                    586: 			 " readvec = ".unpack("b*",$$readvec)."\n");
                    587: 		&logthis("outs = ".unpack("b*",$outs).
                    588: 			 " writevec = ".unpack("b*",$$writevec)."\n");
                    589: 	    }
1.36      foxr      590: 	    $$readvec  = $ins;
                    591: 	    $$writevec = $outs;
                    592: 	    return;
                    593: 	} else {
1.37      foxr      594: 	    if($DEBUG) {
                    595: 		&logthis("Select exited with no bits set in mask\n");
                    596: 	    }
1.36      foxr      597: 	    die "Select failed" unless $! == EINTR;
                    598: 	}
1.35      foxr      599:     }
                    600: }
                    601: 
1.1       albertel  602: # handle($socket) deals with all pending requests for $client
1.35      foxr      603: #
1.1       albertel  604: sub handle {
                    605:     # requests are in $ready{$client}
                    606:     # send output to $outbuffer{$client}
                    607:     my $client = shift;
                    608:     my $request;
                    609:     foreach $request (@{$ready{$client}}) {
                    610: # ============================================================= Process request
                    611:         # $request is the text of the request
                    612:         # put text of reply into $outbuffer{$client}
1.29      www       613: # ------------------------------------------------------------ Is this the end?
1.33      foxr      614: 	chomp($request);
1.32      foxr      615: 	if($DEBUG) {
                    616:      &logthis("<font color=green> Request $request processing starts</font>");
                    617:         }
1.29      www       618:         if ($request eq "close_connection_exit\n") {
1.30      www       619: 	    &status("Request close connection");
1.29      www       620:            &logthis(
1.32      foxr      621:      "<font color=red>CRITICAL: Request Close Connection ... exiting</font>");
1.29      www       622:            $remotesock->shutdown(2);
                    623:            $server->close();
                    624:            exit;
                    625:         }
1.1       albertel  626: # -----------------------------------------------------------------------------
                    627:         if ($request =~ /^encrypt\:/) {
                    628: 	    my $cmd=$request;
                    629:             $cmd =~ s/^encrypt\://;
                    630:             chomp($cmd);
                    631:             my $cmdlength=length($cmd);
                    632:             $cmd.="         ";
                    633:             my $encrequest='';
                    634:             for (my $encidx=0;$encidx<=$cmdlength;$encidx+=8) {
                    635:                 $encrequest.=
                    636:                     unpack("H16",$cipher->encrypt(substr($cmd,$encidx,8)));
                    637:             }
1.33      foxr      638:             $request="enc:$cmdlength:$encrequest";
1.1       albertel  639:         }
1.19      www       640: # --------------------------------------------------------------- Main exchange
1.33      foxr      641: 	$answer = londtransaction($remotesock, $request, 300);
                    642: 
                    643: 	if($DEBUG) { 
                    644: 	    &logthis("<font color=green> Request data exchange complete");
                    645: 	}
                    646: 	if ($@=~/timeout/) { 
                    647: 	    $answer='';
                    648: 	    &logthis(
                    649: 		     "<font color=red>CRITICAL: Timeout: $request</font>");
                    650: 	}  
1.19      www       651: 
                    652: 
1.1       albertel  653:         if ($answer) {
                    654: 	   if ($answer =~ /^enc/) {
                    655:                my ($cmd,$cmdlength,$encinput)=split(/:/,$answer);
                    656:                chomp($encinput);
                    657: 	       $answer='';
                    658:                for (my $encidx=0;$encidx<length($encinput);$encidx+=16) {
                    659:                   $answer.=$cipher->decrypt(
                    660:                    pack("H16",substr($encinput,$encidx,16))
                    661:                   );
                    662: 	       }
                    663: 	      $answer=substr($answer,0,$cmdlength);
                    664: 	      $answer.="\n";
                    665: 	   }
1.33      foxr      666: 	   if($DEBUG) {
                    667: 	       &logthis("sending $answer to client\n");
                    668: 	   }
1.1       albertel  669:            $outbuffer{$client} .= $answer;
                    670:         } else {
                    671:            $outbuffer{$client} .= "con_lost\n";
                    672:         }
                    673: 
1.30      www       674:      &status("Completed: $request");
1.32      foxr      675: 	if($DEBUG) {
                    676: 	    &logthis("<font color=green> Request processing complete</font>");
                    677: 	}
1.1       albertel  678: # ===================================================== Done processing request
                    679:     }
                    680:     delete $ready{$client};
                    681: # -------------------------------------------------------------- End non-forker
1.32      foxr      682:     if($DEBUG) {
                    683: 	&logthis("<font color=green> requests for child handled</font>");
                    684:     }
1.1       albertel  685: }
                    686: # ---------------------------------------------------------- End make_new_child
                    687: 
                    688: # nonblock($socket) puts socket into nonblocking mode
                    689: sub nonblock {
                    690:     my $socket = shift;
                    691:     my $flags;
                    692: 
                    693:     
                    694:     $flags = fcntl($socket, F_GETFL, 0)
1.11      harris41  695:             or die "Can't get flags for socket: $!\n";
1.1       albertel  696:     fcntl($socket, F_SETFL, $flags | O_NONBLOCK)
1.11      harris41  697:             or die "Can't make socket nonblocking: $!\n";
1.29      www       698: }
                    699: 
                    700: 
                    701: sub openremote {
                    702: # ---------------------------------------------------- Client to network server
                    703: 
                    704:     my $conserver=shift;
                    705: 
1.49      albertel  706:     &status("Opening TCP $conserver");
1.32      foxr      707:     my $st=120+int(rand(240)); # Sleep before opening:
1.29      www       708: 
1.49      albertel  709:     unless (
                    710: 	    $remotesock = IO::Socket::INET->new(PeerAddr => $hostip{$conserver},
                    711: 						PeerPort => $perlvar{'londPort'},
                    712: 						Proto    => "tcp",
                    713: 						Type     => SOCK_STREAM)
                    714: 	   ) {
                    715: 
                    716: 	&logthis(
                    717: 		 "<font color=blue>WARNING: Couldn't connect to $conserver ($st secs): </font>");
                    718: 	sleep($st);
                    719: 	exit;
                    720:     };
1.29      www       721: # ----------------------------------------------------------------- Init dialog
                    722: 
1.49      albertel  723:     &logthis("<font color=green>INFO Connected to $conserver, initing</font>");
                    724:     &status("Init dialogue: $conserver");
1.29      www       725: 
1.49      albertel  726:     $answer = londtransaction($remotesock, "init", 60);
1.33      foxr      727:     chomp($answer);
                    728:     $answer = londtransaction($remotesock, $answer, 60);
                    729:     chomp($answer);
1.29      www       730: 
1.49      albertel  731:     if ($@=~/timeout/) {
                    732: 	&logthis("Timed out during init.. exiting");
                    733: 	exit;
                    734:     }
                    735: 
                    736:     if ($answer ne 'ok') {
                    737: 	&logthis("Init reply: >$answer<");
                    738: 	my $st=120+int(rand(240));
                    739: 	&logthis("<font color=blue>WARNING: Init failed ($st secs)</font>");
                    740: 	sleep($st);
                    741: 	exit;
                    742:     }
                    743: 
                    744:     $answer = londtransaction($remotesock,"sethost:$conserver",60);
                    745:     chomp($answer);
                    746:     if ( $answer ne 'ok') {
                    747: 	&logthis('<font color="blue">WARNING: unable to specify remote host'.
                    748: 		 $answer.'</font>');
                    749:     }
                    750: 
                    751:     $answer = londtransaction($remotesock,"version:$VERSION",60);
                    752:     chomp($answer);
                    753:     if ($answer =~ /^version:/) {
                    754: 	$remoteVERSION=(split(/:/,$answer))[1];
                    755:     } else {
                    756: 	&logthis('<font color="blue">WARNING: request remote version failed :'.
                    757: 		 $answer.': my version is :'.$VERSION.':</font>');
                    758:     }
1.29      www       759: 
1.49      albertel  760:     sleep 5;
                    761:     &status("Ponging $conserver");
                    762:     print $remotesock "pong\n";
                    763:     $answer=<$remotesock>;
                    764:     chomp($answer);
                    765:     if ($answer!~/^$conserver/) {
                    766: 	&logthis("Pong reply: >$answer<");
                    767:     }
1.29      www       768: # ----------------------------------------------------------- Initialize cipher
                    769: 
1.49      albertel  770:     &status("Initialize cipher");
                    771:     print $remotesock "ekey\n";
                    772:     my $buildkey=<$remotesock>;
                    773:     my $key=$conserver.$perlvar{'lonHostID'};
                    774:     $key=~tr/a-z/A-Z/;
                    775:     $key=~tr/G-P/0-9/;
                    776:     $key=~tr/Q-Z/0-9/;
                    777:     $key=$key.$buildkey.$key.$buildkey.$key.$buildkey;
                    778:     $key=substr($key,0,32);
                    779:     my $cipherkey=pack("H32",$key);
                    780:     if ($cipher=new IDEA $cipherkey) {
                    781: 	&logthis("Secure connection initialized");
                    782:     } else {
                    783: 	my $st=120+int(rand(240));
                    784: 	&logthis("<font color=blue>WARNING: ".
                    785: 		 "Could not establish secure connection ($st secs)!</font>");
                    786: 	sleep($st);
                    787: 	exit;
                    788:     }
1.32      foxr      789:     &logthis("<font color=green> Remote open success </font>");
1.8       harris41  790: }
1.30      www       791: 
                    792: 
                    793: 
                    794: # grabs exception and records it to log before exiting
                    795: sub catchexception {
                    796:     my ($signal)=@_;
                    797:     $SIG{QUIT}='DEFAULT';
                    798:     $SIG{__DIE__}='DEFAULT';
                    799:     chomp($signal);
                    800:     &logthis("<font color=red>CRITICAL: "
                    801:      ."ABNORMAL EXIT. Child $$ for server [$wasserver] died through "
1.33      foxr      802:      ."\"$signal\" with parameter </font>");
                    803:     die("Signal abend");
1.30      www       804: }
                    805: 
                    806: # -------------------------------------- Routines to see if other box available
                    807: 
1.32      foxr      808: #sub online {
                    809: #    my $host=shift;
                    810: #    &status("Pinging ".$host);
                    811: #    my $p=Net::Ping->new("tcp",20);
                    812: #    my $online=$p->ping("$host");
                    813: #    $p->close();
                    814: #    undef ($p);
                    815: #    return $online;
                    816: #}
1.30      www       817: 
                    818: sub connected {
                    819:     my ($local,$remote)=@_;
                    820:     &status("Checking connection $local to $remote");
                    821:     $local=~s/\W//g;
                    822:     $remote=~s/\W//g;
                    823: 
                    824:     unless ($hostname{$local}) { return 'local_unknown'; }
                    825:     unless ($hostname{$remote}) { return 'remote_unknown'; }
                    826: 
1.32      foxr      827:     #unless (&online($hostname{$local})) { return 'local_offline'; }
1.30      www       828: 
                    829:     my $ua=new LWP::UserAgent;
                    830:     
                    831:     my $request=new HTTP::Request('GET',
                    832:       "http://".$hostname{$local}.'/cgi-bin/ping.pl?'.$remote);
                    833: 
                    834:     my $response=$ua->request($request);
                    835: 
                    836:     unless ($response->is_success) { return 'local_error'; }
                    837: 
                    838:     my $reply=$response->content;
                    839:     $reply=(split("\n",$reply))[0];
                    840:     $reply=~s/\W//g;
                    841:     if ($reply ne $remote) { return $reply; }
                    842:     return 'ok';
                    843: }
                    844: 
                    845: 
                    846: 
                    847: sub hangup {
                    848:     foreach (keys %children) {
                    849:         $wasserver=$children{$_};
                    850:         &status("Closing $wasserver");
                    851:         &logthis('Closing '.$wasserver.': '.&subreply('exit',$wasserver));
                    852:         &status("Kill PID $_ for $wasserver");
                    853: 	kill ('INT',$_);
                    854:     }
                    855: }
                    856: 
                    857: sub HUNTSMAN {                      # signal handler for SIGINT
                    858:     local($SIG{CHLD}) = 'IGNORE';   # we're going to kill our children
                    859:     &hangup();
                    860:     my $execdir=$perlvar{'lonDaemons'};
                    861:     unlink("$execdir/logs/lonc.pid");
                    862:     &logthis("<font color=red>CRITICAL: Shutting down</font>");
                    863:     exit;                           # clean up with dignity
                    864: }
                    865: 
                    866: sub HUPSMAN {                      # signal handler for SIGHUP
                    867:     local($SIG{CHLD}) = 'IGNORE';  # we're going to kill our children
                    868:     &hangup();
                    869:     &logthis("<font color=red>CRITICAL: Restarting</font>");
1.50      foxr      870:     my $execdir=$perlvar{'lonDaemons'};
1.30      www       871:     unlink("$execdir/logs/lonc.pid");
                    872:     exec("$execdir/lonc");         # here we go again
                    873: }
                    874: 
                    875: sub checkchildren {
                    876:     &initnewstatus();
                    877:     &logstatus();
                    878:     &logthis('Going to check on the children');
                    879:     foreach (sort keys %children) {
                    880: 	sleep 1;
                    881:         unless (kill 'USR1' => $_) {
                    882: 	    &logthis ('<font color=red>CRITICAL: Child '.$_.' is dead</font>');
                    883:             &logstatus($$.' is dead');
                    884:         } 
                    885:     }
                    886: }
                    887: 
                    888: sub USRMAN {
                    889:     &logthis("USR1: Trying to establish connections again");
1.39      foxr      890:     #
                    891:     #  It is really important not to just clear the childatt hash or we will
                    892:     #  lose all memory of the children.  What we really want to do is this:
                    893:     #  For each index where childatt is >= $childmaxattempts
                    894:     #  Zero the associated counter and do a make_child for the host.
                    895:     #  Regardles, the childatt entry is zeroed:
                    896:     my $host;
                    897:     foreach $host (keys %childatt) {
                    898: 	if ($childatt{$host} >= $childmaxattempts) {
                    899: 	    $childatt{$host} = 0;
                    900: 	    &logthis("<font color=green>INFO: Restarting child for server: "
                    901: 		     .$host."</font>\n");
                    902: 	    make_new_child($host);
                    903: 	}
                    904: 	else {
                    905: 	    $childatt{$host} = 0;
                    906: 	}
                    907:     }
                    908:     &checkchildren();		# See if any children are still dead...
1.30      www       909: }
                    910: 
                    911: # -------------------------------------------------- Non-critical communication
                    912: sub subreply { 
                    913:  my ($cmd,$server)=@_;
                    914:  my $answer='';
                    915:  if ($server ne $perlvar{'lonHostID'}) { 
                    916:     my $peerfile="$perlvar{'lonSockDir'}/$server";
                    917:     my $sclient=IO::Socket::UNIX->new(Peer    =>"$peerfile",
                    918:                                       Type    => SOCK_STREAM,
                    919:                                       Timeout => 10)
                    920:        or return "con_lost";
                    921: 
                    922: 
1.33      foxr      923:     $answer = londtransaction($sclient, $cmd, 10);
                    924: 
1.30      www       925:     if ((!$answer) || ($@=~/timeout/)) { $answer="con_lost"; }
                    926:     $SIG{ALRM}='DEFAULT';
                    927:     $SIG{__DIE__}=\&catchexception;
                    928:  } else { $answer='self_reply'; }
                    929:  return $answer;
                    930: }
                    931: 
                    932: # --------------------------------------------------------------------- Logging
                    933: 
                    934: sub logthis {
                    935:     my $message=shift;
                    936:     my $execdir=$perlvar{'lonDaemons'};
                    937:     my $fh=IO::File->new(">>$execdir/logs/lonc.log");
                    938:     my $now=time;
                    939:     my $local=localtime($now);
                    940:     $lastlog=$local.': '.$message;
                    941:     print $fh "$local ($$) [$conserver] [$status]: $message\n";
                    942: }
                    943: 
1.33      foxr      944: #--------------------------------------  londtransaction:
                    945: #  
                    946: #  Performs a transaction with lond with timeout support.
                    947: #    result = londtransaction(socket,request,timeout)
                    948: #
                    949: sub londtransaction {
                    950:     my ($socket, $request, $tmo) = @_;
                    951: 
                    952:     if($DEBUG) {
                    953: 	&logthis("londtransaction request: $request");
                    954:     }
                    955: 
                    956:     # Set the signal handlers: ALRM for timeout and disble the others.
                    957: 
                    958:     $SIG{ALRM} = sub { die "timeout" };
                    959:     $SIG{__DIE__} = 'DEFAULT';
                    960:     
                    961:     # Disable all but alarm so that only that can interupt the
                    962:     # send /receive.
                    963:     #
                    964:     my $sigset = POSIX::SigSet->new(QUIT, USR1, HUP, INT, TERM);
                    965:     my $priorsigs = POSIX::SigSet->new;
                    966:     unless (defined sigprocmask(SIG_BLOCK, $sigset, $priorsigs)) {
                    967: 	&logthis("<font color=red> CRITICAL -- londtransaction ".
                    968: 		"failed to block signals </font>");
                    969: 	die "could not block signals in londtransaction";
                    970:     }
                    971:     $answer = '';
                    972:     #
                    973:     #  Send request to lond.
                    974:     #
                    975:     eval { 
                    976: 	alarm($tmo);
                    977: 	print $socket "$request\n";
                    978: 	alarm(0);
                    979:     };
                    980:     #  If request didn't timeout, try for the response.
                    981:     #
                    982: 
                    983:     if ($@!~/timeout/) {
                    984: 	eval {
                    985: 	    alarm($tmo);
                    986: 	    $answer = <$socket>;
                    987: 	    if($DEBUG) {
                    988: 		&logthis("Received $answer in londtransaction");
                    989: 	    }
                    990: 	    alarm(0);
                    991: 	};
                    992:     } else {
1.47      albertel  993: 	&logthis("lonc - suiciding on send Timeout");
                    994: 	die("lonc - suiciding on send Timeout");
1.33      foxr      995:     }
1.47      albertel  996:     if ($@ =~ /timeout/) {
1.49      albertel  997: 	&logthis("lonc - suiciding on read Timeout");
                    998: 	die("lonc - suiciding on read Timeout");
1.33      foxr      999:     }
                   1000:     #
                   1001:     # Restore the initial sigmask set.
                   1002:     #
                   1003:     unless (defined sigprocmask(SIG_UNBLOCK, $priorsigs)) {
                   1004: 	&logthis("<font color=red> CRITICAL -- londtransaction ".
                   1005: 		"failed to re-enable signal processing. </font>");
                   1006: 	die "londtransaction failed to re-enable signals";
                   1007:     }
                   1008:     #
                   1009:     # go back to the prior handler set.
                   1010:     #
                   1011:     $SIG{ALRM} = 'DEFAULT';
                   1012:     $SIG{__DIE__} = \&cathcexception;
                   1013: 
                   1014:     #    chomp $answer;
                   1015:     if ($DEBUG) {
                   1016: 	&logthis("Returning $answer in londtransaction");
                   1017:     }
                   1018:     return $answer;
                   1019: 
                   1020: }
1.30      www      1021: 
                   1022: sub logperm {
                   1023:     my $message=shift;
                   1024:     my $execdir=$perlvar{'lonDaemons'};
                   1025:     my $now=time;
                   1026:     my $local=localtime($now);
                   1027:     my $fh=IO::File->new(">>$execdir/logs/lonnet.perm.log");
                   1028:     print $fh "$now:$message:$local\n";
                   1029: }
                   1030: # ------------------------------------------------------------------ Log status
                   1031: 
                   1032: sub logstatus {
                   1033:     my $docdir=$perlvar{'lonDocRoot'};
                   1034:     my $fh=IO::File->new(">>$docdir/lon-status/loncstatus.txt");
                   1035:     print $fh $$."\t".$conserver."\t".$status."\t".$lastlog."\n";
                   1036: }
                   1037: 
                   1038: sub initnewstatus {
                   1039:     my $docdir=$perlvar{'lonDocRoot'};
                   1040:     my $fh=IO::File->new(">$docdir/lon-status/loncstatus.txt");
                   1041:     my $now=time;
                   1042:     my $local=localtime($now);
                   1043:     print $fh "LONC status $local - parent $$\n\n";
                   1044: }
                   1045: 
                   1046: # -------------------------------------------------------------- Status setting
                   1047: 
                   1048: sub status {
                   1049:     my $what=shift;
                   1050:     my $now=time;
                   1051:     my $local=localtime($now);
                   1052:     $status=$local.': '.$what;
1.43      www      1053:     $0='lonc: '.$what.' '.$local;
1.30      www      1054: }
                   1055: 
                   1056: 
1.1       albertel 1057: 
1.23      harris41 1058: # ----------------------------------- POD (plain old documentation, CPAN style)
                   1059: 
                   1060: =head1 NAME
                   1061: 
                   1062: lonc - LON TCP-MySQL-Server Daemon for handling database requests.
                   1063: 
                   1064: =head1 SYNOPSIS
                   1065: 
1.31      harris41 1066: Usage: B<lonc>
                   1067: 
1.23      harris41 1068: Should only be run as user=www.  This is a command-line script which
1.31      harris41 1069: is invoked by B<loncron>.  There is no expectation that a typical user
                   1070: will manually start B<lonc> from the command-line.  (In other words,
                   1071: DO NOT START B<lonc> YOURSELF.)
1.23      harris41 1072: 
1.51      bowersj2 1073: =head1 OVERVIEW
                   1074: 
                   1075: =head2 Physical Overview
                   1076: 
                   1077: =begin latex 
                   1078: 
                   1079: \begin{figure} 
                   1080:   \begin{center}
                   1081:     \includegraphics[width=0.65\paperwidth,keepaspectratio]{LONCAPA_Network_Diagram}
                   1082:   \end{center}
                   1083:   \caption{\label{Overview_Of_Network}Overview of Network}
                   1084: \end{figure}
                   1085: 
                   1086: =end latex
                   1087: 
                   1088: Physically, the Network consists of relatively inexpensive
                   1089: upper-PC-class server machines which are linked through the commodity
                   1090: internet in a load-balancing, dynamically content-replicating and
                   1091: failover-secure way.
                   1092: 
                   1093: All machines in the Network are connected with each other through
                   1094: two-way persistent TCP/IP connections. Clients (B<B>, B<F>, B<G> and
                   1095: B<H> in Fig. Overview of Network) connect to the servers via standard
                   1096: HTTP. There are two classes of servers, B<Library Servers> (B<A> and
                   1097: B<E> in Fig. Overview of Network) and B<Access Servers> (B<C>, B<D>,
                   1098: B<I> and B<J> in Fig. Overview of Network).
                   1099: 
                   1100: B<Library Servers> X<library server> X<server, library> are used to
                   1101: store all personal records of a set of users, and are responsible for
                   1102: their initial authentication when a session is opened on any server in
                   1103: the Network. For Authors, Library Servers also hosts their
                   1104: construction area and the authoritative copy of the current and
                   1105: previous versions of every resource that was published by that
                   1106: author. Library servers can be used as backups to host sessions when
                   1107: all access servers in the Network are overloaded. Otherwise, for
                   1108: learners, access servers are used to host the sessions. Library
                   1109: servers need to have strong I/O capabilities.
                   1110: 
                   1111: B<Access Servers> X<access server> X<server, access> provide LON-CAPA
                   1112: service to users, using the library servers as their data source. The
                   1113: network is designed so that the number of concurrent sessions can be
                   1114: increased over a wide range by simply adding additional access servers
                   1115: before having to add additional library servers. Preliminary tests
                   1116: showed that a library server could handle up to 10 access servers
                   1117: fully parallel. Access servers can generally be cheaper hardware then
                   1118: library servers require.
                   1119: 
                   1120: The Network is divided into B<domains> X<domain>, which are logical
                   1121: boundaries between participating institutions. These domains can be
                   1122: used to limit the flow of personal user information across the
                   1123: network, set access privileges and enforce royalty schemes. LON-CAPA
                   1124: domains bear no relationship to any other domain, including domains
                   1125: used by the DNS system; LON-CAPA domains may be freely configured in
                   1126: any manner that suits your use pattern.
                   1127: 
                   1128: =head2 Example Transactions
                   1129: 
                   1130: Fig. Overview of Network also depicts examples for several kinds of
                   1131: transactions conducted across the Network.
                   1132: 
                   1133: An instructor at client B<B> modifies and publishes a resource on her
                   1134: Home Server B<A>. Server B<A> has a record of all server machines
                   1135: currently subscribed to this resource, and replicates it to servers
                   1136: B<D> and B<I>. However, server B<D> is currently offline, so the
                   1137: update notification gets buffered on B<A> until B<D> comes online
                   1138: again. Servers B<C> and B<J> are currently not subscribed to this
                   1139: resource.
                   1140: 
                   1141: Learners B<F> and B<G> have open sessions on server B<I>, and the new
                   1142: resource is immediately available to them.
                   1143: 
                   1144: Learner B<H> tries to connect to server B<I> for a new session,
                   1145: however, the machine is not reachable, so he connects to another
                   1146: Access Server B<J> instead. This server currently does not have all
                   1147: necessary resources locally present to host learner B<H>, but
                   1148: subscribes to them and replicates them as they are accessed by B<H>.
                   1149: 
                   1150: Learner B<H> solves a problem on server B<J>. Library Server B<E> is
                   1151: B<H>'s Home Server, so this information gets forwarded to B<E>, where
                   1152: the records of H are updated.
                   1153: 
1.52    ! bowersj2 1154: =head2 lond, lonc, and lonnet
1.51      bowersj2 1155: 
                   1156: =begin latex
                   1157: 
                   1158: \begin{figure}
1.52    ! bowersj2 1159: \includegraphics[width=0.65\paperwidth,keepaspectratio]{LONCAPA_Network_Diagram2}
1.51      bowersj2 1160:   \caption{\label{Overview_Of_Network_Communication}Overview of
                   1161: Network Communication} \end{figure}
                   1162: 
                   1163: =end latex
                   1164: 
                   1165: Fig. Overview of Network Communication elaborates on the details of
                   1166: this network infrastructure. It depicts three servers (B<A>, B<B> and
                   1167: B<C>) and a client who has a session on server B<C>.
                   1168: 
                   1169: As B<C> accesses different resources in the system, different
                   1170: handlers, which are incorporated as modules into the child processes
                   1171: of the web server software, process these requests.
                   1172: 
                   1173: Our current implementation uses C<mod_perl> inside of the Apache web
                   1174: server software. As an example, server B<C> currently has four active
                   1175: web server software child processes. The chain of handlers dealing
                   1176: with a certain resource is determined by both the server content
                   1177: resource area (see below) and the MIME type, which in turn is
                   1178: determined by the URL extension. For most URL structures, both an
                   1179: authentication handler and a content handler are registered.
                   1180: 
                   1181: Handlers use a common library C<lonnet> X<lonnet> to interact with
                   1182: both locally present temporary session data and data across the server
                   1183: network. For example, lonnet provides routines for finding the home
                   1184: server of a user, finding the server with the lowest loadavg, sending
                   1185: simple command-reply sequences, and sending critical messages such as
                   1186: a homework completion, etc. For a non-critical message, the routines
                   1187: reply with a simple "connection lost" if the message could not be
                   1188: delivered. For critical messages, lonnet tries to re-establish
                   1189: connections, re-send the command, etc. If no valid reply could be
                   1190: received, it answers "connection deferred" and stores the message in
                   1191: buffer space to be sent at a later point in time. Also, failed
                   1192: critical messages are logged.
                   1193: 
                   1194: The interface between C<lonnet> and the Network is established by a
                   1195: multiplexed UNIX domain socket, denoted B<DS> in Fig. Overview of
                   1196: Network Communication. The rationale behind this rather involved
                   1197: architecture is that httpd processes (Apache children) dynamically
                   1198: come and go on the timescale of minutes, based on workload and number
                   1199: of processed requests. Over the lifetime of an httpd child, however,
                   1200: it has to establish several hundred connections to several different
                   1201: servers in the Network.
                   1202: 
                   1203: On the other hand, establishing a TCP/IP connection is resource
                   1204: consuming for both ends of the line, and to optimize this connectivity
                   1205: between different servers, connections in the Network are designed to
                   1206: be persistent on the timescale of months, until either end is
                   1207: rebooted. This mechanism will be elaborated on below.
                   1208: 
                   1209: =begin latex
                   1210: 
                   1211: \begin{figure}
                   1212: \begin{lyxcode}
                   1213: msul1:msu:library:zaphod.lite.msu.edu:35.8.63.51
                   1214: 
                   1215: msua1:msu:access:agrajag.lite.msu.edu:35.8.63.68
                   1216: 
                   1217: msul2:msu:library:frootmig.lite.msu.edu:35.8.63.69
                   1218: 
                   1219: msua2:msu:access:bistromath.lite.msu.edu:35.8.63.67
                   1220: 
                   1221: hubl14:hub:library:hubs128-pc-14.cl.msu.edu:35.8.116.34
                   1222: 
                   1223: hubl15:hub:library:hubs128-pc-15.cl.msu.edu:35.8.116.35
                   1224: 
                   1225: hubl16:hub:library:hubs128-pc-16.cl.msu.edu:35.8.116.36
                   1226: 
                   1227: huba20:hub:access:hubs128-pc-20.cl.msu.edu:35.8.116.40
                   1228: 
                   1229: huba21:hub:access:hubs128-pc-21.cl.msu.edu:35.8.116.41
                   1230: 
                   1231: huba22:hub:access:hubs128-pc-22.cl.msu.edu:35.8.116.42
                   1232: 
                   1233: huba23:hub:access:hubs128-pc-23.cl.msu.edu:35.8.116.43
                   1234: 
                   1235: hubl25:other:library:hubs128-pc-25.cl.msu.edu:35.8.116.45
                   1236: 
                   1237: huba27:other:access:hubs128-pc-27.cl.msu.edu:35.8.116.47
                   1238: \end{lyxcode}
                   1239: 
                   1240: \caption{\label{Example_Of_hosts.tab}Example of Hosts Lookup table\texttt{/home/httpd/lonTabs/hosts.tab}} 
                   1241: \end{figure}
                   1242: 
                   1243: =end latex
                   1244: 
                   1245: Establishing a connection to a UNIX domain socket is far less resource
                   1246: consuming than the establishing of a TCP/IP connection. C<lonc>
                   1247: X<lonc> is a proxy daemon that forks off a child for every server in
                   1248: the Network. Which servers are members of the Network is determined by
                   1249: a lookup table, such as the one in Fig. Examples of Hosts. In order,
                   1250: the entries denote an internal name for the server, the domain of the
                   1251: server, the type of the server, the host name and the IP address.
                   1252: 
                   1253: The C<lonc> parent process maintains the population and listens for
                   1254: signals to restart or shutdown, as well as I<USR1>. Every child
                   1255: establishes a multiplexed UNIX domain socket for its server and opens
                   1256: a TCP/IP connection to the lond daemon (discussed below) on the remote
                   1257: machine, which it keeps alive. If the connection is interrupted, the
                   1258: child dies, whereupon the parent makes several attempts to fork
                   1259: another child for that server.
                   1260: 
                   1261: When starting a new child (a new connection), first an init-sequence
                   1262: is carried out, which includes receiving the information from the
                   1263: remote C<lond> which is needed to establish the 128-bit encryption key
                   1264: - the key is different for every connection. Next, any buffered
                   1265: (delayed) messages for the server are sent.
                   1266: 
                   1267: In normal operation, the child listens to the UNIX socket, forwards
                   1268: requests to the TCP connection, gets the reply from C<lond>, and sends
                   1269: it back to the UNIX socket. Also, C<lonc> takes care to the encryption
                   1270: and decryption of messages.
                   1271: 
                   1272: C<lond> X<lond> is the remote end of the TCP/IP connection and acts as
                   1273: a remote command processor. It receives commands, executes them, and
                   1274: sends replies. In normal operation, a C<lonc> child is constantly
                   1275: connected to a dedicated C<lond> child on the remote server, and the
                   1276: same is true vice versa (two persistent connections per server
                   1277: combination).
                   1278: 
                   1279: lond listens to a TCP/IP port (denoted B<P> in Fig. Overview of
                   1280: Network Communication) and forks off enough child processes to have
                   1281: one for each other server in the network plus two spare children. The
                   1282: parent process maintains the population and listens for signals to
                   1283: restart or shutdown. Client servers are authenticated by IP.
                   1284: 
                   1285: When a new client server comes online, C<lond> sends a signal I<USR1>
                   1286: to lonc, whereupon C<lonc> tries again to reestablish all lost
                   1287: connections, even if it had given up on them before - a new client
                   1288: connecting could mean that that machine came online again after an
                   1289: interruption.
                   1290: 
                   1291: The gray boxes in Fig. Overview of Network Communication denote the
                   1292: entities involved in an example transaction of the Network. The Client
                   1293: is logged into server B<C>, while server B<B> is her Home
                   1294: Server. Server B<C> can be an access server or a library server, while
                   1295: server B<B> is a library server. She submits a solution to a homework
                   1296: problem, which is processed by the appropriate handler for the MIME
                   1297: type "problem". Through C<lonnet>, the handler writes information
                   1298: about this transaction to the local session data. To make a permanent
                   1299: log entry, C<lonnet> establishes a connection to the UNIX domain
                   1300: socket for server B<B>. C<lonc> receives this command, encrypts it,
                   1301: and sends it through the persistent TCP/IP connection to the TCP/IP
                   1302: port of the remote C<lond>. C<lond> decrypts the command, executes it
                   1303: by writing to the permanent user data files of the client, and sends
                   1304: back a reply regarding the success of the operation. If the operation
                   1305: was unsuccessful, or the connection would have broken down, C<lonc>
                   1306: would write the command into a FIFO buffer stack to be sent again
                   1307: later. C<lonc> now sends a reply regarding the overall success of the
                   1308: operation to C<lonnet> via the UNIX domain port, which is eventually
                   1309: received back by the handler.
                   1310: 
                   1311: =head2 Dynamic Resource Replication
                   1312: 
                   1313: Since resources are assembled into higher order resources simply by
                   1314: reference, in principle it would be sufficient to retrieve them from
                   1315: the respective Home Servers of the authors. However, there are several
                   1316: problems with this simple approach: since the resource assembly
                   1317: mechanism is designed to facilitate content assembly from a large
                   1318: number of widely distributed sources, individual sessions would depend
                   1319: on a large number of machines and network connections to be available,
                   1320: thus be rather fragile. Also, frequently accessed resources could
                   1321: potentially drive individual machines in the network into overload
                   1322: situations.
                   1323: 
                   1324: Finally, since most resources depend on content handlers on the Access
                   1325: Servers to be served to a client within the session context, the raw
                   1326: source would first have to be transferred across the Network from the
                   1327: respective Library Server to the Access Server, processed there, and
                   1328: then transferred on to the client.
                   1329: 
                   1330: =begin latex
                   1331: 
                   1332: \begin{figure}
                   1333: \includegraphics[width=0.75\paperwidth,keepaspectratio]{Dynamic_Replication_Request}
                   1334:   \caption{\label{Dynamic_Replication}Dynamic Replication} 
                   1335: \end{figure}
                   1336: 
                   1337: =end latex
                   1338: 
                   1339: To enable resource assembly in a reliable and scalable way, a dynamic
                   1340: resource replication scheme was developed. Fig. "Dynamic Replication"
                   1341: shows the details of this mechanism.
                   1342: 
                   1343: Anytime a resource out of the resource space is requested, a handler
                   1344: routine is called which in turn calls the replication routine. As a
                   1345: first step, this routines determines whether or not the resource is
                   1346: currently in replication transfer (Step B<D1a>). During replication
                   1347: transfer, the incoming data is stored in a temporary file, and Step
                   1348: B<D1a> checks for the presence of that file. If transfer of a resource
                   1349: is actively going on, the controlling handler receives an error
                   1350: message, waits for a few seconds, and then calls the replication
                   1351: routine again. If the resource is still in transfer, the client will
                   1352: receive the message "Service currently not available".
                   1353: 
                   1354: In the next step (Step B<D1b>), the replication routine checks if the
                   1355: URL is locally present. If it is, the replication routine returns OK
                   1356: to the controlling handler, which in turn passes the request on to the
                   1357: next handler in the chain.
                   1358: 
                   1359: If the resource is not locally present, the Home Server of the
                   1360: resource author (as extracted from the URL) is determined (Step
                   1361: B<D2>). This is done by contacting all library servers in the author?s
                   1362: domain (as determined from the lookup table, see Fig. 1.1.2B). In Step
                   1363: B<D2b> a query is sent to the remote server whether or not it is the
                   1364: Home Server of the author (in our current implementation, an
                   1365: additional cache is used to store already identified Home Servers (not
                   1366: shown in the figure)). In Step B<D2c>, the remote server answers the
                   1367: query with True or False. If the Home Server was found, the routine
                   1368: continues, otherwise it contacts the next server (Step D2a). If no
                   1369: server could be found, a "File not Found" error message is issued. In
                   1370: our current implementation, in this step the Home Server is also
                   1371: written into a cache for faster access if resources by the same author
                   1372: are needed again (not shown in the figure).
                   1373: 
                   1374: =begin latex
                   1375: 
                   1376: \begin{figure}
                   1377: \includegraphics[width=0.75\paperwidth,keepaspectratio]{Dynamic_Replication_Change}
                   1378:   \caption{\label{Dynamic_Replication_Change}Dynamic Replication: Change} \end{figure}
                   1379: 
                   1380: =end latex
                   1381: 
                   1382: In Step B<D3a>, the routine sends a subscribe command for the URL to
                   1383: the Home Server of the author. The Home Server first determines if the
                   1384: resource is present, and if the access privileges allow it to be
                   1385: copied to the requesting server (B<D3b>). If this is true, the
                   1386: requesting server is added to the list of subscribed servers for that
                   1387: resource (Step B<D3c>). The Home Server will reply with either OK or
                   1388: an error message, which is determined in Step D4. If the remote
                   1389: resource was not present, the error message "File not Found" will be
                   1390: passed on to the client, if the access was not allowed, the error
                   1391: message "Access Denied" is passed on. If the operation succeeded, the
                   1392: requesting server sends an HTTP request for the resource out of the
                   1393: C</raw> server content resource area of the Home Server.
                   1394: 
                   1395: The Home Server will then check if the requesting server is part of
                   1396: the network, and if it is subscribed to the resource (Step B<D5b>). If
                   1397: it is, it will send the resource via HTTP to the requesting server
                   1398: without any content handlers processing it (Step B<D5c>). The
                   1399: requesting server will store the incoming data in a temporary data
                   1400: file (Step B<D5a>) - this is the file that Step B<D1a> checks for. If
                   1401: the transfer could not complete, and appropriate error message is sent
                   1402: to the client (Step B<D6>). Otherwise, the transferred temporary file
                   1403: is renamed as the actual resource, and the replication routine returns
                   1404: OK to the controlling handler (Step B<D7>).
                   1405: 
                   1406: Fig. "Dynamic Replication: Change" depicts the process of modifying a
                   1407: resource. When an author publishes a new version of a resource, the
                   1408: Home Server will contact every server currently subscribed to the
                   1409: resource (Step B<U1>), as determined from the list of subscribed
                   1410: servers for the resource generated in Step B<D3c>. The subscribing
                   1411: servers will receive and acknowledge the update message (Step
                   1412: B<U1c>). The update mechanism finishes when the last subscribed server
                   1413: has been contacted (messages to unreachable servers are buffered).
                   1414: 
                   1415: Each subscribing server will check if the resource in question had
                   1416: been accessed recently, that is, within a configurable amount of time
                   1417: (Step B<U2>).
                   1418: 
                   1419: If the resource had not been accessed recently, the local copy of the
                   1420: resource is deleted (Step B<U3a>) and an unsubscribe command is sent
                   1421: to the Home Server (Step B<U3b>). The Home Server will check if the
                   1422: server had indeed originally subscribed to the resource (Step B<U3c>)
                   1423: and then delete the server from the list of subscribed servers for the
                   1424: resource (Step B<U3d>).
                   1425: 
                   1426: If the resource had been accessed recently, the modified resource will
                   1427: be copied over using the same mechanism as in Step B<D5a> through
                   1428: B<D7>, which represents steps Steps B<U4a> through B<U6> in the
                   1429: replication figure.
                   1430: 
1.52    ! bowersj2 1431: =head2 Load Balancing 
1.51      bowersj2 1432: 
1.52    ! bowersj2 1433: X<load balancing>C<lond> provides a function to query the server's current loadavg. As
1.51      bowersj2 1434: a configuration parameter, one can determine the value of loadavg,
                   1435: which is to be considered 100%, for example, 2.00.
                   1436: 
                   1437: Access servers can have a list of spare access servers,
                   1438: C</home/httpd/lonTabs/spares.tab>, to offload sessions depending on
                   1439: own workload. This check happens is done by the login handler. It
                   1440: re-directs the login information and session to the least busy spare
                   1441: server if itself is overloaded. An additional round-robin IP scheme
                   1442: possible. See Fig. "Load Balancing Sample" for an example of a
                   1443: load-balancing scheme.
                   1444: 
                   1445: =begin latex
                   1446: 
                   1447: \begin{figure}
                   1448: \includegraphics[width=0.75\paperwidth,keepaspectratio]{Load_Balancing_Example}
                   1449:   \caption{\label{Load_Balancing_Example}Load Balancing Example} \end{figure}
                   1450: 
                   1451: =end latex
                   1452: 
1.23      harris41 1453: =head1 DESCRIPTION
                   1454: 
                   1455: Provides persistent TCP connections to the other servers in the network
                   1456: through multiplexed domain sockets
                   1457: 
1.31      harris41 1458: B<lonc> forks off children processes that correspond to the other servers
                   1459: in the network.  Management of these processes can be done at the
                   1460: parent process level or the child process level.
                   1461: 
1.51      bowersj2 1462: After forking off the children, B<lonc> the B<parent> executes a main
                   1463: loop which simply waits for processes to exit.  As a process exits, a
                   1464: new process managing a link to the same peer as the exiting process is
                   1465: created.
1.33      foxr     1466: 
1.31      harris41 1467: B<logs/lonc.log> is the location of log messages.
                   1468: 
                   1469: The process management is now explained in terms of linux shell commands,
                   1470: subroutines internal to this code, and signal assignments:
                   1471: 
                   1472: =over 4
                   1473: 
                   1474: =item *
                   1475: 
                   1476: PID is stored in B<logs/lonc.pid>
                   1477: 
                   1478: This is the process id number of the parent B<lonc> process.
                   1479: 
                   1480: =item *
                   1481: 
                   1482: SIGTERM and SIGINT
                   1483: 
                   1484: Parent signal assignment:
                   1485:  $SIG{INT}  = $SIG{TERM} = \&HUNTSMAN;
                   1486: 
                   1487: Child signal assignment:
                   1488:  $SIG{INT}  = 'DEFAULT'; (and SIGTERM is DEFAULT also)
                   1489: (The child dies and a SIGALRM is sent to parent, awaking parent from slumber
                   1490:  to restart a new child.)
                   1491: 
                   1492: Command-line invocations:
                   1493:  B<kill> B<-s> SIGTERM I<PID>
                   1494:  B<kill> B<-s> SIGINT I<PID>
                   1495: 
                   1496: Subroutine B<HUNTSMAN>:
                   1497:  This is only invoked for the B<lonc> parent I<PID>.
                   1498: This kills all the children, and then the parent.
                   1499: The B<lonc.pid> file is cleared.
                   1500: 
                   1501: =item *
                   1502: 
                   1503: SIGHUP
                   1504: 
                   1505: Current bug:
                   1506:  This signal can only be processed the first time
                   1507: on the parent process.  Subsequent SIGHUP signals
                   1508: have no effect.
                   1509: 
                   1510: Parent signal assignment:
                   1511:  $SIG{HUP}  = \&HUPSMAN;
                   1512: 
                   1513: Child signal assignment:
                   1514:  none (nothing happens)
                   1515: 
                   1516: Command-line invocations:
                   1517:  B<kill> B<-s> SIGHUP I<PID>
                   1518: 
                   1519: Subroutine B<HUPSMAN>:
                   1520:  This is only invoked for the B<lonc> parent I<PID>,
                   1521: This kills all the children, and then the parent.
                   1522: The B<lonc.pid> file is cleared.
                   1523: 
                   1524: =item *
                   1525: 
                   1526: SIGUSR1
                   1527: 
                   1528: Parent signal assignment:
                   1529:  $SIG{USR1} = \&USRMAN;
                   1530: 
                   1531: Child signal assignment:
                   1532:  $SIG{USR1}= \&logstatus;
                   1533: 
                   1534: Command-line invocations:
                   1535:  B<kill> B<-s> SIGUSR1 I<PID>
                   1536: 
                   1537: Subroutine B<USRMAN>:
                   1538:  When invoked for the B<lonc> parent I<PID>,
                   1539: SIGUSR1 is sent to all the children, and the status of
                   1540: each connection is logged.
                   1541: 
1.23      harris41 1542: 
1.31      harris41 1543: =back
1.23      harris41 1544: 
                   1545: =cut

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>