--- loncom/loncron 2018/11/06 15:37:42 1.113 +++ loncom/loncron 2018/11/18 22:50:46 1.114 @@ -2,7 +2,7 @@ # Housekeeping program, started by cron, loncontrol and loncron.pl # -# $Id: loncron,v 1.113 2018/11/06 15:37:42 raeburn Exp $ +# $Id: loncron,v 1.114 2018/11/18 22:50:46 raeburn Exp $ # # Copyright Michigan State University Board of Trustees # @@ -43,6 +43,7 @@ use HTML::Entities; use Getopt::Long; use GDBM_File; use Storable qw(thaw); +use File::ReadBackwards; #globals use vars qw (%perlvar %simplestatus $errors $warnings $notices $totalcount); @@ -646,35 +647,199 @@ sub test_connections { # ------------------------------------------------------------ Delayed messages sub check_delayed_msg { - my ($fh)=@_; + my ($fh,$weightsref,$exclusionsref)=@_; &log($fh,'

Delayed Messages

'); print "Checking buffers.\n"; &log($fh,'

Scanning Permanent Log

'); my $unsend=0; + my $ignored=0; my %hostname = &Apache::lonnet::all_hostnames(); my $numhosts = scalar(keys(%hostname)); - - my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log"); - while (my $line=<$dfh>) { - my ($time,$sdf,$dserv,$dcmd)=split(/:/,$line); - if ($numhosts) { - next unless ($hostname{$dserv}); - } - if ($sdf eq 'F') { - my $local=localtime($time); - &log($fh,"Failed: $time, $dserv, $dcmd
"); - $warnings++; - } - if ($sdf eq 'S') { $unsend--; } - if ($sdf eq 'D') { $unsend++; } + my $checkbackwards = 0; + my $checkfrom = 0; + my $checkexcluded = 0; + my (%bymachine,%weights,%exclusions,%serverhomes); + if (ref($weightsref) eq 'HASH') { + %weights = %{$weightsref}; + } + if (ref($exclusionsref) eq 'HASH') { + %exclusions = %{$exclusionsref}; + if (keys(%exclusions)) { + $checkexcluded = 1; + %serverhomes = &read_serverhomeIDs(); + } } - &log($fh,"

Total unsend messages: $unsend

\n"); - if ($unsend > 0) { - $warnings=$warnings+5*$unsend; +# +# For LON-CAPA 1.2.0 to 2.1.3 (release dates: 8/31/2004 and 3/31/2006) any +# entry logged in lonnet.perm.log for completion of a delayed (critical) +# transaction lacked the hostID for the remote node to which the command +# to be completed was sent. +# +# Because of this, exclusion of items in lonnet.perm.log for nodes which are +# no longer part of the cluster from adding to the overall "unsend" count +# needs additional effort besides the changes made in loncron rev. 1.105. +# +# For "S" (completion) events logging in LON-CAPA 1.2.0 through 2.1.3 included +# "LondTransaction=HASH(hexadecimal)->getClient() :$cmd, where the hexadecimal +# is a memory location, and $cmd is the command sent to the remote node. +# +# Starting with 2.2.0 (released 8/21/2006) logging for "S" (completion) events +# had sethost:$host_id:$cmd after LondTransaction=HASH(hexadecimal)->getClient() +# +# Starting with 2.4.1 (released 6/13/2007) logging for "S" replaced echoing the +# getClient() call with the result of the Transaction->getClient() call itself +# undef for completion of delivery of a delayed message. +# +# The net effect of these changes is that lonnet.perm.log is now accessed three +# times: (a) oldest record is checked, if earlier than release date for 2.5.0 +# then (b) file is read backwards, with timestamp recorded for most recent +# instance of logged "S" event for "update" command without "sethost:$host_id:" +# then (c) file is read forward with records ignored which predate the timestamp +# recorded in (b), if one was found. +# +# In (c), when calculating the unsend total, i.e., the difference between delayed +# transactions ("D") and sent transactions ("S"), transactions are ignored if the +# target node is no longer in the cluster, and also (for "update" commands), if +# the target node is in the list of nodes excluded from the count, in the domain +# configuration for this machine's default domain. The idea here is to remove +# delayed "update" commands for nodes for which inbound access to port 5663, +# is blocked, but are still part of the LON-CAPA network, (i.e., they can still +# replicate content from other nodes). +# + + my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r"); + if (defined($dfh)) { + while (my $line=<$dfh>) { + my ($time,$sdf,$rest)=split(/:/,$line,3); + if ($time < 1541185772) { + $checkbackwards = 1; + } + last; + } + undef $dfh; + } + + if ($checkbackwards) { + if (tie *BW, 'File::ReadBackwards', "$perlvar{'lonDaemons'}/logs/lonnet.perm.log") { + while(my $line=) { + if ($line =~ /\QLondTransaction=HASH\E[^:]+:update:/) { + ($checkfrom) = split(/:/,$line,2); + last; + } + } + close(BW); + } + } + $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r"); + if (defined($dfh)) { + while (my $line=<$dfh>) { + my ($time,$sdf,$rest)=split(/:/,$line,3); + next unless (($sdf eq 'F') || ($sdf eq 'S') || ($sdf eq 'D')); + next if (($checkfrom) && ($time <= $checkfrom)); + my ($dserv,$dcmd); + if ($sdf eq 'S') { + my ($serva,$cmda,$servb,$cmdb) = split(/:/,$rest); + if ($cmda eq 'sethost') { + chomp($cmdb); + $dcmd = $cmdb; + } else { + $dcmd = $cmda; + } + if (($serva =~ /^LondTransaction/) || ($serva eq '')) { + unless (($servb eq '') || ($servb =~ m{^/})) { + $dserv = $servb; + } + } else { + $dserv = $serva; + } + } else { + ($dserv,$dcmd) = split(/:/,$rest); + } + if ($sdf eq 'F') { + my $local=localtime($time); + &log($fh,"Failed: $time, $dserv, $dcmd
"); + $warnings++; + } + next if ((($dserv eq '') || ($dcmd eq '')) && ($sdf ne 'F')); + if ($sdf eq 'S') { + if ($dcmd eq 'update') { + if ($hostname{$dserv}) { + if ($exclusions{$serverhomes{$hostname{$dserv}}}) { + $ignored --; + } else { + $unsend --; + } + } + if (exists($bymachine{$dserv})) { + $bymachine{$dserv} --; + } else { + $bymachine{$dserv} = -1; + } + } else { + if ($hostname{$dserv}) { + $unsend --; + } + } + } elsif ($sdf eq 'D') { + if ($dcmd eq 'update') { + if ($hostname{$dserv}) { + if ($exclusions{$serverhomes{$hostname{$dserv}}}) { + $ignored ++; + } else { + $unsend ++; + } + } + if (exists($bymachine{$dserv})) { + $bymachine{$dserv} ++; + } else { + $bymachine{$dserv} = 1; + } + } else { + if ($hostname{$dserv}) { + $unsend ++; + } + } + } + } + undef $dfh; + my $nodest = 0; + my $retired = 0; + my %active; + if (keys(%bymachine)) { + unless ($checkexcluded) { + %serverhomes = &read_serverhomeIDs(); + } + foreach my $key (keys(%bymachine)) { + if ($bymachine{$key} > 0) { + if ($hostname{$key}) { + $active{$serverhomes{$hostname{$key}}} += $bymachine{$key}; + } else { + $retired ++; + $nodest += $bymachine{$key}; + } + } + } + } + if (keys(%active)) { + &log($fh,"

Unsend messages by node, active (undegraded) nodes in cluster

\n"); + foreach my $key (sort(keys(%active))) { + &log($fh,&encode_entities("$key => $active{$key}",'<>&"')."\n"); + } + } + &log($fh,"

Total unsend messages: $unsend for ".scalar(keys(%active))." active (undegraded) nodes in cluster.

\n"); + if (keys(%exclusions) > 0) { + &log($fh,"

Total incomplete updates $ignored for ".scalar(keys(%exclusions))." degraded nodes in cluster.

\n"); + } + if ($retired) { + &log($fh,"

Total unsent $nodest for $retired nodes no longer in cluster.

\n"); + } + if ($unsend > 0) { + $warnings=$warnings+$weights{'U'}*$unsend; + } } if ($unsend) { $simplestatus{'unsend'}=$unsend; } @@ -713,7 +878,11 @@ sub check_delayed_msg { } sub finish_logging { - my ($fh,%weights)=@_; + my ($fh,$weightsref)=@_; + my %weights; + if (ref($weightsref) eq 'HASH') { + %weights = %{$weightsref}; + } &log($fh,"
\n"); $totalcount=($weights{'N'}*$notices)+($weights{'W'}*$warnings)+($weights{'E'}*$errors); &errout($fh); @@ -1086,6 +1255,62 @@ sub reset_nosslverify_pids { return; } +sub get_permcount_settings { + my ($domconf) = @_; + my ($defaults,$names) = &Apache::loncommon::lon_status_items(); + my (%weights,$threshold,$sysmail,$reportstatus,%exclusions); + foreach my $type ('E','W','N','U') { + $weights{$type} = $defaults->{$type}; + } + $threshold = $defaults->{'threshold'}; + $sysmail = $defaults->{'sysmail'}; + $reportstatus = 1; + if (ref($domconf) eq 'HASH') { + if (ref($domconf->{'contacts'}) eq 'HASH') { + if ($domconf->{'contacts'}{'reportstatus'} == 0) { + $reportstatus = 0; + } + if (ref($domconf->{'contacts'}{'lonstatus'}) eq 'HASH') { + if (ref($domconf->{'contacts'}{'lonstatus'}{weights}) eq 'HASH') { + foreach my $type ('E','W','N','U') { + if (exists($domconf->{'contacts'}{'lonstatus'}{weights}{$type})) { + $weights{$type} = $domconf->{'contacts'}{'lonstatus'}{weights}{$type}; + } + } + } + if (ref($domconf->{'contacts'}{'lonstatus'}{'excluded'}) eq 'ARRAY') { + my @excluded = @{$domconf->{'contacts'}{'lonstatus'}{'excluded'}}; + if (@excluded) { + map { $exclusions{$_} = 1; } @excluded; + } + } + if (exists($domconf->{'contacts'}{'lonstatus'}{'threshold'})) { + $threshold = $domconf->{'contacts'}{'lonstatus'}{'threshold'}; + } + if (exists($domconf->{'contacts'}{'lonstatus'}{'sysmail'})) { + $sysmail = $domconf->{'contacts'}{'lonstatus'}{'sysmail'}; + } + } + } + } + return ($threshold,$sysmail,$reportstatus,\%weights,\%exclusions); +} + +sub read_serverhomeIDs { + my %server; + if (-e "$perlvar{'lonTabDir'}/serverhomeIDs.tab") { + if (open(my $fh,'<',"$perlvar{'lonTabDir'}/serverhomeIDs.tab")) { + while (<$fh>) { + my($host,$id) = split(/:/); + chomp($id); + $server{$host} = $id; + } + close($fh); + } + } + return %server; +} + sub send_mail { my ($sysmail,$reportstatus) = @_; my $defdom = $perlvar{'lonDefDomain'}; @@ -1273,8 +1498,10 @@ sub main () { &test_connections($fh); } if (!$justcheckdaemons && !$justcheckconnections && !$justreload) { - &check_delayed_msg($fh); - &log_simplestatus(); + my $domconf = &get_domain_config(); + my ($threshold,$sysmail,$reportstatus,$weightsref,$exclusionsref) = + &get_permcount_settings($domconf); + &check_delayed_msg($fh,$weightsref,$exclusionsref); &write_loncaparevs(); &write_serverhomeIDs(); &write_checksums(); @@ -1287,37 +1514,9 @@ sub main () { &checkon_daemon($fh,'lond',40000,'USR2'); &reset_nosslverify_pids($fh,%sslrem); } - my $domconf = &get_domain_config(); - my ($defaults,$names) = &Apache::loncommon::lon_status_items(); - my (%weights,$threshold); - foreach my $type ('E','W','N') { - $weights{$type} = $defaults->{$type}; - } - my $threshold = $defaults->{'threshold'}; - my $sysmail = $defaults->{'sysmail'}; - my $reportstatus = 1; - if (ref($domconf->{'contacts'}) eq 'HASH') { - if ($domconf->{'contacts'}{'reportstatus'} == 0) { - $reportstatus = 0; - } - if (ref($domconf->{'contacts'}{'lonstatus'}) eq 'HASH') { - if (ref($domconf->{'contacts'}{'lonstatus'}{weights}) eq 'HASH') { - foreach my $type ('E','W','N') { - if (exists($domconf->{'contacts'}{'lonstatus'}{weights}{$type})) { - $weights{$type} = $domconf->{'contacts'}{'lonstatus'}{weights}{$type}; - } - } - } - } - if (exists($domconf->{'contacts'}{'lonstatus'}{'threshold'})) { - $threshold = $domconf->{'contacts'}{'lonstatus'}{'threshold'}; - } - if (exists($domconf->{'contacts'}{'lonstatus'}{'sysmail'})) { - $sysmail = $domconf->{'contacts'}{'lonstatus'}{'sysmail'}; - } - } - &finish_logging($fh,%weights); - if ($totalcount>$threshold && !$noemail) { &send_mail($sysmail,$reportstatus); } + &finish_logging($fh,$weightsref); + &log_simplestatus(); + if ($totalcount>$threshold && !$noemail) { &send_mail($sysmail,$reportstatus); } } }