#!/usr/bin/perl $|=1; # Generates a html page showing various status reports about the domain or cluster # $Id: clusterstatus.pl,v 1.26 2008/12/25 01:56:03 raeburn Exp $ # # Copyright Michigan State University Board of Trustees # # This file is part of the LearningOnline Network with CAPA (LON-CAPA). # # LON-CAPA is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # LON-CAPA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with LON-CAPA; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # /home/httpd/html/adm/gpl.txt # # http://www.lon-capa.org/ # use strict; use lib '/home/httpd/lib/perl/'; use Apache::lonnet; use Apache::lonlocal; use LONCAPA::Configuration; use LONCAPA::loncgi; use LONCAPA::lonauthcgi; use LWP::UserAgent(); use HTTP::Headers; use IO::File; my $perlvar=&LONCAPA::Configuration::read_conf('loncapa.conf'); my %host=(); my $oneday=60*60*24; my %connectionstatus=(); my %perlvar=(); my $mode; my $concount=0; my $fromcache; my %domaininfo = &Apache::lonnet::domain_info(); my %allhostname = &Apache::lonnet::all_hostnames(); my (%hostname,%hostip); my %hostdom = &Apache::lonnet::all_host_domain(); my %iphost = &Apache::lonnet::get_iphost(); my %libserv= &Apache::lonnet::all_library(); foreach my $ip (keys(%iphost)) { $hostip{$iphost{$ip}} = $ip; } my $maxusers=0; my $maxload=0; my $totalusers=0; my %FORM=(); my $stat_total=0; my $stat_notyet=0; my $stat_fromcache=0; sub select_form { my ($def,$name,%hash) = @_; my $selectform = ""; return $selectform; } sub key { my ($local,$url)=@_; my $key=$local.'_'.$url; $key=~s/\W/\_/gs; return $key; } sub hidden { my ($name,$value)=@_; print("\n"); } sub request { my ($local,$url,$cachetime)=@_; $cachetime*=(0.5+rand); my $key=&key($local,$url); my $reply=''; $stat_total++; # if fromcache flag is set, only return cached values if ($fromcache) { if ($FORM{$key.'_time'}) { $stat_fromcache++; return $FORM{$key}; } else { $stat_notyet++; return 'not_yet'; } } # normal mode, refresh when expired or not yet present if ($FORM{$key.'_time'}) { if ((time-$FORM{$key.'_time'})<$cachetime) { $reply=$FORM{$key}; &hidden($key.'_time',$FORM{$key.'_time'}); $stat_fromcache++; } } unless ($reply) { if ($hostname{$local}) { my $ua=new LWP::UserAgent(timeout => 20); my $request=new HTTP::Request('GET', "http://".$hostname{$local}.$url); my $response=$ua->request($request); if ($response->is_success) { $reply=$response->content; chomp($reply); } else { $reply='local_error'; } } else { $reply='local_unknown'; } &hidden($key.'_time',time); } &hidden($key,$reply); return $reply; } # ============================================= Are local and remote connected? sub connected { my ($local,$remote)=@_; $local=~s/\W//g; $remote=~s/\W//g; unless ($hostname{$remote}) { return 'remote_unknown'; } my $url='/cgi-bin/ping.pl?'.$remote; # # Slowly phase this in: if not cached, only do 5 percent of the cases, # but always do the first five. # unless ($FORM{&key($local,$url)}) { unless (($concount<=5) || (rand>0.95)) { $stat_total++; $stat_notyet++; return 'not_yet'; } else { $concount++; } } # # Actually do the query # &statuslist($local,'connecting '.$remote); my $reply=&request($local,$url,3600); $reply=(split("\n",$reply))[0]; $reply=~s/\W//g; if ($reply ne $remote) { return $reply; } return 'ok'; } # ============================================================ Get a reply hash sub replyhash { my %returnhash=(); foreach (split(/\&/,&request(@_))) { my ($name,$value)=split(/\=/,$_); if ($name) { unless ($value) { $value=''; } $returnhash{$name}=$value; } } return %returnhash; } # ================================================================ Link to host sub otherwindow { my ($local,$url,$label)=@_; return " $label "; } sub login { my $local=shift; print(&otherwindow($local,'/adm/login?domain='.$perlvar{'lonDefDomain'}, 'Login')); } sub runloncron { my $local=shift; print(&otherwindow($local,'/cgi-bin/loncron.pl',&Apache::lonlocal::mt('Run loncron'))); } sub loncron { my $local=shift; print(&otherwindow($local,'/lon-status','loncron')); } sub lonc { my $local=shift; print(&otherwindow($local,'/lon-status/loncstatus.txt','lonc')); } sub lond { my $local=shift; print(&otherwindow($local,'/lon-status/londstatus.txt','lond')); } sub users { my $local=shift; print(&otherwindow($local,'/cgi-bin/userstatus.pl',&Apache::lonlocal::mt('Users'))); } sub versions { my $local=shift; print(&otherwindow($local,'/cgi-bin/lonversions.pl',&Apache::lonlocal::mt('Versions'))); } sub server { my $local=shift; print(&otherwindow($local,'/server-status',&Apache::lonlocal::mt('Server Status'))); } sub announcement { my $local=shift; print(&otherwindow($local,'/announcement.txt',&Apache::lonlocal::mt('Announcement'))); } sub takeonline { my $local=shift; print(&otherwindow($local,'/cgi-bin/takeonline.pl',&Apache::lonlocal::mt('Take online'))); } sub takeoffline { my $local=shift; print(&otherwindow($local,'/cgi-bin/takeoffline.pl',&Apache::lonlocal::mt('Take offline'))); } sub reroute { my ($local,$remote)=@_; print(&otherwindow($local,'/cgi-bin/takeoffline.pl?'. $hostname{$remote}.'&'.$hostdom{$local} ,$remote)."\n"); } sub allreroutes { my $local=shift; &takeoffline($local); print(&Apache::lonlocal::mt('Reroute to:').' '); foreach my $remote (sort keys %hostname) { unless ($local eq $remote) { &reroute($local,$remote); } } print(''); } # ========================================================= Produce a green bar sub bar { my $parm=shift; my $number=int($parm+0.5); print('
'); for (my $i=0;$i<$number;$i++) { print "+"; } print("
"); } # ========================================================== Show server status sub serverstatus { my ($local,$trouble)=@_; my $hostrole; if (exists($libserv{$local})) { $hostrole = 'library'; } else { $hostrole = 'access'; } my %lt = &Apache::lonlocal::texthash( rero => 'Reroute:', vers => 'Version:', load => 'Load:', acti => 'Active Users:', rpms => 'RPMs', mysq => 'MySQL Database:', notc => 'Not connected', lonc => 'loncron errors', ); print(< "); if ($trouble) { print(""); } print("
$local $hostdom{$local} ($hostname{$local}); $hostrole
$domaininfo{$hostdom{$local}}{'description'} $domaininfo{$hostdom{$local}}{'city'}
ENDHEADER &login($local);&server($local);&users($local);&versions($local); &announcement($local); &loncron($local);&lond($local);&lonc($local);&runloncron($local); print("
$trouble
"); # re-routing if ($host{$local.'_reroute'}) { print('
'.$lt{'rero'}.' '.$host{$local.'_reroute'}); &takeonline($local); } # version if ($host{$local.'_version'}) { print('
'.$lt{'vers'}.' '.$host{$local.'_version'}); } # load if (($host{$local.'_load_doomed'}>0.5) || ($mode eq 'load_doomed')) { print('
'.$lt{'load'}.' '.$host{$local.'_load'}); } # users if (($host{$local.'_users_doomed'}>10) || ($mode eq 'users_doomed')) { print('
'.$lt{'acti'}.' '.$host{$local.'_users'}); } # checkrpms if ($host{$local.'_checkrpms'}) { print('
'.$lt{'rpms'}.' '.$host{$local.'_checkrpms'}); } # mysql if ($host{$local.'_mysql'}) { print('
'.$lt{'mysq'}.' '.$host{$local.'_mysql'}); } # connections if ($host{$local.'_notconnected'}) { print('
'.$lt{'notc'}.' '); foreach my $item (split(/ /,$host{$local.'_notconnected'})) { if ($item) { print(' '.$item.''); } } } # errors if ($host{$local.'_errors'}) { print('
'.$lt{'lonc'}.' '.$host{$local.'_errors'}); } print "
"; &allreroutes($local); print "

"; } # =========================================================== Doomedness sorted sub doomedness { my $crit=shift; my %alldoomed=(); my @allhosts=(); foreach (keys %host) { if ($_=~/^(\w+)\_$crit$/) { if ($host{$_}) { push (@allhosts,$1); $alldoomed{$1}=$host{$_}; } } } return sort { $alldoomed{$b} <=> $alldoomed{$a} } @allhosts; } sub resetvars { $maxusers=0; $maxload=0; $totalusers=0; $stat_total=0; $stat_notyet=0; $stat_fromcache=0; $concount=0; undef %host; %host=(); } sub mainloop { &resetvars(); # ==================================================== Main Loop over all Hosts foreach my $local (sort(keys(%hostname))) { $host{$local.'_unresponsive_doomed'}=0; # -- Check general status &statuslist($local,'General'); my %loncron=&replyhash($local,'/lon-status/loncron_simple.txt',1200); if (defined($loncron{'local_error'})) { $host{$local.'_loncron'}='Could not determine.'; $host{$local.'_unresponsive_doomed'}++; } else { if ((time-$loncron{'time'})>$oneday) { $host{$local.'_loncron'}='Stale.'; $host{$local.'_unresponsive_doomed'}++; } else { $host{$local.'_loncron_doomed'}=$loncron{'notices'} +4*$loncron{'warnings'} +100*$loncron{'errors'}; $host{$local.'_errors'}=$loncron{'errors'}; } } # -- Check version &statuslist($local,'Version'); my $version=&request($local,'/lon-status/version.txt',7200); if ($version eq 'local_error') { $host{$local.'_version'}='Could not determine.'; $host{$local.'_unresponsive_doomed'}++; } else { $host{$local.'_version'}=$version; } # -- Check user status &statuslist($local,'Users'); my %userstatus=&replyhash($local,'/cgi-bin/userstatus.pl?simple',600); if (defined($userstatus{'local_error'})) { $host{$local.'_userstatus'}='Could not determine.'; $host{$local.'_unresponsive_doomed'}++; } else { $host{$local.'_users_doomed'}=$userstatus{'Active'}; $host{$local.'_users'}=$userstatus{'Active'}; unless ($host{$local.'_users'}) { $host{$local.'_users'}=0; } if ($host{$local.'_users'}>$maxusers) { $maxusers=$host{$local.'_users'}; } $totalusers+=$host{$local.'_users'}; my ($sload,$mload,$lload)=split(/ /,$userstatus{'loadavg'}); $host{$local.'_load_doomed'}=$mload; if ($mload>$maxload) { $maxload=$mload; } $host{$local.'_load'}=$userstatus{'loadavg'}; } # -- Check reroute status &statuslist($local,'Reroute'); my %reroute=&replyhash($local,'/lon-status/reroute.txt',1800); if ($reroute{'status'} eq 'rerouting') { if ($reroute{'server'}) { $host{$local.'_reroute'}= 'Rerouting to '.$reroute{'server'}. ', domain: '.$reroute{'domain'}. ' (since '.localtime($reroute{'time'}).')'; } else { $host{$local.'_reroute'}='offline'; } } # -- Check mysql status &statuslist($local,'Database'); my %mysql=&replyhash($local,'/lon-status/mysql.txt',3600); if (defined($mysql{'local_error'})) { $host{$local.'_mysql'}='Could not determine.'; $host{$local.'_unresponsive_doomed'}++; } else { if ((time-$mysql{'time'})>(7*$oneday)) { if (exists($libserv{$local})) { $host{$local.'_mysql'}='Stale.'; $host{$local.'_mysql_doomed'}=1; } if ($mysql{'mysql'} eq 'defunct') { $host{$local.'_mysql'}='Defunct (maybe stale).'; $host{$local.'_mysql_doomed'}=2; } } elsif ($mysql{'mysql'} eq 'defunct') { $host{$local.'_mysql'}='Defunct.'; $host{$local.'_mysql_doomed'}=3; } } # -- Check rpm status &statuslist($local,'RPMs'); my %checkrpms=&replyhash($local,'/lon-status/checkrpms.txt',7200); if (defined($checkrpms{'local_error'})) { $host{$local.'_checkrpms'}='Could not determine.'; $host{$local.'_unresponsive_doomed'}++; } else { if ((time-$checkrpms{'time'})>(4*$oneday)) { $host{$local.'_checkrpms'}='Stale.'; $host{$local.'_checkrpms_doomed'}=50; $host{$local.'_unresponsive_doomed'}++; } elsif ($checkrpms{'status'} eq 'fail') { $host{$local.'_checkrpms'}='Could not checked RPMs.'; $host{$local.'_checkrpms_doomed'}=100; } elsif ($checkrpms{'rpmcount'}) { $host{$local.'_checkrpms'}='Outdated RPMs: '. $checkrpms{'rpmcount'}; $host{$local.'_checkrpms_doomed'}=$checkrpms{'rpmcount'}; } } # -- Check connections &statuslist($local,'Connections'); $host{$local.'_notconnected'}=''; $host{$local.'_notconnected_doomed'}=0; foreach my $remote (sort keys %hostname) { my $status=&connected($local,$remote); $connectionstatus{$local.'_TO_'.$remote}=$status; unless (($status eq 'ok') || ($status eq 'not_yet')) { $host{$local.'_notconnected'}.=' '.$remote; $host{$local.'_notconnected_doomed'}++; } } # =============================================================== End Main Loop } } sub reports { # ====================================================================== Output if ($mode=~/\_doomed$/) { # Output by doomedness foreach (&doomedness($mode)) { &serverstatus($_); } } elsif ($mode eq 'connections') { print "". ""; foreach my $remote (sort keys %hostname) { print ''; } print "\n"; # connection matrix foreach my $local (sort keys %hostname) { print ''; foreach my $remote (sort keys %hostname) { if ($connectionstatus{$local.'_TO_'.$remote} eq 'not_yet') { my $cellcolor='#FFFFFF'; if ($local eq $remote) { $cellcolor='#DDDDDD'; } print ''; } elsif ($connectionstatus{$local.'_TO_'.$remote} eq 'ok') { my $cellcolor='#BBDDBB'; if ($local eq $remote) { $cellcolor='#99DD99'; } print ''; } else { my $cellcolor='#DDCCAA'; if ($connectionstatus{$local.'_TO_'.$remote} eq 'local_error') { if ($local eq $remote) { $cellcolor='#DD88AA'; } else { $cellcolor='#DDAACC'; } } else { if ($local eq $remote) { $cellcolor='#DDBB77'; } } print ''; } } print "\n"; } print "
 '.$remote.'
'.$local.'not yet tested'.&Apache::lonlocal::mt('ok').''. $connectionstatus{$local.'_TO_'.$remote}.'
'; &lonc($local); &lond($remote); print '
"; } elsif ($mode eq 'users') { # Users if ($maxusers) { my $factor=50/$maxusers; print '

'.&Apache::lonlocal::mt('Total active user(s)').': '.$totalusers.'

'. ''; foreach my $local (sort keys %hostname) { if (defined($host{$local.'_users'})) { print ''."\n"; } } print '
'.$local. '
'. $domaininfo{$hostdom{$local}}{'description'}. '
'; &users($local); print ''. $host{$local.'_users'}.'
'; } else { print &Apache::lonlocal::mt('No active users logged in.'); } } elsif ($mode eq 'load') { # Load if ($maxload) { my $factor=50/$maxload; print ""; foreach my $local (sort keys %hostname) { if (defined($host{$local.'_load_doomed'})) { print '\n"; } } print "
'. $local. '
'. $Apache::lonnet::domain{$hostdom{$local}}{'description'}. '
'; &server($local); print ''. $host{$local.'_load_doomed'}.'
"; } else { print &Apache::lonlocal::mt('No workload.'); } } elsif ($mode eq 'trouble') { my $count=0; foreach my $local (sort keys %hostname) { my $trouble=''; if ($host{$local.'_unresponsive_doomed'}>3) { $trouble=&Apache::lonlocal::mt('Does not respond to several queries.'). '
'; } if ($host{$local.'_errors'}) { $trouble=&Apache::lonlocal::mt('Has loncron errors').'
'; } elsif ($host{$local.'_loncron_doomed'}>2500) { $trouble=&Apache::lonlocal::mt('High loncron count.').'
'; } if ($host{$local.'_load_doomed'}>5) { $trouble=&Apache::lonlocal::mt('High load.').'
'; } if ($host{$local.'_users_doomed'}>200) { $trouble=&Apache::lonlocal::mt('High user volume.').'
'; } if ($host{$local.'_mysql_doomed'}>1) { $trouble=&Apache::lonlocal::mt('MySQL database apparently offline.').'
'; } if ($host{$local.'_checkrpms_doomed'}>100) { $trouble=&Apache::lonlocal::mt('RPMs outdated.').'
'; } if ($host{$local.'_reroute'}) { $trouble=&Apache::lonlocal::&mt('Rerouting').'
'; } if ($trouble) { $count++; &serverstatus($local,$trouble); } } unless ($count) { print &Apache::lonlocal::mt('No major trouble.'); } } } # ====================================================================== Status sub statuslist { my ($local,$what)=@_; my $displaylocal; if (defined($local)) { $displaylocal = " $local ($hostname{$local})"; } print ''."\n"; } # ============================================================================= # ============================================================================= # Main program # # ========================================================= Get form parameters my $buffer; read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'}); my @pairs=split(/&/,$buffer); my $pair; my $name; my $value; undef %FORM; %FORM=(); foreach $pair (@pairs) { ($name,$value) = split(/=/,$pair); $value =~ tr/+/ /; $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; $FORM{$name}=$value; } $buffer=$ENV{'QUERY_STRING'}; @pairs=split(/&/,$buffer); foreach $pair (@pairs) { ($name,$value) = split(/=/,$pair); $value =~ tr/+/ /; $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; $FORM{$name}=$value; } # ====================================================== Determine refresh rate my $refresh=(($FORM{'refresh'}=~/^\d+$/)?$FORM{'refresh'}:30); if ($refresh<30) { $refresh=30; } my $starttime=time; # ============================================================== Determine mode my %modes= &Apache::lonlocal::texthash ( 'trouble' => 'Trouble', 'users_doomed' => 'Doomed: Users', 'loncron_doomed' => 'Doomed: General (loncron)', 'mysql_doomed' => 'Doomed: Database (mysql)', 'notconnected_doomed' => 'Doomed: Connections', 'checkrpms_doomed' => 'Doomed: RPMs', 'load_doomed' => 'Doomed: Load', 'unresponsive_doomed' => 'Doomed: Status could not be determined', 'users' => 'User Report', 'load' => 'Load Report', 'connections' => 'Connections Matrix'); $mode=$FORM{'mode'}; unless ($modes{$mode}) { $mode='trouble'; } # ================================================================ Send Headers print("Content-type: text/html\n\n". ''."\n"); # -------------------- Read loncapa.conf (and by default, loncapa_apache.conf). my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf'); %perlvar=%{$perlvarref}; undef $perlvarref; # remove since sensitive and not needed delete $perlvar{'lonReceipt'}; # remove since sensitive and not needed delete $perlvar{'lonSqlAccess'}; # remove since sensitive and not needed if (!&LONCAPA::loncgi::check_cookie_and_load_env()) { &Apache::lonlocal::get_language_handle(); print(&LONCAPA::loncgi::missing_cookie_msg()); exit; } if (!&LONCAPA::lonauthcgi::can_view('clusterstatus')) { &Apache::lonlocal::get_language_handle(); print &LONCAPA::lonauthcgi::unauthorized_msg('clusterstatus'); exit; } &Apache::lonlocal::get_language_handle(); my $scope = 'Domain'; if ($Apache::lonnet::env{'request.role'} =~ m{^su\./}) { %hostname = %allhostname; $scope = 'Cluster'; } else { undef(%hostname); my @poss_domains = &Apache::lonnet::current_machine_domains(); foreach my $host (keys(%allhostname)) { if (grep(/^\Q$hostdom{$host}\E$/,@poss_domains)) { $hostname{$host} = $allhostname{$host}; } } } print '

'.&Apache::lonlocal::mt("LON-CAPA $scope Status").' '.localtime()."

"; print "
\n". '
'. "
\n"; print "
\n"; print &Apache::lonlocal::mt('Choose next report:').' '.&select_form($mode,'mode',%modes).'
'; &hidden('refresh',$refresh); if (!$FORM{'runonetime'}) { my $lcscope = lc($scope); print '

'.&Apache::lonlocal::mt("Gathering initial $lcscope data").'

'. &Apache::lonlocal::mt('This may take some time ..').'
'; $fromcache=0; &mainloop(); &statuslist(undef,'Done initial run.'); &reports(); } else { $fromcache=1; &mainloop(); &statuslist(undef,'Done gathering cached data'); &reports(); $fromcache=0; &mainloop(); } &hidden('runonetime',1); print '
'.&Apache::lonlocal::mt('Total number of queries: [_1]',$stat_total); if ($stat_total != 0) { print '
'.&Apache::lonlocal::mt('Percent complete:').''. int(($stat_total-$stat_notyet)/$stat_total*100.). '
'.&Apache::lonlocal::mt('Percent from cache:').' '. int($stat_fromcache/$stat_total*100.).'
'; } # ============================================================== Close, refresh print "
"; exit 0;