[LON-CAPA-cvs] cvs: loncom(version_2_11_X) / loncron
raeburn
raeburn at source.lon-capa.org
Sat Jan 30 17:25:53 EST 2021
raeburn Sat Jan 30 22:25:53 2021 EDT
Modified files: (Branch: version_2_11_X)
/loncom loncron
Log:
- For 2.11
Backport 1.113, 1.114, 1.116 (modified), 1.125
-------------- next part --------------
Index: loncom/loncron
diff -u loncom/loncron:1.103.2.8 loncom/loncron:1.103.2.9
--- loncom/loncron:1.103.2.8 Sat May 9 19:24:25 2020
+++ loncom/loncron Sat Jan 30 22:25:53 2021
@@ -2,7 +2,7 @@
# Housekeeping program, started by cron, loncontrol and loncron.pl
#
-# $Id: loncron,v 1.103.2.8 2020/05/09 19:24:25 raeburn Exp $
+# $Id: loncron,v 1.103.2.9 2021/01/30 22:25:53 raeburn Exp $
#
# Copyright Michigan State University Board of Trustees
#
@@ -37,10 +37,13 @@
use Apache::lonnet;
use Apache::loncommon;
+use LWP::UserAgent();
+use HTTP::Request();
use IO::File;
use IO::Socket;
use HTML::Entities;
use Getopt::Long;
+use File::ReadBackwards;
use File::Copy;
use Sys::Hostname::FQDN();
@@ -76,14 +79,14 @@
rename("$file.2","$file.3");
rename("$file.1","$file.2");
rename("$file","$file.1");
- }
+ }
}
sub start_daemon {
my ($fh,$daemon,$pidfile,$args) = @_;
my $progname=$daemon;
if ($daemon eq 'lonc') {
- $progname='loncnew';
+ $progname='loncnew';
}
my $error_fname="$perlvar{'lonDaemons'}/logs/${daemon}_errors";
&rotate_logfile($error_fname,$fh,'error logs');
@@ -128,9 +131,9 @@
}
&log($fh,"</tt></p>");
}
-
+
my $pidfile="$perlvar{'lonDaemons'}/logs/$daemon.pid";
-
+
my $restartflag=1;
my $daemonpid;
if (-e $pidfile) {
@@ -210,7 +213,7 @@
}
}
}
-
+
my $fname="$perlvar{'lonDaemons'}/logs/$daemon.log";
&rotate_logfile($fname,$fh,'logs');
@@ -223,13 +226,13 @@
my ($fh)=@_;
&log($fh,'<hr /><a name="machine" /><h2>Machine Information</h2>');
&log($fh,"<h3>loadavg</h3>");
-
+
open (LOADAVGH,"/proc/loadavg");
my $loadavg=<LOADAVGH>;
close (LOADAVGH);
-
+
&log($fh,"<tt>$loadavg</tt>");
-
+
my @parts=split(/\s+/,$loadavg);
if ($parts[1]>4.0) {
$errors++;
@@ -243,14 +246,14 @@
&log($fh,"<pre>");
open (DFH,"df|");
- while (my $line=<DFH>) {
- &log($fh,&encode_entities($line,'<>&"'));
+ while (my $line=<DFH>) {
+ &log($fh,&encode_entities($line,'<>&"'));
@parts=split(/\s+/,$line);
my $usage=$parts[4];
$usage=~s/\W//g;
- if ($usage>90) {
+ if ($usage>90) {
$warnings++;
- $notices++;
+ $notices++;
} elsif ($usage>80) {
$warnings++;
} elsif ($usage>60) {
@@ -267,8 +270,8 @@
my $psproc=0;
open (PSH,"ps aux --cols 140 |");
- while (my $line=<PSH>) {
- &log($fh,&encode_entities($line,'<>&"'));
+ while (my $line=<PSH>) {
+ &log($fh,&encode_entities($line,'<>&"'));
$psproc++;
}
close (PSH);
@@ -290,7 +293,7 @@
my %simplestatus=();
my $now=time;
my $date=localtime($now);
-
+
&log($fh,(<<ENDHEADERS));
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
@@ -406,12 +409,12 @@
($cleaned,$old,$removed) =
&recursive_clean_tmp($innerdir,$cleaned,$old,$removed,$errors);
my @doms = &Apache::lonnet::current_machine_domains();
-
+
if (open(my $dirhandle,$fname)) {
unless (($innerdir eq 'helprequests') ||
(($innerdir =~ /^addcourse/) && ($innerdir !~ m{/\d+$}))) {
my @contents = grep {!/^\.\.?$/} readdir($dirhandle);
- join('&&', at contents)."\n";
+ join('&&', at contents)."\n";
if (scalar(grep {!/^\.\.?$/} readdir($dirhandle)) == 0) {
closedir($dirhandle);
if ($fname =~ m{^\Q$perlvar{'lonDaemons'}\E/tmp/}) {
@@ -464,7 +467,7 @@
}
}
} elsif (ref($errors->{failopen}) eq 'ARRAY') {
- push(@{$errors->{failopen}},$fname);
+ push(@{$errors->{failopen}},$fname);
}
} else {
if (unlink($fname)) {
@@ -618,16 +621,16 @@
print "Checking logs.\n";
if (-e "$perlvar{'lonDaemons'}/logs/lonnet.log"){
open (DFH,"tail -n50 $perlvar{'lonDaemons'}/logs/lonnet.log|");
- while (my $line=<DFH>) {
+ while (my $line=<DFH>) {
&log($fh,&encode_entities($line,'<>&"'));
}
close (DFH);
}
&log($fh,"</pre><h3>Perm Log</h3><pre>");
-
+
if (-e "$perlvar{'lonDaemons'}/logs/lonnet.perm.log") {
open(DFH,"tail -n10 $perlvar{'lonDaemons'}/logs/lonnet.perm.log|");
- while (my $line=<DFH>) {
+ while (my $line=<DFH>) {
&log($fh,&encode_entities($line,'<>&"'));
}
close (DFH);
@@ -691,35 +694,199 @@
# ------------------------------------------------------------ Delayed messages
sub check_delayed_msg {
- my ($fh)=@_;
+ my ($fh,$weightsref,$exclusionsref)=@_;
&log($fh,'<hr /><a name="delayed" /><h2>Delayed Messages</h2>');
print "Checking buffers.\n";
&log($fh,'<h3>Scanning Permanent Log</h3>');
my $unsend=0;
+ my $ignored=0;
my %hostname = &Apache::lonnet::all_hostnames();
my $numhosts = scalar(keys(%hostname));
+ my $checkbackwards = 0;
+ my $checkfrom = 0;
+ my $checkexcluded = 0;
+ my (%bymachine,%weights,%exclusions,%serverhomes);
+ if (ref($weightsref) eq 'HASH') {
+ %weights = %{$weightsref};
+ }
+ if (ref($exclusionsref) eq 'HASH') {
+ %exclusions = %{$exclusionsref};
+ if (keys(%exclusions)) {
+ $checkexcluded = 1;
+ %serverhomes = &read_serverhomeIDs();
+ }
+ }
- my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log");
- while (my $line=<$dfh>) {
- my ($time,$sdf,$dserv,$dcmd)=split(/:/,$line);
- if ($numhosts) {
- next unless ($hostname{$dserv});
- }
- if ($sdf eq 'F') {
- my $local=localtime($time);
- &log($fh,"<b>Failed: $time, $dserv, $dcmd</b><br />");
- $warnings++;
- }
- if ($sdf eq 'S') { $unsend--; }
- if ($sdf eq 'D') { $unsend++; }
+#
+# For LON-CAPA 1.2.0 to 2.1.3 (release dates: 8/31/2004 and 3/31/2006) any
+# entry logged in lonnet.perm.log for completion of a delayed (critical)
+# transaction lacked the hostID for the remote node to which the command
+# to be completed was sent.
+#
+# Because of this, exclusion of items in lonnet.perm.log for nodes which are
+# no longer part of the cluster from adding to the overall "unsend" count
+# needs additional effort besides the changes made in loncron rev. 1.105.
+#
+# For "S" (completion) events logging in LON-CAPA 1.2.0 through 2.1.3 included
+# "LondTransaction=HASH(hexadecimal)->getClient() :$cmd, where the hexadecimal
+# is a memory location, and $cmd is the command sent to the remote node.
+#
+# Starting with 2.2.0 (released 8/21/2006) logging for "S" (completion) events
+# had sethost:$host_id:$cmd after LondTransaction=HASH(hexadecimal)->getClient()
+#
+# Starting with 2.4.1 (released 6/13/2007) logging for "S" replaced echoing the
+# getClient() call with the result of the Transaction->getClient() call itself
+# undef for completion of delivery of a delayed message.
+#
+# The net effect of these changes is that lonnet.perm.log is now accessed three
+# times: (a) oldest record is checked, if earlier than release date for 2.5.0
+# then (b) file is read backwards, with timestamp recorded for most recent
+# instance of logged "S" event for "update" command without "sethost:$host_id:"
+# then (c) file is read forward with records ignored which predate the timestamp
+# recorded in (b), if one was found.
+#
+# In (c), when calculating the unsend total, i.e., the difference between delayed
+# transactions ("D") and sent transactions ("S"), transactions are ignored if the
+# target node is no longer in the cluster, and also (for "update" commands), if
+# the target node is in the list of nodes excluded from the count, in the domain
+# configuration for this machine's default domain. The idea here is to remove
+# delayed "update" commands for nodes for which inbound access to port 5663,
+# is blocked, but are still part of the LON-CAPA network, (i.e., they can still
+# replicate content from other nodes).
+#
+
+ my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r");
+ if (defined($dfh)) {
+ while (my $line=<$dfh>) {
+ my ($time,$sdf,$rest)=split(/:/,$line,3);
+ if ($time < 1541185772) {
+ $checkbackwards = 1;
+ }
+ last;
+ }
+ undef $dfh;
}
- &log($fh,"<p>Total unsend messages: <b>$unsend</b></p>\n");
- if ($unsend > 0) {
- $warnings=$warnings+5*$unsend;
+ if ($checkbackwards) {
+ if (tie *BW, 'File::ReadBackwards', "$perlvar{'lonDaemons'}/logs/lonnet.perm.log") {
+ while(my $line=<BW>) {
+ if ($line =~ /\QLondTransaction=HASH\E[^:]+:update:/) {
+ ($checkfrom) = split(/:/,$line,2);
+ last;
+ }
+ }
+ close(BW);
+ }
+ }
+ $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r");
+ if (defined($dfh)) {
+ while (my $line=<$dfh>) {
+ my ($time,$sdf,$rest)=split(/:/,$line,3);
+ next unless (($sdf eq 'F') || ($sdf eq 'S') || ($sdf eq 'D'));
+ next if (($checkfrom) && ($time <= $checkfrom));
+ my ($dserv,$dcmd);
+ if ($sdf eq 'S') {
+ my ($serva,$cmda,$servb,$cmdb) = split(/:/,$rest);
+ if ($cmda eq 'sethost') {
+ chomp($cmdb);
+ $dcmd = $cmdb;
+ } else {
+ $dcmd = $cmda;
+ }
+ if (($serva =~ /^LondTransaction/) || ($serva eq '')) {
+ unless (($servb eq '') || ($servb =~ m{^/})) {
+ $dserv = $servb;
+ }
+ } else {
+ $dserv = $serva;
+ }
+ } else {
+ ($dserv,$dcmd) = split(/:/,$rest);
+ }
+ if ($sdf eq 'F') {
+ my $local=localtime($time);
+ &log($fh,"<b>Failed: $time, $dserv, $dcmd</b><br />");
+ $warnings++;
+ }
+ next if ((($dserv eq '') || ($dcmd eq '')) && ($sdf ne 'F'));
+ if ($sdf eq 'S') {
+ if ($dcmd eq 'update') {
+ if ($hostname{$dserv}) {
+ if ($exclusions{$serverhomes{$hostname{$dserv}}}) {
+ $ignored --;
+ } else {
+ $unsend --;
+ }
+ }
+ if (exists($bymachine{$dserv})) {
+ $bymachine{$dserv} --;
+ } else {
+ $bymachine{$dserv} = -1;
+ }
+ } else {
+ if ($hostname{$dserv}) {
+ $unsend --;
+ }
+ }
+ } elsif ($sdf eq 'D') {
+ if ($dcmd eq 'update') {
+ if ($hostname{$dserv}) {
+ if ($exclusions{$serverhomes{$hostname{$dserv}}}) {
+ $ignored ++;
+ } else {
+ $unsend ++;
+ }
+ }
+ if (exists($bymachine{$dserv})) {
+ $bymachine{$dserv} ++;
+ } else {
+ $bymachine{$dserv} = 1;
+ }
+ } else {
+ if ($hostname{$dserv}) {
+ $unsend ++;
+ }
+ }
+ }
+ }
+ undef $dfh;
+ my $nodest = 0;
+ my $retired = 0;
+ my %active;
+ if (keys(%bymachine)) {
+ unless ($checkexcluded) {
+ %serverhomes = &read_serverhomeIDs();
+ }
+ foreach my $key (keys(%bymachine)) {
+ if ($bymachine{$key} > 0) {
+ if ($hostname{$key}) {
+ $active{$serverhomes{$hostname{$key}}} += $bymachine{$key};
+ } else {
+ $retired ++;
+ $nodest += $bymachine{$key};
+ }
+ }
+ }
+ }
+ if (keys(%active)) {
+ &log($fh,"<p>Unsend messages by node, active (undegraded) nodes in cluster</p>\n");
+ foreach my $key (sort(keys(%active))) {
+ &log($fh,&encode_entities("$key => $active{$key}",'<>&"')."\n");
+ }
+ }
+ &log($fh,"<p>Total unsend messages: <b>$unsend</b> for ".scalar(keys(%active))." active (undegraded) nodes in cluster.</p>\n");
+ if (keys(%exclusions) > 0) {
+ &log($fh,"<p>Total incomplete updates <b>$ignored</b> for ".scalar(keys(%exclusions))." degraded nodes in cluster.</p>\n");
+ }
+ if ($retired) {
+ &log($fh,"<p>Total unsent <b>$nodest</b> for $retired nodes no longer in cluster.</p>\n");
+ }
+ if ($unsend > 0) {
+ $warnings=$warnings+$weights{'U'}*$unsend;
+ }
}
if ($unsend) { $simplestatus{'unsend'}=$unsend; }
@@ -758,9 +925,13 @@
}
sub finish_logging {
- my ($fh)=@_;
+ my ($fh,$weightsref)=@_;
+ my %weights;
+ if (ref($weightsref) eq 'HASH') {
+ %weights = %{$weightsref};
+ }
&log($fh,"<a name='errcount' />\n");
- $totalcount=$notices+4*$warnings+100*$errors;
+ $totalcount=($weights{'N'}*$notices)+($weights{'W'}*$warnings)+($weights{'E'}*$errors);
&errout($fh);
&log($fh,"<h1>Total Error Count: $totalcount</h1>");
my $now=time;
@@ -829,7 +1000,7 @@
eval {
local $SIG{ ALRM } = sub { die "TIMEOUT" };
alarm(10);
- $serverhomeID =
+ $serverhomeID =
&Apache::lonnet::get_server_homeID($name,1,'loncron');
alarm(0);
};
@@ -962,12 +1133,126 @@
return;
}
+sub get_domain_config {
+ my ($dom,$primlibserv,$isprimary,$url,%confhash);
+ $dom = $perlvar{'lonDefDomain'};
+ $primlibserv = &Apache::lonnet::domain($dom,'primary');
+ if ($primlibserv eq $perlvar{'lonHostID'}) {
+ $isprimary = 1;
+ } elsif ($primlibserv ne '') {
+ my $protocol = $Apache::lonnet::protocol{$primlibserv};
+ my $hostname = &Apache::lonnet::hostname($primlibserv);
+ unless ($protocol eq 'https') {
+ $protocol = 'http';
+ }
+ $url = $protocol.'://'.$hostname.'/cgi-bin/listdomconfig.pl?primary='.$primlibserv.'&format=raw';
+ }
+ if ($isprimary) {
+ my $lonusersdir = $perlvar{'lonUsersDir'};
+ my $fname = $lonusersdir.'/'.$dom.'/configuration.db';
+ if (-e $fname) {
+ my $dbref=&LONCAPA::locking_hash_tie($fname,&GDBM_READER());
+ if (ref($dbref) eq 'HASH') {
+ foreach my $key (sort(keys(%{$dbref}))) {
+ my $value = $dbref->{$key};
+ if ($value =~ s/^__FROZEN__//) {
+ $value = thaw(&LONCAPA::unescape($value));
+ } else {
+ $value = &LONCAPA::unescape($value);
+ }
+ $confhash{$key} = $value;
+ }
+ &LONCAPA::locking_hash_untie($dbref);
+ }
+ }
+ } else {
+ my $ua=new LWP::UserAgent;
+ $ua->timeout(5);
+ my $request=new HTTP::Request('GET',$url);
+ my $response=$ua->request($request);
+ unless ($response->is_error()) {
+ my $content = $response->content;
+ if ($content) {
+ my @pairs=split(/\&/,$content);
+ foreach my $item (@pairs) {
+ my ($key,$value)=split(/=/,$item,2);
+ my $what = &LONCAPA::unescape($key);
+ if ($value =~ s/^__FROZEN__//) {
+ $value = thaw(&LONCAPA::unescape($value));
+ } else {
+ $value = &LONCAPA::unescape($value);
+ }
+ $confhash{$what}=$value;
+ }
+ }
+ }
+ }
+ return \%confhash;
+}
+
+sub get_permcount_settings {
+ my ($domconf) = @_;
+ my ($defaults,$names) = &Apache::loncommon::lon_status_items();
+ my (%weights,$threshold,$sysmail,$reportstatus,%exclusions);
+ foreach my $type ('E','W','N','U') {
+ $weights{$type} = $defaults->{$type};
+ }
+ $threshold = $defaults->{'threshold'};
+ $sysmail = $defaults->{'sysmail'};
+ $reportstatus = 1;
+ if (ref($domconf) eq 'HASH') {
+ if (ref($domconf->{'contacts'}) eq 'HASH') {
+ if ($domconf->{'contacts'}{'reportstatus'} == 0) {
+ $reportstatus = 0;
+ }
+ if (ref($domconf->{'contacts'}{'lonstatus'}) eq 'HASH') {
+ if (ref($domconf->{'contacts'}{'lonstatus'}{weights}) eq 'HASH') {
+ foreach my $type ('E','W','N','U') {
+ if (exists($domconf->{'contacts'}{'lonstatus'}{weights}{$type})) {
+ $weights{$type} = $domconf->{'contacts'}{'lonstatus'}{weights}{$type};
+ }
+ }
+ }
+ if (ref($domconf->{'contacts'}{'lonstatus'}{'excluded'}) eq 'ARRAY') {
+ my @excluded = @{$domconf->{'contacts'}{'lonstatus'}{'excluded'}};
+ if (@excluded) {
+ map { $exclusions{$_} = 1; } @excluded;
+ }
+ }
+ if (exists($domconf->{'contacts'}{'lonstatus'}{'threshold'})) {
+ $threshold = $domconf->{'contacts'}{'lonstatus'}{'threshold'};
+ }
+ if (exists($domconf->{'contacts'}{'lonstatus'}{'sysmail'})) {
+ $sysmail = $domconf->{'contacts'}{'lonstatus'}{'sysmail'};
+ }
+ }
+ }
+ }
+ return ($threshold,$sysmail,$reportstatus,\%weights,\%exclusions);
+}
+
+sub read_serverhomeIDs {
+ my %server;
+ if (-e "$perlvar{'lonTabDir'}/serverhomeIDs.tab") {
+ if (open(my $fh,'<',"$perlvar{'lonTabDir'}/serverhomeIDs.tab")) {
+ while (<$fh>) {
+ my($host,$id) = split(/:/);
+ chomp($id);
+ $server{$host} = $id;
+ }
+ close($fh);
+ }
+ }
+ return %server;
+}
+
sub send_mail {
+ my ($sysmail,$reportstatus) = @_;
my $defdom = $perlvar{'lonDefDomain'};
my $origmail = $perlvar{'lonAdmEMail'};
my $emailto = &Apache::loncommon::build_recipient_list(undef,
'lonstatusmail',$defdom,$origmail);
- if ($totalcount>2500) {
+ if (($totalcount>$sysmail) && ($reportstatus)) {
$emailto.=",$perlvar{'lonSysEMail'}";
}
my $from;
@@ -1093,7 +1378,7 @@
&Apache::lonnet::get_iphost(1,$nomemcache);
}
-# ----------------------------------------- Force firewall update for lond port
+# ----------------------------------------- Force firewall update for lond port
if ((!$justcheckdaemons) && (!$justreload)) {
my $now = time;
@@ -1108,7 +1393,7 @@
if (&LONCAPA::try_to_lock('/tmp/lock_lciptables')) {
my $execpath = $perlvar{'lonDaemons'}.'/lciptables';
system("$execpath $tmpfile");
- unlink('/tmp/lock_lciptables'); # Remove the lock file.
+ unlink('/tmp/lock_lciptables'); # Remove the lock file.
}
unlink($tmpfile);
}
@@ -1120,7 +1405,7 @@
$warnings=0;
$notices=0;
-
+
my $fh;
if (!$justcheckdaemons && !$justcheckconnections && !$justreload && !$justiptables) {
$fh=&start_logging();
@@ -1152,14 +1437,17 @@
&test_connections($fh);
}
if (!$justcheckdaemons && !$justcheckconnections && !$justreload && !$justiptables) {
- &check_delayed_msg($fh);
- &finish_logging($fh);
+ my $domconf = &get_domain_config();
+ my ($threshold,$sysmail,$reportstatus,$weightsref,$exclusionsref) =
+ &get_permcount_settings($domconf);
+ &check_delayed_msg($fh,$weightsref,$exclusionsref);
+ &finish_logging($fh,$weightsref);
&log_simplestatus();
&write_loncaparevs();
&write_serverhomeIDs();
&write_checksums();
&write_hostips();
- if ($totalcount>200 && !$noemail) { &send_mail(); }
+ if ($totalcount>$threshold && !$noemail) { &send_mail($sysmail,$reportstatus); }
}
}
More information about the LON-CAPA-cvs
mailing list