[LON-CAPA-cvs] cvs: loncom / loncnew
foxr
lon-capa-cvs@mail.lon-capa.org
Tue, 02 Sep 2003 10:34:47 -0000
foxr Tue Sep 2 06:34:47 2003 EDT
Modified files:
/loncom loncnew
Log:
- Fix errors in host dead detection logic (too many cases where the
retries left were not getting incremented or just not checked).
- Added some additional status to the ps axuww display:
o Remaining retries on a host.
o >>> DEAD <<< indicator if I've given up on a host.
- Tested the SIGHUP will reset the retries remaining count (thanks to
the above status stuff, and get allow the loncnew to re-try again
on the host (thanks to the log).
Index: loncom/loncnew
diff -u loncom/loncnew:1.21 loncom/loncnew:1.22
--- loncom/loncnew:1.21 Tue Aug 26 05:19:51 2003
+++ loncom/loncnew Tue Sep 2 06:34:47 2003
@@ -2,7 +2,7 @@
# The LearningOnline Network with CAPA
# lonc maintains the connections to remote computers
#
-# $Id: loncnew,v 1.21 2003/08/26 09:19:51 foxr Exp $
+# $Id: loncnew,v 1.22 2003/09/02 10:34:47 foxr Exp $
#
# Copyright Michigan State University Board of Trustees
#
@@ -45,6 +45,16 @@
# Change log:
# $Log: loncnew,v $
+# Revision 1.22 2003/09/02 10:34:47 foxr
+# - Fix errors in host dead detection logic (too many cases where the
+# retries left were not getting incremented or just not checked).
+# - Added some additional status to the ps axuww display:
+# o Remaining retries on a host.
+# o >>> DEAD <<< indicator if I've given up on a host.
+# - Tested the SIGHUP will reset the retries remaining count (thanks to
+# the above status stuff, and get allow the loncnew to re-try again
+# on the host (thanks to the log).
+#
# Revision 1.21 2003/08/26 09:19:51 foxr
# How embarrassing... put in the SocketTimeout function in loncnew and forgot
# to actually hook it into the LondTransaction. Added this to MakeLondConnection
@@ -90,7 +100,7 @@
# Revision 1.10 2003/06/24 02:46:04 foxr
# Put a limit on the number of times we'll retry a connection.
# Start getting the signal stuff put in as well...note that need to get signals
-# going or else 6the client will permanently give up on dead servers.
+# going or else the client will permanently give up on dead servers.
#
# Revision 1.9 2003/06/13 02:38:43 foxr
# Add logging in 'expected format'
@@ -329,7 +339,9 @@
sub SocketTimeout {
my $Socket = shift;
- KillSocket($Socket);
+ KillSocket($Socket); # A transaction timeout also counts as
+ # a connection failure:
+ $ConnectionRetriesLeft--;
}
=pod
@@ -343,8 +355,12 @@
sub Tick {
my $client;
- ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount);
-
+ if($ConnectionRetriesLeft > 0) {
+ ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount
+ ." Retries remaining: ".$ConnectionRetriesLeft);
+ } else {
+ ShowStatus(GetServerHost()." >> DEAD <<");
+ }
# Is it time to prune connection count:
@@ -375,10 +391,16 @@
my $Connections = ($Requests <= $MaxConnectionCount) ?
$Requests : $MaxConnectionCount;
Debug(1,"Work but no connections, start ".$Connections." of them");
+ my $successCount = 0;
for ($i =0; $i < $Connections; $i++) {
- MakeLondConnection();
+ $successCount += MakeLondConnection();
+ }
+ if($successCount == 0) { # All connections failed:
+ Debug(1,"Work in queue failed to make any connectiouns\n");
+ EmptyQueue(); # Fail pending transactions with con_lost.
}
} else {
+ ShowStatus(GetServerHost()." >>> DEAD!!! <<<");
Debug(1,"Work in queue, but gave up on connections..flushing\n");
EmptyQueue(); # Connections can't be established.
}
@@ -632,6 +654,9 @@
Debug(1," Replying con_lost to ".$transaction->getRequest());
StartClientReply($transaction, "con_lost\n");
}
+ if($ConnectionRetriesLeft <= 0) {
+ Log("CRITICAL", "Host marked dead: ".GetServerHost());
+ }
}
@@ -643,6 +668,7 @@
=cut
sub EmptyQueue {
+ $ConnectionRetriesLeft--; # Counts as connection failure too.
while($WorkQueue->Count()) {
my $request = $WorkQueue->dequeue(); # This is a transaction
FailTransaction($request);
@@ -709,7 +735,7 @@
# work queue, the work all gets failed with con_lost.
#
if($ConnectionCount == 0) {
- EmptyQueue;
+ EmptyQueue();
}
}
@@ -799,6 +825,7 @@
}
$Watcher->cancel();
KillSocket($Socket);
+ $ConnectionRetriesLeft--; # Counts as connection failure
return;
}
SocketDump(6,$Socket);
@@ -832,6 +859,10 @@
} elsif ($State eq "Idle") {
# If necessary, complete a transaction and then go into the
# idle queue.
+ # Note that a trasition to idle indicates a live lond
+ # on the other end so reset the connection retries.
+ #
+ $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count
$Watcher->cancel();
if(exists($ActiveTransactions{$Socket})) {
Debug(8,"Completing transaction!!");
@@ -1087,7 +1118,7 @@
$ConnectionRetriesLeft--;
return 0; # Failure.
} else {
- $ConnectionRetriesLeft = $ConnectionRetries; # success resets the count
+
# The connection needs to have writability
# monitored in order to send the init sequence
# that starts the whole authentication/key
@@ -1200,9 +1231,14 @@
Debug(8,"Must queue...");
$WorkQueue->enqueue($requestData);
if($ConnectionCount < $MaxConnectionCount) {
- Debug(4,"Starting additional lond connection");
- if(MakeLondConnection() == 0) {
- EmptyQueue(); # Fail transactions, can't make connection.
+ if($ConnectionRetriesLeft > 0) {
+ Debug(4,"Starting additional lond connection");
+ if(MakeLondConnection() == 0) {
+ EmptyQueue(); # Fail transactions, can't make connection.
+ }
+ } else {
+ ShowStatus(GetServerHost()." >>> DEAD !!!! <<<");
+ EmptyQueue(); # It's worse than that ... he's dead Jim.
}
}
} else { # Can start the request:
@@ -1368,6 +1404,8 @@
Child USR1 signal handler to report the most recent status
into the status file.
+We also use this to reset the retries count in order to allow the
+client to retry connections with a previously dead server.
=cut
sub ChildStatus {
my $event = shift;
@@ -1378,6 +1416,7 @@
my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt");
print $fh $$."\t".$RemoteHost."\t".$Status."\t".
$RecentLogEntry."\n";
+ $ConnectionRetriesLeft = $ConnectionRetries;
}
=pod