[LON-CAPA-cvs] cvs: loncom / loncnew

foxr lon-capa-cvs@mail.lon-capa.org
Tue, 02 Sep 2003 10:34:47 -0000


foxr		Tue Sep  2 06:34:47 2003 EDT

  Modified files:              
    /loncom	loncnew 
  Log:
  - Fix errors in host dead detection logic (too many cases where the
    retries left were not getting incremented or just not checked).
  - Added some additional status to the ps axuww display:
    o Remaining retries on a host.
    o >>> DEAD <<< indicator if I've given up on a host.
  - Tested the SIGHUP will reset the retries remaining count (thanks to 
    the above status stuff, and get allow the loncnew to re-try again
    on the host (thanks to the log).
  
  
  
Index: loncom/loncnew
diff -u loncom/loncnew:1.21 loncom/loncnew:1.22
--- loncom/loncnew:1.21	Tue Aug 26 05:19:51 2003
+++ loncom/loncnew	Tue Sep  2 06:34:47 2003
@@ -2,7 +2,7 @@
 # The LearningOnline Network with CAPA
 # lonc maintains the connections to remote computers
 #
-# $Id: loncnew,v 1.21 2003/08/26 09:19:51 foxr Exp $
+# $Id: loncnew,v 1.22 2003/09/02 10:34:47 foxr Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -45,6 +45,16 @@
 
 # Change log:
 #    $Log: loncnew,v $
+#    Revision 1.22  2003/09/02 10:34:47  foxr
+#    - Fix errors in host dead detection logic (too many cases where the
+#      retries left were not getting incremented or just not checked).
+#    - Added some additional status to the ps axuww display:
+#      o Remaining retries on a host.
+#      o >>> DEAD <<< indicator if I've given up on a host.
+#    - Tested the SIGHUP will reset the retries remaining count (thanks to
+#      the above status stuff, and get allow the loncnew to re-try again
+#      on the host (thanks to the log).
+#
 #    Revision 1.21  2003/08/26 09:19:51  foxr
 #    How embarrassing... put in the SocketTimeout function in loncnew and forgot
 #    to actually hook it into the LondTransaction.  Added this to MakeLondConnection
@@ -90,7 +100,7 @@
 #    Revision 1.10  2003/06/24 02:46:04  foxr
 #    Put a limit on  the number of times we'll retry a connection.
 #    Start getting the signal stuff put in as well...note that need to get signals
-#    going or else 6the client will permanently give up on dead servers.
+#    going or else the client will permanently give up on dead servers.
 #
 #    Revision 1.9  2003/06/13 02:38:43  foxr
 #    Add logging in 'expected format'
@@ -329,7 +339,9 @@
 sub SocketTimeout {
     my $Socket = shift;
     
-    KillSocket($Socket);
+    KillSocket($Socket);	# A transaction timeout also counts as
+                                # a connection failure:
+    $ConnectionRetriesLeft--;
 }
 
 =pod
@@ -343,8 +355,12 @@
 
 sub Tick {
     my $client;
-    ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount);
-
+    if($ConnectionRetriesLeft > 0) {
+	ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount
+		   ." Retries remaining: ".$ConnectionRetriesLeft);
+    } else {
+	ShowStatus(GetServerHost()." >> DEAD <<");
+    }
     # Is it time to prune connection count:
 
 
@@ -375,10 +391,16 @@
 	    my $Connections = ($Requests <= $MaxConnectionCount) ?
 		$Requests : $MaxConnectionCount;
 	    Debug(1,"Work but no connections, start ".$Connections." of them");
+	    my $successCount = 0;
 	    for ($i =0; $i < $Connections; $i++) {
-		MakeLondConnection();
+		$successCount += MakeLondConnection();
+	    }
+	    if($successCount == 0) { # All connections failed:
+		Debug(1,"Work in queue failed to make any connectiouns\n");
+		EmptyQueue();	# Fail pending transactions with con_lost.
 	    }
 	} else {
+	    ShowStatus(GetServerHost()." >>> DEAD!!! <<<");
 	    Debug(1,"Work in queue, but gave up on connections..flushing\n");
 	    EmptyQueue();	# Connections can't be established.
 	}
@@ -632,6 +654,9 @@
 	Debug(1," Replying con_lost to ".$transaction->getRequest());
 	StartClientReply($transaction, "con_lost\n");
     }
+    if($ConnectionRetriesLeft <= 0) {
+	Log("CRITICAL", "Host marked dead: ".GetServerHost());
+    }
 
 }
 
@@ -643,6 +668,7 @@
 
 =cut
 sub EmptyQueue {
+    $ConnectionRetriesLeft--;	# Counts as connection failure too.
     while($WorkQueue->Count()) {
 	my $request = $WorkQueue->dequeue(); # This is a transaction
 	FailTransaction($request);
@@ -709,7 +735,7 @@
     #  work queue, the work all gets failed with con_lost.
     #
     if($ConnectionCount == 0) {
-	EmptyQueue;
+	EmptyQueue();
     }
 }
 
@@ -799,6 +825,7 @@
 	}
 	$Watcher->cancel();
 	KillSocket($Socket);
+	$ConnectionRetriesLeft--;       # Counts as connection failure
 	return;
     }
     SocketDump(6,$Socket);
@@ -832,6 +859,10 @@
     } elsif ($State eq "Idle") {
 	# If necessary, complete a transaction and then go into the
 	# idle queue.
+	#  Note that a trasition to idle indicates a live lond
+	# on the other end so reset the connection retries.
+	#
+	$ConnectionRetriesLeft = $ConnectionRetries; # success resets the count
 	$Watcher->cancel();
 	if(exists($ActiveTransactions{$Socket})) {
 	    Debug(8,"Completing transaction!!");
@@ -1087,7 +1118,7 @@
 	$ConnectionRetriesLeft--;
 	return 0;		# Failure.
     }  else {
-	$ConnectionRetriesLeft = $ConnectionRetries; # success resets the count
+
 	# The connection needs to have writability 
 	# monitored in order to send the init sequence
 	# that starts the whole authentication/key
@@ -1200,9 +1231,14 @@
 	Debug(8,"Must queue...");
 	$WorkQueue->enqueue($requestData);
 	if($ConnectionCount < $MaxConnectionCount) {
-	    Debug(4,"Starting additional lond connection");
-	    if(MakeLondConnection() == 0) {
-		EmptyQueue();	# Fail transactions, can't make connection.
+	    if($ConnectionRetriesLeft > 0) {
+		Debug(4,"Starting additional lond connection");
+		if(MakeLondConnection() == 0) {
+		    EmptyQueue();	# Fail transactions, can't make connection.
+		}
+	    } else {
+		ShowStatus(GetServerHost()." >>> DEAD !!!! <<<");
+		EmptyQueue();	# It's worse than that ... he's dead Jim.
 	    }
 	}
     } else {			# Can start the request:
@@ -1368,6 +1404,8 @@
 Child USR1 signal handler to report the most recent status
 into the status file.
 
+We also use this to reset the retries count in order to allow the
+client to retry connections with a previously dead server.
 =cut
 sub ChildStatus {
     my $event = shift;
@@ -1378,6 +1416,7 @@
     my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt");
     print $fh $$."\t".$RemoteHost."\t".$Status."\t".
 	$RecentLogEntry."\n";
+    $ConnectionRetriesLeft = $ConnectionRetries;
 }
 
 =pod