[OpenAFS] OopenAFS 1.2.13: ever increasing number of fileserver connections - h_Hold leak

Rainer Toebbicke rtb@pclella.cern.ch
Fri, 07 Jan 2005 12:17:15 +0100


This is a multi-part message in MIME format.
--------------000202080608020503010204
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

Rainer Toebbicke wrote:
> Rainer Toebbicke wrote:
> 
>> Something's wrong with OpenAFS 1.2.13 as on several busy servers we 
>> see an ever-increasing number of host/client connections.
>>

The attached patch (against 1.2.13, but modulo line numbers also for 
1.3.73 and later) fixes a h_Hold table leak in GetSomeSpace_r:

lih_r now leaves the current lih_host held, and h_Releases those for 
which it changed mind during h_Enumerate. It also closes the window that 
ClearHostCallbacks_r opens.

The problem was: due to the leak on the h_Hold table, rx connections and 
host and client structures stopped being garbage collected once the 
fileserver went through GetSomeSpace_r. Only relevant for "busy" 
servers, many would never even invoke this routine. The problem did not 
appear before 1.2.11 as the delta that caused this was not yet in.

Tested on two servers up to now.

-- 
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Rainer Toebbicke
European Laboratory for Particle Physics(CERN) - Geneva, Switzerland
Phone: +41 22 767 8985       Fax: +41 22 767 7155

--------------000202080608020503010204
Content-Type: text/plain;
 name="patch_lih_leak"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="patch_lih_leak"

*** openafs/src/viced/callback.c.orig	2004-11-10 11:31:37.000000000 +0100
--- openafs/src/viced/callback.c	2005-01-05 16:42:05.000000000 +0100
***************
*** 1394,1415 ****
  
  
  static struct host *lih_host;
! static int lih_host_held = 0;
  
  static int lih_r(host, held, hostp)
      register struct host *host, *hostp;
      register int held;
  
  {
-     lih_host_held = 0;
      if (host->cblist
  	   && ((hostp && host != hostp) || (!held && !h_OtherHolds_r(host)))
             && (!lih_host || host->ActiveCall < lih_host->ActiveCall) ) {
  	lih_host = host;
!     }
!     if (!held) {
!         held = 1;
!         lih_host_held = 1;
      }
      return held;
  
--- 1394,1415 ----
  
  
  static struct host *lih_host;
! static int lih_host_held;
  
  static int lih_r(host, held, hostp)
      register struct host *host, *hostp;
      register int held;
  
  {
      if (host->cblist
  	   && ((hostp && host != hostp) || (!held && !h_OtherHolds_r(host)))
             && (!lih_host || host->ActiveCall < lih_host->ActiveCall) ) {
+ 	if (lih_host != NULL && lih_host_held) {
+ 	    h_Release_r(lih_host);
+ 	}
  	lih_host = host;
! 	lih_host_held = !held;
! 	held = 1;
      }
      return held;
  
***************
*** 1438,1450 ****
  	h_Enumerate_r(lih_r, hp2, (char *)hp1);
  	hp = lih_host;
  	if (hp) {
  	    cbstuff.GSS4++;
              if (!ClearHostCallbacks_r(hp, 0 /* not locked or held */ )) {
!                 if (lih_host_held) 
                      h_Release_r(hp);
                  return 0;
              }
!             if (lih_host_held) 
                  h_Release_r(hp);
  	    hp2 = hp->next;
  	} else {
--- 1438,1451 ----
  	h_Enumerate_r(lih_r, hp2, (char *)hp1);
  	hp = lih_host;
  	if (hp) {
+ 	    int lih_host_held2=lih_host_held;	/* set in lih_r! private copy before giving up H_LOCK */
  	    cbstuff.GSS4++;
              if (!ClearHostCallbacks_r(hp, 0 /* not locked or held */ )) {
!                 if (lih_host_held2) 
                      h_Release_r(hp);
                  return 0;
              }
!             if (lih_host_held2) 
                  h_Release_r(hp);
  	    hp2 = hp->next;
  	} else {

--------------000202080608020503010204--