[OpenAFS-devel] volserver hangs possible fix

Marcus Watts mdw@umich.edu
Wed, 06 Apr 2005 01:21:51 -0400


At umich.edu, we've apparently been having persistant problems
with volserver "hanging" for some period of time, then spontaneously
recovering.  Unfortunately, that was the most I knew for the longest
time.  The other day, however, I had a chance to use gdb on a volserver
while it was in this state, and managed to get this interesting
traceback:
#0  0x400fa1f8 in __libc_read () at __libc_read:-1
#1  0x40344dc3 in ?? ()
#2  0x0805e6ae in VDetachVolume_r (ec=0xfffffe00, vp=0x3) at volume.c:1220
#3  0x0804f87e in DeleteTrans (atrans=0x1) at voltrans.c:95
#4  0x0804d170 in VolEndTrans (acid=0xfffffe00, destTrans=-512, rcode=0xfffffe00)
    at volprocs.c:1357
#5  0x0804d0d8 in SAFSVolEndTrans (acid=0x80a8cf0, destTrans=3, rcode=0xfffffe00)
    at volprocs.c:1340
#6  0x08055d5f in _AFSVolEndTrans (z_call=0x80a8cf0, z_xdrs=0x6e767d77)
    at volint.ss.c:205
#7  0x0805920e in AFSVolExecuteRequest (z_call=0x6e767d77) at volint.ss.c:1338
#8  0x08076936 in rxi_ServerProc ()
#9  0x08075970 in rx_ServerProc ()
#10 0x08084193 in Create_Process_Part2 ()
#11 0x08084876 in savecontext ()
#12 0x08083f6f in LWP_WaitProcess ()
#13 0x08076b1b in rx_GetCall ()
#14 0x08076986 in rxi_ServerProc ()
#15 0x08075970 in rx_ServerProc ()
#16 0x08075f7a in rx_StartServer ()
#17 0x0804a6f0 in main (argc=6, argv=0xbfffe614) at volmain.c:391
#18 0x400418be in __libc_start_main (main=0x804a2f0 <main>, argc=6, ubp_av=0x8095f10, 
    init=0x8088290 <__libc_csu_init>, fini=0x80882c0 <__libc_csu_fini>, rtld_fini=0x1, 
    stack_end=0x1) at ../sysdeps/generic/libc-start.c:152

The call in VDetachVolume_r reads:
vol/volume.c:1220 reads:
            FSYNC_askfs(volume, tpartp->name, FSYNC_ON, 0);     /* fs can use it again */
so the '??' above means FSYNC_askfs:

Here's the top of the stack:
(gdb) x/8x $esp
0x40344d9c:     0x40344dc3      0x08060929      0x00000003      0x40344dc3
0x40344dac:     0x00000001      0x4009fccf      0x40159860      0x0828bd38

Here's what the return address points to,
(gdb) x/i 0x08060929
0x8060929 <FSYNC_askfs+153>:    cmp    $0x1,%eax

vol/fssync.c:210 reads:
    while ((n = read(FS_sd, &response, 1)) != 1) {

so that's where this is stuck.  *Nothing* is going to happen in
volserver land until the fileserver comes back with a response,
and that's not good for rx.

Basically, volserver was waiting for the fileserver to say it had succeeded
in putting a volume online again.  This (should) be a relatively fast process,
but there are several other operations that involve breaking callbacks
on some unrelated volume, which could be slow.  I think I remember people mumbling
stuff about callback logic, clients that had "gone away", & volumes
interferring with each other, looks like that's a probable explanation in this case.
Unfortunately, I don't have a corresponding core dump or traceback for
the fileserver, but I believe it's likely that the "FSYNC_sync" thread on
the fileserver was calling (*V_BreakVolumeCallbacks) as a consequence of
a previous FSYNC_MOVEVOLUME or FSYNC_RESTOREVOLUME call from volserver,
and that for some reason this was taking a long time.

Assuming that's the case, what follows is a possible fix for this.
I haven't tried this out yet, so don't know how well it will do.
All it does though is create a separate thread to handle the
actual callback logic, plus a bit of instrumentation to see
if there really are unreasonable big fat delays in this code.

			-Marcus Watts
			UM ITCS Umich Systems Group

diff -ru openafs-1.3.80.dist/src/vol/fssync.c openafs-1.3.80/src/vol/fssync.c
--- openafs-1.3.80.dist/src/vol/fssync.c	2004-08-25 03:14:19.000000000 -0400
+++ openafs-1.3.80/src/vol/fssync.c	2005-04-05 21:19:42.000000000 -0400
@@ -135,6 +135,7 @@
 
 /* Forward declarations */
 static void FSYNC_sync();
+static void FSYNC_cbthread();
 static void FSYNC_newconnection();
 static void FSYNC_com();
 static void FSYNC_Drop();
@@ -261,11 +262,15 @@
     assert(pthread_attr_init(&tattr) == 0);
     assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0);
     assert(pthread_create(&tid, &tattr, FSYNC_sync, NULL) == 0);
+    assert(pthread_create(&tid, &tattr, FSYNC_cbthread, NULL) == 0);
 #else /* AFS_PTHREAD_ENV */
     PROCESS pid;
     assert(LWP_CreateProcess
 	   (FSYNC_sync, USUAL_STACK_SIZE, USUAL_PRIORITY, (void *)0,
 	    "FSYNC_sync", &pid) == LWP_SUCCESS);
+    assert(LWP_CreateProcess
+	   (FSYNC_cbthread, USUAL_STACK_SIZE, USUAL_PRIORITY, (void *)0,
+	    "FSYNC_cbthread", &pid) == LWP_SUCCESS);
 #endif /* AFS_PTHREAD_ENV */
 }
 
@@ -286,6 +291,90 @@
     return sd;
 }
 
+/* handle callbacks in a separate thread so that
+ * volserver doesn't hang in a hard loop waiting for this
+ * to complete.  Might need to be smarter about scheduling
+ * this someday???
+ */
+
+static struct fscb {
+    struct fscb *next;
+    unsigned int queued;
+    VolumeId volume;
+} *fscblist, **fscbnext = &fscblist;
+
+#ifdef AFS_PTHREAD_ENV
+pthread_mutex_t fssync_cblist_mutex;
+pthread_cond_t fssync_cblist_cond;
+#endif /* AFS_PTHREAD_ENV */
+
+static void
+FSYNC_cbthread()
+{
+    struct fscb *fp;
+    unsigned int started, completed;
+#ifdef AFS_PTHREAD_ENV
+    int tid;
+#endif
+#ifdef AFS_PTHREAD_ENV
+    /* set our 'thread-id' so that the host hold table works */
+    MUTEX_ENTER(&rx_stats_mutex);	/* protects rxi_pthread_hinum */
+    tid = ++rxi_pthread_hinum;
+    MUTEX_EXIT(&rx_stats_mutex);
+    pthread_setspecific(rx_thread_id_key, (void *)tid);
+    Log("Set thread id %d for FSYNC_sync\n", tid);
+#endif /* AFS_PTHREAD_ENV */
+    for (;;) {
+	VATTACH_LOCK;
+	VOL_LOCK;
+	if (!(fp = fscblist))
+	    ;
+	else if ((fscblist = fp->next))
+	    ;
+	else fscbnext = &fscblist;
+	VOL_UNLOCK;
+	VATTACH_UNLOCK;
+	if (!fp) {
+#ifdef AFS_PTHREAD_ENV
+	    pthread_cond_wait(&fssync_cblist_cond, &fssync_cblist_mutex);
+#else /* AFS_PTHREAD_ENV */
+	    LWP_WaitProcess((void*) FSYNC_cbthread);
+#endif /* AFS_PTHREAD_ENV */
+	    continue;
+	}
+        started = FT_ApproxTime();
+	Log("fssync: breaking all callbacks on volume %u\n",
+	    fp->volume);
+	(*V_BreakVolumeCallbacks) (fp->volume);
+        completed = FT_ApproxTime();
+	if (fp->queued != started || started != completed) {
+	    Log("fssync: delay to break callbacks on volume %u; waited=%d took=%d now=%d\n",
+		fp->volume, started-fp->queued, completed-started, completed);
+	}
+	free(fp);
+    }
+}
+
+/* call with VATTACH_LOCK and VOL_LOCK held */
+static void FSYNC_add_cblist(VolumeId volume)
+{
+    struct fscb *fp, *first;
+    fp = (struct fscb *) malloc(sizeof *fp);
+    fp->next = 0;
+    fp->volume = volume;
+    fp->queued = FT_ApproxTime();
+    first = fscblist;
+    *fscbnext = fp;
+    fscbnext = &fp->next;
+    if (!first) {
+#ifdef AFS_PTHREAD_ENV
+	assert(pthread_cond_broadcast(&fssync_cblist_cond) == 0);
+#else /* AFS_PTHREAD_ENV */
+	LWP_NoYieldSignal((void*) FSYNC_cbthread);
+#endif /* AFS_PTHREAD_ENV */
+    }
+}
+
 static fd_set FSYNC_readfds;
 
 static void
@@ -567,25 +656,17 @@
 	}
 
 	if (V_BreakVolumeCallbacks) {
-	    Log("fssync: volume %u moved to %x; breaking all call backs\n",
+	    Log("fssync: volume %u moved to %x\n",
 		command.volume, command.reason);
-	    VOL_UNLOCK;
-	    VATTACH_UNLOCK;
-	    (*V_BreakVolumeCallbacks) (command.volume);
-	    VATTACH_LOCK;
-	    VOL_LOCK;
+	    FSYNC_add_cblist (command.volume);
 	}
 	break;
     case FSYNC_RESTOREVOLUME:
 	/* if the volume is being restored, break all callbacks on it */
 	if (V_BreakVolumeCallbacks) {
-	    Log("fssync: volume %u restored; breaking all call backs\n",
+	    Log("fssync: volume %u restored or deleted\n",
 		command.volume);
-	    VOL_UNLOCK;
-	    VATTACH_UNLOCK;
-	    (*V_BreakVolumeCallbacks) (command.volume);
-	    VATTACH_LOCK;
-	    VOL_LOCK;
+	    FSYNC_add_cblist (command.volume);
 	}
 	break;
     default: