[OpenAFS-devel] volserver hangs possible fix
Marcus Watts
mdw@umich.edu
Wed, 06 Apr 2005 01:21:51 -0400
At umich.edu, we've apparently been having persistant problems
with volserver "hanging" for some period of time, then spontaneously
recovering. Unfortunately, that was the most I knew for the longest
time. The other day, however, I had a chance to use gdb on a volserver
while it was in this state, and managed to get this interesting
traceback:
#0 0x400fa1f8 in __libc_read () at __libc_read:-1
#1 0x40344dc3 in ?? ()
#2 0x0805e6ae in VDetachVolume_r (ec=0xfffffe00, vp=0x3) at volume.c:1220
#3 0x0804f87e in DeleteTrans (atrans=0x1) at voltrans.c:95
#4 0x0804d170 in VolEndTrans (acid=0xfffffe00, destTrans=-512, rcode=0xfffffe00)
at volprocs.c:1357
#5 0x0804d0d8 in SAFSVolEndTrans (acid=0x80a8cf0, destTrans=3, rcode=0xfffffe00)
at volprocs.c:1340
#6 0x08055d5f in _AFSVolEndTrans (z_call=0x80a8cf0, z_xdrs=0x6e767d77)
at volint.ss.c:205
#7 0x0805920e in AFSVolExecuteRequest (z_call=0x6e767d77) at volint.ss.c:1338
#8 0x08076936 in rxi_ServerProc ()
#9 0x08075970 in rx_ServerProc ()
#10 0x08084193 in Create_Process_Part2 ()
#11 0x08084876 in savecontext ()
#12 0x08083f6f in LWP_WaitProcess ()
#13 0x08076b1b in rx_GetCall ()
#14 0x08076986 in rxi_ServerProc ()
#15 0x08075970 in rx_ServerProc ()
#16 0x08075f7a in rx_StartServer ()
#17 0x0804a6f0 in main (argc=6, argv=0xbfffe614) at volmain.c:391
#18 0x400418be in __libc_start_main (main=0x804a2f0 <main>, argc=6, ubp_av=0x8095f10,
init=0x8088290 <__libc_csu_init>, fini=0x80882c0 <__libc_csu_fini>, rtld_fini=0x1,
stack_end=0x1) at ../sysdeps/generic/libc-start.c:152
The call in VDetachVolume_r reads:
vol/volume.c:1220 reads:
FSYNC_askfs(volume, tpartp->name, FSYNC_ON, 0); /* fs can use it again */
so the '??' above means FSYNC_askfs:
Here's the top of the stack:
(gdb) x/8x $esp
0x40344d9c: 0x40344dc3 0x08060929 0x00000003 0x40344dc3
0x40344dac: 0x00000001 0x4009fccf 0x40159860 0x0828bd38
Here's what the return address points to,
(gdb) x/i 0x08060929
0x8060929 <FSYNC_askfs+153>: cmp $0x1,%eax
vol/fssync.c:210 reads:
while ((n = read(FS_sd, &response, 1)) != 1) {
so that's where this is stuck. *Nothing* is going to happen in
volserver land until the fileserver comes back with a response,
and that's not good for rx.
Basically, volserver was waiting for the fileserver to say it had succeeded
in putting a volume online again. This (should) be a relatively fast process,
but there are several other operations that involve breaking callbacks
on some unrelated volume, which could be slow. I think I remember people mumbling
stuff about callback logic, clients that had "gone away", & volumes
interferring with each other, looks like that's a probable explanation in this case.
Unfortunately, I don't have a corresponding core dump or traceback for
the fileserver, but I believe it's likely that the "FSYNC_sync" thread on
the fileserver was calling (*V_BreakVolumeCallbacks) as a consequence of
a previous FSYNC_MOVEVOLUME or FSYNC_RESTOREVOLUME call from volserver,
and that for some reason this was taking a long time.
Assuming that's the case, what follows is a possible fix for this.
I haven't tried this out yet, so don't know how well it will do.
All it does though is create a separate thread to handle the
actual callback logic, plus a bit of instrumentation to see
if there really are unreasonable big fat delays in this code.
-Marcus Watts
UM ITCS Umich Systems Group
diff -ru openafs-1.3.80.dist/src/vol/fssync.c openafs-1.3.80/src/vol/fssync.c
--- openafs-1.3.80.dist/src/vol/fssync.c 2004-08-25 03:14:19.000000000 -0400
+++ openafs-1.3.80/src/vol/fssync.c 2005-04-05 21:19:42.000000000 -0400
@@ -135,6 +135,7 @@
/* Forward declarations */
static void FSYNC_sync();
+static void FSYNC_cbthread();
static void FSYNC_newconnection();
static void FSYNC_com();
static void FSYNC_Drop();
@@ -261,11 +262,15 @@
assert(pthread_attr_init(&tattr) == 0);
assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0);
assert(pthread_create(&tid, &tattr, FSYNC_sync, NULL) == 0);
+ assert(pthread_create(&tid, &tattr, FSYNC_cbthread, NULL) == 0);
#else /* AFS_PTHREAD_ENV */
PROCESS pid;
assert(LWP_CreateProcess
(FSYNC_sync, USUAL_STACK_SIZE, USUAL_PRIORITY, (void *)0,
"FSYNC_sync", &pid) == LWP_SUCCESS);
+ assert(LWP_CreateProcess
+ (FSYNC_cbthread, USUAL_STACK_SIZE, USUAL_PRIORITY, (void *)0,
+ "FSYNC_cbthread", &pid) == LWP_SUCCESS);
#endif /* AFS_PTHREAD_ENV */
}
@@ -286,6 +291,90 @@
return sd;
}
+/* handle callbacks in a separate thread so that
+ * volserver doesn't hang in a hard loop waiting for this
+ * to complete. Might need to be smarter about scheduling
+ * this someday???
+ */
+
+static struct fscb {
+ struct fscb *next;
+ unsigned int queued;
+ VolumeId volume;
+} *fscblist, **fscbnext = &fscblist;
+
+#ifdef AFS_PTHREAD_ENV
+pthread_mutex_t fssync_cblist_mutex;
+pthread_cond_t fssync_cblist_cond;
+#endif /* AFS_PTHREAD_ENV */
+
+static void
+FSYNC_cbthread()
+{
+ struct fscb *fp;
+ unsigned int started, completed;
+#ifdef AFS_PTHREAD_ENV
+ int tid;
+#endif
+#ifdef AFS_PTHREAD_ENV
+ /* set our 'thread-id' so that the host hold table works */
+ MUTEX_ENTER(&rx_stats_mutex); /* protects rxi_pthread_hinum */
+ tid = ++rxi_pthread_hinum;
+ MUTEX_EXIT(&rx_stats_mutex);
+ pthread_setspecific(rx_thread_id_key, (void *)tid);
+ Log("Set thread id %d for FSYNC_sync\n", tid);
+#endif /* AFS_PTHREAD_ENV */
+ for (;;) {
+ VATTACH_LOCK;
+ VOL_LOCK;
+ if (!(fp = fscblist))
+ ;
+ else if ((fscblist = fp->next))
+ ;
+ else fscbnext = &fscblist;
+ VOL_UNLOCK;
+ VATTACH_UNLOCK;
+ if (!fp) {
+#ifdef AFS_PTHREAD_ENV
+ pthread_cond_wait(&fssync_cblist_cond, &fssync_cblist_mutex);
+#else /* AFS_PTHREAD_ENV */
+ LWP_WaitProcess((void*) FSYNC_cbthread);
+#endif /* AFS_PTHREAD_ENV */
+ continue;
+ }
+ started = FT_ApproxTime();
+ Log("fssync: breaking all callbacks on volume %u\n",
+ fp->volume);
+ (*V_BreakVolumeCallbacks) (fp->volume);
+ completed = FT_ApproxTime();
+ if (fp->queued != started || started != completed) {
+ Log("fssync: delay to break callbacks on volume %u; waited=%d took=%d now=%d\n",
+ fp->volume, started-fp->queued, completed-started, completed);
+ }
+ free(fp);
+ }
+}
+
+/* call with VATTACH_LOCK and VOL_LOCK held */
+static void FSYNC_add_cblist(VolumeId volume)
+{
+ struct fscb *fp, *first;
+ fp = (struct fscb *) malloc(sizeof *fp);
+ fp->next = 0;
+ fp->volume = volume;
+ fp->queued = FT_ApproxTime();
+ first = fscblist;
+ *fscbnext = fp;
+ fscbnext = &fp->next;
+ if (!first) {
+#ifdef AFS_PTHREAD_ENV
+ assert(pthread_cond_broadcast(&fssync_cblist_cond) == 0);
+#else /* AFS_PTHREAD_ENV */
+ LWP_NoYieldSignal((void*) FSYNC_cbthread);
+#endif /* AFS_PTHREAD_ENV */
+ }
+}
+
static fd_set FSYNC_readfds;
static void
@@ -567,25 +656,17 @@
}
if (V_BreakVolumeCallbacks) {
- Log("fssync: volume %u moved to %x; breaking all call backs\n",
+ Log("fssync: volume %u moved to %x\n",
command.volume, command.reason);
- VOL_UNLOCK;
- VATTACH_UNLOCK;
- (*V_BreakVolumeCallbacks) (command.volume);
- VATTACH_LOCK;
- VOL_LOCK;
+ FSYNC_add_cblist (command.volume);
}
break;
case FSYNC_RESTOREVOLUME:
/* if the volume is being restored, break all callbacks on it */
if (V_BreakVolumeCallbacks) {
- Log("fssync: volume %u restored; breaking all call backs\n",
+ Log("fssync: volume %u restored or deleted\n",
command.volume);
- VOL_UNLOCK;
- VATTACH_UNLOCK;
- (*V_BreakVolumeCallbacks) (command.volume);
- VATTACH_LOCK;
- VOL_LOCK;
+ FSYNC_add_cblist (command.volume);
}
break;
default: