[OpenAFS] server process hang on salvage attempt
Jeffrey Hutzelman
jhutz@cmu.edu
Fri, 02 Jul 2004 14:08:57 -0400
On Friday, July 02, 2004 13:44:52 -0400 Joseph H Vilas <jhv@oit.duke.edu>
wrote:
> Then it hangs. The volserver will answer a vos status request, but
> any request for real information (like vos listvol) will not return.
> It looks like the fssync thread in the fileserver is just not coming
> back.
Yup. It's trying to select on a connection from something (a salvager,
perhaps), only the file descriptor it wants to select on doesn't actually
fit in an fd_set because on Solaris 9, FD_SETSIZE is smaller than the
default hard file descriptor limit.
You need the following patch:
diff -ru openafs-1.2.10fs/src/lwp/iomgr.c openafs-1.2.10fs2/src/lwp/iomgr.c
--- openafs-1.2.10fs/src/lwp/iomgr.c Tue Aug 7 20:03:52 2001
+++ openafs-1.2.10fs2/src/lwp/iomgr.c Mon Jun 14 17:30:40 2004
@@ -21,6 +21,18 @@
IO Manager routines & server process for VICE server.
*/
+/* This controls the size of an fd_set; it must be defined early before
+ * the system headers define that type and the macros that operate on it.
+ * Its value should be as large as the maximum file descriptor limit we
+ * are likely to run into on any platform. Right now, that is 65536
+ * which is the default hard fd limit on Solaris 9 */
+/* We don't do this on Windows because on that platform there is code
+ * which allocates fd_set's on the stack (IOMGR_Sleep on Win9x, and
+ * FDSetAnd on WinNT) */
+#ifndef _WIN32
+#define FD_SETSIZE 65536
+#endif
+
#include <afsconfig.h>
#include <afs/param.h>
@@ -177,7 +189,7 @@
/* fd_set pool managment.
* Use the pool instead of creating fd_set's on the stack. fd_set's can be
- * 2K in size, so making three could put 6K in the limited space of an LWP
+ * 8K in size, so making three could put 24K in the limited space of an LWP
* stack.
*/
struct IOMGR_fd_set {
diff -ru openafs-1.2.10fs/src/lwp/lwp.c openafs-1.2.10fs2/src/lwp/lwp.c
--- openafs-1.2.10fs/src/lwp/lwp.c Tue Aug 7 20:03:52 2001
+++ openafs-1.2.10fs2/src/lwp/lwp.c Fri Jun 18 13:48:10 2004
@@ -107,7 +107,7 @@
struct QUEUE {
PROCESS head;
int count;
-} runnable[MAX_PRIORITIES], blocked;
+} runnable[MAX_PRIORITIES], blocked, qwaiting;
/* Invariant for runnable queues: The head of each queue points to the
currently running process if it is in that queue, or it points to the next
process in that queue that should run. */
/* Offset of stack field within pcb -- used by stack checking stuff */
@@ -222,7 +222,7 @@
int LWP_QWait()
{register PROCESS tp;
(tp=lwp_cpptr) -> status = QWAITING;
- lwp_remove(tp, &runnable[tp->priority]);
+ move(tp, &runnable[tp->priority], qwaiting);
Set_LWP_RC();
return LWP_SUCCESS;
}
@@ -231,7 +231,7 @@
register PROCESS pid; {
if (pid->status == QWAITING) {
pid->status = READY;
- insert(pid, &runnable[pid->priority]);
+ move(pid, qwaiting, &runnable[pid->priority]);
return LWP_SUCCESS;
}
else return LWP_ENOWAIT;
@@ -537,6 +537,7 @@
Dump_One_Process(x);
})
for_all_elts(x, blocked, { Dump_One_Process(x); })
+ for_all_elts(x, qwaiting, { Dump_One_Process(x); })
} else
printf("***LWP: LWP support not initialized\n");
return 0;
@@ -577,6 +578,8 @@
}
blocked.head = NULL;
blocked.count = 0;
+ qwaiting.head = NULL;
+ qwaiting.count = 0;
lwp_init = (struct lwp_ctl *) malloc(sizeof(struct lwp_ctl));
temp = (PROCESS) malloc(sizeof(struct lwp_pcb));
if (lwp_init == NULL || temp == NULL)
@@ -628,6 +631,7 @@
for (i=0; i<MAX_PRIORITIES; i++)
for_all_elts(cur, runnable[i], { Free_PCB(cur); })
for_all_elts(cur, blocked, { Free_PCB(cur); })
+ for_all_elts(cur, qwaiting, { Free_PCB(cur); })
free(lwp_init);
lwp_init = NULL;
return LWP_SUCCESS;
@@ -748,6 +752,7 @@
Debug(4, ("Entered Delete_PCB"))
lwp_remove(pid, (pid->blockflag || pid->status==WAITING ||
pid->status==DESTROYED
? &blocked
+ : (pid->status == QWAITING) ? &qwaiting
: &runnable[pid->priority]));
LWPANCHOR.processcnt--;
return 0;
@@ -768,6 +773,7 @@
case READY: printf("READY"); break;
case WAITING: printf("WAITING"); break;
case DESTROYED: printf("DESTROYED"); break;
+ case QWAITING: printf("QWAITING"); break;
default: printf("unknown");
}
putchar('\n');
@@ -818,6 +824,11 @@
}
printf("[Blocked (%d):", blocked.count);
for_all_elts(p, blocked, {
+ printf(" \"%s\"", p->name);
+ })
+ puts("]");
+ printf("[QWaiting (%d):", qwaiting.count);
+ for_all_elts(p, qwaiting, {
printf(" \"%s\"", p->name);
})
puts("]");
diff -ru openafs-1.2.10fs/src/rx/rx_globals.c
openafs-1.2.10fs2/src/rx/rx_globals.c
--- openafs-1.2.10fs/src/rx/rx_globals.c Thu Jul 12 15:58:56 2001
+++ openafs-1.2.10fs2/src/rx/rx_globals.c Fri Jun 18 13:42:42 2004
@@ -9,6 +9,13 @@
/* RX: Globals for internal use, basically */
+/* This controls the size of an fd_set; it must be defined early before
+ * the system headers define that type and the macros that operate on it.
+ * Its value should be as large as the maximum file descriptor limit we
+ * are likely to run into on any platform. Right now, that is 65536
+ * which is the default hard fd limit on Solaris 9 */
+#define FD_SETSIZE 65536
+
#include <afsconfig.h>
#ifdef KERNEL
#include "../afs/param.h"
diff -ru openafs-1.2.10fs/src/rx/rx_lwp.c openafs-1.2.10fs2/src/rx/rx_lwp.c
--- openafs-1.2.10fs/src/rx/rx_lwp.c Wed Jul 31 18:36:11 2002
+++ openafs-1.2.10fs2/src/rx/rx_lwp.c Mon Jun 14 11:53:47 2004
@@ -9,6 +9,13 @@
/* rx_user.c contains routines specific to the user space UNIX
implementation of rx */
+/* This controls the size of an fd_set; it must be defined early before
+ * the system headers define that type and the macros that operate on it.
+ * Its value should be as large as the maximum file descriptor limit we
+ * are likely to run into on any platform. Right now, that is 65536
+ * which is the default hard fd limit on Solaris 9 */
+#define FD_SETSIZE 65536
+
#include <afsconfig.h>
#include <afs/param.h>
diff -ru openafs-1.2.10fs/src/vol/fssync.c
openafs-1.2.10fs2/src/vol/fssync.c
--- openafs-1.2.10fs/src/vol/fssync.c Mon Jan 5 15:05:48 2004
+++ openafs-1.2.10fs2/src/vol/fssync.c Mon Jun 14 11:54:01 2004
@@ -21,6 +21,7 @@
#endif
static int newVLDB = 1;
+
#ifndef AFS_PTHREAD_ENV
#define USUAL_PRIORITY (LWP_MAX_PRIORITY - 2)
@@ -35,6 +36,14 @@
fsync.c
File server synchronization with external volume utilities.
*/
+
+/* This controls the size of an fd_set; it must be defined early before
+ * the system headers define that type and the macros that operate on it.
+ * Its value should be as large as the maximum file descriptor limit we
+ * are likely to run into on any platform. Right now, that is 65536
+ * which is the default hard fd limit on Solaris 9 */
+#define FD_SETSIZE 65536
+
#include <afsconfig.h>
#include <afs/param.h>
@@ -246,6 +255,7 @@
return sd;
}
+static fd_set FSYNC_readfds;
static void FSYNC_sync() {
struct sockaddr_in addr;
int on = 1;
@@ -293,18 +303,17 @@
InitHandler();
AcceptOn();
for(;;) {
- fd_set readfds;
int maxfd;
- GetHandler(&readfds, &maxfd);
+ GetHandler(&FSYNC_readfds, &maxfd);
/* Note: check for >= 1 below is essential since IOMGR_select
* doesn't have exactly same semantics as select.
*/
#ifdef AFS_PTHREAD_ENV
- if (select(maxfd+1, &readfds, NULL, NULL, NULL) >= 1)
+ if (select(maxfd+1, &FSYNC_readfds, NULL, NULL, NULL) >= 1)
#else /* AFS_PTHREAD_ENV */
- if (IOMGR_Select(maxfd+1, &readfds, NULL, NULL, NULL) >= 1)
+ if (IOMGR_Select(maxfd+1, &FSYNC_readfds, NULL, NULL, NULL) >= 1)
#endif /* AFS_PTHREAD_ENV */
- CallHandler(&readfds);
+ CallHandler(&FSYNC_readfds);
}
}