[OpenAFS-devel] [PATCH] fix openafs crashes on linux 2.6.10-2.6.12, and all RHEL4
kernels
Christopher Allen Wing
wingc@engin.umich.edu
Wed, 18 Apr 2007 11:07:45 -0400 (EDT)
(following up to my message from last month)
Background: OpenAFS is vulnerable to crashing in the linux kernel symlink
code when running on kernel versions between 2.6.10 to 2.6.12. This also
includes all RHEL4 kernels, because RHEL4 includes the code from 2.6.10.
The problem is that the symlink text caching API, page_follow_link() et
al, is unsuitable for network filesystems where the page cache may be
invalidated in parallel with a path lookup.
This crash can be triggered easily by doing a bunch of path lookups
involving symlinks (e.g., stat() on various files pointed to through
links), while simultaneously running 'fs flushvol' on the volume
containing the symlinks.
The simplest way to fix this problem is to disable the use of symlink text
caching when the kernel does not provide a usable symlink API.
----
My patch last month was buggy, it did not work under load because I copied
and pasted the following code from
src/afs/LINUX/osi_vnodeops.c::afs_linux_follow_link:
AFS_GLOCK();
name = osi_Alloc(PATH_MAX + 1);
...
...
This code (from the former afs_linux_follow_link()) appears to only be
used on linux 2.2 and older kernels. There are two problems here; first,
PATH_MAX is defined to be 4096 on Linux. Therefore on most machines (e.g.
i386 and amd64), osi_Alloc(PATH_MAX+1) wants to allocate more than a
single page of memory. This forces the use of vmalloc() which is
something that should be avoided.
Secondly, the allocation was wrapped within AFS_GLOCK()/AFS_GUNLOCK().
The OpenAFS implementation of osi_Alloc() for the Linux kernel will do one
of two things depending upon the requested allocation size:
if (size <= PAGE_SIZE) {
kmalloc(size, GFP_NOFS);
} else {
vmalloc(size);
}
GFP_NOFS tells the allocator not to recurse back into the filesystem if
it's necessary to free up memory. However, vmalloc() does not have such
an option. Therefore, calling osi_Alloc() to request more than a page of
memory may end up recursing back into AFS to try to free unused inodes or
dentries.
In this case, what happened was that osi_Alloc() is called within an
AFS_GLOCK(); osi_Alloc() calls vmalloc() which tries to free dentry
objects, which then calls back into the AFS module. Unfortunately,
AFS_GLOCK() is already held and we deadlock.
I solved this by:
- limiting symlink text to PATH_MAX instead of PATH_MAX+1 (this is
what OpenAFS already did for 2.4+ kernels anyway via
afs_symlink_filler()). This avoids calling vmalloc().
- removing the AFS_GLOCK()/AFS_GUNLOCK() wrappers around
osi_Alloc(). I don't see any reason for this.
An updated version of the patch (against openafs-stable-1_4_x) is
attached. It should fix the symlink crashes on 2.6.10-2.6.12, as well as
all RHEL4 kernels. No code changes are made unless compiled on one of
these buggy kernels.
You can also download this patch from:
http://www-personal.engin.umich.edu/~wingc/openafs/patches/openafs-stable-1_4_x-20070418-symlink26.patch
-Chris Wing
wingc@engin.umich.edu
diff -uNr openafs.orig/acinclude.m4 openafs/acinclude.m4
--- openafs.orig/acinclude.m4 2007-02-22 16:48:58.000000000 -0500
+++ openafs/acinclude.m4 2007-04-18 10:46:23.000000000 -0400
@@ -606,6 +606,7 @@
LINUX_IOP_I_CREATE_TAKES_NAMEIDATA
LINUX_IOP_I_LOOKUP_TAKES_NAMEIDATA
LINUX_IOP_I_PERMISSION_TAKES_NAMEIDATA
+ LINUX_IOP_I_PUT_LINK_TAKES_COOKIE
LINUX_DOP_D_REVALIDATE_TAKES_NAMEIDATA
LINUX_AOP_WRITEBACK_CONTROL
LINUX_FS_STRUCT_FOP_HAS_FLOCK
@@ -848,6 +849,11 @@
if test "x$ac_cv_linux_exports_tasklist_lock" = "xyes" ; then
AC_DEFINE(EXPORTED_TASKLIST_LOCK, 1, [define if tasklist_lock exported])
fi
+ if test "x$ac_cv_linux_kernel_page_follow_link" = "xyes" -o "x$ac_cv_linux_func_i_put_link_takes_cookie" = "xyes"; then
+ AC_DEFINE(USABLE_KERNEL_PAGE_SYMLINK_CACHE, 1, [define if your kernel has a usable symlink cache API])
+ else
+ AC_MSG_WARN([your kernel does not have a usable symlink cache API])
+ fi
:
fi
esac
diff -uNr openafs.orig/src/afs/LINUX/osi_vnodeops.c openafs/src/afs/LINUX/osi_vnodeops.c
--- openafs.orig/src/afs/LINUX/osi_vnodeops.c 2007-02-20 13:06:24.000000000 -0500
+++ openafs/src/afs/LINUX/osi_vnodeops.c 2007-04-18 10:46:23.000000000 -0400
@@ -1279,7 +1279,7 @@
return -code;
}
-#if !defined(AFS_LINUX24_ENV)
+#if !defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
/* afs_linux_readlink
* Fill target (which is in user space) with contents of symlink.
*/
@@ -1299,6 +1299,36 @@
/* afs_linux_follow_link
* a file system dependent link following routine.
*/
+#if defined(AFS_LINUX24_ENV)
+static int afs_linux_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ int code;
+ char *name;
+
+ name = osi_Alloc(PATH_MAX);
+ if (!name) {
+ return -EIO;
+ }
+
+ AFS_GLOCK();
+ code = afs_linux_ireadlink(dentry->d_inode, name, PATH_MAX - 1, AFS_UIOSYS);
+ AFS_GUNLOCK();
+
+ if (code < 0) {
+ goto out;
+ }
+
+ name[code] = '\0';
+ code = vfs_follow_link(nd, name);
+
+out:
+ osi_Free(name, PATH_MAX);
+
+ return code;
+}
+
+#else /* !defined(AFS_LINUX24_ENV) */
+
static struct dentry *
afs_linux_follow_link(struct dentry *dp, struct dentry *basep,
unsigned int follow)
@@ -1332,7 +1362,8 @@
AFS_GUNLOCK();
return res;
}
-#endif
+#endif /* AFS_LINUX24_ENV */
+#endif /* USABLE_KERNEL_PAGE_SYMLINK_CACHE */
/* afs_linux_readpage
* all reads come through here. A strategy-like read call.
@@ -1697,7 +1728,7 @@
/* We really need a separate symlink set of ops, since do_follow_link()
* determines if it _is_ a link by checking if the follow_link op is set.
*/
-#if defined(AFS_LINUX24_ENV)
+#if defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
static int
afs_symlink_filler(struct file *file, struct page *page)
{
@@ -1732,10 +1763,10 @@
static struct address_space_operations afs_symlink_aops = {
.readpage = afs_symlink_filler
};
-#endif
+#endif /* USABLE_KERNEL_PAGE_SYMLINK_CACHE */
static struct inode_operations afs_symlink_iops = {
-#if defined(AFS_LINUX24_ENV)
+#if defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
.readlink = page_readlink,
#if defined(HAVE_KERNEL_PAGE_FOLLOW_LINK)
.follow_link = page_follow_link,
@@ -1743,13 +1774,17 @@
.follow_link = page_follow_link_light,
.put_link = page_put_link,
#endif
- .setattr = afs_notify_change,
-#else
+#else /* !defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE) */
.readlink = afs_linux_readlink,
.follow_link = afs_linux_follow_link,
+#if !defined(AFS_LINUX24_ENV)
.permission = afs_linux_permission,
.revalidate = afs_linux_revalidate,
#endif
+#endif /* USABLE_KERNEL_PAGE_SYMLINK_CACHE */
+#if defined(AFS_LINUX24_ENV)
+ .setattr = afs_notify_change,
+#endif
};
void
@@ -1775,7 +1810,7 @@
} else if (S_ISLNK(ip->i_mode)) {
ip->i_op = &afs_symlink_iops;
-#if defined(AFS_LINUX24_ENV)
+#if defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
ip->i_data.a_ops = &afs_symlink_aops;
ip->i_mapping = &ip->i_data;
#endif
diff -uNr openafs.orig/src/cf/linux-test4.m4 openafs/src/cf/linux-test4.m4
--- openafs.orig/src/cf/linux-test4.m4 2007-02-26 12:53:33.000000000 -0500
+++ openafs/src/cf/linux-test4.m4 2007-04-18 10:46:23.000000000 -0400
@@ -644,6 +644,22 @@
AC_MSG_RESULT($ac_cv_linux_func_i_permission_takes_nameidata)])
+AC_DEFUN([LINUX_IOP_I_PUT_LINK_TAKES_COOKIE], [
+ AC_MSG_CHECKING([whether inode_operations.put_link takes an opaque cookie])
+ AC_CACHE_VAL([ac_cv_linux_func_i_put_link_takes_cookie], [
+ AC_TRY_KBUILD(
+[#include <linux/fs.h>
+#include <linux/namei.h>],
+[struct inode _inode;
+struct dentry _dentry;
+struct nameidata _nameidata;
+void *cookie;
+(void)_inode.i_op->put_link(&_dentry, &_nameidata, cookie);],
+ ac_cv_linux_func_i_put_link_takes_cookie=yes,
+ ac_cv_linux_func_i_put_link_takes_cookie=no)])
+ AC_MSG_RESULT($ac_cv_linux_func_i_put_link_takes_cookie)])
+
+
AC_DEFUN([LINUX_DOP_D_REVALIDATE_TAKES_NAMEIDATA], [
AC_MSG_CHECKING([whether dentry_operations.d_revalidate takes a nameidata])
AC_CACHE_VAL([ac_cv_linux_func_d_revalidate_takes_nameidata], [