[OpenAFS-devel] [PATCH] fix openafs crashes on linux 2.6.10-2.6.12, and all RHEL4 kernels

Christopher Allen Wing wingc@engin.umich.edu
Wed, 18 Apr 2007 11:07:45 -0400 (EDT)


(following up to my message from last month)

Background: OpenAFS is vulnerable to crashing in the linux kernel symlink 
code when running on kernel versions between 2.6.10 to 2.6.12.  This also 
includes all RHEL4 kernels, because RHEL4 includes the code from 2.6.10. 
The problem is that the symlink text caching API, page_follow_link() et 
al, is unsuitable for network filesystems where the page cache may be 
invalidated in parallel with a path lookup.

This crash can be triggered easily by doing a bunch of path lookups 
involving symlinks (e.g., stat() on various files pointed to through 
links), while simultaneously running 'fs flushvol' on the volume 
containing the symlinks.

The simplest way to fix this problem is to disable the use of symlink text 
caching when the kernel does not provide a usable symlink API.

----


My patch last month was buggy, it did not work under load because I copied 
and pasted the following code from
src/afs/LINUX/osi_vnodeops.c::afs_linux_follow_link:

 	AFS_GLOCK();
 	name = osi_Alloc(PATH_MAX + 1);
 	...
 	...


This code (from the former afs_linux_follow_link()) appears to only be 
used on linux 2.2 and older kernels.  There are two problems here; first, 
PATH_MAX is defined to be 4096 on Linux.  Therefore on most machines (e.g. 
i386 and amd64), osi_Alloc(PATH_MAX+1) wants to allocate more than a 
single page of memory.  This forces the use of vmalloc() which is 
something that should be avoided.

Secondly, the allocation was wrapped within AFS_GLOCK()/AFS_GUNLOCK(). 
The OpenAFS implementation of osi_Alloc() for the Linux kernel will do one 
of two things depending upon the requested allocation size:

 	if (size <= PAGE_SIZE) {
 		kmalloc(size, GFP_NOFS);
 	} else {
 		vmalloc(size);
 	}

GFP_NOFS tells the allocator not to recurse back into the filesystem if 
it's necessary to free up memory.  However, vmalloc() does not have such 
an option.  Therefore, calling osi_Alloc() to request more than a page of 
memory may end up recursing back into AFS to try to free unused inodes or 
dentries.

In this case, what happened was that osi_Alloc() is called within an 
AFS_GLOCK(); osi_Alloc() calls vmalloc() which tries to free dentry 
objects, which then calls back into the AFS module.  Unfortunately, 
AFS_GLOCK() is already held and we deadlock.



I solved this by:

-	limiting symlink text to PATH_MAX instead of PATH_MAX+1 (this is
 	what OpenAFS already did for 2.4+ kernels anyway via
 	afs_symlink_filler()).  This avoids calling vmalloc().

-	removing the AFS_GLOCK()/AFS_GUNLOCK() wrappers around
 	osi_Alloc().  I don't see any reason for this.



An updated version of the patch (against openafs-stable-1_4_x) is 
attached.  It should fix the symlink crashes on 2.6.10-2.6.12, as well as 
all RHEL4 kernels.  No code changes are made unless compiled on one of 
these buggy kernels.

You can also download this patch from:

 	http://www-personal.engin.umich.edu/~wingc/openafs/patches/openafs-stable-1_4_x-20070418-symlink26.patch


-Chris Wing
wingc@engin.umich.edu



diff -uNr openafs.orig/acinclude.m4 openafs/acinclude.m4
--- openafs.orig/acinclude.m4	2007-02-22 16:48:58.000000000 -0500
+++ openafs/acinclude.m4	2007-04-18 10:46:23.000000000 -0400
@@ -606,6 +606,7 @@
  	  	 LINUX_IOP_I_CREATE_TAKES_NAMEIDATA
  	  	 LINUX_IOP_I_LOOKUP_TAKES_NAMEIDATA
  	  	 LINUX_IOP_I_PERMISSION_TAKES_NAMEIDATA
+	  	 LINUX_IOP_I_PUT_LINK_TAKES_COOKIE
  	  	 LINUX_DOP_D_REVALIDATE_TAKES_NAMEIDATA
  	  	 LINUX_AOP_WRITEBACK_CONTROL
  		 LINUX_FS_STRUCT_FOP_HAS_FLOCK
@@ -848,6 +849,11 @@
  		 if test "x$ac_cv_linux_exports_tasklist_lock" = "xyes" ; then
  		  AC_DEFINE(EXPORTED_TASKLIST_LOCK, 1, [define if tasklist_lock exported])
  		 fi
+		 if test "x$ac_cv_linux_kernel_page_follow_link" = "xyes" -o "x$ac_cv_linux_func_i_put_link_takes_cookie" = "xyes"; then
+		  AC_DEFINE(USABLE_KERNEL_PAGE_SYMLINK_CACHE, 1, [define if your kernel has a usable symlink cache API])
+		 else
+		  AC_MSG_WARN([your kernel does not have a usable symlink cache API])
+		 fi
                  :
  		fi
  esac
diff -uNr openafs.orig/src/afs/LINUX/osi_vnodeops.c openafs/src/afs/LINUX/osi_vnodeops.c
--- openafs.orig/src/afs/LINUX/osi_vnodeops.c	2007-02-20 13:06:24.000000000 -0500
+++ openafs/src/afs/LINUX/osi_vnodeops.c	2007-04-18 10:46:23.000000000 -0400
@@ -1279,7 +1279,7 @@
  	return -code;
  }

-#if !defined(AFS_LINUX24_ENV)
+#if !defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
  /* afs_linux_readlink
   * Fill target (which is in user space) with contents of symlink.
   */
@@ -1299,6 +1299,36 @@
  /* afs_linux_follow_link
   * a file system dependent link following routine.
   */
+#if defined(AFS_LINUX24_ENV)
+static int afs_linux_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+    int code;
+    char *name;
+
+    name = osi_Alloc(PATH_MAX);
+    if (!name) {
+	return -EIO;
+    }
+
+    AFS_GLOCK();
+    code = afs_linux_ireadlink(dentry->d_inode, name, PATH_MAX - 1, AFS_UIOSYS);
+    AFS_GUNLOCK();
+
+    if (code < 0) {
+	goto out;
+    }
+
+    name[code] = '\0';
+    code = vfs_follow_link(nd, name);
+
+out:
+    osi_Free(name, PATH_MAX);
+
+    return code;
+}
+
+#else /* !defined(AFS_LINUX24_ENV) */
+
  static struct dentry *
  afs_linux_follow_link(struct dentry *dp, struct dentry *basep,
  		      unsigned int follow)
@@ -1332,7 +1362,8 @@
      AFS_GUNLOCK();
      return res;
  }
-#endif
+#endif /* AFS_LINUX24_ENV */
+#endif /* USABLE_KERNEL_PAGE_SYMLINK_CACHE */

  /* afs_linux_readpage
   * all reads come through here. A strategy-like read call.
@@ -1697,7 +1728,7 @@
  /* We really need a separate symlink set of ops, since do_follow_link()
   * determines if it _is_ a link by checking if the follow_link op is set.
   */
-#if defined(AFS_LINUX24_ENV)
+#if defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
  static int
  afs_symlink_filler(struct file *file, struct page *page)
  {
@@ -1732,10 +1763,10 @@
  static struct address_space_operations afs_symlink_aops = {
    .readpage =	afs_symlink_filler
  };
-#endif
+#endif	/* USABLE_KERNEL_PAGE_SYMLINK_CACHE */

  static struct inode_operations afs_symlink_iops = {
-#if defined(AFS_LINUX24_ENV)
+#if defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
    .readlink = 		page_readlink,
  #if defined(HAVE_KERNEL_PAGE_FOLLOW_LINK)
    .follow_link =	page_follow_link,
@@ -1743,13 +1774,17 @@
    .follow_link =	page_follow_link_light,
    .put_link =           page_put_link,
  #endif
-  .setattr =		afs_notify_change,
-#else
+#else /* !defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE) */
    .readlink = 		afs_linux_readlink,
    .follow_link =	afs_linux_follow_link,
+#if !defined(AFS_LINUX24_ENV)
    .permission =		afs_linux_permission,
    .revalidate =		afs_linux_revalidate,
  #endif
+#endif /* USABLE_KERNEL_PAGE_SYMLINK_CACHE */
+#if defined(AFS_LINUX24_ENV)
+  .setattr =		afs_notify_change,
+#endif
  };

  void
@@ -1775,7 +1810,7 @@

      } else if (S_ISLNK(ip->i_mode)) {
  	ip->i_op = &afs_symlink_iops;
-#if defined(AFS_LINUX24_ENV)
+#if defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
  	ip->i_data.a_ops = &afs_symlink_aops;
  	ip->i_mapping = &ip->i_data;
  #endif
diff -uNr openafs.orig/src/cf/linux-test4.m4 openafs/src/cf/linux-test4.m4
--- openafs.orig/src/cf/linux-test4.m4	2007-02-26 12:53:33.000000000 -0500
+++ openafs/src/cf/linux-test4.m4	2007-04-18 10:46:23.000000000 -0400
@@ -644,6 +644,22 @@
    AC_MSG_RESULT($ac_cv_linux_func_i_permission_takes_nameidata)])


+AC_DEFUN([LINUX_IOP_I_PUT_LINK_TAKES_COOKIE], [
+  AC_MSG_CHECKING([whether inode_operations.put_link takes an opaque cookie])
+  AC_CACHE_VAL([ac_cv_linux_func_i_put_link_takes_cookie], [
+    AC_TRY_KBUILD(
+[#include <linux/fs.h>
+#include <linux/namei.h>],
+[struct inode _inode;
+struct dentry _dentry;
+struct nameidata _nameidata;
+void *cookie;
+(void)_inode.i_op->put_link(&_dentry, &_nameidata, cookie);],
+      ac_cv_linux_func_i_put_link_takes_cookie=yes,
+      ac_cv_linux_func_i_put_link_takes_cookie=no)])
+  AC_MSG_RESULT($ac_cv_linux_func_i_put_link_takes_cookie)])
+
+
  AC_DEFUN([LINUX_DOP_D_REVALIDATE_TAKES_NAMEIDATA], [
    AC_MSG_CHECKING([whether dentry_operations.d_revalidate takes a nameidata])
    AC_CACHE_VAL([ac_cv_linux_func_d_revalidate_takes_nameidata], [