[OpenAFS-devel] fileserver patch to optimize CopyOnWrite: copy and re-write only relevant data

Rainer Toebbicke rtb@pclella.cern.ch
Mon, 23 Feb 2009 16:30:28 +0100


--------------010703070204090502000606
Content-Type: text/plain; charset="ISO-8859-15"; format=flowed
Content-Transfer-Encoding: 7bit

Limit fileserver CopyOnWrite to copy only the relevant data.

When a file is modified that is shared between volume clones, CopyOnWrite 
creates a new copy of the file in the R/W volume prior to applying the 
modification. In the process, the complete file's data is read and re-written 
to disk, including the the part that is guaranteed to be overwritten by the 
current RPC and including the part of the file that is beyond the new file size.

The attached path optimizes this, copying only data outside the current rpc 
and within the new extent of the file.

This should relieve file servers a bit, in particular for applications which 
truncate or rewrite files from the beginning instead of deleting/re-creating 
them. The gain is most remarkable on small files which fit into one RPC and 
big files which are merely truncated. In those cases, the new CopyOnWrite does 
not need to copy anything.

As a bonus, some statistics about CopyOnWrite activities are kept and sent to 
the FileLog on shutdown or XCPU signal handling.

Context diff against 1.4.8. Bcc'ed to openafs-bugs.


-- 
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Rainer Toebbicke
European Laboratory for Particle Physics(CERN) - Geneva, Switzerland
Phone: +41 22 767 8985       Fax: +41 22 767 7155

--------------010703070204090502000606
Content-Type: text/plain; name="p_copyOnWrite"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="p_copyOnWrite"

--- openafs/src/viced/viced.c.o2	2007-08-09 11:50:13.000000000 +0200
+++ openafs/src/viced/viced.c	2007-08-30 13:26:08.000000000 +0200
@@ -591,6 +591,8 @@
 
 }				/*ClearXStatValues */
 
+int CopyOnWrite_calls = 0, CopyOnWrite_off0 = 0, CopyOnWrite_size0 = 0;
+afs_fsize_t CopyOnWrite_maxsize = 0;
 
 static void
 PrintCounters()
@@ -629,6 +631,9 @@
     ViceLog(0,
 	    ("There are %d workstations, %d are active (req in < 15 mins), %d marked \"down\"\n",
 	     workstations, activeworkstations, delworkstations));
+    ViceLog(0, ("CopyOnWrite: calls %d off0 %d size0 %d maxsize 0x%llx\n",
+		CopyOnWrite_calls, CopyOnWrite_off0, CopyOnWrite_size0, CopyOnWrite_maxsize));
+
     Statistics = 0;
 
 }				/*PrintCounters */
--- openafs/src/viced/afsfileprocs.c.o2	2007-06-06 16:46:31.000000000 +0200
+++ openafs/src/viced/afsfileprocs.c   	2007-08-30 14:01:14.000000000 +0200
@@ -203,6 +203,8 @@
 afs_int32 PctSpare;
 extern afs_int32 implicitAdminRights;
 extern afs_int32 readonlyServer;
+extern int CopyOnWrite_calls, CopyOnWrite_off0, CopyOnWrite_size0;
+extern afs_fsize_t CopyOnWrite_maxsize;
 
 /*
  * Externals used by the xstat code.
@@ -1305,8 +1307,9 @@
  * disk.inodeNumber and cloned)
  */
 #define	COPYBUFFSIZE	8192
+#define MAXFSIZE (~(afs_fsize_t) 0)
 static int
-CopyOnWrite(Vnode * targetptr, Volume * volptr)
+CopyOnWrite(Vnode * targetptr, Volume * volptr, afs_fsize_t off, afs_fsize_t len)
 {
     Inode ino, nearInode;
     int rdlen;
@@ -1323,6 +1326,13 @@
 	DFlush();		/* just in case? */
 
     VN_GET_LEN(size, targetptr);
+    if (size > off) 
+	size -= off;
+    else 
+	size = 0;
+    if (size > len)
+	size = len;
+
     buff = (char *)malloc(COPYBUFFSIZE);
     if (buff == NULL) {
 	return EIO;
@@ -1360,6 +1370,8 @@
     newFdP = IH_OPEN(newH);
     assert(newFdP != NULL);
 
+    FDH_SEEK(targFdP, off, SEEK_SET);
+    FDH_SEEK(newFdP, off, SEEK_SET);
     while (size > 0) {
 	if (size > COPYBUFFSIZE) {	/* more than a buffer */
 	    length = COPYBUFFSIZE;
@@ -1437,6 +1449,41 @@
     return 0;			/* success */
 }				/*CopyOnWrite */
 
+static int
+CopyOnWrite2(FdHandle_t *targFdP, FdHandle_t *newFdP, afs_fsize_t off, afs_fsize_t size) {
+    char *buff = (char *)malloc(COPYBUFFSIZE);
+    register int length;
+    int rdlen;
+    int wrlen;
+    int rc;
+
+    FDH_SEEK(targFdP, off, SEEK_SET);
+    FDH_SEEK(newFdP, off, SEEK_SET);
+
+    while (size > 0) {
+	if (size > COPYBUFFSIZE) {	/* more than a buffer */
+	    length = COPYBUFFSIZE;
+	    size -= COPYBUFFSIZE;
+	} else {
+	    length = (int)size;
+	    size = 0;
+	}
+	rdlen = FDH_READ(targFdP, buff, length);
+	if (rdlen == length)
+	    wrlen = FDH_WRITE(newFdP, buff, length);
+	else
+	    wrlen = 0;
+
+	if ((rdlen != length) || (wrlen != length)) {
+	    /* no error recovery, at the worst we'll have a "hole" in the file */
+	    rc = 1;
+	    break;
+	}
+    }
+    free(buff);
+    return rc;
+}
+
 
 /*
  * Common code to handle with removing the Name (file when it's called from
@@ -1457,7 +1504,7 @@
 	return (EINVAL);
     if (parentptr->disk.cloned) {
 	ViceLog(25, ("DeleteTarget : CopyOnWrite called\n"));
-	if ((errorCode = CopyOnWrite(parentptr, volptr))) {
+	if ((errorCode = CopyOnWrite(parentptr, volptr, 0, MAXFSIZE))) {
 	    ViceLog(20,
 		    ("DeleteTarget %s: CopyOnWrite failed %d\n", Name,
 		     errorCode));
@@ -1897,7 +1944,7 @@
 
     if (parentptr->disk.cloned) {
 	ViceLog(25, ("Alloc_NewVnode : CopyOnWrite called\n"));
-	if ((errorCode = CopyOnWrite(parentptr, volptr))) {	/* disk full */
+	if ((errorCode = CopyOnWrite(parentptr, volptr, 0, MAXFSIZE))) {	/* disk full */
 	    ViceLog(25, ("Alloc_NewVnode : CopyOnWrite failed\n"));
 	    /* delete the vnode previously allocated */
 	    (*targetptr)->delete = 1;
@@ -4210,13 +4257,13 @@
      */
     if (oldvptr->disk.cloned) {
 	ViceLog(25, ("Rename : calling CopyOnWrite on  old dir\n"));
-	if ((errorCode = CopyOnWrite(oldvptr, volptr)))
+	if ((errorCode = CopyOnWrite(oldvptr, volptr, 0, MAXFSIZE)))
 	    goto Bad_Rename;
     }
     SetDirHandle(&olddir, oldvptr);
     if (newvptr->disk.cloned) {
 	ViceLog(25, ("Rename : calling CopyOnWrite on  new dir\n"));
-	if ((errorCode = CopyOnWrite(newvptr, volptr)))
+	if ((errorCode = CopyOnWrite(newvptr, volptr, 0, MAXFSIZE)))
 	    goto Bad_Rename;
     }
 
@@ -4363,7 +4410,7 @@
      */
     if ((fileptr->disk.type == vDirectory) && (fileptr->disk.cloned)) {
 	ViceLog(25, ("Rename : calling CopyOnWrite on  target dir\n"));
-	if ((errorCode = CopyOnWrite(fileptr, volptr)))
+	if ((errorCode = CopyOnWrite(fileptr, volptr, 0, MAXFSIZE)))
 	    goto Bad_Rename;
     }
 
@@ -4867,7 +4914,7 @@
     }
     if (parentptr->disk.cloned) {
 	ViceLog(25, ("Link : calling CopyOnWrite on  target dir\n"));
-	if ((errorCode = CopyOnWrite(parentptr, volptr)))
+	if ((errorCode = CopyOnWrite(parentptr, volptr, 0, MAXFSIZE)))
 	    goto Bad_Link;	/* disk full error */
     }
 
@@ -7359,7 +7406,8 @@
     afs_fsize_t NewLength;	/* size after this store completes */
     afs_sfsize_t adjustSize;	/* bytes to call VAdjust... with */
     int linkCount;		/* link count on inode */
-    FdHandle_t *fdP;
+    afs_fsize_t CoW_off = 0, CoW_len = 0;
+    FdHandle_t *fdP, *origfdP = NULL;
     struct in_addr logHostAddr;	/* host ip holder for inet_ntoa */
 
 #if FS_STATS_DETAILED
@@ -7416,20 +7464,32 @@
 	     * mechanisms (i.e. copy on write overhead.) Also the right size
 	     * of the disk will be recorded...
 	     */
-	    FDH_CLOSE(fdP);
+	    origfdP = fdP;
 	    VN_GET_LEN(size, targetptr);
 	    volptr->partition->flags &= ~PART_DONTUPDATE;
 	    VSetPartitionDiskUsage(volptr->partition);
 	    volptr->partition->flags |= PART_DONTUPDATE;
 	    if ((errorCode = VDiskUsage(volptr, nBlocks(size)))) {
 		volptr->partition->flags &= ~PART_DONTUPDATE;
+		FDH_CLOSE(origfdP);
 		return (errorCode);
 	    }
 
-	    ViceLog(25, ("StoreData : calling CopyOnWrite on  target dir\n"));
-	    if ((errorCode = CopyOnWrite(targetptr, volptr))) {
+	    if (Pos == 0) 
+		CoW_off = Length;	/* only copy remaining parts of file */
+	    if (Length <= FileLength)
+		CoW_len = FileLength - Length;
+	    CopyOnWrite_calls++;
+	    if (CoW_len == 0) CopyOnWrite_size0++;
+	    if (CoW_off == 0) CopyOnWrite_off0++;
+	    if (CoW_len > CopyOnWrite_maxsize) CopyOnWrite_maxsize = CoW_len;
+
+	    ViceLog(1, ("StoreData : calling CopyOnWrite on vnode %lu.%lu (%s) off 0x%llx size 0x%llx\n",
+			V_id(volptr), targetptr->vnodeNumber, V_name(volptr), CoW_off, CoW_len));
+	    if ((errorCode = CopyOnWrite(targetptr, volptr, CoW_off, CoW_len))) {
 		ViceLog(25, ("StoreData : CopyOnWrite failed\n"));
 		volptr->partition->flags &= ~PART_DONTUPDATE;
+		FDH_CLOSE(origfdP);
 		return (errorCode);
 	    }
 	    volptr->partition->flags &= ~PART_DONTUPDATE;
@@ -7438,6 +7498,7 @@
 	    if (fdP == NULL) {
 		ViceLog(25,
 			("StoreData : Reopen after CopyOnWrite failed\n"));
+		FDH_CLOSE(origfdP);
 		return ENOENT;
 	    }
 	}
@@ -7464,6 +7525,7 @@
 	 AdjustDiskUsage(volptr, adjustSize,
 			 adjustSize - SpareComp(volptr)))) {
 	FDH_CLOSE(fdP);
+	if (origfdP) FDH_CLOSE(origfdP);
 	return (errorCode);
     }
 
@@ -7559,6 +7621,9 @@
 	 * need to update the target vnode.
 	 */
 	targetptr->changed_newTime = 1;
+	if (origfdP && (bytesTransfered < Length))	/* Need to "finish" CopyOnWrite still */
+	    CopyOnWrite2(origfdP, fdP, bytesTransfered, Length - bytesTransfered);
+	if (origfdP) FDH_CLOSE(origfdP);
 	FDH_CLOSE(fdP);
 	/* set disk usage to be correct */
 	VAdjustDiskUsage(&errorCode, volptr,
@@ -7566,6 +7631,7 @@
 					 nBlocks(NewLength)), 0);
 	return errorCode;
     }
+    if (origfdP) FDH_CLOSE(origfdP);
     FDH_CLOSE(fdP);
 
     TM_GetTimeOfDay(&StopTime, 0);

--------------010703070204090502000606--