[OpenAFS-devel] Re: [OpenAFS] 'replication'

Hartmut Reuter reuter@rzg.mpg.de
Thu, 29 Aug 2002 13:03:33 +0200


Rubino Gei=DF wrote:
> Hi,
>=20
> this is exactly what we want. Doing something like "vos dump | vos
> restore" cost far too much time, I guess.  My concern is to get our
> systems productive in case a major failure make one of the main servers=

> unusable. dump / restore is usable in case one volume get accidentally
> killed / deleted, but not for restoring >> 100 GB and hundreds of
> volumes...
>=20
> Hartmut, can you publish the changes you made to MR-AFS so we can
> integrate it into OpenAFS? Or does someone did this??? I'm sure a lot o=
f
> people would like to get this working.
>=20
>=20
> Bye, ruby
>=20

Ok, here are the changes stripped from MR-AFS specific code. I admit=20
that this was quick and durty programming, but who ever want's may=20
improve it.

In vos .c:

#########################################################################=
###########

static ConvertRO(as)
register struct cmd_syndesc *as;

{
     afs_int32 partition =3D -1;
     afs_int32 server, volid, code, i, same;
     struct nvldbentry entry, storeEntry;
     afs_int32 vcode;
     afs_int32 rwindex;
     afs_int32 rwserver =3D 0;
     afs_int32 rwpartition;
     afs_int32 roindex;
     afs_int32 roserver =3D 0;
     afs_int32 ropartition;
     int force =3D 0;
     struct rx_connection *aconn;
     char c, dc;

     server =3D GetServer(as->parms[0].items->data);
     if (!server) {
         fprintf(STDERR,"vos: host '%s' not found in host=20
table\n",as->parms[0].items->data );
         return ENOENT;
     }
     partition =3D volutil_GetPartitionID(as->parms[1].items->data);
     if (partition < 0) {
         fprintf(STDERR,"vos: could not interpret partition name=20
'%s'\n",as->parms[1].items->data );
         return ENOENT;
     }
     if (!IsPartValid(partition, server, &code)) {
         if(code) PrintError("",code);
         else fprintf(STDERR,"vos : partition %s does not exist on the=20
server\n",as->parms[1].items->data);
         return ENOENT;
     }
     volid =3D vsu_GetVolumeID(as->parms[2].items->data, cstruct, &code);=

     if (volid =3D=3D 0) {
         if (code) PrintError("", code);
         else fprintf(STDERR, "Unknown volume ID or name '%s'\n",=20
as->parms[0].items->data);
         return -1;
     }
     if (as->parms[3].items)
         force =3D 1;

     vcode =3D VLDB_GetEntryByID (volid, -1, &entry);
     if(vcode) {
         fprintf(STDERR,"Could not fetch the entry for volume %u from=20
VLDB\n",
                 volid);
         PrintError("convertROtoRW", code);
         return vcode;
     }

     /* use RO volid even if user specified RW or BK volid */

     if (volid !=3D entry.volumeId[ROVOL])
         volid =3D entry.volumeId[ROVOL];

     MapHostToNetwork(&entry);
     for (i=3D0; i< entry.nServers; i++) {
         if (entry.serverFlags[i] & ITSRWVOL) {
             rwindex =3D i;
             rwserver =3D entry.serverNumber[i];
             rwpartition =3D entry.serverPartition[i];
         }
         if (entry.serverFlags[i] & ITSROVOL) {
             same =3D VLDB_IsSameAddrs(server, entry.serverNumber[i], &co=
de);
             if (code) {
                 fprintf(STDERR, "Failed to get info about server's %d=20
address(es) from vlserver (err=3D%d); aborting call!\n",
                                 server, code);
                 return ENOENT;
             }
             if (same) {
                 roindex =3D i;
                 roserver =3D entry.serverNumber[i];
                 ropartition =3D entry.serverPartition[i];
                 break;
             }
         }
     }
     if (!roserver) {
         fprintf(STDERR, "Warning: RO volume didn't exist in vldb!\n");
     }
     if (ropartition !=3D partition) {
         fprintf(STDERR, "Warning: RO volume should be in partition %d=20
instead of %d (vldb)\n", ropartition, partition);
     }

     if (rwserver) {
         fprintf(STDERR,
                 "VLDB indicates that a RW volume exists already on %s=20
in partition %s.\n");
         if (!force) {
             fprintf(STDERR, "Overwrite this VLDB entry? [y|n] (n)\n");
             dc =3D c =3D getchar();
             while (!(dc=3D=3DEOF || dc=3D=3D'\n')) dc=3Dgetchar(); /* go=
to end of=20
line */
             if      ((c !=3D 'y') && (c !=3D 'Y')) {
                 fprintf(STDERR, "aborted.\n");
                 return -1;
             }
         }
     }

     vcode =3D ubik_Call(VL_SetLock, cstruct, 0, entry.volumeId[RWVOL], R=
WVOL,
                                                         VLOP_MOVE);
     aconn =3D UV_Bind(server, AFSCONF_VOLUMEPORT);
     code =3D AFSVolConvertROtoRWvolume(aconn, partition, volid);
     if (code) {
         fprintf(STDERR,"Converting RO volume %u to RW volume failed=20
with code %d\n", volid, code);
         PrintError("convertROtoRW", code);
         return -1;
     }

     entry.serverFlags[roindex] =3D ITSRWVOL;
     entry.flags |=3D RW_EXISTS;
     entry.flags &=3D ~BACK_EXISTS;
     if (rwserver) {
         (entry.nServers)--;
         if (rwindex !=3D entry.nServers) {
             entry.serverNumber[rwindex] =3D=20
entry.serverNumber[entry.nServers];
             entry.serverPartition[rwindex] =3D
=20
entry.serverPartition[entry.nServers];
             entry.serverFlags[rwindex] =3D entry.serverFlags[entry.nServ=
ers];
             entry.serverNumber[entry.nServers] =3D 0;
             entry.serverPartition[entry.nServers] =3D 0;
             entry.serverFlags[entry.nServers] =3D 0;
         }
     }
     entry.flags &=3D ~RO_EXISTS;
     for (i=3D0; i<entry.nServers; i++) {
         if (entry.serverFlags[i] & ITSROVOL) {
             if (!(entry.serverFlags[i] & (RO_DONTUSE | NEW_REPSITE)))
                 entry.flags |=3D RO_EXISTS;
         }
     }
     MapNetworkToHost(&entry, &storeEntry);
     code =3D VLDB_ReplaceEntry(entry.volumeId[RWVOL], RWVOL, &storeEntry=
,
                        (LOCKREL_OPCODE | LOCKREL_AFSID |=20
LOCKREL_TIMESTAMP));
     if (code)  {
         fprintf(STDERR, "Warning: volume converted, but vldb update=20
failed with
code %d!\n", code);
     }
     vcode =3D UV_LockRelease(entry.volumeId[RWVOL]);
     if (vcode) {
         PrintDiagnostics("unlock", vcode);
     }
     return code;
}

#########################################################################=
#########

and later:

#########################################################################=
#########
     ts =3D cmd_CreateSyntax("convertROtoRW", ConvertRO, 0, "convert a RO=
=20
volume into a RW volume (after loss of old RW volume)");
     cmd_AddParm(ts, "-server", CMD_SINGLE,CMD_OPTIONAL,  "machine name")=
;
     cmd_AddParm(ts, "-partition", CMD_SINGLE,CMD_OPTIONAL, "partition=20
name");
     cmd_AddParm(ts, "-id", CMD_SINGLE, 0, "volume name or ID");
     cmd_AddParm(ts, "-force", CMD_FLAG, CMD_OPTIONAL, "don't ask");
     COMMONPARMS;
#########################################################################=
##########

in volprocs.c

afs_int32 SAFSVolConvertROtoRWvolume(acid, partId, volumeId)
     struct rx_call *acid;
     afs_int32 partId;
     afs_int32 volumeId;
{
#ifdef AFS_NAMEI_ENV
     register struct Volume *tv;
     struct volser_trans *ttc;
     DIR *dirp;
     char pname[16];
     char volname[20];
     afs_int32 error =3D 0;
     afs_int32 volid;
     int found =3D 0;
     unsigned int now;
     char caller[MAXKTCNAMELEN];
     char headername[16];
     char tpath[256];
     struct VolumeDiskHeader h;
     int fd;
     IHandle_t *ih;
     Inode ino;


     if (!afsconf_SuperUser(tdir, acid, caller)) return=20
VOLSERBAD_ACCESS;/*not a super user*/
     if(GetPartName(partId, pname)) return VOLSERILLEGAL_PARTITION;
     dirp =3D opendir(pname);
     if(dirp =3D=3D NULL) return VOLSERILLEGAL_PARTITION;
     strcpy(volname,"");

     while(strcmp(volname,"EOD") && !found){/*while there are more=20
volumes in the partition */

         if(!strcmp(volname,"")) {/* its not a volume, fetch next file */=

             GetNextVol(dirp,pname,volname,&volid);
             continue; /*back to while loop */
         }

         if(volid =3D=3D volumeId)
             found =3D 1;
     }
     if (!found) {
         return ENOENT;
     }
     sprintf((char *)&headername, VFORMAT, volumeId);
     sprintf((char *)&tpath,"%s/%s", pname, headername);
     fd =3D open(tpath, O_RDONLY);
     if (fd < 0) {
         LogErrors(0,"SAFS_VolConvertROtoRWvolume: Couldn't open header=20
for RO-volume %lu.\n", volumeId);
         return ENOENT;
     }
     if (read(fd, &h, sizeof(h)) !=3D sizeof(h)) {
         LogErrors(0,"SAFS_VolConvertROtoRWvolume: Couldn't read header=20
for RO-volume %lu.\n", volumeId);
         return EIO;
     }
     close(fd);

     FSYNC_askfs(volumeId, partId, FSYNC_RESTOREVOLUME, 0);

     ino =3D NAMEI_INODESPECIAL;
     ino |=3D ((Inode)VI_LINKTABLE) << NAMEI_TAGSHIFT;
     ino |=3D ((Inode)h.parent) << NAMEI_UNIQSHIFT;
     IH_INIT(ih,=20
FSList.FileSystemsList_val[i].FileSystems_u.AfsInodeInterface.DeviceTagNu=
mber,=20
h.parent, ino);

     code =3D namei_ConvertROtoRWvolume(ih, volumeId, &convertVolumeInfo)=
;
     if (code)
         return code;
     if (unlink(tpath) < 0) {
         LogErrors(0, "SAFS_VolConvertROtoRWvolume: Couldn't unlink RO=20
header, error =3D %d\n", error);
     }
     h.id =3D h.parent;
     h.volumeInfoFileTag2 =3D h.id;
     h.smallVnodeFileTag2 =3D h.id;
     h.largeVnodeFileTag2 =3D h.id;
     h.linkTableFileTag2 =3D h.id;
     sprintf((char *)&headername, VFORMAT, h.id);
     sprintf((char *)&tpath, "%s/%s", pname, headername);
     fd =3D open(tpath, O_CREAT | O_EXCL | O_RDWR, 0644);
     if (fd < 0) {
         LogErrors(0,"SAFS_VolConvertROtoRWvolume: Couldn't create=20
header for RW-volume %lu.\n", h.id);
         return EIO;
     }
     if (write(fd, &h, sizeof(h)) !=3D sizeof(h)) {
         LogErrors(0,"SAFS_VolConvertROtoRWvolume: Couldn't write header =

for RW-volume %lu.\n", h.id);
         return EIO;
     }
     close(fd);
     FSYNC_askfs(volumeId, partId, FSYNC_DONE, 0);
     FSYNC_askfs(h.id, partId, FSYNC_ON, 0);
     return 0;
#else /* AFS_NAMEI_ENV */
     return EINV;
#endif /* AFS_NAMEI_ENV */
}

#########################################################################=
############

in volint.xg:

#########################################################################=
############

#define     VOLCONVERTRO        141

proc ConvertROtoRWvolume(
   IN afs_int32 partid,
   IN afs_int32 volid
) =3D VOLCONVERTRO;

#########################################################################=
############

in namei_ops.c:

#########################################################################=
############

/*
  * Convert a RO-volume into a RW-volume
  *
  * This function allows to recover very fast from the loss of a partitio=
n
  * if from RO-copies exist of all volumes exist on another partition.
  * Then these RO-volumes can be made to the new RW-volumes.
  * Backup of RW-volumes consists in "vos release".
  *
  * We must mke sure in this partition exists only the RO-volume which
  * is typical for remote replicas.
  *
  * Then the linktable is already ok,
  *      the vnode files need to be renamed
  *      the volinfo file needs to be replaced by another one with
  *                      slightly different contents and new name.
  * The volume header file of the RO-volume in the /vicep<x> directory
  * is destroyed by this call. A new header file for the RW-volume must
  * be created after return from this routine.
  */

int namei_ConvertROtoRWvolume(IHandle_t * h, afs_uint32 vid,
                    int (*convertVolInfo)(int fd, int fd2, int vid))
{
     namei_t n;
     char dir_name[512], oldpath[512], newpath[512];
     char smallName[64];
     char largeName[64];
     char infoName[64];
     char tmp[NAMEI_LCOMP_LEN];
     IHandle_t t_ih;
     char infoSeen =3D 0;
     char smallSeen =3D 0;
     char largeSeen =3D 0;
     char linkSeen =3D 0;

     int code, fd, fd2;
     char *p;
     DIR *dirp;
     struct dirent *dp;
     struct ViceInodeInfo info;

     namei_HandleToName(&n, h);
     strcpy((char *) &dir_name, (char *) &n.n_path);
     p =3D rindex((char *) &dir_name, '/');
     *p =3D 0;
     dirp =3D opendir(dir_name);
     if (!dirp) {
         LogErrors(0, "namei_ConvertROtoRWvolume: Could not opendir(%s)\n=
",
                         dir_name);
         return EIO;
     }

     while (dp =3D readdir(dirp)) {
         struct ViceInodeInfo info;

         if (*dp->d_name =3D=3D '.') continue;
         if (DecodeInode((char *)&dir_name, dp->d_name, &info,=20
h->ih_vid)<0) {
             LogErrors(0,"namei_ConvertROtoRWvolume: DecodeInode failed=20
for %s/%s\n", dir_name, dp->d_name);
             closedir(dirp);
             return -1;
         }
         if (info.u.param[1] !=3D -1) {
             LogErrors(0,"namei_ConvertROtoRWvolume: found other than=20
volume special file %s/%s\n", dir_name, dp->d_name);
             closedir(dirp);
             return -1;
         }
         if (info.u.param[0] !=3D vid) {
             if (info.u.param[0] =3D=3D h->ih_vid) {
                 if (info.u.param[2] =3D=3D 6) {   /* link table */
                     linkSeen =3D 1;
                     continue;
                 }
             }
             LogErrors(0,"namei_ConvertROtoRWvolume: found special file=20
%s/%s for volume %lu\n", dir_name, dp->d_name, info.u.param[0]);
             closedir(dirp);
             return -1;
         }
         if (info.u.param[2] =3D=3D 1) {   /* volume info file */
             strcpy((char *) &infoName, (char *) &dp->d_name);
             infoSeen =3D 1;
         }
         else if (info.u.param[2] =3D=3D 2) {   /* small vnodes file */
             strcpy((char *) &smallName, (char *) &dp->d_name);
             smallSeen =3D 1;
         }
         else if (info.u.param[2] =3D=3D 3) {   /* large vnodes file */
             strcpy((char *) &largeName, (char *) &dp->d_name);
             largeSeen =3D 1;
         }
         else {
             closedir(dirp);
             LogErrors(0,"namei_ConvertROtoRWvolume: unknown type %d of=20
special file found : %s/%s\n", info.u.param[2], dir_name, dp->d_name);
             return -1;
         }
     }
     closedir(dirp);

     if (!infoSeen || !smallSeen || !largeSeen || !linkSeen) {
         LogErrors(0,"namei_ConvertROtoRWvolume: not all special files=20
found in %s\n", dir_name);
             return -1;
     }

     /*
      * If we come here then there was only a RO-volume and we can safely=

      * proceed.
      */

     bzero(&t_ih, sizeof(t_ih));
     t_ih.ih_dev =3D h->ih_dev;
     t_ih.ih_vid =3D h->ih_vid;


     sprintf(&oldpath, "%s/%s", dir_name, infoName);
     fd =3D open(oldpath, O_RDWR, 0);
     if (fd < 0) {
         LogErrors(0,"namei_ConvertROtoRWvolume: could not open RO info=20
file: %s\n", oldpath);
         return -1;
     }
     t_ih.ih_ino =3D namei_MakeSpecIno(h->ih_vid, 1);
     namei_HandleToName(&n, &t_ih);
     fd2 =3D open(n.n_path, O_CREAT|O_EXCL|O_TRUNC|O_RDWR, 0);
     if (fd2 < 0) {
         LogErrors(0,"namei_ConvertROtoRWvolume: could not create RW=20
info file: %s\n", n.n_path);
         close(fd);
         return -1;
     }
     code =3D (*convertVolInfo)(fd, fd2, h->ih_vid);
     close(fd);
     if (code) {
         close(fd2);
         unlink(n.n_path);
         return -1;
     }
     SetOGM(fd2, h->ih_vid, 1);
     close(fd2);

     t_ih.ih_ino =3D namei_MakeSpecIno(h->ih_vid, 2);
     namei_HandleToName(&n, &t_ih);
     sprintf(&oldpath, "%s/%s", dir_name, smallName);
     fd =3D open(oldpath, O_RDWR, 0);
     SetOGM(fd, h->ih_vid, 2);
     close(fd);
     link(&oldpath, &n.n_path);
     unlink(&oldpath);

     t_ih.ih_ino =3D namei_MakeSpecIno(h->ih_vid, 3);
     namei_HandleToName(&n, &t_ih);
     sprintf(&oldpath, "%s/%s", dir_name, largeName);
     fd =3D open(oldpath, O_RDWR, 0);
     SetOGM(fd, h->ih_vid, 3);
     close(fd);
     link(&oldpath, &n.n_path);
     unlink(&oldpath);

     sprintf(&oldpath, "%s/%s", dir_name, infoName);
     unlink(&oldpath);
     return 0;
}

#########################################################################=
###########

I hope I didn't forget too much. If so, let me know.

Good luck!

Hartmut

-----------------------------------------------------------------
Hartmut Reuter                           e-mail reuter@rzg.mpg.de
					   phone +49-89-3299-1328
RZG (Rechenzentrum Garching)               fax   +49-89-3299-1301
Computing Center of the Max-Planck-Gesellschaft (MPG) and the
Institut fuer Plasmaphysik (IPP)
-----------------------------------------------------------------