/*************************************************************************** * * Copyright (C) 2001 International Business Machines * All rights reserved. * * This file is part of the GPFS mmfslinux kernel module. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *************************************************************************** */ /* @(#)24 1.157.1.8 src/avs/fs/mmfs/ts/kernext/gpl-linux/super.c, mmfs, avs_rgpfs24, rgpfs24s011a 4/5/07 11:15:55 */ /* * Superblock operations * * Contents: * TraceBKL * gpfs_s_read_inode2 * gpfs_s_read_inode * gpfs_s_delete_inode * gpfs_s_notify_change * gpfs_s_put_super * gpfs_s_statfs * gpfs_s_umount_begin * gpfs_s_remount * gpfs_s_write_inode * gpfs_s_clear_inode * gpfs_s_write_super * gpfs_s_fs_locations * gpfs_fill_super * gpfs_reg_fs * gpfs_unreg_fs * kill_mmfsd * get_myinode * exec_mmfs * fork_mount_helper * vfsUserCleanup * cxiSetMountInfo * cxiUnmount * cxiReactivateOSNode * cxiNewOSNode * cxiFreeOSNode * cxiDeleteMmap * cxiReinitOSNode * cxiFindOSNode * cxiDumpOSNode * cxiRefOSNode * cxiInactiveOSNode * cxiPutOSNode * cxiDestroyOSNode * cxiSetOSNodeType * cxiUpdateInode * cxiCanUncacheOSNode * cxiAddOSNode * */ #include #include #include #include #include #include #ifndef GPFS_ARCH_X86_64 #define __KERNEL_SYSCALLS__ #endif #include #include /* KERNEL_DS */ #define FOOBAR #error Do not do this /* GPFS headers */ #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_KERNEL_VERSION > 2060000 #include #endif /* forward declaration */ int vfsUserCleanup(struct super_block *sbP, struct gpfsVfsData_t *privVfsP, Boolean force); extern struct file_system_type gpfs_fs_type; static DECLARE_WAIT_QUEUE_HEAD(pwq); int mmfsd_module_active = 0; static int mmfsd_id = -1; static int mount_id = -1; char mountCmd[CXI_MAXPATHLEN+1] = "M "; char mmfs_path[CXI_MAXPATHLEN+1] = ""; char bin_path[CXI_MAXPATHLEN+1]; static char mount_opt[CXI_MAXPATHLEN+1]; static unsigned int unusedInodeNum = 1; static struct inode *unusedInodeP = NULL; static struct super_block *unusedSuperP = NULL; struct super_block *shutdownSuperP = NULL; static spinlock_t inode_lock; /* Routine to trace whether kernel lock is held */ #ifdef VERBOSETRACE void TraceBKL() { TRACE2(TRACE_VNODE, 10, TRCID_VNODE_BKL, "BKL %d lock_depth %d\n", kernel_locked(), current->lock_depth); } #endif #include #if HAS_SOP_ALLOC_INODE static struct kmem_cache * gpfsInodeCacheP; struct gpfs_bloated_inode { struct inode inode; char cxiNode[CXINODE_SIZE]; }; static void gpfs_init_once(void * iP, struct kmem_cache * cacheP, unsigned long flags) { if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) inode_init_once((struct inode *)iP); } int gpfs_init_inodecache(void) { gpfsInodeCacheP = kmem_cache_create("gpfsInodeCache", sizeof(struct gpfs_bloated_inode), 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, gpfs_init_once, NULL); if (gpfsInodeCacheP == NULL) return -ENOMEM; return 0; } struct inode * gpfs_alloc_inode(struct super_block *sbP) { struct inode * iP; iP = (struct inode *)kmem_cache_alloc(gpfsInodeCacheP, GFP_KERNEL); TRACE1N(TRACE_VNODE, 1, TRCID_LINUXOPS_GPFS_ALLOC_INODE_EXIT, "gpfs_alloc_inode: inode 0x%lX\n", iP); return iP; } void gpfs_destroy_inode(struct inode *iP) { TRACE1N(TRACE_VNODE, 1, TRCID_LINUXOPS_GPFS_DESTROY_INODE, "gpfs_destroy_inode: inode 0x%lX\n", (void *)iP); kmem_cache_free(gpfsInodeCacheP, (void *)iP); } void gpfs_destroy_inodecache(void) { while (kmem_cache_shrink(gpfsInodeCacheP) != 0) cxiSleep(40); kmem_cache_destroy(gpfsInodeCacheP); } #endif /* HAS_SOP_ALLOC_INODE */ /* This routine is called from iget() just after allocating a new inode. This is a variant of the normal read_inode operation that allows passing an opaque parameter through iget4 into read_inode2. We need the parameter to know whether read_inode2 is being called from a normal lookup opration, where we are already holding a distributed lock on the file, or from nfs calling iget, where we need to get the lock inside of read_inode2. Note: In the Linux source the call to read_inode2 is labelled a "reiserfs specific hack" with the additional warning "We don't want this to last, and are looking for VFS changes that will allow us to get rid of it." If and when such a change is made, we will hopefully be able to adapt our code accordingly. Otherwise, if read_inode2 goes away without a suitable replacement, we will have to use a more expensive approach, e.g., a global table where lookup would leave some state before calling iget. */ void gpfs_s_read_inode2(struct inode *iP, void *opaque) { struct gpfsVfsData_t *privVfsP; ino_t inum = iP->i_ino; cxiNode_t *cnP; int rc; ENTER(0); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_READINODE_ENTER, "gpfs_s_read_inode2 enter: inode 0x%lX inode %d\n", iP, inum); /* BKL is sometimes held at entry */ #if HAS_SOP_ALLOC_INODE cnP = (cxiNode_t *)&((struct gpfs_bloated_inode *)iP)->cxiNode; #else /* allocate cxiNode_t */ if (NODE_TOO_BIG_FOR_OSNODE(CXINODE_SIZE)) { /* need to allocate separate storage for the cxiNode_t */ cnP = (cxiNode_t *)cxiMallocUnpinned(CXINODE_SIZE); if (cnP == NULL) goto exit_bad; } else { /* we can store the cxiNode_t in the part of the iP->u * union after the PRVINODE field */ cnP = (cxiNode_t *)(&iP->PRVINODE + 1); } #endif memset(cnP, 0, CXINODE_SIZE); /*TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_NEW_VNODE_1, "gpfs_s_read_inode2: iP 0x%lX cnP 0x%lX uSize-void* %d nodeSize %d", iP, cnP, sizeof(iP->PRVINODE) - sizeof(void *), CXINODE_SIZE); */ /* connect cxiNode_t to struct inode */ cnP->osNodeP = iP; iP->PRVINODE = cnP; /* get inode attributes */ privVfsP = VP_TO_PVP(iP); rc = gpfs_ops.gpfsInodeRead(privVfsP, cnP, inum, opaque); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_READINODE_EXIT, "gpfs_s_read_inode2 exit: inode 0x%lX rc %d", iP, rc); if (rc == 0) { EXIT(0); return; // success! } /* undo cxiNode_t allocation */ cnP->osNodeP = NULL; iP->PRVINODE = NULL; #if !HAS_SOP_ALLOC_INODE if (NODE_TOO_BIG_FOR_OSNODE(CXINODE_SIZE)) cxiFreeUnpinned(cnP); #endif exit_bad: /* make_bad_inode will initialize iP so that all operations return EIO; also set i_nlink to zero so that the bad inode will be thrown out of the cache at the next opportunity */ make_bad_inode(iP); iP->i_nlink = 0; TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READINODE_EXIT_BAD, "gpfs_s_read_inode2 exit: inode 0x%lX rc BADINODE", iP); if (rc) cxiErrorNFS(rc); EXIT(0); } /* The following routine should never be called, since we have a read_inode2 operation. However, knfsd checks the operation table and refuses to export a file system if its read_inode operation ptr is NULL. Hence, we need to have one, even if it never gets called. */ void gpfs_s_read_inode(struct inode *iP) { /* only iget will use read_inode; this shouldn't happen as long as gpfs_nfsd_iget is being invoked via fh_to_dentry/gpfs_fh_to_dentry */ ENTER(0); TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_READINODE_HUH, "gpfs_s_read_inode: ? calling make_bad_inode"); make_bad_inode(iP); EXIT(0); } /* The following routine is called from iput when the i_count goes to zero and the link count in the inode is zero, which presumably means that the file was deleted. If so, we should free the disk space occupied by the file. */ void gpfs_s_delete_inode(struct inode *iP) { cxiNode_t *cnP; ext_cred_t eCred; Boolean isGPFS = cxiIsGPFSThread(); struct gpfsVfsData_t *privVfsP; ENTER(0); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_DELETE_INODE, "gpfs_s_delete_inode enter: inode 0x%lX inode %d gpfsThread %d\n", iP, iP->i_ino, isGPFS); TraceBKL(); #if MUST_TRUNCATE_INODE_PAGES truncate_inode_pages(&iP->i_data, 0); #endif cnP = VP_TO_CNP(iP); if (!cnP) { /* The cxiNode_t is allocated in gpfs_s_read_inode2, so if cnP is NULL, this means gpfs_s_read_inode2 failed and has marked this as a bad inode. No further actions necessary in this case. */ goto xerror; } if (TestCtFlag(cnP, destroyIfDelInode)) { privVfsP = VP_TO_PVP(iP); DBGASSERT(privVfsP != NULL); /* ?? "eCred is passed all the way to the daemon, and then is ignored there," FBS 5/24/01 */ setCred(&eCred); gpfs_ops.gpfsInodeDelete(privVfsP, cnP, isGPFS, &eCred); iP->PRVINODE = NULL; cnP->osNodeP = NULL; #if !HAS_SOP_ALLOC_INODE /* If necessary, free the cxiNode_t structure which was allocated * in gpfs_s_read_inode2. */ if (NODE_TOO_BIG_FOR_OSNODE(CXINODE_SIZE)) cxiFreeUnpinned(cnP); #endif } xerror: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_DELETE_INODE_1, "gpfs_s_delete_inode exit: inode 0x%lX cnP 0x%lX\n", iP, cnP); clear_inode(iP); EXIT(0); } int gpfs_s_notify_change(struct dentry *dentryP, struct iattr *attrP) { int rc; ENTER(0); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_NOTIFY_ENTER, "gpfs_s_notify_change enter: inode 0x%lX attr 0x%lX\n", dentryP->d_inode, attrP); TraceBKL(); rc = gpfs_i_setattr_internal(dentryP->d_inode, attrP); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_NOTIFY_EXIT, "gpfs_s_notify_change exit: inode 0x%lX rc %d\n", dentryP->d_inode, rc); EXIT(0); if (rc) return (-rc); return rc; } /* put_super is called just before the super_block is freed in do_unmount */ void gpfs_s_put_super(struct super_block *sbP) { int rc = 0; struct gpfsVfsData_t *privVfsP; ENTER(0); LOGASSERT(sbP != NULL); LOGASSERT(sbP->s_magic == GPFS_SUPER_MAGIC); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_PUTSUPER_ENTER, "gpfs_s_put_super enter: sbP 0x%lX sbP->s_dev 0x%X\n", sbP, sbP->s_dev); TraceBKL(); rc = cxiUnmount(sbP, false, true); TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_PUTSUPER_EXIT, "gpfs_s_put_super exit: rc %d\n", rc); EXIT(0); } int gpfs_s_statfs(struct dentry *den, struct KSTATFS *bufP) { struct super_block *sbP = den->d_sb; int rc; int code = 0; int len = sizeof(struct KSTATFS); struct gpfsVfsData_t *privVfsP = (struct gpfsVfsData_t *)SBLOCK_PRIVATE(sbP); cxiStatfs_t statfs; VFS_STAT_START(statfsCall); ENTER(0); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_STATFS_ENTER, "gpfs_s_statfs enter: sbP 0x%lX len %d\n", sbP, len); memset(bufP, 0, len); /* BKL is held at entry */ LOGASSERT(sbP->s_magic == GPFS_SUPER_MAGIC); LOGASSERT(privVfsP != NULL); rc = gpfs_ops.gpfsStatfs(privVfsP, &statfs); if (rc) { rc = -rc; code = 1; goto xerror; } bufP->f_type = GPFS_SUPER_MAGIC; bufP->f_bsize = statfs.f_bsize; bufP->f_blocks = statfs.f_blocks; bufP->f_bfree = statfs.f_bfree; bufP->f_bavail = statfs.f_bavail; bufP->f_files = statfs.f_files; bufP->f_ffree = statfs.f_ffree; bufP->f_namelen = statfs.f_name_max; bufP->f_fsid.val[0] = statfs.f_fsid.val[0]; bufP->f_fsid.val[1] = statfs.f_fsid.val[1]; /* If filesystem size cannot be represented by the OS statfs structure, increase the "block size" and reduce the numbers */ if (sizeof(bufP->f_blocks) < sizeof(statfs.f_blocks)) { while (bufP->f_blocks != statfs.f_blocks) { statfs.f_bsize <<= 1; // double f_bsize statfs.f_blocks >>= 1; // halve the rest statfs.f_bfree >>= 1; statfs.f_bavail >>= 1; bufP->f_bsize = statfs.f_bsize; bufP->f_blocks = statfs.f_blocks; bufP->f_bfree = statfs.f_bfree; bufP->f_bavail = statfs.f_bavail; } } xerror: TRACE7(TRACE_VNODE, 1, TRCID_LINUXOPS_STATFS_EXIT, "gpfs_s_statfs exit: f_blocks %lld f_bfree %lld f_files %d f_free %d " "f_bsize %d code %d rc %d\n", statfs.f_blocks, statfs.f_bfree, bufP->f_files, bufP->f_ffree, bufP->f_bsize, code, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return rc; } /* umount_begin is called only when the force option is used */ void #if LINUX_KERNEL_VERSION >= 2061700 gpfs_s_umount_begin(struct vfsmount *vfs, int flags) #else gpfs_s_umount_begin(struct super_block * sbP) #endif { int dmrc = 0; struct gpfsVfsData_t *privVfsP; #if LINUX_KERNEL_VERSION >= 2061700 struct super_block * sbP; LOGASSERT(vfs != NULL); LOGASSERT(vfs->mnt_sb != NULL); sbP = vfs->mnt_sb; #endif ENTER(0); LOGASSERT(sbP != NULL); LOGASSERT(sbP->s_magic == GPFS_SUPER_MAGIC); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_UMOUNT_ENTER, "gpfs_s_umount_begin enter: sbP 0x%lX sbP->s_dev 0x%X " "root vfsmount 0x%X pwd vfsmount 0x%X\n", sbP, sbP->s_dev, current->fs ? current->fs->rootmnt : NULL, current->fs ? current->fs->pwdmnt : NULL); TraceBKL(); privVfsP = (struct gpfsVfsData_t *)SBLOCK_PRIVATE(sbP); /* We may need to generate a preunmount DMAPI event, since this * is a user initiated force unmount and we need to inform any * DM application before we start flushing out VFS users. */ if (privVfsP) { #ifdef DMAPI Boolean doDMEvents = false; struct dentry *dP = NULL; struct inode *iP = NULL; cxiNode_t *cnP = NULL; dP = sbP->s_root; if (dP != NULL) iP = dP->d_inode; if (iP != NULL) cnP = VP_TO_CNP(iP); /* Generate preunmount event. We have to present this because * vfsUserCleanup() may potentially kill processes on forced unmount. * Since the DM application may have an open file in this file system * we have to warn him. The DM application may not however receive * the final unmount event if we can't get everything released. If * VFS users still exist after this, then no mntput() and subsequent * gpfs_s_put_super() will occur. */ dmrc = gpfs_ops.gpfsDmUnmountEvent(true, true, privVfsP, cnP, &doDMEvents, NULL, NULL, NULL, 0); #endif /* Force unmount */ vfsUserCleanup(sbP, privVfsP, true); if (sbP->s_root) printDentryTree(sbP->s_root, 10); } exit: TRACE6(TRACE_VNODE, 1, TRCID_LINUXOPS_UMOUNT_EXIT, "gpfs_s_umount_begin exit: sbP 0x%lX privVfsP 0x%lX dmrc %d " "s_active %d s_count 0x%X active files %d\n", sbP, privVfsP, dmrc, atomic_read(&sbP->s_active), sbP->s_count, !list_empty(&sbP->s_files)); /* Module count is decremented later on in do_unmount via gpfs_s_put_super */ EXIT(0); } int gpfs_s_remount(struct super_block *sbP, int *flags, char *data) { ENTER(0); TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_REMOUNT, "gpfs_s_remount: called\n"); TraceBKL(); EXIT(0); return 0; } void gpfs_s_write_inode(struct inode *inode) { ENTER(0); TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_WRITEINODE, "gpfs_s_write_inode: called\n"); TraceBKL(); EXIT(0); } /* This routine is called from iput() just before the storage of the Linux inode is freed */ void gpfs_s_clear_inode(struct inode *iP) { int code = 0; struct gpfsVfsData_t *privVfsP; cxiNode_t *cnP; ENTER(0); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_CLEARINODE, "gpfs_s_clear_inode enter: inode 0x%lX inode %d generic_ip 0x%lX\n", iP, iP->i_ino, iP->PRVINODE); TRACE3(TRACE_VNODE, 5, TRCID_LINUXOPS_CLEARINODE_DETAILS, "gpfs_s_clear_inode: cnP 0x%lX privVfsP 0x%lX tooBig %d\n", VP_TO_CNP(iP), VP_TO_PVP(iP), NODE_TOO_BIG_FOR_OSNODE(CXINODE_SIZE)); DBGASSERT(atomic_read((atomic_t *)&iP->i_count) == 0); cnP = VP_TO_CNP(iP); privVfsP = VP_TO_PVP(iP); if (cnP) { if (privVfsP) gpfs_ops.gpfsRele(privVfsP, cnP, (void *)iP, vnOp); /* if necessary, free the cxiNode_t storage that we allocated in gpfs_s_read_inode2 */ if (NODE_TOO_BIG_FOR_OSNODE(CXINODE_SIZE)) { iP->PRVINODE = NULL; cxiFreeUnpinned(cnP); } } xerror: TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_CLEARINODE_EXIT, "gpfs_s_clear_inode exit: inode 0x%lX generic_ip 0x%lX code %d\n", iP, iP->PRVINODE, code); EXIT(0); } void gpfs_s_write_super(struct super_block * sbP) { int rc = 0; struct gpfsVfsData_t *privVfsP; ENTER(0); LOGASSERT(sbP != NULL); LOGASSERT(sbP->s_magic == GPFS_SUPER_MAGIC); privVfsP = (struct gpfsVfsData_t *)SBLOCK_PRIVATE(sbP); LOGASSERT(privVfsP != NULL); TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_WRITESUPER, "gpfs_s_write_super enter: sbP 0x%lX\n", sbP); /* We have to either adhere to the s_dirt semantics or * ignore all syncs. Once a file systems write_super gets * called, sync_supers() restarts the super block scan. If * we don't turn off s_dirt then sync_supers() will be caught * in a loop. Alternatively if we only ignored kupdated then * * 1) a person could write to a file (which turns on s_dirt) * 2) kupdated could run (and be ignored) but the s_dirt is turned off * 3) the user attempts a sync from the command line sync, but that * does nothing since s_dirt was off * 4) the user expected the sync to have done something before he * halts the machine. */ sbP->s_dirt = 0; /* * jcw: Another way to handle this would be never turn on the s_dirt flag, * and not to even have a write_super callback. Then neither kupdated nor * sync would do anything. The sync watchdog in the GPFS daemon would * substitute for kupdated. To regain the semantics of sync, we would * create dummy inodes that would have I_DIRTY set, and link one such inode * onto each GPFS superblock. Then sync would notice the dirty inodes * and call back through their write_inode callbacks. This would be * the only use of I_DIRTY by GPFS, so it could be reinterpreted to mean * "sync this file system". For now, s_dirt is still set and reset, but * s_dirt gets reset for all file systems before they have all been synced, * so the race described above can occur. The permanently-dirty inode * needs to be implemented to fix this. */ /* goto xerror; */ /* BKL is held at entry */ TRACE0(TRACE_VNODE, 3, TRCID_LINUXOPS_WRITESUPER_3, "gpfs_s_write_super: performing sync"); rc = gpfs_ops.gpfsSyncfs(privVfsP); if (rc) { cxiErrorNFS(rc); rc = -rc; } xerror: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_WRITESUPER_5, "gpfs_s_write_super exit: sbP 0x%lX rc %d\n", sbP, rc); EXIT(0); } #if LINUX_KERNEL_VERSION >= 2060000 int gpfs_get_sb(struct file_system_type *fsTypeP, int flags, const char *devNameP, void *dataP, struct vfsmount *mnt) { struct super_block *sbP; int sb_ret = 0; ENTER(0); sb_ret = get_sb_nodev(fsTypeP, flags, dataP, gpfs_fill_super, mnt); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_GET_SB, "gpfs_get_sb: flags 0x%X dataP 0x%X sbP %d\n", flags, dataP, sb_ret); EXIT(0); return sb_ret; } int gpfs_fill_super(struct super_block *sbP, void *dataP, int silent) #else struct super_block * gpfs_fill_super(struct super_block *sbP, void *dataP, int silent) #endif { int kernel_unlock = 0; struct inode *rootIP = NULL; struct dentry *rootDP = NULL; char *myBufP = NULL; char *sgNameP; char *strP; char *mountpointP; char *optionsP; int rc = 0; int mountHelperID = -1; int code = 0; int namelen; struct gpfsVfsData_t *privVfsP; cxiNode_t *cnRootP; cxiIno_t rootINum; char bname[BDEVNAME_SIZE]; Boolean restricted = false; ENTER(0); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_1, "gpfs_fill_super enter: sbP 0x%lX dev 0x%X silent %d data '%s'\n", sbP, sbP->s_dev, silent, ((char *)dataP == NULL) ? "" : dataP); TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_2, "gpfs_fill_super: dev name '%s'\n", (sbP->s_bdev == NULL) ? "" : SBLOCK_BDEVNAME(sbP,bname)); TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_3, "gpfs_fill_super: s_flags 0x%x\n", sbP->s_flags); /* A mount increases reference count on module */ #if LINUX_KERNEL_VERSION < 2060000 MY_MODULE_INCREMENT(); #endif if (dataP == NULL || *(char *)dataP == '\0') { rc = EINVAL; code = 1; goto xerror; } if (strlen((char *)dataP) > CXI_MAXPATHLEN) { rc = ENAMETOOLONG; code = 2; goto xerror; } sbP->s_magic = GPFS_SUPER_MAGIC; sbP->s_op = &gpfs_sops; #if LINUX_KERNEL_VERSION > 2060000 sbP->s_export_op = &gpfs_export_ops; #endif SBLOCK_PRIVATE(sbP) = NULL; sbP->s_root = NULL; sbP->s_blocksize = 0; sbP->s_blocksize_bits = 0; /* maximum filesize (avoid sign bit due to use with loff_t) */ sbP->s_maxbytes = 0x7FFFFFFFFFFFFFFFULL; myBufP = (char *)cxiMallocPinned(strlen((char *)dataP) + 1); if (myBufP == NULL) { code = 3; rc = ENOMEM; goto xerror; } strcpy(myBufP, (char *)dataP); optionsP = myBufP; /* This is the syntax parser for the options field. At * least one option must be "dev=". */ sgNameP = NULL; strP = myBufP; while(strP) { if (!strncmp(strP, "dev=", 4)) { sgNameP = (char *)strchr(strP, '=') + 1; strP = (char *)strchr(strP, ','); /* more options */ if (strP) namelen = strP - sgNameP; else namelen = strlen(sgNameP); /* Copy the sgName into the first part of the * buffer, null terminate it, then append the * full option list. */ strncpy(myBufP, sgNameP, namelen); sgNameP = myBufP; sgNameP[namelen] = '\0'; optionsP = myBufP + namelen + 1; /* Move the options next (if there are any) */ strcpy(optionsP, strP?(char *)strP:""); break; } else { strP = (char *)strchr(strP, ','); if (strP) strP++; } } TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_OPTIONS, "gpfs_fill_super: optionsP \"%s\"\n", strP ? (char *) strP:""); while (strP) { /* look for rs option */ strP = (char *)strchr(strP, ','); if (strP) strP++; if (strP) { if (!strncmp(strP, "rs", 2)) { restricted = true; break; } } } if (sgNameP == NULL || *sgNameP == '\0') { code = 4; rc = EINVAL; goto xerror; } mountpointP = sgNameP; /* ??? */ if (restricted) { /* restricted mount - make it readonly */ sbP->s_flags |= MS_RDONLY; } strcpy(mmfs_path, bin_path); strcat(mmfs_path, "/mmfsmount"); TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_STARTHELPER, "gpfs_fill_super: start mount helper '%s'\n", mmfs_path); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_STARTHELPER1, "gpfs_fill_super: s_flags 0x%x (rs %d), mountpointP %s\n", sbP->s_flags, restricted, mountpointP); if (strlen(sgNameP) > CXI_MAXPATHLEN) { rc = ENAMETOOLONG; code = 5; goto xerror; } rc = gpfs_ops.gpfsReady(); if (rc != 0) { rc = EAGAIN; code = 6; goto xerror; } /* Start a new process that will receive and forward all messages during the * mount process to the mount invoker. The current process will wait for * this new process (in HandleMBUnmount()) and the daemon to be connected with * a socket and only than call SFSMountFS() that does the real mount work. */ strcpy(&mountCmd[2], sgNameP); // "M /dev/gpfs1" if (cxiHasMountHelper()) mountHelperID = fork_mount_helper(mountCmd); else { /* Use special pid (-1) when not using mount helper */ mountHelperID = -1; } TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_HELPERRC, "gpfs_fill_super: mount helper mountHelperID %d\n", mountHelperID); #if LINUX_KERNEL_VERSION < 2060000 /* BKL is not held during get_sb in 2.6 */ if (kernel_locked()) { unlock_kernel(); kernel_unlock = 1; } #else /* * In 2.5, a bunch of calls originating from sys_sync will try to down * s_umount and block, because it's already downed in get_sb_nodev, and won't * be upped until get_sb returns (in do_kern_mount). During gpfsMount, we'll * call mmcommon getEFOption, and that will at some point try to do a sync * (e.g. in gpfsClusterInit, two times), and mount will deadlock. One way * to fix this is to take out relevant sync's in the shell scripts, but this * is dodgy because we might end up pulling a new sdr from another node, and * that's a long and compelex path, I don't think one can guarantee there * won't be any syscalls that desire s_umount along the way. Need to think * how to fix this right. For now, up the semaphore for the duration of * the gpfsMount (possibly opening up a window for other races e.g. with * unmount). */ up_write(&sbP->s_umount); #endif rc = gpfs_ops.gpfsMount((void *)sbP, PAGE_SIZE, sgNameP, mountpointP, optionsP, (struct gpfsVfsData_t **)&(SBLOCK_PRIVATE(sbP)), &cnRootP, /* returned root cxiNode_t */ &rootINum, /* returned root inode number */ NULL, /* not a soft mount */ mountHelperID /* mount helper id */, -1U, /* no unique mount ID specified */ (sbP->s_flags & MS_RDONLY), /* is it readonly */ true); /* allocate pinned memory */ #if LINUX_KERNEL_VERSION < 2060000 /* BKL is not held during get_sb in 2.5 */ if (kernel_unlock) lock_kernel(); #else down_write(&sbP->s_umount); #endif if (rc) { code = 7; goto xerror; } privVfsP = (struct gpfsVfsData_t *)SBLOCK_PRIVATE(sbP); DBGASSERT(cnRootP != NULL); rootIP = (struct inode *)cnRootP->osNodeP; DBGASSERT(rootIP != NULL); DBGASSERT(rootIP->PRVINODE == cnRootP); DBGASSERT(cnRootP->osNodeP == rootIP); /* Successful mount in daemon. Allocate root directory cache entry */ rootDP = d_alloc_root(rootIP); if (!rootDP) { rc = gpfs_ops.gpfsUnmount(privVfsP, true); if (rc == 0 || rc == ENOSYS) gpfs_ops.gpfsFinishUnmount(privVfsP); code = 8; goto xerror; } rootDP->d_op = &gpfs_dops_valid; sbP->s_root = rootDP; sbP->s_dirt = 1; /* keep it on for sync to work */ if (myBufP != NULL) cxiFreePinned(myBufP); #if (LINUX_KERNEL_VERSION < 2060000) unlock_super(sbP); #endif TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_SUCCEED, "gpfs_fill_super exit: success sbP 0x%lX\n", sbP); EXIT(0); #if LINUX_KERNEL_VERSION >= 2060000 return 0; #else return sbP; #endif xerror: if (rootDP) dput(rootDP); if (rootIP) iput(rootIP); if (myBufP != NULL) cxiFreePinned(myBufP); #if LINUX_KERNEL_VERSION < 2060000 unlock_super(sbP); sbP->s_dev = 0; #endif /* An unmount decrements module use count */ #if LINUX_KERNEL_VERSION < 2060000 MY_MODULE_DECREMENT(); #endif TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_FAILED, "gpfs_fill_super: failed code %d rc %d\n", code, rc); EXIT(0); #if LINUX_KERNEL_VERSION >= 2060000 return -rc; #else return NULL; #endif } int gpfs_reg_fs() { int rc; ENTER(0); spin_lock_init(&inode_lock); rc = register_filesystem(&gpfs_fs_type); if (rc) goto xerror; /* We create a dummy super block for purposes of instantiating * a shutdown file descriptor. When the daemon dies this file * will be closed and its special ops will be called. * See cxiRegisterCleanup() */ shutdownSuperP = cxiMallocPinned(sizeof(struct super_block)); if (!shutdownSuperP) { unregister_filesystem(&gpfs_fs_type); rc = -ENOMEM; goto xerror; } SET_SUPER_BLOCK(shutdownSuperP, &null_sops); xerror: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_REGFS, "gpfs_reg_fs shutdownSuperP 0x%lX rc %d\n", shutdownSuperP, rc); EXIT(0); return rc; } void gpfs_unreg_fs() { int rc; ENTER(0); rc = unregister_filesystem(&gpfs_fs_type); if (shutdownSuperP) { UNSET_SUPER_BLOCK(shutdownSuperP); cxiFreePinned(shutdownSuperP); shutdownSuperP = NULL; } TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_UNREGFS, "gpfs_unreg_fs rc %d\n", rc); EXIT(0); } void kill_mmfsd(void) { ENTER(0); if (mmfsd_id != -1) { TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_KILLMMFSD, "kill_mmfsd: pid %X\n", mmfsd_id); kill_proc(mmfsd_id, SIGTERM, 1); if (mmfsd_id != -1) #if LINUX_KERNEL_VERSION > 2060000 wait_event(pwq,0); #else sleep_on(&pwq); #endif } EXIT(0); } /* * Note: since this function is executed as kernel_thread "main" routine, * it may not be safe to use stack at all, e.g. call non-inlined functions, * at least in the success path. See comments e.g. in asm-i386/unistd.h */ int exec_mmfs(void *nothing) { static char *argv[] = { mmfs_path, mount_opt, NULL }; static char *envp[] = { "HOME=/", NULL }; int rc; ENTER(0); set_fs(KERNEL_DS); rc = EXEC_HELPER(mmfs_path, argv, envp, 1 /* wait if possible */); xerror: if(rc) TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_EXECMMFS_EXIT, "exec_mmfs: exit rc -1 errno %d path %s\n", errno, mmfs_path); EXIT(0); return rc; } int fork_mount_helper(char *data) { ENTER(0); strcpy(mount_opt, data); mount_id = kernel_thread(exec_mmfs, 0, 0); TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_FORK_MOUNTHELPER, "fork_mount_helper: new pid %d\n", mount_id); EXIT(0); return mount_id; } /* Set device id and other information for a file system being mounted */ int cxiSetMountInfo(void *osVfsP, cxiDev_t sgDevID, int bsize, void *osRootNodeP, cxiNode_t *cnRootP, Boolean *releRootP, void *gnRootP, fsid_t fsid)/* (out) maintain hold on root */ { struct super_block *sbP = (struct super_block *)osVfsP; struct inode *rootIP = (struct inode *)osRootNodeP; // root dir inode int i; ENTER(0); TRACE4(TRACE_VNODE, 1, TRCID_SET_MOUNT_INFO, "cxiSetMountInfo: sbP 0x%lX rootIP 0x%lX cnRootP 0x%lX " "gnRootP 0x%lX\n", sbP, rootIP, cnRootP, gnRootP); DBGASSERT(sbP != NULL); /* This is the auto remount case where mmfsd died/killed and restarted. */ if (gnRootP == cnRootP) { /* Since the OS independent layer looked up and held the * root vnode, we've got too many hold counts for a reconnect. * Tell upper layer that we must release. */ *releRootP = true; } else { /* Don't attempt to release the root VFS node */ *releRootP = false; sbP->s_blocksize = bsize; for (i = sbP->s_blocksize, sbP->s_blocksize_bits = 0; i != 1; i >>= 1) sbP->s_blocksize_bits++; } if (rootIP != NULL) { DBGASSERT(rootIP->i_ino == INODENUM_ROOTDIR_FILE); DBGASSERT(rootIP->PRVINODE == cnRootP); } EXIT(0); return 0; } /* Attempt whatever we can to get holders of VFS elements * (dcache entries, etc) to leave. */ int vfsUserCleanup(struct super_block *sbP, struct gpfsVfsData_t *privVfsP, Boolean force) { struct siginfo sinfo; struct task_struct *g, *tsP; Boolean killit; int rc; ENTER(0); #ifndef GPFS_ARCH_POWER /* Forced unmount doesn't really work very well on Linux since * the VFS layer is very stateful. If a process is sitting in * the file system, its vmount count will not go to zero and a * proper unmount can occur. We're experimenting with the * semantics (akin to umount -k on other OSes) where processes * are killed if they are within a forced unmounted file system. * * Note that this doesn't get everyone. If you have a file open * in GPFS but don't have your current working directory in GPFS * then you're not killed. To kill those user (or close their * files) you'd have to traipse thru the file table. There's * a lot of OS specific code there that we wouldn't want to get * into. */ if (force) { sinfo.si_signo = SIGKILL; sinfo.si_errno = 0; sinfo.si_code = SI_KERNEL; sinfo.si_addr = vfsUserCleanup; sinfo.si_pid = current->pid; sinfo.si_uid = current->uid; // read_lock(&tasklist_lock); rcu_read_lock(); DO_EACH_THREAD(g,tsP) { task_lock(tsP); if (tsP->fs && tsP->fs->pwdmnt && tsP->fs->pwdmnt->mnt_sb == sbP) killit = true; else killit = false; task_unlock(tsP); if (killit) send_sig_info(SIGKILL, &sinfo, tsP); } WHILE_EACH_THREAD(g,tsP); // read_unlock(&tasklist_lock); rcu_read_unlock(); } #endif /* Purge cached OS VFS nodes/cxiNodes. */ rc = gpfs_ops.gpfsUncache(privVfsP); EXIT(0); return rc; } /* Called by gpfs_s_put_super() when the last holder of the superblock * is gone. We should be able to successfully clean up and become * unmounted. */ int cxiUnmount(void *osVfsP, Boolean force, Boolean doDMEvents) { int rc = 0; int dmrc = 0; struct super_block *sbP = (struct super_block *)osVfsP; struct gpfsVfsData_t *privVfsP; #ifdef DMAPI Boolean dmDoUnmountEvent = false; void *sgUidP = NULL; void *eventlistP = NULL; void *sessLocP = NULL; struct dentry *dP = NULL; struct inode *iP = NULL; cxiNode_t *cnP = NULL; #endif ENTER(0); LOGASSERT(sbP != NULL); privVfsP = (struct gpfsVfsData_t *)SBLOCK_PRIVATE(sbP); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_CXIUNMOUNT_ENTER, "cxiUnmount: enter privVfsP 0x%lX sbP 0x%lX force %d doDM %d\n", privVfsP, sbP, force, doDMEvents); if (privVfsP == NULL) goto exit; #ifdef DMAPI dP = sbP->s_root; if (dP != NULL) iP = dP->d_inode; if (iP != NULL) cnP = VP_TO_CNP(iP); /* Generate preunmount event */ if (doDMEvents) { rc = gpfs_ops.gpfsDmUnmountEvent(true, force, privVfsP, cnP, &dmDoUnmountEvent, &sgUidP, &eventlistP, &sessLocP, 0); /* We should continue unmount even if it fails. Otherwise, linux screwup and cannot remount unless we shutdown the daemon */ } #endif /* The superblock is unallocated by the kernel after gpfs_s_put_super / cxiUnmount, regardless of any errors here because it doesn't check a return code from the filesystem specific put_super call, so we need to proceed through these calls even if an error occurs; not cleaning up things in gpfsFinishUnmount (ie, the gpfs mount list) after an error with unmount causes havoc when the daemon later restarts. */ rc = vfsUserCleanup(sbP, privVfsP, force); if (rc == ENOSYS) rc = 0; rc = gpfs_ops.gpfsUnmount(privVfsP, force); if (rc == ENOSYS) rc = 0; gpfs_ops.gpfsFinishUnmount(privVfsP); SBLOCK_PRIVATE(sbP) = NULL; #ifdef DMAPI if (dmDoUnmountEvent) dmrc = gpfs_ops.gpfsDmUnmountEvent(false, force, NULL, NULL, &dmDoUnmountEvent, &sgUidP, &eventlistP, &sessLocP, rc); #endif sbP->s_dirt = 0; printSuperList(sbP); /* An unmount decrements module use count */ #if LINUX_KERNEL_VERSION < 2060000 MY_MODULE_DECREMENT(); #endif exit: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_CXIUNMOUNT_EXIT, "cxiUnmount: exit rc %d dmrc %d\n", rc, dmrc); EXIT(0); return rc; } int cxiReactivateOSNode(void *osVfsP, cxiNode_t *cnP, void **osNodePP) { TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_REACTIVATE_VNODE, "cxiReactivateOSNode: sbP 0x%lX cxiNodeP 0x%lX osNodePP 0x%lX\n", osVfsP, cnP, osNodePP); LOGASSERT(0); // not implemented on linux return 0; } #if LINUX_KERNEL_VERSION >= 2060000 static int inodeFindActor(struct inode *iP, void *opaqueP) { /* iget4 can be called on one thread which goes to create a new * inode (get_new_inode, gpfs_s_read_inode2, gpfsInodeRead, readOSNode) * but before that thread completes initializing the cxiNode_t, another * thread calls iget4 and gets here (find_inode, inodeFindActor). * Similar races exist when an inode is being deleted. * * Ideally, we'd like to spin_unlock() on the inode_lock and call * wait_on_inode() but we cannot release the inode_lock here (find_inode * is depending on it to protect its list_entry() calls). Fortunately, * iget4 does exactly this wait for the inode upon return from * find_inode. Returning zero here would cause get_new_inode to be * called (which would assert when it found the first thread had * already allocated the gnode). Return 1 and iget4 will do the * necessary wait. * * We can't call anything here that could sleep because we are holding * the inode_lock and sleeping can result in a hang * TRACE4N does not block and is ok here. */ TRACE4N(TRACE_VNODE, 2, TRCID_LINUXOPS_INODEFINDACTOR, "inodeFindActor: iP 0x%lX i_state 0x%x cxiNodeP 0x%lX isBad %d\n", iP, iP->i_state, VP_TO_CNP(iP), is_bad_inode(iP)); if (iP->i_state & INODE_IN_CACHE) return 1; if (VP_TO_CNP(iP) == NULL) { if (iP->i_state == 0) return 0; else return 1; } return gpfs_ops.gpfsInodeFindActor(VP_TO_CNP(iP), iP->i_ino, opaqueP); } static int inodeInitLocked(struct inode *iP, void *opaqueP) { cxiIGetArg_t *argsP = (cxiIGetArg_t *)opaqueP; iP->i_ino = argsP->extInodeNum; return 0; } #else static int inodeFindActor(struct inode *iP, unsigned long inodeNum, void *opaqueP) { /* iget4 can be called on one thread which goes to create a new * inode (get_new_inode, gpfs_s_read_inode2, gpfsInodeRead, readOSNode) * but before that thread completes initializing the cxiNode_t, another * thread calls iget4 and gets here (find_inode, inodeFindActor). * Similar races exist when an inode is being deleted. * * Ideally, we'd like to spin_unlock() on the inode_lock and call * wait_on_inode() but we cannot release the inode_lock here (find_inode * is depending on it to protect its list_entry() calls). Fortunately, * iget4 does exactly this wait for the inode upon return from * find_inode. Returning zero here would cause get_new_inode to be * called (which would assert when it found the first thread had * already allocated the gnode). Return 1 and iget4 will do the * necessary wait. * * We can't call anything here that could sleep because we are holding * the inode_lock and sleeping can result in a hang * TRACE3N does not block and is ok here. */ TRACE3N(TRACE_VNODE, 2, TRCID_LINUXOPS_INODEFINDACTOR2, "inodeFindActor: iP 0x%lX i_state 0x%x cxiNodeP 0x%lX\n", iP, iP->i_state, VP_TO_CNP(iP)); if (iP->i_state & INODE_IN_CACHE) return 1; if (VP_TO_CNP(iP) == NULL) { if (iP->i_state == 0) return 0; else return 1; } return gpfs_ops.gpfsInodeFindActor(VP_TO_CNP(iP), inodeNum, opaqueP); } #endif int cxiNewOSNode(void *osVfsP, cxiNode_t **cnPP, void **osNodePP, cxiIno_t inum, int nodeSize, void *opaqueP) { struct super_block *sbP = (struct super_block *)osVfsP; struct inode *iP; int rc; int loop_count = 0; int sleep_count = 0; ENTER(0); TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_NEW_VNODE, "cxiNewOSNode: sbP 0x%lX inum %d size %d", sbP, inum, nodeSize); /* The requested nodeSize must match CXINODE_SIZE */ if (nodeSize != CXINODE_SIZE) goto bad_node_size; repeat: #if LINUX_KERNEL_VERSION >= 2060000 iP = iget5_locked(sbP, inum, inodeFindActor, inodeInitLocked, opaqueP); #else iP = iget4(sbP, inum, inodeFindActor, opaqueP); #endif if (iP == NULL) { *cnPP = NULL; *osNodePP = NULL; rc = ENOMEM; goto xerror; } #if !HAS_SOP_READ_INODE2 /* We fill in the inode as opposed to a read_inode * operation executed with iget() */ if (iP->i_state & I_NEW) { gpfs_s_read_inode2(iP, opaqueP); unlock_new_inode(iP); } #endif if (is_bad_inode(iP)) { TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_NEW_BAD, "cxiNewOSNode: BAD INODE 0x%X\n", iP); *cnPP = NULL; *osNodePP = NULL; iput(iP); rc = EIO; goto xerror; } /* Did we get the right inode ? * When inodeFindActor is called from find_inode() and the inode * is in transition it might return found without checking sanpId * so go check again. */ #if LINUX_KERNEL_VERSION >= 2060000 if (!inodeFindActor(iP, opaqueP)) #else if (!inodeFindActor(iP, iP->i_ino, opaqueP)) #endif { if (sleep_count > 10) { TRACE5(TRACE_VNODE, 1, TRCID_LINUXOPS_NEW_VNODE_2, "cxiNewOSNode: rc ESTALE inode 0x%lX ino %d i_state 0x%x " "cxiNodeP 0x%lX isBad %d\n", iP, iP->i_ino, iP->i_state, VP_TO_CNP(iP), is_bad_inode(iP)); *cnPP = NULL; *osNodePP = NULL; iput(iP); rc = EIO; goto xerror; } if (loop_count > 1000) { cxiSleep(10); sleep_count++; loop_count = 0; } loop_count++; iput(iP); goto repeat; } DBGASSERT(iP->PRVINODE != NULL); *cnPP = (cxiNode_t *)iP->PRVINODE; *osNodePP = iP; rc = 0; xerror: TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_NEW_VNODE_EXIT, "cxiNewOSNode: exit osNodeP 0x%lX cnP 0x%lX rc %d\n", *osNodePP, *cnPP, rc); EXIT(0); return rc; bad_node_size: /* The requested nodeSize does not match CXINODE_SIZE. Whoever called us is an incompitble version of the code or was somehow not compiled correctly. */ TRACE2(TRACE_VNODE, 2, TRCID_LINUXOPS_NEW_VNODE_BAD, "cxiNewOSNode: requested nodeSize %d does not match CXINODE_SIZE %d", nodeSize, CXINODE_SIZE); printk("mmfs: module inconsistency detected in cxiNewOSNode:\n" " requested nodeSize %d does not match CXINODE_SIZE %d\n", nodeSize, CXINODE_SIZE); LOGASSERT(!"nodeSize != CXINODE_SIZE"); EXIT(0); return ELIBBAD; } /* The linux kernel decrements the inode count and deallocates the * inode after gpfs_s_put_inode() is called therefore this routine * doesn't perform a delete. */ void cxiFreeOSNode(void *osVfsP, struct cxiNode_t *cnP, void *osNodeP) { struct super_block *sbP = (struct super_block *)osVfsP; struct inode *iP = (struct inode *)osNodeP; ENTER(0); TRACE5(TRACE_VNODE, 2, TRCID_LINUXOPS_DELETE_VNODE, "cxiFreeOSNode enter: sbP 0x%lX cxiNodeP 0x%lX " "iP 0x%lX inode %d i_count %d\n", sbP, cnP, iP, iP ? iP->i_ino : -1, iP ? atomic_read((atomic_t *)&iP->i_count) : 0); DBGASSERT(cnP->osNodeP == iP); cnP->osNodeP = NULL; if (iP) { DBGASSERT(atomic_read((atomic_t *)&iP->i_count) == 0); iP->i_op = NULL; iP->i_fop = NULL; if (iP->i_mapping) iP->i_mapping->a_ops = &gpfs_aops_after_inode_delete; iP->i_size = 0; iP->i_nlink = 0; } EXIT(0); } void cxiDeleteMmap(cxiVmid_t segid) { TRACE1(TRACE_VNODE, 2, TRCID_LINUXOPS_DELETE_MMAP, "cxiDeleteMmap: segid 0x%X\n", segid); } void cxiReinitOSNode(void *osVfsP, struct cxiNode_t *cnP, void *osNodeP) { struct super_block *sbP = (struct super_block *)osVfsP; struct inode *iP = (struct inode *)osNodeP; TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_REINIT_VNODE, "cxiReinitOSNode: sbP 0x%lX cnP 0x%lX iP 0x%lX\n", sbP, cnP, iP); LOGASSERT(0); // not implemented on linux } void cxiDumpOSNode(cxiNode_t *cnP) { struct inode *iP = (struct inode *)cnP->osNodeP; struct list_head *dListP, *dHeadP; struct dentry *dentry; ENTER(0); TRACE2(TRACE_VNODE, 2, TRCID_LINUXOPS_DUMP_VNODE, "cxiDumpOSNode: cxiNodeP 0x%lX iP 0x%lX\n", cnP, iP); if (iP) { printInode(iP); dHeadP = &iP->i_dentry; spin_lock(&dcache_lock); for (dListP = dHeadP->next; dListP != dHeadP; dListP = dListP->next) { dentry = list_entry(dListP, struct dentry, d_alias); printDentry(dentry); } spin_unlock(&dcache_lock); } EXIT(0); } #if LINUX_KERNEL_VERSION >= 2060000 static int igrabInodeFindActor(struct inode *iP, void *opaqueP) { /* igrab can be called while another thread is doing a finial iput * so instead we are call ilookup5. ilookup5 processes stuff under * the inode_lock so if we are in here and find the inode then * ilookup5 will increase i_count * * We can't call anything here that could sleep because we are holding * the inode_lock and sleeping can result in a hang * TRACE4N does not block and is ok here. */ TRACE3N(TRACE_VNODE, 2, TRCID_LINUXOPS_IGRABINODEFINDACTOR, "igrabInodeFindActor: iP 0x%lX i_state 0x%x inode 0x%lX \n", iP, iP->i_state, (struct inode *) opaqueP); if (iP->i_state & INODE_BEING_RELEASED) return 0; if (iP != (struct inode *) opaqueP) return 0; return 1; } #endif /* On linux we can't just decrement the i_count * thus this routine will only accept a positive * increment. If you want to put a reference then * call cxiPutOSNode() which calls back thru the VFS * layer. */ int cxiRefOSNode(void *osVfsP, cxiNode_t *cnP, void *osNodeP, int inc) { return cxiRefOsNode(osVfsP,cnP,osNodeP,inc,false); } int cxiRefOsNode(void *osVfsP, cxiNode_t *cnP, void *osNodeP, int inc, Boolean calledFromRevoke) { struct inode *iP = (struct inode *)osNodeP; struct inode *riP = NULL; int holdCount; int ino; ENTER(0); DBGASSERT(iP != NULL); DBGASSERT(inc == 1); #if LINUX_KERNEL_VERSION >= 2060000 /* The igrab() may fail if this inode is actively going * thru a release. */ if(osVfsP) { /* we already have a hold */ riP = igrab(iP); } /* we may not currently have a hold so use ilookup5 */ else if(GPFS_TYPE(iP)) { riP = ilookup5(iP->i_sb, iP->i_ino, igrabInodeFindActor, (void*)iP); } #else /* The igrab() may fail if this inode is actively going * thru a release. */ riP = igrab(iP); #endif if (riP) { DBGASSERT(!(iP->i_state & INODE_BEING_RELEASED)); holdCount = atomic_read((atomic_t *)&riP->i_count); ino = riP->i_ino; } else { holdCount = 0; ino = -1; /* If this function is called from revoke handler check of this inode is being released */ if (calledFromRevoke && (iP->i_state & INODE_BEING_RELEASED) ) holdCount = -1; } TRACE5(TRACE_VNODE, 2, TRCID_LINUXOPS_REF_VNODE, "cxiRefOSNode exit: sbP 0x%lX cxiNodeP 0x%lX iP 0x%lX inode %d " "i_count to %d", osVfsP, cnP, iP, ino, holdCount); EXIT(0); return holdCount; } /* Determines if OS node is inactive */ int cxiInactiveOSNode(void *osVfsP, struct cxiNode_t *cnP, void *osNodeP, Boolean *canCacheP, Boolean *hasReferencesP) { struct inode *iP = (struct inode *)osNodeP; struct super_block *sbP = (struct super_block *)osVfsP; int holdCount; ENTER(0); DBGASSERT(cnP->osNodeP == iP); *canCacheP = false; *hasReferencesP = false; holdCount = atomic_read((atomic_t *)&iP->i_count); if (holdCount > 0) *hasReferencesP = true; TRACE6(TRACE_VNODE, 2, TRCID_LINUXOPS_INACTIVE_VNODE, "cxiInactiveOSNode: sbP 0x%lX cxiNodeP 0x%lX iP 0x%lX " "i_count %d canCache %d hasReferences %d\n", sbP, cnP, iP, holdCount, *canCacheP, *hasReferencesP); EXIT(0); return holdCount; } void cxiPutOSNode(void *vP) { struct inode *iP = (struct inode *)vP; int holdCount; ENTER(0); DBGASSERT(iP != NULL); DBGASSERT(!(iP->i_state & INODE_BEING_RELEASED)); holdCount = atomic_read((atomic_t *)&iP->i_count); DBGASSERT(holdCount > 0); TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_PUT_OSNODE, "cxiPutOSNode enter: iP 0x%lX inode %d i_count to %d\n", iP, iP->i_ino, holdCount-1); iput(iP); EXIT(0); return; } void cxiDestroyOSNode(void *vP) { struct inode *iP = (struct inode *)vP; int holdCount; ENTER(0); DBGASSERT(iP != NULL); holdCount = atomic_read((atomic_t *)&iP->i_count); DBGASSERT(holdCount > 0); TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_DESTROY_OSNODE, "cxiDestroyOSNode enter: iP 0x%lX inode %d i_count %d i_nlink %d\n", iP, iP->i_ino, holdCount, iP->i_nlink); iP->i_nlink = 0; EXIT(0); return; } void cxiSetOSNodeType(struct cxiNode_t *cnP, cxiMode_t mode, cxiDev_t dev) { ENTER(0); if (S_ISDIR(mode)) cnP->nType = cxiVDIR; else if (S_ISREG(mode)) cnP->nType = cxiVREG; else if (S_ISLNK(mode)) cnP->nType = cxiVLNK; else if (S_ISCHR(mode)) cnP->nType = cxiVCHR; else if (S_ISBLK(mode)) cnP->nType = cxiVBLK; else if (S_ISFIFO(mode)) cnP->nType = cxiVFIFO; else if (S_ISSOCK(mode)) cnP->nType = cxiVSOCK; else DBGASSERT(0); EXIT(0); } void cxiUpdateInode(cxiNode_t *cnP, cxiVattr_t *attrP, int what) { struct inode *iP = (struct inode *)cnP->osNodeP; ENTER(0); if (iP != NULL) { if (what & CXIUP_ATIME) { CXITIME_TO_INODETIME(attrP->va_atime, iP->i_atime); EXIT(0); return; } if (what & CXIUP_MODE) { iP->i_mode = attrP->va_mode; CXITIME_TO_INODETIME(attrP->va_ctime, iP->i_ctime); } if (what & CXIUP_OWN) { iP->i_mode = attrP->va_mode; iP->i_uid = attrP->va_uid; iP->i_gid = attrP->va_gid; CXITIME_TO_INODETIME(attrP->va_ctime, iP->i_ctime); } if (what & CXIUP_NLINK) { iP->i_nlink = attrP->va_nlink; } if (what & CXIUP_SIZE) { iP->i_size = attrP->va_size; iP->i_blocks = attrP->va_blocks; } if (what & CXIUP_SIZE_BIG) { spin_lock(&inode_lock); if (attrP->va_size > iP->i_size) { iP->i_size = attrP->va_size; iP->i_blocks = attrP->va_blocks; } spin_unlock(&inode_lock); } if (what & CXIUP_TIMES) { CXITIME_TO_INODETIME(attrP->va_atime, iP->i_atime); CXITIME_TO_INODETIME(attrP->va_mtime, iP->i_mtime); CXITIME_TO_INODETIME(attrP->va_ctime, iP->i_ctime); } if (what & CXIUP_PERM) { iP->i_mode = attrP->va_mode; iP->i_uid = attrP->va_uid; iP->i_gid = attrP->va_gid; cnP->xinfo = attrP->va_xinfo; setIopTable(iP, (attrP->va_xinfo & VA_XPERM) != 0); cnP->icValid |= CXI_IC_PERM; } if ((what & CXIUP_NLINK) && TestCtFlag(cnP,destroyIfDelInode)) { cxiDropInvalidDCacheEntry(cnP); /* swapd must be notified to prune dcache entries */ if (TestCtFlag(cnP, pruneDCacheNeeded)) gpfs_ops.gpfsSwapdEnqueue(cnP); } } TRACE4(TRACE_VNODE, 3, TRCID_CXIUPDATE_INODE_3, "cxiUpdateInode: iP 0x%X atime 0x%X mtime 0x%X ctime 0x%X\n", iP, GET_INODETIME_SEC(iP->i_atime), GET_INODETIME_SEC(iP->i_mtime), GET_INODETIME_SEC(iP->i_ctime)); TRACE7(TRACE_VNODE, 3, TRCID_CXIUPDATE_INODE_1, "cxiUpdateInode: what %d mode 0x%X uid %d gid %d nlink %d size %lld" " blocks %d\n", what, iP->i_mode, iP->i_uid, iP->i_gid, iP->i_nlink, iP->i_size, iP->i_blocks); EXIT(0); } /* Determine if operating system specific node belongs to a particular VFS and can be uncached. Returns OS node if it exists, the determination of whether it can be uncached or not. */ Boolean cxiCanUncacheOSNode(void *osVfsP, struct cxiNode_t *cnP, void **vP) { struct inode *iP = (struct inode *)cnP->osNodeP; int count = 0; ENTER(0); if (iP != NULL && iP->i_sb == osVfsP) { count = atomic_read((atomic_t *)&iP->i_count); *vP = (void *)iP; } else *vP = NULL; TRACE6(TRACE_VNODE, 2, TRCID_LINUXOPS_CANUNCACHE_OSNODE, "cxiCanUncacheOSNode: cxiNode 0x%lx vP 0x%lX osVfsP 0x%lX " "i_sb 0x%lX inode %d i_count %d\n", cnP, vP, osVfsP, (iP ? iP->i_sb : 0), (iP ? iP->i_ino : 0), count); EXIT(0); return (count == 0); } /* Add operating system specific node to the lookup cache. This routine is called with the necessary distributed lock held to guarantee that the lookup cache entry is valid. */ #ifdef CCL void * cxiAddOSNode(void *dentryP, void *vP, DentryOpTableTypes dopTabType, int lookup) #else void * cxiAddOSNode(void *dentryP, void *vP, int lookup) #endif { struct inode *iP = (struct inode *)vP; struct dentry *dP = (struct dentry *)dentryP; ENTER(0); TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_ADD_OSNODE, "cxiAddOSNode: dentry 0x%lX vP 0x%lX unhashed %d", dentryP, vP, d_unhashed(dP)); /* mark dentry valid */ #ifdef CCL switch(dopTabType) { /* Positive dcache entry for inexact file name match for Samba user. Only valid for other Samba users. Not valid for local/NFS users. Forces lookup for local/NFS users. */ case DOpOnlyValidIfSamba: dP->d_op = &gpfs_dops_valid_if_Samba; break; /* Negative dcache entry for exact file name match for local/NFS user. Only valid for other local/NFS users. Not valid for Samba users. Forces lookup for Samba users. */ case DOpInvalidIfSamba: dP->d_op = &gpfs_dops_invalid_if_Samba; break; default: dP->d_op = &gpfs_dops_valid; break; } #else dP->d_op = &gpfs_dops_valid; #endif if (!d_unhashed(dP)) { /* hook up dentry and inode */ d_instantiate(dP, iP); dP = NULL; } else { #if LINUX_KERNEL_VERSION >= 2060000 if (lookup) { dP = d_splice_alias(iP, dP); goto exit; } #endif /* hook up dentry and inode */ d_instantiate(dP, iP); /* if not yet done so, add to hash list */ d_rehash(dP); dP = NULL; } exit: EXIT(0); return dP; } #ifdef NFS4_CLUSTER /* get list of fs locations, return number of locations */ int gpfs_s_fs_locations(struct super_block *sbP, char **bufP) { int rc; int code = 0; int loc_count; struct gpfsVfsData_t *privVfsP = (struct gpfsVfsData_t *)SBLOCK_PRIVATE(sbP); LOGASSERT(privVfsP != NULL); ENTER(0); VFS_STAT_START(fsLocationCall); TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_LOCFS_ENTER, "gpfs_s_fs_locations enter: sbP 0x%lX\n", sbP); rc = gpfs_ops.gpfsFsLocations(privVfsP, bufP, &loc_count); if (rc) { rc = -rc; code = 1; goto xerror; } rc = loc_count; xerror: TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_LOCFS_EXIT, "gpfs_s_fs_locations exit: sbP 0x%lX code %d rc %d\n", sbP, code, rc); VFS_STAT_STOP; EXIT(0); return rc; } #endif /* Functions for converting between an NFS file handle and a dentry. We define our own functions rather than using the generic ones in fs/nfsd/nfsfh.c so we can revalidate the file inode, since it could have been changed by another node. */ static struct dentry * gpfs_nfsd_iget_dentry(struct inode *inode, __u32 generation) { struct list_head *lp; struct dentry *result; ENTER(0); TRACE2(TRACE_VNODE, 3, TRCID_NFSD_IGET_DENTRY_1, "gpfs_nfsd_iget_dentry: inode %d generation %d", inode->i_ino, generation); /* Now find a dentry. If possible, get a well-connected one. */ spin_lock(&dcache_lock); for (lp = inode->i_dentry.next; lp != &inode->i_dentry ; lp=lp->next) { result = list_entry(lp, struct dentry, d_alias); if (!(result->d_flags & DCACHE_DFLAGS_DISCONNECTED)) { dget_locked(result); #if LINUX_KERNEL_VERSION >= 2060000 result->d_flags |= DCACHE_REFERENCED; #else result->d_vfs_flags |= DCACHE_REFERENCED; #endif spin_unlock(&dcache_lock); if (result->d_inode != inode) { TRACE4(TRACE_VNODE, 11, TRCID_NFSD_IGET_31, "gpfs_nfsd_iget_dentry:0 dentry flags 0x%x count %d inode 0x%lX " "time %lu", result->d_flags, atomic_read(&result->d_count), result->d_inode, result->d_time); TRACE7(TRACE_VNODE, 11, TRCID_NFSD_IGET_41, "gpfs_nfsd_iget_dentry:0 Inode %lu nlink %d count %d gen %u %u " "state %lu flags 0x%x", inode->i_ino, inode->i_nlink, atomic_read(&inode->i_count), inode->i_generation, generation, inode->i_state, inode->i_flags); dput(result); goto build_dentry; } if (gpfs_i_revalidate(result)) { TRACE4(TRACE_VNODE, 11, TRCID_NFSD_IGET_3, "gpfs_nfsd_iget_dentry:1 dentry flags 0x%x count %d inode 0x%lX " "time %lu", result->d_flags, atomic_read(&result->d_count), result->d_inode, result->d_time); TRACE7(TRACE_VNODE, 1, TRCID_NFSD_IGET_4, "gpfs_nfsd_iget_dentry:1 Inode %lu nlink %d count %d gen %u %u " "state %lu flags 0x%x", inode->i_ino, inode->i_nlink, atomic_read(&inode->i_count), inode->i_generation, generation, inode->i_state, inode->i_flags); iput(inode); dput(result); EXIT(0); return ERR_PTR(-ESTALE); } if (generation && generation != 0xffffffff && /* GENNUM_UNKNOWN */ inode->i_generation != generation) { /* we didn't find the right inode.. */ TRACE4(TRACE_VNODE, 11, TRCID_NFSD_IGET_5, "gpfs_nfsd_iget_dentry:2 dentry flags 0x%x count %d inode 0x%lX " "time %lu", result->d_flags, atomic_read(&result->d_count), result->d_inode, result->d_time); TRACE7(TRACE_VNODE, 11, TRCID_NFSD_IGET_6, "gpfs_nfsd_iget_dentry:2 Inode %lu nlink %d count %d gen %u %u " "state %lu flags 0x%x", inode->i_ino, inode->i_nlink, atomic_read(&inode->i_count), inode->i_generation, generation, inode->i_state, inode->i_flags); iput(inode); dput(result); EXIT(0); return ERR_PTR(-ESTALE); } iput(inode); EXIT(0); return result; } } spin_unlock(&dcache_lock); build_dentry: #if LINUX_KERNEL_VERSION < 2060000 result = d_alloc_root(inode); #else result = d_alloc_anon(inode); #endif if (result == NULL) { iput(inode); EXIT(0); return ERR_PTR(-ENOMEM); } #if LINUX_KERNEL_VERSION < 2060000 result->d_flags |= DCACHE_DFLAGS_DISCONNECTED; #endif if (gpfs_i_revalidate(result)) { TRACE4(TRACE_VNODE, 11, TRCID_NFSD_IGET_7, "gpfs_nfsd_iget:3 dentry flags 0x%x count %d inode 0x%lX time %lu", result->d_flags, atomic_read(&result->d_count), result->d_inode, result->d_time); TRACE7(TRACE_VNODE, 11, TRCID_NFSD_IGET_8, "gpfs_nfsd_iget:3 Inode %lu nlink %d count %d gen %u %u " "state %lu flags 0x%x", inode->i_ino, inode->i_nlink, atomic_read(&inode->i_count), inode->i_generation, generation, inode->i_state, inode->i_flags); /* The dput call here is releases the dcache entry that was * allocated by to d_alloc_root. It also results in an iput effectively * removing the hold we place by our iget call above. */ dput(result); EXIT(0); return ERR_PTR(-ESTALE); } if (generation && generation != 0xffffffff && /* GENNUM_UNKNOWN */ inode->i_generation != generation) { /* we didn't find the right inode.. */ TRACE4(TRACE_VNODE, 11, TRCID_NFSD_IGET_9, "gpfs_nfsd_iget:4 dentry flags 0x%x count %d inode 0x%lX time %lu", result->d_flags, atomic_read(&result->d_count), result->d_inode, result->d_time); TRACE7(TRACE_VNODE, 11, TRCID_NFSD_IGET_10, "gpfs_nfsd_iget:4 Inode %lu nlink %d count %d gen %u %u " "state %lu flags 0x%x", inode->i_ino, inode->i_nlink, atomic_read(&inode->i_count), inode->i_generation, generation, inode->i_state, inode->i_flags); /* Release the dcache entry. This also does an iput. */ dput(result); EXIT(0); return ERR_PTR(-ESTALE); } EXIT(0); return result; } static struct dentry * gpfs_nfsd_iget(struct super_block *sbP, unsigned long ino, cxiIGetArg_t *argP, __u32 generation) { int rc; struct inode *inode; struct gpfsVfsData_t *privVfsP; ENTER(0); TRACE6(TRACE_VNODE, 3, TRCID_NFSD_IGET_1, "gpfs_nfsd_iget: sbP 0x%lX extino %d inode %d snapid %d " "fileset %d generation %d", sbP, ino, argP->inodeNum, argP->snapId, argP->filesetId, generation); /* get the inode */ if (ino == 0) { EXIT(0); return ERR_PTR(-ESTALE); } /* Callers have set inodeNum/snapId in argP. vattrP is NULL and * readInodeCalled is false, but these will be set appropriately in * gpfsNFSIget after it obtains the attributes. */ privVfsP = (struct gpfsVfsData_t *)cxiGetPrivVfsP(sbP); rc = gpfs_ops.gpfsNFSIget(privVfsP, argP, generation, (void **)&inode); if (rc) { cxiErrorNFS(rc); EXIT(0); return ERR_PTR(-rc); } if (inode == NULL) { EXIT(0); return ERR_PTR(-ENOMEM); } if (is_bad_inode(inode)) { EXIT(0); return ERR_PTR(-ESTALE); } /* gpfsNFSIget will have called findOrCreateLinux/cxiNewOSNode which * makes the iget call along with the inodeFindActor validation. */ EXIT(0); return(gpfs_nfsd_iget_dentry(inode,generation)); } #if LINUX_KERNEL_VERSION >= 2060000 /* export_operations for nfsd communication with our file system * via gpfs_export_ops */ /* * gpfs_get_dparent: (get_parent) find the parent dentry for a given dentry */ struct dentry *gpfs_get_dparent(struct dentry * child) { int rc = 0; struct dentry *result = NULL; struct gpfsVfsData_t *privVfsP; ext_cred_t eCred; cxiNode_t *dcnP; cxiIno_t iNum = (cxiIno_t)-1; cxiNode_t *cnP = NULL; struct inode *newInodeP = NULL; struct dentry *retP; ENTER(0); VFS_INC(get_parentCall); TRACE2(TRACE_VNODE, 3, TRCID_GET_DPARENT_ENTER, "gpfs_get_dparent: dentry 0x%lX inode 0x%d", child, child->d_inode->i_ino); dcnP = VP_TO_CNP(child->d_inode); privVfsP = VP_TO_PVP(child->d_inode); DBGASSERT(privVfsP != NULL); setCred(&eCred); if (!dcnP) { /* This can happen due to a bug in linux/fs/dcache.c (prune_dcache) where "count" entries are to be pruned, but the last one is found to be recently referenced. When this happens, count is decremented, but the loop is not terminated. The result is that it continues to prune entries past where it should (prunes everything). If our patch for this is not applied, the result is a kernel failure as the cxiNode is referenced. Checking here (and revalidate) allows us to reject the call instead. */ PRINTINODE(child->d_inode); result = (struct dentry *)ERR_PTR(-ESTALE); goto xerror; } rc = gpfs_ops.gpfsLookup(privVfsP, (void *)child->d_inode, dcnP, NULL, (char *)"..", (void **)&newInodeP, &cnP, &iNum, NULL, NULL, &eCred, (void **)&retP); if (rc == 0) { DBGASSERT(cnP != NULL); DBGASSERT(iNum != -1); DBGASSERT(newInodeP != NULL); DBGASSERT(newInodeP->PRVINODE == cnP); DBGASSERT(cnP->osNodeP == (void *)newInodeP); result = gpfs_nfsd_iget_dentry(newInodeP, (__u32)newInodeP->i_generation); } else { cxiErrorNFS(rc); result = (struct dentry *)ERR_PTR(-rc); iNum = -1; } xerror: TRACE4(TRACE_VNODE, 3, TRCID_GET_DPARENT_EXIT, "gpfs_get_dparent dentry 0x%lX inode %d result %lX err%d \n", child, iNum, result, IS_ERR(result)? PTR_ERR(result): 0); EXIT(0); return result; } /* * gpfs_get_dentry: (get_dentry) find dentry for the inode given a file handle */ struct dentry *gpfs_get_dentry(struct super_block *sbP, void * vdata) { __u32 *data=vdata; unsigned long ino; cxiIGetArg_t arg; __u32 generation; struct dentry *result; ENTER(0); VFS_INC(get_dentryCall); ino = data[0]; if (IS_SNAPROOTDIR_EXT_INO(ino)) arg.inodeNum = SNAPROOTDIR_INT_INO; else if (IS_SNAPLINKDIR_EXT_INO(ino)) arg.inodeNum = data[3]; else arg.inodeNum = ino; arg.snapId = data[1]; generation = data[2]; arg.extInodeNum = ino; arg.filesetId = (unsigned)-1; //FIXME arg.vattrP = NULL; arg.readInodeCalled = false; result = gpfs_nfsd_iget(sbP, ino, &arg, generation); EXIT(0); return result; } /* It is acceptable to create a disconnected dentry for pNFS since it is used only for read/write. The check if it was exported is not required since the call to the MDS will verify that the file is open. */ static int gpfs_acceptable(void *expv, struct dentry *dentry) { if (dentry && dentry->d_inode) { #ifdef GPFS_PRINTK printk("gpfs_acceptable ino %d\n", dentry->d_inode->i_ino); #endif return 1; } return 0; } /* * gpfs_decode_fh: (decode_fh) decode a file handle returning ptr to it's dentry */ struct dentry * gpfs_decode_fh(struct super_block *sbP, __u32 *fh, int len, int fhtype, int (*acceptable)(void *context, struct dentry *de), void *context) { #if LINUX_KERNEL_VERSION == 2060800 int len = *lenP; #endif struct dentry *result; __u32 parent[4]={0}; ENTER(0); VFS_INC(decode_fhCall); #ifdef GPFS_PRINTK printk("gpfs_decode_fh %08x %08x %08x %08x %08x %08x %08x\n", fh[0],fh[1],fh[2],fh[3],fh[4],fh[5],fh[6]); #endif TRACE4(TRACE_VNODE, 3, TRCID_DECODE_FH_1, "gpfs_decode_fh: sbP 0x%lX fh 0x%lX, len %d type %d", sbP, fh, len, fhtype); if (fhtype > 4 && fhtype < 8 && len >= 5) { parent[0]=fh[3]; /* ino */ parent[1]=fh[4]; /* p_sid */ if (len>5) { parent[2]=fh[5]; /* generation */ parent[3]=fh[3]; /* ino */ } if (cxiIsLockdThread() // check for lockd thread #ifdef P_NFS4 || fhtype == 7 // it is a pNFS fh, disconnected fh is acceptable. #endif ) result = sbP->s_export_op->find_exported_dentry(sbP, fh, parent, gpfs_acceptable, context); else result = sbP->s_export_op->find_exported_dentry(sbP, fh, parent, acceptable, context); TRACE4(TRACE_VNODE, 3, TRCID_DECODE_FH_2, "gpfs_decode_fh: sbP 0x%lX fh 0x%lX result %lX err %d", sbP, fh, result, IS_ERR(result)? PTR_ERR(result): 0); #if LINUX_KERNEL_VERSION == 2060800 *lenP = 0; #endif if (IS_ERR(result)) cxiErrorNFS(PTR_ERR(result)); EXIT(0); return result; } TRACE2(TRACE_VNODE, 3, TRCID_DECODE_FH_3, "gpfs_decode_fh: sbP 0x%lX fh 0x%lX -EINVAL", sbP, fh); EXIT(0); return ERR_PTR(-EINVAL); } /* * gpfs_encode_fh: (encode_fh) encode a file handle from the given dentry */ int gpfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp, int need_parent) { UInt32 d_sid, p_sid; ENTER(0); VFS_INC(encode_fhCall); if (*lenp < 5) { EXIT(0); return 255; } if (gpfs_ops.gpfsGetSnapIdPair(VP_TO_CNP(dentry->d_inode), &d_sid, &p_sid) != 0) { EXIT(0); return 255; } fh[0] = (__u32) dentry->d_inode->i_ino; fh[1] = d_sid; fh[2] = (__u32) dentry->d_inode->i_generation; fh[3] = (__u32) dentry->d_parent->d_inode->i_ino; fh[4] = p_sid; if (*lenp > 5) { /* There was enough room to compelete parent */ fh[5] = (__u32) dentry->d_parent->d_inode->i_generation; *lenp = 6; } else *lenp = 5; EXIT(0); return *lenp; } #else struct dentry *gpfs_fh_to_dentry(struct super_block *sbP, __u32 *fh, int len, int fhtype, int parent) { unsigned long ino; cxiIGetArg_t arg; __u32 generation; struct dentry *result; ENTER(0); TRACE5(TRACE_VNODE, 3, TRCID_FH_TO_DENTRY_1, "gpfs_fh_to_dentry: sbP 0x%lX fh 0x%lX, len %d type %d parent %d", sbP, fh, len, fhtype, parent); if (fhtype == 3 && len >= 5) { if (parent) { ino = fh[3]; if (IS_SNAPROOTDIR_EXT_INO(ino)) arg.inodeNum = SNAPROOTDIR_INT_INO; else if (IS_SNAPLINKDIR_EXT_INO(ino)) arg.inodeNum = IS_SNAPROOTDIR_EXT_INO(fh[0]) ? SNAPROOTDIR_INT_INO : fh[0]; else arg.inodeNum = ino; arg.snapId = fh[4]; generation = 0xffffffff; /* GENNUM_UNKNOWN */ } else { ino = fh[0]; if (IS_SNAPROOTDIR_EXT_INO(ino)) arg.inodeNum = SNAPROOTDIR_INT_INO; else if (IS_SNAPLINKDIR_EXT_INO(ino)) arg.inodeNum = fh[3]; else arg.inodeNum = ino; arg.snapId = fh[1]; generation = fh[2]; } arg.filesetId = (unsigned)-1; // FIXME arg.vattrP = NULL; arg.readInodeCalled = false; result = gpfs_nfsd_iget(sbP, ino, &arg, generation); TRACE4(TRACE_VNODE, 3, TRCID_FH_TO_DENTRY_2, "gpfs_fh_to_dentry: sbP 0x%lX fh 0x%lX result %lX err %d", sbP, fh, result, IS_ERR(result)? PTR_ERR(result): 0); EXIT(0); return result; } TRACE2(TRACE_VNODE, 3, TRCID_FH_TO_DENTRY_3, "gpfs_fh_to_dentry: sbP 0x%lX fh 0x%lX -EINVAL", sbP, fh); EXIT(0); return ERR_PTR(-EINVAL); } int gpfs_dentry_to_fh(struct dentry *dentry, __u32 *fh, int *lenp, int need_parent) { UInt32 d_sid, p_sid; if (*lenp < 5) return 255; ENTER(0); if (gpfs_ops.gpfsGetSnapIdPair(VP_TO_CNP(dentry->d_inode), &d_sid, &p_sid) != 0) { EXIT(0); return 255; } fh[0] = (__u32) dentry->d_inode->i_ino; fh[1] = d_sid; fh[2] = (__u32) dentry->d_inode->i_generation; fh[3] = (__u32) dentry->d_parent->d_inode->i_ino; fh[4] = p_sid; *lenp = 5; EXIT(0); return 3; } #endif void printSuper(struct super_block *sbP) { if (!_TRACE_IS_ON(TRACE_VNODE, 3)) return; /* private field won't make much sense for non-GPFS file systems */ TRACE4N(TRACE_VNODE, 3, TRCID_PRINTSUPER_1, "printSuper: sbP 0x%lX magic 0x%lX type 0x%lX private 0x%lX\n", sbP, sbP->s_magic, sbP->s_type, SBLOCK_PRIVATE(sbP)); TRACE3N(TRACE_VNODE, 3, TRCID_PRINTSUPER_3, "printSuper: s_dev 0x%X count 0x%X active %d\n", sbP->s_dev, sbP->s_count, atomic_read(&sbP->s_active)); } void printSuperList(struct super_block *sbP) { struct list_head *lP; struct super_block *sP; if (!_TRACE_IS_ON(TRACE_VNODE, 5)) return; /* Run through all super blocks starting from provided GPFS super block. */ /* Ideally we would lock sb_lock, but we can't access it, so small probability of this breaking, which is why it is at a higher trace level (vnode 5). */ TRACE0N(TRACE_VNODE, 5, TRCID_PRINTALLSUPER_1, "printSuperList:\n"); printSuper(sbP); list_for_each(lP, &sbP->s_list) { sP = sb_entry(lP); printSuper(sP); } }