/*************************************************************************** * * Copyright (C) 2001 International Business Machines * All rights reserved. * * This file is part of the GPFS mmfslinux kernel module. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *************************************************************************** */ /* @(#)01 1.90.1.4 src/avs/fs/mmfs/ts/kernext/gpl-linux/inode.c, mmfs, avs_rgpfs24, rgpfs24s012a 4/17/07 15:54:47 */ /* * Inode operations * * Contents: * printInode * printDentry * cxiSetOSNode * cxiInvalidatePerm * getIattr * get_umask * setCred * gpfs_i_create * gpfs_i_lookup * gpfs_i_link * gpfs_i_unlink * gpfs_i_symlink * gpfs_i_mkdir * gpfs_i_rmdir * gpfs_i_mknod * gpfs_i_rename * gpfs_i_readlink * gpfs_i_follow_link * gpfs_i_readpage (in mmap.c) * gpfs_i_writepage (in mmap.c) * gpfs_i_bmap * gpfs_i_truncate * gpfs_i_permission * gpfs_i_smap * gpfs_i_updatepage * gpfs_i_revalidate * gpfs_i_setattr * gpfs_i_setattr_internal * gpfs_i_getattr * gpfs_i_getattr_internal * gpfs_i_lock * gpfs_i_getxattr * gpfs_i_setxattr * gpfs_i_listxattr * gpfs_i_removexattr */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_KERNEL_VERSION > 2060000 #include #endif #ifdef MODULE #include #endif /* MODULE */ void printInode(struct inode *iP) { TRACE7(TRACE_VNODE, 3, TRCID_PRINTINODE_1, "printInode: iP 0x%lX inode %d (0x%X) i_count %d dev 0x%X " "mode 0x%X nlink %d\n", iP, iP->i_ino, iP->i_ino, atomic_read((atomic_t *)&iP->i_count), KDEV_INT(iP->i_rdev), iP->i_mode, iP->i_nlink); TRACE6(TRACE_VNODE, 3, TRCID_PRINTINODE_2, "printInode: uid %d gid %d rdev 0x%X atime 0x%X mtime 0x%X " "ctime 0x%X\n", iP->i_uid, iP->i_gid, KDEV_INT(iP->i_rdev), GET_INODETIME_SEC(iP->i_atime), GET_INODETIME_SEC(iP->i_mtime), GET_INODETIME_SEC(iP->i_ctime)); TRACE5(TRACE_VNODE, 3, TRCID_PRINTINODE_4, "printInode: size %lld blksize 0x%X blocks %d ver 0x%X op 0x%lX\n", iP->i_size, iP->i_blocks, iP->i_blocks, iP->i_version, iP->i_op); TRACE6(TRACE_VNODE, 3, TRCID_PRINTINODE_5, "printInode: fop 0x%lX sb 0x%lX flags 0x%X state 0x%X gen %d " "generic 0x%lX\n", iP->i_fop, iP->i_sb, iP->i_flags, iP->i_state, iP->i_generation, iP->PRVINODE); TRACE3(TRACE_VNODE, 3, TRCID_PRINTINODE_6, "printInode: list 0x%lX next 0x%lX prev 0x%lX\n", &(iP->i_list), iP->i_list.next, iP->i_list.prev); TRACE3(TRACE_VNODE, 3, TRCID_PRINTINODE_7, "printInode: dentry 0x%lX next 0x%lX prev 0x%lX\n", &(iP->i_dentry), iP->i_dentry.next, iP->i_dentry.prev); #if LINUX_KERNEL_VERSION < 2050000 TRACE3(TRACE_VNODE, 3, TRCID_PRINTINODE_8, "printInode: hash 0x%lX next 0x%lX prev 0x%lX\n", &(iP->i_hash), iP->i_hash.next, iP->i_hash.prev); #else TRACE3(TRACE_VNODE, 3, TRCID_PRINTINODE_9, "printInode: hash 0x%lX next 0x%lX prev 0x%lX\n", &(iP->i_hash), iP->i_hash.next, *iP->i_hash.pprev); #endif } void printDentry(struct dentry *dP) { struct inode *iP = dP->d_inode; if (!_TRACE_IS_ON(TRACE_VNODE, 3)) return; TRACE3N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_1, "printDentry: dentry 0x%lX count %d name '%s'\n", dP, atomic_read((atomic_t *)&dP->d_count), dP->d_name.name); TRACE5N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_2, "printDentry: time 0x%X op 0x%lX flags 0x%X parent 0x%lX " "inode 0x%X\n", dP->d_time, dP->d_op, dP->d_flags, dP->d_parent, iP); if (iP) { if (!list_empty(&iP->i_dentry)) TRACE4N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_3A, "printDentry: i_ino %d i_count %d " "i_dentry next 0x%lX i_dentry prev 0x%lX\n", iP->i_ino, atomic_read((atomic_t *)&iP->i_count), list_entry(iP->i_dentry.next, struct dentry, d_alias), list_entry(iP->i_dentry.prev, struct dentry, d_alias)); else TRACE2N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_3B, "printDentry: i_ino %d i_count %d\n", iP->i_ino, atomic_read((atomic_t *)&iP->i_count)); } TRACE3N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_3C, "printDentry: &d_hash 0x%lX d_hash.next 0x%lX d_hash.prev 0x%lX\n", &dP->d_child, dP->d_child.next, dP->d_child.prev); TRACE3N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_4, "printDentry: &child 0x%lX child.next 0x%lX child.prev 0x%lX\n", &dP->d_child, dP->d_child.next, dP->d_child.prev); if (!list_empty(&dP->d_subdirs)) TRACE3N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_5, "printDentry: &subdirs 0x%lX subdir next 0x%lX " "subdir prev 0x%lX\n", &dP->d_subdirs, list_entry(dP->d_subdirs.next, struct dentry, d_child), list_entry(dP->d_subdirs.prev, struct dentry, d_child)); } /* Print directory entry tree up to maxPrint elements. * If maxPrint is 0 then there is no upper limit. */ void printDentryTree(struct dentry *entryDP, int maxPrint) { int count = 0; struct list_head *lhP; struct dentry *siblingDP; struct dentry *parentDP; /* Check trace level required by printDentry() */ if (!_TRACE_IS_ON(TRACE_VNODE, 3)) return; spin_lock(&dcache_lock); parentDP = entryDP; lhP = parentDP->d_subdirs.next; printDentry(parentDP); if (maxPrint > 0 && ++count >= maxPrint) goto xerror; if (list_empty(&parentDP->d_subdirs)) goto xerror; do { while (lhP != &parentDP->d_subdirs) { siblingDP = list_entry(lhP, struct dentry, d_child); printDentry(siblingDP); if (maxPrint > 0 && ++count >= maxPrint) goto xerror; if (!list_empty(&siblingDP->d_subdirs)) { parentDP = siblingDP; lhP = siblingDP->d_subdirs.next; continue; } lhP = siblingDP->d_child.next; parentDP = siblingDP->d_parent; } siblingDP = siblingDP->d_parent; parentDP = siblingDP->d_parent; lhP = siblingDP->d_child.next; } while (lhP != entryDP->d_child.next); xerror: spin_unlock(&dcache_lock); return; } /* Set the inode operations table for a regular file or directory. Call with xperm set to true if the file has extended permission attributes (i.e. an ACL). This routine is a no-op if the inode is not a regular file or directory. If the file does not have extended attributes, the table that is used will have a null value for the permission routine pointer. This will cause Linux to perform access checks directly instead of acquiring the kernel lock and calling GPFS, giving better performance. */ void setIopTable(struct inode *iP, Boolean xperm) { struct inode_operations *newopP, *stdopP, *xopP; struct list_head *lp; int count = 0; /* Choose the correct inode operations table based on whether this is a directory or a regular file. Assume that the file has extended attributes so that GPFS permission checking will be required. */ ENTER(0); if (S_ISDIR(iP->i_mode)) xopP = &gpfs_dir_iops_xperm; else if (S_ISREG(iP->i_mode)) xopP = &gpfs_iops_xperm; else { EXIT(0); return; } /* If the file really does have extended attributes (or if the token has been lost so that we do not know the status), set extended permission table and exit. */ if (xperm) { iP->i_op = xopP; EXIT(0); return; } /* Get address of an inode operations table that has a generic permission routine pointer. */ iP->i_op = S_ISDIR(iP->i_mode) ? &gpfs_dir_iops_stdperm : &gpfs_iops_stdperm; EXIT(0); } void cxiSetOSNode(void *osVfsP, cxiNode_t *cnP, cxiVattr_t *attrP) { struct super_block *sbP = (struct super_block *)osVfsP; struct inode *inodeP = (struct inode *)cnP->osNodeP; ENTER(0); DBGASSERT(inodeP != NULL); DBGASSERT(inodeP->PRVINODE == cnP); DBGASSERT(inodeP->i_sb == sbP); inodeP->i_mode = attrP->va_mode; inodeP->i_nlink = attrP->va_nlink; inodeP->i_uid = attrP->va_uid; inodeP->i_gid = attrP->va_gid; inodeP->i_rdev = cxiDevToKernelDev(cxiDev32ToDev(attrP->va_rdev)); CXITIME_TO_INODETIME(attrP->va_atime, inodeP->i_atime); CXITIME_TO_INODETIME(attrP->va_mtime, inodeP->i_mtime); CXITIME_TO_INODETIME(attrP->va_ctime, inodeP->i_ctime); inodeP->i_size = attrP->va_size; inodeP->i_blocks = attrP->va_blocksize; inodeP->i_blocks = attrP->va_blocks; inodeP->i_generation = attrP->va_gen; inodeP->i_flags = 0; cnP->xinfo = attrP->va_xinfo; switch (inodeP->i_mode & S_IFMT) { case S_IFREG: setIopTable(inodeP, (attrP->va_xinfo & VA_XPERM) != 0); if (cxiIsNFSThread()) inodeP->i_fop = &gpfs_fops_no_sendfile; else inodeP->i_fop = &gpfs_fops; break; case S_IFDIR: setIopTable(inodeP, (attrP->va_xinfo & VA_XPERM) != 0); inodeP->i_fop = &gpfs_dir_fops; break; case S_IFLNK: inodeP->i_op = &gpfs_link_iops; inodeP->i_fop = &gpfs_fops; break; case S_IFBLK: case S_IFCHR: case S_IFIFO: case S_IFSOCK: /* Set vector table for special files, gpfs will not get * these operations. */ #if LINUX_KERNEL_VERSION >= 2060000 init_special_inode(inodeP, inodeP->i_mode, inodeP->i_rdev); #else init_special_inode(inodeP, inodeP->i_mode, kdev_t_to_nr(inodeP->i_rdev)); #endif break; } if (inodeP->i_mapping) inodeP->i_mapping->a_ops = &gpfs_aops; cnP->icValid = CXI_IC_ALL; TRACE7(TRACE_VNODE, 2, TRCID_LINUXOPS_SETINODE, "cxiSetOSNode: inodeP 0x%lX inode %d i_count %d i_mode 0x%X " "i_xinfo 0x%X i_nlink %d i_size %lld\n", inodeP, inodeP->i_ino, atomic_read((atomic_t *)&inodeP->i_count), inodeP->i_mode, attrP->va_xinfo, inodeP->i_nlink, inodeP->i_size); EXIT(0); return; } /* The following function is called from cxiInvalidateAttr when the CXI_IC_PERM option was specified, which indicates that permission related attributes cached in the struct inode (owner, mode, etc.) are no longer known to be valid. */ void cxiInvalidatePerm(cxiNode_t *cnP) { struct inode *inodeP = (struct inode *)cnP->osNodeP; ENTER(0); TRACE3(TRACE_VNODE, 2, TRCID_CXIINVA_PERM, "cxiInvalidatePerm: cnP 0x%lX std %d dir std %d", cnP, inodeP->i_op == &gpfs_iops_stdperm, inodeP->i_op == &gpfs_dir_iops_stdperm); /* Set the inode operation table to gpfs_..._xperm; the next permission check will then go through our gpfs_i_permission function, which will revalidate permission attributes and set the inode operation table back to gpfs_..._stdperm, if appropriate. Note: since symlinks always have permission iop set, setIopTable is a noop for symlinks. */ setIopTable(inodeP, true); EXIT(0); } static void getIattr(struct inode *inodeP, struct iattr *attrP) { ENTER(0); // attrP->ia_valid = ??? ; attrP->ia_mode = inodeP->i_mode; attrP->ia_uid = inodeP->i_uid; attrP->ia_gid = inodeP->i_gid; attrP->ia_size = inodeP->i_size; attrP->ia_atime = inodeP->i_atime; attrP->ia_mtime = inodeP->i_mtime; attrP->ia_ctime = inodeP->i_ctime; EXIT(0); return; } static inline int get_umask() { return (current->fs->umask); } /* Record credentials of current thread */ void setCred(ext_cred_t *credP) { int nGroups; ENTER(0); credP->principal = current->fsuid; /* user id */ credP->group = current->fsgid; /* primary group id */ #if LINUX_KERNEL_VERSION > 2060300 nGroups = MIN(current->group_info->ngroups, MIN(ECRED_NGROUPS, NGROUPS_SMALL)); #else nGroups = MIN(current->ngroups, ECRED_NGROUPS); #endif credP->num_groups = nGroups; if (nGroups > 0) #if LINUX_KERNEL_VERSION > 2060300 memcpy(credP->eGroups, current->group_info->blocks[0], nGroups*sizeof(gid_t)); /* ?? This is incorrect. Linux 2.6 supports a very large list of groups by allocating a page for each bunch of groups. Only if there are <= NGROUPS_SMALL groups is the space in group_info->small_block used. GPFS will only see the prefix of the group set. */ /* To save kernel stack space, the GPFS ext_cred_t should keep a pointer to the array of groups. The group set cannot change during a GPFS system call since the caller can only make one system call at a time. */ #else memcpy(credP->eGroups, current->groups, nGroups*sizeof(gid_t)); #endif EXIT(0); } /* inode_operations */ /* Called with a negative (no inode) dir cache entry. * If this call succeeds, we fill in with d_instantiate(). */ int gpfs_i_create(struct inode *diP, struct dentry *dentryP, int mode #if LINUX_KERNEL_VERSION >= 2060000 , struct nameidata *ni #endif ) { int rc; struct gpfsVfsData_t *privVfsP; cxiNode_t *dcnP; cxiNode_t *cnP = NULL; cxiIno_t iNum = (cxiIno_t)-1; struct inode *newInodeP = NULL; int flags = FWRITE | FCREAT | FEXCL; cxiMode_t umask = get_umask(); ext_cred_t eCred; struct dentry *retP; VFS_STAT_START(createCall); ENTER(0); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_CREATE_ENTER, "gpfs_i_create enter: iP 0x%lX dentryP 0x%lX mode 0x%X name '%s'\n", diP, dentryP, mode, dentryP->d_name.name); /* BKL is held at entry */ dcnP = VP_TO_CNP(diP); privVfsP = VP_TO_PVP(diP); LOGASSERT(privVfsP != NULL); retry: setCred(&eCred); rc = gpfs_ops.gpfsCreate(privVfsP, dcnP, (void **)&newInodeP, &cnP, &iNum, 0, flags, dentryP, (char *)dentryP->d_name.name, mode, umask, NULL, &eCred); if (rc == 0) { DBGASSERT(cnP != NULL); DBGASSERT(iNum != -1); DBGASSERT(newInodeP != NULL); DBGASSERT(newInodeP->PRVINODE == cnP); DBGASSERT(cnP->osNodeP == (void *)newInodeP); cnP->createRaceLoserThreadId = 0; } /* linux would normally serialize the creates on a directory (via the * parent directory semaphore) to ensure that a create didn't fail with * EEXIST. However in a multinode environment we may perform a lookup * on one node (thinking the file doesn't exist) yet a create is * performed on a different node before linux can call the physical * file systems create. We attempt to reconcile this case by marking * the fact that this happened and checking the FEXCL flag at gpfs_f_open() * to see if we should have failed this with EEXIST. */ if (rc == EEXIST) { /* Make sure that this create call is part of the linux open call. NFS and mknod calls create without an open, so check that this is not one of those calls. On the open call the open flags are available and if the FEXCL was on fail it with EEXIST. */ int mode1; /* Skip if NFS create call. */ if (cxiIsNFSThread()) goto retExist; /* ??? if (sys_mknod call) goto xerror; */ /* Do it only if trying to create a regular file. */ if (((mode & S_IFMT) != 0) && !(mode & S_IFREG)) goto retExist; setCred(&eCred); // rebuild since gpfsCreate may remap ids rc = gpfs_ops.gpfsLookup(privVfsP, (void *)diP, dcnP, dentryP, (char *)dentryP->d_name.name, (void **)&newInodeP, &cnP, &iNum, NULL, &mode1, &eCred, (void **)&retP); if (rc == ENOENT) goto retry; if (!rc) { /* If the file that was found was a directory than return the return code that linux would have returned. */ if (S_ISDIR(newInodeP->i_mode)) { rc = EISDIR; goto retExist; } cnP->createRaceLoserThreadId = cxiGetThreadId(); } } retExist: if (rc) { d_drop(dentryP); goto xerror; } diP->i_sb->s_dirt = 1; xerror: TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_CREATE_EXIT, "gpfs_i_create exit: new inode 0x%lX iNum %d (0x%X) rc %d\n", newInodeP, iNum, iNum, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return -rc; } /* If this routine successfully finds the file, it should * add the dentry to the hash list with d_add() and return * null. If a failure occurs then return non null and the * dentry will be dput() by the linux lfs layer */ struct dentry * gpfs_i_lookup(struct inode *diP, struct dentry *dentryP #if LINUX_KERNEL_VERSION >= 2060000 , struct nameidata *ni #endif ) { int code = 0; int rc = 0; struct dentry *retP = NULL; struct gpfsVfsData_t *privVfsP; ext_cred_t eCred; cxiNode_t *dcnP; cxiMode_t mode = 0; cxiIno_t iNum = (cxiIno_t)-1; cxiNode_t *cnP = NULL; struct inode *newInodeP = NULL; VFS_STAT_START(lookupCall); ENTER(0); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_LOOKUP_ENTER, "gpfs_i_lookup enter: diP 0x%lX dentryP 0x%lX name '%s'\n", diP, dentryP, dentryP->d_name.name); /* BKL is held at entry */ dcnP = VP_TO_CNP(diP); privVfsP = VP_TO_PVP(diP); LOGASSERT(privVfsP != NULL); setCred(&eCred); if (!dcnP) { /* This can happen due to a bug in linux/fs/dcache.c (prune_dcache) where "count" entries are to be pruned, but the last one is found to be recently referenced. When this happens, count is decremented, but the loop is not terminated. The result is that it continues to prune entries past where it should (prunes everything). If our patch for this is not applied, the result is a kernel failure as the cxiNode is referenced. Checking here (and revalidate) allows us to reject the call instead. */ TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_LOOKUP_STALE, "cxiNode for inode 0x%lX (ino 0x%X) was FREED!\n", diP, diP->i_ino); /* Although we may like to know more about this inode, it is not * ok to call PRINTINODE(iP) here. */ rc = ESTALE; code = 1; retP = (struct dentry *)ERR_PTR(-rc); goto xerror; } rc = gpfs_ops.gpfsLookup(privVfsP, (void *)diP, dcnP, dentryP, (char *)dentryP->d_name.name, (void **)&newInodeP, &cnP, &iNum, NULL, &mode, &eCred, (void **)&retP); if (rc == 0) { DBGASSERT(cnP != NULL); DBGASSERT(iNum != -1); DBGASSERT(newInodeP != NULL); DBGASSERT(newInodeP->PRVINODE == cnP); DBGASSERT(cnP->osNodeP == (void *)newInodeP); } else if (rc != ENOENT) // internal failure { cxiErrorNFS(rc); code = 2; retP = (struct dentry *)ERR_PTR(-rc); goto xerror; } else if (diP->i_nlink == 0) // ENOENT but unlinked parent { /* This odd code is here because this function would normally * exit with a negative dcache entry on ENOENT. However if * we allow a negative dcache entry in a directory thats been * deleted (but we're still sitting in it) then the d_count * will never go to zero and we'll strand any open file that * is associated with the parent directory. If we drop the * dentry and return the ENOENT then the VFS will dput the * dentry. The scenario that gave us trouble was: * * NODE 1 NODE 2 * `rm -rf dirA` `rm -rf dirA` * ========================================================== * gpfs_f_open("dirA", ...) * gpfs_f_readdir(...) * [read "fileA", "fileB"] gpfs_f_open("dirA", ...) * gpfs_f_readdir(...) * [read "fileA", "fileB"] * * gpfs_i_lookup("fileA") * gpfs_i_unlink("fileA") * gpfs_s_delete_inode(fileA's inode) * gpfs_i_lookup("fileB") * gpfs_i_unlink("fileB") * gpfs_s_delete_inode(fileB's inode) * ... * gpfs_i_rmdir("dirA", ...) * gpfs_s_delete_inode(dirA's inode) * destroyOnLastClose=1 for dirA <====== * * gpfs_i_lookup("fileA") * [creates a negative dentry for fileA, * increments dirA's reference count] * gpfs_i_lookup("fileB") * [creates a negative dentry for fileB, * increments dirA's reference count] */ DBGASSERT(dentryP->d_inode == NULL); dentryP->d_op = NULL; d_drop(dentryP); code = 3; retP = (struct dentry *)ERR_PTR(-rc); goto xerror; } PRINTDENTRY(dentryP); xerror: TRACE7(TRACE_VNODE, 1, TRCID_LINUXOPS_LOOKUP_EXIT, "gpfs_i_lookup exit: new inode 0x%lX iNum %d (0x%X) cnP 0x%lX retP 0x%lX " "code %d rc %d\n", newInodeP, iNum, iNum, cnP, retP, code, rc); VFS_STAT_STOP; EXIT(0); return retP; } int gpfs_i_link(struct dentry *oldDentryP, struct inode *diP, struct dentry *dentryP) { int rc = 0; struct inode *iP = oldDentryP->d_inode; cxiNode_t *dcnP; cxiNode_t *cnP = NULL; struct gpfsVfsData_t *privVfsP; char *tnameP; ext_cred_t eCred; VFS_STAT_START(linkCall); ENTER(0); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_LINK_ENTER, "gpfs_i_link enter: diP 0x%lX dentryP 0x%lX " "dentryP 0x%lX name '%s'\n", diP, oldDentryP, dentryP, dentryP->d_name.name); /* BKL is held at entry */ cnP = VP_TO_CNP(iP); dcnP = VP_TO_CNP(diP); privVfsP = VP_TO_PVP(diP); LOGASSERT(privVfsP != NULL); setCred(&eCred); rc = gpfs_ops.gpfsLink(privVfsP, cnP, dcnP, dentryP, (char *)dentryP->d_name.name, &eCred); if (rc) { d_drop(dentryP); goto xerror; } iP->i_sb->s_dirt = 1; xerror: PRINTINODE(iP); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_LINK_EXIT, "gpfs_i_link exit: diP 0x%lX iP 0x%lX rc %d\n", diP, iP, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return -rc; } int gpfs_i_unlink(struct inode *diP, struct dentry *dentryP) { int rc = 0; struct gpfsVfsData_t *privVfsP; struct inode *iP = dentryP->d_inode; cxiNode_t *dcnP; cxiNode_t *cnP; ext_cred_t eCred; struct dentry_operations *orig_d_opP; VFS_STAT_START(removeCall); ENTER(0); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_UNLINK_ENTER, "gpfs_i_unlink enter: diP 0x%lX iP 0x%lX dentryP 0x%lX name '%s'\n", diP, iP, dentryP, dentryP->d_name.name); /* BKL is held at entry */ cnP = VP_TO_CNP(iP); dcnP = VP_TO_CNP(diP); privVfsP = VP_TO_PVP(diP); LOGASSERT(privVfsP != NULL); /* Regarding dcache entry update: upon returning from gpfs_i_unlink, the VFS layer will turn the dentry into a valid, negative dcache entry by calling d_delete(). If another node then creates a new file with the same name, the BR token revoke for the directory block will invalidate the negative dcache entry. However, there is a window between the gpfsRemove() and the d_delete(), where a BR token revoke would not recognize that it should invalidate the dcache entry, because d_delete() has not yet turned it into a negative dcache entry. To fix this, we mark the dentry as "valid with d_delete pending"; the meaning of this state is "the dentry is still valid, but a BR token revoke should mark it as 'needing revalidation', even if it does not (yet) look like a negative dcache entry". Note that we don't want to mark "valid with d_delete pending" entries as invalid in the BR revoke handler, because we don't know for sure that the file is in fact going to be deleted. The unlink operation may fail, for any number of reasons, and the dentry should not be marked as invalid prematurely. It's safe to mark a dentry as 'needing revalidation', however. Ideally, we should swap d_op inside gpfsRemove while we are holding the BR lock on the directory. However, (1) there is local synchronization in the VFS (our caller is holding the i_sem semaphore on the directory) that will prevent other threads from doing a lookup or create that might change the state back to just plain "valid" before the gpfsRemove has happened, and (2) a BR revoke that happens before the gpfsRemove might unnecessarily mark the dentry as 'needing revalidation'; this is sub-optimal, but it doesn't hurt. Also see comment in gpfs_i_rmdir. */ orig_d_opP = dentryP->d_op; dentryP->d_op = &gpfs_dops_ddeletepending; setCred(&eCred); rc = gpfs_ops.gpfsRemove(privVfsP, cnP, dcnP, (char *)dentryP->d_name.name, &eCred); if (rc) { d_drop(dentryP); if (dentryP->d_op == &gpfs_dops_ddeletepending) dentryP->d_op = orig_d_opP; goto xerror; } diP->i_sb->s_dirt = 1; /* d_delete will be called at VFS layer if rc == 0 */ xerror: PRINTINODE(iP); PRINTDENTRY(dentryP); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_UNLINK_EXIT, "gpfs_i_unlink exit: diP 0x%lX iP 0x%lX rc %d\n", diP, iP, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return -rc; } int gpfs_i_symlink(struct inode *diP, struct dentry *dentryP, const char *symlinkTargetP) { int rc = 0; cxiNode_t *dcnP; cxiNode_t *cnP; cxiIno_t iNum = (cxiIno_t)-1; struct inode *newInodeP = NULL; struct gpfsVfsData_t *privVfsP; ext_cred_t eCred; VFS_STAT_START(symlinkCall); ENTER(0); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_SYMLINK1, "gpfs_i_symlink enter: iP 0x%lX dentryP 0x%lX symlinkTargetP '%s'\n", diP, dentryP, symlinkTargetP); TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_SYMLINK2, "gpfs_i_symlink: newLinkName '%s'\n", dentryP->d_name.name); /* BKL is held at entry */ dcnP = VP_TO_CNP(diP); privVfsP = VP_TO_PVP(diP); LOGASSERT(privVfsP != NULL); setCred(&eCred); rc = gpfs_ops.gpfsSymlink(privVfsP, dcnP, (void **)&newInodeP, &cnP, &iNum, dentryP, (char *)dentryP->d_name.name, (char *)symlinkTargetP, &eCred); if (rc == 0) { DBGASSERT(cnP != NULL); DBGASSERT(iNum != -1); DBGASSERT(newInodeP != NULL); DBGASSERT(newInodeP->PRVINODE == cnP); DBGASSERT(cnP->osNodeP == (void *)newInodeP); } else { d_drop(dentryP); goto xerror; } diP->i_sb->s_dirt = 1; xerror: TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_SYMLINK_EXIT, "gpfs_i_symlink exit: new inode 0x%lX iNum %d (0x%X) rc %d\n", newInodeP, iNum, iNum, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return -rc; } int gpfs_i_mkdir(struct inode *diP, struct dentry *dentryP, int mode) { int rc = 0; struct gpfsVfsData_t *privVfsP; cxiNode_t *dcnP; cxiNode_t *cnP; cxiMode_t umask; ext_cred_t eCred; cxiIno_t iNum = (cxiIno_t)-1; struct inode *newInodeP = NULL; VFS_STAT_START(mkdirCall); ENTER(0); umask = get_umask(); /* LFS should not apply umask and we may not */ dcnP = VP_TO_CNP(diP); privVfsP = VP_TO_PVP(diP); LOGASSERT(privVfsP != NULL); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_MKDIR_ENTER, "gpfs_i_mkdir enter: diP 0x%lX mode 0x%X name '%s'\n", diP, mode, dentryP->d_name.name); /* BKL is held at entry */ setCred(&eCred); rc = gpfs_ops.gpfsMkdir(privVfsP, dcnP, (void **)&newInodeP, &cnP, &iNum, dentryP, (char *)dentryP->d_name.name, mode, umask, &eCred); if (rc == 0) { DBGASSERT(cnP != NULL); DBGASSERT(iNum != -1); DBGASSERT(newInodeP != NULL); DBGASSERT(newInodeP->PRVINODE == cnP); DBGASSERT(cnP->osNodeP == (void *)newInodeP); } else { d_drop(dentryP); goto xerror; } diP->i_sb->s_dirt = 1; xerror: TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_MKDIR_EXIT, "gpfs_i_mkdir exit: new inode 0x%lX iNum %d (0x%X) rc %d\n", newInodeP, iNum, iNum, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return -rc; } int gpfs_i_rmdir(struct inode *diP, struct dentry *dentryP) { int rc; struct inode *iP = dentryP->d_inode; cxiNode_t *dcnP; cxiNode_t *cnP; struct gpfsVfsData_t *privVfsP; ext_cred_t eCred; struct dentry_operations *orig_d_opP; VFS_STAT_START(rmdirCall); ENTER(0); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_RMDIR_ENTER, "gpfs_i_rmdir enter: diP 0x%lX iP 0x%lX name '%s'\n", diP, iP, dentryP->d_name.name); /* BKL is held at entry */ cnP = VP_TO_CNP(iP); dcnP = VP_TO_CNP(diP); privVfsP = VP_TO_PVP(diP); LOGASSERT(privVfsP != NULL); /* See comment in gpfs_i_unlink. Note that Linux kernel processes directory dentries a little differently from regular file dentries. In particular, it doesn't appear that a successful rmdir call results in the removed directory dentry being turned into a valid negative dentry; the dentry just gets unhashed and recycled if it had no references at the time of rmdir. If the dentry did have extra references, e.g. due to a process using the directory in question as cwd, the dentry is unhashed, but it remains a positive dentry pointing to the deleted inode, and will remain as such until the dentry ref count goes to zero, at which point the dentry is recycled. So there's no apparent need to mark directory dentries as 'needing revalidation' during BR token revoke (we do know that we need to do this for regular files). However, this particular aspect of Linux kernel operation is not guaranteed to always work in this fashion, so we might as well try to stay on the safe side of things, and treat directories the same way as regular files. It doesn't appear that marking a dentry as 'needing revalidation' has any ill effects besides extra cycles required for revalidation, and BR token revoke handler racing with an unsuccessful gpfsRmdir is a rare enough event to tolerate this extra performance hit. */ orig_d_opP = dentryP->d_op; dentryP->d_op = &gpfs_dops_ddeletepending; setCred(&eCred); rc = gpfs_ops.gpfsRmdir(privVfsP, cnP, dcnP, (char *)dentryP->d_name.name, &eCred); if (rc) { if (rc == EEXIST) rc = ENOTEMPTY; if (dentryP->d_op == &gpfs_dops_ddeletepending) dentryP->d_op = orig_d_opP; /* d_drop(dentryP); */ goto xerror; } diP->i_sb->s_dirt = 1; /* d_delete will be called at VFS layer if rc == 0 */ xerror: PRINTINODE(iP); PRINTDENTRY(dentryP); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_RMDIR_EXIT, "gpfs_i_rmdir exit: diP 0x%lX iP 0x%lX rc %d\n", diP, iP, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return -rc; } int #if LINUX_KERNEL_VERSION >= 2050000 gpfs_i_mknod(struct inode *diP, struct dentry *dentryP, int mode, dev_t rdev) #else gpfs_i_mknod(struct inode *diP, struct dentry *dentryP, int mode, int rdev) #endif { int rc = 0; struct gpfsVfsData_t *privVfsP; cxiNode_t *dcnP; cxiNode_t *cnP; cxiIno_t iNum = (cxiIno_t)-1; struct inode *newInodeP = NULL; cxiMode_t umask = get_umask(); ext_cred_t eCred; cxiDev32_t rdev32; VFS_STAT_START(mknodCall); ENTER(0); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_MKNOD_ENTER, "gpfs_i_mknod enter: diP 0x%lX mode 0x%X rdev 0x%X name '%s'\n", diP, mode, (int)rdev, dentryP->d_name.name); /* BKL is held at entry */ dcnP = VP_TO_CNP(diP); privVfsP = VP_TO_PVP(diP); LOGASSERT(privVfsP != NULL); setCred(&eCred); rdev32 = cxiDevToDev32(rdev); rc = gpfs_ops.gpfsMknod(privVfsP, dcnP, (void **)&newInodeP, &cnP, &iNum, dentryP, (char *)dentryP->d_name.name, mode, umask, (cxiDev_t)rdev32, &eCred); if (rc == 0) { DBGASSERT(cnP != NULL); DBGASSERT(iNum != -1); DBGASSERT(newInodeP != NULL); DBGASSERT(newInodeP->PRVINODE == cnP); DBGASSERT(cnP->osNodeP == (void *)newInodeP); } else { d_drop(dentryP); goto xerror; } diP->i_sb->s_dirt = 1; /* Set vector table for special files, gpfs will not get these operations.*/ #if LINUX_KERNEL_VERSION >= 2060000 init_special_inode(newInodeP, newInodeP->i_mode, newInodeP->i_rdev); #else init_special_inode(newInodeP, newInodeP->i_mode, kdev_t_to_nr(newInodeP->i_rdev)); #endif xerror: TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_MKNOD_EXIT, "gpfs_i_mknod exit: new inode 0x%lX iNum %d (0x%X) rc %d\n", newInodeP, iNum, iNum, rc); VFS_STAT_STOP; EXIT(0); return -rc; } int gpfs_i_rename(struct inode *diP, struct dentry *dentryP, struct inode *tdiP, struct dentry *tDentryP) { int rc = 0; struct inode *iP = dentryP->d_inode; struct inode *tiP = tDentryP->d_inode; struct gpfsVfsData_t *privVfsP; cxiNode_t *sourceCNP, *sourceDirCNP, *targetCNP, *targetDirCNP; ext_cred_t eCred; VFS_STAT_START(renameCall); ENTER(0); TRACE6(TRACE_VNODE, 1, TRCID_LINUXOPS_RENAME_1, "gpfs_i_rename enter: iP 0x%lX dvP 0x%lX name '%s'" " tiP 0x%lX tdiP 0x%lX new name '%s'\n", iP, diP, dentryP->d_name.name, tiP, tdiP, tDentryP->d_name.name); /* BKL is held at entry */ /* Do not allow simple rename across mount points */ if (diP->i_sb != tdiP->i_sb) { rc = EXDEV; goto xerror; } sourceCNP = VP_TO_CNP(iP); sourceDirCNP = VP_TO_CNP(diP); targetCNP = (tiP != NULL) ? VP_TO_CNP(tiP) : NULL; targetDirCNP = VP_TO_CNP(tdiP); privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); setCred(&eCred); rc = gpfs_ops.gpfsRename(privVfsP, sourceCNP, sourceDirCNP, (char *)dentryP->d_name.name, targetCNP, targetDirCNP, (char *)tDentryP->d_name.name, &eCred); if (rc == 0) { gpfs_i_getattr_internal(iP); gpfs_i_getattr_internal(diP); if (tiP) gpfs_i_getattr_internal(tiP); if (tdiP != diP) gpfs_i_getattr_internal(tdiP); diP->i_sb->s_dirt = 1; } xerror: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_RENAME_EXIT, "gpfs_i_rename exit: iP 0x%lX rc %d\n", iP, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return -rc; } int gpfs_i_readlink(struct dentry *dentryP, char *bufP, int buflen) { int rc = 0; Boolean gotBKL = false; struct cxiUio_t tmpUio; cxiIovec_t tmpIovec; struct inode *iP = dentryP->d_inode; struct gpfsVfsData_t *privVfsP; cxiNode_t *cnP; VFS_STAT_START(readlinkCall); ENTER(0); TRACE5(TRACE_VNODE, 1, TRCID_LINUXOPS_READLINK_ENTER, "gpfs_i_readlink enter: dentryP 0x%lX bufP 0x%lX len %d " "iP 0x%lX name '%s'\n", dentryP, bufP, buflen, iP, dentryP->d_name.name); /* BKL is not held at entry, except for NFS calls */ TraceBKL(); if (current->lock_depth >= 0) /* kernel lock is held by me */ { gotBKL = true; unlock_kernel(); } cnP = VP_TO_CNP(iP); privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); tmpIovec.iov_base = bufP; /* base memory address */ tmpIovec.iov_len = buflen; /* length of transfer for this area */ tmpUio.uio_iov = &tmpIovec; /* ptr to array of iovec structs */ tmpUio.uio_iovcnt = 1; /* #iovec elements left to be processed */ tmpUio.uio_iovdcnt = 0; /* #iovec elements already processed */ tmpUio.uio_offset = 0; /* byte offset in file/dev to read/write */ tmpUio.uio_resid = buflen; /* #bytes left in data area */ tmpUio.uio_segflg = UIO_USERSPACE; /* copy to user space buffer */ tmpUio.uio_fmode = 0; /* file modes from open file struct */ rc = gpfs_ops.gpfsReadlink(privVfsP, cnP, &tmpUio); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_READLINK_EXIT, "gpfs_i_readlink exit: iP 0x%lX uio_resid %ld offset %d rc %d\n", iP, tmpUio.uio_resid, tmpUio.uio_offset, rc); VFS_STAT_STOP; if (gotBKL) /* If held kernel lock on entry then reacquire it */ lock_kernel(); if (rc) cxiErrorNFS(rc); EXIT(0); if (rc) return (-rc); return (buflen - tmpUio.uio_resid); } #if LINUX_KERNEL_VERSION >= 2061600 void* gpfs_i_follow_link(struct dentry *dentry, struct nameidata *nd) #else int gpfs_i_follow_link(struct dentry *dentry, struct nameidata *nd) #endif { int rc; Boolean gotBKL = false; struct cxiUio_t tmpUio; cxiIovec_t tmpIovec; struct inode *iP = dentry->d_inode; struct gpfsVfsData_t *privVfsP; cxiNode_t *cnP; char *buf = NULL; ENTER(0); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_FOLLOW_LINK_ENTER, "gpfs_i_follow_link enter: inode 0x%lX name '%s'\n", dentry->d_inode, dentry->d_name.name); /* BKL is not held at entry, except for NFS calls */ TraceBKL(); if (current->lock_depth >= 0) /* kernel lock is held by me */ { gotBKL = true; unlock_kernel(); } /* Allocate a temporary buffer to hold the symlink contents */ buf = cxiMallocPinned(CXI_PATH_MAX+1); if (buf == NULL) { rc = -ENOMEM; goto xerror; } cnP = VP_TO_CNP(iP); privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); tmpIovec.iov_base = buf; /* base memory address */ tmpIovec.iov_len = PATH_MAX; /* length of transfer for this area */ tmpUio.uio_iov = &tmpIovec; /* ptr to array of iovec structs */ tmpUio.uio_iovcnt = 1; /* #iovec elements left to be processed */ tmpUio.uio_iovdcnt = 0; /* #iovec elements already processed */ tmpUio.uio_offset = 0; /* byte offset in file/dev to read/write */ tmpUio.uio_resid = PATH_MAX; /* #bytes left in data area */ tmpUio.uio_segflg = UIO_SYSSPACE; /* copy to kernel space buffer */ tmpUio.uio_fmode = 0; /* file modes from open file struct */ /* Read symlink contents */ rc = gpfs_ops.gpfsReadlink(privVfsP, cnP, &tmpUio); if (rc) { cxiErrorNFS(rc); rc = -rc; goto xerror; } /* set end of string */ buf[PATH_MAX - tmpUio.uio_resid] = 0; TRACE2(TRACE_VNODE, 2, TRCID_LINUXOPS_FOLLOW_LINK_1, "gpfs_i_follow_link readlink rc %d data '%s'\n", rc, buf); VFS_FOLLOW_LINK(rc, nd, buf); exit: if (buf) cxiFreePinned(buf); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_FOLLOW_LINK_2, "gpfs_i_follow_link exit: inode 0x%lX rc %d\n", dentry->d_inode, rc); if (gotBKL) /* If held kernel lock on entry then reacquire it */ lock_kernel(); EXIT(0); #if LINUX_KERNEL_VERSION >= 2061600 return NULL; /* no cookie */ #else return rc; #endif xerror: path_release(nd); goto exit; } #ifdef HAS_IOP_PUT_LINK #if LINUX_KERNEL_VERSION >= 2061600 void gpfs_i_put_link(struct dentry *dentry, struct nameidata *nd, void* cookie) #else void gpfs_i_put_link(struct dentry *dentry, struct nameidata *nd) #endif { char *buf = nd_get_link(nd); TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_PUTLINK, "gpfs_i_put_link dentry 0x%lX nd 0x%lX buf 0x%lX\n", dentry, nd, !IS_ERR(buf)? buf : NULL); if (!IS_ERR(buf)) cxiFreePinned(buf); } #endif /* HAS_IOP_PUT_LINK */ int gpfs_i_bmap(struct inode *iP, int fragment) { ENTER(0); TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_BMAP, "gpfs_i_bmap: rc ENOSYS\n"); TraceBKL(); EXIT(0); return -ENOSYS; } void gpfs_i_truncate(struct inode *iP) { ENTER(0); /* Nothing to do since the file size was updated on the notify_change * call which preceeded this call */ TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_TRUNCATE, "gpfs_i_truncate: inode 0x%lX\n", iP); TraceBKL(); EXIT(0); } int gpfs_i_permission(struct inode *iP, int mode #if LINUX_KERNEL_VERSION >= 2060000 , struct nameidata *ni #endif ) { cxiNode_t *cnP; struct gpfsVfsData_t *privVfsP; ext_cred_t eCred; int rc = 0; VFS_STAT_START(accessCall); ENTER(0); /* BKL is held at entry */ cnP = VP_TO_CNP(iP); TRACE6(TRACE_VNODE, 1, TRCID_LINUXOPS_ACCESS_ENTER, "gpfs_i_permission enter: iP 0x%lX mode 0x%X uid %d gid %d " "i_mode 0x%X i_xinfo 0x%X", iP, mode, current->fsuid, current->fsgid, iP->i_mode, cnP->xinfo); privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); if (mode) /* call permission check only if got access mode */ { setCred(&eCred); rc = gpfs_ops.gpfsAccess(privVfsP, cnP, mode, ACC_SELF, &eCred); } xerror: TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_ACCESS_EXIT, "gpfs_i_permission exit: iP 0x%lX std %d dir std %d rc %d", iP, iP->i_op == &gpfs_iops_stdperm, iP->i_op == &gpfs_dir_iops_stdperm, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return -rc; } int gpfs_i_smap(struct inode *iP, int sector) { ENTER(0); TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_SMAP, "gpfs_i_smap: rc ENOSYS\n"); TraceBKL(); EXIT(0); return -ENOSYS; } int gpfs_i_updatepage(struct file *fP, struct page *pageP, const char *bufP, unsigned long offset, uint count, int sync) { ENTER(0); TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_UPDATEPAGE, "gpfs_i_updatepage: rc ENOSYS\n"); TraceBKL(); EXIT(0); return -ENOSYS; } int gpfs_i_revalidate(struct dentry *dentryP) { int rc; int code = 0; struct inode *iP = dentryP->d_inode; cxiNode_t *cnP; cxiVattr_t vattr; struct gpfsVfsData_t *privVfsP; ENTER(0); VFS_INC(revalidateCount); TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_REVALIDATE_ENTER, "gpfs_i_revalidate enter: dentryP 0x%lX iP 0x%lX ino 0x%X name '%s'\n", dentryP, dentryP->d_inode, (iP) ? iP->i_ino : -1, dentryP->d_name.name); /* BKL is usually not held, but seems to be held when coming here as part of setting an ACL */ if (iP == NULL) { code = 1; rc = ENOENT; goto xerror; } cnP = VP_TO_CNP(iP); if (!cnP) { /* This can happen due to a bug in linux/fs/dcache.c (prune_dcache) where "count" entries are to be pruned, but the last one is found to be recently referenced. When this happens, count is decremented, but the loop is not terminated. The result is that it continues to prune entries past where it should (prunes everything). If our patch for this is not applied, the result is a kernel failure as the cxiNode is referenced. Checking here (and lookup) allows us to reject the call instead. */ TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_REVALIDATE_STALE, "gpfs_i_revalidate: cxiNode for iP 0x%lX (ino %d) was FREED!\n", iP, iP->i_ino); /* Although we may like to know more about this inode, it is not * ok to call PRINTINODE(iP) here. */ rc = ESTALE; code = 2; goto xerror; } if ((cnP->icValid & CXI_IC_STAT) == CXI_IC_STAT) { rc = 0; code = 3; goto xerror; } privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); /* This has the effect of calling us back under a lock and * setting the inode attributes at the OS level (since this * operating system caches this info in the vfs layer) */ rc = gpfs_ops.gpfsGetattr(privVfsP, cnP, &vattr, false); PRINTINODE(iP); #if 0 /* Delay briefly to give token revoke races a chance to happen, if there are any. Time delay is in jiffies (10ms). */ # define howLong 5 TRACE1(TRACE_VNODE, 4, TRCID_REVAL_DELAY, "gpfs_i_revalidate: begin delay %d\n", howLong); current->state = TASK_INTERRUPTIBLE; schedule_timeout(howLong); TRACE1(TRACE_VNODE, 14, TRCID_REVAL_DELAY_END, "gpfs_i_revalidate: end delay %d\n", howLong); #endif xerror: TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_REVALIDATE_EXIT, "gpfs_i_revalidate exit: dentry 0x%lX code %d rc %d\n", dentryP, code, rc); if (rc) cxiErrorNFS(rc); EXIT(0); return -rc; } int gpfs_i_setattr(struct dentry *dentryP, struct iattr *iattrP) { int rc; VFS_STAT_START(setattrCall); ENTER(0); rc = gpfs_i_setattr_internal(dentryP->d_inode, iattrP); VFS_STAT_STOP; EXIT(0); return -rc; } int gpfs_i_setattr_internal(struct inode *iP, struct iattr *aP) { int rc = 0; int code = 0; long arg1; /* must be large enough on 64bit to contain */ long arg2; /* either a pointer or integer */ long arg3; cxiTimeStruc_t atime, mtime, ctime; cxiNode_t *cnP; struct gpfsVfsData_t *privVfsP; ext_cred_t eCred; unsigned int ia_valid; ENTER(0); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_SETATTR_ENTER, "gpfs_i_setattr enter: iP 0x%lX ia_valid 0x%X\n", iP, aP->ia_valid); /* ?? Callers of this are inconsistent about whether the BKL is held */ cnP = VP_TO_CNP(iP); privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); ia_valid = aP->ia_valid; /* Change file size */ if (ia_valid & ATTR_SIZE) { arg1 = (long)&aP->ia_size; arg2 = 0; arg3 = 0; /* call gpfsSetattr, unless we know that new size is the same */ if (!(cnP->icValid & CXI_IC_ATTR) || ((struct inode *)cnP->osNodeP)->i_size != aP->ia_size) { setCred(&eCred); // rebuild since gpfsSetattr may remap ids rc = gpfs_ops.gpfsSetattr(privVfsP, cnP, V_SIZE, arg1, arg2, arg3, &eCred); if (rc != 0) { code = 1; goto xerror; } /* gpfsSetattr(... V_SIZE ...) will have updated ctime and mtime. No need to do this again. */ ia_valid &= ~(ATTR_MTIME | ATTR_CTIME); } } /* Change file mode */ if (ia_valid & ATTR_MODE) { arg1 = (long)aP->ia_mode; arg2 = 0; arg3 = 0; setCred(&eCred); // rebuild since gpfsSetattr may remap ids rc = gpfs_ops.gpfsSetattr(privVfsP, cnP, V_MODE, arg1, arg2, arg3, &eCred); if (rc != 0) { code = 2; goto xerror; } } /* Change uid or gid */ if (ia_valid & (ATTR_UID | ATTR_GID)) { arg1 = 0; arg2 = 0; arg3 = 0; if (ia_valid & ATTR_UID) arg2 = (long)aP->ia_uid; else arg1 |= T_OWNER_AS_IS; if (ia_valid & ATTR_GID) arg3 = (long)aP->ia_gid; else arg1 |= T_GROUP_AS_IS; setCred(&eCred); // rebuild since gpfsSetattr may remap ids rc = gpfs_ops.gpfsSetattr(privVfsP, cnP, V_OWN, arg1, arg2, arg3, &eCred); if (rc != 0) { code = 3; goto xerror; } } /* Change access, modification, or change time */ if (ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) { arg1 = 0; arg2 = 0; arg3 = 0; if (ia_valid & ATTR_ATIME) { CXITIME_FROM_INODETIME(atime, aP->ia_atime); arg1 = (long)&atime; } if (ia_valid & ATTR_MTIME) { CXITIME_FROM_INODETIME(mtime, aP->ia_mtime); arg2 = (long)&mtime; } if (ia_valid & ATTR_CTIME) { CXITIME_FROM_INODETIME(ctime, aP->ia_ctime); arg3 = (long)&ctime; } setCred(&eCred); // rebuild since gpfsSetattr may remap ids rc = gpfs_ops.gpfsSetattr(privVfsP, cnP, V_STIME, arg1, arg2, arg3, &eCred); if (rc != 0) { code = 4; goto xerror; } } xerror: if (rc == 0) { /* For NFS we might need to write the inode but the check will be done * in gpfsSyncNFS(). */ if (cxiAllowNFSFsync()) { setCred(&eCred); // rebuild since gpfsSetattr may remap ids rc = gpfs_ops.gpfsSyncNFS(privVfsP, cnP, 0, &eCred); } iP->i_sb->s_dirt = 1; } TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_SETATTR_EXIT, "gpfs_i_setattr exit: iP 0x%lX code %d rc %d\n", iP, code, rc); if (rc) cxiErrorNFS(rc); EXIT(0); return rc; } #if LINUX_KERNEL_VERSION >= 2050000 int gpfs_i_getattr(struct vfsmount *mntP, struct dentry *dentryP, struct kstat *kstatP) #else int gpfs_i_getattr(struct dentry *dentryP, struct iattr *iattrP) #endif { int rc; struct inode *iP = dentryP->d_inode; cxiNode_t *cnP; VFS_STAT_START(getattrCall); ENTER(0); cnP = VP_TO_CNP(iP); if (cnP && ((cnP->icValid & CXI_IC_STAT) == CXI_IC_STAT)) /* attr are vaild */ rc = 0; else rc = gpfs_i_getattr_internal(iP); if (!rc) #if LINUX_KERNEL_VERSION >= 2050000 generic_fillattr(iP, kstatP); #else getIattr(iP, iattrP); #endif else rc = -rc; VFS_STAT_STOP; EXIT(0); return rc; } int gpfs_i_getattr_internal(struct inode *iP) { int rc = 0; cxiNode_t *cnP; struct gpfsVfsData_t *privVfsP; cxiVattr_t vattr; ENTER(0); TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_GETATTR_ENTER, "gpfs_i_getattr enter: iP 0x%lX\n", iP); /* BKL is held at entry */ privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); cnP = VP_TO_CNP(iP); /* This has the effect of calling us back under a lock and * setting the inode attributes at the OS level (since this * operating system caches this info in the vfs layer) */ rc = gpfs_ops.gpfsGetattr(privVfsP, cnP, &vattr, false); PRINTINODE(iP); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_GETATTR_EXIT, "gpfs_i_getattr exit: iP 0x%lX rc %d\n", iP, rc); if (rc) cxiErrorNFS(rc); EXIT(0); return rc; } #if LINUX_KERNEL_VERSION > 2060000 #include #define XATTR_SECURITY_PREFIX "security." #define XATTR_TRUSTED_PREFIX "trusted." #define XATTR_USER_PREFIX "user." #define XATTR_NAME_ACL_ACCESS "system.posix_acl_access" #define XATTR_NAME_ACL_DEFAULT "system.posix_acl_default" static const char * test_prefix(const char *name, const char *prefix) { while (*prefix && *name == *prefix) { name++; prefix++; } return *prefix ? NULL : name; } /* * Inode operation getxattr() * */ ssize_t gpfs_i_getxattr(struct dentry *dentry, const char *name, void *buf, size_t buf_size) { int rc; cxiNode_t *cnP; struct gpfsVfsData_t *privVfsP; struct tsxattr xattr; struct tsxattrs xattrs; ext_cred_t eCred; void *argP = &xattrs; int flags = 0; struct inode *iP = dentry->d_inode; mm_segment_t oldfs; const char *n; ENTER(0); VFS_STAT_START(getxattrCall); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_GETEXTATTR_ENTER, "gpfs_i_getxattr enter: iP 0x%lX name %s buf 0x%lX size %d\n", iP, (name) ? name : "NULL", buf, buf_size); if (iP == NULL) { rc = ENOENT; goto xerror; } #ifdef CONFIG_FS_POSIX_ACL if (n = test_prefix(name, XATTR_NAME_ACL_ACCESS)) { if (n && (strcmp(n, "") != 0)) { rc = EINVAL; goto xerror; } rc = gpfs_get_posix_acl(dentry, ACL_TYPE_ACCESS, buf, buf_size); goto xerror2; } if (S_ISDIR(iP->i_mode)) { if (n = test_prefix(name, XATTR_NAME_ACL_DEFAULT)) { if (n && (strcmp(n, "") != 0)) { rc = EINVAL; goto xerror; } rc = gpfs_get_posix_acl(dentry, ACL_TYPE_DEFAULT, buf, buf_size); goto xerror2; } } #endif if (n = test_prefix(name, XATTR_SECURITY_PREFIX)) { if (n && (strcmp(n, "") == 0)) { rc = EINVAL; goto xerror; } goto xattr; } if (n = test_prefix(name, XATTR_TRUSTED_PREFIX)) { if (n && (strcmp(n, "") == 0)) { rc = EINVAL; goto xerror; } if (!capable(CAP_SYS_ADMIN)) { rc = EPERM; goto xerror; } goto xattr; } if (n = test_prefix(name, XATTR_USER_PREFIX)) { if (n && (strcmp(n, "") == 0)) { rc = EINVAL; goto xerror; } goto xattr; } rc = EOPNOTSUPP; goto xerror; xattr: setCred(&eCred); xattrs.appId = 3; // application id GPFS_ATTR_INTERNAL_APPL_ID xattrs.nattrs = 1; // no of attributes to get or set xattrs.attrs = &xattr; // attributes to get or set xattr.keyP = (char*) name; // attribute key xattr.keyLen = strlen(name) + 1; // key length xattr.valueP = buf; // attribute value xattr.valueLen = buf_size; // length of attribute value privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); cnP = VP_TO_CNP(iP); oldfs = get_fs(); set_fs(get_ds()); rc = gpfs_ops.gpfsFattr(privVfsP, cnP, NULL, flags, GET_XATTR, argP, NULL, &eCred); set_fs(oldfs); if (!rc) { TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_GETEXTATTR_EXIT0, "gpfs_i_getxattr exit: iP 0x%lX len %d\n", iP, xattr.valueLen); VFS_STAT_STOP; EXIT(0); if (xattr.valueLen < 0) rc = ENODATA; else return (xattr.valueLen); } xerror: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_GETEXTATTR_EXIT, "gpfs_i_getxattr exit: iP 0x%lX rc %d\n", iP, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return (-rc); xerror2: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_GETEXTATTR_EXIT2, "gpfs_i_getxattr exit2: iP 0x%lX rc %d\n", iP, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return (rc); } /* * Inode operation setxattr() * */ int gpfs_i_setxattr(struct dentry *dentry, const char *name, const void *buf, size_t buf_size, int ext_flags) { int rc; cxiNode_t *cnP; struct gpfsVfsData_t *privVfsP; struct tsxattr xattr; struct tsxattrs xattrs; ext_cred_t eCred; void *argP = &xattrs; int flags = 0; struct inode *iP = dentry->d_inode; mm_segment_t oldfs; const char *n; ENTER(0); VFS_STAT_START(setxattrCall); TRACE5(TRACE_VNODE, 1, TRCID_LINUXOPS_SETEXTATTR_ENTER, "gpfs_i_setxattr enter: iP 0x%lX name %s buf 0x%lX size %d flags 0x%X\n", iP, (name) ? name : "NULL", buf, buf_size, ext_flags); if (iP == NULL) { rc = ENOENT; goto xerror; } #ifdef CONFIG_FS_POSIX_ACL if (n = test_prefix(name, XATTR_NAME_ACL_ACCESS)) { if (n && (strcmp(n, "") != 0)) { rc = EINVAL; goto xerror; } if ((current->fsuid != iP->i_uid) && !capable(CAP_FOWNER)) return EPERM; rc = gpfs_set_posix_acl(dentry, ACL_TYPE_ACCESS, buf, buf_size); goto xerror; } if (S_ISDIR(iP->i_mode)) { if (n = test_prefix(name, XATTR_NAME_ACL_DEFAULT)) { if (n && (strcmp(n, "") != 0)) { rc = EINVAL; goto xerror; } if ((current->fsuid != iP->i_uid) && !capable(CAP_FOWNER)) return EPERM; rc = gpfs_set_posix_acl(dentry, ACL_TYPE_DEFAULT, buf, buf_size); goto xerror; } } #endif if (n = test_prefix(name, XATTR_SECURITY_PREFIX)) { if (n && (strcmp(n, "") == 0)) { rc = EINVAL; goto xerror; } goto xattr; } if (n = test_prefix(name, XATTR_TRUSTED_PREFIX)) { if (n && (strcmp(n, "") == 0)) { rc = EINVAL; goto xerror; } if (!capable(CAP_SYS_ADMIN)) { rc = EPERM; goto xerror; } goto xattr; } if (n = test_prefix(name, XATTR_USER_PREFIX)) { if (n && (strcmp(n, "") == 0)) { rc = EINVAL; goto xerror; } goto xattr; } rc = EOPNOTSUPP; goto xerror; xattr: setCred(&eCred); xattrs.appId = 3; // application id GPFS_ATTR_INTERNAL_APPL_ID xattrs.nattrs = 1; // no of attributes to get or set xattrs.attrs = &xattr; // attributes to get or set xattr.keyP = (char*) name; // attribute key xattr.keyLen = strlen(name) + 1; // key length xattr.valueP = (char *)buf; // attribute value xattr.valueLen = buf_size; // length of attribute value privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); cnP = VP_TO_CNP(iP); oldfs = get_fs(); set_fs(get_ds()); rc = gpfs_ops.gpfsFattr(privVfsP, cnP, NULL, flags, SET_XATTR, argP, NULL, &eCred); set_fs(oldfs); xerror: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_SETEXTATTR_EXIT, "gpfs_i_setxattr exit: iP 0x%lX rc %d\n", iP, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return (-rc); } /* * Inode operation listxattr() * * Copy a list of attribute names into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ ssize_t gpfs_i_listxattr(struct dentry *dentry, char *buf, size_t buf_size) { int rc; cxiNode_t *cnP; struct gpfsVfsData_t *privVfsP; struct tsxattr xattr; struct tsxattrs xattrs; ext_cred_t eCred; void *argP = &xattrs; int flags = 0; struct inode *iP = dentry->d_inode; mm_segment_t oldfs; ENTER(0); VFS_STAT_START(listxattrCall); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_LISTXATTR_IN, "gpfs_i_listxattr enter: iP 0x%lX buf 0x%lX buf_size %d\n", iP, buf, buf_size); if (iP == NULL) { rc = ENOENT; goto xerror; } setCred(&eCred); xattrs.appId = 3; // application id GPFS_ATTR_INTERNAL_APPL_ID xattrs.nattrs = 0; // get all attribute name xattrs.attrs = &xattr; // attributes to get or set xattr.keyP = NULL; // attribute key xattr.keyLen = 0; // key length xattr.valueP = buf; // attribute value xattr.valueLen = buf_size; // length of attribute value privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); cnP = VP_TO_CNP(iP); oldfs = get_fs(); set_fs(get_ds()); /* which names can we show ??? */ rc = gpfs_ops.gpfsFattr(privVfsP, cnP, NULL, flags, LIST_XATTR, argP, NULL, &eCred); set_fs(oldfs); if (!rc) { TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_LISTXTATTR_EXIT0, "gpfs_i_listxattr exit: iP 0x%lX len %d\n", iP, xattr.valueLen); VFS_STAT_STOP; EXIT(0); return (xattr.valueLen); } xerror: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_LISTXTATTR_EXIT, "gpfs_i_listxattr exit: iP 0x%lX rc %d\n", iP, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return (-rc); } /* * Inode operation removexattr() * */ int gpfs_i_removexattr(struct dentry *dentry, const char *name) { int rc; cxiNode_t *cnP; struct gpfsVfsData_t *privVfsP; struct tsxattr xattr; struct tsxattrs xattrs; ext_cred_t eCred; void *argP = &xattrs; int flags = 0; struct inode *iP = dentry->d_inode; mm_segment_t oldfs; const char *n; ENTER(0); VFS_STAT_START(removexattrCall); TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_REMOVEXATTR_IN, "gpfs_i_removexattr enter: iP 0x%lX name %s\n", iP, (name) ? name : "NULL"); if (iP == NULL) { rc = ENOENT; goto xerror; } #ifdef CONFIG_FS_POSIX_ACL if (n = test_prefix(name, XATTR_NAME_ACL_ACCESS)) { if (n && (strcmp(n, "") != 0)) { rc = EINVAL; goto xerror; } if ((current->fsuid != iP->i_uid) && !capable(CAP_FOWNER)) return EPERM; rc = gpfs_set_posix_acl(dentry, ACL_TYPE_ACCESS, NULL, -1); goto xerror; } if (S_ISDIR(iP->i_mode)) { if (n = test_prefix(name, XATTR_NAME_ACL_DEFAULT)) { if (n && (strcmp(n, "") != 0)) { rc = EINVAL; goto xerror; } if ((current->fsuid != iP->i_uid) && !capable(CAP_FOWNER)) return EPERM; rc = gpfs_set_posix_acl(dentry, ACL_TYPE_DEFAULT, NULL, -1); goto xerror; } } #endif if (n = test_prefix(name, XATTR_SECURITY_PREFIX)) { if (n && (strcmp(n, "") == 0)) { rc = EINVAL; goto xerror; } goto xattr; } if (n = test_prefix(name, XATTR_TRUSTED_PREFIX)) { if (n && (strcmp(n, "") == 0)) { rc = EINVAL; goto xerror; } if (!capable(CAP_SYS_ADMIN)) { rc = EPERM; goto xerror; } goto xattr; } if (n = test_prefix(name, XATTR_USER_PREFIX)) { if (n && (strcmp(n, "") == 0)) { rc = EINVAL; goto xerror; } goto xattr; } rc = EOPNOTSUPP; goto xerror; xattr: setCred(&eCred); xattrs.appId = 3; // application id GPFS_ATTR_INTERNAL_APPL_ID xattrs.nattrs = 1; // no of attributes to get or set xattrs.attrs = &xattr; // attributes to delete xattr.keyP = (char*) name; // attribute key xattr.keyLen = strlen(name) + 1; // key length xattr.valueP = NULL; // attribute value xattr.valueLen = -1; // length < zero means delete privVfsP = VP_TO_PVP(iP); LOGASSERT(privVfsP != NULL); cnP = VP_TO_CNP(iP); oldfs = get_fs(); set_fs(get_ds()); rc = gpfs_ops.gpfsFattr(privVfsP, cnP, NULL, flags, SET_XATTR, argP, NULL, &eCred); set_fs(oldfs); xerror: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_REMOVEXATTR_EXIT, "gpfs_i_removexattr exit: iP 0x%lX rc %d\n", iP, rc); if (rc) cxiErrorNFS(rc); VFS_STAT_STOP; EXIT(0); return (-rc); } #endif