/*************************************************************************** * * Copyright (C) 2001 International Business Machines * All rights reserved. * * This file is part of the GPFS mmfslinux kernel module. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *************************************************************************** */ /* @(#)94 1.48 src/avs/fs/mmfs/ts/kernext/gpl-linux/dir.c, mmfs, avs_rgpfs24, rgpfs240610b 11/8/05 10:20:56 */ #define __NO_VERSION__ #ifndef __KERNEL__ #define __KERNEL__ #endif #include #include #include #include #include #include #include #include #include #include #include #include /* About dcache revalidation: The Linux directory cache (dcache) is used to cache the result of name lookups. Linux caches positive as well as negative lookup results in its dcache entries (struct dentry): if the file existed at the time the last lookup was done (positive lookup), dentry->d_inode will point to the struct inode of the file; if the file did not exist (negative lookup), dentry->d_inode will be null. When a directory is modified on the local node, Linux will update its dcache entries accordingly. When the directory is modified on another node, however, we need to invalidate local dcache entries: - A negative dcache entry becomes invalid when a file by the same name is created on another node. This requires an exclusive byte-range token on the directory block in which the lookup was done that resulted in the dcache entry. Hence, when we lose a byte-range token on a directory, we invalidate all negative dcache entries for lookups that were done in that directory. This is done by a call to kxinvalidateOSNode with KXIVO_NEGDCACHE, which will result in a call to cxiInvalidateNegDCacheEntry() implemented here. - A positive dcache entry becomes invalid when the file it refers to is deleted, moved, or renamed on another node. All of these operations require an exclusive inode lock. Hence we invalidate a positive dcache entry when we lose the inode token for the file. This more selective invalidation of positive dcache entries is more efficient than simply invalidating all dcache entries when we lose a byte-range token on the directory. The invalidation is done by a call to kxinvalidateOSNode with CXI_IC_DCACHE, which will result in a call to cxiInvalidateDCacheEntry() implemented here. To invalidate a dcache entry Linux defines a d_revalidate function in the dentry_operations table. This function is supposed to check whether the dcache entry is still valid and return 'true' or 'false' accordingly. If no d_revalidate function is given in the dentry_operations table, Linux assumes the dentry is valid. Hence the most efficient way of marking a dentry as valid or invalid is to have the d_ops field in the dentry point to one of two different dentry_operations tables: one where the d_revalidate field is NULL (means the dentry is valid), and one where d_revalidate points at a function that always returns false (means the dentry is invalid). */ /* This call handles pruning off all unheld dentries pointing at an * inode. Normally pruning is not done by any daemon thread directly * (ie. token revoke) because d_prune_aliases may initiate a string of * callbacks due to iput. These callbacks may need to communicate back * to the daemon which can be problematic if there is a mailbox shortage. * Hence most dentry invalidation marks the cxiNode as needing a dentry * prune and the GPFS swapd is notified to call cxiPruneDCacheEntry in a * separate thread. * * Caller must be prepared to receive iput() callback into GPFS. * Caller must have a reference on cxiNode_t to ensure it doesn't * go away during processing. */ int cxiPruneDCacheEntry(cxiNode_t *cnP) { struct inode *iP = (struct inode *)cnP->osNodeP; struct list_head *dListP, *dHeadP; struct dentry *dentry; Boolean hasSubdirs = false; int refCount = 0; ENTER(0); TRACE2(TRACE_VNODE, 4, TRCID_PRUNE_DCACHE, "cxiPruneDCacheEntry: iP 0x%lX inode %d", iP, iP->i_ino); /* About to prune it so flag is no longer needed */ ClearCtFlag(cnP, pruneDCacheNeeded); /* This call prunes any unheld dentries pointing at the inode */ d_prune_aliases(iP); /* Traverse the list of all dentries that still refer to this file. */ dHeadP = &iP->i_dentry; spin_lock(&dcache_lock); for (dListP = dHeadP->next; dListP != dHeadP; dListP = dListP->next) { /* count dentries that still refer to this file */ refCount++; dentry = list_entry(dListP, struct dentry, d_alias); hasSubdirs = !list_empty(&dentry->d_subdirs); TRACE5N(TRACE_VNODE, 4, TRCID_PRUNE_DCACHE_ALIAS, "cxiPruneDCacheEntry: ip 0x%lX ino %d alias dentry 0x%lX " "hasSubdirs %d name '%s'", iP, iP->i_ino, dentry, hasSubdirs, dentry->d_name.name); /* Attempt to prune unused children. Helps keep stat cache manageable */ if (hasSubdirs) { dget_locked(dentry); spin_unlock(&dcache_lock); /* This call walks the tree starting at this parent dentry and * will successfully uncache child dentries that aren't held by * user programs and iput their associated inodes (resulting in * many cases of the inode i_count going to 0. iput() may however * just put these inodes on the unused list if they are still * valid (i_nlink > 0) and linked on i_hash. Thus in many cases while * the dentries immediately disappear their associated inode don't * have an immediate clear_inode() called on them. Subsequent * pruning (by kswapd) should shrink the icache for unused inodes * resulting in the gpfs_s_clear_inode callback for these inodes. */ shrink_dcache_parent(dentry); dput(dentry); /* For directories we don't support hard links so we shouldn't * have multiple dentries that need to be pruned. Hence * after having dropped the dcache lock we break out of this * for loop. */ break; } } if (!hasSubdirs) spin_unlock(&dcache_lock); EXIT(0); return refCount; } /* Mark the dentry as needing a revalidate. Called after losing * a token protecting the attributes of this dcache entry. */ int cxiInvalidateDCacheEntry(cxiNode_t *cnP) { struct inode *iP = (struct inode *)cnP->osNodeP; struct list_head *dListP, *dHeadP; struct dentry *dentry; int refCount = 0; /* Traverse the list of all dentries that refer to this file. */ ENTER(0); TRACE2(TRACE_VNODE, 4, TRCID_INVAL_DCACHE, "cxiInvalidateDCacheEntry: ip 0x%lX inode %d", iP, iP->i_ino); dHeadP = &iP->i_dentry; spin_lock(&dcache_lock); for (dListP = dHeadP->next; dListP != dHeadP; dListP = dListP->next) { refCount++; /* Mark the entry as needing revalidation by setting the d_op * function table to gpfs_dops_revalidate. Since this dentry * is staying in the vfs we can't declare it invalid, or a legitimate * stat on it may return ESTALE. * Scenario: node a) mkdir foo; cd foo * node b) chmod 500 foo * node a) ls -al * must succeed. */ dentry = list_entry(dListP, struct dentry, d_alias); dentry->d_op = &gpfs_dops_revalidate; TRACE4N(TRACE_VNODE, 4, TRCID_INVAL_DCACHE_ALIAS, "cxiInvalidateDCacheEntry: ip 0x%lX ino %d " "alias dentry 0x%lX name '%s'", iP, iP->i_ino, dentry, dentry->d_name.name); if (TestCtFlag(cnP, destroyIfDelInode)) { /* If the file was deleted, marking the dentry invalid is not * sufficient. If we leave the dentry in the cache marked as * invalid, it will remain in the cache until: * * a) if it has a zero d_count then the scheduled GPFS swapd * d_prune_aliases will get rid of it * b) if it has a nonzero d_count (it's open) then d_prune_aliases * would not prune it and it would stay in the cache until the * next lookup finds it and calls d_invalidate, which might not * ever happen. * * Thus we drop the dentry and the final close or schedule * d_prune_aliases will remove it. */ DENTRY_DROP(dentry); } /* Dentries for this cxiNode_t should be pruned by GPFS swapd thread * which will be signalled by the caller of this routine. */ SetCtFlag(cnP, pruneDCacheNeeded); } spin_unlock(&dcache_lock); EXIT(0); return refCount; } /* The following function is called to remove invalid dcache entries for a file when the file is deleted on this node. Such invalid dcache entries occur when a file is renamed on another node before it is deleted here. The rename revokes the inode token, which marks the dcache entry invalid, but does not remove it from the cache on this node. When the file is deleted, the delete operation on this node will look up the file under its new name and turn the (new) dcache entry into a negative dcache entry, but since the file was renamed, it will not find or process the old, invalid dcache entry (the one referring to the old file name). This function is called during delete (when the link count goes to zero) to remove old, invalid dcache entries, so the file can be destroyed. The function is similar to cxiInvalidateDCacheEntry, with the following differences: (1) it is only called on files that are being deleted (link count zero and destroyIfDelInode flag already set), (2) it does not mark any dcache entries as invalid; instead, it (3) only drops dcache entries that are already marked as invalid. In particular, we do not want to invalidate the dcache entry referring to the current name being unlinked, because unlink will turn this into a valid, negative dcache entry. */ void cxiDropInvalidDCacheEntry(cxiNode_t *cnP) { struct inode *iP = (struct inode *)cnP->osNodeP; struct list_head *dListP, *dHeadP; struct dentry *dentry; int holdCount; ENTER(0); TRACE2(TRACE_VNODE, 4, TRCID_DROP_INVAL_DCACHE, "cxiDropInvalidDCacheEntry: iP 0x%lX i_ino %d", iP, iP->i_ino); DBGASSERT(TestCtFlag(cnP, destroyIfDelInode)); /* Traverse the list of all dentries that still refer to this file. */ dHeadP = &iP->i_dentry; spin_lock(&dcache_lock); for (dListP = dHeadP->next; dListP != dHeadP; dListP = dListP->next) { /* Check whether this dentry mas been marked invalid */ dentry = list_entry(dListP, struct dentry, d_alias); if (dentry->d_op == &gpfs_dops_invalid || dentry->d_op == &gpfs_dops_revalidate) { TRACE4N(TRACE_VNODE, 4, TRCID_DROP_INVAL_DCACHE_ALIAS, "cxiDropInvalidDCacheEntry: ip 0x%lX ino %d " "removing dentry 0x%lX name '%s'\n", iP, iP->i_ino, dentry, dentry->d_name.name); /* Drop the dcache entry. See details in cxiInvalidateDCacheEntry */ DENTRY_DROP(dentry); /* Dentries for this cxiNode_t should be pruned */ SetCtFlag(cnP, pruneDCacheNeeded); } } spin_unlock(&dcache_lock); EXIT(0); } /* The following function is called to invalidate negative dcache entries for all files in a directory when we lose the BR token for the directory. */ int cxiInvalidateNegDCacheEntry(cxiNode_t *cnP) { struct inode *iP = (struct inode *)cnP->osNodeP; struct list_head *dListP, *dHeadP; struct list_head *cListP, *cHeadP; struct dentry *dentry, *child; int refCount = 0; ENTER(0); TRACE2(TRACE_VNODE, 4, TRCID_INVAL_NEG_DCACHE, "cxiInvalidateNegDCacheEntry: iP 0x%lX inode %d", iP, iP->i_ino); /* Traverse the list of all dentries that refer to this directory. Note: since we don't support hard links to directories, we expect there to be exactly one dentry on this list. */ dHeadP = &iP->i_dentry; spin_lock(&dcache_lock); for (dListP = dHeadP->next; dListP != dHeadP; dListP = dListP->next) { refCount++; /* traverse the list of all children of this dentry */ dentry = list_entry(dListP, struct dentry, d_alias); cHeadP = &dentry->d_subdirs; for (cListP = cHeadP->next; cListP != cHeadP; cListP = cListP->next) { /* If this child is a negative dentry (d_inode pointer is NULL), mark the entry invalid by setting the dop function table to gpfs_dops_invalid, which contains a d_revalidate function that always returns false. Also handle dcache entries that are about to be deleted (unlink operation pending but not yet complete). These entries still have a non-null d_inode pointer, but are marked as "delete pending" by having a different d_op table. We should not mark the latter as invalid, because we don't know yet whether the delete operation is going to succeed, so we mark those dentries as "needing revalidation". (see also comments in gpfs_i_unlink and gpfs_i_rmdir). */ child = list_entry(cListP, struct dentry, d_child); if (!child->d_inode || child->d_op == &gpfs_dops_ddeletepending) { child->d_op = !child->d_inode ? &gpfs_dops_invalid : &gpfs_dops_revalidate; TRACE5N(TRACE_VNODE, 4, TRCID_INVAL_NEG_DUNCACHE, "cxiInvalidateNegDCacheEntry: ip 0x%lX ino %d " "%s dentry 0x%lX name '%s'", iP, iP->i_ino, !child->d_inode ? "inval" : "reval", child, child->d_name.name); } } } spin_unlock(&dcache_lock); EXIT(0); return refCount; } /* dentry_operations */ /* The d_revalidate function is expected to check whether the directory entry * cached in the given dentry struct is still valid. */ int #if LINUX_KERNEL_VERSION >= 2060000 gpfs_d_invalid(struct dentry *dentry, struct nameidata *ni) #else gpfs_d_invalid(struct dentry *dentry, int flags) #endif { TRACE3(TRACE_VNODE, 4, TRCID_DIR_001, "gpfs_d_invalid: dentry 0x%lX d_inode 0x%lX name '%s' is invalid", dentry, dentry->d_inode, dentry->d_name.name); return false; } int #if LINUX_KERNEL_VERSION >= 2060000 gpfs_d_revalidate(struct dentry *dentry, struct nameidata *ni) #else gpfs_d_revalidate(struct dentry *dentry, int flags) #endif { int rc; cxiNode_t *dcnP; cxiNode_t *cnP = NULL; struct inode *diP; struct inode *newInodeP; struct gpfsVfsData_t *privVfsP; cxiIno_t iNum = (cxiIno_t)-1; ext_cred_t eCred; struct dentry *retP; ENTER(0); TRACE6(TRACE_VNODE, 4, TRCID_DIR_REVALIDATE, "gpfs_d_revalidate enter: dentry 0x%lX " "d_inode 0x%lX inum %d parent 0x%lX cwd 0x%lX d_name '%s'", dentry, dentry->d_inode, dentry->d_inode ? dentry->d_inode->i_ino : -1, dentry->d_parent, current->fs->pwd, dentry->d_name.name); rc = gpfs_i_revalidate(dentry); /* We're going to need to revalidate this according to its name. * The scenario that caused us problems is: * * Node a) mkdir dir1; touch dir1/file1 * Node b) mv dir1 dir2 * Node a) ls -al dir1 * * This code used to just revalidate the inode (gpfs_i_revalidate) * which would succeed since the dir1 inode is indeed still valid. * However its name has now changed to dir2 and thus this lookup * with its last known name is performed. We don't perform this * lookup for the root inode. We didn't have to do this before * RH 2.4.18-5 (unusual fix for NFS is in that kernel) but now * we have to go thru these machinations. Most of this is a * tradeoff and doesn't give exactly correct semantics. * * For instance normally on a local node directory rename the dentry * gets moved over to its new position via d_move. However in our * case we don't know what the new name is since we've just lost * the token and have no other info. If a process is sitting in this * renamed directory structure then it has to remain valid for that * process but none other. We unhash the directory so no other * process can step into that subtree but continue to say its valid * if (d_count > 1). At that point the only process calling d_revalidate * would be a process with it's current working directory in that * subtree. However this breaks down if the process needs to back * up into a parent directory, since d_revalidate starts from outside * the renamed subtree and can't proceed into the unhashed directory. * Thus you get an odd * getcwd: cannot access parent directories: No such file or directory * ancillary message but you can cd backwards correctly. * * Another idea attempted was looking at the process' cwd in the * task struct and answering whether the dentry was valid on a * per process basis. This gave odd semantics because a process * could list the parent directory and not see the renamed child * but could still cd into it (because it was still hashed). That * breaks down completely if another node makes a directory of the * old name in the parent. * * So if we can't use d_move...which it doesn't appear possible to * do, at a minimum you have to unhash the directory if it no * longer has the correct name or inode. * * Note that once a process steps out of the renamed dentry then * the final dput will kill the dentry. */ if (rc == 0 && dentry->d_inode->i_ino != INODENUM_ROOTDIR_FILE) { setCred(&eCred); privVfsP = VP_TO_PVP(dentry->d_inode); DBGASSERT(privVfsP != NULL); LOGASSERT(dentry->d_parent != NULL); diP = dentry->d_parent->d_inode; dcnP = VP_TO_CNP(diP); rc = gpfs_ops.gpfsLookup(privVfsP, (void *)diP, dcnP, NULL, (char *)dentry->d_name.name, (void **)&newInodeP, &cnP, &iNum, NULL, NULL, &eCred, (void **)&retP); if (rc == 0) { iput(newInodeP); if (iNum != dentry->d_inode->i_ino) rc = ESTALE; } /* The name is either no longer valid or has been renamed * and recreated with a different inode. We need to drop * the dentry from the hash list so another process can't * proceed into that tree. */ if (rc) { DENTRY_D_DROP(dentry); d_prune_aliases(dentry->d_inode); /* If the dentry still has processes sitting underneath * it we'll still claim its valid. */ if (atomic_read(&dentry->d_count) > 1) rc = 0; } } xerror: TRACE2(TRACE_VNODE, 4, TRCID_DIR_REVALIDATE_EX, "gpfs_d_revalidate exit: dentry 0x%lX rc %d\n", dentry, rc); EXIT(0); if (rc) return false; else return true; } #ifdef CCL /* The d_revalidate function checks whether the directory entry cached in the given dentry struct is still valid. Any dentry referencing this operation is a positive dentry that was created for an inexact caseless file name match for a Samba client. The d_revalidate returns "true" for subsequent Samba clients indicating that the positive dcache entry is still valid. It returns "false" for local or NFS clients indicating that the dcache entry is no longer valid which forces a new lookup. */ int #if LINUX_KERNEL_VERSION >= 2060000 gpfs_d_valid_if_Samba(struct dentry *dentry, struct nameidata *ni) #else gpfs_d_valid_if_Samba(struct dentry *dentry, int flags) #endif { TRACE4(TRACE_VNODE, 4, TRCID_DIR_VALID_IF_SAMBA, "gpfs_d_valid_if_Samba: dentry 0x%lX " "d_inode 0x%lX (name '%s') returns %s\n", dentry, dentry->d_inode, dentry->d_name.name, (cxiIsSambaThread() ? "true" : "false")); return cxiIsSambaThread(); } /* The d_revalidate function checks whether the directory entry cached in the given dentry struct is still valid. Any dentry referencing this operation is a negative dentry that was created for an exact file name match which failed for a local or NFS client. The d_revalidate returns "true" for subsequent local or NFS clients indicating that the negative dcache entry is still valid. It returns "false" for Samba clients indicating that the dcache entry is no longer valid which forces a new lookup. */ int #if LINUX_KERNEL_VERSION >= 2060000 gpfs_d_invalid_if_Samba(struct dentry *dentry, struct nameidata *ni) #else gpfs_d_invalid_if_Samba(struct dentry *dentry, int flags) #endif { TRACE4(TRACE_VNODE, 4, TRCID_DIR_INVALID_IF_SAMBA, "gpfs_d_invalid_if_Samba: dentry 0x%lX " "d_inode 0x%lX (name '%s') returns %s\n", dentry, dentry->d_inode, dentry->d_name.name, (cxiIsSambaThread() ? "false" : "true")); return !cxiIsSambaThread(); } #endif