/*************************************************************************** * * Copyright (C) 2001 International Business Machines * All rights reserved. * * This file is part of the GPFS mmfslinux kernel module. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *************************************************************************** */ /* @(#)16 1.158.1.9 src/avs/fs/mmfs/ts/kernext/gpl-linux/cxiSystem.c, mmfs, avs_rgpfs24, rgpfs24s007a 10/24/06 19:12:27 */ /* * Linux implementation of basic common services * * Contents: * cxiGetThreadId * getpid * cxiIsSuperUser * DoPanic * logAssertFailed * Kernel memory allocation services: * cxiMallocPinned * cxiFreePinned * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef memcmp #define DEFINE_TRACE_GBL_VARS #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_KERNEL_VERSION >= 2060000 #include #include #endif #if LINUX_KERNEL_VERSION >= 2040900 /* This is in the Redhat kernel series */ extern int posix_locks_deadlock(struct file_lock *, struct file_lock *); #endif #ifdef INSTRUMENT_LOCKS struct BlockingMutexStats BlockingMutexStatsTable[MAX_GPFS_LOCK_NAMES]; #endif /* INSTRUMENT_LOCKS */ /* We record the daemon's process group since it can uniquely identify * a thread as being part of the GPFS daemon. pid is unique per thread * on linux due to their clone implementation. */ static pid_t DaemonPGrp = -1; /* Get the kernel thread ID. */ cxiThreadId cxiGetThreadId() { /* ENTER(1); */ return current->pid; } /* Get the kernel process ID. */ pid_t getpid() { /* ENTER(1); */ return current->pid; } /* bufP is caller's ext_cred_t buffer * uCredPP is the ucred struct (NULL on Linux) * eCredPP is the ext_cred_t struct * (if successful) * * cxiPutCred should be called to release when operation has been completed. */ int cxiGetCred(void *bufP, void **uCredPP, void **eCredPP) { ext_cred_t *eCredP = (ext_cred_t *)bufP; ENTER(0); *uCredPP = NULL; *eCredPP = NULL; if (!bufP) { EXIT_RC(0, EINVAL); return EINVAL; } setCred(eCredP); *eCredPP = (void *)eCredP; xerror: EXIT(0); return 0; } /* Release of cxiGetCred() structures (nothing to do on Linux) */ int cxiPutCred(void *userCredP, void *extCredP) { if (userCredP || !extCredP) return EINVAL; return 0; } /* Convert a kernel stack address to the thread ID of the thread that * uses that stack */ int cxiStackAddrToThreadId(char* stackP, cxiThreadId* tidP) { struct task_struct * tP; #if LINUX_KERNEL_VERSION >= 2060000 /* the kernel stack is base off the thread_info struct in the 2.6 kernel * will get the task pointer out of thread_info struct. */ struct thread_info * iP; ENTER(0); iP = (struct thread_info *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1))); tP = iP->task; #else /* the kernel stack is base off the task_struct struct in the 2.4 kernel */ tP = (struct task_struct *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1))); #endif ENTER(0); *tidP = tP->pid; EXIT(0); return 0; } /* Convert a kernel thread pointer to the corresponding thread ID */ int cxiThreadPtrToThreadId(char* threadP, cxiThreadId* tidP) { struct task_struct * tP; ENTER(0); tP = (struct task_struct *) threadP; *tidP = tP->pid; EXIT(0); return 0; } /* Return true if caller has has maximum authorization (is root) */ Boolean cxiIsSuperUser() { return (current->euid == 0); } /* Get the process max filesize limit (ulimit -f) */ Int64 cxiGetMaxFileSize() { if ((signed long)MY_RLIM_CUR(RLIMIT_FSIZE) == -1L) return MAX_INT64; else return (MY_RLIM_CUR(RLIMIT_FSIZE)); } /* Routine to send a signal to the current thread/process */ void cxiSendSigThread(int sig) { ENTER(0); send_sig(sig, current, 0); EXIT(0); } #ifdef MALLOC_DEBUG /* This tracks mallocs and frees on a limited basis. * Implemented originally to determine if we were leaking * any memory after an unload. This is not really thread * safe for multiple processors unless they're automatically * cache coherent without memory barriers (i386). Its useful * for detecting memory leaks on a single processor system. */ #define MALLOC_RECORDS 5000 /* max mallocs to track */ struct mallocStat { void *beginP; unsigned short size; unsigned short type; }; static struct mallocStat *mstatP = NULL; unsigned int nextMalloc = 0; void MallocDebugStart() { int i; ENTER(0); if (mstatP == NULL) mstatP = vmalloc(MALLOC_RECORDS * sizeof(struct mallocStat)); if (mstatP == NULL) { EXIT(0); return; } for (i = 0; i < MALLOC_RECORDS; i++) { mstatP[i].beginP = NULL; mstatP[i].size = 0; mstatP[i].type = 0; } printk("MallocDebugStart 0x%X\n", mstatP); EXIT(0); } void MallocDebugEnd() { int i; ENTER(0); if (mstatP != NULL) { for (i = 0; i < MALLOC_RECORDS; i++) { if (mstatP[i].beginP != NULL) printk("MallocDebug: beginP 0x%X size %d type %d STILL ALLOCATED!\n", mstatP[i].beginP, mstatP[i].size, mstatP[i].type); } } vfree(mstatP); mstatP = NULL; EXIT(0); } void MallocDebugNew(void *ptr, unsigned short size, unsigned short type) { void *bP; int i; int j; int swrc; int oldval; int where = nextMalloc; ENTER(0); if (mstatP == NULL) { EXIT(0); return; } for (i = where; i < MALLOC_RECORDS + where; i++) { if (i >= MALLOC_RECORDS) j = i - MALLOC_RECORDS; else j = i; bP = mstatP[j].beginP; if (bP == NULL) { swrc = ATOMIC_SWAP(&mstatP[j].beginP, &bP, ptr); if (swrc) { mstatP[j].size = size; mstatP[j].type = type; break; } } } EXIT(0); } void MallocDebugDelete(void *ptr) { void *bP; int i; int swrc; int next; int found = 0; ENTER(0); if (mstatP == NULL) { EXIT(0); return; } for (i = 0; i < MALLOC_RECORDS; i++) { bP = mstatP[i].beginP; if (bP == ptr) { next = nextMalloc; ATOMIC_SWAP(&nextMalloc, &next, i); swrc = ATOMIC_SWAP(&mstatP[i].beginP, &bP, NULL); DBGASSERT(swrc); found = 1; break; } } if (!found) printk("MallocDebug: 0x%X not found!\n", ptr); EXIT(0); } #endif /* MALLOC_DEBUG */ /* Allocate pinned kernel memory */ void* cxiMallocPinned(int nBytes) { void *ptr; /* kmalloc only supports requests for up to 131027 bytes. Anything larger than this results in a BUG() call. */ ENTER(0); if (nBytes > 131072) { EXIT(0); return NULL; } ptr = kmalloc(nBytes, GFP_KERNEL); #ifdef MALLOC_DEBUG MallocDebugNew(ptr, nBytes, 1); #endif EXIT(0); return ptr; } /* Free pinned kernel memory that was allocated with cxiMallocPinned */ /* Must not block on lack of memory resourses */ void cxiFreePinned(void* p) { ENTER(0); #ifdef MALLOC_DEBUG MallocDebugDelete(p); #endif kfree(p); EXIT(0); } /* Get the kernel thread ID. */ void* cxiGetFcntlOwner(eflock_t *flP) { return flP? flP->l_owner: current->files; } #if LINUX_KERNEL_VERSION > 2060900 struct lock_manager_operations lm_operations = { }; #endif /* Perform local advisory locking. */ int cxiFcntlLock(void *advObjP, int cmd, void *lockStructP, cxiFlock_t *flockP, int (*retryCB)(), cxiOff64_t size, cxiOff64_t offset, ulong *retry_idP) { int len, rc = 0; // struct file *fP; struct file_lock fl, *flP, *gflP, *cflP; Boolean keepLockElement = false; /* cast platform independent arguments as appropriate for linux */ void (*RetryFcn)(struct file_lock*) = (void (*)(struct file_lock*))retryCB; // fP = (struct file *)advObjP; struct file localFile, *filp = &localFile; struct dentry localDEntry, *dp = &localDEntry; ENTER(0); flP = (struct file_lock *) lockStructP; localFile.f_dentry = &localDEntry; localDEntry.d_inode = (struct inode *)advObjP; /* Lock commands can have two different values. Convert them at * entry to the portability layer so that we only have to check * for one of them. */ #if !defined(__64BIT__) if (cmd == F_GETLK64) cmd = F_GETLK; if (cmd == F_SETLK64) cmd = F_SETLK; if (cmd == F_SETLKW64) cmd = F_SETLKW; #endif /* Callers have the option of passing a platform dependent lock structure (struct file_lock *lockSructP) or the generic (cxiFlock_t *flockP). */ if (flockP) { flP = &fl; /* Use a local file_lock structure */ /* If there is a potential for blocking, must malloc the locking structure so it can persist until the lock becomes available (in Retry()). */ if (cmd == F_SETLKW) { #ifdef NFS_CLUSTER_LOCKS len = sizeof(struct file_lock) + sizeof(struct file) + sizeof(struct dentry); #else len = sizeof(struct file_lock); #endif flP = (struct file_lock*)cxiMallocUnpinned(len); if (flP == NULL) { rc = ENOMEM; goto exit; } cxiMemset(flP, 0, len); #ifdef NFS_CLUSTER_LOCKS filp = (struct file*)((char *)flP + sizeof(struct file_lock)); dp = (struct dentry *)((char *)filp + sizeof(struct file)); filp->f_dentry = dp; dp->d_inode = (struct inode *)advObjP; #endif } else cxiMemset(flP, 0, sizeof(*flP)); locks_init_lock(flP); /* Initialize list_head structs */ if (flockP->l_file == NULL) flockP->l_file = filp; /* fl_wait needs to be initialized because when unlock happens, the linux routine locks_wake_up_blocks invokes our retry routine via fl_notify and then calls wake_up(fl_wait) on the assumption that the waiter is local. */ cxiWaitEventInit((cxiWaitEvent_t *)&flP->fl_wait); cxiFlockToVFS(flockP, flP); } /* daemon didn't know the owner and required kernel code to fill it in. */ if (!flP->fl_owner) flP->fl_owner = (fl_owner_t)cxiGetFcntlOwner(NULL); #if 0 /* Validate the file pointer. Kernel locking routines are going to use these without verifying them. If any of them are NULL, find out now before they generate a segment violation. */ if ((!fP) || (!fP->f_dentry) || (!fP->f_dentry->d_inode)) { if (cmd == F_GETLK) flP->fl_type = F_UNLCK; else rc = EINVAL; goto exit; } #endif /* Note that this all depends on us having serialized such locking for this file during from before the posix_test_lock() until after the posix_block_lock(). The revoke lock that we hold here provides us the necessary serilization. */ TRACE7(TRACE_VNODE, 3, TRCID_FCNTLLOCK_ENTER, "cxiFcntlLock posix_lock_file: pid %d owner 0x%X inodeP 0x%X " "range 0x%lX-%lX cmd %s type %s\n", flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end, (cmd == F_GETLK) ? "GETLK" : (cmd == F_SETLK) ? "SETLK" : "SETLKW", (flP->fl_type == F_RDLCK) ? "RDLCK" : (flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK"); if (cmd == F_GETLK) { /* Check for conflicts. If found, return the information. If there are NO conflicts, return F_UNLCK in fl_type. */ #if LINUX_KERNEL_VERSION >= 2061700 struct file_lock conf; gflP = &conf; rc = posix_test_lock(filp, flP, gflP); if (rc) { rc = 0; #else if (NULL != (gflP = posix_test_lock(&localFile, flP))) { #endif flP->fl_start = gflP->fl_start; flP->fl_end = gflP->fl_end; flP->fl_type = gflP->fl_type; flP->fl_pid = gflP->fl_pid; flP->fl_owner = gflP->fl_owner; } else flP->fl_type = F_UNLCK; TRACE6(TRACE_VNODE, 3, TRCID_FCNTLLOCK_GETLK, "cxiFcntlLock getlk: pid %d owner 0x%X inodeP 0x%X " "range 0x%lX-%lX type %s\n", flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end, (flP->fl_type == F_RDLCK) ? "RDLCK" : (flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK"); } else { /* Begin: do the locking, but handle the blocking via our retry routine. */ /* Test the lock. What this really does for us is return the blocker if one exists. This is needed to queue up the request if a conflicting lock is already held. */ #ifdef NFS_CLUSTER_LOCKS if (cmd == F_SETLKW) { flP->fl_flags |= FL_SLEEP; if (!flP->fl_lmops) { flP->fl_lmops = &lm_operations; flP->fl_lmops->fl_notify = (void *)RetryFcn; } } rc = POSIX_LOCK_FILE(filp, flP); if (rc == -EAGAIN && (cmd == F_SETLKW) && flP->fl_lmops == &lm_operations) { /* Queue the blocker structures */ keepLockElement = true; if (retry_idP) *retry_idP = (ulong)flP; // returned to caller and saved in sleepElement } #else #if LINUX_KERNEL_VERSION >= 2061700 if ((flP->fl_type == F_UNLCK) || !(posix_test_lock(&localFile, flP, cflP))) #else if ((flP->fl_type == F_UNLCK) || !(cflP = posix_test_lock(&localFile, flP))) #endif { /* No conflicting lock: get the lock for the caller. */ rc = POSIX_LOCK_FILE(&localFile, flP); } else { /* Conflicting lock: ..... */ rc = EAGAIN; if (cmd == F_SETLKW) { /*if (posix_locks_deadlock(flP, cflP)) { rc = EDEADLK; } else*/ { /* Queue the blocker structures */ keepLockElement = true; if (retry_idP) *retry_idP = (ulong)flP; // returned to caller and saved in sleepElement #if LINUX_KERNEL_VERSION > 2060900 flP->fl_lmops = &lm_operations; flP->fl_lmops->fl_notify = RetryFcn; #else flP->fl_notify = RetryFcn; #endif #if LINUX_KERNEL_VERSION < 2061700 posix_block_lock(cflP, flP); #endif } } } #endif TRACE2(TRACE_VNODE, 3, TRCID_FCNTLLOCK_EXIT, "cxiFcntlLock posix_lock_file: rc %d retry_id 0x%lX\n", rc, cflP); } /* End: do the locking, but handle the blocking via our retry routine. */ exit: if (flockP) { /* Caller wanted results in flockP */ cxiVFSToFlock((void *)flP, flockP); /* If we allocated the locking structure and then didn't need to use it (the lock request didn't block), free it. */ if ((flP!=&fl) && (!keepLockElement)) { cxiFreeUnpinned(flP); } } #ifdef NFS_CLUSTER_LOCKS if (rc < 0) rc = -rc; /* make it positive */ #endif EXIT_RC(0, rc); return rc; } void cxiFcntlUnblock(void *retry_idP) { struct file_lock *flP = (struct file_lock *)retry_idP; ENTER(0); /* Include some sanity checks on the retry id (file_lock) before passing it into the routine that does the work. It should be properly linked (via its list_head structures) in a file_lock_list that has blocked waiters. Also, we would only be backing this out by the process that has originally blocked, so verify the pid. */ if (!list_empty(&flP->fl_block) && !list_empty(&flP->fl_link) && flP->fl_next && flP->fl_pid == getpid()) { POSIX_UNBLOCK_LOCK(flP); } EXIT(0); } int cxiFcntlReset(void *vfsP, cxiPid_t mmfsd_pid) { int rc = 0; struct super_block *sbP = (struct super_block *)vfsP; struct list_head *fllP; struct file_lock *fl; struct dentry *dentryP; ENTER(0); lock_kernel(); restart: #if LINUX_KERNEL_VERSION >= 2061600 //??? find a different way to clear locks file_lock_list is not exported anymore #else fllP = file_lock_list.next; while(fllP != &file_lock_list) { fl = list_entry(fllP, struct file_lock, fl_link); fllP = fllP->next; /* If there are mmfs lock structures, release them. */ if (fl && fl->fl_file && fl->fl_file->f_dentry && fl->fl_file->f_dentry->d_inode) { dentryP = fl->fl_file->f_dentry; /* If this lock belongs to the specified vfs, release advisory locks. */ if (dentryP->d_sb == sbP) { /* remove all our locks */ rc = gpfs_ops.gpfsFcntlReset((void *)dentryP->d_inode, mmfsd_pid); if (rc == ENOSYS) goto xerror; /* After freeing unknown numbers of locks in gpfsFcntlReset (all locks for the inode), restart from the top of the lock list */ goto restart; } } } #endif xerror: unlock_kernel(); EXIT_RC(0, rc); return rc; } void * cxiGetPrivVfsP(void *vfsP) { struct super_block *sbP = (struct super_block *)vfsP; /* Do some sanity checking */ if ( (sbP->s_magic != GPFS_SUPER_MAGIC) || ((UIntPtr) SBLOCK_PRIVATE(sbP) < GPFS_KERNEL_OFFSET) ) printSuperList(sbP); LOGASSERT( sbP->s_magic == GPFS_SUPER_MAGIC ); LOGASSERT( (UIntPtr) SBLOCK_PRIVATE(sbP) >= GPFS_KERNEL_OFFSET ); return (SBLOCK_PRIVATE(sbP)); } #ifdef NFS_DEBUG /* These flags are defined in the kernel and control various cprintk calls. This provides us a way to easily turn these on/off for debugging our NFS support. */ extern unsigned int nlm_debug; extern unsigned int nfsd_debug; extern unsigned int nfs_debug; extern unsigned int rpc_debug; #endif int cxiTrace(cxiTrace_t trace) { #ifdef NFS_DEBUG int rc = 0; ENTER(0); switch (trace) { case cxiTraceNFS: nlm_debug = nfsd_debug = nfs_debug = rpc_debug = ~0; break; case cxiTraceNFSoff: nlm_debug = nfsd_debug = nfs_debug = rpc_debug = 0; break; default: rc = EINVAL; break; } EXIT_RC(0, rc); return rc; #else return ENOSYS; #endif } void cxiFlockToVFS(eflock_t* lckdatP, void* vP) { struct file_lock* flP = (struct file_lock *)vP; ENTER(0); if ((flP) && (lckdatP)) { flP->fl_pid = lckdatP->l_pid; flP->fl_owner = lckdatP->l_owner; flP->fl_type = lckdatP->l_type; flP->fl_start = lckdatP->l_start; flP->fl_flags = FL_POSIX; #ifdef NFS_CLUSTER_LOCKS flP->fl_lmops = lckdatP->l_lmops; flP->fl_file = lckdatP->l_file; flP->fl_ops = NULL; #else #if LINUX_KERNEL_VERSION < 2061700 if (lckdatP->l_caller == L_CALLER_LOCKD) flP->fl_flags |= FL_LOCKD; #endif #endif if (lckdatP->l_len == 0) flP->fl_end = FL_OFFSET_MAX; else flP->fl_end = lckdatP->l_len + lckdatP->l_start - 1; } EXIT(0); return; } #ifdef NFS_CLUSTER_LOCKS int cxiVFSCallback(eflock_t* lckreqP, eflock_t* lckdatP, int(* callback)(void *, void *, int), int result) { struct file_lock fl; struct file *fileP; struct file_lock conf, *confP = NULL; int rc; ENTER(0); cxiFlockToVFS(lckreqP, &fl); fileP = fl.fl_file; if (!fileP) { return -1; } if (lckdatP) { cxiFlockToVFS(lckdatP, &conf); confP = &conf; } if (!result) { /* try to get the posix lock */ rc = POSIX_LOCK_FILE(fileP, &fl); if (rc) callback(&fl, NULL, EBUSY); else { /* got the posix lock */ rc = callback(&fl, confP, result); if (rc) { /* too late, free the lock */ fl.fl_type = F_UNLCK; rc = POSIX_LOCK_FILE(fileP, &fl); } } } else rc = callback(&fl, confP, result); #ifdef NFS_CLUSTER_LOCKS if (rc < 0) rc = -rc; /* make it positive */ #endif EXIT_RC(0, rc); return rc; } #endif void cxiVFSToFlock(void *vP, eflock_t *lckdatP) { struct file_lock* flP = (struct file_lock *)vP; ENTER(0); if ((flP) && (lckdatP)) { lckdatP->l_pid = flP->fl_pid; lckdatP->l_owner = flP->fl_owner; lckdatP->l_type = flP->fl_type; lckdatP->l_start = flP->fl_start; lckdatP->l_flags = flP->fl_flags; #ifdef NFS_CLUSTER_LOCKS lckdatP->l_lmops = flP->fl_lmops; lckdatP->l_file = flP->fl_file; if (lckdatP->l_lmops) /* must be lockd or nfsd */ #else #if LINUX_KERNEL_VERSION >= 2061700 if (lckdatP->l_lmops) /* must be lockd or nfsd */ #else if (flP->fl_flags & FL_LOCKD) #endif #endif lckdatP->l_caller = L_CALLER_LOCKD; else lckdatP->l_caller = L_CALLER_NULL; if (flP->fl_end == FL_OFFSET_MAX) lckdatP->l_len = 0; else lckdatP->l_len = flP->fl_end - flP->fl_start + 1; } EXIT(0); return; } /* Sleep for the indicated number of milliseconds */ void cxiSleep(int ms) { ENTER(0); TRACE1(TRACE_VNODE, 9, TRCID_SLEEP, "cxiSleep: begin delay %d\n", ms); current->state = TASK_INTERRUPTIBLE; /* For large HZ rearrange jiffies calculation and use presumably larger word size to minimize overflow risk */ if (unlikely(HZ > 1000)) schedule_timeout(((long)ms)*HZ/1000); else schedule_timeout(ms/(1000/HZ)); TRACE2(TRACE_VNODE, 9, TRCID_SLEEP_END, "cxiSleep: end delay %d HZ %d\n", ms, HZ); EXIT(0); } void cxiOpenNFS(void *iP) { struct inode *inodeP = (struct inode *)iP; int refcount; /* A reference is placed on the cxiNode here when the first NFS reference is added */ ENTER(0); refcount = cxiRefOSNode(NULL, ((cxiNode_t *)(cxiGetCnP(inodeP))), iP, 1); TRACE7(TRACE_VNODE, 3, TRCID_OPENNFS, "openNFS iP 0x%lX ino %d (0x%X) mode 0x%X nlink %d gen_ip 0x%lX " "refcount %d\n", inodeP, (inodeP) ? inodeP->i_ino : -1, (inodeP) ? inodeP->i_ino : -1, (inodeP) ? inodeP->i_mode : -1, (inodeP) ? inodeP->i_nlink : -1, (inodeP) ? inodeP->PRVINODE : NULL, refcount); DBGASSERT(refcount != 0); EXIT(0); } int cxiCloseNFS(void *vP, void *viP) { int rc; struct inode *iP = (struct inode *)vP; /* If viP is NULL, the file was never actually opened. If viP is not NULL, close it. */ ENTER(0); if (viP == NULL) rc = 0; else { if (VP_TO_PVP(iP) != NULL && VP_TO_CNP(iP) != NULL) { rc = gpfs_ops.gpfsClose(VP_TO_PVP(iP), VP_TO_CNP(iP), FREAD|FWRITE, (struct MMFSVInfo *)viP, true); cxiPutOSNode((void *)iP); } } EXIT_RC(0, rc); return rc; } static int cxiNFSCluster = 0; void cxiSetNFSCluster(int set) { cxiNFSCluster = set; } /* To avoid failing the NFS client the NFSD thread is put to sleep. Another node will takeover this client and the operation will continue without any errors to the application. */ void cxiNFSError(int rc, const char *str) { TRACE2(TRACE_VNODE, 9, TRCID_NFS_ERROR, "cxiNFSError: %s got rc %d\n", str, rc); if (cxiNFSCluster && cxiIsNFSThread() && (rc == ESTALE || rc == -ESTALE)) { TRACE2(TRACE_VNODE, 1, TRCID_NFS_ERROR_1, "cxiNFSError: NFS got error %d from %s sleep\n", rc, str); cxiSleep(120000); // wait 120 seconds } } void * cxiGetNfsP(void *vP) { if (vP && VP_TO_CNP((struct inode *)vP)) return VP_TO_NFSP((struct inode *)vP); else return NULL; } void cxiSetNfsP(void *vP, void *newP) { if (VP_TO_CNP((struct inode *)vP)) VP_TO_NFSP((struct inode *)vP) = newP; } void * cxiGetCnP(void *vP) { return (void *)VP_TO_CNP((struct inode *)vP); } void * cxiGetPvP(void *vP) { return (void *)VP_TO_PVP((struct inode *)vP); } void * cxiGNPtoVP(void *vP) { return (void *)GNP_TO_VP((struct cxiNode_t *)vP); } /* Main routine of kproc */ static int kprocMain(void *argP) { cxiKProcData_t *kpdP = (cxiKProcData_t *)argP; /* Change our process name */ ENTER(0); current->comm[sizeof(current->comm) - 1] = '\0'; strncpy(current->comm, kpdP->nameP, sizeof(current->comm) - 1); /* Change parent of a kernel process so that when it exits, it won't * send a SIGCHLD signal to the process that created it, and it won't * be left as a zombie. */ DAEMONIZE(kpdP->nameP); /* Call the function specified by startKProc */ kpdP->func(kpdP); EXIT(0); return 0; } /* Create a new kernel process */ cxiPid_t cxiStartKProc(struct cxiKProcData_t *kpdP) { cxiPid_t pid = kernel_thread(kprocMain, kpdP, kpdP->kprocFlags); ENTER(0); kpdP->pid = pid > 0 ? pid : KPROC_FAILED_PID; TRACE2(TRACE_VNODE, 1, TRCID_CXISTART_KPROC_LINUX, "cxiStartKProc %s pid %d \n", kpdP->nameP, kpdP->pid); EXIT(0); return kpdP->pid; } void cxiStopKProc(struct cxiKProcData_t *kpdP) { cxiPid_t pid; ENTER(0); cxiBlockingMutexAcquire(&kpdP->lock); TRACE2(TRACE_VNODE, 1, TRCID_CXISTOP_KPROC_LINUX, "cxiStopKProc: %s pid %d \n", kpdP->nameP, kpdP->pid); if (!KPROC_RUNNING(kpdP)) { cxiBlockingMutexRelease(&kpdP->lock); EXIT(0); return; } pid = kpdP->pid; // Cache pid before signal/wait kpdP->terminate = true; cxiWaitEventSignal(&kpdP->kprocEvent); while (kpdP->pid != KPROC_UNASSIGNED_PID) cxiWaitEventWait(&kpdP->startStopEvent, &kpdP->lock, 0); cxiBlockingMutexRelease(&kpdP->lock); EXIT(0); } /*------------------------------------------------------------------- * logAssertFailed - Subroutine consolidating logGenIF() and * DoPanic() calls. *------------------------------------------------------------------*/ static char PanicMsgBuf[2048]; void cxiPanic(const char* panicStrP) { printk( GPFS_NOTICE "kp %d: cxiPanic: %s\n", cxiGetThreadId(), panicStrP); TRACE1(TRACE_ERRLOG, 0, TRCID_PANIC, "cxiPanic: %s\n", panicStrP); #ifndef DISABLE_KERNEL_PANIC BUG(); #endif } static void DoPanic(char* condP, char* filenameP, int lineNum, Int32 retCode, Int32 reasonCode, char *dataStr) { const char *p; int bytesLeft; p = cxiStrrchr(filenameP, '/'); if (p == NULL) p = filenameP; else p += 1; sprintf(PanicMsgBuf, "%s:%d:%d:%d:", p, lineNum, retCode, reasonCode); bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf); if (dataStr) { strncat(PanicMsgBuf, dataStr, bytesLeft-1); bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf); } strncat(PanicMsgBuf, ":", bytesLeft-1); bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf); if (condP) strncat(PanicMsgBuf, condP, bytesLeft-1); cxiPanic(PanicMsgBuf); } #ifdef MODULE void logAssertFailed(UInt32 flags, /* LOG_FATAL_ERROR or LOG_NONFATAL_ERROR */ char *srcFileName, /* __FILE__ */ UInt32 srcLineNumber, /* __LINE__ */ Int32 retCode, /* return code value */ Int32 reasonCode, /* normally errno */ UInt32 logRecTag, /* tag if have associated error log rec */ char *dataStr, /* assert data string */ char *failingExpr) /* expression that evaluated to false */ { int i; printk("GPFS logAssertFailed: %s file %s line %d\n", failingExpr, srcFileName, srcLineNumber); ENTER(0); TRACE3(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_1, "logAssertFailed: %s retCode %d reasonCode %d\n", failingExpr, retCode, reasonCode); TRACE2(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_2, "logAssertFailed: file %s line %d\n", srcFileName, srcLineNumber); #ifndef GPFS_PRINTF /* fsync buffered lxtrace records */ trc_fsync(); #ifdef STOP_TRACE_ON_FAILURE /* Turn off tracing right after the failure occurs. This may only turn off tracing in the kernel. */ for (i=0 ; iprevP->nextP = (elementP); \ (elementP)->prevP = (headP)->prevP; \ (headP)->prevP = (elementP); \ (elementP)->nextP = (headP); #define CXI_WAIT_LIST_REMOVE(elementP) \ (elementP)->prevP->nextP = (elementP)->nextP; \ (elementP)->nextP->prevP = (elementP)->prevP; /* Initialize abstract wait event with OS specific * initialization function */ void cxiWaitEventInit(cxiWaitEvent_t *weP) { spinlock_t *lockP = (spinlock_t *)&weP->lword; spin_lock_init(lockP); weP->waitList.nextP = weP->waitList.prevP = &weP->waitList; } Boolean cxiWaitEventHasWaiters(cxiWaitEvent_t *weP) { unsigned long flags; spinlock_t *lockP = (spinlock_t *)(weP->lword); Boolean rc; SPIN_LOCK_IRQ(lockP, flags); rc = (weP->waitList.nextP != &weP->waitList); SPIN_UNLOCK_IRQ(lockP, flags); return rc; } /* Do not add trace records. Some callers depend on not being * interrupted by the trace daemon. */ enum WakeType { wBroadcast, wSignal, wWakeOne }; static inline void doWakeup(cxiWaitEvent_t *wEventP, enum WakeType wtype, int wakeupRC) { unsigned long flags; spinlock_t *lockP = (spinlock_t *)(wEventP->lword); cxiWaitList_t *headP; cxiWaitList_t *tmpP; cxiWaitElement_t *wP; SPIN_LOCK_IRQ(lockP, flags); /* We wake up from the front back (FIFO semantics). * There's only one wait element per wake_queue_head_t so * record the return code and wake up the one element. */ headP = &wEventP->waitList; for (tmpP = headP->nextP; tmpP != headP; tmpP = tmpP->nextP) { wP = list_entry(tmpP, cxiWaitElement_t, waitList); wP->wakeupRC = wakeupRC; wake_up(&wP->qhead); if (wtype != wBroadcast) { /* The difference between wSignal and wWakeOne is that the latter guarantees that multiple wake up calls will each pick a different thread if more than one is waiting. With wSignal, if a thread is awakened but hasn't had a chance to run, then subsequent wake up calls might all wake the same thread. On AIX, the calling routine (e_wakeup_one) removes the waiter from the queue, unlike Linux where removal is done by the waiting thread when it wakes up. Nothing special has to be done on AIX to get the nWakeOne style of wakeup. Note: This is an inline routine and the wType argument is a compile-time constant, so the "if" tests in this routine are done by the compiler and do not generate any code. */ if (wtype == wWakeOne) { /* Move this entry to tail of list so that the next wakeup call will pick somebody else. */ CXI_WAIT_LIST_REMOVE(tmpP); CXI_WAIT_LIST_ADD(headP, tmpP); } break; } } SPIN_UNLOCK_IRQ(lockP, flags); } int cxiCopyIn(char *from, char *to, unsigned long size) { /* The daemon needs to bypass access checks since copy to * shared segment would inadvertantly fail. */ ENTER(0); if (PROCESS_GROUP(current) == DaemonPGrp) __copy_from_user(to, from, size); else if (copy_from_user(to, from, size)) { EXIT_RC(0, EFAULT); return EFAULT; } EXIT(0); return 0; } int cxiCopyOut(char *from, char *to, unsigned long size) { int ignore; /* The daemon needs to bypass access checks since copy to * shared segment would inadvertantly fail. */ ENTER(0); if (PROCESS_GROUP(current) == DaemonPGrp) ignore = __copy_to_user(to, from, size); else if (copy_to_user(to, from, size)) { EXIT_RC(0, EFAULT); return EFAULT; } EXIT(0); return 0; } int cxiCopyInstr(char *from, char *to, unsigned long size, unsigned long *len) { long retval; ENTER(0); retval = strncpy_from_user(to, from, size); if ((retval > 0) && (retval <= size)) { *len = retval; EXIT(0); return 0; } *len = 0; if (retval < 0) retval = EFAULT; else retval = E2BIG; EXIT_RC(0, retval); return (int)retval; } long cxiSafeGetLong(long* from) { #if LINUX_KERNEL_VERSION >= 2060000 long tmp; (void)__get_user_nocheck(tmp, from, sizeof(long)); return tmp; #else return *from; #endif } int cxiSafeGetInt(int* from) { #if LINUX_KERNEL_VERSION >= 2060000 int tmp; __get_user_nocheck(tmp, from, sizeof(int)); return tmp; #else return *from; #endif } void cxiSafePutLong(long val, long* to) { #if LINUX_KERNEL_VERSION >= 2060000 __put_user_nocheck(val, to, sizeof(long)); #else *to = val; #endif } void cxiSafePutInt(int val, int* to) { #if LINUX_KERNEL_VERSION >= 2060000 __put_user_nocheck(val, to, sizeof(int)); #else *to = val; #endif } #ifdef GPFS_ARCH_X86_64 /* Check if 64-bit user process */ int cxiIS64U(char *addr) { #if LINUX_KERNEL_VERSION > 2060500 return !(test_thread_flag(TIF_IA32)); #else return !(current->thread.flags & THREAD_IA32); #endif } #endif int socket_aio_dequeue() { return -1; } /* Transfer data from buffer(s) in user space to or from a buffer in the kernel. */ int cxiUiomove(register char* kBufP, /* address of kernel buffer */ register unsigned long nBytes, /* #bytes to transfer */ Boolean toKernel, /* direction of xfer(read/write)*/ register struct cxiUio_t* uioP) /* user area description */ { register struct cxiIovec_t * iovP; unsigned long cnt; int rc; #ifdef TRACE_IO_DATA char* origKBufP = kBufP; int trcdata[4]; #endif int ignore; ENTER(0); TRACE4(TRACE_FOPS, 6, TRCID_CXISYSTEM_037, "cxiUiomove enter: kBufP 0x%lX uioP 0x%lX nBytes %d toKernel %d\n", kBufP, uioP, nBytes, toKernel); if (uioP->uio_resid <= 0) { EXIT_RC(0, ENOMEM); return ENOMEM; } rc = 0; if (uioP->uio_iovcnt == 1) { /* * Fastpath for most common case of iovcnt == 1. Saves a * few instructions. */ iovP = uioP->uio_iov; cnt = iovP->iov_len; if (cnt <= 0) { uioP->uio_iovcnt--; uioP->uio_iov++; uioP->uio_iovdcnt++; EXIT(0); return 0; } if (cnt > nBytes) cnt = nBytes; if (toKernel) { /* The daemon needs to bypass access checks since copy to * shared segment would inadvertantly fail. Copies to * kernel address space also perform no validity check. */ if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE) __copy_from_user(kBufP, (char *)iovP->iov_base, cnt); else if (copy_from_user(kBufP, (char *)iovP->iov_base, cnt)) { EXIT_RC(0, EFAULT); return EFAULT; } } else { int spam; /* The daemon needs to bypass access checks since copy to * shared segment would inadvertantly fail. Copies to * kernel address space also perform no validity check. */ if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE) ignore = __copy_to_user((char *)iovP->iov_base, kBufP, cnt); else if (copy_to_user((char *)iovP->iov_base, kBufP, cnt)) { EXIT_RC(0, EFAULT); return EFAULT; } } iovP->iov_base = (char *)iovP->iov_base + cnt; iovP->iov_len -= cnt; uioP->uio_resid -= cnt; uioP->uio_offset += cnt; #ifdef TRACE_IO_DATA if (cnt >= sizeof(trcdata)) memcpy(trcdata, origKBufP, sizeof(trcdata)); else { memset(trcdata, 0xAA, sizeof(trcdata)); memcpy(trcdata, origKBufP, cnt); } TRACE5(TRACE_FOPS, 7, TRCID_CXISYSTEM_039a, "uiomove exit 1: rc %d data %08X %08X %08X %08X\n", rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]); #else TRACE1(TRACE_FOPS, 7, TRCID_CXISYSTEM_039, "uiomove exit 1: rc %d\n", rc); #endif EXIT_RC(0, rc); return rc; } while (nBytes > 0 && uioP->uio_resid && rc == 0) { if (uioP->uio_iovcnt <= 0) { EXIT_RC(0, ENOMEM); return ENOMEM; } iovP = uioP->uio_iov; cnt = iovP->iov_len; if (cnt <= 0) { uioP->uio_iovcnt--; uioP->uio_iov++; uioP->uio_iovdcnt++; continue; } if (cnt > nBytes) cnt = nBytes; if (toKernel) { /* The daemon needs to bypass access checks since copy to * shared segment would inadvertantly fail. Copies to * kernel address space also perform no validity check. */ if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE) __copy_from_user(kBufP, (char *)iovP->iov_base, cnt); else if (copy_from_user(kBufP, (char *)iovP->iov_base, cnt)) { EXIT_RC(0, EFAULT); return EFAULT; } } else { /* The daemon needs to bypass access checks since copy to * shared segment would inadvertantly fail. Copies to * kernel address space also perform no validity check. */ if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE) ignore = __copy_to_user((char *)iovP->iov_base, kBufP, cnt); else if (copy_to_user((char *)iovP->iov_base, kBufP, cnt)) { EXIT_RC(0, EFAULT); return EFAULT; } } iovP->iov_base = (char *)iovP->iov_base + cnt; iovP->iov_len -= cnt; uioP->uio_resid -= cnt; uioP->uio_offset += cnt; kBufP += cnt; nBytes -= cnt; } #ifdef TRACE_IO_DATA cnt = kBufP - origKBufP; if (cnt >= sizeof(trcdata)) memcpy(trcdata, origKBufP, sizeof(trcdata)); else { memset(trcdata, 0xAA, sizeof(trcdata)); memcpy(trcdata, origKBufP, cnt); } TRACE5(TRACE_FOPS, 7, TRCID_CXISYSTEM_041a, "uiomove exit 2: rc %d data %08X %08X %08X %08X\n", rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]); #else TRACE1(TRACE_FOPS, 7, TRCID_CXISYSTEM_041, "uiomove exit 2: rc %d\n", rc); #endif EXIT_RC(0, rc); return rc; } /* Try to force some sanity checks at compile type */ /* TO DO: revise this to handle comparisons beyond equality/inequality */ /* STATIC_DBGASSERT(sizeof(spinlock_t), SPINLOCK_T_SIZE); */ /* A routine to check that the definitions in our cxiTypes.h * files are equivalent to the system definitions. The module * should not load if it receives an error from this routine. */ int cxiCheckTypes() { int rc = 0; ENTER(0); /* Make sure cxiBlockingMutex_t fits in the space provided. If not, the implementation of the cxiBlockingMutex... routines needs to use the embedded space to record a pointer to kmalloc'ed space holding the semaphore. */ if (sizeof(struct semaphore) > GPFS_LINUX_SEM_SIZE) { printk("cxiCheckTypes: semaphore %ld > GPFS_LINUX_SEM_SIZE %ld\n", sizeof(struct semaphore), GPFS_LINUX_SEM_SIZE); rc = 1; } /* Size of spinlock_t is smaller for UP case with gcc 3.x, so just insure SPINLOCK_T_SIZE is large enough for both the UP and SMP case. */ if (sizeof(spinlock_t) > SPINLOCK_T_SIZE) { printk("cxiCheckTypes: spinlock_t %ld > SPINLOCK_T__SIZE %ld\n", sizeof(spinlock_t), SPINLOCK_T_SIZE); rc = 2; } /* Ensure that size of pid_t matches cxiThreadId (32-bits) */ if (sizeof(pid_t) != sizeof(cxiThreadId)) { printk("cxiCheckTypes: pid_t %ld != cxiThreadId %ld\n", sizeof(pid_t), sizeof(cxiThreadId)); rc = 3; } if (rc > 0) TRACE1(TRACE_TASKING, 2, TRCID_CXISYSTEM_CHKTYPES, "cxiCheckTypes: system type mismatch on type number %d!\n", rc); EXIT_RC(0, rc); return rc; } /* Routine to get current time of day in nanosecond format. */ int cxiGetTOD(cxiTimeStruc_t *tsP) { #if LINUX_KERNEL_VERSION >= 2060000 struct timespec ts; #else struct timeval tv; #endif ENTER(0); #if LINUX_KERNEL_VERSION >= 2060000 ts = CURRENT_TIME; tsP->tv_sec = ts.tv_sec; tsP->tv_nsec = ts.tv_nsec; #else /* This call returns microseconds so we fudge it to nanoseconds */ do_gettimeofday(&tv); tsP->tv_sec = tv.tv_sec; tsP->tv_nsec = tv.tv_usec * 1000; #endif EXIT(0); return 0; } Boolean cxiIsNFSThread() { # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) /* Note comparison against a multibyte character constant (not a string constant). Order of characters in word is reversed due to little- endian representation of integers. */ if (* ((int*)¤t->comm[0]) != 0x6473666e) // 'dsfn' return false; if (* ((char*)¤t->comm[4]) == '\0') return true; return (* ((int*)¤t->comm[2]) == 0x00346473); // '4ds' # else if ((strcmp(current->comm, "nfsd") == 0) || (strcmp(current->comm, "nfsd4") == 0)) return true; return false; # endif } Boolean cxiIsLockdThread() { # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) /* Note comparison against a multibyte character constant (not a string constant). Order of characters in word is reversed due to little- endian representation of integers. */ if ((* ((int*)¤t->comm[0]) != 0x6b636f6c) | // 'kcol' (* ((int*)¤t->comm[2]) != 0x00646b63)); // ' dkc' return false; return * ((char*)¤t->comm[5]) == '\0'; # else return (strcmp(current->comm, "lockd") == 0); # endif } Boolean cxiIsNFS4Thread() { # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) /* Note comparison against a multibyte character constant (not a string constant). Order of characters in word is reversed due to little- endian representation of integers. */ if ((* ((int*)¤t->comm[0]) != 0x6473666e) | // 'dsfn' (* ((int*)¤t->comm[2]) != 0x00346473)); // '4ds' return false; return * ((char*)¤t->comm[5]) == '\0'; # else return (strcmp(current->comm, "nfsd4") == 0); # endif } Boolean cxiIsKupdateThread() { #if LINUX_KERNEL_VERSION >= 2060000 /* In 2.6 pdflush replaced kupdated and bdflush from 2.4 */ return current_is_pdflush(); #else return (strcmp(current->comm, "kupdated") == 0); #endif } #ifdef SMB_LOCKS Boolean cxiIsSambaOrLockdThread() { # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) /* Note comparison against a multibyte character constant (not a string constant). Order of characters in word is reversed due to little- endian representation of integers. */ Boolean rc = (((* ((int*)¤t->comm[0]) == 0x64626d73) & // 'dbms' (* ((char*)¤t->comm[4]) == '\0')) | ((* ((int*)¤t->comm[0]) == 0x6b636f6c) & // 'kcol' (* ((int*)¤t->comm[2]) == 0x00646b63))); // 'dkc' return rc; # else return ((strcmp(current->comm, "smbd") == 0) | (strcmp(current->comm, "lockd") == 0)); # endif } Boolean cxiIsSambaThread() { # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) /* Note comparison against a multibyte character constant (not a string constant). Order of characters in word is reversed due to little- endian representation of integers. */ Boolean rc = ((* ((int*)¤t->comm[0]) == 0x64626d73) & // 'dbms' (* ((char*)¤t->comm[4]) == '\0')); return rc; # else return (strcmp(current->comm, "smbd") == 0); # endif } #endif Boolean cxiIsGPFSThread() { # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) return (((* ((int*)¤t->comm[0]) == 0x73666d6d) & // 'sfmm' (* ((int*)¤t->comm[2]) == 0x00647366))); // 'dsf' # else return (strcmp(current->comm, "mmfsd") == 0); # endif } Boolean cxiIsKswapdThread() { #if LINUX_KERNEL_VERSION > 2060000 /* On 2.6, there may be multiple kswapd processes, named kswapd0, kswapd1, * etc. We don't have to depend on the process name to identify kswapd * processes on 2.6 though, there's a better way. */ return current_is_kswapd(); #else # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) return ((* ((int*)¤t->comm[0]) == 0x6177736b) & // 'awsk' (* ((int*)¤t->comm[3]) == 0x00647061)); // ' dpa' # else return (strcmp(current->comm, "kswapd") == 0); # endif #endif } #ifdef INSTRUMENT_LOCKS void InitBlockingMutexStats() { memset(BlockingMutexStatsTable, 0, sizeof(BlockingMutexStatsTable)); } #endif /* Initialize a cxiBlockingMutex_t. Instead of the DBGASSERT, this routine should kmalloc a struct semaphore if bmSem is too small. */ void cxiBlockingMutexInit(cxiBlockingMutex_t* mP, int bmNameIdx) { ENTER(0); DBGASSERT(sizeof(struct semaphore) <= GPFS_LINUX_SEM_SIZE); #ifdef INSTRUMENT_LOCKS DBGASSERT(bmNameIdx < MAX_GPFS_LOCK_NAMES); #endif /* INSTRUMENT_LOCKS */ TRACE2(TRACE_KLOCKL, 3, TRCID_BM_INIT, "cxiBlockingMutexInit: mP 0x%lX idx %d\n", mP, bmNameIdx); init_MUTEX((struct semaphore *)mP->bmSem); mP->bmOwnerP = NULL; mP->lockNameIndex = bmNameIdx; EXIT(0); } /* Enter critical section, blocking this thread if necessary. Mark this thread as the owner of the mutex before returning. */ void REGPARMS cxiBlockingMutexAcquire(cxiBlockingMutex_t* mP) { ENTER(1); TRACE4(TRACE_KLOCKL, 9, TRCID_BM_ACQ, "cxiBlockingMutexAcquire: about to acquire 0x%lX type %d " "current 0x%lX currentOwner 0x%lX\n", mP, mP->lockNameIndex, current, mP->bmOwnerP); DBGASSERTRC(mP->bmOwnerP != (char *)current, PTR_TO_INT32(mP->bmOwnerP), PTR_TO_INT32(mP), 0); #ifdef INSTRUMENT_LOCKS BlockingMutexStatsTable[mP->lockNameIndex].bmsAcquires += 1; if (mP->bmOwnerP != NULL) BlockingMutexStatsTable[mP->lockNameIndex].bmsConflicts += 1; #endif down((struct semaphore *)mP->bmSem); mP->bmOwnerP = (char *)current; TRACE1(TRACE_KLOCKL, 9, TRCID_BM_ACQ_EXIT, "cxiBlockingMutexAcquire: returning after acquiring 0x%lX\n", mP); EXIT(1); } /* Leave critical section and awaken waiting threads */ void REGPARMS cxiBlockingMutexRelease(cxiBlockingMutex_t* mP) { ENTER(1); TRACE4(TRACE_KLOCKL, 9, TRCID_BM_REL, "cxiBlockingMutexRelease: about to release 0x%lX type %d " "current 0x%lX currentOwner 0x%lX\n", mP, mP->lockNameIndex,current, mP->bmOwnerP); if (mP->bmOwnerP == (char *)current) { mP->bmOwnerP = NULL; up((struct semaphore *)mP->bmSem); } EXIT(1); } /* Free resources associated with this cxiBlockingMutex_t in preparation for freeing the storage it occupies */ void cxiBlockingMutexTerm(cxiBlockingMutex_t* mP) { ENTER(0); TRACE2(TRACE_KLOCKL, 3, TRCID_BM_TERM, "cxiBlockingMutexTerm: mP 0x%lX type %d\n", mP, mP->lockNameIndex); /* Verify that mutex is not held */ DBGASSERT(mP->bmOwnerP == NULL); DBGASSERT(atomic_read(&((struct semaphore *)mP->bmSem)->count) == 1); EXIT(0); } /* Return true if a cxiBlockingMutex_t is held by the calling process */ Boolean cxiBlockingMutexHeldByCaller(cxiBlockingMutex_t* mP) { Boolean result; char* ownerP; cxiPid_t ownerPid; /* Cache bmOwnerP is case it changes to NULL */ ENTER(0); ownerP = mP->bmOwnerP; if (ownerP == NULL) result = false; else { cxiThreadPtrToThreadId(ownerP, &ownerPid); result = (current->pid == ownerPid); } TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_017, "cxiBlockingMutexHeldByCaller: owner 0x%lX returns %d\n", ownerP, result); EXIT_RC(0, result); return result; } /* Return true if a cxiBlockingMutex_t has one or more processes waiting on it */ Boolean cxiBlockingMutexHasWaiters(cxiBlockingMutex_t* mP) { struct semaphore * semP = (struct semaphore *)mP->bmSem; Boolean result; ENTER(0); if ((void*)semP->wait.task_list.next != (void*)&semP->wait.task_list.next) result = true; else result = false; TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_018, "cxiBlockingMutexHasWaiters: mP 0x%lX hasWaiters %d\n", mP, result); EXIT_RC(0, result); return result; } /* Wait for a cxiWaitEventSignal, cxiWaitEventBroadcast, or cxiWaitEventBroadcastRC. Drop the associated cxiBlockingMutex_t *mutexP while waiting, and reacquire it before returning. If INTERRUPTIBLE is set in waitFlags, waits interruptibly; otherwise, waits uninterruptibly. Returns THREAD_INTERRUPTED if interrupted before being woken up, THREAD_AWAKENED, if woken up by cxiWaitEventSignal or cxiWaitEventBroadcast, or the result value passed to cxiWaitEventWakeupResult, if woken up by cxiWaitEventWakeupResult. */ int cxiWaitEventWait(cxiWaitEvent_t* weP, cxiBlockingMutex_t* mutexP, int waitFlags) { spinlock_t *lockP = (spinlock_t *)(weP->lword); unsigned long flags; cxiWaitElement_t waitElement; int count = 0; Boolean done; ENTER(0); TRACE3(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_EVENT_WAIT_ENTER, "cxiWaitEventWait enter: weP 0x%lX waitFlags 0x%X about to release " "mutex 0x%lX \n", weP, waitFlags, mutexP); /* Verify that caller is holding the mutex */ DBGASSERTRC(mutexP->bmOwnerP == (char *)current, PTR_TO_INT32(mutexP->bmOwnerP), PTR_TO_INT32(mutexP), 0); /* initialize our wait element */ init_waitqueue_head(&waitElement.qhead); init_waitqueue_entry(&waitElement.qwaiter, current); __add_wait_queue(&waitElement.qhead, &waitElement.qwaiter); waitElement.wakeupRC = 0; /* update our task state to not running any more */ if (waitFlags & INTERRUPTIBLE) current->state = TASK_INTERRUPTIBLE; else current->state = TASK_UNINTERRUPTIBLE; /* add our wait element to the end of the wait list */ SPIN_LOCK_IRQ(lockP, flags); CXI_WAIT_LIST_ADD(&weP->waitList, &waitElement.waitList); SPIN_UNLOCK_IRQ(lockP, flags); /* Release the mutex. Note: calling cxiBlockingMutexRelease here is problematic, because it makes trace calls, which may block the current process, which would overwrite the task state (current->state) we just updated. A way around this would be to move out task state update to after the call to cxiBlockingMutexRelease, but then, before calling schedule(), we would have to re-acquire the wait-list lock and check wakeupRC to see whether somebody has already woken us up since we released the mutex. Since there is a trace at the top of this routine, we don't need the one in cxiBlockingMutexRelease; hence, just do the release right here. */ mutexP->bmOwnerP = NULL; up((struct semaphore *)mutexP->bmSem); again: /* call the scheduler */ schedule(); /* Remove ourself from the wait list ... except: Even though we may enter uninterrubtible sleep, this sleep can in fact be interrupted in at least two scenarios: 1) page_alloc code may call wakeup_kswapd(). This should be a very rare event with the current code, since we make an effort to avoid blocking kswapd. 2) While signals are supposed to be ignored during uninterruptible sleep, it turns out that some signals, e.g. SIGSEGV and SIGBUS, cause us to wake up. It doesn't look like the signal has been delivered yet, but sleep is interrupted. The signal will be delivered later (probably when exiting kernel). Our callers can't handle unexpected return from uninterruptible sleep. In either of the two cases above, it should be safe to go back to sleep and wait to be woken up properly. */ SPIN_LOCK_IRQ(lockP, flags); if (waitElement.wakeupRC == 0 && !(waitFlags & INTERRUPTIBLE)) { TRACE3N(TRACE_KLOCKL, 1, TRCID_CXISYSTEM_EVENT_WAIT_INTERRUPTED, "cxiWaitEventWait: interrupted weP 0x%lX mutexP 0x%lX rc %d\n", weP, mutexP, waitElement.wakeupRC); current->state = TASK_UNINTERRUPTIBLE; done = false; } else { CXI_WAIT_LIST_REMOVE(&waitElement.waitList); done = true; } SPIN_UNLOCK_IRQ(lockP, flags); if (!done) goto again; /* re-acquire the mutex */ cxiBlockingMutexAcquire(mutexP); TRACE3(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_EVENT_WAIT_EXIT, "cxiWaitEventWait exit: weP 0x%lX mutexP 0x%lX rc %d\n", weP, mutexP, waitElement.wakeupRC); /* A zero wakeup code means we were interrupted rather than woken up */ EXIT(0); if (waitElement.wakeupRC != 0) return waitElement.wakeupRC; else return THREAD_INTERRUPTED; } /* Wake up one thread waiting on this cxiWaitEvent_t. Must not sleep */ void cxiWaitEventSignal(cxiWaitEvent_t* weP) { /* ENTER(0); */ TRACE1N(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_SIGNAL, "cxiWaitEventSignal: weP 0x%lX\n", weP); doWakeup(weP, wSignal, THREAD_AWAKENED); /* wake up one */ /* EXIT(0); */ } /* Wake up one thread waiting on this cxiWaitEvent_t. This is the same as cxiWaitEventSignal(), except this routine guarantees that multiple wake up calls will each pick a different thread if more than one is waiting. */ void cxiWaitEventWakeupOne(cxiWaitEvent_t* weP) { ENTER(0); TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_WAKEUP_ONE, "cxiWaitEventWakeupOne: weP 0x%lX\n", weP); doWakeup(weP, wWakeOne, THREAD_AWAKENED); /* wake up one */ EXIT(0); } /* Wake up all threads waiting on this cxiWaitEvent_t */ void cxiWaitEventBroadcast(cxiWaitEvent_t* weP) { ENTER(0); TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST, "cxiWaitEventBroadcastRC: weP 0x%lX\n", weP); doWakeup(weP, wBroadcast, THREAD_AWAKENED); /* wake up all */ EXIT(0); } /* Wake up all threads waiting on this cxiWaitEvent_t and cause them to return rc from their cxiWaitEventWait calls. */ void cxiWaitEventBroadcastRC(cxiWaitEvent_t* weP, int rc) { ENTER(0); TRACE2(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST_RC, "cxiWaitEventBroadcastRC: weP 0x%lX rc %d\n", weP, rc); doWakeup(weP, wBroadcast, rc); /* wake up all */ EXIT_RC(0, rc); } /* alloc big memory area */ void * cxiBigMalloc(int size) { void *ptr; ENTER(0); ptr = vmalloc(size); #ifdef MALLOC_DEBUG MallocDebugNew(ptr, size, 2); #endif EXIT(0); return ptr; } /* free big memory area */ void cxiBigFree(char *ptr) { ENTER(0); #ifdef MALLOC_DEBUG MallocDebugDelete(ptr); #endif EXIT(0); vfree(ptr); } #ifdef SMB_LOCKS /* Determine if current process has this file open */ void * cxiCheckOpen(struct cxiNode_t* cnP) { int count; int i; struct file** fdList; struct file* fileP; struct inode* inodeP; ENTER(0); #if LINUX_KERNEL_VERSION >= 2061300 count = current->files->fdt->max_fds; fdList = current->files->fdt->fd; #else count = current->files->max_fds; fdList = current->files->fd; #endif inodeP = GNP_TO_VP(cnP); TRACE3(TRACE_VNODE,9,TRCID_CXICHECKOPEN_ENTRY, "cxiCheckOpen: entry. %d files in fd list. Checking for inode %d " "at 0x%x", count, inodeP->i_ino, inodeP); for (i=0; if_dentry->d_inode == inodeP) { TRACE1(TRACE_VNODE, 9,TRCID_CXICHECKOPEN_FOUND, "cxiCheckOpen: found open file. vinfoP 0x%x", fileP->private_data); EXIT(0); return fileP->private_data; } } } EXIT(0); return NULL; } int cxiBreakOplock(void *breakArgP, int oplockNew) { /* On Linux, we use its kernel oplock support. The get_lease() * call is the operation to revoke conflicting leases. */ int rc; ENTER(0); /* O_NONBLOCK: prevents the thread from waiting for the lease return. * In the case of a Samba thread, we only want to get EWOULDBLOCK * back if the conflict is held within Samba iteself. If a wait is * needed, breakSMBOplock will invoke cxiWaitForBreak. */ /* Linux op to revoke conflicting leases */ rc = abs(REVOKE_LEASE((struct inode *)breakArgP, (cxiIsSambaThread()? 0: O_NONBLOCK) | ((oplockNew==smbOplockShared)? FMODE_READ: FMODE_WRITE))); TRACE3(TRACE_VNODE, 4,TRCID_CXIBREAKOPLOCK, "cxiBreakOplock: exit rc %d inode 0x%lX oplock %d\n", rc, breakArgP, oplockNew); EXIT(0); return rc; } DECLARE_WAIT_QUEUE_HEAD(oplock_break_queue); /* No initialization required on Linux */ int cxiInitBreakQ() { return 0; } /* No initialization required on Linux */ int cxiTermBreakQ() { return 0; } /* Send the notification that the oplock break completed */ int cxiSendBreakMsg(void *ofP) { ENTER(0); /* There is only one oplock_break_queue, and no means to pass the ofP back to * the waiters. This will wake all of them up and they will recheck their * oplock states and wait again if necessary (with a timeout). */ wake_up_interruptible(&oplock_break_queue); TRACE1(TRACE_SMB, 3, TRCID_SEND_BREAK, "cxiSendBreakMsg: ofP 0x%lX\n", ofP); EXIT(0); return 0; } /* Suspend the caller until either the oplock break completes, or the timeout * is reached. */ int cxiWaitForBreak(void *fileArgP, int oplockCurrent, int timeoutSeconds) { DECLARE_WAITQUEUE(wait, current); signed long timeout; ENTER(0); TRACE3(TRACE_SMB, 5, TRCID_BREAKWAIT, "cxiWaitForBreak: file 0x%lX, oplockCurrent %d timeoutSeconds %d\n", fileArgP, oplockCurrent, timeoutSeconds); add_wait_queue(&oplock_break_queue, &wait); timeout = timeoutSeconds * HZ; while (timeout > 0) { set_current_state(TASK_INTERRUPTIBLE); /* Check whether the oplock has been released or downgraded */ if (gpfs_ops.SMBGetOplockState(fileArgP) < oplockCurrent) break; timeout = schedule_timeout(timeout); } set_current_state(TASK_RUNNING); remove_wait_queue(&oplock_break_queue, &wait); TRACE0(TRACE_SMB, 5, TRCID_BREAKWAIT_EXIT, "cxiWaitForBreak exit\n"); EXIT(0); return 0; } #endif /* Get the address of the first byte not addressible by processes */ UIntPtr cxiGetKernelBoundary() { return GPFS_KERNEL_OFFSET; } /* Return true if this process holds the big kernel lock (BKL) */ Boolean cxiHoldsBKL() { return current->lock_depth >= 0; } /* Tell the OS that this thread is involved in handling VM page-out requests and should not be blocked waiting for page allocation. Return true if successful. */ Boolean cxiSetPageoutThread() { if (current->flags & PF_MEMALLOC) return false; current->flags |= PF_MEMALLOC; return true; } /* Tell the OS that this thread is no longer involved in handling VM page-out requests. */ void cxiClearPageoutThread() { current->flags &= ~PF_MEMALLOC; } /* Yield the CPU to allow other processes to run */ void cxiYield() { ENTER(0); schedule(); EXIT(0); } /* Linux filldir has changed signatures depending on kernel level. * We always pass a 64bit offset from the GPFS layer. */ int cxiFillDir(void *vargP, const char *nameP, int namelen, offset_t offset, ino_t ino) { int result; cxiFillDirArg_t *fillDirArgP = (cxiFillDirArg_t *)vargP; filldir_t fnP = (filldir_t)fillDirArgP->fnP; ENTER(0); result = (*fnP)(fillDirArgP->argP, nameP, namelen, (loff_t)offset, ino, 0 /* DT_UNKNOWN */); EXIT_RC(0, result); return result; } #ifdef DISK_LEASE_DMS static struct timer_list DMSTimer[MAX_DMS_INDEX]; static int (*DMSgetNIOsInProgressP)(int); #define PANIC_FOR_REAL 1 static void cxiDMSExpired(unsigned long data) { int idx = data; int nIOs = DMSgetNIOsInProgressP(idx); /* ENTER(0); */ /* This code is executed on the interrupt level -- can't use tracing */ printk("GPFS Deadman Switch timer [%d] has expired; IOs in progress: %d\n", idx, nIOs); #ifdef PANIC_FOR_REAL if (nIOs != 0) panic("GPFS Deadman Switch timer has expired, and there are still" " %d outstanding I/O requests\n", nIOs); #endif } /* Start dead man switch, with the timeout specified by the delay argument (in seconds). */ void cxiStartDMS(int idx, int delay, int (*funcP)(int)) { unsigned long njiffies = delay * HZ; /* Only allow the daemon or other root users to make this kernel call */ if (!cxiIsSuperUser()) return; ENTER(0); /* There can be only one timer active at any given moment */ if (timer_pending(&DMSTimer[idx])) del_timer(&DMSTimer[idx]); init_timer(&DMSTimer[idx]); DMSTimer[idx].expires = jiffies + njiffies; DMSTimer[idx].function = cxiDMSExpired; DMSTimer[idx].data = idx; /* save the pointer to nIOsInProgress to a static var */ DMSgetNIOsInProgressP = funcP; add_timer(&DMSTimer[idx]); TRACE3(TRACE_DLEASE, 2, TRCID_DMS_STARTED, "DMS timer [%d] started, delay %d, time %d\n", idx, delay, jiffies/HZ); EXIT(0); } void cxiStopDMS(int idx) { /* Only allow the daemon or other root users to make this kernel call */ if (!cxiIsSuperUser()) return; ENTER(0); if (timer_pending(&DMSTimer[idx])) del_timer(&DMSTimer[idx]); TRACE2(TRACE_DLEASE, 2, TRCID_DMS_STOPPED, "DMS timer [%d] stopped, time %d\n", idx, jiffies/HZ); EXIT(0); } /* dummy init routine. Since on Linux the timer is stored in a static memory, there's nothing to be done */ int cxiInitDMS(void) { return 0; } void cxiShutdownDMS(void) { int i; ENTER(0); for (i = 0; i < MAX_DMS_INDEX; i++) cxiStopDMS(i); EXIT(0); } #endif /* DISK_LEASE_DMS */ void cxiSetBit(unsigned long *flagP, int flag_bit) { set_bit(flag_bit,flagP); } void cxiClearBit(unsigned long *flagP, int flag_bit) { clear_bit(flag_bit,flagP); } Boolean cxiTestBit(unsigned long *flagP, int flag_bit) { return test_bit(flag_bit,flagP); } /* In order to setup our termination callback routine (gpfs_f_cleanup) * we create a dummy file and add it to our file table. Then, upon * process termination, the release file operation will be called in * order to close the file. The only operation we define for this * dummy file is release (gpfs_f_cleanup). */ int cxiRegisterCleanup() { int code = 0, rc = 0; struct inode *iP = NULL; struct file *fileP = NULL; struct dentry *dentryP = NULL; extern int cleanupFD; extern struct super_block *shutdownSuperP; /* We record the daemon's process group because certain * checks on cxiCopyIn/cxiCopyOut are bypassed for the daemon. */ ENTER(0); DaemonPGrp = PROCESS_GROUP(current); /* Make sure we only create one file */ if (cleanupFD) { EXIT_RC(0, EEXIST); return EEXIST; } DBGASSERT(shutdownSuperP != NULL); /* Allocate an inode struct */ iP = NEW_INODE(shutdownSuperP); if (!iP) { code = 1; rc = ENOMEM; goto xerror; } iP->i_mode = S_IFREG; /* Allocate an available file descriptor */ cleanupFD = get_unused_fd(); if (cleanupFD < 0) { code = 2; rc = ENFILE; goto xerror; } /* Allocate a file struct */ fileP = get_empty_filp(); if (!fileP) { code = 3; rc = ENFILE; goto xerror; } /* Allocate a dentry sruct */ dentryP = dget(d_alloc_root(iP)); if (!dentryP) { code = 4; rc = ENOMEM; goto xerror; } /* Initialize and chain our file sructure */ fileP->f_dentry = dentryP; fileP->f_op = &gpfs_cleanup_fops; fileP->f_flags = O_RDONLY; atomic_set(&fileP->f_count, 1); /* Just chain it on the current root mount. When * the file is closed its fput() will decrement * the mount count (hence the mntget here) */ fileP->f_vfsmnt = mntget(current->fs->rootmnt); /* Install the descriptor so it gets "closed" upon our termination */ fd_install(cleanupFD, fileP); /* Set FD_CLOEXEC so that forked processes (like mmfsup.scr) do not * inherrit this descriptor. We want the cleanup routine to be run * when the last mmfsd process terminates. */ #if LINUX_KERNEL_VERSION >= 2061300 FD_SET(cleanupFD, current->files->fdt->close_on_exec); #else FD_SET(cleanupFD, current->files->close_on_exec); #endif /* Once the descriptor for this dummy file is added to our file table, * it is inherrited by all the processes of the daemon. As each * terminates, the files->count is decremented and on the last process * termination all the descriptors will be closed by filp_close. * * The one catch here is that our file table is inherrited by the * kernel threads we start as well as user processes. This would * cause a problem in that daemon termination does not include these * kernel threads which aren't killed until restart (and therefore * the file is never closed). In order for our operation to be * driven at daemon termiation, we must remove the file table from * these kernel threads. This is done in via cxiReparent() by * the mmap pager kproc. */ xerror: TRACE4(TRACE_VNODE, 1, TRCID_CXIREGISTERCLEANUP_EXIT, "cxiRegisterCleanup: fd %d iP %X rc %d code %d\n", cleanupFD, iP, rc, code); if (rc) { if (dentryP); dput(dentryP); if (cleanupFD) put_unused_fd(cleanupFD); if (fileP) #if LINUX_KERNEL_VERSION > 2060900 fput(fileP); #else put_filp(fileP); #endif if (iP) iput(iP); cleanupFD = 0; } EXIT_RC(0, rc); return rc; } #ifdef NFS4_ACL /* Linux routines to be called when processing NFSv4 audit/alarm ACL entries */ int cxiAuditWrite(int numargs, ...) { return ENOSYS; } #endif /* NFS4_ACL */ /* Currently no OS specific VFS initialization for Linux */ int cxiInitVFS(int vfsType) { return 0; } UIntPtr cxiGetKernelStackSize() { return (UIntPtr)THREAD_SIZE; } #if defined(DMAPI) || (SANERGY) void cxiPathRel(void *ndP) { DBGASSERT( ndP != NULL); path_release( (struct nameidata *) ndP); cxiFreeUnpinned(ndP); } int cxiPathToVfsP(void **privVfsPP, char *kpathname, void **ndPP, void **cnPP, Boolean traverseLink) { struct gpfsVfsData_t *privVfsP = NULL; struct nameidata *ndP; struct inode * iP; cxiNode_t *cnP; int rc = 0; Boolean rel = false; int code = 0; *ndPP = NULL; *privVfsPP = NULL; ENTER(0); if (kpathname == NULL) { code = 1; rc = EINVAL; goto xerror; } ndP = (struct nameidata *)cxiMallocUnpinned(sizeof(struct nameidata)); if (ndP == NULL) { code = 2; rc = ENOMEM; goto xerror; } /* For DMAPI, this is called by dm_path_to_handle or dm_path_to_fshandle, * According to dmapi documentation, we should return the symbolic link * itself instead of the object that link references. * so here we need to use the function which does not traverse the link */ if (!traverseLink) rc = user_path_walk_link(kpathname, ndP); else rc = user_path_walk(kpathname, ndP); if (rc) { rc = -rc; code = 3; goto xerror; } rel = true; iP = ndP->dentry->d_inode; DBGASSERT(iP != NULL); if (!GPFS_TYPE(iP)) { code = 4; rc = EINVAL; goto xerror; } privVfsP = VP_TO_PVP(iP); if (privVfsP == NULL) { code = 5; rc = ENOENT; } cnP = VP_TO_CNP(iP); *privVfsPP = (void *)privVfsP; *ndPP = (void *)ndP; if (cnPP != NULL) *cnPP = (void *)cnP; xerror: if (rc && ndP) { if (rel) cxiPathRel(ndP); else cxiFreeUnpinned(ndP); } EXIT_RC(0, rc); return rc; } void cxiSetCred(void *eCredPP) { ext_cred_t *eCredP = (ext_cred_t *)eCredPP; setCred(eCredP); } #endif /* DMAPI or SANERGY */ #ifdef KSTACK_CHECK /* Kernel stack checking: for each active thread that is making subroutine calls in the kernel, allocate a stack_history_t. Within each stack_history_t, create a frame_desc_t for each level of subroutine call. Two lists of frame_desc_t's are maintained: one for the current call stack, and one for the deepest call stack seen so far for this thread. Upon exit from the lowest-level routine, check whether the maximum stack depth threshhold has been exceeded. If it has, print the traceback of the maximum stack usage. Keep hashes of the tracebacks printed to avoid printing the same traceback more than once. Since cxiTraceExit is not called for every routine exit, maintenance of call chains is not exact; a routine entry with stackUsed less than the current entry implies return of the previous routine. Note that these routines cannot call any other routine that has ENTER/EXIT macros inside of it, to avoid recursion. */ /* Maximum size of of a stack frame before it is considered large enough to complain about */ #define STACK_LIMIT_WARNING (THREAD_SIZE - (THREAD_SIZE/3) ) /* Description of one level of a call stack */ typedef struct frame_desc { /* Function name and file name containing the function */ const char * fdFuncNameP; const char * fdFileNameP; /* Pointer to frame_desc of caller, or NULL if this is the first frame. Also used to link free frame descriptors together on the shFreeHeadP free list. */ struct frame_desc * fdCallerP; /* Line number near the beginning of fdFuncNameP */ int fdLineNum; /* Total stack usage up to and including this routine */ int fdStackUsed; /* Reference count for this frame_desc_t. Can be 2 if this descriptor is reachable from both shCurrentP and shMaxP. */ int fdRef; } frame_desc_t; /* Each stack_history is only used by one thread, so no locking is needed within a stack_history. This is allocated as a single page. */ typedef struct stack_history { /* ID of thread to which this stack_history_t belongs */ cxiThreadId shThreadId; /* Bucket index in historyHash that points to this stack_history_t, or -1 if this stack_history_t is on an overflow list */ int shBucketNum; /* Next stack_history_t in same hash overflow list or on free list */ struct stack_history * shNextP; /* Pointer to the frame descriptor for the routine that most recently called fdEnter without a matching fdExit. Following the fdCallerP pointers through these frame descriptors gives the current callback chain. */ frame_desc_t * shCurrentP; /* Pointer to the frame descriptor that had the maximum stack usage seen thus far for this thread. Following the fdCallerP pointers through these frame descriptors gives the callback chain with maximal stack usage. */ frame_desc_t * shMaxP; /* Head of list of free frame_desc_t's */ frame_desc_t * shFreeHeadP; /* Area that holds frame_desc_t's. These will be linked together and put on the list shFreeHeadP. */ #define SH_PREFIX_LEN (sizeof(cxiThreadId) + \ sizeof(int) + \ sizeof(struct stack_history *) + \ 3*sizeof(frame_desc_t *)) #define SH_NFRAMES ((PAGE_SIZE-SH_PREFIX_LEN)/sizeof(frame_desc_t)) frame_desc_t shFrames[SH_NFRAMES]; } stack_history_t; /* Global structures */ struct { /* Global flag controlling whether kernel stack checking is enabled. Initially false; set true during kernel module initialization, then set false again during kernel module termination. */ Boolean shActive; /* Mutex protecting updates to the variables that follow. This cannot be a cxiBlockMutex_t because then the stack checking routines would get called recursively. */ struct semaphore shMutex; /* List of free stack_history_t's and count of how many free entries there are. Excess stack_history_t's beyond a threshhold are freed back to the operating system. */ stack_history_t * freeHeadP; int nFree; #define MAX_FREE_STACK_HISTORIES 16 /* Hash table of active stack_history_t's. To find the entry for a particular thread, hash its thread id to a bucket. If any of the entries in bucket[] match the desired thread id, the pointer to the stack_history_t can be returned without acquiring any locks. If the bucket does not contain the desired thread id, look for it on the overflow list under protection of shMutex. */ #define HISTORY_HASH_SIZE 64 #define HISTS_PER_BUCKET 3 struct { struct { stack_history_t * historyP; cxiThreadId threadId; } bucket[HISTS_PER_BUCKET]; stack_history_t * overflowP; } historyHash[HISTORY_HASH_SIZE]; /* List of hash values for tracebacks that have already been printed. Used to avoid printing the same traceback more than once. Nothing is ever deleted from this table, so to find an entry start searching at its hash value and continue until the entry is found or an empty slot is encountered. The total occupancy of the table is limited to MAX_TRACEBACKS to restrict the amount of searching that will be required, and to guarantee that searches will terminate. */ #define TB_HASH_SIZE 64 #define MAX_TRACEBACKS 32 unsigned int tracebackHash[TB_HASH_SIZE]; int nTracebackHashEntries; } SHG; /* Private version of DBGASSERT used only within stack checking code. Cannot use DBGASSERT without risking recursion. */ #ifdef DBGASSERTS #define SH_ASSERT(_ex) \ if (!(_ex)) { \ printk("GPFS stack checking assert failed: " # _ex " file %s line %d\n", \ __FILE__, __LINE__); \ DoPanic(# _ex, __FILE__, __LINE__, 0, 0, ""); \ } else ((void)0) #else #define SH_ASSERT(_ex) ((void)0) #endif /* Initialize and enable stack depth checking */ void shInit() { /* Clear stack checking globals */ cxiMemset(&SHG, 0, sizeof(SHG)); /* Init mutex */ init_MUTEX(&SHG.shMutex); /* Turn on stack depth checking and make sure the change is visible */ SHG.shActive = true; wmb(); } /* Turn off stack depth checking and free all allocated memory. This does not have to return the global state to what it was when the module was first loaded, since it will not be used again. */ void shTerm() { int h; int b; stack_history_t * shP; stack_history_t * shNextP; /* Turn off stack depth checking and make sure the chenge is visible */ SHG.shActive = false; wmb(); /* Get and then release mutex. This insures that a thread that is in the middle of writing a traceback finishes writing it before we free the data structures it was using. */ /* ?? although there could be another thread waiting for the mutex ... */ down(&SHG.shMutex); up(&SHG.shMutex); /* Wait briefly to allow threads in the middle of the stack checking code to finish what they are doing */ /* ?? Of course, this is not really safe, but this is debugging code, right? */ schedule_timeout(HZ/2); /* Terminate mutex */ // nothing to do /* Free all stack_history_t's on the free list */ shP = SHG.freeHeadP; while (shP != NULL) { shNextP = shP->shNextP; kfree(shP); shP = shNextP; } /* Free all stack_history_t's in the hash table */ for (h=0 ; hshNextP; kfree(shP); shP = shNextP; } } } /* Allocate and initialize a new stack_history_t */ static stack_history_t * shAllocInit() { stack_history_t * shP; int f; up(&SHG.shMutex); shP = (stack_history_t *) kmalloc(sizeof(stack_history_t), GFP_KERNEL); SH_ASSERT(shP != NULL); down(&SHG.shMutex); cxiMemset(shP, 0, sizeof(stack_history_t)); for (f=0 ; f<=SH_NFRAMES-2 ; f++) shP->shFrames[f].fdCallerP = &shP->shFrames[f+1]; shP->shFreeHeadP = &shP->shFrames[0]; return shP; } /* Get a stack_history_t off the free list or build a new one */ static stack_history_t * shGet() { stack_history_t * shP; /* Use free list if one is available there */ shP = SHG.freeHeadP; if (shP != NULL) { SHG.freeHeadP = shP->shNextP; SHG.nFree -= 1; return shP; } /* Make a new one if necessary */ return shAllocInit(); } /* Free a stack_history_t. Put it on the free list if there are not already too many free, or else free it back to the operating system. */ static void shPut(stack_history_t * shP) { int h; int b; stack_history_t ** shPrevPP; stack_history_t * p; /* Both call stacks should be empty */ SH_ASSERT(shP->shCurrentP == NULL); SH_ASSERT(shP->shMaxP == NULL); /* Must hold mutex while changing the hash table */ down(&SHG.shMutex); /* Clear pointer to this stack_history_t from the hash table */ h = ((int)shP->shThreadId) & (HISTORY_HASH_SIZE-1); b = shP->shBucketNum; if (b != -1) { SH_ASSERT(SHG.historyHash[h].bucket[b].historyP == shP); SHG.historyHash[h].bucket[b].historyP = NULL; SHG.historyHash[h].bucket[b].threadId = 0; } else { shPrevPP = &SHG.historyHash[h].overflowP; p = *shPrevPP; while (p != NULL) { if (p == shP) { *shPrevPP = shP->shNextP; break; } shPrevPP = &p->shNextP; p = *shPrevPP; } } /* If not too many already free, add to free list */ if (SHG.nFree < MAX_FREE_STACK_HISTORIES) { shP->shNextP = SHG.freeHeadP; SHG.freeHeadP = shP; SHG.nFree += 1; up(&SHG.shMutex); return; } /* Otherwise, really free it */ up(&SHG.shMutex); kfree(shP); } /* Find the stack_history_t for the current thread, or allocate one if one does not already exist */ static stack_history_t * shFind() { stack_history_t * shP; cxiThreadId id = current->pid; int h = ((int)id) & (HISTORY_HASH_SIZE-1); int b; /* Look at all entries within the bucket given by the hash of the thread ID. No locking needs to be done for this search. */ for (b=0 ; bshThreadId == id) goto exit; shP = shP->shNextP; } /* No stack_history_t for this thread yet. Get one off the free list or build one. */ shP = shGet(); shP->shThreadId = id; shP->shNextP = NULL; /* Find a slot for the new stack_history_t in the hash table */ for (b=0 ; bshBucketNum = b; goto exit; } /* No slots available; add new stack_history_t to overflow list */ shP->shBucketNum = -1; shP->shNextP = SHG.historyHash[h].overflowP; SHG.historyHash[h].overflowP = shP; exit: /* Release mutex before returning */ up(&SHG.shMutex); return shP; } /* Allocate a frame descriptor within the given stack_history_t. This cannot be allowed to fail, so if there are no more free descriptors, throw away the bottom frame descriptor and return that. The reference count of the frame descriptor that is returned is undefined. */ static frame_desc_t * fdGet(stack_history_t * shP) { frame_desc_t * fdP; frame_desc_t ** fdPrevPP; int prevRef; /* Look on the free list within the stack_history_t */ fdP = shP->shFreeHeadP; if (fdP != NULL) { shP->shFreeHeadP = fdP->fdCallerP; return fdP; } /* No free descriptors; first try stealing one off the bottom of the current call stack */ fdP = shP->shCurrentP; if (fdP != NULL) { /* Find the bottom entry of the current call stack */ fdPrevPP = &shP->shCurrentP; prevRef = 1; while (fdP->fdCallerP != NULL) { fdPrevPP = &fdP->fdCallerP; prevRef = fdP->fdRef; fdP = *fdPrevPP; } /* Remove the bottom entry of the current call stack */ *fdPrevPP = NULL; /* Reduce the reference count on the entry just removed. The reference count decreases by the reference count of the frame that used to point to *fdP. If *fdP is no longer referenced, no further work is needed. If *fdP is still referenced from the max depth stack (it must be the bottom entry), we will eventually return it, but only after removing it from the bottom of the max depth stack. We know that fdP will be returned, but we have to search through the max depth stack to find the pointer to *fdP. */ fdP->fdRef -= prevRef; if (fdP->fdRef == 0) return fdP; } /* Still no free descriptors; steal the frame descriptor off the bottom of the maximum depth call stack */ fdP = shP->shMaxP; if (fdP != NULL) { /* Find the bottom entry of the max depth call stack */ fdPrevPP = &shP->shMaxP; while (fdP->fdCallerP != NULL) { fdPrevPP = &fdP->fdCallerP; fdP = *fdPrevPP; } /* Remove the bottom entry of the max depth call stack */ *fdPrevPP = NULL; /* The bottom entry of the max depth call stack that was just removed must have a reference count of one; otherwise it would still be on the current call stack and removing the bottom entry of that stack would have reduced the reference count of some frame descriptor from 2 to 0. */ SH_ASSERT(fdP->fdRef == 1); return fdP; } SH_ASSERT(!"cannot alloc frame_desc_t"); return NULL; } /* Decrease the reference count on a frame descriptor. If it becomes zero, return it to the free list */ static void fdDiscard(frame_desc_t * fdP, stack_history_t * shP) //inline static void fdDiscard(frame_desc_t * fdP, stack_history_t * shP) { if (fdP->fdRef > 1) { fdP->fdRef -= 1; TRACE3(TRACE_ENTRYEXIT, 11, TRCID_FDDISCARD1, "fdDiscard: fdP 0x%lX shP 0x%lX rtn %s refcnt now 1\n", fdP, shP, fdP->fdFuncNameP); return; } fdP->fdCallerP = shP->shFreeHeadP; shP->shFreeHeadP = fdP; TRACE3(TRACE_ENTRYEXIT, 11, TRCID_FDDISCARD2, "fdDiscard: fdP 0x%lX shP 0x%lX rtn %s refcnt now 0\n", fdP, shP, fdP->fdFuncNameP); } /* If the maximum stack depth exceeds the threshhold, print its traceback if it has not already been printed. Reset the maximum depth stack to empty. Only called when the current stack is already empty. */ static void shDisplay(stack_history_t * shP) { frame_desc_t * fdP; unsigned int tbHash; frame_desc_t * fdNextP; int slot; SH_ASSERT(shP->shCurrentP == NULL); /* If the maximum stack depth is less than the threshhold, just free the call chain and return */ fdP = shP->shMaxP; if (fdP == NULL || fdP->fdStackUsed < STACK_LIMIT_WARNING) goto exit; /* Compute a hash of the traceback call chain */ tbHash = 0; while (fdP != NULL) { tbHash <<= 1; tbHash ^= (((unsigned int)fdP->fdStackUsed) << 15) ^ fdP->fdLineNum; fdP = fdP->fdCallerP; } /* Search for the hash of the call chain in the table of tracebacks that have already been printed. Searching the hash table can be done without any locks, since entries are never deleted. The loop must eventually terminate, since the table will not be allowed to fill up. */ search: slot = tbHash % TB_HASH_SIZE; while (SHG.tracebackHash[slot] != 0) { if (SHG.tracebackHash[slot] == tbHash) /* This traceback has already been printed */ goto exit; slot = (slot+1) % TB_HASH_SIZE; } /* The hash of the current max depth traceback was not found in the table and should be inserted at position 'slot'. Do this under protection of the mutex. If 'slot' has been used by the time we get the mutex, drop the mutex and repeat the search. */ down(&SHG.shMutex); if (SHG.nTracebackHashEntries >= MAX_TRACEBACKS) goto exitMutexHeld; if (SHG.tracebackHash[slot] != 0) { up(&SHG.shMutex); goto search; } SHG.tracebackHash[slot] = tbHash; SHG.nTracebackHashEntries += 1; /* Print the traceback */ fdP = shP->shMaxP; printk("\nGPFS kernel stack for process %d(%s) used %d bytes\n", current->pid, current->comm, fdP->fdStackUsed); printk(" stack function\n"); printk(" used\n"); printk(" ----- -----------------------------------------------------\n"); while (fdP != NULL) { printk(" %5d %s at %s:%d\n", fdP->fdStackUsed, fdP->fdFuncNameP, fdP->fdFileNameP, fdP->fdLineNum); fdP = fdP->fdCallerP; } printk(" traceback signature %08X\n", tbHash); /* If the maximum number of allowed tracebacks has been reached, turn off further stack checking. */ if (SHG.nTracebackHashEntries >= MAX_TRACEBACKS) { printk("Maximum number of GPFS deep stack tracebacks reached\n"); printk("GPFS stack checking disabled\n"); SHG.shActive = false; wmb(); } exitMutexHeld: up(&SHG.shMutex); exit: /* Free all stack frame descriptors for the max depth call chain back to the internal free list. */ fdP = shP->shMaxP; while (fdP != NULL) { SH_ASSERT(fdP->fdRef == 1); fdNextP = fdP->fdCallerP; fdP->fdCallerP = shP->shFreeHeadP; shP->shFreeHeadP = fdP; fdP = fdNextP; } shP->shMaxP = NULL; } /* Process routine entry */ static void fdEntry(frame_desc_t * fdP, stack_history_t * shP) { frame_desc_t * popP; frame_desc_t * p; TRACE5(TRACE_ENTRYEXIT, 11, TRCID_FDENTRY, "fdEntry: fdP 0x%lX shP 0x%lX rtn %s shCurrentP 0x%lX used %d\n", fdP, shP, fdP->fdFuncNameP, shP->shCurrentP, fdP->fdStackUsed); /* If this is the first call by this thread, set up the two call chains */ if (shP->shCurrentP == NULL) { SH_ASSERT(shP->shMaxP == NULL); shP->shCurrentP = fdP; shP->shMaxP = fdP; fdP->fdCallerP = NULL; fdP->fdRef = 2; return; } else SH_ASSERT(shP->shMaxP != NULL); /* Process routine exits implied by the number of bytes of stack that are currently in use. The test needs to be for strict less than because inlined routines share the same stack frame as their caller, but both routines will do entry/exit processing. */ popP = shP->shCurrentP; while (fdP->fdStackUsed < popP->fdStackUsed) { p = popP->fdCallerP; shP->shCurrentP = p; TRACE1(TRACE_ENTRYEXIT, 11, TRCID_IMPLIED_EXIT, "fdEntry: implied exit from rtn %s\n", popP->fdFuncNameP); fdDiscard(popP, shP); if (p == NULL) { /* The outermost routine returned before this call without calling fdExit. Test for a large maximum stack, then reset the maximum. */ shDisplay(shP); /* The current routine is the one and only */ shP->shCurrentP = fdP; shP->shMaxP = fdP; fdP->fdCallerP = NULL; fdP->fdRef = 2; return; } popP = p; } /* If this is an extension of the current max depth stack, just add this routine to the top of both stacks */ if (fdP->fdStackUsed > shP->shMaxP->fdStackUsed && shP->shCurrentP == shP->shMaxP) { fdP->fdCallerP = shP->shCurrentP; shP->shCurrentP = fdP; shP->shMaxP = fdP; fdP->fdRef = 2; TRACE2(TRACE_ENTRYEXIT, 11, TRCID_NEWMAX_EXTEND, "fdEntry: extending new max stack %d fdP 0x%lX\n", fdP->fdStackUsed, fdP); return; } /* Make this new routine be the top of the stack */ fdP->fdCallerP = shP->shCurrentP; shP->shCurrentP = fdP; fdP->fdRef = 1; /* If this new routine has a greater stack depth than the previous max, unreference the previous max depth call chain and add additional references to the current one. */ if (fdP->fdStackUsed > shP->shMaxP->fdStackUsed) { popP = shP->shMaxP; do { p = popP->fdCallerP; fdDiscard(popP, shP); popP = p; } while (popP != NULL); p = fdP; do { p->fdRef = 2; p = p->fdCallerP; } while (p != NULL); TRACE2(TRACE_ENTRYEXIT, 11, TRCID_NEWMAX, "fdEntry: new max stack %d fdP 0x%lX\n", fdP->fdStackUsed, fdP); shP->shMaxP = fdP; } } /* Process routine exit */ static void fdExit(const char * funcnameP) { stack_history_t * shP; frame_desc_t * lastPopP; frame_desc_t * popP; frame_desc_t * p; /* Locate or create stack_history_t for this thread */ shP = shFind(); /* If call stack is already empty, there is nothing to do except free the stack_history_t */ if (shP->shCurrentP == NULL) { SH_ASSERT(shP->shMaxP == NULL); shPut(shP); return; } /* Search backward on the call stack for a routine name that matches the one being exitted. In C++, the ENTER/EXIT macros will pass the same string constant (same address) to fdEntry and fdExit. The C versions of the macros may pass two different copies of the same string. This loop cannot pop routines it skips off the stack, since the routine might never be found. */ p = shP->shCurrentP; for (;;) { if (p->fdFuncNameP == funcnameP || cxiStrcmp(p->fdFuncNameP, funcnameP) == 0) { TRACE4(TRACE_ENTRYEXIT, 11, TRCID_FDEXIT, "fdExit: p 0x%lX shP 0x%lX rtn %s shCurrentP 0x%lX\n", p, shP, p->fdFuncNameP, shP->shCurrentP); lastPopP = p; break; } p = p->fdCallerP; if (p == NULL) { /* Routine name not found. Do not pop stack. */ /* printk("No entry found when exitting %s\n", funcnameP); */ TRACE1(TRACE_ENTRYEXIT, 11, TRCID_FDEXIT_NOTFOUND, "No entry found when exitting %s\n", funcnameP); return; } } /* Pop all routines up to and including lastPopP */ p = shP->shCurrentP; do { popP = p; p = popP->fdCallerP; fdDiscard(popP, shP); } while (popP != lastPopP); shP->shCurrentP = p; /* If this was the return of the outermost routine, print new maximum stack depth traceback and discard the stack_history_t */ if (shP->shCurrentP == NULL) { shDisplay(shP); shPut(shP); } } #endif /* KSTACK_CHECK */ #if defined(ENTRYEXIT_TRACE) || defined(KSTACK_CHECK) void cxiTraceEntry(int level, const char * funcnameP, const char * filenameP, int lineNum) { int stackUsed = THREAD_SIZE - (((unsigned long)&stackUsed) & (THREAD_SIZE-1)); #ifdef KSTACK_CHECK stack_history_t * shP; frame_desc_t * fdP; #endif /* KSTACK_CHECK */ #ifdef ENTRYEXIT_TRACE /* Need to use a constant trace level in the TRACE macro call to get the .trclst file (and later the .trcfmt file) built correctly */ if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level)) { TRACE5(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_ENTER, "-->K %s (%s:%d) level %d stackUsed %d\n", funcnameP, filenameP, lineNum, level, stackUsed); } #endif /* ENTRYEXIT_TRACE */ #ifdef KSTACK_CHECK /* Nothing to do if kernel stack checking is disabled */ if (!SHG.shActive) return; /* Do not attempt to keep track of stack usage in interrupt handlers */ if (in_interrupt()) return; /* Locate or create stack_history_t for this thread */ shP = shFind(); /* Get a new frame descriptor and fill it in */ fdP = fdGet(shP); fdP->fdFuncNameP = funcnameP; fdP->fdFileNameP = filenameP; fdP->fdLineNum = lineNum; fdP->fdStackUsed = stackUsed; /* Perform stack checking for this routine entry */ fdEntry(fdP, shP); #endif /* KSTACK_CHECK */ } void cxiTraceExit(int level, const char * funcnameP) { #ifdef ENTRYEXIT_TRACE /* Need to use a constant trace level in the TRACE macro call to get the .trclst file (and later the .trcfmt file) built correctly */ if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level)) TRACE1(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_EXIT, "<--K %s\n", funcnameP); #endif /* ENTRYEXIT_TRACE */ #ifdef KSTACK_CHECK /* Nothing to do if kernel stack checking is disabled */ if (!SHG.shActive) return; /* Do not attempt to keep track of stack usage in interrupt handlers */ if (in_interrupt()) return; /* Process routine exit */ fdExit(funcnameP); #endif /* KSTACK_CHECK */ } void cxiTraceExitRC(int level, const char * funcnameP, int rc) { #ifdef ENTRYEXIT_TRACE /* Need to use a constant trace level in the TRACE macro call to get the .trclst file (and later the .trcfmt file) built correctly */ if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level)) TRACE2(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_EXIT_RC, "<--K %s rc %d\n", funcnameP, rc); #endif /* ENTRYEXIT_TRACE */ #ifdef KSTACK_CHECK /* Nothing to do if kernel stack checking is disabled */ if (!SHG.shActive) return; /* Do not attempt to keep track of stack usage in interrupt handlers */ if (in_interrupt()) return; /* Process routine exit */ fdExit(funcnameP); #endif /* KSTACK_CHECK */ } #endif /* defined(ENTRYEXIT_TRACE) || defined(KSTACK_CHECK) */ #ifdef UIDREMAP size_t cxiGetUserEnvironmentSize(void) { return (current->mm->env_end - current->mm->env_start); } int cxiGetUserEnvironment(char* buf, size_t len) { return cxiCopyIn((char*)current->mm->env_start, buf, len); } #endif Boolean cxiHasMountHelper() { return USING_MOUNT_HELPER(); } #ifdef P_NFS4 #include /* convert ip address to string */ char *IPtoString(int ip, char *buf) { unsigned char *a = (char *)&ip; sprintf(buf, "%u.%u.%u.%u", a[0], a[1], a[2], a[3]); return buf; } static void printfh(char *s, int *fh) { #ifdef GPFS_PRINTK printk("%s: %d: %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", s, fh[0],fh[1],fh[2],fh[3],fh[4],fh[5],fh[6],fh[7],fh[8],fh[9]); #endif } int cxiSetFH(int *fhP, int sid) { struct knfsd_fh *fh = (struct knfsd_fh *)fhP; printfh("cxiSetFH-1", fhP); if (fh->fh_size > 8) { fh->fh_size += 4; // fh_size + 4 for sid fh->fh_fsid_type += max_fsid_type; fhP[(fh->fh_size >> 2)] = sid; fh->fh_fileid_type = 7; // see code in gpfs_decode_fh() #ifdef GPFS_PRINTK printk("cxiSetFH size %d fsid_type %d fileid %d\n", fh->fh_size, fh->fh_fsid_type, fh->fh_fileid_type); #endif printfh("cxiSetFH-2", fhP); return 0; } return ENOENT; } /* Call to NFS server on MDS to get open state */ int cxiOpenState(void *vfsP, void *p) { int rc = ENOENT; struct super_block *sbP = (struct super_block *)vfsP; struct pnfs_get_state *osP = p; struct gpfsVfsData_t *privVfsP = (struct gpfsVfsData_t *)SBLOCK_PRIVATE(sbP); #ifdef GPFS_PRINTK printk("cxiOpenState1 sb %p p %p \n", sbP, p); printk("cxiOpenState cb_get_state %p\n", sbP->s_export_op->cb_get_state); #endif if (sbP->s_export_op->cb_get_state) rc = sbP->s_export_op->cb_get_state(osP); gpfs_ops.gpfsGetVerifier(privVfsP, osP->verifier); #ifdef GPFS_PRINTK printk("cxiOpenState rc %d devid %x verifier %x:%x\n", rc, osP->devid, osP->verifier[0], osP->verifier[1]); #endif return rc; } /* Call to NFS server on DS to get change open state or close the file */ int cxiChangeState(void *vfsP, void *p) { int rc = ENOENT; struct super_block *sbP = (struct super_block *)vfsP; struct pnfs_get_state *osP = p; if (sbP->s_export_op->cb_change_state) rc = sbP->s_export_op->cb_change_state(osP); #ifdef GPFS_PRINTK printk("cxiChangeState2 sb %p p %p access %d\n", sbP, p, osP->access); #endif return rc; } /* Call to NFS server on MDS to recall layout */ int cxiRecallLayout(void *vfsP, void *vP, void *p) { int rc = ENOENT; struct super_block *sbP = (struct super_block *)vfsP; struct inode *iP = (struct inode *)vP; struct layout_recall lr; lr.fsid = sbP; lr.offset = 0; lr.length = -1; if (iP == NULL) // recall all layouts for this fs lr.layout_type = RECALL_FSID; #ifdef GPFS_PRINTK printk("cxiRecallLayout sbP %p type %d\n", sbP, lr.layout_type); #endif if (sbP->s_export_op->cb_layout_recall) { rc = sbP->s_export_op->cb_layout_recall(sbP, iP, &lr); } else { lr.layout_type = RECALL_FILE; #ifdef GPFS_PRINTK printk("cxiRecallLayout sbP %p iP %p type %d\n", sbP, iP, lr.layout_type); #endif } #ifdef GPFS_PRINTK printk("cxiRecallLayout sbP %p iP %p rc %d\n", sbP, iP, rc); #endif return rc; } /* Get device list gd_type in: requested layout type. out: available lauout type. gd_cookie in: cookie returned on the last operation. out: none zero cookie if some devices did not fit in the buffer. gd_maxcount in: buffer size in bytes. gd_buffer in: pointer to buffer. gd_devlist_len out: number of items returned in the buffer. error: Use the same retrun codes as used for GTEDEVLIST */ int cxiGetDeviceList(int nDests, int *idList, void *P) { ENTER(0); int rc = 0; int i, len, left; int j = 0; char *p, *tp; char tmp[32]; struct nfsd4_pnfs_getdevlist *dl = (struct nfsd4_pnfs_getdevlist *)P; struct nfsd4_pnfs_devlist *gd_buf = NULL; struct pnfs_filelayout_devaddr *dev; #ifdef GPFS_PRINTK printk("xxx cxiGetDeviceList enter nDests %d idList %p \n", nDests, idList); #endif dl->gd_type = LAYOUT_NFSV4_FILES; dl->gd_cookie = 0; dl->gd_devlist_len = 0; left = dl->gd_maxcount; tp = &tmp[0]; len = sizeof(struct nfsd4_pnfs_devlist) * nDests; #ifdef GPFS_PRINTK printk("xxx cxiGetDeviceList len %d left %d\n", len, left); #endif if (nDests > left) { rc = ENOMEM; //??? NFS4ERR_TOOSMALL goto xerror; } gd_buf = (struct nfsd4_pnfs_devlist *)cxiMallocUnpinned(len); if (gd_buf == NULL) { rc = ENOMEM; goto xerror; } memset(gd_buf, 0, len); dl->gd_devlist = gd_buf; #ifdef GPFS_PRINTK printk("xxx cxiGetDeviceList gd_buf %p count %d\n", gd_buf, nDests); #endif for (i = 0; i < nDests; i++) { /* make both device id and device address be the same for now */ gd_buf[j].dev_id = idList[i]; gd_buf[j].dev_lotype = LAYOUT_NFSV4_FILES; if (gd_buf[j].dev_id == INADDR_NONE) continue; IPtoString(gd_buf[j].dev_id, tp); len = (cxiStrlen(tp)); p = (char *)cxiMallocUnpinned(sizeof(struct pnfs_filelayout_devaddr)); if (p == NULL) { rc = ENOMEM; goto xerror; } memset(p, 0, sizeof(struct pnfs_filelayout_devaddr)); gd_buf[j].dev_addr = p; dev = (struct pnfs_filelayout_devaddr *)p; dev->r_addr.len = len + 4; /* for ".8.1" */ p = (char *)cxiMallocUnpinned(dev->r_addr.len+1); if (p == NULL) { rc = ENOMEM; goto xerror; } dev->r_addr.data = p; cxiMemcpy(p, tp, len); p = p + len; cxiStrcpy(p, ".8.1"); /* port 2049 = 0x801 = "8.1" */ dev->r_netid.len = 3; /*'tcp'*/ p = (char *)cxiMallocUnpinned(dev->r_netid.len+1); if (p == NULL) { rc = ENOMEM; goto xerror; } cxiStrcpy(p, "tcp"); dev->r_netid.data = p; left = left - 1; dl->gd_devlist_len++; TRACE4(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELIST_P1, "gpfsGetDeviceList index %d len %d ip %s left %d\n", i, dev->r_addr.len, dev->r_addr.data, left); #ifdef GPFS_PRINTK printk("xxx cxiGetDeviceList index %d id %d len %d ip %s left %d ops %p %p\n", i, gd_buf[j].dev_id, dev->r_addr.len, dev->r_addr.data, left, dl->gd_ops, dl->gd_ops->devaddr_encode); #endif j++; } exit: TRACE2(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELIST_EXIT, "cxiGetDeviceList exit: rc %d len %d", rc, len); return rc; xerror: if (gd_buf != NULL) { for (i = 0; i < j; i++) { dev = gd_buf[i].dev_addr; if (dev) { cxiFreeUnpinned(dev->r_addr.data); cxiFreeUnpinned(dev->r_netid.data); cxiFreeUnpinned(dev); } } cxiFreeUnpinned(gd_buf); } goto exit; } int cxiGetDeviceInfo(void *P) { ENTER(0); int rc; int len; char *p, *tp; char tmp[32]; struct nfsd4_pnfs_getdevinfo *da = (struct nfsd4_pnfs_getdevinfo *)P; tp = &tmp[0]; struct pnfs_filelayout_devaddr *dev; IPtoString(da->gd_dev_id, tp); dev = (struct pnfs_filelayout_devaddr *)cxiMallocUnpinned(sizeof(struct pnfs_filelayout_devaddr)); if (dev == NULL) { rc = ENOMEM; goto xerror; } da->gd_devaddr = dev; len = (cxiStrlen(tp)); dev->r_addr.len = len + 4; /* for ".8.1" */ p = (char *)cxiMallocUnpinned(dev->r_addr.len+1); if (p == NULL) { cxiFreeUnpinned(dev); rc = ENOMEM; goto xerror; } dev->r_addr.data = p; cxiMemcpy(p, tp, len); p = p + len; cxiStrcpy(p, ".8.1"); /* port 2049 = 0x801 = "8.1" */ dev->r_netid.len = 3; /*'tcp'*/ p = (char *)cxiMallocUnpinned(dev->r_netid.len+1); if (p == NULL) { cxiFreeUnpinned(dev->r_addr.data); cxiFreeUnpinned(dev); rc = ENOMEM; goto xerror; } cxiStrcpy(p, "tcp"); dev->r_netid.data = p; TRACE2(TRACE_VNODE, 2, TRCID_GPFSOPS_GET_DEVICELINFO_P1, "gpfsGetDeviceInfo len %d ip %s\n", dev->r_addr.len, dev->r_addr.data); #ifdef GPFS_PRINTK printk("xxx cxiGetDeviceInfo id %d len %d ip %s\n", da->gd_dev_id, dev->r_addr.len, dev->r_addr.data); #endif xerror: TRACE1(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELINFO_EXIT, "cxiGetDeviceInfo exit: rc %d\n", rc); return rc; } /* get layout lg_type in: requested layout type. out: available lauout type. lg_offset in: requested offset. out: returned offset. lg_length in: requested length. out: returned length. lg_mxcnt in: buffer size in bytes. lg_llist in: pointer to buffer. lg_layout out: number of items returned in the buffer. if the file is big(?) return all nodes in layout if the file is small return no layout or just one node, choose one node in random but make sure it is the same node for the same file. */ int cxiGetLayout(int nDests, int *idList, cxiVattr_t *vattr, int myAddr, void *P) { ENTER(0); char *p, *n; int i, rc, left, len; struct nfsd4_pnfs_layoutget *gl = (struct nfsd4_pnfs_layoutget *)P; struct nfsd4_pnfs_layoutlist *lg_buf = NULL; struct nfsd4_pnfs_filelayout *layout = NULL; TRACE2(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_ENTER, "cxiGetLayout: nDests %d myAddr %x\n", nDests,myAddr); /* set node id in fh and increase fh size by 4 */ rc = cxiSetFH((int *)&gl->lg_fh, myAddr); if (rc != 0) goto xerror; gl->lg_type = LAYOUT_NFSV4_FILES; gl->lg_offset = 0; gl->lg_length = MAX_UINT64; /* The maximum file size */ layout = (struct nfsd4_pnfs_filelayout *)cxiMallocUnpinned(sizeof(struct nfsd4_pnfs_filelayout)); if (layout == NULL) { rc = ENOMEM; goto xerror; } gl->lg_layout = layout; layout->lg_stripe_type = STRIPE_DENSE; layout->lg_commit_through_mds = true; layout->lg_stripe_unit = vattr->va_blocksize; /* preferred blocksize */ layout->lg_file_size = vattr->va_size; /* file size in bytes */ layout->lg_llistlen = 0; left = gl->lg_mxcnt; len = sizeof(struct nfsd4_pnfs_layoutlist) * nDests; if (len > left) { rc = ENOMEM; // NFS4ERR_TOOSMALL goto xerror; } lg_buf = (struct nfsd4_pnfs_layoutlist *)cxiMallocUnpinned(len); if (lg_buf == NULL) { rc = ENOMEM; goto xerror; } memset(lg_buf, 0, len); layout->lg_llist = lg_buf; left = left - len; for (i = 0; i < nDests; i++) { /* make both device id and device address be the same for now */ lg_buf[i].dev_ids.len = 1; //??? can return a list of dev ids ???? lg_buf[i].dev_ids.list = (u32 *)cxiMallocUnpinned(sizeof(u32)*lg_buf[i].dev_ids.len); if (lg_buf[i].dev_ids.list == NULL) { rc = ENOMEM; goto xerror; } lg_buf[i].dev_ids.list[0] = idList[i]; layout->lg_llistlen++; lg_buf[i].fhp = (struct knfsd_fh *)&gl->lg_fh; #ifdef GPFS_PRINTK printk("cxiGetLayout index %d id %d xid 0x%lX len %d\n", i, idList[i], idList[i], len); #endif TRACE3(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_P1, "cxiGetLayout index %d id 0x%lX len %d\n", i, idList[i], len); } if (i == 0) { layout->lg_llistlen = 0; cxiFreeUnpinned(lg_buf); } #ifdef GPFS_PRINTK printk("cxiGetLayout: type %d iomode %d offset %lld length %lld minlength %lld mxcnt %d ops %p layouts %p\n", gl->lg_type, gl->lg_iomode, gl->lg_offset, gl->lg_length, gl->lg_minlength, gl->lg_mxcnt, gl->lg_ops, gl->lg_layout); printfh("cxiGetLayout:", gl->lg_fh); printk("cxiGetLayout: layout stripe_type %d stripe_unit %lld file_size %lld llistlen %d llist %p\n", layout->lg_stripe_type, layout->lg_stripe_unit,layout->lg_file_size, layout->lg_llistlen,layout->lg_llist); #endif exit: TRACE3(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_EXIT, "cxiGetLayout exit: rc %d len %d p 0x%lX", rc, len, p); return rc; xerror: if (lg_buf) { gl->lg_length = 0; for (i = 0; i < nDests; i++) { cxiFreeUnpinned(lg_buf[i].dev_ids.list); } cxiFreeUnpinned(lg_buf); } if (layout) cxiFreeUnpinned(layout); goto exit; } #endif int cxiCheckThreadState(cxiThreadId tid) { struct task_struct *t, *g; int rc = ENOENT; // read_lock(&tasklist_lock); rcu_read_lock(); DO_EACH_THREAD(g,t) { /* We are looking for a thread with a given tid and the same parent as the caller (the caller must be another mmfsd thread */ if (t->pid == tid && cxiStrcmp(t->comm, current->comm) == 0) { rc = 0; break; } } WHILE_EACH_THREAD(g,t); // read_unlock(&tasklist_lock); rcu_read_unlock(); return rc; }