Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

cxiSystem.c @ 145

Last change on this file since 145 was 16, checked in by rock, 17 years ago

File size: 104.9 KB

Rev	Line
[16]	1	/***************************************************************************
	2	*
	3	* Copyright (C) 2001 International Business Machines
	4	* All rights reserved.
	5	*
	6	* This file is part of the GPFS mmfslinux kernel module.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	*
	12	* 1. Redistributions of source code must retain the above copyright notice,
	13	* this list of conditions and the following disclaimer.
	14	* 2. Redistributions in binary form must reproduce the above copyright
	15	* notice, this list of conditions and the following disclaimer in the
	16	* documentation and/or other materials provided with the distribution.
	17	* 3. The name of the author may not be used to endorse or promote products
	18	* derived from this software without specific prior written
	19	* permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	22	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	23	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	24	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	25	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	26	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
	27	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	28	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
	29	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
	30	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	31	*
	32	*************************************************************************** */
	33	/* @(#)16 1.158.1.9 src/avs/fs/mmfs/ts/kernext/gpl-linux/cxiSystem.c, mmfs, avs_rgpfs24, rgpfs24s007a 10/24/06 19:12:27 */
	34	/*
	35	* Linux implementation of basic common services
	36	*
	37	* Contents:
	38	* cxiGetThreadId
	39	* getpid
	40	* cxiIsSuperUser
	41	* DoPanic
	42	* logAssertFailed
	43	* Kernel memory allocation services:
	44	* cxiMallocPinned
	45	* cxiFreePinned
	46	*
	47	*/
	48
	49	#include <Shark-gpl.h>
	50
	51	#include <linux/kernel.h>
	52	#include <linux/module.h>
	53	#include <linux/sched.h>
	54	#include <linux/slab.h>
	55	#include <linux/wait.h>
	56	#include <linux/time.h>
	57	#include <linux/file.h>
	58	#include <linux/string.h>
	59	#include <asm/uaccess.h>
	60	#include <linux/smp_lock.h>
	61	#include <linux/vmalloc.h>
	62	#include <linux/fs.h>
	63	#include <linux/interrupt.h>
	64	#undef memcmp
	65
	66	#define DEFINE_TRACE_GBL_VARS
	67	#include <Logger-gpl.h>
	68	#include <verdep.h>
	69	#include <linux2gpfs.h>
	70	#include <cxiSystem.h>
	71	#include <cxiAtomic.h>
	72	#include <cxi2gpfs.h>
	73	#include <cxiIOBuffer.h>
	74	#include <cxiSharedSeg.h>
	75	#include <cxiCred.h>
	76
	77	#include <Trace.h>
	78	#include <lxtrace.h>
	79	#include <cxiMode.h>
	80	#if LINUX_KERNEL_VERSION >= 2060000
	81	#include <linux/swap.h>
	82	#include <linux/writeback.h>
	83	#endif
	84
	85	#if LINUX_KERNEL_VERSION >= 2040900
	86	/* This is in the Redhat kernel series */
	87	extern int posix_locks_deadlock(struct file_lock , struct file_lock );
	88	#endif
	89
	90	#ifdef INSTRUMENT_LOCKS
	91	struct BlockingMutexStats BlockingMutexStatsTable[MAX_GPFS_LOCK_NAMES];
	92	#endif /* INSTRUMENT_LOCKS */
	93
	94	/* We record the daemon's process group since it can uniquely identify
	95	* a thread as being part of the GPFS daemon. pid is unique per thread
	96	* on linux due to their clone implementation.
	97	*/
	98	static pid_t DaemonPGrp = -1;
	99
	100	/* Get the kernel thread ID. */
	101	cxiThreadId cxiGetThreadId()
	102	{
	103	/* ENTER(1); */
	104	return current->pid;
	105	}
	106
	107	/* Get the kernel process ID. */
	108	pid_t getpid()
	109	{
	110	/* ENTER(1); */
	111	return current->pid;
	112	}
	113
	114	/* bufP is caller's ext_cred_t buffer
	115	* uCredPP is the ucred struct (NULL on Linux)
	116	* eCredPP is the ext_cred_t struct * (if successful)
	117	*
	118	* cxiPutCred should be called to release when operation has been completed.
	119	*/
	120	int cxiGetCred(void bufP, void uCredPP, void *eCredPP)
	121	{
	122	ext_cred_t eCredP = (ext_cred_t )bufP;
	123
	124	ENTER(0);
	125	*uCredPP = NULL;
	126	*eCredPP = NULL;
	127
	128	if (!bufP)
	129	{
	130	EXIT_RC(0, EINVAL);
	131	return EINVAL;
	132	}
	133
	134	setCred(eCredP);
	135	eCredPP = (void )eCredP;
	136
	137	xerror:
	138	EXIT(0);
	139	return 0;
	140	}
	141
	142	/* Release of cxiGetCred() structures (nothing to do on Linux) */
	143	int cxiPutCred(void userCredP, void extCredP)
	144	{
	145	if (userCredP \|\| !extCredP)
	146	return EINVAL;
	147
	148	return 0;
	149	}
	150
	151	/* Convert a kernel stack address to the thread ID of the thread that
	152	* uses that stack
	153	*/
	154	int
	155	cxiStackAddrToThreadId(char* stackP, cxiThreadId* tidP)
	156	{
	157	struct task_struct * tP;
	158	#if LINUX_KERNEL_VERSION >= 2060000
	159	/* the kernel stack is base off the thread_info struct in the 2.6 kernel
	160	* will get the task pointer out of thread_info struct.
	161	*/
	162	struct thread_info * iP;
	163	ENTER(0);
	164	iP = (struct thread_info *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1)));
	165	tP = iP->task;
	166	#else
	167	/* the kernel stack is base off the task_struct struct in the 2.4 kernel */
	168	tP = (struct task_struct *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1)));
	169	#endif
	170	ENTER(0);
	171	*tidP = tP->pid;
	172	EXIT(0);
	173	return 0;
	174	}
	175
	176	/* Convert a kernel thread pointer to the corresponding thread ID */
	177	int
	178	cxiThreadPtrToThreadId(char* threadP, cxiThreadId* tidP)
	179	{
	180	struct task_struct * tP;
	181
	182	ENTER(0);
	183	tP = (struct task_struct *) threadP;
	184	*tidP = tP->pid;
	185
	186	EXIT(0);
	187	return 0;
	188	}
	189
	190
	191	/* Return true if caller has has maximum authorization (is root) */
	192	Boolean cxiIsSuperUser()
	193	{
	194	return (current->euid == 0);
	195	}
	196
	197
	198	/* Get the process max filesize limit (ulimit -f) */
	199	Int64 cxiGetMaxFileSize()
	200	{
	201	if ((signed long)MY_RLIM_CUR(RLIMIT_FSIZE) == -1L)
	202	return MAX_INT64;
	203	else
	204	return (MY_RLIM_CUR(RLIMIT_FSIZE));
	205	}
	206
	207	/* Routine to send a signal to the current thread/process */
	208	void cxiSendSigThread(int sig)
	209	{
	210	ENTER(0);
	211	send_sig(sig, current, 0);
	212	EXIT(0);
	213	}
	214
	215
	216	#ifdef MALLOC_DEBUG
	217	/* This tracks mallocs and frees on a limited basis.
	218	* Implemented originally to determine if we were leaking
	219	* any memory after an unload. This is not really thread
	220	* safe for multiple processors unless they're automatically
	221	* cache coherent without memory barriers (i386). Its useful
	222	* for detecting memory leaks on a single processor system.
	223	*/
	224	#define MALLOC_RECORDS 5000 /* max mallocs to track */
	225	struct mallocStat
	226	{
	227	void *beginP;
	228	unsigned short size;
	229	unsigned short type;
	230	};
	231	static struct mallocStat *mstatP = NULL;
	232	unsigned int nextMalloc = 0;
	233
	234	void
	235	MallocDebugStart()
	236	{
	237	int i;
	238
	239	ENTER(0);
	240	if (mstatP == NULL)
	241	mstatP = vmalloc(MALLOC_RECORDS * sizeof(struct mallocStat));
	242
	243	if (mstatP == NULL)
	244	{
	245	EXIT(0);
	246	return;
	247	}
	248
	249	for (i = 0; i < MALLOC_RECORDS; i++)
	250	{
	251	mstatP[i].beginP = NULL;
	252	mstatP[i].size = 0;
	253	mstatP[i].type = 0;
	254	}
	255	printk("MallocDebugStart 0x%X\n", mstatP);
	256	EXIT(0);
	257	}
	258
	259	void
	260	MallocDebugEnd()
	261	{
	262	int i;
	263
	264	ENTER(0);
	265	if (mstatP != NULL)
	266	{
	267	for (i = 0; i < MALLOC_RECORDS; i++)
	268	{
	269	if (mstatP[i].beginP != NULL)
	270	printk("MallocDebug: beginP 0x%X size %d type %d STILL ALLOCATED!\n",
	271	mstatP[i].beginP, mstatP[i].size, mstatP[i].type);
	272	}
	273	}
	274
	275	vfree(mstatP);
	276	mstatP = NULL;
	277	EXIT(0);
	278	}
	279
	280	void
	281	MallocDebugNew(void *ptr, unsigned short size, unsigned short type)
	282	{
	283	void *bP;
	284	int i;
	285	int j;
	286	int swrc;
	287	int oldval;
	288	int where = nextMalloc;
	289
	290	ENTER(0);
	291
	292	if (mstatP == NULL)
	293	{
	294	EXIT(0);
	295	return;
	296	}
	297
	298	for (i = where; i < MALLOC_RECORDS + where; i++)
	299	{
	300	if (i >= MALLOC_RECORDS)
	301	j = i - MALLOC_RECORDS;
	302	else
	303	j = i;
	304
	305	bP = mstatP[j].beginP;
	306	if (bP == NULL)
	307	{
	308	swrc = ATOMIC_SWAP(&mstatP[j].beginP, &bP, ptr);
	309	if (swrc)
	310	{
	311	mstatP[j].size = size;
	312	mstatP[j].type = type;
	313	break;
	314	}
	315	}
	316	}
	317
	318	EXIT(0);
	319	}
	320
	321	void
	322	MallocDebugDelete(void *ptr)
	323	{
	324	void *bP;
	325	int i;
	326	int swrc;
	327	int next;
	328	int found = 0;
	329
	330	ENTER(0);
	331	if (mstatP == NULL)
	332	{
	333	EXIT(0);
	334	return;
	335	}
	336
	337	for (i = 0; i < MALLOC_RECORDS; i++)
	338	{
	339	bP = mstatP[i].beginP;
	340	if (bP == ptr)
	341	{
	342	next = nextMalloc;
	343	ATOMIC_SWAP(&nextMalloc, &next, i);
	344
	345	swrc = ATOMIC_SWAP(&mstatP[i].beginP, &bP, NULL);
	346	DBGASSERT(swrc);
	347	found = 1;
	348	break;
	349	}
	350	}
	351
	352	if (!found)
	353	printk("MallocDebug: 0x%X not found!\n", ptr);
	354	EXIT(0);
	355	}
	356	#endif /* MALLOC_DEBUG */
	357
	358	/* Allocate pinned kernel memory */
	359	void* cxiMallocPinned(int nBytes)
	360	{
	361	void *ptr;
	362
	363	/* kmalloc only supports requests for up to 131027 bytes. Anything
	364	larger than this results in a BUG() call. */
	365	ENTER(0);
	366	if (nBytes > 131072)
	367	{
	368	EXIT(0);
	369	return NULL;
	370	}
	371
	372	ptr = kmalloc(nBytes, GFP_KERNEL);
	373
	374	#ifdef MALLOC_DEBUG
	375	MallocDebugNew(ptr, nBytes, 1);
	376	#endif
	377
	378	EXIT(0);
	379	return ptr;
	380	}
	381
	382	/* Free pinned kernel memory that was allocated with cxiMallocPinned */
	383	/* Must not block on lack of memory resourses */
	384	void cxiFreePinned(void* p)
	385	{
	386	ENTER(0);
	387	#ifdef MALLOC_DEBUG
	388	MallocDebugDelete(p);
	389	#endif
	390
	391	kfree(p);
	392	EXIT(0);
	393	}
	394
	395	/* Get the kernel thread ID. */
	396	void* cxiGetFcntlOwner(eflock_t *flP)
	397	{
	398	return flP? flP->l_owner: current->files;
	399	}
	400
	401	#if LINUX_KERNEL_VERSION > 2060900
	402	struct lock_manager_operations lm_operations = {
	403	};
	404	#endif
	405
	406	/* Perform local advisory locking. */
	407	int cxiFcntlLock(void *advObjP,
	408	int cmd,
	409	void *lockStructP,
	410	cxiFlock_t *flockP,
	411	int (*retryCB)(),
	412	cxiOff64_t size,
	413	cxiOff64_t offset,
	414	ulong *retry_idP)
	415	{
	416	int len, rc = 0;
	417	// struct file *fP;
	418	struct file_lock fl, flP, gflP, *cflP;
	419	Boolean keepLockElement = false;
	420
	421	/* cast platform independent arguments as appropriate for linux */
	422	void (RetryFcn)(struct file_lock) = (void ()(struct file_lock))retryCB;
	423	// fP = (struct file *)advObjP;
	424	struct file localFile, *filp = &localFile;
	425	struct dentry localDEntry, *dp = &localDEntry;
	426	ENTER(0);
	427	flP = (struct file_lock *) lockStructP;
	428
	429	localFile.f_dentry = &localDEntry;
	430	localDEntry.d_inode = (struct inode *)advObjP;
	431
	432	/* Lock commands can have two different values. Convert them at
	433	* entry to the portability layer so that we only have to check
	434	* for one of them.
	435	*/
	436	#if !defined(__64BIT__)
	437	if (cmd == F_GETLK64) cmd = F_GETLK;
	438	if (cmd == F_SETLK64) cmd = F_SETLK;
	439	if (cmd == F_SETLKW64) cmd = F_SETLKW;
	440	#endif
	441
	442	/* Callers have the option of passing a platform dependent lock structure
	443	(struct file_lock lockSructP) or the generic (cxiFlock_t flockP). */
	444	if (flockP)
	445	{
	446	flP = &fl; /* Use a local file_lock structure */
	447
	448	/* If there is a potential for blocking, must malloc the locking structure
	449	so it can persist until the lock becomes available (in Retry()). */
	450
	451	if (cmd == F_SETLKW)
	452	{
	453	#ifdef NFS_CLUSTER_LOCKS
	454	len = sizeof(struct file_lock) +
	455	sizeof(struct file) +
	456	sizeof(struct dentry);
	457	#else
	458	len = sizeof(struct file_lock);
	459	#endif
	460	flP = (struct file_lock*)cxiMallocUnpinned(len);
	461	if (flP == NULL)
	462	{
	463	rc = ENOMEM;
	464	goto exit;
	465	}
	466	cxiMemset(flP, 0, len);
	467	#ifdef NFS_CLUSTER_LOCKS
	468	filp = (struct file)((char )flP + sizeof(struct file_lock));
	469	dp = (struct dentry )((char )filp + sizeof(struct file));
	470	filp->f_dentry = dp;
	471	dp->d_inode = (struct inode *)advObjP;
	472	#endif
	473	}
	474	else
	475	cxiMemset(flP, 0, sizeof(*flP));
	476
	477	locks_init_lock(flP); /* Initialize list_head structs */
	478	if (flockP->l_file == NULL)
	479	flockP->l_file = filp;
	480
	481	/* fl_wait needs to be initialized because when unlock happens, the
	482	linux routine locks_wake_up_blocks invokes our retry routine via
	483	fl_notify and then calls wake_up(fl_wait) on the assumption that
	484	the waiter is local. */
	485
	486	cxiWaitEventInit((cxiWaitEvent_t *)&flP->fl_wait);
	487
	488	cxiFlockToVFS(flockP, flP);
	489	}
	490
	491	/* daemon didn't know the owner and required kernel code to fill it in. */
	492	if (!flP->fl_owner)
	493	flP->fl_owner = (fl_owner_t)cxiGetFcntlOwner(NULL);
	494
	495	#if 0
	496	/* Validate the file pointer. Kernel locking routines are going to
	497	use these without verifying them. If any of them are NULL, find
	498	out now before they generate a segment violation. */
	499	if ((!fP) \|\| (!fP->f_dentry) \|\| (!fP->f_dentry->d_inode))
	500	{
	501	if (cmd == F_GETLK)
	502	flP->fl_type = F_UNLCK;
	503	else
	504	rc = EINVAL;
	505	goto exit;
	506	}
	507	#endif
	508
	509	/* Note that this all depends on us having serialized such locking for
	510	this file during from before the posix_test_lock() until after the
	511	posix_block_lock(). The revoke lock that we hold here provides us
	512	the necessary serilization. */
	513
	514	TRACE7(TRACE_VNODE, 3, TRCID_FCNTLLOCK_ENTER,
	515	"cxiFcntlLock posix_lock_file: pid %d owner 0x%X inodeP 0x%X "
	516	"range 0x%lX-%lX cmd %s type %s\n",
	517	flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end,
	518	(cmd == F_GETLK) ? "GETLK" : (cmd == F_SETLK) ? "SETLK" : "SETLKW",
	519	(flP->fl_type == F_RDLCK) ? "RDLCK" :
	520	(flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK");
	521
	522	if (cmd == F_GETLK)
	523	{
	524	/* Check for conflicts. If found, return the information.
	525	If there are NO conflicts, return F_UNLCK in fl_type. */
	526	#if LINUX_KERNEL_VERSION >= 2061700
	527	struct file_lock conf;
	528	gflP = &conf;
	529	rc = posix_test_lock(filp, flP, gflP);
	530	if (rc) {
	531	rc = 0;
	532	#else
	533	if (NULL != (gflP = posix_test_lock(&localFile, flP))) {
	534	#endif
	535	flP->fl_start = gflP->fl_start;
	536	flP->fl_end = gflP->fl_end;
	537	flP->fl_type = gflP->fl_type;
	538	flP->fl_pid = gflP->fl_pid;
	539	flP->fl_owner = gflP->fl_owner;
	540	}
	541	else
	542	flP->fl_type = F_UNLCK;
	543
	544	TRACE6(TRACE_VNODE, 3, TRCID_FCNTLLOCK_GETLK,
	545	"cxiFcntlLock getlk: pid %d owner 0x%X inodeP 0x%X "
	546	"range 0x%lX-%lX type %s\n",
	547	flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end,
	548	(flP->fl_type == F_RDLCK) ? "RDLCK" :
	549	(flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK");
	550	}
	551	else
	552	{ /* Begin: do the locking, but handle the blocking via our retry routine. */
	553	/* Test the lock. What this really does for us is return the blocker
	554	if one exists. This is needed to queue up the request if a conflicting
	555	lock is already held. */
	556
	557	#ifdef NFS_CLUSTER_LOCKS
	558	if (cmd == F_SETLKW) {
	559	flP->fl_flags \|= FL_SLEEP;
	560	if (!flP->fl_lmops) {
	561	flP->fl_lmops = &lm_operations;
	562	flP->fl_lmops->fl_notify = (void *)RetryFcn;
	563	}
	564	}
	565	rc = POSIX_LOCK_FILE(filp, flP);
	566	if (rc == -EAGAIN && (cmd == F_SETLKW) &&
	567	flP->fl_lmops == &lm_operations)
	568	{
	569	/* Queue the blocker structures */
	570	keepLockElement = true;
	571	if (retry_idP)
	572	*retry_idP = (ulong)flP; // returned to caller and saved in sleepElement
	573	}
	574	#else
	575	#if LINUX_KERNEL_VERSION >= 2061700
	576	if ((flP->fl_type == F_UNLCK) \|\| !(posix_test_lock(&localFile, flP, cflP)))
	577	#else
	578	if ((flP->fl_type == F_UNLCK) \|\| !(cflP = posix_test_lock(&localFile, flP)))
	579	#endif
	580	{
	581	/* No conflicting lock: get the lock for the caller. */
	582	rc = POSIX_LOCK_FILE(&localFile, flP);
	583	}
	584	else
	585	{ /* Conflicting lock: ..... */
	586	rc = EAGAIN;
	587
	588	if (cmd == F_SETLKW)
	589	{
	590	/*if (posix_locks_deadlock(flP, cflP))
	591	{
	592	rc = EDEADLK;
	593	}
	594	else*/
	595	{
	596	/* Queue the blocker structures */
	597	keepLockElement = true;
	598	if (retry_idP)
	599	*retry_idP = (ulong)flP; // returned to caller and saved in sleepElement
	600	#if LINUX_KERNEL_VERSION > 2060900
	601	flP->fl_lmops = &lm_operations;
	602	flP->fl_lmops->fl_notify = RetryFcn;
	603	#else
	604	flP->fl_notify = RetryFcn;
	605	#endif
	606	#if LINUX_KERNEL_VERSION < 2061700
	607	posix_block_lock(cflP, flP);
	608	#endif
	609	}
	610	}
	611	}
	612	#endif
	613
	614	TRACE2(TRACE_VNODE, 3, TRCID_FCNTLLOCK_EXIT,
	615	"cxiFcntlLock posix_lock_file: rc %d retry_id 0x%lX\n", rc, cflP);
	616	} /* End: do the locking, but handle the blocking via our retry routine. */
	617
	618	exit:
	619
	620	if (flockP)
	621	{
	622	/* Caller wanted results in flockP */
	623	cxiVFSToFlock((void *)flP, flockP);
	624
	625	/* If we allocated the locking structure and then didn't need to use
	626	it (the lock request didn't block), free it. */
	627
	628	if ((flP!=&fl) && (!keepLockElement)) {
	629	cxiFreeUnpinned(flP);
	630	}
	631	}
	632
	633	#ifdef NFS_CLUSTER_LOCKS
	634	if (rc < 0)
	635	rc = -rc; /* make it positive */
	636	#endif
	637	EXIT_RC(0, rc);
	638	return rc;
	639	}
	640
	641	void cxiFcntlUnblock(void *retry_idP)
	642	{
	643	struct file_lock flP = (struct file_lock )retry_idP;
	644
	645	ENTER(0);
	646	/* Include some sanity checks on the retry id (file_lock)
	647	before passing it into the routine that does the work.
	648	It should be properly linked (via its list_head structures)
	649	in a file_lock_list that has blocked waiters. Also,
	650	we would only be backing this out by the process that
	651	has originally blocked, so verify the pid. */
	652
	653	if (!list_empty(&flP->fl_block) && !list_empty(&flP->fl_link) &&
	654	flP->fl_next && flP->fl_pid == getpid())
	655	{
	656	POSIX_UNBLOCK_LOCK(flP);
	657	}
	658	EXIT(0);
	659	}
	660
	661	int
	662	cxiFcntlReset(void *vfsP, cxiPid_t mmfsd_pid)
	663	{
	664	int rc = 0;
	665	struct super_block sbP = (struct super_block )vfsP;
	666	struct list_head *fllP;
	667	struct file_lock *fl;
	668	struct dentry *dentryP;
	669
	670	ENTER(0);
	671	lock_kernel();
	672
	673	restart:
	674
	675	#if LINUX_KERNEL_VERSION >= 2061600
	676	//??? find a different way to clear locks file_lock_list is not exported anymore
	677	#else
	678	fllP = file_lock_list.next;
	679
	680	while(fllP != &file_lock_list)
	681	{
	682	fl = list_entry(fllP, struct file_lock, fl_link);
	683	fllP = fllP->next;
	684
	685	/* If there are mmfs lock structures, release them. */
	686
	687	if (fl &&
	688	fl->fl_file &&
	689	fl->fl_file->f_dentry &&
	690	fl->fl_file->f_dentry->d_inode)
	691	{
	692	dentryP = fl->fl_file->f_dentry;
	693
	694	/* If this lock belongs to the specified vfs, release advisory locks. */
	695	if (dentryP->d_sb == sbP)
	696	{
	697	/* remove all our locks */
	698	rc = gpfs_ops.gpfsFcntlReset((void *)dentryP->d_inode, mmfsd_pid);
	699	if (rc == ENOSYS)
	700	goto xerror;
	701
	702	/* After freeing unknown numbers of locks in gpfsFcntlReset (all
	703	locks for the inode), restart from the top of the lock list */
	704	goto restart;
	705	}
	706	}
	707	}
	708	#endif
	709
	710	xerror:
	711	unlock_kernel();
	712	EXIT_RC(0, rc);
	713	return rc;
	714	}
	715
	716	void *
	717	cxiGetPrivVfsP(void *vfsP)
	718	{
	719	struct super_block sbP = (struct super_block )vfsP;
	720
	721	/* Do some sanity checking */
	722	if ( (sbP->s_magic != GPFS_SUPER_MAGIC) \|\|
	723	((UIntPtr) SBLOCK_PRIVATE(sbP) < GPFS_KERNEL_OFFSET) )
	724	printSuperList(sbP);
	725	LOGASSERT( sbP->s_magic == GPFS_SUPER_MAGIC );
	726	LOGASSERT( (UIntPtr) SBLOCK_PRIVATE(sbP) >= GPFS_KERNEL_OFFSET );
	727
	728	return (SBLOCK_PRIVATE(sbP));
	729	}
	730
	731
	732	#ifdef NFS_DEBUG
	733	/* These flags are defined in the kernel and control various cprintk
	734	calls. This provides us a way to easily turn these on/off for
	735	debugging our NFS support. */
	736	extern unsigned int nlm_debug;
	737	extern unsigned int nfsd_debug;
	738	extern unsigned int nfs_debug;
	739	extern unsigned int rpc_debug;
	740	#endif
	741
	742	int cxiTrace(cxiTrace_t trace)
	743	{
	744	#ifdef NFS_DEBUG
	745	int rc = 0;
	746
	747	ENTER(0);
	748	switch (trace)
	749	{
	750	case cxiTraceNFS:
	751	nlm_debug = nfsd_debug = nfs_debug = rpc_debug = ~0;
	752	break;
	753	case cxiTraceNFSoff:
	754	nlm_debug = nfsd_debug = nfs_debug = rpc_debug = 0;
	755	break;
	756	default:
	757	rc = EINVAL;
	758	break;
	759	}
	760	EXIT_RC(0, rc);
	761	return rc;
	762	#else
	763	return ENOSYS;
	764	#endif
	765	}
	766
	767	void cxiFlockToVFS(eflock_t* lckdatP, void* vP)
	768	{
	769	struct file_lock* flP = (struct file_lock *)vP;
	770
	771	ENTER(0);
	772	if ((flP) && (lckdatP))
	773	{
	774	flP->fl_pid = lckdatP->l_pid;
	775	flP->fl_owner = lckdatP->l_owner;
	776	flP->fl_type = lckdatP->l_type;
	777	flP->fl_start = lckdatP->l_start;
	778	flP->fl_flags = FL_POSIX;
	779	#ifdef NFS_CLUSTER_LOCKS
	780	flP->fl_lmops = lckdatP->l_lmops;
	781	flP->fl_file = lckdatP->l_file;
	782	flP->fl_ops = NULL;
	783	#else
	784	#if LINUX_KERNEL_VERSION < 2061700
	785	if (lckdatP->l_caller == L_CALLER_LOCKD)
	786	flP->fl_flags \|= FL_LOCKD;
	787	#endif
	788	#endif
	789	if (lckdatP->l_len == 0)
	790	flP->fl_end = FL_OFFSET_MAX;
	791	else
	792	flP->fl_end = lckdatP->l_len + lckdatP->l_start - 1;
	793	}
	794	EXIT(0);
	795	return;
	796	}
	797
	798	#ifdef NFS_CLUSTER_LOCKS
	799	int cxiVFSCallback(eflock_t* lckreqP, eflock_t* lckdatP,
	800	int(* callback)(void , void , int), int result)
	801	{
	802	struct file_lock fl;
	803	struct file *fileP;
	804	struct file_lock conf, *confP = NULL;
	805	int rc;
	806
	807	ENTER(0);
	808
	809	cxiFlockToVFS(lckreqP, &fl);
	810	fileP = fl.fl_file;
	811	if (!fileP) {
	812	return -1;
	813	}
	814	if (lckdatP) {
	815	cxiFlockToVFS(lckdatP, &conf);
	816	confP = &conf;
	817	}
	818	if (!result) { /* try to get the posix lock */
	819	rc = POSIX_LOCK_FILE(fileP, &fl);
	820	if (rc)
	821	callback(&fl, NULL, EBUSY);
	822	else { /* got the posix lock */
	823	rc = callback(&fl, confP, result);
	824	if (rc) { /* too late, free the lock */
	825	fl.fl_type = F_UNLCK;
	826	rc = POSIX_LOCK_FILE(fileP, &fl);
	827	}
	828	}
	829	}
	830	else
	831	rc = callback(&fl, confP, result);
	832
	833	#ifdef NFS_CLUSTER_LOCKS
	834	if (rc < 0)
	835	rc = -rc; /* make it positive */
	836	#endif
	837	EXIT_RC(0, rc);
	838	return rc;
	839	}
	840	#endif
	841
	842	void cxiVFSToFlock(void vP, eflock_t lckdatP)
	843	{
	844	struct file_lock* flP = (struct file_lock *)vP;
	845
	846	ENTER(0);
	847	if ((flP) && (lckdatP))
	848	{
	849	lckdatP->l_pid = flP->fl_pid;
	850	lckdatP->l_owner = flP->fl_owner;
	851	lckdatP->l_type = flP->fl_type;
	852	lckdatP->l_start = flP->fl_start;
	853	lckdatP->l_flags = flP->fl_flags;
	854	#ifdef NFS_CLUSTER_LOCKS
	855	lckdatP->l_lmops = flP->fl_lmops;
	856	lckdatP->l_file = flP->fl_file;
	857	if (lckdatP->l_lmops) /* must be lockd or nfsd */
	858	#else
	859	#if LINUX_KERNEL_VERSION >= 2061700
	860	if (lckdatP->l_lmops) /* must be lockd or nfsd */
	861	#else
	862	if (flP->fl_flags & FL_LOCKD)
	863	#endif
	864	#endif
	865	lckdatP->l_caller = L_CALLER_LOCKD;
	866	else
	867	lckdatP->l_caller = L_CALLER_NULL;
	868	if (flP->fl_end == FL_OFFSET_MAX)
	869	lckdatP->l_len = 0;
	870	else
	871	lckdatP->l_len = flP->fl_end - flP->fl_start + 1;
	872	}
	873	EXIT(0);
	874	return;
	875	}
	876
	877
	878	/* Sleep for the indicated number of milliseconds */
	879	void cxiSleep(int ms)
	880	{
	881	ENTER(0);
	882	TRACE1(TRACE_VNODE, 9, TRCID_SLEEP,
	883	"cxiSleep: begin delay %d\n", ms);
	884	current->state = TASK_INTERRUPTIBLE;
	885	/* For large HZ rearrange jiffies calculation and
	886	use presumably larger word size to minimize overflow risk */
	887	if (unlikely(HZ > 1000))
	888	schedule_timeout(((long)ms)*HZ/1000);
	889	else
	890	schedule_timeout(ms/(1000/HZ));
	891	TRACE2(TRACE_VNODE, 9, TRCID_SLEEP_END,
	892	"cxiSleep: end delay %d HZ %d\n", ms, HZ);
	893	EXIT(0);
	894	}
	895
	896
	897	void cxiOpenNFS(void *iP)
	898	{
	899	struct inode inodeP = (struct inode )iP;
	900	int refcount;
	901
	902	/* A reference is placed on the cxiNode here when the first NFS reference
	903	is added */
	904	ENTER(0);
	905	refcount = cxiRefOSNode(NULL, ((cxiNode_t *)(cxiGetCnP(inodeP))), iP, 1);
	906
	907	TRACE7(TRACE_VNODE, 3, TRCID_OPENNFS,
	908	"openNFS iP 0x%lX ino %d (0x%X) mode 0x%X nlink %d gen_ip 0x%lX "
	909	"refcount %d\n",
	910	inodeP, (inodeP) ? inodeP->i_ino : -1,
	911	(inodeP) ? inodeP->i_ino : -1,
	912	(inodeP) ? inodeP->i_mode : -1,
	913	(inodeP) ? inodeP->i_nlink : -1,
	914	(inodeP) ? inodeP->PRVINODE : NULL,
	915	refcount);
	916
	917	DBGASSERT(refcount != 0);
	918	EXIT(0);
	919	}
	920
	921
	922	int cxiCloseNFS(void vP, void viP)
	923	{
	924	int rc;
	925	struct inode iP = (struct inode )vP;
	926
	927	/* If viP is NULL, the file was never actually opened.
	928	If viP is not NULL, close it. */
	929	ENTER(0);
	930	if (viP == NULL)
	931	rc = 0;
	932	else {
	933	if (VP_TO_PVP(iP) != NULL && VP_TO_CNP(iP) != NULL) {
	934	rc = gpfs_ops.gpfsClose(VP_TO_PVP(iP), VP_TO_CNP(iP), FREAD\|FWRITE,
	935	(struct MMFSVInfo *)viP, true);
	936	cxiPutOSNode((void *)iP);
	937	}
	938	}
	939
	940	EXIT_RC(0, rc);
	941	return rc;
	942	}
	943
	944	static int cxiNFSCluster = 0;
	945
	946	void cxiSetNFSCluster(int set)
	947	{
	948	cxiNFSCluster = set;
	949	}
	950
	951	/* To avoid failing the NFS client the NFSD thread is put to sleep. Another
	952	node will takeover this client and the operation will continue without any
	953	errors to the application.
	954	*/
	955	void cxiNFSError(int rc, const char *str)
	956	{
	957	TRACE2(TRACE_VNODE, 9, TRCID_NFS_ERROR,
	958	"cxiNFSError: %s got rc %d\n", str, rc);
	959	if (cxiNFSCluster && cxiIsNFSThread() && (rc == ESTALE \|\| rc == -ESTALE))
	960	{
	961	TRACE2(TRACE_VNODE, 1, TRCID_NFS_ERROR_1,
	962	"cxiNFSError: NFS got error %d from %s sleep\n", rc, str);
	963	cxiSleep(120000); // wait 120 seconds
	964	}
	965	}
	966
	967	void * cxiGetNfsP(void *vP)
	968	{
	969	if (vP && VP_TO_CNP((struct inode *)vP))
	970	return VP_TO_NFSP((struct inode *)vP);
	971	else
	972	return NULL;
	973	}
	974
	975	void cxiSetNfsP(void vP, void newP)
	976	{
	977	if (VP_TO_CNP((struct inode *)vP))
	978	VP_TO_NFSP((struct inode *)vP) = newP;
	979	}
	980
	981	void * cxiGetCnP(void *vP)
	982	{ return (void )VP_TO_CNP((struct inode )vP); }
	983
	984	void * cxiGetPvP(void *vP)
	985	{ return (void )VP_TO_PVP((struct inode )vP); }
	986
	987	void * cxiGNPtoVP(void *vP)
	988	{ return (void )GNP_TO_VP((struct cxiNode_t )vP); }
	989
	990	/* Main routine of kproc */
	991	static int kprocMain(void *argP)
	992	{
	993	cxiKProcData_t kpdP = (cxiKProcData_t )argP;
	994
	995	/* Change our process name */
	996	ENTER(0);
	997	current->comm[sizeof(current->comm) - 1] = '\0';
	998	strncpy(current->comm, kpdP->nameP, sizeof(current->comm) - 1);
	999
	1000	/* Change parent of a kernel process so that when it exits, it won't
	1001	* send a SIGCHLD signal to the process that created it, and it won't
	1002	* be left as a zombie.
	1003	*/
	1004	DAEMONIZE(kpdP->nameP);
	1005
	1006	/* Call the function specified by startKProc */
	1007	kpdP->func(kpdP);
	1008	EXIT(0);
	1009	return 0;
	1010	}
	1011
	1012	/* Create a new kernel process */
	1013	cxiPid_t
	1014	cxiStartKProc(struct cxiKProcData_t *kpdP)
	1015	{
	1016	cxiPid_t pid = kernel_thread(kprocMain, kpdP, kpdP->kprocFlags);
	1017	ENTER(0);
	1018	kpdP->pid = pid > 0 ? pid : KPROC_FAILED_PID;
	1019
	1020	TRACE2(TRACE_VNODE, 1, TRCID_CXISTART_KPROC_LINUX,
	1021	"cxiStartKProc %s pid %d \n", kpdP->nameP, kpdP->pid);
	1022	EXIT(0);
	1023	return kpdP->pid;
	1024	}
	1025
	1026	void
	1027	cxiStopKProc(struct cxiKProcData_t *kpdP)
	1028	{
	1029	cxiPid_t pid;
	1030
	1031	ENTER(0);
	1032	cxiBlockingMutexAcquire(&kpdP->lock);
	1033
	1034	TRACE2(TRACE_VNODE, 1, TRCID_CXISTOP_KPROC_LINUX,
	1035	"cxiStopKProc: %s pid %d \n", kpdP->nameP, kpdP->pid);
	1036
	1037	if (!KPROC_RUNNING(kpdP))
	1038	{
	1039	cxiBlockingMutexRelease(&kpdP->lock);
	1040	EXIT(0);
	1041	return;
	1042	}
	1043
	1044	pid = kpdP->pid; // Cache pid before signal/wait
	1045	kpdP->terminate = true;
	1046	cxiWaitEventSignal(&kpdP->kprocEvent);
	1047
	1048	while (kpdP->pid != KPROC_UNASSIGNED_PID)
	1049	cxiWaitEventWait(&kpdP->startStopEvent, &kpdP->lock, 0);
	1050
	1051	cxiBlockingMutexRelease(&kpdP->lock);
	1052	EXIT(0);
	1053	}
	1054
	1055	/*-------------------------------------------------------------------
	1056	* logAssertFailed - Subroutine consolidating logGenIF() and
	1057	* DoPanic() calls.
	1058	------------------------------------------------------------------/
	1059
	1060	static char PanicMsgBuf[2048];
	1061
	1062	void cxiPanic(const char* panicStrP)
	1063	{
	1064	printk( GPFS_NOTICE "kp %d: cxiPanic: %s\n", cxiGetThreadId(), panicStrP);
	1065	TRACE1(TRACE_ERRLOG, 0, TRCID_PANIC, "cxiPanic: %s\n", panicStrP);
	1066	#ifndef DISABLE_KERNEL_PANIC
	1067	BUG();
	1068	#endif
	1069	}
	1070
	1071	static void
	1072	DoPanic(char* condP, char* filenameP, int lineNum, Int32 retCode,
	1073	Int32 reasonCode, char *dataStr)
	1074	{
	1075	const char *p;
	1076	int bytesLeft;
	1077
	1078	p = cxiStrrchr(filenameP, '/');
	1079	if (p == NULL)
	1080	p = filenameP;
	1081	else
	1082	p += 1;
	1083
	1084	sprintf(PanicMsgBuf, "%s:%d:%d:%d:", p, lineNum, retCode, reasonCode);
	1085	bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
	1086	if (dataStr)
	1087	{
	1088	strncat(PanicMsgBuf, dataStr, bytesLeft-1);
	1089	bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
	1090	}
	1091	strncat(PanicMsgBuf, ":", bytesLeft-1);
	1092	bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
	1093	if (condP)
	1094	strncat(PanicMsgBuf, condP, bytesLeft-1);
	1095	cxiPanic(PanicMsgBuf);
	1096	}
	1097
	1098	#ifdef MODULE
	1099	void
	1100	logAssertFailed(UInt32 flags, /* LOG_FATAL_ERROR or LOG_NONFATAL_ERROR */
	1101	char srcFileName, / __FILE__ */
	1102	UInt32 srcLineNumber, /* __LINE__ */
	1103	Int32 retCode, /* return code value */
	1104	Int32 reasonCode, /* normally errno */
	1105	UInt32 logRecTag, /* tag if have associated error log rec */
	1106	char dataStr, / assert data string */
	1107	char failingExpr) / expression that evaluated to false */
	1108	{
	1109	int i;
	1110
	1111	printk("GPFS logAssertFailed: %s file %s line %d\n",
	1112	failingExpr, srcFileName, srcLineNumber);
	1113	ENTER(0);
	1114	TRACE3(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_1,
	1115	"logAssertFailed: %s retCode %d reasonCode %d\n",
	1116	failingExpr, retCode, reasonCode);
	1117	TRACE2(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_2,
	1118	"logAssertFailed: file %s line %d\n", srcFileName, srcLineNumber);
	1119	#ifndef GPFS_PRINTF
	1120	/* fsync buffered lxtrace records */
	1121	trc_fsync();
	1122
	1123	#ifdef STOP_TRACE_ON_FAILURE
	1124	/* Turn off tracing right after the failure occurs. This may only turn
	1125	off tracing in the kernel. */
	1126	for (i=0 ; i<MAX_TRACE_CLASSES ; i++)
	1127	TraceFlagsP[i] = 0;
	1128	#endif
	1129
	1130	/* Wait 10 seconds to allow the lxtrace daemon to complete the sync. */
	1131	cxiSleep(10000);
	1132	#endif
	1133	gpfs_ops.gpfsDaemonToDie(srcFileName, srcLineNumber, retCode, reasonCode,
	1134	dataStr, failingExpr);
	1135
	1136	DoPanic(failingExpr, srcFileName, srcLineNumber, retCode, reasonCode,
	1137	dataStr);
	1138	}
	1139	#else /* !MODULE */
	1140	void
	1141	logAssertFailed(UInt32 flags,
	1142	char *srcFileName,
	1143	UInt32 srcLineNumber,
	1144	Int32 retCode,
	1145	Int32 reasonCode,
	1146	UInt32 logRecTag,
	1147	char *dataStr,
	1148	char *failingExpr);
	1149	#endif /* MODULE */
	1150
	1151
	1152	typedef struct cxiWaitElement_t
	1153	{
	1154	cxiWaitList_t waitList; /* previous and next element in chain */
	1155
	1156	/* Linux would normally organize a wait_queue_head_t with any number
	1157	* of wait_queue_t elements. However since we're implementing "wakeup
	1158	* with return code" we have to ensure the OS wakes up the exact sleeper
	1159	* we want. Thus we have only a one to one relationship to ensure the
	1160	* OS can only pick our favorite.
	1161	*/
	1162	wait_queue_head_t qhead;
	1163	wait_queue_t qwaiter;
	1164	int wakeupRC; /* wakeup return code */
	1165
	1166	} cxiWaitElement_t;
	1167
	1168
	1169	#define CXI_WAIT_LIST_ADD(headP, elementP) \
	1170	(headP)->prevP->nextP = (elementP); \
	1171	(elementP)->prevP = (headP)->prevP; \
	1172	(headP)->prevP = (elementP); \
	1173	(elementP)->nextP = (headP);
	1174
	1175	#define CXI_WAIT_LIST_REMOVE(elementP) \
	1176	(elementP)->prevP->nextP = (elementP)->nextP; \
	1177	(elementP)->nextP->prevP = (elementP)->prevP;
	1178
	1179
	1180	/* Initialize abstract wait event with OS specific
	1181	* initialization function
	1182	*/
	1183	void
	1184	cxiWaitEventInit(cxiWaitEvent_t *weP)
	1185	{
	1186	spinlock_t lockP = (spinlock_t )&weP->lword;
	1187
	1188	spin_lock_init(lockP);
	1189	weP->waitList.nextP = weP->waitList.prevP = &weP->waitList;
	1190	}
	1191
	1192	Boolean
	1193	cxiWaitEventHasWaiters(cxiWaitEvent_t *weP)
	1194	{
	1195	unsigned long flags;
	1196	spinlock_t lockP = (spinlock_t )(weP->lword);
	1197	Boolean rc;
	1198
	1199	SPIN_LOCK_IRQ(lockP, flags);
	1200	rc = (weP->waitList.nextP != &weP->waitList);
	1201	SPIN_UNLOCK_IRQ(lockP, flags);
	1202	return rc;
	1203	}
	1204
	1205	/* Do not add trace records. Some callers depend on not being
	1206	* interrupted by the trace daemon.
	1207	*/
	1208	enum WakeType { wBroadcast, wSignal, wWakeOne };
	1209	static inline void
	1210	doWakeup(cxiWaitEvent_t *wEventP, enum WakeType wtype, int wakeupRC)
	1211	{
	1212	unsigned long flags;
	1213	spinlock_t lockP = (spinlock_t )(wEventP->lword);
	1214	cxiWaitList_t *headP;
	1215	cxiWaitList_t *tmpP;
	1216	cxiWaitElement_t *wP;
	1217
	1218	SPIN_LOCK_IRQ(lockP, flags);
	1219
	1220	/* We wake up from the front back (FIFO semantics).
	1221	* There's only one wait element per wake_queue_head_t so
	1222	* record the return code and wake up the one element.
	1223	*/
	1224	headP = &wEventP->waitList;
	1225
	1226	for (tmpP = headP->nextP; tmpP != headP; tmpP = tmpP->nextP)
	1227	{
	1228	wP = list_entry(tmpP, cxiWaitElement_t, waitList);
	1229	wP->wakeupRC = wakeupRC;
	1230
	1231	wake_up(&wP->qhead);
	1232	if (wtype != wBroadcast)
	1233	{
	1234	/* The difference between wSignal and wWakeOne is that the latter
	1235	guarantees that multiple wake up calls will each pick a different
	1236	thread if more than one is waiting. With wSignal, if a thread is
	1237	awakened but hasn't had a chance to run, then subsequent wake up
	1238	calls might all wake the same thread.
	1239
	1240	On AIX, the calling routine (e_wakeup_one) removes the waiter from
	1241	the queue, unlike Linux where removal is done by the waiting
	1242	thread when it wakes up. Nothing special has to be done on AIX to
	1243	get the nWakeOne style of wakeup.
	1244
	1245	Note: This is an inline routine and the wType argument is a
	1246	compile-time constant, so the "if" tests in this routine are done
	1247	by the compiler and do not generate any code. */
	1248
	1249	if (wtype == wWakeOne)
	1250	{
	1251	/* Move this entry to tail of list so that the next wakeup call will
	1252	pick somebody else. */
	1253	CXI_WAIT_LIST_REMOVE(tmpP);
	1254	CXI_WAIT_LIST_ADD(headP, tmpP);
	1255	}
	1256	break;
	1257	}
	1258	}
	1259	SPIN_UNLOCK_IRQ(lockP, flags);
	1260	}
	1261
	1262	int
	1263	cxiCopyIn(char from, char to, unsigned long size)
	1264	{
	1265	/* The daemon needs to bypass access checks since copy to
	1266	* shared segment would inadvertantly fail.
	1267	*/
	1268	ENTER(0);
	1269	if (PROCESS_GROUP(current) == DaemonPGrp)
	1270	__copy_from_user(to, from, size);
	1271	else
	1272	if (copy_from_user(to, from, size))
	1273	{
	1274	EXIT_RC(0, EFAULT);
	1275	return EFAULT;
	1276	}
	1277	EXIT(0);
	1278	return 0;
	1279	}
	1280
	1281	int
	1282	cxiCopyOut(char from, char to, unsigned long size)
	1283	{
	1284	int ignore;
	1285	/* The daemon needs to bypass access checks since copy to
	1286	* shared segment would inadvertantly fail.
	1287	*/
	1288	ENTER(0);
	1289	if (PROCESS_GROUP(current) == DaemonPGrp)
	1290	ignore = __copy_to_user(to, from, size);
	1291	else
	1292	if (copy_to_user(to, from, size))
	1293	{
	1294	EXIT_RC(0, EFAULT);
	1295	return EFAULT;
	1296	}
	1297	EXIT(0);
	1298	return 0;
	1299	}
	1300
	1301	int
	1302	cxiCopyInstr(char from, char to, unsigned long size, unsigned long *len)
	1303	{
	1304	long retval;
	1305
	1306	ENTER(0);
	1307	retval = strncpy_from_user(to, from, size);
	1308	if ((retval > 0) && (retval <= size))
	1309	{
	1310	*len = retval;
	1311	EXIT(0);
	1312	return 0;
	1313	}
	1314	*len = 0;
	1315	if (retval < 0)
	1316	retval = EFAULT;
	1317	else
	1318	retval = E2BIG;
	1319	EXIT_RC(0, retval);
	1320	return (int)retval;
	1321	}
	1322
	1323	long cxiSafeGetLong(long* from)
	1324	{
	1325	#if LINUX_KERNEL_VERSION >= 2060000
	1326	long tmp;
	1327	(void)__get_user_nocheck(tmp, from, sizeof(long));
	1328	return tmp;
	1329	#else
	1330	return *from;
	1331	#endif
	1332	}
	1333
	1334	int cxiSafeGetInt(int* from)
	1335	{
	1336	#if LINUX_KERNEL_VERSION >= 2060000
	1337	int tmp;
	1338	__get_user_nocheck(tmp, from, sizeof(int));
	1339	return tmp;
	1340	#else
	1341	return *from;
	1342	#endif
	1343	}
	1344
	1345	void cxiSafePutLong(long val, long* to)
	1346	{
	1347	#if LINUX_KERNEL_VERSION >= 2060000
	1348	__put_user_nocheck(val, to, sizeof(long));
	1349	#else
	1350	*to = val;
	1351	#endif
	1352	}
	1353
	1354	void cxiSafePutInt(int val, int* to)
	1355	{
	1356	#if LINUX_KERNEL_VERSION >= 2060000
	1357	__put_user_nocheck(val, to, sizeof(int));
	1358	#else
	1359	*to = val;
	1360	#endif
	1361	}
	1362
	1363	#ifdef GPFS_ARCH_X86_64
	1364	/* Check if 64-bit user process */
	1365	int
	1366	cxiIS64U(char *addr)
	1367	{
	1368	#if LINUX_KERNEL_VERSION > 2060500
	1369	return !(test_thread_flag(TIF_IA32));
	1370	#else
	1371	return !(current->thread.flags & THREAD_IA32);
	1372	#endif
	1373	}
	1374	#endif
	1375
	1376	int
	1377	socket_aio_dequeue()
	1378	{
	1379	return -1;
	1380	}
	1381
	1382	/* Transfer data from buffer(s) in user space to or from a buffer in the
	1383	kernel. */
	1384	int
	1385	cxiUiomove(register char* kBufP, /* address of kernel buffer */
	1386	register unsigned long nBytes, /* #bytes to transfer */
	1387	Boolean toKernel, /* direction of xfer(read/write)*/
	1388	register struct cxiUio_t* uioP) /* user area description */
	1389	{
	1390	register struct cxiIovec_t * iovP;
	1391	unsigned long cnt;
	1392	int rc;
	1393	#ifdef TRACE_IO_DATA
	1394	char* origKBufP = kBufP;
	1395	int trcdata[4];
	1396	#endif
	1397	int ignore;
	1398
	1399	ENTER(0);
	1400	TRACE4(TRACE_FOPS, 6, TRCID_CXISYSTEM_037,
	1401	"cxiUiomove enter: kBufP 0x%lX uioP 0x%lX nBytes %d toKernel %d\n",
	1402	kBufP, uioP, nBytes, toKernel);
	1403	if (uioP->uio_resid <= 0)
	1404	{
	1405	EXIT_RC(0, ENOMEM);
	1406	return ENOMEM;
	1407	}
	1408	rc = 0;
	1409	if (uioP->uio_iovcnt == 1)
	1410	{
	1411	/*
	1412	* Fastpath for most common case of iovcnt == 1. Saves a
	1413	* few instructions.
	1414	*/
	1415	iovP = uioP->uio_iov;
	1416	cnt = iovP->iov_len;
	1417	if (cnt <= 0)
	1418	{
	1419	uioP->uio_iovcnt--;
	1420	uioP->uio_iov++;
	1421	uioP->uio_iovdcnt++;
	1422	EXIT(0);
	1423	return 0;
	1424	}
	1425	if (cnt > nBytes)
	1426	cnt = nBytes;
	1427
	1428	if (toKernel)
	1429	{
	1430	/* The daemon needs to bypass access checks since copy to
	1431	* shared segment would inadvertantly fail. Copies to
	1432	* kernel address space also perform no validity check.
	1433	*/
	1434	if (PROCESS_GROUP(current) == DaemonPGrp \|\| uioP->uio_segflg == UIO_SYSSPACE)
	1435	__copy_from_user(kBufP, (char *)iovP->iov_base, cnt);
	1436	else
	1437	if (copy_from_user(kBufP, (char *)iovP->iov_base, cnt))
	1438	{
	1439	EXIT_RC(0, EFAULT);
	1440	return EFAULT;
	1441	}
	1442	}
	1443	else
	1444	{
	1445	int spam;
	1446	/* The daemon needs to bypass access checks since copy to
	1447	* shared segment would inadvertantly fail. Copies to
	1448	* kernel address space also perform no validity check.
	1449	*/
	1450	if (PROCESS_GROUP(current) == DaemonPGrp \|\| uioP->uio_segflg == UIO_SYSSPACE)
	1451	ignore = __copy_to_user((char *)iovP->iov_base, kBufP, cnt);
	1452	else
	1453	if (copy_to_user((char *)iovP->iov_base, kBufP, cnt))
	1454	{
	1455	EXIT_RC(0, EFAULT);
	1456	return EFAULT;
	1457	}
	1458	}
	1459
	1460	iovP->iov_base = (char *)iovP->iov_base + cnt;
	1461	iovP->iov_len -= cnt;
	1462	uioP->uio_resid -= cnt;
	1463	uioP->uio_offset += cnt;
	1464	#ifdef TRACE_IO_DATA
	1465	if (cnt >= sizeof(trcdata))
	1466	memcpy(trcdata, origKBufP, sizeof(trcdata));
	1467	else
	1468	{
	1469	memset(trcdata, 0xAA, sizeof(trcdata));
	1470	memcpy(trcdata, origKBufP, cnt);
	1471	}
	1472	TRACE5(TRACE_FOPS, 7, TRCID_CXISYSTEM_039a,
	1473	"uiomove exit 1: rc %d data %08X %08X %08X %08X\n",
	1474	rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]);
	1475	#else
	1476	TRACE1(TRACE_FOPS, 7, TRCID_CXISYSTEM_039,
	1477	"uiomove exit 1: rc %d\n",
	1478	rc);
	1479	#endif
	1480	EXIT_RC(0, rc);
	1481	return rc;
	1482	}
	1483	while (nBytes > 0 && uioP->uio_resid && rc == 0)
	1484	{
	1485	if (uioP->uio_iovcnt <= 0)
	1486	{
	1487	EXIT_RC(0, ENOMEM);
	1488	return ENOMEM;
	1489	}
	1490	iovP = uioP->uio_iov;
	1491	cnt = iovP->iov_len;
	1492	if (cnt <= 0)
	1493	{
	1494	uioP->uio_iovcnt--;
	1495	uioP->uio_iov++;
	1496	uioP->uio_iovdcnt++;
	1497	continue;
	1498	}
	1499	if (cnt > nBytes)
	1500	cnt = nBytes;
	1501
	1502	if (toKernel)
	1503	{
	1504	/* The daemon needs to bypass access checks since copy to
	1505	* shared segment would inadvertantly fail. Copies to
	1506	* kernel address space also perform no validity check.
	1507	*/
	1508	if (PROCESS_GROUP(current) == DaemonPGrp \|\| uioP->uio_segflg == UIO_SYSSPACE)
	1509	__copy_from_user(kBufP, (char *)iovP->iov_base, cnt);
	1510	else
	1511	if (copy_from_user(kBufP, (char *)iovP->iov_base, cnt))
	1512	{
	1513	EXIT_RC(0, EFAULT);
	1514	return EFAULT;
	1515	}
	1516	}
	1517	else
	1518	{
	1519	/* The daemon needs to bypass access checks since copy to
	1520	* shared segment would inadvertantly fail. Copies to
	1521	* kernel address space also perform no validity check.
	1522	*/
	1523	if (PROCESS_GROUP(current) == DaemonPGrp \|\| uioP->uio_segflg == UIO_SYSSPACE)
	1524	ignore = __copy_to_user((char *)iovP->iov_base, kBufP, cnt);
	1525	else
	1526	if (copy_to_user((char *)iovP->iov_base, kBufP, cnt))
	1527	{
	1528	EXIT_RC(0, EFAULT);
	1529	return EFAULT;
	1530	}
	1531	}
	1532	iovP->iov_base = (char *)iovP->iov_base + cnt;
	1533	iovP->iov_len -= cnt;
	1534	uioP->uio_resid -= cnt;
	1535	uioP->uio_offset += cnt;
	1536	kBufP += cnt;
	1537	nBytes -= cnt;
	1538	}
	1539	#ifdef TRACE_IO_DATA
	1540	cnt = kBufP - origKBufP;
	1541	if (cnt >= sizeof(trcdata))
	1542	memcpy(trcdata, origKBufP, sizeof(trcdata));
	1543	else
	1544	{
	1545	memset(trcdata, 0xAA, sizeof(trcdata));
	1546	memcpy(trcdata, origKBufP, cnt);
	1547	}
	1548	TRACE5(TRACE_FOPS, 7, TRCID_CXISYSTEM_041a,
	1549	"uiomove exit 2: rc %d data %08X %08X %08X %08X\n",
	1550	rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]);
	1551	#else
	1552	TRACE1(TRACE_FOPS, 7, TRCID_CXISYSTEM_041,
	1553	"uiomove exit 2: rc %d\n",
	1554	rc);
	1555	#endif
	1556	EXIT_RC(0, rc);
	1557	return rc;
	1558	}
	1559
	1560	/*
	1561	Try to force some sanity checks at compile type
	1562	*/
	1563	/* TO DO: revise this to handle comparisons beyond equality/inequality */
	1564	/* STATIC_DBGASSERT(sizeof(spinlock_t), SPINLOCK_T_SIZE); */
	1565
	1566	/* A routine to check that the definitions in our cxiTypes.h
	1567	* files are equivalent to the system definitions. The module
	1568	* should not load if it receives an error from this routine.
	1569	*/
	1570	int
	1571	cxiCheckTypes()
	1572	{
	1573	int rc = 0;
	1574	ENTER(0);
	1575
	1576	/* Make sure cxiBlockingMutex_t fits in the space provided. If not,
	1577	the implementation of the cxiBlockingMutex... routines needs to
	1578	use the embedded space to record a pointer to kmalloc'ed space holding
	1579	the semaphore. */
	1580	if (sizeof(struct semaphore) > GPFS_LINUX_SEM_SIZE)
	1581	{
	1582	printk("cxiCheckTypes: semaphore %ld > GPFS_LINUX_SEM_SIZE %ld\n",
	1583	sizeof(struct semaphore), GPFS_LINUX_SEM_SIZE);
	1584	rc = 1;
	1585	}
	1586
	1587	/* Size of spinlock_t is smaller for UP case with gcc 3.x, so just
	1588	insure SPINLOCK_T_SIZE is large enough for both the UP and SMP case. */
	1589	if (sizeof(spinlock_t) > SPINLOCK_T_SIZE)
	1590	{
	1591	printk("cxiCheckTypes: spinlock_t %ld > SPINLOCK_T__SIZE %ld\n",
	1592	sizeof(spinlock_t), SPINLOCK_T_SIZE);
	1593	rc = 2;
	1594	}
	1595
	1596	/* Ensure that size of pid_t matches cxiThreadId (32-bits) */
	1597	if (sizeof(pid_t) != sizeof(cxiThreadId))
	1598	{
	1599	printk("cxiCheckTypes: pid_t %ld != cxiThreadId %ld\n",
	1600	sizeof(pid_t), sizeof(cxiThreadId));
	1601	rc = 3;
	1602	}
	1603
	1604	if (rc > 0)
	1605	TRACE1(TRACE_TASKING, 2, TRCID_CXISYSTEM_CHKTYPES,
	1606	"cxiCheckTypes: system type mismatch on type number %d!\n", rc);
	1607	EXIT_RC(0, rc);
	1608	return rc;
	1609	}
	1610
	1611	/* Routine to get current time of day in nanosecond format.
	1612	*/
	1613	int
	1614	cxiGetTOD(cxiTimeStruc_t *tsP)
	1615	{
	1616	#if LINUX_KERNEL_VERSION >= 2060000
	1617	struct timespec ts;
	1618	#else
	1619	struct timeval tv;
	1620	#endif
	1621
	1622	ENTER(0);
	1623	#if LINUX_KERNEL_VERSION >= 2060000
	1624	ts = CURRENT_TIME;
	1625	tsP->tv_sec = ts.tv_sec;
	1626	tsP->tv_nsec = ts.tv_nsec;
	1627	#else
	1628	/* This call returns microseconds so we fudge it to nanoseconds */
	1629	do_gettimeofday(&tv);
	1630	tsP->tv_sec = tv.tv_sec;
	1631	tsP->tv_nsec = tv.tv_usec * 1000;
	1632	#endif
	1633
	1634	EXIT(0);
	1635	return 0;
	1636	}
	1637
	1638	Boolean
	1639	cxiIsNFSThread()
	1640	{
	1641	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
	1642	/* Note comparison against a multibyte character constant (not a string
	1643	constant). Order of characters in word is reversed due to little-
	1644	endian representation of integers. */
	1645	if (* ((int*)&current->comm[0]) != 0x6473666e) // 'dsfn'
	1646	return false;
	1647	if (* ((char*)&current->comm[4]) == '\0')
	1648	return true;
	1649	return (* ((int*)&current->comm[2]) == 0x00346473); // '4ds'
	1650	# else
	1651	if ((strcmp(current->comm, "nfsd") == 0) \|\|
	1652	(strcmp(current->comm, "nfsd4") == 0))
	1653	return true;
	1654	return false;
	1655	# endif
	1656	}
	1657
	1658	Boolean
	1659	cxiIsLockdThread()
	1660	{
	1661	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
	1662	/* Note comparison against a multibyte character constant (not a string
	1663	constant). Order of characters in word is reversed due to little-
	1664	endian representation of integers. */
	1665	if ((* ((int*)&current->comm[0]) != 0x6b636f6c) \| // 'kcol'
	1666	(* ((int*)&current->comm[2]) != 0x00646b63)); // ' dkc'
	1667	return false;
	1668	return * ((char*)&current->comm[5]) == '\0';
	1669	# else
	1670	return (strcmp(current->comm, "lockd") == 0);
	1671	# endif
	1672	}
	1673
	1674	Boolean
	1675	cxiIsNFS4Thread()
	1676	{
	1677	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
	1678	/* Note comparison against a multibyte character constant (not a string
	1679	constant). Order of characters in word is reversed due to little-
	1680	endian representation of integers. */
	1681	if ((* ((int*)&current->comm[0]) != 0x6473666e) \| // 'dsfn'
	1682	(* ((int*)&current->comm[2]) != 0x00346473)); // '4ds'
	1683	return false;
	1684	return * ((char*)&current->comm[5]) == '\0';
	1685	# else
	1686	return (strcmp(current->comm, "nfsd4") == 0);
	1687	# endif
	1688	}
	1689
	1690	Boolean
	1691	cxiIsKupdateThread()
	1692	{
	1693	#if LINUX_KERNEL_VERSION >= 2060000
	1694	/* In 2.6 pdflush replaced kupdated and bdflush from 2.4 */
	1695	return current_is_pdflush();
	1696	#else
	1697	return (strcmp(current->comm, "kupdated") == 0);
	1698	#endif
	1699	}
	1700
	1701	#ifdef SMB_LOCKS
	1702	Boolean
	1703	cxiIsSambaOrLockdThread()
	1704	{
	1705	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
	1706	/* Note comparison against a multibyte character constant (not a string
	1707	constant). Order of characters in word is reversed due to little-
	1708	endian representation of integers. */
	1709	Boolean rc = (((* ((int*)&current->comm[0]) == 0x64626d73) & // 'dbms'
	1710	(* ((char*)&current->comm[4]) == '\0')) \|
	1711	((* ((int*)&current->comm[0]) == 0x6b636f6c) & // 'kcol'
	1712	(* ((int*)&current->comm[2]) == 0x00646b63))); // 'dkc'
	1713	return rc;
	1714	# else
	1715	return ((strcmp(current->comm, "smbd") == 0) \|
	1716	(strcmp(current->comm, "lockd") == 0));
	1717	# endif
	1718	}
	1719
	1720	Boolean
	1721	cxiIsSambaThread()
	1722	{
	1723	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
	1724	/* Note comparison against a multibyte character constant (not a string
	1725	constant). Order of characters in word is reversed due to little-
	1726	endian representation of integers. */
	1727	Boolean rc = ((* ((int*)&current->comm[0]) == 0x64626d73) & // 'dbms'
	1728	(* ((char*)&current->comm[4]) == '\0'));
	1729	return rc;
	1730	# else
	1731	return (strcmp(current->comm, "smbd") == 0);
	1732	# endif
	1733	}
	1734	#endif
	1735
	1736	Boolean
	1737	cxiIsGPFSThread()
	1738	{
	1739	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
	1740	return (((* ((int*)&current->comm[0]) == 0x73666d6d) & // 'sfmm'
	1741	(* ((int*)&current->comm[2]) == 0x00647366))); // 'dsf'
	1742	# else
	1743	return (strcmp(current->comm, "mmfsd") == 0);
	1744	# endif
	1745	}
	1746
	1747	Boolean
	1748	cxiIsKswapdThread()
	1749	{
	1750	#if LINUX_KERNEL_VERSION > 2060000
	1751	/* On 2.6, there may be multiple kswapd processes, named kswapd0, kswapd1,
	1752	* etc. We don't have to depend on the process name to identify kswapd
	1753	* processes on 2.6 though, there's a better way. */
	1754	return current_is_kswapd();
	1755	#else
	1756	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
	1757	return ((* ((int*)&current->comm[0]) == 0x6177736b) & // 'awsk'
	1758	(* ((int*)&current->comm[3]) == 0x00647061)); // ' dpa'
	1759	# else
	1760	return (strcmp(current->comm, "kswapd") == 0);
	1761	# endif
	1762	#endif
	1763	}
	1764
	1765	#ifdef INSTRUMENT_LOCKS
	1766	void InitBlockingMutexStats()
	1767	{
	1768	memset(BlockingMutexStatsTable, 0, sizeof(BlockingMutexStatsTable));
	1769	}
	1770	#endif
	1771
	1772	/* Initialize a cxiBlockingMutex_t. Instead of the DBGASSERT, this routine
	1773	should kmalloc a struct semaphore if bmSem is too small. */
	1774	void cxiBlockingMutexInit(cxiBlockingMutex_t* mP, int bmNameIdx)
	1775	{
	1776	ENTER(0);
	1777	DBGASSERT(sizeof(struct semaphore) <= GPFS_LINUX_SEM_SIZE);
	1778	#ifdef INSTRUMENT_LOCKS
	1779	DBGASSERT(bmNameIdx < MAX_GPFS_LOCK_NAMES);
	1780	#endif /* INSTRUMENT_LOCKS */
	1781
	1782	TRACE2(TRACE_KLOCKL, 3, TRCID_BM_INIT,
	1783	"cxiBlockingMutexInit: mP 0x%lX idx %d\n",
	1784	mP, bmNameIdx);
	1785	init_MUTEX((struct semaphore *)mP->bmSem);
	1786	mP->bmOwnerP = NULL;
	1787	mP->lockNameIndex = bmNameIdx;
	1788	EXIT(0);
	1789	}
	1790
	1791
	1792	/* Enter critical section, blocking this thread if necessary. Mark this
	1793	thread as the owner of the mutex before returning. */
	1794	void
	1795	REGPARMS cxiBlockingMutexAcquire(cxiBlockingMutex_t* mP)
	1796	{
	1797	ENTER(1);
	1798	TRACE4(TRACE_KLOCKL, 9, TRCID_BM_ACQ,
	1799	"cxiBlockingMutexAcquire: about to acquire 0x%lX type %d "
	1800	"current 0x%lX currentOwner 0x%lX\n",
	1801	mP, mP->lockNameIndex, current, mP->bmOwnerP);
	1802
	1803	DBGASSERTRC(mP->bmOwnerP != (char *)current,
	1804	PTR_TO_INT32(mP->bmOwnerP), PTR_TO_INT32(mP), 0);
	1805
	1806	#ifdef INSTRUMENT_LOCKS
	1807	BlockingMutexStatsTable[mP->lockNameIndex].bmsAcquires += 1;
	1808	if (mP->bmOwnerP != NULL)
	1809	BlockingMutexStatsTable[mP->lockNameIndex].bmsConflicts += 1;
	1810	#endif
	1811
	1812	down((struct semaphore *)mP->bmSem);
	1813	mP->bmOwnerP = (char *)current;
	1814
	1815	TRACE1(TRACE_KLOCKL, 9, TRCID_BM_ACQ_EXIT,
	1816	"cxiBlockingMutexAcquire: returning after acquiring 0x%lX\n", mP);
	1817	EXIT(1);
	1818	}
	1819
	1820
	1821	/* Leave critical section and awaken waiting threads */
	1822	void
	1823	REGPARMS cxiBlockingMutexRelease(cxiBlockingMutex_t* mP)
	1824	{
	1825	ENTER(1);
	1826	TRACE4(TRACE_KLOCKL, 9, TRCID_BM_REL,
	1827	"cxiBlockingMutexRelease: about to release 0x%lX type %d "
	1828	"current 0x%lX currentOwner 0x%lX\n",
	1829	mP, mP->lockNameIndex,current, mP->bmOwnerP);
	1830
	1831	if (mP->bmOwnerP == (char *)current)
	1832	{
	1833	mP->bmOwnerP = NULL;
	1834	up((struct semaphore *)mP->bmSem);
	1835	}
	1836	EXIT(1);
	1837	}
	1838
	1839	/* Free resources associated with this cxiBlockingMutex_t in preparation
	1840	for freeing the storage it occupies */
	1841	void cxiBlockingMutexTerm(cxiBlockingMutex_t* mP)
	1842	{
	1843	ENTER(0);
	1844	TRACE2(TRACE_KLOCKL, 3, TRCID_BM_TERM,
	1845	"cxiBlockingMutexTerm: mP 0x%lX type %d\n", mP, mP->lockNameIndex);
	1846
	1847	/* Verify that mutex is not held */
	1848	DBGASSERT(mP->bmOwnerP == NULL);
	1849	DBGASSERT(atomic_read(&((struct semaphore *)mP->bmSem)->count) == 1);
	1850	EXIT(0);
	1851	}
	1852
	1853
	1854	/* Return true if a cxiBlockingMutex_t is held by the calling process */
	1855	Boolean
	1856	cxiBlockingMutexHeldByCaller(cxiBlockingMutex_t* mP)
	1857	{
	1858	Boolean result;
	1859	char* ownerP;
	1860	cxiPid_t ownerPid;
	1861
	1862	/* Cache bmOwnerP is case it changes to NULL */
	1863	ENTER(0);
	1864	ownerP = mP->bmOwnerP;
	1865	if (ownerP == NULL)
	1866	result = false;
	1867	else
	1868	{
	1869	cxiThreadPtrToThreadId(ownerP, &ownerPid);
	1870	result = (current->pid == ownerPid);
	1871	}
	1872	TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_017,
	1873	"cxiBlockingMutexHeldByCaller: owner 0x%lX returns %d\n",
	1874	ownerP, result);
	1875	EXIT_RC(0, result);
	1876	return result;
	1877	}
	1878
	1879
	1880	/* Return true if a cxiBlockingMutex_t has one or more processes waiting
	1881	on it */
	1882	Boolean cxiBlockingMutexHasWaiters(cxiBlockingMutex_t* mP)
	1883	{
	1884	struct semaphore * semP = (struct semaphore *)mP->bmSem;
	1885	Boolean result;
	1886
	1887	ENTER(0);
	1888	if ((void)semP->wait.task_list.next != (void)&semP->wait.task_list.next)
	1889	result = true;
	1890	else
	1891	result = false;
	1892	TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_018,
	1893	"cxiBlockingMutexHasWaiters: mP 0x%lX hasWaiters %d\n",
	1894	mP, result);
	1895	EXIT_RC(0, result);
	1896	return result;
	1897	}
	1898
	1899
	1900	/* Wait for a cxiWaitEventSignal, cxiWaitEventBroadcast, or
	1901	cxiWaitEventBroadcastRC. Drop the associated cxiBlockingMutex_t
	1902	*mutexP while waiting, and reacquire it before returning.
	1903	If INTERRUPTIBLE is set in waitFlags, waits interruptibly;
	1904	otherwise, waits uninterruptibly.
	1905	Returns THREAD_INTERRUPTED if interrupted before being woken up,
	1906	THREAD_AWAKENED, if woken up by cxiWaitEventSignal or
	1907	cxiWaitEventBroadcast, or the result value passed to
	1908	cxiWaitEventWakeupResult, if woken up by cxiWaitEventWakeupResult. */
	1909	int cxiWaitEventWait(cxiWaitEvent_t* weP, cxiBlockingMutex_t* mutexP,
	1910	int waitFlags)
	1911	{
	1912	spinlock_t lockP = (spinlock_t )(weP->lword);
	1913	unsigned long flags;
	1914	cxiWaitElement_t waitElement;
	1915	int count = 0;
	1916	Boolean done;
	1917
	1918	ENTER(0);
	1919	TRACE3(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_EVENT_WAIT_ENTER,
	1920	"cxiWaitEventWait enter: weP 0x%lX waitFlags 0x%X about to release "
	1921	"mutex 0x%lX \n", weP, waitFlags, mutexP);
	1922
	1923	/* Verify that caller is holding the mutex */
	1924	DBGASSERTRC(mutexP->bmOwnerP == (char *)current,
	1925	PTR_TO_INT32(mutexP->bmOwnerP), PTR_TO_INT32(mutexP), 0);
	1926
	1927	/* initialize our wait element */
	1928	init_waitqueue_head(&waitElement.qhead);
	1929	init_waitqueue_entry(&waitElement.qwaiter, current);
	1930	__add_wait_queue(&waitElement.qhead, &waitElement.qwaiter);
	1931	waitElement.wakeupRC = 0;
	1932
	1933	/* update our task state to not running any more */
	1934	if (waitFlags & INTERRUPTIBLE)
	1935	current->state = TASK_INTERRUPTIBLE;
	1936	else
	1937	current->state = TASK_UNINTERRUPTIBLE;
	1938
	1939	/* add our wait element to the end of the wait list */
	1940	SPIN_LOCK_IRQ(lockP, flags);
	1941
	1942	CXI_WAIT_LIST_ADD(&weP->waitList, &waitElement.waitList);
	1943
	1944	SPIN_UNLOCK_IRQ(lockP, flags);
	1945
	1946	/* Release the mutex. Note: calling cxiBlockingMutexRelease here is
	1947	problematic, because it makes trace calls, which may block the current
	1948	process, which would overwrite the task state (current->state) we just
	1949	updated. A way around this would be to move out task state update to
	1950	after the call to cxiBlockingMutexRelease, but then, before calling
	1951	schedule(), we would have to re-acquire the wait-list lock and check
	1952	wakeupRC to see whether somebody has already woken us up since we
	1953	released the mutex. Since there is a trace at the top of this routine,
	1954	we don't need the one in cxiBlockingMutexRelease; hence, just do the
	1955	release right here. */
	1956	mutexP->bmOwnerP = NULL;
	1957	up((struct semaphore *)mutexP->bmSem);
	1958
	1959	again:
	1960	/* call the scheduler */
	1961	schedule();
	1962
	1963	/* Remove ourself from the wait list ... except:
	1964	Even though we may enter uninterrubtible sleep, this sleep can in
	1965	fact be interrupted in at least two scenarios:
	1966	1) page_alloc code may call wakeup_kswapd(). This should be
	1967	a very rare event with the current code, since we make an effort
	1968	to avoid blocking kswapd.
	1969	2) While signals are supposed to be ignored during uninterruptible
	1970	sleep, it turns out that some signals, e.g. SIGSEGV and SIGBUS,
	1971	cause us to wake up. It doesn't look like the signal has been
	1972	delivered yet, but sleep is interrupted. The signal will be
	1973	delivered later (probably when exiting kernel).
	1974	Our callers can't handle unexpected return from uninterruptible
	1975	sleep. In either of the two cases above, it should be safe to go
	1976	back to sleep and wait to be woken up properly.
	1977	*/
	1978	SPIN_LOCK_IRQ(lockP, flags);
	1979
	1980	if (waitElement.wakeupRC == 0 &&
	1981	!(waitFlags & INTERRUPTIBLE))
	1982	{
	1983	TRACE3N(TRACE_KLOCKL, 1, TRCID_CXISYSTEM_EVENT_WAIT_INTERRUPTED,
	1984	"cxiWaitEventWait: interrupted weP 0x%lX mutexP 0x%lX rc %d\n",
	1985	weP, mutexP, waitElement.wakeupRC);
	1986	current->state = TASK_UNINTERRUPTIBLE;
	1987	done = false;
	1988	}
	1989	else
	1990	{
	1991	CXI_WAIT_LIST_REMOVE(&waitElement.waitList);
	1992	done = true;
	1993	}
	1994
	1995	SPIN_UNLOCK_IRQ(lockP, flags);
	1996
	1997	if (!done)
	1998	goto again;
	1999
	2000	/* re-acquire the mutex */
	2001	cxiBlockingMutexAcquire(mutexP);
	2002
	2003	TRACE3(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_EVENT_WAIT_EXIT,
	2004	"cxiWaitEventWait exit: weP 0x%lX mutexP 0x%lX rc %d\n",
	2005	weP, mutexP, waitElement.wakeupRC);
	2006
	2007	/* A zero wakeup code means we were interrupted rather than woken up */
	2008	EXIT(0);
	2009	if (waitElement.wakeupRC != 0)
	2010	return waitElement.wakeupRC;
	2011	else
	2012	return THREAD_INTERRUPTED;
	2013	}
	2014
	2015	/* Wake up one thread waiting on this cxiWaitEvent_t. Must not sleep */
	2016	void
	2017	cxiWaitEventSignal(cxiWaitEvent_t* weP)
	2018	{
	2019	/* ENTER(0); */
	2020	TRACE1N(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_SIGNAL,
	2021	"cxiWaitEventSignal: weP 0x%lX\n", weP);
	2022
	2023	doWakeup(weP, wSignal, THREAD_AWAKENED); /* wake up one */
	2024	/* EXIT(0); */
	2025	}
	2026
	2027
	2028	/* Wake up one thread waiting on this cxiWaitEvent_t. This is the same as
	2029	cxiWaitEventSignal(), except this routine guarantees that multiple wake
	2030	up calls will each pick a different thread if more than one is waiting. */
	2031	void
	2032	cxiWaitEventWakeupOne(cxiWaitEvent_t* weP)
	2033	{
	2034	ENTER(0);
	2035	TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_WAKEUP_ONE,
	2036	"cxiWaitEventWakeupOne: weP 0x%lX\n", weP);
	2037
	2038	doWakeup(weP, wWakeOne, THREAD_AWAKENED); /* wake up one */
	2039	EXIT(0);
	2040	}
	2041
	2042
	2043	/* Wake up all threads waiting on this cxiWaitEvent_t */
	2044	void
	2045	cxiWaitEventBroadcast(cxiWaitEvent_t* weP)
	2046	{
	2047	ENTER(0);
	2048	TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST,
	2049	"cxiWaitEventBroadcastRC: weP 0x%lX\n", weP);
	2050
	2051	doWakeup(weP, wBroadcast, THREAD_AWAKENED); /* wake up all */
	2052	EXIT(0);
	2053	}
	2054
	2055
	2056	/* Wake up all threads waiting on this cxiWaitEvent_t and cause them to
	2057	return rc from their cxiWaitEventWait calls. */
	2058	void
	2059	cxiWaitEventBroadcastRC(cxiWaitEvent_t* weP, int rc)
	2060	{
	2061	ENTER(0);
	2062	TRACE2(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST_RC,
	2063	"cxiWaitEventBroadcastRC: weP 0x%lX rc %d\n", weP, rc);
	2064
	2065	doWakeup(weP, wBroadcast, rc); /* wake up all */
	2066	EXIT_RC(0, rc);
	2067	}
	2068
	2069	/* alloc big memory area */
	2070	void *
	2071	cxiBigMalloc(int size)
	2072	{
	2073	void *ptr;
	2074
	2075	ENTER(0);
	2076	ptr = vmalloc(size);
	2077
	2078	#ifdef MALLOC_DEBUG
	2079	MallocDebugNew(ptr, size, 2);
	2080	#endif
	2081
	2082	EXIT(0);
	2083	return ptr;
	2084	}
	2085
	2086	/* free big memory area */
	2087	void
	2088	cxiBigFree(char *ptr)
	2089	{
	2090	ENTER(0);
	2091	#ifdef MALLOC_DEBUG
	2092	MallocDebugDelete(ptr);
	2093	#endif
	2094
	2095	EXIT(0);
	2096	vfree(ptr);
	2097	}
	2098
	2099	#ifdef SMB_LOCKS
	2100	/* Determine if current process has this file open */
	2101	void *
	2102	cxiCheckOpen(struct cxiNode_t* cnP)
	2103	{
	2104	int count;
	2105	int i;
	2106	struct file** fdList;
	2107	struct file* fileP;
	2108	struct inode* inodeP;
	2109
	2110	ENTER(0);
	2111	#if LINUX_KERNEL_VERSION >= 2061300
	2112	count = current->files->fdt->max_fds;
	2113	fdList = current->files->fdt->fd;
	2114	#else
	2115	count = current->files->max_fds;
	2116	fdList = current->files->fd;
	2117	#endif
	2118	inodeP = GNP_TO_VP(cnP);
	2119
	2120	TRACE3(TRACE_VNODE,9,TRCID_CXICHECKOPEN_ENTRY,
	2121	"cxiCheckOpen: entry. %d files in fd list. Checking for inode %d "
	2122	"at 0x%x", count, inodeP->i_ino, inodeP);
	2123
	2124	for (i=0; i<count; i++)
	2125	{
	2126	fileP = fdList[i];
	2127
	2128	if (fileP)
	2129	{
	2130	if (fdList[i]->f_dentry->d_inode == inodeP)
	2131	{
	2132	TRACE1(TRACE_VNODE, 9,TRCID_CXICHECKOPEN_FOUND,
	2133	"cxiCheckOpen: found open file. vinfoP 0x%x",
	2134	fileP->private_data);
	2135	EXIT(0);
	2136	return fileP->private_data;
	2137	}
	2138	}
	2139	}
	2140
	2141	EXIT(0);
	2142	return NULL;
	2143	}
	2144
	2145	int cxiBreakOplock(void *breakArgP, int oplockNew)
	2146	{
	2147	/* On Linux, we use its kernel oplock support. The get_lease()
	2148	* call is the operation to revoke conflicting leases.
	2149	*/
	2150	int rc;
	2151	ENTER(0);
	2152
	2153	/* O_NONBLOCK: prevents the thread from waiting for the lease return.
	2154	* In the case of a Samba thread, we only want to get EWOULDBLOCK
	2155	* back if the conflict is held within Samba iteself. If a wait is
	2156	* needed, breakSMBOplock will invoke cxiWaitForBreak.
	2157	*/
	2158
	2159	/* Linux op to revoke conflicting leases */
	2160	rc = abs(REVOKE_LEASE((struct inode *)breakArgP,
	2161	(cxiIsSambaThread()? 0: O_NONBLOCK) \|
	2162	((oplockNew==smbOplockShared)? FMODE_READ: FMODE_WRITE)));
	2163
	2164	TRACE3(TRACE_VNODE, 4,TRCID_CXIBREAKOPLOCK,
	2165	"cxiBreakOplock: exit rc %d inode 0x%lX oplock %d\n",
	2166	rc, breakArgP, oplockNew);
	2167
	2168	EXIT(0);
	2169	return rc;
	2170	}
	2171
	2172	DECLARE_WAIT_QUEUE_HEAD(oplock_break_queue);
	2173
	2174	/* No initialization required on Linux */
	2175	int cxiInitBreakQ() { return 0; }
	2176
	2177	/* No initialization required on Linux */
	2178	int cxiTermBreakQ() { return 0; }
	2179
	2180	/* Send the notification that the oplock break completed */
	2181	int cxiSendBreakMsg(void *ofP)
	2182	{
	2183	ENTER(0);
	2184	/* There is only one oplock_break_queue, and no means to pass the ofP back to
	2185	* the waiters. This will wake all of them up and they will recheck their
	2186	* oplock states and wait again if necessary (with a timeout).
	2187	*/
	2188	wake_up_interruptible(&oplock_break_queue);
	2189
	2190	TRACE1(TRACE_SMB, 3, TRCID_SEND_BREAK, "cxiSendBreakMsg: ofP 0x%lX\n", ofP);
	2191	EXIT(0);
	2192	return 0;
	2193	}
	2194
	2195	/* Suspend the caller until either the oplock break completes, or the timeout
	2196	* is reached.
	2197	*/
	2198	int cxiWaitForBreak(void *fileArgP, int oplockCurrent, int timeoutSeconds)
	2199	{
	2200	DECLARE_WAITQUEUE(wait, current);
	2201	signed long timeout;
	2202
	2203	ENTER(0);
	2204	TRACE3(TRACE_SMB, 5, TRCID_BREAKWAIT,
	2205	"cxiWaitForBreak: file 0x%lX, oplockCurrent %d timeoutSeconds %d\n",
	2206	fileArgP, oplockCurrent, timeoutSeconds);
	2207
	2208	add_wait_queue(&oplock_break_queue, &wait);
	2209	timeout = timeoutSeconds * HZ;
	2210	while (timeout > 0) {
	2211	set_current_state(TASK_INTERRUPTIBLE);
	2212	/* Check whether the oplock has been released or downgraded */
	2213	if (gpfs_ops.SMBGetOplockState(fileArgP) < oplockCurrent)
	2214	break;
	2215	timeout = schedule_timeout(timeout);
	2216	}
	2217	set_current_state(TASK_RUNNING);
	2218	remove_wait_queue(&oplock_break_queue, &wait);
	2219
	2220	TRACE0(TRACE_SMB, 5, TRCID_BREAKWAIT_EXIT,
	2221	"cxiWaitForBreak exit\n");
	2222
	2223	EXIT(0);
	2224	return 0;
	2225	}
	2226	#endif
	2227
	2228
	2229	/* Get the address of the first byte not addressible by processes */
	2230	UIntPtr cxiGetKernelBoundary()
	2231	{
	2232	return GPFS_KERNEL_OFFSET;
	2233	}
	2234
	2235
	2236	/* Return true if this process holds the big kernel lock (BKL) */
	2237	Boolean cxiHoldsBKL()
	2238	{
	2239	return current->lock_depth >= 0;
	2240	}
	2241
	2242
	2243	/* Tell the OS that this thread is involved in handling VM page-out
	2244	requests and should not be blocked waiting for page allocation.
	2245	Return true if successful. */
	2246	Boolean cxiSetPageoutThread()
	2247	{
	2248	if (current->flags & PF_MEMALLOC)
	2249	return false;
	2250	current->flags \|= PF_MEMALLOC;
	2251	return true;
	2252	}
	2253
	2254
	2255	/* Tell the OS that this thread is no longer involved in handling VM
	2256	page-out requests. */
	2257	void cxiClearPageoutThread()
	2258	{
	2259	current->flags &= ~PF_MEMALLOC;
	2260	}
	2261
	2262
	2263	/* Yield the CPU to allow other processes to run */
	2264	void
	2265	cxiYield()
	2266	{
	2267	ENTER(0);
	2268	schedule();
	2269	EXIT(0);
	2270	}
	2271
	2272	/* Linux filldir has changed signatures depending on kernel level.
	2273	* We always pass a 64bit offset from the GPFS layer.
	2274	*/
	2275	int
	2276	cxiFillDir(void vargP, const char nameP, int namelen,
	2277	offset_t offset, ino_t ino)
	2278	{
	2279	int result;
	2280	cxiFillDirArg_t fillDirArgP = (cxiFillDirArg_t )vargP;
	2281	filldir_t fnP = (filldir_t)fillDirArgP->fnP;
	2282	ENTER(0);
	2283
	2284	result = (*fnP)(fillDirArgP->argP, nameP, namelen,
	2285	(loff_t)offset, ino, 0 /* DT_UNKNOWN */);
	2286	EXIT_RC(0, result);
	2287	return result;
	2288	}
	2289
	2290	#ifdef DISK_LEASE_DMS
	2291
	2292	static struct timer_list DMSTimer[MAX_DMS_INDEX];
	2293	static int (*DMSgetNIOsInProgressP)(int);
	2294
	2295	#define PANIC_FOR_REAL 1
	2296
	2297	static void cxiDMSExpired(unsigned long data)
	2298	{
	2299	int idx = data;
	2300	int nIOs = DMSgetNIOsInProgressP(idx);
	2301	/* ENTER(0); */
	2302	/* This code is executed on the interrupt level -- can't use tracing */
	2303	printk("GPFS Deadman Switch timer [%d] has expired; IOs in progress: %d\n",
	2304	idx, nIOs);
	2305	#ifdef PANIC_FOR_REAL
	2306	if (nIOs != 0)
	2307	panic("GPFS Deadman Switch timer has expired, and there are still"
	2308	" %d outstanding I/O requests\n", nIOs);
	2309	#endif
	2310	}
	2311
	2312	/*
	2313	Start dead man switch, with the timeout specified by the delay
	2314	argument (in seconds).
	2315	*/
	2316	void cxiStartDMS(int idx, int delay, int (*funcP)(int))
	2317	{
	2318	unsigned long njiffies = delay * HZ;
	2319
	2320	/* Only allow the daemon or other root users to make this kernel call */
	2321	if (!cxiIsSuperUser())
	2322	return;
	2323	ENTER(0);
	2324
	2325	/* There can be only one timer active at any given moment */
	2326	if (timer_pending(&DMSTimer[idx]))
	2327	del_timer(&DMSTimer[idx]);
	2328
	2329	init_timer(&DMSTimer[idx]);
	2330	DMSTimer[idx].expires = jiffies + njiffies;
	2331	DMSTimer[idx].function = cxiDMSExpired;
	2332	DMSTimer[idx].data = idx;
	2333	/* save the pointer to nIOsInProgress to a static var */
	2334	DMSgetNIOsInProgressP = funcP;
	2335	add_timer(&DMSTimer[idx]);
	2336	TRACE3(TRACE_DLEASE, 2, TRCID_DMS_STARTED,
	2337	"DMS timer [%d] started, delay %d, time %d\n",
	2338	idx, delay, jiffies/HZ);
	2339	EXIT(0);
	2340	}
	2341
	2342	void cxiStopDMS(int idx)
	2343	{
	2344	/* Only allow the daemon or other root users to make this kernel call */
	2345	if (!cxiIsSuperUser())
	2346	return;
	2347	ENTER(0);
	2348
	2349	if (timer_pending(&DMSTimer[idx]))
	2350	del_timer(&DMSTimer[idx]);
	2351	TRACE2(TRACE_DLEASE, 2, TRCID_DMS_STOPPED,
	2352	"DMS timer [%d] stopped, time %d\n", idx, jiffies/HZ);
	2353	EXIT(0);
	2354	}
	2355
	2356	/* dummy init routine. Since on Linux the timer is
	2357	stored in a static memory, there's nothing to be done
	2358	*/
	2359	int cxiInitDMS(void)
	2360	{
	2361	return 0;
	2362	}
	2363
	2364	void cxiShutdownDMS(void)
	2365	{
	2366	int i;
	2367
	2368	ENTER(0);
	2369	for (i = 0; i < MAX_DMS_INDEX; i++)
	2370	cxiStopDMS(i);
	2371	EXIT(0);
	2372	}
	2373
	2374	#endif /* DISK_LEASE_DMS */
	2375
	2376	void cxiSetBit(unsigned long *flagP, int flag_bit)
	2377	{
	2378	set_bit(flag_bit,flagP);
	2379	}
	2380	void cxiClearBit(unsigned long *flagP, int flag_bit)
	2381	{
	2382	clear_bit(flag_bit,flagP);
	2383	}
	2384	Boolean cxiTestBit(unsigned long *flagP, int flag_bit)
	2385	{
	2386	return test_bit(flag_bit,flagP);
	2387	}
	2388
	2389	/* In order to setup our termination callback routine (gpfs_f_cleanup)
	2390	* we create a dummy file and add it to our file table. Then, upon
	2391	* process termination, the release file operation will be called in
	2392	* order to close the file. The only operation we define for this
	2393	* dummy file is release (gpfs_f_cleanup).
	2394	*/
	2395	int
	2396	cxiRegisterCleanup()
	2397	{
	2398	int code = 0, rc = 0;
	2399	struct inode *iP = NULL;
	2400	struct file *fileP = NULL;
	2401	struct dentry *dentryP = NULL;
	2402	extern int cleanupFD;
	2403	extern struct super_block *shutdownSuperP;
	2404
	2405	/* We record the daemon's process group because certain
	2406	* checks on cxiCopyIn/cxiCopyOut are bypassed for the daemon.
	2407	*/
	2408	ENTER(0);
	2409	DaemonPGrp = PROCESS_GROUP(current);
	2410
	2411	/* Make sure we only create one file */
	2412	if (cleanupFD)
	2413	{
	2414	EXIT_RC(0, EEXIST);
	2415	return EEXIST;
	2416	}
	2417
	2418	DBGASSERT(shutdownSuperP != NULL);
	2419
	2420	/* Allocate an inode struct */
	2421	iP = NEW_INODE(shutdownSuperP);
	2422	if (!iP)
	2423	{
	2424	code = 1;
	2425	rc = ENOMEM;
	2426	goto xerror;
	2427	}
	2428	iP->i_mode = S_IFREG;
	2429
	2430	/* Allocate an available file descriptor */
	2431	cleanupFD = get_unused_fd();
	2432	if (cleanupFD < 0)
	2433	{
	2434	code = 2;
	2435	rc = ENFILE;
	2436	goto xerror;
	2437	}
	2438
	2439	/* Allocate a file struct */
	2440	fileP = get_empty_filp();
	2441	if (!fileP)
	2442	{
	2443	code = 3;
	2444	rc = ENFILE;
	2445	goto xerror;
	2446	}
	2447
	2448	/* Allocate a dentry sruct */
	2449	dentryP = dget(d_alloc_root(iP));
	2450	if (!dentryP)
	2451	{
	2452	code = 4;
	2453	rc = ENOMEM;
	2454	goto xerror;
	2455	}
	2456
	2457	/* Initialize and chain our file sructure */
	2458	fileP->f_dentry = dentryP;
	2459	fileP->f_op = &gpfs_cleanup_fops;
	2460	fileP->f_flags = O_RDONLY;
	2461	atomic_set(&fileP->f_count, 1);
	2462
	2463	/* Just chain it on the current root mount. When
	2464	* the file is closed its fput() will decrement
	2465	* the mount count (hence the mntget here)
	2466	*/
	2467	fileP->f_vfsmnt = mntget(current->fs->rootmnt);
	2468
	2469	/* Install the descriptor so it gets "closed" upon our termination */
	2470	fd_install(cleanupFD, fileP);
	2471
	2472	/* Set FD_CLOEXEC so that forked processes (like mmfsup.scr) do not
	2473	* inherrit this descriptor. We want the cleanup routine to be run
	2474	* when the last mmfsd process terminates.
	2475	*/
	2476	#if LINUX_KERNEL_VERSION >= 2061300
	2477	FD_SET(cleanupFD, current->files->fdt->close_on_exec);
	2478	#else
	2479	FD_SET(cleanupFD, current->files->close_on_exec);
	2480	#endif
	2481	/* Once the descriptor for this dummy file is added to our file table,
	2482	* it is inherrited by all the processes of the daemon. As each
	2483	* terminates, the files->count is decremented and on the last process
	2484	* termination all the descriptors will be closed by filp_close.
	2485	*
	2486	* The one catch here is that our file table is inherrited by the
	2487	* kernel threads we start as well as user processes. This would
	2488	* cause a problem in that daemon termination does not include these
	2489	* kernel threads which aren't killed until restart (and therefore
	2490	* the file is never closed). In order for our operation to be
	2491	* driven at daemon termiation, we must remove the file table from
	2492	* these kernel threads. This is done in via cxiReparent() by
	2493	* the mmap pager kproc.
	2494	*/
	2495
	2496	xerror:
	2497	TRACE4(TRACE_VNODE, 1, TRCID_CXIREGISTERCLEANUP_EXIT,
	2498	"cxiRegisterCleanup: fd %d iP %X rc %d code %d\n",
	2499	cleanupFD, iP, rc, code);
	2500
	2501	if (rc)
	2502	{
	2503	if (dentryP);
	2504	dput(dentryP);
	2505
	2506	if (cleanupFD)
	2507	put_unused_fd(cleanupFD);
	2508
	2509	if (fileP)
	2510	#if LINUX_KERNEL_VERSION > 2060900
	2511	fput(fileP);
	2512	#else
	2513	put_filp(fileP);
	2514	#endif
	2515
	2516	if (iP)
	2517	iput(iP);
	2518
	2519	cleanupFD = 0;
	2520	}
	2521
	2522	EXIT_RC(0, rc);
	2523	return rc;
	2524	}
	2525
	2526	#ifdef NFS4_ACL
	2527	/* Linux routines to be called when processing NFSv4 audit/alarm ACL entries */
	2528	int cxiAuditWrite(int numargs, ...) { return ENOSYS; }
	2529	#endif /* NFS4_ACL */
	2530
	2531	/* Currently no OS specific VFS initialization for Linux */
	2532	int
	2533	cxiInitVFS(int vfsType)
	2534	{
	2535	return 0;
	2536	}
	2537
	2538	UIntPtr
	2539	cxiGetKernelStackSize()
	2540	{
	2541	return (UIntPtr)THREAD_SIZE;
	2542	}
	2543
	2544	#if defined(DMAPI) \|\| (SANERGY)
	2545
	2546	void cxiPathRel(void *ndP)
	2547	{
	2548	DBGASSERT( ndP != NULL);
	2549	path_release( (struct nameidata *) ndP);
	2550	cxiFreeUnpinned(ndP);
	2551	}
	2552
	2553	int
	2554	cxiPathToVfsP(void *privVfsPP, char kpathname, void ndPP, void cnPP,
	2555	Boolean traverseLink)
	2556	{
	2557	struct gpfsVfsData_t *privVfsP = NULL;
	2558	struct nameidata *ndP;
	2559	struct inode * iP;
	2560	cxiNode_t *cnP;
	2561	int rc = 0;
	2562	Boolean rel = false;
	2563	int code = 0;
	2564	*ndPP = NULL;
	2565	*privVfsPP = NULL;
	2566
	2567	ENTER(0);
	2568	if (kpathname == NULL)
	2569	{
	2570	code = 1;
	2571	rc = EINVAL;
	2572	goto xerror;
	2573	}
	2574
	2575	ndP = (struct nameidata *)cxiMallocUnpinned(sizeof(struct nameidata));
	2576	if (ndP == NULL)
	2577	{
	2578	code = 2;
	2579	rc = ENOMEM;
	2580	goto xerror;
	2581	}
	2582
	2583	/* For DMAPI, this is called by dm_path_to_handle or dm_path_to_fshandle,
	2584	* According to dmapi documentation, we should return the symbolic link
	2585	* itself instead of the object that link references.
	2586	* so here we need to use the function which does not traverse the link */
	2587	if (!traverseLink)
	2588	rc = user_path_walk_link(kpathname, ndP);
	2589	else
	2590	rc = user_path_walk(kpathname, ndP);
	2591
	2592	if (rc)
	2593	{
	2594	rc = -rc;
	2595	code = 3;
	2596	goto xerror;
	2597	}
	2598
	2599	rel = true;
	2600	iP = ndP->dentry->d_inode;
	2601	DBGASSERT(iP != NULL);
	2602	if (!GPFS_TYPE(iP))
	2603	{
	2604	code = 4;
	2605	rc = EINVAL;
	2606	goto xerror;
	2607	}
	2608
	2609	privVfsP = VP_TO_PVP(iP);
	2610
	2611	if (privVfsP == NULL)
	2612	{
	2613	code = 5;
	2614	rc = ENOENT;
	2615	}
	2616	cnP = VP_TO_CNP(iP);
	2617	privVfsPP = (void )privVfsP;
	2618	ndPP = (void )ndP;
	2619	if (cnPP != NULL)
	2620	cnPP = (void )cnP;
	2621
	2622	xerror:
	2623	if (rc && ndP)
	2624	{
	2625	if (rel)
	2626	cxiPathRel(ndP);
	2627	else
	2628	cxiFreeUnpinned(ndP);
	2629	}
	2630	EXIT_RC(0, rc);
	2631	return rc;
	2632	}
	2633
	2634	void
	2635	cxiSetCred(void *eCredPP)
	2636	{
	2637	ext_cred_t eCredP = (ext_cred_t )eCredPP;
	2638	setCred(eCredP);
	2639	}
	2640
	2641	#endif /* DMAPI or SANERGY */
	2642
	2643
	2644	#ifdef KSTACK_CHECK
	2645	/* Kernel stack checking: for each active thread that is making
	2646	subroutine calls in the kernel, allocate a stack_history_t. Within
	2647	each stack_history_t, create a frame_desc_t for each level of
	2648	subroutine call. Two lists of frame_desc_t's are maintained: one for
	2649	the current call stack, and one for the deepest call stack seen so
	2650	far for this thread. Upon exit from the lowest-level routine, check
	2651	whether the maximum stack depth threshhold has been exceeded. If it
	2652	has, print the traceback of the maximum stack usage. Keep hashes of
	2653	the tracebacks printed to avoid printing the same traceback more than
	2654	once. Since cxiTraceExit is not called for every routine exit,
	2655	maintenance of call chains is not exact; a routine entry with
	2656	stackUsed less than the current entry implies return of the previous
	2657	routine.
	2658
	2659	Note that these routines cannot call any other routine that has
	2660	ENTER/EXIT macros inside of it, to avoid recursion. */
	2661
	2662	/* Maximum size of of a stack frame before it is considered large enough
	2663	to complain about */
	2664	#define STACK_LIMIT_WARNING (THREAD_SIZE - (THREAD_SIZE/3) )
	2665
	2666	/* Description of one level of a call stack */
	2667	typedef struct frame_desc
	2668	{
	2669	/* Function name and file name containing the function */
	2670	const char * fdFuncNameP;
	2671	const char * fdFileNameP;
	2672
	2673	/* Pointer to frame_desc of caller, or NULL if this is the first
	2674	frame. Also used to link free frame descriptors together on the
	2675	shFreeHeadP free list. */
	2676	struct frame_desc * fdCallerP;
	2677
	2678	/* Line number near the beginning of fdFuncNameP */
	2679	int fdLineNum;
	2680
	2681	/* Total stack usage up to and including this routine */
	2682	int fdStackUsed;
	2683
	2684	/* Reference count for this frame_desc_t. Can be 2 if this descriptor
	2685	is reachable from both shCurrentP and shMaxP. */
	2686	int fdRef;
	2687	} frame_desc_t;
	2688
	2689
	2690	/* Each stack_history is only used by one thread, so no locking is
	2691	needed within a stack_history. This is allocated as a single page.
	2692	*/
	2693	typedef struct stack_history
	2694	{
	2695	/* ID of thread to which this stack_history_t belongs */
	2696	cxiThreadId shThreadId;
	2697
	2698	/* Bucket index in historyHash that points to this stack_history_t,
	2699	or -1 if this stack_history_t is on an overflow list */
	2700	int shBucketNum;
	2701
	2702	/* Next stack_history_t in same hash overflow list or on free list */
	2703	struct stack_history * shNextP;
	2704
	2705	/* Pointer to the frame descriptor for the routine that most recently
	2706	called fdEnter without a matching fdExit. Following the fdCallerP
	2707	pointers through these frame descriptors gives the current callback
	2708	chain. */
	2709	frame_desc_t * shCurrentP;
	2710
	2711	/* Pointer to the frame descriptor that had the maximum stack usage
	2712	seen thus far for this thread. Following the fdCallerP pointers
	2713	through these frame descriptors gives the callback chain with
	2714	maximal stack usage. */
	2715	frame_desc_t * shMaxP;
	2716
	2717	/* Head of list of free frame_desc_t's */
	2718	frame_desc_t * shFreeHeadP;
	2719
	2720	/* Area that holds frame_desc_t's. These will be linked together and
	2721	put on the list shFreeHeadP. */
	2722	#define SH_PREFIX_LEN (sizeof(cxiThreadId) + \
	2723	sizeof(int) + \
	2724	sizeof(struct stack_history *) + \
	2725	3sizeof(frame_desc_t ))
	2726	#define SH_NFRAMES ((PAGE_SIZE-SH_PREFIX_LEN)/sizeof(frame_desc_t))
	2727	frame_desc_t shFrames[SH_NFRAMES];
	2728	} stack_history_t;
	2729
	2730	/* Global structures */
	2731	struct
	2732	{
	2733	/* Global flag controlling whether kernel stack checking is enabled.
	2734	Initially false; set true during kernel module initialization,
	2735	then set false again during kernel module termination. */
	2736	Boolean shActive;
	2737
	2738	/* Mutex protecting updates to the variables that follow. This cannot
	2739	be a cxiBlockMutex_t because then the stack checking routines would
	2740	get called recursively. */
	2741	struct semaphore shMutex;
	2742
	2743	/* List of free stack_history_t's and count of how many free entries
	2744	there are. Excess stack_history_t's beyond a threshhold are freed
	2745	back to the operating system. */
	2746	stack_history_t * freeHeadP;
	2747	int nFree;
	2748	#define MAX_FREE_STACK_HISTORIES 16
	2749
	2750	/* Hash table of active stack_history_t's. To find the entry for a
	2751	particular thread, hash its thread id to a bucket. If any of the
	2752	entries in bucket[] match the desired thread id, the pointer to
	2753	the stack_history_t can be returned without acquiring any locks. If
	2754	the bucket does not contain the desired thread id, look for it on
	2755	the overflow list under protection of shMutex. */
	2756	#define HISTORY_HASH_SIZE 64
	2757	#define HISTS_PER_BUCKET 3
	2758	struct
	2759	{
	2760	struct
	2761	{
	2762	stack_history_t * historyP;
	2763	cxiThreadId threadId;
	2764	} bucket[HISTS_PER_BUCKET];
	2765	stack_history_t * overflowP;
	2766	} historyHash[HISTORY_HASH_SIZE];
	2767
	2768	/* List of hash values for tracebacks that have already been printed.
	2769	Used to avoid printing the same traceback more than once. Nothing
	2770	is ever deleted from this table, so to find an entry start
	2771	searching at its hash value and continue until the entry is found
	2772	or an empty slot is encountered. The total occupancy of the table
	2773	is limited to MAX_TRACEBACKS to restrict the amount of searching
	2774	that will be required, and to guarantee that searches will
	2775	terminate. */
	2776	#define TB_HASH_SIZE 64
	2777	#define MAX_TRACEBACKS 32
	2778	unsigned int tracebackHash[TB_HASH_SIZE];
	2779	int nTracebackHashEntries;
	2780	} SHG;
	2781
	2782
	2783	/* Private version of DBGASSERT used only within stack checking code.
	2784	Cannot use DBGASSERT without risking recursion. */
	2785	#ifdef DBGASSERTS
	2786	#define SH_ASSERT(_ex) \
	2787	if (!(_ex)) { \
	2788	printk("GPFS stack checking assert failed: " # _ex " file %s line %d\n", \
	2789	__FILE__, __LINE__); \
	2790	DoPanic(# _ex, __FILE__, __LINE__, 0, 0, ""); \
	2791	} else ((void)0)
	2792	#else
	2793	#define SH_ASSERT(_ex) ((void)0)
	2794	#endif
	2795
	2796
	2797	/* Initialize and enable stack depth checking */
	2798	void shInit()
	2799	{
	2800	/* Clear stack checking globals */
	2801	cxiMemset(&SHG, 0, sizeof(SHG));
	2802
	2803	/* Init mutex */
	2804	init_MUTEX(&SHG.shMutex);
	2805
	2806	/* Turn on stack depth checking and make sure the change is visible */
	2807	SHG.shActive = true;
	2808	wmb();
	2809	}
	2810
	2811
	2812	/* Turn off stack depth checking and free all allocated memory. This does
	2813	not have to return the global state to what it was when the module was
	2814	first loaded, since it will not be used again. */
	2815	void shTerm()
	2816	{
	2817	int h;
	2818	int b;
	2819	stack_history_t * shP;
	2820	stack_history_t * shNextP;
	2821
	2822	/* Turn off stack depth checking and make sure the chenge is visible */
	2823	SHG.shActive = false;
	2824	wmb();
	2825
	2826	/* Get and then release mutex. This insures that a thread that is
	2827	in the middle of writing a traceback finishes writing it before
	2828	we free the data structures it was using. */
	2829	/* ?? although there could be another thread waiting for the mutex ... */
	2830	down(&SHG.shMutex);
	2831	up(&SHG.shMutex);
	2832
	2833	/* Wait briefly to allow threads in the middle of the stack checking
	2834	code to finish what they are doing */
	2835	/* ?? Of course, this is not really safe, but this is debugging code,
	2836	right? */
	2837	schedule_timeout(HZ/2);
	2838
	2839	/* Terminate mutex */
	2840	// nothing to do
	2841
	2842	/* Free all stack_history_t's on the free list */
	2843	shP = SHG.freeHeadP;
	2844	while (shP != NULL)
	2845	{
	2846	shNextP = shP->shNextP;
	2847	kfree(shP);
	2848	shP = shNextP;
	2849	}
	2850
	2851	/* Free all stack_history_t's in the hash table */
	2852	for (h=0 ; h<HISTORY_HASH_SIZE ; h++)
	2853	{
	2854	for (b=0 ; b<HISTS_PER_BUCKET ; b++)
	2855	if (SHG.historyHash[h].bucket[b].historyP != NULL)
	2856	kfree(SHG.historyHash[h].bucket[b].historyP);
	2857	shP = SHG.historyHash[h].overflowP;
	2858	while (shP != NULL)
	2859	{
	2860	shNextP = shP->shNextP;
	2861	kfree(shP);
	2862	shP = shNextP;
	2863	}
	2864	}
	2865	}
	2866
	2867
	2868	/* Allocate and initialize a new stack_history_t */
	2869	static stack_history_t * shAllocInit()
	2870	{
	2871	stack_history_t * shP;
	2872	int f;
	2873
	2874	up(&SHG.shMutex);
	2875	shP = (stack_history_t *) kmalloc(sizeof(stack_history_t), GFP_KERNEL);
	2876	SH_ASSERT(shP != NULL);
	2877	down(&SHG.shMutex);
	2878	cxiMemset(shP, 0, sizeof(stack_history_t));
	2879	for (f=0 ; f<=SH_NFRAMES-2 ; f++)
	2880	shP->shFrames[f].fdCallerP = &shP->shFrames[f+1];
	2881	shP->shFreeHeadP = &shP->shFrames[0];
	2882	return shP;
	2883	}
	2884
	2885
	2886	/* Get a stack_history_t off the free list or build a new one */
	2887	static stack_history_t * shGet()
	2888	{
	2889	stack_history_t * shP;
	2890
	2891	/* Use free list if one is available there */
	2892	shP = SHG.freeHeadP;
	2893	if (shP != NULL)
	2894	{
	2895	SHG.freeHeadP = shP->shNextP;
	2896	SHG.nFree -= 1;
	2897	return shP;
	2898	}
	2899
	2900	/* Make a new one if necessary */
	2901	return shAllocInit();
	2902	}
	2903
	2904
	2905	/* Free a stack_history_t. Put it on the free list if there are not
	2906	already too many free, or else free it back to the operating system.
	2907	*/
	2908	static void shPut(stack_history_t * shP)
	2909	{
	2910	int h;
	2911	int b;
	2912	stack_history_t ** shPrevPP;
	2913	stack_history_t * p;
	2914
	2915	/* Both call stacks should be empty */
	2916	SH_ASSERT(shP->shCurrentP == NULL);
	2917	SH_ASSERT(shP->shMaxP == NULL);
	2918
	2919	/* Must hold mutex while changing the hash table */
	2920	down(&SHG.shMutex);
	2921
	2922	/* Clear pointer to this stack_history_t from the hash table */
	2923	h = ((int)shP->shThreadId) & (HISTORY_HASH_SIZE-1);
	2924	b = shP->shBucketNum;
	2925	if (b != -1)
	2926	{
	2927	SH_ASSERT(SHG.historyHash[h].bucket[b].historyP == shP);
	2928	SHG.historyHash[h].bucket[b].historyP = NULL;
	2929	SHG.historyHash[h].bucket[b].threadId = 0;
	2930	}
	2931	else
	2932	{
	2933	shPrevPP = &SHG.historyHash[h].overflowP;
	2934	p = *shPrevPP;
	2935	while (p != NULL)
	2936	{
	2937	if (p == shP)
	2938	{
	2939	*shPrevPP = shP->shNextP;
	2940	break;
	2941	}
	2942	shPrevPP = &p->shNextP;
	2943	p = *shPrevPP;
	2944	}
	2945	}
	2946
	2947	/* If not too many already free, add to free list */
	2948	if (SHG.nFree < MAX_FREE_STACK_HISTORIES)
	2949	{
	2950	shP->shNextP = SHG.freeHeadP;
	2951	SHG.freeHeadP = shP;
	2952	SHG.nFree += 1;
	2953	up(&SHG.shMutex);
	2954	return;
	2955	}
	2956
	2957	/* Otherwise, really free it */
	2958	up(&SHG.shMutex);
	2959	kfree(shP);
	2960	}
	2961
	2962
	2963	/* Find the stack_history_t for the current thread, or allocate one if
	2964	one does not already exist */
	2965	static stack_history_t * shFind()
	2966	{
	2967	stack_history_t * shP;
	2968	cxiThreadId id = current->pid;
	2969	int h = ((int)id) & (HISTORY_HASH_SIZE-1);
	2970	int b;
	2971
	2972	/* Look at all entries within the bucket given by the hash of the
	2973	thread ID. No locking needs to be done for this search. */
	2974	for (b=0 ; b<HISTS_PER_BUCKET ; b++)
	2975	if (SHG.historyHash[h].bucket[b].threadId == id)
	2976	return SHG.historyHash[h].bucket[b].historyP;
	2977
	2978	/* Must hold mutex while changing the hash table */
	2979	down(&SHG.shMutex);
	2980
	2981	/* Search the overflow list */
	2982	shP = SHG.historyHash[h].overflowP;
	2983	while (shP != NULL)
	2984	{
	2985	if (shP->shThreadId == id)
	2986	goto exit;
	2987	shP = shP->shNextP;
	2988	}
	2989
	2990	/* No stack_history_t for this thread yet. Get one off the free list
	2991	or build one. */
	2992	shP = shGet();
	2993	shP->shThreadId = id;
	2994	shP->shNextP = NULL;
	2995
	2996	/* Find a slot for the new stack_history_t in the hash table */
	2997	for (b=0 ; b<HISTS_PER_BUCKET ; b++)
	2998	if (SHG.historyHash[h].bucket[b].historyP == NULL)
	2999	{
	3000	SHG.historyHash[h].bucket[b].historyP = shP;
	3001	SHG.historyHash[h].bucket[b].threadId = id;
	3002	shP->shBucketNum = b;
	3003	goto exit;
	3004	}
	3005
	3006	/* No slots available; add new stack_history_t to overflow list */
	3007	shP->shBucketNum = -1;
	3008	shP->shNextP = SHG.historyHash[h].overflowP;
	3009	SHG.historyHash[h].overflowP = shP;
	3010
	3011	exit:
	3012	/* Release mutex before returning */
	3013	up(&SHG.shMutex);
	3014	return shP;
	3015	}
	3016
	3017
	3018	/* Allocate a frame descriptor within the given stack_history_t. This
	3019	cannot be allowed to fail, so if there are no more free descriptors,
	3020	throw away the bottom frame descriptor and return that. The reference
	3021	count of the frame descriptor that is returned is undefined. */
	3022	static frame_desc_t * fdGet(stack_history_t * shP)
	3023	{
	3024	frame_desc_t * fdP;
	3025	frame_desc_t ** fdPrevPP;
	3026	int prevRef;
	3027
	3028	/* Look on the free list within the stack_history_t */
	3029	fdP = shP->shFreeHeadP;
	3030	if (fdP != NULL)
	3031	{
	3032	shP->shFreeHeadP = fdP->fdCallerP;
	3033	return fdP;
	3034	}
	3035
	3036	/* No free descriptors; first try stealing one off the bottom of the
	3037	current call stack */
	3038	fdP = shP->shCurrentP;
	3039	if (fdP != NULL)
	3040	{
	3041	/* Find the bottom entry of the current call stack */
	3042	fdPrevPP = &shP->shCurrentP;
	3043	prevRef = 1;
	3044	while (fdP->fdCallerP != NULL)
	3045	{
	3046	fdPrevPP = &fdP->fdCallerP;
	3047	prevRef = fdP->fdRef;
	3048	fdP = *fdPrevPP;
	3049	}
	3050
	3051	/* Remove the bottom entry of the current call stack */
	3052	*fdPrevPP = NULL;
	3053
	3054	/* Reduce the reference count on the entry just removed. The
	3055	reference count decreases by the reference count of the frame
	3056	that used to point to fdP. If fdP is no longer referenced, no
	3057	further work is needed. If *fdP is still referenced from the max
	3058	depth stack (it must be the bottom entry), we will eventually
	3059	return it, but only after removing it from the bottom of the max
	3060	depth stack. We know that fdP will be returned, but we have to
	3061	search through the max depth stack to find the pointer to *fdP.
	3062	*/
	3063	fdP->fdRef -= prevRef;
	3064	if (fdP->fdRef == 0)
	3065	return fdP;
	3066	}
	3067
	3068	/* Still no free descriptors; steal the frame descriptor off the
	3069	bottom of the maximum depth call stack */
	3070	fdP = shP->shMaxP;
	3071	if (fdP != NULL)
	3072	{
	3073	/* Find the bottom entry of the max depth call stack */
	3074	fdPrevPP = &shP->shMaxP;
	3075	while (fdP->fdCallerP != NULL)
	3076	{
	3077	fdPrevPP = &fdP->fdCallerP;
	3078	fdP = *fdPrevPP;
	3079	}
	3080
	3081	/* Remove the bottom entry of the max depth call stack */
	3082	*fdPrevPP = NULL;
	3083
	3084	/* The bottom entry of the max depth call stack that was just
	3085	removed must have a reference count of one; otherwise it would
	3086	still be on the current call stack and removing the bottom entry
	3087	of that stack would have reduced the reference count of some
	3088	frame descriptor from 2 to 0. */
	3089	SH_ASSERT(fdP->fdRef == 1);
	3090	return fdP;
	3091	}
	3092	SH_ASSERT(!"cannot alloc frame_desc_t");
	3093	return NULL;
	3094	}
	3095
	3096
	3097	/* Decrease the reference count on a frame descriptor. If it becomes
	3098	zero, return it to the free list */
	3099	static void fdDiscard(frame_desc_t * fdP, stack_history_t * shP)
	3100	//inline static void fdDiscard(frame_desc_t * fdP, stack_history_t * shP)
	3101	{
	3102	if (fdP->fdRef > 1)
	3103	{
	3104	fdP->fdRef -= 1;
	3105	TRACE3(TRACE_ENTRYEXIT, 11, TRCID_FDDISCARD1,
	3106	"fdDiscard: fdP 0x%lX shP 0x%lX rtn %s refcnt now 1\n",
	3107	fdP, shP, fdP->fdFuncNameP);
	3108	return;
	3109	}
	3110
	3111	fdP->fdCallerP = shP->shFreeHeadP;
	3112	shP->shFreeHeadP = fdP;
	3113	TRACE3(TRACE_ENTRYEXIT, 11, TRCID_FDDISCARD2,
	3114	"fdDiscard: fdP 0x%lX shP 0x%lX rtn %s refcnt now 0\n",
	3115	fdP, shP, fdP->fdFuncNameP);
	3116	}
	3117
	3118
	3119	/* If the maximum stack depth exceeds the threshhold, print its
	3120	traceback if it has not already been printed. Reset the maximum
	3121	depth stack to empty. Only called when the current stack is already
	3122	empty. */
	3123	static void shDisplay(stack_history_t * shP)
	3124	{
	3125	frame_desc_t * fdP;
	3126	unsigned int tbHash;
	3127	frame_desc_t * fdNextP;
	3128	int slot;
	3129
	3130	SH_ASSERT(shP->shCurrentP == NULL);
	3131
	3132	/* If the maximum stack depth is less than the threshhold, just free
	3133	the call chain and return */
	3134	fdP = shP->shMaxP;
	3135	if (fdP == NULL \|\|
	3136	fdP->fdStackUsed < STACK_LIMIT_WARNING)
	3137	goto exit;
	3138
	3139	/* Compute a hash of the traceback call chain */
	3140	tbHash = 0;
	3141	while (fdP != NULL)
	3142	{
	3143	tbHash <<= 1;
	3144	tbHash ^= (((unsigned int)fdP->fdStackUsed) << 15) ^ fdP->fdLineNum;
	3145	fdP = fdP->fdCallerP;
	3146	}
	3147
	3148	/* Search for the hash of the call chain in the table of tracebacks that
	3149	have already been printed. Searching the hash table can be done without
	3150	any locks, since entries are never deleted. The loop must eventually
	3151	terminate, since the table will not be allowed to fill up. */
	3152	search:
	3153	slot = tbHash % TB_HASH_SIZE;
	3154	while (SHG.tracebackHash[slot] != 0)
	3155	{
	3156	if (SHG.tracebackHash[slot] == tbHash)
	3157	/* This traceback has already been printed */
	3158	goto exit;
	3159	slot = (slot+1) % TB_HASH_SIZE;
	3160	}
	3161
	3162	/* The hash of the current max depth traceback was not found in the
	3163	table and should be inserted at position 'slot'. Do this under
	3164	protection of the mutex. If 'slot' has been used by the time we
	3165	get the mutex, drop the mutex and repeat the search. */
	3166	down(&SHG.shMutex);
	3167	if (SHG.nTracebackHashEntries >= MAX_TRACEBACKS)
	3168	goto exitMutexHeld;
	3169	if (SHG.tracebackHash[slot] != 0)
	3170	{
	3171	up(&SHG.shMutex);
	3172	goto search;
	3173	}
	3174	SHG.tracebackHash[slot] = tbHash;
	3175	SHG.nTracebackHashEntries += 1;
	3176
	3177	/* Print the traceback */
	3178	fdP = shP->shMaxP;
	3179	printk("\nGPFS kernel stack for process %d(%s) used %d bytes\n",
	3180	current->pid, current->comm, fdP->fdStackUsed);
	3181	printk(" stack function\n");
	3182	printk(" used\n");
	3183	printk(" ----- -----------------------------------------------------\n");
	3184	while (fdP != NULL)
	3185	{
	3186	printk(" %5d %s at %s:%d\n",
	3187	fdP->fdStackUsed, fdP->fdFuncNameP, fdP->fdFileNameP, fdP->fdLineNum);
	3188	fdP = fdP->fdCallerP;
	3189	}
	3190	printk(" traceback signature %08X\n", tbHash);
	3191
	3192	/* If the maximum number of allowed tracebacks has been reached, turn
	3193	off further stack checking. */
	3194	if (SHG.nTracebackHashEntries >= MAX_TRACEBACKS)
	3195	{
	3196	printk("Maximum number of GPFS deep stack tracebacks reached\n");
	3197	printk("GPFS stack checking disabled\n");
	3198	SHG.shActive = false;
	3199	wmb();
	3200	}
	3201
	3202	exitMutexHeld:
	3203	up(&SHG.shMutex);
	3204
	3205	exit:
	3206	/* Free all stack frame descriptors for the max depth call chain back
	3207	to the internal free list. */
	3208	fdP = shP->shMaxP;
	3209	while (fdP != NULL)
	3210	{
	3211	SH_ASSERT(fdP->fdRef == 1);
	3212	fdNextP = fdP->fdCallerP;
	3213	fdP->fdCallerP = shP->shFreeHeadP;
	3214	shP->shFreeHeadP = fdP;
	3215	fdP = fdNextP;
	3216	}
	3217	shP->shMaxP = NULL;
	3218	}
	3219
	3220
	3221	/* Process routine entry */
	3222	static void fdEntry(frame_desc_t * fdP, stack_history_t * shP)
	3223	{
	3224	frame_desc_t * popP;
	3225	frame_desc_t * p;
	3226
	3227	TRACE5(TRACE_ENTRYEXIT, 11, TRCID_FDENTRY,
	3228	"fdEntry: fdP 0x%lX shP 0x%lX rtn %s shCurrentP 0x%lX used %d\n",
	3229	fdP, shP, fdP->fdFuncNameP, shP->shCurrentP, fdP->fdStackUsed);
	3230
	3231	/* If this is the first call by this thread, set up the two call chains */
	3232	if (shP->shCurrentP == NULL)
	3233	{
	3234	SH_ASSERT(shP->shMaxP == NULL);
	3235	shP->shCurrentP = fdP;
	3236	shP->shMaxP = fdP;
	3237	fdP->fdCallerP = NULL;
	3238	fdP->fdRef = 2;
	3239	return;
	3240	}
	3241	else
	3242	SH_ASSERT(shP->shMaxP != NULL);
	3243
	3244	/* Process routine exits implied by the number of bytes of stack that
	3245	are currently in use. The test needs to be for strict less than
	3246	because inlined routines share the same stack frame as their
	3247	caller, but both routines will do entry/exit processing. */
	3248	popP = shP->shCurrentP;
	3249	while (fdP->fdStackUsed < popP->fdStackUsed)
	3250	{
	3251	p = popP->fdCallerP;
	3252	shP->shCurrentP = p;
	3253	TRACE1(TRACE_ENTRYEXIT, 11, TRCID_IMPLIED_EXIT,
	3254	"fdEntry: implied exit from rtn %s\n",
	3255	popP->fdFuncNameP);
	3256	fdDiscard(popP, shP);
	3257	if (p == NULL)
	3258	{
	3259	/* The outermost routine returned before this call without calling
	3260	fdExit. Test for a large maximum stack, then reset the
	3261	maximum. */
	3262	shDisplay(shP);
	3263
	3264	/* The current routine is the one and only */
	3265	shP->shCurrentP = fdP;
	3266	shP->shMaxP = fdP;
	3267	fdP->fdCallerP = NULL;
	3268	fdP->fdRef = 2;
	3269	return;
	3270	}
	3271	popP = p;
	3272	}
	3273
	3274	/* If this is an extension of the current max depth stack, just add
	3275	this routine to the top of both stacks */
	3276	if (fdP->fdStackUsed > shP->shMaxP->fdStackUsed &&
	3277	shP->shCurrentP == shP->shMaxP)
	3278	{
	3279	fdP->fdCallerP = shP->shCurrentP;
	3280	shP->shCurrentP = fdP;
	3281	shP->shMaxP = fdP;
	3282	fdP->fdRef = 2;
	3283	TRACE2(TRACE_ENTRYEXIT, 11, TRCID_NEWMAX_EXTEND,
	3284	"fdEntry: extending new max stack %d fdP 0x%lX\n",
	3285	fdP->fdStackUsed, fdP);
	3286	return;
	3287	}
	3288
	3289	/* Make this new routine be the top of the stack */
	3290	fdP->fdCallerP = shP->shCurrentP;
	3291	shP->shCurrentP = fdP;
	3292	fdP->fdRef = 1;
	3293
	3294	/* If this new routine has a greater stack depth than the previous max,
	3295	unreference the previous max depth call chain and add additional
	3296	references to the current one. */
	3297	if (fdP->fdStackUsed > shP->shMaxP->fdStackUsed)
	3298	{
	3299	popP = shP->shMaxP;
	3300	do
	3301	{
	3302	p = popP->fdCallerP;
	3303	fdDiscard(popP, shP);
	3304	popP = p;
	3305	} while (popP != NULL);
	3306	p = fdP;
	3307	do
	3308	{
	3309	p->fdRef = 2;
	3310	p = p->fdCallerP;
	3311	} while (p != NULL);
	3312	TRACE2(TRACE_ENTRYEXIT, 11, TRCID_NEWMAX,
	3313	"fdEntry: new max stack %d fdP 0x%lX\n",
	3314	fdP->fdStackUsed, fdP);
	3315	shP->shMaxP = fdP;
	3316	}
	3317	}
	3318
	3319
	3320	/* Process routine exit */
	3321	static void fdExit(const char * funcnameP)
	3322	{
	3323	stack_history_t * shP;
	3324	frame_desc_t * lastPopP;
	3325	frame_desc_t * popP;
	3326	frame_desc_t * p;
	3327
	3328	/* Locate or create stack_history_t for this thread */
	3329	shP = shFind();
	3330
	3331	/* If call stack is already empty, there is nothing to do except free
	3332	the stack_history_t */
	3333	if (shP->shCurrentP == NULL)
	3334	{
	3335	SH_ASSERT(shP->shMaxP == NULL);
	3336	shPut(shP);
	3337	return;
	3338	}
	3339
	3340	/* Search backward on the call stack for a routine name that matches
	3341	the one being exitted. In C++, the ENTER/EXIT macros will pass the
	3342	same string constant (same address) to fdEntry and fdExit. The C
	3343	versions of the macros may pass two different copies of the same
	3344	string. This loop cannot pop routines it skips off the stack, since
	3345	the routine might never be found. */
	3346	p = shP->shCurrentP;
	3347	for (;;)
	3348	{
	3349	if (p->fdFuncNameP == funcnameP \|\|
	3350	cxiStrcmp(p->fdFuncNameP, funcnameP) == 0)
	3351	{
	3352	TRACE4(TRACE_ENTRYEXIT, 11, TRCID_FDEXIT,
	3353	"fdExit: p 0x%lX shP 0x%lX rtn %s shCurrentP 0x%lX\n",
	3354	p, shP, p->fdFuncNameP, shP->shCurrentP);
	3355	lastPopP = p;
	3356	break;
	3357	}
	3358	p = p->fdCallerP;
	3359	if (p == NULL)
	3360	{
	3361	/* Routine name not found. Do not pop stack. */
	3362	/* printk("No entry found when exitting %s\n", funcnameP); */
	3363	TRACE1(TRACE_ENTRYEXIT, 11, TRCID_FDEXIT_NOTFOUND,
	3364	"No entry found when exitting %s\n", funcnameP);
	3365	return;
	3366	}
	3367	}
	3368
	3369	/* Pop all routines up to and including lastPopP */
	3370	p = shP->shCurrentP;
	3371	do
	3372	{
	3373	popP = p;
	3374	p = popP->fdCallerP;
	3375	fdDiscard(popP, shP);
	3376	} while (popP != lastPopP);
	3377	shP->shCurrentP = p;
	3378
	3379	/* If this was the return of the outermost routine, print new maximum
	3380	stack depth traceback and discard the stack_history_t */
	3381	if (shP->shCurrentP == NULL)
	3382	{
	3383	shDisplay(shP);
	3384	shPut(shP);
	3385	}
	3386	}
	3387
	3388	#endif /* KSTACK_CHECK */
	3389
	3390
	3391	#if defined(ENTRYEXIT_TRACE) \|\| defined(KSTACK_CHECK)
	3392	void cxiTraceEntry(int level, const char * funcnameP,
	3393	const char * filenameP, int lineNum)
	3394	{
	3395	int stackUsed = THREAD_SIZE - (((unsigned long)&stackUsed) & (THREAD_SIZE-1));
	3396	#ifdef KSTACK_CHECK
	3397	stack_history_t * shP;
	3398	frame_desc_t * fdP;
	3399	#endif /* KSTACK_CHECK */
	3400
	3401	#ifdef ENTRYEXIT_TRACE
	3402	/* Need to use a constant trace level in the TRACE macro call to get
	3403	the .trclst file (and later the .trcfmt file) built correctly */
	3404	if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level))
	3405	{
	3406	TRACE5(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_ENTER,
	3407	"-->K %s (%s:%d) level %d stackUsed %d\n",
	3408	funcnameP, filenameP, lineNum, level, stackUsed);
	3409	}
	3410	#endif /* ENTRYEXIT_TRACE */
	3411
	3412	#ifdef KSTACK_CHECK
	3413	/* Nothing to do if kernel stack checking is disabled */
	3414	if (!SHG.shActive)
	3415	return;
	3416
	3417	/* Do not attempt to keep track of stack usage in interrupt handlers */
	3418	if (in_interrupt())
	3419	return;
	3420
	3421	/* Locate or create stack_history_t for this thread */
	3422	shP = shFind();
	3423
	3424	/* Get a new frame descriptor and fill it in */
	3425	fdP = fdGet(shP);
	3426	fdP->fdFuncNameP = funcnameP;
	3427	fdP->fdFileNameP = filenameP;
	3428	fdP->fdLineNum = lineNum;
	3429	fdP->fdStackUsed = stackUsed;
	3430
	3431	/* Perform stack checking for this routine entry */
	3432	fdEntry(fdP, shP);
	3433	#endif /* KSTACK_CHECK */
	3434	}
	3435
	3436
	3437	void cxiTraceExit(int level, const char * funcnameP)
	3438	{
	3439	#ifdef ENTRYEXIT_TRACE
	3440	/* Need to use a constant trace level in the TRACE macro call to get
	3441	the .trclst file (and later the .trcfmt file) built correctly */
	3442	if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level))
	3443	TRACE1(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_EXIT,
	3444	"<--K %s\n", funcnameP);
	3445	#endif /* ENTRYEXIT_TRACE */
	3446
	3447	#ifdef KSTACK_CHECK
	3448	/* Nothing to do if kernel stack checking is disabled */
	3449	if (!SHG.shActive)
	3450	return;
	3451
	3452	/* Do not attempt to keep track of stack usage in interrupt handlers */
	3453	if (in_interrupt())
	3454	return;
	3455
	3456	/* Process routine exit */
	3457	fdExit(funcnameP);
	3458	#endif /* KSTACK_CHECK */
	3459	}
	3460	void cxiTraceExitRC(int level, const char * funcnameP, int rc)
	3461	{
	3462	#ifdef ENTRYEXIT_TRACE
	3463	/* Need to use a constant trace level in the TRACE macro call to get
	3464	the .trclst file (and later the .trcfmt file) built correctly */
	3465	if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level))
	3466	TRACE2(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_EXIT_RC,
	3467	"<--K %s rc %d\n", funcnameP, rc);
	3468	#endif /* ENTRYEXIT_TRACE */
	3469
	3470	#ifdef KSTACK_CHECK
	3471	/* Nothing to do if kernel stack checking is disabled */
	3472	if (!SHG.shActive)
	3473	return;
	3474
	3475	/* Do not attempt to keep track of stack usage in interrupt handlers */
	3476	if (in_interrupt())
	3477	return;
	3478
	3479	/* Process routine exit */
	3480	fdExit(funcnameP);
	3481	#endif /* KSTACK_CHECK */
	3482	}
	3483	#endif /* defined(ENTRYEXIT_TRACE) \|\| defined(KSTACK_CHECK) */
	3484
	3485
	3486	#ifdef UIDREMAP
	3487	size_t cxiGetUserEnvironmentSize(void)
	3488	{
	3489	return (current->mm->env_end - current->mm->env_start);
	3490	}
	3491
	3492	int cxiGetUserEnvironment(char* buf, size_t len)
	3493	{
	3494	return cxiCopyIn((char*)current->mm->env_start, buf, len);
	3495	}
	3496	#endif
	3497
	3498	Boolean cxiHasMountHelper()
	3499	{
	3500	return USING_MOUNT_HELPER();
	3501	}
	3502
	3503	#ifdef P_NFS4
	3504
	3505	#include <linux/nfsd/nfs4layoutxdr.h>
	3506
	3507	/* convert ip address to string */
	3508	char IPtoString(int ip, char buf)
	3509	{
	3510	unsigned char a = (char )&ip;
	3511
	3512	sprintf(buf, "%u.%u.%u.%u", a[0], a[1], a[2], a[3]);
	3513
	3514	return buf;
	3515	}
	3516
	3517	static void printfh(char s, int fh)
	3518	{
	3519	#ifdef GPFS_PRINTK
	3520	printk("%s: %d: %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
	3521	s, fh[0],fh[1],fh[2],fh[3],fh[4],fh[5],fh[6],fh[7],fh[8],fh[9]);
	3522	#endif
	3523	}
	3524
	3525	int cxiSetFH(int *fhP, int sid)
	3526	{
	3527	struct knfsd_fh fh = (struct knfsd_fh )fhP;
	3528
	3529	printfh("cxiSetFH-1", fhP);
	3530	if (fh->fh_size > 8) {
	3531	fh->fh_size += 4; // fh_size + 4 for sid
	3532	fh->fh_fsid_type += max_fsid_type;
	3533	fhP[(fh->fh_size >> 2)] = sid;
	3534	fh->fh_fileid_type = 7; // see code in gpfs_decode_fh()
	3535	#ifdef GPFS_PRINTK
	3536	printk("cxiSetFH size %d fsid_type %d fileid %d\n",
	3537	fh->fh_size, fh->fh_fsid_type, fh->fh_fileid_type);
	3538	#endif
	3539	printfh("cxiSetFH-2", fhP);
	3540	return 0;
	3541	}
	3542	return ENOENT;
	3543	}
	3544
	3545	/* Call to NFS server on MDS to get open state */
	3546	int cxiOpenState(void vfsP, void p)
	3547	{
	3548	int rc = ENOENT;
	3549	struct super_block sbP = (struct super_block )vfsP;
	3550	struct pnfs_get_state *osP = p;
	3551	struct gpfsVfsData_t privVfsP = (struct gpfsVfsData_t )SBLOCK_PRIVATE(sbP);
	3552
	3553	#ifdef GPFS_PRINTK
	3554	printk("cxiOpenState1 sb %p p %p \n", sbP, p);
	3555	printk("cxiOpenState cb_get_state %p\n",
	3556	sbP->s_export_op->cb_get_state);
	3557	#endif
	3558	if (sbP->s_export_op->cb_get_state)
	3559	rc = sbP->s_export_op->cb_get_state(osP);
	3560
	3561	gpfs_ops.gpfsGetVerifier(privVfsP, osP->verifier);
	3562	#ifdef GPFS_PRINTK
	3563	printk("cxiOpenState rc %d devid %x verifier %x:%x\n",
	3564	rc, osP->devid, osP->verifier[0], osP->verifier[1]);
	3565	#endif
	3566
	3567	return rc;
	3568	}
	3569	/* Call to NFS server on DS to get change open state or close the file */
	3570	int cxiChangeState(void vfsP, void p)
	3571	{
	3572	int rc = ENOENT;
	3573	struct super_block sbP = (struct super_block )vfsP;
	3574	struct pnfs_get_state *osP = p;
	3575
	3576	if (sbP->s_export_op->cb_change_state)
	3577	rc = sbP->s_export_op->cb_change_state(osP);
	3578	#ifdef GPFS_PRINTK
	3579	printk("cxiChangeState2 sb %p p %p access %d\n", sbP, p, osP->access);
	3580	#endif
	3581
	3582	return rc;
	3583	}
	3584	/* Call to NFS server on MDS to recall layout */
	3585	int cxiRecallLayout(void vfsP, void vP, void *p)
	3586	{
	3587	int rc = ENOENT;
	3588	struct super_block sbP = (struct super_block )vfsP;
	3589	struct inode iP = (struct inode )vP;
	3590	struct layout_recall lr;
	3591
	3592	lr.fsid = sbP;
	3593	lr.offset = 0;
	3594	lr.length = -1;
	3595
	3596	if (iP == NULL) // recall all layouts for this fs
	3597	lr.layout_type = RECALL_FSID;
	3598
	3599	#ifdef GPFS_PRINTK
	3600	printk("cxiRecallLayout sbP %p type %d\n", sbP, lr.layout_type);
	3601	#endif
	3602	if (sbP->s_export_op->cb_layout_recall) {
	3603	rc = sbP->s_export_op->cb_layout_recall(sbP, iP, &lr);
	3604	}
	3605	else {
	3606	lr.layout_type = RECALL_FILE;
	3607	#ifdef GPFS_PRINTK
	3608	printk("cxiRecallLayout sbP %p iP %p type %d\n", sbP, iP, lr.layout_type);
	3609	#endif
	3610	}
	3611
	3612	#ifdef GPFS_PRINTK
	3613	printk("cxiRecallLayout sbP %p iP %p rc %d\n", sbP, iP, rc);
	3614	#endif
	3615	return rc;
	3616	}
	3617
	3618	/* Get device list
	3619
	3620	gd_type
	3621	in: requested layout type.
	3622	out: available lauout type.
	3623	gd_cookie
	3624	in: cookie returned on the last operation.
	3625	out: none zero cookie if some devices did not fit in the buffer.
	3626	gd_maxcount
	3627	in: buffer size in bytes.
	3628	gd_buffer
	3629	in: pointer to buffer.
	3630	gd_devlist_len
	3631	out: number of items returned in the buffer.
	3632
	3633	error:
	3634	Use the same retrun codes as used for GTEDEVLIST
	3635	*/
	3636	int
	3637	cxiGetDeviceList(int nDests, int idList, void P)
	3638	{
	3639	ENTER(0);
	3640	int rc = 0;
	3641	int i, len, left;
	3642	int j = 0;
	3643	char p, tp;
	3644	char tmp[32];
	3645	struct nfsd4_pnfs_getdevlist dl = (struct nfsd4_pnfs_getdevlist )P;
	3646	struct nfsd4_pnfs_devlist *gd_buf = NULL;
	3647	struct pnfs_filelayout_devaddr *dev;
	3648
	3649	#ifdef GPFS_PRINTK
	3650	printk("xxx cxiGetDeviceList enter nDests %d idList %p \n", nDests, idList);
	3651	#endif
	3652
	3653	dl->gd_type = LAYOUT_NFSV4_FILES;
	3654	dl->gd_cookie = 0;
	3655	dl->gd_devlist_len = 0;
	3656	left = dl->gd_maxcount;
	3657	tp = &tmp[0];
	3658
	3659	len = sizeof(struct nfsd4_pnfs_devlist) * nDests;
	3660	#ifdef GPFS_PRINTK
	3661	printk("xxx cxiGetDeviceList len %d left %d\n", len, left);
	3662	#endif
	3663	if (nDests > left) {
	3664	rc = ENOMEM; //??? NFS4ERR_TOOSMALL
	3665	goto xerror;
	3666	}
	3667	gd_buf = (struct nfsd4_pnfs_devlist *)cxiMallocUnpinned(len);
	3668	if (gd_buf == NULL) {
	3669	rc = ENOMEM;
	3670	goto xerror;
	3671	}
	3672	memset(gd_buf, 0, len);
	3673	dl->gd_devlist = gd_buf;
	3674
	3675	#ifdef GPFS_PRINTK
	3676	printk("xxx cxiGetDeviceList gd_buf %p count %d\n", gd_buf, nDests);
	3677	#endif
	3678	for (i = 0; i < nDests; i++)
	3679	{
	3680	/* make both device id and device address be the same for now */
	3681	gd_buf[j].dev_id = idList[i];
	3682	gd_buf[j].dev_lotype = LAYOUT_NFSV4_FILES;
	3683	if (gd_buf[j].dev_id == INADDR_NONE)
	3684	continue;
	3685
	3686	IPtoString(gd_buf[j].dev_id, tp);
	3687	len = (cxiStrlen(tp));
	3688
	3689	p = (char *)cxiMallocUnpinned(sizeof(struct pnfs_filelayout_devaddr));
	3690	if (p == NULL) {
	3691	rc = ENOMEM;
	3692	goto xerror;
	3693	}
	3694	memset(p, 0, sizeof(struct pnfs_filelayout_devaddr));
	3695	gd_buf[j].dev_addr = p;
	3696
	3697	dev = (struct pnfs_filelayout_devaddr *)p;
	3698	dev->r_addr.len = len + 4; /* for ".8.1" */
	3699
	3700	p = (char *)cxiMallocUnpinned(dev->r_addr.len+1);
	3701	if (p == NULL) {
	3702	rc = ENOMEM;
	3703	goto xerror;
	3704	}
	3705	dev->r_addr.data = p;
	3706	cxiMemcpy(p, tp, len);
	3707	p = p + len;
	3708	cxiStrcpy(p, ".8.1"); /* port 2049 = 0x801 = "8.1" */
	3709
	3710	dev->r_netid.len = 3; /'tcp'/
	3711	p = (char *)cxiMallocUnpinned(dev->r_netid.len+1);
	3712	if (p == NULL) {
	3713	rc = ENOMEM;
	3714	goto xerror;
	3715	}
	3716	cxiStrcpy(p, "tcp");
	3717	dev->r_netid.data = p;
	3718
	3719	left = left - 1;
	3720	dl->gd_devlist_len++;
	3721
	3722	TRACE4(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELIST_P1,
	3723	"gpfsGetDeviceList index %d len %d ip %s left %d\n",
	3724	i, dev->r_addr.len, dev->r_addr.data, left);
	3725	#ifdef GPFS_PRINTK
	3726	printk("xxx cxiGetDeviceList index %d id %d len %d ip %s left %d ops %p %p\n",
	3727	i, gd_buf[j].dev_id, dev->r_addr.len,
	3728	dev->r_addr.data, left, dl->gd_ops, dl->gd_ops->devaddr_encode);
	3729	#endif
	3730
	3731	j++;
	3732	}
	3733
	3734	exit:
	3735
	3736	TRACE2(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELIST_EXIT,
	3737	"cxiGetDeviceList exit: rc %d len %d", rc, len);
	3738	return rc;
	3739
	3740	xerror:
	3741
	3742	if (gd_buf != NULL) {
	3743	for (i = 0; i < j; i++)
	3744	{
	3745	dev = gd_buf[i].dev_addr;
	3746	if (dev) {
	3747	cxiFreeUnpinned(dev->r_addr.data);
	3748	cxiFreeUnpinned(dev->r_netid.data);
	3749	cxiFreeUnpinned(dev);
	3750	}
	3751	}
	3752	cxiFreeUnpinned(gd_buf);
	3753	}
	3754	goto exit;
	3755	}
	3756
	3757	int
	3758	cxiGetDeviceInfo(void *P)
	3759	{
	3760	ENTER(0);
	3761	int rc;
	3762	int len;
	3763	char p, tp;
	3764	char tmp[32];
	3765	struct nfsd4_pnfs_getdevinfo da = (struct nfsd4_pnfs_getdevinfo )P;
	3766	tp = &tmp[0];
	3767	struct pnfs_filelayout_devaddr *dev;
	3768
	3769	IPtoString(da->gd_dev_id, tp);
	3770
	3771	dev = (struct pnfs_filelayout_devaddr *)cxiMallocUnpinned(sizeof(struct pnfs_filelayout_devaddr));
	3772	if (dev == NULL) {
	3773	rc = ENOMEM;
	3774	goto xerror;
	3775	}
	3776	da->gd_devaddr = dev;
	3777
	3778	len = (cxiStrlen(tp));
	3779	dev->r_addr.len = len + 4; /* for ".8.1" */
	3780
	3781	p = (char *)cxiMallocUnpinned(dev->r_addr.len+1);
	3782	if (p == NULL) {
	3783	cxiFreeUnpinned(dev);
	3784	rc = ENOMEM;
	3785	goto xerror;
	3786	}
	3787	dev->r_addr.data = p;
	3788	cxiMemcpy(p, tp, len);
	3789	p = p + len;
	3790	cxiStrcpy(p, ".8.1"); /* port 2049 = 0x801 = "8.1" */
	3791
	3792	dev->r_netid.len = 3; /'tcp'/
	3793	p = (char *)cxiMallocUnpinned(dev->r_netid.len+1);
	3794	if (p == NULL) {
	3795	cxiFreeUnpinned(dev->r_addr.data);
	3796	cxiFreeUnpinned(dev);
	3797	rc = ENOMEM;
	3798	goto xerror;
	3799	}
	3800	cxiStrcpy(p, "tcp");
	3801	dev->r_netid.data = p;
	3802
	3803	TRACE2(TRACE_VNODE, 2, TRCID_GPFSOPS_GET_DEVICELINFO_P1,
	3804	"gpfsGetDeviceInfo len %d ip %s\n",
	3805	dev->r_addr.len, dev->r_addr.data);
	3806
	3807	#ifdef GPFS_PRINTK
	3808	printk("xxx cxiGetDeviceInfo id %d len %d ip %s\n",
	3809	da->gd_dev_id, dev->r_addr.len, dev->r_addr.data);
	3810	#endif
	3811
	3812	xerror:
	3813
	3814	TRACE1(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELINFO_EXIT,
	3815	"cxiGetDeviceInfo exit: rc %d\n", rc);
	3816
	3817	return rc;
	3818	}
	3819	/* get layout
	3820	lg_type
	3821	in: requested layout type.
	3822	out: available lauout type.
	3823	lg_offset
	3824	in: requested offset.
	3825	out: returned offset.
	3826	lg_length
	3827	in: requested length.
	3828	out: returned length.
	3829	lg_mxcnt
	3830	in: buffer size in bytes.
	3831	lg_llist
	3832	in: pointer to buffer.
	3833	lg_layout
	3834	out: number of items returned in the buffer.
	3835
	3836	if the file is big(?) return all nodes in layout
	3837	if the file is small return no layout or just one node, choose one node in
	3838	random but make sure it is the same node for the same file.
	3839	*/
	3840	int
	3841	cxiGetLayout(int nDests, int idList, cxiVattr_t vattr, int myAddr, void *P)
	3842	{
	3843	ENTER(0);
	3844	char p, n;
	3845	int i, rc, left, len;
	3846	struct nfsd4_pnfs_layoutget gl = (struct nfsd4_pnfs_layoutget )P;
	3847	struct nfsd4_pnfs_layoutlist *lg_buf = NULL;
	3848	struct nfsd4_pnfs_filelayout *layout = NULL;
	3849
	3850	TRACE2(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_ENTER,
	3851	"cxiGetLayout: nDests %d myAddr %x\n", nDests,myAddr);
	3852
	3853	/* set node id in fh and increase fh size by 4 */
	3854	rc = cxiSetFH((int *)&gl->lg_fh, myAddr);
	3855	if (rc != 0)
	3856	goto xerror;
	3857
	3858	gl->lg_type = LAYOUT_NFSV4_FILES;
	3859	gl->lg_offset = 0;
	3860	gl->lg_length = MAX_UINT64; /* The maximum file size */
	3861
	3862	layout = (struct nfsd4_pnfs_filelayout *)cxiMallocUnpinned(sizeof(struct nfsd4_pnfs_filelayout));
	3863	if (layout == NULL) {
	3864	rc = ENOMEM;
	3865	goto xerror;
	3866	}
	3867	gl->lg_layout = layout;
	3868	layout->lg_stripe_type = STRIPE_DENSE;
	3869	layout->lg_commit_through_mds = true;
	3870	layout->lg_stripe_unit = vattr->va_blocksize; /* preferred blocksize */
	3871	layout->lg_file_size = vattr->va_size; /* file size in bytes */
	3872	layout->lg_llistlen = 0;
	3873
	3874	left = gl->lg_mxcnt;
	3875
	3876	len = sizeof(struct nfsd4_pnfs_layoutlist) * nDests;
	3877	if (len > left) {
	3878	rc = ENOMEM; // NFS4ERR_TOOSMALL
	3879	goto xerror;
	3880	}
	3881	lg_buf = (struct nfsd4_pnfs_layoutlist *)cxiMallocUnpinned(len);
	3882	if (lg_buf == NULL) {
	3883	rc = ENOMEM;
	3884	goto xerror;
	3885	}
	3886	memset(lg_buf, 0, len);
	3887	layout->lg_llist = lg_buf;
	3888	left = left - len;
	3889
	3890	for (i = 0; i < nDests; i++)
	3891	{
	3892	/* make both device id and device address be the same for now */
	3893	lg_buf[i].dev_ids.len = 1; //??? can return a list of dev ids ????
	3894	lg_buf[i].dev_ids.list = (u32 )cxiMallocUnpinned(sizeof(u32)lg_buf[i].dev_ids.len);
	3895	if (lg_buf[i].dev_ids.list == NULL) {
	3896	rc = ENOMEM;
	3897	goto xerror;
	3898	}
	3899	lg_buf[i].dev_ids.list[0] = idList[i];
	3900	layout->lg_llistlen++;
	3901	lg_buf[i].fhp = (struct knfsd_fh *)&gl->lg_fh;
	3902
	3903	#ifdef GPFS_PRINTK
	3904	printk("cxiGetLayout index %d id %d xid 0x%lX len %d\n",
	3905	i, idList[i], idList[i], len);
	3906	#endif
	3907	TRACE3(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_P1,
	3908	"cxiGetLayout index %d id 0x%lX len %d\n",
	3909	i, idList[i], len);
	3910
	3911	}
	3912	if (i == 0) {
	3913	layout->lg_llistlen = 0;
	3914	cxiFreeUnpinned(lg_buf);
	3915	}
	3916
	3917	#ifdef GPFS_PRINTK
	3918	printk("cxiGetLayout: type %d iomode %d offset %lld length %lld minlength %lld mxcnt %d ops %p layouts %p\n",
	3919	gl->lg_type, gl->lg_iomode, gl->lg_offset, gl->lg_length, gl->lg_minlength,
	3920	gl->lg_mxcnt, gl->lg_ops, gl->lg_layout);
	3921
	3922	printfh("cxiGetLayout:", gl->lg_fh);
	3923
	3924	printk("cxiGetLayout: layout stripe_type %d stripe_unit %lld file_size %lld llistlen %d llist %p\n",
	3925	layout->lg_stripe_type, layout->lg_stripe_unit,layout->lg_file_size,
	3926	layout->lg_llistlen,layout->lg_llist);
	3927	#endif
	3928
	3929	exit:
	3930
	3931	TRACE3(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_EXIT,
	3932	"cxiGetLayout exit: rc %d len %d p 0x%lX", rc, len, p);
	3933
	3934	return rc;
	3935
	3936	xerror:
	3937
	3938	if (lg_buf) {
	3939	gl->lg_length = 0;
	3940	for (i = 0; i < nDests; i++)
	3941	{
	3942	cxiFreeUnpinned(lg_buf[i].dev_ids.list);
	3943	}
	3944	cxiFreeUnpinned(lg_buf);
	3945	}
	3946	if (layout)
	3947	cxiFreeUnpinned(layout);
	3948
	3949	goto exit;
	3950	}
	3951	#endif
	3952
	3953	int cxiCheckThreadState(cxiThreadId tid)
	3954	{
	3955	struct task_struct t, g;
	3956	int rc = ENOENT;
	3957
	3958	// read_lock(&tasklist_lock);
	3959	rcu_read_lock();
	3960
	3961	DO_EACH_THREAD(g,t)
	3962	{
	3963	/* We are looking for a thread with a given tid and the same parent as
	3964	the caller (the caller must be another mmfsd thread */
	3965	if (t->pid == tid &&
	3966	cxiStrcmp(t->comm, current->comm) == 0)
	3967	{
	3968	rc = 0;
	3969	break;
	3970	}
	3971	} WHILE_EACH_THREAD(g,t);
	3972	// read_unlock(&tasklist_lock);
	3973	rcu_read_unlock();
	3974
	3975	return rc;
	3976	}
	3977

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gpfs_3.1_ker2.6.20/lpp/mmfs/src/gpl-linux/cxiSystem.c @ 145

Download in other formats: