Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

cxiSystem.c @ 111

Last change on this file since 111 was 16, checked in by rock, 17 years ago

File size: 104.9 KB

Line
1	/***************************************************************************
2	*
3	* Copyright (C) 2001 International Business Machines
4	* All rights reserved.
5	*
6	* This file is part of the GPFS mmfslinux kernel module.
7	*
8	* Redistribution and use in source and binary forms, with or without
9	* modification, are permitted provided that the following conditions
10	* are met:
11	*
12	* 1. Redistributions of source code must retain the above copyright notice,
13	* this list of conditions and the following disclaimer.
14	* 2. Redistributions in binary form must reproduce the above copyright
15	* notice, this list of conditions and the following disclaimer in the
16	* documentation and/or other materials provided with the distribution.
17	* 3. The name of the author may not be used to endorse or promote products
18	* derived from this software without specific prior written
19	* permission.
20	*
21	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
27	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31	*
32	*************************************************************************** */
33	/* @(#)16 1.158.1.9 src/avs/fs/mmfs/ts/kernext/gpl-linux/cxiSystem.c, mmfs, avs_rgpfs24, rgpfs24s007a 10/24/06 19:12:27 */
34	/*
35	* Linux implementation of basic common services
36	*
37	* Contents:
38	* cxiGetThreadId
39	* getpid
40	* cxiIsSuperUser
41	* DoPanic
42	* logAssertFailed
43	* Kernel memory allocation services:
44	* cxiMallocPinned
45	* cxiFreePinned
46	*
47	*/
48
49	#include <Shark-gpl.h>
50
51	#include <linux/kernel.h>
52	#include <linux/module.h>
53	#include <linux/sched.h>
54	#include <linux/slab.h>
55	#include <linux/wait.h>
56	#include <linux/time.h>
57	#include <linux/file.h>
58	#include <linux/string.h>
59	#include <asm/uaccess.h>
60	#include <linux/smp_lock.h>
61	#include <linux/vmalloc.h>
62	#include <linux/fs.h>
63	#include <linux/interrupt.h>
64	#undef memcmp
65
66	#define DEFINE_TRACE_GBL_VARS
67	#include <Logger-gpl.h>
68	#include <verdep.h>
69	#include <linux2gpfs.h>
70	#include <cxiSystem.h>
71	#include <cxiAtomic.h>
72	#include <cxi2gpfs.h>
73	#include <cxiIOBuffer.h>
74	#include <cxiSharedSeg.h>
75	#include <cxiCred.h>
76
77	#include <Trace.h>
78	#include <lxtrace.h>
79	#include <cxiMode.h>
80	#if LINUX_KERNEL_VERSION >= 2060000
81	#include <linux/swap.h>
82	#include <linux/writeback.h>
83	#endif
84
85	#if LINUX_KERNEL_VERSION >= 2040900
86	/* This is in the Redhat kernel series */
87	extern int posix_locks_deadlock(struct file_lock , struct file_lock );
88	#endif
89
90	#ifdef INSTRUMENT_LOCKS
91	struct BlockingMutexStats BlockingMutexStatsTable[MAX_GPFS_LOCK_NAMES];
92	#endif /* INSTRUMENT_LOCKS */
93
94	/* We record the daemon's process group since it can uniquely identify
95	* a thread as being part of the GPFS daemon. pid is unique per thread
96	* on linux due to their clone implementation.
97	*/
98	static pid_t DaemonPGrp = -1;
99
100	/* Get the kernel thread ID. */
101	cxiThreadId cxiGetThreadId()
102	{
103	/* ENTER(1); */
104	return current->pid;
105	}
106
107	/* Get the kernel process ID. */
108	pid_t getpid()
109	{
110	/* ENTER(1); */
111	return current->pid;
112	}
113
114	/* bufP is caller's ext_cred_t buffer
115	* uCredPP is the ucred struct (NULL on Linux)
116	* eCredPP is the ext_cred_t struct * (if successful)
117	*
118	* cxiPutCred should be called to release when operation has been completed.
119	*/
120	int cxiGetCred(void bufP, void uCredPP, void *eCredPP)
121	{
122	ext_cred_t eCredP = (ext_cred_t )bufP;
123
124	ENTER(0);
125	*uCredPP = NULL;
126	*eCredPP = NULL;
127
128	if (!bufP)
129	{
130	EXIT_RC(0, EINVAL);
131	return EINVAL;
132	}
133
134	setCred(eCredP);
135	eCredPP = (void )eCredP;
136
137	xerror:
138	EXIT(0);
139	return 0;
140	}
141
142	/* Release of cxiGetCred() structures (nothing to do on Linux) */
143	int cxiPutCred(void userCredP, void extCredP)
144	{
145	if (userCredP \|\| !extCredP)
146	return EINVAL;
147
148	return 0;
149	}
150
151	/* Convert a kernel stack address to the thread ID of the thread that
152	* uses that stack
153	*/
154	int
155	cxiStackAddrToThreadId(char* stackP, cxiThreadId* tidP)
156	{
157	struct task_struct * tP;
158	#if LINUX_KERNEL_VERSION >= 2060000
159	/* the kernel stack is base off the thread_info struct in the 2.6 kernel
160	* will get the task pointer out of thread_info struct.
161	*/
162	struct thread_info * iP;
163	ENTER(0);
164	iP = (struct thread_info *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1)));
165	tP = iP->task;
166	#else
167	/* the kernel stack is base off the task_struct struct in the 2.4 kernel */
168	tP = (struct task_struct *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1)));
169	#endif
170	ENTER(0);
171	*tidP = tP->pid;
172	EXIT(0);
173	return 0;
174	}
175
176	/* Convert a kernel thread pointer to the corresponding thread ID */
177	int
178	cxiThreadPtrToThreadId(char* threadP, cxiThreadId* tidP)
179	{
180	struct task_struct * tP;
181
182	ENTER(0);
183	tP = (struct task_struct *) threadP;
184	*tidP = tP->pid;
185
186	EXIT(0);
187	return 0;
188	}
189
190
191	/* Return true if caller has has maximum authorization (is root) */
192	Boolean cxiIsSuperUser()
193	{
194	return (current->euid == 0);
195	}
196
197
198	/* Get the process max filesize limit (ulimit -f) */
199	Int64 cxiGetMaxFileSize()
200	{
201	if ((signed long)MY_RLIM_CUR(RLIMIT_FSIZE) == -1L)
202	return MAX_INT64;
203	else
204	return (MY_RLIM_CUR(RLIMIT_FSIZE));
205	}
206
207	/* Routine to send a signal to the current thread/process */
208	void cxiSendSigThread(int sig)
209	{
210	ENTER(0);
211	send_sig(sig, current, 0);
212	EXIT(0);
213	}
214
215
216	#ifdef MALLOC_DEBUG
217	/* This tracks mallocs and frees on a limited basis.
218	* Implemented originally to determine if we were leaking
219	* any memory after an unload. This is not really thread
220	* safe for multiple processors unless they're automatically
221	* cache coherent without memory barriers (i386). Its useful
222	* for detecting memory leaks on a single processor system.
223	*/
224	#define MALLOC_RECORDS 5000 /* max mallocs to track */
225	struct mallocStat
226	{
227	void *beginP;
228	unsigned short size;
229	unsigned short type;
230	};
231	static struct mallocStat *mstatP = NULL;
232	unsigned int nextMalloc = 0;
233
234	void
235	MallocDebugStart()
236	{
237	int i;
238
239	ENTER(0);
240	if (mstatP == NULL)
241	mstatP = vmalloc(MALLOC_RECORDS * sizeof(struct mallocStat));
242
243	if (mstatP == NULL)
244	{
245	EXIT(0);
246	return;
247	}
248
249	for (i = 0; i < MALLOC_RECORDS; i++)
250	{
251	mstatP[i].beginP = NULL;
252	mstatP[i].size = 0;
253	mstatP[i].type = 0;
254	}
255	printk("MallocDebugStart 0x%X\n", mstatP);
256	EXIT(0);
257	}
258
259	void
260	MallocDebugEnd()
261	{
262	int i;
263
264	ENTER(0);
265	if (mstatP != NULL)
266	{
267	for (i = 0; i < MALLOC_RECORDS; i++)
268	{
269	if (mstatP[i].beginP != NULL)
270	printk("MallocDebug: beginP 0x%X size %d type %d STILL ALLOCATED!\n",
271	mstatP[i].beginP, mstatP[i].size, mstatP[i].type);
272	}
273	}
274
275	vfree(mstatP);
276	mstatP = NULL;
277	EXIT(0);
278	}
279
280	void
281	MallocDebugNew(void *ptr, unsigned short size, unsigned short type)
282	{
283	void *bP;
284	int i;
285	int j;
286	int swrc;
287	int oldval;
288	int where = nextMalloc;
289
290	ENTER(0);
291
292	if (mstatP == NULL)
293	{
294	EXIT(0);
295	return;
296	}
297
298	for (i = where; i < MALLOC_RECORDS + where; i++)
299	{
300	if (i >= MALLOC_RECORDS)
301	j = i - MALLOC_RECORDS;
302	else
303	j = i;
304
305	bP = mstatP[j].beginP;
306	if (bP == NULL)
307	{
308	swrc = ATOMIC_SWAP(&mstatP[j].beginP, &bP, ptr);
309	if (swrc)
310	{
311	mstatP[j].size = size;
312	mstatP[j].type = type;
313	break;
314	}
315	}
316	}
317
318	EXIT(0);
319	}
320
321	void
322	MallocDebugDelete(void *ptr)
323	{
324	void *bP;
325	int i;
326	int swrc;
327	int next;
328	int found = 0;
329
330	ENTER(0);
331	if (mstatP == NULL)
332	{
333	EXIT(0);
334	return;
335	}
336
337	for (i = 0; i < MALLOC_RECORDS; i++)
338	{
339	bP = mstatP[i].beginP;
340	if (bP == ptr)
341	{
342	next = nextMalloc;
343	ATOMIC_SWAP(&nextMalloc, &next, i);
344
345	swrc = ATOMIC_SWAP(&mstatP[i].beginP, &bP, NULL);
346	DBGASSERT(swrc);
347	found = 1;
348	break;
349	}
350	}
351
352	if (!found)
353	printk("MallocDebug: 0x%X not found!\n", ptr);
354	EXIT(0);
355	}
356	#endif /* MALLOC_DEBUG */
357
358	/* Allocate pinned kernel memory */
359	void* cxiMallocPinned(int nBytes)
360	{
361	void *ptr;
362
363	/* kmalloc only supports requests for up to 131027 bytes. Anything
364	larger than this results in a BUG() call. */
365	ENTER(0);
366	if (nBytes > 131072)
367	{
368	EXIT(0);
369	return NULL;
370	}
371
372	ptr = kmalloc(nBytes, GFP_KERNEL);
373
374	#ifdef MALLOC_DEBUG
375	MallocDebugNew(ptr, nBytes, 1);
376	#endif
377
378	EXIT(0);
379	return ptr;
380	}
381
382	/* Free pinned kernel memory that was allocated with cxiMallocPinned */
383	/* Must not block on lack of memory resourses */
384	void cxiFreePinned(void* p)
385	{
386	ENTER(0);
387	#ifdef MALLOC_DEBUG
388	MallocDebugDelete(p);
389	#endif
390
391	kfree(p);
392	EXIT(0);
393	}
394
395	/* Get the kernel thread ID. */
396	void* cxiGetFcntlOwner(eflock_t *flP)
397	{
398	return flP? flP->l_owner: current->files;
399	}
400
401	#if LINUX_KERNEL_VERSION > 2060900
402	struct lock_manager_operations lm_operations = {
403	};
404	#endif
405
406	/* Perform local advisory locking. */
407	int cxiFcntlLock(void *advObjP,
408	int cmd,
409	void *lockStructP,
410	cxiFlock_t *flockP,
411	int (*retryCB)(),
412	cxiOff64_t size,
413	cxiOff64_t offset,
414	ulong *retry_idP)
415	{
416	int len, rc = 0;
417	// struct file *fP;
418	struct file_lock fl, flP, gflP, *cflP;
419	Boolean keepLockElement = false;
420
421	/* cast platform independent arguments as appropriate for linux */
422	void (RetryFcn)(struct file_lock) = (void ()(struct file_lock))retryCB;
423	// fP = (struct file *)advObjP;
424	struct file localFile, *filp = &localFile;
425	struct dentry localDEntry, *dp = &localDEntry;
426	ENTER(0);
427	flP = (struct file_lock *) lockStructP;
428
429	localFile.f_dentry = &localDEntry;
430	localDEntry.d_inode = (struct inode *)advObjP;
431
432	/* Lock commands can have two different values. Convert them at
433	* entry to the portability layer so that we only have to check
434	* for one of them.
435	*/
436	#if !defined(__64BIT__)
437	if (cmd == F_GETLK64) cmd = F_GETLK;
438	if (cmd == F_SETLK64) cmd = F_SETLK;
439	if (cmd == F_SETLKW64) cmd = F_SETLKW;
440	#endif
441
442	/* Callers have the option of passing a platform dependent lock structure
443	(struct file_lock lockSructP) or the generic (cxiFlock_t flockP). */
444	if (flockP)
445	{
446	flP = &fl; /* Use a local file_lock structure */
447
448	/* If there is a potential for blocking, must malloc the locking structure
449	so it can persist until the lock becomes available (in Retry()). */
450
451	if (cmd == F_SETLKW)
452	{
453	#ifdef NFS_CLUSTER_LOCKS
454	len = sizeof(struct file_lock) +
455	sizeof(struct file) +
456	sizeof(struct dentry);
457	#else
458	len = sizeof(struct file_lock);
459	#endif
460	flP = (struct file_lock*)cxiMallocUnpinned(len);
461	if (flP == NULL)
462	{
463	rc = ENOMEM;
464	goto exit;
465	}
466	cxiMemset(flP, 0, len);
467	#ifdef NFS_CLUSTER_LOCKS
468	filp = (struct file)((char )flP + sizeof(struct file_lock));
469	dp = (struct dentry )((char )filp + sizeof(struct file));
470	filp->f_dentry = dp;
471	dp->d_inode = (struct inode *)advObjP;
472	#endif
473	}
474	else
475	cxiMemset(flP, 0, sizeof(*flP));
476
477	locks_init_lock(flP); /* Initialize list_head structs */
478	if (flockP->l_file == NULL)
479	flockP->l_file = filp;
480
481	/* fl_wait needs to be initialized because when unlock happens, the
482	linux routine locks_wake_up_blocks invokes our retry routine via
483	fl_notify and then calls wake_up(fl_wait) on the assumption that
484	the waiter is local. */
485
486	cxiWaitEventInit((cxiWaitEvent_t *)&flP->fl_wait);
487
488	cxiFlockToVFS(flockP, flP);
489	}
490
491	/* daemon didn't know the owner and required kernel code to fill it in. */
492	if (!flP->fl_owner)
493	flP->fl_owner = (fl_owner_t)cxiGetFcntlOwner(NULL);
494
495	#if 0
496	/* Validate the file pointer. Kernel locking routines are going to
497	use these without verifying them. If any of them are NULL, find
498	out now before they generate a segment violation. */
499	if ((!fP) \|\| (!fP->f_dentry) \|\| (!fP->f_dentry->d_inode))
500	{
501	if (cmd == F_GETLK)
502	flP->fl_type = F_UNLCK;
503	else
504	rc = EINVAL;
505	goto exit;
506	}
507	#endif
508
509	/* Note that this all depends on us having serialized such locking for
510	this file during from before the posix_test_lock() until after the
511	posix_block_lock(). The revoke lock that we hold here provides us
512	the necessary serilization. */
513
514	TRACE7(TRACE_VNODE, 3, TRCID_FCNTLLOCK_ENTER,
515	"cxiFcntlLock posix_lock_file: pid %d owner 0x%X inodeP 0x%X "
516	"range 0x%lX-%lX cmd %s type %s\n",
517	flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end,
518	(cmd == F_GETLK) ? "GETLK" : (cmd == F_SETLK) ? "SETLK" : "SETLKW",
519	(flP->fl_type == F_RDLCK) ? "RDLCK" :
520	(flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK");
521
522	if (cmd == F_GETLK)
523	{
524	/* Check for conflicts. If found, return the information.
525	If there are NO conflicts, return F_UNLCK in fl_type. */
526	#if LINUX_KERNEL_VERSION >= 2061700
527	struct file_lock conf;
528	gflP = &conf;
529	rc = posix_test_lock(filp, flP, gflP);
530	if (rc) {
531	rc = 0;
532	#else
533	if (NULL != (gflP = posix_test_lock(&localFile, flP))) {
534	#endif
535	flP->fl_start = gflP->fl_start;
536	flP->fl_end = gflP->fl_end;
537	flP->fl_type = gflP->fl_type;
538	flP->fl_pid = gflP->fl_pid;
539	flP->fl_owner = gflP->fl_owner;
540	}
541	else
542	flP->fl_type = F_UNLCK;
543
544	TRACE6(TRACE_VNODE, 3, TRCID_FCNTLLOCK_GETLK,
545	"cxiFcntlLock getlk: pid %d owner 0x%X inodeP 0x%X "
546	"range 0x%lX-%lX type %s\n",
547	flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end,
548	(flP->fl_type == F_RDLCK) ? "RDLCK" :
549	(flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK");
550	}
551	else
552	{ /* Begin: do the locking, but handle the blocking via our retry routine. */
553	/* Test the lock. What this really does for us is return the blocker
554	if one exists. This is needed to queue up the request if a conflicting
555	lock is already held. */
556
557	#ifdef NFS_CLUSTER_LOCKS
558	if (cmd == F_SETLKW) {
559	flP->fl_flags \|= FL_SLEEP;
560	if (!flP->fl_lmops) {
561	flP->fl_lmops = &lm_operations;
562	flP->fl_lmops->fl_notify = (void *)RetryFcn;
563	}
564	}
565	rc = POSIX_LOCK_FILE(filp, flP);
566	if (rc == -EAGAIN && (cmd == F_SETLKW) &&
567	flP->fl_lmops == &lm_operations)
568	{
569	/* Queue the blocker structures */
570	keepLockElement = true;
571	if (retry_idP)
572	*retry_idP = (ulong)flP; // returned to caller and saved in sleepElement
573	}
574	#else
575	#if LINUX_KERNEL_VERSION >= 2061700
576	if ((flP->fl_type == F_UNLCK) \|\| !(posix_test_lock(&localFile, flP, cflP)))
577	#else
578	if ((flP->fl_type == F_UNLCK) \|\| !(cflP = posix_test_lock(&localFile, flP)))
579	#endif
580	{
581	/* No conflicting lock: get the lock for the caller. */
582	rc = POSIX_LOCK_FILE(&localFile, flP);
583	}
584	else
585	{ /* Conflicting lock: ..... */
586	rc = EAGAIN;
587
588	if (cmd == F_SETLKW)
589	{
590	/*if (posix_locks_deadlock(flP, cflP))
591	{
592	rc = EDEADLK;
593	}
594	else*/
595	{
596	/* Queue the blocker structures */
597	keepLockElement = true;
598	if (retry_idP)
599	*retry_idP = (ulong)flP; // returned to caller and saved in sleepElement
600	#if LINUX_KERNEL_VERSION > 2060900
601	flP->fl_lmops = &lm_operations;
602	flP->fl_lmops->fl_notify = RetryFcn;
603	#else
604	flP->fl_notify = RetryFcn;
605	#endif
606	#if LINUX_KERNEL_VERSION < 2061700
607	posix_block_lock(cflP, flP);
608	#endif
609	}
610	}
611	}
612	#endif
613
614	TRACE2(TRACE_VNODE, 3, TRCID_FCNTLLOCK_EXIT,
615	"cxiFcntlLock posix_lock_file: rc %d retry_id 0x%lX\n", rc, cflP);
616	} /* End: do the locking, but handle the blocking via our retry routine. */
617
618	exit:
619
620	if (flockP)
621	{
622	/* Caller wanted results in flockP */
623	cxiVFSToFlock((void *)flP, flockP);
624
625	/* If we allocated the locking structure and then didn't need to use
626	it (the lock request didn't block), free it. */
627
628	if ((flP!=&fl) && (!keepLockElement)) {
629	cxiFreeUnpinned(flP);
630	}
631	}
632
633	#ifdef NFS_CLUSTER_LOCKS
634	if (rc < 0)
635	rc = -rc; /* make it positive */
636	#endif
637	EXIT_RC(0, rc);
638	return rc;
639	}
640
641	void cxiFcntlUnblock(void *retry_idP)
642	{
643	struct file_lock flP = (struct file_lock )retry_idP;
644
645	ENTER(0);
646	/* Include some sanity checks on the retry id (file_lock)
647	before passing it into the routine that does the work.
648	It should be properly linked (via its list_head structures)
649	in a file_lock_list that has blocked waiters. Also,
650	we would only be backing this out by the process that
651	has originally blocked, so verify the pid. */
652
653	if (!list_empty(&flP->fl_block) && !list_empty(&flP->fl_link) &&
654	flP->fl_next && flP->fl_pid == getpid())
655	{
656	POSIX_UNBLOCK_LOCK(flP);
657	}
658	EXIT(0);
659	}
660
661	int
662	cxiFcntlReset(void *vfsP, cxiPid_t mmfsd_pid)
663	{
664	int rc = 0;
665	struct super_block sbP = (struct super_block )vfsP;
666	struct list_head *fllP;
667	struct file_lock *fl;
668	struct dentry *dentryP;
669
670	ENTER(0);
671	lock_kernel();
672
673	restart:
674
675	#if LINUX_KERNEL_VERSION >= 2061600
676	//??? find a different way to clear locks file_lock_list is not exported anymore
677	#else
678	fllP = file_lock_list.next;
679
680	while(fllP != &file_lock_list)
681	{
682	fl = list_entry(fllP, struct file_lock, fl_link);
683	fllP = fllP->next;
684
685	/* If there are mmfs lock structures, release them. */
686
687	if (fl &&
688	fl->fl_file &&
689	fl->fl_file->f_dentry &&
690	fl->fl_file->f_dentry->d_inode)
691	{
692	dentryP = fl->fl_file->f_dentry;
693
694	/* If this lock belongs to the specified vfs, release advisory locks. */
695	if (dentryP->d_sb == sbP)
696	{
697	/* remove all our locks */
698	rc = gpfs_ops.gpfsFcntlReset((void *)dentryP->d_inode, mmfsd_pid);
699	if (rc == ENOSYS)
700	goto xerror;
701
702	/* After freeing unknown numbers of locks in gpfsFcntlReset (all
703	locks for the inode), restart from the top of the lock list */
704	goto restart;
705	}
706	}
707	}
708	#endif
709
710	xerror:
711	unlock_kernel();
712	EXIT_RC(0, rc);
713	return rc;
714	}
715
716	void *
717	cxiGetPrivVfsP(void *vfsP)
718	{
719	struct super_block sbP = (struct super_block )vfsP;
720
721	/* Do some sanity checking */
722	if ( (sbP->s_magic != GPFS_SUPER_MAGIC) \|\|
723	((UIntPtr) SBLOCK_PRIVATE(sbP) < GPFS_KERNEL_OFFSET) )
724	printSuperList(sbP);
725	LOGASSERT( sbP->s_magic == GPFS_SUPER_MAGIC );
726	LOGASSERT( (UIntPtr) SBLOCK_PRIVATE(sbP) >= GPFS_KERNEL_OFFSET );
727
728	return (SBLOCK_PRIVATE(sbP));
729	}
730
731
732	#ifdef NFS_DEBUG
733	/* These flags are defined in the kernel and control various cprintk
734	calls. This provides us a way to easily turn these on/off for
735	debugging our NFS support. */
736	extern unsigned int nlm_debug;
737	extern unsigned int nfsd_debug;
738	extern unsigned int nfs_debug;
739	extern unsigned int rpc_debug;
740	#endif
741
742	int cxiTrace(cxiTrace_t trace)
743	{
744	#ifdef NFS_DEBUG
745	int rc = 0;
746
747	ENTER(0);
748	switch (trace)
749	{
750	case cxiTraceNFS:
751	nlm_debug = nfsd_debug = nfs_debug = rpc_debug = ~0;
752	break;
753	case cxiTraceNFSoff:
754	nlm_debug = nfsd_debug = nfs_debug = rpc_debug = 0;
755	break;
756	default:
757	rc = EINVAL;
758	break;
759	}
760	EXIT_RC(0, rc);
761	return rc;
762	#else
763	return ENOSYS;
764	#endif
765	}
766
767	void cxiFlockToVFS(eflock_t* lckdatP, void* vP)
768	{
769	struct file_lock* flP = (struct file_lock *)vP;
770
771	ENTER(0);
772	if ((flP) && (lckdatP))
773	{
774	flP->fl_pid = lckdatP->l_pid;
775	flP->fl_owner = lckdatP->l_owner;
776	flP->fl_type = lckdatP->l_type;
777	flP->fl_start = lckdatP->l_start;
778	flP->fl_flags = FL_POSIX;
779	#ifdef NFS_CLUSTER_LOCKS
780	flP->fl_lmops = lckdatP->l_lmops;
781	flP->fl_file = lckdatP->l_file;
782	flP->fl_ops = NULL;
783	#else
784	#if LINUX_KERNEL_VERSION < 2061700
785	if (lckdatP->l_caller == L_CALLER_LOCKD)
786	flP->fl_flags \|= FL_LOCKD;
787	#endif
788	#endif
789	if (lckdatP->l_len == 0)
790	flP->fl_end = FL_OFFSET_MAX;
791	else
792	flP->fl_end = lckdatP->l_len + lckdatP->l_start - 1;
793	}
794	EXIT(0);
795	return;
796	}
797
798	#ifdef NFS_CLUSTER_LOCKS
799	int cxiVFSCallback(eflock_t* lckreqP, eflock_t* lckdatP,
800	int(* callback)(void , void , int), int result)
801	{
802	struct file_lock fl;
803	struct file *fileP;
804	struct file_lock conf, *confP = NULL;
805	int rc;
806
807	ENTER(0);
808
809	cxiFlockToVFS(lckreqP, &fl);
810	fileP = fl.fl_file;
811	if (!fileP) {
812	return -1;
813	}
814	if (lckdatP) {
815	cxiFlockToVFS(lckdatP, &conf);
816	confP = &conf;
817	}
818	if (!result) { /* try to get the posix lock */
819	rc = POSIX_LOCK_FILE(fileP, &fl);
820	if (rc)
821	callback(&fl, NULL, EBUSY);
822	else { /* got the posix lock */
823	rc = callback(&fl, confP, result);
824	if (rc) { /* too late, free the lock */
825	fl.fl_type = F_UNLCK;
826	rc = POSIX_LOCK_FILE(fileP, &fl);
827	}
828	}
829	}
830	else
831	rc = callback(&fl, confP, result);
832
833	#ifdef NFS_CLUSTER_LOCKS
834	if (rc < 0)
835	rc = -rc; /* make it positive */
836	#endif
837	EXIT_RC(0, rc);
838	return rc;
839	}
840	#endif
841
842	void cxiVFSToFlock(void vP, eflock_t lckdatP)
843	{
844	struct file_lock* flP = (struct file_lock *)vP;
845
846	ENTER(0);
847	if ((flP) && (lckdatP))
848	{
849	lckdatP->l_pid = flP->fl_pid;
850	lckdatP->l_owner = flP->fl_owner;
851	lckdatP->l_type = flP->fl_type;
852	lckdatP->l_start = flP->fl_start;
853	lckdatP->l_flags = flP->fl_flags;
854	#ifdef NFS_CLUSTER_LOCKS
855	lckdatP->l_lmops = flP->fl_lmops;
856	lckdatP->l_file = flP->fl_file;
857	if (lckdatP->l_lmops) /* must be lockd or nfsd */
858	#else
859	#if LINUX_KERNEL_VERSION >= 2061700
860	if (lckdatP->l_lmops) /* must be lockd or nfsd */
861	#else
862	if (flP->fl_flags & FL_LOCKD)
863	#endif
864	#endif
865	lckdatP->l_caller = L_CALLER_LOCKD;
866	else
867	lckdatP->l_caller = L_CALLER_NULL;
868	if (flP->fl_end == FL_OFFSET_MAX)
869	lckdatP->l_len = 0;
870	else
871	lckdatP->l_len = flP->fl_end - flP->fl_start + 1;
872	}
873	EXIT(0);
874	return;
875	}
876
877
878	/* Sleep for the indicated number of milliseconds */
879	void cxiSleep(int ms)
880	{
881	ENTER(0);
882	TRACE1(TRACE_VNODE, 9, TRCID_SLEEP,
883	"cxiSleep: begin delay %d\n", ms);
884	current->state = TASK_INTERRUPTIBLE;
885	/* For large HZ rearrange jiffies calculation and
886	use presumably larger word size to minimize overflow risk */
887	if (unlikely(HZ > 1000))
888	schedule_timeout(((long)ms)*HZ/1000);
889	else
890	schedule_timeout(ms/(1000/HZ));
891	TRACE2(TRACE_VNODE, 9, TRCID_SLEEP_END,
892	"cxiSleep: end delay %d HZ %d\n", ms, HZ);
893	EXIT(0);
894	}
895
896
897	void cxiOpenNFS(void *iP)
898	{
899	struct inode inodeP = (struct inode )iP;
900	int refcount;
901
902	/* A reference is placed on the cxiNode here when the first NFS reference
903	is added */
904	ENTER(0);
905	refcount = cxiRefOSNode(NULL, ((cxiNode_t *)(cxiGetCnP(inodeP))), iP, 1);
906
907	TRACE7(TRACE_VNODE, 3, TRCID_OPENNFS,
908	"openNFS iP 0x%lX ino %d (0x%X) mode 0x%X nlink %d gen_ip 0x%lX "
909	"refcount %d\n",
910	inodeP, (inodeP) ? inodeP->i_ino : -1,
911	(inodeP) ? inodeP->i_ino : -1,
912	(inodeP) ? inodeP->i_mode : -1,
913	(inodeP) ? inodeP->i_nlink : -1,
914	(inodeP) ? inodeP->PRVINODE : NULL,
915	refcount);
916
917	DBGASSERT(refcount != 0);
918	EXIT(0);
919	}
920
921
922	int cxiCloseNFS(void vP, void viP)
923	{
924	int rc;
925	struct inode iP = (struct inode )vP;
926
927	/* If viP is NULL, the file was never actually opened.
928	If viP is not NULL, close it. */
929	ENTER(0);
930	if (viP == NULL)
931	rc = 0;
932	else {
933	if (VP_TO_PVP(iP) != NULL && VP_TO_CNP(iP) != NULL) {
934	rc = gpfs_ops.gpfsClose(VP_TO_PVP(iP), VP_TO_CNP(iP), FREAD\|FWRITE,
935	(struct MMFSVInfo *)viP, true);
936	cxiPutOSNode((void *)iP);
937	}
938	}
939
940	EXIT_RC(0, rc);
941	return rc;
942	}
943
944	static int cxiNFSCluster = 0;
945
946	void cxiSetNFSCluster(int set)
947	{
948	cxiNFSCluster = set;
949	}
950
951	/* To avoid failing the NFS client the NFSD thread is put to sleep. Another
952	node will takeover this client and the operation will continue without any
953	errors to the application.
954	*/
955	void cxiNFSError(int rc, const char *str)
956	{
957	TRACE2(TRACE_VNODE, 9, TRCID_NFS_ERROR,
958	"cxiNFSError: %s got rc %d\n", str, rc);
959	if (cxiNFSCluster && cxiIsNFSThread() && (rc == ESTALE \|\| rc == -ESTALE))
960	{
961	TRACE2(TRACE_VNODE, 1, TRCID_NFS_ERROR_1,
962	"cxiNFSError: NFS got error %d from %s sleep\n", rc, str);
963	cxiSleep(120000); // wait 120 seconds
964	}
965	}
966
967	void * cxiGetNfsP(void *vP)
968	{
969	if (vP && VP_TO_CNP((struct inode *)vP))
970	return VP_TO_NFSP((struct inode *)vP);
971	else
972	return NULL;
973	}
974
975	void cxiSetNfsP(void vP, void newP)
976	{
977	if (VP_TO_CNP((struct inode *)vP))
978	VP_TO_NFSP((struct inode *)vP) = newP;
979	}
980
981	void * cxiGetCnP(void *vP)
982	{ return (void )VP_TO_CNP((struct inode )vP); }
983
984	void * cxiGetPvP(void *vP)
985	{ return (void )VP_TO_PVP((struct inode )vP); }
986
987	void * cxiGNPtoVP(void *vP)
988	{ return (void )GNP_TO_VP((struct cxiNode_t )vP); }
989
990	/* Main routine of kproc */
991	static int kprocMain(void *argP)
992	{
993	cxiKProcData_t kpdP = (cxiKProcData_t )argP;
994
995	/* Change our process name */
996	ENTER(0);
997	current->comm[sizeof(current->comm) - 1] = '\0';
998	strncpy(current->comm, kpdP->nameP, sizeof(current->comm) - 1);
999
1000	/* Change parent of a kernel process so that when it exits, it won't
1001	* send a SIGCHLD signal to the process that created it, and it won't
1002	* be left as a zombie.
1003	*/
1004	DAEMONIZE(kpdP->nameP);
1005
1006	/* Call the function specified by startKProc */
1007	kpdP->func(kpdP);
1008	EXIT(0);
1009	return 0;
1010	}
1011
1012	/* Create a new kernel process */
1013	cxiPid_t
1014	cxiStartKProc(struct cxiKProcData_t *kpdP)
1015	{
1016	cxiPid_t pid = kernel_thread(kprocMain, kpdP, kpdP->kprocFlags);
1017	ENTER(0);
1018	kpdP->pid = pid > 0 ? pid : KPROC_FAILED_PID;
1019
1020	TRACE2(TRACE_VNODE, 1, TRCID_CXISTART_KPROC_LINUX,
1021	"cxiStartKProc %s pid %d \n", kpdP->nameP, kpdP->pid);
1022	EXIT(0);
1023	return kpdP->pid;
1024	}
1025
1026	void
1027	cxiStopKProc(struct cxiKProcData_t *kpdP)
1028	{
1029	cxiPid_t pid;
1030
1031	ENTER(0);
1032	cxiBlockingMutexAcquire(&kpdP->lock);
1033
1034	TRACE2(TRACE_VNODE, 1, TRCID_CXISTOP_KPROC_LINUX,
1035	"cxiStopKProc: %s pid %d \n", kpdP->nameP, kpdP->pid);
1036
1037	if (!KPROC_RUNNING(kpdP))
1038	{
1039	cxiBlockingMutexRelease(&kpdP->lock);
1040	EXIT(0);
1041	return;
1042	}
1043
1044	pid = kpdP->pid; // Cache pid before signal/wait
1045	kpdP->terminate = true;
1046	cxiWaitEventSignal(&kpdP->kprocEvent);
1047
1048	while (kpdP->pid != KPROC_UNASSIGNED_PID)
1049	cxiWaitEventWait(&kpdP->startStopEvent, &kpdP->lock, 0);
1050
1051	cxiBlockingMutexRelease(&kpdP->lock);
1052	EXIT(0);
1053	}
1054
1055	/*-------------------------------------------------------------------
1056	* logAssertFailed - Subroutine consolidating logGenIF() and
1057	* DoPanic() calls.
1058	------------------------------------------------------------------/
1059
1060	static char PanicMsgBuf[2048];
1061
1062	void cxiPanic(const char* panicStrP)
1063	{
1064	printk( GPFS_NOTICE "kp %d: cxiPanic: %s\n", cxiGetThreadId(), panicStrP);
1065	TRACE1(TRACE_ERRLOG, 0, TRCID_PANIC, "cxiPanic: %s\n", panicStrP);
1066	#ifndef DISABLE_KERNEL_PANIC
1067	BUG();
1068	#endif
1069	}
1070
1071	static void
1072	DoPanic(char* condP, char* filenameP, int lineNum, Int32 retCode,
1073	Int32 reasonCode, char *dataStr)
1074	{
1075	const char *p;
1076	int bytesLeft;
1077
1078	p = cxiStrrchr(filenameP, '/');
1079	if (p == NULL)
1080	p = filenameP;
1081	else
1082	p += 1;
1083
1084	sprintf(PanicMsgBuf, "%s:%d:%d:%d:", p, lineNum, retCode, reasonCode);
1085	bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
1086	if (dataStr)
1087	{
1088	strncat(PanicMsgBuf, dataStr, bytesLeft-1);
1089	bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
1090	}
1091	strncat(PanicMsgBuf, ":", bytesLeft-1);
1092	bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
1093	if (condP)
1094	strncat(PanicMsgBuf, condP, bytesLeft-1);
1095	cxiPanic(PanicMsgBuf);
1096	}
1097
1098	#ifdef MODULE
1099	void
1100	logAssertFailed(UInt32 flags, /* LOG_FATAL_ERROR or LOG_NONFATAL_ERROR */
1101	char srcFileName, / __FILE__ */
1102	UInt32 srcLineNumber, /* __LINE__ */
1103	Int32 retCode, /* return code value */
1104	Int32 reasonCode, /* normally errno */
1105	UInt32 logRecTag, /* tag if have associated error log rec */
1106	char dataStr, / assert data string */
1107	char failingExpr) / expression that evaluated to false */
1108	{
1109	int i;
1110
1111	printk("GPFS logAssertFailed: %s file %s line %d\n",
1112	failingExpr, srcFileName, srcLineNumber);
1113	ENTER(0);
1114	TRACE3(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_1,
1115	"logAssertFailed: %s retCode %d reasonCode %d\n",
1116	failingExpr, retCode, reasonCode);
1117	TRACE2(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_2,
1118	"logAssertFailed: file %s line %d\n", srcFileName, srcLineNumber);
1119	#ifndef GPFS_PRINTF
1120	/* fsync buffered lxtrace records */
1121	trc_fsync();
1122
1123	#ifdef STOP_TRACE_ON_FAILURE
1124	/* Turn off tracing right after the failure occurs. This may only turn
1125	off tracing in the kernel. */
1126	for (i=0 ; i<MAX_TRACE_CLASSES ; i++)
1127	TraceFlagsP[i] = 0;
1128	#endif
1129
1130	/* Wait 10 seconds to allow the lxtrace daemon to complete the sync. */
1131	cxiSleep(10000);
1132	#endif
1133	gpfs_ops.gpfsDaemonToDie(srcFileName, srcLineNumber, retCode, reasonCode,
1134	dataStr, failingExpr);
1135
1136	DoPanic(failingExpr, srcFileName, srcLineNumber, retCode, reasonCode,
1137	dataStr);
1138	}
1139	#else /* !MODULE */
1140	void
1141	logAssertFailed(UInt32 flags,
1142	char *srcFileName,
1143	UInt32 srcLineNumber,
1144	Int32 retCode,
1145	Int32 reasonCode,
1146	UInt32 logRecTag,
1147	char *dataStr,
1148	char *failingExpr);
1149	#endif /* MODULE */
1150
1151
1152	typedef struct cxiWaitElement_t
1153	{
1154	cxiWaitList_t waitList; /* previous and next element in chain */
1155
1156	/* Linux would normally organize a wait_queue_head_t with any number
1157	* of wait_queue_t elements. However since we're implementing "wakeup
1158	* with return code" we have to ensure the OS wakes up the exact sleeper
1159	* we want. Thus we have only a one to one relationship to ensure the
1160	* OS can only pick our favorite.
1161	*/
1162	wait_queue_head_t qhead;
1163	wait_queue_t qwaiter;
1164	int wakeupRC; /* wakeup return code */
1165
1166	} cxiWaitElement_t;
1167
1168
1169	#define CXI_WAIT_LIST_ADD(headP, elementP) \
1170	(headP)->prevP->nextP = (elementP); \
1171	(elementP)->prevP = (headP)->prevP; \
1172	(headP)->prevP = (elementP); \
1173	(elementP)->nextP = (headP);
1174
1175	#define CXI_WAIT_LIST_REMOVE(elementP) \
1176	(elementP)->prevP->nextP = (elementP)->nextP; \
1177	(elementP)->nextP->prevP = (elementP)->prevP;
1178
1179
1180	/* Initialize abstract wait event with OS specific
1181	* initialization function
1182	*/
1183	void
1184	cxiWaitEventInit(cxiWaitEvent_t *weP)
1185	{
1186	spinlock_t lockP = (spinlock_t )&weP->lword;
1187
1188	spin_lock_init(lockP);
1189	weP->waitList.nextP = weP->waitList.prevP = &weP->waitList;
1190	}
1191
1192	Boolean
1193	cxiWaitEventHasWaiters(cxiWaitEvent_t *weP)
1194	{
1195	unsigned long flags;
1196	spinlock_t lockP = (spinlock_t )(weP->lword);
1197	Boolean rc;
1198
1199	SPIN_LOCK_IRQ(lockP, flags);
1200	rc = (weP->waitList.nextP != &weP->waitList);
1201	SPIN_UNLOCK_IRQ(lockP, flags);
1202	return rc;
1203	}
1204
1205	/* Do not add trace records. Some callers depend on not being
1206	* interrupted by the trace daemon.
1207	*/
1208	enum WakeType { wBroadcast, wSignal, wWakeOne };
1209	static inline void
1210	doWakeup(cxiWaitEvent_t *wEventP, enum WakeType wtype, int wakeupRC)
1211	{
1212	unsigned long flags;
1213	spinlock_t lockP = (spinlock_t )(wEventP->lword);
1214	cxiWaitList_t *headP;
1215	cxiWaitList_t *tmpP;
1216	cxiWaitElement_t *wP;
1217
1218	SPIN_LOCK_IRQ(lockP, flags);
1219
1220	/* We wake up from the front back (FIFO semantics).
1221	* There's only one wait element per wake_queue_head_t so
1222	* record the return code and wake up the one element.
1223	*/
1224	headP = &wEventP->waitList;
1225
1226	for (tmpP = headP->nextP; tmpP != headP; tmpP = tmpP->nextP)
1227	{
1228	wP = list_entry(tmpP, cxiWaitElement_t, waitList);
1229	wP->wakeupRC = wakeupRC;
1230
1231	wake_up(&wP->qhead);
1232	if (wtype != wBroadcast)
1233	{
1234	/* The difference between wSignal and wWakeOne is that the latter
1235	guarantees that multiple wake up calls will each pick a different
1236	thread if more than one is waiting. With wSignal, if a thread is
1237	awakened but hasn't had a chance to run, then subsequent wake up
1238	calls might all wake the same thread.
1239
1240	On AIX, the calling routine (e_wakeup_one) removes the waiter from
1241	the queue, unlike Linux where removal is done by the waiting
1242	thread when it wakes up. Nothing special has to be done on AIX to
1243	get the nWakeOne style of wakeup.
1244
1245	Note: This is an inline routine and the wType argument is a
1246	compile-time constant, so the "if" tests in this routine are done
1247	by the compiler and do not generate any code. */
1248
1249	if (wtype == wWakeOne)
1250	{
1251	/* Move this entry to tail of list so that the next wakeup call will
1252	pick somebody else. */
1253	CXI_WAIT_LIST_REMOVE(tmpP);
1254	CXI_WAIT_LIST_ADD(headP, tmpP);
1255	}
1256	break;
1257	}
1258	}
1259	SPIN_UNLOCK_IRQ(lockP, flags);
1260	}
1261
1262	int
1263	cxiCopyIn(char from, char to, unsigned long size)
1264	{
1265	/* The daemon needs to bypass access checks since copy to
1266	* shared segment would inadvertantly fail.
1267	*/
1268	ENTER(0);
1269	if (PROCESS_GROUP(current) == DaemonPGrp)
1270	__copy_from_user(to, from, size);
1271	else
1272	if (copy_from_user(to, from, size))
1273	{
1274	EXIT_RC(0, EFAULT);
1275	return EFAULT;
1276	}
1277	EXIT(0);
1278	return 0;
1279	}
1280
1281	int
1282	cxiCopyOut(char from, char to, unsigned long size)
1283	{
1284	int ignore;
1285	/* The daemon needs to bypass access checks since copy to
1286	* shared segment would inadvertantly fail.
1287	*/
1288	ENTER(0);
1289	if (PROCESS_GROUP(current) == DaemonPGrp)
1290	ignore = __copy_to_user(to, from, size);
1291	else
1292	if (copy_to_user(to, from, size))
1293	{
1294	EXIT_RC(0, EFAULT);
1295	return EFAULT;
1296	}
1297	EXIT(0);
1298	return 0;
1299	}
1300
1301	int
1302	cxiCopyInstr(char from, char to, unsigned long size, unsigned long *len)
1303	{
1304	long retval;
1305
1306	ENTER(0);
1307	retval = strncpy_from_user(to, from, size);
1308	if ((retval > 0) && (retval <= size))
1309	{
1310	*len = retval;
1311	EXIT(0);
1312	return 0;
1313	}
1314	*len = 0;
1315	if (retval < 0)
1316	retval = EFAULT;
1317	else
1318	retval = E2BIG;
1319	EXIT_RC(0, retval);
1320	return (int)retval;
1321	}
1322
1323	long cxiSafeGetLong(long* from)
1324	{
1325	#if LINUX_KERNEL_VERSION >= 2060000
1326	long tmp;
1327	(void)__get_user_nocheck(tmp, from, sizeof(long));
1328	return tmp;
1329	#else
1330	return *from;
1331	#endif
1332	}
1333
1334	int cxiSafeGetInt(int* from)
1335	{
1336	#if LINUX_KERNEL_VERSION >= 2060000
1337	int tmp;
1338	__get_user_nocheck(tmp, from, sizeof(int));
1339	return tmp;
1340	#else
1341	return *from;
1342	#endif
1343	}
1344
1345	void cxiSafePutLong(long val, long* to)
1346	{
1347	#if LINUX_KERNEL_VERSION >= 2060000
1348	__put_user_nocheck(val, to, sizeof(long));
1349	#else
1350	*to = val;
1351	#endif
1352	}
1353
1354	void cxiSafePutInt(int val, int* to)
1355	{
1356	#if LINUX_KERNEL_VERSION >= 2060000
1357	__put_user_nocheck(val, to, sizeof(int));
1358	#else
1359	*to = val;
1360	#endif
1361	}
1362
1363	#ifdef GPFS_ARCH_X86_64
1364	/* Check if 64-bit user process */
1365	int
1366	cxiIS64U(char *addr)
1367	{
1368	#if LINUX_KERNEL_VERSION > 2060500
1369	return !(test_thread_flag(TIF_IA32));
1370	#else
1371	return !(current->thread.flags & THREAD_IA32);
1372	#endif
1373	}
1374	#endif
1375
1376	int
1377	socket_aio_dequeue()
1378	{
1379	return -1;
1380	}
1381
1382	/* Transfer data from buffer(s) in user space to or from a buffer in the
1383	kernel. */
1384	int
1385	cxiUiomove(register char* kBufP, /* address of kernel buffer */
1386	register unsigned long nBytes, /* #bytes to transfer */
1387	Boolean toKernel, /* direction of xfer(read/write)*/
1388	register struct cxiUio_t* uioP) /* user area description */
1389	{
1390	register struct cxiIovec_t * iovP;
1391	unsigned long cnt;
1392	int rc;
1393	#ifdef TRACE_IO_DATA
1394	char* origKBufP = kBufP;
1395	int trcdata[4];
1396	#endif
1397	int ignore;
1398
1399	ENTER(0);
1400	TRACE4(TRACE_FOPS, 6, TRCID_CXISYSTEM_037,
1401	"cxiUiomove enter: kBufP 0x%lX uioP 0x%lX nBytes %d toKernel %d\n",
1402	kBufP, uioP, nBytes, toKernel);
1403	if (uioP->uio_resid <= 0)
1404	{
1405	EXIT_RC(0, ENOMEM);
1406	return ENOMEM;
1407	}
1408	rc = 0;
1409	if (uioP->uio_iovcnt == 1)
1410	{
1411	/*
1412	* Fastpath for most common case of iovcnt == 1. Saves a
1413	* few instructions.
1414	*/
1415	iovP = uioP->uio_iov;
1416	cnt = iovP->iov_len;
1417	if (cnt <= 0)
1418	{
1419	uioP->uio_iovcnt--;
1420	uioP->uio_iov++;
1421	uioP->uio_iovdcnt++;
1422	EXIT(0);
1423	return 0;
1424	}
1425	if (cnt > nBytes)
1426	cnt = nBytes;
1427
1428	if (toKernel)
1429	{
1430	/* The daemon needs to bypass access checks since copy to
1431	* shared segment would inadvertantly fail. Copies to
1432	* kernel address space also perform no validity check.
1433	*/
1434	if (PROCESS_GROUP(current) == DaemonPGrp \|\| uioP->uio_segflg == UIO_SYSSPACE)
1435	__copy_from_user(kBufP, (char *)iovP->iov_base, cnt);
1436	else
1437	if (copy_from_user(kBufP, (char *)iovP->iov_base, cnt))
1438	{
1439	EXIT_RC(0, EFAULT);
1440	return EFAULT;
1441	}
1442	}
1443	else
1444	{
1445	int spam;
1446	/* The daemon needs to bypass access checks since copy to
1447	* shared segment would inadvertantly fail. Copies to
1448	* kernel address space also perform no validity check.
1449	*/
1450	if (PROCESS_GROUP(current) == DaemonPGrp \|\| uioP->uio_segflg == UIO_SYSSPACE)
1451	ignore = __copy_to_user((char *)iovP->iov_base, kBufP, cnt);
1452	else
1453	if (copy_to_user((char *)iovP->iov_base, kBufP, cnt))
1454	{
1455	EXIT_RC(0, EFAULT);
1456	return EFAULT;
1457	}
1458	}
1459
1460	iovP->iov_base = (char *)iovP->iov_base + cnt;
1461	iovP->iov_len -= cnt;
1462	uioP->uio_resid -= cnt;
1463	uioP->uio_offset += cnt;
1464	#ifdef TRACE_IO_DATA
1465	if (cnt >= sizeof(trcdata))
1466	memcpy(trcdata, origKBufP, sizeof(trcdata));
1467	else
1468	{
1469	memset(trcdata, 0xAA, sizeof(trcdata));
1470	memcpy(trcdata, origKBufP, cnt);
1471	}
1472	TRACE5(TRACE_FOPS, 7, TRCID_CXISYSTEM_039a,
1473	"uiomove exit 1: rc %d data %08X %08X %08X %08X\n",
1474	rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]);
1475	#else
1476	TRACE1(TRACE_FOPS, 7, TRCID_CXISYSTEM_039,
1477	"uiomove exit 1: rc %d\n",
1478	rc);
1479	#endif
1480	EXIT_RC(0, rc);
1481	return rc;
1482	}
1483	while (nBytes > 0 && uioP->uio_resid && rc == 0)
1484	{
1485	if (uioP->uio_iovcnt <= 0)
1486	{
1487	EXIT_RC(0, ENOMEM);
1488	return ENOMEM;
1489	}
1490	iovP = uioP->uio_iov;
1491	cnt = iovP->iov_len;
1492	if (cnt <= 0)
1493	{
1494	uioP->uio_iovcnt--;
1495	uioP->uio_iov++;
1496	uioP->uio_iovdcnt++;
1497	continue;
1498	}
1499	if (cnt > nBytes)
1500	cnt = nBytes;
1501
1502	if (toKernel)
1503	{
1504	/* The daemon needs to bypass access checks since copy to
1505	* shared segment would inadvertantly fail. Copies to
1506	* kernel address space also perform no validity check.
1507	*/
1508	if (PROCESS_GROUP(current) == DaemonPGrp \|\| uioP->uio_segflg == UIO_SYSSPACE)
1509	__copy_from_user(kBufP, (char *)iovP->iov_base, cnt);
1510	else
1511	if (copy_from_user(kBufP, (char *)iovP->iov_base, cnt))
1512	{
1513	EXIT_RC(0, EFAULT);
1514	return EFAULT;
1515	}
1516	}
1517	else
1518	{
1519	/* The daemon needs to bypass access checks since copy to
1520	* shared segment would inadvertantly fail. Copies to
1521	* kernel address space also perform no validity check.
1522	*/
1523	if (PROCESS_GROUP(current) == DaemonPGrp \|\| uioP->uio_segflg == UIO_SYSSPACE)
1524	ignore = __copy_to_user((char *)iovP->iov_base, kBufP, cnt);
1525	else
1526	if (copy_to_user((char *)iovP->iov_base, kBufP, cnt))
1527	{
1528	EXIT_RC(0, EFAULT);
1529	return EFAULT;
1530	}
1531	}
1532	iovP->iov_base = (char *)iovP->iov_base + cnt;
1533	iovP->iov_len -= cnt;
1534	uioP->uio_resid -= cnt;
1535	uioP->uio_offset += cnt;
1536	kBufP += cnt;
1537	nBytes -= cnt;
1538	}
1539	#ifdef TRACE_IO_DATA
1540	cnt = kBufP - origKBufP;
1541	if (cnt >= sizeof(trcdata))
1542	memcpy(trcdata, origKBufP, sizeof(trcdata));
1543	else
1544	{
1545	memset(trcdata, 0xAA, sizeof(trcdata));
1546	memcpy(trcdata, origKBufP, cnt);
1547	}
1548	TRACE5(TRACE_FOPS, 7, TRCID_CXISYSTEM_041a,
1549	"uiomove exit 2: rc %d data %08X %08X %08X %08X\n",
1550	rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]);
1551	#else
1552	TRACE1(TRACE_FOPS, 7, TRCID_CXISYSTEM_041,
1553	"uiomove exit 2: rc %d\n",
1554	rc);
1555	#endif
1556	EXIT_RC(0, rc);
1557	return rc;
1558	}
1559
1560	/*
1561	Try to force some sanity checks at compile type
1562	*/
1563	/* TO DO: revise this to handle comparisons beyond equality/inequality */
1564	/* STATIC_DBGASSERT(sizeof(spinlock_t), SPINLOCK_T_SIZE); */
1565
1566	/* A routine to check that the definitions in our cxiTypes.h
1567	* files are equivalent to the system definitions. The module
1568	* should not load if it receives an error from this routine.
1569	*/
1570	int
1571	cxiCheckTypes()
1572	{
1573	int rc = 0;
1574	ENTER(0);
1575
1576	/* Make sure cxiBlockingMutex_t fits in the space provided. If not,
1577	the implementation of the cxiBlockingMutex... routines needs to
1578	use the embedded space to record a pointer to kmalloc'ed space holding
1579	the semaphore. */
1580	if (sizeof(struct semaphore) > GPFS_LINUX_SEM_SIZE)
1581	{
1582	printk("cxiCheckTypes: semaphore %ld > GPFS_LINUX_SEM_SIZE %ld\n",
1583	sizeof(struct semaphore), GPFS_LINUX_SEM_SIZE);
1584	rc = 1;
1585	}
1586
1587	/* Size of spinlock_t is smaller for UP case with gcc 3.x, so just
1588	insure SPINLOCK_T_SIZE is large enough for both the UP and SMP case. */
1589	if (sizeof(spinlock_t) > SPINLOCK_T_SIZE)
1590	{
1591	printk("cxiCheckTypes: spinlock_t %ld > SPINLOCK_T__SIZE %ld\n",
1592	sizeof(spinlock_t), SPINLOCK_T_SIZE);
1593	rc = 2;
1594	}
1595
1596	/* Ensure that size of pid_t matches cxiThreadId (32-bits) */
1597	if (sizeof(pid_t) != sizeof(cxiThreadId))
1598	{
1599	printk("cxiCheckTypes: pid_t %ld != cxiThreadId %ld\n",
1600	sizeof(pid_t), sizeof(cxiThreadId));
1601	rc = 3;
1602	}
1603
1604	if (rc > 0)
1605	TRACE1(TRACE_TASKING, 2, TRCID_CXISYSTEM_CHKTYPES,
1606	"cxiCheckTypes: system type mismatch on type number %d!\n", rc);
1607	EXIT_RC(0, rc);
1608	return rc;
1609	}
1610
1611	/* Routine to get current time of day in nanosecond format.
1612	*/
1613	int
1614	cxiGetTOD(cxiTimeStruc_t *tsP)
1615	{
1616	#if LINUX_KERNEL_VERSION >= 2060000
1617	struct timespec ts;
1618	#else
1619	struct timeval tv;
1620	#endif
1621
1622	ENTER(0);
1623	#if LINUX_KERNEL_VERSION >= 2060000
1624	ts = CURRENT_TIME;
1625	tsP->tv_sec = ts.tv_sec;
1626	tsP->tv_nsec = ts.tv_nsec;
1627	#else
1628	/* This call returns microseconds so we fudge it to nanoseconds */
1629	do_gettimeofday(&tv);
1630	tsP->tv_sec = tv.tv_sec;
1631	tsP->tv_nsec = tv.tv_usec * 1000;
1632	#endif
1633
1634	EXIT(0);
1635	return 0;
1636	}
1637
1638	Boolean
1639	cxiIsNFSThread()
1640	{
1641	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1642	/* Note comparison against a multibyte character constant (not a string
1643	constant). Order of characters in word is reversed due to little-
1644	endian representation of integers. */
1645	if (* ((int*)&current->comm[0]) != 0x6473666e) // 'dsfn'
1646	return false;
1647	if (* ((char*)&current->comm[4]) == '\0')
1648	return true;
1649	return (* ((int*)&current->comm[2]) == 0x00346473); // '4ds'
1650	# else
1651	if ((strcmp(current->comm, "nfsd") == 0) \|\|
1652	(strcmp(current->comm, "nfsd4") == 0))
1653	return true;
1654	return false;
1655	# endif
1656	}
1657
1658	Boolean
1659	cxiIsLockdThread()
1660	{
1661	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1662	/* Note comparison against a multibyte character constant (not a string
1663	constant). Order of characters in word is reversed due to little-
1664	endian representation of integers. */
1665	if ((* ((int*)&current->comm[0]) != 0x6b636f6c) \| // 'kcol'
1666	(* ((int*)&current->comm[2]) != 0x00646b63)); // ' dkc'
1667	return false;
1668	return * ((char*)&current->comm[5]) == '\0';
1669	# else
1670	return (strcmp(current->comm, "lockd") == 0);
1671	# endif
1672	}
1673
1674	Boolean
1675	cxiIsNFS4Thread()
1676	{
1677	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1678	/* Note comparison against a multibyte character constant (not a string
1679	constant). Order of characters in word is reversed due to little-
1680	endian representation of integers. */
1681	if ((* ((int*)&current->comm[0]) != 0x6473666e) \| // 'dsfn'
1682	(* ((int*)&current->comm[2]) != 0x00346473)); // '4ds'
1683	return false;
1684	return * ((char*)&current->comm[5]) == '\0';
1685	# else
1686	return (strcmp(current->comm, "nfsd4") == 0);
1687	# endif
1688	}
1689
1690	Boolean
1691	cxiIsKupdateThread()
1692	{
1693	#if LINUX_KERNEL_VERSION >= 2060000
1694	/* In 2.6 pdflush replaced kupdated and bdflush from 2.4 */
1695	return current_is_pdflush();
1696	#else
1697	return (strcmp(current->comm, "kupdated") == 0);
1698	#endif
1699	}
1700
1701	#ifdef SMB_LOCKS
1702	Boolean
1703	cxiIsSambaOrLockdThread()
1704	{
1705	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1706	/* Note comparison against a multibyte character constant (not a string
1707	constant). Order of characters in word is reversed due to little-
1708	endian representation of integers. */
1709	Boolean rc = (((* ((int*)&current->comm[0]) == 0x64626d73) & // 'dbms'
1710	(* ((char*)&current->comm[4]) == '\0')) \|
1711	((* ((int*)&current->comm[0]) == 0x6b636f6c) & // 'kcol'
1712	(* ((int*)&current->comm[2]) == 0x00646b63))); // 'dkc'
1713	return rc;
1714	# else
1715	return ((strcmp(current->comm, "smbd") == 0) \|
1716	(strcmp(current->comm, "lockd") == 0));
1717	# endif
1718	}
1719
1720	Boolean
1721	cxiIsSambaThread()
1722	{
1723	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1724	/* Note comparison against a multibyte character constant (not a string
1725	constant). Order of characters in word is reversed due to little-
1726	endian representation of integers. */
1727	Boolean rc = ((* ((int*)&current->comm[0]) == 0x64626d73) & // 'dbms'
1728	(* ((char*)&current->comm[4]) == '\0'));
1729	return rc;
1730	# else
1731	return (strcmp(current->comm, "smbd") == 0);
1732	# endif
1733	}
1734	#endif
1735
1736	Boolean
1737	cxiIsGPFSThread()
1738	{
1739	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1740	return (((* ((int*)&current->comm[0]) == 0x73666d6d) & // 'sfmm'
1741	(* ((int*)&current->comm[2]) == 0x00647366))); // 'dsf'
1742	# else
1743	return (strcmp(current->comm, "mmfsd") == 0);
1744	# endif
1745	}
1746
1747	Boolean
1748	cxiIsKswapdThread()
1749	{
1750	#if LINUX_KERNEL_VERSION > 2060000
1751	/* On 2.6, there may be multiple kswapd processes, named kswapd0, kswapd1,
1752	* etc. We don't have to depend on the process name to identify kswapd
1753	* processes on 2.6 though, there's a better way. */
1754	return current_is_kswapd();
1755	#else
1756	# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1757	return ((* ((int*)&current->comm[0]) == 0x6177736b) & // 'awsk'
1758	(* ((int*)&current->comm[3]) == 0x00647061)); // ' dpa'
1759	# else
1760	return (strcmp(current->comm, "kswapd") == 0);
1761	# endif
1762	#endif
1763	}
1764
1765	#ifdef INSTRUMENT_LOCKS
1766	void InitBlockingMutexStats()
1767	{
1768	memset(BlockingMutexStatsTable, 0, sizeof(BlockingMutexStatsTable));
1769	}
1770	#endif
1771
1772	/* Initialize a cxiBlockingMutex_t. Instead of the DBGASSERT, this routine
1773	should kmalloc a struct semaphore if bmSem is too small. */
1774	void cxiBlockingMutexInit(cxiBlockingMutex_t* mP, int bmNameIdx)
1775	{
1776	ENTER(0);
1777	DBGASSERT(sizeof(struct semaphore) <= GPFS_LINUX_SEM_SIZE);
1778	#ifdef INSTRUMENT_LOCKS
1779	DBGASSERT(bmNameIdx < MAX_GPFS_LOCK_NAMES);
1780	#endif /* INSTRUMENT_LOCKS */
1781
1782	TRACE2(TRACE_KLOCKL, 3, TRCID_BM_INIT,
1783	"cxiBlockingMutexInit: mP 0x%lX idx %d\n",
1784	mP, bmNameIdx);
1785	init_MUTEX((struct semaphore *)mP->bmSem);
1786	mP->bmOwnerP = NULL;
1787	mP->lockNameIndex = bmNameIdx;
1788	EXIT(0);
1789	}
1790
1791
1792	/* Enter critical section, blocking this thread if necessary. Mark this
1793	thread as the owner of the mutex before returning. */
1794	void
1795	REGPARMS cxiBlockingMutexAcquire(cxiBlockingMutex_t* mP)
1796	{
1797	ENTER(1);
1798	TRACE4(TRACE_KLOCKL, 9, TRCID_BM_ACQ,
1799	"cxiBlockingMutexAcquire: about to acquire 0x%lX type %d "
1800	"current 0x%lX currentOwner 0x%lX\n",
1801	mP, mP->lockNameIndex, current, mP->bmOwnerP);
1802
1803	DBGASSERTRC(mP->bmOwnerP != (char *)current,
1804	PTR_TO_INT32(mP->bmOwnerP), PTR_TO_INT32(mP), 0);
1805
1806	#ifdef INSTRUMENT_LOCKS
1807	BlockingMutexStatsTable[mP->lockNameIndex].bmsAcquires += 1;
1808	if (mP->bmOwnerP != NULL)
1809	BlockingMutexStatsTable[mP->lockNameIndex].bmsConflicts += 1;
1810	#endif
1811
1812	down((struct semaphore *)mP->bmSem);
1813	mP->bmOwnerP = (char *)current;
1814
1815	TRACE1(TRACE_KLOCKL, 9, TRCID_BM_ACQ_EXIT,
1816	"cxiBlockingMutexAcquire: returning after acquiring 0x%lX\n", mP);
1817	EXIT(1);
1818	}
1819
1820
1821	/* Leave critical section and awaken waiting threads */
1822	void
1823	REGPARMS cxiBlockingMutexRelease(cxiBlockingMutex_t* mP)
1824	{
1825	ENTER(1);
1826	TRACE4(TRACE_KLOCKL, 9, TRCID_BM_REL,
1827	"cxiBlockingMutexRelease: about to release 0x%lX type %d "
1828	"current 0x%lX currentOwner 0x%lX\n",
1829	mP, mP->lockNameIndex,current, mP->bmOwnerP);
1830
1831	if (mP->bmOwnerP == (char *)current)
1832	{
1833	mP->bmOwnerP = NULL;
1834	up((struct semaphore *)mP->bmSem);
1835	}
1836	EXIT(1);
1837	}
1838
1839	/* Free resources associated with this cxiBlockingMutex_t in preparation
1840	for freeing the storage it occupies */
1841	void cxiBlockingMutexTerm(cxiBlockingMutex_t* mP)
1842	{
1843	ENTER(0);
1844	TRACE2(TRACE_KLOCKL, 3, TRCID_BM_TERM,
1845	"cxiBlockingMutexTerm: mP 0x%lX type %d\n", mP, mP->lockNameIndex);
1846
1847	/* Verify that mutex is not held */
1848	DBGASSERT(mP->bmOwnerP == NULL);
1849	DBGASSERT(atomic_read(&((struct semaphore *)mP->bmSem)->count) == 1);
1850	EXIT(0);
1851	}
1852
1853
1854	/* Return true if a cxiBlockingMutex_t is held by the calling process */
1855	Boolean
1856	cxiBlockingMutexHeldByCaller(cxiBlockingMutex_t* mP)
1857	{
1858	Boolean result;
1859	char* ownerP;
1860	cxiPid_t ownerPid;
1861
1862	/* Cache bmOwnerP is case it changes to NULL */
1863	ENTER(0);
1864	ownerP = mP->bmOwnerP;
1865	if (ownerP == NULL)
1866	result = false;
1867	else
1868	{
1869	cxiThreadPtrToThreadId(ownerP, &ownerPid);
1870	result = (current->pid == ownerPid);
1871	}
1872	TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_017,
1873	"cxiBlockingMutexHeldByCaller: owner 0x%lX returns %d\n",
1874	ownerP, result);
1875	EXIT_RC(0, result);
1876	return result;
1877	}
1878
1879
1880	/* Return true if a cxiBlockingMutex_t has one or more processes waiting
1881	on it */
1882	Boolean cxiBlockingMutexHasWaiters(cxiBlockingMutex_t* mP)
1883	{
1884	struct semaphore * semP = (struct semaphore *)mP->bmSem;
1885	Boolean result;
1886
1887	ENTER(0);
1888	if ((void)semP->wait.task_list.next != (void)&semP->wait.task_list.next)
1889	result = true;
1890	else
1891	result = false;
1892	TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_018,
1893	"cxiBlockingMutexHasWaiters: mP 0x%lX hasWaiters %d\n",
1894	mP, result);
1895	EXIT_RC(0, result);
1896	return result;
1897	}
1898
1899
1900	/* Wait for a cxiWaitEventSignal, cxiWaitEventBroadcast, or
1901	cxiWaitEventBroadcastRC. Drop the associated cxiBlockingMutex_t
1902	*mutexP while waiting, and reacquire it before returning.
1903	If INTERRUPTIBLE is set in waitFlags, waits interruptibly;
1904	otherwise, waits uninterruptibly.
1905	Returns THREAD_INTERRUPTED if interrupted before being woken up,
1906	THREAD_AWAKENED, if woken up by cxiWaitEventSignal or
1907	cxiWaitEventBroadcast, or the result value passed to
1908	cxiWaitEventWakeupResult, if woken up by cxiWaitEventWakeupResult. */
1909	int cxiWaitEventWait(cxiWaitEvent_t* weP, cxiBlockingMutex_t* mutexP,
1910	int waitFlags)
1911	{
1912	spinlock_t lockP = (spinlock_t )(weP->lword);
1913	unsigned long flags;
1914	cxiWaitElement_t waitElement;
1915	int count = 0;
1916	Boolean done;
1917
1918	ENTER(0);
1919	TRACE3(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_EVENT_WAIT_ENTER,
1920	"cxiWaitEventWait enter: weP 0x%lX waitFlags 0x%X about to release "
1921	"mutex 0x%lX \n", weP, waitFlags, mutexP);
1922
1923	/* Verify that caller is holding the mutex */
1924	DBGASSERTRC(mutexP->bmOwnerP == (char *)current,
1925	PTR_TO_INT32(mutexP->bmOwnerP), PTR_TO_INT32(mutexP), 0);
1926
1927	/* initialize our wait element */
1928	init_waitqueue_head(&waitElement.qhead);
1929	init_waitqueue_entry(&waitElement.qwaiter, current);
1930	__add_wait_queue(&waitElement.qhead, &waitElement.qwaiter);
1931	waitElement.wakeupRC = 0;
1932
1933	/* update our task state to not running any more */
1934	if (waitFlags & INTERRUPTIBLE)
1935	current->state = TASK_INTERRUPTIBLE;
1936	else
1937	current->state = TASK_UNINTERRUPTIBLE;
1938
1939	/* add our wait element to the end of the wait list */
1940	SPIN_LOCK_IRQ(lockP, flags);
1941
1942	CXI_WAIT_LIST_ADD(&weP->waitList, &waitElement.waitList);
1943
1944	SPIN_UNLOCK_IRQ(lockP, flags);
1945
1946	/* Release the mutex. Note: calling cxiBlockingMutexRelease here is
1947	problematic, because it makes trace calls, which may block the current
1948	process, which would overwrite the task state (current->state) we just
1949	updated. A way around this would be to move out task state update to
1950	after the call to cxiBlockingMutexRelease, but then, before calling
1951	schedule(), we would have to re-acquire the wait-list lock and check
1952	wakeupRC to see whether somebody has already woken us up since we
1953	released the mutex. Since there is a trace at the top of this routine,
1954	we don't need the one in cxiBlockingMutexRelease; hence, just do the
1955	release right here. */
1956	mutexP->bmOwnerP = NULL;
1957	up((struct semaphore *)mutexP->bmSem);
1958
1959	again:
1960	/* call the scheduler */
1961	schedule();
1962
1963	/* Remove ourself from the wait list ... except:
1964	Even though we may enter uninterrubtible sleep, this sleep can in
1965	fact be interrupted in at least two scenarios:
1966	1) page_alloc code may call wakeup_kswapd(). This should be
1967	a very rare event with the current code, since we make an effort
1968	to avoid blocking kswapd.
1969	2) While signals are supposed to be ignored during uninterruptible
1970	sleep, it turns out that some signals, e.g. SIGSEGV and SIGBUS,
1971	cause us to wake up. It doesn't look like the signal has been
1972	delivered yet, but sleep is interrupted. The signal will be
1973	delivered later (probably when exiting kernel).
1974	Our callers can't handle unexpected return from uninterruptible
1975	sleep. In either of the two cases above, it should be safe to go
1976	back to sleep and wait to be woken up properly.
1977	*/
1978	SPIN_LOCK_IRQ(lockP, flags);
1979
1980	if (waitElement.wakeupRC == 0 &&
1981	!(waitFlags & INTERRUPTIBLE))
1982	{
1983	TRACE3N(TRACE_KLOCKL, 1, TRCID_CXISYSTEM_EVENT_WAIT_INTERRUPTED,
1984	"cxiWaitEventWait: interrupted weP 0x%lX mutexP 0x%lX rc %d\n",
1985	weP, mutexP, waitElement.wakeupRC);
1986	current->state = TASK_UNINTERRUPTIBLE;
1987	done = false;
1988	}
1989	else
1990	{
1991	CXI_WAIT_LIST_REMOVE(&waitElement.waitList);
1992	done = true;
1993	}
1994
1995	SPIN_UNLOCK_IRQ(lockP, flags);
1996
1997	if (!done)
1998	goto again;
1999
2000	/* re-acquire the mutex */
2001	cxiBlockingMutexAcquire(mutexP);
2002
2003	TRACE3(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_EVENT_WAIT_EXIT,
2004	"cxiWaitEventWait exit: weP 0x%lX mutexP 0x%lX rc %d\n",
2005	weP, mutexP, waitElement.wakeupRC);
2006
2007	/* A zero wakeup code means we were interrupted rather than woken up */
2008	EXIT(0);
2009	if (waitElement.wakeupRC != 0)
2010	return waitElement.wakeupRC;
2011	else
2012	return THREAD_INTERRUPTED;
2013	}
2014
2015	/* Wake up one thread waiting on this cxiWaitEvent_t. Must not sleep */
2016	void
2017	cxiWaitEventSignal(cxiWaitEvent_t* weP)
2018	{
2019	/* ENTER(0); */
2020	TRACE1N(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_SIGNAL,
2021	"cxiWaitEventSignal: weP 0x%lX\n", weP);
2022
2023	doWakeup(weP, wSignal, THREAD_AWAKENED); /* wake up one */
2024	/* EXIT(0); */
2025	}
2026
2027
2028	/* Wake up one thread waiting on this cxiWaitEvent_t. This is the same as
2029	cxiWaitEventSignal(), except this routine guarantees that multiple wake
2030	up calls will each pick a different thread if more than one is waiting. */
2031	void
2032	cxiWaitEventWakeupOne(cxiWaitEvent_t* weP)
2033	{
2034	ENTER(0);
2035	TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_WAKEUP_ONE,
2036	"cxiWaitEventWakeupOne: weP 0x%lX\n", weP);
2037
2038	doWakeup(weP, wWakeOne, THREAD_AWAKENED); /* wake up one */
2039	EXIT(0);
2040	}
2041
2042
2043	/* Wake up all threads waiting on this cxiWaitEvent_t */
2044	void
2045	cxiWaitEventBroadcast(cxiWaitEvent_t* weP)
2046	{
2047	ENTER(0);
2048	TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST,
2049	"cxiWaitEventBroadcastRC: weP 0x%lX\n", weP);
2050
2051	doWakeup(weP, wBroadcast, THREAD_AWAKENED); /* wake up all */
2052	EXIT(0);
2053	}
2054
2055
2056	/* Wake up all threads waiting on this cxiWaitEvent_t and cause them to
2057	return rc from their cxiWaitEventWait calls. */
2058	void
2059	cxiWaitEventBroadcastRC(cxiWaitEvent_t* weP, int rc)
2060	{
2061	ENTER(0);
2062	TRACE2(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST_RC,
2063	"cxiWaitEventBroadcastRC: weP 0x%lX rc %d\n", weP, rc);
2064
2065	doWakeup(weP, wBroadcast, rc); /* wake up all */
2066	EXIT_RC(0, rc);
2067	}
2068
2069	/* alloc big memory area */
2070	void *
2071	cxiBigMalloc(int size)
2072	{
2073	void *ptr;
2074
2075	ENTER(0);
2076	ptr = vmalloc(size);
2077
2078	#ifdef MALLOC_DEBUG
2079	MallocDebugNew(ptr, size, 2);
2080	#endif
2081
2082	EXIT(0);
2083	return ptr;
2084	}
2085
2086	/* free big memory area */
2087	void
2088	cxiBigFree(char *ptr)
2089	{
2090	ENTER(0);
2091	#ifdef MALLOC_DEBUG
2092	MallocDebugDelete(ptr);
2093	#endif
2094
2095	EXIT(0);
2096	vfree(ptr);
2097	}
2098
2099	#ifdef SMB_LOCKS
2100	/* Determine if current process has this file open */
2101	void *
2102	cxiCheckOpen(struct cxiNode_t* cnP)
2103	{
2104	int count;
2105	int i;
2106	struct file** fdList;
2107	struct file* fileP;
2108	struct inode* inodeP;
2109
2110	ENTER(0);
2111	#if LINUX_KERNEL_VERSION >= 2061300
2112	count = current->files->fdt->max_fds;
2113	fdList = current->files->fdt->fd;
2114	#else
2115	count = current->files->max_fds;
2116	fdList = current->files->fd;
2117	#endif
2118	inodeP = GNP_TO_VP(cnP);
2119
2120	TRACE3(TRACE_VNODE,9,TRCID_CXICHECKOPEN_ENTRY,
2121	"cxiCheckOpen: entry. %d files in fd list. Checking for inode %d "
2122	"at 0x%x", count, inodeP->i_ino, inodeP);
2123
2124	for (i=0; i<count; i++)
2125	{
2126	fileP = fdList[i];
2127
2128	if (fileP)
2129	{
2130	if (fdList[i]->f_dentry->d_inode == inodeP)
2131	{
2132	TRACE1(TRACE_VNODE, 9,TRCID_CXICHECKOPEN_FOUND,
2133	"cxiCheckOpen: found open file. vinfoP 0x%x",
2134	fileP->private_data);
2135	EXIT(0);
2136	return fileP->private_data;
2137	}
2138	}
2139	}
2140
2141	EXIT(0);
2142	return NULL;
2143	}
2144
2145	int cxiBreakOplock(void *breakArgP, int oplockNew)
2146	{
2147	/* On Linux, we use its kernel oplock support. The get_lease()
2148	* call is the operation to revoke conflicting leases.
2149	*/
2150	int rc;
2151	ENTER(0);
2152
2153	/* O_NONBLOCK: prevents the thread from waiting for the lease return.
2154	* In the case of a Samba thread, we only want to get EWOULDBLOCK
2155	* back if the conflict is held within Samba iteself. If a wait is
2156	* needed, breakSMBOplock will invoke cxiWaitForBreak.
2157	*/
2158
2159	/* Linux op to revoke conflicting leases */
2160	rc = abs(REVOKE_LEASE((struct inode *)breakArgP,
2161	(cxiIsSambaThread()? 0: O_NONBLOCK) \|
2162	((oplockNew==smbOplockShared)? FMODE_READ: FMODE_WRITE)));
2163
2164	TRACE3(TRACE_VNODE, 4,TRCID_CXIBREAKOPLOCK,
2165	"cxiBreakOplock: exit rc %d inode 0x%lX oplock %d\n",
2166	rc, breakArgP, oplockNew);
2167
2168	EXIT(0);
2169	return rc;
2170	}
2171
2172	DECLARE_WAIT_QUEUE_HEAD(oplock_break_queue);
2173
2174	/* No initialization required on Linux */
2175	int cxiInitBreakQ() { return 0; }
2176
2177	/* No initialization required on Linux */
2178	int cxiTermBreakQ() { return 0; }
2179
2180	/* Send the notification that the oplock break completed */
2181	int cxiSendBreakMsg(void *ofP)
2182	{
2183	ENTER(0);
2184	/* There is only one oplock_break_queue, and no means to pass the ofP back to
2185	* the waiters. This will wake all of them up and they will recheck their
2186	* oplock states and wait again if necessary (with a timeout).
2187	*/
2188	wake_up_interruptible(&oplock_break_queue);
2189
2190	TRACE1(TRACE_SMB, 3, TRCID_SEND_BREAK, "cxiSendBreakMsg: ofP 0x%lX\n", ofP);
2191	EXIT(0);
2192	return 0;
2193	}
2194
2195	/* Suspend the caller until either the oplock break completes, or the timeout
2196	* is reached.
2197	*/
2198	int cxiWaitForBreak(void *fileArgP, int oplockCurrent, int timeoutSeconds)
2199	{
2200	DECLARE_WAITQUEUE(wait, current);
2201	signed long timeout;
2202
2203	ENTER(0);
2204	TRACE3(TRACE_SMB, 5, TRCID_BREAKWAIT,
2205	"cxiWaitForBreak: file 0x%lX, oplockCurrent %d timeoutSeconds %d\n",
2206	fileArgP, oplockCurrent, timeoutSeconds);
2207
2208	add_wait_queue(&oplock_break_queue, &wait);
2209	timeout = timeoutSeconds * HZ;
2210	while (timeout > 0) {
2211	set_current_state(TASK_INTERRUPTIBLE);
2212	/* Check whether the oplock has been released or downgraded */
2213	if (gpfs_ops.SMBGetOplockState(fileArgP) < oplockCurrent)
2214	break;
2215	timeout = schedule_timeout(timeout);
2216	}
2217	set_current_state(TASK_RUNNING);
2218	remove_wait_queue(&oplock_break_queue, &wait);
2219
2220	TRACE0(TRACE_SMB, 5, TRCID_BREAKWAIT_EXIT,
2221	"cxiWaitForBreak exit\n");
2222
2223	EXIT(0);
2224	return 0;
2225	}
2226	#endif
2227
2228
2229	/* Get the address of the first byte not addressible by processes */
2230	UIntPtr cxiGetKernelBoundary()
2231	{
2232	return GPFS_KERNEL_OFFSET;
2233	}
2234
2235
2236	/* Return true if this process holds the big kernel lock (BKL) */
2237	Boolean cxiHoldsBKL()
2238	{
2239	return current->lock_depth >= 0;
2240	}
2241
2242
2243	/* Tell the OS that this thread is involved in handling VM page-out
2244	requests and should not be blocked waiting for page allocation.
2245	Return true if successful. */
2246	Boolean cxiSetPageoutThread()
2247	{
2248	if (current->flags & PF_MEMALLOC)
2249	return false;
2250	current->flags \|= PF_MEMALLOC;
2251	return true;
2252	}
2253
2254
2255	/* Tell the OS that this thread is no longer involved in handling VM
2256	page-out requests. */
2257	void cxiClearPageoutThread()
2258	{
2259	current->flags &= ~PF_MEMALLOC;
2260	}
2261
2262
2263	/* Yield the CPU to allow other processes to run */
2264	void
2265	cxiYield()
2266	{
2267	ENTER(0);
2268	schedule();
2269	EXIT(0);
2270	}
2271
2272	/* Linux filldir has changed signatures depending on kernel level.
2273	* We always pass a 64bit offset from the GPFS layer.
2274	*/
2275	int
2276	cxiFillDir(void vargP, const char nameP, int namelen,
2277	offset_t offset, ino_t ino)
2278	{
2279	int result;
2280	cxiFillDirArg_t fillDirArgP = (cxiFillDirArg_t )vargP;
2281	filldir_t fnP = (filldir_t)fillDirArgP->fnP;
2282	ENTER(0);
2283
2284	result = (*fnP)(fillDirArgP->argP, nameP, namelen,
2285	(loff_t)offset, ino, 0 /* DT_UNKNOWN */);
2286	EXIT_RC(0, result);
2287	return result;
2288	}
2289
2290	#ifdef DISK_LEASE_DMS
2291
2292	static struct timer_list DMSTimer[MAX_DMS_INDEX];
2293	static int (*DMSgetNIOsInProgressP)(int);
2294
2295	#define PANIC_FOR_REAL 1
2296
2297	static void cxiDMSExpired(unsigned long data)
2298	{
2299	int idx = data;
2300	int nIOs = DMSgetNIOsInProgressP(idx);
2301	/* ENTER(0); */
2302	/* This code is executed on the interrupt level -- can't use tracing */
2303	printk("GPFS Deadman Switch timer [%d] has expired; IOs in progress: %d\n",
2304	idx, nIOs);
2305	#ifdef PANIC_FOR_REAL
2306	if (nIOs != 0)
2307	panic("GPFS Deadman Switch timer has expired, and there are still"
2308	" %d outstanding I/O requests\n", nIOs);
2309	#endif
2310	}
2311
2312	/*
2313	Start dead man switch, with the timeout specified by the delay
2314	argument (in seconds).
2315	*/
2316	void cxiStartDMS(int idx, int delay, int (*funcP)(int))
2317	{
2318	unsigned long njiffies = delay * HZ;
2319
2320	/* Only allow the daemon or other root users to make this kernel call */
2321	if (!cxiIsSuperUser())
2322	return;
2323	ENTER(0);
2324
2325	/* There can be only one timer active at any given moment */
2326	if (timer_pending(&DMSTimer[idx]))
2327	del_timer(&DMSTimer[idx]);
2328
2329	init_timer(&DMSTimer[idx]);
2330	DMSTimer[idx].expires = jiffies + njiffies;
2331	DMSTimer[idx].function = cxiDMSExpired;
2332	DMSTimer[idx].data = idx;
2333	/* save the pointer to nIOsInProgress to a static var */
2334	DMSgetNIOsInProgressP = funcP;
2335	add_timer(&DMSTimer[idx]);
2336	TRACE3(TRACE_DLEASE, 2, TRCID_DMS_STARTED,
2337	"DMS timer [%d] started, delay %d, time %d\n",
2338	idx, delay, jiffies/HZ);
2339	EXIT(0);
2340	}
2341
2342	void cxiStopDMS(int idx)
2343	{
2344	/* Only allow the daemon or other root users to make this kernel call */
2345	if (!cxiIsSuperUser())
2346	return;
2347	ENTER(0);
2348
2349	if (timer_pending(&DMSTimer[idx]))
2350	del_timer(&DMSTimer[idx]);
2351	TRACE2(TRACE_DLEASE, 2, TRCID_DMS_STOPPED,
2352	"DMS timer [%d] stopped, time %d\n", idx, jiffies/HZ);
2353	EXIT(0);
2354	}
2355
2356	/* dummy init routine. Since on Linux the timer is
2357	stored in a static memory, there's nothing to be done
2358	*/
2359	int cxiInitDMS(void)
2360	{
2361	return 0;
2362	}
2363
2364	void cxiShutdownDMS(void)
2365	{
2366	int i;
2367
2368	ENTER(0);
2369	for (i = 0; i < MAX_DMS_INDEX; i++)
2370	cxiStopDMS(i);
2371	EXIT(0);
2372	}
2373
2374	#endif /* DISK_LEASE_DMS */
2375
2376	void cxiSetBit(unsigned long *flagP, int flag_bit)
2377	{
2378	set_bit(flag_bit,flagP);
2379	}
2380	void cxiClearBit(unsigned long *flagP, int flag_bit)
2381	{
2382	clear_bit(flag_bit,flagP);
2383	}
2384	Boolean cxiTestBit(unsigned long *flagP, int flag_bit)
2385	{
2386	return test_bit(flag_bit,flagP);
2387	}
2388
2389	/* In order to setup our termination callback routine (gpfs_f_cleanup)
2390	* we create a dummy file and add it to our file table. Then, upon
2391	* process termination, the release file operation will be called in
2392	* order to close the file. The only operation we define for this
2393	* dummy file is release (gpfs_f_cleanup).
2394	*/
2395	int
2396	cxiRegisterCleanup()
2397	{
2398	int code = 0, rc = 0;
2399	struct inode *iP = NULL;
2400	struct file *fileP = NULL;
2401	struct dentry *dentryP = NULL;
2402	extern int cleanupFD;
2403	extern struct super_block *shutdownSuperP;
2404
2405	/* We record the daemon's process group because certain
2406	* checks on cxiCopyIn/cxiCopyOut are bypassed for the daemon.
2407	*/
2408	ENTER(0);
2409	DaemonPGrp = PROCESS_GROUP(current);
2410
2411	/* Make sure we only create one file */
2412	if (cleanupFD)
2413	{
2414	EXIT_RC(0, EEXIST);
2415	return EEXIST;
2416	}
2417
2418	DBGASSERT(shutdownSuperP != NULL);
2419
2420	/* Allocate an inode struct */
2421	iP = NEW_INODE(shutdownSuperP);
2422	if (!iP)
2423	{
2424	code = 1;
2425	rc = ENOMEM;
2426	goto xerror;
2427	}
2428	iP->i_mode = S_IFREG;
2429
2430	/* Allocate an available file descriptor */
2431	cleanupFD = get_unused_fd();
2432	if (cleanupFD < 0)
2433	{
2434	code = 2;
2435	rc = ENFILE;
2436	goto xerror;
2437	}
2438
2439	/* Allocate a file struct */
2440	fileP = get_empty_filp();
2441	if (!fileP)
2442	{
2443	code = 3;
2444	rc = ENFILE;
2445	goto xerror;
2446	}
2447
2448	/* Allocate a dentry sruct */
2449	dentryP = dget(d_alloc_root(iP));
2450	if (!dentryP)
2451	{
2452	code = 4;
2453	rc = ENOMEM;
2454	goto xerror;
2455	}
2456
2457	/* Initialize and chain our file sructure */
2458	fileP->f_dentry = dentryP;
2459	fileP->f_op = &gpfs_cleanup_fops;
2460	fileP->f_flags = O_RDONLY;
2461	atomic_set(&fileP->f_count, 1);
2462
2463	/* Just chain it on the current root mount. When
2464	* the file is closed its fput() will decrement
2465	* the mount count (hence the mntget here)
2466	*/
2467	fileP->f_vfsmnt = mntget(current->fs->rootmnt);
2468
2469	/* Install the descriptor so it gets "closed" upon our termination */
2470	fd_install(cleanupFD, fileP);
2471
2472	/* Set FD_CLOEXEC so that forked processes (like mmfsup.scr) do not
2473	* inherrit this descriptor. We want the cleanup routine to be run
2474	* when the last mmfsd process terminates.
2475	*/
2476	#if LINUX_KERNEL_VERSION >= 2061300
2477	FD_SET(cleanupFD, current->files->fdt->close_on_exec);
2478	#else
2479	FD_SET(cleanupFD, current->files->close_on_exec);
2480	#endif
2481	/* Once the descriptor for this dummy file is added to our file table,
2482	* it is inherrited by all the processes of the daemon. As each
2483	* terminates, the files->count is decremented and on the last process
2484	* termination all the descriptors will be closed by filp_close.
2485	*
2486	* The one catch here is that our file table is inherrited by the
2487	* kernel threads we start as well as user processes. This would
2488	* cause a problem in that daemon termination does not include these
2489	* kernel threads which aren't killed until restart (and therefore
2490	* the file is never closed). In order for our operation to be
2491	* driven at daemon termiation, we must remove the file table from
2492	* these kernel threads. This is done in via cxiReparent() by
2493	* the mmap pager kproc.
2494	*/
2495
2496	xerror:
2497	TRACE4(TRACE_VNODE, 1, TRCID_CXIREGISTERCLEANUP_EXIT,
2498	"cxiRegisterCleanup: fd %d iP %X rc %d code %d\n",
2499	cleanupFD, iP, rc, code);
2500
2501	if (rc)
2502	{
2503	if (dentryP);
2504	dput(dentryP);
2505
2506	if (cleanupFD)
2507	put_unused_fd(cleanupFD);
2508
2509	if (fileP)
2510	#if LINUX_KERNEL_VERSION > 2060900
2511	fput(fileP);
2512	#else
2513	put_filp(fileP);
2514	#endif
2515
2516	if (iP)
2517	iput(iP);
2518
2519	cleanupFD = 0;
2520	}
2521
2522	EXIT_RC(0, rc);
2523	return rc;
2524	}
2525
2526	#ifdef NFS4_ACL
2527	/* Linux routines to be called when processing NFSv4 audit/alarm ACL entries */
2528	int cxiAuditWrite(int numargs, ...) { return ENOSYS; }
2529	#endif /* NFS4_ACL */
2530
2531	/* Currently no OS specific VFS initialization for Linux */
2532	int
2533	cxiInitVFS(int vfsType)
2534	{
2535	return 0;
2536	}
2537
2538	UIntPtr
2539	cxiGetKernelStackSize()
2540	{
2541	return (UIntPtr)THREAD_SIZE;
2542	}
2543
2544	#if defined(DMAPI) \|\| (SANERGY)
2545
2546	void cxiPathRel(void *ndP)
2547	{
2548	DBGASSERT( ndP != NULL);
2549	path_release( (struct nameidata *) ndP);
2550	cxiFreeUnpinned(ndP);
2551	}
2552
2553	int
2554	cxiPathToVfsP(void *privVfsPP, char kpathname, void ndPP, void cnPP,
2555	Boolean traverseLink)
2556	{
2557	struct gpfsVfsData_t *privVfsP = NULL;
2558	struct nameidata *ndP;
2559	struct inode * iP;
2560	cxiNode_t *cnP;
2561	int rc = 0;
2562	Boolean rel = false;
2563	int code = 0;
2564	*ndPP = NULL;
2565	*privVfsPP = NULL;
2566
2567	ENTER(0);
2568	if (kpathname == NULL)
2569	{
2570	code = 1;
2571	rc = EINVAL;
2572	goto xerror;
2573	}
2574
2575	ndP = (struct nameidata *)cxiMallocUnpinned(sizeof(struct nameidata));
2576	if (ndP == NULL)
2577	{
2578	code = 2;
2579	rc = ENOMEM;
2580	goto xerror;
2581	}
2582
2583	/* For DMAPI, this is called by dm_path_to_handle or dm_path_to_fshandle,
2584	* According to dmapi documentation, we should return the symbolic link
2585	* itself instead of the object that link references.
2586	* so here we need to use the function which does not traverse the link */
2587	if (!traverseLink)
2588	rc = user_path_walk_link(kpathname, ndP);
2589	else
2590	rc = user_path_walk(kpathname, ndP);
2591
2592	if (rc)
2593	{
2594	rc = -rc;
2595	code = 3;
2596	goto xerror;
2597	}
2598
2599	rel = true;
2600	iP = ndP->dentry->d_inode;
2601	DBGASSERT(iP != NULL);
2602	if (!GPFS_TYPE(iP))
2603	{
2604	code = 4;
2605	rc = EINVAL;
2606	goto xerror;
2607	}
2608
2609	privVfsP = VP_TO_PVP(iP);
2610
2611	if (privVfsP == NULL)
2612	{
2613	code = 5;
2614	rc = ENOENT;
2615	}
2616	cnP = VP_TO_CNP(iP);
2617	privVfsPP = (void )privVfsP;
2618	ndPP = (void )ndP;
2619	if (cnPP != NULL)
2620	cnPP = (void )cnP;
2621
2622	xerror:
2623	if (rc && ndP)
2624	{
2625	if (rel)
2626	cxiPathRel(ndP);
2627	else
2628	cxiFreeUnpinned(ndP);
2629	}
2630	EXIT_RC(0, rc);
2631	return rc;
2632	}
2633
2634	void
2635	cxiSetCred(void *eCredPP)
2636	{
2637	ext_cred_t eCredP = (ext_cred_t )eCredPP;
2638	setCred(eCredP);
2639	}
2640
2641	#endif /* DMAPI or SANERGY */
2642
2643
2644	#ifdef KSTACK_CHECK
2645	/* Kernel stack checking: for each active thread that is making
2646	subroutine calls in the kernel, allocate a stack_history_t. Within
2647	each stack_history_t, create a frame_desc_t for each level of
2648	subroutine call. Two lists of frame_desc_t's are maintained: one for
2649	the current call stack, and one for the deepest call stack seen so
2650	far for this thread. Upon exit from the lowest-level routine, check
2651	whether the maximum stack depth threshhold has been exceeded. If it
2652	has, print the traceback of the maximum stack usage. Keep hashes of
2653	the tracebacks printed to avoid printing the same traceback more than
2654	once. Since cxiTraceExit is not called for every routine exit,
2655	maintenance of call chains is not exact; a routine entry with
2656	stackUsed less than the current entry implies return of the previous
2657	routine.
2658
2659	Note that these routines cannot call any other routine that has
2660	ENTER/EXIT macros inside of it, to avoid recursion. */
2661
2662	/* Maximum size of of a stack frame before it is considered large enough
2663	to complain about */
2664	#define STACK_LIMIT_WARNING (THREAD_SIZE - (THREAD_SIZE/3) )
2665
2666	/* Description of one level of a call stack */
2667	typedef struct frame_desc
2668	{
2669	/* Function name and file name containing the function */
2670	const char * fdFuncNameP;
2671	const char * fdFileNameP;
2672
2673	/* Pointer to frame_desc of caller, or NULL if this is the first
2674	frame. Also used to link free frame descriptors together on the
2675	shFreeHeadP free list. */
2676	struct frame_desc * fdCallerP;
2677
2678	/* Line number near the beginning of fdFuncNameP */
2679	int fdLineNum;
2680
2681	/* Total stack usage up to and including this routine */
2682	int fdStackUsed;
2683
2684	/* Reference count for this frame_desc_t. Can be 2 if this descriptor
2685	is reachable from both shCurrentP and shMaxP. */
2686	int fdRef;
2687	} frame_desc_t;
2688
2689
2690	/* Each stack_history is only used by one thread, so no locking is
2691	needed within a stack_history. This is allocated as a single page.
2692	*/
2693	typedef struct stack_history
2694	{
2695	/* ID of thread to which this stack_history_t belongs */
2696	cxiThreadId shThreadId;
2697
2698	/* Bucket index in historyHash that points to this stack_history_t,
2699	or -1 if this stack_history_t is on an overflow list */
2700	int shBucketNum;
2701
2702	/* Next stack_history_t in same hash overflow list or on free list */
2703	struct stack_history * shNextP;
2704
2705	/* Pointer to the frame descriptor for the routine that most recently
2706	called fdEnter without a matching fdExit. Following the fdCallerP
2707	pointers through these frame descriptors gives the current callback
2708	chain. */
2709	frame_desc_t * shCurrentP;
2710
2711	/* Pointer to the frame descriptor that had the maximum stack usage
2712	seen thus far for this thread. Following the fdCallerP pointers
2713	through these frame descriptors gives the callback chain with
2714	maximal stack usage. */
2715	frame_desc_t * shMaxP;
2716
2717	/* Head of list of free frame_desc_t's */
2718	frame_desc_t * shFreeHeadP;
2719
2720	/* Area that holds frame_desc_t's. These will be linked together and
2721	put on the list shFreeHeadP. */
2722	#define SH_PREFIX_LEN (sizeof(cxiThreadId) + \
2723	sizeof(int) + \
2724	sizeof(struct stack_history *) + \
2725	3sizeof(frame_desc_t ))
2726	#define SH_NFRAMES ((PAGE_SIZE-SH_PREFIX_LEN)/sizeof(frame_desc_t))
2727	frame_desc_t shFrames[SH_NFRAMES];
2728	} stack_history_t;
2729
2730	/* Global structures */
2731	struct
2732	{
2733	/* Global flag controlling whether kernel stack checking is enabled.
2734	Initially false; set true during kernel module initialization,
2735	then set false again during kernel module termination. */
2736	Boolean shActive;
2737
2738	/* Mutex protecting updates to the variables that follow. This cannot
2739	be a cxiBlockMutex_t because then the stack checking routines would
2740	get called recursively. */
2741	struct semaphore shMutex;
2742
2743	/* List of free stack_history_t's and count of how many free entries
2744	there are. Excess stack_history_t's beyond a threshhold are freed
2745	back to the operating system. */
2746	stack_history_t * freeHeadP;
2747	int nFree;
2748	#define MAX_FREE_STACK_HISTORIES 16
2749
2750	/* Hash table of active stack_history_t's. To find the entry for a
2751	particular thread, hash its thread id to a bucket. If any of the
2752	entries in bucket[] match the desired thread id, the pointer to
2753	the stack_history_t can be returned without acquiring any locks. If
2754	the bucket does not contain the desired thread id, look for it on
2755	the overflow list under protection of shMutex. */
2756	#define HISTORY_HASH_SIZE 64
2757	#define HISTS_PER_BUCKET 3
2758	struct
2759	{
2760	struct
2761	{
2762	stack_history_t * historyP;
2763	cxiThreadId threadId;
2764	} bucket[HISTS_PER_BUCKET];
2765	stack_history_t * overflowP;
2766	} historyHash[HISTORY_HASH_SIZE];
2767
2768	/* List of hash values for tracebacks that have already been printed.
2769	Used to avoid printing the same traceback more than once. Nothing
2770	is ever deleted from this table, so to find an entry start
2771	searching at its hash value and continue until the entry is found
2772	or an empty slot is encountered. The total occupancy of the table
2773	is limited to MAX_TRACEBACKS to restrict the amount of searching
2774	that will be required, and to guarantee that searches will
2775	terminate. */
2776	#define TB_HASH_SIZE 64
2777	#define MAX_TRACEBACKS 32
2778	unsigned int tracebackHash[TB_HASH_SIZE];
2779	int nTracebackHashEntries;
2780	} SHG;
2781
2782
2783	/* Private version of DBGASSERT used only within stack checking code.
2784	Cannot use DBGASSERT without risking recursion. */
2785	#ifdef DBGASSERTS
2786	#define SH_ASSERT(_ex) \
2787	if (!(_ex)) { \
2788	printk("GPFS stack checking assert failed: " # _ex " file %s line %d\n", \
2789	__FILE__, __LINE__); \
2790	DoPanic(# _ex, __FILE__, __LINE__, 0, 0, ""); \
2791	} else ((void)0)
2792	#else
2793	#define SH_ASSERT(_ex) ((void)0)
2794	#endif
2795
2796
2797	/* Initialize and enable stack depth checking */
2798	void shInit()
2799	{
2800	/* Clear stack checking globals */
2801	cxiMemset(&SHG, 0, sizeof(SHG));
2802
2803	/* Init mutex */
2804	init_MUTEX(&SHG.shMutex);
2805
2806	/* Turn on stack depth checking and make sure the change is visible */
2807	SHG.shActive = true;
2808	wmb();
2809	}
2810
2811
2812	/* Turn off stack depth checking and free all allocated memory. This does
2813	not have to return the global state to what it was when the module was
2814	first loaded, since it will not be used again. */
2815	void shTerm()
2816	{
2817	int h;
2818	int b;
2819	stack_history_t * shP;
2820	stack_history_t * shNextP;
2821
2822	/* Turn off stack depth checking and make sure the chenge is visible */
2823	SHG.shActive = false;
2824	wmb();
2825
2826	/* Get and then release mutex. This insures that a thread that is
2827	in the middle of writing a traceback finishes writing it before
2828	we free the data structures it was using. */
2829	/* ?? although there could be another thread waiting for the mutex ... */
2830	down(&SHG.shMutex);
2831	up(&SHG.shMutex);
2832
2833	/* Wait briefly to allow threads in the middle of the stack checking
2834	code to finish what they are doing */
2835	/* ?? Of course, this is not really safe, but this is debugging code,
2836	right? */
2837	schedule_timeout(HZ/2);
2838
2839	/* Terminate mutex */
2840	// nothing to do
2841
2842	/* Free all stack_history_t's on the free list */
2843	shP = SHG.freeHeadP;
2844	while (shP != NULL)
2845	{
2846	shNextP = shP->shNextP;
2847	kfree(shP);
2848	shP = shNextP;
2849	}
2850
2851	/* Free all stack_history_t's in the hash table */
2852	for (h=0 ; h<HISTORY_HASH_SIZE ; h++)
2853	{
2854	for (b=0 ; b<HISTS_PER_BUCKET ; b++)
2855	if (SHG.historyHash[h].bucket[b].historyP != NULL)
2856	kfree(SHG.historyHash[h].bucket[b].historyP);
2857	shP = SHG.historyHash[h].overflowP;
2858	while (shP != NULL)
2859	{
2860	shNextP = shP->shNextP;
2861	kfree(shP);
2862	shP = shNextP;
2863	}
2864	}
2865	}
2866
2867
2868	/* Allocate and initialize a new stack_history_t */
2869	static stack_history_t * shAllocInit()
2870	{
2871	stack_history_t * shP;
2872	int f;
2873
2874	up(&SHG.shMutex);
2875	shP = (stack_history_t *) kmalloc(sizeof(stack_history_t), GFP_KERNEL);
2876	SH_ASSERT(shP != NULL);
2877	down(&SHG.shMutex);
2878	cxiMemset(shP, 0, sizeof(stack_history_t));
2879	for (f=0 ; f<=SH_NFRAMES-2 ; f++)
2880	shP->shFrames[f].fdCallerP = &shP->shFrames[f+1];
2881	shP->shFreeHeadP = &shP->shFrames[0];
2882	return shP;
2883	}
2884
2885
2886	/* Get a stack_history_t off the free list or build a new one */
2887	static stack_history_t * shGet()
2888	{
2889	stack_history_t * shP;
2890
2891	/* Use free list if one is available there */
2892	shP = SHG.freeHeadP;
2893	if (shP != NULL)
2894	{
2895	SHG.freeHeadP = shP->shNextP;
2896	SHG.nFree -= 1;
2897	return shP;
2898	}
2899
2900	/* Make a new one if necessary */
2901	return shAllocInit();
2902	}
2903
2904
2905	/* Free a stack_history_t. Put it on the free list if there are not
2906	already too many free, or else free it back to the operating system.
2907	*/
2908	static void shPut(stack_history_t * shP)
2909	{
2910	int h;
2911	int b;
2912	stack_history_t ** shPrevPP;
2913	stack_history_t * p;
2914
2915	/* Both call stacks should be empty */
2916	SH_ASSERT(shP->shCurrentP == NULL);
2917	SH_ASSERT(shP->shMaxP == NULL);
2918
2919	/* Must hold mutex while changing the hash table */
2920	down(&SHG.shMutex);
2921
2922	/* Clear pointer to this stack_history_t from the hash table */
2923	h = ((int)shP->shThreadId) & (HISTORY_HASH_SIZE-1);
2924	b = shP->shBucketNum;
2925	if (b != -1)
2926	{
2927	SH_ASSERT(SHG.historyHash[h].bucket[b].historyP == shP);
2928	SHG.historyHash[h].bucket[b].historyP = NULL;
2929	SHG.historyHash[h].bucket[b].threadId = 0;
2930	}
2931	else
2932	{
2933	shPrevPP = &SHG.historyHash[h].overflowP;
2934	p = *shPrevPP;
2935	while (p != NULL)
2936	{
2937	if (p == shP)
2938	{
2939	*shPrevPP = shP->shNextP;
2940	break;
2941	}
2942	shPrevPP = &p->shNextP;
2943	p = *shPrevPP;
2944	}
2945	}
2946
2947	/* If not too many already free, add to free list */
2948	if (SHG.nFree < MAX_FREE_STACK_HISTORIES)
2949	{
2950	shP->shNextP = SHG.freeHeadP;
2951	SHG.freeHeadP = shP;
2952	SHG.nFree += 1;
2953	up(&SHG.shMutex);
2954	return;
2955	}
2956
2957	/* Otherwise, really free it */
2958	up(&SHG.shMutex);
2959	kfree(shP);
2960	}
2961
2962
2963	/* Find the stack_history_t for the current thread, or allocate one if
2964	one does not already exist */
2965	static stack_history_t * shFind()
2966	{
2967	stack_history_t * shP;
2968	cxiThreadId id = current->pid;
2969	int h = ((int)id) & (HISTORY_HASH_SIZE-1);
2970	int b;
2971
2972	/* Look at all entries within the bucket given by the hash of the
2973	thread ID. No locking needs to be done for this search. */
2974	for (b=0 ; b<HISTS_PER_BUCKET ; b++)
2975	if (SHG.historyHash[h].bucket[b].threadId == id)
2976	return SHG.historyHash[h].bucket[b].historyP;
2977
2978	/* Must hold mutex while changing the hash table */
2979	down(&SHG.shMutex);
2980
2981	/* Search the overflow list */
2982	shP = SHG.historyHash[h].overflowP;
2983	while (shP != NULL)
2984	{
2985	if (shP->shThreadId == id)
2986	goto exit;
2987	shP = shP->shNextP;
2988	}
2989
2990	/* No stack_history_t for this thread yet. Get one off the free list
2991	or build one. */
2992	shP = shGet();
2993	shP->shThreadId = id;
2994	shP->shNextP = NULL;
2995
2996	/* Find a slot for the new stack_history_t in the hash table */
2997	for (b=0 ; b<HISTS_PER_BUCKET ; b++)
2998	if (SHG.historyHash[h].bucket[b].historyP == NULL)
2999	{
3000	SHG.historyHash[h].bucket[b].historyP = shP;
3001	SHG.historyHash[h].bucket[b].threadId = id;
3002	shP->shBucketNum = b;
3003	goto exit;
3004	}
3005
3006	/* No slots available; add new stack_history_t to overflow list */
3007	shP->shBucketNum = -1;
3008	shP->shNextP = SHG.historyHash[h].overflowP;
3009	SHG.historyHash[h].overflowP = shP;
3010
3011	exit:
3012	/* Release mutex before returning */
3013	up(&SHG.shMutex);
3014	return shP;
3015	}
3016
3017
3018	/* Allocate a frame descriptor within the given stack_history_t. This
3019	cannot be allowed to fail, so if there are no more free descriptors,
3020	throw away the bottom frame descriptor and return that. The reference
3021	count of the frame descriptor that is returned is undefined. */
3022	static frame_desc_t * fdGet(stack_history_t * shP)
3023	{
3024	frame_desc_t * fdP;
3025	frame_desc_t ** fdPrevPP;
3026	int prevRef;
3027
3028	/* Look on the free list within the stack_history_t */
3029	fdP = shP->shFreeHeadP;
3030	if (fdP != NULL)
3031	{
3032	shP->shFreeHeadP = fdP->fdCallerP;
3033	return fdP;
3034	}
3035
3036	/* No free descriptors; first try stealing one off the bottom of the
3037	current call stack */
3038	fdP = shP->shCurrentP;
3039	if (fdP != NULL)
3040	{
3041	/* Find the bottom entry of the current call stack */
3042	fdPrevPP = &shP->shCurrentP;
3043	prevRef = 1;
3044	while (fdP->fdCallerP != NULL)
3045	{
3046	fdPrevPP = &fdP->fdCallerP;
3047	prevRef = fdP->fdRef;
3048	fdP = *fdPrevPP;
3049	}
3050
3051	/* Remove the bottom entry of the current call stack */
3052	*fdPrevPP = NULL;
3053
3054	/* Reduce the reference count on the entry just removed. The
3055	reference count decreases by the reference count of the frame
3056	that used to point to fdP. If fdP is no longer referenced, no
3057	further work is needed. If *fdP is still referenced from the max
3058	depth stack (it must be the bottom entry), we will eventually
3059	return it, but only after removing it from the bottom of the max
3060	depth stack. We know that fdP will be returned, but we have to
3061	search through the max depth stack to find the pointer to *fdP.
3062	*/
3063	fdP->fdRef -= prevRef;
3064	if (fdP->fdRef == 0)
3065	return fdP;
3066	}
3067
3068	/* Still no free descriptors; steal the frame descriptor off the
3069	bottom of the maximum depth call stack */
3070	fdP = shP->shMaxP;
3071	if (fdP != NULL)
3072	{
3073	/* Find the bottom entry of the max depth call stack */
3074	fdPrevPP = &shP->shMaxP;
3075	while (fdP->fdCallerP != NULL)
3076	{
3077	fdPrevPP = &fdP->fdCallerP;
3078	fdP = *fdPrevPP;
3079	}
3080
3081	/* Remove the bottom entry of the max depth call stack */
3082	*fdPrevPP = NULL;
3083
3084	/* The bottom entry of the max depth call stack that was just
3085	removed must have a reference count of one; otherwise it would
3086	still be on the current call stack and removing the bottom entry
3087	of that stack would have reduced the reference count of some
3088	frame descriptor from 2 to 0. */
3089	SH_ASSERT(fdP->fdRef == 1);
3090	return fdP;
3091	}
3092	SH_ASSERT(!"cannot alloc frame_desc_t");
3093	return NULL;
3094	}
3095
3096
3097	/* Decrease the reference count on a frame descriptor. If it becomes
3098	zero, return it to the free list */
3099	static void fdDiscard(frame_desc_t * fdP, stack_history_t * shP)
3100	//inline static void fdDiscard(frame_desc_t * fdP, stack_history_t * shP)
3101	{
3102	if (fdP->fdRef > 1)
3103	{
3104	fdP->fdRef -= 1;
3105	TRACE3(TRACE_ENTRYEXIT, 11, TRCID_FDDISCARD1,
3106	"fdDiscard: fdP 0x%lX shP 0x%lX rtn %s refcnt now 1\n",
3107	fdP, shP, fdP->fdFuncNameP);
3108	return;
3109	}
3110
3111	fdP->fdCallerP = shP->shFreeHeadP;
3112	shP->shFreeHeadP = fdP;
3113	TRACE3(TRACE_ENTRYEXIT, 11, TRCID_FDDISCARD2,
3114	"fdDiscard: fdP 0x%lX shP 0x%lX rtn %s refcnt now 0\n",
3115	fdP, shP, fdP->fdFuncNameP);
3116	}
3117
3118
3119	/* If the maximum stack depth exceeds the threshhold, print its
3120	traceback if it has not already been printed. Reset the maximum
3121	depth stack to empty. Only called when the current stack is already
3122	empty. */
3123	static void shDisplay(stack_history_t * shP)
3124	{
3125	frame_desc_t * fdP;
3126	unsigned int tbHash;
3127	frame_desc_t * fdNextP;
3128	int slot;
3129
3130	SH_ASSERT(shP->shCurrentP == NULL);
3131
3132	/* If the maximum stack depth is less than the threshhold, just free
3133	the call chain and return */
3134	fdP = shP->shMaxP;
3135	if (fdP == NULL \|\|
3136	fdP->fdStackUsed < STACK_LIMIT_WARNING)
3137	goto exit;
3138
3139	/* Compute a hash of the traceback call chain */
3140	tbHash = 0;
3141	while (fdP != NULL)
3142	{
3143	tbHash <<= 1;
3144	tbHash ^= (((unsigned int)fdP->fdStackUsed) << 15) ^ fdP->fdLineNum;
3145	fdP = fdP->fdCallerP;
3146	}
3147
3148	/* Search for the hash of the call chain in the table of tracebacks that
3149	have already been printed. Searching the hash table can be done without
3150	any locks, since entries are never deleted. The loop must eventually
3151	terminate, since the table will not be allowed to fill up. */
3152	search:
3153	slot = tbHash % TB_HASH_SIZE;
3154	while (SHG.tracebackHash[slot] != 0)
3155	{
3156	if (SHG.tracebackHash[slot] == tbHash)
3157	/* This traceback has already been printed */
3158	goto exit;
3159	slot = (slot+1) % TB_HASH_SIZE;
3160	}
3161
3162	/* The hash of the current max depth traceback was not found in the
3163	table and should be inserted at position 'slot'. Do this under
3164	protection of the mutex. If 'slot' has been used by the time we
3165	get the mutex, drop the mutex and repeat the search. */
3166	down(&SHG.shMutex);
3167	if (SHG.nTracebackHashEntries >= MAX_TRACEBACKS)
3168	goto exitMutexHeld;
3169	if (SHG.tracebackHash[slot] != 0)
3170	{
3171	up(&SHG.shMutex);
3172	goto search;
3173	}
3174	SHG.tracebackHash[slot] = tbHash;
3175	SHG.nTracebackHashEntries += 1;
3176
3177	/* Print the traceback */
3178	fdP = shP->shMaxP;
3179	printk("\nGPFS kernel stack for process %d(%s) used %d bytes\n",
3180	current->pid, current->comm, fdP->fdStackUsed);
3181	printk(" stack function\n");
3182	printk(" used\n");
3183	printk(" ----- -----------------------------------------------------\n");
3184	while (fdP != NULL)
3185	{
3186	printk(" %5d %s at %s:%d\n",
3187	fdP->fdStackUsed, fdP->fdFuncNameP, fdP->fdFileNameP, fdP->fdLineNum);
3188	fdP = fdP->fdCallerP;
3189	}
3190	printk(" traceback signature %08X\n", tbHash);
3191
3192	/* If the maximum number of allowed tracebacks has been reached, turn
3193	off further stack checking. */
3194	if (SHG.nTracebackHashEntries >= MAX_TRACEBACKS)
3195	{
3196	printk("Maximum number of GPFS deep stack tracebacks reached\n");
3197	printk("GPFS stack checking disabled\n");
3198	SHG.shActive = false;
3199	wmb();
3200	}
3201
3202	exitMutexHeld:
3203	up(&SHG.shMutex);
3204
3205	exit:
3206	/* Free all stack frame descriptors for the max depth call chain back
3207	to the internal free list. */
3208	fdP = shP->shMaxP;
3209	while (fdP != NULL)
3210	{
3211	SH_ASSERT(fdP->fdRef == 1);
3212	fdNextP = fdP->fdCallerP;
3213	fdP->fdCallerP = shP->shFreeHeadP;
3214	shP->shFreeHeadP = fdP;
3215	fdP = fdNextP;
3216	}
3217	shP->shMaxP = NULL;
3218	}
3219
3220
3221	/* Process routine entry */
3222	static void fdEntry(frame_desc_t * fdP, stack_history_t * shP)
3223	{
3224	frame_desc_t * popP;
3225	frame_desc_t * p;
3226
3227	TRACE5(TRACE_ENTRYEXIT, 11, TRCID_FDENTRY,
3228	"fdEntry: fdP 0x%lX shP 0x%lX rtn %s shCurrentP 0x%lX used %d\n",
3229	fdP, shP, fdP->fdFuncNameP, shP->shCurrentP, fdP->fdStackUsed);
3230
3231	/* If this is the first call by this thread, set up the two call chains */
3232	if (shP->shCurrentP == NULL)
3233	{
3234	SH_ASSERT(shP->shMaxP == NULL);
3235	shP->shCurrentP = fdP;
3236	shP->shMaxP = fdP;
3237	fdP->fdCallerP = NULL;
3238	fdP->fdRef = 2;
3239	return;
3240	}
3241	else
3242	SH_ASSERT(shP->shMaxP != NULL);
3243
3244	/* Process routine exits implied by the number of bytes of stack that
3245	are currently in use. The test needs to be for strict less than
3246	because inlined routines share the same stack frame as their
3247	caller, but both routines will do entry/exit processing. */
3248	popP = shP->shCurrentP;
3249	while (fdP->fdStackUsed < popP->fdStackUsed)
3250	{
3251	p = popP->fdCallerP;
3252	shP->shCurrentP = p;
3253	TRACE1(TRACE_ENTRYEXIT, 11, TRCID_IMPLIED_EXIT,
3254	"fdEntry: implied exit from rtn %s\n",
3255	popP->fdFuncNameP);
3256	fdDiscard(popP, shP);
3257	if (p == NULL)
3258	{
3259	/* The outermost routine returned before this call without calling
3260	fdExit. Test for a large maximum stack, then reset the
3261	maximum. */
3262	shDisplay(shP);
3263
3264	/* The current routine is the one and only */
3265	shP->shCurrentP = fdP;
3266	shP->shMaxP = fdP;
3267	fdP->fdCallerP = NULL;
3268	fdP->fdRef = 2;
3269	return;
3270	}
3271	popP = p;
3272	}
3273
3274	/* If this is an extension of the current max depth stack, just add
3275	this routine to the top of both stacks */
3276	if (fdP->fdStackUsed > shP->shMaxP->fdStackUsed &&
3277	shP->shCurrentP == shP->shMaxP)
3278	{
3279	fdP->fdCallerP = shP->shCurrentP;
3280	shP->shCurrentP = fdP;
3281	shP->shMaxP = fdP;
3282	fdP->fdRef = 2;
3283	TRACE2(TRACE_ENTRYEXIT, 11, TRCID_NEWMAX_EXTEND,
3284	"fdEntry: extending new max stack %d fdP 0x%lX\n",
3285	fdP->fdStackUsed, fdP);
3286	return;
3287	}
3288
3289	/* Make this new routine be the top of the stack */
3290	fdP->fdCallerP = shP->shCurrentP;
3291	shP->shCurrentP = fdP;
3292	fdP->fdRef = 1;
3293
3294	/* If this new routine has a greater stack depth than the previous max,
3295	unreference the previous max depth call chain and add additional
3296	references to the current one. */
3297	if (fdP->fdStackUsed > shP->shMaxP->fdStackUsed)
3298	{
3299	popP = shP->shMaxP;
3300	do
3301	{
3302	p = popP->fdCallerP;
3303	fdDiscard(popP, shP);
3304	popP = p;
3305	} while (popP != NULL);
3306	p = fdP;
3307	do
3308	{
3309	p->fdRef = 2;
3310	p = p->fdCallerP;
3311	} while (p != NULL);
3312	TRACE2(TRACE_ENTRYEXIT, 11, TRCID_NEWMAX,
3313	"fdEntry: new max stack %d fdP 0x%lX\n",
3314	fdP->fdStackUsed, fdP);
3315	shP->shMaxP = fdP;
3316	}
3317	}
3318
3319
3320	/* Process routine exit */
3321	static void fdExit(const char * funcnameP)
3322	{
3323	stack_history_t * shP;
3324	frame_desc_t * lastPopP;
3325	frame_desc_t * popP;
3326	frame_desc_t * p;
3327
3328	/* Locate or create stack_history_t for this thread */
3329	shP = shFind();
3330
3331	/* If call stack is already empty, there is nothing to do except free
3332	the stack_history_t */
3333	if (shP->shCurrentP == NULL)
3334	{
3335	SH_ASSERT(shP->shMaxP == NULL);
3336	shPut(shP);
3337	return;
3338	}
3339
3340	/* Search backward on the call stack for a routine name that matches
3341	the one being exitted. In C++, the ENTER/EXIT macros will pass the
3342	same string constant (same address) to fdEntry and fdExit. The C
3343	versions of the macros may pass two different copies of the same
3344	string. This loop cannot pop routines it skips off the stack, since
3345	the routine might never be found. */
3346	p = shP->shCurrentP;
3347	for (;;)
3348	{
3349	if (p->fdFuncNameP == funcnameP \|\|
3350	cxiStrcmp(p->fdFuncNameP, funcnameP) == 0)
3351	{
3352	TRACE4(TRACE_ENTRYEXIT, 11, TRCID_FDEXIT,
3353	"fdExit: p 0x%lX shP 0x%lX rtn %s shCurrentP 0x%lX\n",
3354	p, shP, p->fdFuncNameP, shP->shCurrentP);
3355	lastPopP = p;
3356	break;
3357	}
3358	p = p->fdCallerP;
3359	if (p == NULL)
3360	{
3361	/* Routine name not found. Do not pop stack. */
3362	/* printk("No entry found when exitting %s\n", funcnameP); */
3363	TRACE1(TRACE_ENTRYEXIT, 11, TRCID_FDEXIT_NOTFOUND,
3364	"No entry found when exitting %s\n", funcnameP);
3365	return;
3366	}
3367	}
3368
3369	/* Pop all routines up to and including lastPopP */
3370	p = shP->shCurrentP;
3371	do
3372	{
3373	popP = p;
3374	p = popP->fdCallerP;
3375	fdDiscard(popP, shP);
3376	} while (popP != lastPopP);
3377	shP->shCurrentP = p;
3378
3379	/* If this was the return of the outermost routine, print new maximum
3380	stack depth traceback and discard the stack_history_t */
3381	if (shP->shCurrentP == NULL)
3382	{
3383	shDisplay(shP);
3384	shPut(shP);
3385	}
3386	}
3387
3388	#endif /* KSTACK_CHECK */
3389
3390
3391	#if defined(ENTRYEXIT_TRACE) \|\| defined(KSTACK_CHECK)
3392	void cxiTraceEntry(int level, const char * funcnameP,
3393	const char * filenameP, int lineNum)
3394	{
3395	int stackUsed = THREAD_SIZE - (((unsigned long)&stackUsed) & (THREAD_SIZE-1));
3396	#ifdef KSTACK_CHECK
3397	stack_history_t * shP;
3398	frame_desc_t * fdP;
3399	#endif /* KSTACK_CHECK */
3400
3401	#ifdef ENTRYEXIT_TRACE
3402	/* Need to use a constant trace level in the TRACE macro call to get
3403	the .trclst file (and later the .trcfmt file) built correctly */
3404	if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level))
3405	{
3406	TRACE5(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_ENTER,
3407	"-->K %s (%s:%d) level %d stackUsed %d\n",
3408	funcnameP, filenameP, lineNum, level, stackUsed);
3409	}
3410	#endif /* ENTRYEXIT_TRACE */
3411
3412	#ifdef KSTACK_CHECK
3413	/* Nothing to do if kernel stack checking is disabled */
3414	if (!SHG.shActive)
3415	return;
3416
3417	/* Do not attempt to keep track of stack usage in interrupt handlers */
3418	if (in_interrupt())
3419	return;
3420
3421	/* Locate or create stack_history_t for this thread */
3422	shP = shFind();
3423
3424	/* Get a new frame descriptor and fill it in */
3425	fdP = fdGet(shP);
3426	fdP->fdFuncNameP = funcnameP;
3427	fdP->fdFileNameP = filenameP;
3428	fdP->fdLineNum = lineNum;
3429	fdP->fdStackUsed = stackUsed;
3430
3431	/* Perform stack checking for this routine entry */
3432	fdEntry(fdP, shP);
3433	#endif /* KSTACK_CHECK */
3434	}
3435
3436
3437	void cxiTraceExit(int level, const char * funcnameP)
3438	{
3439	#ifdef ENTRYEXIT_TRACE
3440	/* Need to use a constant trace level in the TRACE macro call to get
3441	the .trclst file (and later the .trcfmt file) built correctly */
3442	if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level))
3443	TRACE1(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_EXIT,
3444	"<--K %s\n", funcnameP);
3445	#endif /* ENTRYEXIT_TRACE */
3446
3447	#ifdef KSTACK_CHECK
3448	/* Nothing to do if kernel stack checking is disabled */
3449	if (!SHG.shActive)
3450	return;
3451
3452	/* Do not attempt to keep track of stack usage in interrupt handlers */
3453	if (in_interrupt())
3454	return;
3455
3456	/* Process routine exit */
3457	fdExit(funcnameP);
3458	#endif /* KSTACK_CHECK */
3459	}
3460	void cxiTraceExitRC(int level, const char * funcnameP, int rc)
3461	{
3462	#ifdef ENTRYEXIT_TRACE
3463	/* Need to use a constant trace level in the TRACE macro call to get
3464	the .trclst file (and later the .trcfmt file) built correctly */
3465	if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level))
3466	TRACE2(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_EXIT_RC,
3467	"<--K %s rc %d\n", funcnameP, rc);
3468	#endif /* ENTRYEXIT_TRACE */
3469
3470	#ifdef KSTACK_CHECK
3471	/* Nothing to do if kernel stack checking is disabled */
3472	if (!SHG.shActive)
3473	return;
3474
3475	/* Do not attempt to keep track of stack usage in interrupt handlers */
3476	if (in_interrupt())
3477	return;
3478
3479	/* Process routine exit */
3480	fdExit(funcnameP);
3481	#endif /* KSTACK_CHECK */
3482	}
3483	#endif /* defined(ENTRYEXIT_TRACE) \|\| defined(KSTACK_CHECK) */
3484
3485
3486	#ifdef UIDREMAP
3487	size_t cxiGetUserEnvironmentSize(void)
3488	{
3489	return (current->mm->env_end - current->mm->env_start);
3490	}
3491
3492	int cxiGetUserEnvironment(char* buf, size_t len)
3493	{
3494	return cxiCopyIn((char*)current->mm->env_start, buf, len);
3495	}
3496	#endif
3497
3498	Boolean cxiHasMountHelper()
3499	{
3500	return USING_MOUNT_HELPER();
3501	}
3502
3503	#ifdef P_NFS4
3504
3505	#include <linux/nfsd/nfs4layoutxdr.h>
3506
3507	/* convert ip address to string */
3508	char IPtoString(int ip, char buf)
3509	{
3510	unsigned char a = (char )&ip;
3511
3512	sprintf(buf, "%u.%u.%u.%u", a[0], a[1], a[2], a[3]);
3513
3514	return buf;
3515	}
3516
3517	static void printfh(char s, int fh)
3518	{
3519	#ifdef GPFS_PRINTK
3520	printk("%s: %d: %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3521	s, fh[0],fh[1],fh[2],fh[3],fh[4],fh[5],fh[6],fh[7],fh[8],fh[9]);
3522	#endif
3523	}
3524
3525	int cxiSetFH(int *fhP, int sid)
3526	{
3527	struct knfsd_fh fh = (struct knfsd_fh )fhP;
3528
3529	printfh("cxiSetFH-1", fhP);
3530	if (fh->fh_size > 8) {
3531	fh->fh_size += 4; // fh_size + 4 for sid
3532	fh->fh_fsid_type += max_fsid_type;
3533	fhP[(fh->fh_size >> 2)] = sid;
3534	fh->fh_fileid_type = 7; // see code in gpfs_decode_fh()
3535	#ifdef GPFS_PRINTK
3536	printk("cxiSetFH size %d fsid_type %d fileid %d\n",
3537	fh->fh_size, fh->fh_fsid_type, fh->fh_fileid_type);
3538	#endif
3539	printfh("cxiSetFH-2", fhP);
3540	return 0;
3541	}
3542	return ENOENT;
3543	}
3544
3545	/* Call to NFS server on MDS to get open state */
3546	int cxiOpenState(void vfsP, void p)
3547	{
3548	int rc = ENOENT;
3549	struct super_block sbP = (struct super_block )vfsP;
3550	struct pnfs_get_state *osP = p;
3551	struct gpfsVfsData_t privVfsP = (struct gpfsVfsData_t )SBLOCK_PRIVATE(sbP);
3552
3553	#ifdef GPFS_PRINTK
3554	printk("cxiOpenState1 sb %p p %p \n", sbP, p);
3555	printk("cxiOpenState cb_get_state %p\n",
3556	sbP->s_export_op->cb_get_state);
3557	#endif
3558	if (sbP->s_export_op->cb_get_state)
3559	rc = sbP->s_export_op->cb_get_state(osP);
3560
3561	gpfs_ops.gpfsGetVerifier(privVfsP, osP->verifier);
3562	#ifdef GPFS_PRINTK
3563	printk("cxiOpenState rc %d devid %x verifier %x:%x\n",
3564	rc, osP->devid, osP->verifier[0], osP->verifier[1]);
3565	#endif
3566
3567	return rc;
3568	}
3569	/* Call to NFS server on DS to get change open state or close the file */
3570	int cxiChangeState(void vfsP, void p)
3571	{
3572	int rc = ENOENT;
3573	struct super_block sbP = (struct super_block )vfsP;
3574	struct pnfs_get_state *osP = p;
3575
3576	if (sbP->s_export_op->cb_change_state)
3577	rc = sbP->s_export_op->cb_change_state(osP);
3578	#ifdef GPFS_PRINTK
3579	printk("cxiChangeState2 sb %p p %p access %d\n", sbP, p, osP->access);
3580	#endif
3581
3582	return rc;
3583	}
3584	/* Call to NFS server on MDS to recall layout */
3585	int cxiRecallLayout(void vfsP, void vP, void *p)
3586	{
3587	int rc = ENOENT;
3588	struct super_block sbP = (struct super_block )vfsP;
3589	struct inode iP = (struct inode )vP;
3590	struct layout_recall lr;
3591
3592	lr.fsid = sbP;
3593	lr.offset = 0;
3594	lr.length = -1;
3595
3596	if (iP == NULL) // recall all layouts for this fs
3597	lr.layout_type = RECALL_FSID;
3598
3599	#ifdef GPFS_PRINTK
3600	printk("cxiRecallLayout sbP %p type %d\n", sbP, lr.layout_type);
3601	#endif
3602	if (sbP->s_export_op->cb_layout_recall) {
3603	rc = sbP->s_export_op->cb_layout_recall(sbP, iP, &lr);
3604	}
3605	else {
3606	lr.layout_type = RECALL_FILE;
3607	#ifdef GPFS_PRINTK
3608	printk("cxiRecallLayout sbP %p iP %p type %d\n", sbP, iP, lr.layout_type);
3609	#endif
3610	}
3611
3612	#ifdef GPFS_PRINTK
3613	printk("cxiRecallLayout sbP %p iP %p rc %d\n", sbP, iP, rc);
3614	#endif
3615	return rc;
3616	}
3617
3618	/* Get device list
3619
3620	gd_type
3621	in: requested layout type.
3622	out: available lauout type.
3623	gd_cookie
3624	in: cookie returned on the last operation.
3625	out: none zero cookie if some devices did not fit in the buffer.
3626	gd_maxcount
3627	in: buffer size in bytes.
3628	gd_buffer
3629	in: pointer to buffer.
3630	gd_devlist_len
3631	out: number of items returned in the buffer.
3632
3633	error:
3634	Use the same retrun codes as used for GTEDEVLIST
3635	*/
3636	int
3637	cxiGetDeviceList(int nDests, int idList, void P)
3638	{
3639	ENTER(0);
3640	int rc = 0;
3641	int i, len, left;
3642	int j = 0;
3643	char p, tp;
3644	char tmp[32];
3645	struct nfsd4_pnfs_getdevlist dl = (struct nfsd4_pnfs_getdevlist )P;
3646	struct nfsd4_pnfs_devlist *gd_buf = NULL;
3647	struct pnfs_filelayout_devaddr *dev;
3648
3649	#ifdef GPFS_PRINTK
3650	printk("xxx cxiGetDeviceList enter nDests %d idList %p \n", nDests, idList);
3651	#endif
3652
3653	dl->gd_type = LAYOUT_NFSV4_FILES;
3654	dl->gd_cookie = 0;
3655	dl->gd_devlist_len = 0;
3656	left = dl->gd_maxcount;
3657	tp = &tmp[0];
3658
3659	len = sizeof(struct nfsd4_pnfs_devlist) * nDests;
3660	#ifdef GPFS_PRINTK
3661	printk("xxx cxiGetDeviceList len %d left %d\n", len, left);
3662	#endif
3663	if (nDests > left) {
3664	rc = ENOMEM; //??? NFS4ERR_TOOSMALL
3665	goto xerror;
3666	}
3667	gd_buf = (struct nfsd4_pnfs_devlist *)cxiMallocUnpinned(len);
3668	if (gd_buf == NULL) {
3669	rc = ENOMEM;
3670	goto xerror;
3671	}
3672	memset(gd_buf, 0, len);
3673	dl->gd_devlist = gd_buf;
3674
3675	#ifdef GPFS_PRINTK
3676	printk("xxx cxiGetDeviceList gd_buf %p count %d\n", gd_buf, nDests);
3677	#endif
3678	for (i = 0; i < nDests; i++)
3679	{
3680	/* make both device id and device address be the same for now */
3681	gd_buf[j].dev_id = idList[i];
3682	gd_buf[j].dev_lotype = LAYOUT_NFSV4_FILES;
3683	if (gd_buf[j].dev_id == INADDR_NONE)
3684	continue;
3685
3686	IPtoString(gd_buf[j].dev_id, tp);
3687	len = (cxiStrlen(tp));
3688
3689	p = (char *)cxiMallocUnpinned(sizeof(struct pnfs_filelayout_devaddr));
3690	if (p == NULL) {
3691	rc = ENOMEM;
3692	goto xerror;
3693	}
3694	memset(p, 0, sizeof(struct pnfs_filelayout_devaddr));
3695	gd_buf[j].dev_addr = p;
3696
3697	dev = (struct pnfs_filelayout_devaddr *)p;
3698	dev->r_addr.len = len + 4; /* for ".8.1" */
3699
3700	p = (char *)cxiMallocUnpinned(dev->r_addr.len+1);
3701	if (p == NULL) {
3702	rc = ENOMEM;
3703	goto xerror;
3704	}
3705	dev->r_addr.data = p;
3706	cxiMemcpy(p, tp, len);
3707	p = p + len;
3708	cxiStrcpy(p, ".8.1"); /* port 2049 = 0x801 = "8.1" */
3709
3710	dev->r_netid.len = 3; /'tcp'/
3711	p = (char *)cxiMallocUnpinned(dev->r_netid.len+1);
3712	if (p == NULL) {
3713	rc = ENOMEM;
3714	goto xerror;
3715	}
3716	cxiStrcpy(p, "tcp");
3717	dev->r_netid.data = p;
3718
3719	left = left - 1;
3720	dl->gd_devlist_len++;
3721
3722	TRACE4(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELIST_P1,
3723	"gpfsGetDeviceList index %d len %d ip %s left %d\n",
3724	i, dev->r_addr.len, dev->r_addr.data, left);
3725	#ifdef GPFS_PRINTK
3726	printk("xxx cxiGetDeviceList index %d id %d len %d ip %s left %d ops %p %p\n",
3727	i, gd_buf[j].dev_id, dev->r_addr.len,
3728	dev->r_addr.data, left, dl->gd_ops, dl->gd_ops->devaddr_encode);
3729	#endif
3730
3731	j++;
3732	}
3733
3734	exit:
3735
3736	TRACE2(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELIST_EXIT,
3737	"cxiGetDeviceList exit: rc %d len %d", rc, len);
3738	return rc;
3739
3740	xerror:
3741
3742	if (gd_buf != NULL) {
3743	for (i = 0; i < j; i++)
3744	{
3745	dev = gd_buf[i].dev_addr;
3746	if (dev) {
3747	cxiFreeUnpinned(dev->r_addr.data);
3748	cxiFreeUnpinned(dev->r_netid.data);
3749	cxiFreeUnpinned(dev);
3750	}
3751	}
3752	cxiFreeUnpinned(gd_buf);
3753	}
3754	goto exit;
3755	}
3756
3757	int
3758	cxiGetDeviceInfo(void *P)
3759	{
3760	ENTER(0);
3761	int rc;
3762	int len;
3763	char p, tp;
3764	char tmp[32];
3765	struct nfsd4_pnfs_getdevinfo da = (struct nfsd4_pnfs_getdevinfo )P;
3766	tp = &tmp[0];
3767	struct pnfs_filelayout_devaddr *dev;
3768
3769	IPtoString(da->gd_dev_id, tp);
3770
3771	dev = (struct pnfs_filelayout_devaddr *)cxiMallocUnpinned(sizeof(struct pnfs_filelayout_devaddr));
3772	if (dev == NULL) {
3773	rc = ENOMEM;
3774	goto xerror;
3775	}
3776	da->gd_devaddr = dev;
3777
3778	len = (cxiStrlen(tp));
3779	dev->r_addr.len = len + 4; /* for ".8.1" */
3780
3781	p = (char *)cxiMallocUnpinned(dev->r_addr.len+1);
3782	if (p == NULL) {
3783	cxiFreeUnpinned(dev);
3784	rc = ENOMEM;
3785	goto xerror;
3786	}
3787	dev->r_addr.data = p;
3788	cxiMemcpy(p, tp, len);
3789	p = p + len;
3790	cxiStrcpy(p, ".8.1"); /* port 2049 = 0x801 = "8.1" */
3791
3792	dev->r_netid.len = 3; /'tcp'/
3793	p = (char *)cxiMallocUnpinned(dev->r_netid.len+1);
3794	if (p == NULL) {
3795	cxiFreeUnpinned(dev->r_addr.data);
3796	cxiFreeUnpinned(dev);
3797	rc = ENOMEM;
3798	goto xerror;
3799	}
3800	cxiStrcpy(p, "tcp");
3801	dev->r_netid.data = p;
3802
3803	TRACE2(TRACE_VNODE, 2, TRCID_GPFSOPS_GET_DEVICELINFO_P1,
3804	"gpfsGetDeviceInfo len %d ip %s\n",
3805	dev->r_addr.len, dev->r_addr.data);
3806
3807	#ifdef GPFS_PRINTK
3808	printk("xxx cxiGetDeviceInfo id %d len %d ip %s\n",
3809	da->gd_dev_id, dev->r_addr.len, dev->r_addr.data);
3810	#endif
3811
3812	xerror:
3813
3814	TRACE1(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELINFO_EXIT,
3815	"cxiGetDeviceInfo exit: rc %d\n", rc);
3816
3817	return rc;
3818	}
3819	/* get layout
3820	lg_type
3821	in: requested layout type.
3822	out: available lauout type.
3823	lg_offset
3824	in: requested offset.
3825	out: returned offset.
3826	lg_length
3827	in: requested length.
3828	out: returned length.
3829	lg_mxcnt
3830	in: buffer size in bytes.
3831	lg_llist
3832	in: pointer to buffer.
3833	lg_layout
3834	out: number of items returned in the buffer.
3835
3836	if the file is big(?) return all nodes in layout
3837	if the file is small return no layout or just one node, choose one node in
3838	random but make sure it is the same node for the same file.
3839	*/
3840	int
3841	cxiGetLayout(int nDests, int idList, cxiVattr_t vattr, int myAddr, void *P)
3842	{
3843	ENTER(0);
3844	char p, n;
3845	int i, rc, left, len;
3846	struct nfsd4_pnfs_layoutget gl = (struct nfsd4_pnfs_layoutget )P;
3847	struct nfsd4_pnfs_layoutlist *lg_buf = NULL;
3848	struct nfsd4_pnfs_filelayout *layout = NULL;
3849
3850	TRACE2(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_ENTER,
3851	"cxiGetLayout: nDests %d myAddr %x\n", nDests,myAddr);
3852
3853	/* set node id in fh and increase fh size by 4 */
3854	rc = cxiSetFH((int *)&gl->lg_fh, myAddr);
3855	if (rc != 0)
3856	goto xerror;
3857
3858	gl->lg_type = LAYOUT_NFSV4_FILES;
3859	gl->lg_offset = 0;
3860	gl->lg_length = MAX_UINT64; /* The maximum file size */
3861
3862	layout = (struct nfsd4_pnfs_filelayout *)cxiMallocUnpinned(sizeof(struct nfsd4_pnfs_filelayout));
3863	if (layout == NULL) {
3864	rc = ENOMEM;
3865	goto xerror;
3866	}
3867	gl->lg_layout = layout;
3868	layout->lg_stripe_type = STRIPE_DENSE;
3869	layout->lg_commit_through_mds = true;
3870	layout->lg_stripe_unit = vattr->va_blocksize; /* preferred blocksize */
3871	layout->lg_file_size = vattr->va_size; /* file size in bytes */
3872	layout->lg_llistlen = 0;
3873
3874	left = gl->lg_mxcnt;
3875
3876	len = sizeof(struct nfsd4_pnfs_layoutlist) * nDests;
3877	if (len > left) {
3878	rc = ENOMEM; // NFS4ERR_TOOSMALL
3879	goto xerror;
3880	}
3881	lg_buf = (struct nfsd4_pnfs_layoutlist *)cxiMallocUnpinned(len);
3882	if (lg_buf == NULL) {
3883	rc = ENOMEM;
3884	goto xerror;
3885	}
3886	memset(lg_buf, 0, len);
3887	layout->lg_llist = lg_buf;
3888	left = left - len;
3889
3890	for (i = 0; i < nDests; i++)
3891	{
3892	/* make both device id and device address be the same for now */
3893	lg_buf[i].dev_ids.len = 1; //??? can return a list of dev ids ????
3894	lg_buf[i].dev_ids.list = (u32 )cxiMallocUnpinned(sizeof(u32)lg_buf[i].dev_ids.len);
3895	if (lg_buf[i].dev_ids.list == NULL) {
3896	rc = ENOMEM;
3897	goto xerror;
3898	}
3899	lg_buf[i].dev_ids.list[0] = idList[i];
3900	layout->lg_llistlen++;
3901	lg_buf[i].fhp = (struct knfsd_fh *)&gl->lg_fh;
3902
3903	#ifdef GPFS_PRINTK
3904	printk("cxiGetLayout index %d id %d xid 0x%lX len %d\n",
3905	i, idList[i], idList[i], len);
3906	#endif
3907	TRACE3(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_P1,
3908	"cxiGetLayout index %d id 0x%lX len %d\n",
3909	i, idList[i], len);
3910
3911	}
3912	if (i == 0) {
3913	layout->lg_llistlen = 0;
3914	cxiFreeUnpinned(lg_buf);
3915	}
3916
3917	#ifdef GPFS_PRINTK
3918	printk("cxiGetLayout: type %d iomode %d offset %lld length %lld minlength %lld mxcnt %d ops %p layouts %p\n",
3919	gl->lg_type, gl->lg_iomode, gl->lg_offset, gl->lg_length, gl->lg_minlength,
3920	gl->lg_mxcnt, gl->lg_ops, gl->lg_layout);
3921
3922	printfh("cxiGetLayout:", gl->lg_fh);
3923
3924	printk("cxiGetLayout: layout stripe_type %d stripe_unit %lld file_size %lld llistlen %d llist %p\n",
3925	layout->lg_stripe_type, layout->lg_stripe_unit,layout->lg_file_size,
3926	layout->lg_llistlen,layout->lg_llist);
3927	#endif
3928
3929	exit:
3930
3931	TRACE3(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_EXIT,
3932	"cxiGetLayout exit: rc %d len %d p 0x%lX", rc, len, p);
3933
3934	return rc;
3935
3936	xerror:
3937
3938	if (lg_buf) {
3939	gl->lg_length = 0;
3940	for (i = 0; i < nDests; i++)
3941	{
3942	cxiFreeUnpinned(lg_buf[i].dev_ids.list);
3943	}
3944	cxiFreeUnpinned(lg_buf);
3945	}
3946	if (layout)
3947	cxiFreeUnpinned(layout);
3948
3949	goto exit;
3950	}
3951	#endif
3952
3953	int cxiCheckThreadState(cxiThreadId tid)
3954	{
3955	struct task_struct t, g;
3956	int rc = ENOENT;
3957
3958	// read_lock(&tasklist_lock);
3959	rcu_read_lock();
3960
3961	DO_EACH_THREAD(g,t)
3962	{
3963	/* We are looking for a thread with a given tid and the same parent as
3964	the caller (the caller must be another mmfsd thread */
3965	if (t->pid == tid &&
3966	cxiStrcmp(t->comm, current->comm) == 0)
3967	{
3968	rc = 0;
3969	break;
3970	}
3971	} WHILE_EACH_THREAD(g,t);
3972	// read_unlock(&tasklist_lock);
3973	rcu_read_unlock();
3974
3975	return rc;
3976	}
3977

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gpfs_3.1_ker2.6.20/lpp/mmfs/src/gpl-linux/cxiSystem.c @ 111

Download in other formats: