source: gpfs_3.1_ker2.6.20/lpp/mmfs/src/gpl-linux/cxiSystem.c @ 145

Last change on this file since 145 was 16, checked in by rock, 17 years ago
File size: 104.9 KB
RevLine 
[16]1/***************************************************************************
2 *
3 * Copyright (C) 2001 International Business Machines
4 * All rights reserved.
5 *
6 * This file is part of the GPFS mmfslinux kernel module.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 *  1. Redistributions of source code must retain the above copyright notice,
13 *     this list of conditions and the following disclaimer.
14 *  2. Redistributions in binary form must reproduce the above copyright
15 *     notice, this list of conditions and the following disclaimer in the
16 *     documentation and/or other materials provided with the distribution.
17 *  3. The name of the author may not be used to endorse or promote products
18 *     derived from this software without specific prior written
19 *     permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
27 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 *
32 *************************************************************************** */
33/* @(#)16       1.158.1.9  src/avs/fs/mmfs/ts/kernext/gpl-linux/cxiSystem.c, mmfs, avs_rgpfs24, rgpfs24s007a 10/24/06 19:12:27 */
34/*
35 * Linux implementation of basic common services
36 *
37 * Contents:
38 *     cxiGetThreadId
39 *     getpid
40 *     cxiIsSuperUser
41 *     DoPanic
42 *     logAssertFailed
43 *   Kernel memory allocation services:
44 *     cxiMallocPinned
45 *     cxiFreePinned
46 *
47 */
48
49#include <Shark-gpl.h>
50
51#include <linux/kernel.h>
52#include <linux/module.h>
53#include <linux/sched.h>
54#include <linux/slab.h>
55#include <linux/wait.h>
56#include <linux/time.h>
57#include <linux/file.h>
58#include <linux/string.h>
59#include <asm/uaccess.h>
60#include <linux/smp_lock.h>
61#include <linux/vmalloc.h>
62#include <linux/fs.h>
63#include <linux/interrupt.h>
64#undef memcmp
65
66#define DEFINE_TRACE_GBL_VARS
67#include <Logger-gpl.h>
68#include <verdep.h>
69#include <linux2gpfs.h>
70#include <cxiSystem.h>
71#include <cxiAtomic.h>
72#include <cxi2gpfs.h>
73#include <cxiIOBuffer.h>
74#include <cxiSharedSeg.h>
75#include <cxiCred.h>
76
77#include <Trace.h>
78#include <lxtrace.h>
79#include <cxiMode.h>
80#if LINUX_KERNEL_VERSION >= 2060000
81#include <linux/swap.h>
82#include <linux/writeback.h>
83#endif
84
85#if LINUX_KERNEL_VERSION >= 2040900
86/* This is in the Redhat kernel series */
87extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);
88#endif
89
90#ifdef INSTRUMENT_LOCKS
91struct BlockingMutexStats BlockingMutexStatsTable[MAX_GPFS_LOCK_NAMES];
92#endif  /* INSTRUMENT_LOCKS */
93
94/* We record the daemon's process group since it can uniquely identify
95 * a thread as being part of the GPFS daemon.  pid is unique per thread
96 * on linux due to their clone implementation.
97 */
98static pid_t DaemonPGrp = -1;
99
100/* Get the kernel thread ID. */
101cxiThreadId cxiGetThreadId()
102{
103  /* ENTER(1); */
104  return current->pid;
105}
106
107/* Get the kernel process ID. */
108pid_t getpid()
109{
110  /* ENTER(1); */
111  return current->pid;
112}
113
114/* bufP is caller's ext_cred_t buffer
115 * uCredPP is the ucred struct (NULL on Linux)
116 * eCredPP is the ext_cred_t struct * (if successful)
117 *
118 * cxiPutCred should be called to release when operation has been completed.
119 */
120int cxiGetCred(void *bufP, void **uCredPP, void **eCredPP)
121{
122  ext_cred_t *eCredP = (ext_cred_t *)bufP;
123
124  ENTER(0);
125  *uCredPP = NULL;
126  *eCredPP = NULL;
127
128  if (!bufP)
129  {
130    EXIT_RC(0, EINVAL);
131    return EINVAL;
132  }
133
134  setCred(eCredP);
135  *eCredPP = (void *)eCredP;
136
137xerror:
138  EXIT(0);
139  return 0;
140}
141
142/* Release of cxiGetCred() structures  (nothing to do on Linux) */
143int cxiPutCred(void *userCredP, void *extCredP)
144{
145  if (userCredP || !extCredP)
146    return EINVAL;
147
148  return 0;
149}
150
151/* Convert a kernel stack address to the thread ID of the thread that
152 * uses that stack
153 */
154int 
155cxiStackAddrToThreadId(char* stackP, cxiThreadId* tidP)
156{
157  struct task_struct * tP;
158#if LINUX_KERNEL_VERSION >= 2060000
159  /* the kernel stack is base off the thread_info struct in the 2.6 kernel
160   * will get the task pointer out of thread_info struct.
161   */
162  struct thread_info * iP;
163  ENTER(0);
164  iP = (struct thread_info *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1)));
165  tP = iP->task;
166#else
167  /* the kernel stack is base off the task_struct struct in the 2.4 kernel */
168  tP = (struct task_struct *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1)));
169#endif
170  ENTER(0);
171  *tidP = tP->pid;
172  EXIT(0);
173  return 0;
174}
175
176/* Convert a kernel thread pointer to the corresponding thread ID */
177int 
178cxiThreadPtrToThreadId(char* threadP, cxiThreadId* tidP)
179{
180  struct task_struct * tP;
181
182  ENTER(0);
183  tP = (struct task_struct *) threadP;
184  *tidP = tP->pid;
185
186  EXIT(0);
187  return 0;
188}
189
190
191/* Return true if caller has has maximum authorization (is root) */
192Boolean cxiIsSuperUser()
193{
194  return (current->euid == 0);
195}
196
197
198/* Get the process max filesize limit (ulimit -f) */
199Int64 cxiGetMaxFileSize()
200{
201  if ((signed long)MY_RLIM_CUR(RLIMIT_FSIZE) == -1L)
202    return MAX_INT64;
203  else
204    return (MY_RLIM_CUR(RLIMIT_FSIZE));
205}
206
207/* Routine to send a signal to the current thread/process */
208void cxiSendSigThread(int sig)
209{
210  ENTER(0);
211  send_sig(sig, current, 0);
212  EXIT(0);
213}
214
215
216#ifdef MALLOC_DEBUG
217/* This tracks mallocs and frees on a limited basis.
218 * Implemented originally to determine if we were leaking
219 * any memory after an unload.  This is not really thread
220 * safe for multiple processors unless they're automatically
221 * cache coherent without memory barriers (i386).   Its useful
222 * for detecting memory leaks on a single processor system.
223 */
224#define MALLOC_RECORDS 5000 /* max mallocs to track */
225struct mallocStat
226{
227  void *beginP;
228  unsigned short size;
229  unsigned short type;
230};
231static struct mallocStat *mstatP = NULL;
232unsigned int nextMalloc = 0;
233
234void 
235MallocDebugStart()
236{
237  int i;
238
239  ENTER(0);
240  if (mstatP == NULL)
241    mstatP = vmalloc(MALLOC_RECORDS * sizeof(struct mallocStat));
242 
243  if (mstatP == NULL)
244  {
245    EXIT(0);
246    return;
247  }
248
249  for (i = 0; i < MALLOC_RECORDS; i++)
250  {
251    mstatP[i].beginP = NULL; 
252    mstatP[i].size = 0;
253    mstatP[i].type = 0;
254  }
255  printk("MallocDebugStart 0x%X\n", mstatP);
256  EXIT(0);
257}
258
259void
260MallocDebugEnd()
261{
262  int i;
263
264  ENTER(0);
265  if (mstatP != NULL)
266  {
267    for (i = 0; i < MALLOC_RECORDS; i++)
268    {
269      if (mstatP[i].beginP != NULL)
270        printk("MallocDebug: beginP 0x%X size %d type %d STILL ALLOCATED!\n",
271               mstatP[i].beginP, mstatP[i].size, mstatP[i].type);
272    }
273  }
274
275  vfree(mstatP);
276  mstatP = NULL;
277  EXIT(0);
278}
279
280void
281MallocDebugNew(void *ptr, unsigned short size, unsigned short type)
282{
283  void *bP;
284  int i;
285  int j;
286  int swrc;
287  int oldval;
288  int where = nextMalloc;
289
290  ENTER(0);
291
292  if (mstatP == NULL)
293  {
294    EXIT(0);
295    return;
296  }
297
298  for (i = where; i < MALLOC_RECORDS + where; i++)
299  {
300    if (i >= MALLOC_RECORDS)
301      j = i - MALLOC_RECORDS;
302    else
303      j = i;
304
305    bP = mstatP[j].beginP;
306    if (bP == NULL) 
307    {
308      swrc = ATOMIC_SWAP(&mstatP[j].beginP, &bP, ptr);
309      if (swrc)
310      {
311        mstatP[j].size = size;
312        mstatP[j].type = type;
313        break;
314      }
315    }
316  }
317
318  EXIT(0);
319}
320
321void 
322MallocDebugDelete(void *ptr)
323{
324  void *bP;
325  int i;
326  int swrc;
327  int next;
328  int found = 0;
329 
330  ENTER(0);
331  if (mstatP == NULL)
332  {
333    EXIT(0);
334    return;
335  }
336
337  for (i = 0; i < MALLOC_RECORDS; i++)
338  {
339    bP = mstatP[i].beginP;
340    if (bP == ptr)
341    {
342      next = nextMalloc;
343      ATOMIC_SWAP(&nextMalloc, &next, i);
344
345      swrc = ATOMIC_SWAP(&mstatP[i].beginP, &bP, NULL);
346      DBGASSERT(swrc);
347      found = 1;
348      break;
349    }
350  }
351
352  if (!found)
353    printk("MallocDebug: 0x%X not found!\n", ptr);
354  EXIT(0);
355}
356#endif /* MALLOC_DEBUG */
357
358/* Allocate pinned kernel memory */
359void* cxiMallocPinned(int nBytes)
360{
361  void *ptr;
362
363  /* kmalloc only supports requests for up to 131027 bytes.  Anything
364     larger than this results in a BUG() call. */
365  ENTER(0);
366  if (nBytes > 131072)
367  {
368    EXIT(0);
369    return NULL;
370  }
371
372  ptr = kmalloc(nBytes, GFP_KERNEL);
373
374#ifdef MALLOC_DEBUG
375  MallocDebugNew(ptr, nBytes, 1);
376#endif
377
378  EXIT(0);
379  return ptr;
380}
381
382/* Free pinned kernel memory that was allocated with cxiMallocPinned */
383/* Must not block on lack of memory resourses */
384void cxiFreePinned(void* p)
385{
386  ENTER(0);
387#ifdef MALLOC_DEBUG
388  MallocDebugDelete(p);
389#endif
390
391  kfree(p);
392  EXIT(0);
393}
394
395/* Get the kernel thread ID. */
396void* cxiGetFcntlOwner(eflock_t *flP)
397{
398  return flP? flP->l_owner: current->files;
399}
400
401#if LINUX_KERNEL_VERSION > 2060900
402struct lock_manager_operations lm_operations = {
403};
404#endif
405
406/* Perform local advisory locking. */
407int cxiFcntlLock(void *advObjP,
408                 int cmd,
409                 void *lockStructP,
410                 cxiFlock_t *flockP,
411                 int (*retryCB)(),
412                 cxiOff64_t size,
413                 cxiOff64_t offset,
414                 ulong *retry_idP)
415{
416  int len, rc = 0;
417  // struct file *fP;
418  struct file_lock fl, *flP, *gflP, *cflP;
419  Boolean keepLockElement = false;
420
421  /* cast platform independent arguments as appropriate for linux */
422  void (*RetryFcn)(struct file_lock*) = (void (*)(struct file_lock*))retryCB;
423  // fP = (struct file *)advObjP;
424  struct file localFile, *filp = &localFile;
425  struct dentry localDEntry, *dp = &localDEntry;
426  ENTER(0);
427  flP = (struct file_lock *) lockStructP;
428
429  localFile.f_dentry = &localDEntry;
430  localDEntry.d_inode = (struct inode *)advObjP;
431
432  /* Lock commands can have two different values.  Convert them at
433   * entry to the portability layer so that we only have to check
434   * for one of them.
435   */
436#if !defined(__64BIT__)
437  if (cmd == F_GETLK64) cmd = F_GETLK;
438  if (cmd == F_SETLK64) cmd = F_SETLK;
439  if (cmd == F_SETLKW64) cmd = F_SETLKW;
440#endif
441
442  /* Callers have the option of passing a platform dependent lock structure
443     (struct file_lock *lockSructP) or the generic (cxiFlock_t *flockP). */
444  if (flockP)
445  {
446    flP = &fl; /* Use a local file_lock structure */
447
448    /* If there is a potential for blocking, must malloc the locking structure
449       so it can persist until the lock becomes available (in Retry()). */
450
451    if (cmd == F_SETLKW)
452    {
453#ifdef NFS_CLUSTER_LOCKS
454      len = sizeof(struct file_lock) +
455            sizeof(struct file) +
456            sizeof(struct dentry);
457#else
458      len = sizeof(struct file_lock);
459#endif
460      flP = (struct file_lock*)cxiMallocUnpinned(len);
461      if (flP == NULL)
462      {
463  rc = ENOMEM;
464  goto exit;
465      }
466      cxiMemset(flP, 0, len);
467#ifdef NFS_CLUSTER_LOCKS
468      filp = (struct file*)((char *)flP + sizeof(struct file_lock));
469      dp = (struct dentry *)((char *)filp + sizeof(struct file));
470      filp->f_dentry = dp;
471      dp->d_inode = (struct inode *)advObjP;
472#endif
473    }
474    else
475      cxiMemset(flP, 0, sizeof(*flP));
476
477    locks_init_lock(flP); /* Initialize list_head structs */
478    if (flockP->l_file == NULL)
479      flockP->l_file = filp;
480
481    /* fl_wait needs to be initialized because when unlock happens, the
482       linux routine locks_wake_up_blocks invokes our retry routine via
483       fl_notify and then calls wake_up(fl_wait) on the assumption that
484       the waiter is local. */
485
486    cxiWaitEventInit((cxiWaitEvent_t *)&flP->fl_wait);
487
488    cxiFlockToVFS(flockP, flP);
489  }
490
491  /* daemon didn't know the owner and required kernel code to fill it in. */
492  if (!flP->fl_owner)
493    flP->fl_owner = (fl_owner_t)cxiGetFcntlOwner(NULL);
494
495#if 0
496  /* Validate the file pointer.  Kernel locking routines are going to
497     use these without verifying them.  If any of them are NULL, find
498     out now before they generate a segment violation. */
499  if ((!fP) || (!fP->f_dentry) || (!fP->f_dentry->d_inode))
500  {
501    if (cmd == F_GETLK)
502      flP->fl_type = F_UNLCK;
503    else
504      rc = EINVAL;
505    goto exit;
506  }
507#endif
508
509  /* Note that this all depends on us having serialized such locking for
510     this file during from before the posix_test_lock() until after the
511     posix_block_lock().  The revoke lock that we hold here provides us
512     the necessary serilization. */
513
514  TRACE7(TRACE_VNODE, 3, TRCID_FCNTLLOCK_ENTER,
515         "cxiFcntlLock posix_lock_file: pid %d owner 0x%X inodeP 0x%X "
516         "range 0x%lX-%lX cmd %s type %s\n",
517         flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end,
518         (cmd == F_GETLK) ? "GETLK" : (cmd == F_SETLK) ? "SETLK" : "SETLKW",
519         (flP->fl_type == F_RDLCK) ? "RDLCK" :
520         (flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK");
521
522  if (cmd == F_GETLK)
523  {
524    /* Check for conflicts.  If found, return the information.
525       If there are NO conflicts, return F_UNLCK in fl_type. */
526#if LINUX_KERNEL_VERSION >= 2061700
527    struct file_lock conf;
528    gflP = &conf;
529    rc = posix_test_lock(filp, flP, gflP);
530    if (rc) {
531      rc = 0;
532#else
533    if (NULL != (gflP = posix_test_lock(&localFile, flP))) {
534#endif
535      flP->fl_start = gflP->fl_start;
536      flP->fl_end = gflP->fl_end;
537      flP->fl_type = gflP->fl_type;
538      flP->fl_pid = gflP->fl_pid;
539      flP->fl_owner = gflP->fl_owner;
540    }
541    else
542      flP->fl_type = F_UNLCK;
543
544    TRACE6(TRACE_VNODE, 3, TRCID_FCNTLLOCK_GETLK,
545           "cxiFcntlLock getlk: pid %d owner 0x%X inodeP 0x%X "
546           "range 0x%lX-%lX type %s\n",
547           flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end,
548           (flP->fl_type == F_RDLCK) ? "RDLCK" :
549           (flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK");
550  }
551  else
552  { /* Begin: do the locking, but handle the blocking via our retry routine. */
553    /* Test the lock.   What this really does for us is return the blocker
554       if one exists.  This is needed to queue up the request if a conflicting
555       lock is already held. */
556
557#ifdef NFS_CLUSTER_LOCKS
558    if (cmd == F_SETLKW) {
559      flP->fl_flags |= FL_SLEEP;
560      if (!flP->fl_lmops) {
561        flP->fl_lmops = &lm_operations;
562        flP->fl_lmops->fl_notify = (void *)RetryFcn;
563      }
564    }
565    rc = POSIX_LOCK_FILE(filp, flP);
566    if (rc == -EAGAIN && (cmd == F_SETLKW) &&
567        flP->fl_lmops == &lm_operations)
568    {
569      /* Queue the blocker structures */
570      keepLockElement = true;
571      if (retry_idP)
572        *retry_idP = (ulong)flP; // returned to caller and saved in sleepElement
573    }
574#else
575#if LINUX_KERNEL_VERSION >= 2061700
576    if ((flP->fl_type == F_UNLCK) || !(posix_test_lock(&localFile, flP, cflP)))
577#else
578    if ((flP->fl_type == F_UNLCK) || !(cflP = posix_test_lock(&localFile, flP)))
579#endif
580    {
581      /* No conflicting lock:  get the lock for the caller. */
582      rc = POSIX_LOCK_FILE(&localFile, flP);
583    }
584    else
585    { /* Conflicting lock:  ..... */
586      rc = EAGAIN;
587
588      if (cmd == F_SETLKW)
589      {
590        /*if (posix_locks_deadlock(flP, cflP))
591        {
592          rc = EDEADLK;
593        }
594        else*/
595        {
596          /* Queue the blocker structures */
597          keepLockElement = true;
598          if (retry_idP)
599            *retry_idP = (ulong)flP; // returned to caller and saved in sleepElement
600#if LINUX_KERNEL_VERSION > 2060900
601          flP->fl_lmops = &lm_operations;
602          flP->fl_lmops->fl_notify = RetryFcn;
603#else
604          flP->fl_notify = RetryFcn;
605#endif
606#if LINUX_KERNEL_VERSION < 2061700
607          posix_block_lock(cflP, flP);
608#endif
609        }
610      }
611    }
612#endif
613
614    TRACE2(TRACE_VNODE, 3, TRCID_FCNTLLOCK_EXIT,
615           "cxiFcntlLock posix_lock_file: rc %d retry_id 0x%lX\n", rc, cflP);
616  } /* End: do the locking, but handle the blocking via our retry routine. */
617
618exit:
619
620  if (flockP)
621  {
622    /* Caller wanted results in flockP */
623    cxiVFSToFlock((void *)flP, flockP);
624
625    /* If we allocated the locking structure and then didn't need to use
626       it (the lock request didn't block), free it. */
627
628    if ((flP!=&fl) && (!keepLockElement)) {
629      cxiFreeUnpinned(flP);
630    }
631  }
632
633#ifdef NFS_CLUSTER_LOCKS
634  if (rc < 0)
635    rc = -rc;  /* make it positive */
636#endif
637  EXIT_RC(0, rc);
638  return rc;
639}
640
641void cxiFcntlUnblock(void *retry_idP)
642{
643  struct file_lock *flP = (struct file_lock *)retry_idP;
644
645  ENTER(0);
646  /* Include some sanity checks on the retry id (file_lock)
647     before passing it into the routine that does the work.
648     It should be properly linked (via its list_head structures)
649     in a file_lock_list that has blocked waiters.  Also,
650     we would only be backing this out by the process that
651     has originally blocked, so verify the pid. */
652
653  if (!list_empty(&flP->fl_block) && !list_empty(&flP->fl_link)  &&
654       flP->fl_next && flP->fl_pid == getpid())
655  {
656    POSIX_UNBLOCK_LOCK(flP);
657  }
658  EXIT(0);
659}
660
661int
662cxiFcntlReset(void *vfsP, cxiPid_t mmfsd_pid)
663{
664  int rc = 0;
665  struct super_block *sbP = (struct super_block *)vfsP;
666  struct list_head *fllP;
667  struct file_lock *fl;
668  struct dentry *dentryP;
669
670  ENTER(0);
671  lock_kernel();
672
673restart:
674
675#if LINUX_KERNEL_VERSION >= 2061600
676//??? find a different way to clear locks  file_lock_list is not exported anymore
677#else
678  fllP = file_lock_list.next;
679
680  while(fllP != &file_lock_list)
681  {
682    fl = list_entry(fllP, struct file_lock, fl_link);
683    fllP = fllP->next;
684
685    /* If there are mmfs lock structures, release them. */
686
687    if (fl &&
688        fl->fl_file &&
689        fl->fl_file->f_dentry &&
690        fl->fl_file->f_dentry->d_inode)
691    {
692      dentryP = fl->fl_file->f_dentry;
693
694      /* If this lock belongs to the specified vfs, release advisory locks. */
695      if (dentryP->d_sb == sbP)
696      {
697        /* remove all our locks */
698        rc = gpfs_ops.gpfsFcntlReset((void *)dentryP->d_inode, mmfsd_pid);
699        if (rc == ENOSYS)
700          goto xerror;
701
702        /* After freeing unknown numbers of locks in gpfsFcntlReset (all
703           locks for the inode), restart from the top of the lock list */
704        goto restart;
705      }
706    }
707  }
708#endif
709
710xerror:
711  unlock_kernel();
712  EXIT_RC(0, rc);
713  return rc;
714}
715
716void *
717cxiGetPrivVfsP(void *vfsP)
718{
719  struct super_block *sbP = (struct super_block *)vfsP;
720
721  /* Do some sanity checking */
722  if ( (sbP->s_magic != GPFS_SUPER_MAGIC) ||
723       ((UIntPtr) SBLOCK_PRIVATE(sbP) < GPFS_KERNEL_OFFSET) )
724    printSuperList(sbP);
725  LOGASSERT( sbP->s_magic == GPFS_SUPER_MAGIC );
726  LOGASSERT( (UIntPtr) SBLOCK_PRIVATE(sbP) >= GPFS_KERNEL_OFFSET );
727
728  return (SBLOCK_PRIVATE(sbP));
729}
730
731
732#ifdef NFS_DEBUG
733/* These flags are defined in the kernel and control various cprintk
734   calls.  This provides us a way to easily turn these on/off for
735   debugging our NFS support. */
736extern unsigned int nlm_debug;
737extern unsigned int nfsd_debug;
738extern unsigned int nfs_debug;
739extern unsigned int rpc_debug;
740#endif
741
742int cxiTrace(cxiTrace_t trace)
743{
744#ifdef NFS_DEBUG
745  int rc = 0;
746
747  ENTER(0);
748  switch (trace)
749  {
750    case cxiTraceNFS:
751      nlm_debug = nfsd_debug = nfs_debug = rpc_debug = ~0;
752      break;
753    case cxiTraceNFSoff:
754      nlm_debug = nfsd_debug = nfs_debug = rpc_debug =  0;
755      break;
756    default:
757      rc = EINVAL;
758      break;
759  }
760  EXIT_RC(0, rc);
761  return rc;
762#else
763  return ENOSYS;
764#endif
765}
766
767void cxiFlockToVFS(eflock_t* lckdatP, void* vP)
768{
769  struct file_lock* flP = (struct file_lock *)vP;
770
771  ENTER(0);
772  if ((flP) && (lckdatP))
773  {
774    flP->fl_pid   = lckdatP->l_pid;
775    flP->fl_owner = lckdatP->l_owner;
776    flP->fl_type  = lckdatP->l_type;
777    flP->fl_start = lckdatP->l_start;
778    flP->fl_flags = FL_POSIX;
779#ifdef NFS_CLUSTER_LOCKS
780    flP->fl_lmops = lckdatP->l_lmops;
781    flP->fl_file  = lckdatP->l_file;
782    flP->fl_ops   = NULL;
783#else
784#if LINUX_KERNEL_VERSION < 2061700
785    if (lckdatP->l_caller == L_CALLER_LOCKD)
786      flP->fl_flags |= FL_LOCKD;
787#endif
788#endif
789    if (lckdatP->l_len == 0)
790      flP->fl_end = FL_OFFSET_MAX;
791    else
792      flP->fl_end = lckdatP->l_len + lckdatP->l_start - 1;
793  }
794  EXIT(0);
795  return;
796}
797
798#ifdef NFS_CLUSTER_LOCKS
799int cxiVFSCallback(eflock_t* lckreqP, eflock_t* lckdatP,
800      int(* callback)(void *, void *, int), int result)
801{
802  struct file_lock fl;
803  struct file *fileP;
804  struct file_lock conf, *confP = NULL;
805  int rc;
806
807  ENTER(0);
808
809  cxiFlockToVFS(lckreqP, &fl);
810  fileP = fl.fl_file;
811  if (!fileP) {
812     return -1;
813  }
814  if (lckdatP) {
815     cxiFlockToVFS(lckdatP, &conf);
816     confP = &conf;
817  }
818  if (!result) { /* try to get the posix lock */
819     rc = POSIX_LOCK_FILE(fileP, &fl);
820     if (rc)
821        callback(&fl, NULL, EBUSY);
822     else {      /* got the posix lock */
823        rc = callback(&fl, confP, result);
824        if (rc) {  /* too late, free the lock */
825     fl.fl_type = F_UNLCK;
826           rc = POSIX_LOCK_FILE(fileP, &fl);
827       }
828     }
829  }
830  else
831     rc = callback(&fl, confP, result);
832
833#ifdef NFS_CLUSTER_LOCKS
834  if (rc < 0)
835    rc = -rc;  /* make it positive */
836#endif
837  EXIT_RC(0, rc);
838  return rc;
839}
840#endif
841
842void cxiVFSToFlock(void *vP, eflock_t *lckdatP)
843{
844  struct file_lock* flP = (struct file_lock *)vP;
845
846  ENTER(0);
847  if ((flP) && (lckdatP))
848  {
849    lckdatP->l_pid    = flP->fl_pid;
850    lckdatP->l_owner  = flP->fl_owner;
851    lckdatP->l_type   = flP->fl_type;
852    lckdatP->l_start  = flP->fl_start;
853    lckdatP->l_flags  = flP->fl_flags;
854#ifdef NFS_CLUSTER_LOCKS
855    lckdatP->l_lmops  = flP->fl_lmops;
856    lckdatP->l_file   = flP->fl_file;
857    if (lckdatP->l_lmops) /* must be lockd or nfsd */
858#else
859#if LINUX_KERNEL_VERSION >= 2061700
860    if (lckdatP->l_lmops) /* must be lockd or nfsd */
861#else
862    if (flP->fl_flags & FL_LOCKD)
863#endif
864#endif
865      lckdatP->l_caller = L_CALLER_LOCKD;
866    else
867      lckdatP->l_caller = L_CALLER_NULL;
868    if (flP->fl_end == FL_OFFSET_MAX)
869      lckdatP->l_len = 0;
870    else
871      lckdatP->l_len    = flP->fl_end - flP->fl_start + 1;
872  }
873  EXIT(0);
874  return;
875}
876
877
878/* Sleep for the indicated number of milliseconds */
879void cxiSleep(int ms)
880{
881  ENTER(0);
882  TRACE1(TRACE_VNODE, 9, TRCID_SLEEP,
883         "cxiSleep: begin delay %d\n", ms);
884  current->state = TASK_INTERRUPTIBLE;
885  /* For large HZ rearrange jiffies calculation and
886     use presumably larger word size to minimize overflow risk */
887  if (unlikely(HZ > 1000))
888    schedule_timeout(((long)ms)*HZ/1000);
889  else
890    schedule_timeout(ms/(1000/HZ));
891  TRACE2(TRACE_VNODE, 9, TRCID_SLEEP_END,
892         "cxiSleep: end delay %d HZ %d\n", ms, HZ);
893  EXIT(0);
894}
895
896
897void cxiOpenNFS(void *iP)
898{
899  struct inode *inodeP = (struct inode *)iP;
900  int refcount;
901
902  /* A reference is placed on the cxiNode here when the first NFS reference
903     is added */
904  ENTER(0);
905  refcount = cxiRefOSNode(NULL, ((cxiNode_t *)(cxiGetCnP(inodeP))), iP, 1);
906
907  TRACE7(TRACE_VNODE, 3, TRCID_OPENNFS,
908        "openNFS iP 0x%lX ino %d (0x%X) mode 0x%X nlink %d gen_ip 0x%lX "
909        "refcount %d\n",
910        inodeP, (inodeP) ? inodeP->i_ino : -1,
911        (inodeP) ? inodeP->i_ino : -1,
912        (inodeP) ? inodeP->i_mode : -1,
913        (inodeP) ? inodeP->i_nlink : -1,
914        (inodeP) ? inodeP->PRVINODE : NULL,
915        refcount);
916
917  DBGASSERT(refcount != 0);
918  EXIT(0);
919}
920
921
922int cxiCloseNFS(void *vP, void *viP)
923{
924  int rc;
925  struct inode *iP = (struct inode *)vP;
926
927  /* If viP is NULL, the file was never actually opened.
928     If viP is not NULL, close it. */
929  ENTER(0);
930  if (viP == NULL)
931    rc = 0;
932  else {
933    if (VP_TO_PVP(iP) != NULL && VP_TO_CNP(iP) != NULL) {
934      rc = gpfs_ops.gpfsClose(VP_TO_PVP(iP), VP_TO_CNP(iP), FREAD|FWRITE,
935                                          (struct MMFSVInfo *)viP, true);
936      cxiPutOSNode((void *)iP);
937    }
938  }
939
940  EXIT_RC(0, rc);
941  return rc;
942}
943
944static int cxiNFSCluster = 0;
945
946void cxiSetNFSCluster(int set)
947{
948  cxiNFSCluster = set;
949}
950
951/* To avoid failing the NFS client the NFSD thread is put to sleep. Another
952   node will takeover this client and the operation will continue without any
953   errors to the application.
954*/
955void cxiNFSError(int rc, const char *str)
956{
957  TRACE2(TRACE_VNODE, 9, TRCID_NFS_ERROR,
958         "cxiNFSError: %s got rc %d\n", str, rc);
959  if (cxiNFSCluster && cxiIsNFSThread() && (rc == ESTALE || rc == -ESTALE))
960  {
961    TRACE2(TRACE_VNODE, 1, TRCID_NFS_ERROR_1,
962         "cxiNFSError: NFS got error %d from %s sleep\n", rc, str);
963    cxiSleep(120000); // wait 120 seconds
964  }
965}
966
967void * cxiGetNfsP(void *vP)
968{
969  if (vP && VP_TO_CNP((struct inode *)vP))
970    return VP_TO_NFSP((struct inode *)vP);
971  else
972    return NULL;
973}
974
975void cxiSetNfsP(void *vP, void *newP)
976{
977  if (VP_TO_CNP((struct inode *)vP))
978    VP_TO_NFSP((struct inode *)vP) = newP;
979}
980
981void * cxiGetCnP(void *vP)
982{ return (void *)VP_TO_CNP((struct inode *)vP); }
983
984void * cxiGetPvP(void *vP)
985{ return (void *)VP_TO_PVP((struct inode *)vP); }
986
987void * cxiGNPtoVP(void *vP)
988{ return (void *)GNP_TO_VP((struct cxiNode_t *)vP); }
989
990/* Main routine of kproc */
991static int kprocMain(void *argP)
992{
993  cxiKProcData_t *kpdP = (cxiKProcData_t *)argP;
994
995  /* Change our process name */
996  ENTER(0);
997  current->comm[sizeof(current->comm) - 1] = '\0';
998  strncpy(current->comm, kpdP->nameP, sizeof(current->comm) - 1);
999
1000  /* Change parent of a kernel process so that when it exits, it won't
1001   * send a SIGCHLD signal to the process that created it, and it won't
1002   * be left as a zombie.
1003   */
1004  DAEMONIZE(kpdP->nameP);
1005
1006  /* Call the function specified by startKProc */
1007  kpdP->func(kpdP);
1008  EXIT(0);
1009  return 0;
1010}
1011
1012/* Create a new kernel process */
1013cxiPid_t
1014cxiStartKProc(struct cxiKProcData_t *kpdP)
1015{
1016  cxiPid_t pid = kernel_thread(kprocMain, kpdP, kpdP->kprocFlags);
1017  ENTER(0);
1018  kpdP->pid = pid > 0 ? pid : KPROC_FAILED_PID;
1019
1020  TRACE2(TRACE_VNODE, 1, TRCID_CXISTART_KPROC_LINUX,
1021         "cxiStartKProc %s pid %d \n", kpdP->nameP, kpdP->pid);
1022  EXIT(0);
1023  return kpdP->pid;
1024}
1025
1026void 
1027cxiStopKProc(struct cxiKProcData_t *kpdP)
1028{
1029  cxiPid_t pid;
1030
1031  ENTER(0);
1032  cxiBlockingMutexAcquire(&kpdP->lock);
1033 
1034  TRACE2(TRACE_VNODE, 1, TRCID_CXISTOP_KPROC_LINUX,
1035         "cxiStopKProc: %s pid %d \n", kpdP->nameP, kpdP->pid);
1036
1037  if (!KPROC_RUNNING(kpdP))
1038  {
1039    cxiBlockingMutexRelease(&kpdP->lock);
1040    EXIT(0);
1041    return;
1042  }
1043
1044  pid = kpdP->pid;              // Cache pid before signal/wait
1045  kpdP->terminate = true;
1046  cxiWaitEventSignal(&kpdP->kprocEvent);
1047
1048  while (kpdP->pid != KPROC_UNASSIGNED_PID)
1049    cxiWaitEventWait(&kpdP->startStopEvent, &kpdP->lock, 0);
1050
1051  cxiBlockingMutexRelease(&kpdP->lock);
1052  EXIT(0);
1053}
1054
1055/*-------------------------------------------------------------------
1056 * logAssertFailed  - Subroutine consolidating logGenIF() and
1057 *                    DoPanic() calls.
1058 *------------------------------------------------------------------*/
1059
1060static char PanicMsgBuf[2048];
1061
1062void cxiPanic(const char* panicStrP)
1063{
1064  printk( GPFS_NOTICE  "kp %d: cxiPanic: %s\n", cxiGetThreadId(), panicStrP);
1065  TRACE1(TRACE_ERRLOG, 0, TRCID_PANIC, "cxiPanic: %s\n", panicStrP);
1066#ifndef DISABLE_KERNEL_PANIC
1067  BUG();
1068#endif
1069}
1070
1071static void
1072DoPanic(char* condP, char* filenameP, int lineNum, Int32 retCode,
1073        Int32 reasonCode, char *dataStr)
1074{
1075  const char *p;
1076  int bytesLeft;
1077
1078  p = cxiStrrchr(filenameP, '/');
1079  if (p == NULL)
1080    p = filenameP;
1081  else
1082    p += 1;
1083
1084  sprintf(PanicMsgBuf, "%s:%d:%d:%d:", p, lineNum, retCode, reasonCode);
1085  bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
1086  if (dataStr)
1087  {
1088    strncat(PanicMsgBuf, dataStr, bytesLeft-1);
1089    bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
1090  }
1091  strncat(PanicMsgBuf, ":", bytesLeft-1);
1092  bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
1093  if (condP)
1094    strncat(PanicMsgBuf, condP, bytesLeft-1);
1095  cxiPanic(PanicMsgBuf);
1096}
1097
1098#ifdef MODULE
1099void
1100logAssertFailed(UInt32 flags,   /* LOG_FATAL_ERROR or LOG_NONFATAL_ERROR */
1101                char  *srcFileName,   /* __FILE__ */
1102                UInt32 srcLineNumber, /* __LINE__ */
1103                Int32  retCode,       /* return code value */
1104                Int32  reasonCode,    /* normally errno */
1105                UInt32 logRecTag,     /* tag if have associated error log rec */
1106                char  *dataStr,       /* assert data string */
1107                char  *failingExpr)   /* expression that evaluated to false */
1108{
1109  int i;
1110
1111  printk("GPFS logAssertFailed: %s file %s line %d\n",
1112         failingExpr, srcFileName, srcLineNumber);
1113  ENTER(0);
1114  TRACE3(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_1,
1115         "logAssertFailed: %s retCode %d reasonCode %d\n",
1116         failingExpr, retCode, reasonCode);
1117  TRACE2(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_2,
1118         "logAssertFailed: file %s line %d\n", srcFileName, srcLineNumber);
1119#ifndef GPFS_PRINTF
1120  /* fsync buffered lxtrace records */
1121  trc_fsync();
1122
1123#ifdef STOP_TRACE_ON_FAILURE
1124  /* Turn off tracing right after the failure occurs.  This may only turn
1125     off tracing in the kernel. */
1126  for (i=0 ; i<MAX_TRACE_CLASSES ; i++)
1127    TraceFlagsP[i] = 0;
1128#endif
1129
1130  /* Wait 10 seconds to allow the lxtrace daemon to complete the sync. */
1131  cxiSleep(10000);
1132#endif
1133  gpfs_ops.gpfsDaemonToDie(srcFileName, srcLineNumber, retCode, reasonCode,
1134                           dataStr, failingExpr);
1135
1136  DoPanic(failingExpr, srcFileName, srcLineNumber, retCode, reasonCode,
1137          dataStr);
1138}
1139#else /* !MODULE */
1140void
1141logAssertFailed(UInt32 flags,
1142                char  *srcFileName,
1143                UInt32 srcLineNumber,
1144                Int32  retCode,
1145                Int32  reasonCode,
1146                UInt32 logRecTag,
1147                char  *dataStr,
1148                char  *failingExpr);
1149#endif /* MODULE */
1150
1151
1152typedef struct cxiWaitElement_t
1153{
1154  cxiWaitList_t waitList;  /* previous and next element in chain */
1155
1156  /* Linux would normally organize a wait_queue_head_t with any number
1157   * of wait_queue_t elements.  However since we're implementing "wakeup
1158   * with return code" we have to ensure the OS wakes up the exact sleeper
1159   * we want.  Thus we have only a one to one relationship to ensure the
1160   * OS can only pick our favorite.
1161   */
1162  wait_queue_head_t qhead;
1163  wait_queue_t qwaiter;
1164  int wakeupRC;            /* wakeup return code */
1165
1166} cxiWaitElement_t;
1167
1168
1169#define CXI_WAIT_LIST_ADD(headP, elementP) \
1170   (headP)->prevP->nextP = (elementP); \
1171   (elementP)->prevP = (headP)->prevP; \
1172   (headP)->prevP = (elementP);        \
1173   (elementP)->nextP = (headP);
1174
1175#define CXI_WAIT_LIST_REMOVE(elementP) \
1176   (elementP)->prevP->nextP = (elementP)->nextP; \
1177   (elementP)->nextP->prevP = (elementP)->prevP;
1178
1179
1180/* Initialize abstract wait event with OS specific
1181 * initialization function
1182 */
1183void
1184cxiWaitEventInit(cxiWaitEvent_t *weP)
1185{
1186  spinlock_t *lockP = (spinlock_t *)&weP->lword;
1187
1188  spin_lock_init(lockP);
1189  weP->waitList.nextP = weP->waitList.prevP = &weP->waitList;
1190}
1191
1192Boolean
1193cxiWaitEventHasWaiters(cxiWaitEvent_t *weP)
1194{
1195  unsigned long flags;
1196  spinlock_t *lockP = (spinlock_t *)(weP->lword);
1197  Boolean rc;
1198
1199  SPIN_LOCK_IRQ(lockP, flags);
1200  rc = (weP->waitList.nextP != &weP->waitList);
1201  SPIN_UNLOCK_IRQ(lockP, flags);
1202  return rc;
1203}
1204
1205/* Do not add trace records.  Some callers depend on not being
1206 * interrupted by the trace daemon.
1207 */
1208enum WakeType { wBroadcast, wSignal, wWakeOne };
1209static inline void
1210doWakeup(cxiWaitEvent_t *wEventP, enum WakeType wtype, int wakeupRC)
1211{
1212  unsigned long flags;
1213  spinlock_t *lockP = (spinlock_t *)(wEventP->lword);
1214  cxiWaitList_t *headP;
1215  cxiWaitList_t *tmpP;
1216  cxiWaitElement_t *wP;
1217
1218  SPIN_LOCK_IRQ(lockP, flags);
1219
1220  /* We wake up from the front back (FIFO semantics).
1221   * There's only one wait element per wake_queue_head_t so
1222   * record the return code and wake up the one element.
1223   */
1224  headP = &wEventP->waitList;
1225
1226  for (tmpP = headP->nextP; tmpP != headP; tmpP = tmpP->nextP)
1227  {
1228    wP = list_entry(tmpP, cxiWaitElement_t, waitList);
1229    wP->wakeupRC = wakeupRC;
1230
1231    wake_up(&wP->qhead);
1232    if (wtype != wBroadcast)
1233    {
1234      /* The difference between wSignal and wWakeOne is that the latter
1235         guarantees that multiple wake up calls will each pick a different
1236         thread if more than one is waiting.  With wSignal, if a thread is
1237         awakened but hasn't had a chance to run, then subsequent wake up
1238         calls might all wake the same thread.
1239
1240         On AIX, the calling routine (e_wakeup_one) removes the waiter from
1241         the queue, unlike Linux where removal is done by the waiting
1242         thread when it wakes up.  Nothing special has to be done on AIX to
1243         get the nWakeOne style of wakeup.
1244
1245         Note:  This is an inline routine and the wType argument is a
1246         compile-time constant, so the "if" tests in this routine are done
1247         by the compiler and do not generate any code. */
1248
1249      if (wtype == wWakeOne)
1250      {
1251        /* Move this entry to tail of list so that the next wakeup call will
1252           pick somebody else. */
1253        CXI_WAIT_LIST_REMOVE(tmpP);
1254        CXI_WAIT_LIST_ADD(headP, tmpP);
1255      }
1256      break;
1257    }
1258  }
1259  SPIN_UNLOCK_IRQ(lockP, flags);
1260}
1261
1262int
1263cxiCopyIn(char *from, char *to, unsigned long size)
1264{
1265  /* The daemon needs to bypass access checks since copy to
1266   * shared segment would inadvertantly fail.
1267   */
1268  ENTER(0);
1269  if (PROCESS_GROUP(current) == DaemonPGrp)
1270    __copy_from_user(to, from, size);
1271  else 
1272    if (copy_from_user(to, from, size))
1273    {
1274      EXIT_RC(0, EFAULT);
1275      return EFAULT;
1276    }
1277  EXIT(0);
1278  return 0;
1279}
1280
1281int
1282cxiCopyOut(char *from, char *to, unsigned long size)
1283{
1284  int ignore;
1285  /* The daemon needs to bypass access checks since copy to
1286   * shared segment would inadvertantly fail.
1287   */
1288  ENTER(0);
1289  if (PROCESS_GROUP(current) == DaemonPGrp)
1290    ignore = __copy_to_user(to, from, size);
1291  else
1292    if (copy_to_user(to, from, size))
1293    {
1294      EXIT_RC(0, EFAULT);
1295      return EFAULT;
1296    }
1297  EXIT(0);
1298  return 0;
1299}
1300
1301int
1302cxiCopyInstr(char *from, char *to, unsigned long size, unsigned long *len)
1303{
1304  long retval;
1305
1306  ENTER(0);
1307  retval = strncpy_from_user(to, from, size);
1308  if ((retval > 0) && (retval <= size))
1309  {
1310    *len = retval;
1311    EXIT(0);
1312    return 0;
1313  }
1314  *len = 0;
1315  if (retval < 0)
1316    retval = EFAULT;
1317  else
1318    retval = E2BIG;
1319  EXIT_RC(0, retval);
1320  return (int)retval;
1321}
1322
1323long cxiSafeGetLong(long* from)
1324{
1325#if LINUX_KERNEL_VERSION >= 2060000
1326  long tmp;
1327  (void)__get_user_nocheck(tmp, from, sizeof(long));
1328  return tmp;
1329#else
1330  return *from;
1331#endif
1332}
1333
1334int cxiSafeGetInt(int* from)
1335{
1336#if LINUX_KERNEL_VERSION >= 2060000
1337  int tmp;
1338  __get_user_nocheck(tmp, from, sizeof(int));
1339  return tmp;
1340#else
1341  return *from;
1342#endif
1343}
1344
1345void cxiSafePutLong(long val, long* to)
1346{
1347#if LINUX_KERNEL_VERSION >= 2060000
1348  __put_user_nocheck(val, to, sizeof(long));
1349#else
1350  *to = val;
1351#endif
1352}
1353
1354void cxiSafePutInt(int val, int* to)
1355{
1356#if LINUX_KERNEL_VERSION >= 2060000
1357  __put_user_nocheck(val, to, sizeof(int));
1358#else
1359  *to = val;
1360#endif
1361}
1362
1363#ifdef GPFS_ARCH_X86_64
1364/* Check if 64-bit user process */
1365int
1366cxiIS64U(char *addr)
1367{
1368#if LINUX_KERNEL_VERSION > 2060500
1369  return !(test_thread_flag(TIF_IA32));
1370#else
1371  return !(current->thread.flags & THREAD_IA32);
1372#endif
1373}
1374#endif
1375
1376int 
1377socket_aio_dequeue()
1378{
1379  return -1;
1380}
1381
1382/* Transfer data from buffer(s) in user space to or from a buffer in the
1383   kernel. */
1384int
1385cxiUiomove(register char* kBufP,          /* address of kernel buffer */
1386           register unsigned long nBytes, /* #bytes to transfer */
1387           Boolean toKernel,              /* direction of xfer(read/write)*/
1388           register struct cxiUio_t* uioP) /* user area description */
1389{
1390  register struct cxiIovec_t * iovP;
1391  unsigned long cnt;
1392  int rc;
1393#ifdef TRACE_IO_DATA
1394  char* origKBufP = kBufP;
1395  int trcdata[4];
1396#endif
1397  int ignore;
1398
1399  ENTER(0);
1400  TRACE4(TRACE_FOPS, 6, TRCID_CXISYSTEM_037,
1401         "cxiUiomove enter: kBufP 0x%lX uioP 0x%lX nBytes %d toKernel %d\n",
1402         kBufP, uioP, nBytes, toKernel);
1403  if (uioP->uio_resid <= 0)
1404  {
1405    EXIT_RC(0, ENOMEM);
1406    return ENOMEM;
1407  }
1408  rc = 0;
1409  if (uioP->uio_iovcnt == 1)
1410  {
1411    /*
1412     * Fastpath for most common case of iovcnt == 1.  Saves a
1413     * few instructions.
1414     */
1415    iovP = uioP->uio_iov;
1416    cnt = iovP->iov_len;
1417    if (cnt <= 0)
1418    {
1419      uioP->uio_iovcnt--;
1420      uioP->uio_iov++;
1421      uioP->uio_iovdcnt++;
1422      EXIT(0);
1423      return 0;
1424    }
1425    if (cnt > nBytes)
1426      cnt = nBytes;
1427
1428    if (toKernel)
1429    {
1430      /* The daemon needs to bypass access checks since copy to
1431       * shared segment would inadvertantly fail.  Copies to
1432       * kernel address space also perform no validity check.
1433       */
1434      if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE)
1435        __copy_from_user(kBufP, (char *)iovP->iov_base, cnt);
1436      else
1437        if (copy_from_user(kBufP, (char *)iovP->iov_base, cnt))
1438        {
1439          EXIT_RC(0, EFAULT);
1440          return EFAULT;
1441        }
1442    }
1443    else
1444    {
1445      int spam;
1446      /* The daemon needs to bypass access checks since copy to
1447       * shared segment would inadvertantly fail.  Copies to
1448       * kernel address space also perform no validity check.
1449       */
1450      if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE)
1451        ignore = __copy_to_user((char *)iovP->iov_base, kBufP, cnt);
1452      else
1453        if (copy_to_user((char *)iovP->iov_base, kBufP, cnt))
1454        {
1455          EXIT_RC(0, EFAULT);
1456          return EFAULT;
1457        }
1458    }
1459
1460    iovP->iov_base = (char *)iovP->iov_base + cnt;
1461    iovP->iov_len -= cnt;
1462    uioP->uio_resid -= cnt;
1463    uioP->uio_offset += cnt;
1464#ifdef TRACE_IO_DATA
1465    if (cnt >= sizeof(trcdata))
1466      memcpy(trcdata, origKBufP, sizeof(trcdata));
1467    else
1468    {
1469      memset(trcdata, 0xAA, sizeof(trcdata));
1470      memcpy(trcdata, origKBufP, cnt);
1471    }
1472    TRACE5(TRACE_FOPS, 7, TRCID_CXISYSTEM_039a,
1473           "uiomove exit 1: rc %d data %08X %08X %08X %08X\n",
1474           rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]);
1475#else
1476    TRACE1(TRACE_FOPS, 7, TRCID_CXISYSTEM_039,
1477           "uiomove exit 1: rc %d\n",
1478           rc);
1479#endif
1480    EXIT_RC(0, rc);
1481    return rc;
1482  }
1483  while (nBytes > 0 && uioP->uio_resid && rc == 0)
1484  {
1485    if (uioP->uio_iovcnt <= 0)
1486    {
1487      EXIT_RC(0, ENOMEM);
1488      return ENOMEM;
1489    }
1490    iovP = uioP->uio_iov;
1491    cnt = iovP->iov_len;
1492    if (cnt <= 0)
1493    {
1494      uioP->uio_iovcnt--;
1495      uioP->uio_iov++;
1496      uioP->uio_iovdcnt++;
1497      continue;
1498    }
1499    if (cnt > nBytes)
1500      cnt = nBytes;
1501
1502    if (toKernel)
1503    {
1504      /* The daemon needs to bypass access checks since copy to
1505       * shared segment would inadvertantly fail.  Copies to
1506       * kernel address space also perform no validity check.
1507       */
1508      if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE)
1509        __copy_from_user(kBufP, (char *)iovP->iov_base, cnt);
1510      else 
1511        if (copy_from_user(kBufP, (char *)iovP->iov_base, cnt))
1512        {
1513          EXIT_RC(0, EFAULT);
1514          return EFAULT;
1515        }
1516    }
1517    else
1518    {
1519      /* The daemon needs to bypass access checks since copy to
1520       * shared segment would inadvertantly fail.  Copies to
1521       * kernel address space also perform no validity check.
1522       */
1523      if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE)
1524        ignore = __copy_to_user((char *)iovP->iov_base, kBufP, cnt);
1525      else 
1526        if (copy_to_user((char *)iovP->iov_base, kBufP, cnt))
1527        {
1528          EXIT_RC(0, EFAULT);
1529          return EFAULT;
1530        }
1531    }
1532    iovP->iov_base = (char *)iovP->iov_base + cnt;
1533    iovP->iov_len -= cnt;
1534    uioP->uio_resid -= cnt;
1535    uioP->uio_offset += cnt;
1536    kBufP += cnt;
1537    nBytes -= cnt;
1538  }
1539#ifdef TRACE_IO_DATA
1540  cnt = kBufP - origKBufP;
1541  if (cnt >= sizeof(trcdata))
1542    memcpy(trcdata, origKBufP, sizeof(trcdata));
1543  else
1544  {
1545    memset(trcdata, 0xAA, sizeof(trcdata));
1546    memcpy(trcdata, origKBufP, cnt);
1547  }
1548  TRACE5(TRACE_FOPS, 7, TRCID_CXISYSTEM_041a,
1549         "uiomove exit 2: rc %d data %08X %08X %08X %08X\n",
1550         rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]);
1551#else
1552  TRACE1(TRACE_FOPS, 7, TRCID_CXISYSTEM_041,
1553         "uiomove exit 2: rc %d\n",
1554         rc);
1555#endif
1556  EXIT_RC(0, rc);
1557  return rc;
1558}
1559
1560/*
1561  Try to force some sanity checks at compile type
1562 */
1563/* TO DO: revise this to handle comparisons beyond equality/inequality */
1564/* STATIC_DBGASSERT(sizeof(spinlock_t), SPINLOCK_T_SIZE); */
1565
1566/* A routine to check that the definitions in our cxiTypes.h
1567 * files are equivalent to the system definitions.  The module
1568 * should not load if it receives an error from this routine.
1569 */
1570int
1571cxiCheckTypes()
1572{
1573  int rc = 0;
1574  ENTER(0);
1575
1576  /* Make sure cxiBlockingMutex_t fits in the space provided.  If not,
1577     the implementation of the cxiBlockingMutex... routines needs to
1578     use the embedded space to record a pointer to kmalloc'ed space holding
1579     the semaphore. */
1580  if (sizeof(struct semaphore) > GPFS_LINUX_SEM_SIZE)
1581  {
1582    printk("cxiCheckTypes: semaphore %ld > GPFS_LINUX_SEM_SIZE %ld\n",
1583           sizeof(struct semaphore), GPFS_LINUX_SEM_SIZE);
1584    rc = 1;
1585  }
1586
1587  /* Size of spinlock_t is smaller for UP case with gcc 3.x, so just
1588     insure SPINLOCK_T_SIZE is large enough for both the UP and SMP case. */
1589  if (sizeof(spinlock_t) > SPINLOCK_T_SIZE)
1590  {
1591    printk("cxiCheckTypes: spinlock_t %ld > SPINLOCK_T__SIZE %ld\n",
1592           sizeof(spinlock_t), SPINLOCK_T_SIZE);
1593    rc = 2;
1594  }
1595
1596  /* Ensure that size of pid_t matches cxiThreadId (32-bits) */
1597  if (sizeof(pid_t) != sizeof(cxiThreadId))
1598  {
1599    printk("cxiCheckTypes: pid_t %ld != cxiThreadId %ld\n",
1600           sizeof(pid_t), sizeof(cxiThreadId));
1601    rc = 3;
1602  }
1603
1604  if (rc > 0)
1605    TRACE1(TRACE_TASKING, 2, TRCID_CXISYSTEM_CHKTYPES,
1606           "cxiCheckTypes: system type mismatch on type number %d!\n", rc);
1607  EXIT_RC(0, rc);
1608  return rc;
1609}
1610
1611/* Routine to get current time of day in nanosecond format.
1612 */
1613int
1614cxiGetTOD(cxiTimeStruc_t *tsP)
1615{
1616#if LINUX_KERNEL_VERSION >= 2060000
1617  struct timespec ts;
1618#else
1619  struct timeval tv;
1620#endif
1621
1622  ENTER(0);
1623#if LINUX_KERNEL_VERSION >= 2060000
1624  ts = CURRENT_TIME;
1625  tsP->tv_sec = ts.tv_sec;
1626  tsP->tv_nsec = ts.tv_nsec;
1627#else
1628  /* This call returns microseconds so we fudge it to nanoseconds */
1629  do_gettimeofday(&tv);
1630  tsP->tv_sec = tv.tv_sec;
1631  tsP->tv_nsec = tv.tv_usec * 1000;
1632#endif
1633
1634  EXIT(0);
1635  return 0;
1636}
1637
1638Boolean
1639cxiIsNFSThread()
1640{
1641# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1642    /* Note comparison against a multibyte character constant (not a string
1643      constant).  Order of characters in word is reversed due to little-
1644      endian representation of integers. */
1645    if (* ((int*)&current->comm[0]) != 0x6473666e) // 'dsfn'
1646       return false;
1647    if (* ((char*)&current->comm[4]) == '\0')
1648       return true;
1649    return (* ((int*)&current->comm[2]) == 0x00346473);  // '4ds'
1650# else
1651    if ((strcmp(current->comm, "nfsd") == 0) ||
1652        (strcmp(current->comm, "nfsd4") == 0))
1653      return true;
1654    return false;
1655# endif
1656}
1657
1658Boolean
1659cxiIsLockdThread()
1660{
1661# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1662    /* Note comparison against a multibyte character constant (not a string
1663      constant).  Order of characters in word is reversed due to little-
1664      endian representation of integers. */
1665    if ((* ((int*)&current->comm[0]) != 0x6b636f6c) |  // 'kcol'
1666        (* ((int*)&current->comm[2]) != 0x00646b63));  // ' dkc'
1667       return false;
1668    return * ((char*)&current->comm[5]) == '\0';
1669# else
1670    return (strcmp(current->comm, "lockd") == 0);
1671# endif
1672}
1673
1674Boolean
1675cxiIsNFS4Thread()
1676{
1677# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1678    /* Note comparison against a multibyte character constant (not a string
1679      constant).  Order of characters in word is reversed due to little-
1680      endian representation of integers. */
1681    if ((* ((int*)&current->comm[0]) != 0x6473666e) |  // 'dsfn'
1682        (* ((int*)&current->comm[2]) != 0x00346473));  // '4ds'
1683       return false;
1684    return * ((char*)&current->comm[5]) == '\0';
1685# else
1686    return (strcmp(current->comm, "nfsd4") == 0);
1687# endif
1688}
1689
1690Boolean
1691cxiIsKupdateThread()
1692{
1693#if LINUX_KERNEL_VERSION >= 2060000
1694  /* In 2.6 pdflush replaced kupdated and bdflush from 2.4 */
1695  return current_is_pdflush();
1696#else
1697  return (strcmp(current->comm, "kupdated") == 0);
1698#endif
1699}
1700
1701#ifdef SMB_LOCKS
1702Boolean
1703cxiIsSambaOrLockdThread()
1704{
1705# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1706    /* Note comparison against a multibyte character constant (not a string
1707      constant).  Order of characters in word is reversed due to little-
1708      endian representation of integers. */
1709    Boolean rc = (((* ((int*)&current->comm[0]) == 0x64626d73) &   // 'dbms'
1710                   (* ((char*)&current->comm[4]) == '\0'))    |
1711                  ((* ((int*)&current->comm[0]) == 0x6b636f6c) &   // 'kcol'
1712                   (* ((int*)&current->comm[2]) == 0x00646b63)));  // 'dkc'
1713       return rc;
1714# else
1715    return ((strcmp(current->comm, "smbd") == 0) |
1716            (strcmp(current->comm, "lockd") == 0));
1717# endif
1718}
1719
1720Boolean
1721cxiIsSambaThread()
1722{
1723# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1724    /* Note comparison against a multibyte character constant (not a string
1725      constant).  Order of characters in word is reversed due to little-
1726      endian representation of integers. */
1727    Boolean rc = ((* ((int*)&current->comm[0]) == 0x64626d73) &  // 'dbms'
1728                  (* ((char*)&current->comm[4]) == '\0'));
1729       return rc;
1730# else
1731    return (strcmp(current->comm, "smbd") == 0);
1732# endif
1733}
1734#endif
1735
1736Boolean
1737cxiIsGPFSThread()
1738{
1739# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1740  return (((* ((int*)&current->comm[0]) == 0x73666d6d) &  // 'sfmm'
1741           (* ((int*)&current->comm[2]) == 0x00647366))); // 'dsf'
1742# else
1743  return (strcmp(current->comm, "mmfsd") == 0);
1744# endif
1745}
1746
1747Boolean
1748cxiIsKswapdThread()
1749{
1750#if LINUX_KERNEL_VERSION > 2060000
1751  /* On 2.6, there may be multiple kswapd processes, named kswapd0, kswapd1,
1752   * etc.  We don't have to depend on the process name to identify kswapd
1753   * processes on 2.6 though, there's a better way. */
1754  return current_is_kswapd();
1755#else
1756# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
1757  return ((* ((int*)&current->comm[0]) == 0x6177736b) &  // 'awsk'
1758          (* ((int*)&current->comm[3]) == 0x00647061));  // ' dpa'
1759# else
1760  return (strcmp(current->comm, "kswapd") == 0);
1761# endif
1762#endif
1763}
1764
1765#ifdef INSTRUMENT_LOCKS
1766void InitBlockingMutexStats()
1767{
1768  memset(BlockingMutexStatsTable, 0, sizeof(BlockingMutexStatsTable));
1769}
1770#endif
1771
1772/* Initialize a cxiBlockingMutex_t.  Instead of the DBGASSERT, this routine
1773   should kmalloc a struct semaphore if bmSem is too small.  */
1774void cxiBlockingMutexInit(cxiBlockingMutex_t* mP, int bmNameIdx)
1775{
1776  ENTER(0);
1777  DBGASSERT(sizeof(struct semaphore) <= GPFS_LINUX_SEM_SIZE);
1778#ifdef INSTRUMENT_LOCKS
1779  DBGASSERT(bmNameIdx < MAX_GPFS_LOCK_NAMES);
1780#endif  /* INSTRUMENT_LOCKS */
1781
1782  TRACE2(TRACE_KLOCKL, 3, TRCID_BM_INIT,
1783         "cxiBlockingMutexInit: mP 0x%lX idx %d\n",
1784         mP, bmNameIdx);
1785  init_MUTEX((struct semaphore *)mP->bmSem);
1786  mP->bmOwnerP = NULL;
1787  mP->lockNameIndex = bmNameIdx;
1788  EXIT(0);
1789}
1790
1791
1792/* Enter critical section, blocking this thread if necessary.  Mark this
1793   thread as the owner of the mutex before returning. */
1794void 
1795REGPARMS cxiBlockingMutexAcquire(cxiBlockingMutex_t* mP)
1796{
1797  ENTER(1);
1798  TRACE4(TRACE_KLOCKL, 9, TRCID_BM_ACQ,
1799         "cxiBlockingMutexAcquire: about to acquire 0x%lX type %d "
1800         "current 0x%lX currentOwner 0x%lX\n",
1801         mP, mP->lockNameIndex, current, mP->bmOwnerP);
1802
1803  DBGASSERTRC(mP->bmOwnerP != (char *)current, 
1804              PTR_TO_INT32(mP->bmOwnerP), PTR_TO_INT32(mP), 0);
1805
1806#ifdef INSTRUMENT_LOCKS
1807  BlockingMutexStatsTable[mP->lockNameIndex].bmsAcquires += 1;
1808  if (mP->bmOwnerP != NULL)
1809    BlockingMutexStatsTable[mP->lockNameIndex].bmsConflicts += 1;
1810#endif
1811
1812  down((struct semaphore *)mP->bmSem);
1813  mP->bmOwnerP = (char *)current;
1814
1815  TRACE1(TRACE_KLOCKL, 9, TRCID_BM_ACQ_EXIT,
1816         "cxiBlockingMutexAcquire: returning after acquiring 0x%lX\n", mP);
1817  EXIT(1);
1818}
1819
1820
1821/* Leave critical section and awaken waiting threads */
1822void 
1823REGPARMS cxiBlockingMutexRelease(cxiBlockingMutex_t* mP)
1824{
1825  ENTER(1);
1826  TRACE4(TRACE_KLOCKL, 9, TRCID_BM_REL,
1827         "cxiBlockingMutexRelease: about to release 0x%lX type %d "
1828         "current 0x%lX currentOwner 0x%lX\n",
1829         mP, mP->lockNameIndex,current, mP->bmOwnerP);
1830
1831  if (mP->bmOwnerP == (char *)current)
1832  {
1833     mP->bmOwnerP = NULL;
1834     up((struct semaphore *)mP->bmSem);
1835  }
1836  EXIT(1);
1837}
1838
1839/* Free resources associated with this cxiBlockingMutex_t in preparation
1840   for freeing the storage it occupies */
1841void cxiBlockingMutexTerm(cxiBlockingMutex_t* mP)
1842{
1843  ENTER(0);
1844  TRACE2(TRACE_KLOCKL, 3, TRCID_BM_TERM,
1845         "cxiBlockingMutexTerm: mP 0x%lX type %d\n", mP, mP->lockNameIndex);
1846
1847  /* Verify that mutex is not held */
1848  DBGASSERT(mP->bmOwnerP == NULL);
1849  DBGASSERT(atomic_read(&((struct semaphore *)mP->bmSem)->count) == 1);
1850  EXIT(0);
1851}
1852
1853
1854/* Return true if a cxiBlockingMutex_t is held by the calling process */
1855Boolean
1856cxiBlockingMutexHeldByCaller(cxiBlockingMutex_t* mP)
1857{
1858  Boolean result;
1859  char* ownerP;
1860  cxiPid_t ownerPid;
1861
1862  /* Cache bmOwnerP is case it changes to NULL */
1863  ENTER(0);
1864  ownerP = mP->bmOwnerP;
1865  if (ownerP == NULL)
1866    result = false;
1867  else
1868  {
1869    cxiThreadPtrToThreadId(ownerP, &ownerPid);
1870    result = (current->pid == ownerPid);
1871  }
1872  TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_017,
1873         "cxiBlockingMutexHeldByCaller: owner 0x%lX returns %d\n",
1874         ownerP, result);
1875  EXIT_RC(0, result);
1876  return result;
1877}
1878
1879
1880/* Return true if a cxiBlockingMutex_t has one or more processes waiting
1881   on it */
1882Boolean cxiBlockingMutexHasWaiters(cxiBlockingMutex_t* mP)
1883{
1884  struct semaphore * semP = (struct semaphore *)mP->bmSem;
1885  Boolean result;
1886
1887  ENTER(0);
1888  if ((void*)semP->wait.task_list.next != (void*)&semP->wait.task_list.next)
1889    result = true;
1890  else
1891    result = false;
1892  TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_018,
1893         "cxiBlockingMutexHasWaiters: mP 0x%lX hasWaiters %d\n",
1894         mP, result);
1895  EXIT_RC(0, result);
1896  return result;
1897}
1898
1899
1900/* Wait for a cxiWaitEventSignal, cxiWaitEventBroadcast, or
1901   cxiWaitEventBroadcastRC.  Drop the associated cxiBlockingMutex_t
1902   *mutexP while waiting, and reacquire it before returning.
1903   If INTERRUPTIBLE is set in waitFlags, waits interruptibly;
1904   otherwise, waits uninterruptibly.
1905     Returns THREAD_INTERRUPTED if interrupted before being woken up,
1906   THREAD_AWAKENED, if woken up by cxiWaitEventSignal or
1907   cxiWaitEventBroadcast, or the result value passed to
1908   cxiWaitEventWakeupResult, if woken up by cxiWaitEventWakeupResult. */
1909int cxiWaitEventWait(cxiWaitEvent_t* weP, cxiBlockingMutex_t* mutexP,
1910                     int waitFlags)
1911{
1912  spinlock_t *lockP = (spinlock_t *)(weP->lword);
1913  unsigned long flags;
1914  cxiWaitElement_t waitElement;
1915  int count = 0;
1916  Boolean done;
1917
1918  ENTER(0);
1919  TRACE3(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_EVENT_WAIT_ENTER,
1920         "cxiWaitEventWait enter: weP 0x%lX waitFlags 0x%X about to release "
1921         "mutex 0x%lX \n", weP, waitFlags, mutexP);
1922
1923  /* Verify that caller is holding the mutex */
1924  DBGASSERTRC(mutexP->bmOwnerP == (char *)current, 
1925              PTR_TO_INT32(mutexP->bmOwnerP), PTR_TO_INT32(mutexP), 0);
1926
1927  /* initialize our wait element */
1928  init_waitqueue_head(&waitElement.qhead);
1929  init_waitqueue_entry(&waitElement.qwaiter, current);
1930  __add_wait_queue(&waitElement.qhead, &waitElement.qwaiter);
1931  waitElement.wakeupRC = 0;
1932
1933  /* update our task state to not running any more */
1934  if (waitFlags & INTERRUPTIBLE)
1935    current->state = TASK_INTERRUPTIBLE;
1936  else
1937    current->state = TASK_UNINTERRUPTIBLE;
1938
1939  /* add our wait element to the end of the wait list */
1940  SPIN_LOCK_IRQ(lockP, flags);
1941
1942  CXI_WAIT_LIST_ADD(&weP->waitList, &waitElement.waitList);
1943
1944  SPIN_UNLOCK_IRQ(lockP, flags);
1945
1946  /* Release the mutex.  Note: calling cxiBlockingMutexRelease here is
1947     problematic, because it makes trace calls, which may block the current
1948     process, which would overwrite the task state (current->state) we just
1949     updated.  A way around this would be to move out task state update to
1950     after the call to cxiBlockingMutexRelease, but then, before calling
1951     schedule(), we would have to re-acquire the wait-list lock and check
1952     wakeupRC to see whether somebody has already woken us up since we
1953     released the mutex.  Since there is a trace at the top of this routine,
1954     we don't need the one in cxiBlockingMutexRelease; hence, just do the
1955     release right here. */
1956  mutexP->bmOwnerP = NULL;
1957  up((struct semaphore *)mutexP->bmSem);
1958
1959again:
1960  /* call the scheduler */
1961  schedule();
1962
1963  /* Remove ourself from the wait list ... except:
1964     Even though we may enter uninterrubtible sleep, this sleep can in
1965     fact be interrupted in at least two scenarios:
1966     1) page_alloc code may call wakeup_kswapd().  This should be
1967        a very rare event with the current code, since we make an effort
1968        to avoid blocking kswapd.
1969     2) While signals are supposed to be ignored during uninterruptible
1970        sleep, it turns out that some signals, e.g. SIGSEGV and SIGBUS,
1971        cause us to wake up.  It doesn't look like the signal has been
1972        delivered yet, but sleep is interrupted.  The signal will be
1973        delivered later (probably when exiting kernel).
1974     Our callers can't handle unexpected return from uninterruptible
1975     sleep.  In either of the two cases above, it should be safe to go
1976     back to sleep and wait to be woken up properly.
1977   */
1978  SPIN_LOCK_IRQ(lockP, flags);
1979
1980  if (waitElement.wakeupRC == 0 &&
1981      !(waitFlags & INTERRUPTIBLE))
1982  {
1983    TRACE3N(TRACE_KLOCKL, 1, TRCID_CXISYSTEM_EVENT_WAIT_INTERRUPTED,
1984            "cxiWaitEventWait: interrupted weP 0x%lX mutexP 0x%lX rc %d\n",
1985            weP, mutexP, waitElement.wakeupRC);
1986    current->state = TASK_UNINTERRUPTIBLE;
1987    done = false;
1988  }
1989  else
1990  {
1991    CXI_WAIT_LIST_REMOVE(&waitElement.waitList);
1992    done = true;
1993  }
1994
1995  SPIN_UNLOCK_IRQ(lockP, flags);
1996
1997  if (!done)
1998    goto again;
1999
2000  /* re-acquire the mutex */
2001  cxiBlockingMutexAcquire(mutexP);
2002
2003  TRACE3(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_EVENT_WAIT_EXIT,
2004         "cxiWaitEventWait exit: weP 0x%lX mutexP 0x%lX rc %d\n",
2005         weP, mutexP, waitElement.wakeupRC);
2006
2007  /* A zero wakeup code means we were interrupted rather than woken up */
2008  EXIT(0);
2009  if (waitElement.wakeupRC != 0)
2010    return waitElement.wakeupRC;
2011  else
2012    return THREAD_INTERRUPTED;
2013}
2014
2015/* Wake up one thread waiting on this cxiWaitEvent_t.  Must not sleep */
2016void
2017cxiWaitEventSignal(cxiWaitEvent_t* weP)
2018{
2019  /* ENTER(0); */
2020  TRACE1N(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_SIGNAL,
2021         "cxiWaitEventSignal: weP 0x%lX\n", weP);
2022
2023  doWakeup(weP, wSignal, THREAD_AWAKENED); /* wake up one */
2024  /* EXIT(0); */
2025}
2026
2027
2028/* Wake up one thread waiting on this cxiWaitEvent_t.  This is the same as
2029   cxiWaitEventSignal(), except this routine guarantees that multiple wake
2030   up calls will each pick a different thread if more than one is waiting. */
2031void
2032cxiWaitEventWakeupOne(cxiWaitEvent_t* weP)
2033{
2034  ENTER(0);
2035  TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_WAKEUP_ONE,
2036         "cxiWaitEventWakeupOne: weP 0x%lX\n", weP);
2037
2038  doWakeup(weP, wWakeOne, THREAD_AWAKENED); /* wake up one */
2039  EXIT(0);
2040}
2041
2042
2043/* Wake up all threads waiting on this cxiWaitEvent_t */
2044void
2045cxiWaitEventBroadcast(cxiWaitEvent_t* weP)
2046{
2047  ENTER(0);
2048  TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST,
2049         "cxiWaitEventBroadcastRC: weP 0x%lX\n", weP);
2050
2051  doWakeup(weP, wBroadcast, THREAD_AWAKENED); /* wake up all */
2052  EXIT(0);
2053}
2054
2055
2056/* Wake up all threads waiting on this cxiWaitEvent_t and cause them to
2057   return rc from their cxiWaitEventWait calls. */
2058void
2059cxiWaitEventBroadcastRC(cxiWaitEvent_t* weP, int rc)
2060{
2061  ENTER(0);
2062  TRACE2(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST_RC,
2063         "cxiWaitEventBroadcastRC: weP 0x%lX rc %d\n", weP, rc);
2064
2065  doWakeup(weP, wBroadcast, rc);  /* wake up all */
2066  EXIT_RC(0, rc);
2067}
2068
2069/* alloc big memory area */
2070void *
2071cxiBigMalloc(int size)
2072{
2073  void *ptr;
2074
2075  ENTER(0);
2076  ptr = vmalloc(size);
2077
2078#ifdef MALLOC_DEBUG
2079  MallocDebugNew(ptr, size, 2);
2080#endif
2081
2082  EXIT(0);
2083  return ptr;
2084}
2085
2086/* free big memory area */
2087void
2088cxiBigFree(char *ptr)
2089{
2090  ENTER(0);
2091#ifdef MALLOC_DEBUG
2092  MallocDebugDelete(ptr);
2093#endif
2094
2095  EXIT(0);
2096  vfree(ptr);
2097}
2098
2099#ifdef SMB_LOCKS
2100/* Determine if current process has this file open */
2101void *
2102cxiCheckOpen(struct cxiNode_t* cnP)
2103{
2104  int count;
2105  int i;
2106  struct file** fdList;
2107  struct file*  fileP;
2108  struct inode* inodeP;
2109
2110  ENTER(0);
2111#if LINUX_KERNEL_VERSION >= 2061300
2112  count = current->files->fdt->max_fds;
2113  fdList = current->files->fdt->fd;
2114#else
2115  count = current->files->max_fds;
2116  fdList = current->files->fd;
2117#endif
2118  inodeP = GNP_TO_VP(cnP);
2119
2120  TRACE3(TRACE_VNODE,9,TRCID_CXICHECKOPEN_ENTRY,
2121         "cxiCheckOpen: entry.  %d files in fd list. Checking for inode %d "
2122         "at 0x%x", count, inodeP->i_ino, inodeP);
2123
2124  for (i=0; i<count; i++)
2125  {
2126    fileP = fdList[i];
2127
2128    if (fileP)
2129    {
2130      if (fdList[i]->f_dentry->d_inode == inodeP)
2131      {
2132        TRACE1(TRACE_VNODE, 9,TRCID_CXICHECKOPEN_FOUND,
2133               "cxiCheckOpen: found open file. vinfoP 0x%x",
2134               fileP->private_data);
2135        EXIT(0);
2136        return fileP->private_data;
2137      }
2138    }
2139  }
2140
2141  EXIT(0);
2142  return NULL;
2143}
2144
2145int cxiBreakOplock(void *breakArgP, int oplockNew)
2146{
2147  /* On Linux, we use its kernel oplock support.  The get_lease()
2148   * call is the operation to revoke conflicting leases.
2149   */
2150  int rc;
2151  ENTER(0);
2152
2153  /* O_NONBLOCK: prevents the thread from waiting for the lease return.
2154   * In the case of a Samba thread, we only want to get EWOULDBLOCK
2155   * back if the conflict is held within Samba iteself. If a wait is
2156   * needed, breakSMBOplock will invoke cxiWaitForBreak.
2157   */
2158
2159  /* Linux op to revoke conflicting leases */
2160  rc = abs(REVOKE_LEASE((struct inode *)breakArgP, 
2161                       (cxiIsSambaThread()? 0: O_NONBLOCK) |
2162                       ((oplockNew==smbOplockShared)? FMODE_READ: FMODE_WRITE)));
2163
2164  TRACE3(TRACE_VNODE, 4,TRCID_CXIBREAKOPLOCK,
2165         "cxiBreakOplock: exit rc %d inode 0x%lX oplock %d\n",
2166          rc, breakArgP, oplockNew);
2167
2168  EXIT(0);
2169  return rc;
2170}
2171
2172DECLARE_WAIT_QUEUE_HEAD(oplock_break_queue);
2173
2174/* No initialization required on Linux */
2175int cxiInitBreakQ() { return 0; }
2176
2177/* No initialization required on Linux */
2178int cxiTermBreakQ() { return 0; }
2179
2180/* Send the notification that the oplock break completed */
2181int cxiSendBreakMsg(void *ofP)
2182{
2183  ENTER(0);
2184  /* There is only one oplock_break_queue, and no means to pass the ofP back to
2185   * the waiters.  This will wake all of them up and they will recheck their
2186   * oplock states and wait again if necessary (with a timeout).
2187   */
2188   wake_up_interruptible(&oplock_break_queue);
2189
2190  TRACE1(TRACE_SMB, 3, TRCID_SEND_BREAK, "cxiSendBreakMsg: ofP 0x%lX\n", ofP);
2191  EXIT(0);
2192  return 0;
2193}
2194
2195/* Suspend the caller until either the oplock break completes, or the timeout
2196 * is reached.
2197 */
2198int cxiWaitForBreak(void *fileArgP, int oplockCurrent, int timeoutSeconds)
2199{
2200  DECLARE_WAITQUEUE(wait, current);
2201  signed long timeout;
2202
2203  ENTER(0);
2204  TRACE3(TRACE_SMB, 5, TRCID_BREAKWAIT,
2205         "cxiWaitForBreak: file 0x%lX, oplockCurrent %d timeoutSeconds %d\n",
2206         fileArgP, oplockCurrent, timeoutSeconds);
2207
2208  add_wait_queue(&oplock_break_queue, &wait);
2209  timeout = timeoutSeconds * HZ;
2210  while (timeout > 0) {
2211    set_current_state(TASK_INTERRUPTIBLE);
2212    /* Check whether the oplock has been released or downgraded */
2213    if (gpfs_ops.SMBGetOplockState(fileArgP) < oplockCurrent)
2214      break;
2215    timeout = schedule_timeout(timeout);
2216  }
2217  set_current_state(TASK_RUNNING);
2218  remove_wait_queue(&oplock_break_queue, &wait);
2219
2220  TRACE0(TRACE_SMB, 5, TRCID_BREAKWAIT_EXIT, 
2221         "cxiWaitForBreak exit\n");
2222
2223  EXIT(0);
2224  return 0;
2225}
2226#endif
2227
2228
2229/* Get the address of the first byte not addressible by processes */
2230UIntPtr cxiGetKernelBoundary()
2231{
2232  return GPFS_KERNEL_OFFSET;
2233}
2234
2235
2236/* Return true if this process holds the big kernel lock (BKL) */
2237Boolean cxiHoldsBKL()
2238{
2239  return current->lock_depth >= 0;
2240}
2241
2242
2243/* Tell the OS that this thread is involved in handling VM page-out
2244   requests and should not be blocked waiting for page allocation.
2245   Return true if successful. */
2246Boolean cxiSetPageoutThread()
2247{
2248  if (current->flags & PF_MEMALLOC)
2249    return false;
2250  current->flags |= PF_MEMALLOC;
2251  return true;
2252}
2253
2254
2255/* Tell the OS that this thread is no longer involved in handling VM
2256   page-out requests. */
2257void cxiClearPageoutThread()
2258{
2259  current->flags &= ~PF_MEMALLOC;
2260}
2261
2262
2263/* Yield the CPU to allow other processes to run */
2264void
2265cxiYield()
2266{
2267  ENTER(0);
2268  schedule();
2269  EXIT(0);
2270}
2271
2272/* Linux filldir has changed signatures depending on kernel level.
2273 * We always pass a 64bit offset from the GPFS layer.
2274 */
2275int
2276cxiFillDir(void *vargP, const char *nameP, int namelen, 
2277           offset_t offset, ino_t ino)
2278{
2279  int result;
2280  cxiFillDirArg_t *fillDirArgP = (cxiFillDirArg_t *)vargP;
2281  filldir_t fnP = (filldir_t)fillDirArgP->fnP;
2282  ENTER(0);
2283
2284  result = (*fnP)(fillDirArgP->argP, nameP, namelen,
2285                  (loff_t)offset, ino, 0 /* DT_UNKNOWN */);
2286  EXIT_RC(0, result);
2287  return result;
2288}
2289
2290#ifdef DISK_LEASE_DMS
2291
2292static struct timer_list DMSTimer[MAX_DMS_INDEX];
2293static int (*DMSgetNIOsInProgressP)(int);
2294
2295#define PANIC_FOR_REAL 1
2296
2297static void cxiDMSExpired(unsigned long data)
2298{
2299  int idx = data;
2300  int nIOs = DMSgetNIOsInProgressP(idx);
2301  /* ENTER(0); */
2302  /* This code is executed on the interrupt level -- can't use tracing */
2303  printk("GPFS Deadman Switch timer [%d] has expired; IOs in progress: %d\n",
2304         idx, nIOs);
2305#ifdef PANIC_FOR_REAL
2306  if (nIOs != 0)
2307    panic("GPFS Deadman Switch timer has expired, and there are still"
2308          " %d outstanding I/O requests\n", nIOs);
2309#endif
2310}
2311
2312/*
2313  Start dead man switch, with the timeout specified by the delay
2314  argument (in seconds).
2315*/
2316void cxiStartDMS(int idx, int delay, int (*funcP)(int))
2317{
2318  unsigned long njiffies = delay * HZ;
2319
2320  /* Only allow the daemon or other root users to make this kernel call */
2321  if (!cxiIsSuperUser())
2322    return;
2323  ENTER(0);
2324
2325  /* There can be only one timer active at any given moment */
2326  if (timer_pending(&DMSTimer[idx]))
2327    del_timer(&DMSTimer[idx]);
2328
2329  init_timer(&DMSTimer[idx]);
2330  DMSTimer[idx].expires = jiffies + njiffies;
2331  DMSTimer[idx].function = cxiDMSExpired;
2332  DMSTimer[idx].data = idx;
2333  /* save the pointer to nIOsInProgress to a static var */
2334  DMSgetNIOsInProgressP = funcP;
2335  add_timer(&DMSTimer[idx]);
2336  TRACE3(TRACE_DLEASE, 2, TRCID_DMS_STARTED,
2337         "DMS timer [%d] started, delay %d, time %d\n",
2338         idx, delay, jiffies/HZ);
2339  EXIT(0);
2340}
2341
2342void cxiStopDMS(int idx)
2343{
2344  /* Only allow the daemon or other root users to make this kernel call */
2345  if (!cxiIsSuperUser())
2346    return;
2347  ENTER(0);
2348
2349  if (timer_pending(&DMSTimer[idx]))
2350    del_timer(&DMSTimer[idx]);
2351  TRACE2(TRACE_DLEASE, 2, TRCID_DMS_STOPPED,
2352         "DMS timer [%d] stopped, time %d\n", idx, jiffies/HZ);
2353  EXIT(0);
2354}
2355
2356/* dummy init routine.  Since on Linux the timer is
2357   stored in a static memory, there's nothing to be done
2358*/
2359int cxiInitDMS(void)
2360{
2361  return 0;
2362}
2363
2364void cxiShutdownDMS(void)
2365{
2366  int i;
2367
2368  ENTER(0);
2369  for (i = 0; i < MAX_DMS_INDEX; i++)
2370    cxiStopDMS(i);
2371  EXIT(0);
2372}
2373
2374#endif /* DISK_LEASE_DMS */
2375
2376void cxiSetBit(unsigned long *flagP, int flag_bit)
2377{
2378   set_bit(flag_bit,flagP);
2379}
2380void cxiClearBit(unsigned long *flagP, int flag_bit)
2381{
2382   clear_bit(flag_bit,flagP);
2383}
2384Boolean cxiTestBit(unsigned long *flagP, int flag_bit)
2385{
2386   return test_bit(flag_bit,flagP);
2387}
2388
2389/* In order to setup our termination callback routine (gpfs_f_cleanup)
2390 * we create a dummy file and add it to our file table.  Then, upon
2391 * process termination, the release file operation will be called in
2392 * order to close the file.  The only operation we define for this
2393 * dummy file is release (gpfs_f_cleanup).
2394 */
2395int
2396cxiRegisterCleanup()
2397{
2398  int code = 0, rc = 0;
2399  struct inode *iP = NULL;
2400  struct file *fileP = NULL;
2401  struct dentry *dentryP = NULL;
2402  extern int cleanupFD;
2403  extern struct super_block *shutdownSuperP;
2404
2405  /* We record the daemon's process group because certain
2406   * checks on cxiCopyIn/cxiCopyOut are bypassed for the daemon.
2407   */
2408  ENTER(0);
2409  DaemonPGrp = PROCESS_GROUP(current);
2410
2411  /* Make sure we only create one file */
2412  if (cleanupFD)
2413  {
2414    EXIT_RC(0, EEXIST);
2415    return EEXIST;
2416  }
2417
2418  DBGASSERT(shutdownSuperP != NULL);
2419
2420  /* Allocate an inode struct */
2421  iP = NEW_INODE(shutdownSuperP);
2422  if (!iP)
2423  {
2424    code = 1;
2425    rc = ENOMEM;
2426    goto xerror;
2427  }
2428  iP->i_mode = S_IFREG;
2429
2430  /* Allocate an available file descriptor */
2431  cleanupFD = get_unused_fd();
2432  if (cleanupFD < 0)
2433  {
2434    code = 2;
2435    rc = ENFILE;
2436    goto xerror;
2437  }
2438
2439  /* Allocate a file struct */
2440  fileP = get_empty_filp();
2441  if (!fileP)
2442  {
2443    code = 3;
2444    rc = ENFILE;
2445    goto xerror;
2446  }
2447
2448  /* Allocate a dentry sruct */
2449  dentryP = dget(d_alloc_root(iP));
2450  if (!dentryP)
2451  {
2452    code = 4;
2453    rc = ENOMEM;
2454    goto xerror;
2455  }
2456
2457  /* Initialize and chain our file sructure */
2458  fileP->f_dentry = dentryP;
2459  fileP->f_op     = &gpfs_cleanup_fops;
2460  fileP->f_flags  = O_RDONLY;
2461  atomic_set(&fileP->f_count, 1);
2462
2463  /* Just chain it on the current root mount.  When
2464   * the file is closed its fput() will decrement
2465   * the mount count (hence the mntget here)
2466   */
2467  fileP->f_vfsmnt = mntget(current->fs->rootmnt);
2468
2469  /* Install the descriptor so it gets "closed" upon our termination */
2470  fd_install(cleanupFD, fileP);
2471
2472  /* Set FD_CLOEXEC so that forked processes (like mmfsup.scr) do not
2473   * inherrit this descriptor.  We want the cleanup routine to be run
2474   * when the last mmfsd process terminates.
2475   */
2476#if LINUX_KERNEL_VERSION >= 2061300
2477  FD_SET(cleanupFD, current->files->fdt->close_on_exec);
2478#else
2479  FD_SET(cleanupFD, current->files->close_on_exec);
2480#endif
2481  /* Once the descriptor for this dummy file is added to our file table,
2482   * it is inherrited by all the processes of the daemon.  As each
2483   * terminates, the files->count is decremented and on the last process
2484   * termination all the descriptors will be closed by filp_close.
2485   *
2486   * The one catch here is that our file table is inherrited by the
2487   * kernel threads we start as well as user processes.  This would
2488   * cause a problem in that daemon termination does not include these
2489   * kernel threads which aren't killed until restart (and therefore
2490   * the file is never closed).  In order for our operation to be
2491   * driven at daemon termiation, we must remove the file table from
2492   * these kernel threads.  This is done in via cxiReparent() by
2493   * the mmap pager kproc.
2494   */
2495
2496xerror:
2497  TRACE4(TRACE_VNODE, 1, TRCID_CXIREGISTERCLEANUP_EXIT,
2498         "cxiRegisterCleanup: fd %d iP %X rc %d code %d\n", 
2499         cleanupFD, iP, rc, code);
2500
2501  if (rc)
2502  {
2503    if (dentryP);
2504      dput(dentryP);
2505
2506    if (cleanupFD)
2507      put_unused_fd(cleanupFD);
2508
2509    if (fileP)
2510#if LINUX_KERNEL_VERSION > 2060900
2511      fput(fileP);
2512#else
2513      put_filp(fileP);
2514#endif
2515
2516    if (iP)
2517      iput(iP);
2518
2519    cleanupFD = 0;
2520  }
2521
2522  EXIT_RC(0, rc);
2523  return rc;
2524}
2525
2526#ifdef NFS4_ACL
2527/* Linux routines to be called when processing NFSv4 audit/alarm ACL entries */
2528int cxiAuditWrite(int numargs, ...) { return ENOSYS; }
2529#endif /* NFS4_ACL */
2530
2531/* Currently no OS specific VFS initialization for Linux */
2532int 
2533cxiInitVFS(int vfsType)
2534{
2535  return 0;
2536}
2537
2538UIntPtr
2539cxiGetKernelStackSize()
2540{
2541  return (UIntPtr)THREAD_SIZE;
2542}
2543
2544#if defined(DMAPI) || (SANERGY)
2545
2546void cxiPathRel(void *ndP)
2547{
2548  DBGASSERT( ndP != NULL); 
2549  path_release( (struct nameidata *) ndP);
2550  cxiFreeUnpinned(ndP);
2551}
2552
2553int
2554cxiPathToVfsP(void **privVfsPP, char *kpathname, void **ndPP, void **cnPP, 
2555              Boolean traverseLink)
2556{
2557   struct gpfsVfsData_t *privVfsP = NULL;
2558   struct nameidata *ndP;
2559   struct inode * iP;
2560   cxiNode_t *cnP;
2561   int rc = 0;
2562   Boolean rel = false;
2563   int code = 0;
2564   *ndPP = NULL;
2565   *privVfsPP = NULL;
2566
2567   ENTER(0);
2568   if (kpathname == NULL)
2569   {
2570     code = 1;
2571     rc = EINVAL;
2572     goto xerror;
2573   }
2574
2575   ndP = (struct nameidata *)cxiMallocUnpinned(sizeof(struct nameidata));
2576   if (ndP == NULL)
2577   {
2578     code = 2;
2579     rc = ENOMEM;
2580     goto xerror;
2581   }
2582
2583   /* For DMAPI, this is called by dm_path_to_handle or dm_path_to_fshandle,
2584    * According to dmapi documentation, we should return the symbolic link
2585    * itself instead of the object that link references.
2586    * so here we need to use the function which does not traverse the link */
2587   if (!traverseLink)
2588     rc = user_path_walk_link(kpathname, ndP);
2589   else
2590     rc = user_path_walk(kpathname, ndP);
2591
2592   if (rc)
2593   {
2594     rc = -rc;
2595     code = 3;
2596     goto xerror;
2597   }
2598 
2599   rel = true;
2600   iP = ndP->dentry->d_inode;
2601   DBGASSERT(iP != NULL);
2602   if (!GPFS_TYPE(iP))
2603   {
2604     code = 4;
2605     rc = EINVAL;
2606     goto xerror;
2607   }
2608   
2609   privVfsP = VP_TO_PVP(iP);
2610
2611   if (privVfsP == NULL)
2612   {
2613     code = 5;
2614     rc = ENOENT;
2615  }
2616  cnP = VP_TO_CNP(iP);
2617  *privVfsPP = (void *)privVfsP;
2618  *ndPP = (void *)ndP;
2619  if (cnPP != NULL)
2620    *cnPP = (void *)cnP;
2621
2622xerror:
2623  if (rc && ndP)
2624  {
2625    if (rel)
2626     cxiPathRel(ndP);
2627    else
2628     cxiFreeUnpinned(ndP);
2629  }
2630  EXIT_RC(0, rc);
2631  return rc;
2632}
2633
2634void
2635cxiSetCred(void *eCredPP)
2636{
2637  ext_cred_t *eCredP = (ext_cred_t *)eCredPP;
2638  setCred(eCredP);
2639}
2640
2641#endif /* DMAPI or SANERGY */
2642
2643
2644#ifdef KSTACK_CHECK
2645/* Kernel stack checking: for each active thread that is making
2646   subroutine calls in the kernel, allocate a stack_history_t.  Within
2647   each stack_history_t, create a frame_desc_t for each level of
2648   subroutine call.  Two lists of frame_desc_t's are maintained: one for
2649   the current call stack, and one for the deepest call stack seen so
2650   far for this thread.  Upon exit from the lowest-level routine, check
2651   whether the maximum stack depth threshhold has been exceeded.  If it
2652   has, print the traceback of the maximum stack usage.  Keep hashes of
2653   the tracebacks printed to avoid printing the same traceback more than
2654   once.  Since cxiTraceExit is not called for every routine exit,
2655   maintenance of call chains is not exact; a routine entry with
2656   stackUsed less than the current entry implies return of the previous
2657   routine.
2658
2659   Note that these routines cannot call any other routine that has
2660   ENTER/EXIT macros inside of it, to avoid recursion. */
2661
2662/* Maximum size of of a stack frame before it is considered large enough
2663   to complain about */
2664#define STACK_LIMIT_WARNING (THREAD_SIZE - (THREAD_SIZE/3) )
2665
2666/* Description of one level of a call stack */
2667typedef struct frame_desc
2668{
2669  /* Function name and file name containing the function */
2670  const char * fdFuncNameP;
2671  const char * fdFileNameP;
2672
2673  /* Pointer to frame_desc of caller, or NULL if this is the first
2674     frame.  Also used to link free frame descriptors together on the
2675     shFreeHeadP free list. */
2676  struct frame_desc * fdCallerP;
2677
2678  /* Line number near the beginning of fdFuncNameP */
2679  int fdLineNum;
2680
2681  /* Total stack usage up to and including this routine */
2682  int fdStackUsed;
2683
2684  /* Reference count for this frame_desc_t.  Can be 2 if this descriptor
2685     is reachable from both shCurrentP and shMaxP. */
2686  int fdRef;
2687} frame_desc_t;
2688
2689
2690/* Each stack_history is only used by one thread, so no locking is
2691   needed within a stack_history.  This is allocated as a single page.
2692 */
2693typedef struct stack_history
2694{
2695  /* ID of thread to which this stack_history_t belongs */
2696  cxiThreadId shThreadId;
2697
2698  /* Bucket index in historyHash that points to this stack_history_t,
2699     or -1 if this stack_history_t is on an overflow list */
2700  int shBucketNum;
2701
2702  /* Next stack_history_t in same hash overflow list or on free list */
2703  struct stack_history * shNextP;
2704
2705  /* Pointer to the frame descriptor for the routine that most recently
2706     called fdEnter without a matching fdExit.  Following the fdCallerP
2707     pointers through these frame descriptors gives the current callback
2708     chain. */
2709  frame_desc_t * shCurrentP;
2710
2711  /* Pointer to the frame descriptor that had the maximum stack usage
2712     seen thus far for this thread.  Following the fdCallerP pointers
2713     through these frame descriptors gives the callback chain with
2714     maximal stack usage. */
2715  frame_desc_t * shMaxP;
2716
2717  /* Head of list of free frame_desc_t's */
2718  frame_desc_t * shFreeHeadP;
2719
2720  /* Area that holds frame_desc_t's.  These will be linked together and
2721     put on the list shFreeHeadP. */
2722#define SH_PREFIX_LEN (sizeof(cxiThreadId) +                            \
2723                       sizeof(int) +                                    \
2724                       sizeof(struct stack_history *) +                 \
2725                       3*sizeof(frame_desc_t *))
2726#define SH_NFRAMES ((PAGE_SIZE-SH_PREFIX_LEN)/sizeof(frame_desc_t))
2727  frame_desc_t shFrames[SH_NFRAMES];
2728} stack_history_t;
2729
2730/* Global structures */
2731struct
2732{
2733  /* Global flag controlling whether kernel stack checking is enabled.
2734     Initially false; set true during kernel module initialization,
2735     then set false again during kernel module termination. */
2736  Boolean shActive;
2737
2738  /* Mutex protecting updates to the variables that follow.  This cannot
2739     be a cxiBlockMutex_t because then the stack checking routines would
2740     get called recursively. */
2741  struct semaphore shMutex;
2742
2743  /* List of free stack_history_t's and count of how many free entries
2744     there are.  Excess stack_history_t's beyond a threshhold are freed
2745     back to the operating system. */
2746  stack_history_t * freeHeadP;
2747  int nFree;
2748#define MAX_FREE_STACK_HISTORIES 16
2749
2750  /* Hash table of active stack_history_t's.  To find the entry for a
2751     particular thread, hash its thread id to a bucket.  If any of the
2752     entries in bucket[] match the desired thread id, the pointer to
2753     the stack_history_t can be returned without acquiring any locks.  If
2754     the bucket does not contain the desired thread id, look for it on
2755     the overflow list under protection of shMutex. */
2756#define HISTORY_HASH_SIZE 64
2757#define HISTS_PER_BUCKET 3
2758  struct
2759  {
2760    struct
2761    {
2762      stack_history_t * historyP;
2763      cxiThreadId threadId;
2764    } bucket[HISTS_PER_BUCKET];
2765    stack_history_t * overflowP;
2766  } historyHash[HISTORY_HASH_SIZE];
2767
2768  /* List of hash values for tracebacks that have already been printed.
2769     Used to avoid printing the same traceback more than once.  Nothing
2770     is ever deleted from this table, so to find an entry start
2771     searching at its hash value and continue until the entry is found
2772     or an empty slot is encountered.  The total occupancy of the table
2773     is limited to MAX_TRACEBACKS to restrict the amount of searching
2774     that will be required, and to guarantee that searches will
2775     terminate. */
2776#define TB_HASH_SIZE 64
2777#define MAX_TRACEBACKS 32
2778  unsigned int tracebackHash[TB_HASH_SIZE];
2779  int nTracebackHashEntries;
2780} SHG;
2781
2782
2783/* Private version of DBGASSERT used only within stack checking code.
2784   Cannot use DBGASSERT without risking recursion. */
2785#ifdef DBGASSERTS
2786#define SH_ASSERT(_ex)                                                       \
2787  if (!(_ex)) {                                                              \
2788    printk("GPFS stack checking assert failed: " # _ex " file %s line %d\n", \
2789           __FILE__, __LINE__);                                              \
2790    DoPanic(# _ex, __FILE__, __LINE__, 0, 0, "");                            \
2791  } else ((void)0)
2792#else
2793#define SH_ASSERT(_ex) ((void)0)
2794#endif
2795
2796
2797/* Initialize and enable stack depth checking */
2798void shInit()
2799{
2800  /* Clear stack checking globals */
2801  cxiMemset(&SHG, 0, sizeof(SHG));
2802
2803  /* Init mutex */
2804  init_MUTEX(&SHG.shMutex);
2805
2806  /* Turn on stack depth checking and make sure the change is visible */
2807  SHG.shActive = true;
2808  wmb();
2809}
2810
2811
2812/* Turn off stack depth checking and free all allocated memory.  This does
2813   not have to return the global state to what it was when the module was
2814   first loaded, since it will not be used again. */
2815void shTerm()
2816{
2817  int h;
2818  int b;
2819  stack_history_t * shP;
2820  stack_history_t * shNextP;
2821
2822  /* Turn off stack depth checking and make sure the chenge is visible */
2823  SHG.shActive = false;
2824  wmb();
2825
2826  /* Get and then release mutex.  This insures that a thread that is
2827     in the middle of writing a traceback finishes writing it before
2828     we free the data structures it was using. */
2829  /* ?? although there could be another thread waiting for the mutex ... */
2830  down(&SHG.shMutex);
2831  up(&SHG.shMutex);
2832
2833  /* Wait briefly to allow threads in the middle of the stack checking
2834     code to finish what they are doing */
2835  /* ?? Of course, this is not really safe, but this is debugging code,
2836     right? */
2837  schedule_timeout(HZ/2);
2838
2839  /* Terminate mutex */
2840  // nothing to do
2841
2842  /* Free all stack_history_t's on the free list */
2843  shP = SHG.freeHeadP;
2844  while (shP != NULL)
2845  {
2846    shNextP = shP->shNextP;
2847    kfree(shP);
2848    shP = shNextP;
2849  }
2850
2851  /* Free all stack_history_t's in the hash table */
2852  for (h=0 ; h<HISTORY_HASH_SIZE ; h++)
2853  {
2854    for (b=0 ; b<HISTS_PER_BUCKET ; b++)
2855      if (SHG.historyHash[h].bucket[b].historyP != NULL)
2856        kfree(SHG.historyHash[h].bucket[b].historyP);
2857    shP = SHG.historyHash[h].overflowP;
2858    while (shP != NULL)
2859    {
2860      shNextP = shP->shNextP;
2861      kfree(shP);
2862      shP = shNextP;
2863    }
2864  }
2865}
2866
2867
2868/* Allocate and initialize a new stack_history_t */
2869static stack_history_t * shAllocInit()
2870{
2871  stack_history_t * shP;
2872  int f;
2873
2874  up(&SHG.shMutex);
2875  shP = (stack_history_t *) kmalloc(sizeof(stack_history_t), GFP_KERNEL);
2876  SH_ASSERT(shP != NULL);
2877  down(&SHG.shMutex);
2878  cxiMemset(shP, 0, sizeof(stack_history_t));
2879  for (f=0 ; f<=SH_NFRAMES-2 ; f++)
2880    shP->shFrames[f].fdCallerP = &shP->shFrames[f+1];
2881  shP->shFreeHeadP = &shP->shFrames[0];
2882  return shP;
2883}
2884
2885
2886/* Get a stack_history_t off the free list or build a new one */
2887static stack_history_t * shGet()
2888{
2889  stack_history_t * shP;
2890
2891  /* Use free list if one is available there */
2892  shP = SHG.freeHeadP;
2893  if (shP != NULL)
2894  {
2895    SHG.freeHeadP = shP->shNextP;
2896    SHG.nFree -= 1;
2897    return shP;
2898  }
2899
2900  /* Make a new one if necessary */
2901  return shAllocInit();
2902}
2903
2904
2905/* Free a stack_history_t.  Put it on the free list if there are not
2906   already too many free, or else free it back to the operating system.
2907 */
2908static void shPut(stack_history_t * shP)
2909{
2910  int h;
2911  int b;
2912  stack_history_t ** shPrevPP;
2913  stack_history_t * p;
2914
2915  /* Both call stacks should be empty */
2916  SH_ASSERT(shP->shCurrentP == NULL);
2917  SH_ASSERT(shP->shMaxP == NULL);
2918
2919  /* Must hold mutex while changing the hash table */
2920  down(&SHG.shMutex);
2921
2922  /* Clear pointer to this stack_history_t from the hash table */
2923  h = ((int)shP->shThreadId) & (HISTORY_HASH_SIZE-1);
2924  b = shP->shBucketNum;
2925  if (b != -1)
2926  {
2927    SH_ASSERT(SHG.historyHash[h].bucket[b].historyP == shP);
2928    SHG.historyHash[h].bucket[b].historyP = NULL;
2929    SHG.historyHash[h].bucket[b].threadId = 0;
2930  }
2931  else
2932  {
2933    shPrevPP = &SHG.historyHash[h].overflowP;
2934    p = *shPrevPP;
2935    while (p != NULL)
2936    {
2937      if (p == shP)
2938      {
2939        *shPrevPP = shP->shNextP;
2940        break;
2941      }
2942      shPrevPP = &p->shNextP;
2943      p = *shPrevPP;
2944    }
2945  }
2946
2947  /* If not too many already free, add to free list */
2948  if (SHG.nFree < MAX_FREE_STACK_HISTORIES)
2949  {
2950    shP->shNextP = SHG.freeHeadP;
2951    SHG.freeHeadP = shP;
2952    SHG.nFree += 1;
2953    up(&SHG.shMutex);
2954    return;
2955  }
2956
2957  /* Otherwise, really free it */
2958  up(&SHG.shMutex);
2959  kfree(shP);
2960}
2961
2962
2963/* Find the stack_history_t for the current thread, or allocate one if
2964   one does not already exist */
2965static stack_history_t * shFind()
2966{
2967  stack_history_t * shP;
2968  cxiThreadId id = current->pid;
2969  int h = ((int)id) & (HISTORY_HASH_SIZE-1);
2970  int b;
2971
2972  /* Look at all entries within the bucket given by the hash of the
2973     thread ID.  No locking needs to be done for this search. */
2974  for (b=0 ; b<HISTS_PER_BUCKET ; b++)
2975    if (SHG.historyHash[h].bucket[b].threadId == id)
2976      return SHG.historyHash[h].bucket[b].historyP;
2977
2978  /* Must hold mutex while changing the hash table */
2979  down(&SHG.shMutex);
2980
2981  /* Search the overflow list */
2982  shP = SHG.historyHash[h].overflowP;
2983  while (shP != NULL)
2984  {
2985    if (shP->shThreadId == id)
2986      goto exit;
2987    shP = shP->shNextP;
2988  }
2989
2990  /* No stack_history_t for this thread yet.  Get one off the free list
2991     or build one. */
2992  shP = shGet();
2993  shP->shThreadId = id;
2994  shP->shNextP = NULL;
2995
2996  /* Find a slot for the new stack_history_t in the hash table */
2997  for (b=0 ; b<HISTS_PER_BUCKET ; b++)
2998    if (SHG.historyHash[h].bucket[b].historyP == NULL)
2999    {
3000      SHG.historyHash[h].bucket[b].historyP = shP;
3001      SHG.historyHash[h].bucket[b].threadId = id;
3002      shP->shBucketNum = b;
3003      goto exit;
3004    }
3005
3006  /* No slots available; add new stack_history_t to overflow list */
3007  shP->shBucketNum = -1;
3008  shP->shNextP = SHG.historyHash[h].overflowP;
3009  SHG.historyHash[h].overflowP = shP;
3010
3011exit:
3012  /* Release mutex before returning */
3013  up(&SHG.shMutex);
3014  return shP;
3015}
3016
3017
3018/* Allocate a frame descriptor within the given stack_history_t.  This
3019   cannot be allowed to fail, so if there are no more free descriptors,
3020   throw away the bottom frame descriptor and return that.  The reference
3021   count of the frame descriptor that is returned is undefined. */
3022static frame_desc_t * fdGet(stack_history_t * shP)
3023{
3024  frame_desc_t * fdP;
3025  frame_desc_t ** fdPrevPP;
3026  int prevRef;
3027
3028  /* Look on the free list within the stack_history_t */
3029  fdP = shP->shFreeHeadP;
3030  if (fdP != NULL)
3031  {
3032    shP->shFreeHeadP = fdP->fdCallerP;
3033    return fdP;
3034  }
3035
3036  /* No free descriptors; first try stealing one off the bottom of the
3037     current call stack */
3038  fdP = shP->shCurrentP;
3039  if (fdP != NULL)
3040  {
3041    /* Find the bottom entry of the current call stack */
3042    fdPrevPP = &shP->shCurrentP;
3043    prevRef = 1;
3044    while (fdP->fdCallerP != NULL)
3045    {
3046      fdPrevPP = &fdP->fdCallerP;
3047      prevRef = fdP->fdRef;
3048      fdP = *fdPrevPP;
3049    }
3050
3051    /* Remove the bottom entry of the current call stack */
3052    *fdPrevPP = NULL;
3053
3054    /* Reduce the reference count on the entry just removed.  The
3055       reference count decreases by the reference count of the frame
3056       that used to point to *fdP.  If *fdP is no longer referenced, no
3057       further work is needed.  If *fdP is still referenced from the max
3058       depth stack (it must be the bottom entry), we will eventually
3059       return it, but only after removing it from the bottom of the max
3060       depth stack.  We know that fdP will be returned, but we have to
3061       search through the max depth stack to find the pointer to *fdP.
3062     */
3063    fdP->fdRef -= prevRef;
3064    if (fdP->fdRef == 0)
3065      return fdP;
3066  }
3067
3068  /* Still no free descriptors; steal the frame descriptor off the
3069     bottom of the maximum depth call stack */
3070  fdP = shP->shMaxP;
3071  if (fdP != NULL)
3072  {
3073    /* Find the bottom entry of the max depth call stack */
3074    fdPrevPP = &shP->shMaxP;
3075    while (fdP->fdCallerP != NULL)
3076    {
3077      fdPrevPP = &fdP->fdCallerP;
3078      fdP = *fdPrevPP;
3079    }
3080
3081    /* Remove the bottom entry of the max depth call stack */
3082    *fdPrevPP = NULL;
3083
3084    /* The bottom entry of the max depth call stack that was just
3085       removed must have a reference count of one; otherwise it would
3086       still be on the current call stack and removing the bottom entry
3087       of that stack would have reduced the reference count of some
3088       frame descriptor from 2 to 0. */
3089    SH_ASSERT(fdP->fdRef == 1);
3090    return fdP;
3091  }
3092  SH_ASSERT(!"cannot alloc frame_desc_t");
3093  return NULL;
3094}
3095
3096
3097/* Decrease the reference count on a frame descriptor.  If it becomes
3098   zero, return it to the free list */
3099static void fdDiscard(frame_desc_t * fdP, stack_history_t * shP)
3100//inline static void fdDiscard(frame_desc_t * fdP, stack_history_t * shP)
3101{
3102  if (fdP->fdRef > 1)
3103  {
3104    fdP->fdRef -= 1;
3105    TRACE3(TRACE_ENTRYEXIT, 11, TRCID_FDDISCARD1,
3106           "fdDiscard: fdP 0x%lX shP 0x%lX rtn %s refcnt now 1\n",
3107           fdP, shP, fdP->fdFuncNameP);
3108    return;
3109  }
3110
3111  fdP->fdCallerP = shP->shFreeHeadP;
3112  shP->shFreeHeadP = fdP;
3113  TRACE3(TRACE_ENTRYEXIT, 11, TRCID_FDDISCARD2,
3114         "fdDiscard: fdP 0x%lX shP 0x%lX rtn %s refcnt now 0\n",
3115         fdP, shP, fdP->fdFuncNameP);
3116}
3117
3118
3119/* If the maximum stack depth exceeds the threshhold, print its
3120   traceback if it has not already been printed.  Reset the maximum
3121   depth stack to empty.  Only called when the current stack is already
3122   empty. */
3123static void shDisplay(stack_history_t * shP)
3124{
3125  frame_desc_t * fdP;
3126  unsigned int tbHash;
3127  frame_desc_t * fdNextP;
3128  int slot;
3129
3130  SH_ASSERT(shP->shCurrentP == NULL);
3131
3132  /* If the maximum stack depth is less than the threshhold, just free
3133     the call chain and return */
3134  fdP = shP->shMaxP;
3135  if (fdP == NULL  ||
3136      fdP->fdStackUsed < STACK_LIMIT_WARNING)
3137    goto exit;
3138
3139  /* Compute a hash of the traceback call chain */
3140  tbHash = 0;
3141  while (fdP != NULL)
3142  {
3143    tbHash <<= 1;
3144    tbHash ^= (((unsigned int)fdP->fdStackUsed) << 15) ^ fdP->fdLineNum;
3145    fdP = fdP->fdCallerP;
3146  }
3147
3148  /* Search for the hash of the call chain in the table of tracebacks that
3149     have already been printed.  Searching the hash table can be done without
3150     any locks, since entries are never deleted.  The loop must eventually
3151     terminate, since the table will not be allowed to fill up. */
3152search:
3153  slot = tbHash % TB_HASH_SIZE;
3154  while (SHG.tracebackHash[slot] != 0)
3155  {
3156    if (SHG.tracebackHash[slot] == tbHash)
3157      /* This traceback has already been printed */
3158      goto exit;
3159    slot = (slot+1) % TB_HASH_SIZE;
3160  }
3161
3162  /* The hash of the current max depth traceback was not found in the
3163     table and should be inserted at position 'slot'.  Do this under
3164     protection of the mutex.  If 'slot' has been used by the time we
3165     get the mutex, drop the mutex and repeat the search. */
3166  down(&SHG.shMutex);
3167  if (SHG.nTracebackHashEntries >= MAX_TRACEBACKS)
3168    goto exitMutexHeld;
3169  if (SHG.tracebackHash[slot] != 0)
3170  {
3171    up(&SHG.shMutex);
3172    goto search;
3173  }
3174  SHG.tracebackHash[slot] = tbHash;
3175  SHG.nTracebackHashEntries += 1;
3176
3177  /* Print the traceback */
3178  fdP = shP->shMaxP;
3179  printk("\nGPFS kernel stack for process %d(%s) used %d bytes\n",
3180         current->pid, current->comm, fdP->fdStackUsed);
3181  printk("  stack function\n");
3182  printk("   used\n");
3183  printk("  ----- -----------------------------------------------------\n");
3184  while (fdP != NULL)
3185  {
3186    printk("  %5d %s at %s:%d\n",
3187           fdP->fdStackUsed, fdP->fdFuncNameP, fdP->fdFileNameP, fdP->fdLineNum);
3188    fdP = fdP->fdCallerP;
3189  }
3190  printk("  traceback signature %08X\n", tbHash);
3191
3192  /* If the maximum number of allowed tracebacks has been reached, turn
3193     off further stack checking. */
3194  if (SHG.nTracebackHashEntries >= MAX_TRACEBACKS)
3195  {
3196    printk("Maximum number of GPFS deep stack tracebacks reached\n");
3197    printk("GPFS stack checking disabled\n");
3198    SHG.shActive = false;
3199    wmb();
3200  }
3201
3202exitMutexHeld:
3203  up(&SHG.shMutex);
3204
3205exit:
3206  /* Free all stack frame descriptors for the max depth call chain back
3207     to the internal free list. */
3208  fdP = shP->shMaxP;
3209  while (fdP != NULL)
3210  {
3211    SH_ASSERT(fdP->fdRef == 1);
3212    fdNextP = fdP->fdCallerP;
3213    fdP->fdCallerP = shP->shFreeHeadP;
3214    shP->shFreeHeadP = fdP;
3215    fdP = fdNextP;
3216  }
3217  shP->shMaxP = NULL;
3218}
3219
3220
3221/* Process routine entry */
3222static void fdEntry(frame_desc_t * fdP, stack_history_t * shP)
3223{
3224  frame_desc_t * popP;
3225  frame_desc_t * p;
3226
3227  TRACE5(TRACE_ENTRYEXIT, 11, TRCID_FDENTRY,
3228         "fdEntry: fdP 0x%lX shP 0x%lX rtn %s shCurrentP 0x%lX used %d\n",
3229         fdP, shP, fdP->fdFuncNameP, shP->shCurrentP, fdP->fdStackUsed);
3230
3231  /* If this is the first call by this thread, set up the two call chains */
3232  if (shP->shCurrentP == NULL)
3233  {
3234    SH_ASSERT(shP->shMaxP == NULL);
3235    shP->shCurrentP = fdP;
3236    shP->shMaxP = fdP;
3237    fdP->fdCallerP = NULL;
3238    fdP->fdRef = 2;
3239    return;
3240  }
3241  else
3242    SH_ASSERT(shP->shMaxP != NULL);
3243
3244  /* Process routine exits implied by the number of bytes of stack that
3245     are currently in use.  The test needs to be for strict less than
3246     because inlined routines share the same stack frame as their
3247     caller, but both routines will do entry/exit processing. */
3248  popP = shP->shCurrentP;
3249  while (fdP->fdStackUsed < popP->fdStackUsed)
3250  {
3251    p = popP->fdCallerP;
3252    shP->shCurrentP = p;
3253    TRACE1(TRACE_ENTRYEXIT, 11, TRCID_IMPLIED_EXIT,
3254           "fdEntry: implied exit from rtn %s\n",
3255           popP->fdFuncNameP);
3256    fdDiscard(popP, shP);
3257    if (p == NULL)
3258    {
3259      /* The outermost routine returned before this call without calling
3260         fdExit.  Test for a large maximum stack, then reset the
3261         maximum. */
3262      shDisplay(shP);
3263
3264      /* The current routine is the one and only */
3265      shP->shCurrentP = fdP;
3266      shP->shMaxP = fdP;
3267      fdP->fdCallerP = NULL;
3268      fdP->fdRef = 2;
3269      return;
3270    }
3271    popP = p;
3272  }
3273
3274  /* If this is an extension of the current max depth stack, just add
3275     this routine to the top of both stacks */
3276  if (fdP->fdStackUsed > shP->shMaxP->fdStackUsed  &&
3277      shP->shCurrentP == shP->shMaxP)
3278  {
3279    fdP->fdCallerP = shP->shCurrentP;
3280    shP->shCurrentP = fdP;
3281    shP->shMaxP = fdP;
3282    fdP->fdRef = 2;
3283    TRACE2(TRACE_ENTRYEXIT, 11, TRCID_NEWMAX_EXTEND,
3284           "fdEntry: extending new max stack %d fdP 0x%lX\n",
3285           fdP->fdStackUsed, fdP);
3286    return;
3287  }
3288
3289  /* Make this new routine be the top of the stack */
3290  fdP->fdCallerP = shP->shCurrentP;
3291  shP->shCurrentP = fdP;
3292  fdP->fdRef = 1;
3293
3294  /* If this new routine has a greater stack depth than the previous max,
3295     unreference the previous max depth call chain and add additional
3296     references to the current one. */
3297  if (fdP->fdStackUsed > shP->shMaxP->fdStackUsed)
3298  {
3299    popP = shP->shMaxP;
3300    do
3301    {
3302      p = popP->fdCallerP;
3303      fdDiscard(popP, shP);
3304      popP = p;
3305    } while (popP != NULL);
3306    p = fdP;
3307    do
3308    {
3309      p->fdRef = 2;
3310      p = p->fdCallerP;
3311    } while (p != NULL);
3312    TRACE2(TRACE_ENTRYEXIT, 11, TRCID_NEWMAX,
3313           "fdEntry: new max stack %d fdP 0x%lX\n",
3314           fdP->fdStackUsed, fdP);
3315    shP->shMaxP = fdP;
3316  }
3317}
3318
3319
3320/* Process routine exit */
3321static void fdExit(const char * funcnameP)
3322{
3323  stack_history_t * shP;
3324  frame_desc_t * lastPopP;
3325  frame_desc_t * popP;
3326  frame_desc_t * p;
3327
3328  /* Locate or create stack_history_t for this thread */
3329  shP = shFind();
3330
3331  /* If call stack is already empty, there is nothing to do except free
3332     the stack_history_t */
3333  if (shP->shCurrentP == NULL)
3334  {
3335    SH_ASSERT(shP->shMaxP == NULL);
3336    shPut(shP);
3337    return;
3338  }
3339
3340  /* Search backward on the call stack for a routine name that matches
3341     the one being exitted.  In C++, the ENTER/EXIT macros will pass the
3342     same string constant (same address) to fdEntry and fdExit.  The C
3343     versions of the macros may pass two different copies of the same
3344     string.  This loop cannot pop routines it skips off the stack, since
3345     the routine might never be found. */
3346  p = shP->shCurrentP;
3347  for (;;)
3348  {
3349    if (p->fdFuncNameP == funcnameP  ||
3350        cxiStrcmp(p->fdFuncNameP, funcnameP) == 0)
3351    {
3352      TRACE4(TRACE_ENTRYEXIT, 11, TRCID_FDEXIT,
3353             "fdExit: p 0x%lX shP 0x%lX rtn %s shCurrentP 0x%lX\n",
3354             p, shP, p->fdFuncNameP, shP->shCurrentP);
3355      lastPopP = p;
3356      break;
3357    }
3358    p = p->fdCallerP;
3359    if (p == NULL)
3360    {
3361      /* Routine name not found.  Do not pop stack. */
3362      /* printk("No entry found when exitting %s\n", funcnameP); */
3363      TRACE1(TRACE_ENTRYEXIT, 11, TRCID_FDEXIT_NOTFOUND,
3364             "No entry found when exitting %s\n", funcnameP);
3365      return;
3366    }
3367  }
3368
3369  /* Pop all routines up to and including lastPopP */
3370  p = shP->shCurrentP;
3371  do
3372  {
3373    popP = p;
3374    p = popP->fdCallerP;
3375    fdDiscard(popP, shP);
3376  } while (popP != lastPopP);
3377  shP->shCurrentP = p;
3378
3379  /* If this was the return of the outermost routine, print new maximum
3380     stack depth traceback and discard the stack_history_t */
3381  if (shP->shCurrentP == NULL)
3382  {
3383    shDisplay(shP);
3384    shPut(shP);
3385  }
3386}
3387
3388#endif  /* KSTACK_CHECK */
3389
3390
3391#if defined(ENTRYEXIT_TRACE) || defined(KSTACK_CHECK)
3392void cxiTraceEntry(int level, const char * funcnameP,
3393                   const char * filenameP, int lineNum)
3394{
3395  int stackUsed = THREAD_SIZE - (((unsigned long)&stackUsed) & (THREAD_SIZE-1));
3396#ifdef KSTACK_CHECK
3397  stack_history_t * shP;
3398  frame_desc_t * fdP;
3399#endif  /* KSTACK_CHECK */
3400
3401#ifdef ENTRYEXIT_TRACE
3402  /* Need to use a constant trace level in the TRACE macro call to get
3403     the .trclst file (and later the .trcfmt file) built correctly */
3404  if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level))
3405  {
3406    TRACE5(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_ENTER,
3407           "-->K %s (%s:%d) level %d stackUsed %d\n",
3408           funcnameP, filenameP, lineNum, level, stackUsed);
3409  }
3410#endif  /* ENTRYEXIT_TRACE */
3411
3412#ifdef KSTACK_CHECK
3413  /* Nothing to do if kernel stack checking is disabled */
3414  if (!SHG.shActive)
3415    return;
3416
3417  /* Do not attempt to keep track of stack usage in interrupt handlers */
3418  if (in_interrupt())
3419    return;
3420
3421  /* Locate or create stack_history_t for this thread */
3422  shP = shFind();
3423
3424  /* Get a new frame descriptor and fill it in */
3425  fdP = fdGet(shP);
3426  fdP->fdFuncNameP = funcnameP;
3427  fdP->fdFileNameP = filenameP;
3428  fdP->fdLineNum = lineNum;
3429  fdP->fdStackUsed = stackUsed;
3430
3431  /* Perform stack checking for this routine entry */
3432  fdEntry(fdP, shP);
3433#endif  /* KSTACK_CHECK */
3434}
3435
3436
3437void cxiTraceExit(int level, const char * funcnameP)
3438{
3439#ifdef ENTRYEXIT_TRACE
3440  /* Need to use a constant trace level in the TRACE macro call to get
3441     the .trclst file (and later the .trcfmt file) built correctly */
3442  if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level))
3443    TRACE1(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_EXIT,
3444           "<--K %s\n", funcnameP);
3445#endif  /* ENTRYEXIT_TRACE */
3446
3447#ifdef KSTACK_CHECK
3448  /* Nothing to do if kernel stack checking is disabled */
3449  if (!SHG.shActive)
3450    return;
3451
3452  /* Do not attempt to keep track of stack usage in interrupt handlers */
3453  if (in_interrupt())
3454    return;
3455
3456  /* Process routine exit */
3457  fdExit(funcnameP);
3458#endif  /* KSTACK_CHECK */
3459}
3460void cxiTraceExitRC(int level, const char * funcnameP, int rc)
3461{
3462#ifdef ENTRYEXIT_TRACE
3463  /* Need to use a constant trace level in the TRACE macro call to get
3464     the .trclst file (and later the .trcfmt file) built correctly */
3465  if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level))
3466    TRACE2(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_EXIT_RC,
3467           "<--K %s rc %d\n", funcnameP, rc);
3468#endif  /* ENTRYEXIT_TRACE */
3469
3470#ifdef KSTACK_CHECK
3471  /* Nothing to do if kernel stack checking is disabled */
3472  if (!SHG.shActive)
3473    return;
3474
3475  /* Do not attempt to keep track of stack usage in interrupt handlers */
3476  if (in_interrupt())
3477    return;
3478
3479  /* Process routine exit */
3480  fdExit(funcnameP);
3481#endif  /* KSTACK_CHECK */
3482}
3483#endif  /* defined(ENTRYEXIT_TRACE) || defined(KSTACK_CHECK) */
3484
3485
3486#ifdef UIDREMAP
3487size_t cxiGetUserEnvironmentSize(void)
3488{
3489  return (current->mm->env_end - current->mm->env_start);
3490}
3491
3492int cxiGetUserEnvironment(char* buf, size_t len)
3493{
3494  return cxiCopyIn((char*)current->mm->env_start, buf, len);
3495}
3496#endif
3497
3498Boolean cxiHasMountHelper()
3499{
3500  return USING_MOUNT_HELPER();
3501}
3502
3503#ifdef P_NFS4
3504
3505#include <linux/nfsd/nfs4layoutxdr.h>
3506
3507/* convert ip address to string */
3508char *IPtoString(int ip, char *buf)
3509{
3510  unsigned char *a = (char *)&ip;
3511
3512  sprintf(buf, "%u.%u.%u.%u", a[0], a[1], a[2], a[3]);
3513
3514  return buf;
3515}
3516
3517static void printfh(char *s, int *fh)
3518{
3519#ifdef GPFS_PRINTK
3520  printk("%s: %d: %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3521           s, fh[0],fh[1],fh[2],fh[3],fh[4],fh[5],fh[6],fh[7],fh[8],fh[9]);
3522#endif
3523}
3524
3525int cxiSetFH(int *fhP, int sid)
3526{
3527  struct knfsd_fh *fh = (struct knfsd_fh *)fhP;
3528
3529printfh("cxiSetFH-1", fhP);
3530  if (fh->fh_size > 8) {
3531    fh->fh_size += 4; // fh_size + 4 for sid
3532    fh->fh_fsid_type += max_fsid_type;
3533    fhP[(fh->fh_size >> 2)] = sid;
3534    fh->fh_fileid_type = 7; // see code in gpfs_decode_fh()
3535#ifdef GPFS_PRINTK
3536  printk("cxiSetFH size %d fsid_type %d fileid %d\n",
3537         fh->fh_size, fh->fh_fsid_type, fh->fh_fileid_type);
3538#endif
3539printfh("cxiSetFH-2", fhP);
3540    return 0;
3541  }
3542  return ENOENT;
3543}
3544
3545/* Call to NFS server on MDS to get open state */
3546int cxiOpenState(void *vfsP, void *p)
3547{
3548  int rc = ENOENT;
3549  struct super_block *sbP = (struct super_block *)vfsP;
3550  struct pnfs_get_state *osP = p;
3551  struct gpfsVfsData_t *privVfsP = (struct gpfsVfsData_t *)SBLOCK_PRIVATE(sbP);
3552
3553#ifdef GPFS_PRINTK
3554  printk("cxiOpenState1 sb %p p %p \n", sbP, p);
3555  printk("cxiOpenState cb_get_state %p\n",
3556                 sbP->s_export_op->cb_get_state);
3557#endif
3558  if (sbP->s_export_op->cb_get_state)
3559    rc = sbP->s_export_op->cb_get_state(osP);
3560
3561  gpfs_ops.gpfsGetVerifier(privVfsP, osP->verifier);
3562#ifdef GPFS_PRINTK
3563  printk("cxiOpenState rc %d devid %x verifier %x:%x\n",
3564                        rc, osP->devid, osP->verifier[0], osP->verifier[1]);
3565#endif
3566
3567  return rc;
3568}
3569/* Call to NFS server on DS to get change open state or close the file */
3570int cxiChangeState(void *vfsP, void *p)
3571{
3572  int rc = ENOENT;
3573  struct super_block *sbP = (struct super_block *)vfsP;
3574  struct pnfs_get_state *osP = p;
3575
3576  if (sbP->s_export_op->cb_change_state)
3577    rc = sbP->s_export_op->cb_change_state(osP);
3578#ifdef GPFS_PRINTK
3579  printk("cxiChangeState2 sb %p p %p access %d\n", sbP, p, osP->access);
3580#endif
3581
3582  return rc;
3583}
3584/* Call to NFS server on MDS to recall layout */
3585int cxiRecallLayout(void *vfsP, void *vP, void *p)
3586{
3587  int rc = ENOENT;
3588  struct super_block *sbP = (struct super_block *)vfsP;
3589  struct inode *iP = (struct inode *)vP;
3590  struct layout_recall lr;
3591
3592  lr.fsid = sbP;
3593  lr.offset = 0;
3594  lr.length = -1;
3595
3596  if (iP == NULL)   // recall all layouts for this fs
3597    lr.layout_type = RECALL_FSID;
3598
3599#ifdef GPFS_PRINTK
3600  printk("cxiRecallLayout sbP %p type %d\n", sbP, lr.layout_type);
3601#endif
3602  if (sbP->s_export_op->cb_layout_recall) {
3603    rc = sbP->s_export_op->cb_layout_recall(sbP, iP, &lr);
3604  }
3605  else {
3606    lr.layout_type = RECALL_FILE;
3607#ifdef GPFS_PRINTK
3608    printk("cxiRecallLayout sbP %p iP %p type %d\n", sbP, iP, lr.layout_type);
3609#endif
3610  }
3611
3612#ifdef GPFS_PRINTK
3613  printk("cxiRecallLayout sbP %p iP %p rc %d\n", sbP, iP, rc);
3614#endif
3615  return rc;
3616}
3617
3618/* Get device list
3619
3620  gd_type
3621    in: requested layout type.
3622    out: available lauout type.
3623  gd_cookie
3624    in: cookie returned on the last operation.
3625    out: none zero cookie if some devices did not fit in the buffer.
3626  gd_maxcount
3627    in: buffer size in bytes.
3628  gd_buffer
3629    in: pointer to buffer.
3630  gd_devlist_len
3631    out: number of items returned in the buffer.
3632   
3633error: 
3634  Use the same retrun codes as used for GTEDEVLIST
3635*/
3636int
3637cxiGetDeviceList(int nDests, int *idList, void *P)
3638{
3639  ENTER(0);
3640  int rc = 0;
3641  int i, len, left;
3642  int j = 0;
3643  char *p, *tp;
3644  char tmp[32];
3645  struct nfsd4_pnfs_getdevlist *dl = (struct nfsd4_pnfs_getdevlist *)P;
3646  struct nfsd4_pnfs_devlist *gd_buf = NULL;
3647  struct pnfs_filelayout_devaddr *dev;
3648
3649#ifdef GPFS_PRINTK
3650printk("xxx cxiGetDeviceList enter nDests %d idList %p \n", nDests, idList);
3651#endif
3652
3653  dl->gd_type = LAYOUT_NFSV4_FILES;
3654  dl->gd_cookie = 0;
3655  dl->gd_devlist_len = 0;
3656  left = dl->gd_maxcount;
3657  tp = &tmp[0];
3658
3659  len = sizeof(struct nfsd4_pnfs_devlist) * nDests;
3660#ifdef GPFS_PRINTK
3661  printk("xxx cxiGetDeviceList len %d left %d\n", len, left);
3662#endif
3663  if (nDests > left) {
3664    rc = ENOMEM;  //??? NFS4ERR_TOOSMALL
3665    goto xerror;
3666  }
3667  gd_buf = (struct nfsd4_pnfs_devlist *)cxiMallocUnpinned(len);
3668  if (gd_buf == NULL) {
3669    rc = ENOMEM;
3670    goto xerror;
3671  }
3672  memset(gd_buf, 0, len);
3673  dl->gd_devlist = gd_buf;
3674
3675#ifdef GPFS_PRINTK
3676  printk("xxx cxiGetDeviceList gd_buf %p count %d\n", gd_buf, nDests);
3677#endif
3678  for (i = 0; i < nDests; i++)
3679  {
3680    /* make both device id and device address be the same for now */
3681    gd_buf[j].dev_id = idList[i];
3682    gd_buf[j].dev_lotype = LAYOUT_NFSV4_FILES;
3683    if (gd_buf[j].dev_id == INADDR_NONE)
3684      continue;
3685
3686    IPtoString(gd_buf[j].dev_id, tp);
3687    len = (cxiStrlen(tp));
3688
3689    p = (char *)cxiMallocUnpinned(sizeof(struct pnfs_filelayout_devaddr));
3690    if (p == NULL) {
3691      rc = ENOMEM;
3692      goto xerror;
3693    }
3694    memset(p, 0, sizeof(struct pnfs_filelayout_devaddr));
3695    gd_buf[j].dev_addr = p;
3696
3697    dev = (struct pnfs_filelayout_devaddr *)p;
3698    dev->r_addr.len = len + 4; /* for ".8.1" */
3699
3700    p = (char *)cxiMallocUnpinned(dev->r_addr.len+1);
3701    if (p == NULL) {
3702      rc = ENOMEM;
3703      goto xerror;
3704    }
3705    dev->r_addr.data = p;
3706    cxiMemcpy(p, tp, len);
3707    p = p + len;
3708    cxiStrcpy(p, ".8.1");  /* port 2049 = 0x801 = "8.1" */
3709
3710    dev->r_netid.len = 3; /*'tcp'*/
3711    p = (char *)cxiMallocUnpinned(dev->r_netid.len+1);
3712    if (p == NULL) {
3713      rc = ENOMEM;
3714      goto xerror;
3715    }
3716    cxiStrcpy(p, "tcp");
3717    dev->r_netid.data = p;
3718
3719    left = left - 1;
3720    dl->gd_devlist_len++;
3721
3722    TRACE4(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELIST_P1,
3723           "gpfsGetDeviceList index %d len %d ip %s left %d\n",
3724           i, dev->r_addr.len, dev->r_addr.data, left);
3725#ifdef GPFS_PRINTK
3726printk("xxx cxiGetDeviceList index %d id %d len %d ip %s left %d ops %p %p\n",
3727        i, gd_buf[j].dev_id, dev->r_addr.len,
3728        dev->r_addr.data, left, dl->gd_ops, dl->gd_ops->devaddr_encode);
3729#endif
3730
3731    j++;
3732  }
3733
3734exit:
3735
3736  TRACE2(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELIST_EXIT,
3737         "cxiGetDeviceList exit: rc %d len %d", rc, len);
3738  return rc;
3739
3740xerror:
3741
3742  if (gd_buf != NULL) {
3743    for (i = 0; i < j; i++)
3744    {
3745      dev = gd_buf[i].dev_addr;
3746      if (dev) {
3747        cxiFreeUnpinned(dev->r_addr.data);
3748        cxiFreeUnpinned(dev->r_netid.data);
3749        cxiFreeUnpinned(dev);
3750      }
3751    }
3752    cxiFreeUnpinned(gd_buf);
3753  }
3754  goto exit;
3755}
3756
3757int
3758cxiGetDeviceInfo(void *P)
3759{
3760  ENTER(0);
3761  int rc;
3762  int len;
3763  char *p, *tp;
3764  char tmp[32];
3765  struct nfsd4_pnfs_getdevinfo *da = (struct nfsd4_pnfs_getdevinfo *)P;
3766  tp = &tmp[0];
3767  struct pnfs_filelayout_devaddr *dev;
3768
3769  IPtoString(da->gd_dev_id, tp);
3770
3771  dev = (struct pnfs_filelayout_devaddr *)cxiMallocUnpinned(sizeof(struct pnfs_filelayout_devaddr));
3772  if (dev == NULL) {
3773    rc = ENOMEM;
3774    goto xerror;
3775  }
3776  da->gd_devaddr = dev;
3777
3778  len = (cxiStrlen(tp));
3779  dev->r_addr.len = len + 4; /* for ".8.1" */
3780
3781  p = (char *)cxiMallocUnpinned(dev->r_addr.len+1);
3782  if (p == NULL) {
3783    cxiFreeUnpinned(dev);
3784    rc = ENOMEM;
3785    goto xerror;
3786  }
3787  dev->r_addr.data = p;
3788  cxiMemcpy(p, tp, len);
3789  p = p + len;
3790  cxiStrcpy(p, ".8.1");  /* port 2049 = 0x801 = "8.1" */
3791
3792  dev->r_netid.len = 3; /*'tcp'*/
3793  p = (char *)cxiMallocUnpinned(dev->r_netid.len+1);
3794  if (p == NULL) {
3795    cxiFreeUnpinned(dev->r_addr.data);
3796    cxiFreeUnpinned(dev);
3797    rc = ENOMEM;
3798    goto xerror;
3799  }
3800  cxiStrcpy(p, "tcp");
3801  dev->r_netid.data = p;
3802
3803  TRACE2(TRACE_VNODE, 2, TRCID_GPFSOPS_GET_DEVICELINFO_P1,
3804         "gpfsGetDeviceInfo len %d ip %s\n",
3805          dev->r_addr.len, dev->r_addr.data);
3806
3807#ifdef GPFS_PRINTK
3808  printk("xxx cxiGetDeviceInfo id %d len %d ip %s\n",
3809       da->gd_dev_id, dev->r_addr.len, dev->r_addr.data);
3810#endif
3811
3812xerror:
3813
3814  TRACE1(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELINFO_EXIT,
3815         "cxiGetDeviceInfo exit: rc %d\n", rc);
3816
3817  return rc;
3818}
3819/* get layout
3820  lg_type
3821    in: requested layout type.
3822    out: available lauout type.
3823  lg_offset
3824    in: requested offset.
3825    out: returned offset.
3826  lg_length
3827    in: requested length.
3828    out: returned length.
3829  lg_mxcnt
3830    in: buffer size in bytes.
3831  lg_llist
3832    in: pointer to buffer.
3833  lg_layout
3834    out: number of items returned in the buffer.
3835
3836   if the file is big(?) return all nodes in layout
3837   if the file is small return no layout or just one node, choose one node in
3838   random but make sure it is the same node for the same file.
3839*/
3840int
3841cxiGetLayout(int nDests, int *idList, cxiVattr_t *vattr, int myAddr, void *P)
3842{
3843  ENTER(0);
3844  char *p, *n;
3845  int i, rc, left, len;
3846  struct nfsd4_pnfs_layoutget *gl = (struct nfsd4_pnfs_layoutget *)P;
3847  struct nfsd4_pnfs_layoutlist *lg_buf = NULL;
3848  struct nfsd4_pnfs_filelayout *layout = NULL;
3849
3850  TRACE2(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_ENTER,
3851         "cxiGetLayout: nDests %d myAddr %x\n", nDests,myAddr);
3852
3853  /* set node id in fh and increase fh size by 4 */
3854  rc = cxiSetFH((int *)&gl->lg_fh, myAddr);
3855  if (rc != 0)
3856    goto xerror;
3857
3858  gl->lg_type = LAYOUT_NFSV4_FILES;
3859  gl->lg_offset = 0;
3860  gl->lg_length = MAX_UINT64;               /* The maximum file size */
3861
3862  layout = (struct nfsd4_pnfs_filelayout *)cxiMallocUnpinned(sizeof(struct nfsd4_pnfs_filelayout));
3863  if (layout == NULL) {
3864    rc = ENOMEM;
3865    goto xerror;
3866  }
3867  gl->lg_layout = layout;
3868  layout->lg_stripe_type = STRIPE_DENSE;
3869  layout->lg_commit_through_mds = true;
3870  layout->lg_stripe_unit = vattr->va_blocksize; /* preferred blocksize */
3871  layout->lg_file_size = vattr->va_size;        /* file size in bytes  */
3872  layout->lg_llistlen = 0;
3873
3874  left = gl->lg_mxcnt;
3875
3876  len = sizeof(struct nfsd4_pnfs_layoutlist) * nDests;
3877  if (len > left) {
3878    rc = ENOMEM;  // NFS4ERR_TOOSMALL
3879    goto xerror;
3880  }
3881  lg_buf = (struct nfsd4_pnfs_layoutlist *)cxiMallocUnpinned(len);
3882  if (lg_buf == NULL) {
3883    rc = ENOMEM;
3884    goto xerror;
3885  }
3886  memset(lg_buf, 0, len);
3887  layout->lg_llist = lg_buf;
3888  left = left - len;
3889
3890  for (i = 0; i < nDests; i++)
3891  {
3892    /* make both device id and device address be the same for now */
3893    lg_buf[i].dev_ids.len = 1;  //??? can return a list of dev ids ????
3894    lg_buf[i].dev_ids.list = (u32 *)cxiMallocUnpinned(sizeof(u32)*lg_buf[i].dev_ids.len);
3895    if (lg_buf[i].dev_ids.list == NULL) {
3896      rc = ENOMEM;
3897      goto xerror;
3898    }
3899    lg_buf[i].dev_ids.list[0] = idList[i];
3900    layout->lg_llistlen++;
3901    lg_buf[i].fhp = (struct knfsd_fh *)&gl->lg_fh;
3902
3903#ifdef GPFS_PRINTK
3904    printk("cxiGetLayout index %d id %d xid 0x%lX len %d\n",
3905           i, idList[i], idList[i], len);
3906#endif
3907    TRACE3(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_P1,
3908           "cxiGetLayout index %d id 0x%lX len %d\n",
3909           i, idList[i], len);
3910
3911  }
3912  if (i == 0) {
3913    layout->lg_llistlen = 0;
3914    cxiFreeUnpinned(lg_buf);
3915  }
3916
3917#ifdef GPFS_PRINTK
3918  printk("cxiGetLayout: type %d iomode %d offset %lld length %lld minlength %lld mxcnt %d ops %p layouts %p\n",
3919  gl->lg_type, gl->lg_iomode, gl->lg_offset, gl->lg_length, gl->lg_minlength,
3920  gl->lg_mxcnt, gl->lg_ops, gl->lg_layout);
3921
3922  printfh("cxiGetLayout:", gl->lg_fh);
3923
3924  printk("cxiGetLayout: layout stripe_type %d stripe_unit %lld file_size %lld llistlen %d llist %p\n",
3925  layout->lg_stripe_type, layout->lg_stripe_unit,layout->lg_file_size,
3926  layout->lg_llistlen,layout->lg_llist);
3927#endif
3928
3929exit:
3930
3931  TRACE3(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_EXIT,
3932         "cxiGetLayout exit: rc %d len %d p 0x%lX", rc, len, p);
3933
3934  return rc;
3935
3936xerror:
3937
3938  if (lg_buf) {
3939    gl->lg_length = 0;
3940    for (i = 0; i < nDests; i++)
3941    {
3942      cxiFreeUnpinned(lg_buf[i].dev_ids.list);
3943    }
3944    cxiFreeUnpinned(lg_buf);
3945  }
3946  if (layout)
3947    cxiFreeUnpinned(layout);
3948
3949  goto exit;
3950}
3951#endif
3952
3953int cxiCheckThreadState(cxiThreadId tid)
3954{
3955  struct task_struct *t, *g;
3956  int rc = ENOENT;
3957
3958  // read_lock(&tasklist_lock);
3959  rcu_read_lock(); 
3960
3961  DO_EACH_THREAD(g,t)
3962  {
3963    /* We are looking for a thread with a given tid and the same parent as
3964       the caller (the caller must be another mmfsd thread */
3965    if (t->pid == tid &&
3966        cxiStrcmp(t->comm, current->comm) == 0)
3967    {
3968      rc = 0;
3969      break;
3970    }
3971  } WHILE_EACH_THREAD(g,t);
3972  // read_unlock(&tasklist_lock);
3973  rcu_read_unlock(); 
3974
3975  return rc;
3976}
3977
Note: See TracBrowser for help on using the repository browser.