source: gpfs_3.1_ker2.6.20/lpp/mmfs/src/gpl-linux/inode.c @ 223

Last change on this file since 223 was 16, checked in by rock, 17 years ago
File size: 61.1 KB
Line 
1/***************************************************************************
2 *
3 * Copyright (C) 2001 International Business Machines
4 * All rights reserved.
5 *
6 * This file is part of the GPFS mmfslinux kernel module.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 *  1. Redistributions of source code must retain the above copyright notice,
13 *     this list of conditions and the following disclaimer.
14 *  2. Redistributions in binary form must reproduce the above copyright
15 *     notice, this list of conditions and the following disclaimer in the
16 *     documentation and/or other materials provided with the distribution.
17 *  3. The name of the author may not be used to endorse or promote products
18 *     derived from this software without specific prior written
19 *     permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
27 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 *
32 *************************************************************************** */
33/* @(#)01       1.90.1.4  src/avs/fs/mmfs/ts/kernext/gpl-linux/inode.c, mmfs, avs_rgpfs24, rgpfs24s012a 4/17/07 15:54:47 */
34/*
35 * Inode operations
36 *
37 * Contents:
38 *   printInode
39 *   printDentry
40 *   cxiSetOSNode
41 *   cxiInvalidatePerm
42 *   getIattr
43 *   get_umask
44 *   setCred
45 *   gpfs_i_create
46 *   gpfs_i_lookup
47 *   gpfs_i_link
48 *   gpfs_i_unlink
49 *   gpfs_i_symlink
50 *   gpfs_i_mkdir
51 *   gpfs_i_rmdir
52 *   gpfs_i_mknod
53 *   gpfs_i_rename
54 *   gpfs_i_readlink
55 *   gpfs_i_follow_link
56 *   gpfs_i_readpage        (in mmap.c)
57 *   gpfs_i_writepage       (in mmap.c)
58 *   gpfs_i_bmap
59 *   gpfs_i_truncate
60 *   gpfs_i_permission
61 *   gpfs_i_smap
62 *   gpfs_i_updatepage
63 *   gpfs_i_revalidate
64 *   gpfs_i_setattr
65 *   gpfs_i_setattr_internal
66 *   gpfs_i_getattr
67 *   gpfs_i_getattr_internal
68 *   gpfs_i_lock
69 *   gpfs_i_getxattr
70 *   gpfs_i_setxattr
71 *   gpfs_i_listxattr
72 *   gpfs_i_removexattr
73 */
74
75#include <Shark-gpl.h>
76
77#include <linux/fs.h>
78#include <linux/sched.h>
79#include <linux/slab.h>
80#include <linux/errno.h>
81#include <linux/smp_lock.h>
82#include <linux/mm.h>
83#include <linux/highmem.h>
84#include <linux/kdev_t.h>
85
86#include <verdep.h>
87#include <cxiMode.h>
88#include <cxiSystem.h>
89#include <cxi2gpfs.h>
90#include <cxiVFSStats.h>
91#include <cxiCred.h>
92
93#include <linux2gpfs.h>
94#include <Trace.h>
95
96#if LINUX_KERNEL_VERSION > 2060000
97#include <cxiTSFattr.h>
98#endif
99
100#ifdef MODULE
101#include <linux/module.h>
102#endif /* MODULE */
103 
104void
105printInode(struct inode *iP)
106{
107  TRACE7(TRACE_VNODE, 3, TRCID_PRINTINODE_1,
108         "printInode: iP 0x%lX inode %d (0x%X) i_count %d dev 0x%X "
109         "mode 0x%X nlink %d\n",
110         iP, iP->i_ino, iP->i_ino, atomic_read((atomic_t *)&iP->i_count),
111         KDEV_INT(iP->i_rdev), iP->i_mode, iP->i_nlink);
112
113  TRACE6(TRACE_VNODE, 3, TRCID_PRINTINODE_2,
114         "printInode: uid %d gid %d rdev 0x%X atime 0x%X mtime 0x%X "
115         "ctime 0x%X\n", iP->i_uid, iP->i_gid, KDEV_INT(iP->i_rdev), 
116         GET_INODETIME_SEC(iP->i_atime), GET_INODETIME_SEC(iP->i_mtime), 
117         GET_INODETIME_SEC(iP->i_ctime));
118
119  TRACE5(TRACE_VNODE, 3, TRCID_PRINTINODE_4,
120         "printInode: size %lld blksize 0x%X blocks %d ver 0x%X op 0x%lX\n",
121         iP->i_size, iP->i_blocks, iP->i_blocks, iP->i_version,
122         iP->i_op);
123
124  TRACE6(TRACE_VNODE, 3, TRCID_PRINTINODE_5,
125         "printInode: fop 0x%lX sb 0x%lX flags 0x%X state 0x%X gen %d "
126         "generic 0x%lX\n", iP->i_fop, iP->i_sb, iP->i_flags, iP->i_state,
127         iP->i_generation, iP->PRVINODE);
128
129  TRACE3(TRACE_VNODE, 3, TRCID_PRINTINODE_6,
130         "printInode: list 0x%lX next 0x%lX prev 0x%lX\n",
131         &(iP->i_list), iP->i_list.next, iP->i_list.prev);
132
133  TRACE3(TRACE_VNODE, 3, TRCID_PRINTINODE_7,
134         "printInode: dentry 0x%lX next 0x%lX prev 0x%lX\n",
135         &(iP->i_dentry), iP->i_dentry.next, iP->i_dentry.prev);
136
137#if LINUX_KERNEL_VERSION < 2050000
138  TRACE3(TRACE_VNODE, 3, TRCID_PRINTINODE_8,
139         "printInode: hash 0x%lX next 0x%lX prev 0x%lX\n",
140         &(iP->i_hash), iP->i_hash.next, iP->i_hash.prev);
141#else
142  TRACE3(TRACE_VNODE, 3, TRCID_PRINTINODE_9,
143         "printInode: hash 0x%lX next 0x%lX prev 0x%lX\n",
144         &(iP->i_hash), iP->i_hash.next, *iP->i_hash.pprev);
145#endif
146}
147
148void
149printDentry(struct dentry *dP)
150{
151  struct inode *iP = dP->d_inode;
152
153  if (!_TRACE_IS_ON(TRACE_VNODE, 3))
154    return;
155
156  TRACE3N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_1,
157          "printDentry: dentry 0x%lX count %d name '%s'\n",
158          dP, atomic_read((atomic_t *)&dP->d_count), dP->d_name.name);
159
160  TRACE5N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_2,
161          "printDentry: time 0x%X op 0x%lX flags 0x%X parent 0x%lX "
162          "inode 0x%X\n", dP->d_time, dP->d_op, dP->d_flags, 
163          dP->d_parent, iP);
164
165  if (iP)   
166  {
167    if (!list_empty(&iP->i_dentry))
168      TRACE4N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_3A,
169              "printDentry: i_ino %d i_count %d "
170              "i_dentry next 0x%lX i_dentry prev 0x%lX\n",
171              iP->i_ino, atomic_read((atomic_t *)&iP->i_count), 
172              list_entry(iP->i_dentry.next, struct dentry, d_alias),
173              list_entry(iP->i_dentry.prev, struct dentry, d_alias));
174    else
175      TRACE2N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_3B,
176              "printDentry: i_ino %d i_count %d\n", 
177              iP->i_ino, atomic_read((atomic_t *)&iP->i_count));
178  }
179
180  TRACE3N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_3C,
181          "printDentry: &d_hash 0x%lX d_hash.next 0x%lX d_hash.prev 0x%lX\n", 
182          &dP->d_child, dP->d_child.next, dP->d_child.prev);
183
184  TRACE3N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_4,
185          "printDentry: &child 0x%lX child.next 0x%lX child.prev 0x%lX\n", 
186          &dP->d_child, dP->d_child.next, dP->d_child.prev);
187
188  if (!list_empty(&dP->d_subdirs))
189    TRACE3N(TRACE_VNODE, 3, TRCID_PRINTDENTRY_5,
190            "printDentry: &subdirs 0x%lX subdir next 0x%lX "
191            "subdir prev 0x%lX\n", &dP->d_subdirs,
192            list_entry(dP->d_subdirs.next, struct dentry, d_child),
193            list_entry(dP->d_subdirs.prev, struct dentry, d_child));
194}
195
196/* Print directory entry tree up to maxPrint elements.
197 * If maxPrint is 0 then there is no upper limit.
198 */
199void
200printDentryTree(struct dentry *entryDP, int maxPrint)
201{
202  int count = 0;
203  struct list_head *lhP;
204  struct dentry *siblingDP;
205  struct dentry *parentDP;
206
207  /* Check trace level required by printDentry() */
208  if (!_TRACE_IS_ON(TRACE_VNODE, 3))
209    return;
210
211  spin_lock(&dcache_lock);
212
213  parentDP = entryDP;
214  lhP = parentDP->d_subdirs.next;
215
216  printDentry(parentDP);
217  if (maxPrint > 0 && ++count >= maxPrint)
218    goto xerror;
219
220  if (list_empty(&parentDP->d_subdirs))
221    goto xerror;
222
223  do
224  {
225    while (lhP != &parentDP->d_subdirs)
226    {
227      siblingDP = list_entry(lhP, struct dentry, d_child);
228
229      printDentry(siblingDP);
230      if (maxPrint > 0 && ++count >= maxPrint)
231        goto xerror;
232
233      if (!list_empty(&siblingDP->d_subdirs))
234      {
235        parentDP = siblingDP;
236        lhP = siblingDP->d_subdirs.next;
237        continue;
238      }
239
240      lhP = siblingDP->d_child.next;
241      parentDP = siblingDP->d_parent;
242    }
243 
244    siblingDP = siblingDP->d_parent;
245    parentDP = siblingDP->d_parent;
246    lhP = siblingDP->d_child.next;
247  } 
248  while (lhP != entryDP->d_child.next);
249
250xerror:
251  spin_unlock(&dcache_lock);
252
253  return;
254}
255
256/* Set the inode operations table for a regular file or directory.  Call
257   with xperm set to true if the file has extended permission attributes
258   (i.e. an ACL).  This routine is a no-op if the inode is not a regular
259   file or directory.
260
261   If the file does not have extended attributes, the table that is used
262   will have a null value for the permission routine pointer.  This will
263   cause Linux to perform access checks directly instead of acquiring the
264   kernel lock and calling GPFS, giving better performance. */
265void setIopTable(struct inode *iP, Boolean xperm)
266{
267  struct inode_operations *newopP, *stdopP, *xopP;
268  struct list_head *lp;
269  int count = 0;
270
271  /* Choose the correct inode operations table based on whether this is a
272     directory or a regular file.  Assume that the file has extended
273     attributes so that GPFS permission checking will be required. */
274  ENTER(0);
275  if (S_ISDIR(iP->i_mode))
276    xopP = &gpfs_dir_iops_xperm;
277  else if (S_ISREG(iP->i_mode))
278    xopP = &gpfs_iops_xperm;
279  else
280  {
281    EXIT(0);
282    return;
283  }
284
285  /* If the file really does have extended attributes (or if the token has
286     been lost so that we do not know the status), set extended permission
287     table and exit. */
288  if (xperm)
289  {
290    iP->i_op = xopP;
291    EXIT(0);
292    return;
293  }
294
295  /* Get address of an inode operations table that has a generic permission
296     routine pointer. */
297  iP->i_op = S_ISDIR(iP->i_mode) ? &gpfs_dir_iops_stdperm : &gpfs_iops_stdperm;
298  EXIT(0);
299}
300
301
302void
303cxiSetOSNode(void *osVfsP, cxiNode_t *cnP, cxiVattr_t *attrP)
304{
305  struct super_block *sbP = (struct super_block *)osVfsP;
306  struct inode *inodeP = (struct inode *)cnP->osNodeP;
307
308  ENTER(0);
309  DBGASSERT(inodeP != NULL);
310  DBGASSERT(inodeP->PRVINODE == cnP);
311  DBGASSERT(inodeP->i_sb == sbP);
312
313  inodeP->i_mode = attrP->va_mode;
314  inodeP->i_nlink = attrP->va_nlink;
315  inodeP->i_uid  = attrP->va_uid;
316  inodeP->i_gid  = attrP->va_gid;
317  inodeP->i_rdev = cxiDevToKernelDev(cxiDev32ToDev(attrP->va_rdev));
318
319  CXITIME_TO_INODETIME(attrP->va_atime, inodeP->i_atime);
320  CXITIME_TO_INODETIME(attrP->va_mtime, inodeP->i_mtime);
321  CXITIME_TO_INODETIME(attrP->va_ctime, inodeP->i_ctime);
322
323  inodeP->i_size = attrP->va_size;
324  inodeP->i_blocks = attrP->va_blocksize;
325  inodeP->i_blocks = attrP->va_blocks;
326  inodeP->i_generation = attrP->va_gen;
327  inodeP->i_flags = 0;
328
329  cnP->xinfo = attrP->va_xinfo;
330
331  switch (inodeP->i_mode & S_IFMT)
332  {
333    case S_IFREG:
334      setIopTable(inodeP, (attrP->va_xinfo & VA_XPERM) != 0);
335      if (cxiIsNFSThread())
336        inodeP->i_fop = &gpfs_fops_no_sendfile;
337      else
338        inodeP->i_fop = &gpfs_fops;
339      break;
340
341    case S_IFDIR:
342      setIopTable(inodeP, (attrP->va_xinfo & VA_XPERM) != 0);
343      inodeP->i_fop = &gpfs_dir_fops;
344      break;
345
346    case S_IFLNK:
347      inodeP->i_op = &gpfs_link_iops;
348      inodeP->i_fop = &gpfs_fops;
349      break;
350
351    case S_IFBLK:
352    case S_IFCHR:
353    case S_IFIFO:
354    case S_IFSOCK:
355      /* Set vector table for special files, gpfs will not get
356       * these operations.
357       */
358#if LINUX_KERNEL_VERSION >= 2060000
359      init_special_inode(inodeP, inodeP->i_mode, inodeP->i_rdev);
360#else
361      init_special_inode(inodeP, inodeP->i_mode,
362                         kdev_t_to_nr(inodeP->i_rdev));
363#endif
364      break;
365  }
366  if (inodeP->i_mapping)
367    inodeP->i_mapping->a_ops = &gpfs_aops;
368
369  cnP->icValid = CXI_IC_ALL;
370
371  TRACE7(TRACE_VNODE, 2, TRCID_LINUXOPS_SETINODE,
372         "cxiSetOSNode: inodeP 0x%lX inode %d i_count %d i_mode 0x%X "
373         "i_xinfo 0x%X i_nlink %d i_size %lld\n",
374         inodeP, inodeP->i_ino, atomic_read((atomic_t *)&inodeP->i_count),
375         inodeP->i_mode, attrP->va_xinfo, inodeP->i_nlink, inodeP->i_size);
376  EXIT(0);
377  return;
378}
379
380
381/* The following function is called from cxiInvalidateAttr when the
382   CXI_IC_PERM option was specified, which indicates that permission related
383   attributes cached in the struct inode (owner, mode, etc.) are no longer
384   known to be valid. */
385void
386cxiInvalidatePerm(cxiNode_t *cnP)
387{
388  struct inode *inodeP = (struct inode *)cnP->osNodeP;
389
390  ENTER(0);
391  TRACE3(TRACE_VNODE, 2, TRCID_CXIINVA_PERM,
392         "cxiInvalidatePerm: cnP 0x%lX std %d dir std %d",
393         cnP, inodeP->i_op == &gpfs_iops_stdperm,
394         inodeP->i_op == &gpfs_dir_iops_stdperm);
395
396  /* Set the inode operation table to gpfs_..._xperm; the next permission
397     check will then go through our gpfs_i_permission function, which will
398     revalidate permission attributes and set the inode operation table
399     back to gpfs_..._stdperm, if appropriate. Note: since symlinks always
400     have permission iop set, setIopTable is a noop for symlinks. */
401  setIopTable(inodeP, true);
402  EXIT(0);
403}
404
405static void
406getIattr(struct inode *inodeP, struct iattr *attrP)
407{
408  ENTER(0);
409  // attrP->ia_valid = ??? ;
410  attrP->ia_mode = inodeP->i_mode;
411  attrP->ia_uid = inodeP->i_uid;
412  attrP->ia_gid = inodeP->i_gid;
413  attrP->ia_size = inodeP->i_size;
414  attrP->ia_atime = inodeP->i_atime;
415  attrP->ia_mtime = inodeP->i_mtime;
416  attrP->ia_ctime = inodeP->i_ctime;
417  EXIT(0);
418  return;
419}
420
421static inline int
422get_umask()
423{
424  return (current->fs->umask);
425}
426
427
428/* Record credentials of current thread */
429void
430setCred(ext_cred_t *credP)
431{
432  int nGroups;
433
434  ENTER(0);
435  credP->principal = current->fsuid; /* user id */
436  credP->group = current->fsgid;     /* primary group id */
437
438#if LINUX_KERNEL_VERSION > 2060300
439  nGroups = MIN(current->group_info->ngroups, MIN(ECRED_NGROUPS, NGROUPS_SMALL));
440#else
441  nGroups = MIN(current->ngroups, ECRED_NGROUPS);
442#endif
443  credP->num_groups = nGroups;
444  if (nGroups > 0)
445#if LINUX_KERNEL_VERSION > 2060300
446    memcpy(credP->eGroups, current->group_info->blocks[0], nGroups*sizeof(gid_t));
447    /* ?? This is incorrect.  Linux 2.6 supports a very large list of
448       groups by allocating a page for each bunch of groups.  Only if
449       there are <= NGROUPS_SMALL groups is the space in
450       group_info->small_block used.  GPFS will only see the prefix of
451       the group set. */
452    /* To save kernel stack space, the GPFS ext_cred_t should keep a
453       pointer to the array of groups.  The group set cannot change
454       during a GPFS system call since the caller can only make one
455       system call at a time. */
456#else
457    memcpy(credP->eGroups, current->groups, nGroups*sizeof(gid_t));
458#endif
459  EXIT(0);
460}
461
462/* inode_operations */
463
464/* Called with a negative (no inode) dir cache entry.
465 * If this call succeeds, we fill in with d_instantiate().
466 */
467
468int
469gpfs_i_create(struct inode *diP, struct dentry *dentryP, int mode
470#if LINUX_KERNEL_VERSION >= 2060000
471              , struct nameidata *ni
472#endif
473              )
474{
475  int rc;
476  struct gpfsVfsData_t *privVfsP;
477  cxiNode_t *dcnP;
478  cxiNode_t *cnP = NULL;
479  cxiIno_t iNum = (cxiIno_t)-1;
480  struct inode *newInodeP = NULL;
481  int flags = FWRITE | FCREAT | FEXCL;
482  cxiMode_t umask = get_umask();
483  ext_cred_t eCred;
484  struct dentry *retP;
485
486  VFS_STAT_START(createCall);
487  ENTER(0);
488  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_CREATE_ENTER,
489         "gpfs_i_create enter: iP 0x%lX dentryP 0x%lX mode 0x%X name '%s'\n",
490         diP, dentryP, mode, dentryP->d_name.name);
491  /* BKL is held at entry */
492
493  dcnP = VP_TO_CNP(diP);
494  privVfsP = VP_TO_PVP(diP);
495  LOGASSERT(privVfsP != NULL);
496
497retry:
498
499  setCred(&eCred);
500  rc = gpfs_ops.gpfsCreate(privVfsP, dcnP, (void **)&newInodeP, &cnP, &iNum, 0,
501                           flags, dentryP, (char *)dentryP->d_name.name,
502                           mode, umask, NULL, &eCred);
503  if (rc == 0)
504  {
505    DBGASSERT(cnP != NULL);
506    DBGASSERT(iNum != -1);
507    DBGASSERT(newInodeP != NULL);
508    DBGASSERT(newInodeP->PRVINODE == cnP);
509    DBGASSERT(cnP->osNodeP == (void *)newInodeP);
510    cnP->createRaceLoserThreadId = 0;
511  }
512
513  /* linux would normally serialize the creates on a directory (via the
514   * parent directory semaphore) to ensure that a create didn't fail with
515   * EEXIST.  However in a multinode environment we may perform a lookup
516   * on one node (thinking the file doesn't exist) yet a create is
517   * performed on a different node before linux can call the physical
518   * file systems create.  We attempt to reconcile this case by marking
519   * the fact that this happened and checking the FEXCL flag at gpfs_f_open()
520   * to see if we should have failed this with EEXIST.
521   */
522  if (rc == EEXIST)
523  {
524    /* Make sure that this create call is part of the linux open call.  NFS
525       and mknod calls create without an open, so check that this is not one
526       of those calls. On the open call the open flags are available and if
527       the FEXCL was on fail it with EEXIST. */
528    int mode1;
529
530    /* Skip if NFS create call. */
531    if (cxiIsNFSThread())
532      goto retExist;
533
534    /* ??? if (sys_mknod call) goto xerror; */
535
536    /* Do it only if trying to create a regular file. */
537    if (((mode & S_IFMT) != 0) && !(mode & S_IFREG))
538      goto retExist;
539
540    setCred(&eCred); // rebuild since gpfsCreate may remap ids
541    rc = gpfs_ops.gpfsLookup(privVfsP, (void *)diP, dcnP,
542                             dentryP, (char *)dentryP->d_name.name,
543                             (void **)&newInodeP, &cnP, &iNum, NULL,
544                             &mode1, &eCred, (void **)&retP);
545    if (rc == ENOENT)
546      goto retry;
547    if (!rc)
548    {
549      /* If the file that was found was a directory than return the
550         return code that linux would have returned. */
551      if (S_ISDIR(newInodeP->i_mode))
552      {
553        rc = EISDIR;
554        goto retExist;
555      }
556      cnP->createRaceLoserThreadId = cxiGetThreadId();
557    }
558  }
559
560retExist:
561  if (rc)
562  {
563    d_drop(dentryP);
564    goto xerror;
565  }
566  diP->i_sb->s_dirt = 1;
567
568xerror:
569  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_CREATE_EXIT,
570         "gpfs_i_create exit: new inode 0x%lX iNum %d (0x%X) rc %d\n",
571         newInodeP, iNum, iNum, rc);
572
573  if (rc)
574    cxiErrorNFS(rc);
575
576  VFS_STAT_STOP;
577  EXIT(0);
578  return -rc;
579}
580
581/* If this routine successfully finds the file, it should
582 * add the dentry to the hash list with d_add() and return
583 * null.  If a failure occurs then return non null and the
584 * dentry will be dput() by the linux lfs layer
585 */
586struct dentry *
587gpfs_i_lookup(struct inode *diP, struct dentry *dentryP
588#if LINUX_KERNEL_VERSION >= 2060000
589              , struct nameidata *ni
590#endif
591              )
592{
593  int code = 0;
594  int rc = 0;
595  struct dentry *retP = NULL;
596  struct gpfsVfsData_t *privVfsP;
597  ext_cred_t eCred;
598  cxiNode_t *dcnP;
599  cxiMode_t mode = 0;
600  cxiIno_t iNum = (cxiIno_t)-1;
601  cxiNode_t *cnP = NULL;
602  struct inode *newInodeP = NULL;
603
604  VFS_STAT_START(lookupCall);
605  ENTER(0);
606  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_LOOKUP_ENTER,
607         "gpfs_i_lookup enter: diP 0x%lX dentryP 0x%lX name '%s'\n",
608         diP, dentryP, dentryP->d_name.name);
609  /* BKL is held at entry */
610
611  dcnP = VP_TO_CNP(diP);
612  privVfsP = VP_TO_PVP(diP);
613  LOGASSERT(privVfsP != NULL);
614
615  setCred(&eCred);
616
617  if (!dcnP)
618  {
619    /* This can happen due to a bug in linux/fs/dcache.c (prune_dcache)
620       where "count" entries are to be pruned, but the last one is
621       found to be recently referenced.  When this happens, count is
622       decremented, but the loop is not terminated.  The result is that
623       it continues to prune entries past where it should (prunes
624       everything).  If our patch for this is not applied, the result
625       is a kernel failure as the cxiNode is referenced.  Checking
626       here (and revalidate) allows us to reject the call instead. */
627
628    TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_LOOKUP_STALE,
629           "cxiNode for inode 0x%lX (ino 0x%X) was FREED!\n",
630           diP, diP->i_ino);
631
632    /* Although we may like to know more about this inode, it is not
633     * ok to call PRINTINODE(iP) here.
634     */
635    rc = ESTALE;
636    code = 1;
637    retP = (struct dentry *)ERR_PTR(-rc);
638    goto xerror;
639  }
640
641  rc = gpfs_ops.gpfsLookup(privVfsP, (void *)diP, dcnP,
642                           dentryP, (char *)dentryP->d_name.name,
643                           (void **)&newInodeP, &cnP, &iNum, NULL,
644                           &mode, &eCred, (void **)&retP);
645
646  if (rc == 0)
647  {
648    DBGASSERT(cnP != NULL);
649    DBGASSERT(iNum != -1);
650    DBGASSERT(newInodeP != NULL);
651    DBGASSERT(newInodeP->PRVINODE == cnP);
652    DBGASSERT(cnP->osNodeP == (void *)newInodeP);
653  }
654  else if (rc != ENOENT) // internal failure
655  {
656    cxiErrorNFS(rc);
657    code = 2;
658    retP = (struct dentry *)ERR_PTR(-rc);
659    goto xerror;
660  }
661  else if (diP->i_nlink == 0) // ENOENT but unlinked parent
662  {
663    /* This odd code is here because this function would normally
664     * exit with a negative dcache entry on ENOENT.  However if
665     * we allow a negative dcache entry in a directory thats been
666     * deleted (but we're still sitting in it) then the d_count
667     * will never go to zero and we'll strand any open file that
668     * is associated with the parent directory.  If we drop the
669     * dentry and return the ENOENT then the VFS will dput the
670     * dentry.  The scenario that gave us trouble was:
671     *
672     * NODE 1                               NODE 2
673     * `rm -rf dirA`                        `rm -rf dirA`
674     * ==========================================================
675     * gpfs_f_open("dirA", ...)
676     * gpfs_f_readdir(...)
677     * [read "fileA", "fileB"]              gpfs_f_open("dirA", ...)
678     *                                      gpfs_f_readdir(...)
679     *                                      [read "fileA", "fileB"]
680     *
681     *                                      gpfs_i_lookup("fileA")
682     *                                      gpfs_i_unlink("fileA")
683     *                                      gpfs_s_delete_inode(fileA's inode)
684     *                                      gpfs_i_lookup("fileB")
685     *                                      gpfs_i_unlink("fileB")
686     *                                      gpfs_s_delete_inode(fileB's inode)
687     *                                      ...
688     *                                      gpfs_i_rmdir("dirA", ...)
689     *                                      gpfs_s_delete_inode(dirA's inode)
690     * destroyOnLastClose=1 for dirA        <======
691     *
692     * gpfs_i_lookup("fileA")
693     *  [creates a negative dentry for fileA,
694     *   increments dirA's reference count]
695     * gpfs_i_lookup("fileB")
696     *  [creates a negative dentry for fileB,
697     *   increments dirA's reference count]
698     */
699    DBGASSERT(dentryP->d_inode == NULL);
700    dentryP->d_op = NULL;
701    d_drop(dentryP);
702
703    code = 3;
704    retP = (struct dentry *)ERR_PTR(-rc);
705    goto xerror;
706  }
707
708  PRINTDENTRY(dentryP);
709
710xerror:
711  TRACE7(TRACE_VNODE, 1, TRCID_LINUXOPS_LOOKUP_EXIT,
712         "gpfs_i_lookup exit: new inode 0x%lX iNum %d (0x%X) cnP 0x%lX retP 0x%lX "
713         "code %d rc %d\n", newInodeP, iNum, iNum, cnP, retP, code, rc);
714
715  VFS_STAT_STOP;
716  EXIT(0);
717  return retP;
718}
719
720int
721gpfs_i_link(struct dentry *oldDentryP, struct inode *diP,
722            struct dentry *dentryP)
723{
724  int rc = 0;
725  struct inode *iP = oldDentryP->d_inode;
726  cxiNode_t *dcnP;
727  cxiNode_t *cnP = NULL;
728  struct gpfsVfsData_t *privVfsP;
729  char *tnameP;
730  ext_cred_t eCred;
731
732  VFS_STAT_START(linkCall);
733  ENTER(0);
734  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_LINK_ENTER,
735         "gpfs_i_link enter: diP 0x%lX dentryP 0x%lX "
736         "dentryP 0x%lX name '%s'\n", diP, oldDentryP, dentryP,
737         dentryP->d_name.name);
738  /* BKL is held at entry */
739
740  cnP = VP_TO_CNP(iP);
741  dcnP = VP_TO_CNP(diP);
742  privVfsP = VP_TO_PVP(diP);
743  LOGASSERT(privVfsP != NULL);
744
745  setCred(&eCred);
746  rc = gpfs_ops.gpfsLink(privVfsP, cnP, dcnP,
747                         dentryP, (char *)dentryP->d_name.name, &eCred);
748  if (rc)
749  {
750    d_drop(dentryP);
751    goto xerror;
752  }
753  iP->i_sb->s_dirt = 1;
754
755xerror:
756  PRINTINODE(iP);
757  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_LINK_EXIT,
758         "gpfs_i_link exit: diP 0x%lX iP 0x%lX rc %d\n", diP, iP, rc);
759
760  if (rc)
761    cxiErrorNFS(rc);
762
763  VFS_STAT_STOP;
764  EXIT(0);
765  return -rc;
766}
767
768int
769gpfs_i_unlink(struct inode *diP, struct dentry *dentryP)
770{
771  int rc = 0;
772  struct gpfsVfsData_t *privVfsP;
773  struct inode *iP = dentryP->d_inode;
774  cxiNode_t *dcnP;
775  cxiNode_t *cnP;
776  ext_cred_t eCred;
777  struct dentry_operations *orig_d_opP;
778
779  VFS_STAT_START(removeCall);
780  ENTER(0);
781  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_UNLINK_ENTER,
782         "gpfs_i_unlink enter: diP 0x%lX iP 0x%lX dentryP 0x%lX name '%s'\n",
783         diP, iP, dentryP, dentryP->d_name.name);
784  /* BKL is held at entry */
785
786  cnP = VP_TO_CNP(iP);
787
788  dcnP = VP_TO_CNP(diP);
789  privVfsP = VP_TO_PVP(diP);
790  LOGASSERT(privVfsP != NULL);
791
792  /* Regarding dcache entry update: upon returning from gpfs_i_unlink, the VFS
793     layer will turn the dentry into a valid, negative dcache entry by calling
794     d_delete().  If another node then creates a new file with the same name,
795     the BR token revoke for the directory block will invalidate the negative
796     dcache entry.  However, there is a window between the gpfsRemove() and
797     the d_delete(), where a BR token revoke would not recognize that it
798     should invalidate the dcache entry, because d_delete() has not yet turned
799     it into a negative dcache entry.  To fix this, we mark the dentry as
800     "valid with d_delete pending"; the meaning of this state is "the dentry
801     is still valid, but a BR token revoke should mark it as 'needing
802     revalidation', even if it does not (yet) look like a negative dcache
803     entry".  Note that we don't want to mark "valid with d_delete pending"
804     entries as invalid in the BR revoke handler, because we don't know for
805     sure that the file is in fact going to be deleted.  The unlink operation
806     may fail, for any number of reasons, and the dentry should not be marked
807     as invalid prematurely.  It's safe to mark a dentry as 'needing
808     revalidation', however.  Ideally, we should swap d_op inside gpfsRemove
809     while we are holding the BR lock on the directory.  However, (1) there is
810     local synchronization in the VFS (our caller is holding the i_sem
811     semaphore on the directory) that will prevent other threads from doing a
812     lookup or create that might change the state back to just plain "valid"
813     before the gpfsRemove has happened, and (2) a BR revoke that happens
814     before the gpfsRemove might unnecessarily mark the dentry as 'needing
815     revalidation'; this is sub-optimal, but it doesn't hurt.  Also see
816     comment in gpfs_i_rmdir. */
817  orig_d_opP = dentryP->d_op;
818  dentryP->d_op = &gpfs_dops_ddeletepending;
819
820  setCred(&eCred);
821  rc = gpfs_ops.gpfsRemove(privVfsP, cnP, dcnP, (char *)dentryP->d_name.name,
822                           &eCred);
823  if (rc)
824  {
825    d_drop(dentryP);
826    if (dentryP->d_op == &gpfs_dops_ddeletepending)
827      dentryP->d_op = orig_d_opP;
828    goto xerror;
829  }
830  diP->i_sb->s_dirt = 1;
831
832  /* d_delete will be called at VFS layer if rc == 0 */
833
834xerror:
835  PRINTINODE(iP);
836  PRINTDENTRY(dentryP);
837  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_UNLINK_EXIT,
838         "gpfs_i_unlink exit: diP 0x%lX iP 0x%lX rc %d\n", diP, iP, rc);
839
840  if (rc)
841    cxiErrorNFS(rc);
842
843  VFS_STAT_STOP;
844  EXIT(0);
845  return -rc;
846}
847
848int
849gpfs_i_symlink(struct inode *diP, struct dentry *dentryP,
850               const char *symlinkTargetP)
851{
852  int rc = 0;
853  cxiNode_t *dcnP;
854  cxiNode_t *cnP;
855  cxiIno_t iNum = (cxiIno_t)-1;
856  struct inode *newInodeP = NULL;
857  struct gpfsVfsData_t *privVfsP;
858  ext_cred_t eCred;
859
860  VFS_STAT_START(symlinkCall);
861  ENTER(0);
862  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_SYMLINK1,
863         "gpfs_i_symlink enter: iP 0x%lX dentryP 0x%lX symlinkTargetP '%s'\n",
864         diP, dentryP, symlinkTargetP);
865  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_SYMLINK2,
866         "gpfs_i_symlink: newLinkName '%s'\n", dentryP->d_name.name);
867  /* BKL is held at entry */
868
869  dcnP = VP_TO_CNP(diP);
870  privVfsP = VP_TO_PVP(diP);
871  LOGASSERT(privVfsP != NULL);
872
873  setCred(&eCred);
874  rc = gpfs_ops.gpfsSymlink(privVfsP, dcnP, (void **)&newInodeP, &cnP,
875                            &iNum, dentryP, (char *)dentryP->d_name.name,
876                            (char *)symlinkTargetP, &eCred);
877  if (rc == 0)
878  {
879    DBGASSERT(cnP != NULL);
880    DBGASSERT(iNum != -1);
881    DBGASSERT(newInodeP != NULL);
882    DBGASSERT(newInodeP->PRVINODE == cnP);
883    DBGASSERT(cnP->osNodeP == (void *)newInodeP);
884  }
885  else
886  {
887    d_drop(dentryP);
888    goto xerror;
889  }
890  diP->i_sb->s_dirt = 1;
891
892xerror:
893  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_SYMLINK_EXIT,
894         "gpfs_i_symlink exit: new inode 0x%lX iNum %d (0x%X) rc %d\n",
895         newInodeP, iNum, iNum, rc);
896
897  if (rc)
898    cxiErrorNFS(rc);
899
900  VFS_STAT_STOP;
901  EXIT(0);
902  return -rc;
903}
904
905int
906gpfs_i_mkdir(struct inode *diP, struct dentry *dentryP, int mode)
907{
908  int rc = 0;
909  struct gpfsVfsData_t *privVfsP;
910  cxiNode_t *dcnP;
911  cxiNode_t *cnP;
912  cxiMode_t umask;
913  ext_cred_t eCred;
914  cxiIno_t iNum = (cxiIno_t)-1;
915  struct inode *newInodeP = NULL;
916 
917  VFS_STAT_START(mkdirCall);
918  ENTER(0);
919  umask = get_umask();  /* LFS should not apply umask and we may not */
920
921  dcnP = VP_TO_CNP(diP);
922  privVfsP = VP_TO_PVP(diP);
923  LOGASSERT(privVfsP != NULL);
924
925  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_MKDIR_ENTER,
926         "gpfs_i_mkdir enter: diP 0x%lX mode 0x%X name '%s'\n",
927         diP, mode, dentryP->d_name.name);
928  /* BKL is held at entry */
929
930  setCred(&eCred);
931  rc = gpfs_ops.gpfsMkdir(privVfsP, dcnP, (void **)&newInodeP, &cnP, &iNum,
932                          dentryP, (char *)dentryP->d_name.name, mode, umask,
933                          &eCred);
934
935  if (rc == 0)
936  {
937    DBGASSERT(cnP != NULL);
938    DBGASSERT(iNum != -1);
939    DBGASSERT(newInodeP != NULL);
940    DBGASSERT(newInodeP->PRVINODE == cnP);
941    DBGASSERT(cnP->osNodeP == (void *)newInodeP);
942  }
943  else
944  {
945    d_drop(dentryP);
946    goto xerror;
947  }
948  diP->i_sb->s_dirt = 1;
949
950xerror:
951  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_MKDIR_EXIT,
952         "gpfs_i_mkdir exit: new inode 0x%lX iNum %d (0x%X) rc %d\n",
953         newInodeP, iNum, iNum, rc);
954
955  if (rc)
956    cxiErrorNFS(rc);
957
958  VFS_STAT_STOP;
959  EXIT(0);
960  return -rc;
961}
962
963int
964gpfs_i_rmdir(struct inode *diP, struct dentry *dentryP)
965{
966  int rc;
967  struct inode *iP = dentryP->d_inode;
968  cxiNode_t *dcnP;
969  cxiNode_t *cnP;
970  struct gpfsVfsData_t *privVfsP;
971  ext_cred_t eCred;
972  struct dentry_operations *orig_d_opP;
973
974  VFS_STAT_START(rmdirCall);
975  ENTER(0);
976  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_RMDIR_ENTER,
977         "gpfs_i_rmdir enter: diP 0x%lX iP 0x%lX name '%s'\n",
978         diP, iP, dentryP->d_name.name);
979  /* BKL is held at entry */
980
981  cnP = VP_TO_CNP(iP);
982  dcnP = VP_TO_CNP(diP);
983  privVfsP = VP_TO_PVP(diP);
984  LOGASSERT(privVfsP != NULL);
985
986  /* See comment in gpfs_i_unlink.  Note that Linux kernel processes
987     directory dentries a little differently from regular file
988     dentries.  In particular, it doesn't appear that a successful
989     rmdir call results in the removed directory dentry being turned
990     into a valid negative dentry; the dentry just gets unhashed and
991     recycled if it had no references at the time of rmdir.  If the
992     dentry did have extra references, e.g. due to a process using the
993     directory in question as cwd, the dentry is unhashed, but it
994     remains a positive dentry pointing to the deleted inode, and will
995     remain as such until the dentry ref count goes to zero, at which
996     point the dentry is recycled.  So there's no apparent need to
997     mark directory dentries as 'needing revalidation' during BR token
998     revoke (we do know that we need to do this for regular files).
999     However, this particular aspect of Linux kernel operation is not
1000     guaranteed to always work in this fashion, so we might as well
1001     try to stay on the safe side of things, and treat directories the
1002     same way as regular files.  It doesn't appear that marking a
1003     dentry as 'needing revalidation' has any ill effects besides extra
1004     cycles required for revalidation, and BR token revoke handler
1005     racing with an unsuccessful gpfsRmdir is a rare enough event to
1006     tolerate this extra performance hit. */
1007  orig_d_opP = dentryP->d_op;
1008  dentryP->d_op = &gpfs_dops_ddeletepending;
1009
1010  setCred(&eCred);
1011  rc = gpfs_ops.gpfsRmdir(privVfsP, cnP, dcnP, (char *)dentryP->d_name.name,
1012                          &eCred);
1013  if (rc)
1014  {
1015    if (rc == EEXIST)
1016      rc = ENOTEMPTY;
1017    if (dentryP->d_op == &gpfs_dops_ddeletepending)
1018      dentryP->d_op = orig_d_opP;
1019    /* d_drop(dentryP); */
1020    goto xerror;
1021  }
1022  diP->i_sb->s_dirt = 1;
1023
1024  /* d_delete will be called at VFS layer if rc == 0 */
1025xerror:
1026  PRINTINODE(iP);
1027  PRINTDENTRY(dentryP);
1028  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_RMDIR_EXIT,
1029         "gpfs_i_rmdir exit: diP 0x%lX iP 0x%lX rc %d\n", diP, iP, rc);
1030
1031  if (rc)
1032    cxiErrorNFS(rc);
1033
1034  VFS_STAT_STOP;
1035  EXIT(0);
1036  return -rc;
1037}
1038
1039int
1040#if LINUX_KERNEL_VERSION >= 2050000
1041gpfs_i_mknod(struct inode *diP, struct dentry *dentryP, int mode, dev_t rdev)
1042#else
1043gpfs_i_mknod(struct inode *diP, struct dentry *dentryP, int mode, int rdev)
1044#endif
1045{
1046  int rc = 0;
1047  struct gpfsVfsData_t *privVfsP;
1048  cxiNode_t *dcnP;
1049  cxiNode_t *cnP;
1050  cxiIno_t iNum = (cxiIno_t)-1;
1051  struct inode *newInodeP = NULL;
1052  cxiMode_t umask = get_umask();
1053  ext_cred_t eCred;
1054  cxiDev32_t rdev32;
1055
1056  VFS_STAT_START(mknodCall);
1057  ENTER(0);
1058  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_MKNOD_ENTER,
1059         "gpfs_i_mknod enter: diP 0x%lX mode 0x%X rdev 0x%X name '%s'\n",
1060         diP, mode, (int)rdev, dentryP->d_name.name);
1061  /* BKL is held at entry */
1062
1063  dcnP = VP_TO_CNP(diP);
1064  privVfsP = VP_TO_PVP(diP);
1065  LOGASSERT(privVfsP != NULL);
1066
1067  setCred(&eCred);
1068  rdev32 = cxiDevToDev32(rdev);
1069  rc = gpfs_ops.gpfsMknod(privVfsP, dcnP, (void **)&newInodeP, &cnP,
1070                          &iNum, dentryP, (char *)dentryP->d_name.name,
1071                          mode, umask, (cxiDev_t)rdev32, &eCred);
1072  if (rc == 0)
1073  {
1074    DBGASSERT(cnP != NULL);
1075    DBGASSERT(iNum != -1);
1076    DBGASSERT(newInodeP != NULL);
1077    DBGASSERT(newInodeP->PRVINODE == cnP);
1078    DBGASSERT(cnP->osNodeP == (void *)newInodeP);
1079  }
1080  else
1081  {
1082    d_drop(dentryP);
1083    goto xerror;
1084  }
1085  diP->i_sb->s_dirt = 1;
1086
1087  /* Set vector table for special files, gpfs will not get these operations.*/
1088#if LINUX_KERNEL_VERSION >= 2060000
1089  init_special_inode(newInodeP, newInodeP->i_mode, newInodeP->i_rdev);
1090#else
1091  init_special_inode(newInodeP, newInodeP->i_mode,
1092                     kdev_t_to_nr(newInodeP->i_rdev));
1093#endif
1094
1095xerror:
1096  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_MKNOD_EXIT,
1097         "gpfs_i_mknod exit: new inode 0x%lX iNum %d (0x%X) rc %d\n",
1098         newInodeP, iNum, iNum, rc);
1099 
1100  VFS_STAT_STOP;
1101  EXIT(0);
1102  return -rc;
1103}
1104
1105int
1106gpfs_i_rename(struct inode *diP, struct dentry *dentryP,
1107              struct inode *tdiP, struct dentry *tDentryP)
1108{
1109  int rc = 0;
1110  struct inode *iP = dentryP->d_inode;
1111  struct inode *tiP = tDentryP->d_inode;
1112  struct gpfsVfsData_t *privVfsP;
1113  cxiNode_t *sourceCNP, *sourceDirCNP, *targetCNP, *targetDirCNP;
1114  ext_cred_t eCred;
1115 
1116  VFS_STAT_START(renameCall);
1117  ENTER(0);
1118  TRACE6(TRACE_VNODE, 1, TRCID_LINUXOPS_RENAME_1,
1119         "gpfs_i_rename enter: iP 0x%lX dvP 0x%lX name '%s'"
1120         " tiP 0x%lX tdiP 0x%lX new name '%s'\n",
1121         iP, diP, dentryP->d_name.name, tiP, tdiP, tDentryP->d_name.name);
1122  /* BKL is held at entry */
1123
1124  /* Do not allow simple rename across mount points */
1125  if (diP->i_sb != tdiP->i_sb)
1126  {
1127    rc = EXDEV;
1128    goto xerror;
1129  }
1130
1131  sourceCNP = VP_TO_CNP(iP);
1132  sourceDirCNP = VP_TO_CNP(diP);
1133
1134  targetCNP = (tiP != NULL) ? VP_TO_CNP(tiP) : NULL;
1135  targetDirCNP = VP_TO_CNP(tdiP);
1136
1137  privVfsP = VP_TO_PVP(iP);
1138  LOGASSERT(privVfsP != NULL);
1139
1140  setCred(&eCred);
1141  rc = gpfs_ops.gpfsRename(privVfsP, sourceCNP, sourceDirCNP,
1142                           (char *)dentryP->d_name.name, targetCNP,
1143                           targetDirCNP, (char *)tDentryP->d_name.name,
1144                           &eCred);
1145  if (rc == 0)
1146  {
1147    gpfs_i_getattr_internal(iP);
1148    gpfs_i_getattr_internal(diP);
1149
1150    if (tiP)
1151      gpfs_i_getattr_internal(tiP);
1152
1153    if (tdiP != diP)
1154      gpfs_i_getattr_internal(tdiP);
1155
1156    diP->i_sb->s_dirt = 1;
1157  }
1158
1159xerror:
1160  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_RENAME_EXIT,
1161         "gpfs_i_rename exit: iP 0x%lX rc %d\n", iP, rc);
1162
1163  if (rc)
1164    cxiErrorNFS(rc);
1165
1166  VFS_STAT_STOP;
1167  EXIT(0);
1168  return -rc;
1169}
1170
1171int
1172gpfs_i_readlink(struct dentry *dentryP, char *bufP, int buflen)
1173{
1174  int rc = 0;
1175  Boolean gotBKL = false;
1176  struct cxiUio_t tmpUio;
1177  cxiIovec_t tmpIovec;
1178  struct inode *iP = dentryP->d_inode;
1179  struct gpfsVfsData_t *privVfsP;
1180  cxiNode_t *cnP;
1181 
1182  VFS_STAT_START(readlinkCall);
1183  ENTER(0);
1184  TRACE5(TRACE_VNODE, 1, TRCID_LINUXOPS_READLINK_ENTER,
1185         "gpfs_i_readlink enter: dentryP 0x%lX bufP 0x%lX len %d "
1186           "iP 0x%lX name '%s'\n",
1187         dentryP, bufP, buflen, iP, dentryP->d_name.name);
1188
1189  /* BKL is not held at entry, except for NFS calls */
1190  TraceBKL();
1191  if (current->lock_depth >= 0)  /* kernel lock is held by me */
1192  {
1193    gotBKL = true;
1194    unlock_kernel();
1195  }
1196
1197  cnP = VP_TO_CNP(iP);
1198  privVfsP = VP_TO_PVP(iP);
1199  LOGASSERT(privVfsP != NULL);
1200
1201  tmpIovec.iov_base = bufP;          /* base memory address                   */
1202  tmpIovec.iov_len = buflen;         /* length of transfer for this area      */
1203
1204  tmpUio.uio_iov = &tmpIovec;        /* ptr to array of iovec structs         */
1205  tmpUio.uio_iovcnt = 1;             /* #iovec elements left to be processed  */
1206  tmpUio.uio_iovdcnt = 0;            /* #iovec elements already processed     */
1207  tmpUio.uio_offset = 0;             /* byte offset in file/dev to read/write */
1208  tmpUio.uio_resid = buflen;         /* #bytes left in data area              */
1209  tmpUio.uio_segflg = UIO_USERSPACE; /* copy to user space buffer             */
1210  tmpUio.uio_fmode = 0;              /* file modes from open file struct      */
1211
1212  rc = gpfs_ops.gpfsReadlink(privVfsP, cnP, &tmpUio);
1213
1214  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_READLINK_EXIT,
1215        "gpfs_i_readlink exit: iP 0x%lX uio_resid %ld offset %d rc %d\n",
1216         iP, tmpUio.uio_resid, tmpUio.uio_offset, rc);
1217
1218  VFS_STAT_STOP;
1219
1220  if (gotBKL)        /* If held kernel lock on entry then reacquire it */
1221    lock_kernel();
1222
1223  if (rc)
1224    cxiErrorNFS(rc);
1225
1226  EXIT(0);
1227  if (rc)
1228    return (-rc);
1229
1230  return (buflen - tmpUio.uio_resid);
1231}
1232
1233#if LINUX_KERNEL_VERSION >= 2061600
1234void* gpfs_i_follow_link(struct dentry *dentry, struct nameidata *nd)
1235#else
1236int gpfs_i_follow_link(struct dentry *dentry, struct nameidata *nd)
1237#endif
1238{
1239  int rc;
1240  Boolean gotBKL = false;
1241  struct cxiUio_t tmpUio;
1242  cxiIovec_t tmpIovec;
1243  struct inode *iP = dentry->d_inode;
1244  struct gpfsVfsData_t *privVfsP;
1245  cxiNode_t *cnP;
1246  char *buf = NULL;
1247
1248  ENTER(0);
1249  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_FOLLOW_LINK_ENTER,
1250         "gpfs_i_follow_link enter: inode 0x%lX name '%s'\n",
1251         dentry->d_inode, dentry->d_name.name);
1252
1253  /* BKL is not held at entry, except for NFS calls */
1254  TraceBKL();
1255  if (current->lock_depth >= 0)  /* kernel lock is held by me */
1256  {
1257    gotBKL = true;
1258    unlock_kernel();
1259  }
1260
1261  /* Allocate a temporary buffer to hold the symlink contents */
1262  buf = cxiMallocPinned(CXI_PATH_MAX+1);
1263  if (buf == NULL)
1264  {
1265    rc = -ENOMEM;
1266    goto xerror;
1267  }
1268
1269  cnP = VP_TO_CNP(iP);
1270  privVfsP = VP_TO_PVP(iP);
1271  LOGASSERT(privVfsP != NULL);
1272
1273  tmpIovec.iov_base = buf;          /* base memory address                   */
1274  tmpIovec.iov_len = PATH_MAX;      /* length of transfer for this area      */
1275
1276  tmpUio.uio_iov = &tmpIovec;       /* ptr to array of iovec structs         */
1277  tmpUio.uio_iovcnt = 1;            /* #iovec elements left to be processed  */
1278  tmpUio.uio_iovdcnt = 0;           /* #iovec elements already processed     */
1279  tmpUio.uio_offset = 0;            /* byte offset in file/dev to read/write */
1280  tmpUio.uio_resid = PATH_MAX;      /* #bytes left in data area              */
1281  tmpUio.uio_segflg = UIO_SYSSPACE; /* copy to kernel space buffer           */
1282  tmpUio.uio_fmode = 0;             /* file modes from open file struct      */
1283
1284  /* Read symlink contents */
1285  rc = gpfs_ops.gpfsReadlink(privVfsP, cnP, &tmpUio);
1286  if (rc)
1287  {
1288    cxiErrorNFS(rc);
1289    rc = -rc;
1290    goto xerror;
1291  }
1292 
1293  /* set end of string */
1294  buf[PATH_MAX - tmpUio.uio_resid] = 0;
1295
1296  TRACE2(TRACE_VNODE, 2, TRCID_LINUXOPS_FOLLOW_LINK_1,
1297         "gpfs_i_follow_link readlink rc %d data '%s'\n", rc, buf);
1298
1299  VFS_FOLLOW_LINK(rc, nd, buf);
1300
1301exit:
1302  if (buf)
1303    cxiFreePinned(buf);
1304
1305  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_FOLLOW_LINK_2,
1306         "gpfs_i_follow_link exit: inode 0x%lX rc %d\n",
1307         dentry->d_inode, rc);
1308
1309  if (gotBKL)        /* If held kernel lock on entry then reacquire it */
1310    lock_kernel();
1311
1312  EXIT(0);
1313
1314#if LINUX_KERNEL_VERSION >= 2061600
1315  return NULL;  /* no cookie */
1316#else
1317  return rc;
1318#endif
1319
1320xerror:
1321  path_release(nd);
1322  goto exit;
1323
1324}
1325
1326#ifdef HAS_IOP_PUT_LINK
1327
1328#if LINUX_KERNEL_VERSION >= 2061600
1329void gpfs_i_put_link(struct dentry *dentry, struct nameidata *nd, void* cookie)
1330#else
1331void gpfs_i_put_link(struct dentry *dentry, struct nameidata *nd)
1332#endif
1333{
1334  char *buf = nd_get_link(nd);
1335  TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_PUTLINK,
1336        "gpfs_i_put_link dentry 0x%lX nd 0x%lX buf 0x%lX\n", dentry, nd, 
1337        !IS_ERR(buf)? buf : NULL);
1338  if (!IS_ERR(buf))
1339     cxiFreePinned(buf);
1340}
1341
1342#endif /* HAS_IOP_PUT_LINK */
1343
1344int
1345gpfs_i_bmap(struct inode *iP, int fragment)
1346{
1347  ENTER(0);
1348  TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_BMAP,
1349         "gpfs_i_bmap: rc ENOSYS\n");
1350  TraceBKL();
1351  EXIT(0);
1352  return -ENOSYS;
1353}
1354
1355void
1356gpfs_i_truncate(struct inode *iP)
1357{
1358  ENTER(0);
1359  /* Nothing to do since the file size was updated on the notify_change
1360   * call which preceeded this call
1361   */
1362  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_TRUNCATE,
1363         "gpfs_i_truncate: inode 0x%lX\n", iP);
1364  TraceBKL();
1365  EXIT(0);
1366}
1367
1368int
1369gpfs_i_permission(struct inode *iP, int mode
1370#if LINUX_KERNEL_VERSION >= 2060000
1371                  , struct nameidata *ni
1372#endif
1373                  )
1374{
1375  cxiNode_t *cnP;
1376  struct gpfsVfsData_t *privVfsP;
1377  ext_cred_t eCred;
1378  int rc = 0;
1379
1380  VFS_STAT_START(accessCall);
1381  ENTER(0);
1382
1383  /* BKL is held at entry */
1384
1385  cnP = VP_TO_CNP(iP);
1386
1387  TRACE6(TRACE_VNODE, 1, TRCID_LINUXOPS_ACCESS_ENTER,
1388         "gpfs_i_permission enter: iP 0x%lX mode 0x%X uid %d gid %d "
1389         "i_mode 0x%X i_xinfo 0x%X", iP, mode, current->fsuid, 
1390         current->fsgid, iP->i_mode, cnP->xinfo);
1391
1392  privVfsP = VP_TO_PVP(iP);
1393  LOGASSERT(privVfsP != NULL);
1394
1395  if (mode)        /* call permission check only if got access mode */
1396  {
1397    setCred(&eCred);
1398    rc = gpfs_ops.gpfsAccess(privVfsP, cnP, mode, ACC_SELF, &eCred);
1399  }
1400
1401xerror:
1402  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_ACCESS_EXIT,
1403         "gpfs_i_permission exit: iP 0x%lX std %d dir std %d rc %d",
1404         iP, iP->i_op == &gpfs_iops_stdperm, iP->i_op == &gpfs_dir_iops_stdperm,
1405         rc);
1406
1407  if (rc)
1408    cxiErrorNFS(rc);
1409
1410  VFS_STAT_STOP;
1411  EXIT(0);
1412  return -rc;
1413}
1414
1415int
1416gpfs_i_smap(struct inode *iP, int sector)
1417{
1418  ENTER(0);
1419  TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_SMAP,
1420         "gpfs_i_smap: rc ENOSYS\n");
1421  TraceBKL();
1422  EXIT(0);
1423  return -ENOSYS;
1424}
1425
1426int
1427gpfs_i_updatepage(struct file *fP, struct page *pageP, const char *bufP,
1428                  unsigned long offset, uint count, int sync)
1429{
1430  ENTER(0);
1431  TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_UPDATEPAGE,
1432         "gpfs_i_updatepage: rc ENOSYS\n");
1433  TraceBKL();
1434  EXIT(0);
1435  return -ENOSYS;
1436}
1437
1438int
1439gpfs_i_revalidate(struct dentry *dentryP)
1440{
1441  int rc;
1442  int code = 0;
1443  struct inode *iP = dentryP->d_inode;
1444  cxiNode_t *cnP;
1445  cxiVattr_t vattr;
1446  struct gpfsVfsData_t *privVfsP;
1447
1448  ENTER(0);
1449  VFS_INC(revalidateCount);
1450  TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_REVALIDATE_ENTER,
1451         "gpfs_i_revalidate enter: dentryP 0x%lX iP 0x%lX ino 0x%X name '%s'\n",
1452         dentryP, dentryP->d_inode, 
1453         (iP) ? iP->i_ino : -1,  dentryP->d_name.name);
1454  /* BKL is usually not held, but seems to be held when coming here as
1455     part of setting an ACL */
1456
1457  if (iP == NULL)
1458  {
1459    code = 1;
1460    rc = ENOENT;
1461    goto xerror;
1462  }
1463  cnP = VP_TO_CNP(iP);
1464
1465  if (!cnP)
1466  {
1467    /* This can happen due to a bug in linux/fs/dcache.c (prune_dcache)
1468       where "count" entries are to be pruned, but the last one is
1469       found to be recently referenced.  When this happens, count is
1470       decremented, but the loop is not terminated.  The result is that
1471       it continues to prune entries past where it should (prunes
1472       everything).  If our patch for this is not applied, the result
1473       is a kernel failure as the cxiNode is referenced.  Checking
1474       here (and lookup) allows us to reject the call instead. */
1475     
1476    TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_REVALIDATE_STALE,
1477           "gpfs_i_revalidate: cxiNode for iP 0x%lX (ino %d) was FREED!\n",
1478           iP, iP->i_ino);
1479
1480    /* Although we may like to know more about this inode, it is not
1481     * ok to call PRINTINODE(iP) here.
1482     */
1483
1484    rc = ESTALE;
1485    code = 2;
1486    goto xerror;
1487  }
1488
1489  if ((cnP->icValid & CXI_IC_STAT) == CXI_IC_STAT)
1490  {
1491    rc = 0;
1492    code = 3;
1493    goto xerror;
1494  }
1495
1496  privVfsP = VP_TO_PVP(iP);
1497  LOGASSERT(privVfsP != NULL);
1498
1499  /* This has the effect of calling us back under a lock and
1500   * setting the inode attributes at the OS level (since this
1501   * operating system caches this info in the vfs layer)
1502   */
1503  rc = gpfs_ops.gpfsGetattr(privVfsP, cnP, &vattr, false);
1504  PRINTINODE(iP);
1505
1506#if 0
1507  /* Delay briefly to give token revoke races a chance to happen, if there
1508     are any.  Time delay is in jiffies (10ms). */
1509#  define howLong 5
1510  TRACE1(TRACE_VNODE, 4, TRCID_REVAL_DELAY,
1511         "gpfs_i_revalidate: begin delay %d\n", howLong);
1512  current->state = TASK_INTERRUPTIBLE;
1513  schedule_timeout(howLong);
1514  TRACE1(TRACE_VNODE, 14, TRCID_REVAL_DELAY_END,
1515         "gpfs_i_revalidate: end delay %d\n", howLong);
1516#endif
1517
1518xerror:
1519  TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_REVALIDATE_EXIT,
1520         "gpfs_i_revalidate exit: dentry 0x%lX code %d rc %d\n",
1521         dentryP, code, rc);
1522
1523  if (rc)
1524    cxiErrorNFS(rc);
1525
1526  EXIT(0);
1527  return -rc;
1528}
1529
1530int
1531gpfs_i_setattr(struct dentry *dentryP, struct iattr *iattrP)
1532{
1533  int rc;
1534
1535  VFS_STAT_START(setattrCall);
1536  ENTER(0);
1537  rc = gpfs_i_setattr_internal(dentryP->d_inode, iattrP);
1538
1539  VFS_STAT_STOP;
1540  EXIT(0);
1541  return -rc;
1542}
1543
1544int
1545gpfs_i_setattr_internal(struct inode *iP, struct iattr *aP)
1546{
1547  int rc = 0;
1548  int code = 0;
1549  long arg1;      /* must be large enough on 64bit to contain */
1550  long arg2;      /*   either a pointer or integer            */
1551  long arg3;
1552  cxiTimeStruc_t atime, mtime, ctime;
1553  cxiNode_t *cnP;
1554  struct gpfsVfsData_t *privVfsP;
1555  ext_cred_t eCred;
1556  unsigned int ia_valid;
1557
1558  ENTER(0);
1559  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_SETATTR_ENTER,
1560         "gpfs_i_setattr enter: iP 0x%lX ia_valid 0x%X\n", iP, aP->ia_valid);
1561  /* ?? Callers of this are inconsistent about whether the BKL is held */
1562
1563  cnP = VP_TO_CNP(iP);
1564  privVfsP = VP_TO_PVP(iP);
1565  LOGASSERT(privVfsP != NULL);
1566
1567  ia_valid = aP->ia_valid;
1568
1569  /* Change file size */
1570  if (ia_valid & ATTR_SIZE)
1571  {
1572    arg1 = (long)&aP->ia_size;
1573    arg2 = 0;
1574    arg3 = 0;
1575
1576    /* call gpfsSetattr, unless we know that new size is the same */
1577    if (!(cnP->icValid & CXI_IC_ATTR) ||
1578        ((struct inode *)cnP->osNodeP)->i_size != aP->ia_size)
1579    {
1580      setCred(&eCred); // rebuild since gpfsSetattr may remap ids
1581      rc = gpfs_ops.gpfsSetattr(privVfsP, cnP, V_SIZE, arg1, arg2, arg3,
1582                                &eCred);
1583      if (rc != 0)
1584      {
1585        code = 1;
1586        goto xerror;
1587      }
1588
1589      /* gpfsSetattr(... V_SIZE ...) will have updated ctime and mtime.
1590         No need to do this again. */
1591      ia_valid &= ~(ATTR_MTIME | ATTR_CTIME);
1592    }
1593  }
1594
1595  /* Change file mode */
1596  if (ia_valid & ATTR_MODE)
1597  {
1598    arg1 = (long)aP->ia_mode;
1599    arg2 = 0;
1600    arg3 = 0;
1601
1602    setCred(&eCred); // rebuild since gpfsSetattr may remap ids
1603    rc = gpfs_ops.gpfsSetattr(privVfsP, cnP, V_MODE, arg1, arg2, arg3, &eCred);
1604    if (rc != 0)
1605    {
1606      code = 2;
1607      goto xerror;
1608    }
1609  }
1610
1611  /* Change uid or gid */
1612  if (ia_valid & (ATTR_UID | ATTR_GID))
1613  {
1614    arg1 = 0;
1615    arg2 = 0;
1616    arg3 = 0;
1617
1618    if (ia_valid & ATTR_UID)
1619      arg2 = (long)aP->ia_uid;
1620    else
1621      arg1 |= T_OWNER_AS_IS;
1622
1623    if (ia_valid & ATTR_GID)
1624      arg3 = (long)aP->ia_gid;
1625    else
1626      arg1 |= T_GROUP_AS_IS;
1627
1628    setCred(&eCred); // rebuild since gpfsSetattr may remap ids
1629    rc = gpfs_ops.gpfsSetattr(privVfsP, cnP, V_OWN, arg1, arg2, arg3, &eCred);
1630    if (rc != 0)
1631    {
1632      code = 3;
1633      goto xerror;
1634    }
1635  }
1636
1637  /* Change access, modification, or change time */
1638  if (ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
1639  {
1640    arg1 = 0;
1641    arg2 = 0;
1642    arg3 = 0;
1643   
1644    if (ia_valid & ATTR_ATIME)
1645    {
1646      CXITIME_FROM_INODETIME(atime, aP->ia_atime);
1647      arg1 = (long)&atime;
1648    }
1649    if (ia_valid & ATTR_MTIME)
1650    {
1651      CXITIME_FROM_INODETIME(mtime, aP->ia_mtime);
1652      arg2 = (long)&mtime;
1653    }
1654    if (ia_valid & ATTR_CTIME)
1655    {
1656      CXITIME_FROM_INODETIME(ctime, aP->ia_ctime);
1657      arg3 = (long)&ctime;
1658    }
1659    setCred(&eCred); // rebuild since gpfsSetattr may remap ids
1660    rc = gpfs_ops.gpfsSetattr(privVfsP, cnP, V_STIME, arg1, arg2, arg3, &eCred);
1661    if (rc != 0)
1662    {
1663      code = 4;
1664      goto xerror;
1665    }
1666  }
1667
1668xerror:
1669
1670  if (rc == 0)
1671  {
1672    /* For NFS we might need to write the inode but the check will be done
1673     * in gpfsSyncNFS().
1674     */
1675    if (cxiAllowNFSFsync())
1676    {
1677      setCred(&eCred); // rebuild since gpfsSetattr may remap ids
1678      rc = gpfs_ops.gpfsSyncNFS(privVfsP, cnP, 0, &eCred);
1679    }
1680
1681    iP->i_sb->s_dirt = 1;
1682  }
1683  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_SETATTR_EXIT,
1684         "gpfs_i_setattr exit: iP 0x%lX code %d rc %d\n", iP, code, rc);
1685
1686  if (rc)
1687    cxiErrorNFS(rc);
1688
1689  EXIT(0);
1690  return rc;
1691}
1692
1693#if LINUX_KERNEL_VERSION >= 2050000
1694int
1695gpfs_i_getattr(struct vfsmount *mntP, struct dentry *dentryP, 
1696               struct kstat *kstatP)
1697#else
1698int
1699gpfs_i_getattr(struct dentry *dentryP, struct iattr *iattrP)
1700#endif
1701{
1702  int rc;
1703  struct inode *iP = dentryP->d_inode;
1704  cxiNode_t *cnP;
1705
1706  VFS_STAT_START(getattrCall);
1707  ENTER(0);
1708
1709  cnP = VP_TO_CNP(iP);
1710
1711  if (cnP && ((cnP->icValid & CXI_IC_STAT) == CXI_IC_STAT)) /* attr are vaild */
1712    rc = 0;
1713  else
1714    rc = gpfs_i_getattr_internal(iP);
1715
1716  if (!rc)
1717#if LINUX_KERNEL_VERSION >= 2050000
1718    generic_fillattr(iP, kstatP);
1719#else
1720    getIattr(iP, iattrP);
1721#endif
1722  else
1723    rc = -rc;
1724
1725  VFS_STAT_STOP;
1726  EXIT(0);
1727  return rc;
1728}
1729
1730int
1731gpfs_i_getattr_internal(struct inode *iP)
1732{
1733  int rc = 0;
1734  cxiNode_t *cnP;
1735  struct gpfsVfsData_t *privVfsP;
1736  cxiVattr_t vattr;
1737
1738  ENTER(0);
1739  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_GETATTR_ENTER,
1740         "gpfs_i_getattr enter: iP 0x%lX\n", iP);
1741  /* BKL is held at entry */
1742
1743  privVfsP = VP_TO_PVP(iP);
1744  LOGASSERT(privVfsP != NULL);
1745  cnP = VP_TO_CNP(iP);
1746
1747  /* This has the effect of calling us back under a lock and
1748   * setting the inode attributes at the OS level (since this
1749   * operating system caches this info in the vfs layer)
1750   */
1751  rc = gpfs_ops.gpfsGetattr(privVfsP, cnP, &vattr, false);
1752  PRINTINODE(iP);
1753
1754  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_GETATTR_EXIT,
1755         "gpfs_i_getattr exit: iP 0x%lX rc %d\n", iP, rc);
1756
1757  if (rc)
1758    cxiErrorNFS(rc);
1759
1760  EXIT(0);
1761  return rc;
1762}
1763
1764#if LINUX_KERNEL_VERSION > 2060000
1765#include <cxiAclUser.h>
1766
1767#define XATTR_SECURITY_PREFIX "security."
1768#define XATTR_TRUSTED_PREFIX "trusted."
1769#define XATTR_USER_PREFIX "user."
1770#define XATTR_NAME_ACL_ACCESS "system.posix_acl_access"
1771#define XATTR_NAME_ACL_DEFAULT  "system.posix_acl_default"
1772
1773static const char *
1774test_prefix(const char *name, const char *prefix)
1775{
1776  while (*prefix && *name == *prefix) {
1777    name++;
1778    prefix++;
1779  }
1780  return *prefix ? NULL : name;
1781}
1782
1783/*
1784 * Inode operation getxattr()
1785 *
1786 */
1787ssize_t
1788gpfs_i_getxattr(struct dentry *dentry, const char *name, void *buf,
1789                size_t buf_size)
1790{
1791  int rc;
1792  cxiNode_t *cnP;
1793  struct gpfsVfsData_t *privVfsP;
1794  struct tsxattr xattr;
1795  struct tsxattrs xattrs;
1796  ext_cred_t eCred;
1797  void *argP = &xattrs;
1798  int flags = 0;
1799  struct inode *iP = dentry->d_inode;
1800  mm_segment_t oldfs;
1801  const char *n;
1802
1803  ENTER(0);
1804  VFS_STAT_START(getxattrCall);
1805
1806  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_GETEXTATTR_ENTER,
1807         "gpfs_i_getxattr enter: iP 0x%lX name %s buf 0x%lX size %d\n",
1808         iP, (name) ? name : "NULL", buf, buf_size);
1809
1810  if (iP == NULL)
1811  {
1812    rc = ENOENT;
1813    goto xerror;
1814  }
1815
1816#ifdef CONFIG_FS_POSIX_ACL
1817  if (n = test_prefix(name, XATTR_NAME_ACL_ACCESS)) {
1818    if (n && (strcmp(n, "") != 0)) {
1819      rc = EINVAL;
1820      goto xerror;
1821    }
1822    rc = gpfs_get_posix_acl(dentry, ACL_TYPE_ACCESS, buf, buf_size);
1823    goto xerror2;
1824  }
1825  if (S_ISDIR(iP->i_mode))
1826  {
1827    if (n = test_prefix(name, XATTR_NAME_ACL_DEFAULT)) {
1828      if (n && (strcmp(n, "") != 0)) {
1829        rc = EINVAL;
1830        goto xerror;
1831      }
1832      rc = gpfs_get_posix_acl(dentry, ACL_TYPE_DEFAULT, buf, buf_size);
1833      goto xerror2;
1834    }
1835  }
1836#endif
1837  if (n = test_prefix(name, XATTR_SECURITY_PREFIX)) {
1838    if (n && (strcmp(n, "") == 0)) {
1839      rc = EINVAL;
1840      goto xerror;
1841    }
1842    goto xattr;
1843  }
1844  if (n = test_prefix(name, XATTR_TRUSTED_PREFIX)) {
1845    if (n && (strcmp(n, "") == 0)) {
1846      rc = EINVAL;
1847      goto xerror;
1848    }
1849    if (!capable(CAP_SYS_ADMIN)) {
1850      rc = EPERM;
1851      goto xerror;
1852    }
1853    goto xattr;
1854  }
1855  if (n = test_prefix(name, XATTR_USER_PREFIX)) {
1856    if (n && (strcmp(n, "") == 0)) {
1857      rc = EINVAL;
1858      goto xerror;
1859    }
1860    goto xattr;
1861  }
1862  rc = EOPNOTSUPP;
1863  goto xerror;
1864
1865xattr:
1866  setCred(&eCred);
1867  xattrs.appId = 3;       // application id GPFS_ATTR_INTERNAL_APPL_ID
1868  xattrs.nattrs = 1;      // no of attributes to get or set
1869  xattrs.attrs = &xattr;  // attributes to get or set
1870
1871  xattr.keyP = (char*) name;        // attribute key
1872  xattr.keyLen = strlen(name) + 1;  // key length
1873  xattr.valueP = buf;               // attribute value
1874  xattr.valueLen = buf_size;        // length of attribute value
1875
1876  privVfsP = VP_TO_PVP(iP);
1877  LOGASSERT(privVfsP != NULL);
1878  cnP = VP_TO_CNP(iP);
1879
1880  oldfs = get_fs();
1881  set_fs(get_ds());
1882
1883  rc = gpfs_ops.gpfsFattr(privVfsP, cnP, NULL, flags, GET_XATTR, argP,
1884                          NULL, &eCred);
1885
1886  set_fs(oldfs);
1887  if (!rc)
1888  {
1889    TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_GETEXTATTR_EXIT0,
1890           "gpfs_i_getxattr exit: iP 0x%lX len %d\n", iP, xattr.valueLen);
1891    VFS_STAT_STOP;
1892    EXIT(0);
1893    if (xattr.valueLen < 0)
1894      rc = ENODATA;
1895    else
1896      return (xattr.valueLen);
1897  }
1898
1899xerror:
1900  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_GETEXTATTR_EXIT,
1901         "gpfs_i_getxattr exit: iP 0x%lX rc %d\n", iP, rc);
1902
1903  if (rc)
1904    cxiErrorNFS(rc);
1905
1906  VFS_STAT_STOP;
1907  EXIT(0);
1908  return (-rc);
1909
1910xerror2:
1911  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_GETEXTATTR_EXIT2,
1912         "gpfs_i_getxattr exit2: iP 0x%lX rc %d\n", iP, rc);
1913
1914  if (rc)
1915    cxiErrorNFS(rc);
1916
1917  VFS_STAT_STOP;
1918  EXIT(0);
1919  return (rc);
1920}
1921
1922/*
1923 * Inode operation setxattr()
1924 *
1925 */
1926int
1927gpfs_i_setxattr(struct dentry *dentry, const char *name, const void *buf,
1928                size_t buf_size, int ext_flags)
1929{
1930  int rc;
1931  cxiNode_t *cnP;
1932  struct gpfsVfsData_t *privVfsP;
1933  struct tsxattr xattr;
1934  struct tsxattrs xattrs;
1935  ext_cred_t eCred;
1936  void *argP = &xattrs;
1937  int flags = 0;
1938  struct inode *iP = dentry->d_inode;
1939  mm_segment_t oldfs;
1940  const char *n;
1941
1942  ENTER(0);
1943  VFS_STAT_START(setxattrCall);
1944
1945  TRACE5(TRACE_VNODE, 1, TRCID_LINUXOPS_SETEXTATTR_ENTER,
1946         "gpfs_i_setxattr enter: iP 0x%lX name %s buf 0x%lX size %d flags 0x%X\n",
1947         iP, (name) ? name : "NULL", buf, buf_size, ext_flags);
1948
1949  if (iP == NULL)
1950  {
1951    rc = ENOENT;
1952    goto xerror;
1953  }
1954
1955#ifdef CONFIG_FS_POSIX_ACL
1956  if (n = test_prefix(name, XATTR_NAME_ACL_ACCESS)) {
1957    if (n && (strcmp(n, "") != 0)) {
1958      rc = EINVAL;
1959      goto xerror;
1960    }
1961    if ((current->fsuid != iP->i_uid) && !capable(CAP_FOWNER))
1962      return EPERM;
1963    rc = gpfs_set_posix_acl(dentry, ACL_TYPE_ACCESS, buf, buf_size);
1964    goto xerror;
1965  }
1966  if (S_ISDIR(iP->i_mode))
1967  {
1968    if (n = test_prefix(name, XATTR_NAME_ACL_DEFAULT)) {
1969      if (n && (strcmp(n, "") != 0)) {
1970        rc = EINVAL;
1971        goto xerror;
1972      }
1973      if ((current->fsuid != iP->i_uid) && !capable(CAP_FOWNER))
1974        return EPERM;
1975      rc = gpfs_set_posix_acl(dentry, ACL_TYPE_DEFAULT, buf, buf_size);
1976      goto xerror;
1977    }
1978  }
1979#endif
1980  if (n = test_prefix(name, XATTR_SECURITY_PREFIX)) {
1981    if (n && (strcmp(n, "") == 0)) {
1982      rc = EINVAL;
1983      goto xerror;
1984    }
1985    goto xattr;
1986  }
1987  if (n = test_prefix(name, XATTR_TRUSTED_PREFIX)) {
1988    if (n && (strcmp(n, "") == 0)) {
1989      rc = EINVAL;
1990      goto xerror;
1991    }
1992    if (!capable(CAP_SYS_ADMIN)) {
1993      rc = EPERM;
1994      goto xerror;
1995    }
1996    goto xattr;
1997  }
1998  if (n = test_prefix(name, XATTR_USER_PREFIX)) {
1999    if (n && (strcmp(n, "") == 0)) {
2000      rc = EINVAL;
2001      goto xerror;
2002    }
2003    goto xattr;
2004  }
2005  rc = EOPNOTSUPP;
2006  goto xerror;
2007
2008xattr:
2009  setCred(&eCred);
2010  xattrs.appId = 3;       // application id GPFS_ATTR_INTERNAL_APPL_ID
2011  xattrs.nattrs = 1;      // no of attributes to get or set
2012  xattrs.attrs = &xattr;  // attributes to get or set
2013
2014  xattr.keyP = (char*) name;            // attribute key
2015  xattr.keyLen = strlen(name) + 1;      // key length
2016  xattr.valueP = (char *)buf;           // attribute value
2017  xattr.valueLen = buf_size;            // length of attribute value
2018
2019  privVfsP = VP_TO_PVP(iP);
2020  LOGASSERT(privVfsP != NULL);
2021  cnP = VP_TO_CNP(iP);
2022
2023  oldfs = get_fs();
2024  set_fs(get_ds());
2025
2026  rc = gpfs_ops.gpfsFattr(privVfsP, cnP, NULL, flags, SET_XATTR, argP,
2027                          NULL, &eCred);
2028  set_fs(oldfs);
2029xerror:
2030  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_SETEXTATTR_EXIT,
2031         "gpfs_i_setxattr exit: iP 0x%lX rc %d\n", iP, rc);
2032
2033  if (rc)
2034    cxiErrorNFS(rc);
2035
2036  VFS_STAT_STOP;
2037  EXIT(0);
2038  return (-rc);
2039}
2040
2041/*
2042 * Inode operation listxattr()
2043 *
2044 * Copy a list of attribute names into the buffer
2045 * provided, or compute the buffer size required.
2046 * Buffer is NULL to compute the size of the buffer required.
2047 *
2048 * Returns a negative error number on failure, or the number of bytes
2049 * used / required on success.
2050 */
2051ssize_t
2052gpfs_i_listxattr(struct dentry *dentry, char *buf, size_t buf_size)
2053{
2054  int rc;
2055  cxiNode_t *cnP;
2056  struct gpfsVfsData_t *privVfsP;
2057  struct tsxattr xattr;
2058  struct tsxattrs xattrs;
2059  ext_cred_t eCred;
2060  void *argP = &xattrs;
2061  int flags = 0;
2062  struct inode *iP = dentry->d_inode;
2063  mm_segment_t oldfs;
2064
2065  ENTER(0);
2066  VFS_STAT_START(listxattrCall);
2067
2068  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_LISTXATTR_IN,
2069         "gpfs_i_listxattr enter: iP 0x%lX buf 0x%lX buf_size %d\n",
2070          iP, buf, buf_size);
2071
2072
2073  if (iP == NULL)
2074  {
2075    rc = ENOENT;
2076    goto xerror;
2077  }
2078  setCred(&eCred);
2079  xattrs.appId = 3;       // application id GPFS_ATTR_INTERNAL_APPL_ID
2080  xattrs.nattrs = 0;      // get all attribute name
2081  xattrs.attrs = &xattr;  // attributes to get or set
2082
2083  xattr.keyP = NULL;            // attribute key
2084  xattr.keyLen = 0;             // key length
2085  xattr.valueP = buf;           // attribute value
2086  xattr.valueLen = buf_size;    // length of attribute value
2087
2088  privVfsP = VP_TO_PVP(iP);
2089  LOGASSERT(privVfsP != NULL);
2090  cnP = VP_TO_CNP(iP);
2091
2092  oldfs = get_fs();
2093  set_fs(get_ds());
2094
2095  /* which names can we show ??? */
2096  rc = gpfs_ops.gpfsFattr(privVfsP, cnP, NULL, flags, LIST_XATTR, argP,
2097                          NULL, &eCred);
2098
2099  set_fs(oldfs);
2100  if (!rc)
2101  {
2102    TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_LISTXTATTR_EXIT0,
2103           "gpfs_i_listxattr exit: iP 0x%lX len %d\n", iP, xattr.valueLen);
2104    VFS_STAT_STOP;
2105    EXIT(0);
2106    return (xattr.valueLen);
2107  }
2108
2109xerror:
2110  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_LISTXTATTR_EXIT,
2111         "gpfs_i_listxattr exit: iP 0x%lX rc %d\n", iP, rc);
2112
2113  if (rc)
2114    cxiErrorNFS(rc);
2115
2116  VFS_STAT_STOP;
2117  EXIT(0);
2118  return (-rc);
2119}
2120
2121/*
2122 * Inode operation removexattr()
2123 *
2124 */
2125int
2126gpfs_i_removexattr(struct dentry *dentry, const char *name)
2127{
2128  int rc;
2129  cxiNode_t *cnP;
2130  struct gpfsVfsData_t *privVfsP;
2131  struct tsxattr xattr;
2132  struct tsxattrs xattrs;
2133  ext_cred_t eCred;
2134  void *argP = &xattrs;
2135  int flags = 0;
2136  struct inode *iP = dentry->d_inode;
2137  mm_segment_t oldfs;
2138  const char *n;
2139
2140  ENTER(0);
2141  VFS_STAT_START(removexattrCall);
2142
2143  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_REMOVEXATTR_IN,
2144         "gpfs_i_removexattr enter: iP 0x%lX name %s\n", iP, (name) ? name : "NULL");
2145
2146  if (iP == NULL)
2147  {
2148    rc = ENOENT;
2149    goto xerror;
2150  }
2151#ifdef CONFIG_FS_POSIX_ACL
2152  if (n = test_prefix(name, XATTR_NAME_ACL_ACCESS)) {
2153    if (n && (strcmp(n, "") != 0)) {
2154      rc = EINVAL;
2155      goto xerror;
2156    }
2157    if ((current->fsuid != iP->i_uid) && !capable(CAP_FOWNER))
2158      return EPERM;
2159    rc = gpfs_set_posix_acl(dentry, ACL_TYPE_ACCESS, NULL, -1);
2160    goto xerror;
2161  }
2162  if (S_ISDIR(iP->i_mode))
2163  {
2164    if (n = test_prefix(name, XATTR_NAME_ACL_DEFAULT)) {
2165      if (n && (strcmp(n, "") != 0)) {
2166        rc = EINVAL;
2167        goto xerror;
2168      }
2169      if ((current->fsuid != iP->i_uid) && !capable(CAP_FOWNER))
2170        return EPERM;
2171      rc = gpfs_set_posix_acl(dentry, ACL_TYPE_DEFAULT, NULL, -1);
2172      goto xerror;
2173    }
2174  }
2175#endif
2176  if (n = test_prefix(name, XATTR_SECURITY_PREFIX)) {
2177    if (n && (strcmp(n, "") == 0)) {
2178      rc = EINVAL;
2179      goto xerror;
2180    }
2181    goto xattr;
2182  }
2183  if (n = test_prefix(name, XATTR_TRUSTED_PREFIX)) {
2184    if (n && (strcmp(n, "") == 0)) {
2185      rc = EINVAL;
2186      goto xerror;
2187    }
2188    if (!capable(CAP_SYS_ADMIN)) {
2189      rc = EPERM;
2190      goto xerror;
2191    }
2192    goto xattr;
2193  }
2194  if (n = test_prefix(name, XATTR_USER_PREFIX)) {
2195    if (n && (strcmp(n, "") == 0)) {
2196      rc = EINVAL;
2197      goto xerror;
2198    }
2199    goto xattr;
2200  }
2201  rc = EOPNOTSUPP;
2202  goto xerror;
2203
2204xattr:
2205  setCred(&eCred);
2206  xattrs.appId = 3;       // application id GPFS_ATTR_INTERNAL_APPL_ID
2207  xattrs.nattrs = 1;      // no of attributes to get or set
2208  xattrs.attrs = &xattr;  // attributes to delete
2209
2210  xattr.keyP = (char*) name;            // attribute key
2211  xattr.keyLen = strlen(name) + 1;      // key length
2212  xattr.valueP = NULL;                  // attribute value
2213  xattr.valueLen = -1;                  // length < zero means delete
2214
2215  privVfsP = VP_TO_PVP(iP);
2216  LOGASSERT(privVfsP != NULL);
2217  cnP = VP_TO_CNP(iP);
2218
2219  oldfs = get_fs();
2220  set_fs(get_ds());
2221
2222  rc = gpfs_ops.gpfsFattr(privVfsP, cnP, NULL, flags, SET_XATTR, argP,
2223                          NULL, &eCred);
2224  set_fs(oldfs);
2225
2226xerror:
2227  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_REMOVEXATTR_EXIT,
2228         "gpfs_i_removexattr exit: iP 0x%lX rc %d\n", iP, rc);
2229
2230  if (rc)
2231    cxiErrorNFS(rc);
2232
2233  VFS_STAT_STOP;
2234  EXIT(0);
2235  return (-rc);
2236}
2237#endif
Note: See TracBrowser for help on using the repository browser.