source: gpfs_3.1_ker2.6.20/lpp/mmfs/src/gpl-linux/ss.c @ 223

Last change on this file since 223 was 16, checked in by rock, 17 years ago
File size: 46.2 KB
Line 
1/***************************************************************************
2 *
3 * Copyright (C) 2001 International Business Machines
4 * All rights reserved.
5 *
6 * This file is part of the GPFS mmfslinux kernel module.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 *  1. Redistributions of source code must retain the above copyright notice,
13 *     this list of conditions and the following disclaimer.
14 *  2. Redistributions in binary form must reproduce the above copyright
15 *     notice, this list of conditions and the following disclaimer in the
16 *     documentation and/or other materials provided with the distribution.
17 *  3. The name of the author may not be used to endorse or promote products
18 *     derived from this software without specific prior written
19 *     permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
27 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 *
32 *************************************************************************** */
33/* @(#)22       1.109.1.3  src/avs/fs/mmfs/ts/kernext/gpl-linux/ss.c, mmfs, avs_rgpfs24, rgpfs24s008a 11/30/06 16:55:18 */
34/*
35 * Implementation of shared segment for GPFS daemon and GPFS kernel code.
36 *
37 * Contents:
38 *   exp_procfs_version
39 *   gpfs_proc_export_init
40 *   gpfs_proc_export_term
41 *   ss_open
42 *   ss_release
43 *   ss_fs_read
44 *   ss_fs_write
45 *   ss_fs_ioctl
46 *   ss_init
47 *   kxSaveThreadInfo
48 *
49 *   struct ShMemChunkDesc
50 *   unprotectKernelMemory
51 *   reprotectKernelMemory
52 *   InitSharedMemory
53 *   TermSharedMemory
54 *   cxiCalcMaxSharedSegment
55 *   cxiAllocSharedMemory
56 *   cxiFreeSharedMemory
57 *   cxiAttachSharedMemory
58 *   cxiFreeSharedMemory
59 *
60 */
61
62#include <Shark-gpl.h>
63
64#include <linux/types.h>
65#include <linux/version.h>
66#ifndef UTS_RELEASE
67#include <linux/utsrelease.h>
68#endif
69#include <linux/kernel.h>
70#include <linux/module.h>
71#include <linux/errno.h>
72#include <linux/slab.h>
73#include <linux/smp_lock.h>
74#include <linux/proc_fs.h>
75#include <linux/mm.h>
76#include <linux/fs.h>
77#include <linux/file.h>
78#include <linux/binfmts.h>
79#include <linux/signal.h>
80#include <linux/vmalloc.h>
81
82#include <asm/pgtable.h>
83#include <asm/pgalloc.h>
84#include <asm/io.h>
85#include <asm/uaccess.h>
86#include <asm/user.h>
87#include <asm/mman.h>
88#include <asm/atomic.h>
89#include <asm/ptrace.h>
90#include <asm/ucontext.h>
91#include <asm/elf.h>
92
93#include <Logger-gpl.h>
94#include <linux2gpfs.h>
95#include <verdep.h>
96#include <arch-gpl.h>
97
98#include <cxiSystem.h>
99#include <cxiIOBuffer.h>
100#include <cxiSharedSeg.h>
101#include <Trace.h>
102
103#if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION < 2061600
104#include <asm/ioctl32.h>
105#if LINUX_KERNEL_VERSION >= 2060507
106long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
107#endif
108#endif
109
110int 
111cxiAttachSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment);
112
113#ifdef GPFS_ARCH_POWER
114#define PKMAP_BASE (0xfe000000UL)
115#define VMALLOC_END ioremap_bot
116#endif
117
118const char *gpfs_banner = "GPFS Linux kernel version " UTS_RELEASE "\n";
119
120SETUP_MODULE_PATH_PARMS;
121
122#ifdef PERF_STATS
123int ioctl_count[MAX_SS_IOCTL_OPS];
124#endif
125
126
127/* Dynamically assigned major device number for the ioctl interfaces to the
128   GPFS kernel modules.  This is the /dev/ss0 device. */
129int GPFSIoctlMajorNumber;
130
131  /* Only allow the users with write access or root users */
132#define CHECK_PERM if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser()) \
133                   {                                                       \
134                      EXIT(0);                                             \
135                      return -EPERM;                                       \
136                   }
137
138/* Vector table for all routines that can be called with the ss_fs_ioctl. */
139int (*ss_ioctl_op[MAX_SS_IOCTL_OPS+1])();
140
141#ifdef SSEG_SWIZZLE_PTRS
142/* virtual MM handlers for vm areas */
143void ss_vm_open(struct vm_area_struct *area);
144void ss_vm_close(struct vm_area_struct *area);
145#if LINUX_KERNEL_VERSION < 2060000
146struct page *ss_vm_nopage(struct vm_area_struct *area, unsigned long address, int unused);
147#else
148struct page *ss_vm_nopage(struct vm_area_struct *area, unsigned long address, int *type);
149#endif /* LINUX_KERNEL_VERSION < 2060000 */
150
151static struct vm_operations_struct ss_vm_ops = {
152  open:   ss_vm_open,
153  close:  ss_vm_close,
154  nopage: ss_vm_nopage,
155};
156#endif /* SSEG_SWIZZLE_PTRS */
157
158/* Add GPFS information to the /proc file system. */
159int
160exp_procfs_version(char *buffer, char **start, off_t offset,
161                   int length, int *eof, void *data)
162{
163  off_t pos = 0;
164  off_t begin = 0;
165  int   len = 0;
166
167  len += sprintf(buffer+len, gpfs_banner);
168  *eof = 1;
169
170  *start = buffer + (offset - begin);
171  len -= (offset - begin);
172  if ( len > length )
173    len = length;
174
175  return len;
176}
177
178void
179gpfs_proc_export_init(void)
180{
181  if (!proc_mkdir("fs/gpfs", 0))
182    return;
183  create_proc_read_entry("fs/gpfs/version", 0, 0, exp_procfs_version, NULL);
184}
185
186void
187gpfs_proc_export_term(void)
188{
189  remove_proc_entry("fs/gpfs/version", NULL);
190  remove_proc_entry("fs/gpfs", NULL);
191
192}
193
194/* Open the character device used for the shared segment. */
195int 
196ss_open(struct inode *inode, struct file *filp)
197{
198
199  TRACE2(TRACE_SHARED, 2, TRCID_SS_019,
200         "ss_open: file 0x%lX inode 0x%lX\n",
201         filp, inode);
202
203  MY_MODULE_INCREMENT();
204
205  return 0;          /* success */
206}
207
208
209/* Release/Close the character device used for the shared segment. */
210int 
211ss_release(struct inode *inode, struct file *filp)
212{
213  TRACE1(TRACE_SHARED, 2, TRCID_SS_023,
214         "ss_release: file 0x%lX\n", filp);
215
216  MY_MODULE_DECREMENT();
217
218  return 0;          /* success */
219}
220
221/* Map the shared segment and return the address of the first chunk allocated
222   (if buffer is big enough to hold it). */
223ssize_t 
224ss_fs_read(struct file *file, char *buf, size_t nbytes, loff_t *ppos)
225{
226  struct inode *inode = file->f_dentry->d_inode;
227  unsigned int minor = MINOR(inode->i_rdev);
228  cxiMemoryMapping_t mapping;
229  int rc;
230
231  TRACE1(TRACE_SHARED, 2, TRCID_SS_059, "ss_fs_read: called 0x%lX\n", nbytes);
232  /* BKL is not held at entry */
233
234  if (minor != 0)
235    return -ENODEV;
236
237  /* Only allow the users with write access or root users */
238  if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser())
239    return -EPERM;
240
241  InitMemoryMapping(&mapping);
242
243  /* Map the shared memory */
244  rc = cxiAttachSharedMemory(&mapping, true);
245  if (rc)
246    return -rc;
247
248  /* If user buffer is big enough, copy base address of segment there */
249  if (nbytes >= sizeof(mapping.vaddr))
250  {
251    rc = cxiCopyOut((char *)&mapping.vaddr, buf, sizeof(mapping.vaddr));
252    if (rc)
253      return -EFAULT;
254  }
255  return 0;
256}
257
258/* Was used for debugging. */
259ssize_t 
260ss_fs_write(struct file *file, const char *buf, size_t nbytes, loff_t *ppos)
261{
262  /* Only allow the users with write access or root users */
263  if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser())
264    return -EPERM;
265
266  TRACE1(TRACE_SHARED, 0, TRCID_SS_065, "ss_fs_write: called 0x%lX\n", nbytes);
267  /* BKL is not held at entry */
268
269  return -EINVAL;
270}
271
272#ifdef PERF_STATS
273int kxNoOp(int op1, int op2)
274{
275  int i;
276
277  if (op1 == 1)  // reset all counters
278  {
279    for (i = 0; i < MAX_SS_IOCTL_OPS; i++)
280       ioctl_count[i] = 0;
281  }
282  if (op2 > 0 && op2 < MAX_SS_IOCTL_OPS)
283    return ioctl_count[op2];     // return the requested counter
284
285  return 0;
286}
287#endif
288
289#if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION >= 2061600
290long ss_fs_compat_ioctl(struct file *file, unsigned int op, unsigned long kx_args)
291{
292  int rc;
293  TRACE2(TRACE_KSVFS, 9, TRCID_SS_DMAPI_COMPAT_ENTER,
294             "Entering ss_fs_compat_ioctl: called me with op = %d (%s)", op, kxOp_tostring(op));
295
296  if (ss_ioctl_op[0] != 0)
297  {
298    /*         unlock_kernel();*/
299     rc = ss_ioctl_op[0](op, kx_args);
300     /*lock_kernel();*/
301  }
302  else
303    rc = -1;
304
305  TRACE1(TRACE_KSVFS, 9, TRCID_SS_DMAPI_COMPAT_EXIT,
306         "Leaving ss_fs_compat_ioctl with rc = %d.", rc);
307 
308  return rc; 
309
310}
311#endif
312
313/* Shared segment and other ioctl calls to the kernel code. */
314int 
315ss_fs_ioctl(struct inode *inode, struct file *file,
316            unsigned int op, unsigned long kx_args)
317{
318  int len, rc;
319  char buf[512];
320  struct kxArgs args_cp;
321  struct kxArgs *args = (struct kxArgs *)kx_args;
322
323  ENTER(0);
324  if (op == kxtraceit)
325  {
326    CHECK_PERM;
327
328    rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp));
329    if (rc != 0)
330      goto minus1;
331
332    len = 3;
333    strncpy(buf, KERN_NOTICE, len);            // KERN_NOTICE = "<5>"
334    len += sprintf(buf+len, "dp %X:%d:", cxiGetThreadId(), args_cp.arg3);
335
336    rc = cxiCopyIn((char*)args_cp.arg2, buf+len, args_cp.arg1+1);
337    if (rc != 0)
338      goto minus1;
339
340    printk(buf);
341    EXIT(0);
342    return 0;
343  }
344
345  TRACE5(TRACE_KSVFS, 15, TRCID_SS_075,
346         "ss_fs_ioctl: op %d opAddr 0x%lX args 0x%lX inode 0x%lX file 0x%lX\n",
347         op, ss_ioctl_op[op], kx_args, inode, file);
348  /* BKL is held at entry */
349
350#ifdef PERF_STATS
351  if (op > 0 && op < MAX_SS_IOCTL_OPS)
352    ioctl_count[op]++;
353#endif
354
355  switch (op)
356  {
357#ifdef GPFS_ARCH_POWER
358    case CoreDump:
359      CHECK_PERM;
360      rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp));
361      if (rc != 0)
362        goto minus1;
363      rc = kxCoreDump((long)args_cp.arg1, (void *)args_cp.arg2,
364                      (struct ucontext *)args_cp.arg3, (char *)args_cp.arg4);
365      break;
366#endif
367    case saveThreadInfo:
368      CHECK_PERM;
369      rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp));
370      if (rc != 0)
371        goto minus1;
372      rc = kxSaveThreadInfo(args_cp.arg1, (void *)args_cp.arg2);
373      break;
374
375    case GetPrivLevel:
376      CHECK_PERM;
377      rc = get_privilege_level();
378      break;
379
380    case SetPrivLevel:
381      CHECK_PERM;
382      rc = set_privilege_level(kx_args);
383      break;
384
385    case MapPrivate:
386      { 
387        char *outAddr;
388
389        CHECK_PERM;
390        rc = cxiCopyIn((char*)args, (char *)&args_cp, sizeof(args_cp));
391        if (rc != 0)
392          goto minus1;
393
394        rc = kxMapPrivate((char *)args_cp.arg1, (unsigned long)args_cp.arg2,
395                          (unsigned long)args_cp.arg3, &outAddr);
396        if (rc == 0)
397          rc = cxiCopyOut((char*)&outAddr, (char*)args_cp.arg4, sizeof(char*));
398 
399        if (rc != 0)
400          rc = -EFAULT;
401        break;
402      }
403
404    case GetTimeOfDay:
405      {
406        cxiTimeStruc_t ts;
407
408        rc = cxiGetTOD(&ts);
409        if (rc == 0)
410          rc = cxiCopyOut((char*)&ts, (char*)kx_args, sizeof(cxiTimeStruc_t));
411
412        if (rc != 0)
413          rc = -EFAULT;
414        break;
415      }
416
417#ifdef PERF_STATS
418    case noOp:
419      rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp));
420      if (rc != 0)
421        break;
422      if (args_cp.arg1 == 0 && args_cp.arg2 == 0)
423      { /* continue to the real noop kxNoOp in ssioctl.C */ }
424      else
425      {
426        rc = kxNoOp((int)args_cp.arg1, (int)args_cp.arg2);
427        break;
428      }
429#endif
430
431    default:
432      TRACE1(TRACE_KSVFS, 9, TRCID_SS_077,
433             "ss_fs_ioctl: invoking ss_ioctl_op %d\n", op);
434      if (ss_ioctl_op[0] != 0)
435      {
436        unlock_kernel();
437        rc = ss_ioctl_op[0](op, kx_args);
438        lock_kernel();
439      }
440      else
441        goto minus1;
442      break;
443  }
444  EXIT(0);
445  return rc;
446
447minus1:
448  EXIT(0);
449  return -1;
450}
451
452#ifdef SSEG_SWIZZLE_PTRS
453extern int ss_fs_mmap(struct file *file, struct vm_area_struct *vma);
454#endif
455
456/* The other operations, not in the following list, for the device come from
457   the bare device. */
458struct file_operations ss_fops =
459{
460  read:    ss_fs_read,
461  write:   ss_fs_write,
462  ioctl:   ss_fs_ioctl,
463#ifdef SSEG_SWIZZLE_PTRS
464  mmap:    ss_fs_mmap,
465#endif
466  open:    ss_open,
467  release: ss_release,
468#if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION >= 2061600
469  compat_ioctl: ss_fs_compat_ioctl,
470#endif
471};
472
473#ifdef API_32BIT
474#ifdef GPFS_ARCH_X86_64
475
476/* Note that these 32-bit ioctl functions are not needed for ia64; these
477   routines just call the standard 64-bit ioctl. */
478static int tsstat32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
479{ 
480  DBGASSERT(cmd == Stat);
481  return sys_ioctl(fd,cmd,ptr);
482}
483static int tsfstat32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
484{ 
485  DBGASSERT(cmd == Fstat);
486  return sys_ioctl(fd,cmd,ptr);
487}
488static int tsfattr32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
489{ 
490  DBGASSERT(cmd == Fattr);
491  return sys_ioctl(fd,cmd,ptr);
492}
493static int tsfsattr32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
494{ 
495  DBGASSERT(cmd == FsAttr);
496  return sys_ioctl(fd,cmd,ptr);
497}
498static int tsattr32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
499{ 
500  DBGASSERT(cmd == Attr);
501  return sys_ioctl(fd,cmd,ptr);
502}
503static int tsgetacl32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
504{ 
505  DBGASSERT(cmd == GetACL);
506  return sys_ioctl(fd,cmd,ptr);
507}
508static int tsputacl32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
509{ 
510  DBGASSERT(cmd == PutACL);
511  return sys_ioctl(fd,cmd,ptr);
512}
513#ifdef DMAPI
514static int kxDmApiCall32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
515{
516  DBGASSERT(cmd == DmApiCall);
517  return sys_ioctl(fd,cmd,ptr);
518}
519#endif /* DMAPI */
520
521#ifdef GPFS_QUOTACTL
522static int kxQuotactl32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
523{
524  DBGASSERT(cmd == Quotactl);
525  return sys_ioctl(fd,cmd,ptr);
526}
527#endif
528#endif /* GPFS_ARCH_X86_64 */
529
530/* Most 64-bit architectures have a separate interface where 32-bit ioctl
531   command numbers / routines must be registered (not necessary for ia64).
532   At some point we may need to modify our command numbers (currently
533   use kxOps for number field) to use both the type / magic number
534   and number field (ie, _IOWR('G', ) instead of current implicit _IORW(0, ))
535   if a command number collision occurs between gpfs and a new
536   device driver. The 32-bit ioctl implementation only
537   uses a hash table (and not a driver specific function pointer like ioctl
538   from file_operations ... something like ioctl32 would be ideal or just
539   passing this to sys_ioctl like is done on ia64 platform),
540   so a collision may occur here someday. Curently not very many drivers
541   provide 32-bit ioctl calls and only the entries from 0x0 to 0x1F are used
542   with magic number 0, ie  _IOWR(0,0) to _IOWR(0,1F), while our external API
543   commands are in the range of 53-59 (0x35-0x3b) ... although the limited
544   ioctl32 hash table size actually makes collisions much more likely.
545   Note that /usr/src/linux/Documentation/ioctl-number.txt keeps track of
546   the registered blocks used by drivers. */
547void
548gpfs_reg_ioctl32()
549{
550  int rc = 0;
551/* TO DO: eventually add 32-bit API for PPC64? */
552#if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION < 2061600
553  rc = register_ioctl32_conversion(Stat, tsstat32);
554  rc |= register_ioctl32_conversion(Fstat, tsfstat32);
555  rc |= register_ioctl32_conversion(Fattr, tsfattr32);
556  rc |= register_ioctl32_conversion(FsAttr, tsfsattr32);
557  rc |= register_ioctl32_conversion(Attr, tsattr32);
558  rc |= register_ioctl32_conversion(GetACL, tsgetacl32);
559  rc |= register_ioctl32_conversion(PutACL, tsputacl32);
560#ifdef DMAPI
561  rc |= register_ioctl32_conversion(DmApiCall, kxDmApiCall32);
562#endif /* DMAPI */
563#ifdef GPFS_QUOTACTL
564  rc  |= register_ioctl32_conversion(Quotactl, kxQuotactl32);
565#endif /* GPFS_QUOTACTL */
566 
567  if (rc)
568   printk("gpfs_reg_ioctl32: Error in registering ioctl32\n");
569
570#endif /* GPFS_ARCH_X86_64 */
571}
572
573void
574gpfs_unreg_ioctl32()
575{
576  int rc = 0;
577/* TO DO: eventually add 32-bit API for PPC64? */
578#if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION < 2061600
579  rc = unregister_ioctl32_conversion(Stat);
580  rc |= unregister_ioctl32_conversion(Fstat);
581  rc |= unregister_ioctl32_conversion(Fattr);
582  rc |= unregister_ioctl32_conversion(FsAttr);
583  rc |= unregister_ioctl32_conversion(Attr);
584  rc |= unregister_ioctl32_conversion(GetACL);
585  rc |= unregister_ioctl32_conversion(PutACL);
586#ifdef DMAPI
587  rc |= unregister_ioctl32_conversion(DmApiCall);
588#endif /* DMAPI */
589#ifdef GPFS_QUOTACTL
590  rc |= unregister_ioctl32_conversion(Quotactl);
591#endif /* GPFS_QUOTACTL */
592
593  if (rc)
594   printk("unregister_ioctl32_conversion: Error in unregistering ioctl32\n");
595
596#endif /* GPFS_ARCH_X86_64 */
597}
598
599#endif /* API_32BIT */
600
601/* Initialization of the character device used for the shared segment
602   interfaces and other ioctl calls to the kernel code. */
603int 
604ss_init()
605{
606  int major;
607
608  GPFSIoctlMajorNumber = 0;
609  major = register_chrdev(0, "ss", &ss_fops);
610
611  if (major < 0)
612  {
613    TRACE1(TRACE_SHARED, 2, TRCID_SS_081,
614           "ss_init: unable to get ss0 major rc %d\n", major);
615    return -1;
616  }
617
618  GPFSIoctlMajorNumber = major;
619  TRACE1(TRACE_SHARED, 2, TRCID_SS_083,
620         "ss_init: module loaded ss0 major %d\n", GPFSIoctlMajorNumber);
621
622  return 0;
623}
624
625/* Management of storage shared between the GPFS daemon and the mmfslinux
626   kernel module.  Chunks of memory are allocated on demand by the
627   kxAllocSharedKernelMemory call, and are then suballocated by GPFS.  To
628   allow free use of pointers, all of this memory is addressed using the
629   same virtual addresses whether it is being accessed from the daemon
630   process or from a process in kernel mode.  Setting up this addressibility
631   requires modifying the protection bits in the Linux page table.  For
632   historical reasons dating to the implementation of GPFS on AIX, the
633   storage shared between the GPFS daemon process and the kernel is
634   frequently referred to collectively as "the shared segment".
635   Note that when pointer swizzling is utilized (via SSEG_PTR_SWIZZLE), the
636   virtual address for the daemon process and kernel is no longer common;
637   the page tables are not fiddled with in this situation and a page fault
638   handler is utilized instead. */
639
640/* Description of each allocated chunk.  Allocated chunks are linked
641   together from ChunkListHead. */
642struct ShMemChunkDesc
643{
644  struct list_head chunkList;  /* list linkage */
645  char* vaddrP;                /* virtual address of beginning of chunk */
646  int len;                     /* length of chunk */
647#ifdef SSEG_SWIZZLE_PTRS
648  char* usrvaddrP;             /* corresponding user address from mmap */
649#endif
650};
651struct list_head ChunkListHead;
652
653/* Number of chunks and total size of all chunks */
654int NVMallocChunks;
655int TotalVMallocBytes;
656
657/* Address of the first chunk allocated.  This value gets returned by
658   cxiMapAllSharedKernelMemory as the base of the GPFS shared segment. */
659char* FirstVMallocChunkP;
660
661/* Maximum total bytes to allocate, as computed by cxiCalcMaxSharedSegment */
662int MaxTotalVMallocBytes;
663
664/* Beginning and end of the area of kernel virtual memory used by
665   vmalloc/vfree */
666UIntPtr VMallocStart;
667UIntPtr VMallocEnd;
668
669/* Minimum size of an allocated chunk */
670#define MIN_VMALLOC_CHUNK PAGE_SIZE
671
672/* Lock guarding the chunk list */
673spinlock_t ChunkListLock;
674
675/* Pointer to slab allocator for ShMemChunkDesc's */
676struct kmem_cache* ChunkCacheP = NULL;
677
678/* Make a range of kernel memory addressible by the current process while
679   in user mode */
680#ifndef SSEG_SWIZZLE_PTRS
681static void 
682unprotectKernelMemory(char* vaddrP, int len, Boolean allocating)
683{
684  struct mm_struct *mm = current->mm;
685  unsigned long vaddr = (unsigned long) vaddrP;
686  unsigned long vaddr_start = vaddr;
687  pgd_t *pgdP;
688  pmd_t *pmdP;
689  pte_t *pteP;
690
691  /* Change protection for each page in the range */
692  TRACE3N(TRACE_SHARED, 9, TRCID_UNPROT_ENTER,
693         "unprotectKernelMemory: vaddr 0x%lX len %d allocating %d\n",
694         vaddr, len, allocating);
695  while (len > 0)
696  {
697    /* Access the page to make sure all levels of the page table have been
698       created.  This this is a kernel address, so page table entries will
699       persist once they have been created, since the Linux kernel is not
700       pageable. */
701    atomic_read((atomic_t*) vaddrP);
702
703    /* Find page table entries for this page */
704    pgdP = PGD_OFFSET(mm, vaddr);
705    pmdP = pmd_offset(pgdP, vaddr);
706    pteP = PTE_OFFSET(pmdP, vaddr);
707
708#ifdef GPFS_ARCH_I386
709    /* On IA32, set both the pte, and pmd/pgd to allow mmfsd process-level
710     * access to the area.  Since each process has its own page directory
711     * (pgd), an attempt to access one of these unprotected pages will be
712     * blocked by the protection bit in that process' pgd.  If another process
713     * requires access to shared kernel pages, only its pgd need be updated.
714     * pmd_t and pte_t are same size and definition. Thus pte_rdprotect()
715     * (only available macro that hides differences between Suse/Redhat)
716     * is used.
717     */
718    DBGASSERT(sizeof(pte_t) == sizeof(pmd_t));
719    set_pte((pte_t *)pmdP, pte_mkread((*(pte_t *)pmdP)));
720    if (allocating)
721      set_pte(pteP, pte_mkread(*pteP));
722
723    PTE_UNMAP(pteP);
724
725#elif defined(GPFS_ARCH_POWER) || defined(GPFS_ARCH_X86_64)
726    // XXX Not implemented
727    //      pmd_val(*pmdP) = pmd_val(*pmdP) | _PAGE_USER;
728    //      if (allocating)
729    //        set_pte(pteP, pte_mkread(*pteP));
730#elif defined(GPFS_ARCH_IA64)
731    /* On IA64, set the protection level of the page when it is created.
732     * Nothing to do when allowing access from another process except to
733     * set the privilege level of the process.
734     */
735    if (allocating)
736      pte_val(*pteP) = pte_val(*pteP) | PRIVILEGE_FLAGS;
737#endif
738
739    /* Advance to the next page */
740    vaddr += PAGE_SIZE;
741    vaddrP += PAGE_SIZE;
742    len -= PAGE_SIZE;
743  }
744
745  /* It is necessary to flush the TLB entries for IA64 to propagate the
746   * pte privilege level change.
747   */
748  FLUSH_TLB_RANGE(mm, vaddr_start, vaddr);
749}
750#else
751static void 
752unprotectKernelMemory(char* vaddrP, int len, Boolean allocating)
753{
754  /* do nothing when pointer swizzling */
755  return;
756}
757#endif /* !SSEG_SWIZZLE_PTRS */
758
759/* Make a range of kernel memory no longer addressible by user processes
760   while in user mode.  Called just before freeing the memory. */
761#ifndef SSEG_SWIZZLE_PTRS
762static void 
763reprotectKernelMemory(char* vaddrP, int len)
764{
765  struct mm_struct *mm = current->mm;
766  unsigned long vaddr = (unsigned long) vaddrP;
767  unsigned long vaddr_start = vaddr;
768  pgd_t *pgdP;
769  pmd_t *pmdP;
770  pte_t *pteP;
771
772  /* Change protection for each page in the range */
773  ENTER(0);
774  TRACE2(TRACE_SHARED, 4, TRCID_REPROT_ENTER,
775         "reprotectKernelMemory: vaddr 0x%lX len %d\n",
776         vaddr, len);
777  while (len > 0)
778  {
779    /* Access the page to make sure all levels of the page table have been
780       created.  This this is a kernel address, so page table entries will
781       persist once they have been created, since the Linux kernel is not
782       pageable. */
783    atomic_read((atomic_t*) vaddrP);
784
785    /* Find page table entries for this page */
786    pgdP = PGD_OFFSET(mm, vaddr);
787    pmdP = pmd_offset(pgdP, vaddr);
788    pteP = PTE_OFFSET(pmdP, vaddr);
789
790#ifdef GPFS_ARCH_I386
791    /* On IA32, reset the pte and pmd to disallow process-level access.*/
792    set_pte((pte_t *)pmdP, pte_rdprotect((*(pte_t *)pmdP))); // see unprotect
793    set_pte(pteP, pte_rdprotect(*pteP));
794
795#elif defined(GPFS_ARCH_POWER) || defined(GPFS_ARCH_X86_64)
796    // XXX??? not implemented
797
798#elif defined(GPFS_ARCH_IA64)
799    /* On IA64, reset the protection level of the page. */
800    pte_val(*pteP) = (pte_val(*pteP) & ~_PAGE_PL_MASK) | _PAGE_PL_0;
801#endif
802
803    PTE_UNMAP(pteP);
804
805    /* Advance to the next page */
806    vaddr += PAGE_SIZE;
807    vaddrP += PAGE_SIZE;
808    len -= PAGE_SIZE;
809  }
810
811  /* It is necessary to flush the TLB entries for IA64 to propagate the
812   * pte privilege level change.
813   */
814  FLUSH_TLB_RANGE(mm, vaddr_start, vaddr);
815  EXIT(0);
816}
817#else
818static void 
819reprotectKernelMemory(char* vaddrP, int len)
820{
821  /* do nothing when pointer swizzling */
822  return;
823}
824#endif /* !SSEG_SWIZZLE_PTRS */
825
826
827/* Initialize the code that manages shared memory */
828void 
829InitSharedMemory()
830{
831  ENTER(0);
832  TRACE2(TRACE_SHARED, 1, TRCID_SHKERN_INIT,
833         "InitSharedMemory called.  VMALLOC_START 0x%lX VMALLOC_END 0x%lX\n",
834         VMALLOC_START, VMALLOC_END);
835
836  VMallocStart = (UIntPtr)VMALLOC_START;
837  VMallocEnd = (UIntPtr)VMALLOC_END;
838
839  spin_lock_init(&ChunkListLock);
840
841  /* Create a slab allocator for ShMemChunkDesc objects */
842  ChunkCacheP = kmem_cache_create("ShMemChunkDesc",
843                                  sizeof(struct ShMemChunkDesc),
844                                  0 /* offset */,
845                                  0 /* flags */,
846                                  NULL /* ctor */,
847                                  NULL /* dtor */);
848  if (ChunkCacheP == NULL)
849    cxiPanic("Cannot create ShMemChunkDesc cache\n");
850
851  /* Empty the chunk list */
852  INIT_LIST_HEAD(&ChunkListHead);
853  EXIT(0);
854}
855
856
857/* Compute how large the total size shared segment
858   is allowed to grow, based on a desired size.  A value of 0 for
859   desiredBytes means to compute the default maximum size. */
860int 
861cxiCalcMaxSharedSegment(int desiredBytes, int* actualBytesP)
862{
863  Int64 physMemSize;
864  Int64 effPhysMemSize;
865  UIntPtr minAllowedSize = 16*1024*1024;
866  UIntPtr maxAllowedSize = MAX_SSEG_MAPPINGS*1024*1024;
867  UIntPtr actualBytes;
868  char* p;
869  UIntPtr vmUsed;
870  UIntPtr vmRegionReserved;
871  UIntPtr maxBytes;
872
873  /* If an explicit number of desired bytes was given, use that value.
874     Otherwise, if no number of desired bytes was given (or a value
875     smaller than the minimum possible was specified) compute the size based
876     on the size of real memory.  The size computed is a fixed fraction of
877     real memory (only the first 2G on i386). */
878  ENTER(0);
879  physMemSize = (Int64)num_physpages * PAGE_SIZE;
880#ifdef GPFS_ARCH_I386
881  effPhysMemSize = MIN(physMemSize, (Int64)0x80000000);
882#else
883  effPhysMemSize = physMemSize;
884#endif
885
886  if (desiredBytes > 0)
887    actualBytes = desiredBytes;
888  else
889    actualBytes = effPhysMemSize/16;
890
891  actualBytes = MAX(actualBytes, minAllowedSize);
892
893  /* Compute an approximation of how many bytes are already used in the
894     vmalloc region.  The variables needed to compute this exactly are not
895     exported from the kernel.  If we vmalloc a single page area and see how
896     far the allocated area is from the beginning of the vmalloc region, we
897     have at least a lower bound on the amount of vmalloc storage already
898     used.  If there have been no vfrees, this will yield an accurate
899     answer. */
900  p = vmalloc(PAGE_SIZE);
901  if (p == NULL)
902    vmUsed = VMallocEnd - VMallocStart;
903  else
904  {
905    vmUsed = (UIntPtr)p - VMallocStart;
906    vfree(p);
907  }
908
909  /* Make sure the actual maximum fits within the vmalloc region, taking
910     into account memory already used and leaving a reserved area for other
911     vmallocs. */
912  vmRegionReserved = 16*1024*1024;
913  maxBytes = (VMallocEnd-VMallocStart) - (vmUsed+vmRegionReserved);
914  actualBytes = MIN(actualBytes, maxBytes);
915
916  /* Make sure the actual maximum does not exceed the maximum possible */
917  actualBytes = MIN(actualBytes, maxAllowedSize);
918
919  /* Make sure the actual maximum is less than half of real memory */
920  actualBytes = MIN(actualBytes, effPhysMemSize/2);
921
922  /* Round actual maximum down to a multiple of the page size */
923  actualBytes = (actualBytes/PAGE_SIZE) * PAGE_SIZE;
924
925  /* If actual maximum is less than the minimum allowed, return 0 */
926  if (actualBytes < minAllowedSize)
927    actualBytes = 0;
928
929  /* Return result */
930  TRACE5(TRACE_SHARED, 1, TRCID_CALC_MAX_SHARED,
931         "cxiCalcMaxSharedSegment: actualBytes 0x%lX desiredBytes %d "
932         "physMemSize 0x%lX vmUsed 0x%lX maxBytes 0x%lX\n",
933         actualBytes, desiredBytes, physMemSize, vmUsed, maxBytes);
934
935  *actualBytesP = (int)actualBytes;
936  MaxTotalVMallocBytes = (int)actualBytes;
937
938  EXIT(0);
939  return 0;
940}
941
942/* Acquire additional kernel memory that is mapped to user space when
943 * using SSEG_SWIZZLE_PTRS (different virtual address between kernel and
944 * daemon); otherwise allocated memory uses the same virtual address
945 * for both kernel code and the GPFS daemon.  Will get at least minBytes.
946 * Returns the starting virtual address of the area and its actual length.
947 */
948int 
949cxiAllocSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment)
950{
951  int rc = 0;
952  int code = 0;
953  char *vaddrP;
954  struct ShMemChunkDesc* chunkP = NULL;
955  int minBytes = mappingP->kBytes * 1024;
956  int actualBytes;
957  pgprot_t prot;
958#if defined(GPFS_ARCH_X86_64) && !defined(SSEG_SWIZZLE_PTRS)
959  pml4_t* pml4P;
960#endif
961
962  /* On linux we only allocate the shared segment in this manner */
963  ENTER(0);
964  LOGASSERT(isSharedSegment == true);
965
966  /* Compute actual number of bytes to allocate */
967  if (minBytes <= MIN_VMALLOC_CHUNK)
968    actualBytes = MIN_VMALLOC_CHUNK;
969  else
970    actualBytes = ((minBytes + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE;
971
972  TRACE2(TRACE_SHARED, 5, TRCID_ALLOC_SHARED_VMALLOC,
973         "cxiAllocSharedMemory: vmalloc %d minBytes %d\n",
974         actualBytes, minBytes);
975
976  /* Return failure if this allocation would put us over the limit */
977  if (TotalVMallocBytes + actualBytes > MaxTotalVMallocBytes)
978  {
979    code = 1;
980    rc = -ENOMEM;
981    goto xerror;
982  }
983
984  /* Get a descriptor for the memory to be allocated */
985  chunkP = (struct ShMemChunkDesc*) kmem_cache_alloc(ChunkCacheP, GFP_KERNEL);
986  if (chunkP == NULL)
987  {
988    code = 2;
989    rc = -ENOMEM;
990    goto xerror;
991  }
992   
993  /* Allocate memory
994   * ?? Instead of calling vmalloc here, we could also do something like:
995   *   pgprot_t prot;
996   *   prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_USER);
997   *   vaddrP = __vmalloc(actualBytes, GFP_KERNEL | __GFP_HIGHMEM, prot);
998   *
999   * This is an expansion of the vmalloc inline function, with _PAGE_USER
1000   * added to the protection bits so that the PTE entries will already be set
1001   * correctly.  However, a call to unprotectKernelMemory would still be
1002   * needed to set the protection bits in the PMD entries.
1003   *
1004   * There is also the possibility here of using __GFP_HIGHMEM instead of
1005   * GFP_KERNEL on machines with sufficient high memory.  The storage
1006   * allocated here will never be used as I/O buffers, so high memory would
1007   * be a good place to put it.  This would give I/O buffers a greater chance
1008   * of being allocated below 1G, reducing the need for bounce buffers to do
1009   * I/O. 
1010   */
1011#ifndef SSEG_SWIZZLE_PTRS
1012
1013#if defined(GPFS_ARCH_POWER)
1014  prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_USER);
1015  vaddrP = __vmalloc(actualBytes, GFP_KERNEL, prot);
1016#elif defined(GPFS_ARCH_X86_64)
1017#define __pml4(x) ((pml4_t) { (x) } )
1018  pml4P = pml4_offset_k(VMALLOC_START);
1019  set_pml4(pml4P, __pml4(pml4_val(*pml4P) | _PAGE_USER));
1020#undef __pml4
1021  prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_USER | _PAGE_GLOBAL);
1022  vaddrP = __vmalloc(actualBytes, GFP_KERNEL, prot);
1023#elif defined(GPFS_ARCH_PPC64)
1024  prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_USER);
1025  vaddrP = __vmalloc(actualBytes, GFP_KERNEL, prot);
1026#else
1027  vaddrP = vmalloc(actualBytes);
1028#endif
1029
1030#else
1031  vaddrP = vmalloc(actualBytes);
1032#endif /* !SSEG_SWIZZLE_PTRS */
1033  if (vaddrP == NULL)
1034  {
1035    code = 3;
1036    rc = -ENOMEM;
1037    goto xerror;
1038  }
1039
1040#ifdef MALLOC_DEBUG
1041  MallocDebugNew(vaddrP, actualBytes, 3);
1042#endif
1043
1044  spin_lock(&ChunkListLock);
1045
1046  NVMallocChunks += 1;
1047  TotalVMallocBytes += actualBytes;
1048
1049  /* Remember address of first chunk allocated */
1050  if (NVMallocChunks == 1)
1051    FirstVMallocChunkP = vaddrP;
1052
1053  /* Fill in chunk descriptor and add it to the proper list */
1054  chunkP->vaddrP = vaddrP;
1055  chunkP->len = actualBytes;
1056#ifdef SSEG_SWIZZLE_PTRS
1057  chunkP->usrvaddrP = 0;
1058#endif
1059  list_add(&chunkP->chunkList, &ChunkListHead);
1060
1061  spin_unlock(&ChunkListLock);
1062
1063  /* Make memory just allocated addressible by the current process */
1064  unprotectKernelMemory(vaddrP, actualBytes, true);
1065
1066  /* Return results */
1067  mappingP->vaddr = vaddrP;
1068  mappingP->kBytes = actualBytes / 1024;
1069#ifdef SSEG_SWIZZLE_PTRS
1070  mappingP->kvaddr = vaddrP;
1071  /* mappingP->vaddr is reset to proper user va in kxAllocSharedMemory */
1072#endif
1073
1074xerror:
1075  if (rc)
1076  {
1077    InitMemoryMapping(mappingP);
1078
1079    if (chunkP)
1080      kmem_cache_free(ChunkCacheP, (void*)chunkP);
1081  }
1082
1083  TRACE4(TRACE_SHARED, 1, TRCID_ALLOC_SHARED_EXIT,
1084         "cxiAllocSharedMemory: vaddr 0x%lX kBytes %d rc %d code %d\n",
1085         mappingP->vaddr, mappingP->kBytes, rc, code);
1086  EXIT(0);
1087  return rc;
1088}
1089
1090#ifdef SSEG_SWIZZLE_PTRS
1091/* Record the user address that is associated with the kernel vmalloc
1092   address (vmalloc chunk for shared segment). This is needed later on
1093   by the page fault handler.
1094   This routine is called after allocating the chunk and determining the
1095   corresponding user address (used by all user processes mmap'ing
1096   this specific shared segment chunk).
1097*/
1098int
1099cxiRecordSharedMemory(cxiMemoryMapping_t *mappingP)
1100{
1101  int found = 0;
1102  struct ShMemChunkDesc* chunkP = NULL;
1103  struct list_head* p;
1104
1105  ENTER(0);
1106  spin_lock(&ChunkListLock);
1107  list_for_each(p, &ChunkListHead)
1108  {
1109    chunkP = list_entry(p, struct ShMemChunkDesc, chunkList);
1110    if (chunkP->vaddrP == mappingP->kvaddr)
1111    {
1112      chunkP->usrvaddrP = mappingP->vaddr;
1113      found = 1;
1114      break;
1115    }
1116  }
1117  spin_unlock(&ChunkListLock);
1118
1119  EXIT(0);
1120  if (!found)
1121    return -1;
1122  else
1123    return 0;
1124}
1125
1126/* Obtain any necessary kernel information for initializing
1127   pointer swizzling; currently just grabs vmalloc range info. */
1128int
1129cxiInitPtrSwizzling(UIntPtr *vmallocStartP, UIntPtr *vmallocEndP)
1130{
1131  ENTER(0);
1132
1133  *vmallocStartP = (UIntPtr)VMALLOC_START;
1134  *vmallocEndP = (UIntPtr)VMALLOC_END;
1135
1136  EXIT(0);
1137  return 0;
1138}
1139#endif
1140
1141/* Unmap and deallocate all shared segment memory */
1142int 
1143cxiFreeSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment)
1144{
1145  struct list_head* firstP;
1146  struct ShMemChunkDesc* chunkP;
1147
1148  ENTER(0);
1149  LOGASSERT(isSharedSegment == true);
1150
1151  /* Walk down the list of multi page chunks.  Free each one and its
1152   * associated chunk descriptor.  Drop the list lock while freeing
1153   * storage.
1154   */
1155  spin_lock(&ChunkListLock);
1156
1157  while (!list_empty(&ChunkListHead))
1158  {
1159    firstP = ChunkListHead.next;
1160    list_del(firstP);
1161
1162    chunkP = list_entry(firstP, struct ShMemChunkDesc, chunkList);
1163    NVMallocChunks -= 1;
1164    TotalVMallocBytes -= chunkP->len;
1165
1166    spin_unlock(&ChunkListLock);
1167    reprotectKernelMemory(chunkP->vaddrP, chunkP->len);
1168
1169    TRACE2(TRACE_SHARED, 4, TRCID_FREEALL_VFREE,
1170           "cxiFreeSharedMemory: vaddrP 0x%lX chunkP 0x%lX\n",
1171           chunkP->vaddrP, chunkP);
1172
1173    vfree(chunkP->vaddrP);
1174#ifdef MALLOC_DEBUG
1175    MallocDebugDelete(chunkP->vaddrP);
1176#endif
1177
1178    kmem_cache_free(ChunkCacheP, (void*)chunkP);
1179    spin_lock(&ChunkListLock);
1180  }
1181  FirstVMallocChunkP = NULL;
1182  spin_unlock(&ChunkListLock);
1183
1184  InitMemoryMapping(mappingP);
1185
1186  EXIT(0);
1187  return 0;
1188}
1189
1190/* Map the shared segment memory into the address
1191 * space of the calling process
1192 */
1193int 
1194cxiAttachSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment)
1195{
1196  struct list_head* p;
1197  struct ShMemChunkDesc* chunkP;
1198
1199  ENTER(0);
1200  LOGASSERT(isSharedSegment == true);
1201
1202  /* Walk down the list of allocated chunks.  Map each one so that
1203   * this process can access it from user space.
1204   */
1205  spin_lock(&ChunkListLock);
1206  list_for_each(p, &ChunkListHead)
1207  {
1208    chunkP = list_entry(p, struct ShMemChunkDesc, chunkList);
1209    TRACE1N(TRACE_SHARED, 11, TRCID_MAPALL_MULTI,
1210            "cxiAttachSharedMemory: chunkP 0x%lX\n", chunkP);
1211
1212    /* unprotectKernelMemory has to be called here with 'allocating'
1213     * set to 'true', so that mmfsadm can map and access the shared segment
1214     * even when the daemon has died and called reprotectKernelMemory
1215     */
1216    unprotectKernelMemory(chunkP->vaddrP, chunkP->len, true);
1217  }
1218  spin_unlock(&ChunkListLock);
1219
1220  /* Return address of first chunk allocated; this will be the
1221   * base of the GPFS shared segment
1222   */
1223  mappingP->vaddr = FirstVMallocChunkP;
1224#ifdef SSEG_SWIZZLE_PTRS
1225  mappingP->kvaddr = FirstVMallocChunkP;
1226  /* mappingP->vaddr is reset to proper user va in kxAttachSharedMemory */
1227#endif
1228
1229  /* If there were no chunks, return ENOENT */
1230  EXIT(0);
1231  return (NVMallocChunks > 0) ? 0 : -ENOENT;
1232}
1233
1234int
1235cxiDetachSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment)
1236{
1237  struct list_head* p;
1238  struct ShMemChunkDesc* chunkP;
1239
1240  ENTER(0);
1241  LOGASSERT(isSharedSegment == true);
1242
1243  /* Walk down the list of allocated chunks.  Map each one so that
1244   * this process can access it from user space.
1245   */
1246  spin_lock(&ChunkListLock);
1247
1248  list_for_each(p, &ChunkListHead)
1249  {
1250    chunkP = list_entry(p, struct ShMemChunkDesc, chunkList);
1251    TRACE1N(TRACE_SHARED, 11, TRCID_UNMAPALL_MULTI,
1252            "cxiDetachSharedMemory: chunkP 0x%lX\n", chunkP);
1253
1254    reprotectKernelMemory(chunkP->vaddrP, chunkP->len);
1255  }
1256  spin_unlock(&ChunkListLock);
1257
1258  EXIT(0);
1259  return 0;
1260}
1261
1262/* Clean up the code that manages shared kernel memory,
1263 * including freeing all allocated chunks.
1264 */
1265void 
1266TermSharedMemory()
1267{
1268  cxiMemoryMapping_t mapping;
1269
1270  ENTER(0);
1271  InitMemoryMapping(&mapping);
1272
1273  /* Delete shared segment */
1274  cxiFreeSharedMemory(&mapping, true);
1275
1276  /* Destroy slab allocator for ShMemChunkDesc objects */
1277  (void)kmem_cache_destroy(ChunkCacheP);
1278
1279  /* Unregister the shared segment device driver */
1280  unregister_chrdev(GPFSIoctlMajorNumber, "ss");
1281
1282  TRACE1(TRACE_SHARED, 2, TRCID_SSINIT_003,
1283         "module unloaded major %d\n", GPFSIoctlMajorNumber);
1284  GPFSIoctlMajorNumber = 0;
1285  EXIT(0);
1286}
1287
1288/* Clean up slab for ShMemChunkDesc (for early termination) */
1289void 
1290CleanUpSharedMemory()
1291{
1292  /* Destroy slab allocator for ShMemChunkDesc objects */
1293  (void)kmem_cache_destroy(ChunkCacheP);
1294}
1295
1296int
1297kxCoreDump(long sig, void *info,
1298           struct ucontext *sc, char *filenameP)
1299{
1300  struct pt_regs regs;
1301  static int getDump = 0;
1302  struct linux_binfmt * binfmt;
1303  char *tmp = NULL;
1304  int rc = -1;
1305  int code = 0;
1306  struct file *file = NULL;
1307  Boolean klock = false;
1308  struct sigcontext_struct *uc_mcontext;
1309  unsigned long len;
1310
1311  printk("kxCoreDump sig: %d fn: %s\n", sig, filenameP);
1312
1313  if (getDump == 0)
1314    getDump = 1;   // don't create more than one core dump at the same time
1315  else
1316    return 1;
1317
1318  memset((char *)&regs, 0, sizeof(struct pt_regs));
1319
1320  if (sig) /* Build pt_resgs from sigcontext struct */
1321  {
1322    code = 11;
1323    goto xerror;
1324  }
1325  tmp = cxiMallocPinned(CXI_PATH_MAX+1);
1326  if (!tmp)
1327  {
1328    code = 1;
1329    tmp = NULL;
1330    goto xerror;
1331  }
1332  if(cxiCopyInstr(filenameP, tmp, CXI_PATH_MAX, &len) != 0)
1333  {
1334    code = 12;
1335    goto xerror;
1336  }
1337
1338  lock_kernel();
1339  klock = true;
1340
1341  binfmt = current->binfmt;
1342  if (!binfmt || !binfmt->core_dump)
1343  {
1344    code = 2;
1345    goto xerror;
1346  }
1347
1348  if (MY_RLIM_CUR(RLIMIT_CORE) > 0x01000000)
1349    MY_RLIM_CUR(RLIMIT_CORE) = 0x10000000;
1350
1351  file = filp_open(tmp, O_CREAT | 2 | O_TRUNC | O_NOFOLLOW, 0600);
1352  if (IS_ERR(file))
1353  {
1354    code = 4;
1355    file = NULL;
1356    goto xerror;
1357  }
1358  if (!file->f_op || !file->f_op->write)
1359  {
1360    code = 5;
1361    goto xerror;
1362  }
1363  rc = binfmt->core_dump(sig, &regs, file);
1364  if (!rc)
1365  {
1366    code = 6;
1367    goto xerror;
1368  }
1369
1370xerror:
1371  if (file)
1372    filp_close(file, NULL);
1373
1374  if (klock)
1375    unlock_kernel();
1376
1377  if (tmp)
1378    cxiFreePinned(tmp);
1379
1380  getDump = 0;
1381  return rc;
1382}
1383
1384/* This call looks very similar to a MAP_ANONYMOUS mmap() call.  That's
1385 * because we used to do mmap() for this region.  Unfortunately when we
1386 * want MAP_PRIVATE semantics we don't get the results on Linux that we
1387 * expect.  The trouble starts when the pages of this memory
1388 * area are marked copy-on-write.  Since this is our buffer pool, when
1389 * I/O gets done, the old page goes to the child process and the new page goes
1390 * to the parent (mmfsd).  Unfortunately, the I/O gets done to the old page
1391 * since its physical address was cached in the kiobuf. 
1392 *
1393 * One attempt at fixing this was by making the area shared between parent
1394 * and child via MAP_SHARED. However, it opens the possibility of a child
1395 * process run from system() or popen() being able to stomp on the GPFS buffer
1396 * pool.  Additionally putting MAP_SHARED on the the region causes it
1397 * to be internally mapped to /dev/zero (apparently it needs some file mapping
1398 * on this MAP_ANONYMOUS region).  Subsequent madvise() calls saying that
1399 * we don't need the pages (MADV_DONTNEED) doesn't really free the
1400 * pages since there is still a hold count due to the kernel /dev/zero
1401 * mapping.  Thus the free pages reported by vmstat don't go down even
1402 * though we're freeing them from the mmap'd region.
1403 *
1404 * This all boils down to a workaround where we MAP_PRIVATE as we
1405 * wanted but set the VM_DONTCOPY flag so these mmap pages don't
1406 * get inherited by child processes.
1407 *
1408 * GPFS also needs to make sure that pages of its buffer pool are pinned in
1409 * memory.  This is necessary because GPFS caches the pointers to the struct
1410 * page objects returned by map_user_kiobuf.  Linux might steal pages in
1411 * one of two ways: reclaim_page will steal pages with count <= 1, and
1412 * swap_out_vma will clear the page table mapping of pages belonging to
1413 * vm_area_structs that do not have the VM_LOCKED bit set.
1414 * GPFS prevents the first case because map_user_kiobuf increases page
1415 * reference counts to 2.  We used to turning on the VM_LOCKED bit here,
1416 * but now we mlock() the memory to ensure it isn't swapped out.
1417 */
1418int
1419kxMapPrivate(char *inAddr, unsigned long len, unsigned long prot,
1420             char **outAddr)
1421{
1422  struct mm_struct *mmP;
1423  struct vm_area_struct *vmaP = NULL;
1424
1425  mmP = current->mm;
1426 
1427  ACQUIRE_MMAP_SEM(&mmP->mmap_sem);
1428
1429  *outAddr = (char *)do_mmap(NULL, (unsigned long)inAddr, len, prot, 
1430                             MAP_PRIVATE | MAP_ANONYMOUS, 0);
1431  /* Only look for address in vma list if do_mmap matches what we asked for;
1432     otherwise it may be an unexpected address or an error code and
1433     both are a problem. Any issues should be handled in the daemon
1434     if possible (eg, -ENOMEM). */
1435  if (*outAddr == inAddr)
1436  {
1437    for (vmaP = mmP->mmap; vmaP != NULL; vmaP = vmaP->vm_next)
1438      if (vmaP->vm_start == (unsigned long)*outAddr)
1439      {
1440        /* We don't want our vm_area_structs merged since we are
1441         * about to set a flag that would cross into an area where
1442         * it might not be good.  For instance if we get merged with
1443         * the stack vm area then we won't be able to fork since the
1444         * stack wouldn't be copied.
1445         */
1446        LOGASSERT(vmaP->vm_end == vmaP->vm_start + len);
1447        vmaP->vm_flags |= VM_DONTCOPY;
1448        break;
1449      }
1450
1451    DBGASSERT(vmaP != NULL);
1452  }
1453
1454  RELEASE_MMAP_SEM(&mmP->mmap_sem);
1455
1456  TRACE5(TRACE_SHARED, 1, TRCID_CXI_MAP_PRIVATE,
1457         "kxMapPrivate: inAddr 0x%lX len %d prot 0x%X outAddr 0x%lX vmaP 0x%lX\n",
1458         inAddr, len, prot, *outAddr, vmaP);
1459
1460  if (*outAddr == inAddr)
1461    return 0;
1462 
1463  return -EFAULT;
1464}
1465
1466#ifdef SSEG_SWIZZLE_PTRS
1467/* mmap handler for shared segment */
1468int ss_fs_mmap(struct file *file, struct vm_area_struct *vma)
1469{
1470  UIntPtr offset = vma->vm_pgoff<<PAGE_SHIFT;
1471  UIntPtr size = vma->vm_end - vma->vm_start;
1472
1473  if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
1474  {
1475    printk("ss_fs_mmap: invalid mmap flags\n");
1476    return -EINVAL;
1477  }
1478
1479  if (offset != 0)
1480  {
1481    printk("ss_fs_mmap: page offset should be zero (%ld)\n", offset);
1482    return -EINVAL;
1483  }
1484
1485  /* add page fault handler for vm area */
1486  vma->vm_ops = &ss_vm_ops;
1487
1488#if LINUX_KERNEL_VERSION >= 2060000
1489  /* 2.6 kernel appears to want the pages marked as unswappable,
1490     otherwise gobs of messages about "Badness in do_nopage/copy_page_range"
1491     occur in the system log. Still looking at this, but it appears that the
1492     kernel expects these pages to be "device" reserved pages verses typical
1493     anonymous pages (assumes a device intends to use the pages for DMA?)
1494     and doesn't want them tracked by VMM. */
1495  vma->vm_flags |= VM_RESERVED;
1496#endif
1497 
1498  /* perform open on vm area */
1499  ss_vm_open(vma);
1500 
1501  return 0;
1502} 
1503
1504/* vm area handlers for shared segment */
1505
1506void ss_vm_open(struct vm_area_struct *vma)
1507{   
1508        MY_MODULE_INCREMENT();
1509}
1510 
1511void ss_vm_close(struct vm_area_struct *vma)
1512{
1513        MY_MODULE_DECREMENT();
1514}
1515
1516/* Page fault handler
1517   Called by do_no_page with address of faulting page (ie, on page boundary) */
1518#if LINUX_KERNEL_VERSION < 2060000
1519struct page *
1520ss_vm_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
1521#else
1522struct page *
1523ss_vm_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
1524#endif /* LINUX_KERNEL_VERSION < 2060000 */
1525{
1526  UIntPtr offset;
1527  UIntPtr va;
1528  struct page *ret_page = NOPAGE_SIGBUS;
1529  int found = 0;
1530  struct list_head* p;
1531  struct ShMemChunkDesc* chunkP;
1532
1533  if ((address < vma->vm_start) || (address >= vma->vm_end))
1534  {
1535    printk("ss_vm_nopage: address 0x%lx out of vma range [%lx,%lx)\n",
1536           address, vma->vm_start, vma->vm_end);
1537    return ret_page;
1538  }
1539
1540  /* Make sure that the user address from a page fault is backed by
1541     kernel memory (find a containing memory chunk).
1542     The most recently allocated block will be at the head of
1543     the list, so generally we only check the first list entry. */
1544  /* May want to cache last list entry where a "hit" occurs if needed
1545     for performance at some point, eg, non-daemon attach. */
1546  spin_lock(&ChunkListLock);
1547  list_for_each(p, &ChunkListHead)
1548  {
1549    chunkP = list_entry(p, struct ShMemChunkDesc, chunkList);
1550    if ((address >= (UIntPtr)chunkP->usrvaddrP) &&
1551        (address < (UIntPtr)chunkP->usrvaddrP + chunkP->len))
1552    {
1553      found = 1;
1554      break;
1555    }
1556  }
1557  spin_unlock(&ChunkListLock);
1558  if (!found)
1559  {
1560    /* We have a problem; unable to find backing kernel memory */
1561    printk("ss_vm_nopage: unable to find kernel chunk backing user address 0x%lx\n", address);
1562    return ret_page;
1563  }
1564
1565  /* calculate the kernel virtual address */
1566  offset = address - (IntPtr)chunkP->usrvaddrP;
1567  va = (UIntPtr)(chunkP->vaddrP + offset);
1568
1569  /* Grab kernel page table lock before traversing kernel page table.
1570     I believe this is necessary in order to avoid having another processor
1571     change the page table on us while we are traversing.
1572     Normally only the process page table lock is grabbed when a
1573     page fault occurs (to protect against kswapd). */
1574  spin_lock(&init_mm.page_table_lock);
1575
1576  /* traverse kernel page table */
1577  ret_page = vmalloc_to_page((void *)va);
1578
1579  spin_unlock(&init_mm.page_table_lock);
1580  if (ret_page == NULL)
1581  {
1582    printk("ss_vm_nopage: vmalloc_to_page returned NULL\n");
1583    return ret_page;
1584  }
1585
1586  /* bump up page use count */
1587  get_page(ret_page);
1588
1589#ifdef SWIZ_BIG_DEBUG
1590  printk("ss_vm_nopage: page fault for offset 0x%lx uva 0x%lx va 0x%lx (kva x%lx)\n",
1591         offset, address, va, page_address(ret_page));
1592#endif
1593
1594#if LINUX_KERNEL_VERSION >= 2060000
1595  if (type)
1596    *type = VM_FAULT_MINOR;
1597#endif
1598 
1599  /* return page */
1600  return ret_page;
1601}
1602#endif /* SSEG_SWIZZLE_PTRS */
Note: See TracBrowser for help on using the repository browser.