/*************************************************************************** * * Copyright (C) 2001 International Business Machines * All rights reserved. * * This file is part of the GPFS mmfslinux kernel module. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *************************************************************************** */ /* @(#)22 1.109.1.3 src/avs/fs/mmfs/ts/kernext/gpl-linux/ss.c, mmfs, avs_rgpfs24, rgpfs24s008a 11/30/06 16:55:18 */ /* * Implementation of shared segment for GPFS daemon and GPFS kernel code. * * Contents: * exp_procfs_version * gpfs_proc_export_init * gpfs_proc_export_term * ss_open * ss_release * ss_fs_read * ss_fs_write * ss_fs_ioctl * ss_init * kxSaveThreadInfo * * struct ShMemChunkDesc * unprotectKernelMemory * reprotectKernelMemory * InitSharedMemory * TermSharedMemory * cxiCalcMaxSharedSegment * cxiAllocSharedMemory * cxiFreeSharedMemory * cxiAttachSharedMemory * cxiFreeSharedMemory * */ #include #include #include #ifndef UTS_RELEASE #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION < 2061600 #include #if LINUX_KERNEL_VERSION >= 2060507 long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg); #endif #endif int cxiAttachSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment); #ifdef GPFS_ARCH_POWER #define PKMAP_BASE (0xfe000000UL) #define VMALLOC_END ioremap_bot #endif const char *gpfs_banner = "GPFS Linux kernel version " UTS_RELEASE "\n"; SETUP_MODULE_PATH_PARMS; #ifdef PERF_STATS int ioctl_count[MAX_SS_IOCTL_OPS]; #endif /* Dynamically assigned major device number for the ioctl interfaces to the GPFS kernel modules. This is the /dev/ss0 device. */ int GPFSIoctlMajorNumber; /* Only allow the users with write access or root users */ #define CHECK_PERM if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser()) \ { \ EXIT(0); \ return -EPERM; \ } /* Vector table for all routines that can be called with the ss_fs_ioctl. */ int (*ss_ioctl_op[MAX_SS_IOCTL_OPS+1])(); #ifdef SSEG_SWIZZLE_PTRS /* virtual MM handlers for vm areas */ void ss_vm_open(struct vm_area_struct *area); void ss_vm_close(struct vm_area_struct *area); #if LINUX_KERNEL_VERSION < 2060000 struct page *ss_vm_nopage(struct vm_area_struct *area, unsigned long address, int unused); #else struct page *ss_vm_nopage(struct vm_area_struct *area, unsigned long address, int *type); #endif /* LINUX_KERNEL_VERSION < 2060000 */ static struct vm_operations_struct ss_vm_ops = { open: ss_vm_open, close: ss_vm_close, nopage: ss_vm_nopage, }; #endif /* SSEG_SWIZZLE_PTRS */ /* Add GPFS information to the /proc file system. */ int exp_procfs_version(char *buffer, char **start, off_t offset, int length, int *eof, void *data) { off_t pos = 0; off_t begin = 0; int len = 0; len += sprintf(buffer+len, gpfs_banner); *eof = 1; *start = buffer + (offset - begin); len -= (offset - begin); if ( len > length ) len = length; return len; } void gpfs_proc_export_init(void) { if (!proc_mkdir("fs/gpfs", 0)) return; create_proc_read_entry("fs/gpfs/version", 0, 0, exp_procfs_version, NULL); } void gpfs_proc_export_term(void) { remove_proc_entry("fs/gpfs/version", NULL); remove_proc_entry("fs/gpfs", NULL); } /* Open the character device used for the shared segment. */ int ss_open(struct inode *inode, struct file *filp) { TRACE2(TRACE_SHARED, 2, TRCID_SS_019, "ss_open: file 0x%lX inode 0x%lX\n", filp, inode); MY_MODULE_INCREMENT(); return 0; /* success */ } /* Release/Close the character device used for the shared segment. */ int ss_release(struct inode *inode, struct file *filp) { TRACE1(TRACE_SHARED, 2, TRCID_SS_023, "ss_release: file 0x%lX\n", filp); MY_MODULE_DECREMENT(); return 0; /* success */ } /* Map the shared segment and return the address of the first chunk allocated (if buffer is big enough to hold it). */ ssize_t ss_fs_read(struct file *file, char *buf, size_t nbytes, loff_t *ppos) { struct inode *inode = file->f_dentry->d_inode; unsigned int minor = MINOR(inode->i_rdev); cxiMemoryMapping_t mapping; int rc; TRACE1(TRACE_SHARED, 2, TRCID_SS_059, "ss_fs_read: called 0x%lX\n", nbytes); /* BKL is not held at entry */ if (minor != 0) return -ENODEV; /* Only allow the users with write access or root users */ if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser()) return -EPERM; InitMemoryMapping(&mapping); /* Map the shared memory */ rc = cxiAttachSharedMemory(&mapping, true); if (rc) return -rc; /* If user buffer is big enough, copy base address of segment there */ if (nbytes >= sizeof(mapping.vaddr)) { rc = cxiCopyOut((char *)&mapping.vaddr, buf, sizeof(mapping.vaddr)); if (rc) return -EFAULT; } return 0; } /* Was used for debugging. */ ssize_t ss_fs_write(struct file *file, const char *buf, size_t nbytes, loff_t *ppos) { /* Only allow the users with write access or root users */ if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser()) return -EPERM; TRACE1(TRACE_SHARED, 0, TRCID_SS_065, "ss_fs_write: called 0x%lX\n", nbytes); /* BKL is not held at entry */ return -EINVAL; } #ifdef PERF_STATS int kxNoOp(int op1, int op2) { int i; if (op1 == 1) // reset all counters { for (i = 0; i < MAX_SS_IOCTL_OPS; i++) ioctl_count[i] = 0; } if (op2 > 0 && op2 < MAX_SS_IOCTL_OPS) return ioctl_count[op2]; // return the requested counter return 0; } #endif #if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION >= 2061600 long ss_fs_compat_ioctl(struct file *file, unsigned int op, unsigned long kx_args) { int rc; TRACE2(TRACE_KSVFS, 9, TRCID_SS_DMAPI_COMPAT_ENTER, "Entering ss_fs_compat_ioctl: called me with op = %d (%s)", op, kxOp_tostring(op)); if (ss_ioctl_op[0] != 0) { /* unlock_kernel();*/ rc = ss_ioctl_op[0](op, kx_args); /*lock_kernel();*/ } else rc = -1; TRACE1(TRACE_KSVFS, 9, TRCID_SS_DMAPI_COMPAT_EXIT, "Leaving ss_fs_compat_ioctl with rc = %d.", rc); return rc; } #endif /* Shared segment and other ioctl calls to the kernel code. */ int ss_fs_ioctl(struct inode *inode, struct file *file, unsigned int op, unsigned long kx_args) { int len, rc; char buf[512]; struct kxArgs args_cp; struct kxArgs *args = (struct kxArgs *)kx_args; ENTER(0); if (op == kxtraceit) { CHECK_PERM; rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp)); if (rc != 0) goto minus1; len = 3; strncpy(buf, KERN_NOTICE, len); // KERN_NOTICE = "<5>" len += sprintf(buf+len, "dp %X:%d:", cxiGetThreadId(), args_cp.arg3); rc = cxiCopyIn((char*)args_cp.arg2, buf+len, args_cp.arg1+1); if (rc != 0) goto minus1; printk(buf); EXIT(0); return 0; } TRACE5(TRACE_KSVFS, 15, TRCID_SS_075, "ss_fs_ioctl: op %d opAddr 0x%lX args 0x%lX inode 0x%lX file 0x%lX\n", op, ss_ioctl_op[op], kx_args, inode, file); /* BKL is held at entry */ #ifdef PERF_STATS if (op > 0 && op < MAX_SS_IOCTL_OPS) ioctl_count[op]++; #endif switch (op) { #ifdef GPFS_ARCH_POWER case CoreDump: CHECK_PERM; rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp)); if (rc != 0) goto minus1; rc = kxCoreDump((long)args_cp.arg1, (void *)args_cp.arg2, (struct ucontext *)args_cp.arg3, (char *)args_cp.arg4); break; #endif case saveThreadInfo: CHECK_PERM; rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp)); if (rc != 0) goto minus1; rc = kxSaveThreadInfo(args_cp.arg1, (void *)args_cp.arg2); break; case GetPrivLevel: CHECK_PERM; rc = get_privilege_level(); break; case SetPrivLevel: CHECK_PERM; rc = set_privilege_level(kx_args); break; case MapPrivate: { char *outAddr; CHECK_PERM; rc = cxiCopyIn((char*)args, (char *)&args_cp, sizeof(args_cp)); if (rc != 0) goto minus1; rc = kxMapPrivate((char *)args_cp.arg1, (unsigned long)args_cp.arg2, (unsigned long)args_cp.arg3, &outAddr); if (rc == 0) rc = cxiCopyOut((char*)&outAddr, (char*)args_cp.arg4, sizeof(char*)); if (rc != 0) rc = -EFAULT; break; } case GetTimeOfDay: { cxiTimeStruc_t ts; rc = cxiGetTOD(&ts); if (rc == 0) rc = cxiCopyOut((char*)&ts, (char*)kx_args, sizeof(cxiTimeStruc_t)); if (rc != 0) rc = -EFAULT; break; } #ifdef PERF_STATS case noOp: rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp)); if (rc != 0) break; if (args_cp.arg1 == 0 && args_cp.arg2 == 0) { /* continue to the real noop kxNoOp in ssioctl.C */ } else { rc = kxNoOp((int)args_cp.arg1, (int)args_cp.arg2); break; } #endif default: TRACE1(TRACE_KSVFS, 9, TRCID_SS_077, "ss_fs_ioctl: invoking ss_ioctl_op %d\n", op); if (ss_ioctl_op[0] != 0) { unlock_kernel(); rc = ss_ioctl_op[0](op, kx_args); lock_kernel(); } else goto minus1; break; } EXIT(0); return rc; minus1: EXIT(0); return -1; } #ifdef SSEG_SWIZZLE_PTRS extern int ss_fs_mmap(struct file *file, struct vm_area_struct *vma); #endif /* The other operations, not in the following list, for the device come from the bare device. */ struct file_operations ss_fops = { read: ss_fs_read, write: ss_fs_write, ioctl: ss_fs_ioctl, #ifdef SSEG_SWIZZLE_PTRS mmap: ss_fs_mmap, #endif open: ss_open, release: ss_release, #if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION >= 2061600 compat_ioctl: ss_fs_compat_ioctl, #endif }; #ifdef API_32BIT #ifdef GPFS_ARCH_X86_64 /* Note that these 32-bit ioctl functions are not needed for ia64; these routines just call the standard 64-bit ioctl. */ static int tsstat32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp) { DBGASSERT(cmd == Stat); return sys_ioctl(fd,cmd,ptr); } static int tsfstat32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp) { DBGASSERT(cmd == Fstat); return sys_ioctl(fd,cmd,ptr); } static int tsfattr32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp) { DBGASSERT(cmd == Fattr); return sys_ioctl(fd,cmd,ptr); } static int tsfsattr32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp) { DBGASSERT(cmd == FsAttr); return sys_ioctl(fd,cmd,ptr); } static int tsattr32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp) { DBGASSERT(cmd == Attr); return sys_ioctl(fd,cmd,ptr); } static int tsgetacl32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp) { DBGASSERT(cmd == GetACL); return sys_ioctl(fd,cmd,ptr); } static int tsputacl32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp) { DBGASSERT(cmd == PutACL); return sys_ioctl(fd,cmd,ptr); } #ifdef DMAPI static int kxDmApiCall32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp) { DBGASSERT(cmd == DmApiCall); return sys_ioctl(fd,cmd,ptr); } #endif /* DMAPI */ #ifdef GPFS_QUOTACTL static int kxQuotactl32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp) { DBGASSERT(cmd == Quotactl); return sys_ioctl(fd,cmd,ptr); } #endif #endif /* GPFS_ARCH_X86_64 */ /* Most 64-bit architectures have a separate interface where 32-bit ioctl command numbers / routines must be registered (not necessary for ia64). At some point we may need to modify our command numbers (currently use kxOps for number field) to use both the type / magic number and number field (ie, _IOWR('G', ) instead of current implicit _IORW(0, )) if a command number collision occurs between gpfs and a new device driver. The 32-bit ioctl implementation only uses a hash table (and not a driver specific function pointer like ioctl from file_operations ... something like ioctl32 would be ideal or just passing this to sys_ioctl like is done on ia64 platform), so a collision may occur here someday. Curently not very many drivers provide 32-bit ioctl calls and only the entries from 0x0 to 0x1F are used with magic number 0, ie _IOWR(0,0) to _IOWR(0,1F), while our external API commands are in the range of 53-59 (0x35-0x3b) ... although the limited ioctl32 hash table size actually makes collisions much more likely. Note that /usr/src/linux/Documentation/ioctl-number.txt keeps track of the registered blocks used by drivers. */ void gpfs_reg_ioctl32() { int rc = 0; /* TO DO: eventually add 32-bit API for PPC64? */ #if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION < 2061600 rc = register_ioctl32_conversion(Stat, tsstat32); rc |= register_ioctl32_conversion(Fstat, tsfstat32); rc |= register_ioctl32_conversion(Fattr, tsfattr32); rc |= register_ioctl32_conversion(FsAttr, tsfsattr32); rc |= register_ioctl32_conversion(Attr, tsattr32); rc |= register_ioctl32_conversion(GetACL, tsgetacl32); rc |= register_ioctl32_conversion(PutACL, tsputacl32); #ifdef DMAPI rc |= register_ioctl32_conversion(DmApiCall, kxDmApiCall32); #endif /* DMAPI */ #ifdef GPFS_QUOTACTL rc |= register_ioctl32_conversion(Quotactl, kxQuotactl32); #endif /* GPFS_QUOTACTL */ if (rc) printk("gpfs_reg_ioctl32: Error in registering ioctl32\n"); #endif /* GPFS_ARCH_X86_64 */ } void gpfs_unreg_ioctl32() { int rc = 0; /* TO DO: eventually add 32-bit API for PPC64? */ #if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION < 2061600 rc = unregister_ioctl32_conversion(Stat); rc |= unregister_ioctl32_conversion(Fstat); rc |= unregister_ioctl32_conversion(Fattr); rc |= unregister_ioctl32_conversion(FsAttr); rc |= unregister_ioctl32_conversion(Attr); rc |= unregister_ioctl32_conversion(GetACL); rc |= unregister_ioctl32_conversion(PutACL); #ifdef DMAPI rc |= unregister_ioctl32_conversion(DmApiCall); #endif /* DMAPI */ #ifdef GPFS_QUOTACTL rc |= unregister_ioctl32_conversion(Quotactl); #endif /* GPFS_QUOTACTL */ if (rc) printk("unregister_ioctl32_conversion: Error in unregistering ioctl32\n"); #endif /* GPFS_ARCH_X86_64 */ } #endif /* API_32BIT */ /* Initialization of the character device used for the shared segment interfaces and other ioctl calls to the kernel code. */ int ss_init() { int major; GPFSIoctlMajorNumber = 0; major = register_chrdev(0, "ss", &ss_fops); if (major < 0) { TRACE1(TRACE_SHARED, 2, TRCID_SS_081, "ss_init: unable to get ss0 major rc %d\n", major); return -1; } GPFSIoctlMajorNumber = major; TRACE1(TRACE_SHARED, 2, TRCID_SS_083, "ss_init: module loaded ss0 major %d\n", GPFSIoctlMajorNumber); return 0; } /* Management of storage shared between the GPFS daemon and the mmfslinux kernel module. Chunks of memory are allocated on demand by the kxAllocSharedKernelMemory call, and are then suballocated by GPFS. To allow free use of pointers, all of this memory is addressed using the same virtual addresses whether it is being accessed from the daemon process or from a process in kernel mode. Setting up this addressibility requires modifying the protection bits in the Linux page table. For historical reasons dating to the implementation of GPFS on AIX, the storage shared between the GPFS daemon process and the kernel is frequently referred to collectively as "the shared segment". Note that when pointer swizzling is utilized (via SSEG_PTR_SWIZZLE), the virtual address for the daemon process and kernel is no longer common; the page tables are not fiddled with in this situation and a page fault handler is utilized instead. */ /* Description of each allocated chunk. Allocated chunks are linked together from ChunkListHead. */ struct ShMemChunkDesc { struct list_head chunkList; /* list linkage */ char* vaddrP; /* virtual address of beginning of chunk */ int len; /* length of chunk */ #ifdef SSEG_SWIZZLE_PTRS char* usrvaddrP; /* corresponding user address from mmap */ #endif }; struct list_head ChunkListHead; /* Number of chunks and total size of all chunks */ int NVMallocChunks; int TotalVMallocBytes; /* Address of the first chunk allocated. This value gets returned by cxiMapAllSharedKernelMemory as the base of the GPFS shared segment. */ char* FirstVMallocChunkP; /* Maximum total bytes to allocate, as computed by cxiCalcMaxSharedSegment */ int MaxTotalVMallocBytes; /* Beginning and end of the area of kernel virtual memory used by vmalloc/vfree */ UIntPtr VMallocStart; UIntPtr VMallocEnd; /* Minimum size of an allocated chunk */ #define MIN_VMALLOC_CHUNK PAGE_SIZE /* Lock guarding the chunk list */ spinlock_t ChunkListLock; /* Pointer to slab allocator for ShMemChunkDesc's */ struct kmem_cache* ChunkCacheP = NULL; /* Make a range of kernel memory addressible by the current process while in user mode */ #ifndef SSEG_SWIZZLE_PTRS static void unprotectKernelMemory(char* vaddrP, int len, Boolean allocating) { struct mm_struct *mm = current->mm; unsigned long vaddr = (unsigned long) vaddrP; unsigned long vaddr_start = vaddr; pgd_t *pgdP; pmd_t *pmdP; pte_t *pteP; /* Change protection for each page in the range */ TRACE3N(TRACE_SHARED, 9, TRCID_UNPROT_ENTER, "unprotectKernelMemory: vaddr 0x%lX len %d allocating %d\n", vaddr, len, allocating); while (len > 0) { /* Access the page to make sure all levels of the page table have been created. This this is a kernel address, so page table entries will persist once they have been created, since the Linux kernel is not pageable. */ atomic_read((atomic_t*) vaddrP); /* Find page table entries for this page */ pgdP = PGD_OFFSET(mm, vaddr); pmdP = pmd_offset(pgdP, vaddr); pteP = PTE_OFFSET(pmdP, vaddr); #ifdef GPFS_ARCH_I386 /* On IA32, set both the pte, and pmd/pgd to allow mmfsd process-level * access to the area. Since each process has its own page directory * (pgd), an attempt to access one of these unprotected pages will be * blocked by the protection bit in that process' pgd. If another process * requires access to shared kernel pages, only its pgd need be updated. * pmd_t and pte_t are same size and definition. Thus pte_rdprotect() * (only available macro that hides differences between Suse/Redhat) * is used. */ DBGASSERT(sizeof(pte_t) == sizeof(pmd_t)); set_pte((pte_t *)pmdP, pte_mkread((*(pte_t *)pmdP))); if (allocating) set_pte(pteP, pte_mkread(*pteP)); PTE_UNMAP(pteP); #elif defined(GPFS_ARCH_POWER) || defined(GPFS_ARCH_X86_64) // XXX Not implemented // pmd_val(*pmdP) = pmd_val(*pmdP) | _PAGE_USER; // if (allocating) // set_pte(pteP, pte_mkread(*pteP)); #elif defined(GPFS_ARCH_IA64) /* On IA64, set the protection level of the page when it is created. * Nothing to do when allowing access from another process except to * set the privilege level of the process. */ if (allocating) pte_val(*pteP) = pte_val(*pteP) | PRIVILEGE_FLAGS; #endif /* Advance to the next page */ vaddr += PAGE_SIZE; vaddrP += PAGE_SIZE; len -= PAGE_SIZE; } /* It is necessary to flush the TLB entries for IA64 to propagate the * pte privilege level change. */ FLUSH_TLB_RANGE(mm, vaddr_start, vaddr); } #else static void unprotectKernelMemory(char* vaddrP, int len, Boolean allocating) { /* do nothing when pointer swizzling */ return; } #endif /* !SSEG_SWIZZLE_PTRS */ /* Make a range of kernel memory no longer addressible by user processes while in user mode. Called just before freeing the memory. */ #ifndef SSEG_SWIZZLE_PTRS static void reprotectKernelMemory(char* vaddrP, int len) { struct mm_struct *mm = current->mm; unsigned long vaddr = (unsigned long) vaddrP; unsigned long vaddr_start = vaddr; pgd_t *pgdP; pmd_t *pmdP; pte_t *pteP; /* Change protection for each page in the range */ ENTER(0); TRACE2(TRACE_SHARED, 4, TRCID_REPROT_ENTER, "reprotectKernelMemory: vaddr 0x%lX len %d\n", vaddr, len); while (len > 0) { /* Access the page to make sure all levels of the page table have been created. This this is a kernel address, so page table entries will persist once they have been created, since the Linux kernel is not pageable. */ atomic_read((atomic_t*) vaddrP); /* Find page table entries for this page */ pgdP = PGD_OFFSET(mm, vaddr); pmdP = pmd_offset(pgdP, vaddr); pteP = PTE_OFFSET(pmdP, vaddr); #ifdef GPFS_ARCH_I386 /* On IA32, reset the pte and pmd to disallow process-level access.*/ set_pte((pte_t *)pmdP, pte_rdprotect((*(pte_t *)pmdP))); // see unprotect set_pte(pteP, pte_rdprotect(*pteP)); #elif defined(GPFS_ARCH_POWER) || defined(GPFS_ARCH_X86_64) // XXX??? not implemented #elif defined(GPFS_ARCH_IA64) /* On IA64, reset the protection level of the page. */ pte_val(*pteP) = (pte_val(*pteP) & ~_PAGE_PL_MASK) | _PAGE_PL_0; #endif PTE_UNMAP(pteP); /* Advance to the next page */ vaddr += PAGE_SIZE; vaddrP += PAGE_SIZE; len -= PAGE_SIZE; } /* It is necessary to flush the TLB entries for IA64 to propagate the * pte privilege level change. */ FLUSH_TLB_RANGE(mm, vaddr_start, vaddr); EXIT(0); } #else static void reprotectKernelMemory(char* vaddrP, int len) { /* do nothing when pointer swizzling */ return; } #endif /* !SSEG_SWIZZLE_PTRS */ /* Initialize the code that manages shared memory */ void InitSharedMemory() { ENTER(0); TRACE2(TRACE_SHARED, 1, TRCID_SHKERN_INIT, "InitSharedMemory called. VMALLOC_START 0x%lX VMALLOC_END 0x%lX\n", VMALLOC_START, VMALLOC_END); VMallocStart = (UIntPtr)VMALLOC_START; VMallocEnd = (UIntPtr)VMALLOC_END; spin_lock_init(&ChunkListLock); /* Create a slab allocator for ShMemChunkDesc objects */ ChunkCacheP = kmem_cache_create("ShMemChunkDesc", sizeof(struct ShMemChunkDesc), 0 /* offset */, 0 /* flags */, NULL /* ctor */, NULL /* dtor */); if (ChunkCacheP == NULL) cxiPanic("Cannot create ShMemChunkDesc cache\n"); /* Empty the chunk list */ INIT_LIST_HEAD(&ChunkListHead); EXIT(0); } /* Compute how large the total size shared segment is allowed to grow, based on a desired size. A value of 0 for desiredBytes means to compute the default maximum size. */ int cxiCalcMaxSharedSegment(int desiredBytes, int* actualBytesP) { Int64 physMemSize; Int64 effPhysMemSize; UIntPtr minAllowedSize = 16*1024*1024; UIntPtr maxAllowedSize = MAX_SSEG_MAPPINGS*1024*1024; UIntPtr actualBytes; char* p; UIntPtr vmUsed; UIntPtr vmRegionReserved; UIntPtr maxBytes; /* If an explicit number of desired bytes was given, use that value. Otherwise, if no number of desired bytes was given (or a value smaller than the minimum possible was specified) compute the size based on the size of real memory. The size computed is a fixed fraction of real memory (only the first 2G on i386). */ ENTER(0); physMemSize = (Int64)num_physpages * PAGE_SIZE; #ifdef GPFS_ARCH_I386 effPhysMemSize = MIN(physMemSize, (Int64)0x80000000); #else effPhysMemSize = physMemSize; #endif if (desiredBytes > 0) actualBytes = desiredBytes; else actualBytes = effPhysMemSize/16; actualBytes = MAX(actualBytes, minAllowedSize); /* Compute an approximation of how many bytes are already used in the vmalloc region. The variables needed to compute this exactly are not exported from the kernel. If we vmalloc a single page area and see how far the allocated area is from the beginning of the vmalloc region, we have at least a lower bound on the amount of vmalloc storage already used. If there have been no vfrees, this will yield an accurate answer. */ p = vmalloc(PAGE_SIZE); if (p == NULL) vmUsed = VMallocEnd - VMallocStart; else { vmUsed = (UIntPtr)p - VMallocStart; vfree(p); } /* Make sure the actual maximum fits within the vmalloc region, taking into account memory already used and leaving a reserved area for other vmallocs. */ vmRegionReserved = 16*1024*1024; maxBytes = (VMallocEnd-VMallocStart) - (vmUsed+vmRegionReserved); actualBytes = MIN(actualBytes, maxBytes); /* Make sure the actual maximum does not exceed the maximum possible */ actualBytes = MIN(actualBytes, maxAllowedSize); /* Make sure the actual maximum is less than half of real memory */ actualBytes = MIN(actualBytes, effPhysMemSize/2); /* Round actual maximum down to a multiple of the page size */ actualBytes = (actualBytes/PAGE_SIZE) * PAGE_SIZE; /* If actual maximum is less than the minimum allowed, return 0 */ if (actualBytes < minAllowedSize) actualBytes = 0; /* Return result */ TRACE5(TRACE_SHARED, 1, TRCID_CALC_MAX_SHARED, "cxiCalcMaxSharedSegment: actualBytes 0x%lX desiredBytes %d " "physMemSize 0x%lX vmUsed 0x%lX maxBytes 0x%lX\n", actualBytes, desiredBytes, physMemSize, vmUsed, maxBytes); *actualBytesP = (int)actualBytes; MaxTotalVMallocBytes = (int)actualBytes; EXIT(0); return 0; } /* Acquire additional kernel memory that is mapped to user space when * using SSEG_SWIZZLE_PTRS (different virtual address between kernel and * daemon); otherwise allocated memory uses the same virtual address * for both kernel code and the GPFS daemon. Will get at least minBytes. * Returns the starting virtual address of the area and its actual length. */ int cxiAllocSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment) { int rc = 0; int code = 0; char *vaddrP; struct ShMemChunkDesc* chunkP = NULL; int minBytes = mappingP->kBytes * 1024; int actualBytes; pgprot_t prot; #if defined(GPFS_ARCH_X86_64) && !defined(SSEG_SWIZZLE_PTRS) pml4_t* pml4P; #endif /* On linux we only allocate the shared segment in this manner */ ENTER(0); LOGASSERT(isSharedSegment == true); /* Compute actual number of bytes to allocate */ if (minBytes <= MIN_VMALLOC_CHUNK) actualBytes = MIN_VMALLOC_CHUNK; else actualBytes = ((minBytes + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE; TRACE2(TRACE_SHARED, 5, TRCID_ALLOC_SHARED_VMALLOC, "cxiAllocSharedMemory: vmalloc %d minBytes %d\n", actualBytes, minBytes); /* Return failure if this allocation would put us over the limit */ if (TotalVMallocBytes + actualBytes > MaxTotalVMallocBytes) { code = 1; rc = -ENOMEM; goto xerror; } /* Get a descriptor for the memory to be allocated */ chunkP = (struct ShMemChunkDesc*) kmem_cache_alloc(ChunkCacheP, GFP_KERNEL); if (chunkP == NULL) { code = 2; rc = -ENOMEM; goto xerror; } /* Allocate memory * ?? Instead of calling vmalloc here, we could also do something like: * pgprot_t prot; * prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_USER); * vaddrP = __vmalloc(actualBytes, GFP_KERNEL | __GFP_HIGHMEM, prot); * * This is an expansion of the vmalloc inline function, with _PAGE_USER * added to the protection bits so that the PTE entries will already be set * correctly. However, a call to unprotectKernelMemory would still be * needed to set the protection bits in the PMD entries. * * There is also the possibility here of using __GFP_HIGHMEM instead of * GFP_KERNEL on machines with sufficient high memory. The storage * allocated here will never be used as I/O buffers, so high memory would * be a good place to put it. This would give I/O buffers a greater chance * of being allocated below 1G, reducing the need for bounce buffers to do * I/O. */ #ifndef SSEG_SWIZZLE_PTRS #if defined(GPFS_ARCH_POWER) prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_USER); vaddrP = __vmalloc(actualBytes, GFP_KERNEL, prot); #elif defined(GPFS_ARCH_X86_64) #define __pml4(x) ((pml4_t) { (x) } ) pml4P = pml4_offset_k(VMALLOC_START); set_pml4(pml4P, __pml4(pml4_val(*pml4P) | _PAGE_USER)); #undef __pml4 prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_USER | _PAGE_GLOBAL); vaddrP = __vmalloc(actualBytes, GFP_KERNEL, prot); #elif defined(GPFS_ARCH_PPC64) prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_USER); vaddrP = __vmalloc(actualBytes, GFP_KERNEL, prot); #else vaddrP = vmalloc(actualBytes); #endif #else vaddrP = vmalloc(actualBytes); #endif /* !SSEG_SWIZZLE_PTRS */ if (vaddrP == NULL) { code = 3; rc = -ENOMEM; goto xerror; } #ifdef MALLOC_DEBUG MallocDebugNew(vaddrP, actualBytes, 3); #endif spin_lock(&ChunkListLock); NVMallocChunks += 1; TotalVMallocBytes += actualBytes; /* Remember address of first chunk allocated */ if (NVMallocChunks == 1) FirstVMallocChunkP = vaddrP; /* Fill in chunk descriptor and add it to the proper list */ chunkP->vaddrP = vaddrP; chunkP->len = actualBytes; #ifdef SSEG_SWIZZLE_PTRS chunkP->usrvaddrP = 0; #endif list_add(&chunkP->chunkList, &ChunkListHead); spin_unlock(&ChunkListLock); /* Make memory just allocated addressible by the current process */ unprotectKernelMemory(vaddrP, actualBytes, true); /* Return results */ mappingP->vaddr = vaddrP; mappingP->kBytes = actualBytes / 1024; #ifdef SSEG_SWIZZLE_PTRS mappingP->kvaddr = vaddrP; /* mappingP->vaddr is reset to proper user va in kxAllocSharedMemory */ #endif xerror: if (rc) { InitMemoryMapping(mappingP); if (chunkP) kmem_cache_free(ChunkCacheP, (void*)chunkP); } TRACE4(TRACE_SHARED, 1, TRCID_ALLOC_SHARED_EXIT, "cxiAllocSharedMemory: vaddr 0x%lX kBytes %d rc %d code %d\n", mappingP->vaddr, mappingP->kBytes, rc, code); EXIT(0); return rc; } #ifdef SSEG_SWIZZLE_PTRS /* Record the user address that is associated with the kernel vmalloc address (vmalloc chunk for shared segment). This is needed later on by the page fault handler. This routine is called after allocating the chunk and determining the corresponding user address (used by all user processes mmap'ing this specific shared segment chunk). */ int cxiRecordSharedMemory(cxiMemoryMapping_t *mappingP) { int found = 0; struct ShMemChunkDesc* chunkP = NULL; struct list_head* p; ENTER(0); spin_lock(&ChunkListLock); list_for_each(p, &ChunkListHead) { chunkP = list_entry(p, struct ShMemChunkDesc, chunkList); if (chunkP->vaddrP == mappingP->kvaddr) { chunkP->usrvaddrP = mappingP->vaddr; found = 1; break; } } spin_unlock(&ChunkListLock); EXIT(0); if (!found) return -1; else return 0; } /* Obtain any necessary kernel information for initializing pointer swizzling; currently just grabs vmalloc range info. */ int cxiInitPtrSwizzling(UIntPtr *vmallocStartP, UIntPtr *vmallocEndP) { ENTER(0); *vmallocStartP = (UIntPtr)VMALLOC_START; *vmallocEndP = (UIntPtr)VMALLOC_END; EXIT(0); return 0; } #endif /* Unmap and deallocate all shared segment memory */ int cxiFreeSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment) { struct list_head* firstP; struct ShMemChunkDesc* chunkP; ENTER(0); LOGASSERT(isSharedSegment == true); /* Walk down the list of multi page chunks. Free each one and its * associated chunk descriptor. Drop the list lock while freeing * storage. */ spin_lock(&ChunkListLock); while (!list_empty(&ChunkListHead)) { firstP = ChunkListHead.next; list_del(firstP); chunkP = list_entry(firstP, struct ShMemChunkDesc, chunkList); NVMallocChunks -= 1; TotalVMallocBytes -= chunkP->len; spin_unlock(&ChunkListLock); reprotectKernelMemory(chunkP->vaddrP, chunkP->len); TRACE2(TRACE_SHARED, 4, TRCID_FREEALL_VFREE, "cxiFreeSharedMemory: vaddrP 0x%lX chunkP 0x%lX\n", chunkP->vaddrP, chunkP); vfree(chunkP->vaddrP); #ifdef MALLOC_DEBUG MallocDebugDelete(chunkP->vaddrP); #endif kmem_cache_free(ChunkCacheP, (void*)chunkP); spin_lock(&ChunkListLock); } FirstVMallocChunkP = NULL; spin_unlock(&ChunkListLock); InitMemoryMapping(mappingP); EXIT(0); return 0; } /* Map the shared segment memory into the address * space of the calling process */ int cxiAttachSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment) { struct list_head* p; struct ShMemChunkDesc* chunkP; ENTER(0); LOGASSERT(isSharedSegment == true); /* Walk down the list of allocated chunks. Map each one so that * this process can access it from user space. */ spin_lock(&ChunkListLock); list_for_each(p, &ChunkListHead) { chunkP = list_entry(p, struct ShMemChunkDesc, chunkList); TRACE1N(TRACE_SHARED, 11, TRCID_MAPALL_MULTI, "cxiAttachSharedMemory: chunkP 0x%lX\n", chunkP); /* unprotectKernelMemory has to be called here with 'allocating' * set to 'true', so that mmfsadm can map and access the shared segment * even when the daemon has died and called reprotectKernelMemory */ unprotectKernelMemory(chunkP->vaddrP, chunkP->len, true); } spin_unlock(&ChunkListLock); /* Return address of first chunk allocated; this will be the * base of the GPFS shared segment */ mappingP->vaddr = FirstVMallocChunkP; #ifdef SSEG_SWIZZLE_PTRS mappingP->kvaddr = FirstVMallocChunkP; /* mappingP->vaddr is reset to proper user va in kxAttachSharedMemory */ #endif /* If there were no chunks, return ENOENT */ EXIT(0); return (NVMallocChunks > 0) ? 0 : -ENOENT; } int cxiDetachSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment) { struct list_head* p; struct ShMemChunkDesc* chunkP; ENTER(0); LOGASSERT(isSharedSegment == true); /* Walk down the list of allocated chunks. Map each one so that * this process can access it from user space. */ spin_lock(&ChunkListLock); list_for_each(p, &ChunkListHead) { chunkP = list_entry(p, struct ShMemChunkDesc, chunkList); TRACE1N(TRACE_SHARED, 11, TRCID_UNMAPALL_MULTI, "cxiDetachSharedMemory: chunkP 0x%lX\n", chunkP); reprotectKernelMemory(chunkP->vaddrP, chunkP->len); } spin_unlock(&ChunkListLock); EXIT(0); return 0; } /* Clean up the code that manages shared kernel memory, * including freeing all allocated chunks. */ void TermSharedMemory() { cxiMemoryMapping_t mapping; ENTER(0); InitMemoryMapping(&mapping); /* Delete shared segment */ cxiFreeSharedMemory(&mapping, true); /* Destroy slab allocator for ShMemChunkDesc objects */ (void)kmem_cache_destroy(ChunkCacheP); /* Unregister the shared segment device driver */ unregister_chrdev(GPFSIoctlMajorNumber, "ss"); TRACE1(TRACE_SHARED, 2, TRCID_SSINIT_003, "module unloaded major %d\n", GPFSIoctlMajorNumber); GPFSIoctlMajorNumber = 0; EXIT(0); } /* Clean up slab for ShMemChunkDesc (for early termination) */ void CleanUpSharedMemory() { /* Destroy slab allocator for ShMemChunkDesc objects */ (void)kmem_cache_destroy(ChunkCacheP); } int kxCoreDump(long sig, void *info, struct ucontext *sc, char *filenameP) { struct pt_regs regs; static int getDump = 0; struct linux_binfmt * binfmt; char *tmp = NULL; int rc = -1; int code = 0; struct file *file = NULL; Boolean klock = false; struct sigcontext_struct *uc_mcontext; unsigned long len; printk("kxCoreDump sig: %d fn: %s\n", sig, filenameP); if (getDump == 0) getDump = 1; // don't create more than one core dump at the same time else return 1; memset((char *)®s, 0, sizeof(struct pt_regs)); if (sig) /* Build pt_resgs from sigcontext struct */ { code = 11; goto xerror; } tmp = cxiMallocPinned(CXI_PATH_MAX+1); if (!tmp) { code = 1; tmp = NULL; goto xerror; } if(cxiCopyInstr(filenameP, tmp, CXI_PATH_MAX, &len) != 0) { code = 12; goto xerror; } lock_kernel(); klock = true; binfmt = current->binfmt; if (!binfmt || !binfmt->core_dump) { code = 2; goto xerror; } if (MY_RLIM_CUR(RLIMIT_CORE) > 0x01000000) MY_RLIM_CUR(RLIMIT_CORE) = 0x10000000; file = filp_open(tmp, O_CREAT | 2 | O_TRUNC | O_NOFOLLOW, 0600); if (IS_ERR(file)) { code = 4; file = NULL; goto xerror; } if (!file->f_op || !file->f_op->write) { code = 5; goto xerror; } rc = binfmt->core_dump(sig, ®s, file); if (!rc) { code = 6; goto xerror; } xerror: if (file) filp_close(file, NULL); if (klock) unlock_kernel(); if (tmp) cxiFreePinned(tmp); getDump = 0; return rc; } /* This call looks very similar to a MAP_ANONYMOUS mmap() call. That's * because we used to do mmap() for this region. Unfortunately when we * want MAP_PRIVATE semantics we don't get the results on Linux that we * expect. The trouble starts when the pages of this memory * area are marked copy-on-write. Since this is our buffer pool, when * I/O gets done, the old page goes to the child process and the new page goes * to the parent (mmfsd). Unfortunately, the I/O gets done to the old page * since its physical address was cached in the kiobuf. * * One attempt at fixing this was by making the area shared between parent * and child via MAP_SHARED. However, it opens the possibility of a child * process run from system() or popen() being able to stomp on the GPFS buffer * pool. Additionally putting MAP_SHARED on the the region causes it * to be internally mapped to /dev/zero (apparently it needs some file mapping * on this MAP_ANONYMOUS region). Subsequent madvise() calls saying that * we don't need the pages (MADV_DONTNEED) doesn't really free the * pages since there is still a hold count due to the kernel /dev/zero * mapping. Thus the free pages reported by vmstat don't go down even * though we're freeing them from the mmap'd region. * * This all boils down to a workaround where we MAP_PRIVATE as we * wanted but set the VM_DONTCOPY flag so these mmap pages don't * get inherited by child processes. * * GPFS also needs to make sure that pages of its buffer pool are pinned in * memory. This is necessary because GPFS caches the pointers to the struct * page objects returned by map_user_kiobuf. Linux might steal pages in * one of two ways: reclaim_page will steal pages with count <= 1, and * swap_out_vma will clear the page table mapping of pages belonging to * vm_area_structs that do not have the VM_LOCKED bit set. * GPFS prevents the first case because map_user_kiobuf increases page * reference counts to 2. We used to turning on the VM_LOCKED bit here, * but now we mlock() the memory to ensure it isn't swapped out. */ int kxMapPrivate(char *inAddr, unsigned long len, unsigned long prot, char **outAddr) { struct mm_struct *mmP; struct vm_area_struct *vmaP = NULL; mmP = current->mm; ACQUIRE_MMAP_SEM(&mmP->mmap_sem); *outAddr = (char *)do_mmap(NULL, (unsigned long)inAddr, len, prot, MAP_PRIVATE | MAP_ANONYMOUS, 0); /* Only look for address in vma list if do_mmap matches what we asked for; otherwise it may be an unexpected address or an error code and both are a problem. Any issues should be handled in the daemon if possible (eg, -ENOMEM). */ if (*outAddr == inAddr) { for (vmaP = mmP->mmap; vmaP != NULL; vmaP = vmaP->vm_next) if (vmaP->vm_start == (unsigned long)*outAddr) { /* We don't want our vm_area_structs merged since we are * about to set a flag that would cross into an area where * it might not be good. For instance if we get merged with * the stack vm area then we won't be able to fork since the * stack wouldn't be copied. */ LOGASSERT(vmaP->vm_end == vmaP->vm_start + len); vmaP->vm_flags |= VM_DONTCOPY; break; } DBGASSERT(vmaP != NULL); } RELEASE_MMAP_SEM(&mmP->mmap_sem); TRACE5(TRACE_SHARED, 1, TRCID_CXI_MAP_PRIVATE, "kxMapPrivate: inAddr 0x%lX len %d prot 0x%X outAddr 0x%lX vmaP 0x%lX\n", inAddr, len, prot, *outAddr, vmaP); if (*outAddr == inAddr) return 0; return -EFAULT; } #ifdef SSEG_SWIZZLE_PTRS /* mmap handler for shared segment */ int ss_fs_mmap(struct file *file, struct vm_area_struct *vma) { UIntPtr offset = vma->vm_pgoff<vm_end - vma->vm_start; if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) { printk("ss_fs_mmap: invalid mmap flags\n"); return -EINVAL; } if (offset != 0) { printk("ss_fs_mmap: page offset should be zero (%ld)\n", offset); return -EINVAL; } /* add page fault handler for vm area */ vma->vm_ops = &ss_vm_ops; #if LINUX_KERNEL_VERSION >= 2060000 /* 2.6 kernel appears to want the pages marked as unswappable, otherwise gobs of messages about "Badness in do_nopage/copy_page_range" occur in the system log. Still looking at this, but it appears that the kernel expects these pages to be "device" reserved pages verses typical anonymous pages (assumes a device intends to use the pages for DMA?) and doesn't want them tracked by VMM. */ vma->vm_flags |= VM_RESERVED; #endif /* perform open on vm area */ ss_vm_open(vma); return 0; } /* vm area handlers for shared segment */ void ss_vm_open(struct vm_area_struct *vma) { MY_MODULE_INCREMENT(); } void ss_vm_close(struct vm_area_struct *vma) { MY_MODULE_DECREMENT(); } /* Page fault handler Called by do_no_page with address of faulting page (ie, on page boundary) */ #if LINUX_KERNEL_VERSION < 2060000 struct page * ss_vm_nopage(struct vm_area_struct *vma, unsigned long address, int unused) #else struct page * ss_vm_nopage(struct vm_area_struct *vma, unsigned long address, int *type) #endif /* LINUX_KERNEL_VERSION < 2060000 */ { UIntPtr offset; UIntPtr va; struct page *ret_page = NOPAGE_SIGBUS; int found = 0; struct list_head* p; struct ShMemChunkDesc* chunkP; if ((address < vma->vm_start) || (address >= vma->vm_end)) { printk("ss_vm_nopage: address 0x%lx out of vma range [%lx,%lx)\n", address, vma->vm_start, vma->vm_end); return ret_page; } /* Make sure that the user address from a page fault is backed by kernel memory (find a containing memory chunk). The most recently allocated block will be at the head of the list, so generally we only check the first list entry. */ /* May want to cache last list entry where a "hit" occurs if needed for performance at some point, eg, non-daemon attach. */ spin_lock(&ChunkListLock); list_for_each(p, &ChunkListHead) { chunkP = list_entry(p, struct ShMemChunkDesc, chunkList); if ((address >= (UIntPtr)chunkP->usrvaddrP) && (address < (UIntPtr)chunkP->usrvaddrP + chunkP->len)) { found = 1; break; } } spin_unlock(&ChunkListLock); if (!found) { /* We have a problem; unable to find backing kernel memory */ printk("ss_vm_nopage: unable to find kernel chunk backing user address 0x%lx\n", address); return ret_page; } /* calculate the kernel virtual address */ offset = address - (IntPtr)chunkP->usrvaddrP; va = (UIntPtr)(chunkP->vaddrP + offset); /* Grab kernel page table lock before traversing kernel page table. I believe this is necessary in order to avoid having another processor change the page table on us while we are traversing. Normally only the process page table lock is grabbed when a page fault occurs (to protect against kswapd). */ spin_lock(&init_mm.page_table_lock); /* traverse kernel page table */ ret_page = vmalloc_to_page((void *)va); spin_unlock(&init_mm.page_table_lock); if (ret_page == NULL) { printk("ss_vm_nopage: vmalloc_to_page returned NULL\n"); return ret_page; } /* bump up page use count */ get_page(ret_page); #ifdef SWIZ_BIG_DEBUG printk("ss_vm_nopage: page fault for offset 0x%lx uva 0x%lx va 0x%lx (kva x%lx)\n", offset, address, va, page_address(ret_page)); #endif #if LINUX_KERNEL_VERSION >= 2060000 if (type) *type = VM_FAULT_MINOR; #endif /* return page */ return ret_page; } #endif /* SSEG_SWIZZLE_PTRS */