/*************************************************************************** * * Copyright (C) 2001 International Business Machines * All rights reserved. * * This file is part of the GPFS mmfslinux kernel module. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *************************************************************************** */ /* @(#)26 1.86 src/avs/fs/mmfs/ts/kernext/gpl-linux/mmap.c, mmfs, avs_rgpfs24, rgpfs24s003a 5/8/06 11:04:56 */ #include #include #include #if defined(REDHAT_AS_LINUX) && LINUX_KERNEL_VERSION >= 2042101 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* True if the paging operations are enabled. Serialized using PQLockWord. */ static Boolean mmapEnabled = false; /* Storage for page queue entries */ #define MAX_PAGEQUE_ENTRIES 500 static cxibuf_t Page_queue[MAX_PAGEQUE_ENTRIES]; /* Head of list of free page queue entries, protected by PQLockWord */ static cxibuf_t *PageQueueFreeP; static cxiBlockingMutex_t PQLockWord; /* dump page contents * flag = 0 ==> after read from disk * 1 ==> write */ static void dump_page(struct vm_area_struct *vma, struct page *page, int flag) { #ifdef TRACE_IO_DATA int trcbuf[12]; char *what = (flag == 1) ? "write" : "read"; char *kaddr = kmap(page); ENTER(0); memcpy(trcbuf, kaddr, sizeof(trcbuf)); kunmap(page); TRACE8(TRACE_VNODE, 6, TRCID_MMAP_DIRTY_PAGE_DUMP, "dump 0 %s page 0x%lX: vma 0x%08X count %d data %08X %08X %08X %08X\n", what, page, vma, page_count(page), CPUToBigEnd32(trcbuf[0]), CPUToBigEnd32(trcbuf[1]), CPUToBigEnd32(trcbuf[2]), CPUToBigEnd32(trcbuf[3])); TRACE8(TRACE_VNODE, 9, TRCID_MMAP_DIRTY_PAGE_DUMP_A, "dump 1 %s page 0x%lX: vma 0x%08X count %d data %08X %08X %08X %08X\n", what, page, vma, page_count(page), CPUToBigEnd32(trcbuf[4]), CPUToBigEnd32(trcbuf[5]), CPUToBigEnd32(trcbuf[6]), CPUToBigEnd32(trcbuf[7])); TRACE8(TRACE_VNODE, 9, TRCID_MMAP_DIRTY_PAGE_DUMP_B, "dump 2 %s page 0x%lX: vma 0x%08X count %d data %08X %08X %08X %08X\n", what, page, vma, page_count(page), CPUToBigEnd32(trcbuf[8]), CPUToBigEnd32(trcbuf[9]), CPUToBigEnd32(trcbuf[10]), CPUToBigEnd32(trcbuf[11])); EXIT(0); #endif } /* Disable paging operations */ void mmapKill() { ENTER(0); cxiBlockingMutexAcquire(&PQLockWord); mmapEnabled = false; cxiBlockingMutexRelease(&PQLockWord); EXIT(0); } void EnableMmap() { /* It is ok to change without holding PQLockWord since it is * called from initialization */ mmapEnabled = true; } int cxiMmapRegister(void *dummy) { int i; ENTER(0); TRACE0(TRACE_VNODE, 2, TRCID_MMAP_REG_ENTER, "cxiMmapRegister enter\n"); cxiBlockingMutexInit(&PQLockWord, GPFS_LOCK_MMAP_FREEQ_IDX); TRACE2(TRACE_VNODE, 2, TRCID_MMAP_REG_5, "cxiMmapRegister: Page_queue addr range [0x%lX - 0x%lX]\n", &Page_queue[0], &Page_queue[MAX_PAGEQUE_ENTRIES - 1] ); /* Initialize page queue entries. When a page arrives for read or write (by readpage or writepage functions), the page information will be copied to a free queue entry and that entry will be added to the end of the pager kproc queue. */ PageQueueFreeP = NULL; for (i = 0; i < MAX_PAGEQUE_ENTRIES; i++) { Page_queue[i].av_forw = PageQueueFreeP; PageQueueFreeP = &Page_queue[i]; Page_queue[i].pageP = NULL; Page_queue[i].b_vp = NULL; Page_queue[i].vinfoP = NULL; Page_queue[i].b_baddr = NULL; Page_queue[i].b_flags = 0; Page_queue[i].b_blkno = 0; } mmapEnabled = true; EXIT(0); return 0; } /* Module termination */ int cxiMmapUnregister(void *dummy) { ENTER(0); TRACE0(TRACE_VNODE, 2, TRCID_MMAP_UNREG_ENTER, "cxiMmapUnregister enter\n"); PageQueueFreeP = NULL; mmapEnabled = false; EXIT(0); return 0; } Int64 getFilePos(cxibuf_t *bufP) { Int64 pos = (Int64) bufP->b_blkno << PAGE_SHIFT; ENTER(0); TRACE1(TRACE_VNODE, 5, TRCID_MMAP_FILEPOS_ENTER, "getFilePos: pos 0x%llX\n", pos); EXIT(0); return pos; } char *VM_Attach(cxibuf_t *bufP) { DBGASSERT(bufP->pageP != NULL); return kmap(bufP->pageP); } void VM_Detach(cxibuf_t *bufP, char *baddrP) { kunmap(bufP->pageP); } void IoDone(cxibuf_t *bufP) { struct page *pageP = bufP->pageP; if (pageP != NULL) { TRACE5(TRACE_VNODE, 2, TRCID_MMAP_IO_ENTER, "IoDone enter: b_flags 0x%lX pageP 0x%lX index %d count %d flags 0x%lX\n", bufP->b_flags, pageP, pageP->index, page_count(pageP), pageP->flags); /* error in read or write operation */ if ((bufP->b_flags & B_ERROR) != 0) SetPageError(pageP); else if ((bufP->b_flags & B_READ) != 0) SetPageUptodate(pageP); TRACE2(TRACE_VNODE, 2, TRCID_MMAP_IO_EXIT, "IoDone exit: pageP 0x%lX flags 0x%lX\n", pageP, pageP->flags); #if LINUX_KERNEL_VERSION >= 2050000 if ((bufP->b_flags & B_READ) == 0) /* This was a writeback request. Signal its completion by clearing the writeback flag. */ end_page_writeback(pageP); else #endif PAGE_UNLOCK(pageP); } /* If this was an asynchronous request, free the buf struct. For synchronous requests, the buf is a stack variable. */ if ((bufP->b_flags & B_ASYNC) != 0) { cxiBlockingMutexAcquire(&PQLockWord); bufP->av_forw = PageQueueFreeP; PageQueueFreeP = bufP; cxiBlockingMutexRelease(&PQLockWord); } } void getVp(void *gnP, void **vP, struct gpfsVfsData_t **privVfsP) { cxiNode_t *cP = (cxiNode_t *)gnP; struct inode *iP = (struct inode *)cP->osNodeP; *privVfsP = VP_TO_PVP(iP); *vP = cP->osNodeP; } /* Flush/invalidate a mapped range: CmfProtect - Remove pages from address space so that new references will cause a page fault or protection fault CmfFlush - Write dirty pages CmfInval - Prevent cached page from being re-used */ int cxiMmapFlush(cxiNode_t *cnP, UInt64 start, UInt64 end, enum CmflushOption cmopt) { int rc = 0; struct inode *inodeP = cnP->osNodeP; ENTER(0); TRACE5(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_ENTER, "cxiMmapFlush: cnP 0x%lX inodeNum %d opt %d range 0x%llX-0x%llX\n", cnP, inodeP->i_ino, cmopt, start, end); switch (cmopt) { case CmfProtect: /* Block new modifications to page. This clears PTEs, which will force them to page fault. It also transfers the dirty bit from the PTE to the page struct. */ UNMAP_MAPPING_RANGE(inodeP->i_mapping, start, 0); break; case CmfFlush: FILEMAP_FDATASYNC(rc, inodeP->i_mapping); if (rc == 0) FILEMAP_FDATAWAIT(rc, inodeP->i_mapping); break; case CmfInval: truncate_inode_pages(inodeP->i_mapping, (start & PAGE_CACHE_MASK)); break; } TRACE1(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_EXIT, "cxiMmapFlush exit: rc %d\n", rc); EXIT(0); return rc; } /* Lock a cache page for inode bufP->inodeP at index bufP->b_blkno, creating if necessary. Save pointer to page in bufP->pageP. On error, return with bufP->pageP NULL. Page will be locked and a reference will be added. Return non-zero if page is already up to date. */ int cxiMmapGetPage(cxibuf_t *bufP) { int rc = 0; struct inode *inodeP = (struct inode *)bufP->b_inodeP; struct page *pageP = grab_cache_page(inodeP->i_mapping, bufP->b_blkno); ENTER(0); if (pageP != NULL) { if (PAGE_UP_TO_DATE(pageP)) rc = EEXIST; else ClearPageError(pageP); TRACE6(TRACE_VNODE, 1, TRCID_CXIGETPAGE, "cxiMmapGetPage: page 0x%lX index %d count %d flags 0x%lX mapping 0x%lX uptodate %d\n", pageP, pageP->index, page_count(pageP), pageP->flags, pageP->mapping, (rc != 0)); } bufP->pageP = pageP; EXIT(0); return rc; } /* Release/unlock page */ void cxiMmapReleasePage(struct page *pageP) { ENTER(0); TRACE4(TRACE_VNODE, 1, TRCID_CXIRELPAGE, "cxiMmapReleasePage: released page 0x%lX index %d count %d flags 0x%lX\n", pageP, pageP->index, page_count(pageP), pageP->flags); PAGE_UNLOCK(pageP); page_cache_release(pageP); EXIT(0); } /* Called from do_no_page() to handle page fault. Add page to cache if not already there and add a reference. If contents are not already up to date, then read new contents from disk. Return NULL if failure. */ struct page * #if LINUX_KERNEL_VERSION > 2060300 gpfs_filemap_nopage(struct vm_area_struct *area, unsigned long address, int * noShare) #else gpfs_filemap_nopage(struct vm_area_struct *area, unsigned long address, int noShare) #endif { unsigned long index; struct page *pageP = NULL; struct page **hashP; struct file *fileP = area->vm_file; struct inode *inodeP; struct MMFSVInfo *vinfoP; Boolean haveFlushLock = false; cxiNode_t *cnP; cxibuf_t buf; VFS_STAT_START(readpageCall); ENTER(0); TRACE6(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGE, "gpfs_filemap_nopage enter: area 0x%lX address 0x%lX vm_file 0x%lX " "vm_mm 0x%lX mm_users %d noShare %d\n", area, address, fileP, area->vm_mm, atomic_read(&area->vm_mm->mm_users), noShare); index = area->vm_pgoff + ((address - area->vm_start) >> PAGE_CACHE_SHIFT); TRACE4(TRACE_VNODE, 3, TRCID_LINUXOPS_NOPAGE_1, "gpfs_filemap_nopage: vm_start 0x%lX vm_end 0x%lX vm_flags 0x%lX " "index %d\n", area->vm_start, area->vm_end, area->vm_flags, index); /* Check that paging operations are still enabled */ if (!mmapEnabled) goto exit; LOGASSERT(fileP != NULL); inodeP = fileP->f_dentry->d_inode; LOGASSERT(inodeP != NULL); cnP = VP_TO_CNP(inodeP); /* Remember that there were paging requests under the given instance */ vinfoP = (struct MMFSVInfo *)fileP->private_data; if (vinfoP != NULL) ((struct cxiVinfo_t*)vinfoP)->rwPageDone = true; /* See if this page is already in the cache, and add a reference if so */ #if LINUX_KERNEL_VERSION >= 2057200 pageP = find_get_page(inodeP->i_mapping, index); #else hashP = page_hash(inodeP->i_mapping, index); pageP = __find_get_page(inodeP->i_mapping, index, hashP); #endif if (pageP) { /* Page is already cached. If it is up to date, then we do not need to read it. Hold mmap flush lock until after making pte valid. */ gpfs_ops.gpfsMmapFlushLock(cnP); haveFlushLock = true; if (PAGE_UP_TO_DATE(pageP)) goto exit; /* Not up to date. Release page and go through processRead to fetch the data. */ gpfs_ops.gpfsMmapFlushUnlock(cnP); haveFlushLock = false; page_cache_release(pageP); } /* Initialize buf struct for mmap read. We don't have to fill in a data address since the page won't be allocated until after all the necessary locks have been obtained in kSFSRead. */ buf.av_forw = NULL; buf.pageP = NULL; buf.b_vp = cnP; buf.vinfoP = vinfoP; buf.privVfsP = VP_TO_PVP(inodeP); buf.b_baddr = NULL; buf.b_flags = B_READ | B_PFEOF; buf.b_blkno = index; buf.b_bcount = PAGE_SIZE; buf.b_error = 0; buf.b_inodeP = inodeP; /* Read the page. If successful, this returns with mmap flush lock held and a reference added to page. */ gpfs_ops.gpfsQueueBufs(&buf); pageP = buf.pageP; if (pageP) haveFlushLock = true; exit: #if defined(REDHAT_AS_LINUX) && LINUX_KERNEL_VERSION < 2042100 /* The noShare flag is only used on earlier kernels (of which Redhat * Advanced Server is one). This code is pretty much common to all * the nopage functions and thus was put in the common do_no_page() * function. It's present here for RHAS. */ if (noShare && pageP) { struct page *newPageP = alloc_page(GFP_HIGHUSER); if (newPageP) { copy_user_highpage(newPageP, pageP, address); flush_page_to_ram(newPageP); } page_cache_release(pageP); pageP = newPageP; } #endif /* If we return non-NULL, then nopagedone routine will be called. */ if (pageP) { TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGE_2, "gpfs_filemap_nopage: return page 0x%lX count %d flags 0x%lX " "mm_users %d\n", pageP, page_count(pageP), pageP->flags, atomic_read(&area->vm_mm->mm_users)); dump_page(area, pageP, 0); } else TRACE0(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGE_3, "gpfs_filemap_nopage: return page NULL"); #if !defined(MMAP_LINUX_PATCH) || LINUX_KERNEL_VERSION >= 2060000 /* If we don't have the nopagedone patch, release mmap flush lock here. * If flush/invalidate runs before do_no_page can make the PTE valid, * the application might see stale data and updates could be lost. */ if (haveFlushLock) gpfs_ops.gpfsMmapFlushUnlock(cnP); #endif VFS_STAT_STOP; EXIT(0); return pageP; } /* Called from do_no_page() after making PTE valid */ void gpfs_filemap_nopagedone(struct vm_area_struct *area, unsigned long address, int status) { struct inode *inodeP = area->vm_file->f_dentry->d_inode; cxiNode_t *cnP = VP_TO_CNP(inodeP); ENTER(0); TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGEDONE, "gpfs_filemap_nopagedone: cnP 0x%lX area 0x%lX address 0x%lX status %d\n", cnP, area, address, status); gpfs_ops.gpfsMmapFlushUnlock(cnP); EXIT(0); } /* Address space operation to read a page from a file. On entry, the page is locked and it is in the page cache. If this routine is successful, it marks the page up to date and unlocks it. Page faulting of a mapped file will call gpfs_filemap_nopage, not this routine. The main user of this routine is the sendfile() system call. */ int gpfs_i_readpage(struct file *fileP, struct page *pageP) { int rc = 0, rc1 = 0, code = 0; struct dentry *dentryP = fileP->f_dentry; struct inode *inodeP = dentryP->d_inode; cxiNode_t *cnP = VP_TO_CNP(inodeP); struct gpfsVfsData_t *privVfsP; int index = pageP->index; cxibuf_t buf; struct page *bufPageP; char *kaddr1; char *kaddr2; ext_cred_t eCred; ENTER(0); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_READPAGE_ENTER, "gpfs_i_readpage enter: fileP 0x%lX cnP 0x%lX inodeP 0x%lX inode %d\n", fileP, cnP, inodeP, inodeP->i_ino); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_READPAGE_ENTER_A, "gpfs_i_readpage: page 0x%lX index %d count %d flags 0x%lX\n", pageP, index, page_count(pageP), pageP->flags); /* Unlock the page. In order to read the page, we will have to obtain a file lock and byte range lock, and we can't do that while holding a page lock. The page is not yet marked up to date, so it won't hurt if another process attempts to read this page. We don't have to add a reference to the page since our caller is expecting us to return with the page unlocked, so he must already have taken care of that. */ PAGE_UNLOCK(pageP); /* Make sure file is open if called from NFS */ if (cxiIsNFSThread()) { int NFSflags = FREAD; BEGIN_FAR_CODE; DBGASSERT(GNP_IS_FILE(cnP)); rc = gpfs_ops.gpfsGetNFS((void *)inodeP, (struct MMFSVInfo **)&fileP->private_data, &NFSflags); if (rc != 0) { code = 1; goto xerror; } DBGASSERT((struct MMFSVInfo *)fileP->private_data != NULL); setCred(&eCred); privVfsP = VP_TO_PVP(inodeP); DBGASSERT(privVfsP != NULL); rc = gpfs_ops.gpfsOpenNFS(privVfsP, cnP, FREAD, (struct MMFSVInfo *)fileP->private_data, &eCred); if (rc != 0) { code = 2; goto xerror; } END_FAR_CODE; } buf.av_forw = NULL; buf.pageP = NULL; buf.b_vp = cnP; buf.vinfoP = (struct MMFSVInfo *)fileP->private_data; buf.privVfsP = VP_TO_PVP(inodeP); buf.b_baddr = NULL; buf.b_flags = B_READ | B_PFEOF | B_SENDFILE; buf.b_blkno = index; buf.b_bcount = PAGE_SIZE; buf.b_error = 0; buf.b_inodeP = inodeP; /* Read the page. If successful, this returns with mmap flush lock held and a reference added to page. */ gpfs_ops.gpfsQueueBufs(&buf); if (buf.pageP != NULL) { bufPageP = buf.pageP; TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_READPAGE1, "gpfs_i_readpage: return page 0x%lX index %d count %d flags 0x%lX\n", bufPageP, bufPageP->index, page_count(bufPageP), bufPageP->flags); dump_page(NULL, bufPageP, 0); if (buf.pageP != pageP) { /* may be pageP has been removed from the page cache by truncate_inode_pages. Since caller has reference, when removed by truncate_inode_pages from page cache, it is orphaned and will be deleted as soon as the count goes to zero. Therefore, grab_cache_page doesn't find it and creates a new page instead. Just copy the new page into pageP so that sendfile can use it and decrement the count, which will delete the page */ kaddr1 = kmap(pageP); kaddr2 = kmap(bufPageP); memcpy(kaddr1,kaddr2,PAGE_SIZE); kunmap(pageP); kunmap(bufPageP); SetPageUptodate(pageP); } /* Release reference that was added by gpfsReadpage */ page_cache_release(bufPageP); /* Release mmap flush lock. This lock is used to block invalidate until after PTE is made valid, but we aren't making any PTEs valid here. */ gpfs_ops.gpfsMmapFlushUnlock(cnP); } else { rc = EFAULT; code = 3; } /* Perform release on file if called from NFS */ if (cxiIsNFSThread()) { DBGASSERT(GNP_IS_FILE(cnP)); /* On the last NFS release, a watchdog will be set to close the file after a delay. */ rc1 = gpfs_ops.gpfsReleaseNFS(inodeP); if ((rc1 != 0) && (rc == 0)) { code = 4; rc = rc1; } } xerror: TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_READPAGE_EXIT, "gpfs_i_readpage exit: inodeP 0x%lX rc %d code %d\n", inodeP, rc, code); EXIT(0); return -rc; } /* Address space operation to asynchronously write a page to a file. On entry, the page is locked. This routine queues a write request to a pager kproc and returns. The kproc will unlock the page when write is complete, and that will wake up any waiters. */ int #if LINUX_KERNEL_VERSION >= 2050000 gpfs_i_writepage(struct page *pageP, struct writeback_control *wbcP) #else gpfs_i_writepage(struct page *pageP) #endif { int rc = 0; struct inode *inodeP = (struct inode *) pageP->mapping->host; cxiNode_t *cnP = VP_TO_CNP(inodeP); cxibuf_t *bufP, buf; VFS_STAT_START(writepageCall); ENTER(0); TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_WRPAGE_ENTER, "gpfs_i_writepage enter: cnP 0x%lX inodeP 0x%lX inode %d\n", cnP, inodeP, inodeP->i_ino); TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_WRPAGE_ENTER_A, "gpfs_i_writepage: page 0x%lX index %d count %d flags 0x%lX\n", pageP, pageP->index, page_count(pageP), pageP->flags); dump_page(NULL, pageP, 1); /* Get a request buffer. If none are available, allocate one on stack and do the write synchronously. */ cxiBlockingMutexAcquire(&PQLockWord); if (PageQueueFreeP == NULL) { bufP = &buf; bufP->b_flags = B_WRITE; } else { bufP = PageQueueFreeP; PageQueueFreeP = bufP->av_forw; bufP->b_flags = B_WRITE | B_ASYNC; } cxiBlockingMutexRelease(&PQLockWord); /* Initialize buffer */ bufP->av_forw = NULL; bufP->pageP = pageP; bufP->b_vp = cnP; bufP->vinfoP = NULL; bufP->privVfsP = VP_TO_PVP(inodeP); bufP->b_baddr = NULL; bufP->b_blkno = pageP->index; bufP->b_bcount = PAGE_SIZE; bufP->b_error = 0; bufP->b_inodeP = NULL; #if LINUX_KERNEL_VERSION >= 2050000 /* Set the page writeback flag and unlock the page. When write is complete, the pager kproc will call IoDone to clear this flag and wake up any threads waiting for this write to complete. */ set_page_writeback(pageP); PAGE_UNLOCK(pageP); #endif /* Queue the buffer to a pager kproc and return. */ gpfs_ops.gpfsQueueBufs(bufP); exit: TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_WRPAGE_EXIT, "gpfs_i_writepage exit: inodeP 0x%lX rc %d\n", inodeP, rc); VFS_STAT_STOP; EXIT(0); return -rc; }