[16] | 1 | /*************************************************************************** |
---|
| 2 | * |
---|
| 3 | * Copyright (C) 2001 International Business Machines |
---|
| 4 | * All rights reserved. |
---|
| 5 | * |
---|
| 6 | * This file is part of the GPFS mmfslinux kernel module. |
---|
| 7 | * |
---|
| 8 | * Redistribution and use in source and binary forms, with or without |
---|
| 9 | * modification, are permitted provided that the following conditions |
---|
| 10 | * are met: |
---|
| 11 | * |
---|
| 12 | * 1. Redistributions of source code must retain the above copyright notice, |
---|
| 13 | * this list of conditions and the following disclaimer. |
---|
| 14 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
| 15 | * notice, this list of conditions and the following disclaimer in the |
---|
| 16 | * documentation and/or other materials provided with the distribution. |
---|
| 17 | * 3. The name of the author may not be used to endorse or promote products |
---|
| 18 | * derived from this software without specific prior written |
---|
| 19 | * permission. |
---|
| 20 | * |
---|
| 21 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
---|
| 22 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
---|
| 23 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
---|
| 24 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
---|
| 25 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
---|
| 26 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; |
---|
| 27 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
---|
| 28 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
---|
| 29 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
---|
| 30 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
| 31 | * |
---|
| 32 | *************************************************************************** */ |
---|
| 33 | /* @(#)26 1.86 src/avs/fs/mmfs/ts/kernext/gpl-linux/mmap.c, mmfs, avs_rgpfs24, rgpfs24s003a 5/8/06 11:04:56 */ |
---|
| 34 | |
---|
| 35 | #include <Shark-gpl.h> |
---|
| 36 | #include <arch-gpl.h> |
---|
| 37 | |
---|
| 38 | #include <linux/mm.h> |
---|
| 39 | #if defined(REDHAT_AS_LINUX) && LINUX_KERNEL_VERSION >= 2042101 |
---|
| 40 | #include <linux/mm_inline.h> |
---|
| 41 | #endif |
---|
| 42 | |
---|
| 43 | #include <linux/pagemap.h> |
---|
| 44 | #include <linux/module.h> |
---|
| 45 | #include <asm/pgalloc.h> |
---|
| 46 | #include <linux/mman.h> |
---|
| 47 | #include <linux/file.h> |
---|
| 48 | #include <linux/sched.h> |
---|
| 49 | #include <linux/smp_lock.h> |
---|
| 50 | #include <linux/delay.h> |
---|
| 51 | |
---|
| 52 | #include <verdep.h> |
---|
| 53 | #include <cxiSystem.h> |
---|
| 54 | #include <cxi2gpfs.h> |
---|
| 55 | #include <cxiMmap.h> |
---|
| 56 | #include <linux2gpfs.h> |
---|
| 57 | #include <cxi2gpfs.h> |
---|
| 58 | #include <Trace.h> |
---|
| 59 | #include <LockNames.h> |
---|
| 60 | |
---|
| 61 | |
---|
| 62 | /* True if the paging operations are enabled. Serialized using PQLockWord. */ |
---|
| 63 | static Boolean mmapEnabled = false; |
---|
| 64 | |
---|
| 65 | /* Storage for page queue entries */ |
---|
| 66 | #define MAX_PAGEQUE_ENTRIES 500 |
---|
| 67 | static cxibuf_t Page_queue[MAX_PAGEQUE_ENTRIES]; |
---|
| 68 | |
---|
| 69 | /* Head of list of free page queue entries, protected by PQLockWord */ |
---|
| 70 | static cxibuf_t *PageQueueFreeP; |
---|
| 71 | static cxiBlockingMutex_t PQLockWord; |
---|
| 72 | |
---|
| 73 | |
---|
| 74 | /* dump page contents |
---|
| 75 | * flag = 0 ==> after read from disk |
---|
| 76 | * 1 ==> write |
---|
| 77 | */ |
---|
| 78 | static void dump_page(struct vm_area_struct *vma, struct page *page, int flag) |
---|
| 79 | { |
---|
| 80 | #ifdef TRACE_IO_DATA |
---|
| 81 | int trcbuf[12]; |
---|
| 82 | char *what = (flag == 1) ? "write" : "read"; |
---|
| 83 | char *kaddr = kmap(page); |
---|
| 84 | ENTER(0); |
---|
| 85 | memcpy(trcbuf, kaddr, sizeof(trcbuf)); |
---|
| 86 | kunmap(page); |
---|
| 87 | |
---|
| 88 | TRACE8(TRACE_VNODE, 6, TRCID_MMAP_DIRTY_PAGE_DUMP, |
---|
| 89 | "dump 0 %s page 0x%lX: vma 0x%08X count %d data %08X %08X %08X %08X\n", |
---|
| 90 | what, page, vma, page_count(page), |
---|
| 91 | CPUToBigEnd32(trcbuf[0]), |
---|
| 92 | CPUToBigEnd32(trcbuf[1]), |
---|
| 93 | CPUToBigEnd32(trcbuf[2]), |
---|
| 94 | CPUToBigEnd32(trcbuf[3])); |
---|
| 95 | TRACE8(TRACE_VNODE, 9, TRCID_MMAP_DIRTY_PAGE_DUMP_A, |
---|
| 96 | "dump 1 %s page 0x%lX: vma 0x%08X count %d data %08X %08X %08X %08X\n", |
---|
| 97 | what, page, vma, page_count(page), |
---|
| 98 | CPUToBigEnd32(trcbuf[4]), |
---|
| 99 | CPUToBigEnd32(trcbuf[5]), |
---|
| 100 | CPUToBigEnd32(trcbuf[6]), |
---|
| 101 | CPUToBigEnd32(trcbuf[7])); |
---|
| 102 | TRACE8(TRACE_VNODE, 9, TRCID_MMAP_DIRTY_PAGE_DUMP_B, |
---|
| 103 | "dump 2 %s page 0x%lX: vma 0x%08X count %d data %08X %08X %08X %08X\n", |
---|
| 104 | what, page, vma, page_count(page), |
---|
| 105 | CPUToBigEnd32(trcbuf[8]), |
---|
| 106 | CPUToBigEnd32(trcbuf[9]), |
---|
| 107 | CPUToBigEnd32(trcbuf[10]), |
---|
| 108 | CPUToBigEnd32(trcbuf[11])); |
---|
| 109 | EXIT(0); |
---|
| 110 | #endif |
---|
| 111 | } |
---|
| 112 | |
---|
| 113 | |
---|
| 114 | /* Disable paging operations */ |
---|
| 115 | void mmapKill() |
---|
| 116 | { |
---|
| 117 | ENTER(0); |
---|
| 118 | cxiBlockingMutexAcquire(&PQLockWord); |
---|
| 119 | mmapEnabled = false; |
---|
| 120 | cxiBlockingMutexRelease(&PQLockWord); |
---|
| 121 | EXIT(0); |
---|
| 122 | } |
---|
| 123 | |
---|
| 124 | void EnableMmap() |
---|
| 125 | { |
---|
| 126 | /* It is ok to change without holding PQLockWord since it is |
---|
| 127 | * called from initialization |
---|
| 128 | */ |
---|
| 129 | mmapEnabled = true; |
---|
| 130 | } |
---|
| 131 | |
---|
| 132 | |
---|
| 133 | int cxiMmapRegister(void *dummy) |
---|
| 134 | { |
---|
| 135 | int i; |
---|
| 136 | |
---|
| 137 | ENTER(0); |
---|
| 138 | TRACE0(TRACE_VNODE, 2, TRCID_MMAP_REG_ENTER, |
---|
| 139 | "cxiMmapRegister enter\n"); |
---|
| 140 | |
---|
| 141 | cxiBlockingMutexInit(&PQLockWord, GPFS_LOCK_MMAP_FREEQ_IDX); |
---|
| 142 | |
---|
| 143 | TRACE2(TRACE_VNODE, 2, TRCID_MMAP_REG_5, |
---|
| 144 | "cxiMmapRegister: Page_queue addr range [0x%lX - 0x%lX]\n", |
---|
| 145 | &Page_queue[0], &Page_queue[MAX_PAGEQUE_ENTRIES - 1] ); |
---|
| 146 | |
---|
| 147 | /* Initialize page queue entries. When a page arrives for read or write |
---|
| 148 | (by readpage or writepage functions), the page information will be |
---|
| 149 | copied to a free queue entry and that entry will be added to the end |
---|
| 150 | of the pager kproc queue. */ |
---|
| 151 | PageQueueFreeP = NULL; |
---|
| 152 | for (i = 0; i < MAX_PAGEQUE_ENTRIES; i++) |
---|
| 153 | { |
---|
| 154 | Page_queue[i].av_forw = PageQueueFreeP; |
---|
| 155 | PageQueueFreeP = &Page_queue[i]; |
---|
| 156 | Page_queue[i].pageP = NULL; |
---|
| 157 | Page_queue[i].b_vp = NULL; |
---|
| 158 | Page_queue[i].vinfoP = NULL; |
---|
| 159 | Page_queue[i].b_baddr = NULL; |
---|
| 160 | Page_queue[i].b_flags = 0; |
---|
| 161 | Page_queue[i].b_blkno = 0; |
---|
| 162 | } |
---|
| 163 | |
---|
| 164 | mmapEnabled = true; |
---|
| 165 | EXIT(0); |
---|
| 166 | return 0; |
---|
| 167 | } |
---|
| 168 | |
---|
| 169 | /* Module termination */ |
---|
| 170 | int cxiMmapUnregister(void *dummy) |
---|
| 171 | { |
---|
| 172 | ENTER(0); |
---|
| 173 | TRACE0(TRACE_VNODE, 2, TRCID_MMAP_UNREG_ENTER, |
---|
| 174 | "cxiMmapUnregister enter\n"); |
---|
| 175 | PageQueueFreeP = NULL; |
---|
| 176 | mmapEnabled = false; |
---|
| 177 | EXIT(0); |
---|
| 178 | return 0; |
---|
| 179 | } |
---|
| 180 | |
---|
| 181 | Int64 getFilePos(cxibuf_t *bufP) |
---|
| 182 | { |
---|
| 183 | Int64 pos = (Int64) bufP->b_blkno << PAGE_SHIFT; |
---|
| 184 | ENTER(0); |
---|
| 185 | TRACE1(TRACE_VNODE, 5, TRCID_MMAP_FILEPOS_ENTER, |
---|
| 186 | "getFilePos: pos 0x%llX\n", pos); |
---|
| 187 | EXIT(0); |
---|
| 188 | return pos; |
---|
| 189 | } |
---|
| 190 | |
---|
| 191 | char *VM_Attach(cxibuf_t *bufP) |
---|
| 192 | { |
---|
| 193 | DBGASSERT(bufP->pageP != NULL); |
---|
| 194 | return kmap(bufP->pageP); |
---|
| 195 | } |
---|
| 196 | |
---|
| 197 | void VM_Detach(cxibuf_t *bufP, char *baddrP) |
---|
| 198 | { |
---|
| 199 | kunmap(bufP->pageP); |
---|
| 200 | } |
---|
| 201 | |
---|
| 202 | void IoDone(cxibuf_t *bufP) |
---|
| 203 | { |
---|
| 204 | struct page *pageP = bufP->pageP; |
---|
| 205 | |
---|
| 206 | if (pageP != NULL) |
---|
| 207 | { |
---|
| 208 | TRACE5(TRACE_VNODE, 2, TRCID_MMAP_IO_ENTER, |
---|
| 209 | "IoDone enter: b_flags 0x%lX pageP 0x%lX index %d count %d flags 0x%lX\n", |
---|
| 210 | bufP->b_flags, pageP, pageP->index, page_count(pageP), pageP->flags); |
---|
| 211 | |
---|
| 212 | /* error in read or write operation */ |
---|
| 213 | if ((bufP->b_flags & B_ERROR) != 0) |
---|
| 214 | SetPageError(pageP); |
---|
| 215 | else if ((bufP->b_flags & B_READ) != 0) |
---|
| 216 | SetPageUptodate(pageP); |
---|
| 217 | |
---|
| 218 | TRACE2(TRACE_VNODE, 2, TRCID_MMAP_IO_EXIT, |
---|
| 219 | "IoDone exit: pageP 0x%lX flags 0x%lX\n", |
---|
| 220 | pageP, pageP->flags); |
---|
| 221 | |
---|
| 222 | #if LINUX_KERNEL_VERSION >= 2050000 |
---|
| 223 | if ((bufP->b_flags & B_READ) == 0) |
---|
| 224 | /* This was a writeback request. Signal its completion by clearing the |
---|
| 225 | writeback flag. */ |
---|
| 226 | end_page_writeback(pageP); |
---|
| 227 | else |
---|
| 228 | #endif |
---|
| 229 | PAGE_UNLOCK(pageP); |
---|
| 230 | } |
---|
| 231 | |
---|
| 232 | /* If this was an asynchronous request, free the buf struct. For |
---|
| 233 | synchronous requests, the buf is a stack variable. */ |
---|
| 234 | if ((bufP->b_flags & B_ASYNC) != 0) |
---|
| 235 | { |
---|
| 236 | cxiBlockingMutexAcquire(&PQLockWord); |
---|
| 237 | bufP->av_forw = PageQueueFreeP; |
---|
| 238 | PageQueueFreeP = bufP; |
---|
| 239 | cxiBlockingMutexRelease(&PQLockWord); |
---|
| 240 | } |
---|
| 241 | |
---|
| 242 | |
---|
| 243 | } |
---|
| 244 | |
---|
| 245 | void getVp(void *gnP, void **vP, struct gpfsVfsData_t **privVfsP) |
---|
| 246 | { |
---|
| 247 | cxiNode_t *cP = (cxiNode_t *)gnP; |
---|
| 248 | struct inode *iP = (struct inode *)cP->osNodeP; |
---|
| 249 | *privVfsP = VP_TO_PVP(iP); |
---|
| 250 | *vP = cP->osNodeP; |
---|
| 251 | } |
---|
| 252 | |
---|
| 253 | |
---|
| 254 | /* Flush/invalidate a mapped range: |
---|
| 255 | CmfProtect - Remove pages from address space so that new |
---|
| 256 | references will cause a page fault or protection fault |
---|
| 257 | CmfFlush - Write dirty pages |
---|
| 258 | CmfInval - Prevent cached page from being re-used |
---|
| 259 | */ |
---|
| 260 | int cxiMmapFlush(cxiNode_t *cnP, UInt64 start, UInt64 end, |
---|
| 261 | enum CmflushOption cmopt) |
---|
| 262 | { |
---|
| 263 | int rc = 0; |
---|
| 264 | struct inode *inodeP = cnP->osNodeP; |
---|
| 265 | |
---|
| 266 | ENTER(0); |
---|
| 267 | TRACE5(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_ENTER, |
---|
| 268 | "cxiMmapFlush: cnP 0x%lX inodeNum %d opt %d range 0x%llX-0x%llX\n", |
---|
| 269 | cnP, inodeP->i_ino, cmopt, start, end); |
---|
| 270 | |
---|
| 271 | switch (cmopt) |
---|
| 272 | { |
---|
| 273 | case CmfProtect: |
---|
| 274 | |
---|
| 275 | /* Block new modifications to page. This clears PTEs, which will force |
---|
| 276 | them to page fault. It also transfers the dirty bit from the PTE to |
---|
| 277 | the page struct. */ |
---|
| 278 | UNMAP_MAPPING_RANGE(inodeP->i_mapping, start, 0); |
---|
| 279 | break; |
---|
| 280 | |
---|
| 281 | case CmfFlush: |
---|
| 282 | FILEMAP_FDATASYNC(rc, inodeP->i_mapping); |
---|
| 283 | if (rc == 0) |
---|
| 284 | FILEMAP_FDATAWAIT(rc, inodeP->i_mapping); |
---|
| 285 | break; |
---|
| 286 | |
---|
| 287 | case CmfInval: |
---|
| 288 | truncate_inode_pages(inodeP->i_mapping, (start & PAGE_CACHE_MASK)); |
---|
| 289 | break; |
---|
| 290 | } |
---|
| 291 | |
---|
| 292 | TRACE1(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_EXIT, |
---|
| 293 | "cxiMmapFlush exit: rc %d\n", rc); |
---|
| 294 | EXIT(0); |
---|
| 295 | return rc; |
---|
| 296 | } |
---|
| 297 | |
---|
| 298 | |
---|
| 299 | /* Lock a cache page for inode bufP->inodeP at index bufP->b_blkno, |
---|
| 300 | creating if necessary. Save pointer to page in bufP->pageP. On error, |
---|
| 301 | return with bufP->pageP NULL. Page will be locked and a reference will |
---|
| 302 | be added. Return non-zero if page is already up to date. */ |
---|
| 303 | int cxiMmapGetPage(cxibuf_t *bufP) |
---|
| 304 | { |
---|
| 305 | int rc = 0; |
---|
| 306 | struct inode *inodeP = (struct inode *)bufP->b_inodeP; |
---|
| 307 | struct page *pageP = grab_cache_page(inodeP->i_mapping, bufP->b_blkno); |
---|
| 308 | |
---|
| 309 | ENTER(0); |
---|
| 310 | if (pageP != NULL) |
---|
| 311 | { |
---|
| 312 | if (PAGE_UP_TO_DATE(pageP)) |
---|
| 313 | rc = EEXIST; |
---|
| 314 | else |
---|
| 315 | ClearPageError(pageP); |
---|
| 316 | |
---|
| 317 | TRACE6(TRACE_VNODE, 1, TRCID_CXIGETPAGE, |
---|
| 318 | "cxiMmapGetPage: page 0x%lX index %d count %d flags 0x%lX mapping 0x%lX uptodate %d\n", |
---|
| 319 | pageP, pageP->index, page_count(pageP), pageP->flags, |
---|
| 320 | pageP->mapping, (rc != 0)); |
---|
| 321 | } |
---|
| 322 | bufP->pageP = pageP; |
---|
| 323 | EXIT(0); |
---|
| 324 | return rc; |
---|
| 325 | } |
---|
| 326 | |
---|
| 327 | |
---|
| 328 | /* Release/unlock page */ |
---|
| 329 | void cxiMmapReleasePage(struct page *pageP) |
---|
| 330 | { |
---|
| 331 | ENTER(0); |
---|
| 332 | TRACE4(TRACE_VNODE, 1, TRCID_CXIRELPAGE, |
---|
| 333 | "cxiMmapReleasePage: released page 0x%lX index %d count %d flags 0x%lX\n", |
---|
| 334 | pageP, pageP->index, page_count(pageP), pageP->flags); |
---|
| 335 | |
---|
| 336 | PAGE_UNLOCK(pageP); |
---|
| 337 | page_cache_release(pageP); |
---|
| 338 | EXIT(0); |
---|
| 339 | } |
---|
| 340 | |
---|
| 341 | |
---|
| 342 | /* Called from do_no_page() to handle page fault. Add page to cache if not |
---|
| 343 | already there and add a reference. If contents are not already up to |
---|
| 344 | date, then read new contents from disk. Return NULL if failure. */ |
---|
| 345 | struct page * |
---|
| 346 | #if LINUX_KERNEL_VERSION > 2060300 |
---|
| 347 | gpfs_filemap_nopage(struct vm_area_struct *area, unsigned long address, |
---|
| 348 | int * noShare) |
---|
| 349 | #else |
---|
| 350 | gpfs_filemap_nopage(struct vm_area_struct *area, unsigned long address, |
---|
| 351 | int noShare) |
---|
| 352 | #endif |
---|
| 353 | { |
---|
| 354 | unsigned long index; |
---|
| 355 | struct page *pageP = NULL; |
---|
| 356 | struct page **hashP; |
---|
| 357 | struct file *fileP = area->vm_file; |
---|
| 358 | struct inode *inodeP; |
---|
| 359 | struct MMFSVInfo *vinfoP; |
---|
| 360 | Boolean haveFlushLock = false; |
---|
| 361 | cxiNode_t *cnP; |
---|
| 362 | cxibuf_t buf; |
---|
| 363 | |
---|
| 364 | VFS_STAT_START(readpageCall); |
---|
| 365 | ENTER(0); |
---|
| 366 | |
---|
| 367 | TRACE6(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGE, |
---|
| 368 | "gpfs_filemap_nopage enter: area 0x%lX address 0x%lX vm_file 0x%lX " |
---|
| 369 | "vm_mm 0x%lX mm_users %d noShare %d\n", area, address, fileP, |
---|
| 370 | area->vm_mm, atomic_read(&area->vm_mm->mm_users), noShare); |
---|
| 371 | |
---|
| 372 | index = area->vm_pgoff + ((address - area->vm_start) >> PAGE_CACHE_SHIFT); |
---|
| 373 | |
---|
| 374 | TRACE4(TRACE_VNODE, 3, TRCID_LINUXOPS_NOPAGE_1, |
---|
| 375 | "gpfs_filemap_nopage: vm_start 0x%lX vm_end 0x%lX vm_flags 0x%lX " |
---|
| 376 | "index %d\n", area->vm_start, area->vm_end, area->vm_flags, index); |
---|
| 377 | |
---|
| 378 | /* Check that paging operations are still enabled */ |
---|
| 379 | if (!mmapEnabled) |
---|
| 380 | goto exit; |
---|
| 381 | |
---|
| 382 | LOGASSERT(fileP != NULL); |
---|
| 383 | inodeP = fileP->f_dentry->d_inode; |
---|
| 384 | LOGASSERT(inodeP != NULL); |
---|
| 385 | cnP = VP_TO_CNP(inodeP); |
---|
| 386 | |
---|
| 387 | /* Remember that there were paging requests under the given instance */ |
---|
| 388 | vinfoP = (struct MMFSVInfo *)fileP->private_data; |
---|
| 389 | if (vinfoP != NULL) |
---|
| 390 | ((struct cxiVinfo_t*)vinfoP)->rwPageDone = true; |
---|
| 391 | |
---|
| 392 | /* See if this page is already in the cache, and add a reference if so */ |
---|
| 393 | #if LINUX_KERNEL_VERSION >= 2057200 |
---|
| 394 | pageP = find_get_page(inodeP->i_mapping, index); |
---|
| 395 | #else |
---|
| 396 | hashP = page_hash(inodeP->i_mapping, index); |
---|
| 397 | pageP = __find_get_page(inodeP->i_mapping, index, hashP); |
---|
| 398 | #endif |
---|
| 399 | if (pageP) |
---|
| 400 | { |
---|
| 401 | /* Page is already cached. If it is up to date, then we do not need to |
---|
| 402 | read it. Hold mmap flush lock until after making pte valid. */ |
---|
| 403 | gpfs_ops.gpfsMmapFlushLock(cnP); |
---|
| 404 | haveFlushLock = true; |
---|
| 405 | |
---|
| 406 | if (PAGE_UP_TO_DATE(pageP)) |
---|
| 407 | goto exit; |
---|
| 408 | |
---|
| 409 | /* Not up to date. Release page and go through processRead to fetch |
---|
| 410 | the data. */ |
---|
| 411 | gpfs_ops.gpfsMmapFlushUnlock(cnP); |
---|
| 412 | haveFlushLock = false; |
---|
| 413 | |
---|
| 414 | page_cache_release(pageP); |
---|
| 415 | } |
---|
| 416 | |
---|
| 417 | /* Initialize buf struct for mmap read. We don't have to fill in a |
---|
| 418 | data address since the page won't be allocated until after all the |
---|
| 419 | necessary locks have been obtained in kSFSRead. */ |
---|
| 420 | buf.av_forw = NULL; |
---|
| 421 | buf.pageP = NULL; |
---|
| 422 | buf.b_vp = cnP; |
---|
| 423 | buf.vinfoP = vinfoP; |
---|
| 424 | buf.privVfsP = VP_TO_PVP(inodeP); |
---|
| 425 | buf.b_baddr = NULL; |
---|
| 426 | buf.b_flags = B_READ | B_PFEOF; |
---|
| 427 | buf.b_blkno = index; |
---|
| 428 | buf.b_bcount = PAGE_SIZE; |
---|
| 429 | buf.b_error = 0; |
---|
| 430 | buf.b_inodeP = inodeP; |
---|
| 431 | |
---|
| 432 | /* Read the page. If successful, this returns with mmap flush lock held |
---|
| 433 | and a reference added to page. */ |
---|
| 434 | gpfs_ops.gpfsQueueBufs(&buf); |
---|
| 435 | |
---|
| 436 | pageP = buf.pageP; |
---|
| 437 | if (pageP) |
---|
| 438 | haveFlushLock = true; |
---|
| 439 | |
---|
| 440 | exit: |
---|
| 441 | #if defined(REDHAT_AS_LINUX) && LINUX_KERNEL_VERSION < 2042100 |
---|
| 442 | /* The noShare flag is only used on earlier kernels (of which Redhat |
---|
| 443 | * Advanced Server is one). This code is pretty much common to all |
---|
| 444 | * the nopage functions and thus was put in the common do_no_page() |
---|
| 445 | * function. It's present here for RHAS. |
---|
| 446 | */ |
---|
| 447 | if (noShare && pageP) |
---|
| 448 | { |
---|
| 449 | struct page *newPageP = alloc_page(GFP_HIGHUSER); |
---|
| 450 | if (newPageP) |
---|
| 451 | { |
---|
| 452 | copy_user_highpage(newPageP, pageP, address); |
---|
| 453 | flush_page_to_ram(newPageP); |
---|
| 454 | } |
---|
| 455 | |
---|
| 456 | page_cache_release(pageP); |
---|
| 457 | pageP = newPageP; |
---|
| 458 | } |
---|
| 459 | #endif |
---|
| 460 | |
---|
| 461 | /* If we return non-NULL, then nopagedone routine will be called. */ |
---|
| 462 | if (pageP) |
---|
| 463 | { |
---|
| 464 | TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGE_2, |
---|
| 465 | "gpfs_filemap_nopage: return page 0x%lX count %d flags 0x%lX " |
---|
| 466 | "mm_users %d\n", pageP, page_count(pageP), pageP->flags, |
---|
| 467 | atomic_read(&area->vm_mm->mm_users)); |
---|
| 468 | |
---|
| 469 | dump_page(area, pageP, 0); |
---|
| 470 | } |
---|
| 471 | else |
---|
| 472 | TRACE0(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGE_3, |
---|
| 473 | "gpfs_filemap_nopage: return page NULL"); |
---|
| 474 | |
---|
| 475 | #if !defined(MMAP_LINUX_PATCH) || LINUX_KERNEL_VERSION >= 2060000 |
---|
| 476 | /* If we don't have the nopagedone patch, release mmap flush lock here. |
---|
| 477 | * If flush/invalidate runs before do_no_page can make the PTE valid, |
---|
| 478 | * the application might see stale data and updates could be lost. |
---|
| 479 | */ |
---|
| 480 | if (haveFlushLock) |
---|
| 481 | gpfs_ops.gpfsMmapFlushUnlock(cnP); |
---|
| 482 | #endif |
---|
| 483 | |
---|
| 484 | VFS_STAT_STOP; |
---|
| 485 | EXIT(0); |
---|
| 486 | return pageP; |
---|
| 487 | } |
---|
| 488 | |
---|
| 489 | |
---|
| 490 | /* Called from do_no_page() after making PTE valid */ |
---|
| 491 | void |
---|
| 492 | gpfs_filemap_nopagedone(struct vm_area_struct *area, unsigned long address, |
---|
| 493 | int status) |
---|
| 494 | { |
---|
| 495 | struct inode *inodeP = area->vm_file->f_dentry->d_inode; |
---|
| 496 | cxiNode_t *cnP = VP_TO_CNP(inodeP); |
---|
| 497 | |
---|
| 498 | ENTER(0); |
---|
| 499 | TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGEDONE, |
---|
| 500 | "gpfs_filemap_nopagedone: cnP 0x%lX area 0x%lX address 0x%lX status %d\n", |
---|
| 501 | cnP, area, address, status); |
---|
| 502 | |
---|
| 503 | gpfs_ops.gpfsMmapFlushUnlock(cnP); |
---|
| 504 | EXIT(0); |
---|
| 505 | } |
---|
| 506 | |
---|
| 507 | |
---|
| 508 | /* Address space operation to read a page from a file. On entry, the page |
---|
| 509 | is locked and it is in the page cache. If this routine is successful, |
---|
| 510 | it marks the page up to date and unlocks it. Page faulting of a mapped |
---|
| 511 | file will call gpfs_filemap_nopage, not this routine. The main user of |
---|
| 512 | this routine is the sendfile() system call. */ |
---|
| 513 | int |
---|
| 514 | gpfs_i_readpage(struct file *fileP, struct page *pageP) |
---|
| 515 | { |
---|
| 516 | int rc = 0, rc1 = 0, code = 0; |
---|
| 517 | struct dentry *dentryP = fileP->f_dentry; |
---|
| 518 | struct inode *inodeP = dentryP->d_inode; |
---|
| 519 | cxiNode_t *cnP = VP_TO_CNP(inodeP); |
---|
| 520 | struct gpfsVfsData_t *privVfsP; |
---|
| 521 | int index = pageP->index; |
---|
| 522 | cxibuf_t buf; |
---|
| 523 | struct page *bufPageP; |
---|
| 524 | char *kaddr1; |
---|
| 525 | char *kaddr2; |
---|
| 526 | ext_cred_t eCred; |
---|
| 527 | |
---|
| 528 | |
---|
| 529 | ENTER(0); |
---|
| 530 | TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_READPAGE_ENTER, |
---|
| 531 | "gpfs_i_readpage enter: fileP 0x%lX cnP 0x%lX inodeP 0x%lX inode %d\n", |
---|
| 532 | fileP, cnP, inodeP, inodeP->i_ino); |
---|
| 533 | TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_READPAGE_ENTER_A, |
---|
| 534 | "gpfs_i_readpage: page 0x%lX index %d count %d flags 0x%lX\n", |
---|
| 535 | pageP, index, page_count(pageP), pageP->flags); |
---|
| 536 | |
---|
| 537 | /* Unlock the page. In order to read the page, we will have to obtain a |
---|
| 538 | file lock and byte range lock, and we can't do that while holding a |
---|
| 539 | page lock. The page is not yet marked up to date, so it won't hurt if |
---|
| 540 | another process attempts to read this page. We don't have to add a |
---|
| 541 | reference to the page since our caller is expecting us to return with |
---|
| 542 | the page unlocked, so he must already have taken care of that. */ |
---|
| 543 | PAGE_UNLOCK(pageP); |
---|
| 544 | |
---|
| 545 | /* Make sure file is open if called from NFS */ |
---|
| 546 | if (cxiIsNFSThread()) |
---|
| 547 | { |
---|
| 548 | int NFSflags = FREAD; |
---|
| 549 | |
---|
| 550 | BEGIN_FAR_CODE; |
---|
| 551 | DBGASSERT(GNP_IS_FILE(cnP)); |
---|
| 552 | rc = gpfs_ops.gpfsGetNFS((void *)inodeP, |
---|
| 553 | (struct MMFSVInfo **)&fileP->private_data, |
---|
| 554 | &NFSflags); |
---|
| 555 | if (rc != 0) |
---|
| 556 | { |
---|
| 557 | code = 1; |
---|
| 558 | goto xerror; |
---|
| 559 | } |
---|
| 560 | |
---|
| 561 | DBGASSERT((struct MMFSVInfo *)fileP->private_data != NULL); |
---|
| 562 | |
---|
| 563 | setCred(&eCred); |
---|
| 564 | privVfsP = VP_TO_PVP(inodeP); |
---|
| 565 | DBGASSERT(privVfsP != NULL); |
---|
| 566 | rc = gpfs_ops.gpfsOpenNFS(privVfsP, cnP, FREAD, |
---|
| 567 | (struct MMFSVInfo *)fileP->private_data, &eCred); |
---|
| 568 | if (rc != 0) |
---|
| 569 | { |
---|
| 570 | code = 2; |
---|
| 571 | goto xerror; |
---|
| 572 | } |
---|
| 573 | END_FAR_CODE; |
---|
| 574 | } |
---|
| 575 | |
---|
| 576 | buf.av_forw = NULL; |
---|
| 577 | buf.pageP = NULL; |
---|
| 578 | buf.b_vp = cnP; |
---|
| 579 | buf.vinfoP = (struct MMFSVInfo *)fileP->private_data; |
---|
| 580 | buf.privVfsP = VP_TO_PVP(inodeP); |
---|
| 581 | buf.b_baddr = NULL; |
---|
| 582 | buf.b_flags = B_READ | B_PFEOF | B_SENDFILE; |
---|
| 583 | buf.b_blkno = index; |
---|
| 584 | buf.b_bcount = PAGE_SIZE; |
---|
| 585 | buf.b_error = 0; |
---|
| 586 | buf.b_inodeP = inodeP; |
---|
| 587 | |
---|
| 588 | /* Read the page. If successful, this returns with mmap flush lock held |
---|
| 589 | and a reference added to page. */ |
---|
| 590 | gpfs_ops.gpfsQueueBufs(&buf); |
---|
| 591 | |
---|
| 592 | if (buf.pageP != NULL) |
---|
| 593 | { |
---|
| 594 | |
---|
| 595 | bufPageP = buf.pageP; |
---|
| 596 | TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_READPAGE1, |
---|
| 597 | "gpfs_i_readpage: return page 0x%lX index %d count %d flags 0x%lX\n", |
---|
| 598 | bufPageP, bufPageP->index, page_count(bufPageP), bufPageP->flags); |
---|
| 599 | |
---|
| 600 | dump_page(NULL, bufPageP, 0); |
---|
| 601 | if (buf.pageP != pageP) |
---|
| 602 | { |
---|
| 603 | |
---|
| 604 | /* may be pageP has been removed from the page cache by |
---|
| 605 | truncate_inode_pages. Since caller has reference, when removed by |
---|
| 606 | truncate_inode_pages from page cache, it is orphaned and will be |
---|
| 607 | deleted as soon as the count goes to zero. Therefore, |
---|
| 608 | grab_cache_page doesn't find it and creates a new page instead. |
---|
| 609 | Just copy the new page into pageP so that sendfile can use it and |
---|
| 610 | decrement the count, which will delete the page |
---|
| 611 | */ |
---|
| 612 | kaddr1 = kmap(pageP); |
---|
| 613 | kaddr2 = kmap(bufPageP); |
---|
| 614 | memcpy(kaddr1,kaddr2,PAGE_SIZE); |
---|
| 615 | kunmap(pageP); |
---|
| 616 | kunmap(bufPageP); |
---|
| 617 | SetPageUptodate(pageP); |
---|
| 618 | } |
---|
| 619 | |
---|
| 620 | /* Release reference that was added by gpfsReadpage */ |
---|
| 621 | page_cache_release(bufPageP); |
---|
| 622 | |
---|
| 623 | /* Release mmap flush lock. This lock is used to block invalidate until |
---|
| 624 | after PTE is made valid, but we aren't making any PTEs valid here. */ |
---|
| 625 | gpfs_ops.gpfsMmapFlushUnlock(cnP); |
---|
| 626 | } |
---|
| 627 | else |
---|
| 628 | { |
---|
| 629 | rc = EFAULT; |
---|
| 630 | code = 3; |
---|
| 631 | } |
---|
| 632 | |
---|
| 633 | /* Perform release on file if called from NFS */ |
---|
| 634 | if (cxiIsNFSThread()) |
---|
| 635 | { |
---|
| 636 | DBGASSERT(GNP_IS_FILE(cnP)); |
---|
| 637 | |
---|
| 638 | /* On the last NFS release, a watchdog will be set to close the file |
---|
| 639 | after a delay. */ |
---|
| 640 | rc1 = gpfs_ops.gpfsReleaseNFS(inodeP); |
---|
| 641 | if ((rc1 != 0) && (rc == 0)) |
---|
| 642 | { |
---|
| 643 | code = 4; |
---|
| 644 | rc = rc1; |
---|
| 645 | } |
---|
| 646 | } |
---|
| 647 | |
---|
| 648 | xerror: |
---|
| 649 | |
---|
| 650 | TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_READPAGE_EXIT, |
---|
| 651 | "gpfs_i_readpage exit: inodeP 0x%lX rc %d code %d\n", inodeP, rc, |
---|
| 652 | code); |
---|
| 653 | EXIT(0); |
---|
| 654 | return -rc; |
---|
| 655 | } |
---|
| 656 | |
---|
| 657 | |
---|
| 658 | /* Address space operation to asynchronously write a page to a file. On |
---|
| 659 | entry, the page is locked. This routine queues a write request to a |
---|
| 660 | pager kproc and returns. The kproc will unlock the page when write is |
---|
| 661 | complete, and that will wake up any waiters. */ |
---|
| 662 | int |
---|
| 663 | #if LINUX_KERNEL_VERSION >= 2050000 |
---|
| 664 | gpfs_i_writepage(struct page *pageP, struct writeback_control *wbcP) |
---|
| 665 | #else |
---|
| 666 | gpfs_i_writepage(struct page *pageP) |
---|
| 667 | #endif |
---|
| 668 | { |
---|
| 669 | int rc = 0; |
---|
| 670 | struct inode *inodeP = (struct inode *) pageP->mapping->host; |
---|
| 671 | cxiNode_t *cnP = VP_TO_CNP(inodeP); |
---|
| 672 | cxibuf_t *bufP, buf; |
---|
| 673 | |
---|
| 674 | VFS_STAT_START(writepageCall); |
---|
| 675 | ENTER(0); |
---|
| 676 | TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_WRPAGE_ENTER, |
---|
| 677 | "gpfs_i_writepage enter: cnP 0x%lX inodeP 0x%lX inode %d\n", |
---|
| 678 | cnP, inodeP, inodeP->i_ino); |
---|
| 679 | TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_WRPAGE_ENTER_A, |
---|
| 680 | "gpfs_i_writepage: page 0x%lX index %d count %d flags 0x%lX\n", |
---|
| 681 | pageP, pageP->index, page_count(pageP), pageP->flags); |
---|
| 682 | dump_page(NULL, pageP, 1); |
---|
| 683 | |
---|
| 684 | /* Get a request buffer. If none are available, allocate one on stack |
---|
| 685 | and do the write synchronously. */ |
---|
| 686 | cxiBlockingMutexAcquire(&PQLockWord); |
---|
| 687 | if (PageQueueFreeP == NULL) |
---|
| 688 | { |
---|
| 689 | bufP = &buf; |
---|
| 690 | bufP->b_flags = B_WRITE; |
---|
| 691 | } |
---|
| 692 | else |
---|
| 693 | { |
---|
| 694 | bufP = PageQueueFreeP; |
---|
| 695 | PageQueueFreeP = bufP->av_forw; |
---|
| 696 | bufP->b_flags = B_WRITE | B_ASYNC; |
---|
| 697 | } |
---|
| 698 | cxiBlockingMutexRelease(&PQLockWord); |
---|
| 699 | |
---|
| 700 | /* Initialize buffer */ |
---|
| 701 | bufP->av_forw = NULL; |
---|
| 702 | bufP->pageP = pageP; |
---|
| 703 | bufP->b_vp = cnP; |
---|
| 704 | bufP->vinfoP = NULL; |
---|
| 705 | bufP->privVfsP = VP_TO_PVP(inodeP); |
---|
| 706 | bufP->b_baddr = NULL; |
---|
| 707 | bufP->b_blkno = pageP->index; |
---|
| 708 | bufP->b_bcount = PAGE_SIZE; |
---|
| 709 | bufP->b_error = 0; |
---|
| 710 | bufP->b_inodeP = NULL; |
---|
| 711 | |
---|
| 712 | #if LINUX_KERNEL_VERSION >= 2050000 |
---|
| 713 | /* Set the page writeback flag and unlock the page. When write is complete, |
---|
| 714 | the pager kproc will call IoDone to clear this flag and wake up any |
---|
| 715 | threads waiting for this write to complete. */ |
---|
| 716 | set_page_writeback(pageP); |
---|
| 717 | PAGE_UNLOCK(pageP); |
---|
| 718 | #endif |
---|
| 719 | |
---|
| 720 | /* Queue the buffer to a pager kproc and return. */ |
---|
| 721 | gpfs_ops.gpfsQueueBufs(bufP); |
---|
| 722 | |
---|
| 723 | exit: |
---|
| 724 | TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_WRPAGE_EXIT, |
---|
| 725 | "gpfs_i_writepage exit: inodeP 0x%lX rc %d\n", inodeP, rc); |
---|
| 726 | |
---|
| 727 | VFS_STAT_STOP; |
---|
| 728 | EXIT(0); |
---|
| 729 | return -rc; |
---|
| 730 | } |
---|