/*************************************************************************** * * Copyright (C) 2001 International Business Machines * All rights reserved. * * This file is part of the GPFS mmfslinux kernel module. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *************************************************************************** */ /* @(#)37 1.62.1.3 src/avs/fs/mmfs/ts/kernext/gpl-linux/cxiIOBuffer.c, mmfs, avs_rgpfs24, rgpfs24s010a 2/8/07 15:40:30 */ /* * Linux implementation of I/O buffers * * Contents: * static struct cxiKernelIOBufferDesc_t* kibdAlloc * static void kibdFree * static void deallocKernelIOBufferDesc * static int allocKernelIOBufferDesc * KibdModuleInit * KibdModuleTerm * cxiKibdPin * cxiKibdUnpin * cxiKibdUnpinAll * cxiKibdPinmm * cxiKibdUnpinmm * * cxiAttachIOBuffer * cxiDetachIOBuffer * cxiUXfer * cxiKXfer * cxiKZero * cxiMapDiscontiguousRW * cxiUnmapDiscontiguousRW * cxiMapContiguousRO * cxiUnmapContiguousRO * BHioDone * cxiStartIO * cxiWaitIO * cxiKDoIO * GetDiskInfoX */ #include #include #include #include #include #include #include #include #include #if LINUX_KERNEL_VERSION >= 2050000 #include #else #include #endif #include #include #include #include #include #include #include #include #include #ifdef CONFIG_BGL /* BG/L version of Linux doesn't define get_user_pages, so define it here */ #define get_user_pages(tsk, mm, start, len, write, force, pages, vmas) \ __get_user_pages(tsk, mm, start, len, write, force, pages, vmas, 0) #endif /* Returns a page pointer from a cxiKernelIOBufferDesc_t * The INDEX of the page to return is relative to the * KIBDP supplied. For instance a KIBD may only contain * twenty pages. If you supply a KIBD and an index of twenty * (index starts from zero) then we'll move to the next KIBD * in the chain and update the INDEX to be zero. Thus PAGEINDEX, * KIBD, and PAGEP may be updated by this macro. */ #define KIBD_GET_PAGE(KIBDP, INDEX, PAGEP) \ while ((KIBDP) && (INDEX) >= (KIBDP)->kibdPages) \ { \ (INDEX) -= (KIBDP)->kibdPages; \ (KIBDP) = (KIBDP)->kibdNextP; \ } \ if (KIBDP) \ (PAGEP) = (struct page *)(KIBDP)->maplist[(INDEX)]; \ else \ (PAGEP) = NULL; /* Spin lock protecting list of all top-level cxiKernelIOBufferDesc_t's. Using a static initializer here (spinlock_t KibdLock = SPIN_LOCK_UNLOCKED) does not work, because SPIN_LOCK_UNLOCKED contains a cast to type spinlock_t. In C++, (but not in C), this causes KibdLock to be put in the bss section, and code to be generated to perform the initialization. Unfortunately, this initialization code does not get called, because kernel modules do not have the full C++ environment established. */ spinlock_t KibdLock; /* Static pointer to slab allocator for cxiKernelIOBufferDesc_t's */ struct kmem_cache* KibdCacheP = NULL; /* Static head of doubly-linked list of top-level cxiKernelIOBufferDesc_t's. The list is protected by KibdLock. */ struct cxiKernelIOBufferDesc_t* KibdGblHeadP = NULL; /* Count of number of delays in busy wait loop in cxiWaitIO */ atomic_t cxiWaitIONDelays; /* Group of Linux buffer_heads allocated together for a multi-page I/O. A chunk is just less than half a page. */ #define BUFFER_HEADS_PER_CHUNK \ ((PAGE_SIZE/2-(2*sizeof(void*)+sizeof(int)+sizeof(atomic_t))) / \ (sizeof(void*)+sizeof(struct buffer_head))) struct cxiBufHeadChunk_t { /* Next and previous chunks of buffers used for an I/O. The list is circular. */ struct cxiBufHeadChunk_t* bhcNextP; struct cxiBufHeadChunk_t* bhcPrevP; /* Number of buffer_heads used in this chunk */ int nBHUsed; /* Number of buffer_heads in this chunk that have been submitted, but whose iodone handler has not finished running. Always updated with atomic operations, since this field is accessed asynchronously from interrupt level. */ atomic_t nBHActive; #if LINUX_KERNEL_VERSION >= 2050000 struct bio *biop[BUFFER_HEADS_PER_CHUNK]; #endif /* Space for buffer_heads */ struct buffer_head bh[BUFFER_HEADS_PER_CHUNK]; }; /* Static pointer to slab allocator for cxiBufHeadChunk_t's */ struct kmem_cache* BhcCacheP = NULL; /* Allocate and initialize a new cxiKernelIOBufferDesc_t object. Uses the slab allocator for this object type. */ static struct cxiKernelIOBufferDesc_t * kibdAlloc() { struct cxiKernelIOBufferDesc_t* kibdP; int i; ENTER(0); kibdP = (struct cxiKernelIOBufferDesc_t*) kmem_cache_alloc(KibdCacheP, GFP_KERNEL); TRACE1(TRACE_KSVFS, 14, TRCID_KIBD_NEW, "kibdAlloc: allocated cxiKernelIOBufferDesc_t at 0x%lX\n", kibdP); if (kibdP != NULL) { kibdP->kibdVaddr = NULL; kibdP->kibdPages = 0; kibdP->kibdTotalPages = 0; kibdP->kibdNextP = NULL; kibdP->gblNextP = NULL; kibdP->gblPrevP = NULL; for (i=0; i < PAGES_PER_KIBD; i++) kibdP->maplist[i] = NULL; } EXIT(0); return kibdP; } /* Free a cxiKernelIOBufferDesc_t back to its slab allocator */ static void kibdFree(struct cxiKernelIOBufferDesc_t* kibdP) { ENTER(0); TRACE1(TRACE_KSVFS, 14, TRCID_KIBD_DELETE, "kibdFree: freeing cxiKernelIOBufferDesc_t at 0x%lX\n", kibdP); kmem_cache_free(KibdCacheP, (void*)kibdP); EXIT(0); } /* Destroy a cxiKernelIOBufferDesc_t object. */ static void deallocKernelIOBufferDesc(struct cxiKernelIOBufferDesc_t* kibdP) { struct cxiKernelIOBufferDesc_t *kibdPrevP; struct page *pageP; int pageIndex = 0; int pageTotal = kibdP->kibdTotalPages; ENTER(0); for (;;) { kibdPrevP = kibdP; KIBD_GET_PAGE(kibdP, pageIndex, pageP); if (pageP == NULL) break; page_cache_release(pageP); if (kibdPrevP != kibdP) { TRACE4(TRACE_KSVFS, 11, TRCID_DEALLOC_KIBD_1, "deallocKernelIOBufferDesc: kibdP 0x%lX vaddr 0x%lX kibdPages %d " "kibdNextP 0x%lX\n", kibdPrevP, kibdPrevP->kibdVaddr, kibdPrevP->kibdPages, kibdP); pageTotal -= kibdPrevP->kibdPages; kibdFree(kibdPrevP); } pageIndex++; } if (kibdPrevP != kibdP && kibdPrevP) { TRACE4(TRACE_KSVFS, 11, TRCID_DEALLOC_KIBD_2, "deallocKernelIOBufferDesc: kibdP 0x%lX vaddr 0x%lX kibdPages %d " "kibdNextP 0x%lX\n", kibdPrevP, kibdPrevP->kibdVaddr, kibdPrevP->kibdPages, kibdP); pageTotal -= kibdPrevP->kibdPages; kibdFree(kibdPrevP); } /* Make sure all the constituent cxiKernelIODesc_t page counts added * up to the total page count in the first cxiKernelIODesct_t */ DBGASSERT(pageTotal == 0); EXIT(0); } /* Create a cxiKernelIOBufferDesc_t that maps the given region of * the user address space of this process. The buffer virtual address * must be on a page boundary. */ static int allocKernelIOBufferDesc(char* vaddr, int nPages, struct cxiKernelIOBufferDesc_t** kibdPP) { struct cxiKernelIOBufferDesc_t* kibdP; struct cxiKernelIOBufferDesc_t* kibdPrevP = NULL; struct cxiKernelIOBufferDesc_t* kibdHeadP = NULL; int rc; int mapPages = 0; int totalPages = 0; struct page * pageP; struct address_space * addrSpaceP; /* Validate parameters */ ENTER(0); DBGASSERT(((IntPtr)vaddr & (PAGE_SIZE-1)) == 0); if (nPages) { kibdHeadP = kibdPrevP = kibdP = kibdAlloc(); if (kibdP == NULL) { rc = -ENOMEM; goto errorExit; } } while (nPages) { mapPages = nPages; if (mapPages > PAGES_PER_KIBD) mapPages = PAGES_PER_KIBD; down_read(¤t->mm->mmap_sem); rc = get_user_pages(current, current->mm, (unsigned long)vaddr, mapPages, VM_WRITE, 0 /* force */, (struct page **)kibdP->maplist, NULL); up_read(¤t->mm->mmap_sem); if (rc != mapPages) goto errorExit; kibdP->kibdVaddr = vaddr; kibdP->kibdPages = mapPages; TRACE3(TRACE_KSVFS, 11, TRCID_ALLOC_KIBD_1, "allocKernelIOBufferDesc: kibdP 0x%lX vaddr 0x%lX kibdPages %d\n", kibdP, kibdP->kibdVaddr, kibdPrevP->kibdPages); vaddr += mapPages * PAGE_SIZE; totalPages += mapPages; nPages -= mapPages; if (nPages) { kibdP = kibdAlloc(); if (kibdP == NULL) { rc = -ENOMEM; goto errorExit; } kibdPrevP->kibdNextP = kibdP; kibdPrevP = kibdP; } } /* Total page count is kept only in the first one */ kibdHeadP->kibdTotalPages = totalPages; /* Ensure these pages are't mapped to any inode, otherwise * we won't be able to disclaim them. We did have a problem * where MAP_SHARED semantics would cause this. */ pageP = (struct page *)kibdHeadP->maplist[0]; DBGASSERT(pageP != NULL); addrSpaceP = pageP->mapping; #if LINUX_KERNEL_VERSION >= 2060600 || (defined(SUSE_LINUX) && LINUX_KERNEL_VERSION >= 2060507) /* MAP_ANONYMOUS flags will have PG_anon turned on. */ DBGASSERT(PageAnon(pageP)); #else DBGASSERT(addrSpaceP == NULL || addrSpaceP->host == NULL); #endif /* Success! */ *kibdPP = kibdHeadP; EXIT(0); return 0; errorExit: TRACE5(TRACE_KSVFS, 11, TRCID_ALLOC_KIBD_2, "allocKernelIOBufferDesc: vaddr 0x%lX mapPages %d totalPages %d " "kibdHeadP 0x%lX rc %d\n", vaddr, mapPages, totalPages, kibdHeadP, rc); /* Unmap and deallocate kiobufs, delete cxiKernelIOBufferDesc_t */ if (kibdHeadP) { kibdHeadP->kibdTotalPages = totalPages; deallocKernelIOBufferDesc(kibdHeadP); } EXIT(0); return ((rc < 0) ? -rc : ENOMEM); } /* Initialization routine - called when module is loaded */ void KibdModuleInit() { int rc; ENTER(0); TRACE0(TRACE_KSVFS, 1, TRCID_KIBD_INIT, "KibdModuleInit called\n"); /* Create a slab allocator for cxiKernelIOBufferDesc_t objects */ KibdCacheP = kmem_cache_create("kernIOBufDesc", sizeof(struct cxiKernelIOBufferDesc_t), 0 /* offset */, 0 /* flags */, NULL /* ctor */, NULL /* dtor */); if (KibdCacheP == NULL) cxiPanic("Cannot create cxiKernelIOBufferDesc_t cache\n"); spin_lock_init(&KibdLock); /* Create a slab allocator for cxiBufHeadChunk_t objects */ BhcCacheP = kmem_cache_create("BufHeadChunk", sizeof(struct cxiBufHeadChunk_t), 0 /* offset */, 0 /* flags */, NULL /* ctor */, NULL /* dtor */); if (BhcCacheP == NULL) cxiPanic("Cannot create cxiBufHeadChunk_t cache\n"); #if LINUX_KERNEL_VERSION >= 2060000 if (gpfs_init_inodecache()!=0) cxiPanic("Cannot create gpfsInodeCache cache\n"); #endif atomic_set(&cxiWaitIONDelays, 0); EXIT(0); } /* Termination routine - called just before module is unloaded */ void KibdModuleTerm() { int rc; ENTER(0); TRACE0(TRACE_KSVFS, 1, TRCID_KIBD_TERM, "KibdModuleTerm called\n"); /* Destroy slab allocator for cxiBufHeadChunk_t objects */ kmem_cache_destroy(BhcCacheP); /* We have to ensure these are all deallocated otherwise * the kmem_cache_destroy of the KibdCacheP will fail. * An attempt to reload GPFS would encounter the slab * cache still existing. */ cxiKibdUnpinAll(); #if LINUX_KERNEL_VERSION >= 2050000 /* Ugly ugly ugly FIXME * On 2.5, kmem_cache_destroy may or may not succeed in actually destroying * the cache. Even when kmem_cache_free 's been called for every allocated * chunk, internally, not all of the objects are on the free list. They'll * get there eventually by the virtue of cache_reap being called from a * timer routine every REAPTIMEOUT_CPUC (default 2*HZ). If * kmem_cache_destroy is called before all slabs are moved to the free list * (no active slabs left), it'll fail, and when kmem_cache_create is called * again, it'll panic the kernel, and that's what typically happens when GPFS * restarts. Until we figure out how to do this right, keep calling * cache_shrink until it tells us that it's safe to call cache_destroy */ while (kmem_cache_shrink(KibdCacheP) != 0) cxiSleep(400); #endif /* Destroy slab allocator for cxiKernelIOBufferDesc_t objects */ kmem_cache_destroy(KibdCacheP); #if LINUX_KERNEL_VERSION >= 2060000 gpfs_destroy_inodecache(); #endif EXIT(0); } /* Create a cxiKernelIOBufferDesc_t object (or list of cxiKernelIOBufferDesc_t objects) describing an I/O buffer in the user address space of the calling process and link it onto the list of all such objects. Pins the user-level buffer. The buffer virtual address must be on a page boundary. The length can be arbitrarily large, but must be a multiple of the page size. Returns 0 if successful, non-zero if unsuccessful. */ int cxiKibdPin(char* vaddr, int len, struct cxiKernelIOBufferDesc_t** kibdPP) { int nPages; struct cxiKernelIOBufferDesc_t* headP; struct cxiKernelIOBufferDesc_t* kibdP; int rc; /* Validate parameters */ ENTER(0); TRACE2(TRACE_KSVFS, 5, TRCID_KIBDPIN_ENTER, "cxiKibdPin: vaddr 0x%lX len 0x%X\n", vaddr, len); DBGASSERT(((IntPtr)vaddr & (PAGE_SIZE-1)) == 0); DBGASSERT((len & (PAGE_SIZE-1)) == 0); nPages = len / PAGE_SIZE; rc = allocKernelIOBufferDesc(vaddr, nPages, &headP); if (rc != 0) { EXIT(0); return rc; } /* Add this cxiKernelIOBufferDesc_t to the global list before returning */ TRACE1(TRACE_KSVFS, 12, TRCID_KIBDPIN_EXIT, "cxiKibdPin exit: returning 0x%lX\n", headP); spin_lock(&KibdLock); headP->gblNextP = KibdGblHeadP; if (KibdGblHeadP != NULL) KibdGblHeadP->gblPrevP = headP; KibdGblHeadP = headP; spin_unlock(&KibdLock); *kibdPP = headP; EXIT(0); return 0; } /* Remove a cxiKernelIOBufferDesc_t object from the list of all such objects, destroy it and all chained cxiKernelIOBufferDesc_t objects associated with it, and unpin the associated user-level buffer. */ void cxiKibdUnpin(struct cxiKernelIOBufferDesc_t* kibdP) { struct cxiKernelIOBufferDesc_t* nextP; struct cxiKernelIOBufferDesc_t* prevP; /* Remove this cxiKernelIOBufferDesc_t from the global list */ ENTER(0); spin_lock(&KibdLock); nextP = kibdP->gblNextP; prevP = kibdP->gblPrevP; if (nextP != NULL) nextP->gblPrevP = prevP; if (prevP != NULL) prevP->gblNextP = nextP; else KibdGblHeadP = nextP; spin_unlock(&KibdLock); /* Free the cxiKernelIOBufferDesc_t */ deallocKernelIOBufferDesc(kibdP); EXIT(0); } /* Free all cxiKernelIOBufferDesc_t's, and unpin their underlying storage. */ void cxiKibdUnpinAll() { struct cxiKernelIOBufferDesc_t* nextP; struct cxiKernelIOBufferDesc_t* kibdP; ENTER(0); TRACE0(TRACE_KSVFS, 1, TRCID_KIBD_UNPIN_ALL_ENTER, "cxiKibdUnpinAll entry\n"); for (;;) { /* Remove first cxiKernelIOBufferDesc_t on global list */ spin_lock(&KibdLock); kibdP = KibdGblHeadP; if (kibdP == NULL) { spin_unlock(&KibdLock); break; } nextP = kibdP->gblNextP; if (nextP != NULL) nextP->gblPrevP = NULL; KibdGblHeadP = nextP; spin_unlock(&KibdLock); /* Deallocate the cxiKernelIOBufferDesc_t and unpin its storage */ deallocKernelIOBufferDesc(kibdP); } TRACE0(TRACE_KSVFS, 1, TRCID_KIBD_UNPIN_ALL_EXIT, "cxiKibdUnpinAll exit\n"); EXIT(0); } #ifdef MMAP_DIO /* Create a cxiKernelIOBufferDesc_t object for a page in user address space that is already pinned. The page will be mapped into kernel address space. This is used by mmap routines that want to do direct I/O from user page to disk. The cxiKernelIOBufferDesc_t that this routine creates can be passed to cxiKDoIO just like one that was created by cxiKibdPin. */ int cxiKibdPinmm(struct page *pageP, struct cxiKernelIOBufferDesc_t** kibdPP) { struct cxiKernelIOBufferDesc_t* kibdP; ENTER(0); kibdP = kibdAlloc(); if (kibdP == NULL) { EXIT(0); return -ENOMEM; } kibdP->kibdVaddr = kmap(pageP); kibdP->maplist[0] = (char *)pageP; kibdP->kibdPages = 1; kibdP->kibdTotalPages = 1; *kibdPP = kibdP; EXIT(0); return 0; } /* Free a cxiKernelIOBufferDesc_t that was created by cxiKibdPinmm. */ void cxiKibdUnpinmm(struct page *pageP, struct cxiKernelIOBufferDesc_t* kibdP) { ENTER(0); kunmap(pageP); kibdFree(kibdP); EXIT(0); } #endif /* MMAP_DIO */ /* Attach an I/O buffer to the kernel's virtual address space. The cxiIOBufferAttachment_t returned in *attachP must be used as a parameter of most of the other operations on cxiIOBuffer_t's. */ void cxiAttachIOBuffer(struct cxiIOBuffer_t* iobP, struct cxiIOBufferAttachment_t* attachP) { int oldPinCount; int newPinCount; int rc; /* Increase the pin count on this I/O buffer. If the buffer is not already pinned, call the pinBuffer callback routine to arrange for the buffer to be pinned, then try again. */ ENTER(0); TRACE1(TRACE_KSVFS, 5, TRCID_ATTACH_ENTER, "cxiAttachIOBuffer: dataPtr 0x%lX\n", OffsetToDataPtr(iobP,0,0)); for (;;) { oldPinCount = iobP->pinCount; DBGASSERT(oldPinCount > 0); if (oldPinCount == 0) { DBGASSERT(oldPinCount > 0); break; // rc = xxx->pinBufferCallback(iobP); // if (rc != 0) // return rc; } else { newPinCount = oldPinCount+1; rc = compare_and_swap((atomic_p)&iobP->pinCount, &oldPinCount, newPinCount); if (rc == 1) break; } } /* Once the pin of the buffer succeeds, it must have a * cxiKernelIOBufferDesc_t. Use that as the attachment data. */ DBGASSERT(iobP->kernelIOBufferDescP != NULL); attachP->kDescP = iobP->kernelIOBufferDescP; TRACE2(TRACE_KSVFS, 11, TRCID_ATTACH_KIBD, "cxiAttachIOBuffer: kernelIOBufferDescP 0x%lX newPinCount %d\n", iobP->kernelIOBufferDescP, newPinCount); EXIT(0); } /* Detach a buffer from the kernel's virtual address space. */ void cxiDetachIOBuffer(struct cxiIOBuffer_t* iobP, struct cxiIOBufferAttachment_t* attachP) { /* Validate attachment data */ ENTER(0); TRACE3(TRACE_KSVFS, 5, TRCID_DETACH_KIBD, "cxiDetachIOBuffer: dataPtr 0x%lX kDescP 0x%lX oldPinCount %d\n", OffsetToDataPtr(iobP,0,0), attachP->kDescP, iobP->pinCount); if (attachP->kDescP == NULL) { EXIT(0); return; } DBGASSERT(attachP->kDescP == iobP->kernelIOBufferDescP); /* Decrement I/O buffer pin count */ DBGASSERT(iobP->pinCount >= 2); ATOMIC_ADD(&iobP->pinCount, -1); /* Invalidate attachment data */ attachP->kDescP = NULL; EXIT(0); } /* Transfer len bytes beginning at offset bufOffset within I/O buffer *iobP to or from a user buffer. The direction of the transfer is given with respect to the I/O buffer. Returns EOK if successful, other error codes if unsuccessful. */ int cxiUXfer(struct cxiIOBuffer_t* iobP, Boolean toIOBuffer, const struct cxiIOBufferAttachment_t* attachP, void* vkopP, int bufOffset, int len, struct cxiUio_t* uioP) { int pageIndex; struct cxiKernelIOBufferDesc_t* kibdP = iobP->kernelIOBufferDescP; int pageOffset; struct page * pageP; int pageLen; unsigned long kaddr; int rc = 0; ENTER(0); /* Validate parameters */ TRACE5(TRACE_KSVFS, 5, TRCID_UXFER_LINUX, "cxiUXfer: dataPtr 0x%lX kBuf 0x%lX toIOBuf %d offset %d len %d\n", OffsetToDataPtr(iobP,0,0), kibdP, toIOBuffer, bufOffset, len); DBGASSERT(bufOffset >= 0); DBGASSERT(bufOffset+len <= iobP->ioBufLen); DBGASSERT(attachP->kDescP == iobP->kernelIOBufferDescP); DBGASSERT(kibdP->kibdVaddr == OffsetToDataPtr(iobP,0,0)); DBGASSERT(iobP->ioBufLen/PAGE_SIZE <= kibdP->kibdTotalPages); DBGASSERT(iobP->pinCount >= 2); /* Transfer data in or out of as many cxiKernelIOBufferDesc_t's as necessary to satisfy the data move request */ pageIndex = bufOffset / PAGE_SIZE; pageOffset = bufOffset % PAGE_SIZE; pageLen = PAGE_SIZE - pageOffset; for (;;) { /* Calculate how many bytes to move in or out of the current page of the I/O buffer */ if (len < pageLen) pageLen = len; KIBD_GET_PAGE(kibdP, pageIndex, pageP); DBGASSERT(pageP != NULL); /* Map current I/O buffer page into the kernel's address space temporarily, then copy data in or out of the page */ kaddr = (unsigned long)kmap(pageP); TRACE4(TRACE_KSVFS, 12, TRCID_UXFER_UIOMOVE, "cxiUXfer: uiomove pageIndex %d kaddr 0x%lX pageOffset %d " "pageLen %d\n", pageIndex, kaddr, pageOffset, pageLen); rc = cxiUiomove((char *)(kaddr + pageOffset), pageLen, toIOBuffer, uioP); kunmap(pageP); /* Leave loop if an error occurred on the move */ if (rc != 0) break; /* Update length left to copy and test for loop termination */ len -= pageLen; if (len <= 0) break; /* Set up for next iteration. If the page just copied is the last page of this cxiKernelIOBufferDesc_t, advance to the next one. */ pageOffset = 0; pageLen = PAGE_SIZE; pageIndex += 1; } /* end of do forever */ EXIT(0); return rc; } /* Perform cross-memory transfer of len bytes from user memory in current task to memory in specified address space. If toXmem is true then copy is from userAddrP to udataP/xmemP, otherwise the opposite. */ int cxiXmemXfer(char *userAddrP, int len, char *udataP, cxiXmem_t *xmemP, Boolean toXmem) { int rc = 0; int bufOffset, pageIndex, pageOffset, pageLen; void *kaddrP; struct page *pageP; struct cxiKernelIOBufferDesc_t *kibdP = xmemP->kibdP; ENTER(0); TRACE5(TRACE_KSVFS, 5, TRCID_XMEMXFER_LINUX, "cxiXmemXfer: userAddrP 0x%lX len %d udataP 0x%lX " "kibdP 0x%lX toXmem %d\n", userAddrP, len, udataP, kibdP, toXmem); bufOffset = udataP - kibdP->kibdVaddr; DBGASSERT(bufOffset >= 0); DBGASSERT(bufOffset + len <= kibdP->kibdTotalPages * PAGE_SIZE); /* Transfer data in or out of as many cxiKernelIOBufferDesc_t's as necessary to satisfy the data move request */ pageIndex = bufOffset / PAGE_SIZE; pageOffset = bufOffset % PAGE_SIZE; pageLen = PAGE_SIZE - pageOffset; for (;;) { /* Calculate how many bytes to move in or out of the current page of the I/O buffer */ if (len < pageLen) pageLen = len; KIBD_GET_PAGE(kibdP, pageIndex, pageP); DBGASSERT(pageP != NULL); /* Map current I/O buffer page into the kernel's address space temporarily, then copy data in or out of the page */ kaddrP = kmap(pageP); TRACE4(TRACE_KSVFS, 12, TRCID_XMEMFER_COPY, "cxiXmemXfer: copy pageIndex %d kaddrP 0x%lX pageOffset %d " "pageLen %d\n", pageIndex, kaddrP, pageOffset, pageLen); if (toXmem) rc = cxiCopyIn(userAddrP, (char *)kaddrP + pageOffset, pageLen); else rc = cxiCopyOut((char *)kaddrP + pageOffset, userAddrP, pageLen); kunmap(pageP); /* Leave loop if an error occurred on the move */ if (rc != 0) break; /* Update length left to copy and test for loop termination */ len -= pageLen; if (len <= 0) break; /* Set up for next iteration. If the page just copied is the last page of this cxiKernelIOBufferDesc_t, advance to the next one. */ userAddrP += pageLen; pageOffset = 0; pageLen = PAGE_SIZE; pageIndex += 1; } /* end of do forever */ EXIT(0); return rc; } /* Transfer len bytes beginning at offset bufOffset within I/O buffer *iobP to or from a contiguous kernel buffer. The direction of the transfer is given with respect to the I/O buffer. Returns EOK if successful, other error codes if unsuccessful. */ int cxiKXfer(struct cxiIOBuffer_t* iobP, Boolean toIOBuffer, const struct cxiIOBufferAttachment_t* attachP, int bufOffset, int len, char* kBufP) { int pageIndex; struct cxiKernelIOBufferDesc_t* kibdP = iobP->kernelIOBufferDescP; int pageOffset; struct page * pageP; int pageLen; unsigned long kaddr; /* Validate parameters */ ENTER(0); TRACE6(TRACE_KSVFS, 5, TRCID_KXFER_LINUX, "cxiKXfer: dataPtr 0x%lX kBuf 0x%lX toIOBuf %d offset %d len %d " "kBufP 0x%lX\n", OffsetToDataPtr(iobP,0,0), kibdP, toIOBuffer, bufOffset, len, kBufP); DBGASSERT(bufOffset >= 0); DBGASSERT(bufOffset+len <= iobP->ioBufLen); DBGASSERT(attachP->kDescP == iobP->kernelIOBufferDescP); DBGASSERT(kibdP->kibdVaddr == OffsetToDataPtr(iobP,0,0)); DBGASSERT(iobP->ioBufLen/PAGE_SIZE <= kibdP->kibdTotalPages); DBGASSERT(iobP->pinCount >= 2); /* Transfer data in or out of as many cxiKernelIOBufferDesc_t's as necessary to satisfy the data move request */ pageIndex = bufOffset / PAGE_SIZE; pageOffset = bufOffset % PAGE_SIZE; pageLen = PAGE_SIZE - pageOffset; for (;;) { /* Calculate how many bytes to move in or out of the current page of the I/O buffer */ if (len < pageLen) pageLen = len; KIBD_GET_PAGE(kibdP, pageIndex, pageP); DBGASSERT(pageP != NULL); /* Map current I/O buffer page into the kernel's address space temporarily, then copy data in or out of the page */ kaddr = (unsigned long)kmap(pageP); TRACE5(TRACE_KSVFS, 12, TRCID_KXFER_MEMCPY, "cxiKXfer: move kibdP 0x%lX pageIndex %d kaddr 0x%lX " "pageOffset %d pageLen %d\n", kibdP, pageIndex, kaddr, pageOffset, pageLen); if (toIOBuffer) memcpy((void *)(kaddr + pageOffset), kBufP, pageLen); else memcpy(kBufP, (void *)(kaddr + pageOffset), pageLen); kunmap(pageP); /* Update length left to copy and test for loop termination */ len -= pageLen; if (len <= 0) break; /* Set up for next iteration. If the page just copied is the last page of this cxiKernelIOBufferDesc_t, advance to the next one. */ kBufP += pageLen; pageOffset = 0; pageLen = PAGE_SIZE; pageIndex += 1; } /* end of do forever */ EXIT(0); return 0; } /* Set len bytes beginning at offset bufOffset within I/O buffer *iobP to zero. Returns EOK if successful, other error codes if unsuccessful. */ int cxiKZero(struct cxiIOBuffer_t* iobP, const struct cxiIOBufferAttachment_t* attachP, int bufOffset, int len) { int pageIndex; struct cxiKernelIOBufferDesc_t* kibdP = iobP->kernelIOBufferDescP; int pageOffset; struct page * pageP; int pageLen; unsigned long kaddr; /* Validate parameters */ ENTER(0); TRACE4(TRACE_KSVFS, 5, TRCID_KZERO_LINUX, "cxiKZero: dataPtr 0x%lX kBuf 0x%lX offset %d len %d\n", OffsetToDataPtr(iobP,0,0), kibdP, bufOffset, len); DBGASSERT(bufOffset >= 0); DBGASSERT(bufOffset+len <= iobP->ioBufLen); DBGASSERT(attachP->kDescP == iobP->kernelIOBufferDescP); DBGASSERT(kibdP->kibdVaddr == OffsetToDataPtr(iobP,0,0)); DBGASSERT(iobP->ioBufLen/PAGE_SIZE <= kibdP->kibdTotalPages); DBGASSERT(iobP->pinCount >= 2); /* Zero data in as many cxiKernelIOBufferDesc_t's as necessary to complete the request */ pageIndex = bufOffset / PAGE_SIZE; pageOffset = bufOffset % PAGE_SIZE; pageLen = PAGE_SIZE - pageOffset; for (;;) { /* Calculate how many bytes to zero in the current page of the I/O buffer */ if (len < pageLen) pageLen = len; KIBD_GET_PAGE(kibdP, pageIndex, pageP); DBGASSERT(pageP != NULL); /* Map current I/O buffer page into the kernel's address space temporarily, then zero data in the page */ kaddr = (unsigned long)kmap(pageP); TRACE4(TRACE_KSVFS, 12, TRCID_KZERO_MEMSET, "cxiKZero: zero pageIndex %d kaddr 0x%lX pageOffset %d pageLen %d\n", pageIndex, kaddr, pageOffset, pageLen); memset((void *)(kaddr + pageOffset), 0, pageLen); kunmap(pageP); /* Update length left to zero and test for loop termination */ len -= pageLen; if (len <= 0) break; /* Set up for next iteration. If the page just zeroed is the last page of this cxiKernelIOBufferDesc_t, advance to the next one. */ pageOffset = 0; pageLen = PAGE_SIZE; pageIndex += 1; } /* end of do forever */ EXIT(0); return 0; } /* Map an I/O buffer so it can be read and written from kernel code running in the context of a user thread. Depending on the platform, the addresses at which the I/O buffer gets mapped may not be contiguous. The details of how the buffer got mapped are handled by the cxiDiscontiguousDirectoryBuffer_t object that is filled in by this call. On some platforms, mapping buffers using this call consumes scarce resources, so all cxiMapDiscontiguousRW calls should be promptly matched by cxiUnmapDiscontiguousRW calls as soon as the operation that required access to the I/O buffer completes. Returns 0 if successful, other error codes if unsuccessful. */ int cxiMapDiscontiguousRW(struct cxiIOBuffer_t* iobP, const struct cxiIOBufferAttachment_t* attachP, struct cxiDiscontiguousDirectoryBuffer_t* discontigP) { /* ?? WARNING: Since this must kmap multiple pages, there is the possibility of deadlock if multiple threads are part of the way through executing this code, and LAST_PKMAP pages (512 or 1024) have already been kmapped. There needs to be flow control whereby threads reserve enough pages to complete all of their kmaps before they begin acquiring pages. */ struct cxiKernelIOBufferDesc_t* kibdP = iobP->kernelIOBufferDescP; int pageIndex; int dirIndex; int mapPages; struct page * pageP; unsigned long kaddr; /* __CXI_BUFFERS_ARE_CONTIGUOUS is not #defined */ /* Validate parameters */ ENTER(0); TRACE3(TRACE_KSVFS, 4, TRCID_MAP_DISCONTIG_ENTER, "cxiMapDiscontiguousRW: dataPtr 0x%lX kBufP 0x%lX ioBufLen 0x%X\n", OffsetToDataPtr(iobP,0,0), kibdP, iobP->ioBufLen); DBGASSERT(attachP->kDescP == iobP->kernelIOBufferDescP); DBGASSERT(kibdP->kibdVaddr == OffsetToDataPtr(iobP,0,0)); DBGASSERT(iobP->pinCount >= 2); /* The mappable buffer memory may be longer than a directory block */ mapPages = (iobP->ioBufLen + DISCONTIG_PAGE_SIZE - 1) / DISCONTIG_PAGE_SIZE; mapPages = MIN(mapPages, MAX_PAGES_PER_DIRBLOCK); pageIndex = 0; for (dirIndex=0 ; dirIndexuserPagePointerArray[dirIndex] = (char*)kaddr; discontigP->osPagePointerArray[dirIndex] = (void*)pageP; pageIndex++; } discontigP->mappedLen = dirIndex * DISCONTIG_PAGE_SIZE; EXIT(0); return 0; } /* Unmap an I/O buffer previously mapped */ void cxiUnmapDiscontiguousRW(struct cxiIOBuffer_t* iobP, struct cxiDiscontiguousDirectoryBuffer_t* discontigP) { int pageIndex; struct page * pageP; int mappedPages; ENTER(0); TRACE4(TRACE_KSVFS, 4, TRCID_UNMAP_DISCONTIG_ENTER, "cxiUnmapDiscontiguousRW: dataPtr 0x%lX kBufP 0x%lX ioBufLen 0x%X " "mappedLen %d\n", OffsetToDataPtr(iobP,0,0), iobP->kernelIOBufferDescP, iobP->ioBufLen, discontigP->mappedLen); /* Unmap all pages in discontiguous map. If the osPagePointerArray entry * is NULL, it means that the last mapping was made via MapContiguousBuffer, * which did not do any kmaps that need to be kunmap'ped. */ mappedPages = (discontigP->mappedLen + DISCONTIG_PAGE_SIZE - 1) / DISCONTIG_PAGE_SIZE; for (pageIndex = 0; pageIndex < mappedPages; pageIndex++) { pageP = (struct page *)discontigP->osPagePointerArray[pageIndex]; TRACE3(TRACE_KSVFS, 12, TRCID_UNMAP_DISCONTIG_KUNMAP, "cxiUnmapDiscontiguousRW: unmap pageIndex %d pageP 0x%lX " "kaddr 0x%lX\n", pageIndex, pageP, discontigP->userPagePointerArray[pageIndex]); if (pageP != NULL) { kunmap(pageP); discontigP->osPagePointerArray[pageIndex] = NULL; } discontigP->userPagePointerArray[pageIndex] = NULL; } discontigP->mappedLen = 0; EXIT(0); } /* Return an address in kernel memory that holds a contigous read-only copy of a portion of an I/O buffer. If possible, this will be a mapping of the I/O buffer. If necessary, this routine will allocate a new block of kernel memory and copy the requested data to it. The returned cxiContiguousBuffer_t encapsulates what method was used, so that cxiUnmapContiguousRO can release whatever resources were obtained by this call. Returns 0 if successful, other error codes if unsuccessful. */ int cxiMapContiguousRO(struct cxiIOBuffer_t* iobP, const struct cxiIOBufferAttachment_t* attachP, int bufOffset, int len, const char** contigBasePP, struct cxiContiguousBuffer_t* contigP) { int pageIndex; int pageOffset; int endPageIndex; struct cxiKernelIOBufferDesc_t* kibdP = iobP->kernelIOBufferDescP; struct page * pageP; unsigned long kaddr; char* tempBufP; Boolean usedKmalloc; int rc; /* Validate parameters */ ENTER(0); TRACE4(TRACE_KSVFS, 4, TRCID_MAP_CONTIG_ENTER, "cxiMapContiguousRO: dataPtr 0x%lX kBufP 0x%lX bufOffset %d len %d\n", OffsetToDataPtr(iobP,0,0), kibdP, bufOffset, len); DBGASSERT(bufOffset >= 0); DBGASSERT(bufOffset+len <= iobP->ioBufLen); DBGASSERT(attachP->kDescP == iobP->kernelIOBufferDescP); DBGASSERT(kibdP->kibdVaddr == OffsetToDataPtr(iobP,0,0)); DBGASSERT(iobP->ioBufLen/PAGE_SIZE <= kibdP->kibdTotalPages); DBGASSERT(iobP->pinCount >= 2); /* If the requested piece of the I/O buffer does not cross a page boundary, then map the page and return the mapped address within the page */ pageIndex = bufOffset / PAGE_SIZE; pageOffset = bufOffset % PAGE_SIZE; endPageIndex = (bufOffset+len-1) / PAGE_SIZE; if (pageIndex == endPageIndex) { KIBD_GET_PAGE(kibdP, pageIndex, pageP); DBGASSERT(pageP != NULL); /* Map I/O buffer page into the kernel's address space */ kaddr = (unsigned long)kmap(pageP); /* Return address within the mapped page, and set map state so cxiUnmapContiguousRO knows to do kunmap */ *contigBasePP = (char*) (kaddr+pageOffset); contigP->mallocedBaseP = NULL; contigP->usedKmalloc = false; contigP->pageP = pageP; TRACE2(TRACE_KSVFS, 5, TRCID_MAP_CONTIG_KMAP, "cxiMapContiguousRO: mapped pageP 0x%lX at 0x%lX\n", pageP, *contigBasePP); EXIT(0); return 0; } /* Otherwise, the requested part of the I/O buffer spans page boundaries. Allocate a contiguous buffer, and copy data from the I/O buffer to the temporary buffer. */ else { if (len <= PAGE_SIZE) { tempBufP = (char *)kmalloc(len, GFP_KERNEL); usedKmalloc = true; } else { tempBufP = (char*)vmalloc(len); usedKmalloc = false; } if (tempBufP == NULL) { EXIT(0); return -ENOMEM; } rc = cxiKXfer(iobP, CXI_XFER_FROM_IOBUFFER, attachP, bufOffset, len, tempBufP); if (rc != 0) { if (usedKmalloc) kfree((void*)tempBufP); else vfree((void*)tempBufP); EXIT(0); return rc; } #ifdef MALLOC_DEBUG MallocDebugNew(tempBufP, len, 4); #endif /* Return address within the contiguous temporary buffer, and set map state so cxiUnmapContiguousRO knows to do vfree */ *contigBasePP = tempBufP; contigP->mallocedBaseP = tempBufP; contigP->usedKmalloc = usedKmalloc; contigP->pageP = NULL; TRACE1(TRACE_KSVFS, 5, TRCID_MAP_CONTIG_VMALLOC, "cxiMapContiguousRO: copied to 0x%lX\n", tempBufP); EXIT(0); return 0; } } /* Release a mapping or copy obtained with cxiMapContiguousRO */ void cxiUnmapContiguousRO(struct cxiIOBuffer_t* iobP, struct cxiContiguousBuffer_t* contigP) { ENTER(0); if (contigP->mallocedBaseP != NULL) { TRACE2(TRACE_KSVFS, 4, TRCID_UNMAP_CONTIG_VFREE, "cxiUnmapContiguousRO: dataPtr 0x%lX vfree 0x%lX\n", OffsetToDataPtr(iobP,0,0), contigP->mallocedBaseP); DBGASSERT(contigP->pageP == NULL); if (contigP->usedKmalloc) kfree((void*)contigP->mallocedBaseP); else vfree((void*)contigP->mallocedBaseP); #ifdef MALLOC_DEBUG MallocDebugDelete(contigP->mallocedBaseP); #endif contigP->mallocedBaseP = NULL; } else { TRACE2(TRACE_KSVFS, 4, TRCID_UNMAP_CONTIG_KUNMAP, "cxiUnmapContiguousRO: dataPtr 0x%lX kunmap 0x%lX\n", OffsetToDataPtr(iobP,0,0), contigP->pageP); DBGASSERT(contigP->pageP != NULL); kunmap((struct page *)contigP->pageP); contigP->pageP = NULL; } EXIT(0); } #if LINUX_KERNEL_VERSION < 2050000 /* iodone routine for GPFS buffer_heads. Unlock buffer and wake up * waiters, if any. */ static void BHioDone(struct buffer_head* bhP, int uptodate) { struct cxiBufHeadChunk_t* bhcP; mark_buffer_uptodate(bhP, uptodate); bhcP = (struct cxiBufHeadChunk_t*)bhP->b_private; unlock_buffer(bhP); atomic_dec(&bhcP->nBHActive); } /* Start a read or write of the given sectors from dev. Data should be * placed into the I/O buffer beginning at byte offset bufOffset. Returns * 0 on success, negative values on error. All of the data to be * transferred will be in the first cxiKernelIOBufferDesc_t. */ int cxiStartIO(struct cxiKernelIOBufferDesc_t* kibdHeadP, Boolean isWrite, cxiDev_t dev, UInt64 startSector, int nSectors, int bufOffset, struct cxiBufHeadChunk_t** bhcHeadPP) { int bufEndOffset; int nTotalPages; struct cxiBufHeadChunk_t* bhcP; struct cxiBufHeadChunk_t* bhcHeadP; struct cxiBufHeadChunk_t* bhcTailP; int nBHsAllocated; int pageIndex; int pageOffset; int sectorsThisBH; struct buffer_head* bhP; struct page* pageP; struct cxiBufHeadChunk_t* p; struct cxiKernelIOBufferDesc_t* kibdP = kibdHeadP; kdev_t kdev = cxiDevToKernelDev(dev); /* Convert to kernel version of dev_t */ /* Validate parameters */ ENTER(0); TRACE6(TRACE_IO, 4, TRCID_KDOIO_LINUX, "cxiStartIO: kBuf 0x%lX isWrite %d dev 0x%X sector %llu nSectors %d " "offset %d\n", kibdP, isWrite, dev, startSector, nSectors, bufOffset); DBGASSERT(kibdP != NULL); DBGASSERT(bufOffset >= 0); DBGASSERT(nSectors > 0); /* Compute the total number of pages spanned by the portion of the buffer that will participate in the I/O. This equals the number of buffer_heads that will be used. */ bufEndOffset = bufOffset + nSectors*512 - 1; nTotalPages = (bufEndOffset/PAGE_SIZE) - (bufOffset/PAGE_SIZE) + 1; /* Allocate the entire list of buffer_head chunks needed for this I/O */ bhcP = (struct cxiBufHeadChunk_t*) kmem_cache_alloc(BhcCacheP, GFP_KERNEL); bhcHeadP = bhcP; if (bhcP == NULL) goto enomem; bhcP->bhcNextP = bhcP; bhcP->bhcPrevP = bhcP; bhcP->nBHUsed = 0; atomic_set(&bhcP->nBHActive, 0); nBHsAllocated = BUFFER_HEADS_PER_CHUNK; while (nBHsAllocated < nTotalPages) { bhcP = (struct cxiBufHeadChunk_t*) kmem_cache_alloc(BhcCacheP, GFP_KERNEL); if (bhcP == NULL) goto enomem; bhcTailP = bhcHeadP->bhcPrevP; bhcP->bhcNextP = bhcHeadP; bhcP->bhcPrevP = bhcTailP; bhcTailP->bhcNextP = bhcP; bhcHeadP->bhcPrevP = bhcP; bhcP->nBHUsed = 0; atomic_set(&bhcP->nBHActive, 0); nBHsAllocated += BUFFER_HEADS_PER_CHUNK; } /* Build and submit a buffer_head for each page of the current I/O */ bhcP = bhcHeadP; pageIndex = bufOffset / PAGE_SIZE; pageOffset = bufOffset % PAGE_SIZE; DBGASSERT(pageOffset%512 == 0); sectorsThisBH = MIN((PAGE_SIZE-pageOffset) / 512, nSectors); while (nSectors > 0) { /* Get a buffer_head for the the next page */ if (bhcP->nBHUsed == BUFFER_HEADS_PER_CHUNK) { bhcP = bhcP->bhcNextP; DBGASSERT(bhcP->nBHUsed == 0); } bhP = &bhcP->bh[bhcP->nBHUsed]; bhcP->nBHUsed += 1; /* Initialize the new buffer_head */ memset(bhP, 0, sizeof(*bhP)); KIBD_GET_PAGE(kibdP, pageIndex, pageP); DBGASSERT(pageP != NULL); /* Build and submit the buffer_head for the current page */ bhP->b_size = sectorsThisBH * 512; bhP->b_page = pageP; #ifndef __64BIT__ if (PageHighMem(pageP)) bhP->b_data = (char *)(0 + pageOffset); else #endif bhP->b_data = page_address(pageP) + pageOffset; bhP->b_this_page = bhP; bhP->b_end_io = BHioDone; bhP->b_private = (void*)bhcP; bhP->b_blocknr = startSector; init_waitqueue_head(&bhP->b_wait); bhP->b_dev = kdev; bhP->b_rdev = kdev; bhP->b_rsector = startSector; bhP->b_list = BUF_CLEAN; bhP->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req) | (1 << BH_Uptodate); TRACE6(TRACE_IO, 6, TRCID_KDOIO_LINUX_BH, "cxiStartIO: bhcP 0x%lX bhP 0x%lX sector %llu sectorsThisBH %d state 0x%X pageP 0x%lX\n", bhcP, bhP, startSector, sectorsThisBH, bhP->b_state, pageP); atomic_inc(&bhcP->nBHActive); generic_make_request(isWrite, bhP); if (isWrite) MOD_PGPGOUT(sectorsThisBH); else MOD_PGPGIN(sectorsThisBH); /* Advance to next page */ startSector += sectorsThisBH; nSectors -= sectorsThisBH; sectorsThisBH = MIN(nSectors, PAGE_SIZE/512); pageIndex += 1; pageOffset = 0; } /* Unplug the disk to be sure I/Os actually get started */ run_task_queue(&tq_disk); /* Set success return code and return list of active buffer_heads */ *bhcHeadPP = bhcHeadP; EXIT(0); return 0; enomem: /* Free buffer_head chunks allocated so far and return failure */ if (bhcHeadP != NULL) { bhcP = bhcHeadP; bhcTailP = bhcHeadP->bhcPrevP; do { p = bhcP; bhcP = bhcP->bhcNextP; kmem_cache_free(BhcCacheP, (void*)p); } while (p != bhcTailP); } EXIT(0); return -ENOMEM; } /* Routine to set up the disk block size and get disk parameters */ int GetDiskInfoX(cxiDev_t devId, struct cxiDiskInfo_t* diskInfoP) { kdev_t kdev; int n1KBlocks; /* Convert to kernel version of dev_t */ ENTER(0); kdev = cxiDevToKernelDev(devId); /* Get hardware sector size. If unknown, assume 512. */ #if LINUX_KERNEL_VERSION >= 2040312 diskInfoP->sectorSize = get_hardsect_size(kdev); #else diskInfoP->sectorSize = get_hardblocksize(kdev); #endif if (diskInfoP->sectorSize == 0) diskInfoP->sectorSize = 512; /* Set blocksize of this device to hardware sector size */ set_blocksize(kdev, diskInfoP->sectorSize); /* If defined, return number of sectors on device */ n1KBlocks = 0; if (blk_size[MAJOR(kdev)]) n1KBlocks = blk_size[MAJOR(kdev)][MINOR(kdev)]; diskInfoP->totalSectors = (Int64)n1KBlocks * 1024 / diskInfoP->sectorSize; TRACE3(TRACE_IO, 2, TRCID_DISKINFO, "GetDiskInfo: devId %08lX sector size %d totalSectors %lld\n", devId, diskInfoP->sectorSize, diskInfoP->totalSectors); #if 0 printk("VMALLOC_START=0x%lX VMALLOC_END=0x%lX\n", VMALLOC_START, VMALLOC_END); #endif EXIT(0); return 0; } #else /* >= 2050000 */ /* iodone routine for struct bio */ static int bioDone(struct bio *bioP, unsigned int done, int err) { struct buffer_head *bhP; struct cxiBufHeadChunk_t *bhcP; if (bioP->bi_size) return 1; /* wakes up waiters who will deallocate bio buffer head chunk */ bhP = (struct buffer_head *)bioP->bi_private; bhcP = (struct cxiBufHeadChunk_t *)bhP->b_private; if (test_bit(BIO_UPTODATE, &bioP->bi_flags)) set_buffer_uptodate(bhP); else clear_buffer_uptodate(bhP); unlock_buffer(bhP); atomic_dec(&bhcP->nBHActive); return 0; } /* Start a read or write of the given sectors from dev. Data should be * placed into the I/O buffer beginning at byte offset bufOffset. Returns * 0 on success, negative values on error. All of the data to be * transferred will be in the first cxiKernelIOBufferDesc_t. */ int cxiStartIO(struct cxiKernelIOBufferDesc_t *kibdHeadP, Boolean isWrite, cxiDev_t dev, UInt64 startSector, int nSectors, int bufOffset, struct cxiBufHeadChunk_t **bhcHeadPP) { int i; int vecsAllocated; int bufEndOffset; int nTotalPages; int iovIndex; int pageIndex; int pageOffset; int sectorsThisPage; int nBHsAllocated; struct bio *bioP; struct buffer_head *bhP; struct page *pageP; struct cxiBufHeadChunk_t *p; struct cxiBufHeadChunk_t *bhcP; struct cxiBufHeadChunk_t *bhcHeadP; struct cxiBufHeadChunk_t *bhcTailP; struct cxiKernelIOBufferDesc_t *kibdP = kibdHeadP; struct block_device *bdevP = bdget(new_decode_dev(dev)); int maxIOVec = bio_get_nr_vecs(bdevP); /* query max device vectors */ request_queue_t* reqQP; ENTER(0); LOGASSERT(bdevP != NULL && bdevP->bd_disk != NULL); /* Validate parameters */ TRACE6(TRACE_IO, 4, TRCID_KDOIO_LINUX_BIO, "cxiStartIO: kBuf 0x%lX isWrite %d dev 0x%X sector %llu nSectors %d " "offset %d\n", kibdP, isWrite, dev, startSector, nSectors, bufOffset); DBGASSERT(kibdP != NULL); DBGASSERT(bufOffset >= 0); DBGASSERT(nSectors > 0); /* Compute the total number of pages spanned by the portion of the * buffer that will participate in the I/O. This equals the number * of io vectors needed. */ bufEndOffset = bufOffset + nSectors*512 - 1; nTotalPages = (bufEndOffset/PAGE_SIZE) - (bufOffset/PAGE_SIZE) + 1; /* Compute the pageIndex in the kibd struct as well as the offset * in the first page to read/write. */ pageIndex = bufOffset / PAGE_SIZE; pageOffset = bufOffset % PAGE_SIZE; DBGASSERT(pageOffset%512 == 0); /* Allocate a single buffer_head chunk and link it to itself. * Subsequent buffer_head chunks may be needed and are allocated * below. */ bhcP = (struct cxiBufHeadChunk_t *)kmem_cache_alloc(BhcCacheP, GFP_KERNEL); bhcHeadP = bhcP; if (bhcP == NULL) goto enomem; bhcP->bhcNextP = bhcP; /* circular link to itself */ bhcP->bhcPrevP = bhcP; bhcP->nBHUsed = 0; atomic_set(&bhcP->nBHActive, 0); nBHsAllocated = BUFFER_HEADS_PER_CHUNK; while (nSectors > 0) { vecsAllocated = MIN(nTotalPages, maxIOVec); bioP = bio_alloc(GFP_NOIO, vecsAllocated); if (bioP == NULL) goto enomem; /* Allocate a buffer head and point to it from the bio struct. * We submit the bio struct directly but wait on the dummy * buffer_head struc, since primitives exist for waiting/wakeup * there. We want to submit bios instead of buffer heads since * the bio can encapsulate a larger i/o whereas buffer_heads can * only do a page. */ if (bhcP->nBHUsed == BUFFER_HEADS_PER_CHUNK) { bhcP = (struct cxiBufHeadChunk_t *)kmem_cache_alloc(BhcCacheP, GFP_KERNEL); if (bhcP == NULL) goto enomem; bhcTailP = bhcHeadP->bhcPrevP; bhcP->bhcNextP = bhcHeadP; bhcP->bhcPrevP = bhcTailP; bhcTailP->bhcNextP = bhcP; bhcHeadP->bhcPrevP = bhcP; bhcP->nBHUsed = 0; atomic_set(&bhcP->nBHActive, 0); nBHsAllocated += BUFFER_HEADS_PER_CHUNK; } /* Use next available buffer head and increment used count */ bhcP->biop[bhcP->nBHUsed] = bioP; bhP = &bhcP->bh[bhcP->nBHUsed]; bhcP->nBHUsed++; bhP->b_state = 0; atomic_set(&bhP->b_count, 1); /* set to one for unlock_buffer */ bhP->b_this_page = NULL; bhP->b_page = NULL; bhP->b_blocknr = 0; bhP->b_size = 0; bhP->b_data = NULL; bhP->b_bdev = NULL; /* buffer head points to buffer head chunk */ bhP->b_private = (void *)bhcP; iovIndex = 0; bioP->bi_vcnt = 0; /* accumulated below as number of bi_io_vecs */ bioP->bi_idx = 0; /* used by lower layer for recording current index */ bioP->bi_size = 0; bioP->bi_bdev = bdevP; bioP->bi_end_io = bioDone; /* bio points to buffer head that we'll wait on */ bioP->bi_private = (void *)bhP; bioP->bi_sector = startSector; sectorsThisPage = MIN((PAGE_SIZE-pageOffset) / 512, nSectors); while (iovIndex < vecsAllocated) { KIBD_GET_PAGE(kibdP, pageIndex, pageP); DBGASSERT(pageP != NULL); bioP->bi_io_vec[iovIndex].bv_page = pageP; bioP->bi_io_vec[iovIndex].bv_len = sectorsThisPage * 512; bioP->bi_io_vec[iovIndex].bv_offset = pageOffset; TRACE6(TRACE_IO, 6, TRCID_KDOIO_LINUX_BIO_PAGE, "cxiStartIO: bhcP 0x%lX bioP 0x%lX index %d sector %llu sectorsThisPage %d pageP 0x%lX\n", bhcP, bioP, iovIndex, startSector, sectorsThisPage, pageP); iovIndex++; bioP->bi_vcnt = iovIndex; bioP->bi_size += (sectorsThisPage * 512); /* Advance to next page */ startSector += sectorsThisPage; nSectors -= sectorsThisPage; sectorsThisPage = MIN(nSectors, PAGE_SIZE/512); pageIndex += 1; pageOffset = 0; } bufOffset += bioP->bi_size; nTotalPages -= bioP->bi_vcnt; /* Fill in a couple of fields in this dummy buffer head * that will be examined in unlock_buffer(). */ set_buffer_locked(bhP); bhP->b_page = pageP; atomic_inc(&bhcP->nBHActive); submit_bio(isWrite, bioP); } /* Unplug the device queue to avoid 3ms delay when no other I/O in progress on the device */ reqQP = bdev_get_queue(bdevP); if (reqQP->unplug_fn != NULL) reqQP->unplug_fn(reqQP); *bhcHeadPP = bhcHeadP; EXIT(0); return 0; enomem: /* Free buffer_head chunks allocated so far and return failure */ if (bhcHeadP != NULL) { bhcP = bhcHeadP; bhcTailP = bhcHeadP->bhcPrevP; do { for (i = 0; i < bhcP->nBHUsed; i++) bio_put(bhcP->biop[i]); p = bhcP; bhcP = bhcP->bhcNextP; kmem_cache_free(BhcCacheP, (void*)p); } while (p != bhcTailP); } EXIT(0); return -ENOMEM; } /* Routine to set up the disk block size and get disk parameters */ int GetDiskInfoX(cxiDev_t devId, struct cxiDiskInfo_t* diskInfoP) { struct block_device *bdevP = bdget(new_decode_dev(devId)); ENTER(0); LOGASSERT(bdevP != NULL && bdevP->bd_disk != NULL); diskInfoP->sectorSize = bdev_hardsect_size(bdevP); if (diskInfoP->sectorSize == 0) diskInfoP->sectorSize = 512; /* Set blocksize of this device to hardware sector size */ set_blocksize(bdevP, diskInfoP->sectorSize); DBGASSERT(bdevP->bd_inode != NULL); diskInfoP->totalSectors = bdevP->bd_inode->i_size / diskInfoP->sectorSize; TRACE3(TRACE_IO, 2, TRCID_GET_DISKINFOX, "GetDiskInfoX: devId %08lX sector size %d totalSectors %lld\n", devId, diskInfoP->sectorSize, diskInfoP->totalSectors); EXIT(0); return 0; } #endif /* Wait for a group of I/Os to complete. Free the buffer heads after all * I/O is finished. Returns -EIO if any buffer_head had an error. */ static int cxiWaitIO(struct cxiBufHeadChunk_t *bhcHeadP) { int i; int rc; struct buffer_head* bhP; struct cxiBufHeadChunk_t* bhcP; struct cxiBufHeadChunk_t* p; #if LINUX_KERNEL_VERSION >= 2050000 struct bio *bioP; #endif /* Wait for I/O to be complete on all buffer_heads. Wait on buffer_heads * in the reverse of the order in which I/O was started. By waiting on * the last buffer_head first, it is likely that the calling thread will * only have to sleep once. */ ENTER(0); rc = 0; DBGASSERT(bhcHeadP != NULL); bhcP = bhcHeadP->bhcPrevP; do { for (i = bhcP->nBHUsed-1; i >= 0; i--) { bhP = &bhcP->bh[i]; #if LINUX_KERNEL_VERSION >= 2050000 bioP = bhcP->biop[i]; TRACE5(TRACE_IO, 12, TRCID_KWAITIO_BIO, "cxiWaitIO: bhP 0x%lX bioP 0x%lX sector %d size %d state 0x%lX\n", bhP, bioP, bioP->bi_sector, bioP->bi_size, bioP->bi_flags); #else TRACE4(TRACE_IO, 12, TRCID_KWAITIO_BH, "cxiWaitIO: bhP 0x%lX sector %d size %d state 0x%lX\n", bhP, bhP->b_blocknr, bhP->b_size, bhP->b_state); #endif wait_on_buffer(bhP); if (!buffer_uptodate(bhP)) { #if LINUX_KERNEL_VERSION >= 2050000 TRACE5(TRACE_IO, 1, TRCID_KWAITIO_BIO_ERR, "cxiWaitIO: bhP 0x%lX bioP 0x%lX sector %d size %d " "state 0x%lX\n", bhP, bioP, bioP->bi_sector, bioP->bi_size, bioP->bi_flags); #else TRACE4(TRACE_IO, 1, TRCID_KWAITIO_BH_ERR, "cxiWaitIO: error bhP 0x%lX sector %d size %d state 0x%lX\n", bhP, bhP->b_blocknr, bhP->b_size, bhP->b_state); #endif rc = -EIO; } #if LINUX_KERNEL_VERSION >= 2050000 bio_put(bioP); bhcP->biop[i] = NULL; #endif } p = bhcP; bhcP = bhcP->bhcPrevP; /* All of the I/Os in all of the buffer_heads inside of the * cxiBufHeadChunk_t pointed to by p are complete (the BH_Lock bits * have all been turned off). However, it is possible that some I/O * completion handlers may not yet have returned from BHioDone and * therefore may not have finished accessing fields within the chunk * of buffer_heads. The nBHActive keeps track of how many * completion routines have not yet returned. If this is non-zero, * the cxiBufHeadChunk_t cannot be freed yet. Delay briefly to * allow the interrupt handler on another processor to complete, * then free the cxiBufHeadChunk_t. Repeat the delay until the * cxiBufHeadChunk_t is no longer in use by any interrupt handlers. */ while (atomic_read(&p->nBHActive) > 0) { TRACE2(TRACE_IO, 1, TRCID_KWAITIO_BH_BUSY, "cxiWaitIO: p 0x%lX waiting for %d I/O completion handlers\n", p, atomic_read(&p->nBHActive)); cxiSleep(10); atomic_inc(&cxiWaitIONDelays); } kmem_cache_free(BhcCacheP, (void*)p); } while (p != bhcHeadP); EXIT(0); return rc; } /* Read or write the given sectors from dev. Data should be placed into the * I/O buffer beginning at byte offset bufOffset. Returns EOK on success, * negative values on error. All of the data to be transferred will be in * the first cxiKernelIOBufferDesc_t. */ int cxiKDoIO(struct cxiKernelIOBufferDesc_t* kibdP, Boolean isWrite, cxiDev_t dev, UInt64 startSector, int nSectors, int sectorSize, int bufOffset) { int rc; struct cxiBufHeadChunk_t* bhcHeadP; ENTER(0); DBGASSERT(sectorSize == 512); #ifdef KCSTRACE current->kcst_info.data[0] = dev; current->kcst_info.data[1] = startSector; current->kcst_info.data[2] = nSectors; #endif rc = cxiStartIO(kibdP, isWrite, dev, startSector, nSectors, bufOffset, &bhcHeadP); if (rc == 0) rc = cxiWaitIO(bhcHeadP); #ifdef KCSTRACE current->kcst_info.data[0] = 0; current->kcst_info.data[1] = 0; current->kcst_info.data[2] = 0; #endif EXIT(0); return rc; }