[16] | 1 | /*************************************************************************** |
---|
| 2 | * |
---|
| 3 | * Copyright (C) 2001 International Business Machines |
---|
| 4 | * All rights reserved. |
---|
| 5 | * |
---|
| 6 | * This file is part of the GPFS mmfslinux kernel module. |
---|
| 7 | * |
---|
| 8 | * Redistribution and use in source and binary forms, with or without |
---|
| 9 | * modification, are permitted provided that the following conditions |
---|
| 10 | * are met: |
---|
| 11 | * |
---|
| 12 | * 1. Redistributions of source code must retain the above copyright notice, |
---|
| 13 | * this list of conditions and the following disclaimer. |
---|
| 14 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
| 15 | * notice, this list of conditions and the following disclaimer in the |
---|
| 16 | * documentation and/or other materials provided with the distribution. |
---|
| 17 | * 3. The name of the author may not be used to endorse or promote products |
---|
| 18 | * derived from this software without specific prior written |
---|
| 19 | * permission. |
---|
| 20 | * |
---|
| 21 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
---|
| 22 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
---|
| 23 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
---|
| 24 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
---|
| 25 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
---|
| 26 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; |
---|
| 27 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
---|
| 28 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
---|
| 29 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
---|
| 30 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
| 31 | * |
---|
| 32 | *************************************************************************** */ |
---|
| 33 | /* @(#)16 1.158.1.9 src/avs/fs/mmfs/ts/kernext/gpl-linux/cxiSystem.c, mmfs, avs_rgpfs24, rgpfs24s007a 10/24/06 19:12:27 */ |
---|
| 34 | /* |
---|
| 35 | * Linux implementation of basic common services |
---|
| 36 | * |
---|
| 37 | * Contents: |
---|
| 38 | * cxiGetThreadId |
---|
| 39 | * getpid |
---|
| 40 | * cxiIsSuperUser |
---|
| 41 | * DoPanic |
---|
| 42 | * logAssertFailed |
---|
| 43 | * Kernel memory allocation services: |
---|
| 44 | * cxiMallocPinned |
---|
| 45 | * cxiFreePinned |
---|
| 46 | * |
---|
| 47 | */ |
---|
| 48 | |
---|
| 49 | #include <Shark-gpl.h> |
---|
| 50 | |
---|
| 51 | #include <linux/kernel.h> |
---|
| 52 | #include <linux/module.h> |
---|
| 53 | #include <linux/sched.h> |
---|
| 54 | #include <linux/slab.h> |
---|
| 55 | #include <linux/wait.h> |
---|
| 56 | #include <linux/time.h> |
---|
| 57 | #include <linux/file.h> |
---|
| 58 | #include <linux/string.h> |
---|
| 59 | #include <asm/uaccess.h> |
---|
| 60 | #include <linux/smp_lock.h> |
---|
| 61 | #include <linux/vmalloc.h> |
---|
| 62 | #include <linux/fs.h> |
---|
| 63 | #include <linux/interrupt.h> |
---|
| 64 | #undef memcmp |
---|
| 65 | |
---|
| 66 | #define DEFINE_TRACE_GBL_VARS |
---|
| 67 | #include <Logger-gpl.h> |
---|
| 68 | #include <verdep.h> |
---|
| 69 | #include <linux2gpfs.h> |
---|
| 70 | #include <cxiSystem.h> |
---|
| 71 | #include <cxiAtomic.h> |
---|
| 72 | #include <cxi2gpfs.h> |
---|
| 73 | #include <cxiIOBuffer.h> |
---|
| 74 | #include <cxiSharedSeg.h> |
---|
| 75 | #include <cxiCred.h> |
---|
| 76 | |
---|
| 77 | #include <Trace.h> |
---|
| 78 | #include <lxtrace.h> |
---|
| 79 | #include <cxiMode.h> |
---|
| 80 | #if LINUX_KERNEL_VERSION >= 2060000 |
---|
| 81 | #include <linux/swap.h> |
---|
| 82 | #include <linux/writeback.h> |
---|
| 83 | #endif |
---|
| 84 | |
---|
| 85 | #if LINUX_KERNEL_VERSION >= 2040900 |
---|
| 86 | /* This is in the Redhat kernel series */ |
---|
| 87 | extern int posix_locks_deadlock(struct file_lock *, struct file_lock *); |
---|
| 88 | #endif |
---|
| 89 | |
---|
| 90 | #ifdef INSTRUMENT_LOCKS |
---|
| 91 | struct BlockingMutexStats BlockingMutexStatsTable[MAX_GPFS_LOCK_NAMES]; |
---|
| 92 | #endif /* INSTRUMENT_LOCKS */ |
---|
| 93 | |
---|
| 94 | /* We record the daemon's process group since it can uniquely identify |
---|
| 95 | * a thread as being part of the GPFS daemon. pid is unique per thread |
---|
| 96 | * on linux due to their clone implementation. |
---|
| 97 | */ |
---|
| 98 | static pid_t DaemonPGrp = -1; |
---|
| 99 | |
---|
| 100 | /* Get the kernel thread ID. */ |
---|
| 101 | cxiThreadId cxiGetThreadId() |
---|
| 102 | { |
---|
| 103 | /* ENTER(1); */ |
---|
| 104 | return current->pid; |
---|
| 105 | } |
---|
| 106 | |
---|
| 107 | /* Get the kernel process ID. */ |
---|
| 108 | pid_t getpid() |
---|
| 109 | { |
---|
| 110 | /* ENTER(1); */ |
---|
| 111 | return current->pid; |
---|
| 112 | } |
---|
| 113 | |
---|
| 114 | /* bufP is caller's ext_cred_t buffer |
---|
| 115 | * uCredPP is the ucred struct (NULL on Linux) |
---|
| 116 | * eCredPP is the ext_cred_t struct * (if successful) |
---|
| 117 | * |
---|
| 118 | * cxiPutCred should be called to release when operation has been completed. |
---|
| 119 | */ |
---|
| 120 | int cxiGetCred(void *bufP, void **uCredPP, void **eCredPP) |
---|
| 121 | { |
---|
| 122 | ext_cred_t *eCredP = (ext_cred_t *)bufP; |
---|
| 123 | |
---|
| 124 | ENTER(0); |
---|
| 125 | *uCredPP = NULL; |
---|
| 126 | *eCredPP = NULL; |
---|
| 127 | |
---|
| 128 | if (!bufP) |
---|
| 129 | { |
---|
| 130 | EXIT_RC(0, EINVAL); |
---|
| 131 | return EINVAL; |
---|
| 132 | } |
---|
| 133 | |
---|
| 134 | setCred(eCredP); |
---|
| 135 | *eCredPP = (void *)eCredP; |
---|
| 136 | |
---|
| 137 | xerror: |
---|
| 138 | EXIT(0); |
---|
| 139 | return 0; |
---|
| 140 | } |
---|
| 141 | |
---|
| 142 | /* Release of cxiGetCred() structures (nothing to do on Linux) */ |
---|
| 143 | int cxiPutCred(void *userCredP, void *extCredP) |
---|
| 144 | { |
---|
| 145 | if (userCredP || !extCredP) |
---|
| 146 | return EINVAL; |
---|
| 147 | |
---|
| 148 | return 0; |
---|
| 149 | } |
---|
| 150 | |
---|
| 151 | /* Convert a kernel stack address to the thread ID of the thread that |
---|
| 152 | * uses that stack |
---|
| 153 | */ |
---|
| 154 | int |
---|
| 155 | cxiStackAddrToThreadId(char* stackP, cxiThreadId* tidP) |
---|
| 156 | { |
---|
| 157 | struct task_struct * tP; |
---|
| 158 | #if LINUX_KERNEL_VERSION >= 2060000 |
---|
| 159 | /* the kernel stack is base off the thread_info struct in the 2.6 kernel |
---|
| 160 | * will get the task pointer out of thread_info struct. |
---|
| 161 | */ |
---|
| 162 | struct thread_info * iP; |
---|
| 163 | ENTER(0); |
---|
| 164 | iP = (struct thread_info *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1))); |
---|
| 165 | tP = iP->task; |
---|
| 166 | #else |
---|
| 167 | /* the kernel stack is base off the task_struct struct in the 2.4 kernel */ |
---|
| 168 | tP = (struct task_struct *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1))); |
---|
| 169 | #endif |
---|
| 170 | ENTER(0); |
---|
| 171 | *tidP = tP->pid; |
---|
| 172 | EXIT(0); |
---|
| 173 | return 0; |
---|
| 174 | } |
---|
| 175 | |
---|
| 176 | /* Convert a kernel thread pointer to the corresponding thread ID */ |
---|
| 177 | int |
---|
| 178 | cxiThreadPtrToThreadId(char* threadP, cxiThreadId* tidP) |
---|
| 179 | { |
---|
| 180 | struct task_struct * tP; |
---|
| 181 | |
---|
| 182 | ENTER(0); |
---|
| 183 | tP = (struct task_struct *) threadP; |
---|
| 184 | *tidP = tP->pid; |
---|
| 185 | |
---|
| 186 | EXIT(0); |
---|
| 187 | return 0; |
---|
| 188 | } |
---|
| 189 | |
---|
| 190 | |
---|
| 191 | /* Return true if caller has has maximum authorization (is root) */ |
---|
| 192 | Boolean cxiIsSuperUser() |
---|
| 193 | { |
---|
| 194 | return (current->euid == 0); |
---|
| 195 | } |
---|
| 196 | |
---|
| 197 | |
---|
| 198 | /* Get the process max filesize limit (ulimit -f) */ |
---|
| 199 | Int64 cxiGetMaxFileSize() |
---|
| 200 | { |
---|
| 201 | if ((signed long)MY_RLIM_CUR(RLIMIT_FSIZE) == -1L) |
---|
| 202 | return MAX_INT64; |
---|
| 203 | else |
---|
| 204 | return (MY_RLIM_CUR(RLIMIT_FSIZE)); |
---|
| 205 | } |
---|
| 206 | |
---|
| 207 | /* Routine to send a signal to the current thread/process */ |
---|
| 208 | void cxiSendSigThread(int sig) |
---|
| 209 | { |
---|
| 210 | ENTER(0); |
---|
| 211 | send_sig(sig, current, 0); |
---|
| 212 | EXIT(0); |
---|
| 213 | } |
---|
| 214 | |
---|
| 215 | |
---|
| 216 | #ifdef MALLOC_DEBUG |
---|
| 217 | /* This tracks mallocs and frees on a limited basis. |
---|
| 218 | * Implemented originally to determine if we were leaking |
---|
| 219 | * any memory after an unload. This is not really thread |
---|
| 220 | * safe for multiple processors unless they're automatically |
---|
| 221 | * cache coherent without memory barriers (i386). Its useful |
---|
| 222 | * for detecting memory leaks on a single processor system. |
---|
| 223 | */ |
---|
| 224 | #define MALLOC_RECORDS 5000 /* max mallocs to track */ |
---|
| 225 | struct mallocStat |
---|
| 226 | { |
---|
| 227 | void *beginP; |
---|
| 228 | unsigned short size; |
---|
| 229 | unsigned short type; |
---|
| 230 | }; |
---|
| 231 | static struct mallocStat *mstatP = NULL; |
---|
| 232 | unsigned int nextMalloc = 0; |
---|
| 233 | |
---|
| 234 | void |
---|
| 235 | MallocDebugStart() |
---|
| 236 | { |
---|
| 237 | int i; |
---|
| 238 | |
---|
| 239 | ENTER(0); |
---|
| 240 | if (mstatP == NULL) |
---|
| 241 | mstatP = vmalloc(MALLOC_RECORDS * sizeof(struct mallocStat)); |
---|
| 242 | |
---|
| 243 | if (mstatP == NULL) |
---|
| 244 | { |
---|
| 245 | EXIT(0); |
---|
| 246 | return; |
---|
| 247 | } |
---|
| 248 | |
---|
| 249 | for (i = 0; i < MALLOC_RECORDS; i++) |
---|
| 250 | { |
---|
| 251 | mstatP[i].beginP = NULL; |
---|
| 252 | mstatP[i].size = 0; |
---|
| 253 | mstatP[i].type = 0; |
---|
| 254 | } |
---|
| 255 | printk("MallocDebugStart 0x%X\n", mstatP); |
---|
| 256 | EXIT(0); |
---|
| 257 | } |
---|
| 258 | |
---|
| 259 | void |
---|
| 260 | MallocDebugEnd() |
---|
| 261 | { |
---|
| 262 | int i; |
---|
| 263 | |
---|
| 264 | ENTER(0); |
---|
| 265 | if (mstatP != NULL) |
---|
| 266 | { |
---|
| 267 | for (i = 0; i < MALLOC_RECORDS; i++) |
---|
| 268 | { |
---|
| 269 | if (mstatP[i].beginP != NULL) |
---|
| 270 | printk("MallocDebug: beginP 0x%X size %d type %d STILL ALLOCATED!\n", |
---|
| 271 | mstatP[i].beginP, mstatP[i].size, mstatP[i].type); |
---|
| 272 | } |
---|
| 273 | } |
---|
| 274 | |
---|
| 275 | vfree(mstatP); |
---|
| 276 | mstatP = NULL; |
---|
| 277 | EXIT(0); |
---|
| 278 | } |
---|
| 279 | |
---|
| 280 | void |
---|
| 281 | MallocDebugNew(void *ptr, unsigned short size, unsigned short type) |
---|
| 282 | { |
---|
| 283 | void *bP; |
---|
| 284 | int i; |
---|
| 285 | int j; |
---|
| 286 | int swrc; |
---|
| 287 | int oldval; |
---|
| 288 | int where = nextMalloc; |
---|
| 289 | |
---|
| 290 | ENTER(0); |
---|
| 291 | |
---|
| 292 | if (mstatP == NULL) |
---|
| 293 | { |
---|
| 294 | EXIT(0); |
---|
| 295 | return; |
---|
| 296 | } |
---|
| 297 | |
---|
| 298 | for (i = where; i < MALLOC_RECORDS + where; i++) |
---|
| 299 | { |
---|
| 300 | if (i >= MALLOC_RECORDS) |
---|
| 301 | j = i - MALLOC_RECORDS; |
---|
| 302 | else |
---|
| 303 | j = i; |
---|
| 304 | |
---|
| 305 | bP = mstatP[j].beginP; |
---|
| 306 | if (bP == NULL) |
---|
| 307 | { |
---|
| 308 | swrc = ATOMIC_SWAP(&mstatP[j].beginP, &bP, ptr); |
---|
| 309 | if (swrc) |
---|
| 310 | { |
---|
| 311 | mstatP[j].size = size; |
---|
| 312 | mstatP[j].type = type; |
---|
| 313 | break; |
---|
| 314 | } |
---|
| 315 | } |
---|
| 316 | } |
---|
| 317 | |
---|
| 318 | EXIT(0); |
---|
| 319 | } |
---|
| 320 | |
---|
| 321 | void |
---|
| 322 | MallocDebugDelete(void *ptr) |
---|
| 323 | { |
---|
| 324 | void *bP; |
---|
| 325 | int i; |
---|
| 326 | int swrc; |
---|
| 327 | int next; |
---|
| 328 | int found = 0; |
---|
| 329 | |
---|
| 330 | ENTER(0); |
---|
| 331 | if (mstatP == NULL) |
---|
| 332 | { |
---|
| 333 | EXIT(0); |
---|
| 334 | return; |
---|
| 335 | } |
---|
| 336 | |
---|
| 337 | for (i = 0; i < MALLOC_RECORDS; i++) |
---|
| 338 | { |
---|
| 339 | bP = mstatP[i].beginP; |
---|
| 340 | if (bP == ptr) |
---|
| 341 | { |
---|
| 342 | next = nextMalloc; |
---|
| 343 | ATOMIC_SWAP(&nextMalloc, &next, i); |
---|
| 344 | |
---|
| 345 | swrc = ATOMIC_SWAP(&mstatP[i].beginP, &bP, NULL); |
---|
| 346 | DBGASSERT(swrc); |
---|
| 347 | found = 1; |
---|
| 348 | break; |
---|
| 349 | } |
---|
| 350 | } |
---|
| 351 | |
---|
| 352 | if (!found) |
---|
| 353 | printk("MallocDebug: 0x%X not found!\n", ptr); |
---|
| 354 | EXIT(0); |
---|
| 355 | } |
---|
| 356 | #endif /* MALLOC_DEBUG */ |
---|
| 357 | |
---|
| 358 | /* Allocate pinned kernel memory */ |
---|
| 359 | void* cxiMallocPinned(int nBytes) |
---|
| 360 | { |
---|
| 361 | void *ptr; |
---|
| 362 | |
---|
| 363 | /* kmalloc only supports requests for up to 131027 bytes. Anything |
---|
| 364 | larger than this results in a BUG() call. */ |
---|
| 365 | ENTER(0); |
---|
| 366 | if (nBytes > 131072) |
---|
| 367 | { |
---|
| 368 | EXIT(0); |
---|
| 369 | return NULL; |
---|
| 370 | } |
---|
| 371 | |
---|
| 372 | ptr = kmalloc(nBytes, GFP_KERNEL); |
---|
| 373 | |
---|
| 374 | #ifdef MALLOC_DEBUG |
---|
| 375 | MallocDebugNew(ptr, nBytes, 1); |
---|
| 376 | #endif |
---|
| 377 | |
---|
| 378 | EXIT(0); |
---|
| 379 | return ptr; |
---|
| 380 | } |
---|
| 381 | |
---|
| 382 | /* Free pinned kernel memory that was allocated with cxiMallocPinned */ |
---|
| 383 | /* Must not block on lack of memory resourses */ |
---|
| 384 | void cxiFreePinned(void* p) |
---|
| 385 | { |
---|
| 386 | ENTER(0); |
---|
| 387 | #ifdef MALLOC_DEBUG |
---|
| 388 | MallocDebugDelete(p); |
---|
| 389 | #endif |
---|
| 390 | |
---|
| 391 | kfree(p); |
---|
| 392 | EXIT(0); |
---|
| 393 | } |
---|
| 394 | |
---|
| 395 | /* Get the kernel thread ID. */ |
---|
| 396 | void* cxiGetFcntlOwner(eflock_t *flP) |
---|
| 397 | { |
---|
| 398 | return flP? flP->l_owner: current->files; |
---|
| 399 | } |
---|
| 400 | |
---|
| 401 | #if LINUX_KERNEL_VERSION > 2060900 |
---|
| 402 | struct lock_manager_operations lm_operations = { |
---|
| 403 | }; |
---|
| 404 | #endif |
---|
| 405 | |
---|
| 406 | /* Perform local advisory locking. */ |
---|
| 407 | int cxiFcntlLock(void *advObjP, |
---|
| 408 | int cmd, |
---|
| 409 | void *lockStructP, |
---|
| 410 | cxiFlock_t *flockP, |
---|
| 411 | int (*retryCB)(), |
---|
| 412 | cxiOff64_t size, |
---|
| 413 | cxiOff64_t offset, |
---|
| 414 | ulong *retry_idP) |
---|
| 415 | { |
---|
| 416 | int len, rc = 0; |
---|
| 417 | // struct file *fP; |
---|
| 418 | struct file_lock fl, *flP, *gflP, *cflP; |
---|
| 419 | Boolean keepLockElement = false; |
---|
| 420 | |
---|
| 421 | /* cast platform independent arguments as appropriate for linux */ |
---|
| 422 | void (*RetryFcn)(struct file_lock*) = (void (*)(struct file_lock*))retryCB; |
---|
| 423 | // fP = (struct file *)advObjP; |
---|
| 424 | struct file localFile, *filp = &localFile; |
---|
| 425 | struct dentry localDEntry, *dp = &localDEntry; |
---|
| 426 | ENTER(0); |
---|
| 427 | flP = (struct file_lock *) lockStructP; |
---|
| 428 | |
---|
| 429 | localFile.f_dentry = &localDEntry; |
---|
| 430 | localDEntry.d_inode = (struct inode *)advObjP; |
---|
| 431 | |
---|
| 432 | /* Lock commands can have two different values. Convert them at |
---|
| 433 | * entry to the portability layer so that we only have to check |
---|
| 434 | * for one of them. |
---|
| 435 | */ |
---|
| 436 | #if !defined(__64BIT__) |
---|
| 437 | if (cmd == F_GETLK64) cmd = F_GETLK; |
---|
| 438 | if (cmd == F_SETLK64) cmd = F_SETLK; |
---|
| 439 | if (cmd == F_SETLKW64) cmd = F_SETLKW; |
---|
| 440 | #endif |
---|
| 441 | |
---|
| 442 | /* Callers have the option of passing a platform dependent lock structure |
---|
| 443 | (struct file_lock *lockSructP) or the generic (cxiFlock_t *flockP). */ |
---|
| 444 | if (flockP) |
---|
| 445 | { |
---|
| 446 | flP = &fl; /* Use a local file_lock structure */ |
---|
| 447 | |
---|
| 448 | /* If there is a potential for blocking, must malloc the locking structure |
---|
| 449 | so it can persist until the lock becomes available (in Retry()). */ |
---|
| 450 | |
---|
| 451 | if (cmd == F_SETLKW) |
---|
| 452 | { |
---|
| 453 | #ifdef NFS_CLUSTER_LOCKS |
---|
| 454 | len = sizeof(struct file_lock) + |
---|
| 455 | sizeof(struct file) + |
---|
| 456 | sizeof(struct dentry); |
---|
| 457 | #else |
---|
| 458 | len = sizeof(struct file_lock); |
---|
| 459 | #endif |
---|
| 460 | flP = (struct file_lock*)cxiMallocUnpinned(len); |
---|
| 461 | if (flP == NULL) |
---|
| 462 | { |
---|
| 463 | rc = ENOMEM; |
---|
| 464 | goto exit; |
---|
| 465 | } |
---|
| 466 | cxiMemset(flP, 0, len); |
---|
| 467 | #ifdef NFS_CLUSTER_LOCKS |
---|
| 468 | filp = (struct file*)((char *)flP + sizeof(struct file_lock)); |
---|
| 469 | dp = (struct dentry *)((char *)filp + sizeof(struct file)); |
---|
| 470 | filp->f_dentry = dp; |
---|
| 471 | dp->d_inode = (struct inode *)advObjP; |
---|
| 472 | #endif |
---|
| 473 | } |
---|
| 474 | else |
---|
| 475 | cxiMemset(flP, 0, sizeof(*flP)); |
---|
| 476 | |
---|
| 477 | locks_init_lock(flP); /* Initialize list_head structs */ |
---|
| 478 | if (flockP->l_file == NULL) |
---|
| 479 | flockP->l_file = filp; |
---|
| 480 | |
---|
| 481 | /* fl_wait needs to be initialized because when unlock happens, the |
---|
| 482 | linux routine locks_wake_up_blocks invokes our retry routine via |
---|
| 483 | fl_notify and then calls wake_up(fl_wait) on the assumption that |
---|
| 484 | the waiter is local. */ |
---|
| 485 | |
---|
| 486 | cxiWaitEventInit((cxiWaitEvent_t *)&flP->fl_wait); |
---|
| 487 | |
---|
| 488 | cxiFlockToVFS(flockP, flP); |
---|
| 489 | } |
---|
| 490 | |
---|
| 491 | /* daemon didn't know the owner and required kernel code to fill it in. */ |
---|
| 492 | if (!flP->fl_owner) |
---|
| 493 | flP->fl_owner = (fl_owner_t)cxiGetFcntlOwner(NULL); |
---|
| 494 | |
---|
| 495 | #if 0 |
---|
| 496 | /* Validate the file pointer. Kernel locking routines are going to |
---|
| 497 | use these without verifying them. If any of them are NULL, find |
---|
| 498 | out now before they generate a segment violation. */ |
---|
| 499 | if ((!fP) || (!fP->f_dentry) || (!fP->f_dentry->d_inode)) |
---|
| 500 | { |
---|
| 501 | if (cmd == F_GETLK) |
---|
| 502 | flP->fl_type = F_UNLCK; |
---|
| 503 | else |
---|
| 504 | rc = EINVAL; |
---|
| 505 | goto exit; |
---|
| 506 | } |
---|
| 507 | #endif |
---|
| 508 | |
---|
| 509 | /* Note that this all depends on us having serialized such locking for |
---|
| 510 | this file during from before the posix_test_lock() until after the |
---|
| 511 | posix_block_lock(). The revoke lock that we hold here provides us |
---|
| 512 | the necessary serilization. */ |
---|
| 513 | |
---|
| 514 | TRACE7(TRACE_VNODE, 3, TRCID_FCNTLLOCK_ENTER, |
---|
| 515 | "cxiFcntlLock posix_lock_file: pid %d owner 0x%X inodeP 0x%X " |
---|
| 516 | "range 0x%lX-%lX cmd %s type %s\n", |
---|
| 517 | flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end, |
---|
| 518 | (cmd == F_GETLK) ? "GETLK" : (cmd == F_SETLK) ? "SETLK" : "SETLKW", |
---|
| 519 | (flP->fl_type == F_RDLCK) ? "RDLCK" : |
---|
| 520 | (flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK"); |
---|
| 521 | |
---|
| 522 | if (cmd == F_GETLK) |
---|
| 523 | { |
---|
| 524 | /* Check for conflicts. If found, return the information. |
---|
| 525 | If there are NO conflicts, return F_UNLCK in fl_type. */ |
---|
| 526 | #if LINUX_KERNEL_VERSION >= 2061700 |
---|
| 527 | struct file_lock conf; |
---|
| 528 | gflP = &conf; |
---|
| 529 | rc = posix_test_lock(filp, flP, gflP); |
---|
| 530 | if (rc) { |
---|
| 531 | rc = 0; |
---|
| 532 | #else |
---|
| 533 | if (NULL != (gflP = posix_test_lock(&localFile, flP))) { |
---|
| 534 | #endif |
---|
| 535 | flP->fl_start = gflP->fl_start; |
---|
| 536 | flP->fl_end = gflP->fl_end; |
---|
| 537 | flP->fl_type = gflP->fl_type; |
---|
| 538 | flP->fl_pid = gflP->fl_pid; |
---|
| 539 | flP->fl_owner = gflP->fl_owner; |
---|
| 540 | } |
---|
| 541 | else |
---|
| 542 | flP->fl_type = F_UNLCK; |
---|
| 543 | |
---|
| 544 | TRACE6(TRACE_VNODE, 3, TRCID_FCNTLLOCK_GETLK, |
---|
| 545 | "cxiFcntlLock getlk: pid %d owner 0x%X inodeP 0x%X " |
---|
| 546 | "range 0x%lX-%lX type %s\n", |
---|
| 547 | flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end, |
---|
| 548 | (flP->fl_type == F_RDLCK) ? "RDLCK" : |
---|
| 549 | (flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK"); |
---|
| 550 | } |
---|
| 551 | else |
---|
| 552 | { /* Begin: do the locking, but handle the blocking via our retry routine. */ |
---|
| 553 | /* Test the lock. What this really does for us is return the blocker |
---|
| 554 | if one exists. This is needed to queue up the request if a conflicting |
---|
| 555 | lock is already held. */ |
---|
| 556 | |
---|
| 557 | #ifdef NFS_CLUSTER_LOCKS |
---|
| 558 | if (cmd == F_SETLKW) { |
---|
| 559 | flP->fl_flags |= FL_SLEEP; |
---|
| 560 | if (!flP->fl_lmops) { |
---|
| 561 | flP->fl_lmops = &lm_operations; |
---|
| 562 | flP->fl_lmops->fl_notify = (void *)RetryFcn; |
---|
| 563 | } |
---|
| 564 | } |
---|
| 565 | rc = POSIX_LOCK_FILE(filp, flP); |
---|
| 566 | if (rc == -EAGAIN && (cmd == F_SETLKW) && |
---|
| 567 | flP->fl_lmops == &lm_operations) |
---|
| 568 | { |
---|
| 569 | /* Queue the blocker structures */ |
---|
| 570 | keepLockElement = true; |
---|
| 571 | if (retry_idP) |
---|
| 572 | *retry_idP = (ulong)flP; // returned to caller and saved in sleepElement |
---|
| 573 | } |
---|
| 574 | #else |
---|
| 575 | #if LINUX_KERNEL_VERSION >= 2061700 |
---|
| 576 | if ((flP->fl_type == F_UNLCK) || !(posix_test_lock(&localFile, flP, cflP))) |
---|
| 577 | #else |
---|
| 578 | if ((flP->fl_type == F_UNLCK) || !(cflP = posix_test_lock(&localFile, flP))) |
---|
| 579 | #endif |
---|
| 580 | { |
---|
| 581 | /* No conflicting lock: get the lock for the caller. */ |
---|
| 582 | rc = POSIX_LOCK_FILE(&localFile, flP); |
---|
| 583 | } |
---|
| 584 | else |
---|
| 585 | { /* Conflicting lock: ..... */ |
---|
| 586 | rc = EAGAIN; |
---|
| 587 | |
---|
| 588 | if (cmd == F_SETLKW) |
---|
| 589 | { |
---|
| 590 | /*if (posix_locks_deadlock(flP, cflP)) |
---|
| 591 | { |
---|
| 592 | rc = EDEADLK; |
---|
| 593 | } |
---|
| 594 | else*/ |
---|
| 595 | { |
---|
| 596 | /* Queue the blocker structures */ |
---|
| 597 | keepLockElement = true; |
---|
| 598 | if (retry_idP) |
---|
| 599 | *retry_idP = (ulong)flP; // returned to caller and saved in sleepElement |
---|
| 600 | #if LINUX_KERNEL_VERSION > 2060900 |
---|
| 601 | flP->fl_lmops = &lm_operations; |
---|
| 602 | flP->fl_lmops->fl_notify = RetryFcn; |
---|
| 603 | #else |
---|
| 604 | flP->fl_notify = RetryFcn; |
---|
| 605 | #endif |
---|
| 606 | #if LINUX_KERNEL_VERSION < 2061700 |
---|
| 607 | posix_block_lock(cflP, flP); |
---|
| 608 | #endif |
---|
| 609 | } |
---|
| 610 | } |
---|
| 611 | } |
---|
| 612 | #endif |
---|
| 613 | |
---|
| 614 | TRACE2(TRACE_VNODE, 3, TRCID_FCNTLLOCK_EXIT, |
---|
| 615 | "cxiFcntlLock posix_lock_file: rc %d retry_id 0x%lX\n", rc, cflP); |
---|
| 616 | } /* End: do the locking, but handle the blocking via our retry routine. */ |
---|
| 617 | |
---|
| 618 | exit: |
---|
| 619 | |
---|
| 620 | if (flockP) |
---|
| 621 | { |
---|
| 622 | /* Caller wanted results in flockP */ |
---|
| 623 | cxiVFSToFlock((void *)flP, flockP); |
---|
| 624 | |
---|
| 625 | /* If we allocated the locking structure and then didn't need to use |
---|
| 626 | it (the lock request didn't block), free it. */ |
---|
| 627 | |
---|
| 628 | if ((flP!=&fl) && (!keepLockElement)) { |
---|
| 629 | cxiFreeUnpinned(flP); |
---|
| 630 | } |
---|
| 631 | } |
---|
| 632 | |
---|
| 633 | #ifdef NFS_CLUSTER_LOCKS |
---|
| 634 | if (rc < 0) |
---|
| 635 | rc = -rc; /* make it positive */ |
---|
| 636 | #endif |
---|
| 637 | EXIT_RC(0, rc); |
---|
| 638 | return rc; |
---|
| 639 | } |
---|
| 640 | |
---|
| 641 | void cxiFcntlUnblock(void *retry_idP) |
---|
| 642 | { |
---|
| 643 | struct file_lock *flP = (struct file_lock *)retry_idP; |
---|
| 644 | |
---|
| 645 | ENTER(0); |
---|
| 646 | /* Include some sanity checks on the retry id (file_lock) |
---|
| 647 | before passing it into the routine that does the work. |
---|
| 648 | It should be properly linked (via its list_head structures) |
---|
| 649 | in a file_lock_list that has blocked waiters. Also, |
---|
| 650 | we would only be backing this out by the process that |
---|
| 651 | has originally blocked, so verify the pid. */ |
---|
| 652 | |
---|
| 653 | if (!list_empty(&flP->fl_block) && !list_empty(&flP->fl_link) && |
---|
| 654 | flP->fl_next && flP->fl_pid == getpid()) |
---|
| 655 | { |
---|
| 656 | POSIX_UNBLOCK_LOCK(flP); |
---|
| 657 | } |
---|
| 658 | EXIT(0); |
---|
| 659 | } |
---|
| 660 | |
---|
| 661 | int |
---|
| 662 | cxiFcntlReset(void *vfsP, cxiPid_t mmfsd_pid) |
---|
| 663 | { |
---|
| 664 | int rc = 0; |
---|
| 665 | struct super_block *sbP = (struct super_block *)vfsP; |
---|
| 666 | struct list_head *fllP; |
---|
| 667 | struct file_lock *fl; |
---|
| 668 | struct dentry *dentryP; |
---|
| 669 | |
---|
| 670 | ENTER(0); |
---|
| 671 | lock_kernel(); |
---|
| 672 | |
---|
| 673 | restart: |
---|
| 674 | |
---|
| 675 | #if LINUX_KERNEL_VERSION >= 2061600 |
---|
| 676 | //??? find a different way to clear locks file_lock_list is not exported anymore |
---|
| 677 | #else |
---|
| 678 | fllP = file_lock_list.next; |
---|
| 679 | |
---|
| 680 | while(fllP != &file_lock_list) |
---|
| 681 | { |
---|
| 682 | fl = list_entry(fllP, struct file_lock, fl_link); |
---|
| 683 | fllP = fllP->next; |
---|
| 684 | |
---|
| 685 | /* If there are mmfs lock structures, release them. */ |
---|
| 686 | |
---|
| 687 | if (fl && |
---|
| 688 | fl->fl_file && |
---|
| 689 | fl->fl_file->f_dentry && |
---|
| 690 | fl->fl_file->f_dentry->d_inode) |
---|
| 691 | { |
---|
| 692 | dentryP = fl->fl_file->f_dentry; |
---|
| 693 | |
---|
| 694 | /* If this lock belongs to the specified vfs, release advisory locks. */ |
---|
| 695 | if (dentryP->d_sb == sbP) |
---|
| 696 | { |
---|
| 697 | /* remove all our locks */ |
---|
| 698 | rc = gpfs_ops.gpfsFcntlReset((void *)dentryP->d_inode, mmfsd_pid); |
---|
| 699 | if (rc == ENOSYS) |
---|
| 700 | goto xerror; |
---|
| 701 | |
---|
| 702 | /* After freeing unknown numbers of locks in gpfsFcntlReset (all |
---|
| 703 | locks for the inode), restart from the top of the lock list */ |
---|
| 704 | goto restart; |
---|
| 705 | } |
---|
| 706 | } |
---|
| 707 | } |
---|
| 708 | #endif |
---|
| 709 | |
---|
| 710 | xerror: |
---|
| 711 | unlock_kernel(); |
---|
| 712 | EXIT_RC(0, rc); |
---|
| 713 | return rc; |
---|
| 714 | } |
---|
| 715 | |
---|
| 716 | void * |
---|
| 717 | cxiGetPrivVfsP(void *vfsP) |
---|
| 718 | { |
---|
| 719 | struct super_block *sbP = (struct super_block *)vfsP; |
---|
| 720 | |
---|
| 721 | /* Do some sanity checking */ |
---|
| 722 | if ( (sbP->s_magic != GPFS_SUPER_MAGIC) || |
---|
| 723 | ((UIntPtr) SBLOCK_PRIVATE(sbP) < GPFS_KERNEL_OFFSET) ) |
---|
| 724 | printSuperList(sbP); |
---|
| 725 | LOGASSERT( sbP->s_magic == GPFS_SUPER_MAGIC ); |
---|
| 726 | LOGASSERT( (UIntPtr) SBLOCK_PRIVATE(sbP) >= GPFS_KERNEL_OFFSET ); |
---|
| 727 | |
---|
| 728 | return (SBLOCK_PRIVATE(sbP)); |
---|
| 729 | } |
---|
| 730 | |
---|
| 731 | |
---|
| 732 | #ifdef NFS_DEBUG |
---|
| 733 | /* These flags are defined in the kernel and control various cprintk |
---|
| 734 | calls. This provides us a way to easily turn these on/off for |
---|
| 735 | debugging our NFS support. */ |
---|
| 736 | extern unsigned int nlm_debug; |
---|
| 737 | extern unsigned int nfsd_debug; |
---|
| 738 | extern unsigned int nfs_debug; |
---|
| 739 | extern unsigned int rpc_debug; |
---|
| 740 | #endif |
---|
| 741 | |
---|
| 742 | int cxiTrace(cxiTrace_t trace) |
---|
| 743 | { |
---|
| 744 | #ifdef NFS_DEBUG |
---|
| 745 | int rc = 0; |
---|
| 746 | |
---|
| 747 | ENTER(0); |
---|
| 748 | switch (trace) |
---|
| 749 | { |
---|
| 750 | case cxiTraceNFS: |
---|
| 751 | nlm_debug = nfsd_debug = nfs_debug = rpc_debug = ~0; |
---|
| 752 | break; |
---|
| 753 | case cxiTraceNFSoff: |
---|
| 754 | nlm_debug = nfsd_debug = nfs_debug = rpc_debug = 0; |
---|
| 755 | break; |
---|
| 756 | default: |
---|
| 757 | rc = EINVAL; |
---|
| 758 | break; |
---|
| 759 | } |
---|
| 760 | EXIT_RC(0, rc); |
---|
| 761 | return rc; |
---|
| 762 | #else |
---|
| 763 | return ENOSYS; |
---|
| 764 | #endif |
---|
| 765 | } |
---|
| 766 | |
---|
| 767 | void cxiFlockToVFS(eflock_t* lckdatP, void* vP) |
---|
| 768 | { |
---|
| 769 | struct file_lock* flP = (struct file_lock *)vP; |
---|
| 770 | |
---|
| 771 | ENTER(0); |
---|
| 772 | if ((flP) && (lckdatP)) |
---|
| 773 | { |
---|
| 774 | flP->fl_pid = lckdatP->l_pid; |
---|
| 775 | flP->fl_owner = lckdatP->l_owner; |
---|
| 776 | flP->fl_type = lckdatP->l_type; |
---|
| 777 | flP->fl_start = lckdatP->l_start; |
---|
| 778 | flP->fl_flags = FL_POSIX; |
---|
| 779 | #ifdef NFS_CLUSTER_LOCKS |
---|
| 780 | flP->fl_lmops = lckdatP->l_lmops; |
---|
| 781 | flP->fl_file = lckdatP->l_file; |
---|
| 782 | flP->fl_ops = NULL; |
---|
| 783 | #else |
---|
| 784 | #if LINUX_KERNEL_VERSION < 2061700 |
---|
| 785 | if (lckdatP->l_caller == L_CALLER_LOCKD) |
---|
| 786 | flP->fl_flags |= FL_LOCKD; |
---|
| 787 | #endif |
---|
| 788 | #endif |
---|
| 789 | if (lckdatP->l_len == 0) |
---|
| 790 | flP->fl_end = FL_OFFSET_MAX; |
---|
| 791 | else |
---|
| 792 | flP->fl_end = lckdatP->l_len + lckdatP->l_start - 1; |
---|
| 793 | } |
---|
| 794 | EXIT(0); |
---|
| 795 | return; |
---|
| 796 | } |
---|
| 797 | |
---|
| 798 | #ifdef NFS_CLUSTER_LOCKS |
---|
| 799 | int cxiVFSCallback(eflock_t* lckreqP, eflock_t* lckdatP, |
---|
| 800 | int(* callback)(void *, void *, int), int result) |
---|
| 801 | { |
---|
| 802 | struct file_lock fl; |
---|
| 803 | struct file *fileP; |
---|
| 804 | struct file_lock conf, *confP = NULL; |
---|
| 805 | int rc; |
---|
| 806 | |
---|
| 807 | ENTER(0); |
---|
| 808 | |
---|
| 809 | cxiFlockToVFS(lckreqP, &fl); |
---|
| 810 | fileP = fl.fl_file; |
---|
| 811 | if (!fileP) { |
---|
| 812 | return -1; |
---|
| 813 | } |
---|
| 814 | if (lckdatP) { |
---|
| 815 | cxiFlockToVFS(lckdatP, &conf); |
---|
| 816 | confP = &conf; |
---|
| 817 | } |
---|
| 818 | if (!result) { /* try to get the posix lock */ |
---|
| 819 | rc = POSIX_LOCK_FILE(fileP, &fl); |
---|
| 820 | if (rc) |
---|
| 821 | callback(&fl, NULL, EBUSY); |
---|
| 822 | else { /* got the posix lock */ |
---|
| 823 | rc = callback(&fl, confP, result); |
---|
| 824 | if (rc) { /* too late, free the lock */ |
---|
| 825 | fl.fl_type = F_UNLCK; |
---|
| 826 | rc = POSIX_LOCK_FILE(fileP, &fl); |
---|
| 827 | } |
---|
| 828 | } |
---|
| 829 | } |
---|
| 830 | else |
---|
| 831 | rc = callback(&fl, confP, result); |
---|
| 832 | |
---|
| 833 | #ifdef NFS_CLUSTER_LOCKS |
---|
| 834 | if (rc < 0) |
---|
| 835 | rc = -rc; /* make it positive */ |
---|
| 836 | #endif |
---|
| 837 | EXIT_RC(0, rc); |
---|
| 838 | return rc; |
---|
| 839 | } |
---|
| 840 | #endif |
---|
| 841 | |
---|
| 842 | void cxiVFSToFlock(void *vP, eflock_t *lckdatP) |
---|
| 843 | { |
---|
| 844 | struct file_lock* flP = (struct file_lock *)vP; |
---|
| 845 | |
---|
| 846 | ENTER(0); |
---|
| 847 | if ((flP) && (lckdatP)) |
---|
| 848 | { |
---|
| 849 | lckdatP->l_pid = flP->fl_pid; |
---|
| 850 | lckdatP->l_owner = flP->fl_owner; |
---|
| 851 | lckdatP->l_type = flP->fl_type; |
---|
| 852 | lckdatP->l_start = flP->fl_start; |
---|
| 853 | lckdatP->l_flags = flP->fl_flags; |
---|
| 854 | #ifdef NFS_CLUSTER_LOCKS |
---|
| 855 | lckdatP->l_lmops = flP->fl_lmops; |
---|
| 856 | lckdatP->l_file = flP->fl_file; |
---|
| 857 | if (lckdatP->l_lmops) /* must be lockd or nfsd */ |
---|
| 858 | #else |
---|
| 859 | #if LINUX_KERNEL_VERSION >= 2061700 |
---|
| 860 | if (lckdatP->l_lmops) /* must be lockd or nfsd */ |
---|
| 861 | #else |
---|
| 862 | if (flP->fl_flags & FL_LOCKD) |
---|
| 863 | #endif |
---|
| 864 | #endif |
---|
| 865 | lckdatP->l_caller = L_CALLER_LOCKD; |
---|
| 866 | else |
---|
| 867 | lckdatP->l_caller = L_CALLER_NULL; |
---|
| 868 | if (flP->fl_end == FL_OFFSET_MAX) |
---|
| 869 | lckdatP->l_len = 0; |
---|
| 870 | else |
---|
| 871 | lckdatP->l_len = flP->fl_end - flP->fl_start + 1; |
---|
| 872 | } |
---|
| 873 | EXIT(0); |
---|
| 874 | return; |
---|
| 875 | } |
---|
| 876 | |
---|
| 877 | |
---|
| 878 | /* Sleep for the indicated number of milliseconds */ |
---|
| 879 | void cxiSleep(int ms) |
---|
| 880 | { |
---|
| 881 | ENTER(0); |
---|
| 882 | TRACE1(TRACE_VNODE, 9, TRCID_SLEEP, |
---|
| 883 | "cxiSleep: begin delay %d\n", ms); |
---|
| 884 | current->state = TASK_INTERRUPTIBLE; |
---|
| 885 | /* For large HZ rearrange jiffies calculation and |
---|
| 886 | use presumably larger word size to minimize overflow risk */ |
---|
| 887 | if (unlikely(HZ > 1000)) |
---|
| 888 | schedule_timeout(((long)ms)*HZ/1000); |
---|
| 889 | else |
---|
| 890 | schedule_timeout(ms/(1000/HZ)); |
---|
| 891 | TRACE2(TRACE_VNODE, 9, TRCID_SLEEP_END, |
---|
| 892 | "cxiSleep: end delay %d HZ %d\n", ms, HZ); |
---|
| 893 | EXIT(0); |
---|
| 894 | } |
---|
| 895 | |
---|
| 896 | |
---|
| 897 | void cxiOpenNFS(void *iP) |
---|
| 898 | { |
---|
| 899 | struct inode *inodeP = (struct inode *)iP; |
---|
| 900 | int refcount; |
---|
| 901 | |
---|
| 902 | /* A reference is placed on the cxiNode here when the first NFS reference |
---|
| 903 | is added */ |
---|
| 904 | ENTER(0); |
---|
| 905 | refcount = cxiRefOSNode(NULL, ((cxiNode_t *)(cxiGetCnP(inodeP))), iP, 1); |
---|
| 906 | |
---|
| 907 | TRACE7(TRACE_VNODE, 3, TRCID_OPENNFS, |
---|
| 908 | "openNFS iP 0x%lX ino %d (0x%X) mode 0x%X nlink %d gen_ip 0x%lX " |
---|
| 909 | "refcount %d\n", |
---|
| 910 | inodeP, (inodeP) ? inodeP->i_ino : -1, |
---|
| 911 | (inodeP) ? inodeP->i_ino : -1, |
---|
| 912 | (inodeP) ? inodeP->i_mode : -1, |
---|
| 913 | (inodeP) ? inodeP->i_nlink : -1, |
---|
| 914 | (inodeP) ? inodeP->PRVINODE : NULL, |
---|
| 915 | refcount); |
---|
| 916 | |
---|
| 917 | DBGASSERT(refcount != 0); |
---|
| 918 | EXIT(0); |
---|
| 919 | } |
---|
| 920 | |
---|
| 921 | |
---|
| 922 | int cxiCloseNFS(void *vP, void *viP) |
---|
| 923 | { |
---|
| 924 | int rc; |
---|
| 925 | struct inode *iP = (struct inode *)vP; |
---|
| 926 | |
---|
| 927 | /* If viP is NULL, the file was never actually opened. |
---|
| 928 | If viP is not NULL, close it. */ |
---|
| 929 | ENTER(0); |
---|
| 930 | if (viP == NULL) |
---|
| 931 | rc = 0; |
---|
| 932 | else { |
---|
| 933 | if (VP_TO_PVP(iP) != NULL && VP_TO_CNP(iP) != NULL) { |
---|
| 934 | rc = gpfs_ops.gpfsClose(VP_TO_PVP(iP), VP_TO_CNP(iP), FREAD|FWRITE, |
---|
| 935 | (struct MMFSVInfo *)viP, true); |
---|
| 936 | cxiPutOSNode((void *)iP); |
---|
| 937 | } |
---|
| 938 | } |
---|
| 939 | |
---|
| 940 | EXIT_RC(0, rc); |
---|
| 941 | return rc; |
---|
| 942 | } |
---|
| 943 | |
---|
| 944 | static int cxiNFSCluster = 0; |
---|
| 945 | |
---|
| 946 | void cxiSetNFSCluster(int set) |
---|
| 947 | { |
---|
| 948 | cxiNFSCluster = set; |
---|
| 949 | } |
---|
| 950 | |
---|
| 951 | /* To avoid failing the NFS client the NFSD thread is put to sleep. Another |
---|
| 952 | node will takeover this client and the operation will continue without any |
---|
| 953 | errors to the application. |
---|
| 954 | */ |
---|
| 955 | void cxiNFSError(int rc, const char *str) |
---|
| 956 | { |
---|
| 957 | TRACE2(TRACE_VNODE, 9, TRCID_NFS_ERROR, |
---|
| 958 | "cxiNFSError: %s got rc %d\n", str, rc); |
---|
| 959 | if (cxiNFSCluster && cxiIsNFSThread() && (rc == ESTALE || rc == -ESTALE)) |
---|
| 960 | { |
---|
| 961 | TRACE2(TRACE_VNODE, 1, TRCID_NFS_ERROR_1, |
---|
| 962 | "cxiNFSError: NFS got error %d from %s sleep\n", rc, str); |
---|
| 963 | cxiSleep(120000); // wait 120 seconds |
---|
| 964 | } |
---|
| 965 | } |
---|
| 966 | |
---|
| 967 | void * cxiGetNfsP(void *vP) |
---|
| 968 | { |
---|
| 969 | if (vP && VP_TO_CNP((struct inode *)vP)) |
---|
| 970 | return VP_TO_NFSP((struct inode *)vP); |
---|
| 971 | else |
---|
| 972 | return NULL; |
---|
| 973 | } |
---|
| 974 | |
---|
| 975 | void cxiSetNfsP(void *vP, void *newP) |
---|
| 976 | { |
---|
| 977 | if (VP_TO_CNP((struct inode *)vP)) |
---|
| 978 | VP_TO_NFSP((struct inode *)vP) = newP; |
---|
| 979 | } |
---|
| 980 | |
---|
| 981 | void * cxiGetCnP(void *vP) |
---|
| 982 | { return (void *)VP_TO_CNP((struct inode *)vP); } |
---|
| 983 | |
---|
| 984 | void * cxiGetPvP(void *vP) |
---|
| 985 | { return (void *)VP_TO_PVP((struct inode *)vP); } |
---|
| 986 | |
---|
| 987 | void * cxiGNPtoVP(void *vP) |
---|
| 988 | { return (void *)GNP_TO_VP((struct cxiNode_t *)vP); } |
---|
| 989 | |
---|
| 990 | /* Main routine of kproc */ |
---|
| 991 | static int kprocMain(void *argP) |
---|
| 992 | { |
---|
| 993 | cxiKProcData_t *kpdP = (cxiKProcData_t *)argP; |
---|
| 994 | |
---|
| 995 | /* Change our process name */ |
---|
| 996 | ENTER(0); |
---|
| 997 | current->comm[sizeof(current->comm) - 1] = '\0'; |
---|
| 998 | strncpy(current->comm, kpdP->nameP, sizeof(current->comm) - 1); |
---|
| 999 | |
---|
| 1000 | /* Change parent of a kernel process so that when it exits, it won't |
---|
| 1001 | * send a SIGCHLD signal to the process that created it, and it won't |
---|
| 1002 | * be left as a zombie. |
---|
| 1003 | */ |
---|
| 1004 | DAEMONIZE(kpdP->nameP); |
---|
| 1005 | |
---|
| 1006 | /* Call the function specified by startKProc */ |
---|
| 1007 | kpdP->func(kpdP); |
---|
| 1008 | EXIT(0); |
---|
| 1009 | return 0; |
---|
| 1010 | } |
---|
| 1011 | |
---|
| 1012 | /* Create a new kernel process */ |
---|
| 1013 | cxiPid_t |
---|
| 1014 | cxiStartKProc(struct cxiKProcData_t *kpdP) |
---|
| 1015 | { |
---|
| 1016 | cxiPid_t pid = kernel_thread(kprocMain, kpdP, kpdP->kprocFlags); |
---|
| 1017 | ENTER(0); |
---|
| 1018 | kpdP->pid = pid > 0 ? pid : KPROC_FAILED_PID; |
---|
| 1019 | |
---|
| 1020 | TRACE2(TRACE_VNODE, 1, TRCID_CXISTART_KPROC_LINUX, |
---|
| 1021 | "cxiStartKProc %s pid %d \n", kpdP->nameP, kpdP->pid); |
---|
| 1022 | EXIT(0); |
---|
| 1023 | return kpdP->pid; |
---|
| 1024 | } |
---|
| 1025 | |
---|
| 1026 | void |
---|
| 1027 | cxiStopKProc(struct cxiKProcData_t *kpdP) |
---|
| 1028 | { |
---|
| 1029 | cxiPid_t pid; |
---|
| 1030 | |
---|
| 1031 | ENTER(0); |
---|
| 1032 | cxiBlockingMutexAcquire(&kpdP->lock); |
---|
| 1033 | |
---|
| 1034 | TRACE2(TRACE_VNODE, 1, TRCID_CXISTOP_KPROC_LINUX, |
---|
| 1035 | "cxiStopKProc: %s pid %d \n", kpdP->nameP, kpdP->pid); |
---|
| 1036 | |
---|
| 1037 | if (!KPROC_RUNNING(kpdP)) |
---|
| 1038 | { |
---|
| 1039 | cxiBlockingMutexRelease(&kpdP->lock); |
---|
| 1040 | EXIT(0); |
---|
| 1041 | return; |
---|
| 1042 | } |
---|
| 1043 | |
---|
| 1044 | pid = kpdP->pid; // Cache pid before signal/wait |
---|
| 1045 | kpdP->terminate = true; |
---|
| 1046 | cxiWaitEventSignal(&kpdP->kprocEvent); |
---|
| 1047 | |
---|
| 1048 | while (kpdP->pid != KPROC_UNASSIGNED_PID) |
---|
| 1049 | cxiWaitEventWait(&kpdP->startStopEvent, &kpdP->lock, 0); |
---|
| 1050 | |
---|
| 1051 | cxiBlockingMutexRelease(&kpdP->lock); |
---|
| 1052 | EXIT(0); |
---|
| 1053 | } |
---|
| 1054 | |
---|
| 1055 | /*------------------------------------------------------------------- |
---|
| 1056 | * logAssertFailed - Subroutine consolidating logGenIF() and |
---|
| 1057 | * DoPanic() calls. |
---|
| 1058 | *------------------------------------------------------------------*/ |
---|
| 1059 | |
---|
| 1060 | static char PanicMsgBuf[2048]; |
---|
| 1061 | |
---|
| 1062 | void cxiPanic(const char* panicStrP) |
---|
| 1063 | { |
---|
| 1064 | printk( GPFS_NOTICE "kp %d: cxiPanic: %s\n", cxiGetThreadId(), panicStrP); |
---|
| 1065 | TRACE1(TRACE_ERRLOG, 0, TRCID_PANIC, "cxiPanic: %s\n", panicStrP); |
---|
| 1066 | #ifndef DISABLE_KERNEL_PANIC |
---|
| 1067 | BUG(); |
---|
| 1068 | #endif |
---|
| 1069 | } |
---|
| 1070 | |
---|
| 1071 | static void |
---|
| 1072 | DoPanic(char* condP, char* filenameP, int lineNum, Int32 retCode, |
---|
| 1073 | Int32 reasonCode, char *dataStr) |
---|
| 1074 | { |
---|
| 1075 | const char *p; |
---|
| 1076 | int bytesLeft; |
---|
| 1077 | |
---|
| 1078 | p = cxiStrrchr(filenameP, '/'); |
---|
| 1079 | if (p == NULL) |
---|
| 1080 | p = filenameP; |
---|
| 1081 | else |
---|
| 1082 | p += 1; |
---|
| 1083 | |
---|
| 1084 | sprintf(PanicMsgBuf, "%s:%d:%d:%d:", p, lineNum, retCode, reasonCode); |
---|
| 1085 | bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf); |
---|
| 1086 | if (dataStr) |
---|
| 1087 | { |
---|
| 1088 | strncat(PanicMsgBuf, dataStr, bytesLeft-1); |
---|
| 1089 | bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf); |
---|
| 1090 | } |
---|
| 1091 | strncat(PanicMsgBuf, ":", bytesLeft-1); |
---|
| 1092 | bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf); |
---|
| 1093 | if (condP) |
---|
| 1094 | strncat(PanicMsgBuf, condP, bytesLeft-1); |
---|
| 1095 | cxiPanic(PanicMsgBuf); |
---|
| 1096 | } |
---|
| 1097 | |
---|
| 1098 | #ifdef MODULE |
---|
| 1099 | void |
---|
| 1100 | logAssertFailed(UInt32 flags, /* LOG_FATAL_ERROR or LOG_NONFATAL_ERROR */ |
---|
| 1101 | char *srcFileName, /* __FILE__ */ |
---|
| 1102 | UInt32 srcLineNumber, /* __LINE__ */ |
---|
| 1103 | Int32 retCode, /* return code value */ |
---|
| 1104 | Int32 reasonCode, /* normally errno */ |
---|
| 1105 | UInt32 logRecTag, /* tag if have associated error log rec */ |
---|
| 1106 | char *dataStr, /* assert data string */ |
---|
| 1107 | char *failingExpr) /* expression that evaluated to false */ |
---|
| 1108 | { |
---|
| 1109 | int i; |
---|
| 1110 | |
---|
| 1111 | printk("GPFS logAssertFailed: %s file %s line %d\n", |
---|
| 1112 | failingExpr, srcFileName, srcLineNumber); |
---|
| 1113 | ENTER(0); |
---|
| 1114 | TRACE3(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_1, |
---|
| 1115 | "logAssertFailed: %s retCode %d reasonCode %d\n", |
---|
| 1116 | failingExpr, retCode, reasonCode); |
---|
| 1117 | TRACE2(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_2, |
---|
| 1118 | "logAssertFailed: file %s line %d\n", srcFileName, srcLineNumber); |
---|
| 1119 | #ifndef GPFS_PRINTF |
---|
| 1120 | /* fsync buffered lxtrace records */ |
---|
| 1121 | trc_fsync(); |
---|
| 1122 | |
---|
| 1123 | #ifdef STOP_TRACE_ON_FAILURE |
---|
| 1124 | /* Turn off tracing right after the failure occurs. This may only turn |
---|
| 1125 | off tracing in the kernel. */ |
---|
| 1126 | for (i=0 ; i<MAX_TRACE_CLASSES ; i++) |
---|
| 1127 | TraceFlagsP[i] = 0; |
---|
| 1128 | #endif |
---|
| 1129 | |
---|
| 1130 | /* Wait 10 seconds to allow the lxtrace daemon to complete the sync. */ |
---|
| 1131 | cxiSleep(10000); |
---|
| 1132 | #endif |
---|
| 1133 | gpfs_ops.gpfsDaemonToDie(srcFileName, srcLineNumber, retCode, reasonCode, |
---|
| 1134 | dataStr, failingExpr); |
---|
| 1135 | |
---|
| 1136 | DoPanic(failingExpr, srcFileName, srcLineNumber, retCode, reasonCode, |
---|
| 1137 | dataStr); |
---|
| 1138 | } |
---|
| 1139 | #else /* !MODULE */ |
---|
| 1140 | void |
---|
| 1141 | logAssertFailed(UInt32 flags, |
---|
| 1142 | char *srcFileName, |
---|
| 1143 | UInt32 srcLineNumber, |
---|
| 1144 | Int32 retCode, |
---|
| 1145 | Int32 reasonCode, |
---|
| 1146 | UInt32 logRecTag, |
---|
| 1147 | char *dataStr, |
---|
| 1148 | char *failingExpr); |
---|
| 1149 | #endif /* MODULE */ |
---|
| 1150 | |
---|
| 1151 | |
---|
| 1152 | typedef struct cxiWaitElement_t |
---|
| 1153 | { |
---|
| 1154 | cxiWaitList_t waitList; /* previous and next element in chain */ |
---|
| 1155 | |
---|
| 1156 | /* Linux would normally organize a wait_queue_head_t with any number |
---|
| 1157 | * of wait_queue_t elements. However since we're implementing "wakeup |
---|
| 1158 | * with return code" we have to ensure the OS wakes up the exact sleeper |
---|
| 1159 | * we want. Thus we have only a one to one relationship to ensure the |
---|
| 1160 | * OS can only pick our favorite. |
---|
| 1161 | */ |
---|
| 1162 | wait_queue_head_t qhead; |
---|
| 1163 | wait_queue_t qwaiter; |
---|
| 1164 | int wakeupRC; /* wakeup return code */ |
---|
| 1165 | |
---|
| 1166 | } cxiWaitElement_t; |
---|
| 1167 | |
---|
| 1168 | |
---|
| 1169 | #define CXI_WAIT_LIST_ADD(headP, elementP) \ |
---|
| 1170 | (headP)->prevP->nextP = (elementP); \ |
---|
| 1171 | (elementP)->prevP = (headP)->prevP; \ |
---|
| 1172 | (headP)->prevP = (elementP); \ |
---|
| 1173 | (elementP)->nextP = (headP); |
---|
| 1174 | |
---|
| 1175 | #define CXI_WAIT_LIST_REMOVE(elementP) \ |
---|
| 1176 | (elementP)->prevP->nextP = (elementP)->nextP; \ |
---|
| 1177 | (elementP)->nextP->prevP = (elementP)->prevP; |
---|
| 1178 | |
---|
| 1179 | |
---|
| 1180 | /* Initialize abstract wait event with OS specific |
---|
| 1181 | * initialization function |
---|
| 1182 | */ |
---|
| 1183 | void |
---|
| 1184 | cxiWaitEventInit(cxiWaitEvent_t *weP) |
---|
| 1185 | { |
---|
| 1186 | spinlock_t *lockP = (spinlock_t *)&weP->lword; |
---|
| 1187 | |
---|
| 1188 | spin_lock_init(lockP); |
---|
| 1189 | weP->waitList.nextP = weP->waitList.prevP = &weP->waitList; |
---|
| 1190 | } |
---|
| 1191 | |
---|
| 1192 | Boolean |
---|
| 1193 | cxiWaitEventHasWaiters(cxiWaitEvent_t *weP) |
---|
| 1194 | { |
---|
| 1195 | unsigned long flags; |
---|
| 1196 | spinlock_t *lockP = (spinlock_t *)(weP->lword); |
---|
| 1197 | Boolean rc; |
---|
| 1198 | |
---|
| 1199 | SPIN_LOCK_IRQ(lockP, flags); |
---|
| 1200 | rc = (weP->waitList.nextP != &weP->waitList); |
---|
| 1201 | SPIN_UNLOCK_IRQ(lockP, flags); |
---|
| 1202 | return rc; |
---|
| 1203 | } |
---|
| 1204 | |
---|
| 1205 | /* Do not add trace records. Some callers depend on not being |
---|
| 1206 | * interrupted by the trace daemon. |
---|
| 1207 | */ |
---|
| 1208 | enum WakeType { wBroadcast, wSignal, wWakeOne }; |
---|
| 1209 | static inline void |
---|
| 1210 | doWakeup(cxiWaitEvent_t *wEventP, enum WakeType wtype, int wakeupRC) |
---|
| 1211 | { |
---|
| 1212 | unsigned long flags; |
---|
| 1213 | spinlock_t *lockP = (spinlock_t *)(wEventP->lword); |
---|
| 1214 | cxiWaitList_t *headP; |
---|
| 1215 | cxiWaitList_t *tmpP; |
---|
| 1216 | cxiWaitElement_t *wP; |
---|
| 1217 | |
---|
| 1218 | SPIN_LOCK_IRQ(lockP, flags); |
---|
| 1219 | |
---|
| 1220 | /* We wake up from the front back (FIFO semantics). |
---|
| 1221 | * There's only one wait element per wake_queue_head_t so |
---|
| 1222 | * record the return code and wake up the one element. |
---|
| 1223 | */ |
---|
| 1224 | headP = &wEventP->waitList; |
---|
| 1225 | |
---|
| 1226 | for (tmpP = headP->nextP; tmpP != headP; tmpP = tmpP->nextP) |
---|
| 1227 | { |
---|
| 1228 | wP = list_entry(tmpP, cxiWaitElement_t, waitList); |
---|
| 1229 | wP->wakeupRC = wakeupRC; |
---|
| 1230 | |
---|
| 1231 | wake_up(&wP->qhead); |
---|
| 1232 | if (wtype != wBroadcast) |
---|
| 1233 | { |
---|
| 1234 | /* The difference between wSignal and wWakeOne is that the latter |
---|
| 1235 | guarantees that multiple wake up calls will each pick a different |
---|
| 1236 | thread if more than one is waiting. With wSignal, if a thread is |
---|
| 1237 | awakened but hasn't had a chance to run, then subsequent wake up |
---|
| 1238 | calls might all wake the same thread. |
---|
| 1239 | |
---|
| 1240 | On AIX, the calling routine (e_wakeup_one) removes the waiter from |
---|
| 1241 | the queue, unlike Linux where removal is done by the waiting |
---|
| 1242 | thread when it wakes up. Nothing special has to be done on AIX to |
---|
| 1243 | get the nWakeOne style of wakeup. |
---|
| 1244 | |
---|
| 1245 | Note: This is an inline routine and the wType argument is a |
---|
| 1246 | compile-time constant, so the "if" tests in this routine are done |
---|
| 1247 | by the compiler and do not generate any code. */ |
---|
| 1248 | |
---|
| 1249 | if (wtype == wWakeOne) |
---|
| 1250 | { |
---|
| 1251 | /* Move this entry to tail of list so that the next wakeup call will |
---|
| 1252 | pick somebody else. */ |
---|
| 1253 | CXI_WAIT_LIST_REMOVE(tmpP); |
---|
| 1254 | CXI_WAIT_LIST_ADD(headP, tmpP); |
---|
| 1255 | } |
---|
| 1256 | break; |
---|
| 1257 | } |
---|
| 1258 | } |
---|
| 1259 | SPIN_UNLOCK_IRQ(lockP, flags); |
---|
| 1260 | } |
---|
| 1261 | |
---|
| 1262 | int |
---|
| 1263 | cxiCopyIn(char *from, char *to, unsigned long size) |
---|
| 1264 | { |
---|
| 1265 | /* The daemon needs to bypass access checks since copy to |
---|
| 1266 | * shared segment would inadvertantly fail. |
---|
| 1267 | */ |
---|
| 1268 | ENTER(0); |
---|
| 1269 | if (PROCESS_GROUP(current) == DaemonPGrp) |
---|
| 1270 | __copy_from_user(to, from, size); |
---|
| 1271 | else |
---|
| 1272 | if (copy_from_user(to, from, size)) |
---|
| 1273 | { |
---|
| 1274 | EXIT_RC(0, EFAULT); |
---|
| 1275 | return EFAULT; |
---|
| 1276 | } |
---|
| 1277 | EXIT(0); |
---|
| 1278 | return 0; |
---|
| 1279 | } |
---|
| 1280 | |
---|
| 1281 | int |
---|
| 1282 | cxiCopyOut(char *from, char *to, unsigned long size) |
---|
| 1283 | { |
---|
| 1284 | int ignore; |
---|
| 1285 | /* The daemon needs to bypass access checks since copy to |
---|
| 1286 | * shared segment would inadvertantly fail. |
---|
| 1287 | */ |
---|
| 1288 | ENTER(0); |
---|
| 1289 | if (PROCESS_GROUP(current) == DaemonPGrp) |
---|
| 1290 | ignore = __copy_to_user(to, from, size); |
---|
| 1291 | else |
---|
| 1292 | if (copy_to_user(to, from, size)) |
---|
| 1293 | { |
---|
| 1294 | EXIT_RC(0, EFAULT); |
---|
| 1295 | return EFAULT; |
---|
| 1296 | } |
---|
| 1297 | EXIT(0); |
---|
| 1298 | return 0; |
---|
| 1299 | } |
---|
| 1300 | |
---|
| 1301 | int |
---|
| 1302 | cxiCopyInstr(char *from, char *to, unsigned long size, unsigned long *len) |
---|
| 1303 | { |
---|
| 1304 | long retval; |
---|
| 1305 | |
---|
| 1306 | ENTER(0); |
---|
| 1307 | retval = strncpy_from_user(to, from, size); |
---|
| 1308 | if ((retval > 0) && (retval <= size)) |
---|
| 1309 | { |
---|
| 1310 | *len = retval; |
---|
| 1311 | EXIT(0); |
---|
| 1312 | return 0; |
---|
| 1313 | } |
---|
| 1314 | *len = 0; |
---|
| 1315 | if (retval < 0) |
---|
| 1316 | retval = EFAULT; |
---|
| 1317 | else |
---|
| 1318 | retval = E2BIG; |
---|
| 1319 | EXIT_RC(0, retval); |
---|
| 1320 | return (int)retval; |
---|
| 1321 | } |
---|
| 1322 | |
---|
| 1323 | long cxiSafeGetLong(long* from) |
---|
| 1324 | { |
---|
| 1325 | #if LINUX_KERNEL_VERSION >= 2060000 |
---|
| 1326 | long tmp; |
---|
| 1327 | (void)__get_user_nocheck(tmp, from, sizeof(long)); |
---|
| 1328 | return tmp; |
---|
| 1329 | #else |
---|
| 1330 | return *from; |
---|
| 1331 | #endif |
---|
| 1332 | } |
---|
| 1333 | |
---|
| 1334 | int cxiSafeGetInt(int* from) |
---|
| 1335 | { |
---|
| 1336 | #if LINUX_KERNEL_VERSION >= 2060000 |
---|
| 1337 | int tmp; |
---|
| 1338 | __get_user_nocheck(tmp, from, sizeof(int)); |
---|
| 1339 | return tmp; |
---|
| 1340 | #else |
---|
| 1341 | return *from; |
---|
| 1342 | #endif |
---|
| 1343 | } |
---|
| 1344 | |
---|
| 1345 | void cxiSafePutLong(long val, long* to) |
---|
| 1346 | { |
---|
| 1347 | #if LINUX_KERNEL_VERSION >= 2060000 |
---|
| 1348 | __put_user_nocheck(val, to, sizeof(long)); |
---|
| 1349 | #else |
---|
| 1350 | *to = val; |
---|
| 1351 | #endif |
---|
| 1352 | } |
---|
| 1353 | |
---|
| 1354 | void cxiSafePutInt(int val, int* to) |
---|
| 1355 | { |
---|
| 1356 | #if LINUX_KERNEL_VERSION >= 2060000 |
---|
| 1357 | __put_user_nocheck(val, to, sizeof(int)); |
---|
| 1358 | #else |
---|
| 1359 | *to = val; |
---|
| 1360 | #endif |
---|
| 1361 | } |
---|
| 1362 | |
---|
| 1363 | #ifdef GPFS_ARCH_X86_64 |
---|
| 1364 | /* Check if 64-bit user process */ |
---|
| 1365 | int |
---|
| 1366 | cxiIS64U(char *addr) |
---|
| 1367 | { |
---|
| 1368 | #if LINUX_KERNEL_VERSION > 2060500 |
---|
| 1369 | return !(test_thread_flag(TIF_IA32)); |
---|
| 1370 | #else |
---|
| 1371 | return !(current->thread.flags & THREAD_IA32); |
---|
| 1372 | #endif |
---|
| 1373 | } |
---|
| 1374 | #endif |
---|
| 1375 | |
---|
| 1376 | int |
---|
| 1377 | socket_aio_dequeue() |
---|
| 1378 | { |
---|
| 1379 | return -1; |
---|
| 1380 | } |
---|
| 1381 | |
---|
| 1382 | /* Transfer data from buffer(s) in user space to or from a buffer in the |
---|
| 1383 | kernel. */ |
---|
| 1384 | int |
---|
| 1385 | cxiUiomove(register char* kBufP, /* address of kernel buffer */ |
---|
| 1386 | register unsigned long nBytes, /* #bytes to transfer */ |
---|
| 1387 | Boolean toKernel, /* direction of xfer(read/write)*/ |
---|
| 1388 | register struct cxiUio_t* uioP) /* user area description */ |
---|
| 1389 | { |
---|
| 1390 | register struct cxiIovec_t * iovP; |
---|
| 1391 | unsigned long cnt; |
---|
| 1392 | int rc; |
---|
| 1393 | #ifdef TRACE_IO_DATA |
---|
| 1394 | char* origKBufP = kBufP; |
---|
| 1395 | int trcdata[4]; |
---|
| 1396 | #endif |
---|
| 1397 | int ignore; |
---|
| 1398 | |
---|
| 1399 | ENTER(0); |
---|
| 1400 | TRACE4(TRACE_FOPS, 6, TRCID_CXISYSTEM_037, |
---|
| 1401 | "cxiUiomove enter: kBufP 0x%lX uioP 0x%lX nBytes %d toKernel %d\n", |
---|
| 1402 | kBufP, uioP, nBytes, toKernel); |
---|
| 1403 | if (uioP->uio_resid <= 0) |
---|
| 1404 | { |
---|
| 1405 | EXIT_RC(0, ENOMEM); |
---|
| 1406 | return ENOMEM; |
---|
| 1407 | } |
---|
| 1408 | rc = 0; |
---|
| 1409 | if (uioP->uio_iovcnt == 1) |
---|
| 1410 | { |
---|
| 1411 | /* |
---|
| 1412 | * Fastpath for most common case of iovcnt == 1. Saves a |
---|
| 1413 | * few instructions. |
---|
| 1414 | */ |
---|
| 1415 | iovP = uioP->uio_iov; |
---|
| 1416 | cnt = iovP->iov_len; |
---|
| 1417 | if (cnt <= 0) |
---|
| 1418 | { |
---|
| 1419 | uioP->uio_iovcnt--; |
---|
| 1420 | uioP->uio_iov++; |
---|
| 1421 | uioP->uio_iovdcnt++; |
---|
| 1422 | EXIT(0); |
---|
| 1423 | return 0; |
---|
| 1424 | } |
---|
| 1425 | if (cnt > nBytes) |
---|
| 1426 | cnt = nBytes; |
---|
| 1427 | |
---|
| 1428 | if (toKernel) |
---|
| 1429 | { |
---|
| 1430 | /* The daemon needs to bypass access checks since copy to |
---|
| 1431 | * shared segment would inadvertantly fail. Copies to |
---|
| 1432 | * kernel address space also perform no validity check. |
---|
| 1433 | */ |
---|
| 1434 | if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE) |
---|
| 1435 | __copy_from_user(kBufP, (char *)iovP->iov_base, cnt); |
---|
| 1436 | else |
---|
| 1437 | if (copy_from_user(kBufP, (char *)iovP->iov_base, cnt)) |
---|
| 1438 | { |
---|
| 1439 | EXIT_RC(0, EFAULT); |
---|
| 1440 | return EFAULT; |
---|
| 1441 | } |
---|
| 1442 | } |
---|
| 1443 | else |
---|
| 1444 | { |
---|
| 1445 | int spam; |
---|
| 1446 | /* The daemon needs to bypass access checks since copy to |
---|
| 1447 | * shared segment would inadvertantly fail. Copies to |
---|
| 1448 | * kernel address space also perform no validity check. |
---|
| 1449 | */ |
---|
| 1450 | if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE) |
---|
| 1451 | ignore = __copy_to_user((char *)iovP->iov_base, kBufP, cnt); |
---|
| 1452 | else |
---|
| 1453 | if (copy_to_user((char *)iovP->iov_base, kBufP, cnt)) |
---|
| 1454 | { |
---|
| 1455 | EXIT_RC(0, EFAULT); |
---|
| 1456 | return EFAULT; |
---|
| 1457 | } |
---|
| 1458 | } |
---|
| 1459 | |
---|
| 1460 | iovP->iov_base = (char *)iovP->iov_base + cnt; |
---|
| 1461 | iovP->iov_len -= cnt; |
---|
| 1462 | uioP->uio_resid -= cnt; |
---|
| 1463 | uioP->uio_offset += cnt; |
---|
| 1464 | #ifdef TRACE_IO_DATA |
---|
| 1465 | if (cnt >= sizeof(trcdata)) |
---|
| 1466 | memcpy(trcdata, origKBufP, sizeof(trcdata)); |
---|
| 1467 | else |
---|
| 1468 | { |
---|
| 1469 | memset(trcdata, 0xAA, sizeof(trcdata)); |
---|
| 1470 | memcpy(trcdata, origKBufP, cnt); |
---|
| 1471 | } |
---|
| 1472 | TRACE5(TRACE_FOPS, 7, TRCID_CXISYSTEM_039a, |
---|
| 1473 | "uiomove exit 1: rc %d data %08X %08X %08X %08X\n", |
---|
| 1474 | rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]); |
---|
| 1475 | #else |
---|
| 1476 | TRACE1(TRACE_FOPS, 7, TRCID_CXISYSTEM_039, |
---|
| 1477 | "uiomove exit 1: rc %d\n", |
---|
| 1478 | rc); |
---|
| 1479 | #endif |
---|
| 1480 | EXIT_RC(0, rc); |
---|
| 1481 | return rc; |
---|
| 1482 | } |
---|
| 1483 | while (nBytes > 0 && uioP->uio_resid && rc == 0) |
---|
| 1484 | { |
---|
| 1485 | if (uioP->uio_iovcnt <= 0) |
---|
| 1486 | { |
---|
| 1487 | EXIT_RC(0, ENOMEM); |
---|
| 1488 | return ENOMEM; |
---|
| 1489 | } |
---|
| 1490 | iovP = uioP->uio_iov; |
---|
| 1491 | cnt = iovP->iov_len; |
---|
| 1492 | if (cnt <= 0) |
---|
| 1493 | { |
---|
| 1494 | uioP->uio_iovcnt--; |
---|
| 1495 | uioP->uio_iov++; |
---|
| 1496 | uioP->uio_iovdcnt++; |
---|
| 1497 | continue; |
---|
| 1498 | } |
---|
| 1499 | if (cnt > nBytes) |
---|
| 1500 | cnt = nBytes; |
---|
| 1501 | |
---|
| 1502 | if (toKernel) |
---|
| 1503 | { |
---|
| 1504 | /* The daemon needs to bypass access checks since copy to |
---|
| 1505 | * shared segment would inadvertantly fail. Copies to |
---|
| 1506 | * kernel address space also perform no validity check. |
---|
| 1507 | */ |
---|
| 1508 | if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE) |
---|
| 1509 | __copy_from_user(kBufP, (char *)iovP->iov_base, cnt); |
---|
| 1510 | else |
---|
| 1511 | if (copy_from_user(kBufP, (char *)iovP->iov_base, cnt)) |
---|
| 1512 | { |
---|
| 1513 | EXIT_RC(0, EFAULT); |
---|
| 1514 | return EFAULT; |
---|
| 1515 | } |
---|
| 1516 | } |
---|
| 1517 | else |
---|
| 1518 | { |
---|
| 1519 | /* The daemon needs to bypass access checks since copy to |
---|
| 1520 | * shared segment would inadvertantly fail. Copies to |
---|
| 1521 | * kernel address space also perform no validity check. |
---|
| 1522 | */ |
---|
| 1523 | if (PROCESS_GROUP(current) == DaemonPGrp || uioP->uio_segflg == UIO_SYSSPACE) |
---|
| 1524 | ignore = __copy_to_user((char *)iovP->iov_base, kBufP, cnt); |
---|
| 1525 | else |
---|
| 1526 | if (copy_to_user((char *)iovP->iov_base, kBufP, cnt)) |
---|
| 1527 | { |
---|
| 1528 | EXIT_RC(0, EFAULT); |
---|
| 1529 | return EFAULT; |
---|
| 1530 | } |
---|
| 1531 | } |
---|
| 1532 | iovP->iov_base = (char *)iovP->iov_base + cnt; |
---|
| 1533 | iovP->iov_len -= cnt; |
---|
| 1534 | uioP->uio_resid -= cnt; |
---|
| 1535 | uioP->uio_offset += cnt; |
---|
| 1536 | kBufP += cnt; |
---|
| 1537 | nBytes -= cnt; |
---|
| 1538 | } |
---|
| 1539 | #ifdef TRACE_IO_DATA |
---|
| 1540 | cnt = kBufP - origKBufP; |
---|
| 1541 | if (cnt >= sizeof(trcdata)) |
---|
| 1542 | memcpy(trcdata, origKBufP, sizeof(trcdata)); |
---|
| 1543 | else |
---|
| 1544 | { |
---|
| 1545 | memset(trcdata, 0xAA, sizeof(trcdata)); |
---|
| 1546 | memcpy(trcdata, origKBufP, cnt); |
---|
| 1547 | } |
---|
| 1548 | TRACE5(TRACE_FOPS, 7, TRCID_CXISYSTEM_041a, |
---|
| 1549 | "uiomove exit 2: rc %d data %08X %08X %08X %08X\n", |
---|
| 1550 | rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]); |
---|
| 1551 | #else |
---|
| 1552 | TRACE1(TRACE_FOPS, 7, TRCID_CXISYSTEM_041, |
---|
| 1553 | "uiomove exit 2: rc %d\n", |
---|
| 1554 | rc); |
---|
| 1555 | #endif |
---|
| 1556 | EXIT_RC(0, rc); |
---|
| 1557 | return rc; |
---|
| 1558 | } |
---|
| 1559 | |
---|
| 1560 | /* |
---|
| 1561 | Try to force some sanity checks at compile type |
---|
| 1562 | */ |
---|
| 1563 | /* TO DO: revise this to handle comparisons beyond equality/inequality */ |
---|
| 1564 | /* STATIC_DBGASSERT(sizeof(spinlock_t), SPINLOCK_T_SIZE); */ |
---|
| 1565 | |
---|
| 1566 | /* A routine to check that the definitions in our cxiTypes.h |
---|
| 1567 | * files are equivalent to the system definitions. The module |
---|
| 1568 | * should not load if it receives an error from this routine. |
---|
| 1569 | */ |
---|
| 1570 | int |
---|
| 1571 | cxiCheckTypes() |
---|
| 1572 | { |
---|
| 1573 | int rc = 0; |
---|
| 1574 | ENTER(0); |
---|
| 1575 | |
---|
| 1576 | /* Make sure cxiBlockingMutex_t fits in the space provided. If not, |
---|
| 1577 | the implementation of the cxiBlockingMutex... routines needs to |
---|
| 1578 | use the embedded space to record a pointer to kmalloc'ed space holding |
---|
| 1579 | the semaphore. */ |
---|
| 1580 | if (sizeof(struct semaphore) > GPFS_LINUX_SEM_SIZE) |
---|
| 1581 | { |
---|
| 1582 | printk("cxiCheckTypes: semaphore %ld > GPFS_LINUX_SEM_SIZE %ld\n", |
---|
| 1583 | sizeof(struct semaphore), GPFS_LINUX_SEM_SIZE); |
---|
| 1584 | rc = 1; |
---|
| 1585 | } |
---|
| 1586 | |
---|
| 1587 | /* Size of spinlock_t is smaller for UP case with gcc 3.x, so just |
---|
| 1588 | insure SPINLOCK_T_SIZE is large enough for both the UP and SMP case. */ |
---|
| 1589 | if (sizeof(spinlock_t) > SPINLOCK_T_SIZE) |
---|
| 1590 | { |
---|
| 1591 | printk("cxiCheckTypes: spinlock_t %ld > SPINLOCK_T__SIZE %ld\n", |
---|
| 1592 | sizeof(spinlock_t), SPINLOCK_T_SIZE); |
---|
| 1593 | rc = 2; |
---|
| 1594 | } |
---|
| 1595 | |
---|
| 1596 | /* Ensure that size of pid_t matches cxiThreadId (32-bits) */ |
---|
| 1597 | if (sizeof(pid_t) != sizeof(cxiThreadId)) |
---|
| 1598 | { |
---|
| 1599 | printk("cxiCheckTypes: pid_t %ld != cxiThreadId %ld\n", |
---|
| 1600 | sizeof(pid_t), sizeof(cxiThreadId)); |
---|
| 1601 | rc = 3; |
---|
| 1602 | } |
---|
| 1603 | |
---|
| 1604 | if (rc > 0) |
---|
| 1605 | TRACE1(TRACE_TASKING, 2, TRCID_CXISYSTEM_CHKTYPES, |
---|
| 1606 | "cxiCheckTypes: system type mismatch on type number %d!\n", rc); |
---|
| 1607 | EXIT_RC(0, rc); |
---|
| 1608 | return rc; |
---|
| 1609 | } |
---|
| 1610 | |
---|
| 1611 | /* Routine to get current time of day in nanosecond format. |
---|
| 1612 | */ |
---|
| 1613 | int |
---|
| 1614 | cxiGetTOD(cxiTimeStruc_t *tsP) |
---|
| 1615 | { |
---|
| 1616 | #if LINUX_KERNEL_VERSION >= 2060000 |
---|
| 1617 | struct timespec ts; |
---|
| 1618 | #else |
---|
| 1619 | struct timeval tv; |
---|
| 1620 | #endif |
---|
| 1621 | |
---|
| 1622 | ENTER(0); |
---|
| 1623 | #if LINUX_KERNEL_VERSION >= 2060000 |
---|
| 1624 | ts = CURRENT_TIME; |
---|
| 1625 | tsP->tv_sec = ts.tv_sec; |
---|
| 1626 | tsP->tv_nsec = ts.tv_nsec; |
---|
| 1627 | #else |
---|
| 1628 | /* This call returns microseconds so we fudge it to nanoseconds */ |
---|
| 1629 | do_gettimeofday(&tv); |
---|
| 1630 | tsP->tv_sec = tv.tv_sec; |
---|
| 1631 | tsP->tv_nsec = tv.tv_usec * 1000; |
---|
| 1632 | #endif |
---|
| 1633 | |
---|
| 1634 | EXIT(0); |
---|
| 1635 | return 0; |
---|
| 1636 | } |
---|
| 1637 | |
---|
| 1638 | Boolean |
---|
| 1639 | cxiIsNFSThread() |
---|
| 1640 | { |
---|
| 1641 | # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) |
---|
| 1642 | /* Note comparison against a multibyte character constant (not a string |
---|
| 1643 | constant). Order of characters in word is reversed due to little- |
---|
| 1644 | endian representation of integers. */ |
---|
| 1645 | if (* ((int*)¤t->comm[0]) != 0x6473666e) // 'dsfn' |
---|
| 1646 | return false; |
---|
| 1647 | if (* ((char*)¤t->comm[4]) == '\0') |
---|
| 1648 | return true; |
---|
| 1649 | return (* ((int*)¤t->comm[2]) == 0x00346473); // '4ds' |
---|
| 1650 | # else |
---|
| 1651 | if ((strcmp(current->comm, "nfsd") == 0) || |
---|
| 1652 | (strcmp(current->comm, "nfsd4") == 0)) |
---|
| 1653 | return true; |
---|
| 1654 | return false; |
---|
| 1655 | # endif |
---|
| 1656 | } |
---|
| 1657 | |
---|
| 1658 | Boolean |
---|
| 1659 | cxiIsLockdThread() |
---|
| 1660 | { |
---|
| 1661 | # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) |
---|
| 1662 | /* Note comparison against a multibyte character constant (not a string |
---|
| 1663 | constant). Order of characters in word is reversed due to little- |
---|
| 1664 | endian representation of integers. */ |
---|
| 1665 | if ((* ((int*)¤t->comm[0]) != 0x6b636f6c) | // 'kcol' |
---|
| 1666 | (* ((int*)¤t->comm[2]) != 0x00646b63)); // ' dkc' |
---|
| 1667 | return false; |
---|
| 1668 | return * ((char*)¤t->comm[5]) == '\0'; |
---|
| 1669 | # else |
---|
| 1670 | return (strcmp(current->comm, "lockd") == 0); |
---|
| 1671 | # endif |
---|
| 1672 | } |
---|
| 1673 | |
---|
| 1674 | Boolean |
---|
| 1675 | cxiIsNFS4Thread() |
---|
| 1676 | { |
---|
| 1677 | # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) |
---|
| 1678 | /* Note comparison against a multibyte character constant (not a string |
---|
| 1679 | constant). Order of characters in word is reversed due to little- |
---|
| 1680 | endian representation of integers. */ |
---|
| 1681 | if ((* ((int*)¤t->comm[0]) != 0x6473666e) | // 'dsfn' |
---|
| 1682 | (* ((int*)¤t->comm[2]) != 0x00346473)); // '4ds' |
---|
| 1683 | return false; |
---|
| 1684 | return * ((char*)¤t->comm[5]) == '\0'; |
---|
| 1685 | # else |
---|
| 1686 | return (strcmp(current->comm, "nfsd4") == 0); |
---|
| 1687 | # endif |
---|
| 1688 | } |
---|
| 1689 | |
---|
| 1690 | Boolean |
---|
| 1691 | cxiIsKupdateThread() |
---|
| 1692 | { |
---|
| 1693 | #if LINUX_KERNEL_VERSION >= 2060000 |
---|
| 1694 | /* In 2.6 pdflush replaced kupdated and bdflush from 2.4 */ |
---|
| 1695 | return current_is_pdflush(); |
---|
| 1696 | #else |
---|
| 1697 | return (strcmp(current->comm, "kupdated") == 0); |
---|
| 1698 | #endif |
---|
| 1699 | } |
---|
| 1700 | |
---|
| 1701 | #ifdef SMB_LOCKS |
---|
| 1702 | Boolean |
---|
| 1703 | cxiIsSambaOrLockdThread() |
---|
| 1704 | { |
---|
| 1705 | # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) |
---|
| 1706 | /* Note comparison against a multibyte character constant (not a string |
---|
| 1707 | constant). Order of characters in word is reversed due to little- |
---|
| 1708 | endian representation of integers. */ |
---|
| 1709 | Boolean rc = (((* ((int*)¤t->comm[0]) == 0x64626d73) & // 'dbms' |
---|
| 1710 | (* ((char*)¤t->comm[4]) == '\0')) | |
---|
| 1711 | ((* ((int*)¤t->comm[0]) == 0x6b636f6c) & // 'kcol' |
---|
| 1712 | (* ((int*)¤t->comm[2]) == 0x00646b63))); // 'dkc' |
---|
| 1713 | return rc; |
---|
| 1714 | # else |
---|
| 1715 | return ((strcmp(current->comm, "smbd") == 0) | |
---|
| 1716 | (strcmp(current->comm, "lockd") == 0)); |
---|
| 1717 | # endif |
---|
| 1718 | } |
---|
| 1719 | |
---|
| 1720 | Boolean |
---|
| 1721 | cxiIsSambaThread() |
---|
| 1722 | { |
---|
| 1723 | # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) |
---|
| 1724 | /* Note comparison against a multibyte character constant (not a string |
---|
| 1725 | constant). Order of characters in word is reversed due to little- |
---|
| 1726 | endian representation of integers. */ |
---|
| 1727 | Boolean rc = ((* ((int*)¤t->comm[0]) == 0x64626d73) & // 'dbms' |
---|
| 1728 | (* ((char*)¤t->comm[4]) == '\0')); |
---|
| 1729 | return rc; |
---|
| 1730 | # else |
---|
| 1731 | return (strcmp(current->comm, "smbd") == 0); |
---|
| 1732 | # endif |
---|
| 1733 | } |
---|
| 1734 | #endif |
---|
| 1735 | |
---|
| 1736 | Boolean |
---|
| 1737 | cxiIsGPFSThread() |
---|
| 1738 | { |
---|
| 1739 | # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) |
---|
| 1740 | return (((* ((int*)¤t->comm[0]) == 0x73666d6d) & // 'sfmm' |
---|
| 1741 | (* ((int*)¤t->comm[2]) == 0x00647366))); // 'dsf' |
---|
| 1742 | # else |
---|
| 1743 | return (strcmp(current->comm, "mmfsd") == 0); |
---|
| 1744 | # endif |
---|
| 1745 | } |
---|
| 1746 | |
---|
| 1747 | Boolean |
---|
| 1748 | cxiIsKswapdThread() |
---|
| 1749 | { |
---|
| 1750 | #if LINUX_KERNEL_VERSION > 2060000 |
---|
| 1751 | /* On 2.6, there may be multiple kswapd processes, named kswapd0, kswapd1, |
---|
| 1752 | * etc. We don't have to depend on the process name to identify kswapd |
---|
| 1753 | * processes on 2.6 though, there's a better way. */ |
---|
| 1754 | return current_is_kswapd(); |
---|
| 1755 | #else |
---|
| 1756 | # if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__) |
---|
| 1757 | return ((* ((int*)¤t->comm[0]) == 0x6177736b) & // 'awsk' |
---|
| 1758 | (* ((int*)¤t->comm[3]) == 0x00647061)); // ' dpa' |
---|
| 1759 | # else |
---|
| 1760 | return (strcmp(current->comm, "kswapd") == 0); |
---|
| 1761 | # endif |
---|
| 1762 | #endif |
---|
| 1763 | } |
---|
| 1764 | |
---|
| 1765 | #ifdef INSTRUMENT_LOCKS |
---|
| 1766 | void InitBlockingMutexStats() |
---|
| 1767 | { |
---|
| 1768 | memset(BlockingMutexStatsTable, 0, sizeof(BlockingMutexStatsTable)); |
---|
| 1769 | } |
---|
| 1770 | #endif |
---|
| 1771 | |
---|
| 1772 | /* Initialize a cxiBlockingMutex_t. Instead of the DBGASSERT, this routine |
---|
| 1773 | should kmalloc a struct semaphore if bmSem is too small. */ |
---|
| 1774 | void cxiBlockingMutexInit(cxiBlockingMutex_t* mP, int bmNameIdx) |
---|
| 1775 | { |
---|
| 1776 | ENTER(0); |
---|
| 1777 | DBGASSERT(sizeof(struct semaphore) <= GPFS_LINUX_SEM_SIZE); |
---|
| 1778 | #ifdef INSTRUMENT_LOCKS |
---|
| 1779 | DBGASSERT(bmNameIdx < MAX_GPFS_LOCK_NAMES); |
---|
| 1780 | #endif /* INSTRUMENT_LOCKS */ |
---|
| 1781 | |
---|
| 1782 | TRACE2(TRACE_KLOCKL, 3, TRCID_BM_INIT, |
---|
| 1783 | "cxiBlockingMutexInit: mP 0x%lX idx %d\n", |
---|
| 1784 | mP, bmNameIdx); |
---|
| 1785 | init_MUTEX((struct semaphore *)mP->bmSem); |
---|
| 1786 | mP->bmOwnerP = NULL; |
---|
| 1787 | mP->lockNameIndex = bmNameIdx; |
---|
| 1788 | EXIT(0); |
---|
| 1789 | } |
---|
| 1790 | |
---|
| 1791 | |
---|
| 1792 | /* Enter critical section, blocking this thread if necessary. Mark this |
---|
| 1793 | thread as the owner of the mutex before returning. */ |
---|
| 1794 | void |
---|
| 1795 | REGPARMS cxiBlockingMutexAcquire(cxiBlockingMutex_t* mP) |
---|
| 1796 | { |
---|
| 1797 | ENTER(1); |
---|
| 1798 | TRACE4(TRACE_KLOCKL, 9, TRCID_BM_ACQ, |
---|
| 1799 | "cxiBlockingMutexAcquire: about to acquire 0x%lX type %d " |
---|
| 1800 | "current 0x%lX currentOwner 0x%lX\n", |
---|
| 1801 | mP, mP->lockNameIndex, current, mP->bmOwnerP); |
---|
| 1802 | |
---|
| 1803 | DBGASSERTRC(mP->bmOwnerP != (char *)current, |
---|
| 1804 | PTR_TO_INT32(mP->bmOwnerP), PTR_TO_INT32(mP), 0); |
---|
| 1805 | |
---|
| 1806 | #ifdef INSTRUMENT_LOCKS |
---|
| 1807 | BlockingMutexStatsTable[mP->lockNameIndex].bmsAcquires += 1; |
---|
| 1808 | if (mP->bmOwnerP != NULL) |
---|
| 1809 | BlockingMutexStatsTable[mP->lockNameIndex].bmsConflicts += 1; |
---|
| 1810 | #endif |
---|
| 1811 | |
---|
| 1812 | down((struct semaphore *)mP->bmSem); |
---|
| 1813 | mP->bmOwnerP = (char *)current; |
---|
| 1814 | |
---|
| 1815 | TRACE1(TRACE_KLOCKL, 9, TRCID_BM_ACQ_EXIT, |
---|
| 1816 | "cxiBlockingMutexAcquire: returning after acquiring 0x%lX\n", mP); |
---|
| 1817 | EXIT(1); |
---|
| 1818 | } |
---|
| 1819 | |
---|
| 1820 | |
---|
| 1821 | /* Leave critical section and awaken waiting threads */ |
---|
| 1822 | void |
---|
| 1823 | REGPARMS cxiBlockingMutexRelease(cxiBlockingMutex_t* mP) |
---|
| 1824 | { |
---|
| 1825 | ENTER(1); |
---|
| 1826 | TRACE4(TRACE_KLOCKL, 9, TRCID_BM_REL, |
---|
| 1827 | "cxiBlockingMutexRelease: about to release 0x%lX type %d " |
---|
| 1828 | "current 0x%lX currentOwner 0x%lX\n", |
---|
| 1829 | mP, mP->lockNameIndex,current, mP->bmOwnerP); |
---|
| 1830 | |
---|
| 1831 | if (mP->bmOwnerP == (char *)current) |
---|
| 1832 | { |
---|
| 1833 | mP->bmOwnerP = NULL; |
---|
| 1834 | up((struct semaphore *)mP->bmSem); |
---|
| 1835 | } |
---|
| 1836 | EXIT(1); |
---|
| 1837 | } |
---|
| 1838 | |
---|
| 1839 | /* Free resources associated with this cxiBlockingMutex_t in preparation |
---|
| 1840 | for freeing the storage it occupies */ |
---|
| 1841 | void cxiBlockingMutexTerm(cxiBlockingMutex_t* mP) |
---|
| 1842 | { |
---|
| 1843 | ENTER(0); |
---|
| 1844 | TRACE2(TRACE_KLOCKL, 3, TRCID_BM_TERM, |
---|
| 1845 | "cxiBlockingMutexTerm: mP 0x%lX type %d\n", mP, mP->lockNameIndex); |
---|
| 1846 | |
---|
| 1847 | /* Verify that mutex is not held */ |
---|
| 1848 | DBGASSERT(mP->bmOwnerP == NULL); |
---|
| 1849 | DBGASSERT(atomic_read(&((struct semaphore *)mP->bmSem)->count) == 1); |
---|
| 1850 | EXIT(0); |
---|
| 1851 | } |
---|
| 1852 | |
---|
| 1853 | |
---|
| 1854 | /* Return true if a cxiBlockingMutex_t is held by the calling process */ |
---|
| 1855 | Boolean |
---|
| 1856 | cxiBlockingMutexHeldByCaller(cxiBlockingMutex_t* mP) |
---|
| 1857 | { |
---|
| 1858 | Boolean result; |
---|
| 1859 | char* ownerP; |
---|
| 1860 | cxiPid_t ownerPid; |
---|
| 1861 | |
---|
| 1862 | /* Cache bmOwnerP is case it changes to NULL */ |
---|
| 1863 | ENTER(0); |
---|
| 1864 | ownerP = mP->bmOwnerP; |
---|
| 1865 | if (ownerP == NULL) |
---|
| 1866 | result = false; |
---|
| 1867 | else |
---|
| 1868 | { |
---|
| 1869 | cxiThreadPtrToThreadId(ownerP, &ownerPid); |
---|
| 1870 | result = (current->pid == ownerPid); |
---|
| 1871 | } |
---|
| 1872 | TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_017, |
---|
| 1873 | "cxiBlockingMutexHeldByCaller: owner 0x%lX returns %d\n", |
---|
| 1874 | ownerP, result); |
---|
| 1875 | EXIT_RC(0, result); |
---|
| 1876 | return result; |
---|
| 1877 | } |
---|
| 1878 | |
---|
| 1879 | |
---|
| 1880 | /* Return true if a cxiBlockingMutex_t has one or more processes waiting |
---|
| 1881 | on it */ |
---|
| 1882 | Boolean cxiBlockingMutexHasWaiters(cxiBlockingMutex_t* mP) |
---|
| 1883 | { |
---|
| 1884 | struct semaphore * semP = (struct semaphore *)mP->bmSem; |
---|
| 1885 | Boolean result; |
---|
| 1886 | |
---|
| 1887 | ENTER(0); |
---|
| 1888 | if ((void*)semP->wait.task_list.next != (void*)&semP->wait.task_list.next) |
---|
| 1889 | result = true; |
---|
| 1890 | else |
---|
| 1891 | result = false; |
---|
| 1892 | TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_018, |
---|
| 1893 | "cxiBlockingMutexHasWaiters: mP 0x%lX hasWaiters %d\n", |
---|
| 1894 | mP, result); |
---|
| 1895 | EXIT_RC(0, result); |
---|
| 1896 | return result; |
---|
| 1897 | } |
---|
| 1898 | |
---|
| 1899 | |
---|
| 1900 | /* Wait for a cxiWaitEventSignal, cxiWaitEventBroadcast, or |
---|
| 1901 | cxiWaitEventBroadcastRC. Drop the associated cxiBlockingMutex_t |
---|
| 1902 | *mutexP while waiting, and reacquire it before returning. |
---|
| 1903 | If INTERRUPTIBLE is set in waitFlags, waits interruptibly; |
---|
| 1904 | otherwise, waits uninterruptibly. |
---|
| 1905 | Returns THREAD_INTERRUPTED if interrupted before being woken up, |
---|
| 1906 | THREAD_AWAKENED, if woken up by cxiWaitEventSignal or |
---|
| 1907 | cxiWaitEventBroadcast, or the result value passed to |
---|
| 1908 | cxiWaitEventWakeupResult, if woken up by cxiWaitEventWakeupResult. */ |
---|
| 1909 | int cxiWaitEventWait(cxiWaitEvent_t* weP, cxiBlockingMutex_t* mutexP, |
---|
| 1910 | int waitFlags) |
---|
| 1911 | { |
---|
| 1912 | spinlock_t *lockP = (spinlock_t *)(weP->lword); |
---|
| 1913 | unsigned long flags; |
---|
| 1914 | cxiWaitElement_t waitElement; |
---|
| 1915 | int count = 0; |
---|
| 1916 | Boolean done; |
---|
| 1917 | |
---|
| 1918 | ENTER(0); |
---|
| 1919 | TRACE3(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_EVENT_WAIT_ENTER, |
---|
| 1920 | "cxiWaitEventWait enter: weP 0x%lX waitFlags 0x%X about to release " |
---|
| 1921 | "mutex 0x%lX \n", weP, waitFlags, mutexP); |
---|
| 1922 | |
---|
| 1923 | /* Verify that caller is holding the mutex */ |
---|
| 1924 | DBGASSERTRC(mutexP->bmOwnerP == (char *)current, |
---|
| 1925 | PTR_TO_INT32(mutexP->bmOwnerP), PTR_TO_INT32(mutexP), 0); |
---|
| 1926 | |
---|
| 1927 | /* initialize our wait element */ |
---|
| 1928 | init_waitqueue_head(&waitElement.qhead); |
---|
| 1929 | init_waitqueue_entry(&waitElement.qwaiter, current); |
---|
| 1930 | __add_wait_queue(&waitElement.qhead, &waitElement.qwaiter); |
---|
| 1931 | waitElement.wakeupRC = 0; |
---|
| 1932 | |
---|
| 1933 | /* update our task state to not running any more */ |
---|
| 1934 | if (waitFlags & INTERRUPTIBLE) |
---|
| 1935 | current->state = TASK_INTERRUPTIBLE; |
---|
| 1936 | else |
---|
| 1937 | current->state = TASK_UNINTERRUPTIBLE; |
---|
| 1938 | |
---|
| 1939 | /* add our wait element to the end of the wait list */ |
---|
| 1940 | SPIN_LOCK_IRQ(lockP, flags); |
---|
| 1941 | |
---|
| 1942 | CXI_WAIT_LIST_ADD(&weP->waitList, &waitElement.waitList); |
---|
| 1943 | |
---|
| 1944 | SPIN_UNLOCK_IRQ(lockP, flags); |
---|
| 1945 | |
---|
| 1946 | /* Release the mutex. Note: calling cxiBlockingMutexRelease here is |
---|
| 1947 | problematic, because it makes trace calls, which may block the current |
---|
| 1948 | process, which would overwrite the task state (current->state) we just |
---|
| 1949 | updated. A way around this would be to move out task state update to |
---|
| 1950 | after the call to cxiBlockingMutexRelease, but then, before calling |
---|
| 1951 | schedule(), we would have to re-acquire the wait-list lock and check |
---|
| 1952 | wakeupRC to see whether somebody has already woken us up since we |
---|
| 1953 | released the mutex. Since there is a trace at the top of this routine, |
---|
| 1954 | we don't need the one in cxiBlockingMutexRelease; hence, just do the |
---|
| 1955 | release right here. */ |
---|
| 1956 | mutexP->bmOwnerP = NULL; |
---|
| 1957 | up((struct semaphore *)mutexP->bmSem); |
---|
| 1958 | |
---|
| 1959 | again: |
---|
| 1960 | /* call the scheduler */ |
---|
| 1961 | schedule(); |
---|
| 1962 | |
---|
| 1963 | /* Remove ourself from the wait list ... except: |
---|
| 1964 | Even though we may enter uninterrubtible sleep, this sleep can in |
---|
| 1965 | fact be interrupted in at least two scenarios: |
---|
| 1966 | 1) page_alloc code may call wakeup_kswapd(). This should be |
---|
| 1967 | a very rare event with the current code, since we make an effort |
---|
| 1968 | to avoid blocking kswapd. |
---|
| 1969 | 2) While signals are supposed to be ignored during uninterruptible |
---|
| 1970 | sleep, it turns out that some signals, e.g. SIGSEGV and SIGBUS, |
---|
| 1971 | cause us to wake up. It doesn't look like the signal has been |
---|
| 1972 | delivered yet, but sleep is interrupted. The signal will be |
---|
| 1973 | delivered later (probably when exiting kernel). |
---|
| 1974 | Our callers can't handle unexpected return from uninterruptible |
---|
| 1975 | sleep. In either of the two cases above, it should be safe to go |
---|
| 1976 | back to sleep and wait to be woken up properly. |
---|
| 1977 | */ |
---|
| 1978 | SPIN_LOCK_IRQ(lockP, flags); |
---|
| 1979 | |
---|
| 1980 | if (waitElement.wakeupRC == 0 && |
---|
| 1981 | !(waitFlags & INTERRUPTIBLE)) |
---|
| 1982 | { |
---|
| 1983 | TRACE3N(TRACE_KLOCKL, 1, TRCID_CXISYSTEM_EVENT_WAIT_INTERRUPTED, |
---|
| 1984 | "cxiWaitEventWait: interrupted weP 0x%lX mutexP 0x%lX rc %d\n", |
---|
| 1985 | weP, mutexP, waitElement.wakeupRC); |
---|
| 1986 | current->state = TASK_UNINTERRUPTIBLE; |
---|
| 1987 | done = false; |
---|
| 1988 | } |
---|
| 1989 | else |
---|
| 1990 | { |
---|
| 1991 | CXI_WAIT_LIST_REMOVE(&waitElement.waitList); |
---|
| 1992 | done = true; |
---|
| 1993 | } |
---|
| 1994 | |
---|
| 1995 | SPIN_UNLOCK_IRQ(lockP, flags); |
---|
| 1996 | |
---|
| 1997 | if (!done) |
---|
| 1998 | goto again; |
---|
| 1999 | |
---|
| 2000 | /* re-acquire the mutex */ |
---|
| 2001 | cxiBlockingMutexAcquire(mutexP); |
---|
| 2002 | |
---|
| 2003 | TRACE3(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_EVENT_WAIT_EXIT, |
---|
| 2004 | "cxiWaitEventWait exit: weP 0x%lX mutexP 0x%lX rc %d\n", |
---|
| 2005 | weP, mutexP, waitElement.wakeupRC); |
---|
| 2006 | |
---|
| 2007 | /* A zero wakeup code means we were interrupted rather than woken up */ |
---|
| 2008 | EXIT(0); |
---|
| 2009 | if (waitElement.wakeupRC != 0) |
---|
| 2010 | return waitElement.wakeupRC; |
---|
| 2011 | else |
---|
| 2012 | return THREAD_INTERRUPTED; |
---|
| 2013 | } |
---|
| 2014 | |
---|
| 2015 | /* Wake up one thread waiting on this cxiWaitEvent_t. Must not sleep */ |
---|
| 2016 | void |
---|
| 2017 | cxiWaitEventSignal(cxiWaitEvent_t* weP) |
---|
| 2018 | { |
---|
| 2019 | /* ENTER(0); */ |
---|
| 2020 | TRACE1N(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_SIGNAL, |
---|
| 2021 | "cxiWaitEventSignal: weP 0x%lX\n", weP); |
---|
| 2022 | |
---|
| 2023 | doWakeup(weP, wSignal, THREAD_AWAKENED); /* wake up one */ |
---|
| 2024 | /* EXIT(0); */ |
---|
| 2025 | } |
---|
| 2026 | |
---|
| 2027 | |
---|
| 2028 | /* Wake up one thread waiting on this cxiWaitEvent_t. This is the same as |
---|
| 2029 | cxiWaitEventSignal(), except this routine guarantees that multiple wake |
---|
| 2030 | up calls will each pick a different thread if more than one is waiting. */ |
---|
| 2031 | void |
---|
| 2032 | cxiWaitEventWakeupOne(cxiWaitEvent_t* weP) |
---|
| 2033 | { |
---|
| 2034 | ENTER(0); |
---|
| 2035 | TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_WAKEUP_ONE, |
---|
| 2036 | "cxiWaitEventWakeupOne: weP 0x%lX\n", weP); |
---|
| 2037 | |
---|
| 2038 | doWakeup(weP, wWakeOne, THREAD_AWAKENED); /* wake up one */ |
---|
| 2039 | EXIT(0); |
---|
| 2040 | } |
---|
| 2041 | |
---|
| 2042 | |
---|
| 2043 | /* Wake up all threads waiting on this cxiWaitEvent_t */ |
---|
| 2044 | void |
---|
| 2045 | cxiWaitEventBroadcast(cxiWaitEvent_t* weP) |
---|
| 2046 | { |
---|
| 2047 | ENTER(0); |
---|
| 2048 | TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST, |
---|
| 2049 | "cxiWaitEventBroadcastRC: weP 0x%lX\n", weP); |
---|
| 2050 | |
---|
| 2051 | doWakeup(weP, wBroadcast, THREAD_AWAKENED); /* wake up all */ |
---|
| 2052 | EXIT(0); |
---|
| 2053 | } |
---|
| 2054 | |
---|
| 2055 | |
---|
| 2056 | /* Wake up all threads waiting on this cxiWaitEvent_t and cause them to |
---|
| 2057 | return rc from their cxiWaitEventWait calls. */ |
---|
| 2058 | void |
---|
| 2059 | cxiWaitEventBroadcastRC(cxiWaitEvent_t* weP, int rc) |
---|
| 2060 | { |
---|
| 2061 | ENTER(0); |
---|
| 2062 | TRACE2(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST_RC, |
---|
| 2063 | "cxiWaitEventBroadcastRC: weP 0x%lX rc %d\n", weP, rc); |
---|
| 2064 | |
---|
| 2065 | doWakeup(weP, wBroadcast, rc); /* wake up all */ |
---|
| 2066 | EXIT_RC(0, rc); |
---|
| 2067 | } |
---|
| 2068 | |
---|
| 2069 | /* alloc big memory area */ |
---|
| 2070 | void * |
---|
| 2071 | cxiBigMalloc(int size) |
---|
| 2072 | { |
---|
| 2073 | void *ptr; |
---|
| 2074 | |
---|
| 2075 | ENTER(0); |
---|
| 2076 | ptr = vmalloc(size); |
---|
| 2077 | |
---|
| 2078 | #ifdef MALLOC_DEBUG |
---|
| 2079 | MallocDebugNew(ptr, size, 2); |
---|
| 2080 | #endif |
---|
| 2081 | |
---|
| 2082 | EXIT(0); |
---|
| 2083 | return ptr; |
---|
| 2084 | } |
---|
| 2085 | |
---|
| 2086 | /* free big memory area */ |
---|
| 2087 | void |
---|
| 2088 | cxiBigFree(char *ptr) |
---|
| 2089 | { |
---|
| 2090 | ENTER(0); |
---|
| 2091 | #ifdef MALLOC_DEBUG |
---|
| 2092 | MallocDebugDelete(ptr); |
---|
| 2093 | #endif |
---|
| 2094 | |
---|
| 2095 | EXIT(0); |
---|
| 2096 | vfree(ptr); |
---|
| 2097 | } |
---|
| 2098 | |
---|
| 2099 | #ifdef SMB_LOCKS |
---|
| 2100 | /* Determine if current process has this file open */ |
---|
| 2101 | void * |
---|
| 2102 | cxiCheckOpen(struct cxiNode_t* cnP) |
---|
| 2103 | { |
---|
| 2104 | int count; |
---|
| 2105 | int i; |
---|
| 2106 | struct file** fdList; |
---|
| 2107 | struct file* fileP; |
---|
| 2108 | struct inode* inodeP; |
---|
| 2109 | |
---|
| 2110 | ENTER(0); |
---|
| 2111 | #if LINUX_KERNEL_VERSION >= 2061300 |
---|
| 2112 | count = current->files->fdt->max_fds; |
---|
| 2113 | fdList = current->files->fdt->fd; |
---|
| 2114 | #else |
---|
| 2115 | count = current->files->max_fds; |
---|
| 2116 | fdList = current->files->fd; |
---|
| 2117 | #endif |
---|
| 2118 | inodeP = GNP_TO_VP(cnP); |
---|
| 2119 | |
---|
| 2120 | TRACE3(TRACE_VNODE,9,TRCID_CXICHECKOPEN_ENTRY, |
---|
| 2121 | "cxiCheckOpen: entry. %d files in fd list. Checking for inode %d " |
---|
| 2122 | "at 0x%x", count, inodeP->i_ino, inodeP); |
---|
| 2123 | |
---|
| 2124 | for (i=0; i<count; i++) |
---|
| 2125 | { |
---|
| 2126 | fileP = fdList[i]; |
---|
| 2127 | |
---|
| 2128 | if (fileP) |
---|
| 2129 | { |
---|
| 2130 | if (fdList[i]->f_dentry->d_inode == inodeP) |
---|
| 2131 | { |
---|
| 2132 | TRACE1(TRACE_VNODE, 9,TRCID_CXICHECKOPEN_FOUND, |
---|
| 2133 | "cxiCheckOpen: found open file. vinfoP 0x%x", |
---|
| 2134 | fileP->private_data); |
---|
| 2135 | EXIT(0); |
---|
| 2136 | return fileP->private_data; |
---|
| 2137 | } |
---|
| 2138 | } |
---|
| 2139 | } |
---|
| 2140 | |
---|
| 2141 | EXIT(0); |
---|
| 2142 | return NULL; |
---|
| 2143 | } |
---|
| 2144 | |
---|
| 2145 | int cxiBreakOplock(void *breakArgP, int oplockNew) |
---|
| 2146 | { |
---|
| 2147 | /* On Linux, we use its kernel oplock support. The get_lease() |
---|
| 2148 | * call is the operation to revoke conflicting leases. |
---|
| 2149 | */ |
---|
| 2150 | int rc; |
---|
| 2151 | ENTER(0); |
---|
| 2152 | |
---|
| 2153 | /* O_NONBLOCK: prevents the thread from waiting for the lease return. |
---|
| 2154 | * In the case of a Samba thread, we only want to get EWOULDBLOCK |
---|
| 2155 | * back if the conflict is held within Samba iteself. If a wait is |
---|
| 2156 | * needed, breakSMBOplock will invoke cxiWaitForBreak. |
---|
| 2157 | */ |
---|
| 2158 | |
---|
| 2159 | /* Linux op to revoke conflicting leases */ |
---|
| 2160 | rc = abs(REVOKE_LEASE((struct inode *)breakArgP, |
---|
| 2161 | (cxiIsSambaThread()? 0: O_NONBLOCK) | |
---|
| 2162 | ((oplockNew==smbOplockShared)? FMODE_READ: FMODE_WRITE))); |
---|
| 2163 | |
---|
| 2164 | TRACE3(TRACE_VNODE, 4,TRCID_CXIBREAKOPLOCK, |
---|
| 2165 | "cxiBreakOplock: exit rc %d inode 0x%lX oplock %d\n", |
---|
| 2166 | rc, breakArgP, oplockNew); |
---|
| 2167 | |
---|
| 2168 | EXIT(0); |
---|
| 2169 | return rc; |
---|
| 2170 | } |
---|
| 2171 | |
---|
| 2172 | DECLARE_WAIT_QUEUE_HEAD(oplock_break_queue); |
---|
| 2173 | |
---|
| 2174 | /* No initialization required on Linux */ |
---|
| 2175 | int cxiInitBreakQ() { return 0; } |
---|
| 2176 | |
---|
| 2177 | /* No initialization required on Linux */ |
---|
| 2178 | int cxiTermBreakQ() { return 0; } |
---|
| 2179 | |
---|
| 2180 | /* Send the notification that the oplock break completed */ |
---|
| 2181 | int cxiSendBreakMsg(void *ofP) |
---|
| 2182 | { |
---|
| 2183 | ENTER(0); |
---|
| 2184 | /* There is only one oplock_break_queue, and no means to pass the ofP back to |
---|
| 2185 | * the waiters. This will wake all of them up and they will recheck their |
---|
| 2186 | * oplock states and wait again if necessary (with a timeout). |
---|
| 2187 | */ |
---|
| 2188 | wake_up_interruptible(&oplock_break_queue); |
---|
| 2189 | |
---|
| 2190 | TRACE1(TRACE_SMB, 3, TRCID_SEND_BREAK, "cxiSendBreakMsg: ofP 0x%lX\n", ofP); |
---|
| 2191 | EXIT(0); |
---|
| 2192 | return 0; |
---|
| 2193 | } |
---|
| 2194 | |
---|
| 2195 | /* Suspend the caller until either the oplock break completes, or the timeout |
---|
| 2196 | * is reached. |
---|
| 2197 | */ |
---|
| 2198 | int cxiWaitForBreak(void *fileArgP, int oplockCurrent, int timeoutSeconds) |
---|
| 2199 | { |
---|
| 2200 | DECLARE_WAITQUEUE(wait, current); |
---|
| 2201 | signed long timeout; |
---|
| 2202 | |
---|
| 2203 | ENTER(0); |
---|
| 2204 | TRACE3(TRACE_SMB, 5, TRCID_BREAKWAIT, |
---|
| 2205 | "cxiWaitForBreak: file 0x%lX, oplockCurrent %d timeoutSeconds %d\n", |
---|
| 2206 | fileArgP, oplockCurrent, timeoutSeconds); |
---|
| 2207 | |
---|
| 2208 | add_wait_queue(&oplock_break_queue, &wait); |
---|
| 2209 | timeout = timeoutSeconds * HZ; |
---|
| 2210 | while (timeout > 0) { |
---|
| 2211 | set_current_state(TASK_INTERRUPTIBLE); |
---|
| 2212 | /* Check whether the oplock has been released or downgraded */ |
---|
| 2213 | if (gpfs_ops.SMBGetOplockState(fileArgP) < oplockCurrent) |
---|
| 2214 | break; |
---|
| 2215 | timeout = schedule_timeout(timeout); |
---|
| 2216 | } |
---|
| 2217 | set_current_state(TASK_RUNNING); |
---|
| 2218 | remove_wait_queue(&oplock_break_queue, &wait); |
---|
| 2219 | |
---|
| 2220 | TRACE0(TRACE_SMB, 5, TRCID_BREAKWAIT_EXIT, |
---|
| 2221 | "cxiWaitForBreak exit\n"); |
---|
| 2222 | |
---|
| 2223 | EXIT(0); |
---|
| 2224 | return 0; |
---|
| 2225 | } |
---|
| 2226 | #endif |
---|
| 2227 | |
---|
| 2228 | |
---|
| 2229 | /* Get the address of the first byte not addressible by processes */ |
---|
| 2230 | UIntPtr cxiGetKernelBoundary() |
---|
| 2231 | { |
---|
| 2232 | return GPFS_KERNEL_OFFSET; |
---|
| 2233 | } |
---|
| 2234 | |
---|
| 2235 | |
---|
| 2236 | /* Return true if this process holds the big kernel lock (BKL) */ |
---|
| 2237 | Boolean cxiHoldsBKL() |
---|
| 2238 | { |
---|
| 2239 | return current->lock_depth >= 0; |
---|
| 2240 | } |
---|
| 2241 | |
---|
| 2242 | |
---|
| 2243 | /* Tell the OS that this thread is involved in handling VM page-out |
---|
| 2244 | requests and should not be blocked waiting for page allocation. |
---|
| 2245 | Return true if successful. */ |
---|
| 2246 | Boolean cxiSetPageoutThread() |
---|
| 2247 | { |
---|
| 2248 | if (current->flags & PF_MEMALLOC) |
---|
| 2249 | return false; |
---|
| 2250 | current->flags |= PF_MEMALLOC; |
---|
| 2251 | return true; |
---|
| 2252 | } |
---|
| 2253 | |
---|
| 2254 | |
---|
| 2255 | /* Tell the OS that this thread is no longer involved in handling VM |
---|
| 2256 | page-out requests. */ |
---|
| 2257 | void cxiClearPageoutThread() |
---|
| 2258 | { |
---|
| 2259 | current->flags &= ~PF_MEMALLOC; |
---|
| 2260 | } |
---|
| 2261 | |
---|
| 2262 | |
---|
| 2263 | /* Yield the CPU to allow other processes to run */ |
---|
| 2264 | void |
---|
| 2265 | cxiYield() |
---|
| 2266 | { |
---|
| 2267 | ENTER(0); |
---|
| 2268 | schedule(); |
---|
| 2269 | EXIT(0); |
---|
| 2270 | } |
---|
| 2271 | |
---|
| 2272 | /* Linux filldir has changed signatures depending on kernel level. |
---|
| 2273 | * We always pass a 64bit offset from the GPFS layer. |
---|
| 2274 | */ |
---|
| 2275 | int |
---|
| 2276 | cxiFillDir(void *vargP, const char *nameP, int namelen, |
---|
| 2277 | offset_t offset, ino_t ino) |
---|
| 2278 | { |
---|
| 2279 | int result; |
---|
| 2280 | cxiFillDirArg_t *fillDirArgP = (cxiFillDirArg_t *)vargP; |
---|
| 2281 | filldir_t fnP = (filldir_t)fillDirArgP->fnP; |
---|
| 2282 | ENTER(0); |
---|
| 2283 | |
---|
| 2284 | result = (*fnP)(fillDirArgP->argP, nameP, namelen, |
---|
| 2285 | (loff_t)offset, ino, 0 /* DT_UNKNOWN */); |
---|
| 2286 | EXIT_RC(0, result); |
---|
| 2287 | return result; |
---|
| 2288 | } |
---|
| 2289 | |
---|
| 2290 | #ifdef DISK_LEASE_DMS |
---|
| 2291 | |
---|
| 2292 | static struct timer_list DMSTimer[MAX_DMS_INDEX]; |
---|
| 2293 | static int (*DMSgetNIOsInProgressP)(int); |
---|
| 2294 | |
---|
| 2295 | #define PANIC_FOR_REAL 1 |
---|
| 2296 | |
---|
| 2297 | static void cxiDMSExpired(unsigned long data) |
---|
| 2298 | { |
---|
| 2299 | int idx = data; |
---|
| 2300 | int nIOs = DMSgetNIOsInProgressP(idx); |
---|
| 2301 | /* ENTER(0); */ |
---|
| 2302 | /* This code is executed on the interrupt level -- can't use tracing */ |
---|
| 2303 | printk("GPFS Deadman Switch timer [%d] has expired; IOs in progress: %d\n", |
---|
| 2304 | idx, nIOs); |
---|
| 2305 | #ifdef PANIC_FOR_REAL |
---|
| 2306 | if (nIOs != 0) |
---|
| 2307 | panic("GPFS Deadman Switch timer has expired, and there are still" |
---|
| 2308 | " %d outstanding I/O requests\n", nIOs); |
---|
| 2309 | #endif |
---|
| 2310 | } |
---|
| 2311 | |
---|
| 2312 | /* |
---|
| 2313 | Start dead man switch, with the timeout specified by the delay |
---|
| 2314 | argument (in seconds). |
---|
| 2315 | */ |
---|
| 2316 | void cxiStartDMS(int idx, int delay, int (*funcP)(int)) |
---|
| 2317 | { |
---|
| 2318 | unsigned long njiffies = delay * HZ; |
---|
| 2319 | |
---|
| 2320 | /* Only allow the daemon or other root users to make this kernel call */ |
---|
| 2321 | if (!cxiIsSuperUser()) |
---|
| 2322 | return; |
---|
| 2323 | ENTER(0); |
---|
| 2324 | |
---|
| 2325 | /* There can be only one timer active at any given moment */ |
---|
| 2326 | if (timer_pending(&DMSTimer[idx])) |
---|
| 2327 | del_timer(&DMSTimer[idx]); |
---|
| 2328 | |
---|
| 2329 | init_timer(&DMSTimer[idx]); |
---|
| 2330 | DMSTimer[idx].expires = jiffies + njiffies; |
---|
| 2331 | DMSTimer[idx].function = cxiDMSExpired; |
---|
| 2332 | DMSTimer[idx].data = idx; |
---|
| 2333 | /* save the pointer to nIOsInProgress to a static var */ |
---|
| 2334 | DMSgetNIOsInProgressP = funcP; |
---|
| 2335 | add_timer(&DMSTimer[idx]); |
---|
| 2336 | TRACE3(TRACE_DLEASE, 2, TRCID_DMS_STARTED, |
---|
| 2337 | "DMS timer [%d] started, delay %d, time %d\n", |
---|
| 2338 | idx, delay, jiffies/HZ); |
---|
| 2339 | EXIT(0); |
---|
| 2340 | } |
---|
| 2341 | |
---|
| 2342 | void cxiStopDMS(int idx) |
---|
| 2343 | { |
---|
| 2344 | /* Only allow the daemon or other root users to make this kernel call */ |
---|
| 2345 | if (!cxiIsSuperUser()) |
---|
| 2346 | return; |
---|
| 2347 | ENTER(0); |
---|
| 2348 | |
---|
| 2349 | if (timer_pending(&DMSTimer[idx])) |
---|
| 2350 | del_timer(&DMSTimer[idx]); |
---|
| 2351 | TRACE2(TRACE_DLEASE, 2, TRCID_DMS_STOPPED, |
---|
| 2352 | "DMS timer [%d] stopped, time %d\n", idx, jiffies/HZ); |
---|
| 2353 | EXIT(0); |
---|
| 2354 | } |
---|
| 2355 | |
---|
| 2356 | /* dummy init routine. Since on Linux the timer is |
---|
| 2357 | stored in a static memory, there's nothing to be done |
---|
| 2358 | */ |
---|
| 2359 | int cxiInitDMS(void) |
---|
| 2360 | { |
---|
| 2361 | return 0; |
---|
| 2362 | } |
---|
| 2363 | |
---|
| 2364 | void cxiShutdownDMS(void) |
---|
| 2365 | { |
---|
| 2366 | int i; |
---|
| 2367 | |
---|
| 2368 | ENTER(0); |
---|
| 2369 | for (i = 0; i < MAX_DMS_INDEX; i++) |
---|
| 2370 | cxiStopDMS(i); |
---|
| 2371 | EXIT(0); |
---|
| 2372 | } |
---|
| 2373 | |
---|
| 2374 | #endif /* DISK_LEASE_DMS */ |
---|
| 2375 | |
---|
| 2376 | void cxiSetBit(unsigned long *flagP, int flag_bit) |
---|
| 2377 | { |
---|
| 2378 | set_bit(flag_bit,flagP); |
---|
| 2379 | } |
---|
| 2380 | void cxiClearBit(unsigned long *flagP, int flag_bit) |
---|
| 2381 | { |
---|
| 2382 | clear_bit(flag_bit,flagP); |
---|
| 2383 | } |
---|
| 2384 | Boolean cxiTestBit(unsigned long *flagP, int flag_bit) |
---|
| 2385 | { |
---|
| 2386 | return test_bit(flag_bit,flagP); |
---|
| 2387 | } |
---|
| 2388 | |
---|
| 2389 | /* In order to setup our termination callback routine (gpfs_f_cleanup) |
---|
| 2390 | * we create a dummy file and add it to our file table. Then, upon |
---|
| 2391 | * process termination, the release file operation will be called in |
---|
| 2392 | * order to close the file. The only operation we define for this |
---|
| 2393 | * dummy file is release (gpfs_f_cleanup). |
---|
| 2394 | */ |
---|
| 2395 | int |
---|
| 2396 | cxiRegisterCleanup() |
---|
| 2397 | { |
---|
| 2398 | int code = 0, rc = 0; |
---|
| 2399 | struct inode *iP = NULL; |
---|
| 2400 | struct file *fileP = NULL; |
---|
| 2401 | struct dentry *dentryP = NULL; |
---|
| 2402 | extern int cleanupFD; |
---|
| 2403 | extern struct super_block *shutdownSuperP; |
---|
| 2404 | |
---|
| 2405 | /* We record the daemon's process group because certain |
---|
| 2406 | * checks on cxiCopyIn/cxiCopyOut are bypassed for the daemon. |
---|
| 2407 | */ |
---|
| 2408 | ENTER(0); |
---|
| 2409 | DaemonPGrp = PROCESS_GROUP(current); |
---|
| 2410 | |
---|
| 2411 | /* Make sure we only create one file */ |
---|
| 2412 | if (cleanupFD) |
---|
| 2413 | { |
---|
| 2414 | EXIT_RC(0, EEXIST); |
---|
| 2415 | return EEXIST; |
---|
| 2416 | } |
---|
| 2417 | |
---|
| 2418 | DBGASSERT(shutdownSuperP != NULL); |
---|
| 2419 | |
---|
| 2420 | /* Allocate an inode struct */ |
---|
| 2421 | iP = NEW_INODE(shutdownSuperP); |
---|
| 2422 | if (!iP) |
---|
| 2423 | { |
---|
| 2424 | code = 1; |
---|
| 2425 | rc = ENOMEM; |
---|
| 2426 | goto xerror; |
---|
| 2427 | } |
---|
| 2428 | iP->i_mode = S_IFREG; |
---|
| 2429 | |
---|
| 2430 | /* Allocate an available file descriptor */ |
---|
| 2431 | cleanupFD = get_unused_fd(); |
---|
| 2432 | if (cleanupFD < 0) |
---|
| 2433 | { |
---|
| 2434 | code = 2; |
---|
| 2435 | rc = ENFILE; |
---|
| 2436 | goto xerror; |
---|
| 2437 | } |
---|
| 2438 | |
---|
| 2439 | /* Allocate a file struct */ |
---|
| 2440 | fileP = get_empty_filp(); |
---|
| 2441 | if (!fileP) |
---|
| 2442 | { |
---|
| 2443 | code = 3; |
---|
| 2444 | rc = ENFILE; |
---|
| 2445 | goto xerror; |
---|
| 2446 | } |
---|
| 2447 | |
---|
| 2448 | /* Allocate a dentry sruct */ |
---|
| 2449 | dentryP = dget(d_alloc_root(iP)); |
---|
| 2450 | if (!dentryP) |
---|
| 2451 | { |
---|
| 2452 | code = 4; |
---|
| 2453 | rc = ENOMEM; |
---|
| 2454 | goto xerror; |
---|
| 2455 | } |
---|
| 2456 | |
---|
| 2457 | /* Initialize and chain our file sructure */ |
---|
| 2458 | fileP->f_dentry = dentryP; |
---|
| 2459 | fileP->f_op = &gpfs_cleanup_fops; |
---|
| 2460 | fileP->f_flags = O_RDONLY; |
---|
| 2461 | atomic_set(&fileP->f_count, 1); |
---|
| 2462 | |
---|
| 2463 | /* Just chain it on the current root mount. When |
---|
| 2464 | * the file is closed its fput() will decrement |
---|
| 2465 | * the mount count (hence the mntget here) |
---|
| 2466 | */ |
---|
| 2467 | fileP->f_vfsmnt = mntget(current->fs->rootmnt); |
---|
| 2468 | |
---|
| 2469 | /* Install the descriptor so it gets "closed" upon our termination */ |
---|
| 2470 | fd_install(cleanupFD, fileP); |
---|
| 2471 | |
---|
| 2472 | /* Set FD_CLOEXEC so that forked processes (like mmfsup.scr) do not |
---|
| 2473 | * inherrit this descriptor. We want the cleanup routine to be run |
---|
| 2474 | * when the last mmfsd process terminates. |
---|
| 2475 | */ |
---|
| 2476 | #if LINUX_KERNEL_VERSION >= 2061300 |
---|
| 2477 | FD_SET(cleanupFD, current->files->fdt->close_on_exec); |
---|
| 2478 | #else |
---|
| 2479 | FD_SET(cleanupFD, current->files->close_on_exec); |
---|
| 2480 | #endif |
---|
| 2481 | /* Once the descriptor for this dummy file is added to our file table, |
---|
| 2482 | * it is inherrited by all the processes of the daemon. As each |
---|
| 2483 | * terminates, the files->count is decremented and on the last process |
---|
| 2484 | * termination all the descriptors will be closed by filp_close. |
---|
| 2485 | * |
---|
| 2486 | * The one catch here is that our file table is inherrited by the |
---|
| 2487 | * kernel threads we start as well as user processes. This would |
---|
| 2488 | * cause a problem in that daemon termination does not include these |
---|
| 2489 | * kernel threads which aren't killed until restart (and therefore |
---|
| 2490 | * the file is never closed). In order for our operation to be |
---|
| 2491 | * driven at daemon termiation, we must remove the file table from |
---|
| 2492 | * these kernel threads. This is done in via cxiReparent() by |
---|
| 2493 | * the mmap pager kproc. |
---|
| 2494 | */ |
---|
| 2495 | |
---|
| 2496 | xerror: |
---|
| 2497 | TRACE4(TRACE_VNODE, 1, TRCID_CXIREGISTERCLEANUP_EXIT, |
---|
| 2498 | "cxiRegisterCleanup: fd %d iP %X rc %d code %d\n", |
---|
| 2499 | cleanupFD, iP, rc, code); |
---|
| 2500 | |
---|
| 2501 | if (rc) |
---|
| 2502 | { |
---|
| 2503 | if (dentryP); |
---|
| 2504 | dput(dentryP); |
---|
| 2505 | |
---|
| 2506 | if (cleanupFD) |
---|
| 2507 | put_unused_fd(cleanupFD); |
---|
| 2508 | |
---|
| 2509 | if (fileP) |
---|
| 2510 | #if LINUX_KERNEL_VERSION > 2060900 |
---|
| 2511 | fput(fileP); |
---|
| 2512 | #else |
---|
| 2513 | put_filp(fileP); |
---|
| 2514 | #endif |
---|
| 2515 | |
---|
| 2516 | if (iP) |
---|
| 2517 | iput(iP); |
---|
| 2518 | |
---|
| 2519 | cleanupFD = 0; |
---|
| 2520 | } |
---|
| 2521 | |
---|
| 2522 | EXIT_RC(0, rc); |
---|
| 2523 | return rc; |
---|
| 2524 | } |
---|
| 2525 | |
---|
| 2526 | #ifdef NFS4_ACL |
---|
| 2527 | /* Linux routines to be called when processing NFSv4 audit/alarm ACL entries */ |
---|
| 2528 | int cxiAuditWrite(int numargs, ...) { return ENOSYS; } |
---|
| 2529 | #endif /* NFS4_ACL */ |
---|
| 2530 | |
---|
| 2531 | /* Currently no OS specific VFS initialization for Linux */ |
---|
| 2532 | int |
---|
| 2533 | cxiInitVFS(int vfsType) |
---|
| 2534 | { |
---|
| 2535 | return 0; |
---|
| 2536 | } |
---|
| 2537 | |
---|
| 2538 | UIntPtr |
---|
| 2539 | cxiGetKernelStackSize() |
---|
| 2540 | { |
---|
| 2541 | return (UIntPtr)THREAD_SIZE; |
---|
| 2542 | } |
---|
| 2543 | |
---|
| 2544 | #if defined(DMAPI) || (SANERGY) |
---|
| 2545 | |
---|
| 2546 | void cxiPathRel(void *ndP) |
---|
| 2547 | { |
---|
| 2548 | DBGASSERT( ndP != NULL); |
---|
| 2549 | path_release( (struct nameidata *) ndP); |
---|
| 2550 | cxiFreeUnpinned(ndP); |
---|
| 2551 | } |
---|
| 2552 | |
---|
| 2553 | int |
---|
| 2554 | cxiPathToVfsP(void **privVfsPP, char *kpathname, void **ndPP, void **cnPP, |
---|
| 2555 | Boolean traverseLink) |
---|
| 2556 | { |
---|
| 2557 | struct gpfsVfsData_t *privVfsP = NULL; |
---|
| 2558 | struct nameidata *ndP; |
---|
| 2559 | struct inode * iP; |
---|
| 2560 | cxiNode_t *cnP; |
---|
| 2561 | int rc = 0; |
---|
| 2562 | Boolean rel = false; |
---|
| 2563 | int code = 0; |
---|
| 2564 | *ndPP = NULL; |
---|
| 2565 | *privVfsPP = NULL; |
---|
| 2566 | |
---|
| 2567 | ENTER(0); |
---|
| 2568 | if (kpathname == NULL) |
---|
| 2569 | { |
---|
| 2570 | code = 1; |
---|
| 2571 | rc = EINVAL; |
---|
| 2572 | goto xerror; |
---|
| 2573 | } |
---|
| 2574 | |
---|
| 2575 | ndP = (struct nameidata *)cxiMallocUnpinned(sizeof(struct nameidata)); |
---|
| 2576 | if (ndP == NULL) |
---|
| 2577 | { |
---|
| 2578 | code = 2; |
---|
| 2579 | rc = ENOMEM; |
---|
| 2580 | goto xerror; |
---|
| 2581 | } |
---|
| 2582 | |
---|
| 2583 | /* For DMAPI, this is called by dm_path_to_handle or dm_path_to_fshandle, |
---|
| 2584 | * According to dmapi documentation, we should return the symbolic link |
---|
| 2585 | * itself instead of the object that link references. |
---|
| 2586 | * so here we need to use the function which does not traverse the link */ |
---|
| 2587 | if (!traverseLink) |
---|
| 2588 | rc = user_path_walk_link(kpathname, ndP); |
---|
| 2589 | else |
---|
| 2590 | rc = user_path_walk(kpathname, ndP); |
---|
| 2591 | |
---|
| 2592 | if (rc) |
---|
| 2593 | { |
---|
| 2594 | rc = -rc; |
---|
| 2595 | code = 3; |
---|
| 2596 | goto xerror; |
---|
| 2597 | } |
---|
| 2598 | |
---|
| 2599 | rel = true; |
---|
| 2600 | iP = ndP->dentry->d_inode; |
---|
| 2601 | DBGASSERT(iP != NULL); |
---|
| 2602 | if (!GPFS_TYPE(iP)) |
---|
| 2603 | { |
---|
| 2604 | code = 4; |
---|
| 2605 | rc = EINVAL; |
---|
| 2606 | goto xerror; |
---|
| 2607 | } |
---|
| 2608 | |
---|
| 2609 | privVfsP = VP_TO_PVP(iP); |
---|
| 2610 | |
---|
| 2611 | if (privVfsP == NULL) |
---|
| 2612 | { |
---|
| 2613 | code = 5; |
---|
| 2614 | rc = ENOENT; |
---|
| 2615 | } |
---|
| 2616 | cnP = VP_TO_CNP(iP); |
---|
| 2617 | *privVfsPP = (void *)privVfsP; |
---|
| 2618 | *ndPP = (void *)ndP; |
---|
| 2619 | if (cnPP != NULL) |
---|
| 2620 | *cnPP = (void *)cnP; |
---|
| 2621 | |
---|
| 2622 | xerror: |
---|
| 2623 | if (rc && ndP) |
---|
| 2624 | { |
---|
| 2625 | if (rel) |
---|
| 2626 | cxiPathRel(ndP); |
---|
| 2627 | else |
---|
| 2628 | cxiFreeUnpinned(ndP); |
---|
| 2629 | } |
---|
| 2630 | EXIT_RC(0, rc); |
---|
| 2631 | return rc; |
---|
| 2632 | } |
---|
| 2633 | |
---|
| 2634 | void |
---|
| 2635 | cxiSetCred(void *eCredPP) |
---|
| 2636 | { |
---|
| 2637 | ext_cred_t *eCredP = (ext_cred_t *)eCredPP; |
---|
| 2638 | setCred(eCredP); |
---|
| 2639 | } |
---|
| 2640 | |
---|
| 2641 | #endif /* DMAPI or SANERGY */ |
---|
| 2642 | |
---|
| 2643 | |
---|
| 2644 | #ifdef KSTACK_CHECK |
---|
| 2645 | /* Kernel stack checking: for each active thread that is making |
---|
| 2646 | subroutine calls in the kernel, allocate a stack_history_t. Within |
---|
| 2647 | each stack_history_t, create a frame_desc_t for each level of |
---|
| 2648 | subroutine call. Two lists of frame_desc_t's are maintained: one for |
---|
| 2649 | the current call stack, and one for the deepest call stack seen so |
---|
| 2650 | far for this thread. Upon exit from the lowest-level routine, check |
---|
| 2651 | whether the maximum stack depth threshhold has been exceeded. If it |
---|
| 2652 | has, print the traceback of the maximum stack usage. Keep hashes of |
---|
| 2653 | the tracebacks printed to avoid printing the same traceback more than |
---|
| 2654 | once. Since cxiTraceExit is not called for every routine exit, |
---|
| 2655 | maintenance of call chains is not exact; a routine entry with |
---|
| 2656 | stackUsed less than the current entry implies return of the previous |
---|
| 2657 | routine. |
---|
| 2658 | |
---|
| 2659 | Note that these routines cannot call any other routine that has |
---|
| 2660 | ENTER/EXIT macros inside of it, to avoid recursion. */ |
---|
| 2661 | |
---|
| 2662 | /* Maximum size of of a stack frame before it is considered large enough |
---|
| 2663 | to complain about */ |
---|
| 2664 | #define STACK_LIMIT_WARNING (THREAD_SIZE - (THREAD_SIZE/3) ) |
---|
| 2665 | |
---|
| 2666 | /* Description of one level of a call stack */ |
---|
| 2667 | typedef struct frame_desc |
---|
| 2668 | { |
---|
| 2669 | /* Function name and file name containing the function */ |
---|
| 2670 | const char * fdFuncNameP; |
---|
| 2671 | const char * fdFileNameP; |
---|
| 2672 | |
---|
| 2673 | /* Pointer to frame_desc of caller, or NULL if this is the first |
---|
| 2674 | frame. Also used to link free frame descriptors together on the |
---|
| 2675 | shFreeHeadP free list. */ |
---|
| 2676 | struct frame_desc * fdCallerP; |
---|
| 2677 | |
---|
| 2678 | /* Line number near the beginning of fdFuncNameP */ |
---|
| 2679 | int fdLineNum; |
---|
| 2680 | |
---|
| 2681 | /* Total stack usage up to and including this routine */ |
---|
| 2682 | int fdStackUsed; |
---|
| 2683 | |
---|
| 2684 | /* Reference count for this frame_desc_t. Can be 2 if this descriptor |
---|
| 2685 | is reachable from both shCurrentP and shMaxP. */ |
---|
| 2686 | int fdRef; |
---|
| 2687 | } frame_desc_t; |
---|
| 2688 | |
---|
| 2689 | |
---|
| 2690 | /* Each stack_history is only used by one thread, so no locking is |
---|
| 2691 | needed within a stack_history. This is allocated as a single page. |
---|
| 2692 | */ |
---|
| 2693 | typedef struct stack_history |
---|
| 2694 | { |
---|
| 2695 | /* ID of thread to which this stack_history_t belongs */ |
---|
| 2696 | cxiThreadId shThreadId; |
---|
| 2697 | |
---|
| 2698 | /* Bucket index in historyHash that points to this stack_history_t, |
---|
| 2699 | or -1 if this stack_history_t is on an overflow list */ |
---|
| 2700 | int shBucketNum; |
---|
| 2701 | |
---|
| 2702 | /* Next stack_history_t in same hash overflow list or on free list */ |
---|
| 2703 | struct stack_history * shNextP; |
---|
| 2704 | |
---|
| 2705 | /* Pointer to the frame descriptor for the routine that most recently |
---|
| 2706 | called fdEnter without a matching fdExit. Following the fdCallerP |
---|
| 2707 | pointers through these frame descriptors gives the current callback |
---|
| 2708 | chain. */ |
---|
| 2709 | frame_desc_t * shCurrentP; |
---|
| 2710 | |
---|
| 2711 | /* Pointer to the frame descriptor that had the maximum stack usage |
---|
| 2712 | seen thus far for this thread. Following the fdCallerP pointers |
---|
| 2713 | through these frame descriptors gives the callback chain with |
---|
| 2714 | maximal stack usage. */ |
---|
| 2715 | frame_desc_t * shMaxP; |
---|
| 2716 | |
---|
| 2717 | /* Head of list of free frame_desc_t's */ |
---|
| 2718 | frame_desc_t * shFreeHeadP; |
---|
| 2719 | |
---|
| 2720 | /* Area that holds frame_desc_t's. These will be linked together and |
---|
| 2721 | put on the list shFreeHeadP. */ |
---|
| 2722 | #define SH_PREFIX_LEN (sizeof(cxiThreadId) + \ |
---|
| 2723 | sizeof(int) + \ |
---|
| 2724 | sizeof(struct stack_history *) + \ |
---|
| 2725 | 3*sizeof(frame_desc_t *)) |
---|
| 2726 | #define SH_NFRAMES ((PAGE_SIZE-SH_PREFIX_LEN)/sizeof(frame_desc_t)) |
---|
| 2727 | frame_desc_t shFrames[SH_NFRAMES]; |
---|
| 2728 | } stack_history_t; |
---|
| 2729 | |
---|
| 2730 | /* Global structures */ |
---|
| 2731 | struct |
---|
| 2732 | { |
---|
| 2733 | /* Global flag controlling whether kernel stack checking is enabled. |
---|
| 2734 | Initially false; set true during kernel module initialization, |
---|
| 2735 | then set false again during kernel module termination. */ |
---|
| 2736 | Boolean shActive; |
---|
| 2737 | |
---|
| 2738 | /* Mutex protecting updates to the variables that follow. This cannot |
---|
| 2739 | be a cxiBlockMutex_t because then the stack checking routines would |
---|
| 2740 | get called recursively. */ |
---|
| 2741 | struct semaphore shMutex; |
---|
| 2742 | |
---|
| 2743 | /* List of free stack_history_t's and count of how many free entries |
---|
| 2744 | there are. Excess stack_history_t's beyond a threshhold are freed |
---|
| 2745 | back to the operating system. */ |
---|
| 2746 | stack_history_t * freeHeadP; |
---|
| 2747 | int nFree; |
---|
| 2748 | #define MAX_FREE_STACK_HISTORIES 16 |
---|
| 2749 | |
---|
| 2750 | /* Hash table of active stack_history_t's. To find the entry for a |
---|
| 2751 | particular thread, hash its thread id to a bucket. If any of the |
---|
| 2752 | entries in bucket[] match the desired thread id, the pointer to |
---|
| 2753 | the stack_history_t can be returned without acquiring any locks. If |
---|
| 2754 | the bucket does not contain the desired thread id, look for it on |
---|
| 2755 | the overflow list under protection of shMutex. */ |
---|
| 2756 | #define HISTORY_HASH_SIZE 64 |
---|
| 2757 | #define HISTS_PER_BUCKET 3 |
---|
| 2758 | struct |
---|
| 2759 | { |
---|
| 2760 | struct |
---|
| 2761 | { |
---|
| 2762 | stack_history_t * historyP; |
---|
| 2763 | cxiThreadId threadId; |
---|
| 2764 | } bucket[HISTS_PER_BUCKET]; |
---|
| 2765 | stack_history_t * overflowP; |
---|
| 2766 | } historyHash[HISTORY_HASH_SIZE]; |
---|
| 2767 | |
---|
| 2768 | /* List of hash values for tracebacks that have already been printed. |
---|
| 2769 | Used to avoid printing the same traceback more than once. Nothing |
---|
| 2770 | is ever deleted from this table, so to find an entry start |
---|
| 2771 | searching at its hash value and continue until the entry is found |
---|
| 2772 | or an empty slot is encountered. The total occupancy of the table |
---|
| 2773 | is limited to MAX_TRACEBACKS to restrict the amount of searching |
---|
| 2774 | that will be required, and to guarantee that searches will |
---|
| 2775 | terminate. */ |
---|
| 2776 | #define TB_HASH_SIZE 64 |
---|
| 2777 | #define MAX_TRACEBACKS 32 |
---|
| 2778 | unsigned int tracebackHash[TB_HASH_SIZE]; |
---|
| 2779 | int nTracebackHashEntries; |
---|
| 2780 | } SHG; |
---|
| 2781 | |
---|
| 2782 | |
---|
| 2783 | /* Private version of DBGASSERT used only within stack checking code. |
---|
| 2784 | Cannot use DBGASSERT without risking recursion. */ |
---|
| 2785 | #ifdef DBGASSERTS |
---|
| 2786 | #define SH_ASSERT(_ex) \ |
---|
| 2787 | if (!(_ex)) { \ |
---|
| 2788 | printk("GPFS stack checking assert failed: " # _ex " file %s line %d\n", \ |
---|
| 2789 | __FILE__, __LINE__); \ |
---|
| 2790 | DoPanic(# _ex, __FILE__, __LINE__, 0, 0, ""); \ |
---|
| 2791 | } else ((void)0) |
---|
| 2792 | #else |
---|
| 2793 | #define SH_ASSERT(_ex) ((void)0) |
---|
| 2794 | #endif |
---|
| 2795 | |
---|
| 2796 | |
---|
| 2797 | /* Initialize and enable stack depth checking */ |
---|
| 2798 | void shInit() |
---|
| 2799 | { |
---|
| 2800 | /* Clear stack checking globals */ |
---|
| 2801 | cxiMemset(&SHG, 0, sizeof(SHG)); |
---|
| 2802 | |
---|
| 2803 | /* Init mutex */ |
---|
| 2804 | init_MUTEX(&SHG.shMutex); |
---|
| 2805 | |
---|
| 2806 | /* Turn on stack depth checking and make sure the change is visible */ |
---|
| 2807 | SHG.shActive = true; |
---|
| 2808 | wmb(); |
---|
| 2809 | } |
---|
| 2810 | |
---|
| 2811 | |
---|
| 2812 | /* Turn off stack depth checking and free all allocated memory. This does |
---|
| 2813 | not have to return the global state to what it was when the module was |
---|
| 2814 | first loaded, since it will not be used again. */ |
---|
| 2815 | void shTerm() |
---|
| 2816 | { |
---|
| 2817 | int h; |
---|
| 2818 | int b; |
---|
| 2819 | stack_history_t * shP; |
---|
| 2820 | stack_history_t * shNextP; |
---|
| 2821 | |
---|
| 2822 | /* Turn off stack depth checking and make sure the chenge is visible */ |
---|
| 2823 | SHG.shActive = false; |
---|
| 2824 | wmb(); |
---|
| 2825 | |
---|
| 2826 | /* Get and then release mutex. This insures that a thread that is |
---|
| 2827 | in the middle of writing a traceback finishes writing it before |
---|
| 2828 | we free the data structures it was using. */ |
---|
| 2829 | /* ?? although there could be another thread waiting for the mutex ... */ |
---|
| 2830 | down(&SHG.shMutex); |
---|
| 2831 | up(&SHG.shMutex); |
---|
| 2832 | |
---|
| 2833 | /* Wait briefly to allow threads in the middle of the stack checking |
---|
| 2834 | code to finish what they are doing */ |
---|
| 2835 | /* ?? Of course, this is not really safe, but this is debugging code, |
---|
| 2836 | right? */ |
---|
| 2837 | schedule_timeout(HZ/2); |
---|
| 2838 | |
---|
| 2839 | /* Terminate mutex */ |
---|
| 2840 | // nothing to do |
---|
| 2841 | |
---|
| 2842 | /* Free all stack_history_t's on the free list */ |
---|
| 2843 | shP = SHG.freeHeadP; |
---|
| 2844 | while (shP != NULL) |
---|
| 2845 | { |
---|
| 2846 | shNextP = shP->shNextP; |
---|
| 2847 | kfree(shP); |
---|
| 2848 | shP = shNextP; |
---|
| 2849 | } |
---|
| 2850 | |
---|
| 2851 | /* Free all stack_history_t's in the hash table */ |
---|
| 2852 | for (h=0 ; h<HISTORY_HASH_SIZE ; h++) |
---|
| 2853 | { |
---|
| 2854 | for (b=0 ; b<HISTS_PER_BUCKET ; b++) |
---|
| 2855 | if (SHG.historyHash[h].bucket[b].historyP != NULL) |
---|
| 2856 | kfree(SHG.historyHash[h].bucket[b].historyP); |
---|
| 2857 | shP = SHG.historyHash[h].overflowP; |
---|
| 2858 | while (shP != NULL) |
---|
| 2859 | { |
---|
| 2860 | shNextP = shP->shNextP; |
---|
| 2861 | kfree(shP); |
---|
| 2862 | shP = shNextP; |
---|
| 2863 | } |
---|
| 2864 | } |
---|
| 2865 | } |
---|
| 2866 | |
---|
| 2867 | |
---|
| 2868 | /* Allocate and initialize a new stack_history_t */ |
---|
| 2869 | static stack_history_t * shAllocInit() |
---|
| 2870 | { |
---|
| 2871 | stack_history_t * shP; |
---|
| 2872 | int f; |
---|
| 2873 | |
---|
| 2874 | up(&SHG.shMutex); |
---|
| 2875 | shP = (stack_history_t *) kmalloc(sizeof(stack_history_t), GFP_KERNEL); |
---|
| 2876 | SH_ASSERT(shP != NULL); |
---|
| 2877 | down(&SHG.shMutex); |
---|
| 2878 | cxiMemset(shP, 0, sizeof(stack_history_t)); |
---|
| 2879 | for (f=0 ; f<=SH_NFRAMES-2 ; f++) |
---|
| 2880 | shP->shFrames[f].fdCallerP = &shP->shFrames[f+1]; |
---|
| 2881 | shP->shFreeHeadP = &shP->shFrames[0]; |
---|
| 2882 | return shP; |
---|
| 2883 | } |
---|
| 2884 | |
---|
| 2885 | |
---|
| 2886 | /* Get a stack_history_t off the free list or build a new one */ |
---|
| 2887 | static stack_history_t * shGet() |
---|
| 2888 | { |
---|
| 2889 | stack_history_t * shP; |
---|
| 2890 | |
---|
| 2891 | /* Use free list if one is available there */ |
---|
| 2892 | shP = SHG.freeHeadP; |
---|
| 2893 | if (shP != NULL) |
---|
| 2894 | { |
---|
| 2895 | SHG.freeHeadP = shP->shNextP; |
---|
| 2896 | SHG.nFree -= 1; |
---|
| 2897 | return shP; |
---|
| 2898 | } |
---|
| 2899 | |
---|
| 2900 | /* Make a new one if necessary */ |
---|
| 2901 | return shAllocInit(); |
---|
| 2902 | } |
---|
| 2903 | |
---|
| 2904 | |
---|
| 2905 | /* Free a stack_history_t. Put it on the free list if there are not |
---|
| 2906 | already too many free, or else free it back to the operating system. |
---|
| 2907 | */ |
---|
| 2908 | static void shPut(stack_history_t * shP) |
---|
| 2909 | { |
---|
| 2910 | int h; |
---|
| 2911 | int b; |
---|
| 2912 | stack_history_t ** shPrevPP; |
---|
| 2913 | stack_history_t * p; |
---|
| 2914 | |
---|
| 2915 | /* Both call stacks should be empty */ |
---|
| 2916 | SH_ASSERT(shP->shCurrentP == NULL); |
---|
| 2917 | SH_ASSERT(shP->shMaxP == NULL); |
---|
| 2918 | |
---|
| 2919 | /* Must hold mutex while changing the hash table */ |
---|
| 2920 | down(&SHG.shMutex); |
---|
| 2921 | |
---|
| 2922 | /* Clear pointer to this stack_history_t from the hash table */ |
---|
| 2923 | h = ((int)shP->shThreadId) & (HISTORY_HASH_SIZE-1); |
---|
| 2924 | b = shP->shBucketNum; |
---|
| 2925 | if (b != -1) |
---|
| 2926 | { |
---|
| 2927 | SH_ASSERT(SHG.historyHash[h].bucket[b].historyP == shP); |
---|
| 2928 | SHG.historyHash[h].bucket[b].historyP = NULL; |
---|
| 2929 | SHG.historyHash[h].bucket[b].threadId = 0; |
---|
| 2930 | } |
---|
| 2931 | else |
---|
| 2932 | { |
---|
| 2933 | shPrevPP = &SHG.historyHash[h].overflowP; |
---|
| 2934 | p = *shPrevPP; |
---|
| 2935 | while (p != NULL) |
---|
| 2936 | { |
---|
| 2937 | if (p == shP) |
---|
| 2938 | { |
---|
| 2939 | *shPrevPP = shP->shNextP; |
---|
| 2940 | break; |
---|
| 2941 | } |
---|
| 2942 | shPrevPP = &p->shNextP; |
---|
| 2943 | p = *shPrevPP; |
---|
| 2944 | } |
---|
| 2945 | } |
---|
| 2946 | |
---|
| 2947 | /* If not too many already free, add to free list */ |
---|
| 2948 | if (SHG.nFree < MAX_FREE_STACK_HISTORIES) |
---|
| 2949 | { |
---|
| 2950 | shP->shNextP = SHG.freeHeadP; |
---|
| 2951 | SHG.freeHeadP = shP; |
---|
| 2952 | SHG.nFree += 1; |
---|
| 2953 | up(&SHG.shMutex); |
---|
| 2954 | return; |
---|
| 2955 | } |
---|
| 2956 | |
---|
| 2957 | /* Otherwise, really free it */ |
---|
| 2958 | up(&SHG.shMutex); |
---|
| 2959 | kfree(shP); |
---|
| 2960 | } |
---|
| 2961 | |
---|
| 2962 | |
---|
| 2963 | /* Find the stack_history_t for the current thread, or allocate one if |
---|
| 2964 | one does not already exist */ |
---|
| 2965 | static stack_history_t * shFind() |
---|
| 2966 | { |
---|
| 2967 | stack_history_t * shP; |
---|
| 2968 | cxiThreadId id = current->pid; |
---|
| 2969 | int h = ((int)id) & (HISTORY_HASH_SIZE-1); |
---|
| 2970 | int b; |
---|
| 2971 | |
---|
| 2972 | /* Look at all entries within the bucket given by the hash of the |
---|
| 2973 | thread ID. No locking needs to be done for this search. */ |
---|
| 2974 | for (b=0 ; b<HISTS_PER_BUCKET ; b++) |
---|
| 2975 | if (SHG.historyHash[h].bucket[b].threadId == id) |
---|
| 2976 | return SHG.historyHash[h].bucket[b].historyP; |
---|
| 2977 | |
---|
| 2978 | /* Must hold mutex while changing the hash table */ |
---|
| 2979 | down(&SHG.shMutex); |
---|
| 2980 | |
---|
| 2981 | /* Search the overflow list */ |
---|
| 2982 | shP = SHG.historyHash[h].overflowP; |
---|
| 2983 | while (shP != NULL) |
---|
| 2984 | { |
---|
| 2985 | if (shP->shThreadId == id) |
---|
| 2986 | goto exit; |
---|
| 2987 | shP = shP->shNextP; |
---|
| 2988 | } |
---|
| 2989 | |
---|
| 2990 | /* No stack_history_t for this thread yet. Get one off the free list |
---|
| 2991 | or build one. */ |
---|
| 2992 | shP = shGet(); |
---|
| 2993 | shP->shThreadId = id; |
---|
| 2994 | shP->shNextP = NULL; |
---|
| 2995 | |
---|
| 2996 | /* Find a slot for the new stack_history_t in the hash table */ |
---|
| 2997 | for (b=0 ; b<HISTS_PER_BUCKET ; b++) |
---|
| 2998 | if (SHG.historyHash[h].bucket[b].historyP == NULL) |
---|
| 2999 | { |
---|
| 3000 | SHG.historyHash[h].bucket[b].historyP = shP; |
---|
| 3001 | SHG.historyHash[h].bucket[b].threadId = id; |
---|
| 3002 | shP->shBucketNum = b; |
---|
| 3003 | goto exit; |
---|
| 3004 | } |
---|
| 3005 | |
---|
| 3006 | /* No slots available; add new stack_history_t to overflow list */ |
---|
| 3007 | shP->shBucketNum = -1; |
---|
| 3008 | shP->shNextP = SHG.historyHash[h].overflowP; |
---|
| 3009 | SHG.historyHash[h].overflowP = shP; |
---|
| 3010 | |
---|
| 3011 | exit: |
---|
| 3012 | /* Release mutex before returning */ |
---|
| 3013 | up(&SHG.shMutex); |
---|
| 3014 | return shP; |
---|
| 3015 | } |
---|
| 3016 | |
---|
| 3017 | |
---|
| 3018 | /* Allocate a frame descriptor within the given stack_history_t. This |
---|
| 3019 | cannot be allowed to fail, so if there are no more free descriptors, |
---|
| 3020 | throw away the bottom frame descriptor and return that. The reference |
---|
| 3021 | count of the frame descriptor that is returned is undefined. */ |
---|
| 3022 | static frame_desc_t * fdGet(stack_history_t * shP) |
---|
| 3023 | { |
---|
| 3024 | frame_desc_t * fdP; |
---|
| 3025 | frame_desc_t ** fdPrevPP; |
---|
| 3026 | int prevRef; |
---|
| 3027 | |
---|
| 3028 | /* Look on the free list within the stack_history_t */ |
---|
| 3029 | fdP = shP->shFreeHeadP; |
---|
| 3030 | if (fdP != NULL) |
---|
| 3031 | { |
---|
| 3032 | shP->shFreeHeadP = fdP->fdCallerP; |
---|
| 3033 | return fdP; |
---|
| 3034 | } |
---|
| 3035 | |
---|
| 3036 | /* No free descriptors; first try stealing one off the bottom of the |
---|
| 3037 | current call stack */ |
---|
| 3038 | fdP = shP->shCurrentP; |
---|
| 3039 | if (fdP != NULL) |
---|
| 3040 | { |
---|
| 3041 | /* Find the bottom entry of the current call stack */ |
---|
| 3042 | fdPrevPP = &shP->shCurrentP; |
---|
| 3043 | prevRef = 1; |
---|
| 3044 | while (fdP->fdCallerP != NULL) |
---|
| 3045 | { |
---|
| 3046 | fdPrevPP = &fdP->fdCallerP; |
---|
| 3047 | prevRef = fdP->fdRef; |
---|
| 3048 | fdP = *fdPrevPP; |
---|
| 3049 | } |
---|
| 3050 | |
---|
| 3051 | /* Remove the bottom entry of the current call stack */ |
---|
| 3052 | *fdPrevPP = NULL; |
---|
| 3053 | |
---|
| 3054 | /* Reduce the reference count on the entry just removed. The |
---|
| 3055 | reference count decreases by the reference count of the frame |
---|
| 3056 | that used to point to *fdP. If *fdP is no longer referenced, no |
---|
| 3057 | further work is needed. If *fdP is still referenced from the max |
---|
| 3058 | depth stack (it must be the bottom entry), we will eventually |
---|
| 3059 | return it, but only after removing it from the bottom of the max |
---|
| 3060 | depth stack. We know that fdP will be returned, but we have to |
---|
| 3061 | search through the max depth stack to find the pointer to *fdP. |
---|
| 3062 | */ |
---|
| 3063 | fdP->fdRef -= prevRef; |
---|
| 3064 | if (fdP->fdRef == 0) |
---|
| 3065 | return fdP; |
---|
| 3066 | } |
---|
| 3067 | |
---|
| 3068 | /* Still no free descriptors; steal the frame descriptor off the |
---|
| 3069 | bottom of the maximum depth call stack */ |
---|
| 3070 | fdP = shP->shMaxP; |
---|
| 3071 | if (fdP != NULL) |
---|
| 3072 | { |
---|
| 3073 | /* Find the bottom entry of the max depth call stack */ |
---|
| 3074 | fdPrevPP = &shP->shMaxP; |
---|
| 3075 | while (fdP->fdCallerP != NULL) |
---|
| 3076 | { |
---|
| 3077 | fdPrevPP = &fdP->fdCallerP; |
---|
| 3078 | fdP = *fdPrevPP; |
---|
| 3079 | } |
---|
| 3080 | |
---|
| 3081 | /* Remove the bottom entry of the max depth call stack */ |
---|
| 3082 | *fdPrevPP = NULL; |
---|
| 3083 | |
---|
| 3084 | /* The bottom entry of the max depth call stack that was just |
---|
| 3085 | removed must have a reference count of one; otherwise it would |
---|
| 3086 | still be on the current call stack and removing the bottom entry |
---|
| 3087 | of that stack would have reduced the reference count of some |
---|
| 3088 | frame descriptor from 2 to 0. */ |
---|
| 3089 | SH_ASSERT(fdP->fdRef == 1); |
---|
| 3090 | return fdP; |
---|
| 3091 | } |
---|
| 3092 | SH_ASSERT(!"cannot alloc frame_desc_t"); |
---|
| 3093 | return NULL; |
---|
| 3094 | } |
---|
| 3095 | |
---|
| 3096 | |
---|
| 3097 | /* Decrease the reference count on a frame descriptor. If it becomes |
---|
| 3098 | zero, return it to the free list */ |
---|
| 3099 | static void fdDiscard(frame_desc_t * fdP, stack_history_t * shP) |
---|
| 3100 | //inline static void fdDiscard(frame_desc_t * fdP, stack_history_t * shP) |
---|
| 3101 | { |
---|
| 3102 | if (fdP->fdRef > 1) |
---|
| 3103 | { |
---|
| 3104 | fdP->fdRef -= 1; |
---|
| 3105 | TRACE3(TRACE_ENTRYEXIT, 11, TRCID_FDDISCARD1, |
---|
| 3106 | "fdDiscard: fdP 0x%lX shP 0x%lX rtn %s refcnt now 1\n", |
---|
| 3107 | fdP, shP, fdP->fdFuncNameP); |
---|
| 3108 | return; |
---|
| 3109 | } |
---|
| 3110 | |
---|
| 3111 | fdP->fdCallerP = shP->shFreeHeadP; |
---|
| 3112 | shP->shFreeHeadP = fdP; |
---|
| 3113 | TRACE3(TRACE_ENTRYEXIT, 11, TRCID_FDDISCARD2, |
---|
| 3114 | "fdDiscard: fdP 0x%lX shP 0x%lX rtn %s refcnt now 0\n", |
---|
| 3115 | fdP, shP, fdP->fdFuncNameP); |
---|
| 3116 | } |
---|
| 3117 | |
---|
| 3118 | |
---|
| 3119 | /* If the maximum stack depth exceeds the threshhold, print its |
---|
| 3120 | traceback if it has not already been printed. Reset the maximum |
---|
| 3121 | depth stack to empty. Only called when the current stack is already |
---|
| 3122 | empty. */ |
---|
| 3123 | static void shDisplay(stack_history_t * shP) |
---|
| 3124 | { |
---|
| 3125 | frame_desc_t * fdP; |
---|
| 3126 | unsigned int tbHash; |
---|
| 3127 | frame_desc_t * fdNextP; |
---|
| 3128 | int slot; |
---|
| 3129 | |
---|
| 3130 | SH_ASSERT(shP->shCurrentP == NULL); |
---|
| 3131 | |
---|
| 3132 | /* If the maximum stack depth is less than the threshhold, just free |
---|
| 3133 | the call chain and return */ |
---|
| 3134 | fdP = shP->shMaxP; |
---|
| 3135 | if (fdP == NULL || |
---|
| 3136 | fdP->fdStackUsed < STACK_LIMIT_WARNING) |
---|
| 3137 | goto exit; |
---|
| 3138 | |
---|
| 3139 | /* Compute a hash of the traceback call chain */ |
---|
| 3140 | tbHash = 0; |
---|
| 3141 | while (fdP != NULL) |
---|
| 3142 | { |
---|
| 3143 | tbHash <<= 1; |
---|
| 3144 | tbHash ^= (((unsigned int)fdP->fdStackUsed) << 15) ^ fdP->fdLineNum; |
---|
| 3145 | fdP = fdP->fdCallerP; |
---|
| 3146 | } |
---|
| 3147 | |
---|
| 3148 | /* Search for the hash of the call chain in the table of tracebacks that |
---|
| 3149 | have already been printed. Searching the hash table can be done without |
---|
| 3150 | any locks, since entries are never deleted. The loop must eventually |
---|
| 3151 | terminate, since the table will not be allowed to fill up. */ |
---|
| 3152 | search: |
---|
| 3153 | slot = tbHash % TB_HASH_SIZE; |
---|
| 3154 | while (SHG.tracebackHash[slot] != 0) |
---|
| 3155 | { |
---|
| 3156 | if (SHG.tracebackHash[slot] == tbHash) |
---|
| 3157 | /* This traceback has already been printed */ |
---|
| 3158 | goto exit; |
---|
| 3159 | slot = (slot+1) % TB_HASH_SIZE; |
---|
| 3160 | } |
---|
| 3161 | |
---|
| 3162 | /* The hash of the current max depth traceback was not found in the |
---|
| 3163 | table and should be inserted at position 'slot'. Do this under |
---|
| 3164 | protection of the mutex. If 'slot' has been used by the time we |
---|
| 3165 | get the mutex, drop the mutex and repeat the search. */ |
---|
| 3166 | down(&SHG.shMutex); |
---|
| 3167 | if (SHG.nTracebackHashEntries >= MAX_TRACEBACKS) |
---|
| 3168 | goto exitMutexHeld; |
---|
| 3169 | if (SHG.tracebackHash[slot] != 0) |
---|
| 3170 | { |
---|
| 3171 | up(&SHG.shMutex); |
---|
| 3172 | goto search; |
---|
| 3173 | } |
---|
| 3174 | SHG.tracebackHash[slot] = tbHash; |
---|
| 3175 | SHG.nTracebackHashEntries += 1; |
---|
| 3176 | |
---|
| 3177 | /* Print the traceback */ |
---|
| 3178 | fdP = shP->shMaxP; |
---|
| 3179 | printk("\nGPFS kernel stack for process %d(%s) used %d bytes\n", |
---|
| 3180 | current->pid, current->comm, fdP->fdStackUsed); |
---|
| 3181 | printk(" stack function\n"); |
---|
| 3182 | printk(" used\n"); |
---|
| 3183 | printk(" ----- -----------------------------------------------------\n"); |
---|
| 3184 | while (fdP != NULL) |
---|
| 3185 | { |
---|
| 3186 | printk(" %5d %s at %s:%d\n", |
---|
| 3187 | fdP->fdStackUsed, fdP->fdFuncNameP, fdP->fdFileNameP, fdP->fdLineNum); |
---|
| 3188 | fdP = fdP->fdCallerP; |
---|
| 3189 | } |
---|
| 3190 | printk(" traceback signature %08X\n", tbHash); |
---|
| 3191 | |
---|
| 3192 | /* If the maximum number of allowed tracebacks has been reached, turn |
---|
| 3193 | off further stack checking. */ |
---|
| 3194 | if (SHG.nTracebackHashEntries >= MAX_TRACEBACKS) |
---|
| 3195 | { |
---|
| 3196 | printk("Maximum number of GPFS deep stack tracebacks reached\n"); |
---|
| 3197 | printk("GPFS stack checking disabled\n"); |
---|
| 3198 | SHG.shActive = false; |
---|
| 3199 | wmb(); |
---|
| 3200 | } |
---|
| 3201 | |
---|
| 3202 | exitMutexHeld: |
---|
| 3203 | up(&SHG.shMutex); |
---|
| 3204 | |
---|
| 3205 | exit: |
---|
| 3206 | /* Free all stack frame descriptors for the max depth call chain back |
---|
| 3207 | to the internal free list. */ |
---|
| 3208 | fdP = shP->shMaxP; |
---|
| 3209 | while (fdP != NULL) |
---|
| 3210 | { |
---|
| 3211 | SH_ASSERT(fdP->fdRef == 1); |
---|
| 3212 | fdNextP = fdP->fdCallerP; |
---|
| 3213 | fdP->fdCallerP = shP->shFreeHeadP; |
---|
| 3214 | shP->shFreeHeadP = fdP; |
---|
| 3215 | fdP = fdNextP; |
---|
| 3216 | } |
---|
| 3217 | shP->shMaxP = NULL; |
---|
| 3218 | } |
---|
| 3219 | |
---|
| 3220 | |
---|
| 3221 | /* Process routine entry */ |
---|
| 3222 | static void fdEntry(frame_desc_t * fdP, stack_history_t * shP) |
---|
| 3223 | { |
---|
| 3224 | frame_desc_t * popP; |
---|
| 3225 | frame_desc_t * p; |
---|
| 3226 | |
---|
| 3227 | TRACE5(TRACE_ENTRYEXIT, 11, TRCID_FDENTRY, |
---|
| 3228 | "fdEntry: fdP 0x%lX shP 0x%lX rtn %s shCurrentP 0x%lX used %d\n", |
---|
| 3229 | fdP, shP, fdP->fdFuncNameP, shP->shCurrentP, fdP->fdStackUsed); |
---|
| 3230 | |
---|
| 3231 | /* If this is the first call by this thread, set up the two call chains */ |
---|
| 3232 | if (shP->shCurrentP == NULL) |
---|
| 3233 | { |
---|
| 3234 | SH_ASSERT(shP->shMaxP == NULL); |
---|
| 3235 | shP->shCurrentP = fdP; |
---|
| 3236 | shP->shMaxP = fdP; |
---|
| 3237 | fdP->fdCallerP = NULL; |
---|
| 3238 | fdP->fdRef = 2; |
---|
| 3239 | return; |
---|
| 3240 | } |
---|
| 3241 | else |
---|
| 3242 | SH_ASSERT(shP->shMaxP != NULL); |
---|
| 3243 | |
---|
| 3244 | /* Process routine exits implied by the number of bytes of stack that |
---|
| 3245 | are currently in use. The test needs to be for strict less than |
---|
| 3246 | because inlined routines share the same stack frame as their |
---|
| 3247 | caller, but both routines will do entry/exit processing. */ |
---|
| 3248 | popP = shP->shCurrentP; |
---|
| 3249 | while (fdP->fdStackUsed < popP->fdStackUsed) |
---|
| 3250 | { |
---|
| 3251 | p = popP->fdCallerP; |
---|
| 3252 | shP->shCurrentP = p; |
---|
| 3253 | TRACE1(TRACE_ENTRYEXIT, 11, TRCID_IMPLIED_EXIT, |
---|
| 3254 | "fdEntry: implied exit from rtn %s\n", |
---|
| 3255 | popP->fdFuncNameP); |
---|
| 3256 | fdDiscard(popP, shP); |
---|
| 3257 | if (p == NULL) |
---|
| 3258 | { |
---|
| 3259 | /* The outermost routine returned before this call without calling |
---|
| 3260 | fdExit. Test for a large maximum stack, then reset the |
---|
| 3261 | maximum. */ |
---|
| 3262 | shDisplay(shP); |
---|
| 3263 | |
---|
| 3264 | /* The current routine is the one and only */ |
---|
| 3265 | shP->shCurrentP = fdP; |
---|
| 3266 | shP->shMaxP = fdP; |
---|
| 3267 | fdP->fdCallerP = NULL; |
---|
| 3268 | fdP->fdRef = 2; |
---|
| 3269 | return; |
---|
| 3270 | } |
---|
| 3271 | popP = p; |
---|
| 3272 | } |
---|
| 3273 | |
---|
| 3274 | /* If this is an extension of the current max depth stack, just add |
---|
| 3275 | this routine to the top of both stacks */ |
---|
| 3276 | if (fdP->fdStackUsed > shP->shMaxP->fdStackUsed && |
---|
| 3277 | shP->shCurrentP == shP->shMaxP) |
---|
| 3278 | { |
---|
| 3279 | fdP->fdCallerP = shP->shCurrentP; |
---|
| 3280 | shP->shCurrentP = fdP; |
---|
| 3281 | shP->shMaxP = fdP; |
---|
| 3282 | fdP->fdRef = 2; |
---|
| 3283 | TRACE2(TRACE_ENTRYEXIT, 11, TRCID_NEWMAX_EXTEND, |
---|
| 3284 | "fdEntry: extending new max stack %d fdP 0x%lX\n", |
---|
| 3285 | fdP->fdStackUsed, fdP); |
---|
| 3286 | return; |
---|
| 3287 | } |
---|
| 3288 | |
---|
| 3289 | /* Make this new routine be the top of the stack */ |
---|
| 3290 | fdP->fdCallerP = shP->shCurrentP; |
---|
| 3291 | shP->shCurrentP = fdP; |
---|
| 3292 | fdP->fdRef = 1; |
---|
| 3293 | |
---|
| 3294 | /* If this new routine has a greater stack depth than the previous max, |
---|
| 3295 | unreference the previous max depth call chain and add additional |
---|
| 3296 | references to the current one. */ |
---|
| 3297 | if (fdP->fdStackUsed > shP->shMaxP->fdStackUsed) |
---|
| 3298 | { |
---|
| 3299 | popP = shP->shMaxP; |
---|
| 3300 | do |
---|
| 3301 | { |
---|
| 3302 | p = popP->fdCallerP; |
---|
| 3303 | fdDiscard(popP, shP); |
---|
| 3304 | popP = p; |
---|
| 3305 | } while (popP != NULL); |
---|
| 3306 | p = fdP; |
---|
| 3307 | do |
---|
| 3308 | { |
---|
| 3309 | p->fdRef = 2; |
---|
| 3310 | p = p->fdCallerP; |
---|
| 3311 | } while (p != NULL); |
---|
| 3312 | TRACE2(TRACE_ENTRYEXIT, 11, TRCID_NEWMAX, |
---|
| 3313 | "fdEntry: new max stack %d fdP 0x%lX\n", |
---|
| 3314 | fdP->fdStackUsed, fdP); |
---|
| 3315 | shP->shMaxP = fdP; |
---|
| 3316 | } |
---|
| 3317 | } |
---|
| 3318 | |
---|
| 3319 | |
---|
| 3320 | /* Process routine exit */ |
---|
| 3321 | static void fdExit(const char * funcnameP) |
---|
| 3322 | { |
---|
| 3323 | stack_history_t * shP; |
---|
| 3324 | frame_desc_t * lastPopP; |
---|
| 3325 | frame_desc_t * popP; |
---|
| 3326 | frame_desc_t * p; |
---|
| 3327 | |
---|
| 3328 | /* Locate or create stack_history_t for this thread */ |
---|
| 3329 | shP = shFind(); |
---|
| 3330 | |
---|
| 3331 | /* If call stack is already empty, there is nothing to do except free |
---|
| 3332 | the stack_history_t */ |
---|
| 3333 | if (shP->shCurrentP == NULL) |
---|
| 3334 | { |
---|
| 3335 | SH_ASSERT(shP->shMaxP == NULL); |
---|
| 3336 | shPut(shP); |
---|
| 3337 | return; |
---|
| 3338 | } |
---|
| 3339 | |
---|
| 3340 | /* Search backward on the call stack for a routine name that matches |
---|
| 3341 | the one being exitted. In C++, the ENTER/EXIT macros will pass the |
---|
| 3342 | same string constant (same address) to fdEntry and fdExit. The C |
---|
| 3343 | versions of the macros may pass two different copies of the same |
---|
| 3344 | string. This loop cannot pop routines it skips off the stack, since |
---|
| 3345 | the routine might never be found. */ |
---|
| 3346 | p = shP->shCurrentP; |
---|
| 3347 | for (;;) |
---|
| 3348 | { |
---|
| 3349 | if (p->fdFuncNameP == funcnameP || |
---|
| 3350 | cxiStrcmp(p->fdFuncNameP, funcnameP) == 0) |
---|
| 3351 | { |
---|
| 3352 | TRACE4(TRACE_ENTRYEXIT, 11, TRCID_FDEXIT, |
---|
| 3353 | "fdExit: p 0x%lX shP 0x%lX rtn %s shCurrentP 0x%lX\n", |
---|
| 3354 | p, shP, p->fdFuncNameP, shP->shCurrentP); |
---|
| 3355 | lastPopP = p; |
---|
| 3356 | break; |
---|
| 3357 | } |
---|
| 3358 | p = p->fdCallerP; |
---|
| 3359 | if (p == NULL) |
---|
| 3360 | { |
---|
| 3361 | /* Routine name not found. Do not pop stack. */ |
---|
| 3362 | /* printk("No entry found when exitting %s\n", funcnameP); */ |
---|
| 3363 | TRACE1(TRACE_ENTRYEXIT, 11, TRCID_FDEXIT_NOTFOUND, |
---|
| 3364 | "No entry found when exitting %s\n", funcnameP); |
---|
| 3365 | return; |
---|
| 3366 | } |
---|
| 3367 | } |
---|
| 3368 | |
---|
| 3369 | /* Pop all routines up to and including lastPopP */ |
---|
| 3370 | p = shP->shCurrentP; |
---|
| 3371 | do |
---|
| 3372 | { |
---|
| 3373 | popP = p; |
---|
| 3374 | p = popP->fdCallerP; |
---|
| 3375 | fdDiscard(popP, shP); |
---|
| 3376 | } while (popP != lastPopP); |
---|
| 3377 | shP->shCurrentP = p; |
---|
| 3378 | |
---|
| 3379 | /* If this was the return of the outermost routine, print new maximum |
---|
| 3380 | stack depth traceback and discard the stack_history_t */ |
---|
| 3381 | if (shP->shCurrentP == NULL) |
---|
| 3382 | { |
---|
| 3383 | shDisplay(shP); |
---|
| 3384 | shPut(shP); |
---|
| 3385 | } |
---|
| 3386 | } |
---|
| 3387 | |
---|
| 3388 | #endif /* KSTACK_CHECK */ |
---|
| 3389 | |
---|
| 3390 | |
---|
| 3391 | #if defined(ENTRYEXIT_TRACE) || defined(KSTACK_CHECK) |
---|
| 3392 | void cxiTraceEntry(int level, const char * funcnameP, |
---|
| 3393 | const char * filenameP, int lineNum) |
---|
| 3394 | { |
---|
| 3395 | int stackUsed = THREAD_SIZE - (((unsigned long)&stackUsed) & (THREAD_SIZE-1)); |
---|
| 3396 | #ifdef KSTACK_CHECK |
---|
| 3397 | stack_history_t * shP; |
---|
| 3398 | frame_desc_t * fdP; |
---|
| 3399 | #endif /* KSTACK_CHECK */ |
---|
| 3400 | |
---|
| 3401 | #ifdef ENTRYEXIT_TRACE |
---|
| 3402 | /* Need to use a constant trace level in the TRACE macro call to get |
---|
| 3403 | the .trclst file (and later the .trcfmt file) built correctly */ |
---|
| 3404 | if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level)) |
---|
| 3405 | { |
---|
| 3406 | TRACE5(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_ENTER, |
---|
| 3407 | "-->K %s (%s:%d) level %d stackUsed %d\n", |
---|
| 3408 | funcnameP, filenameP, lineNum, level, stackUsed); |
---|
| 3409 | } |
---|
| 3410 | #endif /* ENTRYEXIT_TRACE */ |
---|
| 3411 | |
---|
| 3412 | #ifdef KSTACK_CHECK |
---|
| 3413 | /* Nothing to do if kernel stack checking is disabled */ |
---|
| 3414 | if (!SHG.shActive) |
---|
| 3415 | return; |
---|
| 3416 | |
---|
| 3417 | /* Do not attempt to keep track of stack usage in interrupt handlers */ |
---|
| 3418 | if (in_interrupt()) |
---|
| 3419 | return; |
---|
| 3420 | |
---|
| 3421 | /* Locate or create stack_history_t for this thread */ |
---|
| 3422 | shP = shFind(); |
---|
| 3423 | |
---|
| 3424 | /* Get a new frame descriptor and fill it in */ |
---|
| 3425 | fdP = fdGet(shP); |
---|
| 3426 | fdP->fdFuncNameP = funcnameP; |
---|
| 3427 | fdP->fdFileNameP = filenameP; |
---|
| 3428 | fdP->fdLineNum = lineNum; |
---|
| 3429 | fdP->fdStackUsed = stackUsed; |
---|
| 3430 | |
---|
| 3431 | /* Perform stack checking for this routine entry */ |
---|
| 3432 | fdEntry(fdP, shP); |
---|
| 3433 | #endif /* KSTACK_CHECK */ |
---|
| 3434 | } |
---|
| 3435 | |
---|
| 3436 | |
---|
| 3437 | void cxiTraceExit(int level, const char * funcnameP) |
---|
| 3438 | { |
---|
| 3439 | #ifdef ENTRYEXIT_TRACE |
---|
| 3440 | /* Need to use a constant trace level in the TRACE macro call to get |
---|
| 3441 | the .trclst file (and later the .trcfmt file) built correctly */ |
---|
| 3442 | if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level)) |
---|
| 3443 | TRACE1(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_EXIT, |
---|
| 3444 | "<--K %s\n", funcnameP); |
---|
| 3445 | #endif /* ENTRYEXIT_TRACE */ |
---|
| 3446 | |
---|
| 3447 | #ifdef KSTACK_CHECK |
---|
| 3448 | /* Nothing to do if kernel stack checking is disabled */ |
---|
| 3449 | if (!SHG.shActive) |
---|
| 3450 | return; |
---|
| 3451 | |
---|
| 3452 | /* Do not attempt to keep track of stack usage in interrupt handlers */ |
---|
| 3453 | if (in_interrupt()) |
---|
| 3454 | return; |
---|
| 3455 | |
---|
| 3456 | /* Process routine exit */ |
---|
| 3457 | fdExit(funcnameP); |
---|
| 3458 | #endif /* KSTACK_CHECK */ |
---|
| 3459 | } |
---|
| 3460 | void cxiTraceExitRC(int level, const char * funcnameP, int rc) |
---|
| 3461 | { |
---|
| 3462 | #ifdef ENTRYEXIT_TRACE |
---|
| 3463 | /* Need to use a constant trace level in the TRACE macro call to get |
---|
| 3464 | the .trclst file (and later the .trcfmt file) built correctly */ |
---|
| 3465 | if (_TRACE_IS_ON(TRACE_ENTRYEXIT, BASE_ENTEREXIT_LEVEL + level)) |
---|
| 3466 | TRACE2(TRACE_ENTRYEXIT, 1, TRCID_KTRACE_LINUX_EXIT_RC, |
---|
| 3467 | "<--K %s rc %d\n", funcnameP, rc); |
---|
| 3468 | #endif /* ENTRYEXIT_TRACE */ |
---|
| 3469 | |
---|
| 3470 | #ifdef KSTACK_CHECK |
---|
| 3471 | /* Nothing to do if kernel stack checking is disabled */ |
---|
| 3472 | if (!SHG.shActive) |
---|
| 3473 | return; |
---|
| 3474 | |
---|
| 3475 | /* Do not attempt to keep track of stack usage in interrupt handlers */ |
---|
| 3476 | if (in_interrupt()) |
---|
| 3477 | return; |
---|
| 3478 | |
---|
| 3479 | /* Process routine exit */ |
---|
| 3480 | fdExit(funcnameP); |
---|
| 3481 | #endif /* KSTACK_CHECK */ |
---|
| 3482 | } |
---|
| 3483 | #endif /* defined(ENTRYEXIT_TRACE) || defined(KSTACK_CHECK) */ |
---|
| 3484 | |
---|
| 3485 | |
---|
| 3486 | #ifdef UIDREMAP |
---|
| 3487 | size_t cxiGetUserEnvironmentSize(void) |
---|
| 3488 | { |
---|
| 3489 | return (current->mm->env_end - current->mm->env_start); |
---|
| 3490 | } |
---|
| 3491 | |
---|
| 3492 | int cxiGetUserEnvironment(char* buf, size_t len) |
---|
| 3493 | { |
---|
| 3494 | return cxiCopyIn((char*)current->mm->env_start, buf, len); |
---|
| 3495 | } |
---|
| 3496 | #endif |
---|
| 3497 | |
---|
| 3498 | Boolean cxiHasMountHelper() |
---|
| 3499 | { |
---|
| 3500 | return USING_MOUNT_HELPER(); |
---|
| 3501 | } |
---|
| 3502 | |
---|
| 3503 | #ifdef P_NFS4 |
---|
| 3504 | |
---|
| 3505 | #include <linux/nfsd/nfs4layoutxdr.h> |
---|
| 3506 | |
---|
| 3507 | /* convert ip address to string */ |
---|
| 3508 | char *IPtoString(int ip, char *buf) |
---|
| 3509 | { |
---|
| 3510 | unsigned char *a = (char *)&ip; |
---|
| 3511 | |
---|
| 3512 | sprintf(buf, "%u.%u.%u.%u", a[0], a[1], a[2], a[3]); |
---|
| 3513 | |
---|
| 3514 | return buf; |
---|
| 3515 | } |
---|
| 3516 | |
---|
| 3517 | static void printfh(char *s, int *fh) |
---|
| 3518 | { |
---|
| 3519 | #ifdef GPFS_PRINTK |
---|
| 3520 | printk("%s: %d: %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", |
---|
| 3521 | s, fh[0],fh[1],fh[2],fh[3],fh[4],fh[5],fh[6],fh[7],fh[8],fh[9]); |
---|
| 3522 | #endif |
---|
| 3523 | } |
---|
| 3524 | |
---|
| 3525 | int cxiSetFH(int *fhP, int sid) |
---|
| 3526 | { |
---|
| 3527 | struct knfsd_fh *fh = (struct knfsd_fh *)fhP; |
---|
| 3528 | |
---|
| 3529 | printfh("cxiSetFH-1", fhP); |
---|
| 3530 | if (fh->fh_size > 8) { |
---|
| 3531 | fh->fh_size += 4; // fh_size + 4 for sid |
---|
| 3532 | fh->fh_fsid_type += max_fsid_type; |
---|
| 3533 | fhP[(fh->fh_size >> 2)] = sid; |
---|
| 3534 | fh->fh_fileid_type = 7; // see code in gpfs_decode_fh() |
---|
| 3535 | #ifdef GPFS_PRINTK |
---|
| 3536 | printk("cxiSetFH size %d fsid_type %d fileid %d\n", |
---|
| 3537 | fh->fh_size, fh->fh_fsid_type, fh->fh_fileid_type); |
---|
| 3538 | #endif |
---|
| 3539 | printfh("cxiSetFH-2", fhP); |
---|
| 3540 | return 0; |
---|
| 3541 | } |
---|
| 3542 | return ENOENT; |
---|
| 3543 | } |
---|
| 3544 | |
---|
| 3545 | /* Call to NFS server on MDS to get open state */ |
---|
| 3546 | int cxiOpenState(void *vfsP, void *p) |
---|
| 3547 | { |
---|
| 3548 | int rc = ENOENT; |
---|
| 3549 | struct super_block *sbP = (struct super_block *)vfsP; |
---|
| 3550 | struct pnfs_get_state *osP = p; |
---|
| 3551 | struct gpfsVfsData_t *privVfsP = (struct gpfsVfsData_t *)SBLOCK_PRIVATE(sbP); |
---|
| 3552 | |
---|
| 3553 | #ifdef GPFS_PRINTK |
---|
| 3554 | printk("cxiOpenState1 sb %p p %p \n", sbP, p); |
---|
| 3555 | printk("cxiOpenState cb_get_state %p\n", |
---|
| 3556 | sbP->s_export_op->cb_get_state); |
---|
| 3557 | #endif |
---|
| 3558 | if (sbP->s_export_op->cb_get_state) |
---|
| 3559 | rc = sbP->s_export_op->cb_get_state(osP); |
---|
| 3560 | |
---|
| 3561 | gpfs_ops.gpfsGetVerifier(privVfsP, osP->verifier); |
---|
| 3562 | #ifdef GPFS_PRINTK |
---|
| 3563 | printk("cxiOpenState rc %d devid %x verifier %x:%x\n", |
---|
| 3564 | rc, osP->devid, osP->verifier[0], osP->verifier[1]); |
---|
| 3565 | #endif |
---|
| 3566 | |
---|
| 3567 | return rc; |
---|
| 3568 | } |
---|
| 3569 | /* Call to NFS server on DS to get change open state or close the file */ |
---|
| 3570 | int cxiChangeState(void *vfsP, void *p) |
---|
| 3571 | { |
---|
| 3572 | int rc = ENOENT; |
---|
| 3573 | struct super_block *sbP = (struct super_block *)vfsP; |
---|
| 3574 | struct pnfs_get_state *osP = p; |
---|
| 3575 | |
---|
| 3576 | if (sbP->s_export_op->cb_change_state) |
---|
| 3577 | rc = sbP->s_export_op->cb_change_state(osP); |
---|
| 3578 | #ifdef GPFS_PRINTK |
---|
| 3579 | printk("cxiChangeState2 sb %p p %p access %d\n", sbP, p, osP->access); |
---|
| 3580 | #endif |
---|
| 3581 | |
---|
| 3582 | return rc; |
---|
| 3583 | } |
---|
| 3584 | /* Call to NFS server on MDS to recall layout */ |
---|
| 3585 | int cxiRecallLayout(void *vfsP, void *vP, void *p) |
---|
| 3586 | { |
---|
| 3587 | int rc = ENOENT; |
---|
| 3588 | struct super_block *sbP = (struct super_block *)vfsP; |
---|
| 3589 | struct inode *iP = (struct inode *)vP; |
---|
| 3590 | struct layout_recall lr; |
---|
| 3591 | |
---|
| 3592 | lr.fsid = sbP; |
---|
| 3593 | lr.offset = 0; |
---|
| 3594 | lr.length = -1; |
---|
| 3595 | |
---|
| 3596 | if (iP == NULL) // recall all layouts for this fs |
---|
| 3597 | lr.layout_type = RECALL_FSID; |
---|
| 3598 | |
---|
| 3599 | #ifdef GPFS_PRINTK |
---|
| 3600 | printk("cxiRecallLayout sbP %p type %d\n", sbP, lr.layout_type); |
---|
| 3601 | #endif |
---|
| 3602 | if (sbP->s_export_op->cb_layout_recall) { |
---|
| 3603 | rc = sbP->s_export_op->cb_layout_recall(sbP, iP, &lr); |
---|
| 3604 | } |
---|
| 3605 | else { |
---|
| 3606 | lr.layout_type = RECALL_FILE; |
---|
| 3607 | #ifdef GPFS_PRINTK |
---|
| 3608 | printk("cxiRecallLayout sbP %p iP %p type %d\n", sbP, iP, lr.layout_type); |
---|
| 3609 | #endif |
---|
| 3610 | } |
---|
| 3611 | |
---|
| 3612 | #ifdef GPFS_PRINTK |
---|
| 3613 | printk("cxiRecallLayout sbP %p iP %p rc %d\n", sbP, iP, rc); |
---|
| 3614 | #endif |
---|
| 3615 | return rc; |
---|
| 3616 | } |
---|
| 3617 | |
---|
| 3618 | /* Get device list |
---|
| 3619 | |
---|
| 3620 | gd_type |
---|
| 3621 | in: requested layout type. |
---|
| 3622 | out: available lauout type. |
---|
| 3623 | gd_cookie |
---|
| 3624 | in: cookie returned on the last operation. |
---|
| 3625 | out: none zero cookie if some devices did not fit in the buffer. |
---|
| 3626 | gd_maxcount |
---|
| 3627 | in: buffer size in bytes. |
---|
| 3628 | gd_buffer |
---|
| 3629 | in: pointer to buffer. |
---|
| 3630 | gd_devlist_len |
---|
| 3631 | out: number of items returned in the buffer. |
---|
| 3632 | |
---|
| 3633 | error: |
---|
| 3634 | Use the same retrun codes as used for GTEDEVLIST |
---|
| 3635 | */ |
---|
| 3636 | int |
---|
| 3637 | cxiGetDeviceList(int nDests, int *idList, void *P) |
---|
| 3638 | { |
---|
| 3639 | ENTER(0); |
---|
| 3640 | int rc = 0; |
---|
| 3641 | int i, len, left; |
---|
| 3642 | int j = 0; |
---|
| 3643 | char *p, *tp; |
---|
| 3644 | char tmp[32]; |
---|
| 3645 | struct nfsd4_pnfs_getdevlist *dl = (struct nfsd4_pnfs_getdevlist *)P; |
---|
| 3646 | struct nfsd4_pnfs_devlist *gd_buf = NULL; |
---|
| 3647 | struct pnfs_filelayout_devaddr *dev; |
---|
| 3648 | |
---|
| 3649 | #ifdef GPFS_PRINTK |
---|
| 3650 | printk("xxx cxiGetDeviceList enter nDests %d idList %p \n", nDests, idList); |
---|
| 3651 | #endif |
---|
| 3652 | |
---|
| 3653 | dl->gd_type = LAYOUT_NFSV4_FILES; |
---|
| 3654 | dl->gd_cookie = 0; |
---|
| 3655 | dl->gd_devlist_len = 0; |
---|
| 3656 | left = dl->gd_maxcount; |
---|
| 3657 | tp = &tmp[0]; |
---|
| 3658 | |
---|
| 3659 | len = sizeof(struct nfsd4_pnfs_devlist) * nDests; |
---|
| 3660 | #ifdef GPFS_PRINTK |
---|
| 3661 | printk("xxx cxiGetDeviceList len %d left %d\n", len, left); |
---|
| 3662 | #endif |
---|
| 3663 | if (nDests > left) { |
---|
| 3664 | rc = ENOMEM; //??? NFS4ERR_TOOSMALL |
---|
| 3665 | goto xerror; |
---|
| 3666 | } |
---|
| 3667 | gd_buf = (struct nfsd4_pnfs_devlist *)cxiMallocUnpinned(len); |
---|
| 3668 | if (gd_buf == NULL) { |
---|
| 3669 | rc = ENOMEM; |
---|
| 3670 | goto xerror; |
---|
| 3671 | } |
---|
| 3672 | memset(gd_buf, 0, len); |
---|
| 3673 | dl->gd_devlist = gd_buf; |
---|
| 3674 | |
---|
| 3675 | #ifdef GPFS_PRINTK |
---|
| 3676 | printk("xxx cxiGetDeviceList gd_buf %p count %d\n", gd_buf, nDests); |
---|
| 3677 | #endif |
---|
| 3678 | for (i = 0; i < nDests; i++) |
---|
| 3679 | { |
---|
| 3680 | /* make both device id and device address be the same for now */ |
---|
| 3681 | gd_buf[j].dev_id = idList[i]; |
---|
| 3682 | gd_buf[j].dev_lotype = LAYOUT_NFSV4_FILES; |
---|
| 3683 | if (gd_buf[j].dev_id == INADDR_NONE) |
---|
| 3684 | continue; |
---|
| 3685 | |
---|
| 3686 | IPtoString(gd_buf[j].dev_id, tp); |
---|
| 3687 | len = (cxiStrlen(tp)); |
---|
| 3688 | |
---|
| 3689 | p = (char *)cxiMallocUnpinned(sizeof(struct pnfs_filelayout_devaddr)); |
---|
| 3690 | if (p == NULL) { |
---|
| 3691 | rc = ENOMEM; |
---|
| 3692 | goto xerror; |
---|
| 3693 | } |
---|
| 3694 | memset(p, 0, sizeof(struct pnfs_filelayout_devaddr)); |
---|
| 3695 | gd_buf[j].dev_addr = p; |
---|
| 3696 | |
---|
| 3697 | dev = (struct pnfs_filelayout_devaddr *)p; |
---|
| 3698 | dev->r_addr.len = len + 4; /* for ".8.1" */ |
---|
| 3699 | |
---|
| 3700 | p = (char *)cxiMallocUnpinned(dev->r_addr.len+1); |
---|
| 3701 | if (p == NULL) { |
---|
| 3702 | rc = ENOMEM; |
---|
| 3703 | goto xerror; |
---|
| 3704 | } |
---|
| 3705 | dev->r_addr.data = p; |
---|
| 3706 | cxiMemcpy(p, tp, len); |
---|
| 3707 | p = p + len; |
---|
| 3708 | cxiStrcpy(p, ".8.1"); /* port 2049 = 0x801 = "8.1" */ |
---|
| 3709 | |
---|
| 3710 | dev->r_netid.len = 3; /*'tcp'*/ |
---|
| 3711 | p = (char *)cxiMallocUnpinned(dev->r_netid.len+1); |
---|
| 3712 | if (p == NULL) { |
---|
| 3713 | rc = ENOMEM; |
---|
| 3714 | goto xerror; |
---|
| 3715 | } |
---|
| 3716 | cxiStrcpy(p, "tcp"); |
---|
| 3717 | dev->r_netid.data = p; |
---|
| 3718 | |
---|
| 3719 | left = left - 1; |
---|
| 3720 | dl->gd_devlist_len++; |
---|
| 3721 | |
---|
| 3722 | TRACE4(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELIST_P1, |
---|
| 3723 | "gpfsGetDeviceList index %d len %d ip %s left %d\n", |
---|
| 3724 | i, dev->r_addr.len, dev->r_addr.data, left); |
---|
| 3725 | #ifdef GPFS_PRINTK |
---|
| 3726 | printk("xxx cxiGetDeviceList index %d id %d len %d ip %s left %d ops %p %p\n", |
---|
| 3727 | i, gd_buf[j].dev_id, dev->r_addr.len, |
---|
| 3728 | dev->r_addr.data, left, dl->gd_ops, dl->gd_ops->devaddr_encode); |
---|
| 3729 | #endif |
---|
| 3730 | |
---|
| 3731 | j++; |
---|
| 3732 | } |
---|
| 3733 | |
---|
| 3734 | exit: |
---|
| 3735 | |
---|
| 3736 | TRACE2(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELIST_EXIT, |
---|
| 3737 | "cxiGetDeviceList exit: rc %d len %d", rc, len); |
---|
| 3738 | return rc; |
---|
| 3739 | |
---|
| 3740 | xerror: |
---|
| 3741 | |
---|
| 3742 | if (gd_buf != NULL) { |
---|
| 3743 | for (i = 0; i < j; i++) |
---|
| 3744 | { |
---|
| 3745 | dev = gd_buf[i].dev_addr; |
---|
| 3746 | if (dev) { |
---|
| 3747 | cxiFreeUnpinned(dev->r_addr.data); |
---|
| 3748 | cxiFreeUnpinned(dev->r_netid.data); |
---|
| 3749 | cxiFreeUnpinned(dev); |
---|
| 3750 | } |
---|
| 3751 | } |
---|
| 3752 | cxiFreeUnpinned(gd_buf); |
---|
| 3753 | } |
---|
| 3754 | goto exit; |
---|
| 3755 | } |
---|
| 3756 | |
---|
| 3757 | int |
---|
| 3758 | cxiGetDeviceInfo(void *P) |
---|
| 3759 | { |
---|
| 3760 | ENTER(0); |
---|
| 3761 | int rc; |
---|
| 3762 | int len; |
---|
| 3763 | char *p, *tp; |
---|
| 3764 | char tmp[32]; |
---|
| 3765 | struct nfsd4_pnfs_getdevinfo *da = (struct nfsd4_pnfs_getdevinfo *)P; |
---|
| 3766 | tp = &tmp[0]; |
---|
| 3767 | struct pnfs_filelayout_devaddr *dev; |
---|
| 3768 | |
---|
| 3769 | IPtoString(da->gd_dev_id, tp); |
---|
| 3770 | |
---|
| 3771 | dev = (struct pnfs_filelayout_devaddr *)cxiMallocUnpinned(sizeof(struct pnfs_filelayout_devaddr)); |
---|
| 3772 | if (dev == NULL) { |
---|
| 3773 | rc = ENOMEM; |
---|
| 3774 | goto xerror; |
---|
| 3775 | } |
---|
| 3776 | da->gd_devaddr = dev; |
---|
| 3777 | |
---|
| 3778 | len = (cxiStrlen(tp)); |
---|
| 3779 | dev->r_addr.len = len + 4; /* for ".8.1" */ |
---|
| 3780 | |
---|
| 3781 | p = (char *)cxiMallocUnpinned(dev->r_addr.len+1); |
---|
| 3782 | if (p == NULL) { |
---|
| 3783 | cxiFreeUnpinned(dev); |
---|
| 3784 | rc = ENOMEM; |
---|
| 3785 | goto xerror; |
---|
| 3786 | } |
---|
| 3787 | dev->r_addr.data = p; |
---|
| 3788 | cxiMemcpy(p, tp, len); |
---|
| 3789 | p = p + len; |
---|
| 3790 | cxiStrcpy(p, ".8.1"); /* port 2049 = 0x801 = "8.1" */ |
---|
| 3791 | |
---|
| 3792 | dev->r_netid.len = 3; /*'tcp'*/ |
---|
| 3793 | p = (char *)cxiMallocUnpinned(dev->r_netid.len+1); |
---|
| 3794 | if (p == NULL) { |
---|
| 3795 | cxiFreeUnpinned(dev->r_addr.data); |
---|
| 3796 | cxiFreeUnpinned(dev); |
---|
| 3797 | rc = ENOMEM; |
---|
| 3798 | goto xerror; |
---|
| 3799 | } |
---|
| 3800 | cxiStrcpy(p, "tcp"); |
---|
| 3801 | dev->r_netid.data = p; |
---|
| 3802 | |
---|
| 3803 | TRACE2(TRACE_VNODE, 2, TRCID_GPFSOPS_GET_DEVICELINFO_P1, |
---|
| 3804 | "gpfsGetDeviceInfo len %d ip %s\n", |
---|
| 3805 | dev->r_addr.len, dev->r_addr.data); |
---|
| 3806 | |
---|
| 3807 | #ifdef GPFS_PRINTK |
---|
| 3808 | printk("xxx cxiGetDeviceInfo id %d len %d ip %s\n", |
---|
| 3809 | da->gd_dev_id, dev->r_addr.len, dev->r_addr.data); |
---|
| 3810 | #endif |
---|
| 3811 | |
---|
| 3812 | xerror: |
---|
| 3813 | |
---|
| 3814 | TRACE1(TRACE_VNODE, 2, TRCID_CXI_GET_DEVICELINFO_EXIT, |
---|
| 3815 | "cxiGetDeviceInfo exit: rc %d\n", rc); |
---|
| 3816 | |
---|
| 3817 | return rc; |
---|
| 3818 | } |
---|
| 3819 | /* get layout |
---|
| 3820 | lg_type |
---|
| 3821 | in: requested layout type. |
---|
| 3822 | out: available lauout type. |
---|
| 3823 | lg_offset |
---|
| 3824 | in: requested offset. |
---|
| 3825 | out: returned offset. |
---|
| 3826 | lg_length |
---|
| 3827 | in: requested length. |
---|
| 3828 | out: returned length. |
---|
| 3829 | lg_mxcnt |
---|
| 3830 | in: buffer size in bytes. |
---|
| 3831 | lg_llist |
---|
| 3832 | in: pointer to buffer. |
---|
| 3833 | lg_layout |
---|
| 3834 | out: number of items returned in the buffer. |
---|
| 3835 | |
---|
| 3836 | if the file is big(?) return all nodes in layout |
---|
| 3837 | if the file is small return no layout or just one node, choose one node in |
---|
| 3838 | random but make sure it is the same node for the same file. |
---|
| 3839 | */ |
---|
| 3840 | int |
---|
| 3841 | cxiGetLayout(int nDests, int *idList, cxiVattr_t *vattr, int myAddr, void *P) |
---|
| 3842 | { |
---|
| 3843 | ENTER(0); |
---|
| 3844 | char *p, *n; |
---|
| 3845 | int i, rc, left, len; |
---|
| 3846 | struct nfsd4_pnfs_layoutget *gl = (struct nfsd4_pnfs_layoutget *)P; |
---|
| 3847 | struct nfsd4_pnfs_layoutlist *lg_buf = NULL; |
---|
| 3848 | struct nfsd4_pnfs_filelayout *layout = NULL; |
---|
| 3849 | |
---|
| 3850 | TRACE2(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_ENTER, |
---|
| 3851 | "cxiGetLayout: nDests %d myAddr %x\n", nDests,myAddr); |
---|
| 3852 | |
---|
| 3853 | /* set node id in fh and increase fh size by 4 */ |
---|
| 3854 | rc = cxiSetFH((int *)&gl->lg_fh, myAddr); |
---|
| 3855 | if (rc != 0) |
---|
| 3856 | goto xerror; |
---|
| 3857 | |
---|
| 3858 | gl->lg_type = LAYOUT_NFSV4_FILES; |
---|
| 3859 | gl->lg_offset = 0; |
---|
| 3860 | gl->lg_length = MAX_UINT64; /* The maximum file size */ |
---|
| 3861 | |
---|
| 3862 | layout = (struct nfsd4_pnfs_filelayout *)cxiMallocUnpinned(sizeof(struct nfsd4_pnfs_filelayout)); |
---|
| 3863 | if (layout == NULL) { |
---|
| 3864 | rc = ENOMEM; |
---|
| 3865 | goto xerror; |
---|
| 3866 | } |
---|
| 3867 | gl->lg_layout = layout; |
---|
| 3868 | layout->lg_stripe_type = STRIPE_DENSE; |
---|
| 3869 | layout->lg_commit_through_mds = true; |
---|
| 3870 | layout->lg_stripe_unit = vattr->va_blocksize; /* preferred blocksize */ |
---|
| 3871 | layout->lg_file_size = vattr->va_size; /* file size in bytes */ |
---|
| 3872 | layout->lg_llistlen = 0; |
---|
| 3873 | |
---|
| 3874 | left = gl->lg_mxcnt; |
---|
| 3875 | |
---|
| 3876 | len = sizeof(struct nfsd4_pnfs_layoutlist) * nDests; |
---|
| 3877 | if (len > left) { |
---|
| 3878 | rc = ENOMEM; // NFS4ERR_TOOSMALL |
---|
| 3879 | goto xerror; |
---|
| 3880 | } |
---|
| 3881 | lg_buf = (struct nfsd4_pnfs_layoutlist *)cxiMallocUnpinned(len); |
---|
| 3882 | if (lg_buf == NULL) { |
---|
| 3883 | rc = ENOMEM; |
---|
| 3884 | goto xerror; |
---|
| 3885 | } |
---|
| 3886 | memset(lg_buf, 0, len); |
---|
| 3887 | layout->lg_llist = lg_buf; |
---|
| 3888 | left = left - len; |
---|
| 3889 | |
---|
| 3890 | for (i = 0; i < nDests; i++) |
---|
| 3891 | { |
---|
| 3892 | /* make both device id and device address be the same for now */ |
---|
| 3893 | lg_buf[i].dev_ids.len = 1; //??? can return a list of dev ids ???? |
---|
| 3894 | lg_buf[i].dev_ids.list = (u32 *)cxiMallocUnpinned(sizeof(u32)*lg_buf[i].dev_ids.len); |
---|
| 3895 | if (lg_buf[i].dev_ids.list == NULL) { |
---|
| 3896 | rc = ENOMEM; |
---|
| 3897 | goto xerror; |
---|
| 3898 | } |
---|
| 3899 | lg_buf[i].dev_ids.list[0] = idList[i]; |
---|
| 3900 | layout->lg_llistlen++; |
---|
| 3901 | lg_buf[i].fhp = (struct knfsd_fh *)&gl->lg_fh; |
---|
| 3902 | |
---|
| 3903 | #ifdef GPFS_PRINTK |
---|
| 3904 | printk("cxiGetLayout index %d id %d xid 0x%lX len %d\n", |
---|
| 3905 | i, idList[i], idList[i], len); |
---|
| 3906 | #endif |
---|
| 3907 | TRACE3(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_P1, |
---|
| 3908 | "cxiGetLayout index %d id 0x%lX len %d\n", |
---|
| 3909 | i, idList[i], len); |
---|
| 3910 | |
---|
| 3911 | } |
---|
| 3912 | if (i == 0) { |
---|
| 3913 | layout->lg_llistlen = 0; |
---|
| 3914 | cxiFreeUnpinned(lg_buf); |
---|
| 3915 | } |
---|
| 3916 | |
---|
| 3917 | #ifdef GPFS_PRINTK |
---|
| 3918 | printk("cxiGetLayout: type %d iomode %d offset %lld length %lld minlength %lld mxcnt %d ops %p layouts %p\n", |
---|
| 3919 | gl->lg_type, gl->lg_iomode, gl->lg_offset, gl->lg_length, gl->lg_minlength, |
---|
| 3920 | gl->lg_mxcnt, gl->lg_ops, gl->lg_layout); |
---|
| 3921 | |
---|
| 3922 | printfh("cxiGetLayout:", gl->lg_fh); |
---|
| 3923 | |
---|
| 3924 | printk("cxiGetLayout: layout stripe_type %d stripe_unit %lld file_size %lld llistlen %d llist %p\n", |
---|
| 3925 | layout->lg_stripe_type, layout->lg_stripe_unit,layout->lg_file_size, |
---|
| 3926 | layout->lg_llistlen,layout->lg_llist); |
---|
| 3927 | #endif |
---|
| 3928 | |
---|
| 3929 | exit: |
---|
| 3930 | |
---|
| 3931 | TRACE3(TRACE_VNODE, 2, TRCID_CXI_GET_LAYOUT_EXIT, |
---|
| 3932 | "cxiGetLayout exit: rc %d len %d p 0x%lX", rc, len, p); |
---|
| 3933 | |
---|
| 3934 | return rc; |
---|
| 3935 | |
---|
| 3936 | xerror: |
---|
| 3937 | |
---|
| 3938 | if (lg_buf) { |
---|
| 3939 | gl->lg_length = 0; |
---|
| 3940 | for (i = 0; i < nDests; i++) |
---|
| 3941 | { |
---|
| 3942 | cxiFreeUnpinned(lg_buf[i].dev_ids.list); |
---|
| 3943 | } |
---|
| 3944 | cxiFreeUnpinned(lg_buf); |
---|
| 3945 | } |
---|
| 3946 | if (layout) |
---|
| 3947 | cxiFreeUnpinned(layout); |
---|
| 3948 | |
---|
| 3949 | goto exit; |
---|
| 3950 | } |
---|
| 3951 | #endif |
---|
| 3952 | |
---|
| 3953 | int cxiCheckThreadState(cxiThreadId tid) |
---|
| 3954 | { |
---|
| 3955 | struct task_struct *t, *g; |
---|
| 3956 | int rc = ENOENT; |
---|
| 3957 | |
---|
| 3958 | // read_lock(&tasklist_lock); |
---|
| 3959 | rcu_read_lock(); |
---|
| 3960 | |
---|
| 3961 | DO_EACH_THREAD(g,t) |
---|
| 3962 | { |
---|
| 3963 | /* We are looking for a thread with a given tid and the same parent as |
---|
| 3964 | the caller (the caller must be another mmfsd thread */ |
---|
| 3965 | if (t->pid == tid && |
---|
| 3966 | cxiStrcmp(t->comm, current->comm) == 0) |
---|
| 3967 | { |
---|
| 3968 | rc = 0; |
---|
| 3969 | break; |
---|
| 3970 | } |
---|
| 3971 | } WHILE_EACH_THREAD(g,t); |
---|
| 3972 | // read_unlock(&tasklist_lock); |
---|
| 3973 | rcu_read_unlock(); |
---|
| 3974 | |
---|
| 3975 | return rc; |
---|
| 3976 | } |
---|
| 3977 | |
---|