/*************************************************************************** * * Copyright (C) 2001 International Business Machines * All rights reserved. * * This file is part of the GPFS mmfslinux kernel module. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *************************************************************************** */ /* @(#)25 1.65.1.6 src/avs/fs/mmfs/ts/kernext/gpl-linux/tracedev.c, mmfs, avs_rgpfs24, rgpfs24s011a 3/14/07 10:57:03 */ /************************************************************************** * * Loadable kernel module that implements the trace device. * **************************************************************************/ #ifndef GPFS_PRINTF #ifndef __KERNEL__ # define __KERNEL__ #endif #ifndef KBUILD_MODNAME #define KBUILD_MODNAME tracedev #endif /* If trace is built into kernel, pick up GPFS flag definitions from a file rather than requiring them to be defined on the command line. */ #ifndef MODULE /* #include */ #endif #include #include #include #include #include #include #include #include #include #if LINUX_KERNEL_VERSION > 2060900 #include /* in_interrupt */ #else #include /* in_interrupt */ #endif #include /* copy_to/from_user */ #include #include #include #include #ifdef __64BIT__ # define Int64 long long # define ARGLEN 8 # define ARGTYPE Int64 #else # define Int32 int # define ARGLEN 4 # define ARGTYPE Int32 #endif /* __64BIT__ */ char stringPadding[8]; #if LINUX_KERNEL_VERSION > 2060900 || \ (LINUX_KERNEL_VERSION > 2060000 && (defined(GPFS_ARCH_PPC64) || defined(GPFS_ARCH_X86_64))) #define EXPORTKDUMPDEV #endif #ifdef EXPORTKDUMPDEV static int major_kdump = -1; #endif #if defined(MODULE) && (LINUX_KERNEL_VERSION >= 2040900) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("GPFS portability layer (tracing module)"); MODULE_AUTHOR ("IBM "); #endif /* MODULE */ /* If trace is built into kernel, then this is a dummy module */ #ifndef KTRACE /* the daemon's task structure (for signal) */ static struct task_struct *taskP; /* The writeLock serializes trace writers. It should be avoided by * other operations in order to allow the writers to continue unimpeded. * The writeLock must be held when accessing the following fields in the * trace header element: nWaits, nBuffers, nLost, writeBuf */ static spinlock_t writeLock; /* The readLock serializes trace operations, as well as most other access * to the trace header element. Whenever both readLock and writeLock are * required, readLock is always to be acquired first. */ static struct semaphore readLock; /* The readFull flag synchronizes access to readBuf by readers and writers. Writers set this after filling readBuf and wait for this to be clear before filling readBuf. Readers use this flag to tell if readBuf has any data and clear this after processing. Using an atomic variable allows steady-state tracing to be done without readers needing to acquire a lock that would block writers. Note that atomic operations do not generally act as memory barriers, so explicit barrier calls may be necessary before or after accessing readFull. Spinlocks act as partial memory barriers, so explicit barriers can be avoided in some cases where spinlocks are used. */ static atomic_t readFull; /* Trace Header Element - THE anchor for the trace state */ static trcdev_header_t lxthe; static wait_queue_head_t daemonWaitQ; /* Export pointers to internal data structures for debugging */ struct { trcdev_header_t *lxtheP; wait_queue_head_t *daemonWaitQP; struct semaphore *readLockP; spinlock_t *writeLockP; } TraceVarAddrs = { &lxthe, &daemonWaitQ, &readLock, &writeLock }; /* A trcdev_buffer is dirty if there is any data in it (nextP != beginP) AND the * dirtyP has not yet been advanced (by trc_read) past the data (to nextP) */ #define IS_DIRTY(b) (b.nextP != b.beginP && b.nextP != b.dirtyP) /* A trace record passed from a user thread consists of a data header followed by the marshalled arguments */ struct trcRec { trc_datahdr_t hdr; char data[LXTRACE_MAX_DATA-sizeof(trc_datahdr_t)]; }; /* Updating state information requires the writeLock in addition to * readLock. The readLock is widely held where the trace header is * manipulated, but for the brief period of updating the state field, get the * writeLock as well. */ static void setTraceState(trcdev_state_t newState) { spin_lock(&writeLock); lxthe.state = newState; wake_up(&daemonWaitQ); spin_unlock(&writeLock); } /* Return true if the specified hookword is currently being traced. */ static Boolean isTraced(uint hw) { return lxthe.state == trc_active;; } /* Construct the static trace header element ("lxthe"). * trc_open will allocate buffers and set the appropriate values. */ static void trc_init() { spin_lock_init(&writeLock); sema_init(&readLock, 1); /* Note: Locks are not needed here. There better not be any other threads trying to access lxthe at this point. If there were, then what would happen if a thread tried to acquire the locks a few instructions earlier, before we initialized the locks? */ lxthe.major = 0; /* dynamic assignment (by register_chrdev in trc_register) */ lxthe.minor = 0; lxthe.bufSize = 0; lxthe.nOpens = 0; lxthe.nWaits = 0; lxthe.nBuffers = 0; lxthe.nLost = 0; atomic_set(&readFull, 0); taskP = NULL; init_waitqueue_head(&daemonWaitQ); lxthe.writeBuf.beginP = NULL; lxthe.writeBuf.endP = NULL; lxthe.writeBuf.nextP = NULL; lxthe.writeBuf.dirtyP = NULL; lxthe.readBuf = lxthe.writeBuf; lxthe.state = trc_initialized; } /* Destroy the static trace header element (lxthe) */ static void trc_term() { /* Note: Locks are not needed here. We're about to re-initialize, so if anyone is still using lxthe at this point, we would clobber them. */ /* The two buffers are allocated together. Free them both here. */ if (lxthe.writeBuf.beginP) vfree(MIN(lxthe.writeBuf.beginP, lxthe.readBuf.beginP)); /* (re)initialize all fields. Rather than copy all the stuff that happens * in trc_init, we can use it here to reset all the fields. */ trc_init(); } #ifdef EXPORTKDUMPDEV static ssize_t kdump_read(struct file *fileP, char *bufP, size_t nBytes, loff_t *ppos) { int rc= -EINVAL; #if defined(GPFS_ARCH_X86_64) && LINUX_KERNEL_VERSION >= 2061600 /* rw_verify_area does not allow kernel addr range, so a read() will fail with EINVAL. We subtracted the base kernel addr is kdump.c and add back in here. */ unsigned long highBits = GPFS_KERNEL_OFFSET; #else unsigned long highBits = 0; #endif if (virt_addr_valid((unsigned long)*ppos + highBits)) if (copy_to_user(bufP, (void *)((unsigned long)*ppos + highBits), nBytes)==0) rc=nBytes; return((ssize_t)rc); } static int kdump_open(struct inode *inodeP, struct file *fileP) { MY_MODULE_INCREMENT(); fileP->f_pos=0; return 0; } static int kdump_close(struct inode *inodeP, struct file *fileP) { MY_MODULE_DECREMENT(); return 0; } static loff_t kdump_lseek(struct file *fileP, loff_t offset, int orgin) { loff_t rc; if (orgin != 0) return(-EAGAIN); fileP->f_pos = offset; return(offset); } #endif /* The device open operation. The first open is initiated by the trace daemon, * and comes after registration. It results in the allocation of the trace * buffers, and identifying the trace daemon (so it can be signalled when * buffers are ready to be read). */ static int trc_open(struct inode *inodeP, struct file *fileP) { int rc = 0; /* Serialize multiple opens and prevent state changes */ down(&readLock); /* Only the daemon opens the device O_RDWR, and only does so when turning * trace on. */ if ((fileP->f_flags & O_ACCMODE) == O_RDWR) { if (lxthe.state != trc_initialized) { rc = -EALREADY; goto exit; } /* The first open (lxtrace on) requires initialization of the header. */ lxthe.minor = MINOR(inodeP->i_rdev); /* Only supporting one such device */ if (lxthe.minor > 0) { rc = -ENODEV; goto exit; } /* If not configured otherwise, use the default buffer size. */ if (lxthe.bufSize == 0) lxthe.bufSize = DEF_TRC_BUFSIZE; /* Allocate dual trace buffers (new records go into the write buffer, * and the daemon reads (via trc_read) from the read buffer). */ lxthe.writeBuf.beginP = vmalloc(2*lxthe.bufSize); if (!lxthe.writeBuf.beginP) { rc = -ENOMEM; goto exit; } lxthe.writeBuf.endP = lxthe.writeBuf.beginP + lxthe.bufSize - 1; lxthe.writeBuf.nextP = lxthe.writeBuf.beginP; lxthe.writeBuf.dirtyP = lxthe.writeBuf.beginP; lxthe.writeBuf.bufNum = 1; lxthe.readBuf.beginP = lxthe.writeBuf.beginP + lxthe.bufSize; lxthe.readBuf.endP = lxthe.readBuf.beginP + lxthe.bufSize - 1; lxthe.readBuf.nextP = lxthe.readBuf.beginP; lxthe.readBuf.dirtyP = lxthe.readBuf.beginP; lxthe.readBuf.bufNum = 2; /* Save pointer to the daemon task information, and mark the * device open. */ taskP = current; setTraceState(trc_opened); /* Since threads that handle VM page-outs also do traces, set flag so that we will not get blocked waiting to allocate pages. Otherwise a deadlock could occur if the page-out thread was waiting for us to empty the trace buffer, and we are waiting for the page-out thread to free some pages. */ current->flags |= PF_MEMALLOC; } /* Applications must open the trace device O_WRONLY. These opens do not * require any processing. If the daemon has turned tracing on, the open * is allowed and subsequent write() calls will be handled. If the daemon * has NOT turned tracing on, the application open will be granted, but * subsequent write() calls will NOOP * until the daemon turns trace on (state == trac_active). */ else if ((fileP->f_flags & O_ACCMODE) != O_WRONLY) { /* After "trace on", subsequent trace control commands open O_RDONLY. */ if (lxthe.state != trc_active) { rc = -EALREADY; goto exit; } } lxthe.nOpens += 1; MY_MODULE_INCREMENT(); exit: up(&readLock); return rc; } /* The device read operation. This is to be used only by the trace daemon to * retrieve trace buffers for the purposes of writing to the output file. */ static ssize_t trc_read(struct file *fileP, char *bufP, size_t nBytes, loff_t *ppos) { ssize_t nDone = 0; ssize_t nReady; /* All access to lxthe.readBuf is protected via the readLock. */ down(&readLock); /* Only the trace daemon is allowed to read. */ if (taskP && taskP->pid != current->pid) { nDone = -EPERM; goto exit; } /* See if there is data waiting to be processed by the daemon. Read is * allowed here during normal operation (trc_active) and as trace is * terminating (this to get the last group of buffered records). */ if ((lxthe.state == trc_active || lxthe.state == trc_stopped) && atomic_read(&readFull)) { /* Be sure that we don't access readBuf until after readFull is set */ rmb(); if (IS_DIRTY(lxthe.readBuf)) { /* Make sure that the caller's buffer is large enough to hold * what we have. */ nReady = lxthe.readBuf.nextP - lxthe.readBuf.beginP; if (nReady > nBytes) { nDone = -EFBIG; goto exit; } if (copy_to_user(bufP, lxthe.readBuf.dirtyP, nReady)) { nDone = -EFAULT; goto exit; } nDone = nReady; lxthe.readBuf.dirtyP += nDone; } /* Allow writers to use readBuf */ if (!IS_DIRTY(lxthe.readBuf)) { wmb(); atomic_set(&readFull, 0); wake_up(&daemonWaitQ); } } exit: up(&readLock); return nDone; } static void my_send_sig_info(int mySig, struct siginfo * sigData, struct task_struct *taskP) { struct task_struct *g, *tsP; // read_lock(&tasklist_lock); rcu_read_lock(); DO_EACH_THREAD(g,tsP) { if (tsP == taskP) { send_sig_info(mySig, sigData, tsP); break; } } WHILE_EACH_THREAD(g,tsP); // read_unlock(&tasklist_lock); rcu_read_unlock(); } /* Internal routine to schedule i/o of the trace buffer. NOTE that this routine is called while holding the writeLock. */ static void trc_signal_io() { trcdev_buffer_t tBuf; struct siginfo sigData; /* DBGASSERT(atomic_read(&readFull) == 0); */ /* Switch the buffers. We don't have to worry about trc_read looking at readBuf while we do this because it always verify that readFull is non-zero before accessing readBuf. */ rmb(); tBuf = lxthe.readBuf; lxthe.readBuf = lxthe.writeBuf; lxthe.writeBuf= tBuf; lxthe.nBuffers++; /* Number of buffers filled */ /* Mark readBuf full so that writers won't switch to it until after the daemon has processed it. Do write memory barrier to ensure that our change to readBuf makes it to memory before readFull is set. */ wmb(); atomic_set(&readFull, 1); /* Reset the (new) writeBuf to a clean state */ lxthe.writeBuf.dirtyP = lxthe.writeBuf.nextP = lxthe.writeBuf.beginP; /* Debug for 471707: Since all trace records begin with a header the * very first thing in a dirty buffer should be a valid header. If * this is not the case, print debug information to the log file. */ if (IS_DIRTY(tBuf) && (((trc_header_t *)tBuf.beginP)->trMagic != LXTRACE_MAGIC)) { printk("trc_signal_io: bad trace buffer! trMagic 0x%X\n", ((trc_header_t *)tBuf.beginP)->trMagic); printk("trc_signal_io: begin 0x%x end 0x%X next 0x%X dirty 0x%X isDirty %d\n", (trc_header_t *)tBuf.beginP, (trc_header_t *)tBuf.endP, (trc_header_t *)tBuf.nextP, (trc_header_t *)tBuf.dirtyP, IS_DIRTY(tBuf)); } /* Signal daemon that there is a trace buffer to be read and processed. */ sigData.si_signo = SIGIO; sigData.si_errno = 0; sigData.si_code = SI_KERNEL; my_send_sig_info(SIGIO, &sigData, taskP); } /* Return true if trace writer will have to wait for daemon to make room for a trace record of the specified length. */ static int writeMustBlock(int len) { return (len > lxthe.writeBuf.endP - lxthe.writeBuf.nextP + 1 && atomic_read(&readFull) && lxthe.state == trc_active); } /* Reserves space for a trace record whose data header plus arguments totals nBytes. Returns 0 if space was reserved for the trace record, or non-zero if space could not be found because the buffer is full and cantBlock is set, or because tracing is not enabled. If space was reserved successfully, builds a trace header, then copies the trace header and the given data header into the trace device buffer. After returning from this routine, trc_append_record should be called 0 or more times, then trc_end_record. Returns with writeLock held iff 0 was returned. */ static int trc_start_record(trc_datahdr_t * hdrP, size_t nBytes, int cantBlock) { int trclen; trc_header_t tHdr; /* Construct the trace record header */ tHdr.trMagic = LXTRACE_MAGIC; /* Wait to set the timestamp (tHdr.trTime) until after all serialization. * When multiple threads call trace, they don't necessarily get the * writeLock in FIFO order so setting the timestamp here can result * in times going backwards in the trace file. * do_gettimeofday(&tHdr.trTime); */ tHdr.trProcess = current->pid; tHdr.trCPU = smp_processor_id(); tHdr.trLength = nBytes; trclen = nBytes + sizeof(trc_header_t); /* Serialize access to writeBuf */ spin_lock(&writeLock); /* If this trace record will not fit in the write buffer, and the read buffer is still full, and trace is active, then we must wait for the daemon to empty the read buffer. */ if (writeMustBlock(trclen)) { if (cantBlock) { lxthe.nLost++; spin_unlock(&writeLock); return 1; } if (lxthe.state != trc_active) { spin_unlock(&writeLock); return 1; } lxthe.nWaits++; do { spin_unlock(&writeLock); wait_event(daemonWaitQ, !writeMustBlock(trclen)); spin_lock(&writeLock); } while (writeMustBlock(trclen)); } if (lxthe.state != trc_active) { spin_unlock(&writeLock); return 1; } /* Will the trace record fit into the write buffer? If not, then we can swap with the read buffer which must be empty at this point (else we wouldn't have come out of previous wait loop). */ if (trclen > lxthe.writeBuf.endP - lxthe.writeBuf.nextP + 1) { /* Swap write buffer with read buffer and signal daemon to process the data. */ trc_signal_io(); /* This could be an assert, since write buffer must be empty now. */ if (trclen > lxthe.writeBuf.endP - lxthe.writeBuf.nextP + 1) { spin_unlock(&writeLock); return 1; } } /* Now that there isn't anything to block the writing of this * record, insert the timestamp. */ do_gettimeofday(&tHdr.trTime); /* Insert the header stamp into the buffer ahead of the application record and remember its location. */ lxthe.tHdrP = (trc_header_t *)lxthe.writeBuf.nextP; tHdr.trBuf = lxthe.writeBuf.bufNum; memcpy(lxthe.writeBuf.nextP, &tHdr, sizeof(tHdr)); lxthe.writeBuf.nextP += sizeof(tHdr); /* Move the application trace header directly into the trace buffer and remember its location */ lxthe.hdrP = (trc_datahdr_t *)lxthe.writeBuf.nextP; memcpy(lxthe.writeBuf.nextP, hdrP, sizeof(*hdrP)); lxthe.writeBuf.nextP += sizeof(*hdrP); /* Return with writeLock still held */ return 0; } /* Append a portion of a trace record to the write buffer. Must have previously called trc_start_record. */ static void trc_append_record(const void* bufP, size_t nBytes) { /* Move the application trace record directly into the trace buffer */ memcpy(lxthe.writeBuf.nextP, bufP, nBytes); lxthe.writeBuf.nextP += nBytes; } /* Finish a trace record */ static void trc_end_record() { spin_unlock(&writeLock); } static ssize_t trc_write(struct file *fileP, const char *bufP, size_t nBytes, loff_t *posP) { struct trcRec tr; int rc; int dataBytes; /* Copy trace record from user address space */ if (nBytes < 4 || nBytes > LXTRACE_MAX_DATA) return -EINVAL; if (copy_from_user(&tr, bufP, nBytes)) return -EFAULT; /* The beginning of the trace record is a hookword number. Verify that the specified hookword is being traced. If not, return as if the trace was successful. */ if (isTraced(tr.hdr.trHook)) { rc = trc_start_record(&tr.hdr, nBytes, false); if (rc == 0) { dataBytes = nBytes - sizeof(tr.hdr); if (dataBytes > 0) trc_append_record(&tr.data[0], dataBytes); trc_end_record(); } } return nBytes; } /* Before close, a sync of the trace device will flush the records * still in the read buffer (even though it might not be full). A * close without this call could result in the loss of these records. * Must not call fsync from daemon termination signal handler because * that could deadlock if a SIGIO is still pending. */ static int trc_fsync_internal(struct file* fileP, struct dentry* dP, int datasync) { spin_lock(&writeLock); /* If read buffer is still full, wait for daemon to process it */ while (atomic_read(&readFull) && (lxthe.state == trc_active || lxthe.state == trc_stopped)) { spin_unlock(&writeLock); wait_event(daemonWaitQ, !(atomic_read(&readFull) && (lxthe.state == trc_active || lxthe.state == trc_stopped))); spin_lock(&writeLock); } /* Allow fsync during normal operation OR after ioctl(trc_end) has disabled further trace writing (allows an fsync before close to flush the buffered records). */ if (lxthe.writeBuf.nextP != lxthe.writeBuf.beginP && (lxthe.state == trc_active || lxthe.state == trc_stopped)) trc_signal_io(); spin_unlock(&writeLock); return 0; } /* The externally visible version of trc_fsync_internal */ int trc_fsync() { return trc_fsync_internal(NULL, NULL, 0); } /* The device close operation. */ static int trc_close(struct inode *inodeP, struct file *fileP) { down(&readLock); /* The trace daemon only closes the device upon termination. */ if (taskP && taskP->pid == current->pid) { /* The final trace daemon close. Reset for subsequent use. */ setTraceState(trc_initialized); /* We don't really need writeLock here since writers won't do anything after state is set to trc_initialized, but it doesn't hurt. */ spin_lock(&writeLock); lxthe.nWaits = 0; lxthe.nBuffers = 0; lxthe.nLost = 0; spin_unlock(&writeLock); taskP = NULL; current->flags &= ~PF_MEMALLOC; /* Free the two trace buffers. */ if (lxthe.writeBuf.beginP) { vfree(MIN(lxthe.writeBuf.beginP, lxthe.readBuf.beginP)); lxthe.writeBuf.beginP = NULL; lxthe.writeBuf.endP = NULL; lxthe.writeBuf.nextP = NULL; lxthe.writeBuf.dirtyP = NULL; lxthe.readBuf = lxthe.writeBuf; } } lxthe.nOpens -= 1; MY_MODULE_DECREMENT(); up(&readLock); return 0; } /* ioctl op used to for low-level access to trace operation. */ static int trc_ioctl(struct inode *inodeP, struct file *fileP, unsigned int op, unsigned long kx_args) { int h, rc = 0; Boolean readLockHeld = false; struct kArgs args_cp; struct kArgs *args = (struct kArgs *)kx_args; char *p; char *newBufP; char *trc_dumpP; char *trc_nextP; struct siginfo sigData; int waitCount = 0; down(&readLock); readLockHeld = true; switch (op) { case trc_begin: if (lxthe.state == trc_active) { rc = -EALREADY; break; } if (lxthe.state != trc_opened) { rc = -EBADF; break; } setTraceState(trc_active); break; case trc_end: if (lxthe.state != trc_active) rc = -EBADF; else { setTraceState(trc_stopped); up(&readLock); readLockHeld = false; trc_fsync(); /* Signal the daemon to terminate. */ sigData.si_signo = SIGTERM; sigData.si_errno = 0; sigData.si_code = SI_KERNEL; my_send_sig_info(SIGTERM, &sigData, taskP); } /* Wait for lxtrace to terminate, but don't wait forever. At this point the signal has been delivered to lxtrace, but it may take some time for the process to exit. Since lxthe.state is changed from trc_stopped to trc_initialized in trc_close(), which is called when lxtrace exits, if we return control to the caller right away, there'd be a window when tracing has ostensibly been stopped, and it should be OK to start tracing again, but trying to do so would fail with EALREADY in trc_open because lxthe.state is not what the code expects. So we give lxtrace some time to terminate. Something could go seriously wrong, and lxtrace may get stuck, we don't wait forever. */ while (lxthe.state == trc_stopped && waitCount++ < 10) { current->state = TASK_INTERRUPTIBLE; schedule_timeout(100); } break; case trc_bufSize: /* The daemon may call ioctl to change the desired buffer size. On open, buffers of the default size are allocated. This call frees the current buffers (replacing them with new ones). Any trace records currently in the buffers will be lost. */ if (lxthe.state != trc_opened) { rc = -EPERM; break; } /* get the argument array */ if (copy_from_user(&args_cp, args, sizeof(args_cp))) { rc = -EFAULT; break; } /* Allocate the new (dual) trace buffers. * arg1 is the requested buffer size */ newBufP = vmalloc(2*args_cp.arg1); if (!newBufP) { rc = -ENOMEM; break; } /* Free the previous buffers. Since the state is currently * "trc_opened" and we are holding readLock, neither readers nor * writers can be using the buffers at this time. */ if (lxthe.writeBuf.beginP) vfree(MIN(lxthe.writeBuf.beginP, lxthe.readBuf.beginP)); lxthe.bufSize = args_cp.arg1; lxthe.writeBuf.beginP = newBufP; lxthe.writeBuf.endP = lxthe.writeBuf.beginP + lxthe.bufSize - 1; lxthe.writeBuf.nextP = lxthe.writeBuf.beginP; lxthe.writeBuf.dirtyP = lxthe.writeBuf.beginP; lxthe.readBuf.beginP = lxthe.writeBuf.beginP + lxthe.bufSize; lxthe.readBuf.endP = lxthe.readBuf.beginP + lxthe.bufSize - 1; lxthe.readBuf.nextP = lxthe.readBuf.beginP; lxthe.readBuf.dirtyP = lxthe.readBuf.beginP; break; case trc_dump: /* format trace header information and return to daemon */ trc_dumpP = vmalloc(LXTRACE_DUMP_SIZE); if (trc_dumpP == NULL) { rc = -ENOMEM; break; } if (copy_from_user(&args_cp, args, sizeof(args_cp))) { rc = -EFAULT; break; } /* Block writers so that we can look at writeBuf. */ spin_lock(&writeLock); /* Format the state information suitable for displaying by * the daemon. */ trc_nextP = trc_dumpP; sprintf(trc_nextP, "Trace Header Element: 0x%08X\n", &lxthe); trc_nextP += strlen(trc_nextP); /* Global information on device number, buffer sizes, * and lost records. */ sprintf(trc_nextP, " Major %d Minor %d bufSize 0x%X nOpens %d " "nBuffers %d nLost %d nWaits %d Daemon %d\n", lxthe.major, lxthe.minor, lxthe.bufSize, lxthe.nOpens, lxthe.nBuffers, lxthe.nLost, lxthe.nWaits, taskP ? taskP->pid: 0); trc_nextP += strlen(trc_nextP); sprintf(trc_nextP, "\n"); trc_nextP += strlen(trc_nextP); /* Append buffer information */ sprintf(trc_nextP, " writeBuf: beginP 0x%X endP 0x%X nextP 0x%X " "dirtyP 0x%X isDirty %d\n", lxthe.writeBuf.beginP, lxthe.writeBuf.endP, lxthe.writeBuf.nextP, lxthe.writeBuf.dirtyP, IS_DIRTY(lxthe.writeBuf)); trc_nextP += strlen(trc_nextP); sprintf(trc_nextP, " readBuf : beginP 0x%X endP 0x%X nextP 0x%X " "dirtyP 0x%X isDirty %d\n", lxthe.readBuf.beginP, lxthe.readBuf.endP, lxthe.readBuf.nextP, lxthe.readBuf.dirtyP, IS_DIRTY(lxthe.readBuf)); trc_nextP += strlen(trc_nextP); #if 0 /* verify dumpBuf size */ sprintf(trc_nextP, " dumpBuf size %d (used %d)\n", LXTRACE_DUMP_SIZE, (trc_nextP-trc_dumpP)); trc_nextP += strlen(trc_nextP); #endif spin_unlock(&writeLock); /* arg1 is the user buffer size, arg2 is the address of the buffer */ if (copy_to_user((char *)args_cp.arg2, trc_dumpP, MIN(strlen(trc_dumpP)+1, args_cp.arg1))) rc = -EFAULT; vfree(trc_dumpP); break; default: rc = -EINVAL; break; } if (readLockHeld) up(&readLock); return rc; } static struct file_operations trc_ops = { llseek: NULL, read: trc_read, /* read op allows the daemon to retrieve records */ write: trc_write, /* Trace points write to the device */ readdir: NULL, poll: NULL, ioctl: trc_ioctl, /* control op to change buffering or dump state */ mmap: NULL, open: trc_open, /* Prepare the device for tracing */ flush: NULL, release: trc_close, /* Terminate tracing and close the device */ fsync: trc_fsync_internal, /* Sync all buffered data to the daemon */ fasync: NULL, lock: NULL, aio_read: NULL, aio_write: NULL, }; #ifdef EXPORTKDUMPDEV static struct file_operations kdump_ops = { llseek: kdump_lseek, read: kdump_read, /* read op allows the daemon to retrieve records */ write: NULL, /* Trace points write to the device */ readdir: NULL, poll: NULL, ioctl: NULL, /* control op to change buffering or dump state */ mmap: NULL, open: kdump_open, /* Prepare the device for tracing */ flush: NULL, release: kdump_close, /* Terminate tracing and close the device */ fsync: NULL, /* Sync all buffered data to the daemon */ fasync: NULL, lock: NULL, aio_read: NULL, aio_write: NULL, }; #endif /* Register the trace device "/dev/trace" and save the major number in * the header */ static int trc_register() { int major = register_chrdev(0, "trace", &trc_ops); if (major < 0) return major; lxthe.major = major; #ifdef EXPORTKDUMPDEV major_kdump = register_chrdev(0, "kdump", &kdump_ops); #endif return 0; } /* Unregister the trace device */ static void trc_unregister() { unregister_chrdev(lxthe.major, "trace"); lxthe.major = 0; #ifdef EXPORTKDUMPDEV if (major_kdump >= 0) unregister_chrdev(major_kdump, "kdump"); major_kdump = 0; #endif } static void _STraceArgs(int* trRecLenP, int* stringLenP, int nArgs, int pos, va_list listP) { int dataLen; int i; ARGTYPE tmpint; char *s; int stringLen; int stringPadLen; dataLen = 0; /* Handle argument lists that include a string parameter */ if (pos >= 0 && pos < LXTRACE_MAX_FORMAT_SUBS) { /* Items (if any) preceeding the string argument */ for (i = 0; i < pos; i++) { tmpint = va_arg(listP, ARGTYPE); trc_append_record(&tmpint, ARGLEN); dataLen += ARGLEN; } /* Copy the string, making sure it does not overflow the buffer */ s = va_arg(listP, char*); if (s < (char*)4096) /* bad address */ { printk("_STrace: bad address 0x%X hook 0x%X\n", s, lxthe.hdrP->trHook); s = ""; } stringLen = strlen(s); stringLen = MIN(stringLen, LXTRACE_MAX_DATA - sizeof(trc_datahdr_t) - (nArgs*ARGLEN) - 1 - (ARGLEN-1)); trc_append_record(s, stringLen); stringPadLen = ARGLEN - (stringLen%ARGLEN); trc_append_record(stringPadding, stringPadLen); *stringLenP = stringLen + stringPadLen; dataLen += stringLen + stringPadLen; /* Append items following string argument */ for (i = pos; i < nArgs; i++) { tmpint = va_arg(listP, ARGTYPE); trc_append_record(&tmpint, ARGLEN); dataLen += ARGLEN; } } else /* !IS_SFORMAT */ { /* Place the fixed parameters in the temporary trace buffer */ for (i = 0; i < nArgs; i++) { tmpint = va_arg(listP, ARGTYPE); trc_append_record(&tmpint, ARGLEN); dataLen += ARGLEN; } *stringLenP = 0; } /* Append the float argument */ if (pos == _TR_FORMAT_F) { /* Although the argument is really a double, don't tell the compiler, so that it will not generate code using floating point hardware that is not supposed to be used in the kernel. */ /* double tmpdbl = va_arg(listP, double); */ unsigned long long tmpdbl = va_arg(listP, unsigned long long); trc_append_record(&tmpdbl, sizeof(tmpdbl)); dataLen += sizeof(tmpdbl); } *trRecLenP = sizeof(trc_datahdr_t) + dataLen; /* DBGASSERT(*trRecLenP <= LXTRACE_MAX_DATA); */ } void _STraceNB(int hookword, int nArgs, int pos, ...) { trc_datahdr_t hdr; int recLen; int rc; va_list listP; int trRecLen; int stringLen; /* Trace calls from interrupt level are not supported. If anybody needs them, changing writeLock to use spin_lock_irqsave should be all that is needed to allow non-blocking traces to work. */ if (in_interrupt()) return; if (!isTraced(hookword)) return; /* Test for trace formats that aren't supported yet */ if ((pos == _TR_FORMAT_I) && (nArgs > LXTRACE_MAX_FORMAT_SUBS)) { #ifdef DBGASSERTS printk("_STrace: too many arguments (hook %X)\n", hookword); #endif /* DBGASSERTS */ return; } /* Build a data header and append it to the trace file. If there is a string, the length is not yet known, so use the maximum. It will be patched to the correct value later. */ hdr.trHook = hookword; hdr.trNArgs = nArgs; hdr.trSPos = pos; hdr.trSLen = 0; /* invalid if there is a string; fix below */ if (pos >= 0 && pos < LXTRACE_MAX_FORMAT_SUBS) recLen = LXTRACE_MAX_DATA; else { recLen = sizeof(hdr) + nArgs*ARGLEN; if (pos == _TR_FORMAT_F) recLen += ARGLEN; } rc = trc_start_record(&hdr, recLen, true); /* If the header was successfully written, collect arguments directly into the trace buffer */ if (rc == 0) { va_start(listP, pos); _STraceArgs(&trRecLen, &stringLen, nArgs, pos, listP); va_end(listP); /* Patch the string and record lengths now that the string has been copied */ lxthe.hdrP->trSLen = stringLen; lxthe.tHdrP->trLength = trRecLen; /* Trace record complete */ trc_end_record(); } } void _STrace(int hookword, int nArgs, int pos, ...) { trc_datahdr_t hdr; int recLen; int rc; va_list listP; int trRecLen; int stringLen; /* Trace calls from interrupt level are not supported. If anybody needs them, changing writeLock to use spin_lock_irqsave should be all that is needed to allow non-blocking traces to work. */ if (in_interrupt()) return; if (!isTraced(hookword)) return; /* Test for trace formats that aren't supported yet */ if ((pos == _TR_FORMAT_I) && (nArgs > LXTRACE_MAX_FORMAT_SUBS)) { #ifdef DBGASSERTS printk("_STrace: too many arguments (hook %X)\n", hookword); #endif /* DBGASSERTS */ return; } /* Build a data header and append it to the trace file. If there is a string, the length is not yet known, so use the maximum. It will be patched to the correct value later. */ hdr.trHook = hookword; hdr.trNArgs = nArgs; hdr.trSPos = pos; hdr.trSLen = 0; /* invalid if there is a string; fix below */ if (pos >= 0 && pos < LXTRACE_MAX_FORMAT_SUBS) recLen = LXTRACE_MAX_DATA; else { recLen = sizeof(hdr) + nArgs*ARGLEN; if (pos == _TR_FORMAT_F) recLen += ARGLEN; } rc = trc_start_record(&hdr, recLen, false); /* If the header was successfully written, collect arguments directly into the trace buffer */ if (rc == 0) { va_start(listP, pos); _STraceArgs(&trRecLen, &stringLen, nArgs, pos, listP); va_end(listP); /* Patch the string and record lengths now that the string has been copied */ lxthe.hdrP->trSLen = stringLen; lxthe.tHdrP->trLength = trRecLen; /* Trace record complete */ trc_end_record(); } } void _XTraceNB(int hookword, char *fmt, ...) { trc_datahdr_t hdr; int rc; va_list vargs; int stringLen; /* Trace calls from interrupt level are not supported. If anybody needs them, changing writeLock to use spin_lock_irqsave should be all that is needed to allow non-blocking traces to work. */ if (in_interrupt()) return; if (!isTraced(hookword)) return; /* Build a data header and append it to the trace file. Since the length is not yet known, use the maximum. It will be patched to the correct value later. */ hdr.trHook = hookword; hdr.trNArgs = 0; hdr.trSPos = _TR_FORMAT_X; hdr.trSLen = -1; /* invalid; fix below */ rc = trc_start_record(&hdr, LXTRACE_MAX_DATA, true); /* If the header was successfully written, format the string directly into the trace buffer */ if (rc == 0) { va_start(vargs, fmt); stringLen = vsnprintf(lxthe.writeBuf.nextP, LXTRACE_MAX_DATA-sizeof(trc_datahdr_t), fmt, vargs) + 1; va_end(vargs); if (stringLen > LXTRACE_MAX_DATA-sizeof(trc_datahdr_t)) { printk("_XTraceNB: argument too long. len=%d max=%d hook=0x%X\n", stringLen, LXTRACE_MAX_DATA-sizeof(trc_datahdr_t)-1, hookword); stringLen = LXTRACE_MAX_DATA-sizeof(trc_datahdr_t); } /* Patch the string and record lengths now that vsnprintf has calculated the length that it formatted */ lxthe.hdrP->trSLen = ((stringLen+ARGLEN-1)/ARGLEN)*ARGLEN; lxthe.tHdrP->trLength = sizeof(hdr) + lxthe.hdrP->trSLen; /* Advance pointer into trace buffer by the length of the string just appended */ lxthe.writeBuf.nextP += lxthe.hdrP->trSLen; /* Trace record complete */ trc_end_record(); } } void _XTrace(int hookword, char *fmt, ...) { trc_datahdr_t hdr; int rc; va_list vargs; int stringLen; /* Trace calls from interrupt level are not supported. If anybody needs them, changing writeLock to use spin_lock_irqsave should be all that is needed to allow non-blocking traces to work. */ if (in_interrupt()) return; if (!isTraced(hookword)) return; /* Build a data header and append it to the trace file. Since the length is not yet known, use the maximum. It will be patched to the correct value later. */ hdr.trHook = hookword; hdr.trNArgs = 0; hdr.trSPos = _TR_FORMAT_X; hdr.trSLen = -1; /* invalid; fix below */ rc = trc_start_record(&hdr, LXTRACE_MAX_DATA, false); /* If the header was successfully written, format the string directly into the trace buffer */ if (rc == 0) { va_start(vargs, fmt); stringLen = vsnprintf(lxthe.writeBuf.nextP, LXTRACE_MAX_DATA-sizeof(trc_datahdr_t), fmt, vargs) + 1; va_end(vargs); if (stringLen > LXTRACE_MAX_DATA-sizeof(trc_datahdr_t)) { printk("_XTrace: argument too long. len=%d max=%d hook=0x%X\n", stringLen, LXTRACE_MAX_DATA-sizeof(trc_datahdr_t)-1, hookword); stringLen = LXTRACE_MAX_DATA-sizeof(trc_datahdr_t); } /* Patch the string and record lengths now that vsnprintf has calculated the length that it formatted */ lxthe.hdrP->trSLen = ((stringLen+ARGLEN-1)/ARGLEN)*ARGLEN; lxthe.tHdrP->trLength = sizeof(hdr) + lxthe.hdrP->trSLen; /* Advance pointer into trace buffer by the length of the string just appended */ lxthe.writeBuf.nextP += lxthe.hdrP->trSLen; /* Trace record complete */ trc_end_record(); } } /* Module initialization */ MY_INIT_FUNCTION() { trc_init(); return trc_register(); } MY_EXIT_FUNCTION() { trc_unregister(); trc_term(); } DEFINE_MODULE_INIT(); DEFINE_MODULE_EXIT(); #endif /* GPFS_PRINTF */ #endif /* KTRACE */