Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

ss.c @ 65

Last change on this file since 65 was 16, checked in by rock, 17 years ago

File size: 46.2 KB

Line
1	/***************************************************************************
2	*
3	* Copyright (C) 2001 International Business Machines
4	* All rights reserved.
5	*
6	* This file is part of the GPFS mmfslinux kernel module.
7	*
8	* Redistribution and use in source and binary forms, with or without
9	* modification, are permitted provided that the following conditions
10	* are met:
11	*
12	* 1. Redistributions of source code must retain the above copyright notice,
13	* this list of conditions and the following disclaimer.
14	* 2. Redistributions in binary form must reproduce the above copyright
15	* notice, this list of conditions and the following disclaimer in the
16	* documentation and/or other materials provided with the distribution.
17	* 3. The name of the author may not be used to endorse or promote products
18	* derived from this software without specific prior written
19	* permission.
20	*
21	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
27	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31	*
32	*************************************************************************** */
33	/* @(#)22 1.109.1.3 src/avs/fs/mmfs/ts/kernext/gpl-linux/ss.c, mmfs, avs_rgpfs24, rgpfs24s008a 11/30/06 16:55:18 */
34	/*
35	* Implementation of shared segment for GPFS daemon and GPFS kernel code.
36	*
37	* Contents:
38	* exp_procfs_version
39	* gpfs_proc_export_init
40	* gpfs_proc_export_term
41	* ss_open
42	* ss_release
43	* ss_fs_read
44	* ss_fs_write
45	* ss_fs_ioctl
46	* ss_init
47	* kxSaveThreadInfo
48	*
49	* struct ShMemChunkDesc
50	* unprotectKernelMemory
51	* reprotectKernelMemory
52	* InitSharedMemory
53	* TermSharedMemory
54	* cxiCalcMaxSharedSegment
55	* cxiAllocSharedMemory
56	* cxiFreeSharedMemory
57	* cxiAttachSharedMemory
58	* cxiFreeSharedMemory
59	*
60	*/
61
62	#include <Shark-gpl.h>
63
64	#include <linux/types.h>
65	#include <linux/version.h>
66	#ifndef UTS_RELEASE
67	#include <linux/utsrelease.h>
68	#endif
69	#include <linux/kernel.h>
70	#include <linux/module.h>
71	#include <linux/errno.h>
72	#include <linux/slab.h>
73	#include <linux/smp_lock.h>
74	#include <linux/proc_fs.h>
75	#include <linux/mm.h>
76	#include <linux/fs.h>
77	#include <linux/file.h>
78	#include <linux/binfmts.h>
79	#include <linux/signal.h>
80	#include <linux/vmalloc.h>
81
82	#include <asm/pgtable.h>
83	#include <asm/pgalloc.h>
84	#include <asm/io.h>
85	#include <asm/uaccess.h>
86	#include <asm/user.h>
87	#include <asm/mman.h>
88	#include <asm/atomic.h>
89	#include <asm/ptrace.h>
90	#include <asm/ucontext.h>
91	#include <asm/elf.h>
92
93	#include <Logger-gpl.h>
94	#include <linux2gpfs.h>
95	#include <verdep.h>
96	#include <arch-gpl.h>
97
98	#include <cxiSystem.h>
99	#include <cxiIOBuffer.h>
100	#include <cxiSharedSeg.h>
101	#include <Trace.h>
102
103	#if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION < 2061600
104	#include <asm/ioctl32.h>
105	#if LINUX_KERNEL_VERSION >= 2060507
106	long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
107	#endif
108	#endif
109
110	int
111	cxiAttachSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment);
112
113	#ifdef GPFS_ARCH_POWER
114	#define PKMAP_BASE (0xfe000000UL)
115	#define VMALLOC_END ioremap_bot
116	#endif
117
118	const char *gpfs_banner = "GPFS Linux kernel version " UTS_RELEASE "\n";
119
120	SETUP_MODULE_PATH_PARMS;
121
122	#ifdef PERF_STATS
123	int ioctl_count[MAX_SS_IOCTL_OPS];
124	#endif
125
126
127	/* Dynamically assigned major device number for the ioctl interfaces to the
128	GPFS kernel modules. This is the /dev/ss0 device. */
129	int GPFSIoctlMajorNumber;
130
131	/* Only allow the users with write access or root users */
132	#define CHECK_PERM if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser()) \
133	{ \
134	EXIT(0); \
135	return -EPERM; \
136	}
137
138	/* Vector table for all routines that can be called with the ss_fs_ioctl. */
139	int (*ss_ioctl_op[MAX_SS_IOCTL_OPS+1])();
140
141	#ifdef SSEG_SWIZZLE_PTRS
142	/* virtual MM handlers for vm areas */
143	void ss_vm_open(struct vm_area_struct *area);
144	void ss_vm_close(struct vm_area_struct *area);
145	#if LINUX_KERNEL_VERSION < 2060000
146	struct page ss_vm_nopage(struct vm_area_struct area, unsigned long address, int unused);
147	#else
148	struct page ss_vm_nopage(struct vm_area_struct area, unsigned long address, int *type);
149	#endif /* LINUX_KERNEL_VERSION < 2060000 */
150
151	static struct vm_operations_struct ss_vm_ops = {
152	open: ss_vm_open,
153	close: ss_vm_close,
154	nopage: ss_vm_nopage,
155	};
156	#endif /* SSEG_SWIZZLE_PTRS */
157
158	/* Add GPFS information to the /proc file system. */
159	int
160	exp_procfs_version(char buffer, char *start, off_t offset,
161	int length, int eof, void data)
162	{
163	off_t pos = 0;
164	off_t begin = 0;
165	int len = 0;
166
167	len += sprintf(buffer+len, gpfs_banner);
168	*eof = 1;
169
170	*start = buffer + (offset - begin);
171	len -= (offset - begin);
172	if ( len > length )
173	len = length;
174
175	return len;
176	}
177
178	void
179	gpfs_proc_export_init(void)
180	{
181	if (!proc_mkdir("fs/gpfs", 0))
182	return;
183	create_proc_read_entry("fs/gpfs/version", 0, 0, exp_procfs_version, NULL);
184	}
185
186	void
187	gpfs_proc_export_term(void)
188	{
189	remove_proc_entry("fs/gpfs/version", NULL);
190	remove_proc_entry("fs/gpfs", NULL);
191
192	}
193
194	/* Open the character device used for the shared segment. */
195	int
196	ss_open(struct inode inode, struct file filp)
197	{
198
199	TRACE2(TRACE_SHARED, 2, TRCID_SS_019,
200	"ss_open: file 0x%lX inode 0x%lX\n",
201	filp, inode);
202
203	MY_MODULE_INCREMENT();
204
205	return 0; /* success */
206	}
207
208
209	/* Release/Close the character device used for the shared segment. */
210	int
211	ss_release(struct inode inode, struct file filp)
212	{
213	TRACE1(TRACE_SHARED, 2, TRCID_SS_023,
214	"ss_release: file 0x%lX\n", filp);
215
216	MY_MODULE_DECREMENT();
217
218	return 0; /* success */
219	}
220
221	/* Map the shared segment and return the address of the first chunk allocated
222	(if buffer is big enough to hold it). */
223	ssize_t
224	ss_fs_read(struct file file, char buf, size_t nbytes, loff_t *ppos)
225	{
226	struct inode *inode = file->f_dentry->d_inode;
227	unsigned int minor = MINOR(inode->i_rdev);
228	cxiMemoryMapping_t mapping;
229	int rc;
230
231	TRACE1(TRACE_SHARED, 2, TRCID_SS_059, "ss_fs_read: called 0x%lX\n", nbytes);
232	/* BKL is not held at entry */
233
234	if (minor != 0)
235	return -ENODEV;
236
237	/* Only allow the users with write access or root users */
238	if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser())
239	return -EPERM;
240
241	InitMemoryMapping(&mapping);
242
243	/* Map the shared memory */
244	rc = cxiAttachSharedMemory(&mapping, true);
245	if (rc)
246	return -rc;
247
248	/* If user buffer is big enough, copy base address of segment there */
249	if (nbytes >= sizeof(mapping.vaddr))
250	{
251	rc = cxiCopyOut((char *)&mapping.vaddr, buf, sizeof(mapping.vaddr));
252	if (rc)
253	return -EFAULT;
254	}
255	return 0;
256	}
257
258	/* Was used for debugging. */
259	ssize_t
260	ss_fs_write(struct file file, const char buf, size_t nbytes, loff_t *ppos)
261	{
262	/* Only allow the users with write access or root users */
263	if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser())
264	return -EPERM;
265
266	TRACE1(TRACE_SHARED, 0, TRCID_SS_065, "ss_fs_write: called 0x%lX\n", nbytes);
267	/* BKL is not held at entry */
268
269	return -EINVAL;
270	}
271
272	#ifdef PERF_STATS
273	int kxNoOp(int op1, int op2)
274	{
275	int i;
276
277	if (op1 == 1) // reset all counters
278	{
279	for (i = 0; i < MAX_SS_IOCTL_OPS; i++)
280	ioctl_count[i] = 0;
281	}
282	if (op2 > 0 && op2 < MAX_SS_IOCTL_OPS)
283	return ioctl_count[op2]; // return the requested counter
284
285	return 0;
286	}
287	#endif
288
289	#if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION >= 2061600
290	long ss_fs_compat_ioctl(struct file *file, unsigned int op, unsigned long kx_args)
291	{
292	int rc;
293	TRACE2(TRACE_KSVFS, 9, TRCID_SS_DMAPI_COMPAT_ENTER,
294	"Entering ss_fs_compat_ioctl: called me with op = %d (%s)", op, kxOp_tostring(op));
295
296	if (ss_ioctl_op[0] != 0)
297	{
298	/* unlock_kernel();*/
299	rc = ss_ioctl_op[0](op, kx_args);
300	/lock_kernel();/
301	}
302	else
303	rc = -1;
304
305	TRACE1(TRACE_KSVFS, 9, TRCID_SS_DMAPI_COMPAT_EXIT,
306	"Leaving ss_fs_compat_ioctl with rc = %d.", rc);
307
308	return rc;
309
310	}
311	#endif
312
313	/* Shared segment and other ioctl calls to the kernel code. */
314	int
315	ss_fs_ioctl(struct inode inode, struct file file,
316	unsigned int op, unsigned long kx_args)
317	{
318	int len, rc;
319	char buf[512];
320	struct kxArgs args_cp;
321	struct kxArgs args = (struct kxArgs )kx_args;
322
323	ENTER(0);
324	if (op == kxtraceit)
325	{
326	CHECK_PERM;
327
328	rc = cxiCopyIn((char)args, (char)&args_cp, sizeof(args_cp));
329	if (rc != 0)
330	goto minus1;
331
332	len = 3;
333	strncpy(buf, KERN_NOTICE, len); // KERN_NOTICE = "<5>"
334	len += sprintf(buf+len, "dp %X:%d:", cxiGetThreadId(), args_cp.arg3);
335
336	rc = cxiCopyIn((char*)args_cp.arg2, buf+len, args_cp.arg1+1);
337	if (rc != 0)
338	goto minus1;
339
340	printk(buf);
341	EXIT(0);
342	return 0;
343	}
344
345	TRACE5(TRACE_KSVFS, 15, TRCID_SS_075,
346	"ss_fs_ioctl: op %d opAddr 0x%lX args 0x%lX inode 0x%lX file 0x%lX\n",
347	op, ss_ioctl_op[op], kx_args, inode, file);
348	/* BKL is held at entry */
349
350	#ifdef PERF_STATS
351	if (op > 0 && op < MAX_SS_IOCTL_OPS)
352	ioctl_count[op]++;
353	#endif
354
355	switch (op)
356	{
357	#ifdef GPFS_ARCH_POWER
358	case CoreDump:
359	CHECK_PERM;
360	rc = cxiCopyIn((char)args, (char)&args_cp, sizeof(args_cp));
361	if (rc != 0)
362	goto minus1;
363	rc = kxCoreDump((long)args_cp.arg1, (void *)args_cp.arg2,
364	(struct ucontext )args_cp.arg3, (char )args_cp.arg4);
365	break;
366	#endif
367	case saveThreadInfo:
368	CHECK_PERM;
369	rc = cxiCopyIn((char)args, (char)&args_cp, sizeof(args_cp));
370	if (rc != 0)
371	goto minus1;
372	rc = kxSaveThreadInfo(args_cp.arg1, (void *)args_cp.arg2);
373	break;
374
375	case GetPrivLevel:
376	CHECK_PERM;
377	rc = get_privilege_level();
378	break;
379
380	case SetPrivLevel:
381	CHECK_PERM;
382	rc = set_privilege_level(kx_args);
383	break;
384
385	case MapPrivate:
386	{
387	char *outAddr;
388
389	CHECK_PERM;
390	rc = cxiCopyIn((char)args, (char )&args_cp, sizeof(args_cp));
391	if (rc != 0)
392	goto minus1;
393
394	rc = kxMapPrivate((char *)args_cp.arg1, (unsigned long)args_cp.arg2,
395	(unsigned long)args_cp.arg3, &outAddr);
396	if (rc == 0)
397	rc = cxiCopyOut((char)&outAddr, (char)args_cp.arg4, sizeof(char*));
398
399	if (rc != 0)
400	rc = -EFAULT;
401	break;
402	}
403
404	case GetTimeOfDay:
405	{
406	cxiTimeStruc_t ts;
407
408	rc = cxiGetTOD(&ts);
409	if (rc == 0)
410	rc = cxiCopyOut((char)&ts, (char)kx_args, sizeof(cxiTimeStruc_t));
411
412	if (rc != 0)
413	rc = -EFAULT;
414	break;
415	}
416
417	#ifdef PERF_STATS
418	case noOp:
419	rc = cxiCopyIn((char)args, (char)&args_cp, sizeof(args_cp));
420	if (rc != 0)
421	break;
422	if (args_cp.arg1 == 0 && args_cp.arg2 == 0)
423	{ /* continue to the real noop kxNoOp in ssioctl.C */ }
424	else
425	{
426	rc = kxNoOp((int)args_cp.arg1, (int)args_cp.arg2);
427	break;
428	}
429	#endif
430
431	default:
432	TRACE1(TRACE_KSVFS, 9, TRCID_SS_077,
433	"ss_fs_ioctl: invoking ss_ioctl_op %d\n", op);
434	if (ss_ioctl_op[0] != 0)
435	{
436	unlock_kernel();
437	rc = ss_ioctl_op[0](op, kx_args);
438	lock_kernel();
439	}
440	else
441	goto minus1;
442	break;
443	}
444	EXIT(0);
445	return rc;
446
447	minus1:
448	EXIT(0);
449	return -1;
450	}
451
452	#ifdef SSEG_SWIZZLE_PTRS
453	extern int ss_fs_mmap(struct file file, struct vm_area_struct vma);
454	#endif
455
456	/* The other operations, not in the following list, for the device come from
457	the bare device. */
458	struct file_operations ss_fops =
459	{
460	read: ss_fs_read,
461	write: ss_fs_write,
462	ioctl: ss_fs_ioctl,
463	#ifdef SSEG_SWIZZLE_PTRS
464	mmap: ss_fs_mmap,
465	#endif
466	open: ss_open,
467	release: ss_release,
468	#if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION >= 2061600
469	compat_ioctl: ss_fs_compat_ioctl,
470	#endif
471	};
472
473	#ifdef API_32BIT
474	#ifdef GPFS_ARCH_X86_64
475
476	/* Note that these 32-bit ioctl functions are not needed for ia64; these
477	routines just call the standard 64-bit ioctl. */
478	static int tsstat32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
479	{
480	DBGASSERT(cmd == Stat);
481	return sys_ioctl(fd,cmd,ptr);
482	}
483	static int tsfstat32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
484	{
485	DBGASSERT(cmd == Fstat);
486	return sys_ioctl(fd,cmd,ptr);
487	}
488	static int tsfattr32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
489	{
490	DBGASSERT(cmd == Fattr);
491	return sys_ioctl(fd,cmd,ptr);
492	}
493	static int tsfsattr32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
494	{
495	DBGASSERT(cmd == FsAttr);
496	return sys_ioctl(fd,cmd,ptr);
497	}
498	static int tsattr32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
499	{
500	DBGASSERT(cmd == Attr);
501	return sys_ioctl(fd,cmd,ptr);
502	}
503	static int tsgetacl32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
504	{
505	DBGASSERT(cmd == GetACL);
506	return sys_ioctl(fd,cmd,ptr);
507	}
508	static int tsputacl32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
509	{
510	DBGASSERT(cmd == PutACL);
511	return sys_ioctl(fd,cmd,ptr);
512	}
513	#ifdef DMAPI
514	static int kxDmApiCall32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
515	{
516	DBGASSERT(cmd == DmApiCall);
517	return sys_ioctl(fd,cmd,ptr);
518	}
519	#endif /* DMAPI */
520
521	#ifdef GPFS_QUOTACTL
522	static int kxQuotactl32(unsigned fd, unsigned cmd, unsigned long ptr, struct file * filp)
523	{
524	DBGASSERT(cmd == Quotactl);
525	return sys_ioctl(fd,cmd,ptr);
526	}
527	#endif
528	#endif /* GPFS_ARCH_X86_64 */
529
530	/* Most 64-bit architectures have a separate interface where 32-bit ioctl
531	command numbers / routines must be registered (not necessary for ia64).
532	At some point we may need to modify our command numbers (currently
533	use kxOps for number field) to use both the type / magic number
534	and number field (ie, _IOWR('G', ) instead of current implicit _IORW(0, ))
535	if a command number collision occurs between gpfs and a new
536	device driver. The 32-bit ioctl implementation only
537	uses a hash table (and not a driver specific function pointer like ioctl
538	from file_operations ... something like ioctl32 would be ideal or just
539	passing this to sys_ioctl like is done on ia64 platform),
540	so a collision may occur here someday. Curently not very many drivers
541	provide 32-bit ioctl calls and only the entries from 0x0 to 0x1F are used
542	with magic number 0, ie _IOWR(0,0) to _IOWR(0,1F), while our external API
543	commands are in the range of 53-59 (0x35-0x3b) ... although the limited
544	ioctl32 hash table size actually makes collisions much more likely.
545	Note that /usr/src/linux/Documentation/ioctl-number.txt keeps track of
546	the registered blocks used by drivers. */
547	void
548	gpfs_reg_ioctl32()
549	{
550	int rc = 0;
551	/* TO DO: eventually add 32-bit API for PPC64? */
552	#if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION < 2061600
553	rc = register_ioctl32_conversion(Stat, tsstat32);
554	rc \|= register_ioctl32_conversion(Fstat, tsfstat32);
555	rc \|= register_ioctl32_conversion(Fattr, tsfattr32);
556	rc \|= register_ioctl32_conversion(FsAttr, tsfsattr32);
557	rc \|= register_ioctl32_conversion(Attr, tsattr32);
558	rc \|= register_ioctl32_conversion(GetACL, tsgetacl32);
559	rc \|= register_ioctl32_conversion(PutACL, tsputacl32);
560	#ifdef DMAPI
561	rc \|= register_ioctl32_conversion(DmApiCall, kxDmApiCall32);
562	#endif /* DMAPI */
563	#ifdef GPFS_QUOTACTL
564	rc \|= register_ioctl32_conversion(Quotactl, kxQuotactl32);
565	#endif /* GPFS_QUOTACTL */
566
567	if (rc)
568	printk("gpfs_reg_ioctl32: Error in registering ioctl32\n");
569
570	#endif /* GPFS_ARCH_X86_64 */
571	}
572
573	void
574	gpfs_unreg_ioctl32()
575	{
576	int rc = 0;
577	/* TO DO: eventually add 32-bit API for PPC64? */
578	#if GPFS_ARCH_X86_64 && LINUX_KERNEL_VERSION < 2061600
579	rc = unregister_ioctl32_conversion(Stat);
580	rc \|= unregister_ioctl32_conversion(Fstat);
581	rc \|= unregister_ioctl32_conversion(Fattr);
582	rc \|= unregister_ioctl32_conversion(FsAttr);
583	rc \|= unregister_ioctl32_conversion(Attr);
584	rc \|= unregister_ioctl32_conversion(GetACL);
585	rc \|= unregister_ioctl32_conversion(PutACL);
586	#ifdef DMAPI
587	rc \|= unregister_ioctl32_conversion(DmApiCall);
588	#endif /* DMAPI */
589	#ifdef GPFS_QUOTACTL
590	rc \|= unregister_ioctl32_conversion(Quotactl);
591	#endif /* GPFS_QUOTACTL */
592
593	if (rc)
594	printk("unregister_ioctl32_conversion: Error in unregistering ioctl32\n");
595
596	#endif /* GPFS_ARCH_X86_64 */
597	}
598
599	#endif /* API_32BIT */
600
601	/* Initialization of the character device used for the shared segment
602	interfaces and other ioctl calls to the kernel code. */
603	int
604	ss_init()
605	{
606	int major;
607
608	GPFSIoctlMajorNumber = 0;
609	major = register_chrdev(0, "ss", &ss_fops);
610
611	if (major < 0)
612	{
613	TRACE1(TRACE_SHARED, 2, TRCID_SS_081,
614	"ss_init: unable to get ss0 major rc %d\n", major);
615	return -1;
616	}
617
618	GPFSIoctlMajorNumber = major;
619	TRACE1(TRACE_SHARED, 2, TRCID_SS_083,
620	"ss_init: module loaded ss0 major %d\n", GPFSIoctlMajorNumber);
621
622	return 0;
623	}
624
625	/* Management of storage shared between the GPFS daemon and the mmfslinux
626	kernel module. Chunks of memory are allocated on demand by the
627	kxAllocSharedKernelMemory call, and are then suballocated by GPFS. To
628	allow free use of pointers, all of this memory is addressed using the
629	same virtual addresses whether it is being accessed from the daemon
630	process or from a process in kernel mode. Setting up this addressibility
631	requires modifying the protection bits in the Linux page table. For
632	historical reasons dating to the implementation of GPFS on AIX, the
633	storage shared between the GPFS daemon process and the kernel is
634	frequently referred to collectively as "the shared segment".
635	Note that when pointer swizzling is utilized (via SSEG_PTR_SWIZZLE), the
636	virtual address for the daemon process and kernel is no longer common;
637	the page tables are not fiddled with in this situation and a page fault
638	handler is utilized instead. */
639
640	/* Description of each allocated chunk. Allocated chunks are linked
641	together from ChunkListHead. */
642	struct ShMemChunkDesc
643	{
644	struct list_head chunkList; /* list linkage */
645	char* vaddrP; /* virtual address of beginning of chunk */
646	int len; /* length of chunk */
647	#ifdef SSEG_SWIZZLE_PTRS
648	char* usrvaddrP; /* corresponding user address from mmap */
649	#endif
650	};
651	struct list_head ChunkListHead;
652
653	/* Number of chunks and total size of all chunks */
654	int NVMallocChunks;
655	int TotalVMallocBytes;
656
657	/* Address of the first chunk allocated. This value gets returned by
658	cxiMapAllSharedKernelMemory as the base of the GPFS shared segment. */
659	char* FirstVMallocChunkP;
660
661	/* Maximum total bytes to allocate, as computed by cxiCalcMaxSharedSegment */
662	int MaxTotalVMallocBytes;
663
664	/* Beginning and end of the area of kernel virtual memory used by
665	vmalloc/vfree */
666	UIntPtr VMallocStart;
667	UIntPtr VMallocEnd;
668
669	/* Minimum size of an allocated chunk */
670	#define MIN_VMALLOC_CHUNK PAGE_SIZE
671
672	/* Lock guarding the chunk list */
673	spinlock_t ChunkListLock;
674
675	/* Pointer to slab allocator for ShMemChunkDesc's */
676	struct kmem_cache* ChunkCacheP = NULL;
677
678	/* Make a range of kernel memory addressible by the current process while
679	in user mode */
680	#ifndef SSEG_SWIZZLE_PTRS
681	static void
682	unprotectKernelMemory(char* vaddrP, int len, Boolean allocating)
683	{
684	struct mm_struct *mm = current->mm;
685	unsigned long vaddr = (unsigned long) vaddrP;
686	unsigned long vaddr_start = vaddr;
687	pgd_t *pgdP;
688	pmd_t *pmdP;
689	pte_t *pteP;
690
691	/* Change protection for each page in the range */
692	TRACE3N(TRACE_SHARED, 9, TRCID_UNPROT_ENTER,
693	"unprotectKernelMemory: vaddr 0x%lX len %d allocating %d\n",
694	vaddr, len, allocating);
695	while (len > 0)
696	{
697	/* Access the page to make sure all levels of the page table have been
698	created. This this is a kernel address, so page table entries will
699	persist once they have been created, since the Linux kernel is not
700	pageable. */
701	atomic_read((atomic_t*) vaddrP);
702
703	/* Find page table entries for this page */
704	pgdP = PGD_OFFSET(mm, vaddr);
705	pmdP = pmd_offset(pgdP, vaddr);
706	pteP = PTE_OFFSET(pmdP, vaddr);
707
708	#ifdef GPFS_ARCH_I386
709	/* On IA32, set both the pte, and pmd/pgd to allow mmfsd process-level
710	* access to the area. Since each process has its own page directory
711	* (pgd), an attempt to access one of these unprotected pages will be
712	* blocked by the protection bit in that process' pgd. If another process
713	* requires access to shared kernel pages, only its pgd need be updated.
714	* pmd_t and pte_t are same size and definition. Thus pte_rdprotect()
715	* (only available macro that hides differences between Suse/Redhat)
716	* is used.
717	*/
718	DBGASSERT(sizeof(pte_t) == sizeof(pmd_t));
719	set_pte((pte_t )pmdP, pte_mkread(((pte_t *)pmdP)));
720	if (allocating)
721	set_pte(pteP, pte_mkread(*pteP));
722
723	PTE_UNMAP(pteP);
724
725	#elif defined(GPFS_ARCH_POWER) \|\| defined(GPFS_ARCH_X86_64)
726	// XXX Not implemented
727	// pmd_val(pmdP) = pmd_val(pmdP) \| _PAGE_USER;
728	// if (allocating)
729	// set_pte(pteP, pte_mkread(*pteP));
730	#elif defined(GPFS_ARCH_IA64)
731	/* On IA64, set the protection level of the page when it is created.
732	* Nothing to do when allowing access from another process except to
733	* set the privilege level of the process.
734	*/
735	if (allocating)
736	pte_val(pteP) = pte_val(pteP) \| PRIVILEGE_FLAGS;
737	#endif
738
739	/* Advance to the next page */
740	vaddr += PAGE_SIZE;
741	vaddrP += PAGE_SIZE;
742	len -= PAGE_SIZE;
743	}
744
745	/* It is necessary to flush the TLB entries for IA64 to propagate the
746	* pte privilege level change.
747	*/
748	FLUSH_TLB_RANGE(mm, vaddr_start, vaddr);
749	}
750	#else
751	static void
752	unprotectKernelMemory(char* vaddrP, int len, Boolean allocating)
753	{
754	/* do nothing when pointer swizzling */
755	return;
756	}
757	#endif /* !SSEG_SWIZZLE_PTRS */
758
759	/* Make a range of kernel memory no longer addressible by user processes
760	while in user mode. Called just before freeing the memory. */
761	#ifndef SSEG_SWIZZLE_PTRS
762	static void
763	reprotectKernelMemory(char* vaddrP, int len)
764	{
765	struct mm_struct *mm = current->mm;
766	unsigned long vaddr = (unsigned long) vaddrP;
767	unsigned long vaddr_start = vaddr;
768	pgd_t *pgdP;
769	pmd_t *pmdP;
770	pte_t *pteP;
771
772	/* Change protection for each page in the range */
773	ENTER(0);
774	TRACE2(TRACE_SHARED, 4, TRCID_REPROT_ENTER,
775	"reprotectKernelMemory: vaddr 0x%lX len %d\n",
776	vaddr, len);
777	while (len > 0)
778	{
779	/* Access the page to make sure all levels of the page table have been
780	created. This this is a kernel address, so page table entries will
781	persist once they have been created, since the Linux kernel is not
782	pageable. */
783	atomic_read((atomic_t*) vaddrP);
784
785	/* Find page table entries for this page */
786	pgdP = PGD_OFFSET(mm, vaddr);
787	pmdP = pmd_offset(pgdP, vaddr);
788	pteP = PTE_OFFSET(pmdP, vaddr);
789
790	#ifdef GPFS_ARCH_I386
791	/* On IA32, reset the pte and pmd to disallow process-level access.*/
792	set_pte((pte_t )pmdP, pte_rdprotect(((pte_t *)pmdP))); // see unprotect
793	set_pte(pteP, pte_rdprotect(*pteP));
794
795	#elif defined(GPFS_ARCH_POWER) \|\| defined(GPFS_ARCH_X86_64)
796	// XXX??? not implemented
797
798	#elif defined(GPFS_ARCH_IA64)
799	/* On IA64, reset the protection level of the page. */
800	pte_val(pteP) = (pte_val(pteP) & ~_PAGE_PL_MASK) \| _PAGE_PL_0;
801	#endif
802
803	PTE_UNMAP(pteP);
804
805	/* Advance to the next page */
806	vaddr += PAGE_SIZE;
807	vaddrP += PAGE_SIZE;
808	len -= PAGE_SIZE;
809	}
810
811	/* It is necessary to flush the TLB entries for IA64 to propagate the
812	* pte privilege level change.
813	*/
814	FLUSH_TLB_RANGE(mm, vaddr_start, vaddr);
815	EXIT(0);
816	}
817	#else
818	static void
819	reprotectKernelMemory(char* vaddrP, int len)
820	{
821	/* do nothing when pointer swizzling */
822	return;
823	}
824	#endif /* !SSEG_SWIZZLE_PTRS */
825
826
827	/* Initialize the code that manages shared memory */
828	void
829	InitSharedMemory()
830	{
831	ENTER(0);
832	TRACE2(TRACE_SHARED, 1, TRCID_SHKERN_INIT,
833	"InitSharedMemory called. VMALLOC_START 0x%lX VMALLOC_END 0x%lX\n",
834	VMALLOC_START, VMALLOC_END);
835
836	VMallocStart = (UIntPtr)VMALLOC_START;
837	VMallocEnd = (UIntPtr)VMALLOC_END;
838
839	spin_lock_init(&ChunkListLock);
840
841	/* Create a slab allocator for ShMemChunkDesc objects */
842	ChunkCacheP = kmem_cache_create("ShMemChunkDesc",
843	sizeof(struct ShMemChunkDesc),
844	0 /* offset */,
845	0 /* flags */,
846	NULL /* ctor */,
847	NULL /* dtor */);
848	if (ChunkCacheP == NULL)
849	cxiPanic("Cannot create ShMemChunkDesc cache\n");
850
851	/* Empty the chunk list */
852	INIT_LIST_HEAD(&ChunkListHead);
853	EXIT(0);
854	}
855
856
857	/* Compute how large the total size shared segment
858	is allowed to grow, based on a desired size. A value of 0 for
859	desiredBytes means to compute the default maximum size. */
860	int
861	cxiCalcMaxSharedSegment(int desiredBytes, int* actualBytesP)
862	{
863	Int64 physMemSize;
864	Int64 effPhysMemSize;
865	UIntPtr minAllowedSize = 1610241024;
866	UIntPtr maxAllowedSize = MAX_SSEG_MAPPINGS10241024;
867	UIntPtr actualBytes;
868	char* p;
869	UIntPtr vmUsed;
870	UIntPtr vmRegionReserved;
871	UIntPtr maxBytes;
872
873	/* If an explicit number of desired bytes was given, use that value.
874	Otherwise, if no number of desired bytes was given (or a value
875	smaller than the minimum possible was specified) compute the size based
876	on the size of real memory. The size computed is a fixed fraction of
877	real memory (only the first 2G on i386). */
878	ENTER(0);
879	physMemSize = (Int64)num_physpages * PAGE_SIZE;
880	#ifdef GPFS_ARCH_I386
881	effPhysMemSize = MIN(physMemSize, (Int64)0x80000000);
882	#else
883	effPhysMemSize = physMemSize;
884	#endif
885
886	if (desiredBytes > 0)
887	actualBytes = desiredBytes;
888	else
889	actualBytes = effPhysMemSize/16;
890
891	actualBytes = MAX(actualBytes, minAllowedSize);
892
893	/* Compute an approximation of how many bytes are already used in the
894	vmalloc region. The variables needed to compute this exactly are not
895	exported from the kernel. If we vmalloc a single page area and see how
896	far the allocated area is from the beginning of the vmalloc region, we
897	have at least a lower bound on the amount of vmalloc storage already
898	used. If there have been no vfrees, this will yield an accurate
899	answer. */
900	p = vmalloc(PAGE_SIZE);
901	if (p == NULL)
902	vmUsed = VMallocEnd - VMallocStart;
903	else
904	{
905	vmUsed = (UIntPtr)p - VMallocStart;
906	vfree(p);
907	}
908
909	/* Make sure the actual maximum fits within the vmalloc region, taking
910	into account memory already used and leaving a reserved area for other
911	vmallocs. */
912	vmRegionReserved = 1610241024;
913	maxBytes = (VMallocEnd-VMallocStart) - (vmUsed+vmRegionReserved);
914	actualBytes = MIN(actualBytes, maxBytes);
915
916	/* Make sure the actual maximum does not exceed the maximum possible */
917	actualBytes = MIN(actualBytes, maxAllowedSize);
918
919	/* Make sure the actual maximum is less than half of real memory */
920	actualBytes = MIN(actualBytes, effPhysMemSize/2);
921
922	/* Round actual maximum down to a multiple of the page size */
923	actualBytes = (actualBytes/PAGE_SIZE) * PAGE_SIZE;
924
925	/* If actual maximum is less than the minimum allowed, return 0 */
926	if (actualBytes < minAllowedSize)
927	actualBytes = 0;
928
929	/* Return result */
930	TRACE5(TRACE_SHARED, 1, TRCID_CALC_MAX_SHARED,
931	"cxiCalcMaxSharedSegment: actualBytes 0x%lX desiredBytes %d "
932	"physMemSize 0x%lX vmUsed 0x%lX maxBytes 0x%lX\n",
933	actualBytes, desiredBytes, physMemSize, vmUsed, maxBytes);
934
935	*actualBytesP = (int)actualBytes;
936	MaxTotalVMallocBytes = (int)actualBytes;
937
938	EXIT(0);
939	return 0;
940	}
941
942	/* Acquire additional kernel memory that is mapped to user space when
943	* using SSEG_SWIZZLE_PTRS (different virtual address between kernel and
944	* daemon); otherwise allocated memory uses the same virtual address
945	* for both kernel code and the GPFS daemon. Will get at least minBytes.
946	* Returns the starting virtual address of the area and its actual length.
947	*/
948	int
949	cxiAllocSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment)
950	{
951	int rc = 0;
952	int code = 0;
953	char *vaddrP;
954	struct ShMemChunkDesc* chunkP = NULL;
955	int minBytes = mappingP->kBytes * 1024;
956	int actualBytes;
957	pgprot_t prot;
958	#if defined(GPFS_ARCH_X86_64) && !defined(SSEG_SWIZZLE_PTRS)
959	pml4_t* pml4P;
960	#endif
961
962	/* On linux we only allocate the shared segment in this manner */
963	ENTER(0);
964	LOGASSERT(isSharedSegment == true);
965
966	/* Compute actual number of bytes to allocate */
967	if (minBytes <= MIN_VMALLOC_CHUNK)
968	actualBytes = MIN_VMALLOC_CHUNK;
969	else
970	actualBytes = ((minBytes + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE;
971
972	TRACE2(TRACE_SHARED, 5, TRCID_ALLOC_SHARED_VMALLOC,
973	"cxiAllocSharedMemory: vmalloc %d minBytes %d\n",
974	actualBytes, minBytes);
975
976	/* Return failure if this allocation would put us over the limit */
977	if (TotalVMallocBytes + actualBytes > MaxTotalVMallocBytes)
978	{
979	code = 1;
980	rc = -ENOMEM;
981	goto xerror;
982	}
983
984	/* Get a descriptor for the memory to be allocated */
985	chunkP = (struct ShMemChunkDesc*) kmem_cache_alloc(ChunkCacheP, GFP_KERNEL);
986	if (chunkP == NULL)
987	{
988	code = 2;
989	rc = -ENOMEM;
990	goto xerror;
991	}
992
993	/* Allocate memory
994	* ?? Instead of calling vmalloc here, we could also do something like:
995	* pgprot_t prot;
996	* prot = __pgprot(pgprot_val(PAGE_KERNEL) \| _PAGE_USER);
997	* vaddrP = __vmalloc(actualBytes, GFP_KERNEL \| __GFP_HIGHMEM, prot);
998	*
999	* This is an expansion of the vmalloc inline function, with _PAGE_USER
1000	* added to the protection bits so that the PTE entries will already be set
1001	* correctly. However, a call to unprotectKernelMemory would still be
1002	* needed to set the protection bits in the PMD entries.
1003	*
1004	* There is also the possibility here of using __GFP_HIGHMEM instead of
1005	* GFP_KERNEL on machines with sufficient high memory. The storage
1006	* allocated here will never be used as I/O buffers, so high memory would
1007	* be a good place to put it. This would give I/O buffers a greater chance
1008	* of being allocated below 1G, reducing the need for bounce buffers to do
1009	* I/O.
1010	*/
1011	#ifndef SSEG_SWIZZLE_PTRS
1012
1013	#if defined(GPFS_ARCH_POWER)
1014	prot = __pgprot(pgprot_val(PAGE_KERNEL) \| _PAGE_USER);
1015	vaddrP = __vmalloc(actualBytes, GFP_KERNEL, prot);
1016	#elif defined(GPFS_ARCH_X86_64)
1017	#define __pml4(x) ((pml4_t) { (x) } )
1018	pml4P = pml4_offset_k(VMALLOC_START);
1019	set_pml4(pml4P, __pml4(pml4_val(*pml4P) \| _PAGE_USER));
1020	#undef __pml4
1021	prot = __pgprot(pgprot_val(PAGE_KERNEL) \| _PAGE_USER \| _PAGE_GLOBAL);
1022	vaddrP = __vmalloc(actualBytes, GFP_KERNEL, prot);
1023	#elif defined(GPFS_ARCH_PPC64)
1024	prot = __pgprot(pgprot_val(PAGE_KERNEL) \| _PAGE_USER);
1025	vaddrP = __vmalloc(actualBytes, GFP_KERNEL, prot);
1026	#else
1027	vaddrP = vmalloc(actualBytes);
1028	#endif
1029
1030	#else
1031	vaddrP = vmalloc(actualBytes);
1032	#endif /* !SSEG_SWIZZLE_PTRS */
1033	if (vaddrP == NULL)
1034	{
1035	code = 3;
1036	rc = -ENOMEM;
1037	goto xerror;
1038	}
1039
1040	#ifdef MALLOC_DEBUG
1041	MallocDebugNew(vaddrP, actualBytes, 3);
1042	#endif
1043
1044	spin_lock(&ChunkListLock);
1045
1046	NVMallocChunks += 1;
1047	TotalVMallocBytes += actualBytes;
1048
1049	/* Remember address of first chunk allocated */
1050	if (NVMallocChunks == 1)
1051	FirstVMallocChunkP = vaddrP;
1052
1053	/* Fill in chunk descriptor and add it to the proper list */
1054	chunkP->vaddrP = vaddrP;
1055	chunkP->len = actualBytes;
1056	#ifdef SSEG_SWIZZLE_PTRS
1057	chunkP->usrvaddrP = 0;
1058	#endif
1059	list_add(&chunkP->chunkList, &ChunkListHead);
1060
1061	spin_unlock(&ChunkListLock);
1062
1063	/* Make memory just allocated addressible by the current process */
1064	unprotectKernelMemory(vaddrP, actualBytes, true);
1065
1066	/* Return results */
1067	mappingP->vaddr = vaddrP;
1068	mappingP->kBytes = actualBytes / 1024;
1069	#ifdef SSEG_SWIZZLE_PTRS
1070	mappingP->kvaddr = vaddrP;
1071	/* mappingP->vaddr is reset to proper user va in kxAllocSharedMemory */
1072	#endif
1073
1074	xerror:
1075	if (rc)
1076	{
1077	InitMemoryMapping(mappingP);
1078
1079	if (chunkP)
1080	kmem_cache_free(ChunkCacheP, (void*)chunkP);
1081	}
1082
1083	TRACE4(TRACE_SHARED, 1, TRCID_ALLOC_SHARED_EXIT,
1084	"cxiAllocSharedMemory: vaddr 0x%lX kBytes %d rc %d code %d\n",
1085	mappingP->vaddr, mappingP->kBytes, rc, code);
1086	EXIT(0);
1087	return rc;
1088	}
1089
1090	#ifdef SSEG_SWIZZLE_PTRS
1091	/* Record the user address that is associated with the kernel vmalloc
1092	address (vmalloc chunk for shared segment). This is needed later on
1093	by the page fault handler.
1094	This routine is called after allocating the chunk and determining the
1095	corresponding user address (used by all user processes mmap'ing
1096	this specific shared segment chunk).
1097	*/
1098	int
1099	cxiRecordSharedMemory(cxiMemoryMapping_t *mappingP)
1100	{
1101	int found = 0;
1102	struct ShMemChunkDesc* chunkP = NULL;
1103	struct list_head* p;
1104
1105	ENTER(0);
1106	spin_lock(&ChunkListLock);
1107	list_for_each(p, &ChunkListHead)
1108	{
1109	chunkP = list_entry(p, struct ShMemChunkDesc, chunkList);
1110	if (chunkP->vaddrP == mappingP->kvaddr)
1111	{
1112	chunkP->usrvaddrP = mappingP->vaddr;
1113	found = 1;
1114	break;
1115	}
1116	}
1117	spin_unlock(&ChunkListLock);
1118
1119	EXIT(0);
1120	if (!found)
1121	return -1;
1122	else
1123	return 0;
1124	}
1125
1126	/* Obtain any necessary kernel information for initializing
1127	pointer swizzling; currently just grabs vmalloc range info. */
1128	int
1129	cxiInitPtrSwizzling(UIntPtr vmallocStartP, UIntPtr vmallocEndP)
1130	{
1131	ENTER(0);
1132
1133	*vmallocStartP = (UIntPtr)VMALLOC_START;
1134	*vmallocEndP = (UIntPtr)VMALLOC_END;
1135
1136	EXIT(0);
1137	return 0;
1138	}
1139	#endif
1140
1141	/* Unmap and deallocate all shared segment memory */
1142	int
1143	cxiFreeSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment)
1144	{
1145	struct list_head* firstP;
1146	struct ShMemChunkDesc* chunkP;
1147
1148	ENTER(0);
1149	LOGASSERT(isSharedSegment == true);
1150
1151	/* Walk down the list of multi page chunks. Free each one and its
1152	* associated chunk descriptor. Drop the list lock while freeing
1153	* storage.
1154	*/
1155	spin_lock(&ChunkListLock);
1156
1157	while (!list_empty(&ChunkListHead))
1158	{
1159	firstP = ChunkListHead.next;
1160	list_del(firstP);
1161
1162	chunkP = list_entry(firstP, struct ShMemChunkDesc, chunkList);
1163	NVMallocChunks -= 1;
1164	TotalVMallocBytes -= chunkP->len;
1165
1166	spin_unlock(&ChunkListLock);
1167	reprotectKernelMemory(chunkP->vaddrP, chunkP->len);
1168
1169	TRACE2(TRACE_SHARED, 4, TRCID_FREEALL_VFREE,
1170	"cxiFreeSharedMemory: vaddrP 0x%lX chunkP 0x%lX\n",
1171	chunkP->vaddrP, chunkP);
1172
1173	vfree(chunkP->vaddrP);
1174	#ifdef MALLOC_DEBUG
1175	MallocDebugDelete(chunkP->vaddrP);
1176	#endif
1177
1178	kmem_cache_free(ChunkCacheP, (void*)chunkP);
1179	spin_lock(&ChunkListLock);
1180	}
1181	FirstVMallocChunkP = NULL;
1182	spin_unlock(&ChunkListLock);
1183
1184	InitMemoryMapping(mappingP);
1185
1186	EXIT(0);
1187	return 0;
1188	}
1189
1190	/* Map the shared segment memory into the address
1191	* space of the calling process
1192	*/
1193	int
1194	cxiAttachSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment)
1195	{
1196	struct list_head* p;
1197	struct ShMemChunkDesc* chunkP;
1198
1199	ENTER(0);
1200	LOGASSERT(isSharedSegment == true);
1201
1202	/* Walk down the list of allocated chunks. Map each one so that
1203	* this process can access it from user space.
1204	*/
1205	spin_lock(&ChunkListLock);
1206	list_for_each(p, &ChunkListHead)
1207	{
1208	chunkP = list_entry(p, struct ShMemChunkDesc, chunkList);
1209	TRACE1N(TRACE_SHARED, 11, TRCID_MAPALL_MULTI,
1210	"cxiAttachSharedMemory: chunkP 0x%lX\n", chunkP);
1211
1212	/* unprotectKernelMemory has to be called here with 'allocating'
1213	* set to 'true', so that mmfsadm can map and access the shared segment
1214	* even when the daemon has died and called reprotectKernelMemory
1215	*/
1216	unprotectKernelMemory(chunkP->vaddrP, chunkP->len, true);
1217	}
1218	spin_unlock(&ChunkListLock);
1219
1220	/* Return address of first chunk allocated; this will be the
1221	* base of the GPFS shared segment
1222	*/
1223	mappingP->vaddr = FirstVMallocChunkP;
1224	#ifdef SSEG_SWIZZLE_PTRS
1225	mappingP->kvaddr = FirstVMallocChunkP;
1226	/* mappingP->vaddr is reset to proper user va in kxAttachSharedMemory */
1227	#endif
1228
1229	/* If there were no chunks, return ENOENT */
1230	EXIT(0);
1231	return (NVMallocChunks > 0) ? 0 : -ENOENT;
1232	}
1233
1234	int
1235	cxiDetachSharedMemory(cxiMemoryMapping_t *mappingP, Boolean isSharedSegment)
1236	{
1237	struct list_head* p;
1238	struct ShMemChunkDesc* chunkP;
1239
1240	ENTER(0);
1241	LOGASSERT(isSharedSegment == true);
1242
1243	/* Walk down the list of allocated chunks. Map each one so that
1244	* this process can access it from user space.
1245	*/
1246	spin_lock(&ChunkListLock);
1247
1248	list_for_each(p, &ChunkListHead)
1249	{
1250	chunkP = list_entry(p, struct ShMemChunkDesc, chunkList);
1251	TRACE1N(TRACE_SHARED, 11, TRCID_UNMAPALL_MULTI,
1252	"cxiDetachSharedMemory: chunkP 0x%lX\n", chunkP);
1253
1254	reprotectKernelMemory(chunkP->vaddrP, chunkP->len);
1255	}
1256	spin_unlock(&ChunkListLock);
1257
1258	EXIT(0);
1259	return 0;
1260	}
1261
1262	/* Clean up the code that manages shared kernel memory,
1263	* including freeing all allocated chunks.
1264	*/
1265	void
1266	TermSharedMemory()
1267	{
1268	cxiMemoryMapping_t mapping;
1269
1270	ENTER(0);
1271	InitMemoryMapping(&mapping);
1272
1273	/* Delete shared segment */
1274	cxiFreeSharedMemory(&mapping, true);
1275
1276	/* Destroy slab allocator for ShMemChunkDesc objects */
1277	(void)kmem_cache_destroy(ChunkCacheP);
1278
1279	/* Unregister the shared segment device driver */
1280	unregister_chrdev(GPFSIoctlMajorNumber, "ss");
1281
1282	TRACE1(TRACE_SHARED, 2, TRCID_SSINIT_003,
1283	"module unloaded major %d\n", GPFSIoctlMajorNumber);
1284	GPFSIoctlMajorNumber = 0;
1285	EXIT(0);
1286	}
1287
1288	/* Clean up slab for ShMemChunkDesc (for early termination) */
1289	void
1290	CleanUpSharedMemory()
1291	{
1292	/* Destroy slab allocator for ShMemChunkDesc objects */
1293	(void)kmem_cache_destroy(ChunkCacheP);
1294	}
1295
1296	int
1297	kxCoreDump(long sig, void *info,
1298	struct ucontext sc, char filenameP)
1299	{
1300	struct pt_regs regs;
1301	static int getDump = 0;
1302	struct linux_binfmt * binfmt;
1303	char *tmp = NULL;
1304	int rc = -1;
1305	int code = 0;
1306	struct file *file = NULL;
1307	Boolean klock = false;
1308	struct sigcontext_struct *uc_mcontext;
1309	unsigned long len;
1310
1311	printk("kxCoreDump sig: %d fn: %s\n", sig, filenameP);
1312
1313	if (getDump == 0)
1314	getDump = 1; // don't create more than one core dump at the same time
1315	else
1316	return 1;
1317
1318	memset((char *)&regs, 0, sizeof(struct pt_regs));
1319
1320	if (sig) /* Build pt_resgs from sigcontext struct */
1321	{
1322	code = 11;
1323	goto xerror;
1324	}
1325	tmp = cxiMallocPinned(CXI_PATH_MAX+1);
1326	if (!tmp)
1327	{
1328	code = 1;
1329	tmp = NULL;
1330	goto xerror;
1331	}
1332	if(cxiCopyInstr(filenameP, tmp, CXI_PATH_MAX, &len) != 0)
1333	{
1334	code = 12;
1335	goto xerror;
1336	}
1337
1338	lock_kernel();
1339	klock = true;
1340
1341	binfmt = current->binfmt;
1342	if (!binfmt \|\| !binfmt->core_dump)
1343	{
1344	code = 2;
1345	goto xerror;
1346	}
1347
1348	if (MY_RLIM_CUR(RLIMIT_CORE) > 0x01000000)
1349	MY_RLIM_CUR(RLIMIT_CORE) = 0x10000000;
1350
1351	file = filp_open(tmp, O_CREAT \| 2 \| O_TRUNC \| O_NOFOLLOW, 0600);
1352	if (IS_ERR(file))
1353	{
1354	code = 4;
1355	file = NULL;
1356	goto xerror;
1357	}
1358	if (!file->f_op \|\| !file->f_op->write)
1359	{
1360	code = 5;
1361	goto xerror;
1362	}
1363	rc = binfmt->core_dump(sig, &regs, file);
1364	if (!rc)
1365	{
1366	code = 6;
1367	goto xerror;
1368	}
1369
1370	xerror:
1371	if (file)
1372	filp_close(file, NULL);
1373
1374	if (klock)
1375	unlock_kernel();
1376
1377	if (tmp)
1378	cxiFreePinned(tmp);
1379
1380	getDump = 0;
1381	return rc;
1382	}
1383
1384	/* This call looks very similar to a MAP_ANONYMOUS mmap() call. That's
1385	* because we used to do mmap() for this region. Unfortunately when we
1386	* want MAP_PRIVATE semantics we don't get the results on Linux that we
1387	* expect. The trouble starts when the pages of this memory
1388	* area are marked copy-on-write. Since this is our buffer pool, when
1389	* I/O gets done, the old page goes to the child process and the new page goes
1390	* to the parent (mmfsd). Unfortunately, the I/O gets done to the old page
1391	* since its physical address was cached in the kiobuf.
1392	*
1393	* One attempt at fixing this was by making the area shared between parent
1394	* and child via MAP_SHARED. However, it opens the possibility of a child
1395	* process run from system() or popen() being able to stomp on the GPFS buffer
1396	* pool. Additionally putting MAP_SHARED on the the region causes it
1397	* to be internally mapped to /dev/zero (apparently it needs some file mapping
1398	* on this MAP_ANONYMOUS region). Subsequent madvise() calls saying that
1399	* we don't need the pages (MADV_DONTNEED) doesn't really free the
1400	* pages since there is still a hold count due to the kernel /dev/zero
1401	* mapping. Thus the free pages reported by vmstat don't go down even
1402	* though we're freeing them from the mmap'd region.
1403	*
1404	* This all boils down to a workaround where we MAP_PRIVATE as we
1405	* wanted but set the VM_DONTCOPY flag so these mmap pages don't
1406	* get inherited by child processes.
1407	*
1408	* GPFS also needs to make sure that pages of its buffer pool are pinned in
1409	* memory. This is necessary because GPFS caches the pointers to the struct
1410	* page objects returned by map_user_kiobuf. Linux might steal pages in
1411	* one of two ways: reclaim_page will steal pages with count <= 1, and
1412	* swap_out_vma will clear the page table mapping of pages belonging to
1413	* vm_area_structs that do not have the VM_LOCKED bit set.
1414	* GPFS prevents the first case because map_user_kiobuf increases page
1415	* reference counts to 2. We used to turning on the VM_LOCKED bit here,
1416	* but now we mlock() the memory to ensure it isn't swapped out.
1417	*/
1418	int
1419	kxMapPrivate(char *inAddr, unsigned long len, unsigned long prot,
1420	char **outAddr)
1421	{
1422	struct mm_struct *mmP;
1423	struct vm_area_struct *vmaP = NULL;
1424
1425	mmP = current->mm;
1426
1427	ACQUIRE_MMAP_SEM(&mmP->mmap_sem);
1428
1429	outAddr = (char )do_mmap(NULL, (unsigned long)inAddr, len, prot,
1430	MAP_PRIVATE \| MAP_ANONYMOUS, 0);
1431	/* Only look for address in vma list if do_mmap matches what we asked for;
1432	otherwise it may be an unexpected address or an error code and
1433	both are a problem. Any issues should be handled in the daemon
1434	if possible (eg, -ENOMEM). */
1435	if (*outAddr == inAddr)
1436	{
1437	for (vmaP = mmP->mmap; vmaP != NULL; vmaP = vmaP->vm_next)
1438	if (vmaP->vm_start == (unsigned long)*outAddr)
1439	{
1440	/* We don't want our vm_area_structs merged since we are
1441	* about to set a flag that would cross into an area where
1442	* it might not be good. For instance if we get merged with
1443	* the stack vm area then we won't be able to fork since the
1444	* stack wouldn't be copied.
1445	*/
1446	LOGASSERT(vmaP->vm_end == vmaP->vm_start + len);
1447	vmaP->vm_flags \|= VM_DONTCOPY;
1448	break;
1449	}
1450
1451	DBGASSERT(vmaP != NULL);
1452	}
1453
1454	RELEASE_MMAP_SEM(&mmP->mmap_sem);
1455
1456	TRACE5(TRACE_SHARED, 1, TRCID_CXI_MAP_PRIVATE,
1457	"kxMapPrivate: inAddr 0x%lX len %d prot 0x%X outAddr 0x%lX vmaP 0x%lX\n",
1458	inAddr, len, prot, *outAddr, vmaP);
1459
1460	if (*outAddr == inAddr)
1461	return 0;
1462
1463	return -EFAULT;
1464	}
1465
1466	#ifdef SSEG_SWIZZLE_PTRS
1467	/* mmap handler for shared segment */
1468	int ss_fs_mmap(struct file file, struct vm_area_struct vma)
1469	{
1470	UIntPtr offset = vma->vm_pgoff<<PAGE_SHIFT;
1471	UIntPtr size = vma->vm_end - vma->vm_start;
1472
1473	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
1474	{
1475	printk("ss_fs_mmap: invalid mmap flags\n");
1476	return -EINVAL;
1477	}
1478
1479	if (offset != 0)
1480	{
1481	printk("ss_fs_mmap: page offset should be zero (%ld)\n", offset);
1482	return -EINVAL;
1483	}
1484
1485	/* add page fault handler for vm area */
1486	vma->vm_ops = &ss_vm_ops;
1487
1488	#if LINUX_KERNEL_VERSION >= 2060000
1489	/* 2.6 kernel appears to want the pages marked as unswappable,
1490	otherwise gobs of messages about "Badness in do_nopage/copy_page_range"
1491	occur in the system log. Still looking at this, but it appears that the
1492	kernel expects these pages to be "device" reserved pages verses typical
1493	anonymous pages (assumes a device intends to use the pages for DMA?)
1494	and doesn't want them tracked by VMM. */
1495	vma->vm_flags \|= VM_RESERVED;
1496	#endif
1497
1498	/* perform open on vm area */
1499	ss_vm_open(vma);
1500
1501	return 0;
1502	}
1503
1504	/* vm area handlers for shared segment */
1505
1506	void ss_vm_open(struct vm_area_struct *vma)
1507	{
1508	MY_MODULE_INCREMENT();
1509	}
1510
1511	void ss_vm_close(struct vm_area_struct *vma)
1512	{
1513	MY_MODULE_DECREMENT();
1514	}
1515
1516	/* Page fault handler
1517	Called by do_no_page with address of faulting page (ie, on page boundary) */
1518	#if LINUX_KERNEL_VERSION < 2060000
1519	struct page *
1520	ss_vm_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
1521	#else
1522	struct page *
1523	ss_vm_nopage(struct vm_area_struct vma, unsigned long address, int type)
1524	#endif /* LINUX_KERNEL_VERSION < 2060000 */
1525	{
1526	UIntPtr offset;
1527	UIntPtr va;
1528	struct page *ret_page = NOPAGE_SIGBUS;
1529	int found = 0;
1530	struct list_head* p;
1531	struct ShMemChunkDesc* chunkP;
1532
1533	if ((address < vma->vm_start) \|\| (address >= vma->vm_end))
1534	{
1535	printk("ss_vm_nopage: address 0x%lx out of vma range [%lx,%lx)\n",
1536	address, vma->vm_start, vma->vm_end);
1537	return ret_page;
1538	}
1539
1540	/* Make sure that the user address from a page fault is backed by
1541	kernel memory (find a containing memory chunk).
1542	The most recently allocated block will be at the head of
1543	the list, so generally we only check the first list entry. */
1544	/* May want to cache last list entry where a "hit" occurs if needed
1545	for performance at some point, eg, non-daemon attach. */
1546	spin_lock(&ChunkListLock);
1547	list_for_each(p, &ChunkListHead)
1548	{
1549	chunkP = list_entry(p, struct ShMemChunkDesc, chunkList);
1550	if ((address >= (UIntPtr)chunkP->usrvaddrP) &&
1551	(address < (UIntPtr)chunkP->usrvaddrP + chunkP->len))
1552	{
1553	found = 1;
1554	break;
1555	}
1556	}
1557	spin_unlock(&ChunkListLock);
1558	if (!found)
1559	{
1560	/* We have a problem; unable to find backing kernel memory */
1561	printk("ss_vm_nopage: unable to find kernel chunk backing user address 0x%lx\n", address);
1562	return ret_page;
1563	}
1564
1565	/* calculate the kernel virtual address */
1566	offset = address - (IntPtr)chunkP->usrvaddrP;
1567	va = (UIntPtr)(chunkP->vaddrP + offset);
1568
1569	/* Grab kernel page table lock before traversing kernel page table.
1570	I believe this is necessary in order to avoid having another processor
1571	change the page table on us while we are traversing.
1572	Normally only the process page table lock is grabbed when a
1573	page fault occurs (to protect against kswapd). */
1574	spin_lock(&init_mm.page_table_lock);
1575
1576	/* traverse kernel page table */
1577	ret_page = vmalloc_to_page((void *)va);
1578
1579	spin_unlock(&init_mm.page_table_lock);
1580	if (ret_page == NULL)
1581	{
1582	printk("ss_vm_nopage: vmalloc_to_page returned NULL\n");
1583	return ret_page;
1584	}
1585
1586	/* bump up page use count */
1587	get_page(ret_page);
1588
1589	#ifdef SWIZ_BIG_DEBUG
1590	printk("ss_vm_nopage: page fault for offset 0x%lx uva 0x%lx va 0x%lx (kva x%lx)\n",
1591	offset, address, va, page_address(ret_page));
1592	#endif
1593
1594	#if LINUX_KERNEL_VERSION >= 2060000
1595	if (type)
1596	*type = VM_FAULT_MINOR;
1597	#endif
1598
1599	/* return page */
1600	return ret_page;
1601	}
1602	#endif /* SSEG_SWIZZLE_PTRS */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gpfs_3.1_ker2.6.20/lpp/mmfs/src/gpl-linux/ss.c @ 65

Download in other formats: