From: Manfred Spraul <manfred@colorfullife.com>

Actual implementation of the posix message queues, written by Krzysztof
Benedyczak and Michal Wronski.  The complete implementation is dependant on
CONFIG_POSIX_MQUEUE.

It passed the openposix test suite with two exceptions: one mq_unlink test
was bad and tested undefined behavior.  And Linux succeeds
mq_close(open(,,,)).  The spec mandates EBADF, but we have decided to ignore
that: we would have to add a new syscall just for the right error code.

The patch intentionally doesn't use all helpers from fs/libfs for kernel-only
filesystems: step 5 allows user space mounts of the file system.


---

 CREDITS                            |   17 
 Documentation/filesystems/proc.txt |   25 
 include/asm-generic/siginfo.h      |    4 
 init/Kconfig                       |   17 
 ipc/Makefile                       |    2 
 ipc/mqueue.c                       | 1094 +++++++++++++++++++++++++++++++++++++
 kernel/signal.c                    |    1 
 7 files changed, 1159 insertions(+), 1 deletion(-)

diff -puN CREDITS~mq-03-core CREDITS
--- 25/CREDITS~mq-03-core	2004-02-28 16:26:53.000000000 -0800
+++ 25-akpm/CREDITS	2004-02-28 16:26:53.000000000 -0800
@@ -289,6 +289,15 @@ S: Via Delle Palme, 9
 S: Terni 05100
 S: Italy
 
+N: Krzysztof Benedyczak
+E: golbi@mat.uni.torun.pl
+W: http://www.mat.uni.torun.pl/~golbi
+D: POSIX message queues fs (with M. Wronski)
+S: ul. Podmiejska 52
+S: Radunica
+S: 83-000 Pruszcz Gdanski
+S: Poland
+
 N: Randolph Bentson
 E: bentson@grieg.seaslug.org
 W: http://www.aa.net/~bentson/
@@ -3489,6 +3498,14 @@ S: 12725 SW Millikan Way, Suite 400
 S: Beaverton, OR 97005
 S: USA
 
+N: Michal Wronski
+E: wrona@mat.uni.torun.pl
+W: http://www.mat.uni.torun.pl/~wrona
+D: POSIX message queues fs (with K. Benedyczak)
+S: ul. Teczowa 23/12
+S: 80-680 Gdansk-Sobieszewo
+S: Poland
+
 N: Frank Xia
 E: qx@math.columbia.edu
 D: Xiafs filesystem [defunct]
diff -puN Documentation/filesystems/proc.txt~mq-03-core Documentation/filesystems/proc.txt
--- 25/Documentation/filesystems/proc.txt~mq-03-core	2004-02-28 16:26:53.000000000 -0800
+++ 25-akpm/Documentation/filesystems/proc.txt	2004-02-28 16:26:53.000000000 -0800
@@ -38,6 +38,7 @@ Table of Contents
   2.8	/proc/sys/net/ipv4 - IPV4 settings
   2.9	Appletalk
   2.10	IPX
+  2.11	/proc/sys/fs/mqueue - POSIX message queues filesystem
 
 ------------------------------------------------------------------------------
 Preface
@@ -1814,6 +1815,30 @@ The /proc/net/ipx_route  table  holds  a
 gives the  destination  network, the router node (or Directly) and the network
 address of the router (or Connected) for internal networks.
 
+2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem
+----------------------------------------------------------
+
+The "mqueue"  filesystem provides  the necessary kernel features to enable the
+creation of a  user space  library that  implements  the  POSIX message queues
+API (as noted by the  MSG tag in the  POSIX 1003.1-2001 version  of the System
+Interfaces specification.)
+
+The "mqueue" filesystem contains values for determining/setting  the amount of
+resources used by the file system.
+
+/proc/sys/fs/mqueue/queues_max is a read/write  file for  setting/getting  the
+maximum number of message queues allowed on the system.
+
+/proc/sys/fs/mqueue/msg_max  is  a  read/write file  for  setting/getting  the
+maximum number of messages in a queue value.  In fact it is the limiting value
+for another (user) limit which is set in mq_open invocation. This attribute of
+a queue must be less or equal then msg_max.
+
+/proc/sys/fs/mqueue/msgsize_max is  a read/write  file for setting/getting the
+maximum  message size value (it is every  message queue's attribute set during
+its creation).
+
+
 ------------------------------------------------------------------------------
 Summary
 ------------------------------------------------------------------------------
diff -puN include/asm-generic/siginfo.h~mq-03-core include/asm-generic/siginfo.h
--- 25/include/asm-generic/siginfo.h~mq-03-core	2004-02-28 16:26:53.000000000 -0800
+++ 25-akpm/include/asm-generic/siginfo.h	2004-02-28 16:26:53.000000000 -0800
@@ -118,6 +118,7 @@ typedef struct siginfo {
 #define __SI_FAULT	(3 << 16)
 #define __SI_CHLD	(4 << 16)
 #define __SI_RT		(5 << 16)
+#define __SI_MESGQ	(6 << 16)
 #define __SI_CODE(T,N)	((T) | ((N) & 0xffff))
 #else
 #define __SI_KILL	0
@@ -126,6 +127,7 @@ typedef struct siginfo {
 #define __SI_FAULT	0
 #define __SI_CHLD	0
 #define __SI_RT		0
+#define __SI_MESGQ	0
 #define __SI_CODE(T,N)	(N)
 #endif
 
@@ -137,7 +139,7 @@ typedef struct siginfo {
 #define SI_KERNEL	0x80		/* sent by the kernel from somewhere */
 #define SI_QUEUE	-1		/* sent by sigqueue */
 #define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */
-#define SI_MESGQ	-3		/* sent by real time mesq state change */
+#define SI_MESGQ __SI_CODE(__SI_MESGQ,-3) /* sent by real time mesq state change */
 #define SI_ASYNCIO	-4		/* sent by AIO completion */
 #define SI_SIGIO	-5		/* sent by queued SIGIO */
 #define SI_TKILL	-6		/* sent by tkill system call */
diff -puN init/Kconfig~mq-03-core init/Kconfig
--- 25/init/Kconfig~mq-03-core	2004-02-28 16:26:53.000000000 -0800
+++ 25-akpm/init/Kconfig	2004-02-28 16:26:53.000000000 -0800
@@ -91,6 +91,23 @@ config SYSVIPC
 	  section 6.4 of the Linux Programmer's Guide, available from
 	  <http://www.tldp.org/docs.html#guide>.
 
+config POSIX_MQUEUE
+	bool "POSIX Message Queues"
+	---help---
+	  POSIX variant of message queues is a part of IPC. In POSIX message
+	  queues every message has a priority which decides about succession
+	  of receiving it by a process. If you want to compile and run
+	  programs written e.g. for Solaris with use of its POSIX message
+	  queues (functions mq_*) say Y here. To use this feature you will
+	  also need mqueue library, available from
+	  <http://www.mat.uni.torun.pl/~wrona/posix_ipc/>
+
+	  POSIX message queues are visible as a filesystem called 'mqueue'
+	  and can be mounted somewhere if you want to do filesystem
+	  operations on message queues.
+
+	  If unsure, say Y.
+
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
 	help
diff -puN ipc/Makefile~mq-03-core ipc/Makefile
--- 25/ipc/Makefile~mq-03-core	2004-02-28 16:26:53.000000000 -0800
+++ 25-akpm/ipc/Makefile	2004-02-28 16:26:53.000000000 -0800
@@ -4,3 +4,5 @@
 
 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
 obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o
+obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o
+
diff -puN /dev/null ipc/mqueue.c
--- /dev/null	2002-08-30 16:31:37.000000000 -0700
+++ 25-akpm/ipc/mqueue.c	2004-02-28 16:26:53.000000000 -0800
@@ -0,0 +1,1094 @@
+/*
+ * POSIX message queues filesystem for Linux.
+ *
+ * Copyright (C) 2003,2004  Krzysztof Benedyczak    (golbi@mat.uni.torun.pl)
+ *                          Michal Wronski          (wrona@mat.uni.torun.pl)
+ *
+ * Spinlocks:               Mohamed Abbas           (abbas.mohamed@intel.com)
+ * Lockless receive & send, fd based notify:
+ * 			    Manfred Spraul	    (manfred@colorfullife.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sysctl.h>
+#include <linux/poll.h>
+#include <linux/mqueue.h>
+#include <linux/msg.h>
+#include "util.h"
+
+#define MQUEUE_MAGIC	0x19800202
+#define DIRENT_SIZE	20
+#define FILENT_SIZE	80
+
+#define SEND		0
+#define RECV		1
+
+#define STATE_NONE	0
+#define STATE_PENDING	1
+#define STATE_READY	2
+
+#define NP_NONE		((void*)NOTIFY_NONE)
+#define NP_WOKENUP	((void*)NOTIFY_WOKENUP)
+#define NP_REMOVED	((void*)NOTIFY_REMOVED)
+/* used by sysctl */
+#define FS_MQUEUE 	1
+#define CTL_QUEUESMAX 	2
+#define CTL_MSGMAX 	3
+#define CTL_MSGSIZEMAX 	4
+
+/* default values */
+#define DFLT_QUEUESMAX	64	/* max number of message queues */
+#define DFLT_MSGMAX 	40	/* max number of messages in each queue */
+#define HARD_MSGMAX 	(131072/sizeof(void*))
+#define DFLT_MSGSIZEMAX 16384	/* max message size */
+
+struct ext_wait_queue {		/* queue of sleeping tasks */
+	struct task_struct *task;
+	struct list_head list;
+	struct msg_msg *msg;	/* ptr of loaded message */
+	int state;		/* one of STATE_* values */
+};
+
+struct mqueue_inode_info {
+	struct mq_attr attr;
+	struct msg_msg **messages;
+
+	pid_t notify_owner;	/* != 0 means notification registered */
+	struct sigevent notify;
+	struct file *notify_filp;
+
+	/* for tasks waiting for free space and messages, respectively */
+	struct ext_wait_queue e_wait_q[2];
+	wait_queue_head_t wait_q;
+
+	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
+	spinlock_t lock;
+	struct inode vfs_inode;
+};
+
+static struct inode_operations mqueue_dir_inode_operations;
+static struct file_operations mqueue_file_operations;
+static struct file_operations mqueue_notify_fops;
+static struct super_operations mqueue_super_ops;
+static void remove_notification(struct mqueue_inode_info *info);
+
+static spinlock_t mq_lock;
+static kmem_cache_t *mqueue_inode_cachep;
+static struct vfsmount *mqueue_mnt;
+
+static unsigned int queues_count;
+static unsigned int queues_max 	= DFLT_QUEUESMAX;
+static unsigned int msg_max 	= DFLT_MSGMAX;
+static unsigned int msgsize_max = DFLT_MSGSIZEMAX;
+
+static struct ctl_table_header * mq_sysctl_table;
+
+static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
+{
+	return container_of(inode, struct mqueue_inode_info, vfs_inode);
+}
+
+static struct inode *mqueue_get_inode(struct super_block *sb, int mode)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_mtime = inode->i_ctime = inode->i_atime = CURRENT_TIME;
+
+		if (S_ISREG(mode)) {
+			struct mqueue_inode_info *info;
+
+			inode->i_fop = &mqueue_file_operations;
+			inode->i_size = FILENT_SIZE;
+			/* mqueue specific info */
+			info = MQUEUE_I(inode);
+			spin_lock_init(&info->lock);
+			init_waitqueue_head(&info->wait_q);
+			INIT_LIST_HEAD(&info->e_wait_q[0].list);
+			INIT_LIST_HEAD(&info->e_wait_q[1].list);
+			info->notify_owner = 0;
+			info->qsize = 0;
+			info->attr.mq_curmsgs = 0;
+			info->messages = NULL;
+		} else if (S_ISDIR(mode)) {
+			inode->i_nlink++;
+			inode->i_op = &mqueue_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+		}
+	}
+	return inode;
+}
+
+static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+
+	sb->s_flags = MS_NOUSER;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = MQUEUE_MAGIC;
+	sb->s_op = &mqueue_super_ops;
+
+	inode = mqueue_get_inode(sb, S_IFDIR | S_IRWXUGO);
+	if (!inode)
+		return -ENOMEM;
+
+	sb->s_root = d_alloc_root(inode);
+	if (!sb->s_root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static struct super_block *mqueue_get_sb(struct file_system_type *fs_type,
+					 int flags, const char *dev_name,
+					 void *data)
+{
+	return get_sb_single(fs_type, flags, data, mqueue_fill_super);
+}
+
+static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
+{
+	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&p->vfs_inode);
+}
+
+static struct inode *mqueue_alloc_inode(struct super_block *sb)
+{
+	struct mqueue_inode_info *ei;
+
+	ei = (struct mqueue_inode_info *)kmem_cache_alloc(mqueue_inode_cachep, SLAB_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void mqueue_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
+}
+
+static void mqueue_delete_inode(struct inode *inode)
+{
+	struct mqueue_inode_info *info;
+	int i;
+
+	if (S_ISDIR(inode->i_mode)) {
+		clear_inode(inode);
+		return;
+	}
+	info = MQUEUE_I(inode);
+	spin_lock(&info->lock);
+	for (i = 0; i < info->attr.mq_curmsgs; i++)
+		free_msg(info->messages[i]);
+	kfree(info->messages);
+	spin_unlock(&info->lock);
+
+	clear_inode(inode);
+
+	spin_lock(&mq_lock);
+	queues_count--;
+	spin_unlock(&mq_lock);
+}
+
+static int mqueue_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+{
+	struct inode *inode;
+	int error;
+
+	spin_lock(&mq_lock);
+	if (queues_count >= queues_max && !capable(CAP_SYS_RESOURCE)) {
+		error = -ENOSPC;
+		goto out_lock;
+	}
+	queues_count++;
+	spin_unlock(&mq_lock);
+
+	inode = mqueue_get_inode(dir->i_sb, mode);
+	if (!inode) {
+		error = -ENOMEM;
+		spin_lock(&mq_lock);
+		queues_count--;
+		goto out_lock;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);
+	return 0;
+out_lock:
+	spin_unlock(&mq_lock);
+	return error;
+}
+
+static int mqueue_flush_file(struct file *filp)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+
+	spin_lock(&info->lock);
+	if (current->tgid == info->notify_owner)
+		remove_notification(info);
+
+	spin_unlock(&info->lock);
+	return 0;
+}
+
+/* Adds current to info->e_wait_q[sr] before element with smaller prio */
+static void wq_add(struct mqueue_inode_info *info, int sr,
+			struct ext_wait_queue *ewp)
+{
+	struct ext_wait_queue *walk;
+
+	ewp->task = current;
+
+	list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
+		if (walk->task->static_prio <= current->static_prio) {
+			list_add_tail(&ewp->list, &walk->list);
+			return;
+		}
+	}
+	list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
+}
+
+/*
+ * Puts current task to sleep. Caller must hold queue lock. After return
+ * lock isn't held.
+ * sr: SEND or RECV
+ */
+static int wq_sleep(struct mqueue_inode_info *info, int sr,
+		    long timeout, struct ext_wait_queue *ewp)
+{
+	int retval;
+	signed long time;
+
+	wq_add(info, sr, ewp);
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_unlock(&info->lock);
+		time = schedule_timeout(timeout);
+
+		while (ewp->state == STATE_PENDING)
+			cpu_relax();
+
+		if (ewp->state == STATE_READY) {
+			retval = 0;
+			goto out;
+		}
+		spin_lock(&info->lock);
+		if (ewp->state == STATE_READY) {
+			retval = 0;
+			goto out_unlock;
+		}
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		if (time == 0) {
+			retval = -ETIMEDOUT;
+			break;
+		}
+		printk(KERN_WARNING "mqueue: Spurious wakeup in wq_sleep()\n");
+	}
+	list_del(&ewp->list);
+out_unlock:
+	spin_unlock(&info->lock);
+out:
+	return retval;
+}
+
+/* Returns waiting task that should be serviced first or NULL if none exists */
+static struct ext_wait_queue *wq_get_first_waiter(
+		struct mqueue_inode_info *info, int sr)
+{
+	struct list_head *ptr;
+
+	ptr = info->e_wait_q[sr].list.prev;
+	if (ptr == &info->e_wait_q[sr].list)
+		return NULL;
+	return list_entry(ptr, struct ext_wait_queue, list);
+}
+
+/* Auxiliary functions to manipulate messages' list */
+static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info)
+{
+	int k;
+
+	k = info->attr.mq_curmsgs - 1;
+	while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) {
+		info->messages[k + 1] = info->messages[k];
+		k--;
+	}
+	info->attr.mq_curmsgs++;
+	info->qsize += ptr->m_ts;
+	info->messages[k + 1] = ptr;
+}
+
+static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
+{
+	info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts;
+	return info->messages[info->attr.mq_curmsgs];
+}
+
+/*
+ * The next function is only to split too long sys_mq_timedsend
+ */
+static void __do_notify(struct mqueue_inode_info *info)
+{
+	/* notification
+	 * invoked when there is registered process and there isn't process
+	 * waiting synchronously for message AND state of queue changed from
+	 * empty to not empty. Here we are sure that no one is waiting
+	 * synchronously. */
+	if (info->notify_owner && info->attr.mq_curmsgs == 1) {
+		/* sends signal */
+		if (info->notify.sigev_notify == SIGEV_SIGNAL) {
+			struct siginfo sig_i;
+
+			sig_i.si_signo = info->notify.sigev_signo;
+			sig_i.si_errno = 0;
+			sig_i.si_code = SI_MESGQ;
+			sig_i.si_value = info->notify.sigev_value;
+			sig_i.si_pid = current->tgid;
+			sig_i.si_uid = current->uid;
+
+			kill_proc_info(info->notify.sigev_signo,
+				       &sig_i, info->notify_owner);
+		} else if (info->notify.sigev_notify == SIGEV_THREAD) {
+			info->notify_filp->private_data = (void*)NP_WOKENUP;
+			wake_up(&info->wait_q);
+		}
+		/* after notification unregisters process */
+		info->notify_owner = 0;
+	}
+}
+
+static long prepare_timeout(const struct timespec __user *u_arg)
+{
+	struct timespec ts, nowts;
+	long timeout;
+
+	if (u_arg) {
+		if (unlikely(copy_from_user(&ts, u_arg, sizeof(struct timespec))))
+			return -EFAULT;
+
+		if (unlikely(ts.tv_nsec < 0 || ts.tv_sec < 0
+			|| ts.tv_nsec >= NSEC_PER_SEC))
+			return -EINVAL;
+		nowts = CURRENT_TIME;
+		/* first subtract as jiffies can't be too big */
+		ts.tv_sec -= nowts.tv_sec;
+		if (ts.tv_nsec < nowts.tv_nsec) {
+			ts.tv_nsec += NSEC_PER_SEC;
+			ts.tv_sec--;
+		}
+		ts.tv_nsec -= nowts.tv_nsec;
+		if (ts.tv_sec < 0)
+			return 0;
+
+		timeout = timespec_to_jiffies(&ts) + 1;
+	} else
+		return MAX_SCHEDULE_TIMEOUT;
+
+	return timeout;
+}
+
+static unsigned int mqueue_notify_poll(struct file *filp, struct poll_table_struct *poll_tab)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+	int retval;
+
+	poll_wait(filp, &info->wait_q, poll_tab);
+
+	if (filp->private_data == NP_NONE)
+		retval = 0;
+	else
+		retval = POLLIN | POLLRDNORM;
+	return retval;
+}
+
+static ssize_t mqueue_notify_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+	char result;
+
+	if (!count)
+		return 0;
+	if (*ppos != 0)
+		return 0;
+	spin_lock(&info->lock);
+	while (filp->private_data == NP_NONE) {
+		DEFINE_WAIT(wait);
+		if (filp->f_flags & O_NONBLOCK) {
+			spin_unlock(&info->lock);
+			return -EAGAIN;
+		}
+		prepare_to_wait(&info->wait_q, &wait, TASK_INTERRUPTIBLE);
+		spin_unlock(&info->lock);
+		schedule();
+		finish_wait(&info->wait_q, &wait);
+		spin_lock(&info->lock);
+	}
+	spin_unlock(&info->lock);
+	result = (char)(unsigned long)filp->private_data;
+	if (put_user(result, buf))
+		return -EFAULT;
+	*ppos = 1;
+	return 1;
+}
+
+static void remove_notification(struct mqueue_inode_info *info)
+{
+	if (info->notify.sigev_notify == SIGEV_THREAD) {
+		info->notify_filp->private_data = NP_REMOVED;
+		wake_up(&info->wait_q);
+	}
+	info->notify_owner = 0;
+}
+
+/*
+ * Invoked when creating a new queue via sys_mq_open
+ */
+static struct file *do_create(struct dentry *dir, struct dentry *dentry,
+	     int oflag, mode_t mode, struct mq_attr __user *u_attr)
+{
+	struct file *filp;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+	struct msg_msg **msgs = NULL;
+	struct mq_attr attr;
+	int ret;
+
+	if (u_attr != NULL) {
+		if (copy_from_user(&attr, u_attr, sizeof(attr)))
+			return ERR_PTR(-EFAULT);
+
+		if (attr.mq_maxmsg <= 0 || attr.mq_msgsize <= 0)
+			return ERR_PTR(-EINVAL);
+		if (capable(CAP_SYS_RESOURCE)) {
+			if (attr.mq_maxmsg > HARD_MSGMAX)
+				return ERR_PTR(-EINVAL);
+		} else {
+			if (attr.mq_maxmsg > msg_max || attr.mq_msgsize > msgsize_max)
+				return ERR_PTR(-EINVAL);
+		}
+	} else {
+		attr.mq_maxmsg = DFLT_MSGMAX;
+		attr.mq_msgsize = DFLT_MSGSIZEMAX;
+	}
+	msgs = kmalloc(attr.mq_maxmsg * sizeof(*msgs), GFP_KERNEL);
+	if (!msgs)
+		return ERR_PTR(-ENOMEM);
+
+	ret = vfs_create(dir->d_inode, dentry, mode, NULL);
+	if (ret) {
+		kfree(msgs);
+		return ERR_PTR(ret);
+	}
+
+	inode = dentry->d_inode;
+	info = MQUEUE_I(inode);
+
+	info->attr.mq_maxmsg = attr.mq_maxmsg;
+	info->attr.mq_msgsize = attr.mq_msgsize;
+	info->messages = msgs;
+
+	filp = dentry_open(dentry, mqueue_mnt, oflag);
+	if (!IS_ERR(filp))
+		dget(dentry);
+
+	return filp;
+}
+
+/* Opens existing queue */
+static struct file *do_open(struct dentry *dentry, int oflag)
+{
+	struct file *filp;
+	static int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE, MAY_READ | MAY_WRITE };
+
+	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
+		return ERR_PTR(-EINVAL);
+
+	if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL))
+		return ERR_PTR(-EACCES);
+
+	filp = dentry_open(dentry, mqueue_mnt, oflag);
+
+	if (!IS_ERR(filp))
+		dget(dentry);
+
+	return filp;
+}
+
+asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
+	struct mq_attr __user *u_attr)
+{
+	struct dentry *dentry;
+	struct file *filp;
+	char   *name;
+	int    fd, error;
+
+	if (IS_ERR(name = getname(u_name)))
+		return PTR_ERR(name);
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		goto out_putname;
+
+	down(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
+	if (IS_ERR(dentry)) {
+		error = PTR_ERR(dentry);
+		goto out_err;
+	}
+	mntget(mqueue_mnt);
+
+	if (oflag & O_CREAT) {
+		if (dentry->d_inode) {	/* entry already exists */
+			filp = (oflag & O_EXCL) ? ERR_PTR(-EEXIST) : do_open(dentry, oflag);
+		} else {
+			filp = do_create(mqueue_mnt->mnt_root, dentry, oflag, mode, u_attr);
+		}
+	} else
+		filp = (dentry->d_inode) ? do_open(dentry, oflag) : ERR_PTR(-ENOENT);
+
+	dput(dentry);
+
+	if (IS_ERR(filp)) {
+		error = PTR_ERR(filp);
+		goto out_putfd;
+	}
+
+	fd_install(fd, filp);
+	goto out_upsem;
+
+out_putfd:
+	mntput(mqueue_mnt);
+	put_unused_fd(fd);
+out_err:
+	fd = error;
+out_upsem:
+	up(&mqueue_mnt->mnt_root->d_inode->i_sem);
+out_putname:
+	putname(name);
+	return fd;
+}
+
+asmlinkage long sys_mq_unlink(const char __user *u_name)
+{
+	int err;
+	char *name;
+	struct dentry *dentry;
+	struct inode *inode = NULL;
+
+	name = getname(u_name);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	down(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+
+	if (!dentry->d_inode) {
+		err = -ENOENT;
+		goto out_err;
+	}
+
+	if (permission(dentry->d_inode, MAY_WRITE, NULL)) {
+		err = -EACCES;
+		goto out_err;
+	}
+	inode = dentry->d_inode;
+	if (inode)
+		atomic_inc(&inode->i_count);
+
+	err = vfs_unlink(dentry->d_parent->d_inode, dentry);
+out_err:
+	dput(dentry);
+
+out_unlock:
+	up(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	putname(name);
+	if (inode)
+		iput(inode);
+
+	return err;
+}
+
+/* Pipelined send and receive functions. Do not confuse this with SysV message
+ * queues terminology. It is little bit different. */
+
+/* pipelined_send() - send a message directly to the task waiting in
+ * sys_mq_timedreceive() (without inserting message into a queue). */
+static inline void pipelined_send(struct mqueue_inode_info *info,
+				  struct msg_msg *message,
+				  struct ext_wait_queue *receiver)
+{
+	receiver->msg = message;
+	list_del(&receiver->list);
+	receiver->state = STATE_PENDING;
+	wake_up_process(receiver->task);
+	wmb();
+	receiver->state = STATE_READY;
+}
+
+/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
+ * gets its message and put to the queue (we have one free place for sure). */
+static inline void pipelined_receive(struct mqueue_inode_info *info)
+{
+	struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
+
+	if (!sender)
+		return;
+
+	msg_insert(sender->msg, info);
+	list_del(&sender->list);
+	sender->state = STATE_PENDING;
+	wake_up_process(sender->task);
+	wmb();
+	sender->state = STATE_READY;
+}
+
+asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
+	size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout)
+{
+	struct file *filp;
+	struct inode *inode;
+	struct ext_wait_queue wait;
+	struct ext_wait_queue *receiver;
+	struct msg_msg *msg_ptr;
+	struct mqueue_inode_info *info;
+ 	long timeout;
+	int ret;
+
+	if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
+		return -EINVAL;
+
+	if (unlikely((timeout = prepare_timeout(u_abs_timeout)) < 0))
+		return timeout;
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (unlikely(!filp))
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	if (unlikely((filp->f_flags & O_ACCMODE) == O_RDONLY))
+		goto out_fput;
+
+	if (unlikely(msg_len > info->attr.mq_msgsize)) {
+		ret = -EMSGSIZE;
+		goto out_fput;
+	}
+
+	/* First try to allocate memory, before doing anything with
+	 * existing queues. */
+	msg_ptr = load_msg((void *)u_msg_ptr, msg_len);
+	if (unlikely(IS_ERR(msg_ptr))) {
+		ret = PTR_ERR(msg_ptr);
+		goto out_fput;
+	}
+	msg_ptr->m_ts = msg_len;
+	msg_ptr->m_type = msg_prio;
+
+	spin_lock(&info->lock);
+
+	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
+		if (filp->f_flags & O_NONBLOCK) {
+			spin_unlock(&info->lock);
+			ret = -EAGAIN;
+		} else {
+			wait.task = current;
+			wait.msg = (void *) msg_ptr;
+			wait.state = STATE_NONE;
+			ret = wq_sleep(info, SEND, timeout, &wait);
+			if (ret < 0)
+				free_msg(msg_ptr);
+		}
+	} else {
+		receiver = wq_get_first_waiter(info, RECV);
+		if (receiver) {
+			pipelined_send(info, msg_ptr, receiver);
+		} else {
+			/* adds message to the queue */
+			msg_insert(msg_ptr, info);
+			__do_notify(info);
+		}
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		spin_unlock(&info->lock);
+		ret = 0;
+	}
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
+	size_t msg_len, unsigned int __user *u_msg_prio,
+	const struct timespec __user *u_abs_timeout)
+{
+	long timeout;
+	ssize_t ret;
+	struct msg_msg *msg_ptr;
+	struct file *filp;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+	struct ext_wait_queue wait;
+
+
+	if (unlikely((timeout = prepare_timeout(u_abs_timeout)) < 0))
+		return timeout;
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (unlikely(!filp))
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	if (unlikely((filp->f_flags & O_ACCMODE) == O_WRONLY))
+		goto out_fput;
+
+	/* checks if buffer is big enough */
+	if (unlikely(msg_len < info->attr.mq_msgsize)) {
+		ret = -EMSGSIZE;
+		goto out_fput;
+	}
+
+	spin_lock(&info->lock);
+	if (info->attr.mq_curmsgs == 0) {
+		if (filp->f_flags & O_NONBLOCK) {
+			spin_unlock(&info->lock);
+			ret = -EAGAIN;
+			msg_ptr = NULL;
+		} else {
+			wait.task = current;
+			wait.state = STATE_NONE;
+			ret = wq_sleep(info, RECV, timeout, &wait);
+			msg_ptr = wait.msg;
+		}
+	} else {
+		msg_ptr = msg_get(info);
+
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+		/* There is now free space in queue. */
+		pipelined_receive(info);
+		spin_unlock(&info->lock);
+		ret = 0;
+	}
+	if (ret == 0) {
+		ret = msg_ptr->m_ts;
+
+		if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
+			store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
+			ret = -EFAULT;
+		}
+		free_msg(msg_ptr);
+	}
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+/* Notes: the case when user wants us to deregister (with NULL as pointer or SIGEV_NONE)
+ * and he isn't currently owner of notification will be silently discarded.
+ * It isn't explicitly defined in the POSIX.
+ */
+asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
+{
+	int ret, fd;
+	struct file *filp, *nfilp;
+	struct inode *inode;
+	struct sigevent notification;
+	struct mqueue_inode_info *info;
+
+	if (u_notification == NULL) {
+		notification.sigev_notify = SIGEV_NONE;
+	} else {
+		if (copy_from_user(&notification, u_notification, sizeof(struct sigevent)))
+			return -EFAULT;
+
+		if (unlikely(notification.sigev_notify != SIGEV_NONE &&
+			     notification.sigev_notify != SIGEV_SIGNAL &&
+			     notification.sigev_notify != SIGEV_THREAD))
+			return -EINVAL;
+		if (notification.sigev_notify == SIGEV_SIGNAL &&
+			(notification.sigev_signo < 0 || notification.sigev_signo > _NSIG)) {
+			return -EINVAL;
+		}
+	}
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (!filp)
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	ret = 0;
+	if (notification.sigev_notify == SIGEV_THREAD) {
+		ret = get_unused_fd();
+		if (ret < 0)
+			goto out_fput;
+		fd = ret;
+		nfilp = get_empty_filp();
+		if (!nfilp) {
+			ret = -ENFILE;
+			goto out_dropfd;
+		}
+		nfilp->private_data = NP_NONE;
+		nfilp->f_op = &mqueue_notify_fops;
+		nfilp->f_vfsmnt = mntget(mqueue_mnt);
+		nfilp->f_dentry = dget(filp->f_dentry);
+		nfilp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+		nfilp->f_mode = FMODE_READ;
+	} else {
+		nfilp = NULL;
+		fd = -1;
+	}
+
+	spin_lock(&info->lock);
+
+	if (notification.sigev_notify == SIGEV_NONE) {
+		if (info->notify_owner == current->tgid) {
+			remove_notification(info);
+			inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		}
+	} else if (info->notify_owner) {
+		ret = -EBUSY;
+	} else if (notification.sigev_notify == SIGEV_THREAD) {
+		info->notify_filp = nfilp;
+		fd_install(fd, nfilp);
+		ret = fd;
+		fd = -1;
+		nfilp = NULL;
+		info->notify.sigev_notify = SIGEV_THREAD;
+		info->notify_owner = current->tgid;
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}  else {
+		info->notify.sigev_signo = notification.sigev_signo;
+		info->notify.sigev_value = notification.sigev_value;
+		info->notify.sigev_notify = SIGEV_SIGNAL;
+		info->notify_owner = current->tgid;
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}
+	spin_unlock(&info->lock);
+out_dropfd:
+	if (fd != -1)
+		put_unused_fd(fd);
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *u_mqstat,
+	struct mq_attr __user *u_omqstat)
+{
+	int ret;
+	struct mq_attr mqstat, omqstat;
+	struct file *filp;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+
+	if (u_mqstat != NULL) {
+		if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
+			return -EFAULT;
+	}
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (!filp)
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	spin_lock(&info->lock);
+
+	omqstat = info->attr;
+	omqstat.mq_flags = filp->f_flags;
+	if (u_mqstat) {
+		if (mqstat.mq_flags & O_NONBLOCK)
+			filp->f_flags |= O_NONBLOCK;
+		else
+			filp->f_flags &= ~O_NONBLOCK;
+
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}
+
+	spin_unlock(&info->lock);
+
+	ret = 0;
+	if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat, sizeof(struct mq_attr)))
+		ret = -EFAULT;
+
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+static struct inode_operations mqueue_dir_inode_operations = {
+	.lookup = simple_lookup,
+	.create = mqueue_create,
+	.unlink = simple_unlink,
+};
+
+static struct file_operations mqueue_file_operations = {
+	.flush = mqueue_flush_file,
+};
+
+static struct file_operations mqueue_notify_fops = {
+	.poll = mqueue_notify_poll,
+	.read = mqueue_notify_read,
+};
+
+
+static struct super_operations mqueue_super_ops = {
+	.alloc_inode = mqueue_alloc_inode,
+	.destroy_inode = mqueue_destroy_inode,
+	.delete_inode = mqueue_delete_inode,
+	.drop_inode = generic_delete_inode,
+};
+
+static struct file_system_type mqueue_fs_type = {
+	.name = "mqueue",
+	.get_sb = mqueue_get_sb,
+	.kill_sb = kill_anon_super,
+};
+
+static int msg_max_limit_min = DFLT_MSGMAX;
+static int msg_max_limit_max = HARD_MSGMAX;
+
+static int msg_maxsize_limit_min = DFLT_MSGSIZEMAX;
+static int msg_maxsize_limit_max = INT_MAX;
+
+static ctl_table mq_sysctls[] = {
+	{
+		.ctl_name	= CTL_QUEUESMAX,
+		.procname	= "queues_max",
+		.data		= &queues_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_MSGMAX,
+		.procname	= "msg_max",
+		.data		= &msg_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &msg_max_limit_min,
+		.extra2		= &msg_max_limit_max,
+	},
+	{
+		.ctl_name	= CTL_MSGSIZEMAX,
+		.procname	= "msgsize_max",
+		.data		= &msgsize_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &msg_maxsize_limit_min,
+		.extra2		= &msg_maxsize_limit_max,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table mq_sysctl_dir[] = {
+	{
+		.ctl_name	= FS_MQUEUE,
+		.procname	= "mqueue",
+		.mode		= 0555,
+		.child		= mq_sysctls,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table mq_sysctl_root[] = {
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= mq_sysctl_dir,
+	},
+	{ .ctl_name = 0 }
+};
+
+static int __init init_mqueue_fs(void)
+{
+	int error;
+
+	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
+		sizeof(struct mqueue_inode_info), 0, SLAB_HWCACHE_ALIGN, init_once, NULL);
+	if (mqueue_inode_cachep == NULL)
+		return -ENOMEM;
+
+	mq_sysctl_table = register_sysctl_table(mq_sysctl_root, 0);
+	if (!mq_sysctl_table) {
+		error = -ENOMEM;
+		goto out_cache;
+	}
+
+	error = register_filesystem(&mqueue_fs_type);
+	if (error)
+		goto out_sysctl;
+
+	if (IS_ERR(mqueue_mnt = kern_mount(&mqueue_fs_type))) {
+		error = PTR_ERR(mqueue_mnt);
+		goto out_filesystem;
+	}
+
+	/* internal initialization - not common for vfs */
+	queues_count = 0;
+	spin_lock_init(&mq_lock);
+
+	return 0;
+
+out_filesystem:
+	unregister_filesystem(&mqueue_fs_type);
+out_sysctl:
+	unregister_sysctl_table(mq_sysctl_table);
+out_cache:
+	if (kmem_cache_destroy(mqueue_inode_cachep))
+		printk(KERN_INFO "mqueue_inode_cache: not all structures were freed\n");
+	return error;
+}
+
+__initcall(init_mqueue_fs);
diff -puN kernel/signal.c~mq-03-core kernel/signal.c
--- 25/kernel/signal.c~mq-03-core	2004-02-28 16:26:53.000000000 -0800
+++ 25-akpm/kernel/signal.c	2004-02-28 16:26:53.000000000 -0800
@@ -2048,6 +2048,7 @@ int copy_siginfo_to_user(siginfo_t __use
 		err |= __put_user(from->si_stime, &to->si_stime);
 		break;
 	case __SI_RT: /* This is not generated by the kernel as of now. */
+	case __SI_MESGQ: /* But this is */
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
 		err |= __put_user(from->si_int, &to->si_int);

_