From: "Michael S. Tsirkin" <mst@mellanox.co.il>

The patch introduces two new methods (ioctl_native and ioctl_compat):
ioctl_native is called on native ioctl syscall, without BKL being taken,
and is, in that respect, equivalent to Ingo's unlocked_ioctl (which is why
it conflicts).

ioctl_compat is called on compat (i.e.  32 bit app on 64 bit OS) ioctl,
again without BKL being taken.

If a new call is not defined, default to the old behaviour.  (It should be
possible for me to build a patch that applies on top of Ingo's
unlocked_ioctl, if its really needed let me know and I'll look at it the
next week.)


There was an additional motivation for my patch: As noted by Juergen
Kreileder, the compat hash does not work for ioctls that encode additional
information in the command, like this:

#define EVIOCGBIT(ev,len)  _IOC(_IOC_READ, 'E', 0x20 + ev, len)

Signed-off-by: Michael Tsirkin <mst@mellanox.co.il>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/Documentation/filesystems/Locking |    9 +++
 25-akpm/fs/compat.c                       |   24 +++++++---
 25-akpm/fs/ioctl.c                        |   68 +++++++++++++++++++++---------
 25-akpm/include/linux/fs.h                |   24 ++++++++++
 25-akpm/include/linux/ioctl.h             |    8 +++
 5 files changed, 105 insertions(+), 28 deletions(-)

diff -puN Documentation/filesystems/Locking~ioctl-rework Documentation/filesystems/Locking
--- 25/Documentation/filesystems/Locking~ioctl-rework	Thu Dec 16 15:48:31 2004
+++ 25-akpm/Documentation/filesystems/Locking	Thu Dec 16 15:48:31 2004
@@ -350,6 +350,10 @@ prototypes:
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	int (*ioctl) (struct inode *, struct file *, unsigned int,
 			unsigned long);
+	long (*ioctl_native) (struct inode *, struct file *, unsigned int,
+			unsigned long);
+	long (*ioctl_compat) (struct inode *, struct file *, unsigned int,
+			unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *);
@@ -383,6 +387,8 @@ aio_write:		no
 readdir: 		no
 poll:			no
 ioctl:			yes	(see below)
+ioctl_native:		no	(see below)
+ioctl_compat:		no	(see below)
 mmap:			no
 open:			maybe	(see below)
 flush:			no
@@ -428,6 +434,9 @@ move ->readdir() to inode_operations and
 anything that resembles union-mount we won't have a struct file for all
 components. And there are other reasons why the current interface is a mess...
 
+->ioctl() on regular files is superceded by the ->ioctl_native() and
+->ioctl_compat() pair. The lock is not taken for these new calls.
+
 ->read on directories probably must go away - we should just enforce -EISDIR
 in sys_read() and friends.
 
diff -puN fs/compat.c~ioctl-rework fs/compat.c
--- 25/fs/compat.c~ioctl-rework	Thu Dec 16 15:48:31 2004
+++ 25-akpm/fs/compat.c	Thu Dec 16 15:48:31 2004
@@ -401,16 +401,21 @@ asmlinkage long compat_sys_ioctl(unsigne
 				unsigned long arg)
 {
 	struct file * filp;
-	int error = -EBADF;
+	long error = -EBADF;
 	struct ioctl_trans *t;
+	int fput_needed;
 
-	filp = fget(fd);
-	if(!filp)
+	filp = fget_light(fd, &fput_needed);
+	if (!filp)
 		goto out2;
 
-	if (!filp->f_op || !filp->f_op->ioctl) {
-		error = sys_ioctl (fd, cmd, arg);
+	if (!std_sys_ioctl(fd, cmd, arg, filp, &error))
 		goto out;
+	else if (filp->f_op && filp->f_op->ioctl_compat) {
+		error = filp->f_op->ioctl_compat(filp->f_dentry->d_inode,
+						 filp, cmd, arg);
+		if (error != -ENOIOCTLCMD)
+			goto out;
 	}
 
 	down_read(&ioctl32_sem);
@@ -425,9 +430,12 @@ asmlinkage long compat_sys_ioctl(unsigne
 			error = t->handler(fd, cmd, arg, filp);
 			unlock_kernel();
 			up_read(&ioctl32_sem);
-		} else {
+		} else if (filp->f_op && filp->f_op->ioctl) {
 			up_read(&ioctl32_sem);
-			error = sys_ioctl(fd, cmd, arg);
+			lock_kernel();
+			error = filp->f_op->ioctl(filp->f_dentry->d_inode,
+						  filp, cmd, arg);
+			unlock_kernel();
 		}
 	} else {
 		up_read(&ioctl32_sem);
@@ -466,7 +474,7 @@ asmlinkage long compat_sys_ioctl(unsigne
 		}
 	}
 out:
-	fput(filp);
+	fput_light(filp, fput_needed);
 out2:
 	return error;
 }
diff -puN fs/ioctl.c~ioctl-rework fs/ioctl.c
--- 25/fs/ioctl.c~ioctl-rework	Thu Dec 16 15:48:31 2004
+++ 25-akpm/fs/ioctl.c	Thu Dec 16 15:48:31 2004
@@ -36,7 +36,9 @@ static int file_ioctl(struct file *filp,
 			if ((error = get_user(block, p)) != 0)
 				return error;
 
+			lock_kernel();
 			res = mapping->a_ops->bmap(mapping, block);
+			unlock_kernel();
 			return put_user(res, p);
 		}
 		case FIGETBSZ:
@@ -46,29 +48,21 @@ static int file_ioctl(struct file *filp,
 		case FIONREAD:
 			return put_user(i_size_read(inode) - filp->f_pos, p);
 	}
-	if (filp->f_op && filp->f_op->ioctl)
-		return filp->f_op->ioctl(inode, filp, cmd, arg);
 	return -ENOTTY;
 }
 
 
-asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{	
-	struct file * filp;
+EXPORT_SYMBOL(std_sys_ioctl);
+int std_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
+		  struct file *filp, long *status)
+{
 	unsigned int flag;
-	int on, error = -EBADF;
-
-	filp = fget(fd);
-	if (!filp)
-		goto out;
+	int on, error, unknown = 0;
 
 	error = security_file_ioctl(filp, cmd, arg);
-	if (error) {
-                fput(filp);
+	if (error)
                 goto out;
-        }
 
-	lock_kernel();
 	switch (cmd) {
 		case FIOCLEX:
 			set_close_on_exec(fd, 1);
@@ -100,8 +94,11 @@ asmlinkage long sys_ioctl(unsigned int f
 
 			/* Did FASYNC state change ? */
 			if ((flag ^ filp->f_flags) & FASYNC) {
-				if (filp->f_op && filp->f_op->fasync)
+				if (filp->f_op && filp->f_op->fasync) {
+					lock_kernel();
 					error = filp->f_op->fasync(fd, filp, on);
+					unlock_kernel();
+				}
 				else error = -ENOTTY;
 			}
 			if (error != 0)
@@ -125,15 +122,46 @@ asmlinkage long sys_ioctl(unsigned int f
 			break;
 		default:
 			error = -ENOTTY;
-			if (S_ISREG(filp->f_dentry->d_inode->i_mode))
+			if (S_ISREG(filp->f_dentry->d_inode->i_mode)) {
 				error = file_ioctl(filp, cmd, arg);
-			else if (filp->f_op && filp->f_op->ioctl)
-				error = filp->f_op->ioctl(filp->f_dentry->d_inode, filp, cmd, arg);
+			}
+			if (error == -ENOTTY) {
+				unknown = 1;
+				goto out;
+			}
+			break;
+	}
+out:
+	*status = error;
+	return unknown;
+}
+
+asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+	struct file *filp;
+	long error = -EBADF;
+	int fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (!filp)
+		goto out2;
+
+	if (!std_sys_ioctl(fd, cmd, arg, filp, &error))
+		goto out;
+
+	if (filp->f_op && filp->f_op->ioctl_native)
+		error = filp->f_op->ioctl_native(filp->f_dentry->d_inode,
+						 filp, cmd, arg);
+	else if (filp->f_op && filp->f_op->ioctl) {
+		lock_kernel();
+		error = filp->f_op->ioctl(filp->f_dentry->d_inode,
+					  filp, cmd, arg);
+		unlock_kernel();
 	}
-	unlock_kernel();
-	fput(filp);
 
 out:
+	fput_light(filp, fput_needed);
+out2:
 	return error;
 }
 
diff -puN include/linux/fs.h~ioctl-rework include/linux/fs.h
--- 25/include/linux/fs.h~ioctl-rework	Thu Dec 16 15:48:31 2004
+++ 25-akpm/include/linux/fs.h	Thu Dec 16 15:48:31 2004
@@ -907,6 +907,12 @@ typedef struct {
 
 typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long);
 
+/* These macros are for out of kernel modules to test that
+ * the kernel supports the ioctl_native and ioctl_compat
+ * fields in struct file_operations. */
+#define HAVE_IOCTL_COMPAT 1
+#define HAVE_IOCTL_NATIVE 1
+
 /*
  * NOTE:
  * read, write, poll, fsync, readv, writev can be called
@@ -922,6 +928,24 @@ struct file_operations {
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
+	/* The two calls ioctl_native and ioctl_compat described below
+	 * can be used as a replacement for the ioctl call above. They
+	 * take precedence over ioctl: thus if they are set, ioctl is
+	 * not used.  Unlike ioctl, BKL is not taken: drivers manage
+	 * their own locking. */
+
+	/* If ioctl_native is set, it is used instead of ioctl for
+	 * native ioctl syscalls.
+	 * Note that the standard glibc ioctl trims the return code to
+	 * type int, so dont try to put a 64 bit value there.
+	 */
+	long (*ioctl_native) (struct inode *, struct file *, unsigned int, unsigned long);
+	/* If ioctl_compat is set, it is used for a 32 bit compatible
+	 * ioctl (i.e. a 32 bit binary running on a 64 bit OS).
+	 * Return -ENOIOCTLCMD if you dont handle it.
+	 * Note that only the low 32 bit of the return code are passed
+	 * to the user-space application. */
+	long (*ioctl_compat) (struct inode *, struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *);
diff -puN include/linux/ioctl.h~ioctl-rework include/linux/ioctl.h
--- 25/include/linux/ioctl.h~ioctl-rework	Thu Dec 16 15:48:31 2004
+++ 25-akpm/include/linux/ioctl.h	Thu Dec 16 15:48:31 2004
@@ -3,5 +3,13 @@
 
 #include <asm/ioctl.h>
 
+/* Handles standard ioctl commands, and returns the result in status.
+   Does nothing and returns non-zero if cmd is not one of the standard commands.
+*/
+
+struct file;
+int std_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
+		  struct file *filp, long *status);
+
 #endif /* _LINUX_IOCTL_H */
 
_