From: NeilBrown <neilb@cse.unsw.edu.au>

From: Fred.  We don't do all the utf8 checking we could in the kernel, but we
do some simple checks.  Implement slightly stricter, and probably more
efficient, checking.


---

 25-akpm/fs/nfsd/nfs4xdr.c |  153 ++++++++++++++++++++++++----------------------
 1 files changed, 80 insertions(+), 73 deletions(-)

diff -puN fs/nfsd/nfs4xdr.c~knfsd-06-UTF8-improvements fs/nfsd/nfs4xdr.c
--- 25/fs/nfsd/nfs4xdr.c~knfsd-06-UTF8-improvements	2004-04-03 02:59:51.554515464 -0800
+++ 25-akpm/fs/nfsd/nfs4xdr.c	2004-04-03 02:59:51.559514704 -0800
@@ -58,93 +58,94 @@
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
-/*
- * From Peter Astrand <peter@cendio.se>: The following routines check
- * whether a filename supplied by the client is valid.
- */
-static const char trailing_bytes_for_utf8[256] = {
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+static const char utf8_byte_len[256] = {
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+	0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
 };
 
 static inline int
-is_legal_iso_utf8_sequence(unsigned char *source, int length)
+is_legal_utf8_sequence(unsigned char *source, int length)
 {
-	unsigned char a;
-	unsigned char *srcptr;
+	unsigned char *ptr;
+	unsigned char c;
 
-	srcptr = source + length;
+	if (length==1) return 1;
 
-	switch (length) {
-		/* Everything else falls through when "1"... */
+	/* Check for overlong sequence, and check second byte */
+	c = *(source + 1);
+	switch (*source) {
+	case 0xE0: /* 3 bytes */
+		if ( c < 0xA0 ) return 0;
+		break;
+	case 0xF0: /* 4 bytes */
+		if ( c < 0x90 ) return 0;
+		break;
+	case 0xF8: /* 5 bytes */
+		if ( c < 0xC8 ) return 0;
+		break;
+	case 0xFC: /* 6 bytes */
+		if ( c < 0x84 ) return 0;
+		break;
 	default:
-		/* Sequences with more than 6 bytes are invalid */
-		return 0;
+		if ( (c & 0xC0) != 0x80) return 0;
+	}
 
-		/*
-		   Byte 3-6 must be 80..BF
-		*/
-	case 6:
-		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-	case 5:
-		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-	case 4:
-		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-	case 3:
-		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-
-	case 2:
-		a = *--srcptr;
-
-		/* Upper limit */
-		if (a > 0xBF)
-			/* 2nd byte may never be > 0xBF */
-			return 0;
+	/* Check that trailing bytes look like 10xxxxxx */
+	for (ptr = source++ + length - 1; ptr>source; ptr--)
+		if ( ((*ptr) & 0xC0) != 0x80 ) return 0;
+	return 1;
+}
 
-		/*
-		   Lower limits checks, to detect non-shortest forms.
-		   No fall-through in this inner switch.
-		*/
-		switch (*source) {
-		case 0xE0: /* 3 bytes */
-			if (a < 0xA0) return 0;
-			break;
-		case 0xF0: /* 4 bytes */
-			if (a < 0x90) return 0;
-			break;
-		case 0xF8: /* 5 bytes */
-			if (a < 0xC8) return 0;
-			break;
-		case 0xFC: /* 6 bytes */
-			if (a < 0x84) return 0;
-			break;
-		default:
-			/* In all cases, 2nd byte must be >= 0x80 (because leading
-			   10...) */
-			if (a < 0x80) return 0;
-		}
+/* This does some screening on disallowed unicode characters.  It is NOT
+ * comprehensive.
+ */
+static int
+is_allowed_utf8_char(unsigned char *source, int length)
+{
+	/* We assume length and source point to a valid utf8 sequence */
+	unsigned char c;
 
-	case 1:
-		/* Invalid ranges */
-		if (*source >= 0x80 && *source < 0xC2)
-			/* Multibyte char with value < 0xC2, non-shortest */
-			return 0;
-		if (*source > 0xFD)
-			/* Leading byte starting with 11111110 is illegal */
-			return 0;
-		if (!*source)
-			return 0;
+	/* Disallow F0000 and up (in utf8, F3B08080) */
+	if (*source > 0xF3 ) return 0;
+	c = *(source + 1);
+	switch (*source) {
+	case 0xF3:
+		if (c >= 0xB0) return 0;
+		break;
+	/* Disallow D800-F8FF (in utf8, EDA080-EFA3BF */
+	case 0xED:
+		if (c >= 0xA0) return 0;
+		break;
+	case 0xEE:
+		return 0;
+		break;
+	case 0xEF:
+		if (c <= 0xA3) return 0;
+	/* Disallow FFF9-FFFF (EFBFB9-EFBFBF) */
+		if (c==0xBF)
+			/* Don't need to check <=0xBF, since valid utf8 */
+			if ( *(source+2) >= 0xB9) return 0;
+		break;
 	}
-
 	return 1;
 }
 
+/* This routine should really check to see that the proper stringprep
+ * mappings have been applied.  Instead, we do a simple screen of some
+ * of the more obvious illegal values by calling is_allowed_utf8_char.
+ * This will allow many illegal strings through, but if a client behaves,
+ * it will get full functionality.  The other option (apart from full
+ * stringprep checking) is to limit everything to an easily handled subset,
+ * such as 7-bit ascii.
+ *
+ * Note - currently calling routines ignore return value except as boolean.
+ */
 static int
 check_utf8(char *str, int len)
 {
@@ -155,11 +156,17 @@ check_utf8(char *str, int len)
 	sourceend = str + len;
 
 	while (chunk < sourceend) {
-		chunklen = trailing_bytes_for_utf8[*chunk]+1;
+		chunklen = utf8_byte_len[*chunk];
+		if (!chunklen)
+			return nfserr_inval;
 		if (chunk + chunklen > sourceend)
 			return nfserr_inval;
-		if (!is_legal_iso_utf8_sequence(chunk, chunklen))
+		if (!is_legal_utf8_sequence(chunk, chunklen))
+			return nfserr_inval;
+		if (!is_allowed_utf8_char(chunk, chunklen))
 			return nfserr_inval;
+		if ( (chunklen==1) && (!*chunk) )
+			return nfserr_inval; /* Disallow embedded nulls */
 		chunk += chunklen;
 	}
 

_