From: Jens Axboe <axboe@suse.de>

core bits

Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/drivers/block/ll_rw_blk.c   |  132 +++++++++++++++++++++++++++++++++---
 25-akpm/fs/buffer.c                 |   23 +++++-
 25-akpm/include/linux/bio.h         |    3 
 25-akpm/include/linux/blkdev.h      |   15 ++++
 25-akpm/include/linux/buffer_head.h |    6 +
 25-akpm/include/linux/fs.h          |    1 
 6 files changed, 164 insertions(+), 16 deletions(-)

diff -puN drivers/block/ll_rw_blk.c~disk-barrier-core drivers/block/ll_rw_blk.c
--- 25/drivers/block/ll_rw_blk.c~disk-barrier-core	2004-05-27 02:09:09.175703992 -0700
+++ 25-akpm/drivers/block/ll_rw_blk.c	2004-05-27 02:09:09.192701408 -0700
@@ -263,6 +263,45 @@ void blk_queue_make_request(request_queu
 EXPORT_SYMBOL(blk_queue_make_request);
 
 /**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q:     the request queue
+ * @flag:  see below
+ *
+ * Description:
+ *   For journalled file systems, doing ordered writes on a commit
+ *   block instead of explicitly doing wait_on_buffer (which is bad
+ *   for performance) can be a big win. Block drivers supporting this
+ *   feature should call this function and indicate so.
+ *
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+	if (flag)
+		set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+	else
+		clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/**
+ * blk_queue_ordered - set function for issuing a flush
+ * @q:     the request queue
+ * @iff:   the function to be called issuing the flush
+ *
+ * Description:
+ *   If a driver supports issuing a flush command, the support is notified
+ *   to the block layer by defining it through this call.
+ *
+ **/
+void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
+{
+	q->issue_flush_fn = iff;
+}
+
+EXPORT_SYMBOL(blk_queue_issue_flush_fn);
+
+/**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q:  the request queue for the device
  * @dma_addr:   bus address limit
@@ -1873,10 +1912,11 @@ int blk_execute_rq(request_queue_t *q, s
 	}
 
 	rq->flags |= REQ_NOMERGE;
-	rq->waiting = &wait;
+	if (!rq->waiting)
+		rq->waiting = &wait;
 	elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
 	generic_unplug_device(q);
-	wait_for_completion(&wait);
+	wait_for_completion(rq->waiting);
 	rq->waiting = NULL;
 
 	if (rq->errors)
@@ -1887,6 +1927,74 @@ int blk_execute_rq(request_queue_t *q, s
 
 EXPORT_SYMBOL(blk_execute_rq);
 
+/**
+ * blk_issue_flush - queue a flush
+ * @bdev:	blockdev to issue flush for
+ * @error_sector:	error sector
+ * @wait:	completion event
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to. Passing in @wait makes the interface async, caller must
+ *    wait_for_completion() on its own.
+ */
+int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+{
+	request_queue_t *q;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+	if (!q->issue_flush_fn)
+		return -EOPNOTSUPP;
+
+	return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
+}
+
+EXPORT_SYMBOL(blkdev_issue_flush);
+
+/**
+ * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices
+ * @q:		device queue
+ * @disk:	gendisk
+ * @error_sector:	error offset
+ *
+ * Description:
+ *    Devices understanding the SCSI command set, can use this function as
+ *    a helper for issuing a cache flush. Note: driver is required to store
+ *    the error offset (in case of error flushing) in ->sector of struct
+ *    request.
+ */
+int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
+			       sector_t *error_sector)
+{
+	struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT);
+	int ret;
+
+	rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER;
+	rq->sector = 0;
+	memset(rq->cmd, 0, sizeof(rq->cmd));
+	rq->cmd[0] = 0x35;
+	rq->cmd_len = 12;
+	rq->data = NULL;
+	rq->data_len = 0;
+	rq->timeout = 60 * HZ;
+
+	ret = blk_execute_rq(q, disk, rq);
+
+	if (ret && error_sector)
+		*error_sector = rq->sector;
+
+	blk_put_request(rq);
+	return ret;
+}
+
+EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn);
+
 void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
 {
 	int rw = rq_data_dir(rq);
@@ -2140,7 +2248,7 @@ EXPORT_SYMBOL(__blk_attempt_remerge);
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req, *freereq = NULL;
-	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
+	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err;
 	sector_t sector;
 
 	sector = bio->bi_sector;
@@ -2158,9 +2266,11 @@ static int __make_request(request_queue_
 
 	spin_lock_prefetch(q->queue_lock);
 
-	barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
-
-	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
+	barrier = bio_barrier(bio);
+	if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
 
 again:
 	spin_lock_irq(q->queue_lock);
@@ -2240,7 +2350,8 @@ get_rq:
 			/*
 			 * READA bit set
 			 */
-			if (ra)
+			err = -EWOULDBLOCK;
+			if (bio_rw_ahead(bio))
 				goto end_io;
 	
 			freereq = get_request_wait(q, rw);
@@ -2251,10 +2362,9 @@ get_rq:
 	req->flags |= REQ_CMD;
 
 	/*
-	 * inherit FAILFAST from bio and don't stack up
-	 * retries for read ahead
+	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
 	 */
-	if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw))	
+	if (bio_rw_ahead(bio) || bio_failfast(bio))
 		req->flags |= REQ_FAILFAST;
 
 	/*
@@ -2292,7 +2402,7 @@ out:
 	return 0;
 
 end_io:
-	bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK);
+	bio_endio(bio, nr_sectors << 9, err);
 	return 0;
 }
 
diff -puN fs/buffer.c~disk-barrier-core fs/buffer.c
--- 25/fs/buffer.c~disk-barrier-core	2004-05-27 02:09:09.177703688 -0700
+++ 25-akpm/fs/buffer.c	2004-05-27 02:09:09.194701104 -0700
@@ -2753,21 +2753,31 @@ static int end_bio_bh_io_sync(struct bio
 	if (bio->bi_size)
 		return 1;
 
+	if (err == -EOPNOTSUPP)
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+
 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
 	bio_put(bio);
 	return 0;
 }
 
-void submit_bh(int rw, struct buffer_head * bh)
+int submit_bh(int rw, struct buffer_head * bh)
 {
 	struct bio *bio;
+	int ret = 0;
 
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
 	BUG_ON(!bh->b_end_io);
 
-	/* Only clear out a write error when rewriting */
-	if (test_set_buffer_req(bh) && rw == WRITE)
+	if (buffer_ordered(bh) && (rw == WRITE))
+		rw = WRITE_BARRIER;
+
+	/*
+	 * Only clear out a write error when rewriting, should this
+	 * include WRITE_SYNC as well?
+	 */
+	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
 		clear_buffer_write_io_error(bh);
 
 	/*
@@ -2789,7 +2799,14 @@ void submit_bh(int rw, struct buffer_hea
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
 
+	bio_get(bio);
 	submit_bio(rw, bio);
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+
+	bio_put(bio);
+	return ret;
 }
 
 /**
diff -puN include/linux/bio.h~disk-barrier-core include/linux/bio.h
--- 25/include/linux/bio.h~disk-barrier-core	2004-05-27 02:09:09.179703384 -0700
+++ 25-akpm/include/linux/bio.h	2004-05-27 02:09:09.195700952 -0700
@@ -102,6 +102,7 @@ struct bio {
 #define BIO_SEG_VALID	3	/* nr_hw_seg valid */
 #define BIO_CLONED	4	/* doesn't own data */
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
+#define BIO_EOPNOTSUPP	6	/* not supported */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
@@ -141,6 +142,8 @@ struct bio {
 #define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
 #define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
+#define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
+#define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 
 /*
  * will die
diff -puN include/linux/blkdev.h~disk-barrier-core include/linux/blkdev.h
--- 25/include/linux/blkdev.h~disk-barrier-core	2004-05-27 02:09:09.180703232 -0700
+++ 25-akpm/include/linux/blkdev.h	2004-05-27 02:09:09.196700800 -0700
@@ -195,6 +195,8 @@ enum rq_flag_bits {
 	__REQ_PM_SUSPEND,	/* suspend request */
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
+	__REQ_BAR_PREFLUSH,	/* barrier pre-flush done */
+	__REQ_BAR_POSTFLUSH,	/* barrier post-flush */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -220,6 +222,8 @@ enum rq_flag_bits {
 #define REQ_PM_SUSPEND	(1 << __REQ_PM_SUSPEND)
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
+#define REQ_BAR_PREFLUSH	(1 << __REQ_BAR_PREFLUSH)
+#define REQ_BAR_POSTFLUSH	(1 << __REQ_BAR_POSTFLUSH)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -248,6 +252,7 @@ typedef void (unplug_fn) (request_queue_
 struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
 typedef void (activity_fn) (void *data, int rw);
+typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -290,6 +295,7 @@ struct request_queue
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	activity_fn		*activity_fn;
+	issue_flush_fn		*issue_flush_fn;
 
 	/*
 	 * Auto-unplugging state
@@ -373,6 +379,7 @@ struct request_queue
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
+#define QUEUE_FLAG_ORDERED	8	/* supports ordered writes */
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
@@ -390,6 +397,10 @@ struct request_queue
 #define blk_pm_request(rq)	\
 	((rq)->flags & (REQ_PM_SUSPEND | REQ_PM_RESUME))
 
+#define blk_barrier_rq(rq)	((rq)->flags & REQ_HARDBARRIER)
+#define blk_barrier_preflush(rq)	((rq)->flags & REQ_BAR_PREFLUSH)
+#define blk_barrier_postflush(rq)	((rq)->flags & REQ_BAR_POSTFLUSH)
+
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
 #define rq_data_dir(rq)		((rq)->flags & 1)
@@ -588,6 +599,9 @@ extern void blk_queue_prep_rq(request_qu
 extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(request_queue_t *, int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
+extern void blk_queue_ordered(request_queue_t *, int);
+extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *);
+extern int blkdev_scsi_issue_flush_fn(request_queue_t *, struct gendisk *, sector_t *);
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
@@ -615,6 +629,7 @@ extern long blk_congestion_wait(int rw, 
 
 extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *);
 extern void blk_rq_prep_restart(struct request *);
+extern int blkdev_issue_flush(struct block_device *, sector_t *);
 
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128
diff -puN include/linux/buffer_head.h~disk-barrier-core include/linux/buffer_head.h
--- 25/include/linux/buffer_head.h~disk-barrier-core	2004-05-27 02:09:09.182702928 -0700
+++ 25-akpm/include/linux/buffer_head.h	2004-05-27 02:09:09.197700648 -0700
@@ -26,6 +26,7 @@ enum bh_state_bits {
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
+	BH_Ordered,	/* ordered write */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
@@ -110,7 +111,8 @@ BUFFER_FNS(Async_Read, async_read)
 BUFFER_FNS(Async_Write, async_write)
 BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
-BUFFER_FNS(Write_EIO,write_io_error)
+BUFFER_FNS(Write_EIO, write_io_error)
+BUFFER_FNS(Ordered, ordered)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 #define touch_buffer(bh)	mark_page_accessed(bh->b_page)
@@ -173,7 +175,7 @@ void FASTCALL(unlock_buffer(struct buffe
 void FASTCALL(__lock_buffer(struct buffer_head *bh));
 void ll_rw_block(int, int, struct buffer_head * bh[]);
 void sync_dirty_buffer(struct buffer_head *bh);
-void submit_bh(int, struct buffer_head *);
+int submit_bh(int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 
diff -puN include/linux/fs.h~disk-barrier-core include/linux/fs.h
--- 25/include/linux/fs.h~disk-barrier-core	2004-05-27 02:09:09.184702624 -0700
+++ 25-akpm/include/linux/fs.h	2004-05-27 02:09:09.199700344 -0700
@@ -83,6 +83,7 @@ extern int leases_enable, dir_notify_ena
 #define SPECIAL 4	/* For non-blockdevice requests in request queue */
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
+#define WRITE_BARRIER	((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
 
 #define SEL_IN		1
 #define SEL_OUT		2
_