From: Gerrit Huizenga <gh@us.ibm.com>

CKRM processor scheduling delay accounting - provides a mechanism to In
addition to counting frequency the total delay in ns is also recorded.  CPU
delays are specified as cpu-wait and cpu-run.  I/O delays are recorded for
memory and regular I/O.  Information is accessible through /proc/<pid>/delay.

Signed-Off-By: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-Off-By: Hubertus Franke <frankeh@us.ibm.com>
Signed-Off-By: Shailabh Nagar <nagar@us.ibm.com>
Signed-Off-By: Gerrit Huizenga <gh@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 fs/proc/array.c            |   18 +++++++++
 fs/proc/base.c             |   17 ++++++++
 fs/proc/internal.h         |    1 
 include/linux/sched.h      |   88 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/taskdelays.h |   35 +++++++++++++++++
 init/Kconfig               |    8 ++++
 kernel/fork.c              |    1 
 kernel/sched.c             |   13 ++++++
 mm/memory.c                |   27 +++++++++----
 9 files changed, 198 insertions(+), 10 deletions(-)

diff -puN fs/proc/array.c~ckrm-processor-delay-accounting fs/proc/array.c
--- devel/fs/proc/array.c~ckrm-processor-delay-accounting	2005-07-27 15:59:18.000000000 -0700
+++ devel-akpm/fs/proc/array.c	2005-07-27 15:59:18.000000000 -0700
@@ -482,3 +482,21 @@ int proc_pid_statm(struct task_struct *t
 	return sprintf(buffer,"%d %d %d %d %d %d %d\n",
 		       size, resident, shared, text, lib, data, 0);
 }
+
+
+int proc_pid_delay(struct task_struct *task, char * buffer)
+{
+	int res;
+
+	res  = sprintf(buffer,"%u %llu %llu %u %llu %u %llu\n",
+		       (unsigned int) get_delay(task,runs),
+		       (uint64_t) get_delay(task,runcpu_total),
+		       (uint64_t) get_delay(task,waitcpu_total),
+		       (unsigned int) get_delay(task,num_iowaits),
+		       (uint64_t) get_delay(task,iowait_total),
+		       (unsigned int) get_delay(task,num_memwaits),
+		       (uint64_t) get_delay(task,mem_iowait_total)
+		);
+	return res;
+}
+
diff -puN fs/proc/base.c~ckrm-processor-delay-accounting fs/proc/base.c
--- devel/fs/proc/base.c~ckrm-processor-delay-accounting	2005-07-27 15:59:18.000000000 -0700
+++ devel-akpm/fs/proc/base.c	2005-07-27 15:59:18.000000000 -0700
@@ -158,6 +158,10 @@ enum pid_directory_inos {
 #ifdef CONFIG_AUDITSYSCALL
 	PROC_TID_LOGINUID,
 #endif
+#ifdef CONFIG_DELAY_ACCT
+        PROC_TID_DELAY_ACCT,
+        PROC_TGID_DELAY_ACCT,
+#endif
 	PROC_TID_FD_DIR = 0x8000,	/* 0x8000-0xffff */
 	PROC_TID_OOM_SCORE,
 	PROC_TID_OOM_ADJUST,
@@ -197,6 +201,9 @@ static struct pid_entry tgid_base_stuff[
 #ifdef CONFIG_SECURITY
 	E(PROC_TGID_ATTR,      "attr",    S_IFDIR|S_IRUGO|S_IXUGO),
 #endif
+#ifdef CONFIG_DELAY_ACCT
+	E(PROC_TGID_DELAY_ACCT,"delay",   S_IFREG|S_IRUGO),
+#endif
 #ifdef CONFIG_KALLSYMS
 	E(PROC_TGID_WCHAN,     "wchan",   S_IFREG|S_IRUGO),
 #endif
@@ -237,6 +244,9 @@ static struct pid_entry tid_base_stuff[]
 #ifdef CONFIG_SECURITY
 	E(PROC_TID_ATTR,       "attr",    S_IFDIR|S_IRUGO|S_IXUGO),
 #endif
+#ifdef CONFIG_DELAY_ACCT
+	E(PROC_TGID_DELAY_ACCT,"delay",   S_IFREG|S_IRUGO),
+#endif
 #ifdef CONFIG_KALLSYMS
 	E(PROC_TID_WCHAN,      "wchan",   S_IFREG|S_IRUGO),
 #endif
@@ -1664,6 +1674,13 @@ static struct dentry *proc_pident_lookup
 			ei->op.proc_read = proc_pid_wchan;
 			break;
 #endif
+#ifdef CONFIG_DELAY_ACCT
+		case PROC_TID_DELAY_ACCT:
+		case PROC_TGID_DELAY_ACCT:
+			inode->i_fop = &proc_info_file_operations;
+			ei->op.proc_read = proc_pid_delay;
+			break;
+#endif
 #ifdef CONFIG_SCHEDSTATS
 		case PROC_TID_SCHEDSTAT:
 		case PROC_TGID_SCHEDSTAT:
diff -puN fs/proc/internal.h~ckrm-processor-delay-accounting fs/proc/internal.h
--- devel/fs/proc/internal.h~ckrm-processor-delay-accounting	2005-07-27 15:59:18.000000000 -0700
+++ devel-akpm/fs/proc/internal.h	2005-07-27 15:59:18.000000000 -0700
@@ -36,6 +36,7 @@ extern int proc_tid_stat(struct task_str
 extern int proc_tgid_stat(struct task_struct *, char *);
 extern int proc_pid_status(struct task_struct *, char *);
 extern int proc_pid_statm(struct task_struct *, char *);
+extern int proc_pid_delay(struct task_struct *, char*);
 
 static inline struct task_struct *proc_task(struct inode *inode)
 {
diff -puN include/linux/sched.h~ckrm-processor-delay-accounting include/linux/sched.h
--- devel/include/linux/sched.h~ckrm-processor-delay-accounting	2005-07-27 15:59:18.000000000 -0700
+++ devel-akpm/include/linux/sched.h	2005-07-27 15:59:18.000000000 -0700
@@ -34,6 +34,7 @@
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/seccomp.h>
+#include <linux/taskdelays.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
@@ -820,6 +821,9 @@ struct task_struct {
 	int cpuset_mems_generation;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
+#ifdef CONFIG_DELAY_ACCT
+	struct task_delay_info delays;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
@@ -870,6 +874,8 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
+#define PF_MEMIO	0x01000000      /* I am potentially doing I/O for mem */
+#define PF_IOWAIT	0x02000000      /* I am waiting on disk I/O */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
@@ -1395,6 +1401,88 @@ static inline void thaw_processes(void) 
 static inline int try_to_freeze(void) { return 0; }
 
 #endif /* CONFIG_PM */
+
+/* API for registering delay info */
+#ifdef CONFIG_DELAY_ACCT
+
+#define test_delay_flag(tsk,flg)	((tsk)->flags & (flg))
+#define set_delay_flag(tsk,flg)		((tsk)->flags |= (flg))
+#define clear_delay_flag(tsk,flg)	((tsk)->flags &= ~(flg))
+
+#define def_delay_var(var)		unsigned long long var
+#define get_delay(tsk,field)		((tsk)->delays.field)
+
+#define start_delay(var)		((var) = sched_clock())
+#define start_delay_set(var,flg)	(set_delay_flag(current,flg),(var) = \
+							sched_clock())
+
+#define inc_delay(tsk,field)		(((tsk)->delays.field)++)
+
+/* because of hardware timer drifts in SMPs and task continue on different cpu
+ * then where the start_ts was taken there is a possibility that
+ * end_ts < start_ts by some usecs. In this case we ignore the diff
+ * and add nothing to the total.
+ */
+#ifdef CONFIG_SMP
+#define test_ts_integrity(start_ts,end_ts)  (likely((end_ts) > (start_ts)))
+#else
+#define test_ts_integrity(start_ts,end_ts)  (1)
+#endif
+
+#define add_delay_ts(tsk,field,start_ts,end_ts) \
+	do { if (test_ts_integrity(start_ts,end_ts)) (tsk)->delays.field += ((end_ts)-(start_ts)); } while (0)
+
+#define add_delay_clear(tsk,field,start_ts,flg)		\
+	do {						\
+		unsigned long long now = sched_clock();	\
+		add_delay_ts(tsk,field,start_ts,now);	\
+		clear_delay_flag(tsk,flg);		\
+	} while (0)
+
+static inline void add_io_delay(unsigned long long dstart)
+{
+	struct task_struct * tsk = current;
+	unsigned long long now = sched_clock();
+	unsigned long long val;
+
+	if (test_ts_integrity(dstart,now))
+		val = now - dstart;
+	else
+		val = 0;
+	if (test_delay_flag(tsk,PF_MEMIO)) {
+		tsk->delays.mem_iowait_total += val;
+		tsk->delays.num_memwaits++;
+	} else {
+		tsk->delays.iowait_total += val;
+		tsk->delays.num_iowaits++;
+	}
+	clear_delay_flag(tsk,PF_IOWAIT);
+}
+
+inline static void init_delays(struct task_struct *tsk)
+{
+	memset((void*)&tsk->delays,0,sizeof(tsk->delays));
+}
+
+#else
+
+#define test_delay_flag(tsk,flg)                (0)
+#define set_delay_flag(tsk,flg)                 do { } while (0)
+#define clear_delay_flag(tsk,flg)               do { } while (0)
+
+#define def_delay_var(var)
+#define get_delay(tsk,field)                    (0)
+
+#define start_delay(var)                        do { } while (0)
+#define start_delay_set(var,flg)                do { } while (0)
+
+#define inc_delay(tsk,field)                    do { } while (0)
+#define add_delay_ts(tsk,field,start_ts,now)    do { } while (0)
+#define add_delay_clear(tsk,field,start_ts,flg) do { } while (0)
+#define add_io_delay(dstart)			do { } while (0)
+#define init_delays(tsk)                        do { } while (0)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif
diff -puN /dev/null include/linux/taskdelays.h
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ devel-akpm/include/linux/taskdelays.h	2005-07-27 15:59:18.000000000 -0700
@@ -0,0 +1,35 @@
+/* taskdelays.h - for delay accounting
+ *
+ * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004
+ *
+ * Has the data structure for delay counting.
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef _LINUX_TASKDELAYS_H
+#define _LINUX_TASKDELAYS_H
+
+#include <linux/config.h>
+#include <linux/types.h>
+
+struct task_delay_info {
+	/* delay statistics in usecs */
+	uint64_t waitcpu_total;
+	uint64_t runcpu_total;
+	uint64_t iowait_total;
+	uint64_t mem_iowait_total;
+	uint32_t runs;
+	uint32_t num_iowaits;
+	uint32_t num_memwaits;
+};
+
+#endif /* _LINUX_TASKDELAYS_H */
diff -puN init/Kconfig~ckrm-processor-delay-accounting init/Kconfig
--- devel/init/Kconfig~ckrm-processor-delay-accounting	2005-07-27 15:59:18.000000000 -0700
+++ devel-akpm/init/Kconfig	2005-07-27 15:59:18.000000000 -0700
@@ -262,6 +262,14 @@ menuconfig EMBEDDED
           environments which can tolerate a "non-standard" kernel.
           Only use this if you really know what you are doing.
 
+config DELAY_ACCT
+	bool "Enable delay accounting (EXPERIMENTAL)"
+	help
+	  In addition to counting frequency the total delay in ns is also
+	  recorded. CPU delays are specified as cpu-wait and cpu-run.
+	  I/O delays are recorded for memory and regular I/O.
+	  Information is accessible through /proc/<pid>/delay.
+
 config KALLSYMS
 	 bool "Load all symbols for debugging/kksymoops" if EMBEDDED
 	 default y
diff -puN kernel/fork.c~ckrm-processor-delay-accounting kernel/fork.c
--- devel/kernel/fork.c~ckrm-processor-delay-accounting	2005-07-27 15:59:18.000000000 -0700
+++ devel-akpm/kernel/fork.c	2005-07-27 15:59:18.000000000 -0700
@@ -906,6 +906,7 @@ static task_t *copy_process(unsigned lon
 	if (p->binfmt && !try_module_get(p->binfmt->module))
 		goto bad_fork_cleanup_put_domain;
 
+	init_delays(p);
 	p->did_exec = 0;
 	copy_flags(clone_flags, p);
 	p->pid = pid;
diff -puN kernel/sched.c~ckrm-processor-delay-accounting kernel/sched.c
--- devel/kernel/sched.c~ckrm-processor-delay-accounting	2005-07-27 15:59:18.000000000 -0700
+++ devel-akpm/kernel/sched.c	2005-07-27 15:59:18.000000000 -0700
@@ -827,11 +827,13 @@ static inline void resched_task(task_t *
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  */
-inline int task_curr(const task_t *p)
+int task_curr(const task_t *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
 
+EXPORT_SYMBOL_GPL(task_curr);
+
 #ifdef CONFIG_SMP
 typedef struct {
 	struct list_head list;
@@ -2892,6 +2894,7 @@ switch_tasks:
 
 	update_cpu_clock(prev, rq, now);
 
+	add_delay_ts(prev, runcpu_total, prev->timestamp, now);
 	prev->sleep_avg -= run_time;
 	if ((long)prev->sleep_avg <= 0)
 		prev->sleep_avg = 0;
@@ -2899,6 +2902,8 @@ switch_tasks:
 
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
+		add_delay_ts(next, waitcpu_total, next->timestamp, now);
+		inc_delay(next, runs);
 		next->timestamp = now;
 		rq->nr_switches++;
 		rq->curr = next;
@@ -3975,9 +3980,12 @@ void __sched io_schedule(void)
 {
 	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 
+	def_delay_var(dstart);
+	start_delay_set(dstart, PF_IOWAIT);
 	atomic_inc(&rq->nr_iowait);
 	schedule();
 	atomic_dec(&rq->nr_iowait);
+	add_io_delay(dstart);
 }
 
 EXPORT_SYMBOL(io_schedule);
@@ -3986,10 +3994,13 @@ long __sched io_schedule_timeout(long ti
 {
 	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 	long ret;
+	def_delay_var(dstart);
 
+	start_delay_set(dstart,PF_IOWAIT);
 	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
 	atomic_dec(&rq->nr_iowait);
+	add_io_delay(dstart);
 	return ret;
 }
 
diff -puN mm/memory.c~ckrm-processor-delay-accounting mm/memory.c
--- devel/mm/memory.c~ckrm-processor-delay-accounting	2005-07-27 15:59:18.000000000 -0700
+++ devel-akpm/mm/memory.c	2005-07-27 16:02:17.000000000 -0700
@@ -2065,6 +2065,7 @@ int handle_mm_fault(struct mm_struct *mm
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	int rc;
 
 	__set_current_state(TASK_RUNNING);
 
@@ -2084,6 +2085,7 @@ int handle_mm_fault(struct mm_struct *mm
 	 * pte_alloc_map here.
 	 */
 	page_table_atomic_start(mm);
+	set_delay_flag(current, PF_MEMIO);
 	pgd = pgd_offset(mm, address);
 	if (unlikely(pgd_none(*pgd))) {
 		pud_t *new;
@@ -2091,8 +2093,10 @@ int handle_mm_fault(struct mm_struct *mm
 		page_table_atomic_stop(mm);
 		new = pud_alloc_one(mm, address);
 
-		if (!new)
-			goto oom;
+		if (!new) {
+			rc = VM_FAULT_OOM;
+			goto out;
+		}
 
 		page_table_atomic_start(mm);
 		if (!pgd_test_and_populate(mm, pgd, new))
@@ -2106,8 +2110,10 @@ int handle_mm_fault(struct mm_struct *mm
 		page_table_atomic_stop(mm);
 		new = pmd_alloc_one(mm, address);
 
-		if (!new)
-			goto oom;
+		if (!new) {
+			rc = VM_FAULT_OOM;
+			goto out;
+		}
 
 		page_table_atomic_start(mm);
 
@@ -2122,8 +2128,10 @@ int handle_mm_fault(struct mm_struct *mm
 		page_table_atomic_stop(mm);
 		new = pte_alloc_one(mm, address);
 
-		if (!new)
-			goto oom;
+		if (!new) {
+			rc = VM_FAULT_OOM;
+			goto out;
+		}
 
 		page_table_atomic_start(mm);
 
@@ -2136,9 +2144,10 @@ int handle_mm_fault(struct mm_struct *mm
 	}
 
 	pte = pte_offset_map(pmd, address);
-	return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
-oom:
-	return VM_FAULT_OOM;
+	rc = handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+out:
+	clear_delay_flag(current, PF_MEMIO);
+	return rc;
 
 sigbus:
 	return VM_FAULT_SIGBUS;
_