http://linus.bkbits.net/linux-2.5
greg@kroah.com[torvalds]|ChangeSet|20040331005319|52424 greg
# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/03/30 16:53:19-08:00 greg@kroah.com 
#   [PATCH] back out sysfs reference count change
#   
#   This backs out Maneesh's sysfs patch that was recently added to the
#   kernel.
#   
#   In its defense, the original patch did solve some fixes that could be
#   duplicated on SMP machines, but the side affect of the patch caused lots
#   of problems.  Basically it caused kobjects to get their references
#   incremented when files that are not present in the kobject are asked for
#   (udev can easily trigger this when it looks for files call "dev" in
#   directories that do not have that file).  This can cause easy oopses
#   when the VFS later ages out those old dentries and the kobject has its
#   reference finally released (usually after the module that the kobject
#   lived in was removed.)
#   
#   I will continue to work with Maneesh to try to solve the original bug,
#   but for now, this patch needs to be applied.
# 
# fs/sysfs/dir.c
#   2004/03/30 07:23:21-08:00 greg@kroah.com +1 -14
#   back out sysfs reference count change
# 
# ChangeSet
#   2004/03/30 16:53:09-08:00 rth@twiddle.net 
#   [PATCH] Alpha: UP1500 pci_mem fix
#   
#   From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
#   
#   The memory reserved for PCI probe is not freed properly in some cases,
#   for instance with a 3.5Gb of RAM.
#   
#   Forward port from 2.4.
# 
# arch/alpha/kernel/sys_nautilus.c
#   2003/08/20 04:30:07-07:00 rth@twiddle.net +5 -3
#   Alpha: UP1500 pci_mem fix
# 
# ChangeSet
#   2004/03/30 10:52:10-08:00 benh@kernel.crashing.org 
#   [PATCH] ppc64: More incorrect syscall error test
#   
#   Oops, there was two different code path affected by this
#   bug (strace and normal) and I fixed only one. Here's the
#   other one:
# 
# arch/ppc64/kernel/entry.S
#   2004/03/30 07:59:43-08:00 benh@kernel.crashing.org +1 -1
#   ppc64: More incorrect syscall error test
# 
# ChangeSet
#   2004/03/30 10:51:56-08:00 benh@kernel.crashing.org 
#   [PATCH] ppc64: Add a sync in context switch on SMP
#   
#   For the same reason as ppc32, we need to ensure that all stores
#   done on a CPU has reached the coherency domain and are visible
#   to loads done by another CPU when context switching as the same
#   thread may be rescheduled almost right away there.
# 
# arch/ppc64/kernel/entry.S
#   2004/03/29 20:55:47-08:00 benh@kernel.crashing.org +8 -0
#   ppc64: Add a sync in context switch on SMP
# 
# ChangeSet
#   2004/03/30 10:51:44-08:00 benh@kernel.crashing.org 
#   [PATCH] ppc32: PCI mmap update
#   
#   This updates the ppc32 PCI mmap facility to allow mmap'ing of space
#   outside of the actual devices, using the host bridge resources instead. 
#   
#   This allow userland to map things like legacy IO space by either using
#   the bridge device itself, or simply any PCI device on the same bus
#   domain
# 
# arch/ppc/kernel/pci.c
#   2004/03/28 19:56:09-08:00 benh@kernel.crashing.org +28 -37
#   ppc32: PCI mmap update
# 
# ChangeSet
#   2004/03/30 10:51:30-08:00 benh@kernel.crashing.org 
#   [PATCH] ppc32: Allow PREEMPT with SMP in KConfig
#   
#   On ppc32, CONFIG_PREEMPT wasn't settable along with CONFIG_SMP
#   for historical reasons (smp_processor_id() races). Those races have
#   been fixes since then (well, should have been at least) so it's now
#   safe to allow both options.
# 
# arch/ppc/Kconfig
#   2004/03/30 07:39:41-08:00 benh@kernel.crashing.org +0 -4
#   ppc32: Allow PREEMPT with SMP in KConfig
# 
# ChangeSet
#   2004/03/30 10:51:17-08:00 benh@kernel.crashing.org 
#   [PATCH] ppc32: context switch  fixes
#   
#   This fixes a few issues with context switch on ppc32:
#   
#    - Makes sure we properly flush out all stores to the coherency domain
#      when switching out, since the same thread could be switched back in
#      on another CPU right away, those stores must be visible to all other
#      CPUs. 
#   
#    - Remove dssall in the assembly calls and do it now once in switch_mm
#      (stop vmx streams).  Assume the G5 doesn't need a sync after dssall. 
#   
#    - Remove bogus isync in the loop setting the userland segment registers
#   
#    - Do not switch the userland segments when the mm stays the same
# 
# include/asm-ppc/mmu_context.h
#   2004/03/29 20:58:44-08:00 benh@kernel.crashing.org +19 -6
#   ppc32: context switch  fixes
# 
# include/asm-ppc/cputable.h
#   2004/03/29 20:58:43-08:00 benh@kernel.crashing.org +16 -2
#   ppc32: context switch  fixes
# 
# arch/ppc/kernel/head.S
#   2004/03/29 20:55:41-08:00 benh@kernel.crashing.org +2 -5
#   ppc32: context switch  fixes
# 
# arch/ppc/kernel/entry.S
#   2004/03/29 20:55:41-08:00 benh@kernel.crashing.org +9 -0
#   ppc32: context switch  fixes
# 
# ChangeSet
#   2004/03/30 10:51:04-08:00 benh@kernel.crashing.org 
#   [PATCH] ppc32: Remove duplicate export
#   
#   enable_kernel_fp is exported both in ppc_ksyms and near it's
#   definition in process.c, remove the former.
# 
# arch/ppc/kernel/ppc_ksyms.c
#   2004/03/29 19:00:44-08:00 benh@kernel.crashing.org +0 -1
#   ppc32: Remove duplicate export
# 
# ChangeSet
#   2004/03/30 10:50:51-08:00 benh@kernel.crashing.org 
#   [PATCH] ppc32: Even more preempt fixes
#   
#   Add a warning if enable_kernel_{fp,altivec} is called with preempt
#   enabled since this is always an error, and make sure the alignement
#   exception handler properly disables preempt when doing FP operations.
# 
# arch/ppc/kernel/process.c
#   2004/03/29 20:55:42-08:00 benh@kernel.crashing.org +6 -4
#   ppc32: Even more preempt fixes
# 
# arch/ppc/kernel/align.c
#   2004/03/29 19:00:44-08:00 benh@kernel.crashing.org +4 -0
#   ppc32: Even more preempt fixes
# 
# ChangeSet
#   2004/03/30 10:49:13-08:00 vatsa@in.ibm.com 
#   [PATCH] Fix obvious stupid race in do_stop
#   
#   We don't set the task state to TASK_INTERRUPTIBLE _before_ checking for
#   kthread_should_stop in do_stop.
# 
# kernel/stop_machine.c
#   2004/03/08 22:53:56-08:00 vatsa@in.ibm.com +3 -1
#   Fix obvious stupid race in do_stop
# 
# ChangeSet
#   2004/03/30 10:47:17-08:00 marcelo.tosatti@cyclades.com 
#   [PATCH] pc300 driver misplaced ;
#   
#   From Dave Jones.
#   
#   Oops.
# 
# drivers/net/wan/pc300_drv.c
#   2004/03/30 06:32:11-08:00 marcelo.tosatti@cyclades.com +1 -1
#   pc300 driver misplaced ;
# 
# ChangeSet
#   2004/03/30 10:47:05-08:00 armin@melware.de 
#   [PATCH] ISDN Eicon driver: NULL pointer check inside spinlock
#   
#      Check for valid application pointer inside api spinlock
#      in diva_send_message().
# 
# drivers/isdn/hardware/eicon/capifunc.c
#   2004/03/30 06:17:51-08:00 armin@melware.de +4 -3
#   ISDN Eicon driver: NULL pointer check inside spinlock
# 
# ChangeSet
#   2004/03/30 10:41:57-08:00 akpm@osdl.org 
#   [PATCH] Make pdflush run at nice 0
#   
#   Since pdflush was converted to be launched by the kthread infrastructure it
#   has inherited keventd's `nice -10' setting.  That hurts interactivity when
#   pdflush is doing lots of work writing back through the dm-crypt layer.
#   
#   So set pdflush back to `nice 0'.
# 
# mm/pdflush.c
#   2004/03/30 09:58:09-08:00 akpm@osdl.org +6 -0
#   Make pdflush run at nice 0
# 
# ChangeSet
#   2004/03/30 10:41:44-08:00 akpm@osdl.org 
#   [PATCH] catch errors when completing bio pairs
#   
#   From: Mike Christie <michaelc@cs.wisc.edu>
#   
#   A couple of drivers can sometimes fail the first segments in a bio then
#   requeue the rest of the request.  In this situation, if the last part of
#   the bio completes successfully bio_pair_end_* will miss that the beginging
#   of the bio had failed becuase they just return one when bi_size is not yet
#   zero.  The attached patch moves the error value test before the bi_size to
#   catch the above case.
# 
# fs/bio.c
#   2004/03/23 07:05:19-08:00 akpm@osdl.org +6 -4
#   catch errors when completing bio pairs
# 
# ChangeSet
#   2004/03/30 10:41:31-08:00 akpm@osdl.org 
#   [PATCH] Fix BLKPREP_KILL
#   
#   From: Jens Axboe <axboe@suse.de>
#   
#   Samuel Rydh wrote:
#   
#   If a MODE_SENSE(6) command is sent to an IDE cd using the CDROM_SEND_PACKET
#   ioctl, then the kernel freezes solidly. To reproduce this, one can take the
#   SCSI cmd [1a 08 31 00 10 00] and a 16 byte data buffer.
#   
#   After some bug hunting, I found out that the following is what happens:
#   
#   - ide-cd recognizes that MODE_SENSE(6) isn't supported and tries
#     to abort the request from ide_cdrom_prep_pc by returning BLKPREP_KILL.
#   
#   - in elv_next_request(), the kill request is handled by
#     the following code:
#   
#   	while (end_that_request_first(rq, 0, rq->nr_sectors))
#   		;
#   	end_that_request_last(rq);
#   
#   The while loop never exits. The end_that_request_first() doesn't do anything
#   since rq->nr_sectors is 0; it just returns "not-done" after handling those 0
#   bytes (rq->bio->bi_size is 16).
# 
# drivers/block/elevator.c
#   2004/03/23 06:45:00-08:00 akpm@osdl.org +6 -2
#   Fix BLKPREP_KILL
# 
# ChangeSet
#   2004/03/29 20:26:56-08:00 laforge@netfilter.org 
#   [NETFILTER]: Fix DELETE_LIST oopses.
#   
#   We've now narrowed down the issue of kernel oopses in combination with
#   'LIST_DELETE' syslog messages happening in certain setups.
#   
#   Apparently people who do not enable CONFIG_IP_NF_NAT_LOCAL and do
#   DNAT/REDIRECT and want to connect locally from the gateway via DNAT to
#   the DNAT'ed address experience the bug ;)
#   
#   Patch courtesy of KOVACS Krisztian and Henrik Nordstrom
# 
# net/ipv4/netfilter/ip_nat_standalone.c
#   2004/03/29 20:26:43-08:00 laforge@netfilter.org +10 -1
#   [NETFILTER]: Fix DELETE_LIST oopses.
#   
#   We've now narrowed down the issue of kernel oopses in combination with
#   'LIST_DELETE' syslog messages happening in certain setups.
#   
#   Apparently people who do not enable CONFIG_IP_NF_NAT_LOCAL and do
#   DNAT/REDIRECT and want to connect locally from the gateway via DNAT to
#   the DNAT'ed address experience the bug ;)
#   
#   Patch courtesy of KOVACS Krisztian and Henrik Nordstrom
# 
# ChangeSet
#   2004/03/29 20:19:57-08:00 laforge@netfilter.org 
#   [NETFILTER]: Fix DEBUG compile in ipt_MASQUERADE.
# 
# net/ipv4/netfilter/ipt_MASQUERADE.c
#   2004/03/29 20:19:44-08:00 laforge@netfilter.org +1 -1
#   [NETFILTER]: Fix DEBUG compile in ipt_MASQUERADE.
# 
# ChangeSet
#   2004/03/29 20:11:56-08:00 uaca@alumni.uv.es 
#   [AF_PACKET]: Add PACKET_MMAP documentation.
# 
# net/Kconfig
#   2004/03/29 20:11:38-08:00 uaca@alumni.uv.es +0 -0
#   [AF_PACKET]: Add PACKET_MMAP documentation.
# 
# Documentation/networking/packet_mmap.txt
#   2004/03/29 20:11:32-08:00 uaca@alumni.uv.es +412 -0
#   [AF_PACKET]: Add PACKET_MMAP documentation.
# 
# Documentation/networking/packet_mmap.txt
#   2004/03/29 20:11:32-08:00 uaca@alumni.uv.es +0 -0
#   BitKeeper file /disk1/BK/net-2.6/Documentation/networking/packet_mmap.txt
# 
# ChangeSet
#   2004/03/28 21:51:55-08:00 niv@us.ibm.com 
#   [TCP]: Use tcp_tw_put on time-wait sockets.
# 
# net/ipv4/tcp_ipv4.c
#   2004/03/28 21:51:37-08:00 niv@us.ibm.com +6 -3
#   [TCP]: Use tcp_tw_put on time-wait sockets.
# 
# ChangeSet
#   2004/03/28 01:56:20-08:00 jmorris@redhat.com 
#   [IPV6]: Link some packet walker helpers always statically.
#   
#   Put the extension header helper funcs always statically into
#   the kernel even if ipv6 is built as a module, this is needed
#   for things like SELinux.
# 
# net/ipv6/ipv6_syms.c
#   2004/03/28 01:55:27-08:00 jmorris@redhat.com +0 -2
#   [IPV6]: Link some packet walker helpers always statically.
# 
# net/ipv6/exthdrs.c
#   2004/03/28 01:55:27-08:00 jmorris@redhat.com +0 -102
#   [IPV6]: Link some packet walker helpers always statically.
# 
# net/ipv6/Makefile
#   2004/03/28 01:55:27-08:00 jmorris@redhat.com +2 -0
#   [IPV6]: Link some packet walker helpers always statically.
# 
# net/Makefile
#   2004/03/28 01:55:27-08:00 jmorris@redhat.com +3 -1
#   [IPV6]: Link some packet walker helpers always statically.
# 
# net/ipv6/exthdrs_core.c
#   2004/03/28 01:55:23-08:00 jmorris@redhat.com +108 -0
#   [IPV6]: Link some packet walker helpers always statically.
# 
# net/ipv6/exthdrs_core.c
#   2004/03/28 01:55:23-08:00 jmorris@redhat.com +0 -0
#   BitKeeper file /disk1/BK/net-2.6/net/ipv6/exthdrs_core.c
# 
# ChangeSet
#   2004/03/28 01:54:03-08:00 uaca@alumni.uv.es 
#   [AF_PACKET]: Fix packet_set_ring memleak and remove num frame limit.
# 
# net/packet/af_packet.c
#   2004/03/28 01:50:58-08:00 uaca@alumni.uv.es +53 -36
#   [AF_PACKET]: Fix packet_set_ring memleak and remove num frame limit.
# 
diff -Nru a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/Documentation/networking/packet_mmap.txt	Tue Mar 30 20:00:37 2004
@@ -0,0 +1,412 @@
+
+DaveM:
+
+If you agree with it I will send two small patches to modify
+kernel's configure help. 
+
+	Ulisses
+
+--------------------------------------------------------------------------------
++ ABSTRACT
+--------------------------------------------------------------------------------
+
+This file documents the CONFIG_PACKET_MMAP option available with the PACKET
+socket interface on 2.4 and 2.6 kernels. This type of sockets is used for 
+capture network traffic with utilities like tcpdump or any other that uses 
+the libpcap library. 
+
+You can find the latest version of this document at
+
+    http://pusa.uv.es/~ulisses/packet_mmap/
+
+Please send me your comments to
+
+    Ulisses Alonso Camar� <uaca@i.hate.spam.alumni.uv.es>
+
+-------------------------------------------------------------------------------
++ Why use PACKET_MMAP
+--------------------------------------------------------------------------------
+
+In Linux 2.4/2.6 if PACKET_MMAP is not enabled, the capture process is very
+inefficient. It uses very limited buffers and requires one system call
+to capture each packet, it requires two if you want to get packet's 
+timestamp (like libpcap always does).
+
+In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size 
+configurable circular buffer mapped in user space. This way reading packets just 
+needs to wait for them, most of the time there is no need to issue a single 
+system call. By using a shared buffer between the kernel and the user 
+also has the benefit of minimizing packet copies.
+
+It's fine to use PACKET_MMAP to improve the performance of the capture process, 
+but it isn't everything. At least, if you are capturing at high speeds (this 
+is relative to the cpu speed), you should check if the device driver of your 
+network interface card supports some sort of interrupt load mitigation or 
+(even better) if it supports NAPI, also make sure it is enabled.
+
+--------------------------------------------------------------------------------
++ How to use CONFIG_PACKET_MMAP
+--------------------------------------------------------------------------------
+
+From the user standpoint, you should use the higher level libpcap library, wich
+is a de facto standard, portable across nearly all operating systems
+including Win32. 
+
+Said that, at time of this writing, official libpcap 0.8.1 is out and doesn't include
+support for PACKET_MMAP, and also probably the libpcap included in your distribution. 
+
+I'm aware of two implementations of PACKET_MMAP in libpcap:
+
+    http://pusa.uv.es/~ulisses/packet_mmap/  (by Simon Patarin, based on libpcap 0.6.2)
+    http://public.lanl.gov/cpw/              (by Phil Wood, based on lastest libpcap)
+
+The rest of this document is intended for people who want to understand
+the low level details or want to improve libpcap by including PACKET_MMAP
+support.
+
+--------------------------------------------------------------------------------
++ How to use CONFIG_PACKET_MMAP directly
+--------------------------------------------------------------------------------
+
+From the system calls stand point, the use of PACKET_MMAP involves
+the following process:
+
+
+[setup]     socket() -------> creation of the capture socket
+            setsockopt() ---> allocation of the circular buffer (ring)
+            mmap() ---------> maping of the allocated buffer to the
+                              user process
+
+[capture]   poll() ---------> to wait for incoming packets
+
+[shutdown]  close() --------> destruction of the capture socket and
+                              deallocation of all associated 
+                              resources.
+
+
+socket creation and destruction is straight forward, and is done 
+the same way with or without PACKET_MMAP:
+
+int fd;
+
+fd= socket(PF_PACKET, mode, htons(ETH_P_ALL))
+
+where mode is SOCK_RAW for the raw interface were link level
+information can be captured or SOCK_DGRAM for the cooked
+interface where link level information capture is not 
+supported and a link level pseudo-header is provided 
+by the kernel.
+
+The destruction of the socket and all associated resources
+is done by a simple call to close(fd).
+
+Next I will describe PACKET_MMAP settings and it's constraints,
+also the maping of the circular buffer in the user process and 
+the use of this buffer.
+
+--------------------------------------------------------------------------------
++ PACKET_MMAP settings
+--------------------------------------------------------------------------------
+
+
+To setup PACKET_MMAP from user level code is done with a call like
+
+     setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))
+
+The most significant argument in the previous call is the req parameter, 
+this parameter must to have the following structure:
+
+    struct tpacket_req
+    {
+        unsigned int    tp_block_size;  /* Minimal size of contiguous block */
+        unsigned int    tp_block_nr;    /* Number of blocks */
+        unsigned int    tp_frame_size;  /* Size of frame */
+        unsigned int    tp_frame_nr;    /* Total number of frames */
+    };
+
+This structure is defined in /usr/include/linux/if_packet.h and establishes a 
+circular buffer (ring) of unswappable memory mapped in the capture process. 
+Being mapped in the capture process allows reading the captured frames and 
+related meta-information like timestamps without requiring a system call.
+
+Captured frames are grouped in blocks. Each block is a physically contiguous 
+region of memory and holds tp_block_size/tp_frame_size frames. The total number 
+of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because
+
+    frames_per_block = tp_block_size/tp_frame_size
+
+indeed, packet_set_ring checks that the following condition is true
+
+    frames_per_block * tp_block_nr == tp_frame_nr
+
+
+Lets see an example, with the following values:
+
+     tp_block_size= 4096
+     tp_frame_size= 2048
+     tp_block_nr  = 4
+     tp_frame_nr  = 8
+
+we will get the following buffer structure:
+
+        block #1                 block #2         
++---------+---------+    +---------+---------+    
+| frame 1 | frame 2 |    | frame 3 | frame 4 |    
++---------+---------+    +---------+---------+    
+
+        block #3                 block #4
++---------+---------+    +---------+---------+
+| frame 5 | frame 6 |    | frame 7 | frame 8 |
++---------+---------+    +---------+---------+
+
+A frame can be of any size with the only condition it can fit in a block. A block
+can only hold an integer number of frames, or in other words, a frame cannot 
+be spawn accross two blocks so there are some datails you have to take into 
+account when choosing the frame_size. See "Maping and use of the circular 
+buffer (ring)".
+
+
+--------------------------------------------------------------------------------
++ PACKET_MMAP setting constraints
+--------------------------------------------------------------------------------
+
+In kernel versions prior to 2.4.26 (for the 2.4 branch) and 2.6.5 (2.6 branch),
+the PACKET_MMAP buffer could hold only 32768 frames in a 32 bit architecture or
+16384 in a 64 bit architecture. For information on these kernel versions
+see http://pusa.uv.es/~ulisses/packet_mmap/packet_mmap.pre-2.4.26_2.6.5.txt
+
+ Block size limit
+------------------
+
+As stated earlier, each block is a contiguous physical region of memory. These 
+memory regions are allocated with calls to the __get_free_pages() function. As 
+the name indicates, this function allocates pages of memory, and the second
+argument is "order" or a power of two number of pages, that is 
+(for PAGE_SIZE == 4096) order=0 ==> 4096 bytes, order=1 ==> 8192 bytes, 
+order=2 ==> 16384 bytes, etc. The maximum size of a 
+region allocated by __get_free_pages is determined by the MAX_ORDER macro. More 
+precisely the limit can be calculated as:
+
+   PAGE_SIZE << MAX_ORDER
+
+   In a i386 architecture PAGE_SIZE is 4096 bytes 
+   In a 2.4/i386 kernel MAX_ORDER is 10
+   In a 2.6/i386 kernel MAX_ORDER is 11
+
+So get_free_pages can allocate as much as 4MB or 8MB in a 2.4/2.6 kernel 
+respectively, with an i386 architecture.
+
+User space programs can include /usr/include/sys/user.h and 
+/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_ORDER declarations.
+
+The pagesize can also be determined dynamically with the getpagesize (2) 
+system call. 
+
+
+ Block number limit
+--------------------
+
+To understand the constraints of PACKET_MMAP, we have to see the structure 
+used to hold the pointers to each block.
+
+Currently, this structure is a dynamically allocated vector with kmalloc 
+called pg_vec, its size limits the number of blocks that can be allocated.
+
+    +---+---+---+---+
+    | x | x | x | x |
+    +---+---+---+---+
+      |   |   |   |
+      |   |   |   v
+      |   |   v  block #4
+      |   v  block #3
+      v  block #2
+     block #1
+
+
+kmalloc allocates any number of bytes of phisically contiguous memory from 
+a pool of pre-determined sizes. This pool of memory is mantained by the slab 
+allocator wich is at the end the responsible for doing the allocation and 
+hence wich imposes the maximum memory that kmalloc can allocate. 
+
+In a 2.4/2.6 kernel and the i386 architecture, the limit is 131072 bytes. The 
+predetermined sizes that kmalloc uses can be checked in the "size-<bytes>" 
+entries of /proc/slabinfo
+
+In a 32 bit architecture, pointers are 4 bytes long, so the total number of 
+pointers to blocks is
+
+     131072/4 = 32768 blocks
+
+
+ PACKET_MMAP buffer size calculator
+------------------------------------
+
+Definitions:
+
+<size-max>    : is the maximum size of allocable with kmalloc (see /proc/slabinfo)
+<pointer size>: depends on the architecture -- sizeof(void *)
+<page size>   : depends on the architecture -- PAGE_SIZE or getpagesize (2)
+<max-order>   : is the value defined with MAX_ORDER
+<frame size>  : it's an upper bound of frame's capture size (more on this later)
+
+from these definitions we will derive 
+
+	<block number> = <size-max>/<pointer size>
+	<block size> = <pagesize> << <max-order>
+
+so, the max buffer size is
+
+	<block number> * <block size>
+
+and, the number of frames be
+
+	<block number> * <block size> / <frame size>
+
+Suposse the following parameters, wich apply for 2.6 kernel and an
+i386 architecture:
+
+	<size-max> = 131072 bytes
+	<pointer size> = 4 bytes
+	<pagesize> = 4096 bytes
+	<max-order> = 11
+
+and a value for <frame size> of 2048 byteas. These parameters will yield
+
+	<block number> = 131072/4 = 32768 blocks
+	<block size> = 4096 << 11 = 8 MiB.
+
+and hence the buffer will have a 262144 MiB size. So it can hold 
+262144 MiB / 2048 bytes = 134217728 frames
+
+
+Actually, this buffer size is not possible with an i386 architecture. 
+Remember that the memory is allocated in kernel space, in the case of 
+an i386 kernel's memory size is limited to 1GiB.
+
+All memory allocations are not freed until the socket is closed. The memory 
+allocations are done with GFP_KERNEL priority, this basically means that 
+the allocation can wait and swap other process' memory in order to allocate 
+the nececessary memory, so normally limits can be reached.
+
+ Other constraints
+-------------------
+
+If you check the source code you will see that what I draw here as a frame
+is not only the link level frame. At the begining of each frame there is a 
+header called struct tpacket_hdr used in PACKET_MMAP to hold link level's frame
+meta information like timestamp. So what we draw here a frame it's really 
+the following (from include/linux/if_packet.h):
+
+/*
+   Frame structure:
+
+   - Start. Frame must be aligned to TPACKET_ALIGNMENT=16
+   - struct tpacket_hdr
+   - pad to TPACKET_ALIGNMENT=16
+   - struct sockaddr_ll
+   - Gap, chosen so that packet data (Start+tp_net) alignes to 
+     TPACKET_ALIGNMENT=16
+   - Start+tp_mac: [ Optional MAC header ]
+   - Start+tp_net: Packet data, aligned to TPACKET_ALIGNMENT=16.
+   - Pad to align to TPACKET_ALIGNMENT=16
+ */
+           
+ 
+ The following are conditions that are checked in packet_set_ring
+
+   tp_block_size must be a multiple of PAGE_SIZE (1)
+   tp_frame_size must be greater than TPACKET_HDRLEN (obvious)
+   tp_frame_size must be a multiple of TPACKET_ALIGNMENT
+   tp_frame_nr   must be exactly frames_per_block*tp_block_nr
+
+Note that tp_block_size should be choosed to be a power of two or there will
+be a waste of memory.
+
+--------------------------------------------------------------------------------
++ Maping and use of the circular buffer (ring)
+--------------------------------------------------------------------------------
+
+The maping of the buffer in the user process is done with the conventional 
+mmap function. Even the circular buffer is compound of several physically
+discontiguous blocks of memory, they are contiguous to the user space, hence
+just one call to mmap is needed:
+
+    mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+
+If tp_frame_size is a divisor of tp_block_size frames will be 
+contiguosly spaced by tp_frame_size bytes. If not, each 
+tp_block_size/tp_frame_size frames there will be a gap between 
+the frames. This is because a frame cannot be spawn across two
+blocks. 
+
+At the beginning of each frame there is an status field (see 
+struct tpacket_hdr). If this field is 0 means that the frame is ready
+to be used for the kernel, If not, there is a frame the user can read 
+and the following flags apply:
+
+     from include/linux/if_packet.h
+
+     #define TP_STATUS_COPY          2 
+     #define TP_STATUS_LOSING        4 
+     #define TP_STATUS_CSUMNOTREADY  8 
+
+
+TP_STATUS_COPY        : This flag indicates that the frame (and associated
+                        meta information) has been truncated because it's 
+                        larger than tp_frame_size. This packet can be 
+                        read entirely with recvfrom().
+                        
+                        In order to make this work it must to be
+                        enabled previously with setsockopt() and 
+                        the PACKET_COPY_THRESH option. 
+
+                        The number of frames than can be buffered to 
+                        be read with recvfrom is limited like a normal socket.
+                        See the SO_RCVBUF option in the socket (7) man page.
+
+TP_STATUS_LOSING      : indicates there were packet drops from last time 
+                        statistics where checked with getsockopt() and
+                        the PACKET_STATISTICS option.
+
+TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets wich 
+                        it's checksum will be done in hardware. So while 
+                        reading the packet we should not try to check the 
+                        checksum. 
+
+for convenience there are also the following defines:
+
+     #define TP_STATUS_KERNEL        0
+     #define TP_STATUS_USER          1
+
+The kernel initializes all frames to TP_STATUS_KERNEL, when the kernel
+receives a packet it puts in the buffer and updates the status with
+at least the TP_STATUS_USER flag. Then the user can read the packet,
+once the packet is read the user must zero the status field, so the kernel 
+can use again that frame buffer.
+
+The user can use poll (any other variant should apply too) to check if new
+packets are in the ring:
+
+    struct pollfd pfd;
+
+    pfd.fd = fd;
+    pfd.revents = 0;
+    pfd.events = POLLIN|POLLRDNORM|POLLERR;
+
+    if (status == TP_STATUS_KERNEL)
+        retval = poll(&pfd, 1, timeout);
+
+It doesn't incur in a race condition to first check the status value and 
+then poll for frames.
+
+--------------------------------------------------------------------------------
++ THANKS
+--------------------------------------------------------------------------------
+   
+   Jesse Brandeburg, for fixing my grammathical/spelling errors
+
+>>> EOF
+-
+To unsubscribe from this list: send the line "unsubscribe linux-net" in
+the body of a message to majordomo@vger.kernel.org
+More majordomo info at  http://vger.kernel.org/majordomo-info.html
\ No newline at end of file
diff -Nru a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c
--- a/arch/alpha/kernel/sys_nautilus.c	Tue Mar 30 20:00:36 2004
+++ b/arch/alpha/kernel/sys_nautilus.c	Tue Mar 30 20:00:36 2004
@@ -225,11 +225,13 @@
 	if (request_resource(&iomem_resource, bus->resource[1]) < 0)
 		printk(KERN_ERR "Failed to request MEM on hose 0\n");
 
-	if (pci_mem < memtop && pci_mem > alpha_mv.min_mem_address) {
+	if (pci_mem < memtop)
+		memtop = pci_mem;
+	if (memtop > alpha_mv.min_mem_address) {
 		free_reserved_mem(__va(alpha_mv.min_mem_address),
-				  __va(pci_mem));
+				  __va(memtop));
 		printk("nautilus_init_pci: %ldk freed\n",
-			(pci_mem - alpha_mv.min_mem_address) >> 10);
+			(memtop - alpha_mv.min_mem_address) >> 10);
 	}
 
 	if ((IRONGATE0->dev_vendor >> 16) > 0x7006)	/* Albacore? */
diff -Nru a/arch/ppc/Kconfig b/arch/ppc/Kconfig
--- a/arch/ppc/Kconfig	Tue Mar 30 20:00:37 2004
+++ b/arch/ppc/Kconfig	Tue Mar 30 20:00:37 2004
@@ -696,14 +696,10 @@
 
 config PREEMPT
 	bool "Preemptible Kernel"
-	depends on !SMP
 	help
 	  This option reduces the latency of the kernel when reacting to
 	  real-time or interactive events by allowing a low priority process to
 	  be preempted even if it is in kernel mode executing a system call.
-	  Unfortunately the kernel code has some race conditions if both
-	  CONFIG_SMP and CONFIG_PREEMPT are enabled, so this option is
-	  currently disabled if you are building an SMP kernel.
 
 	  Say Y here if you are building a kernel for a desktop, embedded
 	  or real-time system.  Say N if you are unsure.
diff -Nru a/arch/ppc/kernel/align.c b/arch/ppc/kernel/align.c
--- a/arch/ppc/kernel/align.c	Tue Mar 30 20:00:36 2004
+++ b/arch/ppc/kernel/align.c	Tue Mar 30 20:00:36 2004
@@ -325,14 +325,18 @@
 	 * the kernel with -msoft-float so it doesn't use the
 	 * fp regs for copying 8-byte objects. */
 	case LD+F+S:
+		preempt_disable();
 		enable_kernel_fp();
 		cvt_fd(&data.f, &current->thread.fpr[reg], &current->thread.fpscr);
 		/* current->thread.fpr[reg] = data.f; */
+		preempt_enable();
 		break;
 	case ST+F+S:
+		preempt_disable();
 		enable_kernel_fp();
 		cvt_df(&current->thread.fpr[reg], &data.f, &current->thread.fpscr);
 		/* data.f = current->thread.fpr[reg]; */
+		preempt_enable();
 		break;
 	default:
 		printk("align: can't handle flags=%x\n", flags);
diff -Nru a/arch/ppc/kernel/entry.S b/arch/ppc/kernel/entry.S
--- a/arch/ppc/kernel/entry.S	Tue Mar 30 20:00:37 2004
+++ b/arch/ppc/kernel/entry.S	Tue Mar 30 20:00:37 2004
@@ -469,10 +469,19 @@
 	stw	r10,_CCR(r1)
 	stw	r1,KSP(r3)	/* Set old stack pointer */
 
+#ifdef CONFIG_SMP
+	/* We need a sync somewhere here to make sure that if the
+	 * previous task gets rescheduled on another CPU, it sees all
+	 * stores it has performed on this one.
+	 */
+	sync
+#endif /* CONFIG_SMP */
+
 	tophys(r0,r4)
 	CLR_TOP32(r0)
 	mtspr	SPRG3,r0	/* Update current THREAD phys addr */
 	lwz	r1,KSP(r4)	/* Load new stack pointer */
+
 	/* save the old current 'last' for return value */
 	mr	r3,r2
 	addi	r2,r4,-THREAD	/* Update current */
diff -Nru a/arch/ppc/kernel/head.S b/arch/ppc/kernel/head.S
--- a/arch/ppc/kernel/head.S	Tue Mar 30 20:00:36 2004
+++ b/arch/ppc/kernel/head.S	Tue Mar 30 20:00:36 2004
@@ -1436,11 +1436,8 @@
 	stw	r4, 0x4(r5)
 #endif
 	li	r4,0
-BEGIN_FTR_SECTION
-	dssall
-	sync
-END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
-3:	isync
+	isync
+3:
 #ifdef CONFIG_PPC64BRIDGE
 	slbie	r4
 #endif /* CONFIG_PPC64BRIDGE */
diff -Nru a/arch/ppc/kernel/pci.c b/arch/ppc/kernel/pci.c
--- a/arch/ppc/kernel/pci.c	Tue Mar 30 20:00:36 2004
+++ b/arch/ppc/kernel/pci.c	Tue Mar 30 20:00:36 2004
@@ -159,7 +159,6 @@
 		ppc_md.pcibios_fixup_resources(dev);
 }
 
-
 void
 pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
 			struct resource *res)
@@ -1522,51 +1521,43 @@
 {
 	struct pci_controller *hose = (struct pci_controller *) dev->sysdata;
 	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
-	unsigned long io_offset = 0;
-	int i, res_bit;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	unsigned long base;
+	struct resource *res;
+	int i;
+	int ret = -EINVAL;
 
 	if (hose == 0)
 		return -EINVAL;		/* should never happen */
+	if (offset + size <= offset)
+		return -EINVAL;
 
-	/* If memory, add on the PCI bridge address offset */
 	if (mmap_state == pci_mmap_mem) {
+		/* PCI memory space */
+		base = hose->pci_mem_offset;
+		for (i = 0; i < 3; ++i) {
+			res = &hose->mem_resources[i];
+			if (res->flags == 0)
+				continue;
+			if (offset >= res->start - base
+			    && offset + size - 1 <= res->end - base) {
+				ret = 0;
+				break;
+			}
+		}
 		offset += hose->pci_mem_offset;
-		res_bit = IORESOURCE_MEM;
 	} else {
-		io_offset = (unsigned long)hose->io_base_virt - isa_io_base;
-		offset += io_offset;
-		res_bit = IORESOURCE_IO;
-	}
-
-	/*
-	 * Check that the offset requested corresponds to one of the
-	 * resources of the device.
-	 */
-	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
-		struct resource *rp = &dev->resource[i];
-		int flags = rp->flags;
-
-		/* treat ROM as memory (should be already) */
-		if (i == PCI_ROM_RESOURCE)
-			flags |= IORESOURCE_MEM;
-
-		/* Active and same type? */
-		if ((flags & res_bit) == 0)
-			continue;
-
-		/* In the range of this resource? */
-		if (offset < (rp->start & PAGE_MASK) || offset > rp->end)
-			continue;
-
-		/* found it! construct the final physical address */
-		if (mmap_state == pci_mmap_io)
-			offset += hose->io_base_phys - io_offset;
-
-		vma->vm_pgoff = offset >> PAGE_SHIFT;
-		return 0;
+		/* PCI I/O space */
+		base = (unsigned long)hose->io_base_virt - isa_io_base;
+		res = &hose->io_resource;
+		if (offset >= res->start - base
+		    && offset + size - 1 <= res->end - base)
+			ret = 0;
+		offset += hose->io_base_phys;
 	}
 
-	return -EINVAL;
+	vma->vm_pgoff = offset >> PAGE_SHIFT;
+	return ret;
 }
 
 /*
diff -Nru a/arch/ppc/kernel/ppc_ksyms.c b/arch/ppc/kernel/ppc_ksyms.c
--- a/arch/ppc/kernel/ppc_ksyms.c	Tue Mar 30 20:00:37 2004
+++ b/arch/ppc/kernel/ppc_ksyms.c	Tue Mar 30 20:00:37 2004
@@ -192,7 +192,6 @@
 
 EXPORT_SYMBOL(flush_instruction_cache);
 EXPORT_SYMBOL(giveup_fpu);
-EXPORT_SYMBOL(enable_kernel_fp);
 EXPORT_SYMBOL(flush_icache_range);
 EXPORT_SYMBOL(flush_dcache_range);
 EXPORT_SYMBOL(flush_icache_user_range);
diff -Nru a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c
--- a/arch/ppc/kernel/process.c	Tue Mar 30 20:00:36 2004
+++ b/arch/ppc/kernel/process.c	Tue Mar 30 20:00:36 2004
@@ -163,7 +163,8 @@
 void
 enable_kernel_altivec(void)
 {
-	preempt_disable();
+	WARN_ON(current_thread_info()->preempt_count == 0 && !irqs_disabled());
+
 #ifdef CONFIG_SMP
 	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
 		giveup_altivec(current);
@@ -172,14 +173,15 @@
 #else
 	giveup_altivec(last_task_used_altivec);
 #endif /* __SMP __ */
-	preempt_enable();
 }
+EXPORT_SYMBOL(enable_kernel_altivec);
 #endif /* CONFIG_ALTIVEC */
 
 void
 enable_kernel_fp(void)
 {
-	preempt_disable();
+	WARN_ON(current_thread_info()->preempt_count == 0 && !irqs_disabled());
+
 #ifdef CONFIG_SMP
 	if (current->thread.regs && (current->thread.regs->msr & MSR_FP))
 		giveup_fpu(current);
@@ -188,8 +190,8 @@
 #else
 	giveup_fpu(last_task_used_math);
 #endif /* CONFIG_SMP */
-	preempt_enable();
 }
+EXPORT_SYMBOL(enable_kernel_fp);
 
 int
 dump_task_fpu(struct task_struct *tsk, elf_fpregset_t *fpregs)
diff -Nru a/arch/ppc64/kernel/entry.S b/arch/ppc64/kernel/entry.S
--- a/arch/ppc64/kernel/entry.S	Tue Mar 30 20:00:36 2004
+++ b/arch/ppc64/kernel/entry.S	Tue Mar 30 20:00:36 2004
@@ -194,7 +194,7 @@
 _GLOBAL(ret_from_syscall_2)
 	std	r3,RESULT(r1)	/* Save result */	
 	li	r10,-_LAST_ERRNO
-	cmpl	0,r3,r10
+	cmpld	0,r3,r10
 	blt	60f
 	neg	r3,r3
 57:	ld	r10,_CCR(r1)	/* Set SO bit in CR */
@@ -288,6 +288,14 @@
 	mfcr	r23
 	std	r23,_CCR(r1)
 	std	r1,KSP(r3)	/* Set old stack pointer */
+
+#ifdef CONFIG_SMP
+	/* We need a sync somewhere here to make sure that if the
+	 * previous task gets rescheduled on another CPU, it sees all
+	 * stores it has performed on this one.
+	 */
+	sync
+#endif /* CONFIG_SMP */
 
 	addi	r6,r4,-THREAD	/* Convert THREAD to 'current' */
 	std	r6,PACACURRENT(r13)	/* Set new 'current' */
diff -Nru a/drivers/block/elevator.c b/drivers/block/elevator.c
--- a/drivers/block/elevator.c	Tue Mar 30 20:00:37 2004
+++ b/drivers/block/elevator.c	Tue Mar 30 20:00:37 2004
@@ -210,10 +210,14 @@
 			rq = NULL;
 			break;
 		} else if (ret == BLKPREP_KILL) {
+			int nr_bytes = rq->hard_nr_sectors << 9;
+
+			if (!nr_bytes)
+				nr_bytes = rq->data_len;
+
 			blkdev_dequeue_request(rq);
 			rq->flags |= REQ_QUIET;
-			while (end_that_request_first(rq, 0, rq->nr_sectors))
-				;
+			end_that_request_chunk(rq, 0, nr_bytes);
 			end_that_request_last(rq);
 		} else {
 			printk("%s: bad return=%d\n", __FUNCTION__, ret);
diff -Nru a/drivers/isdn/hardware/eicon/capifunc.c b/drivers/isdn/hardware/eicon/capifunc.c
--- a/drivers/isdn/hardware/eicon/capifunc.c	Tue Mar 30 20:00:37 2004
+++ b/drivers/isdn/hardware/eicon/capifunc.c	Tue Mar 30 20:00:37 2004
@@ -1,4 +1,4 @@
-/* $Id: capifunc.c,v 1.60 2004/03/22 16:28:27 armin Exp $
+/* $Id: capifunc.c,v 1.61 2004/03/26 19:48:48 armin Exp $
  *
  * ISDN interface module for Eicon active cards DIVA.
  * CAPI Interface common functions
@@ -893,15 +893,16 @@
 		return CAPI_REGOSRESOURCEERR;
 	}
 
+	diva_os_enter_spin_lock(&api_lock, &old_irql, "send message");
+
 	if (!this->Id) {
+		diva_os_leave_spin_lock(&api_lock, &old_irql, "send message");
 		return CAPI_ILLAPPNR;
 	}
 
 	/* patch controller number */
 	msg->header.controller = ControllerMap[card->Id]
 	    | (msg->header.controller & 0x80);	/* preserve external controller bit */
-
-	diva_os_enter_spin_lock(&api_lock, &old_irql, "send message");
 
 	switch (command) {
 	default:
diff -Nru a/drivers/net/wan/pc300_drv.c b/drivers/net/wan/pc300_drv.c
--- a/drivers/net/wan/pc300_drv.c	Tue Mar 30 20:00:37 2004
+++ b/drivers/net/wan/pc300_drv.c	Tue Mar 30 20:00:37 2004
@@ -3661,7 +3661,7 @@
 			release_mem_region(card->hw.falcphys, card->hw.falcsize);
 		}
 		for (i = 0; i < card->hw.nchan; i++)
-			if (card->chan[i].d.dev);
+			if (card->chan[i].d.dev)
 				free_netdev(card->chan[i].d.dev);
 		if (card->hw.irq)
 			free_irq(card->hw.irq, card);
diff -Nru a/fs/bio.c b/fs/bio.c
--- a/fs/bio.c	Tue Mar 30 20:00:36 2004
+++ b/fs/bio.c	Tue Mar 30 20:00:36 2004
@@ -701,11 +701,12 @@
 {
 	struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
 
-	if (bi->bi_size)
-		return 1;
 	if (err)
 		bp->error = err;
 
+	if (bi->bi_size)
+		return 1;
+
 	bio_pair_release(bp);
 	return 0;
 }
@@ -714,10 +715,11 @@
 {
 	struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
 
-	if (bi->bi_size)
-		return 1;
 	if (err)
 		bp->error = err;
+
+	if (bi->bi_size)
+		return 1;
 
 	bio_pair_release(bp);
 	return 0;
diff -Nru a/fs/sysfs/dir.c b/fs/sysfs/dir.c
--- a/fs/sysfs/dir.c	Tue Mar 30 20:00:36 2004
+++ b/fs/sysfs/dir.c	Tue Mar 30 20:00:36 2004
@@ -20,18 +20,6 @@
 	return 0;
 }
 
-static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
-{
-	struct kobject * kobj = dentry->d_fsdata;
-
-	if (kobj)
-		kobject_put(kobj);
-	iput(inode);
-}
-
-static struct dentry_operations sysfs_dentry_operations = {
-	.d_iput	= &sysfs_d_iput,
-};
 
 static int create_dir(struct kobject * k, struct dentry * p,
 		      const char * n, struct dentry ** d)
@@ -45,8 +33,7 @@
 					 S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO,
 					 init_dir);
 		if (!error) {
-			(*d)->d_op = &sysfs_dentry_operations;
-			(*d)->d_fsdata = kobject_get(k);
+			(*d)->d_fsdata = k;
 			p->d_inode->i_nlink++;
 		}
 		dput(*d);
diff -Nru a/include/asm-ppc/cputable.h b/include/asm-ppc/cputable.h
--- a/include/asm-ppc/cputable.h	Tue Mar 30 20:00:36 2004
+++ b/include/asm-ppc/cputable.h	Tue Mar 30 20:00:36 2004
@@ -90,10 +90,24 @@
 	.long 99b;				\
 	.previous
 
-#define END_FTR_SECTION_IFSET(msk)	END_FTR_SECTION((msk), (msk))
-#define END_FTR_SECTION_IFCLR(msk)	END_FTR_SECTION((msk), 0)
+#else
+
+#define BEGIN_FTR_SECTION		"98:\n"
+#define END_FTR_SECTION(msk, val)		\
+"99:\n"						\
+"	.section __ftr_fixup,\"a\";\n"		\
+"	.align 2;\n"				\
+"	.long "#msk";\n"			\
+"	.long "#val";\n"			\
+"	.long 98b;\n"			        \
+"	.long 99b;\n"	 		        \
+"	.previous\n"
+
 
 #endif /* __ASSEMBLY__ */
+
+#define END_FTR_SECTION_IFSET(msk)	END_FTR_SECTION((msk), (msk))
+#define END_FTR_SECTION_IFCLR(msk)	END_FTR_SECTION((msk), 0)
 
 #endif /* __ASM_PPC_CPUTABLE_H */
 #endif /* __KERNEL__ */
diff -Nru a/include/asm-ppc/mmu_context.h b/include/asm-ppc/mmu_context.h
--- a/include/asm-ppc/mmu_context.h	Tue Mar 30 20:00:36 2004
+++ b/include/asm-ppc/mmu_context.h	Tue Mar 30 20:00:36 2004
@@ -6,6 +6,7 @@
 #include <asm/atomic.h>
 #include <asm/bitops.h>
 #include <asm/mmu.h>
+#include <asm/cputable.h>
 
 /*
  * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
@@ -155,7 +156,24 @@
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
 {
+#ifdef CONFIG_ALTIVEC
+	asm volatile (
+ BEGIN_FTR_SECTION
+	"dssall;\n"
+#ifndef CONFIG_POWER4
+	 "sync;\n" /* G4 needs a sync here, G5 apparently not */
+#endif
+ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+	 : : );
+#endif /* CONFIG_ALTIVEC */
+
 	tsk->thread.pgdir = next->pgd;
+
+	/* No need to flush userspace segments if the mm doesnt change */
+	if (prev == next)
+		return;
+
+	/* Setup new userspace context */
 	get_mmu_context(next);
 	set_context(next->context, next->pgd);
 }
@@ -166,12 +184,7 @@
  * After we have set current->mm to a new value, this activates
  * the context for the new mm so we see the new mappings.
  */
-static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm)
-{
-	current->thread.pgdir = mm->pgd;
-	get_mmu_context(mm);
-	set_context(mm->context, mm->pgd);
-}
+#define activate_mm(active_mm, mm)   switch_mm(active_mm, mm, current)
 
 extern void mmu_context_init(void);
 
diff -Nru a/kernel/stop_machine.c b/kernel/stop_machine.c
--- a/kernel/stop_machine.c	Tue Mar 30 20:00:36 2004
+++ b/kernel/stop_machine.c	Tue Mar 30 20:00:36 2004
@@ -149,10 +149,12 @@
 	complete(&smdata->done);
 
 	/* Wait for kthread_stop */
+	__set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
-		__set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
+		__set_current_state(TASK_INTERRUPTIBLE);
 	}
+	__set_current_state(TASK_RUNNING);
 	return ret;
 }
 
diff -Nru a/mm/pdflush.c b/mm/pdflush.c
--- a/mm/pdflush.c	Tue Mar 30 20:00:37 2004
+++ b/mm/pdflush.c	Tue Mar 30 20:00:37 2004
@@ -172,6 +172,12 @@
 static int pdflush(void *dummy)
 {
 	struct pdflush_work my_work;
+
+	/*
+	 * pdflush can spend a lot of time doing encryption via dm-crypt.  We
+	 * don't want to do that at keventd's priority.
+	 */
+	set_user_nice(current, 0);
 	return __pdflush(&my_work);
 }
 
diff -Nru a/net/Makefile b/net/Makefile
--- a/net/Makefile	Tue Mar 30 20:00:36 2004
+++ b/net/Makefile	Tue Mar 30 20:00:36 2004
@@ -16,7 +16,9 @@
 obj-$(CONFIG_NET)		+= ethernet/ 802/ sched/ netlink/
 obj-$(CONFIG_INET)		+= ipv4/ xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
-obj-$(CONFIG_IPV6)		+= ipv6/
+ifneq ($(CONFIG_IPV6),)
+obj-y				+= ipv6/
+endif
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_NET_SCHED)		+= sched/
diff -Nru a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
--- a/net/ipv4/netfilter/ip_nat_standalone.c	Tue Mar 30 20:00:37 2004
+++ b/net/ipv4/netfilter/ip_nat_standalone.c	Tue Mar 30 20:00:37 2004
@@ -124,7 +124,16 @@
 		WRITE_LOCK(&ip_nat_lock);
 		/* Seen it before?  This can happen for loopback, retrans,
 		   or local packets.. */
-		if (!(info->initialized & (1 << maniptype))) {
+		if (!(info->initialized & (1 << maniptype))
+#ifndef CONFIG_IP_NF_NAT_LOCAL
+		    /* If this session has already been confirmed we must not
+		     * touch it again even if there is no mapping set up.
+		     * Can only happen on local->local traffic with
+		     * CONFIG_IP_NF_NAT_LOCAL disabled.
+		     */
+		    && !(ct->status & IPS_CONFIRMED)
+#endif
+		    ) {
 			unsigned int ret;
 
 			if (ct->master
diff -Nru a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c	Tue Mar 30 20:00:36 2004
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c	Tue Mar 30 20:00:36 2004
@@ -45,7 +45,7 @@
 	const struct ip_nat_multi_range *mr = targinfo;
 
 	if (strcmp(tablename, "nat") != 0) {
-		DEBUGP("masquerade_check: bad table `%s'.\n", table);
+		DEBUGP("masquerade_check: bad table `%s'.\n", tablename);
 		return 0;
 	}
 	if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
diff -Nru a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
--- a/net/ipv4/tcp_ipv4.c	Tue Mar 30 20:00:36 2004
+++ b/net/ipv4/tcp_ipv4.c	Tue Mar 30 20:00:36 2004
@@ -1825,12 +1825,15 @@
 	goto discard_it;
 
 do_time_wait:
-	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
-		goto discard_and_relse;
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		goto discard_it;
+	}
 
 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
 		TCP_INC_STATS_BH(TcpInErrs);
-		goto discard_and_relse;
+		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		goto discard_it;
 	}
 	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
 					   skb, th, skb->len)) {
diff -Nru a/net/ipv6/Makefile b/net/ipv6/Makefile
--- a/net/ipv6/Makefile	Tue Mar 30 20:00:37 2004
+++ b/net/ipv6/Makefile	Tue Mar 30 20:00:37 2004
@@ -19,3 +19,5 @@
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
+
+obj-y += exthdrs_core.o
diff -Nru a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
--- a/net/ipv6/exthdrs.c	Tue Mar 30 20:00:36 2004
+++ b/net/ipv6/exthdrs.c	Tue Mar 30 20:00:36 2004
@@ -633,105 +633,3 @@
 	}
 	return opt2;
 }
-
-
-/* 
- * find out if nexthdr is a well-known extension header or a protocol
- */
-
-int ipv6_ext_hdr(u8 nexthdr)
-{
-	/* 
-	 * find out if nexthdr is an extension header or a protocol
-	 */
-	return ( (nexthdr == NEXTHDR_HOP)	||
-		 (nexthdr == NEXTHDR_ROUTING)	||
-		 (nexthdr == NEXTHDR_FRAGMENT)	||
-		 (nexthdr == NEXTHDR_AUTH)	||
-		 (nexthdr == NEXTHDR_NONE)	||
-		 (nexthdr == NEXTHDR_DEST) );
-}
-
-/*
- * Skip any extension headers. This is used by the ICMP module.
- *
- * Note that strictly speaking this conflicts with RFC 2460 4.0:
- * ...The contents and semantics of each extension header determine whether 
- * or not to proceed to the next header.  Therefore, extension headers must
- * be processed strictly in the order they appear in the packet; a
- * receiver must not, for example, scan through a packet looking for a
- * particular kind of extension header and process that header prior to
- * processing all preceding ones.
- * 
- * We do exactly this. This is a protocol bug. We can't decide after a
- * seeing an unknown discard-with-error flavour TLV option if it's a 
- * ICMP error message or not (errors should never be send in reply to
- * ICMP error messages).
- * 
- * But I see no other way to do this. This might need to be reexamined
- * when Linux implements ESP (and maybe AUTH) headers.
- * --AK
- *
- * This function parses (probably truncated) exthdr set "hdr"
- * of length "len". "nexthdrp" initially points to some place,
- * where type of the first header can be found.
- *
- * It skips all well-known exthdrs, and returns pointer to the start
- * of unparsable area i.e. the first header with unknown type.
- * If it is not NULL *nexthdr is updated by type/protocol of this header.
- *
- * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
- *        - it may return pointer pointing beyond end of packet,
- *	    if the last recognized header is truncated in the middle.
- *        - if packet is truncated, so that all parsed headers are skipped,
- *	    it returns NULL.
- *	  - First fragment header is skipped, not-first ones
- *	    are considered as unparsable.
- *	  - ESP is unparsable for now and considered like
- *	    normal payload protocol.
- *	  - Note also special handling of AUTH header. Thanks to IPsec wizards.
- *
- * --ANK (980726)
- */
-
-int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, int len)
-{
-	u8 nexthdr = *nexthdrp;
-
-	while (ipv6_ext_hdr(nexthdr)) {
-		struct ipv6_opt_hdr hdr;
-		int hdrlen;
-
-		if (len < (int)sizeof(struct ipv6_opt_hdr))
-			return -1;
-		if (nexthdr == NEXTHDR_NONE)
-			return -1;
-		if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
-			BUG();
-		if (nexthdr == NEXTHDR_FRAGMENT) {
-			unsigned short frag_off;
-			if (skb_copy_bits(skb,
-					  start+offsetof(struct frag_hdr,
-							 frag_off),
-					  &frag_off,
-					  sizeof(frag_off))) {
-				return -1;
-			}
-
-			if (ntohs(frag_off) & ~0x7)
-				break;
-			hdrlen = 8;
-		} else if (nexthdr == NEXTHDR_AUTH)
-			hdrlen = (hdr.hdrlen+2)<<2; 
-		else
-			hdrlen = ipv6_optlen(&hdr); 
-
-		nexthdr = hdr.nexthdr;
-		len -= hdrlen;
-		start += hdrlen;
-	}
-
-	*nexthdrp = nexthdr;
-	return start;
-}
-
diff -Nru a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/net/ipv6/exthdrs_core.c	Tue Mar 30 20:00:37 2004
@@ -0,0 +1,108 @@
+/*
+ * IPv6 library code, needed by static components when full IPv6 support is
+ * not configured or static.
+ */
+#include <net/ipv6.h>
+
+/* 
+ * find out if nexthdr is a well-known extension header or a protocol
+ */
+
+int ipv6_ext_hdr(u8 nexthdr)
+{
+	/* 
+	 * find out if nexthdr is an extension header or a protocol
+	 */
+	return ( (nexthdr == NEXTHDR_HOP)	||
+		 (nexthdr == NEXTHDR_ROUTING)	||
+		 (nexthdr == NEXTHDR_FRAGMENT)	||
+		 (nexthdr == NEXTHDR_AUTH)	||
+		 (nexthdr == NEXTHDR_NONE)	||
+		 (nexthdr == NEXTHDR_DEST) );
+}
+
+/*
+ * Skip any extension headers. This is used by the ICMP module.
+ *
+ * Note that strictly speaking this conflicts with RFC 2460 4.0:
+ * ...The contents and semantics of each extension header determine whether 
+ * or not to proceed to the next header.  Therefore, extension headers must
+ * be processed strictly in the order they appear in the packet; a
+ * receiver must not, for example, scan through a packet looking for a
+ * particular kind of extension header and process that header prior to
+ * processing all preceding ones.
+ * 
+ * We do exactly this. This is a protocol bug. We can't decide after a
+ * seeing an unknown discard-with-error flavour TLV option if it's a 
+ * ICMP error message or not (errors should never be send in reply to
+ * ICMP error messages).
+ * 
+ * But I see no other way to do this. This might need to be reexamined
+ * when Linux implements ESP (and maybe AUTH) headers.
+ * --AK
+ *
+ * This function parses (probably truncated) exthdr set "hdr"
+ * of length "len". "nexthdrp" initially points to some place,
+ * where type of the first header can be found.
+ *
+ * It skips all well-known exthdrs, and returns pointer to the start
+ * of unparsable area i.e. the first header with unknown type.
+ * If it is not NULL *nexthdr is updated by type/protocol of this header.
+ *
+ * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
+ *        - it may return pointer pointing beyond end of packet,
+ *	    if the last recognized header is truncated in the middle.
+ *        - if packet is truncated, so that all parsed headers are skipped,
+ *	    it returns NULL.
+ *	  - First fragment header is skipped, not-first ones
+ *	    are considered as unparsable.
+ *	  - ESP is unparsable for now and considered like
+ *	    normal payload protocol.
+ *	  - Note also special handling of AUTH header. Thanks to IPsec wizards.
+ *
+ * --ANK (980726)
+ */
+
+int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, int len)
+{
+	u8 nexthdr = *nexthdrp;
+
+	while (ipv6_ext_hdr(nexthdr)) {
+		struct ipv6_opt_hdr hdr;
+		int hdrlen;
+
+		if (len < (int)sizeof(struct ipv6_opt_hdr))
+			return -1;
+		if (nexthdr == NEXTHDR_NONE)
+			return -1;
+		if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
+			BUG();
+		if (nexthdr == NEXTHDR_FRAGMENT) {
+			unsigned short frag_off;
+			if (skb_copy_bits(skb,
+					  start+offsetof(struct frag_hdr,
+							 frag_off),
+					  &frag_off,
+					  sizeof(frag_off))) {
+				return -1;
+			}
+
+			if (ntohs(frag_off) & ~0x7)
+				break;
+			hdrlen = 8;
+		} else if (nexthdr == NEXTHDR_AUTH)
+			hdrlen = (hdr.hdrlen+2)<<2; 
+		else
+			hdrlen = ipv6_optlen(&hdr); 
+
+		nexthdr = hdr.nexthdr;
+		len -= hdrlen;
+		start += hdrlen;
+	}
+
+	*nexthdrp = nexthdr;
+	return start;
+}
+
+EXPORT_SYMBOL(ipv6_ext_hdr);
+EXPORT_SYMBOL(ipv6_skip_exthdr);
diff -Nru a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
--- a/net/ipv6/ipv6_syms.c	Tue Mar 30 20:00:37 2004
+++ b/net/ipv6/ipv6_syms.c	Tue Mar 30 20:00:37 2004
@@ -41,9 +41,7 @@
 #endif
 EXPORT_SYMBOL(rt6_lookup);
 EXPORT_SYMBOL(fl6_sock_lookup);
-EXPORT_SYMBOL(ipv6_ext_hdr);
 EXPORT_SYMBOL(ip6_append_data);
 EXPORT_SYMBOL(ip6_flush_pending_frames);
 EXPORT_SYMBOL(ip6_push_pending_frames);
 EXPORT_SYMBOL(ipv6_push_nfrag_opts);
-EXPORT_SYMBOL(ipv6_skip_exthdr);
diff -Nru a/net/packet/af_packet.c b/net/packet/af_packet.c
--- a/net/packet/af_packet.c	Tue Mar 30 20:00:37 2004
+++ b/net/packet/af_packet.c	Tue Mar 30 20:00:37 2004
@@ -34,6 +34,8 @@
  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
  *	Cyrus Durgin		:	Fixed kerneld for kmod.
  *	Michal Ostrowski        :       Module initialization cleanup.
+ *         Ulises Alonso        :       Frame number limit removal and 
+ *                                      packet_set_ring memory leak.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -168,30 +170,47 @@
 
 struct packet_opt
 {
+	struct tpacket_stats	stats;
+#ifdef CONFIG_PACKET_MMAP
+	unsigned long		*pg_vec;
+	unsigned int		head;
+	unsigned int            frames_per_block;
+	unsigned int		frame_size;
+	unsigned int		frame_max;
+	int			copy_thresh;
+#endif
 	struct packet_type	prot_hook;
 	spinlock_t		bind_lock;
 	char			running;	/* prot_hook is attached*/
 	int			ifindex;	/* bound device		*/
 	unsigned short		num;
-	struct tpacket_stats	stats;
 #ifdef CONFIG_PACKET_MULTICAST
 	struct packet_mclist	*mclist;
 #endif
 #ifdef CONFIG_PACKET_MMAP
 	atomic_t		mapped;
-	unsigned long		*pg_vec;
-	unsigned int		pg_vec_order;
+	unsigned int            pg_vec_order;
 	unsigned int		pg_vec_pages;
 	unsigned int		pg_vec_len;
-
-	struct tpacket_hdr	**iovec;
-	unsigned int		frame_size;
-	unsigned int		iovmax;
-	unsigned int		head;
-	int			copy_thresh;
 #endif
 };
 
+#ifdef CONFIG_PACKET_MMAP
+
+static inline unsigned long packet_lookup_frame(struct packet_opt *po, unsigned int position)
+{
+	unsigned int pg_vec_pos, frame_offset;
+	unsigned long frame;
+
+	pg_vec_pos = position / po->frames_per_block;
+	frame_offset = position % po->frames_per_block;
+
+	frame = (unsigned long) (po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
+	
+	return frame;
+}
+#endif
+
 #define pkt_sk(__sk) ((struct packet_opt *)(__sk)->sk_protinfo)
 
 void packet_sock_destruct(struct sock *sk)
@@ -586,11 +605,11 @@
 		snaplen = skb->len-skb->data_len;
 
 	spin_lock(&sk->sk_receive_queue.lock);
-	h = po->iovec[po->head];
-
+	h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
+	
 	if (h->tp_status)
 		goto ring_is_full;
-	po->head = po->head != po->iovmax ? po->head+1 : 0;
+	po->head = po->head != po->frame_max ? po->head+1 : 0;
 	po->stats.tp_packets++;
 	if (copy_skb) {
 		status |= TP_STATUS_COPY;
@@ -1485,10 +1504,13 @@
 	unsigned int mask = datagram_poll(file, sock, wait);
 
 	spin_lock_bh(&sk->sk_receive_queue.lock);
-	if (po->iovec) {
-		unsigned last = po->head ? po->head-1 : po->iovmax;
+	if (po->pg_vec) {
+		unsigned last = po->head ? po->head-1 : po->frame_max;
+		struct tpacket_hdr *h;
 
-		if (po->iovec[last]->tp_status)
+		h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
+
+		if (h->tp_status)
 			mask |= POLLIN | POLLRDNORM;
 	}
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -1548,16 +1570,18 @@
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
 {
 	unsigned long *pg_vec = NULL;
-	struct tpacket_hdr **io_vec = NULL;
 	struct packet_opt *po = pkt_sk(sk);
 	int was_running, num, order = 0;
 	int err = 0;
-
+	
 	if (req->tp_block_nr) {
 		int i, l;
-		int frames_per_block;
 
 		/* Sanity tests and some calculations */
+
+		if (po->pg_vec)
+			return -EBUSY;
+
 		if ((int)req->tp_block_size <= 0)
 			return -EINVAL;
 		if (req->tp_block_size&(PAGE_SIZE-1))
@@ -1566,10 +1590,11 @@
 			return -EINVAL;
 		if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
 			return -EINVAL;
-		frames_per_block = req->tp_block_size/req->tp_frame_size;
-		if (frames_per_block <= 0)
+
+		po->frames_per_block = req->tp_block_size/req->tp_frame_size;
+		if (po->frames_per_block <= 0)
 			return -EINVAL;
-		if (frames_per_block*req->tp_block_nr != req->tp_frame_nr)
+		if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
 			return -EINVAL;
 		/* OK! */
 
@@ -1596,20 +1621,16 @@
 		}
 		/* Page vector is allocated */
 
-		/* Draw frames */
-		io_vec = kmalloc(req->tp_frame_nr*sizeof(struct tpacket_hdr*), GFP_KERNEL);
-		if (io_vec == NULL)
-			goto out_free_pgvec;
-		memset(io_vec, 0, req->tp_frame_nr*sizeof(struct tpacket_hdr*));
-
 		l = 0;
 		for (i=0; i<req->tp_block_nr; i++) {
 			unsigned long ptr = pg_vec[i];
+			struct tpacket_hdr *header;
 			int k;
 
-			for (k=0; k<frames_per_block; k++, l++) {
-				io_vec[l] = (struct tpacket_hdr*)ptr;
-				io_vec[l]->tp_status = TP_STATUS_KERNEL;
+			for (k=0; k<po->frames_per_block; k++) {
+				
+				header = (struct tpacket_hdr*)ptr;
+				header->tp_status = TP_STATUS_KERNEL;
 				ptr += req->tp_frame_size;
 			}
 		}
@@ -1642,8 +1663,7 @@
 
 		spin_lock_bh(&sk->sk_receive_queue.lock);
 		pg_vec = XC(po->pg_vec, pg_vec);
-		io_vec = XC(po->iovec, io_vec);
-		po->iovmax = req->tp_frame_nr-1;
+		po->frame_max = req->tp_frame_nr-1;
 		po->head = 0;
 		po->frame_size = req->tp_frame_size;
 		spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -1652,7 +1672,7 @@
 		req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
 
 		po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
-		po->prot_hook.func = po->iovec ? tpacket_rcv : packet_rcv;
+		po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
 		skb_queue_purge(&sk->sk_receive_queue);
 #undef XC
 		if (atomic_read(&po->mapped))
@@ -1669,9 +1689,6 @@
 	spin_unlock(&po->bind_lock);
 
 	release_sock(sk);
-
-	if (io_vec)
-		kfree(io_vec);
 
 out_free_pgvec:
 	if (pg_vec)