Port over linux 4.1.15 infiniband/core logic for kernel bypass NIC access
authorKanoj Sarcar' via Akaros <akaros@googlegroups.com>
Thu, 11 Feb 2016 01:09:47 +0000 (17:09 -0800)
committerBarret Rhoden <brho@cs.berkeley.edu>
Wed, 17 Feb 2016 22:23:14 +0000 (17:23 -0500)
Port over linux 4.1.15 drivers/infiniband/core logic essential for
kernel bypass NIC access. Slight edits to adapt to Akaros environment
(#if exclusion of non essential code blocks, panic stubs etc), described
in README file.

Most of the interlock logic with core kernel (mm/vfs etc) is captured
in compat.[ch].

Signed-off-by: Kanoj Sarcar <kanoj@google.com>
Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
kern/drivers/net/udrvr/Makefile [new file with mode: 0644]
kern/drivers/net/udrvr/README [new file with mode: 0644]
kern/drivers/net/udrvr/compat.c [new file with mode: 0644]
kern/drivers/net/udrvr/compat.h [new file with mode: 0644]
kern/drivers/net/udrvr/device.c [new file with mode: 0644]
kern/drivers/net/udrvr/umem.c [new file with mode: 0644]
kern/drivers/net/udrvr/uverbs.h [new file with mode: 0644]
kern/drivers/net/udrvr/uverbs_cmd.c [new file with mode: 0644]
kern/drivers/net/udrvr/uverbs_main.c [new file with mode: 0644]
kern/drivers/net/udrvr/verbs.c [new file with mode: 0644]

diff --git a/kern/drivers/net/udrvr/Makefile b/kern/drivers/net/udrvr/Makefile
new file mode 100644 (file)
index 0000000..e8462bc
--- /dev/null
@@ -0,0 +1,3 @@
+CFLAGS_KERNEL += -include kern/drivers/net/udrvr/compat.h -Wno-maybe-uninitialized
+
+obj-y          += verbs.o device.o uverbs_main.o uverbs_cmd.o umem.o compat.o
diff --git a/kern/drivers/net/udrvr/README b/kern/drivers/net/udrvr/README
new file mode 100644 (file)
index 0000000..bf8a3af
--- /dev/null
@@ -0,0 +1,81 @@
+This directory contains logic for priviledged verbs aka user mode control
+path to support libibverbs. It is based off linux-4.1.15 snapshot of
+drivers/infiniband/core/. Changes on top of baseline are described
+here.
+
+Some common reasons for changes to baseline source:
+
+HF1:    Linux source code that #includes standard linux header files eg
+        linux/list.h, linux/idr.h had to be modified since these headers
+        are absent in akaros.
+
+HF2:    Headers copied from Linux eg rdma/ib_verbs.h were placed in akaros
+        in linux/rdma/ib_verbs.h.
+
+HF3:    Some changes were done to pull in lesser header files from Linux.
+
+Per file listing of changes:
+
+Makefile:      -include local compatibility header file (sometimes overriding
+       akaros compatibility definitions). -Wno-maybe-uninitialized option
+       needed to suppress warning in uverbs_cmd.c:__uverbs_create_xsrq():attr.
+
+compat.h:      Used by udrvr/ and mlx4u/ code via Makefile -include
+       directive as compatibility header file (sometimes overriding akaros
+       compatibility definitions).
+
+uverbs.h: HF1, HF2
+
+compat.c: Place holder file to add akaros specific hooks
+
+device.c: HF1, HF2
+       Add stubs for unrequired logic pieces
+       Stubbed kobject_put()
+
+uverbs_cmd.c: HF1
+       Misc stubs, including ib_resolve_eth_l2_attrs()
+       XRCD logic deleted
+       ib_uverbs_create_comp_channel() logic panic-stubbed, since it is
+       VFS specific.
+       ib_uverbs_get_context() event file related logic gutted out, since
+       it is VFS specific.
+
+uverbs_main.c: HF1
+       Stubbed out all user event file related logic. Including any reference
+               to struct ib_uverbs_file->async_file.
+       Misc stubs
+
+verbs.c: HF1, HF2
+       ib_init_ah_from_wc() panic-stubbed because AH unsupported.
+       ib_resolve_eth_l2_attrs() panic-stubbed because L2 addr routines
+       unsupported.
+
+umem.c:        HF1
+       Delete unrequired functions.
+       Akaros MM changes.
+
+Other files provided for core libibverbs support in kern/include/linux/rdma,
+baselined off linux-4.1.15 snapshot:
+
+ib_umem.h: HF1, turn on CONFIG_INFINIBAND_USER_MEM
+       (Baselined off include/rdma/ib_umem.h)
+
+ib_user_verbs.h: HF1
+       (Baselined off include/uapi/rdma/ib_user_verbs.h)
+
+
+TODO:
+1. linux pgprot_noncached() adds _PAGE_PCD ie bit 4, which is akaros PTE_PCD.
+   Akaros PTE_NOCACHE also sets bit 3 ie _PAGE_PWT (which seems wrong?)
+2. linux pgprot_writecombine() defaults to pgprot_noncached() when pat is not
+   enabled, otherwise just sets bit 3 ie _PAGE_PWT. PAT usage needed.
+3. iboe_get_mtu() dependencies
+4. query_qp API with older libibverbs inconsistent due to
+   "struct ib_uverbs_qp_dest" size difference with kernel.
+5. Completion channels not implemented.
+       (http://linux.die.net/man/3/ibv_ack_cq_events)
+6. HW driver's vendor/device/vsd strings are not being picked up from lower
+       level driver in sysfs_create(), but rather hardcoded.
+7. Port of include/linux/rdma/ib_verbs.h killed the "mutex" field in
+       "struct ib_uobject". Need to add that back in and remove hacks in
+       uverbs_cmd.c for up_read(), up_write() and friends.
diff --git a/kern/drivers/net/udrvr/compat.c b/kern/drivers/net/udrvr/compat.c
new file mode 100644 (file)
index 0000000..436240f
--- /dev/null
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2016 Google Inc
+ * Author: Kanoj Sarcar <kanoj@google.com>
+ * See LICENSE for details.
+ */
+
+#include <err.h>
+#include <kmalloc.h>
+#include <kref.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <error.h>
+#include <pmap.h>
+#include <smp.h>
+#include <devfs.h>
+#include <linux/rdma/ib_user_verbs.h>
+#include "uverbs.h"
+
+/*
+ * Our version knocked off from kern/src/mm.c version + uncaching logic from
+ * vmap_pmem_nocache().
+ */
+int map_upage_at_addr(struct proc *p, physaddr_t paddr, uintptr_t addr, int pteprot, int dolock)
+{
+       pte_t   pte;
+       int     rv = -1;
+
+       spin_lock(&p->pte_lock);
+
+       pte = pgdir_walk(p->env_pgdir, (void*)addr, TRUE);
+
+       if (!pte_walk_okay(pte))
+               goto err1;
+       pte_write(pte, paddr, pteprot);
+       // tlbflush(); tlb_flush_global();
+       rv = 0;
+err1:
+       spin_unlock(&p->pte_lock);
+
+       /*
+        * TODO: @mm tear down, unmap_and_destroy_vmrs():__vmr_free_pgs()
+        * decrefs page, which is a problem. 1st level workaround is to set
+        * PG_LOCKED/PG_PAGEMAP to avoid that. Not proud of myself.
+        */
+       if ((rv == 0) && (dolock == 1))
+               atomic_set(&pa2page(paddr)->pg_flags, PG_LOCKED | PG_PAGEMAP);
+
+       return rv;
+}
+
+void set_page_dirty_lock(struct page *pagep)
+{
+       atomic_or(&pagep->pg_flags, PG_DIRTY);
+}
+
+/*
+ * get_user_pages() does not grab a page ref count. Thus, put_page()
+ * can not release page ref count.
+ */
+void put_page(struct page *pagep)
+{
+       /* page_decref(pagep) / __put_page(pagep) */
+}
+
+int get_user_page(struct proc *p, unsigned long uvastart, int write, int force,
+    struct page **plist)
+{
+       pte_t   pte;
+       int     ret = -1;
+
+       spin_lock(&p->pte_lock);
+
+       pte = pgdir_walk(p->env_pgdir, (void*)uvastart, TRUE);
+
+       if (!pte_walk_okay(pte))
+               goto err1;
+
+       if (!pte_is_present(pte)) {
+               printk("[akaros]: get_user_page() uva=0x%llx pte absent\n",
+                   uvastart);
+               goto err1;
+       }
+
+       if (write && (!pte_has_perm_urw(pte))) {
+               /* TODO: How is Linux using the "force" parameter */
+               printk("[akaros]: get_user_page() uva=0x%llx pte ro\n",
+                   uvastart);
+               goto err1;
+       }
+
+       plist[0] = pa2page(pte_get_paddr(pte));
+       ret = 1;
+err1:
+       spin_unlock(&p->pte_lock);
+       return ret;
+}
+
+int sg_alloc_table(struct sg_table *ptr, unsigned int npages, gfp_t mask)
+{
+       ptr->sgl = kmalloc((sizeof(struct scatterlist) * npages), mask);
+       ptr->nents = ptr->orig_nents = npages;
+       return 0;
+}
+
+void sg_free_table(struct sg_table *ptr)
+{
+       kfree(ptr->sgl);
+}
+
+void idr_remove(struct idr *idp, int id)
+{
+       BUG_ON((id < 0) || (id >= MAXITEMS));
+       idp->values[id] = NULL;
+}
+
+void *idr_find(struct idr *idp, int id)
+{
+       BUG_ON((id < 0) || (id >= MAXITEMS));
+       BUG_ON(idp->values[id] == NULL);
+       return idp->values[id];
+}
+
+int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask)
+{
+       int     i;
+
+       /* We use values[] == NULL as an indicator that slot is free */
+       BUG_ON(ptr == NULL);
+
+       spin_lock_irqsave(&idp->lock, f);
+
+       for (i = 0; i < MAXITEMS; i++) {
+               if (idp->values[i] == NULL) {
+                       idp->values[i] = ptr;
+                       goto done;
+               }
+       }
+
+       i = -1;                 /* error return */
+
+done:
+       spin_unlock_irqsave(&idp->lock);
+       return i;
+}
+
+/* START: Linux /sys support for lib/apps */
+
+/* Callers must pass in null terminated strings */
+static ssize_t sysfs_read(char __user *buf, size_t ucount, loff_t *pos,
+    char *src)
+{
+       int             slen = strlen(src) + 1; /* + 1 for terminating null */
+       unsigned long   off = *pos, nb = slen - off;
+
+       if (off >= slen)
+               return 0;
+
+       if (copy_to_user(buf, (src + off), nb))
+               return -EFAULT;
+
+       *pos += nb;
+       return nb;
+}
+
+static ssize_t ib_api_ver_read(struct file *filp, char __user *buf,
+    size_t count, loff_t *pos)
+{
+       char            src[4] = { 0, 0, 0, 0};
+
+       src[0] = '0' + IB_USER_VERBS_ABI_VERSION;
+
+       return sysfs_read(buf, count, pos, src);
+}
+
+static const struct file_operations ib_api_ver = {
+       .read   = ib_api_ver_read,
+       .open   = kfs_open,
+       .release= kfs_release,
+};
+
+void sysfs_init(void)
+{
+       do_mkdir("/dev/infiniband", S_IRWXU | S_IRWXG | S_IRWXO);
+       do_mkdir("/sys", S_IRWXU | S_IRWXG | S_IRWXO);
+       do_mkdir("/sys/class", S_IRWXU | S_IRWXG | S_IRWXO);
+       do_mkdir("/sys/class/infiniband_verbs", S_IRWXU | S_IRWXG | S_IRWXO);
+       do_mkdir("/sys/class/infiniband", S_IRWXU | S_IRWXG | S_IRWXO);
+
+       make_device("/sys/class/infiniband_verbs/abi_version",
+                   S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
+                   __S_IFCHR, (struct file_operations *)&ib_api_ver);
+}
+
+static ssize_t dver_read(struct file *filp, char __user *buf,
+    size_t count, loff_t *pos)
+{
+       struct ib_uverbs_device *uvp;
+       char            src[4] = { 0, 0, 0, 0};
+
+       uvp = (struct ib_uverbs_device *)get_fs_info(filp);
+       src[0] = '0' + uvp->ib_dev->uverbs_abi_ver;
+
+       return sysfs_read(buf, count, pos, src);
+}
+
+static ssize_t dname_read(struct file *filp, char __user *buf,
+    size_t count, loff_t *pos)
+{
+       struct ib_uverbs_device *uvp;
+
+       uvp = (struct ib_uverbs_device *)get_fs_info(filp);
+       return sysfs_read(buf, count, pos, uvp->ib_dev->name);
+}
+
+static ssize_t ntype_read(struct file *filp, char __user *buf,
+    size_t count, loff_t *pos)
+{
+       char    src[] = "1";
+
+       return sysfs_read(buf, count, pos, src);
+}
+
+static ssize_t ddev_read(struct file *filp, char __user *buf,
+    size_t count, loff_t *pos)
+{
+       char    src[] = "0x1003";
+
+       return sysfs_read(buf, count, pos, src);
+}
+
+static ssize_t dven_read(struct file *filp, char __user *buf,
+    size_t count, loff_t *pos)
+{
+       char    src[] = "0x15b3";
+
+       return sysfs_read(buf, count, pos, src);
+}
+
+static ssize_t vsd_read(struct file *filp, char __user *buf,
+    size_t count, loff_t *pos)
+{
+       char    *src = "puma20_A1-10.2.3.0";
+
+       return sysfs_read(buf, count, pos, src);
+}
+
+static const struct file_operations dver_fops = {
+       .read   = dver_read,
+       .open   = kfs_open,
+       .release= kfs_release,
+};
+
+static const struct file_operations dname_fops = {
+       .read   = dname_read,
+       .open   = kfs_open,
+       .release= kfs_release,
+};
+
+static const struct file_operations ddev_fops = {
+       .read   = ddev_read,
+       .open   = kfs_open,
+       .release= kfs_release,
+};
+
+static const struct file_operations dven_fops = {
+       .read   = dven_read,
+       .open   = kfs_open,
+       .release= kfs_release,
+};
+
+static const struct file_operations ntype_fops = {
+       .read   = ntype_read,
+       .open   = kfs_open,
+       .release= kfs_release,
+};
+
+static const struct file_operations vsd_fops = {
+       .read   = vsd_read,
+       .open   = kfs_open,
+       .release= kfs_release,
+};
+
+void sysfs_create(int devnum, const struct file_operations *verb_fops,
+    void *ptr)
+{
+       char            sysname[256] = "/sys/class/infiniband_verbs/uverbs0";
+       char            devname[] = "/dev/infiniband/uverbs0";
+       char            drvname[64] = "/sys/class/infiniband/";
+       int             sysnameidx = strlen(sysname), drvidx;
+       struct file     *fp;
+       struct ib_uverbs_device *uvp = (struct ib_uverbs_device *)ptr;
+
+       /* Create correct name */
+       if (devnum > 9)
+               panic("Too many devs");
+       devname[strlen(devname) - 1] = '0' + devnum;
+       sysname[sysnameidx - 1] = '0' + devnum;
+
+       /* Foll fops need to come from caller */
+       fp = make_device(devname,
+           S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
+           __S_IFCHR, (struct file_operations *)verb_fops);
+       set_fs_info(fp, ptr);
+
+       /* /sys/class/infiniband/mlx4_0 */
+       strncpy((drvname + strlen(drvname)), uvp->ib_dev->name, 12);
+       do_mkdir(drvname, S_IRWXU | S_IRWXG | S_IRWXO);
+       drvidx = strlen(drvname);
+
+       /* /sys/class/infiniband/mlx4_0/node_type */
+       strncpy(drvname + drvidx, "/node_type", 11);
+       make_device(drvname,
+           S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
+           __S_IFCHR, (struct file_operations *)&ntype_fops);
+
+       /* /sys/class/infiniband/mlx4_0/vsd */
+       strncpy(drvname + drvidx, "/vsd", 5);
+       fp = make_device(drvname,
+           S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
+           __S_IFCHR, (struct file_operations *)&vsd_fops);
+       set_fs_info(fp, ptr);
+
+       /* /sys/class/infiniband_verbs/uverbs0 */
+       do_mkdir(sysname, S_IRWXU | S_IRWXG | S_IRWXO);
+
+       /* /sys/class/infiniband_verbs/uverbs0/device */
+       strncpy(sysname + sysnameidx, "/device", 16);
+       do_mkdir(sysname, S_IRWXU | S_IRWXG | S_IRWXO);
+
+       /* /sys/class/infiniband_verbs/uverbs0/device/device */
+       strncpy(sysname + sysnameidx, "/device/device", 16);
+       fp = make_device(sysname,
+           S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
+           __S_IFCHR, (struct file_operations *)&ddev_fops);
+       set_fs_info(fp, ptr);
+
+       /* /sys/class/infiniband_verbs/uverbs0/device/vendor */
+       strncpy(sysname + sysnameidx, "/device/vendor", 16);
+       fp = make_device(sysname,
+           S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
+           __S_IFCHR, (struct file_operations *)&dven_fops);
+       set_fs_info(fp, ptr);
+
+       /* /sys/class/infiniband_verbs/uverbs0/ibdev */
+       strncpy(sysname + sysnameidx, "/ibdev", 16);
+       fp = make_device(sysname,
+           S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
+           __S_IFCHR, (struct file_operations *)&dname_fops);
+       set_fs_info(fp, ptr);
+
+       /* /sys/class/infiniband_verbs/uverbs0/abi_version */
+       strncpy(sysname + sysnameidx, "/abi_version", 16);
+       fp = make_device(sysname,
+           S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
+           __S_IFCHR, (struct file_operations *)&dver_fops);
+       set_fs_info(fp, ptr);
+}
+
+/* END: Linux /sys support for lib/apps */
+
+/* START: Support older version of libibverbs */
+
+/* in_words and provider_in_words are in terms of 4-byte words, not 8-byte */
+struct ib_uverbs_ex_cmd_hdr_compat {
+       __u16 provider_in_words;
+       __u16 provider_out_words;
+       __u32 cmd_hdr_reserved;
+       __u32 comp_mask;
+       /* __u32 dummy; */
+       __u64 response;
+       __u32 qp_handle;
+};
+
+static ssize_t compat_ex(struct ib_uverbs_file *file, size_t count,
+    const char __user *buf)
+{
+       struct ib_uverbs_cmd_hdr hdr;
+       struct ib_uverbs_ex_cmd_hdr_compat ex_hdr;
+       struct ib_udata ucore;
+       struct ib_udata uhw;
+       __u32 command;
+       int err;
+       unsigned long   tmpbuf[16];
+       struct ib_uverbs_create_flow *ptr;
+
+       if (copy_from_user(&hdr, buf, sizeof hdr))
+               return -EFAULT;
+
+       command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+       command -= 2;
+
+       if (command == IB_USER_VERBS_EX_CMD_DESTROY_FLOW) {
+               INIT_UDATA_BUF_OR_NULL(&ucore, buf + 8, 0, 8, 0);
+               err = ib_uverbs_ex_destroy_flow(file, &ucore, &uhw);
+               goto next;
+       }
+
+       /*
+        * "struct ibv_create_flow" is 56 bytes, "struct ibv_kern_spec" is
+        * 48 bytes, so at a minimum we expect 56 + (n x 48), n >= 1.
+        */
+       if (count < 104)
+               return -EINVAL;
+
+       if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+               return -EFAULT;
+
+       if ((hdr.in_words + ex_hdr.provider_in_words) * 4 != count)
+               return -EINVAL;
+
+       if (ex_hdr.cmd_hdr_reserved)
+               return -EINVAL;
+
+       if (ex_hdr.comp_mask)
+               return -EINVAL;
+
+       if (ex_hdr.response) {
+               if (!hdr.out_words && !ex_hdr.provider_out_words)
+                       return -EINVAL;
+
+               if (!access_ok(VERIFY_WRITE,
+                              (void __user *) (unsigned long) ex_hdr.response,
+                              (hdr.out_words + ex_hdr.provider_out_words) * 4))
+                       return -EFAULT;
+       } else {
+               if (hdr.out_words || ex_hdr.provider_out_words)
+                       return -EINVAL;
+       }
+
+       ptr = (struct ib_uverbs_create_flow *)tmpbuf;
+       ptr->comp_mask = 0;     /* user input already validated above */
+       ptr->qp_handle = ex_hdr.qp_handle;
+
+       if ((count-36) > 120)
+               BUG();
+
+       /* Copy 16 bytes worth "struct ibv_kern_flow_attr" */
+       copy_from_user(&tmpbuf[1], buf+36, sizeof(struct ib_uverbs_flow_attr));
+
+       ptr->flow_attr.size -= 56;              /* Comes in as 96 = 56 + 40 */
+
+       /* Copy "struct ibv_kern_spec"s */
+       copy_from_user(&tmpbuf[3], buf+56, count-56);
+
+       /*
+        * Copy : count-56 "struct ibv_kern_spec"s,
+        * 16 bytes "struct ibv_kern_flow_attr", 16 bytes comp_mask/qp_handle.
+        */
+       copy_to_user((char __user *)buf, tmpbuf, count-24);
+
+       INIT_UDATA_BUF_OR_NULL(&ucore, buf,
+           (unsigned long) ex_hdr.response, count - 24,
+           hdr.out_words * 4);
+
+       err = ib_uverbs_ex_create_flow(file, &ucore, &uhw);
+
+next:
+       if (err)
+               return err;
+
+       return count;
+}
+
+static ssize_t compat(struct ib_uverbs_file *file, size_t count,
+    const char __user *buf)
+{
+       unsigned long                   tmpbuf[17];
+       struct ib_uverbs_cmd_hdr        *p = (struct ib_uverbs_cmd_hdr *)tmpbuf;
+       char __user                     *dst = (char __user *)buf;
+       int                             insz, outsz;
+
+       /*
+        * User "struct ibv_qp_dest" is 40 bytes, passes in 136 bytes.
+        * Kernel "struct ib_uverbs_qp_dest" is 32 bytes, expects 120.
+        * Last 8 bytes of user "struct ibv_qp_dest" not used by kernel.
+        * Kernel expects this layout:
+        *      struct ib_uverbs_cmd_hdr (8)
+        *      struct ib_uverbs_qp_dest (32 <- 40)
+        *      struct ib_uverbs_qp_dest (32 <- 40)
+        *      Rest of qp_mod inputs    (48)
+        */
+
+       if (count > 136)
+               BUG();
+
+       if (copy_from_user(tmpbuf, buf, count))
+               return -EFAULT;
+       insz = p->in_words * 4;
+       outsz = p->out_words * 4;
+
+       copy_to_user(dst, &tmpbuf[1], sizeof(struct ib_uverbs_qp_dest));
+       dst += sizeof(struct ib_uverbs_qp_dest);
+       copy_to_user(dst, &tmpbuf[6], sizeof(struct ib_uverbs_qp_dest));
+       dst += sizeof(struct ib_uverbs_qp_dest);
+       copy_to_user(dst, &tmpbuf[11], 48);
+       
+
+       return ib_uverbs_modify_qp(file, buf, insz, outsz);
+}
+
+/*
+ * Compat hack for applications/libraries we care about. Retrofit Linux 3.12
+ * style APIs.
+ */
+ssize_t check_old_abi(struct file *filp, const char __user *buf, size_t count)
+{
+       struct ib_uverbs_cmd_hdr hdr;
+       int                      tmp;
+       struct ib_uverbs_file *file = filp->private_data;
+
+       if (copy_from_user(&hdr, buf, sizeof hdr))
+               return -EFAULT;
+
+       tmp = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+       if ((tmp >= 52) && (tmp <= 53)) {
+               return compat_ex(file, count, buf);
+       } else if (tmp == IB_USER_VERBS_CMD_MODIFY_QP) {
+               return compat(file, count, buf);
+       } else if (tmp == IB_USER_VERBS_CMD_QUERY_QP) {
+               panic("query_qp API difference not handled\n");
+       }
+
+       /* Continue with processing this command */
+       return 0;
+}
+
+/* END: Support older version of libibverbs */
diff --git a/kern/drivers/net/udrvr/compat.h b/kern/drivers/net/udrvr/compat.h
new file mode 100644 (file)
index 0000000..cdbb37f
--- /dev/null
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ * Author: Kanoj Sarcar <kanoj@google.com>
+ * See LICENSE for details.
+ *
+ * A majority of macros in this file should migrate to compat_todo.h,
+ * which should hold source copied from Linux. Some part of macros
+ * will also move into linux_compat.h to translate from Linux to
+ * Akaros. A tiny portion should remain here, since those are stub
+ * or hack definitions whose scope should be restricted only to mlx4u/
+ * and udrvr/
+ */
+#include <linux_compat.h>
+
+#include <arch/uaccess.h>      /* copy_from_user(), copy_to_user() */
+#define access_ok(type, addr, size)     1
+
+
+/*
+ * Device file /dev/XXXX has a dentry and inode that is associated
+ * with the "struct file" for each user process opening the device file.
+ * Thus, we can stash private data into file->f_dentry->d_fs_info or
+ * into file->f_dentry->d_inode->i_fs_info.
+ */
+#define set_fs_info(_p_, _v_)          \
+       do { (_p_)->f_dentry->d_fs_info = (_v_); } while(0)
+#define        get_fs_info(_p_)        ((_p_)->f_dentry->d_fs_info)
+#define        private_data                    f_privdata
+
+typedef uint8_t u8;
+typedef uint8_t __u8;
+typedef uint16_t u16;
+typedef uint16_t __u16;
+typedef uint32_t u32;
+typedef uint32_t __u32;
+typedef uint64_t u64;
+typedef uint64_t __u64;
+
+typedef int32_t        __s32;
+
+typedef off64_t        loff_t;
+
+typedef atomic_t                       atomic64_t;
+
+#define        atomic64_set                    atomic_set
+#define        atomic64_read                   atomic_read
+#define        atomic_dec_and_test(e)          atomic_sub_and_test(e, 1)
+#define        atomic_inc_not_zero(p)          atomic_add_not_zero(p, 1)
+
+#define        mutex_init(a)                   qlock_init(a)
+#define        mutex_lock(a)                   qlock(a)
+#define        mutex_unlock(a)                 qunlock(a)
+
+#define        spin_lock_init(E)               spinlock_init_irqsave(E)
+#define        spin_lock_irq(a)                spin_lock_irqsave(a, 0)
+#define        spin_unlock_irq(a)              spin_unlock_irqsave(a)
+#define        spin_lock_irqsave(l, f)         spin_lock_irqsave(l)
+#define        spin_unlock_irqrestore(l, f)    spin_unlock_irqsave(l)
+#define        spin_lock_nested(l, d)          spin_lock(l)
+#define        spin_lock_bh(E)                 spin_lock(E)
+#define        spin_unlock_bh(E)               spin_unlock(E)
+#define        DEFINE_SPINLOCK(x)              spinlock_t x = SPINLOCK_INITIALIZER
+
+/*
+ * Linux pgprot_noncached() adds _PAGE_PCD ie bit 4, which is akaros PTE_PCD.
+ * Akaros PTE_NOCACHE also sets bit 3 ie _PAGE_PWT (which is overkill?).
+ * Linux pgprot_writecombine() defaults to pgprot_noncached() when PAT is
+ * not enabled, otherwise just sets bit 3 ie _PAGE_PWT.
+ */
+static unsigned long pgprot_noncached(int vmprot)
+{
+       unsigned long   prot = PTE_P | PTE_U | PTE_A | PTE_PCD;
+
+       if (vmprot & PROT_WRITE)
+               prot |= PTE_W | PTE_D;
+       return prot;
+}
+
+/* TODO: Factor in PAT usage */
+#define        pgprot_writecombine(vmprot)     pgprot_noncached(vmprot)
+
+#define is_vm_hugetlb_page(vma)        0
+
+extern int map_upage_at_addr(struct proc *p, physaddr_t paddr, uintptr_t addr,
+    int pteprot, int dolock);
+extern int get_user_page(struct proc *p, unsigned long uvastart, int write,
+     int force, struct page **plist);
+extern void put_page(struct page *pagep);
+extern void set_page_dirty_lock(struct page *pagep);
+
+#define        io_remap_pfn_range(vma, vmstart, pfn, rangesz, pteprot) \
+       (rangesz == PAGE_SIZE ? map_upage_at_addr(current,      \
+       ((pfn) << PAGE_SHIFT), vmstart, pteprot, 1) : -1)
+
+#define        get_user_pages(task, mm, uvastart, numpages, write, force,      \
+       plist, vlist)                                                   \
+               get_user_page(task, uvastart, write, force, plist)
+
+/* The foll is only true for mlx4/ code */
+#define        read_lock(p)
+#define        read_unlock(p)
+
+#define        GFP_KERNEL                      KMALLOC_WAIT
+#define        GFP_ATOMIC                      0
+#define        GFP_NOIO                        KMALLOC_WAIT
+#define        GFP_NOWAIT                      0
+
+#define        __get_free_page(f)              kpage_alloc_addr()
+
+static inline void free_page(unsigned long addr)
+{
+       if (addr != 0)
+               free_cont_pages((void *)addr, 0);
+}
+
+#define        get_zeroed_page(f)              kpage_zalloc_addr()
+
+#define        kzalloc(SZ, FL)                 kzmalloc(SZ, FL)
+#define        kcalloc(CNT, SZ, FL)            kzmalloc((CNT) * (SZ), FL)
+
+#define        roundup_pow_of_two(E)           ROUNDUPPWR2(E)
+#define        roundup(VAL, UP)                ROUNDUP(VAL, UP)
+#define        min(A0, A1)                     MIN(A0, A1)
+#define        max(A0, A1)                     MAX(A0, A1)
+
+#define        LIST_HEAD(l)                    LINUX_LIST_HEAD(l)
+
+/*
+ * Careful: these will replace "struct mutex" to "struct semaphore" but
+ * also replace ptr->mutex to ptr->semaphore aka structure field rename.
+ */
+#define        mutex           semaphore
+#define        rw_semaphore    semaphore
+
+/* From include/linux/netdevice.h */
+#define        dev_hold(p)
+#define        dev_put(p)
+
+#define        pr_info_once    printk
+
+/* From Linux include/linux/scatterlist.h: move to compat_todo.h */
+struct sg_table {
+       struct scatterlist *sgl;
+       unsigned int nents;
+       unsigned int orig_nents;
+};
+
+extern int sg_alloc_table(struct sg_table *ptr, unsigned int npages, gfp_t mask);
+void sg_free_table(struct sg_table *ptr);
+
+
+/* From include/linux/compiler.h: move to compat_todo.h */
+#define        __acquires(x)
+#define        __releases(x)
+#define        __acquire(x)                    (void)0
+#define        __release(x)                    (void)0
+#define uninitialized_var(x)           x = *(&(x))
+
+/* From include/asm-generic/bug.h: move to compat_todo.h */
+#define WARN_ON(condition) ({                                           \
+        int __ret_warn_on = !!(condition);                              \
+        if (unlikely(__ret_warn_on))                                    \
+               printk("BUG: %s:%d/%s()!\n", __FILE__, __LINE__, __func__);\
+        unlikely(__ret_warn_on);                                        \
+})
+
+#define        BUG_ON(condition)       \
+       do {                                                            \
+               if (unlikely(condition))                                \
+                       panic("BADBUG");                                \
+       } while(0)
+
+#define        BUG()           BUG_ON(1)
+
+/* Akaros cpu_to_be32() does not handle constants */
+#undef cpu_to_be32
+#define        ___constant_swab32(x) ((__u32)(                                 \
+       (((__u32)(x) & (__u32)0x000000ffUL) << 24) |                    \
+       (((__u32)(x) & (__u32)0x0000ff00UL) <<  8) |                    \
+       (((__u32)(x) & (__u32)0x00ff0000UL) >>  8) |                    \
+       (((__u32)(x) & (__u32)0xff000000UL) >> 24)))
+
+#define        cpu_to_be32(x)                                                  \
+       (__builtin_constant_p((__u32)(x)) ?                             \
+       ___constant_swab32(x) :                                         \
+       byte_swap32(x))
+
+#define        MAXITEMS        128
+
+struct idr {
+       spinlock_t      lock;
+       void            *values[MAXITEMS];
+};
+
+#define        idr_destroy(p)
+#define        idr_preload(f)
+#define        idr_preload_end()
+
+#define        DEFINE_IDR(name)                        \
+               struct idr name = { .lock = SPINLOCK_INITIALIZER }
+
+void idr_remove(struct idr *idp, int id);
+void *idr_find(struct idr *idr, int id);
+int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask);
+
+struct net_device {
+       unsigned char dev_addr[MAX_ADDR_LEN];
+};
+
+/* Conflicting definitions in compat_todo.h */
+#define        netif_carrier_ok(p)     1
+#define        vm_area_struct          vm_region
+
+#define        vm_start                vm_base
+#define        vm_pgoff                vm_foff >> PAGE_SHIFT
+
+#undef __init
+#undef __exit
+#define        __init  __attribute__((used))
+#define        __exit  __attribute__((used))
+
+struct cdev {
+};
+
+struct kobject {
+};
+
+typedef struct  wait_queue_head {
+} wait_queue_head_t;
+
+struct lock_class_key {
+};
+
+struct attribute {
+};
+
+struct ib_ud_header {
+};
+
+extern void sysfs_init(void);
+extern void sysfs_create(int devnum, const struct file_operations *verb_fops,
+    void *ptr);
+
+extern ssize_t check_old_abi(struct file *filp, const char __user *buf,
+    size_t count);
diff --git a/kern/drivers/net/udrvr/device.c b/kern/drivers/net/udrvr/device.c
new file mode 100644 (file)
index 0000000..f6208be
--- /dev/null
@@ -0,0 +1,803 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if 0  /* AKAROS */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <rdma/rdma_netlink.h>
+
+#include "core_priv.h"
+#else  /* AKAROS */
+#include <list.h>
+#include <linux/rdma/ib_verbs.h>
+
+#define ib_cache_cleanup()
+#define        ibnl_cleanup()
+#define        ib_sysfs_cleanup()
+#define        ib_device_unregister_sysfs(d)
+#define ib_cache_setup()               0
+#define        ibnl_init()                     0
+#define        ib_sysfs_setup()                0
+#define        ib_device_register_sysfs(d, c)  0
+
+#define        alloc_workqueue(str, a, b)      create_singlethread_workqueue(str)
+#define        kobject_put(p)
+
+#endif /* AKAROS */
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_client_data {
+       struct list_head  list;
+       struct ib_client *client;
+       void *            data;
+};
+
+struct workqueue_struct *ib_wq;
+EXPORT_SYMBOL_GPL(ib_wq);
+
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_mutex protects access to both device_list and client_list.
+ * There's no real point to using multiple locks or something fancier
+ * like an rwsem: we always access both lists, and we're always
+ * modifying one list or the other list.  In any case this is not a
+ * hot path so there's no point in trying to optimize.
+ */
+static DEFINE_MUTEX(device_mutex);
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+       static const struct {
+               size_t offset;
+               char  *name;
+       } mandatory_table[] = {
+               IB_MANDATORY_FUNC(query_device),
+               IB_MANDATORY_FUNC(query_port),
+               IB_MANDATORY_FUNC(query_pkey),
+               IB_MANDATORY_FUNC(query_gid),
+               IB_MANDATORY_FUNC(alloc_pd),
+               IB_MANDATORY_FUNC(dealloc_pd),
+               IB_MANDATORY_FUNC(create_ah),
+               IB_MANDATORY_FUNC(destroy_ah),
+               IB_MANDATORY_FUNC(create_qp),
+               IB_MANDATORY_FUNC(modify_qp),
+               IB_MANDATORY_FUNC(destroy_qp),
+               IB_MANDATORY_FUNC(post_send),
+               IB_MANDATORY_FUNC(post_recv),
+               IB_MANDATORY_FUNC(create_cq),
+               IB_MANDATORY_FUNC(destroy_cq),
+               IB_MANDATORY_FUNC(poll_cq),
+               IB_MANDATORY_FUNC(req_notify_cq),
+               IB_MANDATORY_FUNC(get_dma_mr),
+               IB_MANDATORY_FUNC(dereg_mr)
+       };
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+               if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
+                       printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
+                              device->name, mandatory_table[i].name);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+       struct ib_device *device;
+
+       list_for_each_entry(device, &device_list, core_list)
+               if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+                       return device;
+
+       return NULL;
+}
+
+
+static int alloc_name(char *name)
+{
+       unsigned long *inuse;
+       char buf[IB_DEVICE_NAME_MAX];
+       struct ib_device *device;
+       int i;
+
+       inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+       if (!inuse)
+               return -ENOMEM;
+
+       list_for_each_entry(device, &device_list, core_list) {
+               if (!sscanf(device->name, name, &i))
+                       continue;
+               if (i < 0 || i >= PAGE_SIZE * 8)
+                       continue;
+               snprintf(buf, sizeof buf, name, i);
+               if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+                       set_bit(i, inuse);
+       }
+
+       i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+       free_page((unsigned long) inuse);
+       snprintf(buf, sizeof buf, name, i);
+
+       if (__ib_device_get_by_name(buf))
+               return -ENFILE;
+
+       strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+       return 0;
+}
+
+static int start_port(struct ib_device *device)
+{
+       return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+       return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+               0 : device->phys_port_cnt;
+}
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device.  @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+       BUG_ON(size < sizeof (struct ib_device));
+
+       return kzalloc(size, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+       if (device->reg_state == IB_DEV_UNINITIALIZED) {
+               kfree(device);
+               return;
+       }
+
+       BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
+
+       kobject_put(&device->dev.kobj);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+       struct ib_client_data *context;
+       unsigned long flags;
+
+       context = kmalloc(sizeof *context, GFP_KERNEL);
+       if (!context) {
+               printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
+                      device->name, client->name);
+               return -ENOMEM;
+       }
+
+       context->client = client;
+       context->data   = NULL;
+
+       spin_lock_irqsave(&device->client_data_lock, flags);
+       list_add(&context->list, &device->client_data_list);
+       spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+       return 0;
+}
+
+static int read_port_table_lengths(struct ib_device *device)
+{
+       struct ib_port_attr *tprops = NULL;
+       int num_ports, ret = -ENOMEM;
+       u8 port_index;
+
+       tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+       if (!tprops)
+               goto out;
+
+       num_ports = end_port(device) - start_port(device) + 1;
+
+       device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+                                      GFP_KERNEL);
+       device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+                                     GFP_KERNEL);
+       if (!device->pkey_tbl_len || !device->gid_tbl_len)
+               goto err;
+
+       for (port_index = 0; port_index < num_ports; ++port_index) {
+               ret = ib_query_port(device, port_index + start_port(device),
+                                       tprops);
+               if (ret)
+                       goto err;
+               device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+               device->gid_tbl_len[port_index]  = tprops->gid_tbl_len;
+       }
+
+       ret = 0;
+       goto out;
+
+err:
+       kfree(device->gid_tbl_len);
+       kfree(device->pkey_tbl_len);
+out:
+       kfree(tprops);
+       return ret;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core.  All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device,
+                      int (*port_callback)(struct ib_device *,
+                                           u8, struct kobject *))
+{
+       int ret;
+
+       mutex_lock(&device_mutex);
+
+       if (strchr(device->name, '%')) {
+               ret = alloc_name(device->name);
+               if (ret)
+                       goto out;
+       }
+
+       if (ib_device_check_mandatory(device)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       INIT_LIST_HEAD(&device->event_handler_list);
+       INIT_LIST_HEAD(&device->client_data_list);
+       spin_lock_init(&device->event_handler_lock);
+       spin_lock_init(&device->client_data_lock);
+
+       ret = read_port_table_lengths(device);
+       if (ret) {
+               printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+                      device->name);
+               goto out;
+       }
+
+       ret = ib_device_register_sysfs(device, port_callback);
+       if (ret) {
+               printk(KERN_WARNING "Couldn't register device %s with driver model\n",
+                      device->name);
+               kfree(device->gid_tbl_len);
+               kfree(device->pkey_tbl_len);
+               goto out;
+       }
+
+       list_add_tail(&device->core_list, &device_list);
+
+       device->reg_state = IB_DEV_REGISTERED;
+
+       {
+               struct ib_client *client;
+
+               list_for_each_entry(client, &client_list, list)
+                       if (client->add && !add_client_context(device, client))
+                               client->add(device);
+       }
+
+ out:
+       mutex_unlock(&device_mutex);
+       return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device.  All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+       struct ib_client *client;
+       struct ib_client_data *context, *tmp;
+       unsigned long flags;
+
+       mutex_lock(&device_mutex);
+
+       list_for_each_entry_reverse(client, &client_list, list)
+               if (client->remove)
+                       client->remove(device);
+
+       list_del(&device->core_list);
+
+       kfree(device->gid_tbl_len);
+       kfree(device->pkey_tbl_len);
+
+       mutex_unlock(&device_mutex);
+
+       ib_device_unregister_sysfs(device);
+
+       spin_lock_irqsave(&device->client_data_lock, flags);
+       list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+               kfree(context);
+       spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+       device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal.  When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered).  In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+       struct ib_device *device;
+
+       mutex_lock(&device_mutex);
+
+       list_add_tail(&client->list, &client_list);
+       list_for_each_entry(device, &device_list, core_list)
+               if (client->add && !add_client_context(device, client))
+                       client->add(device);
+
+       mutex_unlock(&device_mutex);
+
+       return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration.  When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+       struct ib_client_data *context, *tmp;
+       struct ib_device *device;
+       unsigned long flags;
+
+       mutex_lock(&device_mutex);
+
+       list_for_each_entry(device, &device_list, core_list) {
+               if (client->remove)
+                       client->remove(device);
+
+               spin_lock_irqsave(&device->client_data_lock, flags);
+               list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+                       if (context->client == client) {
+                               list_del(&context->list);
+                               kfree(context);
+                       }
+               spin_unlock_irqrestore(&device->client_data_lock, flags);
+       }
+       list_del(&client->list);
+
+       mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+       struct ib_client_data *context;
+       void *ret = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&device->client_data_lock, flags);
+       list_for_each_entry(context, &device->client_data_list, list)
+               if (context->client == client) {
+                       ret = context->data;
+                       break;
+               }
+       spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+                       void *data)
+{
+       struct ib_client_data *context;
+       unsigned long flags;
+
+       spin_lock_irqsave(&device->client_data_lock, flags);
+       list_for_each_entry(context, &device->client_data_list, list)
+               if (context->client == client) {
+                       context->data = data;
+                       goto out;
+               }
+
+       printk(KERN_WARNING "No client context found for %s/%s\n",
+              device->name, client->name);
+
+out:
+       spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification).  This
+ * callback may occur in interrupt context.
+ */
+int ib_register_event_handler  (struct ib_event_handler *event_handler)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+       list_add_tail(&event_handler->list,
+                     &event_handler->device->event_handler_list);
+       spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+       return 0;
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+       list_del(&event_handler->list);
+       spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+       return 0;
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+       unsigned long flags;
+       struct ib_event_handler *handler;
+
+       spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+       list_for_each_entry(handler, &event->device->event_handler_list, list)
+               handler->handler(handler, event);
+
+       spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int ib_query_device(struct ib_device *device,
+                   struct ib_device_attr *device_attr)
+{
+       return device->query_device(device, device_attr);
+}
+EXPORT_SYMBOL(ib_query_device);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+                 u8 port_num,
+                 struct ib_port_attr *port_attr)
+{
+       if (port_num < start_port(device) || port_num > end_port(device))
+               return -EINVAL;
+
+       return device->query_port(device, port_num, port_attr);
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ *
+ * ib_query_gid() fetches the specified GID table entry.
+ */
+int ib_query_gid(struct ib_device *device,
+                u8 port_num, int index, union ib_gid *gid)
+{
+       return device->query_gid(device, port_num, index, gid);
+}
+EXPORT_SYMBOL(ib_query_gid);
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+                 u8 port_num, u16 index, u16 *pkey)
+{
+       return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+                    int device_modify_mask,
+                    struct ib_device_modify *device_modify)
+{
+       if (!device->modify_device)
+               return -ENOSYS;
+
+       return device->modify_device(device, device_modify_mask,
+                                    device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ *   to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+                  u8 port_num, int port_modify_mask,
+                  struct ib_port_modify *port_modify)
+{
+       if (!device->modify_port)
+               return -ENOSYS;
+
+       if (port_num < start_port(device) || port_num > end_port(device))
+               return -EINVAL;
+
+       return device->modify_port(device, port_num, port_modify_mask,
+                                  port_modify);
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found.  This
+ *   parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+               u8 *port_num, u16 *index)
+{
+       union ib_gid tmp_gid;
+       int ret, port, i;
+
+       for (port = start_port(device); port <= end_port(device); ++port) {
+               for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+                       ret = ib_query_gid(device, port, i, &tmp_gid);
+                       if (ret)
+                               return ret;
+                       if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+                               *port_num = port;
+                               if (index)
+                                       *index = i;
+                               return 0;
+                       }
+               }
+       }
+
+       return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+                u8 port_num, u16 pkey, u16 *index)
+{
+       int ret, i;
+       u16 tmp_pkey;
+       int partial_ix = -1;
+
+       for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+               ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+               if (ret)
+                       return ret;
+               if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
+                       /* if there is full-member pkey take it.*/
+                       if (tmp_pkey & 0x8000) {
+                               *index = i;
+                               return 0;
+                       }
+                       if (partial_ix < 0)
+                               partial_ix = i;
+               }
+       }
+
+       /*no full-member, if exists take the limited*/
+       if (partial_ix >= 0) {
+               *index = partial_ix;
+               return 0;
+       }
+       return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
+static int __init ib_core_init(void)
+{
+       int ret;
+
+       ib_wq = alloc_workqueue("infiniband", 0, 0);
+       if (!ib_wq)
+               return -ENOMEM;
+
+       ret = ib_sysfs_setup();
+       if (ret) {
+               printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+               goto err;
+       }
+
+       ret = ibnl_init();
+       if (ret) {
+               printk(KERN_WARNING "Couldn't init IB netlink interface\n");
+               goto err_sysfs;
+       }
+
+       ret = ib_cache_setup();
+       if (ret) {
+               printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+               goto err_nl;
+       }
+
+       return 0;
+
+err_nl:
+       ibnl_cleanup();
+
+err_sysfs:
+       ib_sysfs_cleanup();
+
+err:
+       destroy_workqueue(ib_wq);
+       return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+       ib_cache_cleanup();
+       ibnl_cleanup();
+       ib_sysfs_cleanup();
+       /* Make sure that any pending umem accounting work is done. */
+       destroy_workqueue(ib_wq);
+}
+
+module_init(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/kern/drivers/net/udrvr/umem.c b/kern/drivers/net/udrvr/umem.c
new file mode 100644 (file)
index 0000000..9a723da
--- /dev/null
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if 0  /* AKAROS */
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/hugetlb.h>
+#include <linux/dma-attrs.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem_odp.h>
+#endif /* AKAROS */
+
+#include "uverbs.h"
+
+
+#if 1  /* AKAROS */
+#define        DEFINE_DMA_ATTRS(x)     struct dma_attrs x
+#define        dma_set_attr(a, b)      do {} while(0)
+#define        can_do_mlock()          1
+#define get_task_pid(p, t)     NULL
+#define        put_pid(p)              do {} while(0)
+#define        ib_umem_odp_get(c, u)   ({ BUG(); -1; })
+#endif /* AKAROS */
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+       struct scatterlist *sg;
+       struct page *page;
+       int i;
+
+       if (umem->nmap > 0)
+               ib_dma_unmap_sg(dev, umem->sg_head.sgl,
+                               umem->nmap,
+                               DMA_BIDIRECTIONAL);
+
+       for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
+
+               page = sg_page(sg);
+               if (umem->writable && dirty)
+                       set_page_dirty_lock(page);
+               put_page(page);
+       }
+
+       sg_free_table(&umem->sg_head);
+       return;
+
+}
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ *
+ * If access flags indicate ODP memory, avoid pinning. Instead, stores
+ * the mm for future page fault handling in conjunction with MMU notifiers.
+ *
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+                           size_t size, int access, int dmasync)
+{
+       struct ib_umem *umem;
+       struct page **page_list;
+       struct vm_area_struct **vma_list;
+       unsigned long locked;
+       unsigned long lock_limit;
+       unsigned long cur_base;
+       unsigned long npages;
+       int ret;
+       int i;
+       DEFINE_DMA_ATTRS(attrs);
+       struct scatterlist *sg, *sg_list_start;
+       int need_release = 0;
+
+       if (dmasync)
+               dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+
+       if (!size)
+               return ERR_PTR(-EINVAL);
+
+       /*
+        * If the combination of the addr and size requested for this memory
+        * region causes an integer overflow, return error.
+        */
+       if (((addr + size) < addr) ||
+           PAGE_ALIGN(addr + size) < (addr + size))
+               return ERR_PTR(-EINVAL);
+
+       if (!can_do_mlock())
+               return ERR_PTR(-EPERM);
+
+       umem = kzalloc(sizeof *umem, GFP_KERNEL);
+       if (!umem)
+               return ERR_PTR(-ENOMEM);
+
+       umem->context   = context;
+       umem->length    = size;
+       umem->address   = addr;
+       umem->page_size = PAGE_SIZE;
+       umem->pid       = get_task_pid(current, PIDTYPE_PID);
+       /*
+        * We ask for writable memory if any of the following
+        * access flags are set.  "Local write" and "remote write"
+        * obviously require write access.  "Remote atomic" can do
+        * things like fetch and add, which will modify memory, and
+        * "MW bind" can change permissions by binding a window.
+        */
+       umem->writable  = !!(access &
+               (IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
+                IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
+
+       if (access & IB_ACCESS_ON_DEMAND) {
+               ret = ib_umem_odp_get(context, umem);
+               if (ret) {
+                       kfree(umem);
+                       return ERR_PTR(ret);
+               }
+               return umem;
+       }
+
+       umem->odp_data = NULL;
+
+#if 0  /* AKAROS */
+       /* We assume the memory is from hugetlb until proved otherwise */
+       umem->hugetlb   = 1;
+
+       page_list = (struct page **) __get_free_page(GFP_KERNEL);
+       if (!page_list) {
+               kfree(umem);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       /*
+        * if we can't alloc the vma_list, it's not so bad;
+        * just assume the memory is not hugetlb memory
+        */
+       vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
+       if (!vma_list)
+               umem->hugetlb = 0;
+
+       npages = ib_umem_num_pages(umem);
+
+       down_write(&current->mm->mmap_sem);
+
+       locked     = npages + current->mm->pinned_vm;
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+       if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+               ret = -ENOMEM;
+               goto out;
+       }
+#else  /* AKAROS */
+       struct page *pptr;
+       npages = ib_umem_num_pages(umem);
+       vma_list = NULL;
+       page_list = &pptr;
+#endif /* AKAROS */
+
+       cur_base = addr & PAGE_MASK;
+
+       if (npages == 0) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
+       if (ret)
+               goto out;
+
+       need_release = 1;
+       sg_list_start = umem->sg_head.sgl;
+
+       while (npages) {
+               ret = get_user_pages(current, current->mm, cur_base,
+                                    min_t(unsigned long, npages,
+                                          PAGE_SIZE / sizeof (struct page *)),
+                                    1, !umem->writable, page_list, vma_list);
+
+               if (ret < 0)
+                       goto out;
+
+               umem->npages += ret;
+               cur_base += ret * PAGE_SIZE;
+               npages   -= ret;
+
+               for_each_sg(sg_list_start, sg, ret, i) {
+                       if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
+                               umem->hugetlb = 0;
+
+                       sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
+               }
+
+               /* preparing for next loop */
+               sg_list_start = sg;
+       }
+
+       umem->nmap = ib_dma_map_sg_attrs(context->device,
+                                 umem->sg_head.sgl,
+                                 umem->npages,
+                                 DMA_BIDIRECTIONAL,
+                                 &attrs);
+
+       if (umem->nmap <= 0) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = 0;
+
+out:
+       if (ret < 0) {
+               if (need_release)
+                       __ib_umem_release(context->device, umem, 0);
+               put_pid(umem->pid);
+               kfree(umem);
+#if 1  /* AKAROS */
+       }
+#else  /* AKAROS */
+       } else
+               current->mm->pinned_vm = locked;
+
+       up_write(&current->mm->mmap_sem);
+       if (vma_list)
+               free_page((unsigned long) vma_list);
+       free_page((unsigned long) page_list);
+#endif /* AKAROS */
+
+       return ret < 0 ? ERR_PTR(ret) : umem;
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+#if 0  /* AKAROS */
+static void ib_umem_account(struct work_struct *work)
+{
+       struct ib_umem *umem = container_of(work, struct ib_umem, work);
+
+       down_write(&umem->mm->mmap_sem);
+       umem->mm->pinned_vm -= umem->diff;
+       up_write(&umem->mm->mmap_sem);
+       mmput(umem->mm);
+       kfree(umem);
+}
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+       struct ib_ucontext *context = umem->context;
+       struct mm_struct *mm;
+       struct task_struct *task;
+       unsigned long diff;
+
+       if (umem->odp_data) {
+               ib_umem_odp_release(umem);
+               return;
+       }
+
+       __ib_umem_release(umem->context->device, umem, 1);
+
+       task = get_pid_task(umem->pid, PIDTYPE_PID);
+       put_pid(umem->pid);
+       if (!task)
+               goto out;
+       mm = get_task_mm(task);
+       put_task_struct(task);
+       if (!mm)
+               goto out;
+
+       diff = ib_umem_num_pages(umem);
+
+       /*
+        * We may be called with the mm's mmap_sem already held.  This
+        * can happen when a userspace munmap() is the call that drops
+        * the last reference to our file and calls our release
+        * method.  If there are memory regions to destroy, we'll end
+        * up here and not be able to take the mmap_sem.  In that case
+        * we defer the vm_locked accounting to the system workqueue.
+        */
+       if (context->closing) {
+               if (!down_write_trylock(&mm->mmap_sem)) {
+                       INIT_WORK(&umem->work, ib_umem_account);
+                       umem->mm   = mm;
+                       umem->diff = diff;
+
+                       queue_work(ib_wq, &umem->work);
+                       return;
+               }
+       } else
+               down_write(&mm->mmap_sem);
+
+       mm->pinned_vm -= diff;
+       up_write(&mm->mmap_sem);
+       mmput(mm);
+out:
+       kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+#else  /* AKAROS */
+void ib_umem_release(struct ib_umem *umem)
+{
+       __ib_umem_release(umem->context->device, umem, 1);
+       kfree(umem);
+}
+#endif /* AKAROS */
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+       int shift;
+       int i;
+       int n;
+       struct scatterlist *sg;
+
+       if (umem->odp_data)
+               return ib_umem_num_pages(umem);
+
+       shift = ilog2(umem->page_size);
+
+       n = 0;
+       for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
+               n += sg_dma_len(sg) >> shift;
+
+       return n;
+}
+EXPORT_SYMBOL(ib_umem_page_count);
+
+#if 0  /* AKAROS */
+/*
+ * Copy from the given ib_umem's pages to the given buffer.
+ *
+ * umem - the umem to copy from
+ * offset - offset to start copying from
+ * dst - destination buffer
+ * length - buffer length
+ *
+ * Returns 0 on success, or an error code.
+ */
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+                     size_t length)
+{
+       size_t end = offset + length;
+       int ret;
+
+       if (offset > umem->length || length > umem->length - offset) {
+               pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
+                      offset, umem->length, end);
+               return -EINVAL;
+       }
+
+       ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
+                                offset + ib_umem_offset(umem));
+
+       if (ret < 0)
+               return ret;
+       else if (ret != length)
+               return -EINVAL;
+       else
+               return 0;
+}
+EXPORT_SYMBOL(ib_umem_copy_from);
+#endif /* AKAROS */
diff --git a/kern/drivers/net/udrvr/uverbs.h b/kern/drivers/net/udrvr/uverbs.h
new file mode 100644 (file)
index 0000000..34069a5
--- /dev/null
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+#if 0  /* AKAROS */
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/cdev.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#else  /* AKAROS */
+#include <linux/rdma/ib_verbs.h>
+#include <linux/rdma/ib_umem.h>
+#include <linux/rdma/ib_user_verbs.h>
+#endif /* AKAROS */
+
+#define INIT_UDATA(udata, ibuf, obuf, ilen, olen)                      \
+       do {                                                            \
+               (udata)->inbuf  = (const void __user *) (ibuf);         \
+               (udata)->outbuf = (void __user *) (obuf);               \
+               (udata)->inlen  = (ilen);                               \
+               (udata)->outlen = (olen);                               \
+       } while (0)
+
+#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen)                  \
+       do {                                                                    \
+               (udata)->inbuf  = (ilen) ? (const void __user *) (ibuf) : NULL; \
+               (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL;       \
+               (udata)->inlen  = (ilen);                                       \
+               (udata)->outlen = (olen);                                       \
+       } while (0)
+
+/*
+ * Our lifetime rules for these structs are the following:
+ *
+ * struct ib_uverbs_device: One reference is held by the module and
+ * released in ib_uverbs_remove_one().  Another reference is taken by
+ * ib_uverbs_open() each time the character special file is opened,
+ * and released in ib_uverbs_release_file() when the file is released.
+ *
+ * struct ib_uverbs_file: One reference is held by the VFS and
+ * released when the file is closed.  Another reference is taken when
+ * an asynchronous event queue file is created and released when the
+ * event file is closed.
+ *
+ * struct ib_uverbs_event_file: One reference is held by the VFS and
+ * released when the file is closed.  For asynchronous event files,
+ * another reference is held by the corresponding main context file
+ * and released when that file is closed.  For completion event files,
+ * a reference is taken when a CQ is created that uses the file, and
+ * released when the CQ is destroyed.
+ */
+
+struct ib_uverbs_device {
+       atomic_t                                refcount;
+       int                                     num_comp_vectors;
+       struct completion                       comp;
+       struct device                          *dev;
+       struct ib_device                       *ib_dev;
+       int                                     devnum;
+       struct cdev                             cdev;
+       struct rb_root                          xrcd_tree;
+       struct mutex                            xrcd_tree_mutex;
+       struct kobject                          kobj;
+};
+
+struct ib_uverbs_event_file {
+       struct kref                             ref;
+       int                                     is_async;
+       struct ib_uverbs_file                  *uverbs_file;
+       spinlock_t                              lock;
+       int                                     is_closed;
+       wait_queue_head_t                       poll_wait;
+       struct fasync_struct                   *async_queue;
+       struct list_head                        event_list;
+};
+
+struct ib_uverbs_file {
+       struct kref                             ref;
+       struct mutex                            mutex;
+       struct ib_uverbs_device                *device;
+       struct ib_ucontext                     *ucontext;
+       struct ib_event_handler                 event_handler;
+       struct ib_uverbs_event_file            *async_file;
+};
+
+struct ib_uverbs_event {
+       union {
+               struct ib_uverbs_async_event_desc       async;
+               struct ib_uverbs_comp_event_desc        comp;
+       }                                       desc;
+       struct list_head                        list;
+       struct list_head                        obj_list;
+       u32                                    *counter;
+};
+
+struct ib_uverbs_mcast_entry {
+       struct list_head        list;
+       union ib_gid            gid;
+       u16                     lid;
+};
+
+struct ib_uevent_object {
+       struct ib_uobject       uobject;
+       struct list_head        event_list;
+       u32                     events_reported;
+};
+
+struct ib_uxrcd_object {
+       struct ib_uobject       uobject;
+       atomic_t                refcnt;
+};
+
+struct ib_usrq_object {
+       struct ib_uevent_object uevent;
+       struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_uqp_object {
+       struct ib_uevent_object uevent;
+       struct list_head        mcast_list;
+       struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_ucq_object {
+       struct ib_uobject       uobject;
+       struct ib_uverbs_file  *uverbs_file;
+       struct list_head        comp_list;
+       struct list_head        async_list;
+       u32                     comp_events_reported;
+       u32                     async_events_reported;
+};
+
+extern spinlock_t ib_uverbs_idr_lock;
+extern struct idr ib_uverbs_pd_idr;
+extern struct idr ib_uverbs_mr_idr;
+extern struct idr ib_uverbs_mw_idr;
+extern struct idr ib_uverbs_ah_idr;
+extern struct idr ib_uverbs_cq_idr;
+extern struct idr ib_uverbs_qp_idr;
+extern struct idr ib_uverbs_srq_idr;
+extern struct idr ib_uverbs_xrcd_idr;
+extern struct idr ib_uverbs_rule_idr;
+
+void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+                                       int is_async);
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+                          struct ib_uverbs_event_file *ev_file,
+                          struct ib_ucq_object *uobj);
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+                             struct ib_uevent_object *uobj);
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+                            struct ib_event *event);
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
+
+struct ib_uverbs_flow_spec {
+       union {
+               union {
+                       struct ib_uverbs_flow_spec_hdr hdr;
+                       struct {
+                               __u32 type;
+                               __u16 size;
+                               __u16 reserved;
+                       };
+               };
+               struct ib_uverbs_flow_spec_eth     eth;
+               struct ib_uverbs_flow_spec_ipv4    ipv4;
+               struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+       };
+};
+
+#define IB_UVERBS_DECLARE_CMD(name)                                    \
+       ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,           \
+                                const char __user *buf, int in_len,    \
+                                int out_len)
+
+IB_UVERBS_DECLARE_CMD(get_context);
+IB_UVERBS_DECLARE_CMD(query_device);
+IB_UVERBS_DECLARE_CMD(query_port);
+IB_UVERBS_DECLARE_CMD(alloc_pd);
+IB_UVERBS_DECLARE_CMD(dealloc_pd);
+IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(rereg_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(alloc_mw);
+IB_UVERBS_DECLARE_CMD(dealloc_mw);
+IB_UVERBS_DECLARE_CMD(create_comp_channel);
+IB_UVERBS_DECLARE_CMD(create_cq);
+IB_UVERBS_DECLARE_CMD(resize_cq);
+IB_UVERBS_DECLARE_CMD(poll_cq);
+IB_UVERBS_DECLARE_CMD(req_notify_cq);
+IB_UVERBS_DECLARE_CMD(destroy_cq);
+IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(open_qp);
+IB_UVERBS_DECLARE_CMD(query_qp);
+IB_UVERBS_DECLARE_CMD(modify_qp);
+IB_UVERBS_DECLARE_CMD(destroy_qp);
+IB_UVERBS_DECLARE_CMD(post_send);
+IB_UVERBS_DECLARE_CMD(post_recv);
+IB_UVERBS_DECLARE_CMD(post_srq_recv);
+IB_UVERBS_DECLARE_CMD(create_ah);
+IB_UVERBS_DECLARE_CMD(destroy_ah);
+IB_UVERBS_DECLARE_CMD(attach_mcast);
+IB_UVERBS_DECLARE_CMD(detach_mcast);
+IB_UVERBS_DECLARE_CMD(create_srq);
+IB_UVERBS_DECLARE_CMD(modify_srq);
+IB_UVERBS_DECLARE_CMD(query_srq);
+IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(create_xsrq);
+IB_UVERBS_DECLARE_CMD(open_xrcd);
+IB_UVERBS_DECLARE_CMD(close_xrcd);
+
+#define IB_UVERBS_DECLARE_EX_CMD(name)                         \
+       int ib_uverbs_ex_##name(struct ib_uverbs_file *file,    \
+                               struct ib_udata *ucore,         \
+                               struct ib_udata *uhw)
+
+IB_UVERBS_DECLARE_EX_CMD(create_flow);
+IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
+IB_UVERBS_DECLARE_EX_CMD(query_device);
+
+#endif /* UVERBS_H */
diff --git a/kern/drivers/net/udrvr/uverbs_cmd.c b/kern/drivers/net/udrvr/uverbs_cmd.c
new file mode 100644 (file)
index 0000000..b0ff74f
--- /dev/null
@@ -0,0 +1,3433 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if 0  /* AKAROS */
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+#include "core_priv.h"
+#else  /* AKAROS */
+#include "uverbs.h"
+
+static void release_uobj(struct kref *kref);
+#define        kref_init(p)    kref_init(p, release_uobj, 1)
+#define kref_put(p, f) kref_put(p)
+#define        kref_get(p)     kref_get(p, 1)
+
+#define        get_task_pid(a, b)      NULL
+#define        put_pid(p)
+
+#define        ib_resolve_eth_l2_attrs(qp, attr, attrp)        0
+#define capable(c)                                     1
+
+/*
+ * Someone "ported" include/linux/rdma/ib_verbs.h and killed the "mutex"
+ * field in "struct ib_uobject". Need to add that back in. In the meantime,
+ * hack around this ....
+ */
+#define        init_rwsem(p)
+#define        up_read(p)
+#define        up_write(p)
+#define        down_read(p)
+#define        down_write(p)
+#define        down_read_nested(p, d)
+
+# define lockdep_set_class_and_name(lock, key, name) \
+                do { (void)(key); (void)(name); } while (0)
+
+#endif /* AKAROS */
+
+struct uverbs_lock_class {
+       struct lock_class_key   key;
+       char                    name[16];
+};
+
+static struct uverbs_lock_class pd_lock_class  = { .name = "PD-uobj" };
+static struct uverbs_lock_class mr_lock_class  = { .name = "MR-uobj" };
+static struct uverbs_lock_class mw_lock_class  = { .name = "MW-uobj" };
+static struct uverbs_lock_class cq_lock_class  = { .name = "CQ-uobj" };
+static struct uverbs_lock_class qp_lock_class  = { .name = "QP-uobj" };
+static struct uverbs_lock_class ah_lock_class  = { .name = "AH-uobj" };
+static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" };
+static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
+static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
+
+/*
+ * The ib_uobject locking scheme is as follows:
+ *
+ * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it
+ *   needs to be held during all idr operations.  When an object is
+ *   looked up, a reference must be taken on the object's kref before
+ *   dropping this lock.
+ *
+ * - Each object also has an rwsem.  This rwsem must be held for
+ *   reading while an operation that uses the object is performed.
+ *   For example, while registering an MR, the associated PD's
+ *   uobject.mutex must be held for reading.  The rwsem must be held
+ *   for writing while initializing or destroying an object.
+ *
+ * - In addition, each object has a "live" flag.  If this flag is not
+ *   set, then lookups of the object will fail even if it is found in
+ *   the idr.  This handles a reader that blocks and does not acquire
+ *   the rwsem until after the object is destroyed.  The destroy
+ *   operation will set the live flag to 0 and then drop the rwsem;
+ *   this will allow the reader to acquire the rwsem, see that the
+ *   live flag is 0, and then drop the rwsem and its reference to
+ *   object.  The underlying storage will not be freed until the last
+ *   reference to the object is dropped.
+ */
+
+static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
+                     struct ib_ucontext *context, struct uverbs_lock_class *c)
+{
+       uobj->user_handle = user_handle;
+       uobj->context     = context;
+       kref_init(&uobj->ref);
+       init_rwsem(&uobj->mutex);
+       lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name);
+       uobj->live        = 0;
+}
+
+static void release_uobj(struct kref *kref)
+{
+       kfree(container_of(kref, struct ib_uobject, ref));
+}
+
+static void put_uobj(struct ib_uobject *uobj)
+{
+       kref_put(&uobj->ref, release_uobj);
+}
+
+static void put_uobj_read(struct ib_uobject *uobj)
+{
+       up_read(&uobj->mutex);
+       put_uobj(uobj);
+}
+
+static void put_uobj_write(struct ib_uobject *uobj)
+{
+       up_write(&uobj->mutex);
+       put_uobj(uobj);
+}
+
+static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+       int ret;
+
+       idr_preload(GFP_KERNEL);
+       spin_lock(&ib_uverbs_idr_lock);
+
+       ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT);
+       if (ret >= 0)
+               uobj->id = ret;
+
+       spin_unlock(&ib_uverbs_idr_lock);
+       idr_preload_end();
+
+       return ret < 0 ? ret : 0;
+}
+
+void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+       spin_lock(&ib_uverbs_idr_lock);
+       idr_remove(idr, uobj->id);
+       spin_unlock(&ib_uverbs_idr_lock);
+}
+
+static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
+                                        struct ib_ucontext *context)
+{
+       struct ib_uobject *uobj;
+
+       spin_lock(&ib_uverbs_idr_lock);
+       uobj = idr_find(idr, id);
+       if (uobj) {
+               if (uobj->context == context)
+                       kref_get(&uobj->ref);
+               else
+                       uobj = NULL;
+       }
+       spin_unlock(&ib_uverbs_idr_lock);
+
+       return uobj;
+}
+
+static struct ib_uobject *idr_read_uobj(struct idr *idr, int id,
+                                       struct ib_ucontext *context, int nested)
+{
+       struct ib_uobject *uobj;
+
+       uobj = __idr_get_uobj(idr, id, context);
+       if (!uobj)
+               return NULL;
+
+       if (nested)
+               down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING);
+       else
+               down_read(&uobj->mutex);
+       if (!uobj->live) {
+               put_uobj_read(uobj);
+               return NULL;
+       }
+
+       return uobj;
+}
+
+static struct ib_uobject *idr_write_uobj(struct idr *idr, int id,
+                                        struct ib_ucontext *context)
+{
+       struct ib_uobject *uobj;
+
+       uobj = __idr_get_uobj(idr, id, context);
+       if (!uobj)
+               return NULL;
+
+       down_write(&uobj->mutex);
+       if (!uobj->live) {
+               put_uobj_write(uobj);
+               return NULL;
+       }
+
+       return uobj;
+}
+
+static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context,
+                         int nested)
+{
+       struct ib_uobject *uobj;
+
+       uobj = idr_read_uobj(idr, id, context, nested);
+       return uobj ? uobj->object : NULL;
+}
+
+static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context)
+{
+       return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0);
+}
+
+static void put_pd_read(struct ib_pd *pd)
+{
+       put_uobj_read(pd->uobject);
+}
+
+static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested)
+{
+       return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested);
+}
+
+static void put_cq_read(struct ib_cq *cq)
+{
+       put_uobj_read(cq->uobject);
+}
+
+static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context)
+{
+       return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0);
+}
+
+static void put_ah_read(struct ib_ah *ah)
+{
+       put_uobj_read(ah->uobject);
+}
+
+static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
+{
+       return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
+}
+
+static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context)
+{
+       struct ib_uobject *uobj;
+
+       uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context);
+       return uobj ? uobj->object : NULL;
+}
+
+static void put_qp_read(struct ib_qp *qp)
+{
+       put_uobj_read(qp->uobject);
+}
+
+static void put_qp_write(struct ib_qp *qp)
+{
+       put_uobj_write(qp->uobject);
+}
+
+static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context)
+{
+       return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0);
+}
+
+static void put_srq_read(struct ib_srq *srq)
+{
+       put_uobj_read(srq->uobject);
+}
+
+static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context,
+                                    struct ib_uobject **uobj)
+{
+       *uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0);
+       return *uobj ? (*uobj)->object : NULL;
+}
+
+static void put_xrcd_read(struct ib_uobject *uobj)
+{
+       put_uobj_read(uobj);
+}
+
+ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+                             const char __user *buf,
+                             int in_len, int out_len)
+{
+       struct ib_uverbs_get_context      cmd;
+       struct ib_uverbs_get_context_resp resp;
+       struct ib_udata                   udata;
+       struct ib_device                 *ibdev = file->device->ib_dev;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       struct ib_device_attr             dev_attr;
+#endif
+       struct ib_ucontext               *ucontext;
+       struct file                      *filp;
+       int ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       mutex_lock(&file->mutex);
+
+       if (file->ucontext) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       INIT_UDATA(&udata, buf + sizeof cmd,
+                  (unsigned long) cmd.response + sizeof resp,
+                  in_len - sizeof cmd, out_len - sizeof resp);
+
+       ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+       if (IS_ERR(ucontext)) {
+               ret = PTR_ERR(ucontext);
+               goto err;
+       }
+
+       ucontext->device = ibdev;
+       INIT_LIST_HEAD(&ucontext->pd_list);
+       INIT_LIST_HEAD(&ucontext->mr_list);
+       INIT_LIST_HEAD(&ucontext->mw_list);
+       INIT_LIST_HEAD(&ucontext->cq_list);
+       INIT_LIST_HEAD(&ucontext->qp_list);
+       INIT_LIST_HEAD(&ucontext->srq_list);
+       INIT_LIST_HEAD(&ucontext->ah_list);
+       INIT_LIST_HEAD(&ucontext->xrcd_list);
+       INIT_LIST_HEAD(&ucontext->rule_list);
+       rcu_read_lock();
+       ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+       rcu_read_unlock();
+       ucontext->closing = 0;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       ucontext->umem_tree = RB_ROOT;
+       init_rwsem(&ucontext->umem_rwsem);
+       ucontext->odp_mrs_count = 0;
+       INIT_LIST_HEAD(&ucontext->no_private_counters);
+
+       ret = ib_query_device(ibdev, &dev_attr);
+       if (ret)
+               goto err_free;
+       if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+               ucontext->invalidate_range = NULL;
+
+#endif
+
+       resp.num_comp_vectors = file->device->num_comp_vectors;
+
+#if 0  /* AKAROS */
+       ret = get_unused_fd_flags(O_CLOEXEC);
+       if (ret < 0)
+               goto err_free;
+       resp.async_fd = ret;
+
+       filp = ib_uverbs_alloc_event_file(file, 1);
+       if (IS_ERR(filp)) {
+               ret = PTR_ERR(filp);
+               goto err_fd;
+       }
+#endif /* AKAROS */
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp)) {
+               ret = -EFAULT;
+               goto err_file;
+       }
+
+#if 0  /* AKAROS */
+       file->async_file = filp->private_data;
+#endif /* AKAROS */
+
+       INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
+                             ib_uverbs_event_handler);
+       ret = ib_register_event_handler(&file->event_handler);
+       if (ret)
+               goto err_file;
+
+#if 0  /* AKAROS */
+       kref_get(&file->async_file->ref);
+#endif /* AKAROS */
+       kref_get(&file->ref);
+       file->ucontext = ucontext;
+
+#if 0  /* AKAROS */
+       fd_install(resp.async_fd, filp);
+#endif /* AKAROS */
+
+       mutex_unlock(&file->mutex);
+
+       return in_len;
+
+err_file:
+#if 0  /* AKAROS */
+       fput(filp);
+
+err_fd:
+       put_unused_fd(resp.async_fd);
+#endif /* AKAROS */
+
+err_free:
+       put_pid(ucontext->tgid);
+       ibdev->dealloc_ucontext(ucontext);
+
+err:
+       mutex_unlock(&file->mutex);
+       return ret;
+}
+
+static void copy_query_dev_fields(struct ib_uverbs_file *file,
+                                 struct ib_uverbs_query_device_resp *resp,
+                                 struct ib_device_attr *attr)
+{
+       resp->fw_ver            = attr->fw_ver;
+       resp->node_guid         = file->device->ib_dev->node_guid;
+       resp->sys_image_guid    = attr->sys_image_guid;
+       resp->max_mr_size       = attr->max_mr_size;
+       resp->page_size_cap     = attr->page_size_cap;
+       resp->vendor_id         = attr->vendor_id;
+       resp->vendor_part_id    = attr->vendor_part_id;
+       resp->hw_ver            = attr->hw_ver;
+       resp->max_qp            = attr->max_qp;
+       resp->max_qp_wr         = attr->max_qp_wr;
+       resp->device_cap_flags  = attr->device_cap_flags;
+       resp->max_sge           = attr->max_sge;
+       resp->max_sge_rd        = attr->max_sge_rd;
+       resp->max_cq            = attr->max_cq;
+       resp->max_cqe           = attr->max_cqe;
+       resp->max_mr            = attr->max_mr;
+       resp->max_pd            = attr->max_pd;
+       resp->max_qp_rd_atom    = attr->max_qp_rd_atom;
+       resp->max_ee_rd_atom    = attr->max_ee_rd_atom;
+       resp->max_res_rd_atom   = attr->max_res_rd_atom;
+       resp->max_qp_init_rd_atom       = attr->max_qp_init_rd_atom;
+       resp->max_ee_init_rd_atom       = attr->max_ee_init_rd_atom;
+       resp->atomic_cap                = attr->atomic_cap;
+       resp->max_ee                    = attr->max_ee;
+       resp->max_rdd                   = attr->max_rdd;
+       resp->max_mw                    = attr->max_mw;
+       resp->max_raw_ipv6_qp           = attr->max_raw_ipv6_qp;
+       resp->max_raw_ethy_qp           = attr->max_raw_ethy_qp;
+       resp->max_mcast_grp             = attr->max_mcast_grp;
+       resp->max_mcast_qp_attach       = attr->max_mcast_qp_attach;
+       resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
+       resp->max_ah                    = attr->max_ah;
+       resp->max_fmr                   = attr->max_fmr;
+       resp->max_map_per_fmr           = attr->max_map_per_fmr;
+       resp->max_srq                   = attr->max_srq;
+       resp->max_srq_wr                = attr->max_srq_wr;
+       resp->max_srq_sge               = attr->max_srq_sge;
+       resp->max_pkeys                 = attr->max_pkeys;
+       resp->local_ca_ack_delay        = attr->local_ca_ack_delay;
+       resp->phys_port_cnt             = file->device->ib_dev->phys_port_cnt;
+}
+
+ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+                              const char __user *buf,
+                              int in_len, int out_len)
+{
+       struct ib_uverbs_query_device      cmd;
+       struct ib_uverbs_query_device_resp resp;
+       struct ib_device_attr              attr;
+       int                                ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       ret = ib_query_device(file->device->ib_dev, &attr);
+       if (ret)
+               return ret;
+
+       memset(&resp, 0, sizeof resp);
+       copy_query_dev_fields(file, &resp, &attr);
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp))
+               return -EFAULT;
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+                            const char __user *buf,
+                            int in_len, int out_len)
+{
+       struct ib_uverbs_query_port      cmd;
+       struct ib_uverbs_query_port_resp resp;
+       struct ib_port_attr              attr;
+       int                              ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+       if (ret)
+               return ret;
+
+       memset(&resp, 0, sizeof resp);
+
+       resp.state           = attr.state;
+       resp.max_mtu         = attr.max_mtu;
+       resp.active_mtu      = attr.active_mtu;
+       resp.gid_tbl_len     = attr.gid_tbl_len;
+       resp.port_cap_flags  = attr.port_cap_flags;
+       resp.max_msg_sz      = attr.max_msg_sz;
+       resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
+       resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
+       resp.pkey_tbl_len    = attr.pkey_tbl_len;
+       resp.lid             = attr.lid;
+       resp.sm_lid          = attr.sm_lid;
+       resp.lmc             = attr.lmc;
+       resp.max_vl_num      = attr.max_vl_num;
+       resp.sm_sl           = attr.sm_sl;
+       resp.subnet_timeout  = attr.subnet_timeout;
+       resp.init_type_reply = attr.init_type_reply;
+       resp.active_width    = attr.active_width;
+       resp.active_speed    = attr.active_speed;
+       resp.phys_state      = attr.phys_state;
+       resp.link_layer      = rdma_port_get_link_layer(file->device->ib_dev,
+                                                       cmd.port_num);
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp))
+               return -EFAULT;
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+                          const char __user *buf,
+                          int in_len, int out_len)
+{
+       struct ib_uverbs_alloc_pd      cmd;
+       struct ib_uverbs_alloc_pd_resp resp;
+       struct ib_udata                udata;
+       struct ib_uobject             *uobj;
+       struct ib_pd                  *pd;
+       int                            ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof cmd,
+                  (unsigned long) cmd.response + sizeof resp,
+                  in_len - sizeof cmd, out_len - sizeof resp);
+
+       uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+       if (!uobj)
+               return -ENOMEM;
+
+       init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
+       down_write(&uobj->mutex);
+
+       pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
+                                           file->ucontext, &udata);
+       if (IS_ERR(pd)) {
+               ret = PTR_ERR(pd);
+               goto err;
+       }
+
+       pd->device  = file->device->ib_dev;
+       pd->uobject = uobj;
+       atomic_set(&pd->usecnt, 0);
+
+       uobj->object = pd;
+       ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj);
+       if (ret)
+               goto err_idr;
+
+       memset(&resp, 0, sizeof resp);
+       resp.pd_handle = uobj->id;
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp)) {
+               ret = -EFAULT;
+               goto err_copy;
+       }
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&uobj->list, &file->ucontext->pd_list);
+       mutex_unlock(&file->mutex);
+
+       uobj->live = 1;
+
+       up_write(&uobj->mutex);
+
+       return in_len;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+err_idr:
+       ib_dealloc_pd(pd);
+
+err:
+       put_uobj_write(uobj);
+       return ret;
+}
+
+ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+                            const char __user *buf,
+                            int in_len, int out_len)
+{
+       struct ib_uverbs_dealloc_pd cmd;
+       struct ib_uobject          *uobj;
+       int                         ret;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+
+       ret = ib_dealloc_pd(uobj->object);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+
+       if (ret)
+               return ret;
+
+       idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       put_uobj(uobj);
+
+       return in_len;
+}
+
+#if 0  /* AKAROS */
+
+struct xrcd_table_entry {
+       struct rb_node  node;
+       struct ib_xrcd *xrcd;
+       struct inode   *inode;
+};
+
+static int xrcd_table_insert(struct ib_uverbs_device *dev,
+                           struct inode *inode,
+                           struct ib_xrcd *xrcd)
+{
+       struct xrcd_table_entry *entry, *scan;
+       struct rb_node **p = &dev->xrcd_tree.rb_node;
+       struct rb_node *parent = NULL;
+
+       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+       if (!entry)
+               return -ENOMEM;
+
+       entry->xrcd  = xrcd;
+       entry->inode = inode;
+
+       while (*p) {
+               parent = *p;
+               scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+               if (inode < scan->inode) {
+                       p = &(*p)->rb_left;
+               } else if (inode > scan->inode) {
+                       p = &(*p)->rb_right;
+               } else {
+                       kfree(entry);
+                       return -EEXIST;
+               }
+       }
+
+       rb_link_node(&entry->node, parent, p);
+       rb_insert_color(&entry->node, &dev->xrcd_tree);
+       igrab(inode);
+       return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev,
+                                                 struct inode *inode)
+{
+       struct xrcd_table_entry *entry;
+       struct rb_node *p = dev->xrcd_tree.rb_node;
+
+       while (p) {
+               entry = rb_entry(p, struct xrcd_table_entry, node);
+
+               if (inode < entry->inode)
+                       p = p->rb_left;
+               else if (inode > entry->inode)
+                       p = p->rb_right;
+               else
+                       return entry;
+       }
+
+       return NULL;
+}
+
+static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode)
+{
+       struct xrcd_table_entry *entry;
+
+       entry = xrcd_table_search(dev, inode);
+       if (!entry)
+               return NULL;
+
+       return entry->xrcd;
+}
+
+static void xrcd_table_delete(struct ib_uverbs_device *dev,
+                             struct inode *inode)
+{
+       struct xrcd_table_entry *entry;
+
+       entry = xrcd_table_search(dev, inode);
+       if (entry) {
+               iput(inode);
+               rb_erase(&entry->node, &dev->xrcd_tree);
+               kfree(entry);
+       }
+}
+
+ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+                           const char __user *buf, int in_len,
+                           int out_len)
+{
+       struct ib_uverbs_open_xrcd      cmd;
+       struct ib_uverbs_open_xrcd_resp resp;
+       struct ib_udata                 udata;
+       struct ib_uxrcd_object         *obj;
+       struct ib_xrcd                 *xrcd = NULL;
+       struct fd                       f = {NULL, 0};
+       struct inode                   *inode = NULL;
+       int                             ret = 0;
+       int                             new_xrcd = 0;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof cmd,
+                  (unsigned long) cmd.response + sizeof resp,
+                  in_len - sizeof cmd, out_len - sizeof  resp);
+
+       mutex_lock(&file->device->xrcd_tree_mutex);
+
+       if (cmd.fd != -1) {
+               /* search for file descriptor */
+               f = fdget(cmd.fd);
+               if (!f.file) {
+                       ret = -EBADF;
+                       goto err_tree_mutex_unlock;
+               }
+
+               inode = file_inode(f.file);
+               xrcd = find_xrcd(file->device, inode);
+               if (!xrcd && !(cmd.oflags & O_CREAT)) {
+                       /* no file descriptor. Need CREATE flag */
+                       ret = -EAGAIN;
+                       goto err_tree_mutex_unlock;
+               }
+
+               if (xrcd && cmd.oflags & O_EXCL) {
+                       ret = -EINVAL;
+                       goto err_tree_mutex_unlock;
+               }
+       }
+
+       obj = kmalloc(sizeof *obj, GFP_KERNEL);
+       if (!obj) {
+               ret = -ENOMEM;
+               goto err_tree_mutex_unlock;
+       }
+
+       init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class);
+
+       down_write(&obj->uobject.mutex);
+
+       if (!xrcd) {
+               xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
+                                                       file->ucontext, &udata);
+               if (IS_ERR(xrcd)) {
+                       ret = PTR_ERR(xrcd);
+                       goto err;
+               }
+
+               xrcd->inode   = inode;
+               xrcd->device  = file->device->ib_dev;
+               atomic_set(&xrcd->usecnt, 0);
+               mutex_init(&xrcd->tgt_qp_mutex);
+               INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+               new_xrcd = 1;
+       }
+
+       atomic_set(&obj->refcnt, 0);
+       obj->uobject.object = xrcd;
+       ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject);
+       if (ret)
+               goto err_idr;
+
+       memset(&resp, 0, sizeof resp);
+       resp.xrcd_handle = obj->uobject.id;
+
+       if (inode) {
+               if (new_xrcd) {
+                       /* create new inode/xrcd table entry */
+                       ret = xrcd_table_insert(file->device, inode, xrcd);
+                       if (ret)
+                               goto err_insert_xrcd;
+               }
+               atomic_inc(&xrcd->usecnt);
+       }
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp)) {
+               ret = -EFAULT;
+               goto err_copy;
+       }
+
+       if (f.file)
+               fdput(f);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list);
+       mutex_unlock(&file->mutex);
+
+       obj->uobject.live = 1;
+       up_write(&obj->uobject.mutex);
+
+       mutex_unlock(&file->device->xrcd_tree_mutex);
+       return in_len;
+
+err_copy:
+       if (inode) {
+               if (new_xrcd)
+                       xrcd_table_delete(file->device, inode);
+               atomic_dec(&xrcd->usecnt);
+       }
+
+err_insert_xrcd:
+       idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject);
+
+err_idr:
+       ib_dealloc_xrcd(xrcd);
+
+err:
+       put_uobj_write(&obj->uobject);
+
+err_tree_mutex_unlock:
+       if (f.file)
+               fdput(f);
+
+       mutex_unlock(&file->device->xrcd_tree_mutex);
+
+       return ret;
+}
+
+ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+                            const char __user *buf, int in_len,
+                            int out_len)
+{
+       struct ib_uverbs_close_xrcd cmd;
+       struct ib_uobject           *uobj;
+       struct ib_xrcd              *xrcd = NULL;
+       struct inode                *inode = NULL;
+       struct ib_uxrcd_object      *obj;
+       int                         live;
+       int                         ret = 0;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       mutex_lock(&file->device->xrcd_tree_mutex);
+       uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext);
+       if (!uobj) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       xrcd  = uobj->object;
+       inode = xrcd->inode;
+       obj   = container_of(uobj, struct ib_uxrcd_object, uobject);
+       if (atomic_read(&obj->refcnt)) {
+               put_uobj_write(uobj);
+               ret = -EBUSY;
+               goto out;
+       }
+
+       if (!inode || atomic_dec_and_test(&xrcd->usecnt)) {
+               ret = ib_dealloc_xrcd(uobj->object);
+               if (!ret)
+                       uobj->live = 0;
+       }
+
+       live = uobj->live;
+       if (inode && ret)
+               atomic_inc(&xrcd->usecnt);
+
+       put_uobj_write(uobj);
+
+       if (ret)
+               goto out;
+
+       if (inode && !live)
+               xrcd_table_delete(file->device, inode);
+
+       idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj);
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       put_uobj(uobj);
+       ret = in_len;
+
+out:
+       mutex_unlock(&file->device->xrcd_tree_mutex);
+       return ret;
+}
+
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
+                           struct ib_xrcd *xrcd)
+{
+       struct inode *inode;
+
+       inode = xrcd->inode;
+       if (inode && !atomic_dec_and_test(&xrcd->usecnt))
+               return;
+
+       ib_dealloc_xrcd(xrcd);
+
+       if (inode)
+               xrcd_table_delete(dev, inode);
+}
+
+#else  /* AKAROS */
+ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+                           const char __user *buf, int in_len,
+                           int out_len)
+{
+       return -EBADF;
+}
+
+ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+                            const char __user *buf, int in_len,
+                            int out_len)
+{
+       return -EBADF;
+}
+
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
+                           struct ib_xrcd *xrcd)
+{
+}
+#endif /* AKAROS */
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+                        const char __user *buf, int in_len,
+                        int out_len)
+{
+       struct ib_uverbs_reg_mr      cmd;
+       struct ib_uverbs_reg_mr_resp resp;
+       struct ib_udata              udata;
+       struct ib_uobject           *uobj;
+       struct ib_pd                *pd;
+       struct ib_mr                *mr;
+       int                          ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof cmd,
+                  (unsigned long) cmd.response + sizeof resp,
+                  in_len - sizeof cmd, out_len - sizeof resp);
+
+       if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+               return -EINVAL;
+
+       ret = ib_check_mr_access(cmd.access_flags);
+       if (ret)
+               return ret;
+
+       uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+       if (!uobj)
+               return -ENOMEM;
+
+       init_uobj(uobj, 0, file->ucontext, &mr_lock_class);
+       down_write(&uobj->mutex);
+
+       pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+       if (!pd) {
+               ret = -EINVAL;
+               goto err_free;
+       }
+
+       if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
+               struct ib_device_attr attr;
+
+               ret = ib_query_device(pd->device, &attr);
+               if (ret || !(attr.device_cap_flags &
+                               IB_DEVICE_ON_DEMAND_PAGING)) {
+                       pr_debug("ODP support not available\n");
+                       ret = -EINVAL;
+                       goto err_put;
+               }
+       }
+
+       mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+                                    cmd.access_flags, &udata);
+       if (IS_ERR(mr)) {
+               ret = PTR_ERR(mr);
+               goto err_put;
+       }
+
+       mr->device  = pd->device;
+       mr->pd      = pd;
+       mr->uobject = uobj;
+       atomic_inc(&pd->usecnt);
+       atomic_set(&mr->usecnt, 0);
+
+       uobj->object = mr;
+       ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
+       if (ret)
+               goto err_unreg;
+
+       memset(&resp, 0, sizeof resp);
+       resp.lkey      = mr->lkey;
+       resp.rkey      = mr->rkey;
+       resp.mr_handle = uobj->id;
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp)) {
+               ret = -EFAULT;
+               goto err_copy;
+       }
+
+       put_pd_read(pd);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&uobj->list, &file->ucontext->mr_list);
+       mutex_unlock(&file->mutex);
+
+       uobj->live = 1;
+
+       up_write(&uobj->mutex);
+
+       return in_len;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+err_unreg:
+       ib_dereg_mr(mr);
+
+err_put:
+       put_pd_read(pd);
+
+err_free:
+       put_uobj_write(uobj);
+       return ret;
+}
+
+ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+                          const char __user *buf, int in_len,
+                          int out_len)
+{
+       struct ib_uverbs_rereg_mr      cmd;
+       struct ib_uverbs_rereg_mr_resp resp;
+       struct ib_udata              udata;
+       struct ib_pd                *pd = NULL;
+       struct ib_mr                *mr;
+       struct ib_pd                *old_pd;
+       int                          ret;
+       struct ib_uobject           *uobj;
+
+       if (out_len < sizeof(resp))
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof(cmd)))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof(cmd),
+                  (unsigned long) cmd.response + sizeof(resp),
+                  in_len - sizeof(cmd), out_len - sizeof(resp));
+
+       if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
+               return -EINVAL;
+
+       if ((cmd.flags & IB_MR_REREG_TRANS) &&
+           (!cmd.start || !cmd.hca_va || 0 >= cmd.length ||
+            (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
+                       return -EINVAL;
+
+       uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle,
+                             file->ucontext);
+
+       if (!uobj)
+               return -EINVAL;
+
+       mr = uobj->object;
+
+       if (cmd.flags & IB_MR_REREG_ACCESS) {
+               ret = ib_check_mr_access(cmd.access_flags);
+               if (ret)
+                       goto put_uobjs;
+       }
+
+       if (cmd.flags & IB_MR_REREG_PD) {
+               pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+               if (!pd) {
+                       ret = -EINVAL;
+                       goto put_uobjs;
+               }
+       }
+
+       if (atomic_read(&mr->usecnt)) {
+               ret = -EBUSY;
+               goto put_uobj_pd;
+       }
+
+       old_pd = mr->pd;
+       ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
+                                       cmd.length, cmd.hca_va,
+                                       cmd.access_flags, pd, &udata);
+       if (!ret) {
+               if (cmd.flags & IB_MR_REREG_PD) {
+                       atomic_inc(&pd->usecnt);
+                       mr->pd = pd;
+                       atomic_dec(&old_pd->usecnt);
+               }
+       } else {
+               goto put_uobj_pd;
+       }
+
+       memset(&resp, 0, sizeof(resp));
+       resp.lkey      = mr->lkey;
+       resp.rkey      = mr->rkey;
+
+       if (copy_to_user((void __user *)(unsigned long)cmd.response,
+                        &resp, sizeof(resp)))
+               ret = -EFAULT;
+       else
+               ret = in_len;
+
+put_uobj_pd:
+       if (cmd.flags & IB_MR_REREG_PD)
+               put_pd_read(pd);
+
+put_uobjs:
+
+       put_uobj_write(mr->uobject);
+
+       return ret;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+                          const char __user *buf, int in_len,
+                          int out_len)
+{
+       struct ib_uverbs_dereg_mr cmd;
+       struct ib_mr             *mr;
+       struct ib_uobject        *uobj;
+       int                       ret = -EINVAL;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+
+       mr = uobj->object;
+
+       ret = ib_dereg_mr(mr);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+
+       if (ret)
+               return ret;
+
+       idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       put_uobj(uobj);
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
+                        const char __user *buf, int in_len,
+                        int out_len)
+{
+       struct ib_uverbs_alloc_mw      cmd;
+       struct ib_uverbs_alloc_mw_resp resp;
+       struct ib_uobject             *uobj;
+       struct ib_pd                  *pd;
+       struct ib_mw                  *mw;
+       int                            ret;
+
+       if (out_len < sizeof(resp))
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof(cmd)))
+               return -EFAULT;
+
+       uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+       if (!uobj)
+               return -ENOMEM;
+
+       init_uobj(uobj, 0, file->ucontext, &mw_lock_class);
+       down_write(&uobj->mutex);
+
+       pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+       if (!pd) {
+               ret = -EINVAL;
+               goto err_free;
+       }
+
+       mw = pd->device->alloc_mw(pd, cmd.mw_type);
+       if (IS_ERR(mw)) {
+               ret = PTR_ERR(mw);
+               goto err_put;
+       }
+
+       mw->device  = pd->device;
+       mw->pd      = pd;
+       mw->uobject = uobj;
+       atomic_inc(&pd->usecnt);
+
+       uobj->object = mw;
+       ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj);
+       if (ret)
+               goto err_unalloc;
+
+       memset(&resp, 0, sizeof(resp));
+       resp.rkey      = mw->rkey;
+       resp.mw_handle = uobj->id;
+
+       if (copy_to_user((void __user *)(unsigned long)cmd.response,
+                        &resp, sizeof(resp))) {
+               ret = -EFAULT;
+               goto err_copy;
+       }
+
+       put_pd_read(pd);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&uobj->list, &file->ucontext->mw_list);
+       mutex_unlock(&file->mutex);
+
+       uobj->live = 1;
+
+       up_write(&uobj->mutex);
+
+       return in_len;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+
+err_unalloc:
+       ib_dealloc_mw(mw);
+
+err_put:
+       put_pd_read(pd);
+
+err_free:
+       put_uobj_write(uobj);
+       return ret;
+}
+
+ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
+                          const char __user *buf, int in_len,
+                          int out_len)
+{
+       struct ib_uverbs_dealloc_mw cmd;
+       struct ib_mw               *mw;
+       struct ib_uobject          *uobj;
+       int                         ret = -EINVAL;
+
+       if (copy_from_user(&cmd, buf, sizeof(cmd)))
+               return -EFAULT;
+
+       uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+
+       mw = uobj->object;
+
+       ret = ib_dealloc_mw(mw);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+
+       if (ret)
+               return ret;
+
+       idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       put_uobj(uobj);
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+                                     const char __user *buf, int in_len,
+                                     int out_len)
+{
+       struct ib_uverbs_create_comp_channel       cmd;
+       struct ib_uverbs_create_comp_channel_resp  resp;
+       struct file                               *filp;
+       int ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+#if 0  /* AKAROS */
+       ret = get_unused_fd_flags(O_CLOEXEC);
+       if (ret < 0)
+               return ret;
+       resp.fd = ret;
+
+       filp = ib_uverbs_alloc_event_file(file, 0);
+       if (IS_ERR(filp)) {
+               put_unused_fd(resp.fd);
+               return PTR_ERR(filp);
+       }
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp)) {
+               put_unused_fd(resp.fd);
+               fput(filp);
+               return -EFAULT;
+       }
+
+       fd_install(resp.fd, filp);
+#else
+       BUG_ON(1);
+#endif /* AKAROS */
+       return in_len;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+                           const char __user *buf, int in_len,
+                           int out_len)
+{
+       struct ib_uverbs_create_cq      cmd;
+       struct ib_uverbs_create_cq_resp resp;
+       struct ib_udata                 udata;
+       struct ib_ucq_object           *obj;
+       struct ib_uverbs_event_file    *ev_file = NULL;
+       struct ib_cq                   *cq;
+       int                             ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof cmd,
+                  (unsigned long) cmd.response + sizeof resp,
+                  in_len - sizeof cmd, out_len - sizeof resp);
+
+       if (cmd.comp_vector >= file->device->num_comp_vectors)
+               return -EINVAL;
+
+       obj = kmalloc(sizeof *obj, GFP_KERNEL);
+       if (!obj)
+               return -ENOMEM;
+
+       init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_class);
+       down_write(&obj->uobject.mutex);
+
+       if (cmd.comp_channel >= 0) {
+               ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+               if (!ev_file) {
+                       ret = -EINVAL;
+                       goto err;
+               }
+       }
+
+       obj->uverbs_file           = file;
+       obj->comp_events_reported  = 0;
+       obj->async_events_reported = 0;
+       INIT_LIST_HEAD(&obj->comp_list);
+       INIT_LIST_HEAD(&obj->async_list);
+
+       cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
+                                            cmd.comp_vector,
+                                            file->ucontext, &udata);
+       if (IS_ERR(cq)) {
+               ret = PTR_ERR(cq);
+               goto err_file;
+       }
+
+       cq->device        = file->device->ib_dev;
+       cq->uobject       = &obj->uobject;
+       cq->comp_handler  = ib_uverbs_comp_handler;
+       cq->event_handler = ib_uverbs_cq_event_handler;
+       cq->cq_context    = ev_file;
+       atomic_set(&cq->usecnt, 0);
+
+       obj->uobject.object = cq;
+       ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+       if (ret)
+               goto err_free;
+
+       memset(&resp, 0, sizeof resp);
+       resp.cq_handle = obj->uobject.id;
+       resp.cqe       = cq->cqe;
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp)) {
+               ret = -EFAULT;
+               goto err_copy;
+       }
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&obj->uobject.list, &file->ucontext->cq_list);
+       mutex_unlock(&file->mutex);
+
+       obj->uobject.live = 1;
+
+       up_write(&obj->uobject.mutex);
+
+       return in_len;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+
+err_free:
+       ib_destroy_cq(cq);
+
+err_file:
+       if (ev_file)
+               ib_uverbs_release_ucq(file, ev_file, obj);
+
+err:
+       put_uobj_write(&obj->uobject);
+       return ret;
+}
+
+ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+                           const char __user *buf, int in_len,
+                           int out_len)
+{
+       struct ib_uverbs_resize_cq      cmd;
+       struct ib_uverbs_resize_cq_resp resp;
+       struct ib_udata                 udata;
+       struct ib_cq                    *cq;
+       int                             ret = -EINVAL;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof cmd,
+                  (unsigned long) cmd.response + sizeof resp,
+                  in_len - sizeof cmd, out_len - sizeof resp);
+
+       cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+       if (!cq)
+               return -EINVAL;
+
+       ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+       if (ret)
+               goto out;
+
+       resp.cqe = cq->cqe;
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp.cqe))
+               ret = -EFAULT;
+
+out:
+       put_cq_read(cq);
+
+       return ret ? ret : in_len;
+}
+
+static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
+{
+       struct ib_uverbs_wc tmp;
+
+       tmp.wr_id               = wc->wr_id;
+       tmp.status              = wc->status;
+       tmp.opcode              = wc->opcode;
+       tmp.vendor_err          = wc->vendor_err;
+       tmp.byte_len            = wc->byte_len;
+       tmp.ex.imm_data         = (__u32 __force) wc->ex.imm_data;
+       tmp.qp_num              = wc->qp->qp_num;
+       tmp.src_qp              = wc->src_qp;
+       tmp.wc_flags            = wc->wc_flags;
+       tmp.pkey_index          = wc->pkey_index;
+       tmp.slid                = wc->slid;
+       tmp.sl                  = wc->sl;
+       tmp.dlid_path_bits      = wc->dlid_path_bits;
+       tmp.port_num            = wc->port_num;
+       tmp.reserved            = 0;
+
+       if (copy_to_user(dest, &tmp, sizeof tmp))
+               return -EFAULT;
+
+       return 0;
+}
+
+ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+                         const char __user *buf, int in_len,
+                         int out_len)
+{
+       struct ib_uverbs_poll_cq       cmd;
+       struct ib_uverbs_poll_cq_resp  resp;
+       u8 __user                     *header_ptr;
+       u8 __user                     *data_ptr;
+       struct ib_cq                  *cq;
+       struct ib_wc                   wc;
+       int                            ret;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+       if (!cq)
+               return -EINVAL;
+
+       /* we copy a struct ib_uverbs_poll_cq_resp to user space */
+       header_ptr = (void __user *)(unsigned long) cmd.response;
+       data_ptr = header_ptr + sizeof resp;
+
+       memset(&resp, 0, sizeof resp);
+       while (resp.count < cmd.ne) {
+               ret = ib_poll_cq(cq, 1, &wc);
+               if (ret < 0)
+                       goto out_put;
+               if (!ret)
+                       break;
+
+               ret = copy_wc_to_user(data_ptr, &wc);
+               if (ret)
+                       goto out_put;
+
+               data_ptr += sizeof(struct ib_uverbs_wc);
+               ++resp.count;
+       }
+
+       if (copy_to_user(header_ptr, &resp, sizeof resp)) {
+               ret = -EFAULT;
+               goto out_put;
+       }
+
+       ret = in_len;
+
+out_put:
+       put_cq_read(cq);
+       return ret;
+}
+
+ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+                               const char __user *buf, int in_len,
+                               int out_len)
+{
+       struct ib_uverbs_req_notify_cq cmd;
+       struct ib_cq                  *cq;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+       if (!cq)
+               return -EINVAL;
+
+       ib_req_notify_cq(cq, cmd.solicited_only ?
+                        IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+
+       put_cq_read(cq);
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+                            const char __user *buf, int in_len,
+                            int out_len)
+{
+       struct ib_uverbs_destroy_cq      cmd;
+       struct ib_uverbs_destroy_cq_resp resp;
+       struct ib_uobject               *uobj;
+       struct ib_cq                    *cq;
+       struct ib_ucq_object            *obj;
+       struct ib_uverbs_event_file     *ev_file;
+       int                              ret = -EINVAL;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+       cq      = uobj->object;
+       ev_file = cq->cq_context;
+       obj     = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+       ret = ib_destroy_cq(cq);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+
+       if (ret)
+               return ret;
+
+       idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       ib_uverbs_release_ucq(file, ev_file, obj);
+
+       memset(&resp, 0, sizeof resp);
+       resp.comp_events_reported  = obj->comp_events_reported;
+       resp.async_events_reported = obj->async_events_reported;
+
+       put_uobj(uobj);
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp))
+               return -EFAULT;
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+                           const char __user *buf, int in_len,
+                           int out_len)
+{
+       struct ib_uverbs_create_qp      cmd;
+       struct ib_uverbs_create_qp_resp resp;
+       struct ib_udata                 udata;
+       struct ib_uqp_object           *obj;
+       struct ib_device               *device;
+       struct ib_pd                   *pd = NULL;
+       struct ib_xrcd                 *xrcd = NULL;
+       struct ib_uobject              *uninitialized_var(xrcd_uobj);
+       struct ib_cq                   *scq = NULL, *rcq = NULL;
+       struct ib_srq                  *srq = NULL;
+       struct ib_qp                   *qp;
+       struct ib_qp_init_attr          attr;
+       int ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       if (cmd.qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+               return -EPERM;
+
+       INIT_UDATA(&udata, buf + sizeof cmd,
+                  (unsigned long) cmd.response + sizeof resp,
+                  in_len - sizeof cmd, out_len - sizeof resp);
+
+       obj = kzalloc(sizeof *obj, GFP_KERNEL);
+       if (!obj)
+               return -ENOMEM;
+
+       init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class);
+       down_write(&obj->uevent.uobject.mutex);
+
+       if (cmd.qp_type == IB_QPT_XRC_TGT) {
+               xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj);
+               if (!xrcd) {
+                       ret = -EINVAL;
+                       goto err_put;
+               }
+               device = xrcd->device;
+       } else {
+               if (cmd.qp_type == IB_QPT_XRC_INI) {
+                       cmd.max_recv_wr = cmd.max_recv_sge = 0;
+               } else {
+                       if (cmd.is_srq) {
+                               srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+                               if (!srq || srq->srq_type != IB_SRQT_BASIC) {
+                                       ret = -EINVAL;
+                                       goto err_put;
+                               }
+                       }
+
+                       if (cmd.recv_cq_handle != cmd.send_cq_handle) {
+                               rcq = idr_read_cq(cmd.recv_cq_handle, file->ucontext, 0);
+                               if (!rcq) {
+                                       ret = -EINVAL;
+                                       goto err_put;
+                               }
+                       }
+               }
+
+               scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, !!rcq);
+               rcq = rcq ?: scq;
+               pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+               if (!pd || !scq) {
+                       ret = -EINVAL;
+                       goto err_put;
+               }
+
+               device = pd->device;
+       }
+
+       attr.event_handler = ib_uverbs_qp_event_handler;
+       attr.qp_context    = file;
+       attr.send_cq       = scq;
+       attr.recv_cq       = rcq;
+       attr.srq           = srq;
+       attr.xrcd          = xrcd;
+       attr.sq_sig_type   = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+       attr.qp_type       = cmd.qp_type;
+       attr.create_flags  = 0;
+
+       attr.cap.max_send_wr     = cmd.max_send_wr;
+       attr.cap.max_recv_wr     = cmd.max_recv_wr;
+       attr.cap.max_send_sge    = cmd.max_send_sge;
+       attr.cap.max_recv_sge    = cmd.max_recv_sge;
+       attr.cap.max_inline_data = cmd.max_inline_data;
+
+       obj->uevent.events_reported     = 0;
+       INIT_LIST_HEAD(&obj->uevent.event_list);
+       INIT_LIST_HEAD(&obj->mcast_list);
+
+       if (cmd.qp_type == IB_QPT_XRC_TGT)
+               qp = ib_create_qp(pd, &attr);
+       else
+               qp = device->create_qp(pd, &attr, &udata);
+
+       if (IS_ERR(qp)) {
+               ret = PTR_ERR(qp);
+               goto err_put;
+       }
+
+       if (cmd.qp_type != IB_QPT_XRC_TGT) {
+               qp->real_qp       = qp;
+               qp->device        = device;
+               qp->pd            = pd;
+               qp->send_cq       = attr.send_cq;
+               qp->recv_cq       = attr.recv_cq;
+               qp->srq           = attr.srq;
+               qp->event_handler = attr.event_handler;
+               qp->qp_context    = attr.qp_context;
+               qp->qp_type       = attr.qp_type;
+               atomic_set(&qp->usecnt, 0);
+               atomic_inc(&pd->usecnt);
+               atomic_inc(&attr.send_cq->usecnt);
+               if (attr.recv_cq)
+                       atomic_inc(&attr.recv_cq->usecnt);
+               if (attr.srq)
+                       atomic_inc(&attr.srq->usecnt);
+       }
+       qp->uobject = &obj->uevent.uobject;
+
+       obj->uevent.uobject.object = qp;
+       ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+       if (ret)
+               goto err_destroy;
+
+       memset(&resp, 0, sizeof resp);
+       resp.qpn             = qp->qp_num;
+       resp.qp_handle       = obj->uevent.uobject.id;
+       resp.max_recv_sge    = attr.cap.max_recv_sge;
+       resp.max_send_sge    = attr.cap.max_send_sge;
+       resp.max_recv_wr     = attr.cap.max_recv_wr;
+       resp.max_send_wr     = attr.cap.max_send_wr;
+       resp.max_inline_data = attr.cap.max_inline_data;
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp)) {
+               ret = -EFAULT;
+               goto err_copy;
+       }
+
+       if (xrcd) {
+               obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+                                         uobject);
+               atomic_inc(&obj->uxrcd->refcnt);
+               put_xrcd_read(xrcd_uobj);
+       }
+
+       if (pd)
+               put_pd_read(pd);
+       if (scq)
+               put_cq_read(scq);
+       if (rcq && rcq != scq)
+               put_cq_read(rcq);
+       if (srq)
+               put_srq_read(srq);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+       mutex_unlock(&file->mutex);
+
+       obj->uevent.uobject.live = 1;
+
+       up_write(&obj->uevent.uobject.mutex);
+
+       return in_len;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+       ib_destroy_qp(qp);
+
+err_put:
+       if (xrcd)
+               put_xrcd_read(xrcd_uobj);
+       if (pd)
+               put_pd_read(pd);
+       if (scq)
+               put_cq_read(scq);
+       if (rcq && rcq != scq)
+               put_cq_read(rcq);
+       if (srq)
+               put_srq_read(srq);
+
+       put_uobj_write(&obj->uevent.uobject);
+       return ret;
+}
+
+ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+                         const char __user *buf, int in_len, int out_len)
+{
+       struct ib_uverbs_open_qp        cmd;
+       struct ib_uverbs_create_qp_resp resp;
+       struct ib_udata                 udata;
+       struct ib_uqp_object           *obj;
+       struct ib_xrcd                 *xrcd;
+       struct ib_uobject              *uninitialized_var(xrcd_uobj);
+       struct ib_qp                   *qp;
+       struct ib_qp_open_attr          attr;
+       int ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof cmd,
+                  (unsigned long) cmd.response + sizeof resp,
+                  in_len - sizeof cmd, out_len - sizeof resp);
+
+       obj = kmalloc(sizeof *obj, GFP_KERNEL);
+       if (!obj)
+               return -ENOMEM;
+
+       init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class);
+       down_write(&obj->uevent.uobject.mutex);
+
+       xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj);
+       if (!xrcd) {
+               ret = -EINVAL;
+               goto err_put;
+       }
+
+       attr.event_handler = ib_uverbs_qp_event_handler;
+       attr.qp_context    = file;
+       attr.qp_num        = cmd.qpn;
+       attr.qp_type       = cmd.qp_type;
+
+       obj->uevent.events_reported = 0;
+       INIT_LIST_HEAD(&obj->uevent.event_list);
+       INIT_LIST_HEAD(&obj->mcast_list);
+
+       qp = ib_open_qp(xrcd, &attr);
+       if (IS_ERR(qp)) {
+               ret = PTR_ERR(qp);
+               goto err_put;
+       }
+
+       qp->uobject = &obj->uevent.uobject;
+
+       obj->uevent.uobject.object = qp;
+       ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+       if (ret)
+               goto err_destroy;
+
+       memset(&resp, 0, sizeof resp);
+       resp.qpn       = qp->qp_num;
+       resp.qp_handle = obj->uevent.uobject.id;
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp)) {
+               ret = -EFAULT;
+               goto err_remove;
+       }
+
+       obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+       atomic_inc(&obj->uxrcd->refcnt);
+       put_xrcd_read(xrcd_uobj);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+       mutex_unlock(&file->mutex);
+
+       obj->uevent.uobject.live = 1;
+
+       up_write(&obj->uevent.uobject.mutex);
+
+       return in_len;
+
+err_remove:
+       idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+       ib_destroy_qp(qp);
+
+err_put:
+       put_xrcd_read(xrcd_uobj);
+       put_uobj_write(&obj->uevent.uobject);
+       return ret;
+}
+
+ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+                          const char __user *buf, int in_len,
+                          int out_len)
+{
+       struct ib_uverbs_query_qp      cmd;
+       struct ib_uverbs_query_qp_resp resp;
+       struct ib_qp                   *qp;
+       struct ib_qp_attr              *attr;
+       struct ib_qp_init_attr         *init_attr;
+       int                            ret;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       attr      = kmalloc(sizeof *attr, GFP_KERNEL);
+       init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+       if (!attr || !init_attr) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+       if (!qp) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr);
+
+       put_qp_read(qp);
+
+       if (ret)
+               goto out;
+
+       memset(&resp, 0, sizeof resp);
+
+       resp.qp_state               = attr->qp_state;
+       resp.cur_qp_state           = attr->cur_qp_state;
+       resp.path_mtu               = attr->path_mtu;
+       resp.path_mig_state         = attr->path_mig_state;
+       resp.qkey                   = attr->qkey;
+       resp.rq_psn                 = attr->rq_psn;
+       resp.sq_psn                 = attr->sq_psn;
+       resp.dest_qp_num            = attr->dest_qp_num;
+       resp.qp_access_flags        = attr->qp_access_flags;
+       resp.pkey_index             = attr->pkey_index;
+       resp.alt_pkey_index         = attr->alt_pkey_index;
+       resp.sq_draining            = attr->sq_draining;
+       resp.max_rd_atomic          = attr->max_rd_atomic;
+       resp.max_dest_rd_atomic     = attr->max_dest_rd_atomic;
+       resp.min_rnr_timer          = attr->min_rnr_timer;
+       resp.port_num               = attr->port_num;
+       resp.timeout                = attr->timeout;
+       resp.retry_cnt              = attr->retry_cnt;
+       resp.rnr_retry              = attr->rnr_retry;
+       resp.alt_port_num           = attr->alt_port_num;
+       resp.alt_timeout            = attr->alt_timeout;
+
+       memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+       resp.dest.flow_label        = attr->ah_attr.grh.flow_label;
+       resp.dest.sgid_index        = attr->ah_attr.grh.sgid_index;
+       resp.dest.hop_limit         = attr->ah_attr.grh.hop_limit;
+       resp.dest.traffic_class     = attr->ah_attr.grh.traffic_class;
+       resp.dest.dlid              = attr->ah_attr.dlid;
+       resp.dest.sl                = attr->ah_attr.sl;
+       resp.dest.src_path_bits     = attr->ah_attr.src_path_bits;
+       resp.dest.static_rate       = attr->ah_attr.static_rate;
+       resp.dest.is_global         = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
+       resp.dest.port_num          = attr->ah_attr.port_num;
+
+       memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+       resp.alt_dest.flow_label    = attr->alt_ah_attr.grh.flow_label;
+       resp.alt_dest.sgid_index    = attr->alt_ah_attr.grh.sgid_index;
+       resp.alt_dest.hop_limit     = attr->alt_ah_attr.grh.hop_limit;
+       resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+       resp.alt_dest.dlid          = attr->alt_ah_attr.dlid;
+       resp.alt_dest.sl            = attr->alt_ah_attr.sl;
+       resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+       resp.alt_dest.static_rate   = attr->alt_ah_attr.static_rate;
+       resp.alt_dest.is_global     = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
+       resp.alt_dest.port_num      = attr->alt_ah_attr.port_num;
+
+       resp.max_send_wr            = init_attr->cap.max_send_wr;
+       resp.max_recv_wr            = init_attr->cap.max_recv_wr;
+       resp.max_send_sge           = init_attr->cap.max_send_sge;
+       resp.max_recv_sge           = init_attr->cap.max_recv_sge;
+       resp.max_inline_data        = init_attr->cap.max_inline_data;
+       resp.sq_sig_all             = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp))
+               ret = -EFAULT;
+
+out:
+       kfree(attr);
+       kfree(init_attr);
+
+       return ret ? ret : in_len;
+}
+
+/* Remove ignored fields set in the attribute mask */
+static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
+{
+       switch (qp_type) {
+       case IB_QPT_XRC_INI:
+               return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER);
+       case IB_QPT_XRC_TGT:
+               return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT |
+                               IB_QP_RNR_RETRY);
+       default:
+               return mask;
+       }
+}
+
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+                           const char __user *buf, int in_len,
+                           int out_len)
+{
+       struct ib_uverbs_modify_qp cmd;
+       struct ib_udata            udata;
+       struct ib_qp              *qp;
+       struct ib_qp_attr         *attr;
+       int                        ret;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+                  out_len);
+
+       attr = kmalloc(sizeof *attr, GFP_KERNEL);
+       if (!attr)
+               return -ENOMEM;
+
+       qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+       if (!qp) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       attr->qp_state            = cmd.qp_state;
+       attr->cur_qp_state        = cmd.cur_qp_state;
+       attr->path_mtu            = cmd.path_mtu;
+       attr->path_mig_state      = cmd.path_mig_state;
+       attr->qkey                = cmd.qkey;
+       attr->rq_psn              = cmd.rq_psn;
+       attr->sq_psn              = cmd.sq_psn;
+       attr->dest_qp_num         = cmd.dest_qp_num;
+       attr->qp_access_flags     = cmd.qp_access_flags;
+       attr->pkey_index          = cmd.pkey_index;
+       attr->alt_pkey_index      = cmd.alt_pkey_index;
+       attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+       attr->max_rd_atomic       = cmd.max_rd_atomic;
+       attr->max_dest_rd_atomic  = cmd.max_dest_rd_atomic;
+       attr->min_rnr_timer       = cmd.min_rnr_timer;
+       attr->port_num            = cmd.port_num;
+       attr->timeout             = cmd.timeout;
+       attr->retry_cnt           = cmd.retry_cnt;
+       attr->rnr_retry           = cmd.rnr_retry;
+       attr->alt_port_num        = cmd.alt_port_num;
+       attr->alt_timeout         = cmd.alt_timeout;
+
+       memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+       attr->ah_attr.grh.flow_label        = cmd.dest.flow_label;
+       attr->ah_attr.grh.sgid_index        = cmd.dest.sgid_index;
+       attr->ah_attr.grh.hop_limit         = cmd.dest.hop_limit;
+       attr->ah_attr.grh.traffic_class     = cmd.dest.traffic_class;
+       attr->ah_attr.dlid                  = cmd.dest.dlid;
+       attr->ah_attr.sl                    = cmd.dest.sl;
+       attr->ah_attr.src_path_bits         = cmd.dest.src_path_bits;
+       attr->ah_attr.static_rate           = cmd.dest.static_rate;
+       attr->ah_attr.ah_flags              = cmd.dest.is_global ? IB_AH_GRH : 0;
+       attr->ah_attr.port_num              = cmd.dest.port_num;
+
+       memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+       attr->alt_ah_attr.grh.flow_label    = cmd.alt_dest.flow_label;
+       attr->alt_ah_attr.grh.sgid_index    = cmd.alt_dest.sgid_index;
+       attr->alt_ah_attr.grh.hop_limit     = cmd.alt_dest.hop_limit;
+       attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+       attr->alt_ah_attr.dlid              = cmd.alt_dest.dlid;
+       attr->alt_ah_attr.sl                = cmd.alt_dest.sl;
+       attr->alt_ah_attr.src_path_bits     = cmd.alt_dest.src_path_bits;
+       attr->alt_ah_attr.static_rate       = cmd.alt_dest.static_rate;
+       attr->alt_ah_attr.ah_flags          = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+       attr->alt_ah_attr.port_num          = cmd.alt_dest.port_num;
+
+       if (qp->real_qp == qp) {
+               ret = ib_resolve_eth_l2_attrs(qp, attr, &cmd.attr_mask);
+               if (ret)
+                       goto release_qp;
+               ret = qp->device->modify_qp(qp, attr,
+                       modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata);
+       } else {
+               ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask));
+       }
+
+       if (ret)
+               goto release_qp;
+
+       ret = in_len;
+
+release_qp:
+       put_qp_read(qp);
+
+out:
+       kfree(attr);
+
+       return ret;
+}
+
+ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+                            const char __user *buf, int in_len,
+                            int out_len)
+{
+       struct ib_uverbs_destroy_qp      cmd;
+       struct ib_uverbs_destroy_qp_resp resp;
+       struct ib_uobject               *uobj;
+       struct ib_qp                    *qp;
+       struct ib_uqp_object            *obj;
+       int                              ret = -EINVAL;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       memset(&resp, 0, sizeof resp);
+
+       uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+       qp  = uobj->object;
+       obj = container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+       if (!list_empty(&obj->mcast_list)) {
+               put_uobj_write(uobj);
+               return -EBUSY;
+       }
+
+       ret = ib_destroy_qp(qp);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+
+       if (ret)
+               return ret;
+
+       if (obj->uxrcd)
+               atomic_dec(&obj->uxrcd->refcnt);
+
+       idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       ib_uverbs_release_uevent(file, &obj->uevent);
+
+       resp.events_reported = obj->uevent.events_reported;
+
+       put_uobj(uobj);
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp))
+               return -EFAULT;
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+                           const char __user *buf, int in_len,
+                           int out_len)
+{
+       struct ib_uverbs_post_send      cmd;
+       struct ib_uverbs_post_send_resp resp;
+       struct ib_uverbs_send_wr       *user_wr;
+       struct ib_send_wr              *wr = NULL, *last, *next, *bad_wr;
+       struct ib_qp                   *qp;
+       int                             i, sg_ind;
+       int                             is_ud;
+       ssize_t                         ret = -EINVAL;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
+           cmd.sge_count * sizeof (struct ib_uverbs_sge))
+               return -EINVAL;
+
+       if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
+               return -EINVAL;
+
+       user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
+       if (!user_wr)
+               return -ENOMEM;
+
+       qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+       if (!qp)
+               goto out;
+
+       is_ud = qp->qp_type == IB_QPT_UD;
+       sg_ind = 0;
+       last = NULL;
+       for (i = 0; i < cmd.wr_count; ++i) {
+               if (copy_from_user(user_wr,
+                                  buf + sizeof cmd + i * cmd.wqe_size,
+                                  cmd.wqe_size)) {
+                       ret = -EFAULT;
+                       goto out_put;
+               }
+
+               if (user_wr->num_sge + sg_ind > cmd.sge_count) {
+                       ret = -EINVAL;
+                       goto out_put;
+               }
+
+               next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+                              user_wr->num_sge * sizeof (struct ib_sge),
+                              GFP_KERNEL);
+               if (!next) {
+                       ret = -ENOMEM;
+                       goto out_put;
+               }
+
+               if (!last)
+                       wr = next;
+               else
+                       last->next = next;
+               last = next;
+
+               next->next       = NULL;
+               next->wr_id      = user_wr->wr_id;
+               next->num_sge    = user_wr->num_sge;
+               next->opcode     = user_wr->opcode;
+               next->send_flags = user_wr->send_flags;
+
+               if (is_ud) {
+                       if (next->opcode != IB_WR_SEND &&
+                           next->opcode != IB_WR_SEND_WITH_IMM) {
+                               ret = -EINVAL;
+                               goto out_put;
+                       }
+
+                       next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
+                                                    file->ucontext);
+                       if (!next->wr.ud.ah) {
+                               ret = -EINVAL;
+                               goto out_put;
+                       }
+                       next->wr.ud.remote_qpn  = user_wr->wr.ud.remote_qpn;
+                       next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
+                       if (next->opcode == IB_WR_SEND_WITH_IMM)
+                               next->ex.imm_data =
+                                       (__be32 __force) user_wr->ex.imm_data;
+               } else {
+                       switch (next->opcode) {
+                       case IB_WR_RDMA_WRITE_WITH_IMM:
+                               next->ex.imm_data =
+                                       (__be32 __force) user_wr->ex.imm_data;
+                       case IB_WR_RDMA_WRITE:
+                       case IB_WR_RDMA_READ:
+                               next->wr.rdma.remote_addr =
+                                       user_wr->wr.rdma.remote_addr;
+                               next->wr.rdma.rkey        =
+                                       user_wr->wr.rdma.rkey;
+                               break;
+                       case IB_WR_SEND_WITH_IMM:
+                               next->ex.imm_data =
+                                       (__be32 __force) user_wr->ex.imm_data;
+                               break;
+                       case IB_WR_SEND_WITH_INV:
+                               next->ex.invalidate_rkey =
+                                       user_wr->ex.invalidate_rkey;
+                               break;
+                       case IB_WR_ATOMIC_CMP_AND_SWP:
+                       case IB_WR_ATOMIC_FETCH_AND_ADD:
+                               next->wr.atomic.remote_addr =
+                                       user_wr->wr.atomic.remote_addr;
+                               next->wr.atomic.compare_add =
+                                       user_wr->wr.atomic.compare_add;
+                               next->wr.atomic.swap = user_wr->wr.atomic.swap;
+                               next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+                       case IB_WR_SEND:
+                               break;
+                       default:
+                               ret = -EINVAL;
+                               goto out_put;
+                       }
+               }
+
+               if (next->num_sge) {
+                       next->sg_list = (void *) next +
+                               ALIGN(sizeof *next, sizeof (struct ib_sge));
+                       if (copy_from_user(next->sg_list,
+                                          buf + sizeof cmd +
+                                          cmd.wr_count * cmd.wqe_size +
+                                          sg_ind * sizeof (struct ib_sge),
+                                          next->num_sge * sizeof (struct ib_sge))) {
+                               ret = -EFAULT;
+                               goto out_put;
+                       }
+                       sg_ind += next->num_sge;
+               } else
+                       next->sg_list = NULL;
+       }
+
+       resp.bad_wr = 0;
+       ret = qp->device->post_send(qp->real_qp, wr, &bad_wr);
+       if (ret)
+               for (next = wr; next; next = next->next) {
+                       ++resp.bad_wr;
+                       if (next == bad_wr)
+                               break;
+               }
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp))
+               ret = -EFAULT;
+
+out_put:
+       put_qp_read(qp);
+
+       while (wr) {
+               if (is_ud && wr->wr.ud.ah)
+                       put_ah_read(wr->wr.ud.ah);
+               next = wr->next;
+               kfree(wr);
+               wr = next;
+       }
+
+out:
+       kfree(user_wr);
+
+       return ret ? ret : in_len;
+}
+
+static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
+                                                   int in_len,
+                                                   u32 wr_count,
+                                                   u32 sge_count,
+                                                   u32 wqe_size)
+{
+       struct ib_uverbs_recv_wr *user_wr;
+       struct ib_recv_wr        *wr = NULL, *last, *next;
+       int                       sg_ind;
+       int                       i;
+       int                       ret;
+
+       if (in_len < wqe_size * wr_count +
+           sge_count * sizeof (struct ib_uverbs_sge))
+               return ERR_PTR(-EINVAL);
+
+       if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
+               return ERR_PTR(-EINVAL);
+
+       user_wr = kmalloc(wqe_size, GFP_KERNEL);
+       if (!user_wr)
+               return ERR_PTR(-ENOMEM);
+
+       sg_ind = 0;
+       last = NULL;
+       for (i = 0; i < wr_count; ++i) {
+               if (copy_from_user(user_wr, buf + i * wqe_size,
+                                  wqe_size)) {
+                       ret = -EFAULT;
+                       goto err;
+               }
+
+               if (user_wr->num_sge + sg_ind > sge_count) {
+                       ret = -EINVAL;
+                       goto err;
+               }
+
+               next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+                              user_wr->num_sge * sizeof (struct ib_sge),
+                              GFP_KERNEL);
+               if (!next) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+
+               if (!last)
+                       wr = next;
+               else
+                       last->next = next;
+               last = next;
+
+               next->next       = NULL;
+               next->wr_id      = user_wr->wr_id;
+               next->num_sge    = user_wr->num_sge;
+
+               if (next->num_sge) {
+                       next->sg_list = (void *) next +
+                               ALIGN(sizeof *next, sizeof (struct ib_sge));
+                       if (copy_from_user(next->sg_list,
+                                          buf + wr_count * wqe_size +
+                                          sg_ind * sizeof (struct ib_sge),
+                                          next->num_sge * sizeof (struct ib_sge))) {
+                               ret = -EFAULT;
+                               goto err;
+                       }
+                       sg_ind += next->num_sge;
+               } else
+                       next->sg_list = NULL;
+       }
+
+       kfree(user_wr);
+       return wr;
+
+err:
+       kfree(user_wr);
+
+       while (wr) {
+               next = wr->next;
+               kfree(wr);
+               wr = next;
+       }
+
+       return ERR_PTR(ret);
+}
+
+ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+                           const char __user *buf, int in_len,
+                           int out_len)
+{
+       struct ib_uverbs_post_recv      cmd;
+       struct ib_uverbs_post_recv_resp resp;
+       struct ib_recv_wr              *wr, *next, *bad_wr;
+       struct ib_qp                   *qp;
+       ssize_t                         ret = -EINVAL;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+                                      in_len - sizeof cmd, cmd.wr_count,
+                                      cmd.sge_count, cmd.wqe_size);
+       if (IS_ERR(wr))
+               return PTR_ERR(wr);
+
+       qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+       if (!qp)
+               goto out;
+
+       resp.bad_wr = 0;
+       ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr);
+
+       put_qp_read(qp);
+
+       if (ret)
+               for (next = wr; next; next = next->next) {
+                       ++resp.bad_wr;
+                       if (next == bad_wr)
+                               break;
+               }
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp))
+               ret = -EFAULT;
+
+out:
+       while (wr) {
+               next = wr->next;
+               kfree(wr);
+               wr = next;
+       }
+
+       return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+                               const char __user *buf, int in_len,
+                               int out_len)
+{
+       struct ib_uverbs_post_srq_recv      cmd;
+       struct ib_uverbs_post_srq_recv_resp resp;
+       struct ib_recv_wr                  *wr, *next, *bad_wr;
+       struct ib_srq                      *srq;
+       ssize_t                             ret = -EINVAL;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+                                      in_len - sizeof cmd, cmd.wr_count,
+                                      cmd.sge_count, cmd.wqe_size);
+       if (IS_ERR(wr))
+               return PTR_ERR(wr);
+
+       srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+       if (!srq)
+               goto out;
+
+       resp.bad_wr = 0;
+       ret = srq->device->post_srq_recv(srq, wr, &bad_wr);
+
+       put_srq_read(srq);
+
+       if (ret)
+               for (next = wr; next; next = next->next) {
+                       ++resp.bad_wr;
+                       if (next == bad_wr)
+                               break;
+               }
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp))
+               ret = -EFAULT;
+
+out:
+       while (wr) {
+               next = wr->next;
+               kfree(wr);
+               wr = next;
+       }
+
+       return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+                           const char __user *buf, int in_len,
+                           int out_len)
+{
+       struct ib_uverbs_create_ah       cmd;
+       struct ib_uverbs_create_ah_resp  resp;
+       struct ib_uobject               *uobj;
+       struct ib_pd                    *pd;
+       struct ib_ah                    *ah;
+       struct ib_ah_attr               attr;
+       int ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+       if (!uobj)
+               return -ENOMEM;
+
+       init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class);
+       down_write(&uobj->mutex);
+
+       pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+       if (!pd) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       attr.dlid              = cmd.attr.dlid;
+       attr.sl                = cmd.attr.sl;
+       attr.src_path_bits     = cmd.attr.src_path_bits;
+       attr.static_rate       = cmd.attr.static_rate;
+       attr.ah_flags          = cmd.attr.is_global ? IB_AH_GRH : 0;
+       attr.port_num          = cmd.attr.port_num;
+       attr.grh.flow_label    = cmd.attr.grh.flow_label;
+       attr.grh.sgid_index    = cmd.attr.grh.sgid_index;
+       attr.grh.hop_limit     = cmd.attr.grh.hop_limit;
+       attr.grh.traffic_class = cmd.attr.grh.traffic_class;
+       attr.vlan_id           = 0;
+       memset(&attr.dmac, 0, sizeof(attr.dmac));
+       memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
+
+       ah = ib_create_ah(pd, &attr);
+       if (IS_ERR(ah)) {
+               ret = PTR_ERR(ah);
+               goto err_put;
+       }
+
+       ah->uobject  = uobj;
+       uobj->object = ah;
+
+       ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj);
+       if (ret)
+               goto err_destroy;
+
+       resp.ah_handle = uobj->id;
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp)) {
+               ret = -EFAULT;
+               goto err_copy;
+       }
+
+       put_pd_read(pd);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&uobj->list, &file->ucontext->ah_list);
+       mutex_unlock(&file->mutex);
+
+       uobj->live = 1;
+
+       up_write(&uobj->mutex);
+
+       return in_len;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+err_destroy:
+       ib_destroy_ah(ah);
+
+err_put:
+       put_pd_read(pd);
+
+err:
+       put_uobj_write(uobj);
+       return ret;
+}
+
+ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+                            const char __user *buf, int in_len, int out_len)
+{
+       struct ib_uverbs_destroy_ah cmd;
+       struct ib_ah               *ah;
+       struct ib_uobject          *uobj;
+       int                         ret;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+       ah = uobj->object;
+
+       ret = ib_destroy_ah(ah);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+
+       if (ret)
+               return ret;
+
+       idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       put_uobj(uobj);
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+                              const char __user *buf, int in_len,
+                              int out_len)
+{
+       struct ib_uverbs_attach_mcast cmd;
+       struct ib_qp                 *qp;
+       struct ib_uqp_object         *obj;
+       struct ib_uverbs_mcast_entry *mcast;
+       int                           ret;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       qp = idr_write_qp(cmd.qp_handle, file->ucontext);
+       if (!qp)
+               return -EINVAL;
+
+       obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+       list_for_each_entry(mcast, &obj->mcast_list, list)
+               if (cmd.mlid == mcast->lid &&
+                   !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+                       ret = 0;
+                       goto out_put;
+               }
+
+       mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+       if (!mcast) {
+               ret = -ENOMEM;
+               goto out_put;
+       }
+
+       mcast->lid = cmd.mlid;
+       memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw);
+
+       ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid);
+       if (!ret)
+               list_add_tail(&mcast->list, &obj->mcast_list);
+       else
+               kfree(mcast);
+
+out_put:
+       put_qp_write(qp);
+
+       return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+                              const char __user *buf, int in_len,
+                              int out_len)
+{
+       struct ib_uverbs_detach_mcast cmd;
+       struct ib_uqp_object         *obj;
+       struct ib_qp                 *qp;
+       struct ib_uverbs_mcast_entry *mcast;
+       int                           ret = -EINVAL;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       qp = idr_write_qp(cmd.qp_handle, file->ucontext);
+       if (!qp)
+               return -EINVAL;
+
+       ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid);
+       if (ret)
+               goto out_put;
+
+       obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+       list_for_each_entry(mcast, &obj->mcast_list, list)
+               if (cmd.mlid == mcast->lid &&
+                   !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+                       list_del(&mcast->list);
+                       kfree(mcast);
+                       break;
+               }
+
+out_put:
+       put_qp_write(qp);
+
+       return ret ? ret : in_len;
+}
+
+static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
+                               union ib_flow_spec *ib_spec)
+{
+       if (kern_spec->reserved)
+               return -EINVAL;
+
+       ib_spec->type = kern_spec->type;
+
+       switch (ib_spec->type) {
+       case IB_FLOW_SPEC_ETH:
+               ib_spec->eth.size = sizeof(struct ib_flow_spec_eth);
+               if (ib_spec->eth.size != kern_spec->eth.size)
+                       return -EINVAL;
+               memcpy(&ib_spec->eth.val, &kern_spec->eth.val,
+                      sizeof(struct ib_flow_eth_filter));
+               memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask,
+                      sizeof(struct ib_flow_eth_filter));
+               break;
+       case IB_FLOW_SPEC_IPV4:
+               ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4);
+               if (ib_spec->ipv4.size != kern_spec->ipv4.size)
+                       return -EINVAL;
+               memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val,
+                      sizeof(struct ib_flow_ipv4_filter));
+               memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask,
+                      sizeof(struct ib_flow_ipv4_filter));
+               break;
+       case IB_FLOW_SPEC_TCP:
+       case IB_FLOW_SPEC_UDP:
+               ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp);
+               if (ib_spec->tcp_udp.size != kern_spec->tcp_udp.size)
+                       return -EINVAL;
+               memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val,
+                      sizeof(struct ib_flow_tcp_udp_filter));
+               memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask,
+                      sizeof(struct ib_flow_tcp_udp_filter));
+               break;
+       default:
+               return -EINVAL;
+       }
+       return 0;
+}
+
+int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+                            struct ib_udata *ucore,
+                            struct ib_udata *uhw)
+{
+       struct ib_uverbs_create_flow      cmd;
+       struct ib_uverbs_create_flow_resp resp;
+       struct ib_uobject                 *uobj;
+       struct ib_flow                    *flow_id;
+       struct ib_uverbs_flow_attr        *kern_flow_attr;
+       struct ib_flow_attr               *flow_attr;
+       struct ib_qp                      *qp;
+       int err = 0;
+       void *kern_spec;
+       void *ib_spec;
+       int i;
+
+       if (ucore->inlen < sizeof(cmd))
+               return -EINVAL;
+
+       if (ucore->outlen < sizeof(resp))
+               return -ENOSPC;
+
+       err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+       if (err)
+               return err;
+
+       ucore->inbuf += sizeof(cmd);
+       ucore->inlen -= sizeof(cmd);
+
+       if (cmd.comp_mask)
+               return -EINVAL;
+
+       if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER &&
+            !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW))
+               return -EPERM;
+
+       if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+               return -EINVAL;
+
+       if (cmd.flow_attr.size > ucore->inlen ||
+           cmd.flow_attr.size >
+           (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
+               return -EINVAL;
+
+       if (cmd.flow_attr.reserved[0] ||
+           cmd.flow_attr.reserved[1])
+               return -EINVAL;
+
+       if (cmd.flow_attr.num_of_specs) {
+               kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size,
+                                        GFP_KERNEL);
+               if (!kern_flow_attr)
+                       return -ENOMEM;
+
+               memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
+               err = ib_copy_from_udata(kern_flow_attr + 1, ucore,
+                                        cmd.flow_attr.size);
+               if (err)
+                       goto err_free_attr;
+       } else {
+               kern_flow_attr = &cmd.flow_attr;
+       }
+
+       uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+       if (!uobj) {
+               err = -ENOMEM;
+               goto err_free_attr;
+       }
+       init_uobj(uobj, 0, file->ucontext, &rule_lock_class);
+       down_write(&uobj->mutex);
+
+       qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+       if (!qp) {
+               err = -EINVAL;
+               goto err_uobj;
+       }
+
+       flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL);
+       if (!flow_attr) {
+               err = -ENOMEM;
+               goto err_put;
+       }
+
+       flow_attr->type = kern_flow_attr->type;
+       flow_attr->priority = kern_flow_attr->priority;
+       flow_attr->num_of_specs = kern_flow_attr->num_of_specs;
+       flow_attr->port = kern_flow_attr->port;
+       flow_attr->flags = kern_flow_attr->flags;
+       flow_attr->size = sizeof(*flow_attr);
+
+       kern_spec = kern_flow_attr + 1;
+       ib_spec = flow_attr + 1;
+       for (i = 0; i < flow_attr->num_of_specs &&
+            cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) &&
+            cmd.flow_attr.size >=
+            ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) {
+               err = kern_spec_to_ib_spec(kern_spec, ib_spec);
+               if (err)
+                       goto err_free;
+               flow_attr->size +=
+                       ((union ib_flow_spec *) ib_spec)->size;
+               cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size;
+               kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size;
+               ib_spec += ((union ib_flow_spec *) ib_spec)->size;
+       }
+       if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
+               pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n",
+                       i, cmd.flow_attr.size);
+               err = -EINVAL;
+               goto err_free;
+       }
+       flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
+       if (IS_ERR(flow_id)) {
+               err = PTR_ERR(flow_id);
+               goto err_free;
+       }
+       flow_id->qp = qp;
+       flow_id->uobject = uobj;
+       uobj->object = flow_id;
+
+       err = idr_add_uobj(&ib_uverbs_rule_idr, uobj);
+       if (err)
+               goto destroy_flow;
+
+       memset(&resp, 0, sizeof(resp));
+       resp.flow_handle = uobj->id;
+
+       err = ib_copy_to_udata(ucore,
+                              &resp, sizeof(resp));
+       if (err)
+               goto err_copy;
+
+       put_qp_read(qp);
+       mutex_lock(&file->mutex);
+       list_add_tail(&uobj->list, &file->ucontext->rule_list);
+       mutex_unlock(&file->mutex);
+
+       uobj->live = 1;
+
+       up_write(&uobj->mutex);
+       kfree(flow_attr);
+       if (cmd.flow_attr.num_of_specs)
+               kfree(kern_flow_attr);
+       return 0;
+err_copy:
+       idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+destroy_flow:
+       ib_destroy_flow(flow_id);
+err_free:
+       kfree(flow_attr);
+err_put:
+       put_qp_read(qp);
+err_uobj:
+       put_uobj_write(uobj);
+err_free_attr:
+       if (cmd.flow_attr.num_of_specs)
+               kfree(kern_flow_attr);
+       return err;
+}
+
+int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+                             struct ib_udata *ucore,
+                             struct ib_udata *uhw)
+{
+       struct ib_uverbs_destroy_flow   cmd;
+       struct ib_flow                  *flow_id;
+       struct ib_uobject               *uobj;
+       int                             ret;
+
+       if (ucore->inlen < sizeof(cmd))
+               return -EINVAL;
+
+       ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+       if (ret)
+               return ret;
+
+       if (cmd.comp_mask)
+               return -EINVAL;
+
+       uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle,
+                             file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+       flow_id = uobj->object;
+
+       ret = ib_destroy_flow(flow_id);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+
+       idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       put_uobj(uobj);
+
+       return ret;
+}
+
+static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+                               struct ib_uverbs_create_xsrq *cmd,
+                               struct ib_udata *udata)
+{
+       struct ib_uverbs_create_srq_resp resp;
+       struct ib_usrq_object           *obj;
+       struct ib_pd                    *pd;
+       struct ib_srq                   *srq;
+       struct ib_uobject               *uninitialized_var(xrcd_uobj);
+       struct ib_srq_init_attr          attr;
+       int ret;
+
+       obj = kmalloc(sizeof *obj, GFP_KERNEL);
+       if (!obj)
+               return -ENOMEM;
+
+       init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class);
+       down_write(&obj->uevent.uobject.mutex);
+
+       if (cmd->srq_type == IB_SRQT_XRC) {
+               attr.ext.xrc.xrcd  = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj);
+               if (!attr.ext.xrc.xrcd) {
+                       ret = -EINVAL;
+                       goto err;
+               }
+
+               obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+               atomic_inc(&obj->uxrcd->refcnt);
+
+               attr.ext.xrc.cq  = idr_read_cq(cmd->cq_handle, file->ucontext, 0);
+               if (!attr.ext.xrc.cq) {
+                       ret = -EINVAL;
+                       goto err_put_xrcd;
+               }
+       }
+
+       pd  = idr_read_pd(cmd->pd_handle, file->ucontext);
+       if (!pd) {
+               ret = -EINVAL;
+               goto err_put_cq;
+       }
+
+       attr.event_handler  = ib_uverbs_srq_event_handler;
+       attr.srq_context    = file;
+       attr.srq_type       = cmd->srq_type;
+       attr.attr.max_wr    = cmd->max_wr;
+       attr.attr.max_sge   = cmd->max_sge;
+       attr.attr.srq_limit = cmd->srq_limit;
+
+       obj->uevent.events_reported = 0;
+       INIT_LIST_HEAD(&obj->uevent.event_list);
+
+       srq = pd->device->create_srq(pd, &attr, udata);
+       if (IS_ERR(srq)) {
+               ret = PTR_ERR(srq);
+               goto err_put;
+       }
+
+       srq->device        = pd->device;
+       srq->pd            = pd;
+       srq->srq_type      = cmd->srq_type;
+       srq->uobject       = &obj->uevent.uobject;
+       srq->event_handler = attr.event_handler;
+       srq->srq_context   = attr.srq_context;
+
+       if (cmd->srq_type == IB_SRQT_XRC) {
+               srq->ext.xrc.cq   = attr.ext.xrc.cq;
+               srq->ext.xrc.xrcd = attr.ext.xrc.xrcd;
+               atomic_inc(&attr.ext.xrc.cq->usecnt);
+               atomic_inc(&attr.ext.xrc.xrcd->usecnt);
+       }
+
+       atomic_inc(&pd->usecnt);
+       atomic_set(&srq->usecnt, 0);
+
+       obj->uevent.uobject.object = srq;
+       ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
+       if (ret)
+               goto err_destroy;
+
+       memset(&resp, 0, sizeof resp);
+       resp.srq_handle = obj->uevent.uobject.id;
+       resp.max_wr     = attr.attr.max_wr;
+       resp.max_sge    = attr.attr.max_sge;
+       if (cmd->srq_type == IB_SRQT_XRC)
+               resp.srqn = srq->ext.xrc.srq_num;
+
+       if (copy_to_user((void __user *) (unsigned long) cmd->response,
+                        &resp, sizeof resp)) {
+               ret = -EFAULT;
+               goto err_copy;
+       }
+
+       if (cmd->srq_type == IB_SRQT_XRC) {
+               put_uobj_read(xrcd_uobj);
+               put_cq_read(attr.ext.xrc.cq);
+       }
+       put_pd_read(pd);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list);
+       mutex_unlock(&file->mutex);
+
+       obj->uevent.uobject.live = 1;
+
+       up_write(&obj->uevent.uobject.mutex);
+
+       return 0;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
+
+err_destroy:
+       ib_destroy_srq(srq);
+
+err_put:
+       put_pd_read(pd);
+
+err_put_cq:
+       if (cmd->srq_type == IB_SRQT_XRC)
+               put_cq_read(attr.ext.xrc.cq);
+
+err_put_xrcd:
+       if (cmd->srq_type == IB_SRQT_XRC) {
+               atomic_dec(&obj->uxrcd->refcnt);
+               put_uobj_read(xrcd_uobj);
+       }
+
+err:
+       put_uobj_write(&obj->uevent.uobject);
+       return ret;
+}
+
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+                            const char __user *buf, int in_len,
+                            int out_len)
+{
+       struct ib_uverbs_create_srq      cmd;
+       struct ib_uverbs_create_xsrq     xcmd;
+       struct ib_uverbs_create_srq_resp resp;
+       struct ib_udata                  udata;
+       int ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       xcmd.response    = cmd.response;
+       xcmd.user_handle = cmd.user_handle;
+       xcmd.srq_type    = IB_SRQT_BASIC;
+       xcmd.pd_handle   = cmd.pd_handle;
+       xcmd.max_wr      = cmd.max_wr;
+       xcmd.max_sge     = cmd.max_sge;
+       xcmd.srq_limit   = cmd.srq_limit;
+
+       INIT_UDATA(&udata, buf + sizeof cmd,
+                  (unsigned long) cmd.response + sizeof resp,
+                  in_len - sizeof cmd, out_len - sizeof resp);
+
+       ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+       if (ret)
+               return ret;
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+                             const char __user *buf, int in_len, int out_len)
+{
+       struct ib_uverbs_create_xsrq     cmd;
+       struct ib_uverbs_create_srq_resp resp;
+       struct ib_udata                  udata;
+       int ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof cmd,
+                  (unsigned long) cmd.response + sizeof resp,
+                  in_len - sizeof cmd, out_len - sizeof resp);
+
+       ret = __uverbs_create_xsrq(file, &cmd, &udata);
+       if (ret)
+               return ret;
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+                            const char __user *buf, int in_len,
+                            int out_len)
+{
+       struct ib_uverbs_modify_srq cmd;
+       struct ib_udata             udata;
+       struct ib_srq              *srq;
+       struct ib_srq_attr          attr;
+       int                         ret;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+                  out_len);
+
+       srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+       if (!srq)
+               return -EINVAL;
+
+       attr.max_wr    = cmd.max_wr;
+       attr.srq_limit = cmd.srq_limit;
+
+       ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+
+       put_srq_read(srq);
+
+       return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+                           const char __user *buf,
+                           int in_len, int out_len)
+{
+       struct ib_uverbs_query_srq      cmd;
+       struct ib_uverbs_query_srq_resp resp;
+       struct ib_srq_attr              attr;
+       struct ib_srq                   *srq;
+       int                             ret;
+
+       if (out_len < sizeof resp)
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+       if (!srq)
+               return -EINVAL;
+
+       ret = ib_query_srq(srq, &attr);
+
+       put_srq_read(srq);
+
+       if (ret)
+               return ret;
+
+       memset(&resp, 0, sizeof resp);
+
+       resp.max_wr    = attr.max_wr;
+       resp.max_sge   = attr.max_sge;
+       resp.srq_limit = attr.srq_limit;
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp))
+               return -EFAULT;
+
+       return in_len;
+}
+
+ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+                             const char __user *buf, int in_len,
+                             int out_len)
+{
+       struct ib_uverbs_destroy_srq      cmd;
+       struct ib_uverbs_destroy_srq_resp resp;
+       struct ib_uobject                *uobj;
+       struct ib_srq                    *srq;
+       struct ib_uevent_object          *obj;
+       int                               ret = -EINVAL;
+       struct ib_usrq_object            *us;
+       enum ib_srq_type                  srq_type;
+
+       if (copy_from_user(&cmd, buf, sizeof cmd))
+               return -EFAULT;
+
+       uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+       srq = uobj->object;
+       obj = container_of(uobj, struct ib_uevent_object, uobject);
+       srq_type = srq->srq_type;
+
+       ret = ib_destroy_srq(srq);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+
+       if (ret)
+               return ret;
+
+       if (srq_type == IB_SRQT_XRC) {
+               us = container_of(obj, struct ib_usrq_object, uevent);
+               atomic_dec(&us->uxrcd->refcnt);
+       }
+
+       idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       ib_uverbs_release_uevent(file, obj);
+
+       memset(&resp, 0, sizeof resp);
+       resp.events_reported = obj->events_reported;
+
+       put_uobj(uobj);
+
+       if (copy_to_user((void __user *) (unsigned long) cmd.response,
+                        &resp, sizeof resp))
+               ret = -EFAULT;
+
+       return ret ? ret : in_len;
+}
+
+int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+                             struct ib_udata *ucore,
+                             struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_query_device_resp resp;
+       struct ib_uverbs_ex_query_device  cmd;
+       struct ib_device_attr attr;
+       struct ib_device *device;
+       int err;
+
+       device = file->device->ib_dev;
+       if (ucore->inlen < sizeof(cmd))
+               return -EINVAL;
+
+       err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+       if (err)
+               return err;
+
+       if (cmd.comp_mask)
+               return -EINVAL;
+
+       if (cmd.reserved)
+               return -EINVAL;
+
+       resp.response_length = offsetof(typeof(resp), odp_caps);
+
+       if (ucore->outlen < resp.response_length)
+               return -ENOSPC;
+
+       err = device->query_device(device, &attr);
+       if (err)
+               return err;
+
+       copy_query_dev_fields(file, &resp.base, &attr);
+       resp.comp_mask = 0;
+
+       if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
+               goto end;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       resp.odp_caps.general_caps = attr.odp_caps.general_caps;
+       resp.odp_caps.per_transport_caps.rc_odp_caps =
+               attr.odp_caps.per_transport_caps.rc_odp_caps;
+       resp.odp_caps.per_transport_caps.uc_odp_caps =
+               attr.odp_caps.per_transport_caps.uc_odp_caps;
+       resp.odp_caps.per_transport_caps.ud_odp_caps =
+               attr.odp_caps.per_transport_caps.ud_odp_caps;
+       resp.odp_caps.reserved = 0;
+#else
+       memset(&resp.odp_caps, 0, sizeof(resp.odp_caps));
+#endif
+       resp.response_length += sizeof(resp.odp_caps);
+
+end:
+       err = ib_copy_to_udata(ucore, &resp, resp.response_length);
+       if (err)
+               return err;
+
+       return 0;
+}
diff --git a/kern/drivers/net/udrvr/uverbs_main.c b/kern/drivers/net/udrvr/uverbs_main.c
new file mode 100644 (file)
index 0000000..b61e5a4
--- /dev/null
@@ -0,0 +1,1143 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if 0  /* AKAROS */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/anon_inodes.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#endif /* AKAROS */
+
+#include "uverbs.h"
+
+#if 1  /* AKAROS */
+
+#define        wake_up_interruptible(p)
+#define        kill_fasync(p, a, b)
+
+#define        kref_init(p)    kref_init(p, ib_uverbs_release_file, 1)
+#define        kref_put(p, f)  kref_put(p)
+#define        kref_get(p)     kref_get(p, 1)
+#define        put_pid(p)
+
+#define        kobject_init(a, b)
+
+#define        device_destroy(c, p)
+#define        device_create_file(d, p)        0
+#define        class_create_file(a, b)         0
+#define        try_module_get(p)               1
+#define        module_put(p)
+#define kobject_get(p)
+#define kobject_put(p)
+#define nonseekable_open(a, b)         0
+
+#define        MKDEV(a, b)                     0
+#define        alloc_chrdev_region(a, b, c, d) -1
+
+#endif /* AKAROS */
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+       IB_UVERBS_MAJOR       = 231,
+       IB_UVERBS_BASE_MINOR  = 192,
+       IB_UVERBS_MAX_DEVICES = 32
+};
+
+#define IB_UVERBS_BASE_DEV     MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+static struct class *uverbs_class;
+
+DEFINE_SPINLOCK(ib_uverbs_idr_lock);
+DEFINE_IDR(ib_uverbs_pd_idr);
+DEFINE_IDR(ib_uverbs_mr_idr);
+DEFINE_IDR(ib_uverbs_mw_idr);
+DEFINE_IDR(ib_uverbs_ah_idr);
+DEFINE_IDR(ib_uverbs_cq_idr);
+DEFINE_IDR(ib_uverbs_qp_idr);
+DEFINE_IDR(ib_uverbs_srq_idr);
+DEFINE_IDR(ib_uverbs_xrcd_idr);
+DEFINE_IDR(ib_uverbs_rule_idr);
+
+static DEFINE_SPINLOCK(map_lock);
+static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+
+static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+                                    const char __user *buf, int in_len,
+                                    int out_len) = {
+       [IB_USER_VERBS_CMD_GET_CONTEXT]         = ib_uverbs_get_context,
+       [IB_USER_VERBS_CMD_QUERY_DEVICE]        = ib_uverbs_query_device,
+       [IB_USER_VERBS_CMD_QUERY_PORT]          = ib_uverbs_query_port,
+       [IB_USER_VERBS_CMD_ALLOC_PD]            = ib_uverbs_alloc_pd,
+       [IB_USER_VERBS_CMD_DEALLOC_PD]          = ib_uverbs_dealloc_pd,
+       [IB_USER_VERBS_CMD_REG_MR]              = ib_uverbs_reg_mr,
+       [IB_USER_VERBS_CMD_REREG_MR]            = ib_uverbs_rereg_mr,
+       [IB_USER_VERBS_CMD_DEREG_MR]            = ib_uverbs_dereg_mr,
+       [IB_USER_VERBS_CMD_ALLOC_MW]            = ib_uverbs_alloc_mw,
+       [IB_USER_VERBS_CMD_DEALLOC_MW]          = ib_uverbs_dealloc_mw,
+       [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
+       [IB_USER_VERBS_CMD_CREATE_CQ]           = ib_uverbs_create_cq,
+       [IB_USER_VERBS_CMD_RESIZE_CQ]           = ib_uverbs_resize_cq,
+       [IB_USER_VERBS_CMD_POLL_CQ]             = ib_uverbs_poll_cq,
+       [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]       = ib_uverbs_req_notify_cq,
+       [IB_USER_VERBS_CMD_DESTROY_CQ]          = ib_uverbs_destroy_cq,
+       [IB_USER_VERBS_CMD_CREATE_QP]           = ib_uverbs_create_qp,
+       [IB_USER_VERBS_CMD_QUERY_QP]            = ib_uverbs_query_qp,
+       [IB_USER_VERBS_CMD_MODIFY_QP]           = ib_uverbs_modify_qp,
+       [IB_USER_VERBS_CMD_DESTROY_QP]          = ib_uverbs_destroy_qp,
+       [IB_USER_VERBS_CMD_POST_SEND]           = ib_uverbs_post_send,
+       [IB_USER_VERBS_CMD_POST_RECV]           = ib_uverbs_post_recv,
+       [IB_USER_VERBS_CMD_POST_SRQ_RECV]       = ib_uverbs_post_srq_recv,
+       [IB_USER_VERBS_CMD_CREATE_AH]           = ib_uverbs_create_ah,
+       [IB_USER_VERBS_CMD_DESTROY_AH]          = ib_uverbs_destroy_ah,
+       [IB_USER_VERBS_CMD_ATTACH_MCAST]        = ib_uverbs_attach_mcast,
+       [IB_USER_VERBS_CMD_DETACH_MCAST]        = ib_uverbs_detach_mcast,
+       [IB_USER_VERBS_CMD_CREATE_SRQ]          = ib_uverbs_create_srq,
+       [IB_USER_VERBS_CMD_MODIFY_SRQ]          = ib_uverbs_modify_srq,
+       [IB_USER_VERBS_CMD_QUERY_SRQ]           = ib_uverbs_query_srq,
+       [IB_USER_VERBS_CMD_DESTROY_SRQ]         = ib_uverbs_destroy_srq,
+       [IB_USER_VERBS_CMD_OPEN_XRCD]           = ib_uverbs_open_xrcd,
+       [IB_USER_VERBS_CMD_CLOSE_XRCD]          = ib_uverbs_close_xrcd,
+       [IB_USER_VERBS_CMD_CREATE_XSRQ]         = ib_uverbs_create_xsrq,
+       [IB_USER_VERBS_CMD_OPEN_QP]             = ib_uverbs_open_qp,
+};
+
+static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+                                   struct ib_udata *ucore,
+                                   struct ib_udata *uhw) = {
+       [IB_USER_VERBS_EX_CMD_CREATE_FLOW]      = ib_uverbs_ex_create_flow,
+       [IB_USER_VERBS_EX_CMD_DESTROY_FLOW]     = ib_uverbs_ex_destroy_flow,
+       [IB_USER_VERBS_EX_CMD_QUERY_DEVICE]     = ib_uverbs_ex_query_device,
+};
+
+static void ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device);
+
+static void ib_uverbs_release_dev(struct kobject *kobj)
+{
+       struct ib_uverbs_device *dev =
+               container_of(kobj, struct ib_uverbs_device, kobj);
+
+       kfree(dev);
+}
+
+#if 0  /* AKAROS */
+static struct kobj_type ib_uverbs_dev_ktype = {
+       .release = ib_uverbs_release_dev,
+};
+#endif /* AKAROS */
+
+static void ib_uverbs_release_event_file(struct kref *ref)
+{
+       struct ib_uverbs_event_file *file =
+               container_of(ref, struct ib_uverbs_event_file, ref);
+
+       kfree(file);
+}
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+                         struct ib_uverbs_event_file *ev_file,
+                         struct ib_ucq_object *uobj)
+{
+       struct ib_uverbs_event *evt, *tmp;
+
+       if (ev_file) {
+               spin_lock_irq(&ev_file->lock);
+               list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
+                       list_del(&evt->list);
+                       kfree(evt);
+               }
+               spin_unlock_irq(&ev_file->lock);
+
+               kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+       }
+
+#if 0  /* AKAROS */
+       spin_lock_irq(&file->async_file->lock);
+       list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) {
+               list_del(&evt->list);
+               kfree(evt);
+       }
+       spin_unlock_irq(&file->async_file->lock);
+#endif /* AKAROS */
+}
+
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+                             struct ib_uevent_object *uobj)
+{
+#if 0  /* AKAROS */
+       struct ib_uverbs_event *evt, *tmp;
+
+       spin_lock_irq(&file->async_file->lock);
+       list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
+               list_del(&evt->list);
+               kfree(evt);
+       }
+       spin_unlock_irq(&file->async_file->lock);
+#endif /* AKAROS */
+}
+
+static void ib_uverbs_detach_umcast(struct ib_qp *qp,
+                                   struct ib_uqp_object *uobj)
+{
+       struct ib_uverbs_mcast_entry *mcast, *tmp;
+
+       list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
+               ib_detach_mcast(qp, &mcast->gid, mcast->lid);
+               list_del(&mcast->list);
+               kfree(mcast);
+       }
+}
+
+static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
+                                     struct ib_ucontext *context)
+{
+       struct ib_uobject *uobj, *tmp;
+
+       if (!context)
+               return 0;
+
+       context->closing = 1;
+
+       list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
+               struct ib_ah *ah = uobj->object;
+
+               idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+               ib_destroy_ah(ah);
+               kfree(uobj);
+       }
+
+       /* Remove MWs before QPs, in order to support type 2A MWs. */
+       list_for_each_entry_safe(uobj, tmp, &context->mw_list, list) {
+               struct ib_mw *mw = uobj->object;
+
+               idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+               ib_dealloc_mw(mw);
+               kfree(uobj);
+       }
+
+       list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) {
+               struct ib_flow *flow_id = uobj->object;
+
+               idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+               ib_destroy_flow(flow_id);
+               kfree(uobj);
+       }
+
+       list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) {
+               struct ib_qp *qp = uobj->object;
+               struct ib_uqp_object *uqp =
+                       container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+               idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+               if (qp != qp->real_qp) {
+                       ib_close_qp(qp);
+               } else {
+                       ib_uverbs_detach_umcast(qp, uqp);
+                       ib_destroy_qp(qp);
+               }
+               ib_uverbs_release_uevent(file, &uqp->uevent);
+               kfree(uqp);
+       }
+
+       list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
+               struct ib_srq *srq = uobj->object;
+               struct ib_uevent_object *uevent =
+                       container_of(uobj, struct ib_uevent_object, uobject);
+
+               idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+               ib_destroy_srq(srq);
+               ib_uverbs_release_uevent(file, uevent);
+               kfree(uevent);
+       }
+
+       list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) {
+               struct ib_cq *cq = uobj->object;
+               struct ib_uverbs_event_file *ev_file = cq->cq_context;
+               struct ib_ucq_object *ucq =
+                       container_of(uobj, struct ib_ucq_object, uobject);
+
+               idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+               ib_destroy_cq(cq);
+               ib_uverbs_release_ucq(file, ev_file, ucq);
+               kfree(ucq);
+       }
+
+       list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
+               struct ib_mr *mr = uobj->object;
+
+               idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+               ib_dereg_mr(mr);
+               kfree(uobj);
+       }
+
+       mutex_lock(&file->device->xrcd_tree_mutex);
+       list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) {
+               struct ib_xrcd *xrcd = uobj->object;
+               struct ib_uxrcd_object *uxrcd =
+                       container_of(uobj, struct ib_uxrcd_object, uobject);
+
+               idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj);
+               ib_uverbs_dealloc_xrcd(file->device, xrcd);
+               kfree(uxrcd);
+       }
+       mutex_unlock(&file->device->xrcd_tree_mutex);
+
+       list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
+               struct ib_pd *pd = uobj->object;
+
+               idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+               ib_dealloc_pd(pd);
+               kfree(uobj);
+       }
+
+       put_pid(context->tgid);
+
+       return context->device->dealloc_ucontext(context);
+}
+
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+       complete(&dev->comp);
+}
+
+static void ib_uverbs_release_file(struct kref *ref)
+{
+       struct ib_uverbs_file *file =
+               container_of(ref, struct ib_uverbs_file, ref);
+
+       module_put(file->device->ib_dev->owner);
+       if (atomic_dec_and_test(&file->device->refcount))
+               ib_uverbs_comp_dev(file->device);
+
+       kfree(file);
+}
+
+#if 0  /* AKAROS */
+static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
+                                   size_t count, loff_t *pos)
+{
+       struct ib_uverbs_event_file *file = filp->private_data;
+       struct ib_uverbs_event *event;
+       int eventsz;
+       int ret = 0;
+
+       spin_lock_irq(&file->lock);
+
+       while (list_empty(&file->event_list)) {
+               spin_unlock_irq(&file->lock);
+
+               if (filp->f_flags & O_NONBLOCK)
+                       return -EAGAIN;
+
+               if (wait_event_interruptible(file->poll_wait,
+                                            !list_empty(&file->event_list)))
+                       return -ERESTARTSYS;
+
+               spin_lock_irq(&file->lock);
+       }
+
+       event = list_entry(file->event_list.next, struct ib_uverbs_event, list);
+
+       if (file->is_async)
+               eventsz = sizeof (struct ib_uverbs_async_event_desc);
+       else
+               eventsz = sizeof (struct ib_uverbs_comp_event_desc);
+
+       if (eventsz > count) {
+               ret   = -EINVAL;
+               event = NULL;
+       } else {
+               list_del(file->event_list.next);
+               if (event->counter) {
+                       ++(*event->counter);
+                       list_del(&event->obj_list);
+               }
+       }
+
+       spin_unlock_irq(&file->lock);
+
+       if (event) {
+               if (copy_to_user(buf, event, eventsz))
+                       ret = -EFAULT;
+               else
+                       ret = eventsz;
+       }
+
+       kfree(event);
+
+       return ret;
+}
+
+static unsigned int ib_uverbs_event_poll(struct file *filp,
+                                        struct poll_table_struct *wait)
+{
+       unsigned int pollflags = 0;
+       struct ib_uverbs_event_file *file = filp->private_data;
+
+       poll_wait(filp, &file->poll_wait, wait);
+
+       spin_lock_irq(&file->lock);
+       if (!list_empty(&file->event_list))
+               pollflags = POLLIN | POLLRDNORM;
+       spin_unlock_irq(&file->lock);
+
+       return pollflags;
+}
+
+static int ib_uverbs_event_fasync(int fd, struct file *filp, int on)
+{
+       struct ib_uverbs_event_file *file = filp->private_data;
+
+       return fasync_helper(fd, filp, on, &file->async_queue);
+}
+
+static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
+{
+       struct ib_uverbs_event_file *file = filp->private_data;
+       struct ib_uverbs_event *entry, *tmp;
+
+       spin_lock_irq(&file->lock);
+       file->is_closed = 1;
+       list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
+               if (entry->counter)
+                       list_del(&entry->obj_list);
+               kfree(entry);
+       }
+       spin_unlock_irq(&file->lock);
+
+       if (file->is_async) {
+               ib_unregister_event_handler(&file->uverbs_file->event_handler);
+               kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+       }
+       kref_put(&file->ref, ib_uverbs_release_event_file);
+
+       return 0;
+}
+
+static const struct file_operations uverbs_event_fops = {
+       .owner   = THIS_MODULE,
+       .read    = ib_uverbs_event_read,
+       .poll    = ib_uverbs_event_poll,
+       .release = ib_uverbs_event_close,
+       .fasync  = ib_uverbs_event_fasync,
+       .llseek  = no_llseek,
+};
+
+#endif /* AKAROS */
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+       struct ib_uverbs_event_file    *file = cq_context;
+       struct ib_ucq_object           *uobj;
+       struct ib_uverbs_event         *entry;
+       unsigned long                   flags;
+
+       if (!file)
+               return;
+
+       spin_lock_irqsave(&file->lock, flags);
+       if (file->is_closed) {
+               spin_unlock_irqrestore(&file->lock, flags);
+               return;
+       }
+
+       entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+       if (!entry) {
+               spin_unlock_irqrestore(&file->lock, flags);
+               return;
+       }
+
+       uobj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+       entry->desc.comp.cq_handle = cq->uobject->user_handle;
+       entry->counter             = &uobj->comp_events_reported;
+
+       list_add_tail(&entry->list, &file->event_list);
+       list_add_tail(&entry->obj_list, &uobj->comp_list);
+       spin_unlock_irqrestore(&file->lock, flags);
+
+       wake_up_interruptible(&file->poll_wait);
+       kill_fasync(&file->async_queue, SIGIO, POLL_IN);
+}
+
+static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
+                                   __u64 element, __u64 event,
+                                   struct list_head *obj_list,
+                                   u32 *counter)
+{
+       struct ib_uverbs_event *entry;
+       unsigned long flags;
+
+       spin_lock_irqsave(&file->async_file->lock, flags);
+       if (file->async_file->is_closed) {
+               spin_unlock_irqrestore(&file->async_file->lock, flags);
+               return;
+       }
+
+       entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+       if (!entry) {
+               spin_unlock_irqrestore(&file->async_file->lock, flags);
+               return;
+       }
+
+       entry->desc.async.element    = element;
+       entry->desc.async.event_type = event;
+       entry->desc.async.reserved   = 0;
+       entry->counter               = counter;
+
+       list_add_tail(&entry->list, &file->async_file->event_list);
+       if (obj_list)
+               list_add_tail(&entry->obj_list, obj_list);
+       spin_unlock_irqrestore(&file->async_file->lock, flags);
+
+       wake_up_interruptible(&file->async_file->poll_wait);
+       kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+       struct ib_ucq_object *uobj = container_of(event->element.cq->uobject,
+                                                 struct ib_ucq_object, uobject);
+
+       ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle,
+                               event->event, &uobj->async_list,
+                               &uobj->async_events_reported);
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+       struct ib_uevent_object *uobj;
+
+       /* for XRC target qp's, check that qp is live */
+       if (!event->element.qp->uobject || !event->element.qp->uobject->live)
+               return;
+
+       uobj = container_of(event->element.qp->uobject,
+                           struct ib_uevent_object, uobject);
+
+       ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+                               event->event, &uobj->event_list,
+                               &uobj->events_reported);
+}
+
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
+{
+       struct ib_uevent_object *uobj;
+
+       uobj = container_of(event->element.srq->uobject,
+                           struct ib_uevent_object, uobject);
+
+       ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+                               event->event, &uobj->event_list,
+                               &uobj->events_reported);
+}
+
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+                            struct ib_event *event)
+{
+       struct ib_uverbs_file *file =
+               container_of(handler, struct ib_uverbs_file, event_handler);
+
+       ib_uverbs_async_handler(file, event->element.port_num, event->event,
+                               NULL, NULL);
+}
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+                                       int is_async)
+{
+#if 0  /* AKAROS */
+       struct ib_uverbs_event_file *ev_file;
+       struct file *filp;
+
+       ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+       if (!ev_file)
+               return ERR_PTR(-ENOMEM);
+
+       kref_init(&ev_file->ref);
+       spin_lock_init(&ev_file->lock);
+       INIT_LIST_HEAD(&ev_file->event_list);
+       init_waitqueue_head(&ev_file->poll_wait);
+       ev_file->uverbs_file = uverbs_file;
+       ev_file->async_queue = NULL;
+       ev_file->is_async    = is_async;
+       ev_file->is_closed   = 0;
+
+       filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
+                                 ev_file, O_RDONLY);
+       if (IS_ERR(filp))
+               kfree(ev_file);
+
+       return filp;
+#else  /* AKAROS */
+       BUG_ON(1);
+       return NULL;
+#endif /* AKAROS */
+}
+
+/*
+ * Look up a completion event file by FD.  If lookup is successful,
+ * takes a ref to the event file struct that it returns; if
+ * unsuccessful, returns NULL.
+ */
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
+{
+       struct ib_uverbs_event_file *ev_file = NULL;
+#if 0  /* AKAROS */
+       struct fd f = fdget(fd);
+
+       if (!f.file)
+               return NULL;
+
+       if (f.file->f_op != &uverbs_event_fops)
+               goto out;
+
+       ev_file = f.file->private_data;
+       if (ev_file->is_async) {
+               ev_file = NULL;
+               goto out;
+       }
+
+       kref_get(&ev_file->ref);
+
+out:
+       fdput(f);
+#lse   /* AKAROS */
+       BUG_ON(1);
+#endif /* AKAROS */
+       return ev_file;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+                            size_t count, loff_t *pos)
+{
+       struct ib_uverbs_file *file = filp->private_data;
+       struct ib_uverbs_cmd_hdr hdr;
+       __u32 flags;
+
+       if (count < sizeof hdr)
+               return -EINVAL;
+
+       if (copy_from_user(&hdr, buf, sizeof hdr))
+               return -EFAULT;
+
+#if 1  /* AKAROS */
+       ssize_t ret;
+
+       if ((ret = check_old_abi(filp, buf, count)))
+               return ret;
+#endif /* AKAROS */
+
+       flags = (hdr.command &
+                IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
+
+       if (!flags) {
+               __u32 command;
+
+               if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+                                          IB_USER_VERBS_CMD_COMMAND_MASK))
+                       return -EINVAL;
+
+               command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+               if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
+                   !uverbs_cmd_table[command])
+                       return -EINVAL;
+
+               if (!file->ucontext &&
+                   command != IB_USER_VERBS_CMD_GET_CONTEXT)
+                       return -EINVAL;
+
+               if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
+                       return -ENOSYS;
+
+               if (hdr.in_words * 4 != count)
+                       return -EINVAL;
+
+               return uverbs_cmd_table[command](file,
+                                                buf + sizeof(hdr),
+                                                hdr.in_words * 4,
+                                                hdr.out_words * 4);
+
+       } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+               __u32 command;
+
+               struct ib_uverbs_ex_cmd_hdr ex_hdr;
+               struct ib_udata ucore;
+               struct ib_udata uhw;
+               int err;
+               size_t written_count = count;
+
+               if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+                                          IB_USER_VERBS_CMD_COMMAND_MASK))
+                       return -EINVAL;
+
+               command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+               if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
+                   !uverbs_ex_cmd_table[command])
+                       return -ENOSYS;
+
+               if (!file->ucontext)
+                       return -EINVAL;
+
+               if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
+                       return -ENOSYS;
+
+               if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+                       return -EINVAL;
+
+               if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+                       return -EFAULT;
+
+               count -= sizeof(hdr) + sizeof(ex_hdr);
+               buf += sizeof(hdr) + sizeof(ex_hdr);
+
+               if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
+                       return -EINVAL;
+
+               if (ex_hdr.cmd_hdr_reserved)
+                       return -EINVAL;
+
+               if (ex_hdr.response) {
+                       if (!hdr.out_words && !ex_hdr.provider_out_words)
+                               return -EINVAL;
+
+                       if (!access_ok(VERIFY_WRITE,
+                                      (void __user *) (unsigned long) ex_hdr.response,
+                                      (hdr.out_words + ex_hdr.provider_out_words) * 8))
+                               return -EFAULT;
+               } else {
+                       if (hdr.out_words || ex_hdr.provider_out_words)
+                               return -EINVAL;
+               }
+
+               INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
+                                      hdr.in_words * 8, hdr.out_words * 8);
+
+               INIT_UDATA_BUF_OR_NULL(&uhw,
+                                      buf + ucore.inlen,
+                                      (unsigned long) ex_hdr.response + ucore.outlen,
+                                      ex_hdr.provider_in_words * 8,
+                                      ex_hdr.provider_out_words * 8);
+
+               err = uverbs_ex_cmd_table[command](file,
+                                                  &ucore,
+                                                  &uhw);
+
+               if (err)
+                       return err;
+
+               return written_count;
+       }
+
+       return -ENOSYS;
+}
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       struct ib_uverbs_file *file = filp->private_data;
+
+       if (!file->ucontext)
+               return -ENODEV;
+       else
+               return file->device->ib_dev->mmap(file->ucontext, vma);
+}
+
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ *  - the ib_uverbs_device structures are properly reference counted and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - there is no ioctl method to race against;
+ *  - the open method will either immediately run -ENXIO, or all
+ *    required initialization will be done.
+ */
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+       struct ib_uverbs_device *dev;
+       struct ib_uverbs_file *file;
+       int ret;
+
+#if 0  /* AKAROS */
+       dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
+#else  /* AKAROS */
+       dev = get_fs_info(filp);
+#endif /* AKAROS */
+       if (!atomic_inc_not_zero(&dev->refcount))
+               return -ENXIO;
+
+       if (!try_module_get(dev->ib_dev->owner)) {
+               ret = -ENODEV;
+               goto err;
+       }
+
+       file = kmalloc(sizeof *file, GFP_KERNEL);
+       if (!file) {
+               ret = -ENOMEM;
+               goto err_module;
+       }
+
+       file->device     = dev;
+       file->ucontext   = NULL;
+       file->async_file = NULL;
+       kref_init(&file->ref);
+       mutex_init(&file->mutex);
+
+       filp->private_data = file;
+       kobject_get(&dev->kobj);
+
+       return nonseekable_open(inode, filp);
+
+err_module:
+       module_put(dev->ib_dev->owner);
+
+err:
+       if (atomic_dec_and_test(&dev->refcount))
+               ib_uverbs_comp_dev(dev);
+
+       return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+       struct ib_uverbs_file *file = filp->private_data;
+       struct ib_uverbs_device *dev = file->device;
+
+       ib_uverbs_cleanup_ucontext(file, file->ucontext);
+
+       if (file->async_file)
+               kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+
+       kref_put(&file->ref, ib_uverbs_release_file);
+       kobject_put(&dev->kobj);
+
+       return 0;
+}
+
+static const struct file_operations uverbs_fops = {
+#if 0  /* AKAROS */
+       .owner   = THIS_MODULE,
+#endif /* AKAROS */
+       .write   = ib_uverbs_write,
+       .open    = ib_uverbs_open,
+       .release = ib_uverbs_close,
+#if 0  /* AKAROS */
+       .llseek  = no_llseek,
+#endif /* AKAROS */
+};
+
+static const struct file_operations uverbs_mmap_fops = {
+#if 0  /* AKAROS */
+       .owner   = THIS_MODULE,
+#endif /* AKAROS */
+       .write   = ib_uverbs_write,
+       .mmap    = ib_uverbs_mmap,
+       .open    = ib_uverbs_open,
+       .release = ib_uverbs_close,
+#if 0  /* AKAROS */
+       .llseek  = no_llseek,
+#endif /* AKAROS */
+};
+
+static struct ib_client uverbs_client = {
+       .name   = "uverbs",
+       .add    = ib_uverbs_add_one,
+       .remove = ib_uverbs_remove_one
+};
+
+#if 0  /* AKAROS */
+static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+                         char *buf)
+{
+       struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+       if (!dev)
+               return -ENODEV;
+
+       return sprintf(buf, "%s\n", dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_dev_abi_version(struct device *device,
+                                   struct device_attribute *attr, char *buf)
+{
+       struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+       if (!dev)
+               return -ENODEV;
+
+       return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+                        __stringify(IB_USER_VERBS_ABI_VERSION));
+#endif /* AKAROS */
+
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES);
+
+/*
+ * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by
+ * requesting a new major number and doubling the number of max devices we
+ * support. It's stupid, but simple.
+ */
+static int find_overflow_devnum(void)
+{
+       int ret;
+
+       if (!overflow_maj) {
+               ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES,
+                                         "infiniband_verbs");
+               if (ret) {
+                       printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n");
+                       return ret;
+               }
+       }
+
+       ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES);
+       if (ret >= IB_UVERBS_MAX_DEVICES)
+               return -1;
+
+       return ret;
+}
+
+static void ib_uverbs_add_one(struct ib_device *device)
+{
+       int devnum;
+       dev_t base;
+       struct ib_uverbs_device *uverbs_dev;
+
+       if (!device->alloc_ucontext)
+               return;
+
+       uverbs_dev = kzalloc(sizeof *uverbs_dev, GFP_KERNEL);
+       if (!uverbs_dev)
+               return;
+
+       atomic_set(&uverbs_dev->refcount, 1);
+       init_completion(&uverbs_dev->comp);
+       uverbs_dev->xrcd_tree = RB_ROOT;
+       mutex_init(&uverbs_dev->xrcd_tree_mutex);
+       kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
+
+       spin_lock(&map_lock);
+       devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+       if (devnum >= IB_UVERBS_MAX_DEVICES) {
+               spin_unlock(&map_lock);
+               devnum = find_overflow_devnum();
+               if (devnum < 0)
+                       goto err;
+
+               spin_lock(&map_lock);
+               uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES;
+               base = devnum + overflow_maj;
+               set_bit(devnum, overflow_map);
+       } else {
+               uverbs_dev->devnum = devnum;
+               base = devnum + IB_UVERBS_BASE_DEV;
+               set_bit(devnum, dev_map);
+       }
+       spin_unlock(&map_lock);
+
+       uverbs_dev->ib_dev           = device;
+       uverbs_dev->num_comp_vectors = device->num_comp_vectors;
+
+#if 0  /* AKAROS */
+       cdev_init(&uverbs_dev->cdev, NULL);
+       uverbs_dev->cdev.owner = THIS_MODULE;
+       uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+       uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj;
+       kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
+       if (cdev_add(&uverbs_dev->cdev, base, 1))
+               goto err_cdev;
+
+       uverbs_dev->dev = device_create(uverbs_class, device->dma_device,
+                                       uverbs_dev->cdev.dev, uverbs_dev,
+                                       "uverbs%d", uverbs_dev->devnum);
+       if (IS_ERR(uverbs_dev->dev))
+               goto err_cdev;
+
+       if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
+               goto err_class;
+       if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
+               goto err_class;
+#else  /* AKAROS */
+       sysfs_create(devnum, &uverbs_mmap_fops, uverbs_dev);
+#endif /* AKAROS */
+
+       ib_set_client_data(device, &uverbs_client, uverbs_dev);
+
+       return;
+
+#if 0  /* AKAROS */
+err_class:
+       device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+
+err_cdev:
+       cdev_del(&uverbs_dev->cdev);
+       if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+               clear_bit(devnum, dev_map);
+       else
+               clear_bit(devnum, overflow_map);
+#endif /* AKAROS */
+
+err:
+       if (atomic_dec_and_test(&uverbs_dev->refcount))
+               ib_uverbs_comp_dev(uverbs_dev);
+       wait_for_completion(&uverbs_dev->comp);
+       kobject_put(&uverbs_dev->kobj);
+       return;
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device)
+{
+       struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+
+       if (!uverbs_dev)
+               return;
+
+#if 0  /* AKAROS */
+       dev_set_drvdata(uverbs_dev->dev, NULL);
+       device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+       cdev_del(&uverbs_dev->cdev);
+#endif /* AKAROS */
+
+       if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+               clear_bit(uverbs_dev->devnum, dev_map);
+       else
+               clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
+
+       if (atomic_dec_and_test(&uverbs_dev->refcount))
+               ib_uverbs_comp_dev(uverbs_dev);
+       wait_for_completion(&uverbs_dev->comp);
+       kobject_put(&uverbs_dev->kobj);
+}
+
+#if 0  /* AKAROS */
+static char *uverbs_devnode(struct device *dev, umode_t *mode)
+{
+       if (mode)
+               *mode = 0666;
+       return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_uverbs_init(void)
+#else  /* AKAROS */
+int __init ib_uverbs_init(void)
+#endif /* AKAROS */
+{
+       int ret;
+
+#if 0  /* AKAROS */
+       ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
+                                    "infiniband_verbs");
+       if (ret) {
+               printk(KERN_ERR "user_verbs: couldn't register device number\n");
+               goto out;
+       }
+
+       uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+       if (IS_ERR(uverbs_class)) {
+               ret = PTR_ERR(uverbs_class);
+               printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n");
+               goto out_chrdev;
+       }
+
+       uverbs_class->devnode = uverbs_devnode;
+
+       ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
+       if (ret) {
+               printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
+               goto out_class;
+       }
+#else  /* AKAROS */
+       sysfs_init();
+#endif /* AKAROS */
+
+       ret = ib_register_client(&uverbs_client);
+       if (ret) {
+               printk(KERN_ERR "user_verbs: couldn't register client\n");
+               goto out_class;
+       }
+
+       return 0;
+
+out_class:
+#if 0  /* AKAROS */
+       class_destroy(uverbs_class);
+
+out_chrdev:
+       unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+
+out:
+#endif /* AKAROS */
+       return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+       ib_unregister_client(&uverbs_client);
+#if 0  /* AKAROS */
+       class_destroy(uverbs_class);
+       unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+       if (overflow_maj)
+               unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES);
+       idr_destroy(&ib_uverbs_pd_idr);
+       idr_destroy(&ib_uverbs_mr_idr);
+       idr_destroy(&ib_uverbs_mw_idr);
+       idr_destroy(&ib_uverbs_ah_idr);
+       idr_destroy(&ib_uverbs_cq_idr);
+       idr_destroy(&ib_uverbs_qp_idr);
+       idr_destroy(&ib_uverbs_srq_idr);
+#endif /* AKAROS */
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);
diff --git a/kern/drivers/net/udrvr/verbs.c b/kern/drivers/net/udrvr/verbs.c
new file mode 100644 (file)
index 0000000..55b5381
--- /dev/null
@@ -0,0 +1,1460 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if 0  /* AKAROS */
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+#include "core_priv.h"
+#else  /* AKAROS */
+#include <linux/rdma/ib_verbs.h>
+#endif /* AKAROS */
+
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
+{
+       switch (rate) {
+       case IB_RATE_2_5_GBPS: return  1;
+       case IB_RATE_5_GBPS:   return  2;
+       case IB_RATE_10_GBPS:  return  4;
+       case IB_RATE_20_GBPS:  return  8;
+       case IB_RATE_30_GBPS:  return 12;
+       case IB_RATE_40_GBPS:  return 16;
+       case IB_RATE_60_GBPS:  return 24;
+       case IB_RATE_80_GBPS:  return 32;
+       case IB_RATE_120_GBPS: return 48;
+       default:               return -1;
+       }
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
+{
+       switch (mult) {
+       case 1:  return IB_RATE_2_5_GBPS;
+       case 2:  return IB_RATE_5_GBPS;
+       case 4:  return IB_RATE_10_GBPS;
+       case 8:  return IB_RATE_20_GBPS;
+       case 12: return IB_RATE_30_GBPS;
+       case 16: return IB_RATE_40_GBPS;
+       case 24: return IB_RATE_60_GBPS;
+       case 32: return IB_RATE_80_GBPS;
+       case 48: return IB_RATE_120_GBPS;
+       default: return IB_RATE_PORT_CURRENT;
+       }
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
+{
+       switch (rate) {
+       case IB_RATE_2_5_GBPS: return 2500;
+       case IB_RATE_5_GBPS:   return 5000;
+       case IB_RATE_10_GBPS:  return 10000;
+       case IB_RATE_20_GBPS:  return 20000;
+       case IB_RATE_30_GBPS:  return 30000;
+       case IB_RATE_40_GBPS:  return 40000;
+       case IB_RATE_60_GBPS:  return 60000;
+       case IB_RATE_80_GBPS:  return 80000;
+       case IB_RATE_120_GBPS: return 120000;
+       case IB_RATE_14_GBPS:  return 14062;
+       case IB_RATE_56_GBPS:  return 56250;
+       case IB_RATE_112_GBPS: return 112500;
+       case IB_RATE_168_GBPS: return 168750;
+       case IB_RATE_25_GBPS:  return 25781;
+       case IB_RATE_100_GBPS: return 103125;
+       case IB_RATE_200_GBPS: return 206250;
+       case IB_RATE_300_GBPS: return 309375;
+       default:               return -1;
+       }
+}
+EXPORT_SYMBOL(ib_rate_to_mbps);
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+       switch (node_type) {
+       case RDMA_NODE_IB_CA:
+       case RDMA_NODE_IB_SWITCH:
+       case RDMA_NODE_IB_ROUTER:
+               return RDMA_TRANSPORT_IB;
+       case RDMA_NODE_RNIC:
+               return RDMA_TRANSPORT_IWARP;
+       case RDMA_NODE_USNIC:
+               return RDMA_TRANSPORT_USNIC;
+       case RDMA_NODE_USNIC_UDP:
+               return RDMA_TRANSPORT_USNIC_UDP;
+       default:
+               BUG();
+               return 0;
+       }
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+{
+       if (device->get_link_layer)
+               return device->get_link_layer(device, port_num);
+
+       switch (rdma_node_get_transport(device->node_type)) {
+       case RDMA_TRANSPORT_IB:
+               return IB_LINK_LAYER_INFINIBAND;
+       case RDMA_TRANSPORT_IWARP:
+       case RDMA_TRANSPORT_USNIC:
+       case RDMA_TRANSPORT_USNIC_UDP:
+               return IB_LINK_LAYER_ETHERNET;
+       default:
+               return IB_LINK_LAYER_UNSPECIFIED;
+       }
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device)
+{
+       struct ib_pd *pd;
+
+       pd = device->alloc_pd(device, NULL, NULL);
+
+       if (!IS_ERR(pd)) {
+               pd->device  = device;
+               pd->uobject = NULL;
+               atomic_set(&pd->usecnt, 0);
+       }
+
+       return pd;
+}
+EXPORT_SYMBOL(ib_alloc_pd);
+
+int ib_dealloc_pd(struct ib_pd *pd)
+{
+       if (atomic_read(&pd->usecnt))
+               return -EBUSY;
+
+       return pd->device->dealloc_pd(pd);
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+       struct ib_ah *ah;
+
+       ah = pd->device->create_ah(pd, ah_attr);
+
+       if (!IS_ERR(ah)) {
+               ah->device  = pd->device;
+               ah->pd      = pd;
+               ah->uobject = NULL;
+               atomic_inc(&pd->usecnt);
+       }
+
+       return ah;
+}
+EXPORT_SYMBOL(ib_create_ah);
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+                      struct ib_grh *grh, struct ib_ah_attr *ah_attr)
+{
+#if 0  /* AKAROS */
+       u32 flow_class;
+       u16 gid_index;
+       int ret;
+       int is_eth = (rdma_port_get_link_layer(device, port_num) ==
+                       IB_LINK_LAYER_ETHERNET);
+
+       memset(ah_attr, 0, sizeof *ah_attr);
+       if (is_eth) {
+               if (!(wc->wc_flags & IB_WC_GRH))
+                       return -EPROTOTYPE;
+
+               if (wc->wc_flags & IB_WC_WITH_SMAC &&
+                   wc->wc_flags & IB_WC_WITH_VLAN) {
+                       memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
+                       ah_attr->vlan_id = wc->vlan_id;
+               } else {
+                       ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
+                                       ah_attr->dmac, &ah_attr->vlan_id);
+                       if (ret)
+                               return ret;
+               }
+       } else {
+               ah_attr->vlan_id = 0xffff;
+       }
+
+       ah_attr->dlid = wc->slid;
+       ah_attr->sl = wc->sl;
+       ah_attr->src_path_bits = wc->dlid_path_bits;
+       ah_attr->port_num = port_num;
+
+       if (wc->wc_flags & IB_WC_GRH) {
+               ah_attr->ah_flags = IB_AH_GRH;
+               ah_attr->grh.dgid = grh->sgid;
+
+               ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
+                                        &gid_index);
+               if (ret)
+                       return ret;
+
+               ah_attr->grh.sgid_index = (u8) gid_index;
+               flow_class = be32_to_cpu(grh->version_tclass_flow);
+               ah_attr->grh.flow_label = flow_class & 0xFFFFF;
+               ah_attr->grh.hop_limit = 0xFF;
+               ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
+       }
+#else  /* AKAROS */
+       BUG_ON(1);
+#endif /* AKAROS */
+       return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_wc);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+                                  struct ib_grh *grh, u8 port_num)
+{
+       struct ib_ah_attr ah_attr;
+       int ret;
+
+       ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+       if (ret)
+               return ERR_PTR(ret);
+
+       return ib_create_ah(pd, &ah_attr);
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+       return ah->device->modify_ah ?
+               ah->device->modify_ah(ah, ah_attr) :
+               -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_ah);
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+       return ah->device->query_ah ?
+               ah->device->query_ah(ah, ah_attr) :
+               -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_ah);
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+       struct ib_pd *pd;
+       int ret;
+
+       pd = ah->pd;
+       ret = ah->device->destroy_ah(ah);
+       if (!ret)
+               atomic_dec(&pd->usecnt);
+
+       return ret;
+}
+EXPORT_SYMBOL(ib_destroy_ah);
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+                            struct ib_srq_init_attr *srq_init_attr)
+{
+       struct ib_srq *srq;
+
+       if (!pd->device->create_srq)
+               return ERR_PTR(-ENOSYS);
+
+       srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+       if (!IS_ERR(srq)) {
+               srq->device        = pd->device;
+               srq->pd            = pd;
+               srq->uobject       = NULL;
+               srq->event_handler = srq_init_attr->event_handler;
+               srq->srq_context   = srq_init_attr->srq_context;
+               srq->srq_type      = srq_init_attr->srq_type;
+               if (srq->srq_type == IB_SRQT_XRC) {
+                       srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+                       srq->ext.xrc.cq   = srq_init_attr->ext.xrc.cq;
+                       atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+                       atomic_inc(&srq->ext.xrc.cq->usecnt);
+               }
+               atomic_inc(&pd->usecnt);
+               atomic_set(&srq->usecnt, 0);
+       }
+
+       return srq;
+}
+EXPORT_SYMBOL(ib_create_srq);
+
+int ib_modify_srq(struct ib_srq *srq,
+                 struct ib_srq_attr *srq_attr,
+                 enum ib_srq_attr_mask srq_attr_mask)
+{
+       return srq->device->modify_srq ?
+               srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+               -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+                struct ib_srq_attr *srq_attr)
+{
+       return srq->device->query_srq ?
+               srq->device->query_srq(srq, srq_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+       struct ib_pd *pd;
+       enum ib_srq_type srq_type;
+       struct ib_xrcd *uninitialized_var(xrcd);
+       struct ib_cq *uninitialized_var(cq);
+       int ret;
+
+       if (atomic_read(&srq->usecnt))
+               return -EBUSY;
+
+       pd = srq->pd;
+       srq_type = srq->srq_type;
+       if (srq_type == IB_SRQT_XRC) {
+               xrcd = srq->ext.xrc.xrcd;
+               cq = srq->ext.xrc.cq;
+       }
+
+       ret = srq->device->destroy_srq(srq);
+       if (!ret) {
+               atomic_dec(&pd->usecnt);
+               if (srq_type == IB_SRQT_XRC) {
+                       atomic_dec(&xrcd->usecnt);
+                       atomic_dec(&cq->usecnt);
+               }
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq);
+
+/* Queue pairs */
+
+static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
+{
+       struct ib_qp *qp = context;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->device->event_handler_lock, flags);
+       list_for_each_entry(event->element.qp, &qp->open_list, open_list)
+               if (event->element.qp->event_handler)
+                       event->element.qp->event_handler(event, event->element.qp->qp_context);
+       spin_unlock_irqrestore(&qp->device->event_handler_lock, flags);
+}
+
+static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
+{
+       mutex_lock(&xrcd->tgt_qp_mutex);
+       list_add(&qp->xrcd_list, &xrcd->tgt_qp_list);
+       mutex_unlock(&xrcd->tgt_qp_mutex);
+}
+
+static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
+                                 void (*event_handler)(struct ib_event *, void *),
+                                 void *qp_context)
+{
+       struct ib_qp *qp;
+       unsigned long flags;
+
+       qp = kzalloc(sizeof *qp, GFP_KERNEL);
+       if (!qp)
+               return ERR_PTR(-ENOMEM);
+
+       qp->real_qp = real_qp;
+       atomic_inc(&real_qp->usecnt);
+       qp->device = real_qp->device;
+       qp->event_handler = event_handler;
+       qp->qp_context = qp_context;
+       qp->qp_num = real_qp->qp_num;
+       qp->qp_type = real_qp->qp_type;
+
+       spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+       list_add(&qp->open_list, &real_qp->open_list);
+       spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+       return qp;
+}
+
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+                        struct ib_qp_open_attr *qp_open_attr)
+{
+       struct ib_qp *qp, *real_qp;
+
+       if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
+               return ERR_PTR(-EINVAL);
+
+       qp = ERR_PTR(-EINVAL);
+       mutex_lock(&xrcd->tgt_qp_mutex);
+       list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) {
+               if (real_qp->qp_num == qp_open_attr->qp_num) {
+                       qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
+                                         qp_open_attr->qp_context);
+                       break;
+               }
+       }
+       mutex_unlock(&xrcd->tgt_qp_mutex);
+       return qp;
+}
+EXPORT_SYMBOL(ib_open_qp);
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+                          struct ib_qp_init_attr *qp_init_attr)
+{
+       struct ib_qp *qp, *real_qp;
+       struct ib_device *device;
+
+       device = pd ? pd->device : qp_init_attr->xrcd->device;
+       qp = device->create_qp(pd, qp_init_attr, NULL);
+
+       if (!IS_ERR(qp)) {
+               qp->device     = device;
+               qp->real_qp    = qp;
+               qp->uobject    = NULL;
+               qp->qp_type    = qp_init_attr->qp_type;
+
+               atomic_set(&qp->usecnt, 0);
+               if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
+                       qp->event_handler = __ib_shared_qp_event_handler;
+                       qp->qp_context = qp;
+                       qp->pd = NULL;
+                       qp->send_cq = qp->recv_cq = NULL;
+                       qp->srq = NULL;
+                       qp->xrcd = qp_init_attr->xrcd;
+                       atomic_inc(&qp_init_attr->xrcd->usecnt);
+                       INIT_LIST_HEAD(&qp->open_list);
+
+                       real_qp = qp;
+                       qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+                                         qp_init_attr->qp_context);
+                       if (!IS_ERR(qp))
+                               __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+                       else
+                               real_qp->device->destroy_qp(real_qp);
+               } else {
+                       qp->event_handler = qp_init_attr->event_handler;
+                       qp->qp_context = qp_init_attr->qp_context;
+                       if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
+                               qp->recv_cq = NULL;
+                               qp->srq = NULL;
+                       } else {
+                               qp->recv_cq = qp_init_attr->recv_cq;
+                               atomic_inc(&qp_init_attr->recv_cq->usecnt);
+                               qp->srq = qp_init_attr->srq;
+                               if (qp->srq)
+                                       atomic_inc(&qp_init_attr->srq->usecnt);
+                       }
+
+                       qp->pd      = pd;
+                       qp->send_cq = qp_init_attr->send_cq;
+                       qp->xrcd    = NULL;
+
+                       atomic_inc(&pd->usecnt);
+                       atomic_inc(&qp_init_attr->send_cq->usecnt);
+               }
+       }
+
+       return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+static const struct {
+       int                     valid;
+       enum ib_qp_attr_mask    req_param[IB_QPT_MAX];
+       enum ib_qp_attr_mask    req_param_add_eth[IB_QPT_MAX];
+       enum ib_qp_attr_mask    opt_param[IB_QPT_MAX];
+       enum ib_qp_attr_mask    opt_param_add_eth[IB_QPT_MAX];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+       [IB_QPS_RESET] = {
+               [IB_QPS_RESET] = { .valid = 1 },
+               [IB_QPS_INIT]  = {
+                       .valid = 1,
+                       .req_param = {
+                               [IB_QPT_UD]  = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_PORT                      |
+                                               IB_QP_QKEY),
+                               [IB_QPT_RAW_PACKET] = IB_QP_PORT,
+                               [IB_QPT_UC]  = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_PORT                      |
+                                               IB_QP_ACCESS_FLAGS),
+                               [IB_QPT_RC]  = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_PORT                      |
+                                               IB_QP_ACCESS_FLAGS),
+                               [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX            |
+                                               IB_QP_PORT                      |
+                                               IB_QP_ACCESS_FLAGS),
+                               [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX            |
+                                               IB_QP_PORT                      |
+                                               IB_QP_ACCESS_FLAGS),
+                               [IB_QPT_SMI] = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_QKEY),
+                               [IB_QPT_GSI] = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_QKEY),
+                       }
+               },
+       },
+       [IB_QPS_INIT]  = {
+               [IB_QPS_RESET] = { .valid = 1 },
+               [IB_QPS_ERR] =   { .valid = 1 },
+               [IB_QPS_INIT]  = {
+                       .valid = 1,
+                       .opt_param = {
+                               [IB_QPT_UD]  = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_PORT                      |
+                                               IB_QP_QKEY),
+                               [IB_QPT_UC]  = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_PORT                      |
+                                               IB_QP_ACCESS_FLAGS),
+                               [IB_QPT_RC]  = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_PORT                      |
+                                               IB_QP_ACCESS_FLAGS),
+                               [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX            |
+                                               IB_QP_PORT                      |
+                                               IB_QP_ACCESS_FLAGS),
+                               [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX            |
+                                               IB_QP_PORT                      |
+                                               IB_QP_ACCESS_FLAGS),
+                               [IB_QPT_SMI] = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_QKEY),
+                               [IB_QPT_GSI] = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_QKEY),
+                       }
+               },
+               [IB_QPS_RTR]   = {
+                       .valid = 1,
+                       .req_param = {
+                               [IB_QPT_UC]  = (IB_QP_AV                        |
+                                               IB_QP_PATH_MTU                  |
+                                               IB_QP_DEST_QPN                  |
+                                               IB_QP_RQ_PSN),
+                               [IB_QPT_RC]  = (IB_QP_AV                        |
+                                               IB_QP_PATH_MTU                  |
+                                               IB_QP_DEST_QPN                  |
+                                               IB_QP_RQ_PSN                    |
+                                               IB_QP_MAX_DEST_RD_ATOMIC        |
+                                               IB_QP_MIN_RNR_TIMER),
+                               [IB_QPT_XRC_INI] = (IB_QP_AV                    |
+                                               IB_QP_PATH_MTU                  |
+                                               IB_QP_DEST_QPN                  |
+                                               IB_QP_RQ_PSN),
+                               [IB_QPT_XRC_TGT] = (IB_QP_AV                    |
+                                               IB_QP_PATH_MTU                  |
+                                               IB_QP_DEST_QPN                  |
+                                               IB_QP_RQ_PSN                    |
+                                               IB_QP_MAX_DEST_RD_ATOMIC        |
+                                               IB_QP_MIN_RNR_TIMER),
+                       },
+                       .req_param_add_eth = {
+                               [IB_QPT_RC]  = (IB_QP_SMAC),
+                               [IB_QPT_UC]  = (IB_QP_SMAC),
+                               [IB_QPT_XRC_INI]  = (IB_QP_SMAC),
+                               [IB_QPT_XRC_TGT]  = (IB_QP_SMAC)
+                       },
+                       .opt_param = {
+                                [IB_QPT_UD]  = (IB_QP_PKEY_INDEX               |
+                                                IB_QP_QKEY),
+                                [IB_QPT_UC]  = (IB_QP_ALT_PATH                 |
+                                                IB_QP_ACCESS_FLAGS             |
+                                                IB_QP_PKEY_INDEX),
+                                [IB_QPT_RC]  = (IB_QP_ALT_PATH                 |
+                                                IB_QP_ACCESS_FLAGS             |
+                                                IB_QP_PKEY_INDEX),
+                                [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH             |
+                                                IB_QP_ACCESS_FLAGS             |
+                                                IB_QP_PKEY_INDEX),
+                                [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH             |
+                                                IB_QP_ACCESS_FLAGS             |
+                                                IB_QP_PKEY_INDEX),
+                                [IB_QPT_SMI] = (IB_QP_PKEY_INDEX               |
+                                                IB_QP_QKEY),
+                                [IB_QPT_GSI] = (IB_QP_PKEY_INDEX               |
+                                                IB_QP_QKEY),
+                        },
+                       .opt_param_add_eth = {
+                               [IB_QPT_RC]  = (IB_QP_ALT_SMAC                  |
+                                               IB_QP_VID                       |
+                                               IB_QP_ALT_VID),
+                               [IB_QPT_UC]  = (IB_QP_ALT_SMAC                  |
+                                               IB_QP_VID                       |
+                                               IB_QP_ALT_VID),
+                               [IB_QPT_XRC_INI]  = (IB_QP_ALT_SMAC                     |
+                                               IB_QP_VID                       |
+                                               IB_QP_ALT_VID),
+                               [IB_QPT_XRC_TGT]  = (IB_QP_ALT_SMAC                     |
+                                               IB_QP_VID                       |
+                                               IB_QP_ALT_VID)
+                       }
+               }
+       },
+       [IB_QPS_RTR]   = {
+               [IB_QPS_RESET] = { .valid = 1 },
+               [IB_QPS_ERR] =   { .valid = 1 },
+               [IB_QPS_RTS]   = {
+                       .valid = 1,
+                       .req_param = {
+                               [IB_QPT_UD]  = IB_QP_SQ_PSN,
+                               [IB_QPT_UC]  = IB_QP_SQ_PSN,
+                               [IB_QPT_RC]  = (IB_QP_TIMEOUT                   |
+                                               IB_QP_RETRY_CNT                 |
+                                               IB_QP_RNR_RETRY                 |
+                                               IB_QP_SQ_PSN                    |
+                                               IB_QP_MAX_QP_RD_ATOMIC),
+                               [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT               |
+                                               IB_QP_RETRY_CNT                 |
+                                               IB_QP_RNR_RETRY                 |
+                                               IB_QP_SQ_PSN                    |
+                                               IB_QP_MAX_QP_RD_ATOMIC),
+                               [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT               |
+                                               IB_QP_SQ_PSN),
+                               [IB_QPT_SMI] = IB_QP_SQ_PSN,
+                               [IB_QPT_GSI] = IB_QP_SQ_PSN,
+                       },
+                       .opt_param = {
+                                [IB_QPT_UD]  = (IB_QP_CUR_STATE                |
+                                                IB_QP_QKEY),
+                                [IB_QPT_UC]  = (IB_QP_CUR_STATE                |
+                                                IB_QP_ALT_PATH                 |
+                                                IB_QP_ACCESS_FLAGS             |
+                                                IB_QP_PATH_MIG_STATE),
+                                [IB_QPT_RC]  = (IB_QP_CUR_STATE                |
+                                                IB_QP_ALT_PATH                 |
+                                                IB_QP_ACCESS_FLAGS             |
+                                                IB_QP_MIN_RNR_TIMER            |
+                                                IB_QP_PATH_MIG_STATE),
+                                [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE            |
+                                                IB_QP_ALT_PATH                 |
+                                                IB_QP_ACCESS_FLAGS             |
+                                                IB_QP_PATH_MIG_STATE),
+                                [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE            |
+                                                IB_QP_ALT_PATH                 |
+                                                IB_QP_ACCESS_FLAGS             |
+                                                IB_QP_MIN_RNR_TIMER            |
+                                                IB_QP_PATH_MIG_STATE),
+                                [IB_QPT_SMI] = (IB_QP_CUR_STATE                |
+                                                IB_QP_QKEY),
+                                [IB_QPT_GSI] = (IB_QP_CUR_STATE                |
+                                                IB_QP_QKEY),
+                        }
+               }
+       },
+       [IB_QPS_RTS]   = {
+               [IB_QPS_RESET] = { .valid = 1 },
+               [IB_QPS_ERR] =   { .valid = 1 },
+               [IB_QPS_RTS]   = {
+                       .valid = 1,
+                       .opt_param = {
+                               [IB_QPT_UD]  = (IB_QP_CUR_STATE                 |
+                                               IB_QP_QKEY),
+                               [IB_QPT_UC]  = (IB_QP_CUR_STATE                 |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_PATH_MIG_STATE),
+                               [IB_QPT_RC]  = (IB_QP_CUR_STATE                 |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_PATH_MIG_STATE            |
+                                               IB_QP_MIN_RNR_TIMER),
+                               [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE             |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_PATH_MIG_STATE),
+                               [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE             |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_PATH_MIG_STATE            |
+                                               IB_QP_MIN_RNR_TIMER),
+                               [IB_QPT_SMI] = (IB_QP_CUR_STATE                 |
+                                               IB_QP_QKEY),
+                               [IB_QPT_GSI] = (IB_QP_CUR_STATE                 |
+                                               IB_QP_QKEY),
+                       }
+               },
+               [IB_QPS_SQD]   = {
+                       .valid = 1,
+                       .opt_param = {
+                               [IB_QPT_UD]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+                               [IB_QPT_UC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+                               [IB_QPT_RC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+                               [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+                               [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
+                               [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+                               [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+                       }
+               },
+       },
+       [IB_QPS_SQD]   = {
+               [IB_QPS_RESET] = { .valid = 1 },
+               [IB_QPS_ERR] =   { .valid = 1 },
+               [IB_QPS_RTS]   = {
+                       .valid = 1,
+                       .opt_param = {
+                               [IB_QPT_UD]  = (IB_QP_CUR_STATE                 |
+                                               IB_QP_QKEY),
+                               [IB_QPT_UC]  = (IB_QP_CUR_STATE                 |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_PATH_MIG_STATE),
+                               [IB_QPT_RC]  = (IB_QP_CUR_STATE                 |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_MIN_RNR_TIMER             |
+                                               IB_QP_PATH_MIG_STATE),
+                               [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE             |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_PATH_MIG_STATE),
+                               [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE             |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_MIN_RNR_TIMER             |
+                                               IB_QP_PATH_MIG_STATE),
+                               [IB_QPT_SMI] = (IB_QP_CUR_STATE                 |
+                                               IB_QP_QKEY),
+                               [IB_QPT_GSI] = (IB_QP_CUR_STATE                 |
+                                               IB_QP_QKEY),
+                       }
+               },
+               [IB_QPS_SQD]   = {
+                       .valid = 1,
+                       .opt_param = {
+                               [IB_QPT_UD]  = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_QKEY),
+                               [IB_QPT_UC]  = (IB_QP_AV                        |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_PKEY_INDEX                |
+                                               IB_QP_PATH_MIG_STATE),
+                               [IB_QPT_RC]  = (IB_QP_PORT                      |
+                                               IB_QP_AV                        |
+                                               IB_QP_TIMEOUT                   |
+                                               IB_QP_RETRY_CNT                 |
+                                               IB_QP_RNR_RETRY                 |
+                                               IB_QP_MAX_QP_RD_ATOMIC          |
+                                               IB_QP_MAX_DEST_RD_ATOMIC        |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_PKEY_INDEX                |
+                                               IB_QP_MIN_RNR_TIMER             |
+                                               IB_QP_PATH_MIG_STATE),
+                               [IB_QPT_XRC_INI] = (IB_QP_PORT                  |
+                                               IB_QP_AV                        |
+                                               IB_QP_TIMEOUT                   |
+                                               IB_QP_RETRY_CNT                 |
+                                               IB_QP_RNR_RETRY                 |
+                                               IB_QP_MAX_QP_RD_ATOMIC          |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_PKEY_INDEX                |
+                                               IB_QP_PATH_MIG_STATE),
+                               [IB_QPT_XRC_TGT] = (IB_QP_PORT                  |
+                                               IB_QP_AV                        |
+                                               IB_QP_TIMEOUT                   |
+                                               IB_QP_MAX_DEST_RD_ATOMIC        |
+                                               IB_QP_ALT_PATH                  |
+                                               IB_QP_ACCESS_FLAGS              |
+                                               IB_QP_PKEY_INDEX                |
+                                               IB_QP_MIN_RNR_TIMER             |
+                                               IB_QP_PATH_MIG_STATE),
+                               [IB_QPT_SMI] = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_QKEY),
+                               [IB_QPT_GSI] = (IB_QP_PKEY_INDEX                |
+                                               IB_QP_QKEY),
+                       }
+               }
+       },
+       [IB_QPS_SQE]   = {
+               [IB_QPS_RESET] = { .valid = 1 },
+               [IB_QPS_ERR] =   { .valid = 1 },
+               [IB_QPS_RTS]   = {
+                       .valid = 1,
+                       .opt_param = {
+                               [IB_QPT_UD]  = (IB_QP_CUR_STATE                 |
+                                               IB_QP_QKEY),
+                               [IB_QPT_UC]  = (IB_QP_CUR_STATE                 |
+                                               IB_QP_ACCESS_FLAGS),
+                               [IB_QPT_SMI] = (IB_QP_CUR_STATE                 |
+                                               IB_QP_QKEY),
+                               [IB_QPT_GSI] = (IB_QP_CUR_STATE                 |
+                                               IB_QP_QKEY),
+                       }
+               }
+       },
+       [IB_QPS_ERR] = {
+               [IB_QPS_RESET] = { .valid = 1 },
+               [IB_QPS_ERR] =   { .valid = 1 }
+       }
+};
+
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+                      enum ib_qp_type type, enum ib_qp_attr_mask mask,
+                      enum rdma_link_layer ll)
+{
+       enum ib_qp_attr_mask req_param, opt_param;
+
+       if (cur_state  < 0 || cur_state  > IB_QPS_ERR ||
+           next_state < 0 || next_state > IB_QPS_ERR)
+               return 0;
+
+       if (mask & IB_QP_CUR_STATE  &&
+           cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+           cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+               return 0;
+
+       if (!qp_state_table[cur_state][next_state].valid)
+               return 0;
+
+       req_param = qp_state_table[cur_state][next_state].req_param[type];
+       opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+       if (ll == IB_LINK_LAYER_ETHERNET) {
+               req_param |= qp_state_table[cur_state][next_state].
+                       req_param_add_eth[type];
+               opt_param |= qp_state_table[cur_state][next_state].
+                       opt_param_add_eth[type];
+       }
+
+       if ((mask & req_param) != req_param)
+               return 0;
+
+       if (mask & ~(req_param | opt_param | IB_QP_STATE))
+               return 0;
+
+       return 1;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
+                           struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+       int           ret = 0;
+#if 0  /* AKAROS */
+       union ib_gid  sgid;
+
+       if ((*qp_attr_mask & IB_QP_AV)  &&
+           (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) {
+               ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num,
+                                  qp_attr->ah_attr.grh.sgid_index, &sgid);
+               if (ret)
+                       goto out;
+               if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) {
+                       rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac);
+                       rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac);
+                       if (!(*qp_attr_mask & IB_QP_VID))
+                               qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
+               } else {
+                       ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid,
+                                       qp_attr->ah_attr.dmac, &qp_attr->vlan_id);
+                       if (ret)
+                               goto out;
+                       ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL);
+                       if (ret)
+                               goto out;
+               }
+               *qp_attr_mask |= IB_QP_SMAC;
+               if (qp_attr->vlan_id < 0xFFFF)
+                       *qp_attr_mask |= IB_QP_VID;
+       }
+out:
+#else  /* AKAROS */
+       BUG_ON(1);
+#endif /* AKAROS */
+       return ret;
+}
+EXPORT_SYMBOL(ib_resolve_eth_l2_attrs);
+
+
+int ib_modify_qp(struct ib_qp *qp,
+                struct ib_qp_attr *qp_attr,
+                int qp_attr_mask)
+{
+       int ret;
+
+       ret = ib_resolve_eth_l2_attrs(qp, qp_attr, &qp_attr_mask);
+       if (ret)
+               return ret;
+
+       return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+               struct ib_qp_attr *qp_attr,
+               int qp_attr_mask,
+               struct ib_qp_init_attr *qp_init_attr)
+{
+       return qp->device->query_qp ?
+               qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
+               -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_close_qp(struct ib_qp *qp)
+{
+       struct ib_qp *real_qp;
+       unsigned long flags;
+
+       real_qp = qp->real_qp;
+       if (real_qp == qp)
+               return -EINVAL;
+
+       spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+       list_del(&qp->open_list);
+       spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+       atomic_dec(&real_qp->usecnt);
+       kfree(qp);
+
+       return 0;
+}
+EXPORT_SYMBOL(ib_close_qp);
+
+static int __ib_destroy_shared_qp(struct ib_qp *qp)
+{
+       struct ib_xrcd *xrcd;
+       struct ib_qp *real_qp;
+       int ret;
+
+       real_qp = qp->real_qp;
+       xrcd = real_qp->xrcd;
+
+       mutex_lock(&xrcd->tgt_qp_mutex);
+       ib_close_qp(qp);
+       if (atomic_read(&real_qp->usecnt) == 0)
+               list_del(&real_qp->xrcd_list);
+       else
+               real_qp = NULL;
+       mutex_unlock(&xrcd->tgt_qp_mutex);
+
+       if (real_qp) {
+               ret = ib_destroy_qp(real_qp);
+               if (!ret)
+                       atomic_dec(&xrcd->usecnt);
+               else
+                       __ib_insert_xrcd_qp(xrcd, real_qp);
+       }
+
+       return 0;
+}
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+       struct ib_pd *pd;
+       struct ib_cq *scq, *rcq;
+       struct ib_srq *srq;
+       int ret;
+
+       if (atomic_read(&qp->usecnt))
+               return -EBUSY;
+
+       if (qp->real_qp != qp)
+               return __ib_destroy_shared_qp(qp);
+
+       pd   = qp->pd;
+       scq  = qp->send_cq;
+       rcq  = qp->recv_cq;
+       srq  = qp->srq;
+
+       ret = qp->device->destroy_qp(qp);
+       if (!ret) {
+               if (pd)
+                       atomic_dec(&pd->usecnt);
+               if (scq)
+                       atomic_dec(&scq->usecnt);
+               if (rcq)
+                       atomic_dec(&rcq->usecnt);
+               if (srq)
+                       atomic_dec(&srq->usecnt);
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+                          ib_comp_handler comp_handler,
+                          void (*event_handler)(struct ib_event *, void *),
+                          void *cq_context, int cqe, int comp_vector)
+{
+       struct ib_cq *cq;
+
+       cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+
+       if (!IS_ERR(cq)) {
+               cq->device        = device;
+               cq->uobject       = NULL;
+               cq->comp_handler  = comp_handler;
+               cq->event_handler = event_handler;
+               cq->cq_context    = cq_context;
+               atomic_set(&cq->usecnt, 0);
+       }
+
+       return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+       return cq->device->modify_cq ?
+               cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_cq);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+       if (atomic_read(&cq->usecnt))
+               return -EBUSY;
+
+       return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+       return cq->device->resize_cq ?
+               cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+       struct ib_mr *mr;
+       int err;
+
+       err = ib_check_mr_access(mr_access_flags);
+       if (err)
+               return ERR_PTR(err);
+
+       mr = pd->device->get_dma_mr(pd, mr_access_flags);
+
+       if (!IS_ERR(mr)) {
+               mr->device  = pd->device;
+               mr->pd      = pd;
+               mr->uobject = NULL;
+               atomic_inc(&pd->usecnt);
+               atomic_set(&mr->usecnt, 0);
+       }
+
+       return mr;
+}
+EXPORT_SYMBOL(ib_get_dma_mr);
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+                            struct ib_phys_buf *phys_buf_array,
+                            int num_phys_buf,
+                            int mr_access_flags,
+                            u64 *iova_start)
+{
+       struct ib_mr *mr;
+       int err;
+
+       err = ib_check_mr_access(mr_access_flags);
+       if (err)
+               return ERR_PTR(err);
+
+       if (!pd->device->reg_phys_mr)
+               return ERR_PTR(-ENOSYS);
+
+       mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
+                                    mr_access_flags, iova_start);
+
+       if (!IS_ERR(mr)) {
+               mr->device  = pd->device;
+               mr->pd      = pd;
+               mr->uobject = NULL;
+               atomic_inc(&pd->usecnt);
+               atomic_set(&mr->usecnt, 0);
+       }
+
+       return mr;
+}
+EXPORT_SYMBOL(ib_reg_phys_mr);
+
+int ib_rereg_phys_mr(struct ib_mr *mr,
+                    int mr_rereg_mask,
+                    struct ib_pd *pd,
+                    struct ib_phys_buf *phys_buf_array,
+                    int num_phys_buf,
+                    int mr_access_flags,
+                    u64 *iova_start)
+{
+       struct ib_pd *old_pd;
+       int ret;
+
+       ret = ib_check_mr_access(mr_access_flags);
+       if (ret)
+               return ret;
+
+       if (!mr->device->rereg_phys_mr)
+               return -ENOSYS;
+
+       if (atomic_read(&mr->usecnt))
+               return -EBUSY;
+
+       old_pd = mr->pd;
+
+       ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
+                                       phys_buf_array, num_phys_buf,
+                                       mr_access_flags, iova_start);
+
+       if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
+               atomic_dec(&old_pd->usecnt);
+               atomic_inc(&pd->usecnt);
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL(ib_rereg_phys_mr);
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+       return mr->device->query_mr ?
+               mr->device->query_mr(mr, mr_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_mr);
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+       struct ib_pd *pd;
+       int ret;
+
+       if (atomic_read(&mr->usecnt))
+               return -EBUSY;
+
+       pd = mr->pd;
+       ret = mr->device->dereg_mr(mr);
+       if (!ret)
+               atomic_dec(&pd->usecnt);
+
+       return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+struct ib_mr *ib_create_mr(struct ib_pd *pd,
+                          struct ib_mr_init_attr *mr_init_attr)
+{
+       struct ib_mr *mr;
+
+       if (!pd->device->create_mr)
+               return ERR_PTR(-ENOSYS);
+
+       mr = pd->device->create_mr(pd, mr_init_attr);
+
+       if (!IS_ERR(mr)) {
+               mr->device  = pd->device;
+               mr->pd      = pd;
+               mr->uobject = NULL;
+               atomic_inc(&pd->usecnt);
+               atomic_set(&mr->usecnt, 0);
+       }
+
+       return mr;
+}
+EXPORT_SYMBOL(ib_create_mr);
+
+int ib_destroy_mr(struct ib_mr *mr)
+{
+       struct ib_pd *pd;
+       int ret;
+
+       if (atomic_read(&mr->usecnt))
+               return -EBUSY;
+
+       pd = mr->pd;
+       ret = mr->device->destroy_mr(mr);
+       if (!ret)
+               atomic_dec(&pd->usecnt);
+
+       return ret;
+}
+EXPORT_SYMBOL(ib_destroy_mr);
+
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+       struct ib_mr *mr;
+
+       if (!pd->device->alloc_fast_reg_mr)
+               return ERR_PTR(-ENOSYS);
+
+       mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+       if (!IS_ERR(mr)) {
+               mr->device  = pd->device;
+               mr->pd      = pd;
+               mr->uobject = NULL;
+               atomic_inc(&pd->usecnt);
+               atomic_set(&mr->usecnt, 0);
+       }
+
+       return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+                                                         int max_page_list_len)
+{
+       struct ib_fast_reg_page_list *page_list;
+
+       if (!device->alloc_fast_reg_page_list)
+               return ERR_PTR(-ENOSYS);
+
+       page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+       if (!IS_ERR(page_list)) {
+               page_list->device = device;
+               page_list->max_page_list_len = max_page_list_len;
+       }
+
+       return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+       page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
+/* Memory windows */
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+       struct ib_mw *mw;
+
+       if (!pd->device->alloc_mw)
+               return ERR_PTR(-ENOSYS);
+
+       mw = pd->device->alloc_mw(pd, type);
+       if (!IS_ERR(mw)) {
+               mw->device  = pd->device;
+               mw->pd      = pd;
+               mw->uobject = NULL;
+               mw->type    = type;
+               atomic_inc(&pd->usecnt);
+       }
+
+       return mw;
+}
+EXPORT_SYMBOL(ib_alloc_mw);
+
+int ib_dealloc_mw(struct ib_mw *mw)
+{
+       struct ib_pd *pd;
+       int ret;
+
+       pd = mw->pd;
+       ret = mw->device->dealloc_mw(mw);
+       if (!ret)
+               atomic_dec(&pd->usecnt);
+
+       return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_mw);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+                           int mr_access_flags,
+                           struct ib_fmr_attr *fmr_attr)
+{
+       struct ib_fmr *fmr;
+
+       if (!pd->device->alloc_fmr)
+               return ERR_PTR(-ENOSYS);
+
+       fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+       if (!IS_ERR(fmr)) {
+               fmr->device = pd->device;
+               fmr->pd     = pd;
+               atomic_inc(&pd->usecnt);
+       }
+
+       return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+       struct ib_fmr *fmr;
+
+       if (list_empty(fmr_list))
+               return 0;
+
+       fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+       return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+       struct ib_pd *pd;
+       int ret;
+
+       pd = fmr->pd;
+       ret = fmr->device->dealloc_fmr(fmr);
+       if (!ret)
+               atomic_dec(&pd->usecnt);
+
+       return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+       int ret;
+
+       if (!qp->device->attach_mcast)
+               return -ENOSYS;
+       if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+               return -EINVAL;
+
+       ret = qp->device->attach_mcast(qp, gid, lid);
+       if (!ret)
+               atomic_inc(&qp->usecnt);
+       return ret;
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+       int ret;
+
+       if (!qp->device->detach_mcast)
+               return -ENOSYS;
+       if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+               return -EINVAL;
+
+       ret = qp->device->detach_mcast(qp, gid, lid);
+       if (!ret)
+               atomic_dec(&qp->usecnt);
+       return ret;
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
+{
+       struct ib_xrcd *xrcd;
+
+       if (!device->alloc_xrcd)
+               return ERR_PTR(-ENOSYS);
+
+       xrcd = device->alloc_xrcd(device, NULL, NULL);
+       if (!IS_ERR(xrcd)) {
+               xrcd->device = device;
+               xrcd->inode = NULL;
+               atomic_set(&xrcd->usecnt, 0);
+               mutex_init(&xrcd->tgt_qp_mutex);
+               INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+       }
+
+       return xrcd;
+}
+EXPORT_SYMBOL(ib_alloc_xrcd);
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+       struct ib_qp *qp;
+       int ret;
+
+       if (atomic_read(&xrcd->usecnt))
+               return -EBUSY;
+
+       while (!list_empty(&xrcd->tgt_qp_list)) {
+               qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list);
+               ret = ib_destroy_qp(qp);
+               if (ret)
+                       return ret;
+       }
+
+       return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+                              struct ib_flow_attr *flow_attr,
+                              int domain)
+{
+       struct ib_flow *flow_id;
+       if (!qp->device->create_flow)
+               return ERR_PTR(-ENOSYS);
+
+       flow_id = qp->device->create_flow(qp, flow_attr, domain);
+       if (!IS_ERR(flow_id))
+               atomic_inc(&qp->usecnt);
+       return flow_id;
+}
+EXPORT_SYMBOL(ib_create_flow);
+
+int ib_destroy_flow(struct ib_flow *flow_id)
+{
+       int err;
+       struct ib_qp *qp = flow_id->qp;
+
+       err = qp->device->destroy_flow(flow_id);
+       if (!err)
+               atomic_dec(&qp->usecnt);
+       return err;
+}
+EXPORT_SYMBOL(ib_destroy_flow);
+
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+                      struct ib_mr_status *mr_status)
+{
+       return mr->device->check_mr_status ?
+               mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_check_mr_status);