akaros/kern/drivers/dev/cbdma.c
<<
>>
Prefs
   1/* Copyright (c) 2019-2020 Google Inc
   2 * Aditya Basu <mitthu@google.com>
   3 * Barret Rhoden <brho@google.com>
   4 * See LICENSE for details.
   5 *
   6 * Useful resources:
   7 *   - Intel Xeon E7 2800/4800/8800 Datasheet Vol. 2
   8 *   - Purley Programmer's Guide
   9 *
  10 * Acronyms:
  11 *   - IOAT: (Intel) I/O Acceleration Technology
  12 *   - CDMA: Crystal Beach DMA
  13 *
  14 * TODO
  15 * - Consider something lighter-weight than the qlock for ensuring the device
  16 * doesn't get detached during operation.  kref, perhaps.  There's also an
  17 * element of "stop new people from coming in", like we do with closing FDs.
  18 * There's also stuff that the dmaengine does in linux.  See dma_chan_get().
  19 * - Freeze or handle faults with VA->PA page mappings, till DMA is completed.
  20 * Right now, we could get iommu faults, which was the purpose of this whole
  21 * thing.
  22 *      - The dmaengine has helpers for some of this.  dma_set_unmap() is a
  23 *      "unmap all these things when you're done" approach, called by __cleanup
  24 *      -> dma_descriptor_unmap().  the unmap struct is basically a todo list.
  25 * - There's a lot of stuff we could do with the DMA engine to reduce the
  26 * amount of device touches, contention, and other inefficiencies.
  27 * issue_dma() is a minimalist one.  No batching, etc.  And with the pdev
  28 * qlock, we have only a single request per PCI device, though there may be
  29 * numerous channels.
  30 */
  31
  32#include <kmalloc.h>
  33#include <string.h>
  34#include <stdio.h>
  35#include <assert.h>
  36#include <error.h>
  37#include <net/ip.h>
  38#include <linux_compat.h>
  39#include <arch/pci.h>
  40#include <page_alloc.h>
  41#include <pmap.h>
  42#include <arch/pci_regs.h>
  43
  44#include <linux/dmaengine.h>
  45
  46/* QID Path */
  47enum {
  48        Qdir           = 0,
  49        Qcbdmaktest    = 1,
  50        Qcbdmaucopy    = 2,
  51};
  52
  53static struct dirtab cbdmadir[] = {
  54        {".",         {Qdir, 0, QTDIR}, 0, 0555},
  55        {"ktest",     {Qcbdmaktest, 0, QTFILE}, 0, 0555},
  56        {"ucopy",     {Qcbdmaucopy, 0, QTFILE}, 0, 0755},
  57};
  58
  59/* TODO: this is a device/kernel ABI.  ucbdma.c has a copy.  It's probably not
  60 * worth putting in its own header, since this is really cheap test code. */
  61struct ucbdma {
  62        uint64_t                dst_addr;
  63        uint64_t                src_addr;
  64        uint32_t                xfer_size;
  65        char                    bdf_str[10];
  66} __attribute__((packed));
  67
  68#define KTEST_SIZE 64
  69static struct {
  70        char    src[KTEST_SIZE];
  71        char    dst[KTEST_SIZE];
  72        char    srcfill;
  73        char    dstfill;
  74} ktest = {.srcfill = '0', .dstfill = 'X'};
  75
  76static inline struct pci_device *dma_chan_to_pci_dev(struct dma_chan *dc)
  77{
  78        return container_of(dc->device->dev, struct pci_device, linux_dev);
  79}
  80
  81/* Filter function for finding a particular PCI device.  If
  82 * __dma_request_channel() asks for a particular device, we'll only give it that
  83 * chan.  If you don't care, pass NULL, and you'll get any free chan. */
  84static bool filter_pci_dev(struct dma_chan *dc, void *arg)
  85{
  86        struct pci_device *pdev = dma_chan_to_pci_dev(dc);
  87
  88        if (arg)
  89                return arg == pdev;
  90        return true;
  91}
  92
  93/* Addresses are device-physical.  Caller holds the pdev qlock. */
  94static void issue_dma(struct pci_device *pdev, physaddr_t dst, physaddr_t src,
  95                      size_t len, bool async)
  96{
  97        ERRSTACK(1);
  98        struct dma_chan *dc;
  99        dma_cap_mask_t mask;
 100        struct dma_async_tx_descriptor *tx;
 101        int flags;
 102
 103        struct completion cmp;
 104        unsigned long tmo;
 105        dma_cookie_t cookie;
 106
 107        /* dmaengine_get works for the non-DMA_PRIVATE devices.  A lot
 108         * of devices turn on DMA_PRIVATE, in which case they won't be in the
 109         * general pool available to the dmaengine.  Instead, we directly
 110         * request DMA channels - particularly since we want specific devices to
 111         * use with the IOMMU. */
 112
 113        dma_cap_zero(mask);
 114        dma_cap_set(DMA_MEMCPY, mask);
 115        dc = __dma_request_channel(&mask, filter_pci_dev, pdev);
 116        if (!dc)
 117                error(EFAIL, "Couldn't get a DMA channel");
 118        if (waserror()) {
 119                dma_release_channel(dc);
 120                nexterror();
 121        }
 122
 123        flags = 0;
 124        if (async)
 125                flags |= DMA_PREP_INTERRUPT;
 126
 127        if (!is_dma_copy_aligned(dc->device, dst, src, len))
 128                error(EINVAL, "Bad copy alignment: %p %p %lu", dst, src, len);
 129
 130        tx = dmaengine_prep_dma_memcpy(dc, dst, src, len, flags);
 131        if (!tx)
 132                error(EFAIL, "Couldn't prep the memcpy!\n");
 133
 134        if (async) {
 135                async_tx_ack(tx);
 136                init_completion(&cmp);
 137                tx->callback = (dma_async_tx_callback)complete;
 138                tx->callback_param = &cmp;
 139        }
 140
 141        cookie = dmaengine_submit(tx);
 142        if (cookie < 0)
 143                error(EFAIL, "Failed to submit the DMA...");
 144
 145        /* You can poke this.  dma_sync_wait() also calls this. */
 146        dma_async_issue_pending(dc);
 147
 148        if (async) {
 149                /* Giant warning: the polling methods, like
 150                 * dmaengine_tx_status(), might actually trigger the
 151                 * tx->callback.  At least the IOAT driver does this. */
 152                tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
 153                if (tmo == 0 || dmaengine_tx_status(dc, cookie, NULL)
 154                                        != DMA_COMPLETE) {
 155                        error(ETIMEDOUT, "timeout or related spurious failure");
 156                }
 157        } else {
 158                dma_wait_for_async_tx(tx);
 159        }
 160
 161        dma_release_channel(dc);
 162        poperror();
 163}
 164
 165static void issue_dma_ucbdma(struct ucbdma *u)
 166{
 167        ERRSTACK(1);
 168        struct pci_device *pdev;
 169
 170        pdev = pci_match_string(u->bdf_str);
 171        if (!pdev)
 172                error(ENODEV, "No device %s", u->bdf_str);
 173        /* The qlock prevents unassignment from happening during an operation.
 174         * If that happened, the driver's reset method would be called while the
 175         * op is ongoing.  The driver might be able to handle that.  Though when
 176         * the iommu mappings are destroyed, the driver is likely to get wedged.
 177         *
 178         * A kref or something else might work better here, to allow multiple
 179         * DMAs at a time. */
 180        qlock(&pdev->qlock);
 181        if (waserror()) {
 182                qunlock(&pdev->qlock);
 183                nexterror();
 184        }
 185        if (pdev->proc_owner != current)
 186                error(EINVAL, "wrong proc_owner");
 187        issue_dma(pdev, u->dst_addr, u->src_addr, u->xfer_size, true);
 188        qunlock(&pdev->qlock);
 189        poperror();
 190}
 191
 192/* Runs a basic test from within the kernel on 0:4.3.
 193 *
 194 * One option would be to have write() set the sza buffer.  It won't be static
 195 * through the chan's lifetime (so you'd need to deal with syncing), but it'd
 196 * let you set things.  Another would be to have another chan/file for the BDF
 197 * (and you'd sync on that). */
 198static struct sized_alloc *open_ktest(void)
 199{
 200        ERRSTACK(2);
 201        struct pci_device *pdev = pci_match_tbdf(MKBUS(0, 0, 4, 3));
 202        struct sized_alloc *sza;
 203        physaddr_t dst, src;    /* device addrs */
 204        char *dst_d, *src_d;    /* driver addrs */
 205        uintptr_t prev;
 206
 207        if (!pdev)
 208                error(EINVAL, "no 00:04.3");
 209
 210        qlock(&pdev->qlock);
 211        /* We need to get into the address space of the device, which might be
 212         * NULL if it's the kernel's or unassigned. */
 213        prev = switch_to(pdev->proc_owner);
 214        if (waserror()) {
 215                switch_back(pdev->proc_owner, prev);
 216                qunlock(&pdev->qlock);
 217                nexterror();
 218        }
 219
 220        if (pdev->state != DEV_STATE_ASSIGNED_KERNEL &&
 221            pdev->state != DEV_STATE_ASSIGNED_USER)
 222                error(EINVAL, "00:04.3 is unassigned (%d)", pdev->state);
 223
 224        dst_d = dma_alloc_coherent(&pdev->linux_dev, KTEST_SIZE, &dst,
 225                                   MEM_WAIT);
 226        src_d = dma_alloc_coherent(&pdev->linux_dev, KTEST_SIZE, &src,
 227                                   MEM_WAIT);
 228
 229        if (waserror()) {
 230                dma_free_coherent(&pdev->linux_dev, KTEST_SIZE, dst_d, dst);
 231                dma_free_coherent(&pdev->linux_dev, KTEST_SIZE, src_d, src);
 232                nexterror();
 233        }
 234
 235        ktest.srcfill += 1;
 236        /* initialize src and dst address */
 237        memset(src_d, ktest.srcfill, KTEST_SIZE);
 238        memset(dst_d, ktest.dstfill, KTEST_SIZE);
 239        src_d[KTEST_SIZE-1] = '\0';
 240        dst_d[KTEST_SIZE-1] = '\0';
 241
 242        issue_dma(pdev, dst, src, KTEST_SIZE, true);
 243
 244        sza = sized_kzmalloc(1024, MEM_WAIT);
 245        sza_printf(sza, "\tCopy Size: %d (0x%x)\n", KTEST_SIZE, KTEST_SIZE);
 246        sza_printf(sza, "\tsrcfill: %c (0x%x)\n", ktest.srcfill, ktest.srcfill);
 247        sza_printf(sza, "\tdstfill: %c (0x%x)\n", ktest.dstfill, ktest.dstfill);
 248
 249        /* %s on a uptr causes a printfmt warning.  stop at 20 too.  sanity.*/
 250        sza_printf(sza, "\tsrc_str (after copy): ");
 251        for (int i = 0; i < 20; i++)
 252                sza_printf(sza, "%c", src_d[i]);
 253        sza_printf(sza, "\n");
 254
 255        sza_printf(sza, "\tdst_str (after copy): ");
 256        for (int i = 0; i < 20; i++)
 257                sza_printf(sza, "%c", dst_d[i]);
 258        sza_printf(sza, "\n");
 259
 260        dma_free_coherent(&pdev->linux_dev, KTEST_SIZE, dst_d, dst);
 261        dma_free_coherent(&pdev->linux_dev, KTEST_SIZE, src_d, src);
 262        poperror();
 263
 264        switch_back(pdev->proc_owner, prev);
 265        qunlock(&pdev->qlock);
 266        poperror();
 267
 268        return sza;
 269}
 270
 271struct dev cbdmadevtab;
 272
 273static char *devname(void)
 274{
 275        return cbdmadevtab.name;
 276}
 277
 278static struct chan *cbdmaattach(char *spec)
 279{
 280        return devattach(devname(), spec);
 281}
 282
 283struct walkqid *cbdmawalk(struct chan *c, struct chan *nc, char **name,
 284                         unsigned int nname)
 285{
 286        return devwalk(c, nc, name, nname, cbdmadir,
 287                       ARRAY_SIZE(cbdmadir), devgen);
 288}
 289
 290static size_t cbdmastat(struct chan *c, uint8_t *dp, size_t n)
 291{
 292        return devstat(c, dp, n, cbdmadir, ARRAY_SIZE(cbdmadir), devgen);
 293}
 294
 295static struct chan *cbdmaopen(struct chan *c, int omode)
 296{
 297        switch (c->qid.path) {
 298        case Qcbdmaktest:
 299                c->synth_buf = open_ktest();
 300                break;
 301        case Qdir:
 302        case Qcbdmaucopy:
 303                break;
 304        default:
 305                error(EIO, "cbdma: qid 0x%x is impossible", c->qid.path);
 306        }
 307
 308        return devopen(c, omode, cbdmadir, ARRAY_SIZE(cbdmadir), devgen);
 309}
 310
 311static void cbdmaclose(struct chan *c)
 312{
 313        switch (c->qid.path) {
 314        case Qcbdmaktest:
 315                kfree(c->synth_buf);
 316                c->synth_buf = NULL;
 317                break;
 318        case Qdir:
 319        case Qcbdmaucopy:
 320                break;
 321        default:
 322                error(EIO, "cbdma: qid 0x%x is impossible", c->qid.path);
 323        }
 324}
 325
 326static size_t cbdmaread(struct chan *c, void *va, size_t n, off64_t offset)
 327{
 328        struct sized_alloc *sza = c->synth_buf;
 329
 330        switch (c->qid.path) {
 331        case Qcbdmaktest:
 332                return readstr(offset, va, n, sza->buf);
 333        case Qcbdmaucopy:
 334                return readstr(offset, va, n,
 335                        "Write address of struct ucopy to issue DMA\n");
 336        case Qdir:
 337                return devdirread(c, va, n, cbdmadir, ARRAY_SIZE(cbdmadir),
 338                                        devgen);
 339        default:
 340                error(EIO, "cbdma: qid 0x%x is impossible", c->qid.path);
 341        }
 342
 343        return -1;      /* not reached */
 344}
 345
 346static size_t cbdmawrite(struct chan *c, void *va, size_t n, off64_t offset)
 347{
 348        struct ucbdma ucbdma[1];
 349
 350        switch (c->qid.path) {
 351        case Qdir:
 352                error(EPERM, "writing not permitted");
 353        case Qcbdmaktest:
 354                error(EPERM, ERROR_FIXME);
 355        case Qcbdmaucopy:
 356                if (n != sizeof(struct ucbdma))
 357                        error(EINVAL, "Bad ucbdma size %u (%u)", n,
 358                              sizeof(struct ucbdma));
 359                if (copy_from_user(ucbdma, va, sizeof(struct ucbdma)))
 360                        error(EINVAL, "Bad ucbdma pointer");
 361                issue_dma_ucbdma(ucbdma);
 362                return n;
 363        default:
 364                error(EIO, "cbdma: qid 0x%x is impossible", c->qid.path);
 365        }
 366
 367        return -1;      /* not reached */
 368}
 369
 370struct dev cbdmadevtab __devtab = {
 371        .name       = "cbdma",
 372        .reset      = devreset,
 373        .init       = devinit,
 374        .shutdown   = devshutdown,
 375        .attach     = cbdmaattach,
 376        .walk       = cbdmawalk,
 377        .stat       = cbdmastat,
 378        .open       = cbdmaopen,
 379        .create     = devcreate,
 380        .close      = cbdmaclose,
 381        .read       = cbdmaread,
 382        .bread      = devbread,
 383        .write      = cbdmawrite,
 384        .bwrite     = devbwrite,
 385        .remove     = devremove,
 386        .wstat      = devwstat,
 387};
 388