akaros/kern/drivers/dev/iommu.c
<<
>>
Prefs
   1/* Copyright (c) 2019, 2020 Google, Inc.x
   2 *
   3 * Driver for accessing Intel iommu
   4 *
   5 * Aditya Basu <mitthu@google.com>
   6 * Barret Rhoden <brho@cs.berkeley.edu>
   7 *
   8 * (1) proc->dev_qlock => (2) iommu->iommu_lock
   9 * (1) proc->dev_qlock => (2) pdev->qlock
  10 *
  11 * TODO
  12 * ====
  13 *  - In iommu_map_pci_devices() assign the correct iommu for scoped DRHD. Right
  14 *    now the default iommu is assigned to all devices.
  15 *  - IOMMU_DID_DEFAULT = 1; this means pid = 1 cannot have a device passthru
  16 *    because we use the pid as "did" or domain ID.
  17 *
  18 * lifecycle of CTE entries:
  19 * - at boot, every CTE (per pdev on an iommu) is set to non-translating.  In
  20 *   essence, an identity map.
  21 * - pci devices are initially assigned to the kernel.
  22 * - when devices are unassigned, their cte mapping is destroyed.
  23 * - when they are reassigned, their mapping is set to either an identity map
  24 *   (kernel) or a process's page table.
  25 *
  26 * - On the topic of disabling the IOMMU, we used to have an option to just
  27 *   unset it completely.  Disable TE, clear the root pointer.  Though the code
  28 *   we had was hokey and broken.  Even then, if we have a device behind an
  29 *   IOMMU and disable the IOMMU, that would just fuck everything up.  Maybe if
  30 *   we had identity mapped pages in the IPT, so that when translation turned
  31 *   off, the device would still work.  Seems like a mess.
  32 *
  33 * - We ought to do a domain-selective, context-cache invalidation whenever we
  34 *   reuse DIDs.  aka, whenever there is a new IPT for a pid, which is every 65k
  35 *   processes.  Or maybe every 16k, depending on how many pids we have.
  36 */
  37
  38#include <stdio.h>
  39#include <error.h>
  40#include <common.h>
  41#include <net/ip.h>
  42#include <atomic.h>
  43
  44#include <acpi.h>
  45#include <arch/intel-iommu.h>
  46#include <env.h>
  47#include <arch/pci.h>
  48#include <linux_compat.h>
  49
  50#define IOMMU "iommu: "
  51#define BUFFERSZ 8192
  52
  53struct dev iommudevtab;
  54
  55static struct iommu_list_tq iommu_list = TAILQ_HEAD_INITIALIZER(iommu_list);
  56static bool iommu_is_supported;
  57
  58/* QID Path */
  59enum {
  60        Qdir         = 0,
  61        Qmappings    = 1,
  62        Qadddev      = 2,
  63        Qremovedev   = 3,
  64        Qinfo        = 4,
  65};
  66
  67static struct dirtab iommudir[] = {
  68        {".",                   {Qdir, 0, QTDIR}, 0, 0555},
  69        {"mappings",            {Qmappings, 0, QTFILE}, 0, 0755},
  70        {"attach",              {Qadddev, 0, QTFILE}, 0, 0755},
  71        {"detach",              {Qremovedev, 0, QTFILE}, 0, 0755},
  72        {"info",                {Qinfo, 0, QTFILE}, 0, 0755},
  73};
  74
  75/* OK, we never actually use these, since we won't support any IOMMU that
  76 * requires RWBF (Required Write Buffer Flushing).
  77 *
  78 * On older hardware, if we updated data structures from software, the IOMMU
  79 * wouldn't necessarily see it.  The software write would get held up at various
  80 * write buffers.  See 6.8.
  81 *
  82 * Certain operations, such as ctx cache and iotlb flushes, were OK.  The HW
  83 * would implicitly do a write buffer flush.  Other operations, like changing an
  84 * IPT PTE, which do not necessarily require a command flush, would need the
  85 * WBF.
  86 *
  87 * This is different than caching mode (CM).  In CM, hardware (or more often a
  88 * virtual IOMMU) caches negative PTEs, and you need to poke the IOMMU whenever
  89 * changing any PTE.  This RWBF isn't about caching old values; it's about not
  90 * seeing new values due to buffering.
  91 *
  92 * Just about any time you want to do a CM operation, you'd also want to check
  93 * for RWBF.  Though note that we do not use the IOMMU if it requires either CM
  94 * or RWBF. */
  95static inline void write_buffer_flush(struct iommu *iommu)
  96{
  97        uint32_t cmd, status;
  98
  99        if (!iommu->rwbf)
 100                return;
 101
 102        cmd = read32(iommu->regio + DMAR_GCMD_REG) | DMA_GCMD_WBF;
 103        write32(cmd, iommu->regio + DMAR_GCMD_REG);
 104
 105        do {
 106                status = read32(iommu->regio + DMAR_GSTS_REG);
 107        } while (status & DMA_GSTS_WBFS);
 108}
 109
 110/* OK, read and write draining on flush.  At first I thought this was about
 111 * ops that queued up, but hadn't gone through the IOMMU yet.  Instead, this is
 112 * about ops that made it through the IOMMU, but have not made it to main
 113 * memory.  i.e., the IOMMU translated to a physical address, but the write to
 114 * that paddr hasn't made it to RAM.  The reason we ask for a TLB flush is
 115 * typically to make sure the PTE / translation is no longer in use.  Undrained
 116 * operations that made it past the IOMMU are still using the old translation.
 117 * Thus we should always read/write drain. */
 118static void __iotlb_flush_global(struct iommu *iommu)
 119{
 120        write64(DMA_TLB_IVT | DMA_TLB_READ_DRAIN | DMA_TLB_WRITE_DRAIN |
 121                DMA_TLB_GLOBAL_FLUSH,
 122                iommu->regio + iommu->iotlb_cmd_offset);
 123
 124        while (read64(iommu->regio + iommu->iotlb_cmd_offset) & DMA_TLB_IVT)
 125                cpu_relax();
 126}
 127
 128static void iotlb_flush(struct iommu *iommu, uint16_t did)
 129{
 130        write64(DMA_TLB_IVT | DMA_TLB_READ_DRAIN | DMA_TLB_WRITE_DRAIN |
 131                DMA_TLB_DSI_FLUSH | DMA_TLB_DID(did),
 132                iommu->regio + iommu->iotlb_cmd_offset);
 133
 134        while (read64(iommu->regio + iommu->iotlb_cmd_offset) & DMA_TLB_IVT)
 135                cpu_relax();
 136}
 137
 138static inline struct root_entry *get_root_entry(physaddr_t paddr)
 139{
 140        return (struct root_entry *) KADDR(paddr);
 141}
 142
 143static inline struct context_entry *get_context_entry(physaddr_t paddr)
 144{
 145        return (struct context_entry *) KADDR(paddr);
 146}
 147
 148static void __cte_set_identity_pgtbl(struct context_entry *cte)
 149{
 150        cte->hi = 0
 151                | (IOMMU_DID_DEFAULT << CTX_HI_DID_SHIFT) // DID bit: 72 to 87
 152                | (CTX_AW_L4 << CTX_HI_AW_SHIFT); // AW
 153
 154        cte->lo = 0 /* assumes page alignment */
 155                | (0x2 << CTX_LO_TRANS_SHIFT)
 156                | (0x1 << CTX_LO_FPD_SHIFT) // disable faults
 157                | (0x1 << CTX_LO_PRESENT_SHIFT); /* mark present */
 158}
 159
 160static void __cte_set_proc_pgtbl(struct context_entry *cte, struct proc *p)
 161{
 162        /* TODO: need to limit PID to 16 bits or come up with an alternative */
 163        warn_on(p->pid & ~0xffff);
 164
 165        cte->hi = 0
 166                | ((uint16_t)p->pid << CTX_HI_DID_SHIFT) // DID bit: 72 to 87
 167                | (CTX_AW_L4 << CTX_HI_AW_SHIFT); // AW
 168
 169        /* The only difference here is PGDIR and the LO_TRANS_SHIFT */
 170        cte->lo = PTE_ADDR(p->env_pgdir.eptp)
 171                | (0x0 << CTX_LO_TRANS_SHIFT)
 172                | (0x1 << CTX_LO_FPD_SHIFT) // disable faults
 173                | (0x1 << CTX_LO_PRESENT_SHIFT); /* mark present */
 174}
 175
 176static physaddr_t ct_init(void)
 177{
 178        struct context_entry *cte;
 179        physaddr_t ct;
 180
 181        cte = (struct context_entry *) kpage_zalloc_addr();
 182        ct = PADDR(cte);
 183
 184        for (int i = 0; i < 32 * 8; i++, cte++) // device * func
 185                __cte_set_identity_pgtbl(cte);
 186
 187        return ct;
 188}
 189
 190/* Get a new root_entry table.  Allocates all context entries. */
 191static physaddr_t rt_init(void)
 192{
 193        struct root_entry *rte;
 194        physaddr_t rt;
 195        physaddr_t ct;
 196
 197        /* Page Align = 0x1000 */
 198        rte = (struct root_entry *) kpage_zalloc_addr();
 199        rt = PADDR(rte);
 200
 201        /* create context table */
 202        for (int i = 0; i < 256; i++, rte++) {
 203                ct = ct_init();
 204                rte->hi = 0;
 205                rte->lo = 0
 206                        | ct
 207                        | (0x1 << RT_LO_PRESENT_SHIFT);
 208        }
 209
 210        return rt;
 211}
 212
 213static struct context_entry *get_ctx_for(struct iommu *iommu,
 214                                         struct pci_device *pdev)
 215{
 216        struct root_entry *rte;
 217        physaddr_t cte_phy;
 218        struct context_entry *cte;
 219        uint32_t offset = 0;
 220
 221        rte = get_root_entry(iommu->roottable) + pdev->bus;
 222
 223        cte_phy = rte->lo & 0xFFFFFFFFFFFFF000;
 224        cte = get_context_entry(cte_phy);
 225
 226        offset = (pdev->dev * 8) + pdev->func;
 227        cte += offset;
 228
 229        return cte;
 230}
 231
 232static void __iommu_clear_pgtbl(struct pci_device *pdev, uint16_t did)
 233{
 234        struct iommu *iommu = pdev->iommu;
 235        struct context_entry *cte = get_ctx_for(iommu, pdev);
 236
 237        cte->lo &= ~0x1;
 238
 239        spin_lock_irqsave(&iommu->iommu_lock);
 240        iotlb_flush(iommu, did);
 241        spin_unlock_irqsave(&iommu->iommu_lock);
 242}
 243
 244/* Hold the proc's dev_qlock.  This returns the linkage for p and i, and inserts
 245 * if it it didn't exist. */
 246static struct iommu_proc_link *__get_linkage(struct proc *p, struct iommu *i)
 247{
 248        struct iommu_proc_link *l;
 249
 250        list_for_each_entry(l, &p->iommus, link) {
 251                if (l->i == i)
 252                        return l;
 253        }
 254        l = kmalloc(sizeof(struct iommu_proc_link), MEM_WAIT);
 255        l->i = i;
 256        l->p = p;
 257        l->nr_devices = 0;
 258        list_add_rcu(&l->link, &p->iommus);
 259        return l;
 260}
 261
 262/* Caller holds the pdev->qlock and if proc, the proc->dev_qlock.
 263 * Careful, this can throw. */
 264void __iommu_device_assign(struct pci_device *pdev, struct proc *proc)
 265{
 266        struct iommu *iommu = pdev->iommu;
 267        struct iommu_proc_link *l;
 268
 269        if (!proc) {
 270                __cte_set_identity_pgtbl(get_ctx_for(pdev->iommu, pdev));
 271                return;
 272        }
 273
 274        /* Lockless peek.  We hold the dev_qlock, so if we are concurrently
 275         * dying, proc_destroy() will come behind us and undo this.  If
 276         * proc_destroy() already removed all devices, we would see DYING. */
 277        if (proc_is_dying(proc))
 278                error(EINVAL, "process is dying");
 279        l = __get_linkage(proc, iommu);
 280
 281        l->nr_devices++;
 282        TAILQ_INSERT_TAIL(&proc->pci_devs, pdev, proc_link);
 283
 284        __cte_set_proc_pgtbl(get_ctx_for(pdev->iommu, pdev), proc);
 285}
 286
 287/* Caller holds the pdev->qlock and if proc, the proc->dev_qlock. */
 288void __iommu_device_unassign(struct pci_device *pdev, struct proc *proc)
 289{
 290        struct iommu *iommu = pdev->iommu;
 291        struct iommu_proc_link *l;
 292
 293        assert(iommu == pdev->iommu);
 294
 295        if (!proc) {
 296                __iommu_clear_pgtbl(pdev, IOMMU_DID_DEFAULT);
 297                return;
 298        }
 299
 300        l = __get_linkage(proc, iommu);
 301
 302        __iommu_clear_pgtbl(pdev, proc->pid);
 303
 304        l->nr_devices--;
 305        if (!l->nr_devices) {
 306                list_del_rcu(&l->link);
 307                kfree_rcu(l, rcu);
 308        }
 309
 310        TAILQ_REMOVE(&proc->pci_devs, pdev, proc_link);
 311}
 312
 313void iommu_unassign_all_devices(struct proc *p)
 314{
 315        struct pci_device *pdev, *tp;
 316
 317        qlock(&p->dev_qlock);
 318        /* If you want to get clever and try to batch up the iotlb flushes, it's
 319         * probably not worth it.  The big concern is that the moment you unlock
 320         * the pdev, it can be reassigned.  If you didn't flush the iotlb yet,
 321         * it might have old entries.  Note that when we flush, we pass the DID
 322         * (p->pid), which the next user of the pdev won't know.  I don't know
 323         * if you need to flush the old DID entry or not before reusing a CTE,
 324         * though probably. */
 325        TAILQ_FOREACH_SAFE(pdev, &p->pci_devs, proc_link, tp) {
 326                qlock(&pdev->qlock);
 327                pci_device_unassign_known(pdev, p);
 328                qunlock(&pdev->qlock);
 329        }
 330        qunlock(&p->dev_qlock);
 331}
 332
 333void proc_iotlb_flush(struct proc *p)
 334{
 335        struct iommu_proc_link *l;
 336
 337        rcu_read_lock();
 338        list_for_each_entry_rcu(l, &p->iommus, link) {
 339                spin_lock_irqsave(&l->i->iommu_lock);
 340                iotlb_flush(l->i, p->pid);
 341                spin_unlock_irqsave(&l->i->iommu_lock);
 342        }
 343        rcu_read_unlock();
 344}
 345
 346static void __set_root_table(struct iommu *iommu, physaddr_t roottable)
 347{
 348        write64(roottable, iommu->regio + DMAR_RTADDR_REG);
 349        write32(DMA_GCMD_SRTP, iommu->regio + DMAR_GCMD_REG);
 350        /* Unlike the write-buffer-flush status and ICC completion check,
 351         * hardware *sets* the bit to 1 when it is done */
 352        while (!(read32(iommu->regio + DMAR_GSTS_REG) & DMA_GSTS_RTPS))
 353                cpu_relax();
 354}
 355
 356static void __inval_ctx_cache_global(struct iommu *iommu)
 357{
 358        write64(DMA_CCMD_ICC | DMA_CCMD_GLOBAL_INVL,
 359                iommu->regio + DMAR_CCMD_REG);
 360        while (read64(iommu->regio + DMAR_CCMD_REG) & DMA_CCMD_ICC)
 361                cpu_relax();
 362}
 363
 364static void __enable_translation(struct iommu *iommu)
 365{
 366        /* see 10.4.4 for some concerns if we want to update multiple fields.
 367         * (read status, mask the one-shot commands we don't want on, then set
 368         * the ones we do want). */
 369        write32(DMA_GCMD_TE, iommu->regio + DMAR_GCMD_REG);
 370        while (!(read32(iommu->regio + DMAR_GSTS_REG) & DMA_GSTS_TES))
 371                cpu_relax();
 372}
 373
 374/* Given an iommu with a root table, enable translation.  The default root table
 375 * (from rt_init()) is set up to not translate.  i.e. IOVA == PA. */
 376static void iommu_enable_translation(struct iommu *iommu)
 377{
 378        spin_lock_irqsave(&iommu->iommu_lock);
 379        __set_root_table(iommu, iommu->roottable);
 380        __inval_ctx_cache_global(iommu);
 381        __iotlb_flush_global(iommu);
 382        __enable_translation(iommu);
 383        spin_unlock_irqsave(&iommu->iommu_lock);
 384}
 385
 386/* Iterate over all IOMMUs and make sure the "rba" present in DRHD are unique */
 387static bool iommu_asset_unique_regio(void)
 388{
 389        struct iommu *outer, *inner;
 390        uint64_t rba;
 391        bool result = true;
 392
 393        TAILQ_FOREACH(outer, &iommu_list, iommu_link) {
 394                rba = outer->rba;
 395
 396                TAILQ_FOREACH(inner, &iommu_list, iommu_link) {
 397                        if (outer != inner && rba == inner->rba) {
 398                                outer->supported = false;
 399                                result = false;
 400                        }
 401                }
 402        }
 403
 404        return result;
 405}
 406
 407static bool iommu_has_required_capabilities(struct iommu *iommu)
 408{
 409        uint64_t cap, ecap;
 410        bool support, result = true;
 411
 412        cap = read64(iommu->regio + DMAR_CAP_REG);
 413        ecap = read64(iommu->regio + DMAR_ECAP_REG);
 414
 415        support = (cap_sagaw(cap) & 0x4) >> 2;
 416        if (!support) {
 417                printk(IOMMU "%p: unsupported paging level: 0x%x\n",
 418                        iommu, cap_sagaw(cap));
 419                result = false;
 420        }
 421
 422        support = cap_super_page_val(cap) & 0x1;
 423        if (!support) {
 424                printk(IOMMU "%p: 1GB super pages not supported\n", iommu);
 425                result = false;
 426        }
 427
 428        if (cap_rwbf(cap)) {
 429                printk(IOMMU "%p: HW requires RWBF, will abort\n", iommu);
 430                result = false;
 431        }
 432
 433        if (cap_caching_mode(cap)) {
 434                printk(IOMMU "%p: HW requires caching_mode, will abort\n",
 435                       iommu);
 436                result = false;
 437        }
 438
 439        support = ecap_pass_through(ecap);
 440        if (!support) {
 441                printk(IOMMU "%p: pass-through translation type in context entries not supported\n", iommu);
 442                result = false;
 443        }
 444
 445        /* max gaw/haw reported by iommu.  It's fine if these differ.  Spec says
 446         * MGAW must be at least the HAW.  It's OK to be more. */
 447        iommu->haw_cap = cap_mgaw(cap);
 448        if (iommu->haw_cap < iommu->haw_dmar) {
 449                printk(IOMMU "%p: HAW mismatch; DMAR reports %d, CAP reports %d, check CPUID\n",
 450                        iommu, iommu->haw_dmar, iommu->haw_cap);
 451        }
 452
 453        return result;
 454}
 455
 456/* All or nothing */
 457static bool have_iommu_support(void)
 458{
 459        struct iommu *iommu;
 460
 461        if (TAILQ_EMPTY(&iommu_list))
 462                return false;
 463
 464        TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
 465                if (!iommu->supported)
 466                        return false;
 467        }
 468        return true;
 469}
 470
 471/* Run this function after all individual IOMMUs are initialized. */
 472void iommu_enable_all(void)
 473{
 474        struct iommu *iommu;
 475        static bool once = false;
 476
 477        if (once)
 478                warn(IOMMU "Called twice, aborting!");
 479        once = true;
 480
 481        if (!iommu_asset_unique_regio())
 482                warn(IOMMU "same register base addresses detected");
 483
 484        iommu_is_supported = have_iommu_support();
 485        if (!iommu_is_supported) {
 486                printk("No supported IOMMUs detected\n");
 487                return;
 488        }
 489
 490        TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
 491                printk("IOMMU: enabling translation on %p\n", iommu);
 492                iommu_enable_translation(iommu);
 493        }
 494}
 495
 496static bool _iommu_is_enabled(struct iommu *iommu)
 497{
 498        uint32_t status = 0;
 499
 500        /* Arguably we don't need the lock when reading. */
 501        spin_lock_irqsave(&iommu->iommu_lock);
 502        status = read32(iommu->regio + DMAR_GSTS_REG);
 503        spin_unlock_irqsave(&iommu->iommu_lock);
 504
 505        return status & DMA_GSTS_TES;
 506}
 507
 508static bool iommu_some_is_enabled(void)
 509{
 510        struct iommu *iommu;
 511
 512        TAILQ_FOREACH(iommu, &iommu_list, iommu_link)
 513                if (_iommu_is_enabled(iommu))
 514                        return true;
 515
 516        return false;
 517}
 518
 519/* grabs the iommu of the first DRHD with INCLUDE_PCI_ALL */
 520struct iommu *get_default_iommu(void)
 521{
 522        struct Dmar *dt;
 523
 524        /* dmar is a global variable; see acpi.h */
 525        if (dmar == NULL)
 526                return NULL;
 527
 528        dt = dmar->tbl;
 529        for (int i = 0; i < dmar->nchildren; i++) {
 530                struct Atable *at = dmar->children[i];
 531                struct Drhd *drhd = at->tbl;
 532
 533                if (drhd->all & 1)
 534                        return &drhd->iommu;
 535        }
 536
 537        return NULL;
 538}
 539
 540void iommu_map_pci_devices(void)
 541{
 542        struct pci_device *pci_iter;
 543        struct iommu *iommu = get_default_iommu();
 544
 545        if (!iommu)
 546                return;
 547
 548        /* set the default iommu */
 549        STAILQ_FOREACH(pci_iter, &pci_devices, all_dev) {
 550                pci_iter->iommu = iommu;
 551                TAILQ_INSERT_TAIL(&iommu->pci_devs, pci_iter, iommu_link);
 552        }
 553}
 554
 555/* This is called from acpi.c to initialize an iommu. */
 556void iommu_acpi_init(struct iommu *iommu, uint8_t haw, uint64_t rba)
 557{
 558        uint64_t cap, ecap;
 559
 560        TAILQ_INIT(&iommu->pci_devs);
 561        spinlock_init_irqsave(&iommu->iommu_lock);
 562        iommu->rba = rba;
 563        iommu->regio = (void __iomem *) vmap_pmem_nocache(rba, VTD_PAGE_SIZE);
 564        if (!iommu->regio)
 565                warn("Unable to map the iommu, aborting!");
 566        iommu->haw_dmar = haw;
 567
 568        iommu->supported = iommu_has_required_capabilities(iommu);
 569
 570        cap = read64(iommu->regio + DMAR_CAP_REG);
 571        ecap = read64(iommu->regio + DMAR_ECAP_REG);
 572
 573        /* Creates a root table for non-translating identity maps, but it is not
 574         * enabled / turned on yet. */
 575        iommu->roottable = rt_init();
 576        iommu->iotlb_cmd_offset = ecap_iotlb_offset(ecap) + 8;
 577        iommu->iotlb_addr_offset = ecap_iotlb_offset(ecap);
 578
 579        iommu->rwbf = cap_rwbf(cap);
 580        iommu->device_iotlb = ecap_dev_iotlb_support(ecap);
 581
 582        /* add the iommu to the list of all discovered iommu */
 583        TAILQ_INSERT_TAIL(&iommu_list, iommu, iommu_link);
 584}
 585
 586static void assign_device(int bus, int dev, int func, pid_t pid)
 587{
 588        ERRSTACK(1);
 589        int tbdf = MKBUS(BusPCI, bus, dev, func);
 590        struct pci_device *pdev = pci_match_tbdf(tbdf);
 591        struct proc *p;
 592
 593        if (!pdev)
 594                error(EIO, "cannot find dev %x:%x.%x\n", bus, dev, func);
 595        if (!pid) {
 596                pci_device_assign(pdev, NULL);
 597                return;
 598        }
 599        if (pid == 1)
 600                error(EIO, "device passthru not supported for pid = 1");
 601        p = pid2proc(pid);
 602        if (!p)
 603                error(EIO, "cannot find pid %d\n", pid);
 604        if (waserror()) {
 605                proc_decref(p);
 606                nexterror();
 607        }
 608        pci_device_assign(pdev, p);
 609        proc_decref(p);
 610        poperror();
 611}
 612
 613static void unassign_device(int bus, int dev, int func, pid_t pid)
 614{
 615        ERRSTACK(1);
 616        int tbdf = MKBUS(BusPCI, bus, dev, func);
 617        struct pci_device *pdev = pci_match_tbdf(tbdf);
 618        struct proc *p;
 619
 620        if (!pdev)
 621                error(EIO, "cannot find dev %x:%x.%x\n", bus, dev, func);
 622        if (!pid) {
 623                pci_device_unassign(pdev, NULL);
 624                return;
 625        }
 626        p = pid2proc(pid);
 627        if (!p)
 628                error(EIO, "cannot find pid %d\n", pid);
 629        if (waserror()) {
 630                proc_decref(p);
 631                nexterror();
 632        }
 633        pci_device_unassign(pdev, p);
 634        proc_decref(p);
 635        poperror();
 636}
 637
 638static struct sized_alloc *open_mappings(void)
 639{
 640        struct iommu *iommu;
 641        bool has_dev = false;
 642        struct pci_device *pdev;
 643        struct sized_alloc *sza = sized_kzmalloc(BUFFERSZ, MEM_WAIT);
 644
 645        TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
 646                sza_printf(sza, "Mappings for iommu@%p\n", iommu);
 647                spin_lock_irqsave(&iommu->iommu_lock);
 648                TAILQ_FOREACH(pdev, &iommu->pci_devs, iommu_link) {
 649                        if (!pdev->proc_owner)
 650                                continue;
 651                        has_dev = true;
 652                        sza_printf(sza, "\tdevice %02x:%02x.%x, PID %u\n",
 653                                   pdev->bus, pdev->dev, pdev->func,
 654                                   pdev->proc_owner->pid);
 655                }
 656                spin_unlock_irqsave(&iommu->iommu_lock);
 657                if (!has_dev)
 658                        sza_printf(sza, "\t<empty>\n");
 659        }
 660
 661        return sza;
 662}
 663
 664static void _open_info(struct iommu *iommu, struct sized_alloc *sza)
 665{
 666        uint64_t value;
 667
 668        sza_printf(sza, "\niommu@%p\n", iommu);
 669        sza_printf(sza, "\trba = %p\n", iommu->rba);
 670        sza_printf(sza, "\tsupported = %s\n", iommu->supported ? "yes" : "no");
 671        sza_printf(sza, "\tregspace = %p\n", iommu->regio);
 672        sza_printf(sza, "\thost addr width (dmar) = %d\n", iommu->haw_dmar);
 673        sza_printf(sza, "\thost addr width (cap[mgaw]) = %d\n",
 674                iommu->haw_cap);
 675        value = read32(iommu->regio + DMAR_VER_REG);
 676        sza_printf(sza, "\tversion = 0x%x\n", value);
 677
 678        value = read64(iommu->regio + DMAR_CAP_REG);
 679        sza_printf(sza, "\tcapabilities = %p\n", value);
 680        sza_printf(sza, "\t\tmgaw: %d\n", cap_mgaw(value));
 681        sza_printf(sza, "\t\tsagaw (paging level): 0x%x\n", cap_sagaw(value));
 682        sza_printf(sza, "\t\tcaching mode: %s (%d)\n", cap_caching_mode(value) ?
 683                "yes" : "no", cap_caching_mode(value));
 684        sza_printf(sza, "\t\tzlr: 0x%x\n", cap_zlr(value));
 685        sza_printf(sza, "\t\trwbf: %s\n", cap_rwbf(value) ? "required"
 686                                                          : "not required");
 687        sza_printf(sza, "\t\tnum domains: %d\n", cap_ndoms(value));
 688        sza_printf(sza, "\t\tsupports protected high-memory region: %s\n",
 689                cap_phmr(value) ? "yes" : "no");
 690        sza_printf(sza, "\t\tsupports Protected low-memory region: %s\n",
 691                cap_plmr(value) ? "yes" : "no");
 692
 693        value = read64(iommu->regio + DMAR_ECAP_REG);
 694        sza_printf(sza, "\text. capabilities = %p\n", value);
 695        sza_printf(sza, "\t\tpass through: %s\n",
 696                ecap_pass_through(value) ? "yes" : "no");
 697        sza_printf(sza, "\t\tdevice iotlb: %s\n",
 698                ecap_dev_iotlb_support(value) ? "yes" : "no");
 699        sza_printf(sza, "\t\tiotlb register offset: 0x%x\n",
 700                ecap_iotlb_offset(value));
 701        sza_printf(sza, "\t\tsnoop control: %s\n",
 702                ecap_sc_support(value) ? "yes" : "no");
 703        sza_printf(sza, "\t\tcoherency: %s\n",
 704                ecap_coherent(value) ? "yes" : "no");
 705        sza_printf(sza, "\t\tqueue invalidation support: %s\n",
 706                ecap_qis(value) ? "yes" : "no");
 707        sza_printf(sza, "\t\tinterrupt remapping support: %s\n",
 708                ecap_ir_support(value) ? "yes" : "no");
 709        sza_printf(sza, "\t\textended interrupt mode: 0x%x\n",
 710                ecap_eim_support(value));
 711
 712        value = read32(iommu->regio + DMAR_GSTS_REG);
 713        sza_printf(sza, "\tglobal status = 0x%x\n", value);
 714        sza_printf(sza, "\t\ttranslation: %s\n",
 715                value & DMA_GSTS_TES ? "enabled" : "disabled");
 716        sza_printf(sza, "\t\troot table: %s\n",
 717                value & DMA_GSTS_RTPS ? "set" : "not set");
 718
 719        value = read64(iommu->regio + DMAR_RTADDR_REG);
 720        sza_printf(sza, "\troot entry table = %p (phy) or %p (vir)\n",
 721                        value, KADDR(value));
 722}
 723
 724static struct sized_alloc *open_info(void)
 725{
 726        struct sized_alloc *sza = sized_kzmalloc(BUFFERSZ, MEM_WAIT);
 727        uint64_t value;
 728        struct iommu *iommu;
 729
 730        sza_printf(sza, "driver info:\n");
 731
 732        value = IOMMU_DID_DEFAULT;
 733        sza_printf(sza, "\tdefault did = %d\n", value);
 734        sza_printf(sza, "\tstatus = %s\n",
 735                iommu_some_is_enabled() ? "enabled" : "disabled");
 736
 737        TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
 738                _open_info(iommu, sza);
 739        }
 740
 741        return sza;
 742}
 743
 744static char *devname(void)
 745{
 746        return iommudevtab.name;
 747}
 748
 749static struct chan *iommuattach(char *spec)
 750{
 751        return devattach(devname(), spec);
 752}
 753
 754static struct walkqid *iommuwalk(struct chan *c, struct chan *nc, char **name,
 755                         unsigned int nname)
 756{
 757        return devwalk(c, nc, name, nname, iommudir,
 758                       ARRAY_SIZE(iommudir), devgen);
 759}
 760
 761static size_t iommustat(struct chan *c, uint8_t *dp, size_t n)
 762{
 763        return devstat(c, dp, n, iommudir, ARRAY_SIZE(iommudir), devgen);
 764}
 765
 766static struct chan *iommuopen(struct chan *c, int omode)
 767{
 768        switch (c->qid.path) {
 769        case Qmappings:
 770                c->synth_buf = open_mappings();
 771                break;
 772        case Qinfo:
 773                c->synth_buf = open_info();
 774                break;
 775        case Qadddev:
 776        case Qremovedev:
 777        case Qdir:
 778        default:
 779                break;
 780        }
 781
 782        return devopen(c, omode, iommudir, ARRAY_SIZE(iommudir), devgen);
 783}
 784
 785/*
 786 * All files are synthetic. Hence we do not need to implement any close
 787 * function.
 788 */
 789static void iommuclose(struct chan *c)
 790{
 791        switch (c->qid.path) {
 792        case Qmappings:
 793        case Qinfo:
 794                kfree(c->synth_buf);
 795                c->synth_buf = NULL;
 796                break;
 797        case Qadddev:
 798        case Qremovedev:
 799        case Qdir:
 800        default:
 801                break;
 802        }
 803}
 804
 805static size_t iommuread(struct chan *c, void *va, size_t n, off64_t offset)
 806{
 807        struct sized_alloc *sza = c->synth_buf;
 808
 809        switch (c->qid.path) {
 810        case Qdir:
 811                return devdirread(c, va, n, iommudir,
 812                                  ARRAY_SIZE(iommudir), devgen);
 813        case Qadddev:
 814                return readstr(offset, va, n,
 815                    "write format: xx:yy.z pid\n"
 816                    "   xx  = bus (in hex)\n"
 817                    "   yy  = device (in hex)\n"
 818                    "   z   = function (in hex)\n"
 819                    "   pid = process pid\n"
 820                    "\nexample:\n"
 821                    "$ echo 00:1f.2 13 >\\#iommu/attach\n");
 822        case Qremovedev:
 823                return readstr(offset, va, n,
 824                    "write format: xx:yy.z\n"
 825                    "   xx  = bus (in hex)\n"
 826                    "   yy  = device (in hex)\n"
 827                    "   z   = function (in hex)\n"
 828                    "\nexample:\n"
 829                    "$ echo 00:1f.2 >\\#iommu/detach\n");
 830        case Qmappings:
 831        case Qinfo:
 832                return readstr(offset, va, n, sza->buf);
 833        default:
 834                error(EIO, "read: qid %d is impossible", c->qid.path);
 835        }
 836
 837        return -1; /* not reached */
 838}
 839
 840static void get_bdf_pid(struct cmdbuf *cb, int *bus, int *dev, int *func,
 841                        pid_t *pid)
 842{
 843        int err;
 844
 845        if (cb->nf < 2)
 846                error(EFAIL, "bb:dd.f pid");
 847
 848        err = sscanf(cb->f[0], "%x:%x.%x", bus, dev, func);
 849        if (err != 3)
 850                error(EIO,
 851                  IOMMU "error parsing bdf %s; nr parsed: %d", cb->f[0], err);
 852
 853        *pid = strtoul(cb->f[1], 0, 0);
 854}
 855
 856static void write_add_dev(struct chan *c, struct cmdbuf *cb)
 857{
 858        int bus, dev, func;
 859        pid_t pid;
 860
 861        get_bdf_pid(cb, &bus, &dev, &func, &pid);
 862
 863        if (pid == 1)
 864                error(EIO, IOMMU "device passthru not supported for pid = 1");
 865
 866        assign_device(bus, dev, func, pid);
 867}
 868
 869static void write_remove_dev(struct chan *c, struct cmdbuf *cb)
 870{
 871        int bus, dev, func;
 872        pid_t pid;
 873
 874        get_bdf_pid(cb, &bus, &dev, &func, &pid);
 875
 876        unassign_device(bus, dev, func, pid);
 877}
 878
 879static size_t iommuwrite(struct chan *c, void *va, size_t n, off64_t offset)
 880{
 881        ERRSTACK(1);
 882        struct cmdbuf *cb = parsecmd(va, n);
 883
 884        if (waserror()) {
 885                kfree(cb);
 886                nexterror();
 887        }
 888        switch (c->qid.path) {
 889        case Qadddev:
 890                if (!iommu_is_supported)
 891                        error(EROFS, IOMMU "not supported");
 892                write_add_dev(c, cb);
 893                break;
 894        case Qremovedev:
 895                if (!iommu_is_supported)
 896                        error(EROFS, IOMMU "not supported");
 897                write_remove_dev(c, cb);
 898                break;
 899        case Qmappings:
 900        case Qinfo:
 901        case Qdir:
 902                error(EROFS, IOMMU "cannot modify");
 903        default:
 904                error(EIO, "write: qid %d is impossible", c->qid.path);
 905        }
 906        kfree(cb);
 907        poperror();
 908        return n;
 909}
 910
 911struct dev iommudevtab __devtab = {
 912        .name       = "iommu",
 913        .reset      = devreset,
 914        .init       = devinit,
 915        .shutdown   = devshutdown,
 916        .attach     = iommuattach,
 917        .walk       = iommuwalk,
 918        .stat       = iommustat,
 919        .open       = iommuopen,
 920        .create     = devcreate,
 921        .close      = iommuclose,
 922        .read       = iommuread,
 923        .bread      = devbread,
 924        .write      = iommuwrite,
 925        .bwrite     = devbwrite,
 926        .remove     = devremove,
 927        .wstat      = devwstat,
 928};
 929