PCI helper find_cap
[akaros.git] / kern / arch / x86 / ioapic.c
index a71b1e2..a4f8f58 100644 (file)
 #include <smp.h>
 #include <ip.h>
 #include <arch/io.h>
+#include <acpi.h>
+#include <trap.h>
 
+/* Rbus chains, one for each device bus: each rbus matches a device to an rdt */
 struct Rbus {
-       struct Rbus     *next;
-       int     devno;
-       struct Rdt      *rdt;
+       struct Rbus *next;
+       int devno;
+       struct Rdt *rdt;
 };
 
+/* Each rdt describes an ioapic input pin (intin, from the bus/device) */
 struct Rdt {
-       struct apic     *apic;
-       int     intin;
-       uint32_t        lo;
+       struct apic *apic;
+       int intin;
+       uint32_t lo;                            /* matches the lo in the intin, incl Im */
+       uint32_t hi;                            /* matches the hi in the intin, incl routing */
 
-       int     ref;                            /* could map to multiple busses */
-       int     enabled;                                /* times enabled */
+       int ref;                                        /* could map to multiple busses */
+       int enabled;                            /* times enabled */
 };
 
-enum {                                         /* IOAPIC registers */
-       Ioregsel        = 0x00,                 /* indirect register address */
-       Iowin           = 0x04,                 /* indirect register data */
-       Ioipa           = 0x08,                 /* IRQ Pin Assertion */
-       Ioeoi           = 0x10,                 /* EOI */
-
-       Ioapicid        = 0x00,                 /* Identification */
-       Ioapicver       = 0x01,                 /* Version */
-       Ioapicarb       = 0x02,                 /* Arbitration */
-       Ioabcfg         = 0x03,                 /* Boot Coniguration */
-       Ioredtbl        = 0x10,                 /* Redirection Table */
+enum {                                                 /* IOAPIC registers */
+       Ioregsel = 0x00,                        /* indirect register address */
+       Iowin = 0x10,   /* indirect register data */
+       Ioipa = 0x08,   /* IRQ Pin Assertion */
+       Ioeoi = 0x10,   /* EOI */
+
+       Ioapicid = 0x00,        /* Identification */
+       Ioapicver = 0x01,       /* Version */
+       Ioapicarb = 0x02,       /* Arbitration */
+       Ioabcfg = 0x03, /* Boot Coniguration */
+       Ioredtbl = 0x10,        /* Redirection Table */
 };
 
 static struct Rdt rdtarray[Nrdt];
 static int nrdtarray;
-static struct Rbus* rdtbus[Nbus];
-static struct Rdt* rdtvecno[IdtMAX+1];
+static struct Rbus *rdtbus[Nbus];
+/* reverse mapping of IDT vector to the RDT/IOAPIC entry triggering vector */
+static struct Rdt *rdtvecno[IdtMAX + 1];
 
 static spinlock_t idtnolock;
 static int idtno = IdtIOAPIC;
 
-struct apic    xioapic[Napic];
+struct apic xioapic[Napic];
+
+static bool ioapic_exists(void)
+{
+       /* not foolproof, if we called this before parsing */
+       for (int i = 0; i < Napic; i++)
+               if (xioapic[i].useable)
+                       return TRUE;
+       return FALSE;
+}
 
-static void
-rtblget(struct apic* apic, int sel, uint32_t* hi, uint32_t* lo)
+static void rtblget(struct apic *apic, int sel, uint32_t * hi, uint32_t * lo)
 {
-       sel = Ioredtbl + 2*sel;
+       sel = Ioredtbl + 2 * sel;
 
-       *(apic->addr+Ioregsel) = sel+1;
-       *hi = *(apic->addr+Iowin);
-       *(apic->addr+Ioregsel) = sel;
-       *lo = *(apic->addr+Iowin);
+       write_mmreg32(apic->addr + Ioregsel, sel + 1);
+       *hi = read_mmreg32(apic->addr + Iowin);
+       write_mmreg32(apic->addr + Ioregsel, sel);
+       *lo = read_mmreg32(apic->addr + Iowin);
 }
 
-static void
-rtblput(struct apic* apic, int sel, uint32_t hi, uint32_t lo)
+static void rtblput(struct apic *apic, int sel, uint32_t hi, uint32_t lo)
 {
-       sel = Ioredtbl + 2*sel;
+       sel = Ioredtbl + 2 * sel;
 
-       *(apic->addr+Ioregsel) = sel+1;
-       *(apic->addr+Iowin) = hi;
-       *(apic->addr+Ioregsel) = sel;
-       *(apic->addr+Iowin) = lo;
+       write_mmreg32(apic->addr + Ioregsel, sel + 1);
+       write_mmreg32(apic->addr + Iowin, hi);
+       write_mmreg32(apic->addr + Ioregsel, sel);
+       write_mmreg32(apic->addr + Iowin, lo);
 }
 
-struct Rdt*
-rdtlookup(struct apic *apic, int intin)
+struct Rdt *rdtlookup(struct apic *apic, int intin)
 {
        int i;
        struct Rdt *r;
 
-       for(i = 0; i < nrdtarray; i++){
+       for (i = 0; i < nrdtarray; i++) {
                r = rdtarray + i;
-               if(apic == r->apic && intin == r->intin)
+               if (apic == r->apic && intin == r->intin)
                        return r;
        }
        return NULL;
 }
 
-void
-ioapicintrinit(int busno, int apicno, int intin, int devno, uint32_t lo)
+struct Rdt *rbus_get_rdt(int busno, int devno)
+{
+       struct Rbus *rbus;
+       for (rbus = rdtbus[busno]; rbus != NULL; rbus = rbus->next) {
+               if (rbus->devno == devno)
+                       return rbus->rdt;
+       }
+       return 0;
+}
+
+/* builds RDT and Rbus entries, given the wiring of bus:dev to ioapicno:intin.
+ * - busno is the source bus
+ * - devno is the device number in the style of a PCI Interrupt Assignment
+ * Entry.  Which is the irq << 2 (check MP spec D.3).
+ * - ioapic is the ioapic the device is connected to
+ * - intin is the INTIN pin on the ioapic
+ * - lo is the lower part of the IOAPIC apic-message, which has the polarity and
+ * trigger mode flags. */
+void ioapicintrinit(int busno, int ioapicno, int intin, int devno, int lo)
 {
        struct Rbus *rbus;
        struct Rdt *rdt;
-       struct apic *apic;
+       struct apic *ioapic;
 
-       if(busno >= Nbus || apicno >= Napic || nrdtarray >= Nrdt)
+       if (busno >= Nbus || ioapicno >= Napic || nrdtarray >= Nrdt) {
+               printk("Bad bus %d ioapic %d or nrdtarray %d too big\n", busno,
+                      ioapicno, nrdtarray);
                return;
-       apic = &xioapic[apicno];
-       if(!apic->useable || intin >= apic->nrdt)
+       }
+       ioapic = &xioapic[ioapicno];
+       if (!ioapic->useable || intin >= ioapic->nrdt) {
+               printk("IOAPIC unusable (%d) or not enough nrdt (%d) for %d\n",
+                      ioapic->useable, ioapic->nrdt, intin);
                return;
+       }
 
-       rdt = rdtlookup(apic, intin);
-       if(rdt == NULL){
+       rdt = rdtlookup(ioapic, intin);
+       if (rdt == NULL) {
                rdt = &rdtarray[nrdtarray++];
-               rdt->apic = apic;
+               rdt->apic = ioapic;
                rdt->intin = intin;
                rdt->lo = lo;
-       }else{
-               if(lo != rdt->lo){
-                       printd("mutiple irq botch bus %d %d/%d/%d lo %d vs %d\n",
-                               busno, apicno, intin, devno, lo, rdt->lo);
+               rdt->hi = 0;
+       } else {
+               /* Polarity/trigger check.  Stored lo also has the vector in 0xff */
+               if (lo != (rdt->lo & ~0xff)) {
+                       printk("multiple irq botch bus %d %d/%d/%d lo %d vs %d\n",
+                                  busno, ioapicno, intin, devno, lo, rdt->lo);
                        return;
                }
-               printk("dup rdt %d %d %d %d %.8ux\n", busno, apicno, intin, devno, lo);
        }
+       /* TODO: this shit is racy.  (refcnt, linked list addition) */
        rdt->ref++;
        rbus = kzmalloc(sizeof *rbus, 0);
        rbus->rdt = rdt;
@@ -131,21 +168,113 @@ ioapicintrinit(int busno, int apicno, int intin, int devno, uint32_t lo)
        rdtbus[busno] = rbus;
 }
 
-void
-ioapicinit(int id, int ibase, uintptr_t_t pa)
+static int map_polarity[4] = {
+       -1, IPhigh, -1, IPlow
+};
+
+static int map_edge_level[4] = {
+       -1, TMedge, -1, TMlevel
+};
+
+static int acpi_irq2ioapic(int irq)
+{
+       int ioapic_idx = 0;
+       struct apic *ioapic;
+       /* with acpi, the ioapics map a global interrupt space.  each covers a
+        * window of the space from [ibase, ibase + nrdt). */
+       for (ioapic = xioapic; ioapic < &xioapic[Napic]; ioapic++, ioapic_idx++) {
+               /* addr check is just for sanity */
+               if (!ioapic->useable || !ioapic->addr)
+                       continue;
+               if ((ioapic->ibase <= irq) && (irq < ioapic->ibase + ioapic->nrdt))
+                       return ioapic_idx;
+       }
+       return -1;
+}
+
+/* Build an RDT route, like we would have had from the MP tables had they been
+ * parsed, via ACPI.
+ *
+ * This only really deals with the ISA IRQs and maybe PCI ones that happen to
+ * have an override.  FWIW, on qemu the PCI NIC shows up as an ACPI intovr.
+ *
+ * From Brendan http://f.osdev.org/viewtopic.php?f=1&t=25951:
+ *
+ *             Before parsing the MADT you should begin by assuming that redirection
+ *             entries 0 to 15 are used for ISA IRQs 0 to 15. The MADT's "Interrupt
+ *             Source Override Structures" will tell you when this initial/default
+ *             assumption is wrong. For example, the MADT might tell you that ISA IRQ 9
+ *             is connected to IO APIC 44 and is level triggered; and (in this case)
+ *             it'd be silly to assume that ISA IRQ 9 is also connected to IO APIC
+ *             input 9 just because IO APIC input 9 is not listed.
+ *
+ *             For PCI IRQs, the MADT tells you nothing and you can't assume anything
+ *             at all. Sadly, you have to interpret the ACPI AML to determine how PCI
+ *             IRQs are connected to IO APIC inputs (or find some other work-around;
+ *             like implementing a motherboard driver for each different motherboard,
+ *             or some complex auto-detection scheme, or just configure PCI devices to
+ *             use MSI instead). */
+static int acpi_make_rdt(int tbdf, int irq, int busno, int devno)
+{
+       struct Apicst *st;
+       uint32_t lo;
+       int pol, edge_level, ioapic_nr, gsi_irq;
+
+       for (st = apics->st; st != NULL; st = st->next) {
+               if (st->type == ASintovr) {
+                       if (st->intovr.irq == irq)
+                               break;
+               }
+       }
+       if (st) {
+               pol = map_polarity[st->intovr.flags & AFpmask];
+               if (pol < 0) {
+                       printk("ACPI override had bad polarity\n");
+                       return -1;
+               }
+               edge_level = map_edge_level[(st->intovr.flags & AFlevel) >> 2];
+               if (edge_level < 0) {
+                       printk("ACPI override had bad edge/level\n");
+                       return -1;
+               }
+               lo = pol | edge_level;
+               gsi_irq = st->intovr.intr;
+       } else {
+               if (BUSTYPE(tbdf) == BusISA) {
+                       lo = IPhigh | TMedge;
+                       gsi_irq = irq;
+               } else {
+                       /* Need to query ACPI at some point to handle this */
+                       printk("Non-ISA IRQ %d not found in MADT, aborting\n", irq);
+                       return -1;
+               }
+       }
+       ioapic_nr = acpi_irq2ioapic(gsi_irq);
+       if (ioapic_nr < 0) {
+               printk("Could not find an IOAPIC for global irq %d!\n", gsi_irq);
+               return -1;
+       }
+       ioapicintrinit(busno, ioapic_nr, gsi_irq - xioapic[ioapic_nr].ibase,
+                      devno, lo);
+       return 0;
+}
+
+void ioapicinit(int id, int ibase, uintptr_t pa)
 {
        struct apic *apic;
        static int base;
 
+       assert((IOAPIC_PBASE <= pa) && (pa + PGSIZE <= IOAPIC_PBASE + APIC_SIZE));
        /*
         * Mark the IOAPIC useable if it has a good ID
         * and the registers can be mapped.
         */
-       if(id >= Napic)
+       if (id >= Napic)
                return;
 
        apic = &xioapic[id];
-       if(apic->useable || (apic->addr = vmap(pa, 1024)) == NULL)
+       apic->addr = IOAPIC_BASE + (pa - IOAPIC_PBASE);
+       if (apic->useable)
                return;
        apic->useable = 1;
        apic->paddr = pa;
@@ -156,21 +285,24 @@ ioapicinit(int id, int ibase, uintptr_t_t pa)
         * responsibility of the O/S to set the APIC ID.
         */
        spin_lock(&apic->lock);
-       *(apic->addr+Ioregsel) = Ioapicver;
-       apic->nrdt = ((*(apic->addr+Iowin)>>16) & 0xff) + 1;
-       if(ibase != -1)
+       write_mmreg32(apic->addr + Ioregsel, Ioapicver);
+       apic->nrdt = ((read_mmreg32(apic->addr + Iowin) >> 16) & 0xff) + 1;
+       /* the ibase is the global system interrupt base, told to us by ACPI.  if
+        * it's -1, we're called from mpparse, and just guess/make up our own
+        * assignments. */
+       if (ibase != -1)
                apic->ibase = ibase;
-       else{
+       else {
                apic->ibase = base;
                base += apic->nrdt;
        }
-       *(apic->addr+Ioregsel) = Ioapicid;
-       *(apic->addr+Iowin) = id<<24;
+       write_mmreg32(apic->addr + Ioregsel, Ioapicid);
+       write_mmreg32(apic->addr + Iowin, id << 24);
        spin_unlock(&apic->lock);
+       printk("IOAPIC initialized at %p\n", apic->addr);
 }
 
-void
-ioapicdump(void)
+char *ioapicdump(char *start, char *end)
 {
        int i, n;
        struct Rbus *rbus;
@@ -178,261 +310,321 @@ ioapicdump(void)
        struct apic *apic;
        uint32_t hi, lo;
 
-       if(!2)
-               return;
-       for(i = 0; i < Napic; i++){
+       if (!2)
+               return start;
+       for (i = 0; i < Napic; i++) {
                apic = &xioapic[i];
-               if(!apic->useable || apic->addr == 0)
+               if (!apic->useable || apic->addr == 0)
                        continue;
-               printd("ioapic %d addr %#p nrdt %d ibase %d\n",
-                       i, apic->addr, apic->nrdt, apic->ibase);
-               for(n = 0; n < apic->nrdt; n++){
+               start = seprintf(start, end, "ioapic %d addr %p nrdt %d ibase %d\n",
+                                                i, apic->addr, apic->nrdt, apic->ibase);
+               for (n = 0; n < apic->nrdt; n++) {
                        spin_lock(&apic->lock);
                        rtblget(apic, n, &hi, &lo);
                        spin_unlock(&apic->lock);
-                       printd(" rdt %2.2d %#8.8ux %#8.8ux\n", n, hi, lo);
+                       start = seprintf(start, end, " rdt %2.2d %p %p\n", n, hi, lo);
                }
        }
-       for(i = 0; i < Nbus; i++){
-               if((rbus = rdtbus[i]) == NULL)
+       for (i = 0; i < Nbus; i++) {
+               if ((rbus = rdtbus[i]) == NULL)
                        continue;
-               printd("iointr bus %d:\n", i);
-               for(; rbus != NULL; rbus = rbus->next){
+               start = seprintf(start, end, "iointr bus %d:\n", i);
+               for (; rbus != NULL; rbus = rbus->next) {
                        rdt = rbus->rdt;
-                       printd(" apic %ld devno %#ux (%d %d) intin %d lo %#ux ref %d\n",
-                               rdt->apic-xioapic, rbus->devno, rbus->devno>>2,
-                               rbus->devno & 0x03, rdt->intin, rdt->lo, rdt->ref);
+                       start = seprintf(start, end,
+                                                        " apic %ld devno %p(%d %d) intin %d hi %p lo %p\n",
+                                                        rdt->apic - xioapic, rbus->devno, rbus->devno >> 2,
+                                                        rbus->devno & 0x03, rdt->intin, rdt->hi, rdt->lo);
                }
        }
+       return start;
 }
 
-void
-ioapiconline(void)
+/* Zeros and masks every redirect entry in every IOAPIC */
+void ioapiconline(void)
 {
        int i;
        struct apic *apic;
 
-       for(apic = xioapic; apic < &xioapic[Napic]; apic++){
-               if(!apic->useable || apic->addr == NULL)
+       for (apic = xioapic; apic < &xioapic[Napic]; apic++) {
+               if (!apic->useable || !apic->addr)
                        continue;
-               for(i = 0; i < apic->nrdt; i++){
+               for (i = 0; i < apic->nrdt; i++) {
                        spin_lock(&apic->lock);
                        rtblput(apic, i, 0, Im);
                        spin_unlock(&apic->lock);
                }
        }
-       ioapicdump();
 }
 
-static int dfpolicy = 0;
-
-static void
-ioapicintrdd(uint32_t* hi, uint32_t* lo)
+int nextvec(void)
 {
-       int i;
-       static int df;
-       static spinlock_t dflock;
+       unsigned int vecno;
 
-       /*
-        * Set delivery mode (lo) and destination field (hi),
-        * according to interrupt routing policy.
-        */
-       /*
-        * The bulk of this code was written ~1995, when there was
-        * one architecture and one generation of hardware, the number
-        * of CPUs was up to 4(8) and the choices for interrupt routing
-        * were physical, or flat logical (optionally with lowest
-        * priority interrupt). Logical mode hasn't scaled well with
-        * the increasing number of packages/cores/threads, so the
-        * fall-back is to physical mode, which works across all processor
-        * generations, both AMD and Intel, using the APIC and xAPIC.
-        *
-        * Interrupt routing policy can be set here.
-        */
-       switch(dfpolicy){
-       default:                                /* noise core 0 */
-               *hi = sys->machptr[0]->apicno<<24;
-               break;
-       case 1:                                 /* round-robin */
-               /*
-                * Assign each interrupt to a different CPU on a round-robin
-                * Some idea of the packages/cores/thread topology would be
-                * useful here, e.g. to not assign interrupts to more than one
-                * thread in a core. But, as usual, Intel make that an onerous
-                * task.
-                */
-               spin_lock(&(&dflock)->lock);
-               for(;;){
-                       i = df++;
-                       if(df >= sys->nmach+1)
-                               df = 0;
-                       if(sys->machptr[i] == NULL || !sys->machptr[i]->online)
-                               continue;
-                       i = sys->machptr[i]->apicno;
-                       if(xlapic[i].useable && xlapic[i].addr == 0)
-                               break;
-               }
-               spin_unlock(&(&dflock)->lock);
+       /* TODO: half-way decent integer service (vmem) */
+       spin_lock(&idtnolock);
+       vecno = idtno;
+       idtno = (idtno + 1) % IdtMAX;
+       if (idtno < IdtIOAPIC)
+               idtno += IdtIOAPIC;
+       spin_unlock(&idtnolock);
 
-               *hi = i<<24;
-               break;
-       }
-       *lo |= Pm|MTf;
+       return vecno;
 }
 
-int
-nextvec(void)
+static void msi_mask_irq(struct irq_handler *irq_h, int apic_vector)
 {
-       unsigned int vecno;
+       pci_msi_mask(irq_h->dev_private);
+}
 
-       spin_lock(&(&idtnolock)->lock);
-       vecno = idtno;
-       idtno = (idtno+8) % IdtMAX;
-       if(idtno < IdtIOAPIC)
-               idtno += IdtIOAPIC;
-       spin_unlock(&(&idtnolock)->lock);
+static void msi_unmask_irq(struct irq_handler *irq_h, int apic_vector)
+{
+       pci_msi_unmask(irq_h->dev_private);
+}
 
-       return vecno;
+static void msi_route_irq(struct irq_handler *irq_h, int apic_vector, int dest)
+{
+       pci_msi_route(irq_h->dev_private, dest);
 }
 
-static int
-msimask(Vkey *v, int mask)
+static void msix_mask_irq(struct irq_handler *irq_h, int apic_vector)
 {
-       Pcidev *p;
+       pci_msix_mask_vector(irq_h->dev_private);
+}
 
-       p = pcimatchtbdf(v->tbdf);
-       if(p == NULL)
-               return -1;
-       return pcimsimask(p, mask);
+static void msix_unmask_irq(struct irq_handler *irq_h, int apic_vector)
+{
+       pci_msix_unmask_vector(irq_h->dev_private);
+}
+
+static void msix_route_irq(struct irq_handler *irq_h, int apic_vector, int dest)
+{
+       pci_msix_route_vector(irq_h->dev_private, dest);
 }
 
-static int
-intrenablemsi(Vctl* v, Pcidev *p)
+static int msi_irq_enable(struct irq_handler *irq_h, struct pci_device *p)
 {
-       unsigned int vno, lo, hi;
+       unsigned int vno, lo, hi = 0;
        uint64_t msivec;
+       struct msix_irq_vector *linkage;
 
        vno = nextvec();
 
-       lo = IPlow | TMedge | vno;
-       ioapicintrdd(&hi, &lo);
+       /* routing the IRQ to core 0 (hi = 0) in physical mode (Pm) */
+       lo = IPlow | TMedge | Pm | vno;
 
-       if(lo & Lm)
-               lo |= MTlp;
+       msivec = (uint64_t) hi << 32 | lo;
+       irq_h->dev_private = pci_msix_enable(p, msivec);
+       if (!irq_h->dev_private) {
+               if (pci_msi_enable(p, msivec) == -1) {
+                       /* TODO: should free vno here */
+                       return -1;
+               }
+               irq_h->dev_private = p;
+               irq_h->check_spurious = lapic_check_spurious;
+               irq_h->eoi = lapic_send_eoi;
+               irq_h->mask = msi_mask_irq;
+               irq_h->unmask = msi_unmask_irq;
+               irq_h->route_irq = msi_route_irq;
+               irq_h->type = "msi";
+               printk("MSI irq: (%x,%x,%x): enabling %p %s vno %d\n",
+                          p->bus, p->dev, p->func, msivec, irq_h->name, vno);
+               return vno;
+       }
+       irq_h->check_spurious = lapic_check_spurious;
+       irq_h->eoi = lapic_send_eoi;
+       irq_h->mask = msix_mask_irq;
+       irq_h->unmask = msix_unmask_irq;
+       irq_h->route_irq = msix_route_irq;
+       irq_h->type = "msi-x";
+       printk("MSI-X irq: (%x,%x,%x): enabling %p %s vno %d\n",
+              p->bus, p->dev, p->func, msivec, irq_h->name, vno);
+       return vno;
+}
 
-       msivec = (uint64_t)hi<<32 | lo;
-       if(pcimsienable(p, msivec) == -1)
-               return -1;
-       v->isr = apicisr;
-       v->eoi = apiceoi;
-       v->vno = vno;
-       v->type = "msi";
-       v->mask = msimask;
+static struct Rdt *ioapic_vector2rdt(int apic_vector)
+{
+       struct Rdt *rdt;
+       if (apic_vector < IdtIOAPIC || apic_vector > MaxIdtIOAPIC) {
+               printk("ioapic vector %d out of range", apic_vector);
+               return 0;
+       }
+       /* Fortunately rdtvecno[vecno] is static once assigned. o/w, we'll need some
+        * global sync for the callers, both for lookup and keeping rdt valid. */
+       rdt = rdtvecno[apic_vector];
+       if (!rdt) {
+               printk("vector %d has no RDT! (did you enable it?)", apic_vector);
+               return 0;
+       }
+       return rdt;
+}
 
-       printk("msiirq: %T: enabling %.16llux %s irq %d vno %d\n", p->tbdf, msivec, v->name, v->irq, vno);
-       return vno;
+/* Routes the IRQ to the hw_coreid.  Will take effect immediately.  Route
+ * masking from rdt->lo will take effect.  The early return cases are probably
+ * bugs in IOAPIC irq_h setup. */
+static void ioapic_route_irq(struct irq_handler *unused, int apic_vector,
+                             int hw_coreid)
+{
+       struct Rdt *rdt = ioapic_vector2rdt(apic_vector);
+       if (!rdt) {
+               printk("Missing IOAPIC route for vector!\n", apic_vector);
+               return;
+       }
+       spin_lock(&rdt->apic->lock);
+       /* this bit gets set in apicinit, only if we found it via MP or ACPI */
+       if (!xlapic[hw_coreid].useable) {
+               printk("Can't route to uninitialized LAPIC %d!\n", hw_coreid);
+               spin_unlock(&rdt->apic->lock);
+               return;
+       }
+       rdt->hi = hw_coreid << 24;
+       rdt->lo |= Pm | MTf;
+       rtblput(rdt->apic, rdt->intin, rdt->hi, rdt->lo);
+       spin_unlock(&rdt->apic->lock);
 }
 
-int
-disablemsi(Vctl*, Pcidev *p)
+static void ioapic_mask_irq(struct irq_handler *unused, int apic_vector)
 {
-       if(p == NULL)
-               return -1;
-       return pcimsimask(p, 1);
+       /* could store the rdt in the irq_h */
+       struct Rdt *rdt = ioapic_vector2rdt(apic_vector);
+       if (!rdt)
+               return;
+       spin_lock(&rdt->apic->lock);
+       /* don't allow shared vectors to be masked.  whatever. */
+       if (rdt->enabled > 1) {
+               spin_unlock(&rdt->apic->lock);
+               return;
+       }
+       rdt->lo |= Im;
+       rtblput(rdt->apic, rdt->intin, rdt->hi, rdt->lo);
+       spin_unlock(&rdt->apic->lock);
 }
 
-int
-ioapicintrenable(Vctl* v)
+static void ioapic_unmask_irq(struct irq_handler *unused, int apic_vector)
+{
+       struct Rdt *rdt = ioapic_vector2rdt(apic_vector);
+       if (!rdt)
+               return;
+       spin_lock(&rdt->apic->lock);
+       rdt->lo &= ~Im;
+       rtblput(rdt->apic, rdt->intin, rdt->hi, rdt->lo);
+       spin_unlock(&rdt->apic->lock);
+}
+
+/* Attempts to init a bus interrupt, initializes irq_h, and returns the IDT
+ * vector to use (-1 on error).  If routable, the IRQ will route to core 0.  The
+ * IRQ will be masked, if possible.  Call irq_h->unmask() when you're ready.
+ *
+ * This will determine the type of bus the device is on (LAPIC, IOAPIC, PIC,
+ * etc), and set the appropriate fields in isr_h.  If applicable, it'll also
+ * allocate an IDT vector, such as for an IOAPIC, and route the IOAPIC entries
+ * appropriately.
+ *
+ * Callers init irq_h->dev_irq and ->tbdf.  tbdf encodes the bus type and the
+ * classic PCI bus:dev:func.  dev_irq may be ignored based on the bus type (e.g.
+ * PCI, esp MSI).
+ *
+ * In plan9, this was ioapicintrenable(), which also unmasked.  We don't have a
+ * deinit/disable method that would tear down the route yet.  All the plan9 one
+ * did was dec enabled and mask the entry. */
+int bus_irq_setup(struct irq_handler *irq_h)
 {
        struct Rbus *rbus;
        struct Rdt *rdt;
-       uint32_t hi, lo;
        int busno, devno, vecno;
+       struct pci_device *pcidev;
 
-       /*
-        * Bridge between old and unspecified new scheme,
-        * the work in progress...
-        */
-       if(v->tbdf == BUSUNKNOWN){
-               if(v->irq >= IrqLINT0 && v->irq <= MaxIrqLAPIC){
-                       if(v->irq != IrqSPURIOUS)
-                               v->isr = apiceoi;
-                       v->type = "lapic";
-                       return v->irq;
-               }
-               else{
-                       /*
-                        * Legacy ISA.
-                        * Make a busno and devno using the
-                        * ISA bus number and the irq.
-                        */
-                       extern int mpisabusno;
-
-                       if(mpisabusno == -1)
-                               panic("no ISA bus allocated");
-                       busno = mpisabusno;
-                       devno = v->irq<<2;
+       if (!ioapic_exists()) {
+               switch (BUSTYPE(irq_h->tbdf)) {
+                       case BusLAPIC:
+                       case BusIPI:
+                               break;
+                       default:
+                               irq_h->check_spurious = pic_check_spurious;
+                               irq_h->eoi = pic_send_eoi;
+                               irq_h->mask = pic_mask_irq;
+                               irq_h->unmask = pic_unmask_irq;
+                               irq_h->route_irq = 0;
+                               irq_h->type = "pic";
+                               /* PIC devices have vector = irq + 32 */
+                               return irq_h->dev_irq + IdtPIC;
                }
        }
-       else if(BUSTYPE(v->tbdf) == BusPCI){
-               /*
-                * PCI.
-                * Make a devno from BUSDNO(tbdf) and pcidev->intp.
-                */
-               Pcidev *pcidev;
-
-               busno = BUSBNO(v->tbdf);
-               if((pcidev = pcimatchtbdf(v->tbdf)) == NULL)
-                       panic("no PCI dev for tbdf %#8.8ux", v->tbdf);
-               if((vecno = intrenablemsi(v, pcidev)) != -1)
-                       return vecno;
-               disablemsi(v, pcidev);
-               if((devno = pcicfgr8(pcidev, PciINTP)) == 0)
-                       panic("no INTP for tbdf %#8.8ux", v->tbdf);
-               devno = BUSDNO(v->tbdf)<<2|(devno-1);
-               printk("ioapicintrenable: tbdf %#8.8ux busno %d devno %d\n",
-                       v->tbdf, busno, devno);
-       }
-       else{
-               SET(busno, devno);
-               panic("unknown tbdf %#8.8ux", v->tbdf);
-       }
-
-       rdt = NULL;
-       for(rbus = rdtbus[busno]; rbus != NULL; rbus = rbus->next)
-               if(rbus->devno == devno){
-                       rdt = rbus->rdt;
+       switch (BUSTYPE(irq_h->tbdf)) {
+               case BusLAPIC:
+                       /* nxm used to set the initial 'isr' method (i think equiv to our
+                        * check_spurious) to apiceoi for non-spurious lapic vectors.  in
+                        * effect, i think they were sending the EOI early, and their eoi
+                        * method was 0.  we're not doing that (unless we have to). */
+                       irq_h->check_spurious = lapic_check_spurious;
+                       irq_h->eoi = lapic_send_eoi;
+                       irq_h->mask = lapic_mask_irq;
+                       irq_h->unmask = lapic_unmask_irq;
+                       irq_h->route_irq = 0;
+                       irq_h->type = "lapic";
+                       /* For the LAPIC, irq == vector */
+                       return irq_h->dev_irq;
+               case BusIPI:
+                       /* similar to LAPIC, but we don't actually have LVT entries */
+                       irq_h->check_spurious = lapic_check_spurious;
+                       irq_h->eoi = lapic_send_eoi;
+                       irq_h->mask = 0;
+                       irq_h->unmask = 0;
+                       irq_h->route_irq = 0;
+                       irq_h->type = "IPI";
+                       return irq_h->dev_irq;
+               case BusISA:
+                       if (mpisabusno == -1)
+                               panic("No ISA bus allocated");
+                       busno = mpisabusno;
+                       /* need to track the irq in devno in PCI interrupt assignment entry
+                        * format (see mp.c or MP spec D.3). */
+                       devno = irq_h->dev_irq << 2;
                        break;
-               }
-       if(rdt == NULL){
-               extern int mpisabusno;
-
-               /*
-                * First crack in the smooth exterior of the new code:
-                * some BIOS make an MPS table where the PCI devices are
-                * just defaulted to ISA.
-                * Rewrite this to be cleaner.
-                */
-               if((busno = mpisabusno) == -1)
-                       return -1;
-               devno = v->irq<<2;
-               for(rbus = rdtbus[busno]; rbus != NULL; rbus = rbus->next)
-                       if(rbus->devno == devno){
-                               rdt = rbus->rdt;
-                               break;
+               case BusPCI:
+                       pcidev = pci_match_tbdf(irq_h->tbdf);
+                       if (!pcidev) {
+                               printk("No PCI dev for tbdf %p!", irq_h->tbdf);
+                               return -1;
                        }
-               printk("isa: tbdf %#8.8ux busno %d devno %d %#p\n",
-                       v->tbdf, busno, devno, rdt);
+                       if ((vecno = msi_irq_enable(irq_h, pcidev)) != -1)
+                               return vecno;
+                       busno = BUSBNO(irq_h->tbdf);
+                       assert(busno == pcidev->bus);
+                       devno = pcidev_read8(pcidev, PciINTP);
+
+                       /* this might not be a big deal - some PCI devices have no INTP.  if
+                        * so, change our devno - 1 below. */
+                       if (devno == 0)
+                               panic("no INTP for tbdf %p", irq_h->tbdf);
+                       /* remember, devno is the device shifted with irq pin in bits 0-1.
+                        * we subtract 1, since the PCI intp maps 1 -> INTA, 2 -> INTB, etc,
+                        * and the MP spec uses 0 -> INTA, 1 -> INTB, etc. */
+                       devno = BUSDNO(irq_h->tbdf) << 2 | (devno - 1);
+                       break;
+               default:
+                       panic("Unknown bus type, TBDF %p", irq_h->tbdf);
+       }
+       /* busno and devno are set, regardless of the bustype, enough to find rdt.
+        * these may differ from the values in tbdf. */
+       rdt = rbus_get_rdt(busno, devno);
+       if (!rdt) {
+               /* second chance.  if we didn't find the item the first time, then (if
+                * it exists at all), it wasn't in the MP tables (or we had no tables).
+                * So maybe we can figure it out via ACPI. */
+               acpi_make_rdt(irq_h->tbdf, irq_h->dev_irq, busno, devno);
+               rdt = rbus_get_rdt(busno, devno);
        }
-       if(rdt == NULL)
+       if (!rdt) {
+               printk("Unable to build IOAPIC route for irq %d\n", irq_h->dev_irq);
                return -1;
-
+       }
        /*
-        * Second crack:
         * what to do about devices that intrenable/intrdisable frequently?
         * 1) there is no ioapicdisable yet;
         * 2) it would be good to reuse freed vectors.
         * Oh bugger.
+        * brho: plus the diff btw mask/unmask and enable/disable is unclear
         */
        /*
         * This is a low-frequency event so just lock
@@ -440,58 +632,29 @@ ioapicintrenable(Vctl* v)
         * rather than putting a Lock in each entry.
         */
        spin_lock(&rdt->apic->lock);
-       printk("%T: %ld/%d/%d (%d)\n", v->tbdf, rdt->apic - xioapic, rbus->devno, rdt->intin, devno);
-       if((rdt->lo & 0xff) == 0){
+       /* if a destination has already been picked, we store it in the lo.  this
+        * stays around regardless of enabled/disabled, since we don't reap vectors
+        * yet.  nor do we really mess with enabled... */
+       if ((rdt->lo & 0xff) == 0) {
                vecno = nextvec();
                rdt->lo |= vecno;
                rdtvecno[vecno] = rdt;
-       }else
-               printk("%T: mutiple irq bus %d dev %d\n", v->tbdf, busno, devno);
-
+       } else {
+               printd("%p: mutiple irq bus %d dev %d\n", irq_h->tbdf, busno, devno);
+       }
        rdt->enabled++;
-       lo = (rdt->lo & ~Im);
-       ioapicintrdd(&hi, &lo);
-       rtblput(rdt->apic, rdt->intin, hi, lo);
-       vecno = lo & 0xff;
+       rdt->hi = 0;                    /* route to 0 by default */
+       rdt->lo |= Pm | MTf;
+       rtblput(rdt->apic, rdt->intin, rdt->hi, rdt->lo);
+       vecno = rdt->lo & 0xff;
        spin_unlock(&rdt->apic->lock);
 
-       printk("busno %d devno %d hi %#8.8ux lo %#8.8ux vecno %d\n",
-               busno, devno, hi, lo, vecno);
-       v->isr = apicisr;
-       v->eoi = apiceoi;
-       v->vno = vecno;
-       v->type = "ioapic";
+       irq_h->check_spurious = lapic_check_spurious;
+       irq_h->eoi = lapic_send_eoi;
+       irq_h->mask = ioapic_mask_irq;
+       irq_h->unmask = ioapic_unmask_irq;
+       irq_h->route_irq = ioapic_route_irq;
+       irq_h->type = "ioapic";
 
        return vecno;
 }
-
-int
-ioapicintrdisable(int vecno)
-{
-       struct Rdt *rdt;
-
-       /*
-        * FOV. Oh dear. This isn't very good.
-        * Fortunately rdtvecno[vecno] is static
-        * once assigned.
-        * Must do better.
-        *
-        * What about any pending interrupts?
-        */
-       if(vecno < 0 || vecno > MaxVectorAPIC){
-               panic("ioapicintrdisable: vecno %d out of range", vecno);
-               return -1;
-       }
-       if((rdt = rdtvecno[vecno]) == NULL){
-               panic("ioapicintrdisable: vecno %d has no rdt", vecno);
-               return -1;
-       }
-
-       spin_lock(&rdt->apic->lock);
-       rdt->enabled--;
-       if(rdt->enabled == 0)
-               rtblput(rdt->apic, rdt->intin, 0, rdt->lo);
-       spin_unlock(&rdt->apic->lock);
-
-       return 0;
-}