Commit 02eb84d0ec97f183ac23ee939403a139e8849b1d

Authored by Michael S. Tsirkin
Committed by Anthony Liguori
1 parent bd4b65ee

qemu/pci: MSI-X support functions

Add functions implementing MSI-X support. First user will be virtio-pci.
Note that platform must set a flag to declare MSI supported: this
is a safety measure to avoid breaking platforms which should support
MSI-X but currently lack this in the interrupt controller emulation.
For PC this will be set by APIC.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Makefile.target
... ... @@ -495,7 +495,7 @@ endif #CONFIG_BSD_USER
495 495 ifndef CONFIG_USER_ONLY
496 496  
497 497 OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o \
498   - gdbstub.o gdbstub-xml.o
  498 + gdbstub.o gdbstub-xml.o msix.o
499 499 # virtio has to be here due to weird dependency between PCI and virtio-net.
500 500 # need to fix this properly
501 501 OBJS+=virtio-blk.o virtio-balloon.o virtio-net.o virtio-console.o
... ...
hw/msix.c 0 → 100644
  1 +/*
  2 + * MSI-X device support
  3 + *
  4 + * This module includes support for MSI-X in pci devices.
  5 + *
  6 + * Author: Michael S. Tsirkin <mst@redhat.com>
  7 + *
  8 + * Copyright (c) 2009, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com)
  9 + *
  10 + * This work is licensed under the terms of the GNU GPL, version 2. See
  11 + * the COPYING file in the top-level directory.
  12 + */
  13 +
  14 +#include "hw.h"
  15 +#include "msix.h"
  16 +#include "pci.h"
  17 +
  18 +/* Declaration from linux/pci_regs.h */
  19 +#define PCI_CAP_ID_MSIX 0x11 /* MSI-X */
  20 +#define PCI_MSIX_FLAGS 2 /* Table at lower 11 bits */
  21 +#define PCI_MSIX_FLAGS_QSIZE 0x7FF
  22 +#define PCI_MSIX_FLAGS_ENABLE (1 << 15)
  23 +#define PCI_MSIX_FLAGS_BIRMASK (7 << 0)
  24 +
  25 +/* MSI-X capability structure */
  26 +#define MSIX_TABLE_OFFSET 4
  27 +#define MSIX_PBA_OFFSET 8
  28 +#define MSIX_CAP_LENGTH 12
  29 +
  30 +/* MSI enable bit is in byte 1 in FLAGS register */
  31 +#define MSIX_ENABLE_OFFSET (PCI_MSIX_FLAGS + 1)
  32 +#define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
  33 +
  34 +/* MSI-X table format */
  35 +#define MSIX_MSG_ADDR 0
  36 +#define MSIX_MSG_UPPER_ADDR 4
  37 +#define MSIX_MSG_DATA 8
  38 +#define MSIX_VECTOR_CTRL 12
  39 +#define MSIX_ENTRY_SIZE 16
  40 +#define MSIX_VECTOR_MASK 0x1
  41 +
  42 +/* How much space does an MSIX table need. */
  43 +/* The spec requires giving the table structure
  44 + * a 4K aligned region all by itself. Align it to
  45 + * target pages so that drivers can do passthrough
  46 + * on the rest of the region. */
  47 +#define MSIX_PAGE_SIZE TARGET_PAGE_ALIGN(0x1000)
  48 +/* Reserve second half of the page for pending bits */
  49 +#define MSIX_PAGE_PENDING (MSIX_PAGE_SIZE / 2)
  50 +#define MSIX_MAX_ENTRIES 32
  51 +
  52 +
  53 +#ifdef MSIX_DEBUG
  54 +#define DEBUG(fmt, ...) \
  55 + do { \
  56 + fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \
  57 + } while (0)
  58 +#else
  59 +#define DEBUG(fmt, ...) do { } while(0)
  60 +#endif
  61 +
  62 +/* Flag for interrupt controller to declare MSI-X support */
  63 +int msix_supported;
  64 +
  65 +/* Add MSI-X capability to the config space for the device. */
  66 +/* Given a bar and its size, add MSI-X table on top of it
  67 + * and fill MSI-X capability in the config space.
  68 + * Original bar size must be a power of 2 or 0.
  69 + * New bar size is returned. */
  70 +static int msix_add_config(struct PCIDevice *pdev, unsigned short nentries,
  71 + unsigned bar_nr, unsigned bar_size)
  72 +{
  73 + int config_offset;
  74 + uint8_t *config;
  75 + uint32_t new_size;
  76 +
  77 + if (nentries < 1 || nentries > PCI_MSIX_FLAGS_QSIZE + 1)
  78 + return -EINVAL;
  79 + if (bar_size > 0x80000000)
  80 + return -ENOSPC;
  81 +
  82 + /* Add space for MSI-X structures */
  83 + if (!bar_size)
  84 + new_size = MSIX_PAGE_SIZE;
  85 + else if (bar_size < MSIX_PAGE_SIZE) {
  86 + bar_size = MSIX_PAGE_SIZE;
  87 + new_size = MSIX_PAGE_SIZE * 2;
  88 + } else
  89 + new_size = bar_size * 2;
  90 +
  91 + pdev->msix_bar_size = new_size;
  92 + config_offset = pci_add_capability(pdev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
  93 + if (config_offset < 0)
  94 + return config_offset;
  95 + config = pdev->config + config_offset;
  96 +
  97 + pci_set_word(config + PCI_MSIX_FLAGS, nentries - 1);
  98 + /* Table on top of BAR */
  99 + pci_set_long(config + MSIX_TABLE_OFFSET, bar_size | bar_nr);
  100 + /* Pending bits on top of that */
  101 + pci_set_long(config + MSIX_PBA_OFFSET, (bar_size + MSIX_PAGE_PENDING) |
  102 + bar_nr);
  103 + pdev->msix_cap = config_offset;
  104 + /* Make flags bit writeable. */
  105 + pdev->wmask[config_offset + MSIX_ENABLE_OFFSET] |= MSIX_ENABLE_MASK;
  106 + return 0;
  107 +}
  108 +
  109 +static void msix_free_irq_entries(PCIDevice *dev)
  110 +{
  111 + int vector;
  112 +
  113 + for (vector = 0; vector < dev->msix_entries_nr; ++vector)
  114 + dev->msix_entry_used[vector] = 0;
  115 +}
  116 +
  117 +/* Handle MSI-X capability config write. */
  118 +void msix_write_config(PCIDevice *dev, uint32_t addr,
  119 + uint32_t val, int len)
  120 +{
  121 + unsigned enable_pos = dev->msix_cap + MSIX_ENABLE_OFFSET;
  122 + if (addr + len <= enable_pos || addr > enable_pos)
  123 + return;
  124 +
  125 + if (msix_enabled(dev))
  126 + qemu_set_irq(dev->irq[0], 0);
  127 +}
  128 +
  129 +static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
  130 +{
  131 + PCIDevice *dev = opaque;
  132 + unsigned int offset = addr & (MSIX_PAGE_SIZE - 1);
  133 + void *page = dev->msix_table_page;
  134 + uint32_t val = 0;
  135 +
  136 + memcpy(&val, (void *)((char *)page + offset), 4);
  137 +
  138 + return val;
  139 +}
  140 +
  141 +static uint32_t msix_mmio_read_unallowed(void *opaque, target_phys_addr_t addr)
  142 +{
  143 + fprintf(stderr, "MSI-X: only dword read is allowed!\n");
  144 + return 0;
  145 +}
  146 +
  147 +static uint8_t msix_pending_mask(int vector)
  148 +{
  149 + return 1 << (vector % 8);
  150 +}
  151 +
  152 +static uint8_t *msix_pending_byte(PCIDevice *dev, int vector)
  153 +{
  154 + return dev->msix_table_page + MSIX_PAGE_PENDING + vector / 8;
  155 +}
  156 +
  157 +static int msix_is_pending(PCIDevice *dev, int vector)
  158 +{
  159 + return *msix_pending_byte(dev, vector) & msix_pending_mask(vector);
  160 +}
  161 +
  162 +static void msix_set_pending(PCIDevice *dev, int vector)
  163 +{
  164 + *msix_pending_byte(dev, vector) |= msix_pending_mask(vector);
  165 +}
  166 +
  167 +static void msix_clr_pending(PCIDevice *dev, int vector)
  168 +{
  169 + *msix_pending_byte(dev, vector) &= ~msix_pending_mask(vector);
  170 +}
  171 +
  172 +static int msix_is_masked(PCIDevice *dev, int vector)
  173 +{
  174 + unsigned offset = vector * MSIX_ENTRY_SIZE + MSIX_VECTOR_CTRL;
  175 + return dev->msix_table_page[offset] & MSIX_VECTOR_MASK;
  176 +}
  177 +
  178 +static void msix_mmio_writel(void *opaque, target_phys_addr_t addr,
  179 + uint32_t val)
  180 +{
  181 + PCIDevice *dev = opaque;
  182 + unsigned int offset = addr & (MSIX_PAGE_SIZE - 1);
  183 + int vector = offset / MSIX_ENTRY_SIZE;
  184 + memcpy(dev->msix_table_page + offset, &val, 4);
  185 + if (!msix_is_masked(dev, vector) && msix_is_pending(dev, vector)) {
  186 + msix_clr_pending(dev, vector);
  187 + msix_notify(dev, vector);
  188 + }
  189 +}
  190 +
  191 +static void msix_mmio_write_unallowed(void *opaque, target_phys_addr_t addr,
  192 + uint32_t val)
  193 +{
  194 + fprintf(stderr, "MSI-X: only dword write is allowed!\n");
  195 +}
  196 +
  197 +static CPUWriteMemoryFunc *msix_mmio_write[] = {
  198 + msix_mmio_write_unallowed, msix_mmio_write_unallowed, msix_mmio_writel
  199 +};
  200 +
  201 +static CPUReadMemoryFunc *msix_mmio_read[] = {
  202 + msix_mmio_read_unallowed, msix_mmio_read_unallowed, msix_mmio_readl
  203 +};
  204 +
  205 +/* Should be called from device's map method. */
  206 +void msix_mmio_map(PCIDevice *d, int region_num,
  207 + uint32_t addr, uint32_t size, int type)
  208 +{
  209 + uint8_t *config = d->config + d->msix_cap;
  210 + uint32_t table = pci_get_long(config + MSIX_TABLE_OFFSET);
  211 + uint32_t offset = table & ~(MSIX_PAGE_SIZE - 1);
  212 + /* TODO: for assigned devices, we'll want to make it possible to map
  213 + * pending bits separately in case they are in a separate bar. */
  214 + int table_bir = table & PCI_MSIX_FLAGS_BIRMASK;
  215 +
  216 + if (table_bir != region_num)
  217 + return;
  218 + if (size <= offset)
  219 + return;
  220 + cpu_register_physical_memory(addr + offset, size - offset,
  221 + d->msix_mmio_index);
  222 +}
  223 +
  224 +/* Initialize the MSI-X structures. Note: if MSI-X is supported, BAR size is
  225 + * modified, it should be retrieved with msix_bar_size. */
  226 +int msix_init(struct PCIDevice *dev, unsigned short nentries,
  227 + unsigned bar_nr, unsigned bar_size)
  228 +{
  229 + int ret;
  230 + /* Nothing to do if MSI is not supported by interrupt controller */
  231 + if (!msix_supported)
  232 + return -ENOTSUP;
  233 +
  234 + if (nentries > MSIX_MAX_ENTRIES)
  235 + return -EINVAL;
  236 +
  237 + dev->msix_entry_used = qemu_mallocz(MSIX_MAX_ENTRIES *
  238 + sizeof *dev->msix_entry_used);
  239 +
  240 + dev->msix_table_page = qemu_mallocz(MSIX_PAGE_SIZE);
  241 +
  242 + dev->msix_mmio_index = cpu_register_io_memory(msix_mmio_read,
  243 + msix_mmio_write, dev);
  244 + if (dev->msix_mmio_index == -1) {
  245 + ret = -EBUSY;
  246 + goto err_index;
  247 + }
  248 +
  249 + dev->msix_entries_nr = nentries;
  250 + ret = msix_add_config(dev, nentries, bar_nr, bar_size);
  251 + if (ret)
  252 + goto err_config;
  253 +
  254 + dev->cap_present |= QEMU_PCI_CAP_MSIX;
  255 + return 0;
  256 +
  257 +err_config:
  258 + cpu_unregister_io_memory(dev->msix_mmio_index);
  259 +err_index:
  260 + qemu_free(dev->msix_table_page);
  261 + dev->msix_table_page = NULL;
  262 + qemu_free(dev->msix_entry_used);
  263 + dev->msix_entry_used = NULL;
  264 + return ret;
  265 +}
  266 +
  267 +/* Clean up resources for the device. */
  268 +int msix_uninit(PCIDevice *dev)
  269 +{
  270 + if (!(dev->cap_present & QEMU_PCI_CAP_MSIX))
  271 + return 0;
  272 + pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
  273 + dev->msix_cap = 0;
  274 + msix_free_irq_entries(dev);
  275 + dev->msix_entries_nr = 0;
  276 + cpu_unregister_io_memory(dev->msix_mmio_index);
  277 + qemu_free(dev->msix_table_page);
  278 + dev->msix_table_page = NULL;
  279 + qemu_free(dev->msix_entry_used);
  280 + dev->msix_entry_used = NULL;
  281 + dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
  282 + return 0;
  283 +}
  284 +
  285 +void msix_save(PCIDevice *dev, QEMUFile *f)
  286 +{
  287 + unsigned nentries = (pci_get_word(dev->config + PCI_MSIX_FLAGS) &
  288 + PCI_MSIX_FLAGS_QSIZE) + 1;
  289 + qemu_put_buffer(f, dev->msix_table_page, nentries * MSIX_ENTRY_SIZE);
  290 + qemu_put_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING,
  291 + (nentries + 7) / 8);
  292 +}
  293 +
  294 +/* Should be called after restoring the config space. */
  295 +void msix_load(PCIDevice *dev, QEMUFile *f)
  296 +{
  297 + unsigned n = dev->msix_entries_nr;
  298 +
  299 + if (!dev->cap_present & QEMU_PCI_CAP_MSIX)
  300 + return;
  301 +
  302 + qemu_get_buffer(f, dev->msix_table_page, n * MSIX_ENTRY_SIZE);
  303 + qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
  304 +}
  305 +
  306 +/* Does device support MSI-X? */
  307 +int msix_present(PCIDevice *dev)
  308 +{
  309 + return dev->cap_present & QEMU_PCI_CAP_MSIX;
  310 +}
  311 +
  312 +/* Is MSI-X enabled? */
  313 +int msix_enabled(PCIDevice *dev)
  314 +{
  315 + return (dev->cap_present & QEMU_PCI_CAP_MSIX) &&
  316 + (dev->config[dev->msix_cap + MSIX_ENABLE_OFFSET] &
  317 + MSIX_ENABLE_MASK);
  318 +}
  319 +
  320 +/* Size of bar where MSI-X table resides, or 0 if MSI-X not supported. */
  321 +uint32_t msix_bar_size(PCIDevice *dev)
  322 +{
  323 + return (dev->cap_present & QEMU_PCI_CAP_MSIX) ?
  324 + dev->msix_bar_size : 0;
  325 +}
  326 +
  327 +/* Send an MSI-X message */
  328 +void msix_notify(PCIDevice *dev, unsigned vector)
  329 +{
  330 + uint8_t *table_entry = dev->msix_table_page + vector * MSIX_ENTRY_SIZE;
  331 + uint64_t address;
  332 + uint32_t data;
  333 +
  334 + if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
  335 + return;
  336 + if (msix_is_masked(dev, vector)) {
  337 + msix_set_pending(dev, vector);
  338 + return;
  339 + }
  340 +
  341 + address = pci_get_long(table_entry + MSIX_MSG_UPPER_ADDR);
  342 + address = (address << 32) | pci_get_long(table_entry + MSIX_MSG_ADDR);
  343 + data = pci_get_long(table_entry + MSIX_MSG_DATA);
  344 + stl_phys(address, data);
  345 +}
  346 +
  347 +void msix_reset(PCIDevice *dev)
  348 +{
  349 + if (!(dev->cap_present & QEMU_PCI_CAP_MSIX))
  350 + return;
  351 + msix_free_irq_entries(dev);
  352 + dev->config[dev->msix_cap + MSIX_ENABLE_OFFSET] &= MSIX_ENABLE_MASK;
  353 + memset(dev->msix_table_page, 0, MSIX_PAGE_SIZE);
  354 +}
  355 +
  356 +/* PCI spec suggests that devices make it possible for software to configure
  357 + * less vectors than supported by the device, but does not specify a standard
  358 + * mechanism for devices to do so.
  359 + *
  360 + * We support this by asking devices to declare vectors software is going to
  361 + * actually use, and checking this on the notification path. Devices that
  362 + * don't want to follow the spec suggestion can declare all vectors as used. */
  363 +
  364 +/* Mark vector as used. */
  365 +int msix_vector_use(PCIDevice *dev, unsigned vector)
  366 +{
  367 + if (vector >= dev->msix_entries_nr)
  368 + return -EINVAL;
  369 + dev->msix_entry_used[vector]++;
  370 + return 0;
  371 +}
  372 +
  373 +/* Mark vector as unused. */
  374 +void msix_vector_unuse(PCIDevice *dev, unsigned vector)
  375 +{
  376 + if (vector < dev->msix_entries_nr && dev->msix_entry_used[vector])
  377 + --dev->msix_entry_used[vector];
  378 +}
... ...
hw/msix.h 0 → 100644
  1 +#ifndef QEMU_MSIX_H
  2 +#define QEMU_MSIX_H
  3 +
  4 +#include "qemu-common.h"
  5 +
  6 +int msix_init(PCIDevice *pdev, unsigned short nentries,
  7 + unsigned bar_nr, unsigned bar_size);
  8 +
  9 +void msix_write_config(PCIDevice *pci_dev, uint32_t address,
  10 + uint32_t val, int len);
  11 +
  12 +void msix_mmio_map(PCIDevice *pci_dev, int region_num,
  13 + uint32_t addr, uint32_t size, int type);
  14 +
  15 +int msix_uninit(PCIDevice *d);
  16 +
  17 +void msix_save(PCIDevice *dev, QEMUFile *f);
  18 +void msix_load(PCIDevice *dev, QEMUFile *f);
  19 +
  20 +int msix_enabled(PCIDevice *dev);
  21 +int msix_present(PCIDevice *dev);
  22 +
  23 +uint32_t msix_bar_size(PCIDevice *dev);
  24 +
  25 +int msix_vector_use(PCIDevice *dev, unsigned vector);
  26 +void msix_vector_unuse(PCIDevice *dev, unsigned vector);
  27 +
  28 +void msix_notify(PCIDevice *dev, unsigned vector);
  29 +
  30 +void msix_reset(PCIDevice *dev);
  31 +
  32 +extern int msix_supported;
  33 +
  34 +#endif
... ...
hw/pci.h
... ... @@ -155,6 +155,11 @@ typedef struct PCIIORegion {
155 155 /* Size of the standard PCI config space */
156 156 #define PCI_CONFIG_SPACE_SIZE 0x100
157 157  
  158 +/* Bits in cap_present field. */
  159 +enum {
  160 + QEMU_PCI_CAP_MSIX = 0x1,
  161 +};
  162 +
158 163 struct PCIDevice {
159 164 DeviceState qdev;
160 165 /* PCI config space */
... ... @@ -186,6 +191,24 @@ struct PCIDevice {
186 191  
187 192 /* Current IRQ levels. Used internally by the generic PCI code. */
188 193 int irq_state[4];
  194 +
  195 + /* Capability bits */
  196 + uint32_t cap_present;
  197 +
  198 + /* Offset of MSI-X capability in config space */
  199 + uint8_t msix_cap;
  200 +
  201 + /* MSI-X entries */
  202 + int msix_entries_nr;
  203 +
  204 + /* Space to store MSIX table */
  205 + uint8_t *msix_table_page;
  206 + /* MMIO index used to map MSIX table and pending bit entries. */
  207 + int msix_mmio_index;
  208 + /* Reference-count for entries actually in use by driver. */
  209 + unsigned *msix_entry_used;
  210 + /* Region including the MSI-X table */
  211 + uint32_t msix_bar_size;
189 212 };
190 213  
191 214 PCIDevice *pci_register_device(PCIBus *bus, const char *name,
... ...