virtio bus结构

注册virtio bus
// drivers/virtio/virtio.c
static struct bus_type virtio_bus = {
        .name  = "virtio",
        .match = virtio_dev_match,
        .dev_groups = virtio_dev_groups,
        .uevent = virtio_uevent,
        .probe = virtio_dev_probe,
        .remove = virtio_dev_remove,
};

static int virtio_init(void)
{
        if (bus_register(&virtio_bus) != 0)
                panic("virtio bus registration failed");
        return 0;
}

static void __exit virtio_exit(void)
{
        bus_unregister(&virtio_bus);
        ida_destroy(&virtio_index_ida);
}
core_initcall(virtio_init);
module_exit(virtio_exit);

MODULE_LICENSE("GPL");
/*
 * A "pure" initcall has no dependencies on anything else, and purely
 * initializes variables that couldn't be statically initialized.
 *
 * This only exists for built-in code, not for modules.
 * Keep main.c:initcall_level_names[] in sync.
 */
#define pure_initcall(fn)                __define_initcall(fn, 0)

#define core_initcall(fn)                __define_initcall(fn, 1)
#define core_initcall_sync(fn)                __define_initcall(fn, 1s)
#define postcore_initcall(fn)                __define_initcall(fn, 2)
#define postcore_initcall_sync(fn)        __define_initcall(fn, 2s)
#define arch_initcall(fn)                __define_initcall(fn, 3)
#define arch_initcall_sync(fn)                __define_initcall(fn, 3s)
#define subsys_initcall(fn)                __define_initcall(fn, 4)
#define subsys_initcall_sync(fn)        __define_initcall(fn, 4s)
#define fs_initcall(fn)                        __define_initcall(fn, 5)
#define fs_initcall_sync(fn)                __define_initcall(fn, 5s)
#define rootfs_initcall(fn)                __define_initcall(fn, rootfs)
#define device_initcall(fn)                __define_initcall(fn, 6)
#define device_initcall_sync(fn)        __define_initcall(fn, 6s)
#define late_initcall(fn)                __define_initcall(fn, 7)
#define late_initcall_sync(fn)                __define_initcall(fn, 7s)

virtio bus以core_initcall的方式回调注册,其启动顺序的优先级很高。因此virtio-clk、virtio-irq等,可基于架构实现半虚拟化。

virtio_dev_match函数
// include/linux/mod_devicetable.h
struct virtio_device_id {
        __u32 device;    // device id
        __u32 vendor;    // vendor id
};
#define VIRTIO_DEV_ANY_ID        0xffffffff

device定义详见:virtio设备

// drivers/virtio/virtio.c
static inline int virtio_id_match(const struct virtio_device *dev,
                                  const struct virtio_device_id *id)
{
        // 先比较 device id要一致,且不能为 ANY
        if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID)
                return 0;

        // vendor id为 ANY 或者相等
        return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor;
}

/* This looks through all the IDs a driver claims to support.  If any of them
 * match, we return 1 and the kernel will call virtio_dev_probe(). */
static int virtio_dev_match(struct device *_dv, struct device_driver *_dr)
{
        unsigned int i;
        // 根据device 结构索引获取 virtio_device
        struct virtio_device *dev = dev_to_virtio(_dv);
        const struct virtio_device_id *ids;

        // 根据 device_driver结构,获取 virtio_driver,
        // 并取出其 id_table,且必须以0为结尾,以便结束循环
        ids = drv_to_virtio(_dr)->id_table;
        for (i = 0; ids[i].device; i++)
                if (virtio_id_match(dev, &ids[i]))
                        return 1;
        return 0;
}

virtio_device结构

// include/linux/virtio.h
/**
 * virtio_device - representation of a device using virtio
 * @index: unique position on the virtio bus
 * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
 * @config_enabled: configuration change reporting enabled
 * @config_change_pending: configuration change reported while disabled
 * @config_lock: protects configuration change reporting
 * @dev: underlying device.
 * @id: the device type identification (used to match it with a driver).
 * @config: the configuration ops for this device.
 * @vringh_config: configuration ops for host vrings.
 * @vqs: the list of virtqueues for this device.
 * @features: the features supported by both driver and device.
 * @priv: private pointer for the driver's use.
 */
struct virtio_device {
        int index;
        bool failed;
        bool config_enabled;
        bool config_change_pending;
        spinlock_t config_lock;
        struct device dev;
        struct virtio_device_id id;                                // 当前设备的 id
        const struct virtio_config_ops *config; // virtio_device 的配置
        const struct vringh_config_ops *vringh_config;
        struct list_head vqs;                                        // 当前 dev 的 virtioqueue 链表
        u64 features;                                                        // 设备通信特性
        void *priv;
};
  • struct virtio_config_ops

virtio_config_ops操作集中的函数主要与virtio_device的配置相关,主要有如下2类操作,

① 实例化 / 反实例化virtqueue,其中要特别注意find_vqs函数,该函数用于实例化virtio_device所持有的virtqueue

②. 获取 / 设置virtio_device的属性与状态

// include/linux/virtio_config.h
struct virtio_config_ops {
        // 获取 host 端的数据
        void (*get)(struct virtio_device *vdev, unsigned offset,
                    void *buf, unsigned len);
        // 发送 数据到 host端
        void (*set)(struct virtio_device *vdev, unsigned offset,
                    const void *buf, unsigned len);
        u32 (*generation)(struct virtio_device *vdev);

        // 获取 host端的状态
        u8 (*get_status)(struct virtio_device *vdev);

        // 配置 guest端的状态
        void (*set_status)(struct virtio_device *vdev, u8 status);

        // 通知 host端 复位设备
        void (*reset)(struct virtio_device *vdev);

        // 实例化virtio_device所持有的virtqueue
        int (*find_vqs)(struct virtio_device *, unsigned nvqs,
                        struct virtqueue *vqs[], vq_callback_t *callbacks[],
                        const char * const names[], const bool *ctx,
                        struct irq_affinity *desc);
        void (*del_vqs)(struct virtio_device *);

        // 获取 features 信息
        u64 (*get_features)(struct virtio_device *vdev);
        int (*finalize_features)(struct virtio_device *vdev);

        // 获取 bus名,如:platform、pci等
        const char *(*bus_name)(struct virtio_device *vdev);
        int (*set_vq_affinity)(struct virtqueue *vq,
                               const struct cpumask *cpu_mask);
        const struct cpumask *(*get_vq_affinity)(struct virtio_device *vdev,
                        int index);
};

virtio_driver结构

// include/linux/virtio.h
/**
 * virtio_driver - operations for a virtio I/O driver
 * @driver: underlying device driver (populate name and owner).
 * @id_table: the ids serviced by this driver.
 * @feature_table: an array of feature numbers supported by this driver.
 * @feature_table_size: number of entries in the feature table array.
 * @feature_table_legacy: same as feature_table but when working in legacy mode.
 * @feature_table_size_legacy: number of entries in feature table legacy array.
 * @probe: the function to call when a device is found.  Returns 0 or -errno.
 * @scan: optional function to call after successful probe; intended
 *    for virtio-scsi to invoke a scan.
 * @remove: the function to call when a device is removed.
 * @config_changed: optional function to call when the device configuration
 *    changes; may be called in interrupt context.
 * @freeze: optional function to call during suspend/hibernation.
 * @restore: optional function to call on resume.
 */
struct virtio_driver {
        struct device_driver driver;
        const struct virtio_device_id *id_table;        // id 列表
        const unsigned int *feature_table;                        // 特性
        unsigned int feature_table_size;
        const unsigned int *feature_table_legacy;
        unsigned int feature_table_size_legacy;
        int (*validate)(struct virtio_device *dev);
        int (*probe)(struct virtio_device *dev);        // 探测函数
        void (*scan)(struct virtio_device *dev);
        void (*remove)(struct virtio_device *dev);
        void (*config_changed)(struct virtio_device *dev);
#ifdef CONFIG_PM
        int (*freeze)(struct virtio_device *dev);
        int (*restore)(struct virtio_device *dev);
#endif
};

virtqueue结构

// include/linux/virtio.h
/**
 * virtqueue - a queue to register buffers for sending or receiving.
 * @list: the chain of virtqueues for this device
 * @callback: the function to call when buffers are consumed (can be NULL).
 * @name: the name of this virtqueue (mainly for debugging)
 * @vdev: the virtio device this queue was created for.
 * @priv: a pointer for the virtqueue implementation to use.
 * @index: the zero-based ordinal number for this queue.
 * @num_free: number of elements we expect to be able to fit.
 *
 * A note on @num_free: with indirect buffers, each buffer needs one
 * element in the queue, otherwise a buffer will need one element per
 * sg element.
 */
struct virtqueue {
        // 加入 virtio_device的vqs链表
        struct list_head list;
        // virtqueue被触发中断时执行的回调函数
        void (*callback)(struct virtqueue *vq);
        // virtqueue名
        const char *name;
        // virtqueue所属的 virtio_device
        struct virtio_device *vdev;
        // virtqueue的编号
        unsigned int index;
        // virtioqueue中空闲的descriptor个数
        unsigned int num_free;
        void *priv;
};

vring结构

  • 数据结构定义

  • vring的三个构成

    • ① Destcriptor Table:描述内存buffer,主要包括addr & len等信息
    • ② Avail Ring:用于前端驱动(Guest)通知后端驱动(Host)有可用的描述符

e.g. 前端驱动有一个报文需要发送,需要将其加入Avail Ring,之后通知后端驱动读取

  • ③ Used Ring:用于后端驱动(Host)通知前端驱动(Guest)有可用的描述符,或者是后端驱动已将前端驱动提供的描述符使用完毕

e.g. 后端驱动有一个报文需要发送,需要将其加入Used Ring,之后通知前端驱动读取

可见avail & used的命名都是站在Host的角度进行的

  • vring的存储

vring结构只是用于描述vring在内存中的布局(因此包含的都是指针变量),实际用于通信的vring是存储在内存中

上文提到的vring的三个区域是在内存中连续存储的,而且是存储在Guest & Host共享的一片连续内存中

我们可以通过vring_init函数理解vring存储结构的布局:

/*
 * vr:要初始化的vring结构
 * num:vring的大小,即descriptor的个数
 * p:存储实际vring的内存首地址
 * align:vring不同区域的对齐要求
 */
static inline void vring_init(struct vring *vr, unsigned int num, void *p,
                              unsigned long align)
{
        vr->num = num;
        vr->desc = p;
        vr->avail = p + num*sizeof(struct vring_desc);
        vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16)
                + align-1) & ~(align - 1));
}

实际vring的内存布局如下图所示:

在计算used ring的起始地址时,在avail->ring[num]的地址之后又加了sizeof(__virtio16),也就是增加了2B,是为了容纳avail ring末尾的used_event。

  • vring的大小

实际vring的大小可以通过vring_size函数获得

/*
 * num:vring的大小,即descriptor的个数
 * align:vring不同区域的对齐要求
 */
static inline unsigned vring_size(unsigned int num, unsigned long align)
{
        return ((sizeof(struct vring_desc) * num + sizeof(__virtio16) * (3 + num)
                 + align - 1) & ~(align - 1))
                + sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num;
}

① 计算avail ring时加3,分别为flags、idx和used_event

② 计算used ring时加3,分别为flags、idx和avail_event

③ 计算过程中,包含了为满足对齐要求padding的空间

  • used_event 与 avail_event 机制

这2个字段均与virtio设备的VIRTIO_RING_F_EVENT_IDX特性有关,由于virtio驱动触发对方中断将导致CPU反复进出虚拟机 & 宿主机模式,从而降低性能,因此需要控制触发中断频率的机制。

  • ① avail ring中的used_event

a. 由前端驱动(Geust)设置,标识希望后端驱动(Host)触发中断的阈值

b. 后端驱动(Host)在向Used Ring加入buffer后,检查Used Ring中的idx字段,只有达到阈值才触发中断

  • ② used_ring中的avail_event

a. 由后端驱动(Host)设置,标识希望前端驱动(Guest)触发中断的阈值

b. 前端驱动(Guest)在向Avail Ring加入buffer后,检查Avail Ring的idx字段,只有达到阈值才触发中断

综上所属,vring结构的构成如下图所示,

vring_virtqueue结构

vring_virtqueue结构用于描述前端驱动(Guest)中的一条虚拟队列

// drivers/virtio/virtio_ring.c


struct vring_virtqueue {
        // virtio层虚拟队列
        struct virtqueue vq;

        /* Is this a packed ring? */
        bool packed_ring;

        /* Is DMA API used? */
        bool use_dma_api;

        /* Can we use weak barriers? */
        bool weak_barriers;

        /* Other side has made a mess, don't try any more. */
        // 标识后端驱动状态是否正常
        bool broken;

        /* Host supports indirect buffers */
        // 标识是否支持间接descriptor
        // 即descriptor指向的不是GPA,而是descriptor
        bool indirect;

        /* Host publishes avail event idx */
        // 标识是否支持event流控
        bool event;

        /* Head of free buffer list. */
        // vring descriptor table中第一个可用的下标
        // 即空闲链表表头
        unsigned int free_head;

        /* Number we've added since last sync. */
        // 上一次通知后端驱动(Host)之后向avail ring中增加的请求次数
        unsigned int num_added;

        /* Last used index we've seen. */
        // 前端驱动(Guest)上次读取到的uesd ring index
        u16 last_used_idx;

        union {
                /* Available for split ring */
                struct {
                        /* Actual memory layout for this queue. */
                        struct vring vring;

                        /* Last written value to avail->flags */
                        // 最后一次写入 avail flags的值
                        u16 avail_flags_shadow;

                        /*
                         * Last written value to avail->idx in
                         * guest byte order.
                         */
                        // 最后一次写入 avail ring index的值
                        u16 avail_idx_shadow;

                        /* Per-descriptor state. */
                        // 数组大小为virtqueue的大小
                        // 用来存放每次添加的descriptor的一个上下文结构
                        // 该结构仅供前端驱动使用,后端驱动是看不到此结构
                        struct vring_desc_state_split *desc_state;

                        /* DMA address and size information */
                        dma_addr_t queue_dma_addr;
                        size_t queue_size_in_bytes;
                } split;

                /* Available for packed ring */
                struct {
                        /* Actual memory layout for this queue. */
                        struct {
                                unsigned int num;
                                struct vring_packed_desc *desc;                        // Descriptor Ring
                                struct vring_packed_desc_event *driver;        // Driver Event Suppression
                                struct vring_packed_desc_event *device;        // Device Event Suppression
                        } vring;

                        /* Driver ring wrap counter. */
                        bool avail_wrap_counter;

                        /* Device ring wrap counter. */
                        bool used_wrap_counter;

                        /* Avail used flags. */
                        u16 avail_used_flags;

                        /* Index of the next avail descriptor. */
                        u16 next_avail_idx;

                        /*
                         * Last written value to driver->flags in
                         * guest byte order.
                         */
                        u16 event_flags_shadow;

                        /* Per-descriptor state. */
                        struct vring_desc_state_packed *desc_state;
                        struct vring_desc_extra_packed *desc_extra;

                        /* DMA address and size information */
                        dma_addr_t ring_dma_addr;
                        dma_addr_t driver_event_dma_addr;
                        dma_addr_t device_event_dma_addr;
                        size_t ring_size_in_bytes;
                        size_t event_size_in_bytes;
                } packed;
        };

        /* How to notify other side. FIXME: commonalize hcalls! */
        // 通知后端驱动(Host)的回调函数
        bool (*notify)(struct virtqueue *vq);

        /* DMA, allocation, and size information */
        bool we_own_ring;

#ifdef DEBUG
        /* They're supposed to lock for us. */
        unsigned int in_use;

        /* Figure out if their kicks are too delayed. */
        bool last_add_time_valid;
        ktime_t last_add_time;
#endif
};

数据结构小结

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐