• 首页 首页 icon
  • 工具库 工具库 icon
    • IP查询 IP查询 icon
  • 内容库 内容库 icon
    • 快讯库 快讯库 icon
    • 精品库 精品库 icon
    • 问答库 问答库 icon
  • 更多 更多 icon
    • 服务条款 服务条款 icon

Linux Kernel 6.0 CXL Core pci.c

武飞扬头像
Happy_Enger
帮助1

前言

CXL 是一个比较新的技术,所以我研究的内核源码是选了当前比较新的内核版本 linux 6.0。打算将内核关于 CXL 的驱动进行解析一遍,一步一步慢慢来。

在阅读之前,希望读者能有一定的 PCIe 基础知识,精力有限,不能把所有知识点都能说的很详细,需要一定的基础才能理解,同时,希望在学习的过程中,手边能有 PCIe 5.0 Spec 以及 CXL 2.0 Spec,以便随时查看,当然,我也会尽量把重点的部分截图在博文中。

最后,如果有问题请留言讨论。

相关链接

Linux Kernel 6.0 CXL Core Regs.c 详解

Ref

《PCI_Express_Base_5.0r1.0》
《CXL Specification_rev2p0_ver1p0_2020Oct26》

正文

首先,仍然是是一个PCI 设备驱动模型,根据 pci_device_id 中的 Class 去匹配设备,匹配成功调用 probe 函数


static const struct pci_device_id cxl_mem_pci_tbl[] = {
	/* PCI class code for CXL.mem Type-3 Devices */
	{ PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
	{ /* terminate list */ },
};
MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);

static struct pci_driver cxl_pci_driver = {
	.name			= KBUILD_MODNAME,
    // 匹配表
	.id_table		= cxl_mem_pci_tbl,
    // 匹配成功,回调 probe 函数
	.probe			= cxl_pci_probe,
	.driver	= {
		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
	},
};

以下对 probe 函数进行解析:


static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
	struct cxl_register_map map;
	struct cxl_memdev *cxlmd;
	struct cxl_dev_state *cxlds;
	int rc;

	/*
	 * Double check the anonymous union trickery in struct cxl_regs
	 * FIXME switch to struct_group()
	 */
	BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) !=
		     offsetof(struct cxl_regs, device_regs.memdev));

    // 使能设备
	rc = pcim_enable_device(pdev);
	if (rc)
		return rc;
    // 为 struct cxl_dev_state *cxlds 申请内存并初始化部分变量
	cxlds = cxl_dev_state_create(&pdev->dev);
	if (IS_ERR(cxlds))
		return PTR_ERR(cxlds);

    // Looks up the PCI_EXT_CAP_ID_DSN and reads the 8 bytes of the Device Serial Number.
    // The Device Serial Number is two dwords offset 4 bytes from the capability position
    // 在 PCIe Ext capability 中寻找序列号的位置并读出,详情参考 PCIe Spec
	cxlds->serial = pci_get_dsn(pdev);

    // 此函数会在 PCIe 配置空间中 extended capability 区域进行遍历
	// 寻找匹配的 Vendor == PCI_DVSEC_VENDOR_ID_CXL (0x23)
	// 以及 DVSEC ID == CXL_DVSEC_PCIE_DEVICE(0)的 DVSEC 
	// 0x23 是表示这个 capability 是 DVSEC 
	// 0 表示这个 DVSEC 是具体类型 DVSEC FOR CXL DEVICE
	// DVSEC 不了解的可以认为是一块保存一些寄存器的内存区域
	// 详情 Ref PCIe 5.0 Spec 7.9.6 Designated Vendor-Specific Extended Capability (DVSEC)
	// ID 分配 Ref CXL 2.0 Spec 8.1.1 PCI Express Designated Vendor-Specific Extended Capability(DVSEC) ID Assignment
	cxlds->cxl_dvsec = pci_find_dvsec_capability(
		pdev, PCI_DVSEC_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
	if (!cxlds->cxl_dvsec)
		dev_warn(&pdev->dev,
			 "Device DVSEC not present, skip CXL.mem init\n");

	rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map);
	if (rc)
		return rc;

	rc = cxl_map_regs(cxlds, &map);
	if (rc)
		return rc;

	/*
	 * If the component registers can't be found, the cxl_pci driver may
	 * still be useful for management functions so don't return an error.
	 */
	cxlds->component_reg_phys = CXL_RESOURCE_NONE;
	// 定位寄存器块位置,建立映射,记录位置和大小,详情在下面
	rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT, &map);
	if (rc)
		dev_warn(&pdev->dev, "No component registers (%d)\n", rc);

	// CXL_REGLOC_RBI_COMPONENT
	// Component 寄存器块的物理基地址
	cxlds->component_reg_phys = cxl_regmap_to_base(pdev, &map);

	// 为每个 DOE 创建一个实体
	devm_cxl_pci_create_doe(cxlds);

	// 查看 mailbox 是否准备好,记录 payload size 等信息
	rc = cxl_pci_setup_mailbox(cxlds);
	if (rc)
		return rc;

	// Enumerate commands for a device.
	// 详情见另一篇 mbox.c
	rc = cxl_enumerate_cmds(cxlds);
	if (rc)
		return rc;

	// Send the IDENTIFY command to the device.
	// 详情见另一篇 mbox.c
	// 命令作用 : Retrieve basic information about the memory device.,如 total_bytes、volatile_only_bytes等
	rc = cxl_dev_state_identify(cxlds);
	if (rc)
		return rc;
	// 创建内存范围信息
	// 详情见另一篇 mbox.c
	rc = cxl_mem_create_range_info(cxlds);
	if (rc)
		return rc;
	// 创建字符设备, /dev/memX
	// 详情见 memdev.c
	cxlmd = devm_cxl_add_memdev(cxlds);
	if (IS_ERR(cxlmd))
		return PTR_ERR(cxlmd);

	if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM))
		rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd);

	return rc;
}

1. CXL Subsystem Component Register Ranges

学新通

2. Type:

学新通


static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
			  struct cxl_register_map *map)
{
	int rc;
	// Locate register blocks by type
	// 根据 Type 定位寄存器,记录寄存器的所在的BAR,偏移以及寄存器类型,保存在 map 中
	// CXL 设备有一些 CXL 相关的寄存器会以这种形式告知操作系统它的布局位置
	// Type 分为 (如上图)
	// 0 :空
	// 1 :Component Reg 如上Table 141
	// 2 : BAR Virtualization ACL Reg
	// 3 : CXL Memory Device Registers
	// Ref CXL 2.0 Spec 8.1.9 Register Locator DVSEC
	//                  8.1.9.1 Register Offset Low
	rc = cxl_find_regblock(pdev, type, map);
	if (rc)
		return rc;
	// 根据上面的信息,找到对应的 BAR 进行 io 映射并保存寄存器块的基地址,详情在下个函数
	rc = cxl_map_regblock(pdev, map);
	if (rc)
		return rc;

	rc = cxl_probe_regs(pdev, map);
	cxl_unmap_regblock(pdev, map);

	return rc;
}

static int cxl_map_regblock(struct pci_dev *pdev, struct cxl_register_map *map)
{
	void __iomem *addr;
	int bar = map->barno;
	struct device *dev = &pdev->dev;
	resource_size_t offset = map->block_offset;

	/* Basic sanity check that BAR is big enough */
	// 检查 BAR 总大小是否小于偏移,如果是报错
	// 寄存器所在的偏移应该是在 BAR 空间内
	if (pci_resource_len(pdev, bar) < offset) {
		dev_err(dev, "BAR%d: %pr: too small (offset: %pa)\n", bar,
			&pdev->resource[bar], &offset);
		return -ENXIO;
	}
	// 映射 BAR 空间,bar 是序号,0表示不检查长度,全部映射
	addr = pci_iomap(pdev, bar, 0);
	if (!addr) {
		dev_err(dev, "failed to map registers\n");
		return -ENOMEM;
	}

	dev_dbg(dev, "Mapped CXL Memory Device resource bar %u @ %pa\n",
		bar, &offset);

	// 获取寄存器虚拟地址空间的基地址,内核可直接读写访问
	map->base = addr   map->block_offset;
	return 0;
}


static int cxl_probe_regs(struct pci_dev *pdev, struct cxl_register_map *map)
{
	struct cxl_component_reg_map *comp_map;
	struct cxl_device_reg_map *dev_map;
	struct device *dev = &pdev->dev;
	void __iomem *base = map->base;

	// 根据不同的寄存器类型,做不同的处理
	switch (map->reg_type) {
	case CXL_REGLOC_RBI_COMPONENT:
		// 如果是组件寄存器
		comp_map = &map->component_map;
		// 参考另一篇文章 Regs.c
		// 记录 HDM Decoder 寄存器块的offset 以及长度
		cxl_probe_component_regs(dev, base, comp_map);
		if (!comp_map->hdm_decoder.valid) {
			dev_err(dev, "HDM decoder registers not found\n");
			return -ENXIO;
		}

		dev_dbg(dev, "Set up component registers\n");
		break;
	case CXL_REGLOC_RBI_MEMDEV:
		// 如果是 CXL 内存设备寄存器
		dev_map = &map->device_map;
		// 参考另一篇文章 Regs.c
		// 记录 CXL Device 寄存器块的offset 以及长度
		cxl_probe_device_regs(dev, base, dev_map);
		if (!dev_map->status.valid || !dev_map->mbox.valid ||
		    !dev_map->memdev.valid) {
			dev_err(dev, "registers not found: %s%s%s\n",
				!dev_map->status.valid ? "status " : "",
				!dev_map->mbox.valid ? "mbox " : "",
				!dev_map->memdev.valid ? "memdev " : "");
			return -ENXIO;
		}

		dev_dbg(dev, "Probing device registers...\n");
		break;
	default:
		break;
	}

	return 0;
}

static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
{
	struct device *dev = cxlds->dev;
	struct pci_dev *pdev = to_pci_dev(dev);
	u16 off = 0;

	// Initialise an empty XArray.
	xa_init(&cxlds->doe_mbs);
	// 管理资源接口,linux kernel 相关接口,非重点
	if (devm_add_action(&pdev->dev, cxl_pci_destroy_doe, &cxlds->doe_mbs)) {
		dev_err(dev, "Failed to create XArray for DOE's\n");
		return;
	}

	/*
	 * Mailbox creation is best effort.  Higher layers must determine if
	 * the lack of a mailbox for their protocol is a device failure or not.
	 */
	// 遍历枚举每一个 DOE Capability
	pci_doe_for_each_off(pdev, off) {
		struct pci_doe_mb *doe_mb;
		// Create a DOE mailbox object
		// 详情见另一篇 doe.c
		doe_mb = pcim_doe_create_mb(pdev, off);
		if (IS_ERR(doe_mb)) {
			dev_err(dev, "Failed to create MB object for MB @ %x\n",
				off);
			continue;
		}
		// Store this entry in the XArray unless another entry is already present.
		// 存储到数组中
		if (xa_insert(&cxlds->doe_mbs, off, doe_mb, GFP_KERNEL)) {
			dev_err(dev, "xa_insert failed to insert MB @ %x\n",
				off);
			continue;
		}

		dev_dbg(dev, "Created DOE mailbox @%x\n", off);
	}
}

3. Mailbox Registers

学新通

4. Mailbox Capabilities Register

学新通

5. Mailbox Control Register

学新通

6. Mailbox Interfaces Ready

学新通

. Spec 引用,mailbox 命令超时时间

The mailbox command timeout is 2 seconds. Commands that require a longer execution time shall be completed asynchronously in the background. Only one command can be executed in the background at a time.

// Ref CXL 2.0 8.2.8.4.4 Mailbox Control Register (Mailbox Registers Capability Offset   04h)
// bit0 DoorBell : 当为 0 时表示设备准备接收新的命令;调用者会置1,告诉设备命令已经准备好输入了
// 当置1时只读, 当命令完成后由设备清0, 如上图 5. Mailbox Control Register
// 所以为 1 时表示设备 mailbox 在忙
#define cxl_doorbell_busy(cxlds)                                                \
	(readl((cxlds)->regs.mbox   CXLDEV_MBOX_CTRL_OFFSET) &                  \
	 CXLDEV_MBOX_CTRL_DOORBELL)

static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
{
	const unsigned long start = jiffies;
	unsigned long end = start;
	// polling 查询mailbox 寄存器的状态
	while (cxl_doorbell_busy(cxlds)) {
		end = jiffies;

		if (time_after(end, start   CXL_MAILBOX_TIMEOUT_MS)) {
			// mailbox command 超时时间 2 S, 协议规定, 如上 6. 引用部分
			// Ref CXL 2.0 8.2.8.4 Mailbox Registers (Offset Varies)
			/* Check again in case preempted before timeout test */
			if (!cxl_doorbell_busy(cxlds))
				break;
			return -ETIMEDOUT;
		}
		cpu_relax();
	}

	dev_dbg(cxlds->dev, "Doorbell wait took %dms",
		jiffies_to_msecs(end) - jiffies_to_msecs(start));
	return 0;
}

static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
{
	const int cap = readl(cxlds->regs.mbox   CXLDEV_MBOX_CAPS_OFFSET);
	unsigned long timeout;
	u64 md_status;

	timeout = jiffies   mbox_ready_timeout * HZ;
	// 首先查询设备是否准备好mailbox
	do {
		// Ref CXL 2.0 Spec 8.2.8.5.1.1 Memory Device Status Register
		// or 上图 6. Mailbox Interfaces Ready
		// bit4 置1表示设备已经准备好通过 mailbox 接收命令了
		// CXLMDEV_MBOX_IF_READY == 0x4
		md_status = readq(cxlds->regs.memdev   CXLMDEV_STATUS_OFFSET);
		if (md_status & CXLMDEV_MBOX_IF_READY)
			break;
		if (msleep_interruptible(100))
			break;
	} while (!time_after(jiffies, timeout));

	if (!(md_status & CXLMDEV_MBOX_IF_READY)) {
		cxl_err(cxlds->dev, md_status,
			"timeout awaiting mailbox ready");
		return -ETIMEDOUT;
	}

	/*
	 * A command may be in flight from a previous driver instance,
	 * think kexec, do one doorbell wait so that
	 * __cxl_pci_mbox_send_cmd() can assume that it is the only
	 * source for future doorbell busy events.
	 */
	// 前面详细介绍了
	// 等待 mailbox 空闲
	if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) {
		cxl_err(cxlds->dev, md_status, "timeout awaiting mailbox idle");
		return -ETIMEDOUT;
	}

	cxlds->mbox_send = cxl_pci_mbox_send;
	// Payload Size: Size of the Command Payload Registers in bytes, expressed as 2^n.
	// The minimum size is 256 bytes (n=8) and the maximum size is 1 MB (n=20).
	// Ref CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
	cxlds->payload_size =
		1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);

	/*
	 * CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
	 *
	 * If the size is too small, mandatory commands will not work and so
	 * there's no point in going forward. If the size is too large, there's
	 * no harm is soft limiting it.
	 */
	cxlds->payload_size = min_t(size_t, cxlds->payload_size, SZ_1M);
	if (cxlds->payload_size < 256) {
		dev_err(cxlds->dev, "Mailbox is too small (%zub)",
			cxlds->payload_size);
		return -ENXIO;
	}

	dev_dbg(cxlds->dev, "Mailbox payload sized %zu",
		cxlds->payload_size);

	return 0;
}

最后剩下一个重要的 mbox send 接口,其实理解也很简单,就是根据一系列寄存器,进行数据读写。


static int cxl_pci_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
{
	int rc;

	mutex_lock_io(&cxlds->mbox_mutex);
	rc = __cxl_pci_mbox_send_cmd(cxlds, cmd);
	mutex_unlock(&cxlds->mbox_mutex);

	return rc;
}

主要函数为 __cxl_pci_mbox_send_cmd,执行一个 mailbox 命令:

CXL 2.0 8.2.8.4 Mailbox Registers

The flow for executing a command is described below. The term “caller” represents the entity submitting the command:

  1. Caller reads MB Control Register to verify doorbell is clear
  2. Caller writes Command Register
  3. Caller writes Command Payload Registers if input payload is non-empty
  4. Caller writes MB Control Register to set doorbell
  5. Caller either polls for doorbell to be clear or waits for interrupt if configured
  6. Caller reads MB Status Register to fetch Return code
  7. If command successful, Caller reads Command Register to get Payload Length
  8. If output payload is non-empty, host reads Command Payload Registers

/**
 * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
 * @cxlds: The device state to communicate with.
 * @mbox_cmd: Command to send to the memory device.
 *
 * Context: Any context. Expects mbox_mutex to be held.
 * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
 *         Caller should check the return code in @mbox_cmd to make sure it
 *         succeeded.
 *
 * This is a generic form of the CXL mailbox send command thus only using the
 * registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory
 * devices, and perhaps other types of CXL devices may have further information
 * available upon error conditions. Driver facilities wishing to send mailbox
 * commands should use the wrapper command.
 *
 * The CXL spec allows for up to two mailboxes. The intention is for the primary
 * mailbox to be OS controlled and the secondary mailbox to be used by system
 * firmware. This allows the OS and firmware to communicate with the device and
 * not need to coordinate with each other. The driver only uses the primary
 * mailbox.
 */
static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
				   struct cxl_mbox_cmd *mbox_cmd)
{
	void __iomem *payload = cxlds->regs.mbox   CXLDEV_MBOX_PAYLOAD_OFFSET;
	struct device *dev = cxlds->dev;
	u64 cmd_reg, status_reg;
	size_t out_len;
	int rc;

	lockdep_assert_held(&cxlds->mbox_mutex);

	/*
	 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
	 *   1. Caller reads MB Control Register to verify doorbell is clear
	 *   2. Caller writes Command Register
	 *   3. Caller writes Command Payload Registers if input payload is non-empty
	 *   4. Caller writes MB Control Register to set doorbell
	 *   5. Caller either polls for doorbell to be clear or waits for interrupt if configured
	 *   6. Caller reads MB Status Register to fetch Return code
	 *   7. If command successful, Caller reads Command Register to get Payload Length
	 *   8. If output payload is non-empty, host reads Command Payload Registers
	 *
	 * Hardware is free to do whatever it wants before the doorbell is rung,
	 * and isn't allowed to change anything after it clears the doorbell. As
	 * such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can
	 * also happen in any order (though some orders might not make sense).
	 */
	// 基本时按照规范,依次进行
	/* #1 Caller reads MB Control Register to verify doorbell is clear */
	// 第一步,确保 mailbox 空闲,通过查看 doorbell 状态位
	if (cxl_doorbell_busy(cxlds)) {
		u64 md_status =
			readq(cxlds->regs.memdev   CXLMDEV_STATUS_OFFSET);

		cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
			    "mailbox queue busy");
		return -EBUSY;
	}

	cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK,
			     mbox_cmd->opcode);
	if (mbox_cmd->size_in) {
		if (WARN_ON(!mbox_cmd->payload_in))
			return -EINVAL;
	// 3. Caller writes Command Payload Registers if input payload is non-empty
		cmd_reg |= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK,
				      mbox_cmd->size_in);
		memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
	}

	/* #2, #3 */
	// 2. Caller writes Command Register
	writeq(cmd_reg, cxlds->regs.mbox   CXLDEV_MBOX_CMD_OFFSET);

	/* #4 */
	// 4. Caller writes MB Control Register to set doorbell
	// 调用者写控制寄存器,设置 doorbell 为 1,表示设备在忙
	dev_dbg(dev, "Sending command\n");
	writel(CXLDEV_MBOX_CTRL_DOORBELL,
	       cxlds->regs.mbox   CXLDEV_MBOX_CTRL_OFFSET);

	/* #5 */
	// 5. Caller either polls for doorbell to be clear or waits for interrupt if configured
	// 等待结束可以使用轮询或者中断方式
	rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
	if (rc == -ETIMEDOUT) {
		u64 md_status = readq(cxlds->regs.memdev   CXLMDEV_STATUS_OFFSET);

		cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout");
		return rc;
	}

	/* #6 */
	// 6. Caller reads MB Status Register to fetch Return code
	// 调用者读状态寄存器,获取返回码
	status_reg = readq(cxlds->regs.mbox   CXLDEV_MBOX_STATUS_OFFSET);
	mbox_cmd->return_code =
		FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);

	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
		dev_dbg(dev, "Mailbox operation had an error: %s\n",
			cxl_mbox_cmd_rc2str(mbox_cmd));
		return 0; /* completed but caller must check return_code */
	}

	/* #7 */
	// 7. If command successful, Caller reads Command Register to get Payload Length
	// 如果返回成功,调用者读命令寄存器获取数据长度
	cmd_reg = readq(cxlds->regs.mbox   CXLDEV_MBOX_CMD_OFFSET);
	out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg);

	/* #8 */
	// 8. If output payload is non-empty, host reads Command Payload Registers
	// 如果长度不为空,则读取数据
	if (out_len && mbox_cmd->payload_out) {
		/*
		 * Sanitize the copy. If hardware misbehaves, out_len per the
		 * spec can actually be greater than the max allowed size (21
		 * bits available but spec defined 1M max). The caller also may
		 * have requested less data than the hardware supplied even
		 * within spec.
		 */
		size_t n = min3(mbox_cmd->size_out, cxlds->payload_size, out_len);

		memcpy_fromio(mbox_cmd->payload_out, payload, n);
		mbox_cmd->size_out = n;
	} else {
		mbox_cmd->size_out = 0;
	}

	return 0;
}

这篇好文章是转载于:学新通技术网

  • 版权申明: 本站部分内容来自互联网,仅供学习及演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,请提供相关证据及您的身份证明,我们将在收到邮件后48小时内删除。
  • 本站站名: 学新通技术网
  • 本文地址: /boutique/detail/tanhfkikcj
系列文章
更多 icon
同类精品
更多 icon
继续加载