summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/drm_ras.c
blob: b2fa5ab86d870e9153666645966d968030784827 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
// SPDX-License-Identifier: MIT
/*
 * Copyright © 2026 Intel Corporation
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/xarray.h>
#include <net/genetlink.h>

#include <drm/drm_ras.h>

#include "drm_ras_nl.h"

/**
 * DOC: DRM RAS Node Management
 *
 * This module provides the infrastructure to manage RAS (Reliability,
 * Availability, and Serviceability) nodes for DRM drivers. Each
 * DRM driver may register one or more RAS nodes, which represent
 * logical components capable of reporting error counters and other
 * reliability metrics.
 *
 * The nodes are stored in a global xarray `drm_ras_xa` to allow
 * efficient lookup by ID. Nodes can be registered or unregistered
 * dynamically at runtime.
 *
 * A Generic Netlink family `drm_ras` exposes two main operations to
 * userspace:
 *
 * 1. LIST_NODES: Dump all currently registered RAS nodes.
 *    The user receives an array of node IDs, names, and types.
 *
 * 2. GET_ERROR_COUNTER: Get error counters of a given node.
 *    Userspace must provide Node ID, Error ID (Optional for specific counter).
 *    Returns all counters of a node if only Node ID is provided or specific
 *    error counters.
 *
 * Node registration:
 *
 * - drm_ras_node_register(): Registers a new node and assigns
 *   it a unique ID in the xarray.
 * - drm_ras_node_unregister(): Removes a previously registered
 *   node from the xarray.
 *
 * Node type:
 *
 * - ERROR_COUNTER:
 *     + Currently, only error counters are supported.
 *     + The driver must implement the query_error_counter() callback to provide
 *       the name and the value of the error counter.
 *     + The driver must provide a error_counter_range.last value informing the
 *       last valid error ID.
 *     + The driver can provide a error_counter_range.first value informing the
 *       first valid error ID.
 *     + The error counters in the driver doesn't need to be contiguous, but the
 *       driver must return -ENOENT to the query_error_counter as an indication
 *       that the ID should be skipped and not listed in the netlink API.
 *
 * Netlink handlers:
 *
 * - drm_ras_nl_list_nodes_dumpit(): Implements the LIST_NODES
 *   operation, iterating over the xarray.
 * - drm_ras_nl_get_error_counter_dumpit(): Implements the GET_ERROR_COUNTER dumpit
 *   operation, fetching all counters from a specific node.
 * - drm_ras_nl_get_error_counter_doit(): Implements the GET_ERROR_COUNTER doit
 *   operation, fetching a counter value from a specific node.
 */

static DEFINE_XARRAY_ALLOC(drm_ras_xa);

/*
 * The netlink callback context carries dump state across multiple dumpit calls
 */
struct drm_ras_ctx {
	/* Which xarray id to restart the dump from */
	unsigned long restart;
};

/**
 * drm_ras_nl_list_nodes_dumpit() - Dump all registered RAS nodes
 * @skb: Netlink message buffer
 * @cb: Callback context for multi-part dumps
 *
 * Iterates over all registered RAS nodes in the global xarray and appends
 * their attributes (ID, name, type) to the given netlink message buffer.
 * Uses @cb->ctx to track progress in case the message buffer fills up, allowing
 * multi-part dump support. On buffer overflow, updates the context to resume
 * from the last node on the next invocation.
 *
 * Return: 0 if all nodes fit in @skb, number of bytes added to @skb if
 *          the buffer filled up (requires multi-part continuation), or
 *          a negative error code on failure.
 */
int drm_ras_nl_list_nodes_dumpit(struct sk_buff *skb,
				 struct netlink_callback *cb)
{
	const struct genl_info *info = genl_info_dump(cb);
	struct drm_ras_ctx *ctx = (void *)cb->ctx;
	struct drm_ras_node *node;
	struct nlattr *hdr;
	unsigned long id;
	int ret;

	xa_for_each_start(&drm_ras_xa, id, node, ctx->restart) {
		hdr = genlmsg_iput(skb, info);
		if (!hdr) {
			ret = -EMSGSIZE;
			break;
		}

		ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_ID, node->id);
		if (ret) {
			genlmsg_cancel(skb, hdr);
			break;
		}

		ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_DEVICE_NAME,
				     node->device_name);
		if (ret) {
			genlmsg_cancel(skb, hdr);
			break;
		}

		ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_NODE_NAME,
				     node->node_name);
		if (ret) {
			genlmsg_cancel(skb, hdr);
			break;
		}

		ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_TYPE,
				  node->type);
		if (ret) {
			genlmsg_cancel(skb, hdr);
			break;
		}

		genlmsg_end(skb, hdr);
	}

	if (ret == -EMSGSIZE)
		ctx->restart = id;

	return ret;
}

static int get_node_error_counter(u32 node_id, u32 error_id,
				  const char **name, u32 *value)
{
	struct drm_ras_node *node;

	node = xa_load(&drm_ras_xa, node_id);
	if (!node || !node->query_error_counter)
		return -ENOENT;

	if (error_id < node->error_counter_range.first ||
	    error_id > node->error_counter_range.last)
		return -EINVAL;

	return node->query_error_counter(node, error_id, name, value);
}

static int msg_reply_value(struct sk_buff *msg, u32 error_id,
			   const char *error_name, u32 value)
{
	int ret;

	ret = nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, error_id);
	if (ret)
		return ret;

	ret = nla_put_string(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME,
			     error_name);
	if (ret)
		return ret;

	return nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE,
			   value);
}

static int doit_reply_value(struct genl_info *info, u32 node_id,
			    u32 error_id)
{
	struct sk_buff *msg;
	struct nlattr *hdr;
	const char *error_name;
	u32 value;
	int ret;

	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
	if (!msg)
		return -ENOMEM;

	hdr = genlmsg_iput(msg, info);
	if (!hdr) {
		nlmsg_free(msg);
		return -EMSGSIZE;
	}

	ret = get_node_error_counter(node_id, error_id,
				     &error_name, &value);
	if (ret)
		return ret;

	ret = msg_reply_value(msg, error_id, error_name, value);
	if (ret) {
		genlmsg_cancel(msg, hdr);
		nlmsg_free(msg);
		return ret;
	}

	genlmsg_end(msg, hdr);

	return genlmsg_reply(msg, info);
}

/**
 * drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters
 * @skb: Netlink message buffer
 * @cb: Callback context for multi-part dumps
 *
 * Iterates over all error counters in a given Node and appends
 * their attributes (ID, name, value) to the given netlink message buffer.
 * Uses @cb->ctx to track progress in case the message buffer fills up, allowing
 * multi-part dump support. On buffer overflow, updates the context to resume
 * from the last node on the next invocation.
 *
 * Return: 0 if all errors fit in @skb, number of bytes added to @skb if
 *          the buffer filled up (requires multi-part continuation), or
 *          a negative error code on failure.
 */
int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb,
					struct netlink_callback *cb)
{
	const struct genl_info *info = genl_info_dump(cb);
	struct drm_ras_ctx *ctx = (void *)cb->ctx;
	struct drm_ras_node *node;
	struct nlattr *hdr;
	const char *error_name;
	u32 node_id, error_id, value;
	int ret;

	if (!info->attrs || GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID))
		return -EINVAL;

	node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]);

	node = xa_load(&drm_ras_xa, node_id);
	if (!node)
		return -ENOENT;

	for (error_id = max(node->error_counter_range.first, ctx->restart);
	     error_id <= node->error_counter_range.last;
	     error_id++) {
		ret = get_node_error_counter(node_id, error_id,
					     &error_name, &value);
		/*
		 * For non-contiguous range, driver return -ENOENT as indication
		 * to skip this ID when listing all errors.
		 */
		if (ret == -ENOENT)
			continue;
		if (ret)
			return ret;

		hdr = genlmsg_iput(skb, info);

		if (!hdr) {
			ret = -EMSGSIZE;
			break;
		}

		ret = msg_reply_value(skb, error_id, error_name, value);
		if (ret) {
			genlmsg_cancel(skb, hdr);
			break;
		}

		genlmsg_end(skb, hdr);
	}

	if (ret == -EMSGSIZE)
		ctx->restart = error_id;

	return ret;
}

/**
 * drm_ras_nl_get_error_counter_doit() - Query an error counter of an node
 * @skb: Netlink message buffer
 * @info: Generic Netlink info containing attributes of the request
 *
 * Extracts the node ID and error ID from the netlink attributes and
 * retrieves the current value of the corresponding error counter. Sends the
 * result back to the requesting user via the standard Genl reply.
 *
 * Return: 0 on success, or negative errno on failure.
 */
int drm_ras_nl_get_error_counter_doit(struct sk_buff *skb,
				      struct genl_info *info)
{
	u32 node_id, error_id;

	if (!info->attrs ||
	    GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) ||
	    GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID))
		return -EINVAL;

	node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]);
	error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]);

	return doit_reply_value(info, node_id, error_id);
}

/**
 * drm_ras_node_register() - Register a new RAS node
 * @node: Node structure to register
 *
 * Adds the given RAS node to the global node xarray and assigns it
 * a unique ID. Both @node->name and @node->type must be valid.
 *
 * Return: 0 on success, or negative errno on failure:
 */
int drm_ras_node_register(struct drm_ras_node *node)
{
	if (!node->device_name || !node->node_name)
		return -EINVAL;

	/* Currently, only Error Counter Endpoints are supported */
	if (node->type != DRM_RAS_NODE_TYPE_ERROR_COUNTER)
		return -EINVAL;

	/* Mandatory entries for Error Counter Node */
	if (node->type == DRM_RAS_NODE_TYPE_ERROR_COUNTER &&
	    (!node->error_counter_range.last || !node->query_error_counter))
		return -EINVAL;

	return xa_alloc(&drm_ras_xa, &node->id, node, xa_limit_32b, GFP_KERNEL);
}
EXPORT_SYMBOL(drm_ras_node_register);

/**
 * drm_ras_node_unregister() - Unregister a previously registered node
 * @node: Node structure to unregister
 *
 * Removes the given node from the global node xarray using its ID.
 */
void drm_ras_node_unregister(struct drm_ras_node *node)
{
	xa_erase(&drm_ras_xa, node->id);
}
EXPORT_SYMBOL(drm_ras_node_unregister);