summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/xe/xe_drm_ras.c
blob: e07dc23a155eda1649c949c84d138a77a065866f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// SPDX-License-Identifier: MIT
/*
 * Copyright © 2026 Intel Corporation
 */

#include <linux/bitmap.h>

#include <drm/drm_managed.h>
#include <drm/drm_print.h>
#include <drm/drm_ras.h>

#include "xe_device_types.h"
#include "xe_drm_ras.h"

static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES;
static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES;

static int hw_query_error_counter(struct xe_drm_ras_counter *info,
				  u32 error_id, const char **name, u32 *val)
{
	if (!info || !info[error_id].name)
		return -ENOENT;

	*name = info[error_id].name;
	*val = atomic_read(&info[error_id].counter);

	return 0;
}

static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id,
					     const char **name, u32 *val)
{
	struct xe_device *xe = ep->priv;
	struct xe_drm_ras *ras = &xe->ras;
	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE];

	return hw_query_error_counter(info, error_id, name, val);
}

static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id,
					   const char **name, u32 *val)
{
	struct xe_device *xe = ep->priv;
	struct xe_drm_ras *ras = &xe->ras;
	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE];

	return hw_query_error_counter(info, error_id, name, val);
}

static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe)
{
	struct xe_drm_ras_counter *counter;
	int i;

	counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL);
	if (!counter)
		return ERR_PTR(-ENOMEM);

	for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) {
		if (!error_components[i])
			continue;

		counter[i].name = error_components[i];
		atomic_set(&counter[i].counter, 0);
	}

	return counter;
}

static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
			      const enum drm_xe_ras_error_severity severity)
{
	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
	struct xe_drm_ras *ras = &xe->ras;
	const char *device_name;

	device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d",
				pci_domain_nr(pdev->bus), pdev->bus->number,
				PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));

	if (!device_name)
		return -ENOMEM;

	node->device_name = device_name;
	node->node_name = error_severity[severity];
	node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER;
	node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE;
	node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1;
	node->priv = xe;

	ras->info[severity] = allocate_and_copy_counters(xe);
	if (IS_ERR(ras->info[severity]))
		return PTR_ERR(ras->info[severity]);

	if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
		node->query_error_counter = query_correctable_error_counter;
	else
		node->query_error_counter = query_uncorrectable_error_counter;

	return 0;
}

static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity)
{
	struct drm_ras_node *node = &ras->node[severity];

	kfree(ras->info[severity]);
	ras->info[severity] = NULL;

	kfree(node->device_name);
	node->device_name = NULL;
}

static int register_nodes(struct xe_device *xe)
{
	struct xe_drm_ras *ras = &xe->ras;
	int i;

	for_each_error_severity(i) {
		struct drm_ras_node *node = &ras->node[i];
		int ret;

		ret = assign_node_params(xe, node, i);
		if (ret) {
			cleanup_node_param(ras, i);
			return ret;
		}

		ret = drm_ras_node_register(node);
		if (ret) {
			cleanup_node_param(ras, i);
			return ret;
		}
	}

	return 0;
}

static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg)
{
	struct xe_device *xe = arg;
	struct xe_drm_ras *ras = &xe->ras;
	int i;

	for_each_error_severity(i) {
		struct drm_ras_node *node = &ras->node[i];

		drm_ras_node_unregister(node);
		cleanup_node_param(ras, i);
	}
}

/**
 * xe_drm_ras_init() - Initialize DRM RAS
 * @xe: xe device instance
 *
 * Allocate and register DRM RAS nodes per device
 *
 * Return: 0 on success, negative error code otherwise.
 */
int xe_drm_ras_init(struct xe_device *xe)
{
	struct xe_drm_ras *ras = &xe->ras;
	struct drm_ras_node *node;
	int err;

	node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL);
	if (!node)
		return -ENOMEM;

	ras->node = node;

	err = register_nodes(xe);
	if (err) {
		drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err));
		return err;
	}

	err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe);
	if (err) {
		drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err));
		return err;
	}

	return 0;
}