1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * GHES/EDAC Linux driver
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (c) 2013 by Mauro Carvalho Chehab
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Red Hat Inc. https://www.redhat.com
8*4882a593Smuzhiyun */
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun #include <acpi/ghes.h>
13*4882a593Smuzhiyun #include <linux/edac.h>
14*4882a593Smuzhiyun #include <linux/dmi.h>
15*4882a593Smuzhiyun #include "edac_module.h"
16*4882a593Smuzhiyun #include <ras/ras_event.h>
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun struct ghes_pvt {
19*4882a593Smuzhiyun struct mem_ctl_info *mci;
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun /* Buffers for the error handling routine */
22*4882a593Smuzhiyun char other_detail[400];
23*4882a593Smuzhiyun char msg[80];
24*4882a593Smuzhiyun };
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun static refcount_t ghes_refcount = REFCOUNT_INIT(0);
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun /*
29*4882a593Smuzhiyun * Access to ghes_pvt must be protected by ghes_lock. The spinlock
30*4882a593Smuzhiyun * also provides the necessary (implicit) memory barrier for the SMP
31*4882a593Smuzhiyun * case to make the pointer visible on another CPU.
32*4882a593Smuzhiyun */
33*4882a593Smuzhiyun static struct ghes_pvt *ghes_pvt;
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun /*
36*4882a593Smuzhiyun * This driver's representation of the system hardware, as collected
37*4882a593Smuzhiyun * from DMI.
38*4882a593Smuzhiyun */
39*4882a593Smuzhiyun struct ghes_hw_desc {
40*4882a593Smuzhiyun int num_dimms;
41*4882a593Smuzhiyun struct dimm_info *dimms;
42*4882a593Smuzhiyun } ghes_hw;
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun /* GHES registration mutex */
45*4882a593Smuzhiyun static DEFINE_MUTEX(ghes_reg_mutex);
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun /*
48*4882a593Smuzhiyun * Sync with other, potentially concurrent callers of
49*4882a593Smuzhiyun * ghes_edac_report_mem_error(). We don't know what the
50*4882a593Smuzhiyun * "inventive" firmware would do.
51*4882a593Smuzhiyun */
52*4882a593Smuzhiyun static DEFINE_SPINLOCK(ghes_lock);
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun /* "ghes_edac.force_load=1" skips the platform check */
55*4882a593Smuzhiyun static bool __read_mostly force_load;
56*4882a593Smuzhiyun module_param(force_load, bool, 0);
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun static bool system_scanned;
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun /* Memory Device - Type 17 of SMBIOS spec */
61*4882a593Smuzhiyun struct memdev_dmi_entry {
62*4882a593Smuzhiyun u8 type;
63*4882a593Smuzhiyun u8 length;
64*4882a593Smuzhiyun u16 handle;
65*4882a593Smuzhiyun u16 phys_mem_array_handle;
66*4882a593Smuzhiyun u16 mem_err_info_handle;
67*4882a593Smuzhiyun u16 total_width;
68*4882a593Smuzhiyun u16 data_width;
69*4882a593Smuzhiyun u16 size;
70*4882a593Smuzhiyun u8 form_factor;
71*4882a593Smuzhiyun u8 device_set;
72*4882a593Smuzhiyun u8 device_locator;
73*4882a593Smuzhiyun u8 bank_locator;
74*4882a593Smuzhiyun u8 memory_type;
75*4882a593Smuzhiyun u16 type_detail;
76*4882a593Smuzhiyun u16 speed;
77*4882a593Smuzhiyun u8 manufacturer;
78*4882a593Smuzhiyun u8 serial_number;
79*4882a593Smuzhiyun u8 asset_tag;
80*4882a593Smuzhiyun u8 part_number;
81*4882a593Smuzhiyun u8 attributes;
82*4882a593Smuzhiyun u32 extended_size;
83*4882a593Smuzhiyun u16 conf_mem_clk_speed;
84*4882a593Smuzhiyun } __attribute__((__packed__));
85*4882a593Smuzhiyun
find_dimm_by_handle(struct mem_ctl_info * mci,u16 handle)86*4882a593Smuzhiyun static struct dimm_info *find_dimm_by_handle(struct mem_ctl_info *mci, u16 handle)
87*4882a593Smuzhiyun {
88*4882a593Smuzhiyun struct dimm_info *dimm;
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun mci_for_each_dimm(mci, dimm) {
91*4882a593Smuzhiyun if (dimm->smbios_handle == handle)
92*4882a593Smuzhiyun return dimm;
93*4882a593Smuzhiyun }
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun return NULL;
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun
dimm_setup_label(struct dimm_info * dimm,u16 handle)98*4882a593Smuzhiyun static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
99*4882a593Smuzhiyun {
100*4882a593Smuzhiyun const char *bank = NULL, *device = NULL;
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun dmi_memdev_name(handle, &bank, &device);
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun /*
105*4882a593Smuzhiyun * Set to a NULL string when both bank and device are zero. In this case,
106*4882a593Smuzhiyun * the label assigned by default will be preserved.
107*4882a593Smuzhiyun */
108*4882a593Smuzhiyun snprintf(dimm->label, sizeof(dimm->label), "%s%s%s",
109*4882a593Smuzhiyun (bank && *bank) ? bank : "",
110*4882a593Smuzhiyun (bank && *bank && device && *device) ? " " : "",
111*4882a593Smuzhiyun (device && *device) ? device : "");
112*4882a593Smuzhiyun }
113*4882a593Smuzhiyun
assign_dmi_dimm_info(struct dimm_info * dimm,struct memdev_dmi_entry * entry)114*4882a593Smuzhiyun static void assign_dmi_dimm_info(struct dimm_info *dimm, struct memdev_dmi_entry *entry)
115*4882a593Smuzhiyun {
116*4882a593Smuzhiyun u16 rdr_mask = BIT(7) | BIT(13);
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun if (entry->size == 0xffff) {
119*4882a593Smuzhiyun pr_info("Can't get DIMM%i size\n", dimm->idx);
120*4882a593Smuzhiyun dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
121*4882a593Smuzhiyun } else if (entry->size == 0x7fff) {
122*4882a593Smuzhiyun dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
123*4882a593Smuzhiyun } else {
124*4882a593Smuzhiyun if (entry->size & BIT(15))
125*4882a593Smuzhiyun dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10);
126*4882a593Smuzhiyun else
127*4882a593Smuzhiyun dimm->nr_pages = MiB_TO_PAGES(entry->size);
128*4882a593Smuzhiyun }
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun switch (entry->memory_type) {
131*4882a593Smuzhiyun case 0x12:
132*4882a593Smuzhiyun if (entry->type_detail & BIT(13))
133*4882a593Smuzhiyun dimm->mtype = MEM_RDDR;
134*4882a593Smuzhiyun else
135*4882a593Smuzhiyun dimm->mtype = MEM_DDR;
136*4882a593Smuzhiyun break;
137*4882a593Smuzhiyun case 0x13:
138*4882a593Smuzhiyun if (entry->type_detail & BIT(13))
139*4882a593Smuzhiyun dimm->mtype = MEM_RDDR2;
140*4882a593Smuzhiyun else
141*4882a593Smuzhiyun dimm->mtype = MEM_DDR2;
142*4882a593Smuzhiyun break;
143*4882a593Smuzhiyun case 0x14:
144*4882a593Smuzhiyun dimm->mtype = MEM_FB_DDR2;
145*4882a593Smuzhiyun break;
146*4882a593Smuzhiyun case 0x18:
147*4882a593Smuzhiyun if (entry->type_detail & BIT(12))
148*4882a593Smuzhiyun dimm->mtype = MEM_NVDIMM;
149*4882a593Smuzhiyun else if (entry->type_detail & BIT(13))
150*4882a593Smuzhiyun dimm->mtype = MEM_RDDR3;
151*4882a593Smuzhiyun else
152*4882a593Smuzhiyun dimm->mtype = MEM_DDR3;
153*4882a593Smuzhiyun break;
154*4882a593Smuzhiyun case 0x1a:
155*4882a593Smuzhiyun if (entry->type_detail & BIT(12))
156*4882a593Smuzhiyun dimm->mtype = MEM_NVDIMM;
157*4882a593Smuzhiyun else if (entry->type_detail & BIT(13))
158*4882a593Smuzhiyun dimm->mtype = MEM_RDDR4;
159*4882a593Smuzhiyun else
160*4882a593Smuzhiyun dimm->mtype = MEM_DDR4;
161*4882a593Smuzhiyun break;
162*4882a593Smuzhiyun default:
163*4882a593Smuzhiyun if (entry->type_detail & BIT(6))
164*4882a593Smuzhiyun dimm->mtype = MEM_RMBS;
165*4882a593Smuzhiyun else if ((entry->type_detail & rdr_mask) == rdr_mask)
166*4882a593Smuzhiyun dimm->mtype = MEM_RDR;
167*4882a593Smuzhiyun else if (entry->type_detail & BIT(7))
168*4882a593Smuzhiyun dimm->mtype = MEM_SDR;
169*4882a593Smuzhiyun else if (entry->type_detail & BIT(9))
170*4882a593Smuzhiyun dimm->mtype = MEM_EDO;
171*4882a593Smuzhiyun else
172*4882a593Smuzhiyun dimm->mtype = MEM_UNKNOWN;
173*4882a593Smuzhiyun }
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun /*
176*4882a593Smuzhiyun * Actually, we can only detect if the memory has bits for
177*4882a593Smuzhiyun * checksum or not
178*4882a593Smuzhiyun */
179*4882a593Smuzhiyun if (entry->total_width == entry->data_width)
180*4882a593Smuzhiyun dimm->edac_mode = EDAC_NONE;
181*4882a593Smuzhiyun else
182*4882a593Smuzhiyun dimm->edac_mode = EDAC_SECDED;
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun dimm->dtype = DEV_UNKNOWN;
185*4882a593Smuzhiyun dimm->grain = 128; /* Likely, worse case */
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun dimm_setup_label(dimm, entry->handle);
188*4882a593Smuzhiyun
189*4882a593Smuzhiyun if (dimm->nr_pages) {
190*4882a593Smuzhiyun edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
191*4882a593Smuzhiyun dimm->idx, edac_mem_types[dimm->mtype],
192*4882a593Smuzhiyun PAGES_TO_MiB(dimm->nr_pages),
193*4882a593Smuzhiyun (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
194*4882a593Smuzhiyun edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
195*4882a593Smuzhiyun entry->memory_type, entry->type_detail,
196*4882a593Smuzhiyun entry->total_width, entry->data_width);
197*4882a593Smuzhiyun }
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun dimm->smbios_handle = entry->handle;
200*4882a593Smuzhiyun }
201*4882a593Smuzhiyun
enumerate_dimms(const struct dmi_header * dh,void * arg)202*4882a593Smuzhiyun static void enumerate_dimms(const struct dmi_header *dh, void *arg)
203*4882a593Smuzhiyun {
204*4882a593Smuzhiyun struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
205*4882a593Smuzhiyun struct ghes_hw_desc *hw = (struct ghes_hw_desc *)arg;
206*4882a593Smuzhiyun struct dimm_info *d;
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun if (dh->type != DMI_ENTRY_MEM_DEVICE)
209*4882a593Smuzhiyun return;
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun /* Enlarge the array with additional 16 */
212*4882a593Smuzhiyun if (!hw->num_dimms || !(hw->num_dimms % 16)) {
213*4882a593Smuzhiyun struct dimm_info *new;
214*4882a593Smuzhiyun
215*4882a593Smuzhiyun new = krealloc(hw->dimms, (hw->num_dimms + 16) * sizeof(struct dimm_info),
216*4882a593Smuzhiyun GFP_KERNEL);
217*4882a593Smuzhiyun if (!new) {
218*4882a593Smuzhiyun WARN_ON_ONCE(1);
219*4882a593Smuzhiyun return;
220*4882a593Smuzhiyun }
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun hw->dimms = new;
223*4882a593Smuzhiyun }
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun d = &hw->dimms[hw->num_dimms];
226*4882a593Smuzhiyun d->idx = hw->num_dimms;
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun assign_dmi_dimm_info(d, entry);
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun hw->num_dimms++;
231*4882a593Smuzhiyun }
232*4882a593Smuzhiyun
ghes_scan_system(void)233*4882a593Smuzhiyun static void ghes_scan_system(void)
234*4882a593Smuzhiyun {
235*4882a593Smuzhiyun if (system_scanned)
236*4882a593Smuzhiyun return;
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun dmi_walk(enumerate_dimms, &ghes_hw);
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun system_scanned = true;
241*4882a593Smuzhiyun }
242*4882a593Smuzhiyun
ghes_edac_report_mem_error(int sev,struct cper_sec_mem_err * mem_err)243*4882a593Smuzhiyun void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
244*4882a593Smuzhiyun {
245*4882a593Smuzhiyun struct edac_raw_error_desc *e;
246*4882a593Smuzhiyun struct mem_ctl_info *mci;
247*4882a593Smuzhiyun struct ghes_pvt *pvt;
248*4882a593Smuzhiyun unsigned long flags;
249*4882a593Smuzhiyun char *p;
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun /*
252*4882a593Smuzhiyun * We can do the locking below because GHES defers error processing
253*4882a593Smuzhiyun * from NMI to IRQ context. Whenever that changes, we'd at least
254*4882a593Smuzhiyun * know.
255*4882a593Smuzhiyun */
256*4882a593Smuzhiyun if (WARN_ON_ONCE(in_nmi()))
257*4882a593Smuzhiyun return;
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun spin_lock_irqsave(&ghes_lock, flags);
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun pvt = ghes_pvt;
262*4882a593Smuzhiyun if (!pvt)
263*4882a593Smuzhiyun goto unlock;
264*4882a593Smuzhiyun
265*4882a593Smuzhiyun mci = pvt->mci;
266*4882a593Smuzhiyun e = &mci->error_desc;
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun /* Cleans the error report buffer */
269*4882a593Smuzhiyun memset(e, 0, sizeof (*e));
270*4882a593Smuzhiyun e->error_count = 1;
271*4882a593Smuzhiyun e->grain = 1;
272*4882a593Smuzhiyun e->msg = pvt->msg;
273*4882a593Smuzhiyun e->other_detail = pvt->other_detail;
274*4882a593Smuzhiyun e->top_layer = -1;
275*4882a593Smuzhiyun e->mid_layer = -1;
276*4882a593Smuzhiyun e->low_layer = -1;
277*4882a593Smuzhiyun *pvt->other_detail = '\0';
278*4882a593Smuzhiyun *pvt->msg = '\0';
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun switch (sev) {
281*4882a593Smuzhiyun case GHES_SEV_CORRECTED:
282*4882a593Smuzhiyun e->type = HW_EVENT_ERR_CORRECTED;
283*4882a593Smuzhiyun break;
284*4882a593Smuzhiyun case GHES_SEV_RECOVERABLE:
285*4882a593Smuzhiyun e->type = HW_EVENT_ERR_UNCORRECTED;
286*4882a593Smuzhiyun break;
287*4882a593Smuzhiyun case GHES_SEV_PANIC:
288*4882a593Smuzhiyun e->type = HW_EVENT_ERR_FATAL;
289*4882a593Smuzhiyun break;
290*4882a593Smuzhiyun default:
291*4882a593Smuzhiyun case GHES_SEV_NO:
292*4882a593Smuzhiyun e->type = HW_EVENT_ERR_INFO;
293*4882a593Smuzhiyun }
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun edac_dbg(1, "error validation_bits: 0x%08llx\n",
296*4882a593Smuzhiyun (long long)mem_err->validation_bits);
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun /* Error type, mapped on e->msg */
299*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
300*4882a593Smuzhiyun p = pvt->msg;
301*4882a593Smuzhiyun switch (mem_err->error_type) {
302*4882a593Smuzhiyun case 0:
303*4882a593Smuzhiyun p += sprintf(p, "Unknown");
304*4882a593Smuzhiyun break;
305*4882a593Smuzhiyun case 1:
306*4882a593Smuzhiyun p += sprintf(p, "No error");
307*4882a593Smuzhiyun break;
308*4882a593Smuzhiyun case 2:
309*4882a593Smuzhiyun p += sprintf(p, "Single-bit ECC");
310*4882a593Smuzhiyun break;
311*4882a593Smuzhiyun case 3:
312*4882a593Smuzhiyun p += sprintf(p, "Multi-bit ECC");
313*4882a593Smuzhiyun break;
314*4882a593Smuzhiyun case 4:
315*4882a593Smuzhiyun p += sprintf(p, "Single-symbol ChipKill ECC");
316*4882a593Smuzhiyun break;
317*4882a593Smuzhiyun case 5:
318*4882a593Smuzhiyun p += sprintf(p, "Multi-symbol ChipKill ECC");
319*4882a593Smuzhiyun break;
320*4882a593Smuzhiyun case 6:
321*4882a593Smuzhiyun p += sprintf(p, "Master abort");
322*4882a593Smuzhiyun break;
323*4882a593Smuzhiyun case 7:
324*4882a593Smuzhiyun p += sprintf(p, "Target abort");
325*4882a593Smuzhiyun break;
326*4882a593Smuzhiyun case 8:
327*4882a593Smuzhiyun p += sprintf(p, "Parity Error");
328*4882a593Smuzhiyun break;
329*4882a593Smuzhiyun case 9:
330*4882a593Smuzhiyun p += sprintf(p, "Watchdog timeout");
331*4882a593Smuzhiyun break;
332*4882a593Smuzhiyun case 10:
333*4882a593Smuzhiyun p += sprintf(p, "Invalid address");
334*4882a593Smuzhiyun break;
335*4882a593Smuzhiyun case 11:
336*4882a593Smuzhiyun p += sprintf(p, "Mirror Broken");
337*4882a593Smuzhiyun break;
338*4882a593Smuzhiyun case 12:
339*4882a593Smuzhiyun p += sprintf(p, "Memory Sparing");
340*4882a593Smuzhiyun break;
341*4882a593Smuzhiyun case 13:
342*4882a593Smuzhiyun p += sprintf(p, "Scrub corrected error");
343*4882a593Smuzhiyun break;
344*4882a593Smuzhiyun case 14:
345*4882a593Smuzhiyun p += sprintf(p, "Scrub uncorrected error");
346*4882a593Smuzhiyun break;
347*4882a593Smuzhiyun case 15:
348*4882a593Smuzhiyun p += sprintf(p, "Physical Memory Map-out event");
349*4882a593Smuzhiyun break;
350*4882a593Smuzhiyun default:
351*4882a593Smuzhiyun p += sprintf(p, "reserved error (%d)",
352*4882a593Smuzhiyun mem_err->error_type);
353*4882a593Smuzhiyun }
354*4882a593Smuzhiyun } else {
355*4882a593Smuzhiyun strcpy(pvt->msg, "unknown error");
356*4882a593Smuzhiyun }
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun /* Error address */
359*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
360*4882a593Smuzhiyun e->page_frame_number = PHYS_PFN(mem_err->physical_addr);
361*4882a593Smuzhiyun e->offset_in_page = offset_in_page(mem_err->physical_addr);
362*4882a593Smuzhiyun }
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun /* Error grain */
365*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
366*4882a593Smuzhiyun e->grain = ~mem_err->physical_addr_mask + 1;
367*4882a593Smuzhiyun
368*4882a593Smuzhiyun /* Memory error location, mapped on e->location */
369*4882a593Smuzhiyun p = e->location;
370*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
371*4882a593Smuzhiyun p += sprintf(p, "node:%d ", mem_err->node);
372*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
373*4882a593Smuzhiyun p += sprintf(p, "card:%d ", mem_err->card);
374*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
375*4882a593Smuzhiyun p += sprintf(p, "module:%d ", mem_err->module);
376*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
377*4882a593Smuzhiyun p += sprintf(p, "rank:%d ", mem_err->rank);
378*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
379*4882a593Smuzhiyun p += sprintf(p, "bank:%d ", mem_err->bank);
380*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_BANK_GROUP)
381*4882a593Smuzhiyun p += sprintf(p, "bank_group:%d ",
382*4882a593Smuzhiyun mem_err->bank >> CPER_MEM_BANK_GROUP_SHIFT);
383*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_BANK_ADDRESS)
384*4882a593Smuzhiyun p += sprintf(p, "bank_address:%d ",
385*4882a593Smuzhiyun mem_err->bank & CPER_MEM_BANK_ADDRESS_MASK);
386*4882a593Smuzhiyun if (mem_err->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) {
387*4882a593Smuzhiyun u32 row = mem_err->row;
388*4882a593Smuzhiyun
389*4882a593Smuzhiyun row |= cper_get_mem_extension(mem_err->validation_bits, mem_err->extended);
390*4882a593Smuzhiyun p += sprintf(p, "row:%d ", row);
391*4882a593Smuzhiyun }
392*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
393*4882a593Smuzhiyun p += sprintf(p, "col:%d ", mem_err->column);
394*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
395*4882a593Smuzhiyun p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
396*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
397*4882a593Smuzhiyun const char *bank = NULL, *device = NULL;
398*4882a593Smuzhiyun struct dimm_info *dimm;
399*4882a593Smuzhiyun
400*4882a593Smuzhiyun dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
401*4882a593Smuzhiyun if (bank != NULL && device != NULL)
402*4882a593Smuzhiyun p += sprintf(p, "DIMM location:%s %s ", bank, device);
403*4882a593Smuzhiyun else
404*4882a593Smuzhiyun p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
405*4882a593Smuzhiyun mem_err->mem_dev_handle);
406*4882a593Smuzhiyun
407*4882a593Smuzhiyun dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle);
408*4882a593Smuzhiyun if (dimm) {
409*4882a593Smuzhiyun e->top_layer = dimm->idx;
410*4882a593Smuzhiyun strcpy(e->label, dimm->label);
411*4882a593Smuzhiyun }
412*4882a593Smuzhiyun }
413*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_CHIP_ID)
414*4882a593Smuzhiyun p += sprintf(p, "chipID: %d ",
415*4882a593Smuzhiyun mem_err->extended >> CPER_MEM_CHIP_ID_SHIFT);
416*4882a593Smuzhiyun if (p > e->location)
417*4882a593Smuzhiyun *(p - 1) = '\0';
418*4882a593Smuzhiyun
419*4882a593Smuzhiyun if (!*e->label)
420*4882a593Smuzhiyun strcpy(e->label, "unknown memory");
421*4882a593Smuzhiyun
422*4882a593Smuzhiyun /* All other fields are mapped on e->other_detail */
423*4882a593Smuzhiyun p = pvt->other_detail;
424*4882a593Smuzhiyun p += snprintf(p, sizeof(pvt->other_detail),
425*4882a593Smuzhiyun "APEI location: %s ", e->location);
426*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
427*4882a593Smuzhiyun u64 status = mem_err->error_status;
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun p += sprintf(p, "status(0x%016llx): ", (long long)status);
430*4882a593Smuzhiyun switch ((status >> 8) & 0xff) {
431*4882a593Smuzhiyun case 1:
432*4882a593Smuzhiyun p += sprintf(p, "Error detected internal to the component ");
433*4882a593Smuzhiyun break;
434*4882a593Smuzhiyun case 16:
435*4882a593Smuzhiyun p += sprintf(p, "Error detected in the bus ");
436*4882a593Smuzhiyun break;
437*4882a593Smuzhiyun case 4:
438*4882a593Smuzhiyun p += sprintf(p, "Storage error in DRAM memory ");
439*4882a593Smuzhiyun break;
440*4882a593Smuzhiyun case 5:
441*4882a593Smuzhiyun p += sprintf(p, "Storage error in TLB ");
442*4882a593Smuzhiyun break;
443*4882a593Smuzhiyun case 6:
444*4882a593Smuzhiyun p += sprintf(p, "Storage error in cache ");
445*4882a593Smuzhiyun break;
446*4882a593Smuzhiyun case 7:
447*4882a593Smuzhiyun p += sprintf(p, "Error in one or more functional units ");
448*4882a593Smuzhiyun break;
449*4882a593Smuzhiyun case 8:
450*4882a593Smuzhiyun p += sprintf(p, "component failed self test ");
451*4882a593Smuzhiyun break;
452*4882a593Smuzhiyun case 9:
453*4882a593Smuzhiyun p += sprintf(p, "Overflow or undervalue of internal queue ");
454*4882a593Smuzhiyun break;
455*4882a593Smuzhiyun case 17:
456*4882a593Smuzhiyun p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR ");
457*4882a593Smuzhiyun break;
458*4882a593Smuzhiyun case 18:
459*4882a593Smuzhiyun p += sprintf(p, "Improper access error ");
460*4882a593Smuzhiyun break;
461*4882a593Smuzhiyun case 19:
462*4882a593Smuzhiyun p += sprintf(p, "Access to a memory address which is not mapped to any component ");
463*4882a593Smuzhiyun break;
464*4882a593Smuzhiyun case 20:
465*4882a593Smuzhiyun p += sprintf(p, "Loss of Lockstep ");
466*4882a593Smuzhiyun break;
467*4882a593Smuzhiyun case 21:
468*4882a593Smuzhiyun p += sprintf(p, "Response not associated with a request ");
469*4882a593Smuzhiyun break;
470*4882a593Smuzhiyun case 22:
471*4882a593Smuzhiyun p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits ");
472*4882a593Smuzhiyun break;
473*4882a593Smuzhiyun case 23:
474*4882a593Smuzhiyun p += sprintf(p, "Detection of a PATH_ERROR ");
475*4882a593Smuzhiyun break;
476*4882a593Smuzhiyun case 25:
477*4882a593Smuzhiyun p += sprintf(p, "Bus operation timeout ");
478*4882a593Smuzhiyun break;
479*4882a593Smuzhiyun case 26:
480*4882a593Smuzhiyun p += sprintf(p, "A read was issued to data that has been poisoned ");
481*4882a593Smuzhiyun break;
482*4882a593Smuzhiyun default:
483*4882a593Smuzhiyun p += sprintf(p, "reserved ");
484*4882a593Smuzhiyun break;
485*4882a593Smuzhiyun }
486*4882a593Smuzhiyun }
487*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
488*4882a593Smuzhiyun p += sprintf(p, "requestorID: 0x%016llx ",
489*4882a593Smuzhiyun (long long)mem_err->requestor_id);
490*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
491*4882a593Smuzhiyun p += sprintf(p, "responderID: 0x%016llx ",
492*4882a593Smuzhiyun (long long)mem_err->responder_id);
493*4882a593Smuzhiyun if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID)
494*4882a593Smuzhiyun p += sprintf(p, "targetID: 0x%016llx ",
495*4882a593Smuzhiyun (long long)mem_err->responder_id);
496*4882a593Smuzhiyun if (p > pvt->other_detail)
497*4882a593Smuzhiyun *(p - 1) = '\0';
498*4882a593Smuzhiyun
499*4882a593Smuzhiyun edac_raw_mc_handle_error(e);
500*4882a593Smuzhiyun
501*4882a593Smuzhiyun unlock:
502*4882a593Smuzhiyun spin_unlock_irqrestore(&ghes_lock, flags);
503*4882a593Smuzhiyun }
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun /*
506*4882a593Smuzhiyun * Known systems that are safe to enable this module.
507*4882a593Smuzhiyun */
508*4882a593Smuzhiyun static struct acpi_platform_list plat_list[] = {
509*4882a593Smuzhiyun {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions},
510*4882a593Smuzhiyun { } /* End */
511*4882a593Smuzhiyun };
512*4882a593Smuzhiyun
ghes_edac_register(struct ghes * ghes,struct device * dev)513*4882a593Smuzhiyun int ghes_edac_register(struct ghes *ghes, struct device *dev)
514*4882a593Smuzhiyun {
515*4882a593Smuzhiyun bool fake = false;
516*4882a593Smuzhiyun struct mem_ctl_info *mci;
517*4882a593Smuzhiyun struct ghes_pvt *pvt;
518*4882a593Smuzhiyun struct edac_mc_layer layers[1];
519*4882a593Smuzhiyun unsigned long flags;
520*4882a593Smuzhiyun int idx = -1;
521*4882a593Smuzhiyun int rc = 0;
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun if (IS_ENABLED(CONFIG_X86)) {
524*4882a593Smuzhiyun /* Check if safe to enable on this system */
525*4882a593Smuzhiyun idx = acpi_match_platform_list(plat_list);
526*4882a593Smuzhiyun if (!force_load && idx < 0)
527*4882a593Smuzhiyun return -ENODEV;
528*4882a593Smuzhiyun } else {
529*4882a593Smuzhiyun force_load = true;
530*4882a593Smuzhiyun idx = 0;
531*4882a593Smuzhiyun }
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun /* finish another registration/unregistration instance first */
534*4882a593Smuzhiyun mutex_lock(&ghes_reg_mutex);
535*4882a593Smuzhiyun
536*4882a593Smuzhiyun /*
537*4882a593Smuzhiyun * We have only one logical memory controller to which all DIMMs belong.
538*4882a593Smuzhiyun */
539*4882a593Smuzhiyun if (refcount_inc_not_zero(&ghes_refcount))
540*4882a593Smuzhiyun goto unlock;
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun ghes_scan_system();
543*4882a593Smuzhiyun
544*4882a593Smuzhiyun /* Check if we've got a bogus BIOS */
545*4882a593Smuzhiyun if (!ghes_hw.num_dimms) {
546*4882a593Smuzhiyun fake = true;
547*4882a593Smuzhiyun ghes_hw.num_dimms = 1;
548*4882a593Smuzhiyun }
549*4882a593Smuzhiyun
550*4882a593Smuzhiyun layers[0].type = EDAC_MC_LAYER_ALL_MEM;
551*4882a593Smuzhiyun layers[0].size = ghes_hw.num_dimms;
552*4882a593Smuzhiyun layers[0].is_virt_csrow = true;
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_pvt));
555*4882a593Smuzhiyun if (!mci) {
556*4882a593Smuzhiyun pr_info("Can't allocate memory for EDAC data\n");
557*4882a593Smuzhiyun rc = -ENOMEM;
558*4882a593Smuzhiyun goto unlock;
559*4882a593Smuzhiyun }
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun pvt = mci->pvt_info;
562*4882a593Smuzhiyun pvt->mci = mci;
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun mci->pdev = dev;
565*4882a593Smuzhiyun mci->mtype_cap = MEM_FLAG_EMPTY;
566*4882a593Smuzhiyun mci->edac_ctl_cap = EDAC_FLAG_NONE;
567*4882a593Smuzhiyun mci->edac_cap = EDAC_FLAG_NONE;
568*4882a593Smuzhiyun mci->mod_name = "ghes_edac.c";
569*4882a593Smuzhiyun mci->ctl_name = "ghes_edac";
570*4882a593Smuzhiyun mci->dev_name = "ghes";
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun if (fake) {
573*4882a593Smuzhiyun pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
574*4882a593Smuzhiyun pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
575*4882a593Smuzhiyun pr_info("work on such system. Use this driver with caution\n");
576*4882a593Smuzhiyun } else if (idx < 0) {
577*4882a593Smuzhiyun pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
578*4882a593Smuzhiyun pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
579*4882a593Smuzhiyun pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
580*4882a593Smuzhiyun pr_info("If you find incorrect reports, please contact your hardware vendor\n");
581*4882a593Smuzhiyun pr_info("to correct its BIOS.\n");
582*4882a593Smuzhiyun pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms);
583*4882a593Smuzhiyun }
584*4882a593Smuzhiyun
585*4882a593Smuzhiyun if (!fake) {
586*4882a593Smuzhiyun struct dimm_info *src, *dst;
587*4882a593Smuzhiyun int i = 0;
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun mci_for_each_dimm(mci, dst) {
590*4882a593Smuzhiyun src = &ghes_hw.dimms[i];
591*4882a593Smuzhiyun
592*4882a593Smuzhiyun dst->idx = src->idx;
593*4882a593Smuzhiyun dst->smbios_handle = src->smbios_handle;
594*4882a593Smuzhiyun dst->nr_pages = src->nr_pages;
595*4882a593Smuzhiyun dst->mtype = src->mtype;
596*4882a593Smuzhiyun dst->edac_mode = src->edac_mode;
597*4882a593Smuzhiyun dst->dtype = src->dtype;
598*4882a593Smuzhiyun dst->grain = src->grain;
599*4882a593Smuzhiyun
600*4882a593Smuzhiyun /*
601*4882a593Smuzhiyun * If no src->label, preserve default label assigned
602*4882a593Smuzhiyun * from EDAC core.
603*4882a593Smuzhiyun */
604*4882a593Smuzhiyun if (strlen(src->label))
605*4882a593Smuzhiyun memcpy(dst->label, src->label, sizeof(src->label));
606*4882a593Smuzhiyun
607*4882a593Smuzhiyun i++;
608*4882a593Smuzhiyun }
609*4882a593Smuzhiyun
610*4882a593Smuzhiyun } else {
611*4882a593Smuzhiyun struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0);
612*4882a593Smuzhiyun
613*4882a593Smuzhiyun dimm->nr_pages = 1;
614*4882a593Smuzhiyun dimm->grain = 128;
615*4882a593Smuzhiyun dimm->mtype = MEM_UNKNOWN;
616*4882a593Smuzhiyun dimm->dtype = DEV_UNKNOWN;
617*4882a593Smuzhiyun dimm->edac_mode = EDAC_SECDED;
618*4882a593Smuzhiyun }
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun rc = edac_mc_add_mc(mci);
621*4882a593Smuzhiyun if (rc < 0) {
622*4882a593Smuzhiyun pr_info("Can't register with the EDAC core\n");
623*4882a593Smuzhiyun edac_mc_free(mci);
624*4882a593Smuzhiyun rc = -ENODEV;
625*4882a593Smuzhiyun goto unlock;
626*4882a593Smuzhiyun }
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun spin_lock_irqsave(&ghes_lock, flags);
629*4882a593Smuzhiyun ghes_pvt = pvt;
630*4882a593Smuzhiyun spin_unlock_irqrestore(&ghes_lock, flags);
631*4882a593Smuzhiyun
632*4882a593Smuzhiyun /* only set on success */
633*4882a593Smuzhiyun refcount_set(&ghes_refcount, 1);
634*4882a593Smuzhiyun
635*4882a593Smuzhiyun unlock:
636*4882a593Smuzhiyun
637*4882a593Smuzhiyun /* Not needed anymore */
638*4882a593Smuzhiyun kfree(ghes_hw.dimms);
639*4882a593Smuzhiyun ghes_hw.dimms = NULL;
640*4882a593Smuzhiyun
641*4882a593Smuzhiyun mutex_unlock(&ghes_reg_mutex);
642*4882a593Smuzhiyun
643*4882a593Smuzhiyun return rc;
644*4882a593Smuzhiyun }
645*4882a593Smuzhiyun
ghes_edac_unregister(struct ghes * ghes)646*4882a593Smuzhiyun void ghes_edac_unregister(struct ghes *ghes)
647*4882a593Smuzhiyun {
648*4882a593Smuzhiyun struct mem_ctl_info *mci;
649*4882a593Smuzhiyun unsigned long flags;
650*4882a593Smuzhiyun
651*4882a593Smuzhiyun if (!force_load)
652*4882a593Smuzhiyun return;
653*4882a593Smuzhiyun
654*4882a593Smuzhiyun mutex_lock(&ghes_reg_mutex);
655*4882a593Smuzhiyun
656*4882a593Smuzhiyun system_scanned = false;
657*4882a593Smuzhiyun memset(&ghes_hw, 0, sizeof(struct ghes_hw_desc));
658*4882a593Smuzhiyun
659*4882a593Smuzhiyun if (!refcount_dec_and_test(&ghes_refcount))
660*4882a593Smuzhiyun goto unlock;
661*4882a593Smuzhiyun
662*4882a593Smuzhiyun /*
663*4882a593Smuzhiyun * Wait for the irq handler being finished.
664*4882a593Smuzhiyun */
665*4882a593Smuzhiyun spin_lock_irqsave(&ghes_lock, flags);
666*4882a593Smuzhiyun mci = ghes_pvt ? ghes_pvt->mci : NULL;
667*4882a593Smuzhiyun ghes_pvt = NULL;
668*4882a593Smuzhiyun spin_unlock_irqrestore(&ghes_lock, flags);
669*4882a593Smuzhiyun
670*4882a593Smuzhiyun if (!mci)
671*4882a593Smuzhiyun goto unlock;
672*4882a593Smuzhiyun
673*4882a593Smuzhiyun mci = edac_mc_del_mc(mci->pdev);
674*4882a593Smuzhiyun if (mci)
675*4882a593Smuzhiyun edac_mc_free(mci);
676*4882a593Smuzhiyun
677*4882a593Smuzhiyun unlock:
678*4882a593Smuzhiyun mutex_unlock(&ghes_reg_mutex);
679*4882a593Smuzhiyun }
680