1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
|
/*
* /dev/mcelog driver
*
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
* Rest from unknown author(s).
* 2004 Andi Kleen. Rewrote most of it.
* Copyright 2008 Intel Corporation
* Author: Andi Kleen
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/poll.h>
#include "mce-internal.h"
static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
static DEFINE_MUTEX(mce_chrdev_read_mutex);
static char mce_helper[128];
static char *mce_helper_argv[2] = { mce_helper, NULL };
#define mce_log_get_idx_check(p) \
({ \
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
!lockdep_is_held(&mce_chrdev_read_mutex), \
"suspicious mce_log_get_idx_check() usage"); \
smp_load_acquire(&(p)); \
})
/*
* Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also
* separate MCEs from kernel messages to avoid bogus bug reports.
*/
static struct mce_log_buffer mcelog = {
.signature = MCE_LOG_SIGNATURE,
.len = MCE_LOG_LEN,
.recordlen = sizeof(struct mce),
};
static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
/* User mode helper program triggered by machine check event */
extern char mce_helper[128];
static int dev_mce_log(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *mce = (struct mce *)data;
unsigned int next, entry;
wmb();
for (;;) {
entry = mce_log_get_idx_check(mcelog.next);
for (;;) {
/*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
* interesting ones:
*/
if (entry >= MCE_LOG_LEN) {
set_bit(MCE_OVERFLOW,
(unsigned long *)&mcelog.flags);
return NOTIFY_OK;
}
/* Old left over entry. Skip: */
if (mcelog.entry[entry].finished) {
entry++;
continue;
}
break;
}
smp_rmb();
next = entry + 1;
if (cmpxchg(&mcelog.next, entry, next) == entry)
break;
}
memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
wmb();
mcelog.entry[entry].finished = 1;
wmb();
/* wake processes polling /dev/mcelog */
wake_up_interruptible(&mce_chrdev_wait);
return NOTIFY_OK;
}
static struct notifier_block dev_mcelog_nb = {
.notifier_call = dev_mce_log,
.priority = MCE_PRIO_MCELOG,
};
static void mce_do_trigger(struct work_struct *work)
{
call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
}
static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
void mce_work_trigger(void)
{
if (mce_helper[0])
schedule_work(&mce_trigger_work);
}
static ssize_t
show_trigger(struct device *s, struct device_attribute *attr, char *buf)
{
strcpy(buf, mce_helper);
strcat(buf, "\n");
return strlen(mce_helper) + 1;
}
static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
const char *buf, size_t siz)
{
char *p;
strncpy(mce_helper, buf, sizeof(mce_helper));
mce_helper[sizeof(mce_helper)-1] = 0;
p = strchr(mce_helper, '\n');
if (p)
*p = 0;
return strlen(mce_helper) + !!p;
}
DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
/*
* mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
*/
static DEFINE_SPINLOCK(mce_chrdev_state_lock);
static int mce_chrdev_open_count; /* #times opened */
static int mce_chrdev_open_exclu; /* already open exclusive? */
static int mce_chrdev_open(struct inode *inode, struct file *file)
{
spin_lock(&mce_chrdev_state_lock);
if (mce_chrdev_open_exclu ||
(mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
spin_unlock(&mce_chrdev_state_lock);
return -EBUSY;
}
if (file->f_flags & O_EXCL)
mce_chrdev_open_exclu = 1;
mce_chrdev_open_count++;
spin_unlock(&mce_chrdev_state_lock);
return nonseekable_open(inode, file);
}
static int mce_chrdev_release(struct inode *inode, struct file *file)
{
spin_lock(&mce_chrdev_state_lock);
mce_chrdev_open_count--;
mce_chrdev_open_exclu = 0;
spin_unlock(&mce_chrdev_state_lock);
return 0;
}
static void collect_tscs(void *data)
{
unsigned long *cpu_tsc = (unsigned long *)data;
cpu_tsc[smp_processor_id()] = rdtsc();
}
static int mce_apei_read_done;
/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
static int __mce_read_apei(char __user **ubuf, size_t usize)
{
int rc;
u64 record_id;
struct mce m;
if (usize < sizeof(struct mce))
return -EINVAL;
rc = apei_read_mce(&m, &record_id);
/* Error or no more MCE record */
if (rc <= 0) {
mce_apei_read_done = 1;
/*
* When ERST is disabled, mce_chrdev_read() should return
* "no record" instead of "no device."
*/
if (rc == -ENODEV)
return 0;
return rc;
}
rc = -EFAULT;
if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
return rc;
/*
* In fact, we should have cleared the record after that has
* been flushed to the disk or sent to network in
* /sbin/mcelog, but we have no interface to support that now,
* so just clear it to avoid duplication.
*/
rc = apei_clear_mce(record_id);
if (rc) {
mce_apei_read_done = 1;
return rc;
}
*ubuf += sizeof(struct mce);
return 0;
}
static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
size_t usize, loff_t *off)
{
char __user *buf = ubuf;
unsigned long *cpu_tsc;
unsigned prev, next;
int i, err;
cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
if (!cpu_tsc)
return -ENOMEM;
mutex_lock(&mce_chrdev_read_mutex);
if (!mce_apei_read_done) {
err = __mce_read_apei(&buf, usize);
if (err || buf != ubuf)
goto out;
}
next = mce_log_get_idx_check(mcelog.next);
/* Only supports full reads right now */
err = -EINVAL;
if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
goto out;
err = 0;
prev = 0;
do {
for (i = prev; i < next; i++) {
unsigned long start = jiffies;
struct mce *m = &mcelog.entry[i];
while (!m->finished) {
if (time_after_eq(jiffies, start + 2)) {
memset(m, 0, sizeof(*m));
goto timeout;
}
cpu_relax();
}
smp_rmb();
err |= copy_to_user(buf, m, sizeof(*m));
buf += sizeof(*m);
timeout:
;
}
memset(mcelog.entry + prev, 0,
(next - prev) * sizeof(struct mce));
prev = next;
next = cmpxchg(&mcelog.next, prev, 0);
} while (next != prev);
synchronize_sched();
/*
* Collect entries that were still getting written before the
* synchronize.
*/
on_each_cpu(collect_tscs, cpu_tsc, 1);
for (i = next; i < MCE_LOG_LEN; i++) {
struct mce *m = &mcelog.entry[i];
if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
err |= copy_to_user(buf, m, sizeof(*m));
smp_rmb();
buf += sizeof(*m);
memset(m, 0, sizeof(*m));
}
}
if (err)
err = -EFAULT;
out:
mutex_unlock(&mce_chrdev_read_mutex);
kfree(cpu_tsc);
return err ? err : buf - ubuf;
}
static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_chrdev_wait, wait);
if (READ_ONCE(mcelog.next))
return POLLIN | POLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
return POLLIN | POLLRDNORM;
return 0;
}
static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
unsigned long arg)
{
int __user *p = (int __user *)arg;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case MCE_GET_RECORD_LEN:
return put_user(sizeof(struct mce), p);
case MCE_GET_LOG_LEN:
return put_user(MCE_LOG_LEN, p);
case MCE_GETCLEAR_FLAGS: {
unsigned flags;
do {
flags = mcelog.flags;
} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
return put_user(flags, p);
}
default:
return -ENOTTY;
}
}
void mce_register_injector_chain(struct notifier_block *nb)
{
blocking_notifier_chain_register(&mce_injector_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_register_injector_chain);
void mce_unregister_injector_chain(struct notifier_block *nb)
{
blocking_notifier_chain_unregister(&mce_injector_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
size_t usize, loff_t *off)
{
struct mce m;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/*
* There are some cases where real MSR reads could slip
* through.
*/
if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
return -EIO;
if ((unsigned long)usize > sizeof(struct mce))
usize = sizeof(struct mce);
if (copy_from_user(&m, ubuf, usize))
return -EFAULT;
if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
return -EINVAL;
/*
* Need to give user space some time to set everything up,
* so do it a jiffie or two later everywhere.
*/
schedule_timeout(2);
blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
return usize;
}
static const struct file_operations mce_chrdev_ops = {
.open = mce_chrdev_open,
.release = mce_chrdev_release,
.read = mce_chrdev_read,
.write = mce_chrdev_write,
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
.llseek = no_llseek,
};
static struct miscdevice mce_chrdev_device = {
MISC_MCELOG_MINOR,
"mcelog",
&mce_chrdev_ops,
};
static __init int dev_mcelog_init_device(void)
{
int err;
/* register character device /dev/mcelog */
err = misc_register(&mce_chrdev_device);
if (err) {
if (err == -EBUSY)
/* Xen dom0 might have registered the device already. */
pr_info("Unable to init device /dev/mcelog, already registered");
else
pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
return err;
}
mce_register_decode_chain(&dev_mcelog_nb);
return 0;
}
device_initcall_sync(dev_mcelog_init_device);
|