2 * drivers/pci/pcie/aer/aerdrv_core.c
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
8 * This file implements the core part of PCI-Express AER. When an pci-express
9 * error is delivered, an error message will be collected and printed to
10 * console, then, an error recovery procedure will be executed by following
11 * the pci error recovery rules.
13 * Copyright (C) 2006 Intel Corp.
14 * Tom Long Nguyen (tom.l.nguyen@intel.com)
15 * Zhang Yanmin (yanmin.zhang@intel.com)
19 #include <linux/module.h>
20 #include <linux/pci.h>
21 #include <linux/kernel.h>
22 #include <linux/errno.h>
24 #include <linux/suspend.h>
25 #include <linux/delay.h>
26 #include <linux/slab.h>
30 static int nosourceid
;
31 module_param(forceload
, bool, 0);
32 module_param(nosourceid
, bool, 0);
34 int pci_enable_pcie_error_reporting(struct pci_dev
*dev
)
39 if (dev
->aer_firmware_first
)
42 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
46 pos
= pci_pcie_cap(dev
);
50 pci_read_config_word(dev
, pos
+PCI_EXP_DEVCTL
, ®16
);
53 PCI_EXP_DEVCTL_NFERE
|
56 pci_write_config_word(dev
, pos
+PCI_EXP_DEVCTL
, reg16
);
60 EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting
);
62 int pci_disable_pcie_error_reporting(struct pci_dev
*dev
)
67 if (dev
->aer_firmware_first
)
70 pos
= pci_pcie_cap(dev
);
74 pci_read_config_word(dev
, pos
+PCI_EXP_DEVCTL
, ®16
);
75 reg16
= reg16
& ~(PCI_EXP_DEVCTL_CERE
|
76 PCI_EXP_DEVCTL_NFERE
|
79 pci_write_config_word(dev
, pos
+PCI_EXP_DEVCTL
, reg16
);
83 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting
);
85 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev
*dev
)
90 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
94 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
, &status
);
96 pci_write_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
, status
);
100 EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status
);
103 * add_error_device - list device to be handled
104 * @e_info: pointer to error info
105 * @dev: pointer to pci_dev to be added
107 static int add_error_device(struct aer_err_info
*e_info
, struct pci_dev
*dev
)
109 if (e_info
->error_dev_num
< AER_MAX_MULTI_ERR_DEVICES
) {
110 e_info
->dev
[e_info
->error_dev_num
] = dev
;
111 e_info
->error_dev_num
++;
117 #define PCI_BUS(x) (((x) >> 8) & 0xff)
120 * is_error_source - check whether the device is source of reported error
121 * @dev: pointer to pci_dev to be checked
122 * @e_info: pointer to reported error info
124 static bool is_error_source(struct pci_dev
*dev
, struct aer_err_info
*e_info
)
131 * When bus id is equal to 0, it might be a bad id
132 * reported by root port.
134 if (!nosourceid
&& (PCI_BUS(e_info
->id
) != 0)) {
135 /* Device ID match? */
136 if (e_info
->id
== ((dev
->bus
->number
<< 8) | dev
->devfn
))
139 /* Continue id comparing if there is no multiple error */
140 if (!e_info
->multi_error_valid
)
147 * 2) bus id is equal to 0. Some ports might lose the bus
148 * id of error source id;
149 * 3) There are multiple errors and prior id comparing fails;
150 * We check AER status registers to find possible reporter.
152 if (atomic_read(&dev
->enable_cnt
) == 0)
154 pos
= pci_pcie_cap(dev
);
158 /* Check if AER is enabled */
159 pci_read_config_word(dev
, pos
+ PCI_EXP_DEVCTL
, ®16
);
161 PCI_EXP_DEVCTL_CERE
|
162 PCI_EXP_DEVCTL_NFERE
|
163 PCI_EXP_DEVCTL_FERE
|
164 PCI_EXP_DEVCTL_URRE
)))
166 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
170 /* Check if error is recorded */
171 if (e_info
->severity
== AER_CORRECTABLE
) {
172 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_STATUS
, &status
);
173 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_MASK
, &mask
);
175 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
, &status
);
176 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_MASK
, &mask
);
184 static int find_device_iter(struct pci_dev
*dev
, void *data
)
186 struct aer_err_info
*e_info
= (struct aer_err_info
*)data
;
188 if (is_error_source(dev
, e_info
)) {
189 /* List this device */
190 if (add_error_device(e_info
, dev
)) {
191 /* We cannot handle more... Stop iteration */
192 /* TODO: Should print error message here? */
196 /* If there is only a single error, stop iteration */
197 if (!e_info
->multi_error_valid
)
204 * find_source_device - search through device hierarchy for source device
205 * @parent: pointer to Root Port pci_dev data structure
206 * @e_info: including detailed error information such like id
208 * Return true if found.
210 * Invoked by DPC when error is detected at the Root Port.
212 static bool find_source_device(struct pci_dev
*parent
,
213 struct aer_err_info
*e_info
)
215 struct pci_dev
*dev
= parent
;
218 /* Is Root Port an agent that sends error message? */
219 result
= find_device_iter(dev
, e_info
);
223 pci_walk_bus(parent
->subordinate
, find_device_iter
, e_info
);
225 if (!e_info
->error_dev_num
) {
226 dev_printk(KERN_DEBUG
, &parent
->dev
,
227 "can't find device of ID%04x\n",
234 static int report_error_detected(struct pci_dev
*dev
, void *data
)
236 pci_ers_result_t vote
;
237 struct pci_error_handlers
*err_handler
;
238 struct aer_broadcast_data
*result_data
;
239 result_data
= (struct aer_broadcast_data
*) data
;
241 dev
->error_state
= result_data
->state
;
244 !dev
->driver
->err_handler
||
245 !dev
->driver
->err_handler
->error_detected
) {
246 if (result_data
->state
== pci_channel_io_frozen
&&
247 !(dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
)) {
249 * In case of fatal recovery, if one of down-
250 * stream device has no driver. We might be
251 * unable to recover because a later insmod
252 * of a driver for this device is unaware of
255 dev_printk(KERN_DEBUG
, &dev
->dev
, "device has %s\n",
257 "no AER-aware driver" : "no driver");
262 err_handler
= dev
->driver
->err_handler
;
263 vote
= err_handler
->error_detected(dev
, result_data
->state
);
264 result_data
->result
= merge_result(result_data
->result
, vote
);
268 static int report_mmio_enabled(struct pci_dev
*dev
, void *data
)
270 pci_ers_result_t vote
;
271 struct pci_error_handlers
*err_handler
;
272 struct aer_broadcast_data
*result_data
;
273 result_data
= (struct aer_broadcast_data
*) data
;
276 !dev
->driver
->err_handler
||
277 !dev
->driver
->err_handler
->mmio_enabled
)
280 err_handler
= dev
->driver
->err_handler
;
281 vote
= err_handler
->mmio_enabled(dev
);
282 result_data
->result
= merge_result(result_data
->result
, vote
);
286 static int report_slot_reset(struct pci_dev
*dev
, void *data
)
288 pci_ers_result_t vote
;
289 struct pci_error_handlers
*err_handler
;
290 struct aer_broadcast_data
*result_data
;
291 result_data
= (struct aer_broadcast_data
*) data
;
294 !dev
->driver
->err_handler
||
295 !dev
->driver
->err_handler
->slot_reset
)
298 err_handler
= dev
->driver
->err_handler
;
299 vote
= err_handler
->slot_reset(dev
);
300 result_data
->result
= merge_result(result_data
->result
, vote
);
304 static int report_resume(struct pci_dev
*dev
, void *data
)
306 struct pci_error_handlers
*err_handler
;
308 dev
->error_state
= pci_channel_io_normal
;
311 !dev
->driver
->err_handler
||
312 !dev
->driver
->err_handler
->resume
)
315 err_handler
= dev
->driver
->err_handler
;
316 err_handler
->resume(dev
);
321 * broadcast_error_message - handle message broadcast to downstream drivers
322 * @dev: pointer to from where in a hierarchy message is broadcasted down
323 * @state: error state
324 * @error_mesg: message to print
325 * @cb: callback to be broadcasted
327 * Invoked during error recovery process. Once being invoked, the content
328 * of error severity will be broadcasted to all downstream drivers in a
329 * hierarchy in question.
331 static pci_ers_result_t
broadcast_error_message(struct pci_dev
*dev
,
332 enum pci_channel_state state
,
334 int (*cb
)(struct pci_dev
*, void *))
336 struct aer_broadcast_data result_data
;
338 dev_printk(KERN_DEBUG
, &dev
->dev
, "broadcast %s message\n", error_mesg
);
339 result_data
.state
= state
;
340 if (cb
== report_error_detected
)
341 result_data
.result
= PCI_ERS_RESULT_CAN_RECOVER
;
343 result_data
.result
= PCI_ERS_RESULT_RECOVERED
;
345 if (dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
) {
347 * If the error is reported by a bridge, we think this error
348 * is related to the downstream link of the bridge, so we
349 * do error recovery on all subordinates of the bridge instead
350 * of the bridge and clear the error status of the bridge.
352 if (cb
== report_error_detected
)
353 dev
->error_state
= state
;
354 pci_walk_bus(dev
->subordinate
, cb
, &result_data
);
355 if (cb
== report_resume
) {
356 pci_cleanup_aer_uncorrect_error_status(dev
);
357 dev
->error_state
= pci_channel_io_normal
;
361 * If the error is reported by an end point, we think this
362 * error is related to the upstream link of the end point.
364 pci_walk_bus(dev
->bus
, cb
, &result_data
);
367 return result_data
.result
;
370 struct find_aer_service_data
{
371 struct pcie_port_service_driver
*aer_driver
;
375 static int find_aer_service_iter(struct device
*device
, void *data
)
377 struct device_driver
*driver
;
378 struct pcie_port_service_driver
*service_driver
;
379 struct find_aer_service_data
*result
;
381 result
= (struct find_aer_service_data
*) data
;
383 if (device
->bus
== &pcie_port_bus_type
) {
384 struct pcie_device
*pcie
= to_pcie_device(device
);
386 if (pcie
->port
->pcie_type
== PCI_EXP_TYPE_DOWNSTREAM
)
387 result
->is_downstream
= 1;
389 driver
= device
->driver
;
391 service_driver
= to_service_driver(driver
);
392 if (service_driver
->service
== PCIE_PORT_SERVICE_AER
) {
393 result
->aer_driver
= service_driver
;
402 static void find_aer_service(struct pci_dev
*dev
,
403 struct find_aer_service_data
*data
)
406 retval
= device_for_each_child(&dev
->dev
, data
, find_aer_service_iter
);
409 static pci_ers_result_t
reset_link(struct pcie_device
*aerdev
,
412 struct pci_dev
*udev
;
413 pci_ers_result_t status
;
414 struct find_aer_service_data data
;
416 if (dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
)
419 udev
= dev
->bus
->self
;
421 data
.is_downstream
= 0;
422 data
.aer_driver
= NULL
;
423 find_aer_service(udev
, &data
);
426 * Use the aer driver of the error agent firstly.
427 * If it hasn't the aer driver, use the root port's
429 if (!data
.aer_driver
|| !data
.aer_driver
->reset_link
) {
430 if (data
.is_downstream
&&
431 aerdev
->device
.driver
&&
432 to_service_driver(aerdev
->device
.driver
)->reset_link
) {
434 to_service_driver(aerdev
->device
.driver
);
436 dev_printk(KERN_DEBUG
, &dev
->dev
, "no link-reset "
438 return PCI_ERS_RESULT_DISCONNECT
;
442 status
= data
.aer_driver
->reset_link(udev
);
443 if (status
!= PCI_ERS_RESULT_RECOVERED
) {
444 dev_printk(KERN_DEBUG
, &dev
->dev
, "link reset at upstream "
445 "device %s failed\n", pci_name(udev
));
446 return PCI_ERS_RESULT_DISCONNECT
;
453 * do_recovery - handle nonfatal/fatal error recovery process
454 * @aerdev: pointer to a pcie_device data structure of root port
455 * @dev: pointer to a pci_dev data structure of agent detecting an error
456 * @severity: error severity type
458 * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
459 * error detected message to all downstream drivers within a hierarchy in
460 * question and return the returned code.
462 static pci_ers_result_t
do_recovery(struct pcie_device
*aerdev
,
466 pci_ers_result_t status
, result
= PCI_ERS_RESULT_RECOVERED
;
467 enum pci_channel_state state
;
469 if (severity
== AER_FATAL
)
470 state
= pci_channel_io_frozen
;
472 state
= pci_channel_io_normal
;
474 status
= broadcast_error_message(dev
,
477 report_error_detected
);
479 if (severity
== AER_FATAL
) {
480 result
= reset_link(aerdev
, dev
);
481 if (result
!= PCI_ERS_RESULT_RECOVERED
) {
482 /* TODO: Should panic here? */
487 if (status
== PCI_ERS_RESULT_CAN_RECOVER
)
488 status
= broadcast_error_message(dev
,
491 report_mmio_enabled
);
493 if (status
== PCI_ERS_RESULT_NEED_RESET
) {
495 * TODO: Should call platform-specific
496 * functions to reset slot before calling
497 * drivers' slot_reset callbacks?
499 status
= broadcast_error_message(dev
,
505 if (status
== PCI_ERS_RESULT_RECOVERED
)
506 broadcast_error_message(dev
,
515 * handle_error_source - handle logging error into an event log
516 * @aerdev: pointer to pcie_device data structure of the root port
517 * @dev: pointer to pci_dev data structure of error source device
518 * @info: comprehensive error information
520 * Invoked when an error being detected by Root Port.
522 static void handle_error_source(struct pcie_device
*aerdev
,
524 struct aer_err_info
*info
)
526 pci_ers_result_t status
= 0;
529 if (info
->severity
== AER_CORRECTABLE
) {
531 * Correctable error does not need software intevention.
532 * No need to go through error recovery process.
534 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
536 pci_write_config_dword(dev
, pos
+ PCI_ERR_COR_STATUS
,
539 status
= do_recovery(aerdev
, dev
, info
->severity
);
540 if (status
== PCI_ERS_RESULT_RECOVERED
) {
541 dev_printk(KERN_DEBUG
, &dev
->dev
, "AER driver "
542 "successfully recovered\n");
544 /* TODO: Should kernel panic here? */
545 dev_printk(KERN_DEBUG
, &dev
->dev
, "AER driver didn't "
552 * get_e_source - retrieve an error source
553 * @rpc: pointer to the root port which holds an error
555 * Invoked by DPC handler to consume an error.
557 static struct aer_err_source
*get_e_source(struct aer_rpc
*rpc
)
559 struct aer_err_source
*e_source
;
562 /* Lock access to Root error producer/consumer index */
563 spin_lock_irqsave(&rpc
->e_lock
, flags
);
564 if (rpc
->prod_idx
== rpc
->cons_idx
) {
565 spin_unlock_irqrestore(&rpc
->e_lock
, flags
);
568 e_source
= &rpc
->e_sources
[rpc
->cons_idx
];
570 if (rpc
->cons_idx
== AER_ERROR_SOURCES_MAX
)
572 spin_unlock_irqrestore(&rpc
->e_lock
, flags
);
578 * get_device_error_info - read error status from dev and store it to info
579 * @dev: pointer to the device expected to have a error record
580 * @info: pointer to structure to store the error record
582 * Return 1 on success, 0 on error.
584 static int get_device_error_info(struct pci_dev
*dev
, struct aer_err_info
*info
)
589 info
->tlp_header_valid
= 0;
591 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
593 /* The device might not support AER */
597 if (info
->severity
== AER_CORRECTABLE
) {
598 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_STATUS
,
600 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_MASK
,
602 if (!(info
->status
& ~info
->mask
))
604 } else if (dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
||
605 info
->severity
== AER_NONFATAL
) {
607 /* Link is still healthy for IO reads */
608 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
,
610 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_MASK
,
612 if (!(info
->status
& ~info
->mask
))
615 /* Get First Error Pointer */
616 pci_read_config_dword(dev
, pos
+ PCI_ERR_CAP
, &temp
);
617 info
->first_error
= PCI_ERR_CAP_FEP(temp
);
619 if (info
->status
& AER_LOG_TLP_MASKS
) {
620 info
->tlp_header_valid
= 1;
621 pci_read_config_dword(dev
,
622 pos
+ PCI_ERR_HEADER_LOG
, &info
->tlp
.dw0
);
623 pci_read_config_dword(dev
,
624 pos
+ PCI_ERR_HEADER_LOG
+ 4, &info
->tlp
.dw1
);
625 pci_read_config_dword(dev
,
626 pos
+ PCI_ERR_HEADER_LOG
+ 8, &info
->tlp
.dw2
);
627 pci_read_config_dword(dev
,
628 pos
+ PCI_ERR_HEADER_LOG
+ 12, &info
->tlp
.dw3
);
635 static inline void aer_process_err_devices(struct pcie_device
*p_device
,
636 struct aer_err_info
*e_info
)
640 /* Report all before handle them, not to lost records by reset etc. */
641 for (i
= 0; i
< e_info
->error_dev_num
&& e_info
->dev
[i
]; i
++) {
642 if (get_device_error_info(e_info
->dev
[i
], e_info
))
643 aer_print_error(e_info
->dev
[i
], e_info
);
645 for (i
= 0; i
< e_info
->error_dev_num
&& e_info
->dev
[i
]; i
++) {
646 if (get_device_error_info(e_info
->dev
[i
], e_info
))
647 handle_error_source(p_device
, e_info
->dev
[i
], e_info
);
652 * aer_isr_one_error - consume an error detected by root port
653 * @p_device: pointer to error root port service device
654 * @e_src: pointer to an error source
656 static void aer_isr_one_error(struct pcie_device
*p_device
,
657 struct aer_err_source
*e_src
)
659 struct aer_err_info
*e_info
;
662 /* struct aer_err_info might be big, so we allocate it with slab */
663 e_info
= kmalloc(sizeof(struct aer_err_info
), GFP_KERNEL
);
664 if (e_info
== NULL
) {
665 dev_printk(KERN_DEBUG
, &p_device
->port
->dev
,
666 "Can't allocate mem when processing AER errors\n");
671 * There is a possibility that both correctable error and
672 * uncorrectable error being logged. Report correctable error first.
674 for (i
= 1; i
& ROOT_ERR_STATUS_MASKS
; i
<<= 2) {
677 if (!(e_src
->status
& i
))
680 memset(e_info
, 0, sizeof(struct aer_err_info
));
682 /* Init comprehensive error information */
683 if (i
& PCI_ERR_ROOT_COR_RCV
) {
684 e_info
->id
= ERR_COR_ID(e_src
->id
);
685 e_info
->severity
= AER_CORRECTABLE
;
687 e_info
->id
= ERR_UNCOR_ID(e_src
->id
);
688 e_info
->severity
= ((e_src
->status
>> 6) & 1);
691 (PCI_ERR_ROOT_MULTI_COR_RCV
|
692 PCI_ERR_ROOT_MULTI_UNCOR_RCV
))
693 e_info
->multi_error_valid
= 1;
695 aer_print_port_info(p_device
->port
, e_info
);
697 if (find_source_device(p_device
->port
, e_info
))
698 aer_process_err_devices(p_device
, e_info
);
705 * aer_isr - consume errors detected by root port
706 * @work: definition of this work item
708 * Invoked, as DPC, when root port records new detected error
710 void aer_isr(struct work_struct
*work
)
712 struct aer_rpc
*rpc
= container_of(work
, struct aer_rpc
, dpc_handler
);
713 struct pcie_device
*p_device
= rpc
->rpd
;
714 struct aer_err_source
*e_src
;
716 mutex_lock(&rpc
->rpc_mutex
);
717 e_src
= get_e_source(rpc
);
719 aer_isr_one_error(p_device
, e_src
);
720 e_src
= get_e_source(rpc
);
722 mutex_unlock(&rpc
->rpc_mutex
);
724 wake_up(&rpc
->wait_release
);
728 * aer_init - provide AER initialization
729 * @dev: pointer to AER pcie device
731 * Invoked when AER service driver is loaded.
733 int aer_init(struct pcie_device
*dev
)
735 if (dev
->port
->aer_firmware_first
) {
736 dev_printk(KERN_DEBUG
, &dev
->device
,
737 "PCIe errors handled by platform firmware.\n");
741 if (aer_osc_setup(dev
))
747 dev_printk(KERN_DEBUG
, &dev
->device
,
748 "aerdrv forceload requested.\n");
749 dev
->port
->aer_firmware_first
= 0;