2 * drivers/pci/pcie/aer/aerdrv_core.c
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
8 * This file implements the core part of PCI-Express AER. When an pci-express
9 * error is delivered, an error message will be collected and printed to
10 * console, then, an error recovery procedure will be executed by following
11 * the pci error recovery rules.
13 * Copyright (C) 2006 Intel Corp.
14 * Tom Long Nguyen (tom.l.nguyen@intel.com)
15 * Zhang Yanmin (yanmin.zhang@intel.com)
19 #include <linux/module.h>
20 #include <linux/pci.h>
21 #include <linux/kernel.h>
22 #include <linux/errno.h>
24 #include <linux/suspend.h>
25 #include <linux/delay.h>
26 #include <linux/slab.h>
30 static int nosourceid
;
31 module_param(forceload
, bool, 0);
32 module_param(nosourceid
, bool, 0);
34 int pci_enable_pcie_error_reporting(struct pci_dev
*dev
)
39 if (dev
->aer_firmware_first
)
42 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
46 pos
= pci_pcie_cap(dev
);
50 pci_read_config_word(dev
, pos
+PCI_EXP_DEVCTL
, ®16
);
53 PCI_EXP_DEVCTL_NFERE
|
56 pci_write_config_word(dev
, pos
+PCI_EXP_DEVCTL
, reg16
);
60 EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting
);
62 int pci_disable_pcie_error_reporting(struct pci_dev
*dev
)
67 if (dev
->aer_firmware_first
)
70 pos
= pci_pcie_cap(dev
);
74 pci_read_config_word(dev
, pos
+PCI_EXP_DEVCTL
, ®16
);
75 reg16
= reg16
& ~(PCI_EXP_DEVCTL_CERE
|
76 PCI_EXP_DEVCTL_NFERE
|
79 pci_write_config_word(dev
, pos
+PCI_EXP_DEVCTL
, reg16
);
83 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting
);
85 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev
*dev
)
90 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
94 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
, &status
);
96 pci_write_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
, status
);
100 EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status
);
103 * add_error_device - list device to be handled
104 * @e_info: pointer to error info
105 * @dev: pointer to pci_dev to be added
107 static int add_error_device(struct aer_err_info
*e_info
, struct pci_dev
*dev
)
109 if (e_info
->error_dev_num
< AER_MAX_MULTI_ERR_DEVICES
) {
110 e_info
->dev
[e_info
->error_dev_num
] = dev
;
111 e_info
->error_dev_num
++;
117 #define PCI_BUS(x) (((x) >> 8) & 0xff)
120 * is_error_source - check whether the device is source of reported error
121 * @dev: pointer to pci_dev to be checked
122 * @e_info: pointer to reported error info
124 static bool is_error_source(struct pci_dev
*dev
, struct aer_err_info
*e_info
)
131 * When bus id is equal to 0, it might be a bad id
132 * reported by root port.
134 if (!nosourceid
&& (PCI_BUS(e_info
->id
) != 0)) {
135 /* Device ID match? */
136 if (e_info
->id
== ((dev
->bus
->number
<< 8) | dev
->devfn
))
139 /* Continue id comparing if there is no multiple error */
140 if (!e_info
->multi_error_valid
)
147 * 2) bus id is equal to 0. Some ports might lose the bus
148 * id of error source id;
149 * 3) There are multiple errors and prior id comparing fails;
150 * We check AER status registers to find possible reporter.
152 if (atomic_read(&dev
->enable_cnt
) == 0)
154 pos
= pci_pcie_cap(dev
);
158 /* Check if AER is enabled */
159 pci_read_config_word(dev
, pos
+ PCI_EXP_DEVCTL
, ®16
);
161 PCI_EXP_DEVCTL_CERE
|
162 PCI_EXP_DEVCTL_NFERE
|
163 PCI_EXP_DEVCTL_FERE
|
164 PCI_EXP_DEVCTL_URRE
)))
166 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
170 /* Check if error is recorded */
171 if (e_info
->severity
== AER_CORRECTABLE
) {
172 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_STATUS
, &status
);
173 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_MASK
, &mask
);
175 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
, &status
);
176 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_MASK
, &mask
);
184 static int find_device_iter(struct pci_dev
*dev
, void *data
)
186 struct aer_err_info
*e_info
= (struct aer_err_info
*)data
;
188 if (is_error_source(dev
, e_info
)) {
189 /* List this device */
190 if (add_error_device(e_info
, dev
)) {
191 /* We cannot handle more... Stop iteration */
192 /* TODO: Should print error message here? */
196 /* If there is only a single error, stop iteration */
197 if (!e_info
->multi_error_valid
)
204 * find_source_device - search through device hierarchy for source device
205 * @parent: pointer to Root Port pci_dev data structure
206 * @e_info: including detailed error information such like id
208 * Return true if found.
210 * Invoked by DPC when error is detected at the Root Port.
211 * Caller of this function must set id, severity, and multi_error_valid of
212 * struct aer_err_info pointed by @e_info properly. This function must fill
213 * e_info->error_dev_num and e_info->dev[], based on the given information.
215 static bool find_source_device(struct pci_dev
*parent
,
216 struct aer_err_info
*e_info
)
218 struct pci_dev
*dev
= parent
;
221 /* Must reset in this function */
222 e_info
->error_dev_num
= 0;
224 /* Is Root Port an agent that sends error message? */
225 result
= find_device_iter(dev
, e_info
);
229 pci_walk_bus(parent
->subordinate
, find_device_iter
, e_info
);
231 if (!e_info
->error_dev_num
) {
232 dev_printk(KERN_DEBUG
, &parent
->dev
,
233 "can't find device of ID%04x\n",
240 static int report_error_detected(struct pci_dev
*dev
, void *data
)
242 pci_ers_result_t vote
;
243 struct pci_error_handlers
*err_handler
;
244 struct aer_broadcast_data
*result_data
;
245 result_data
= (struct aer_broadcast_data
*) data
;
247 dev
->error_state
= result_data
->state
;
250 !dev
->driver
->err_handler
||
251 !dev
->driver
->err_handler
->error_detected
) {
252 if (result_data
->state
== pci_channel_io_frozen
&&
253 !(dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
)) {
255 * In case of fatal recovery, if one of down-
256 * stream device has no driver. We might be
257 * unable to recover because a later insmod
258 * of a driver for this device is unaware of
261 dev_printk(KERN_DEBUG
, &dev
->dev
, "device has %s\n",
263 "no AER-aware driver" : "no driver");
268 err_handler
= dev
->driver
->err_handler
;
269 vote
= err_handler
->error_detected(dev
, result_data
->state
);
270 result_data
->result
= merge_result(result_data
->result
, vote
);
274 static int report_mmio_enabled(struct pci_dev
*dev
, void *data
)
276 pci_ers_result_t vote
;
277 struct pci_error_handlers
*err_handler
;
278 struct aer_broadcast_data
*result_data
;
279 result_data
= (struct aer_broadcast_data
*) data
;
282 !dev
->driver
->err_handler
||
283 !dev
->driver
->err_handler
->mmio_enabled
)
286 err_handler
= dev
->driver
->err_handler
;
287 vote
= err_handler
->mmio_enabled(dev
);
288 result_data
->result
= merge_result(result_data
->result
, vote
);
292 static int report_slot_reset(struct pci_dev
*dev
, void *data
)
294 pci_ers_result_t vote
;
295 struct pci_error_handlers
*err_handler
;
296 struct aer_broadcast_data
*result_data
;
297 result_data
= (struct aer_broadcast_data
*) data
;
300 !dev
->driver
->err_handler
||
301 !dev
->driver
->err_handler
->slot_reset
)
304 err_handler
= dev
->driver
->err_handler
;
305 vote
= err_handler
->slot_reset(dev
);
306 result_data
->result
= merge_result(result_data
->result
, vote
);
310 static int report_resume(struct pci_dev
*dev
, void *data
)
312 struct pci_error_handlers
*err_handler
;
314 dev
->error_state
= pci_channel_io_normal
;
317 !dev
->driver
->err_handler
||
318 !dev
->driver
->err_handler
->resume
)
321 err_handler
= dev
->driver
->err_handler
;
322 err_handler
->resume(dev
);
327 * broadcast_error_message - handle message broadcast to downstream drivers
328 * @dev: pointer to from where in a hierarchy message is broadcasted down
329 * @state: error state
330 * @error_mesg: message to print
331 * @cb: callback to be broadcasted
333 * Invoked during error recovery process. Once being invoked, the content
334 * of error severity will be broadcasted to all downstream drivers in a
335 * hierarchy in question.
337 static pci_ers_result_t
broadcast_error_message(struct pci_dev
*dev
,
338 enum pci_channel_state state
,
340 int (*cb
)(struct pci_dev
*, void *))
342 struct aer_broadcast_data result_data
;
344 dev_printk(KERN_DEBUG
, &dev
->dev
, "broadcast %s message\n", error_mesg
);
345 result_data
.state
= state
;
346 if (cb
== report_error_detected
)
347 result_data
.result
= PCI_ERS_RESULT_CAN_RECOVER
;
349 result_data
.result
= PCI_ERS_RESULT_RECOVERED
;
351 if (dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
) {
353 * If the error is reported by a bridge, we think this error
354 * is related to the downstream link of the bridge, so we
355 * do error recovery on all subordinates of the bridge instead
356 * of the bridge and clear the error status of the bridge.
358 if (cb
== report_error_detected
)
359 dev
->error_state
= state
;
360 pci_walk_bus(dev
->subordinate
, cb
, &result_data
);
361 if (cb
== report_resume
) {
362 pci_cleanup_aer_uncorrect_error_status(dev
);
363 dev
->error_state
= pci_channel_io_normal
;
367 * If the error is reported by an end point, we think this
368 * error is related to the upstream link of the end point.
370 pci_walk_bus(dev
->bus
, cb
, &result_data
);
373 return result_data
.result
;
377 * aer_do_secondary_bus_reset - perform secondary bus reset
378 * @dev: pointer to bridge's pci_dev data structure
380 * Invoked when performing link reset at Root Port or Downstream Port.
382 void aer_do_secondary_bus_reset(struct pci_dev
*dev
)
386 /* Assert Secondary Bus Reset */
387 pci_read_config_word(dev
, PCI_BRIDGE_CONTROL
, &p2p_ctrl
);
388 p2p_ctrl
|= PCI_BRIDGE_CTL_BUS_RESET
;
389 pci_write_config_word(dev
, PCI_BRIDGE_CONTROL
, p2p_ctrl
);
392 * we should send hot reset message for 2ms to allow it time to
393 * propagate to all downstream ports
397 /* De-assert Secondary Bus Reset */
398 p2p_ctrl
&= ~PCI_BRIDGE_CTL_BUS_RESET
;
399 pci_write_config_word(dev
, PCI_BRIDGE_CONTROL
, p2p_ctrl
);
402 * System software must wait for at least 100ms from the end
403 * of a reset of one or more device before it is permitted
404 * to issue Configuration Requests to those devices.
410 * default_downstream_reset_link - default reset function for Downstream Port
411 * @dev: pointer to downstream port's pci_dev data structure
413 * Invoked when performing link reset at Downstream Port w/ no aer driver.
415 static pci_ers_result_t
default_downstream_reset_link(struct pci_dev
*dev
)
417 aer_do_secondary_bus_reset(dev
);
418 dev_printk(KERN_DEBUG
, &dev
->dev
,
419 "Downstream Port link has been reset\n");
420 return PCI_ERS_RESULT_RECOVERED
;
423 static int find_aer_service_iter(struct device
*device
, void *data
)
425 struct pcie_port_service_driver
*service_driver
, **drv
;
427 drv
= (struct pcie_port_service_driver
**) data
;
429 if (device
->bus
== &pcie_port_bus_type
&& device
->driver
) {
430 service_driver
= to_service_driver(device
->driver
);
431 if (service_driver
->service
== PCIE_PORT_SERVICE_AER
) {
432 *drv
= service_driver
;
440 static struct pcie_port_service_driver
*find_aer_service(struct pci_dev
*dev
)
442 struct pcie_port_service_driver
*drv
= NULL
;
444 device_for_each_child(&dev
->dev
, &drv
, find_aer_service_iter
);
449 static pci_ers_result_t
reset_link(struct pcie_device
*aerdev
,
452 struct pci_dev
*udev
;
453 pci_ers_result_t status
;
454 struct pcie_port_service_driver
*driver
;
456 if (dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
) {
457 /* Reset this port for all subordinates */
460 /* Reset the upstream component (likely downstream port) */
461 udev
= dev
->bus
->self
;
464 /* Use the aer driver of the component firstly */
465 driver
= find_aer_service(udev
);
467 if (driver
&& driver
->reset_link
) {
468 status
= driver
->reset_link(udev
);
469 } else if (udev
->pcie_type
== PCI_EXP_TYPE_DOWNSTREAM
) {
470 status
= default_downstream_reset_link(udev
);
472 dev_printk(KERN_DEBUG
, &dev
->dev
,
473 "no link-reset support at upstream device %s\n",
475 return PCI_ERS_RESULT_DISCONNECT
;
478 if (status
!= PCI_ERS_RESULT_RECOVERED
) {
479 dev_printk(KERN_DEBUG
, &dev
->dev
,
480 "link reset at upstream device %s failed\n",
482 return PCI_ERS_RESULT_DISCONNECT
;
489 * do_recovery - handle nonfatal/fatal error recovery process
490 * @aerdev: pointer to a pcie_device data structure of root port
491 * @dev: pointer to a pci_dev data structure of agent detecting an error
492 * @severity: error severity type
494 * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
495 * error detected message to all downstream drivers within a hierarchy in
496 * question and return the returned code.
498 static void do_recovery(struct pcie_device
*aerdev
, struct pci_dev
*dev
,
501 pci_ers_result_t status
, result
= PCI_ERS_RESULT_RECOVERED
;
502 enum pci_channel_state state
;
504 if (severity
== AER_FATAL
)
505 state
= pci_channel_io_frozen
;
507 state
= pci_channel_io_normal
;
509 status
= broadcast_error_message(dev
,
512 report_error_detected
);
514 if (severity
== AER_FATAL
) {
515 result
= reset_link(aerdev
, dev
);
516 if (result
!= PCI_ERS_RESULT_RECOVERED
)
520 if (status
== PCI_ERS_RESULT_CAN_RECOVER
)
521 status
= broadcast_error_message(dev
,
524 report_mmio_enabled
);
526 if (status
== PCI_ERS_RESULT_NEED_RESET
) {
528 * TODO: Should call platform-specific
529 * functions to reset slot before calling
530 * drivers' slot_reset callbacks?
532 status
= broadcast_error_message(dev
,
538 if (status
!= PCI_ERS_RESULT_RECOVERED
)
541 broadcast_error_message(dev
,
546 dev_printk(KERN_DEBUG
, &dev
->dev
,
547 "AER driver successfully recovered\n");
551 /* TODO: Should kernel panic here? */
552 dev_printk(KERN_DEBUG
, &dev
->dev
,
553 "AER driver didn't recover\n");
557 * handle_error_source - handle logging error into an event log
558 * @aerdev: pointer to pcie_device data structure of the root port
559 * @dev: pointer to pci_dev data structure of error source device
560 * @info: comprehensive error information
562 * Invoked when an error being detected by Root Port.
564 static void handle_error_source(struct pcie_device
*aerdev
,
566 struct aer_err_info
*info
)
570 if (info
->severity
== AER_CORRECTABLE
) {
572 * Correctable error does not need software intevention.
573 * No need to go through error recovery process.
575 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
577 pci_write_config_dword(dev
, pos
+ PCI_ERR_COR_STATUS
,
580 do_recovery(aerdev
, dev
, info
->severity
);
584 * get_device_error_info - read error status from dev and store it to info
585 * @dev: pointer to the device expected to have a error record
586 * @info: pointer to structure to store the error record
588 * Return 1 on success, 0 on error.
590 * Note that @info is reused among all error devices. Clear fields properly.
592 static int get_device_error_info(struct pci_dev
*dev
, struct aer_err_info
*info
)
596 /* Must reset in this function */
598 info
->tlp_header_valid
= 0;
600 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
602 /* The device might not support AER */
606 if (info
->severity
== AER_CORRECTABLE
) {
607 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_STATUS
,
609 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_MASK
,
611 if (!(info
->status
& ~info
->mask
))
613 } else if (dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
||
614 info
->severity
== AER_NONFATAL
) {
616 /* Link is still healthy for IO reads */
617 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
,
619 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_MASK
,
621 if (!(info
->status
& ~info
->mask
))
624 /* Get First Error Pointer */
625 pci_read_config_dword(dev
, pos
+ PCI_ERR_CAP
, &temp
);
626 info
->first_error
= PCI_ERR_CAP_FEP(temp
);
628 if (info
->status
& AER_LOG_TLP_MASKS
) {
629 info
->tlp_header_valid
= 1;
630 pci_read_config_dword(dev
,
631 pos
+ PCI_ERR_HEADER_LOG
, &info
->tlp
.dw0
);
632 pci_read_config_dword(dev
,
633 pos
+ PCI_ERR_HEADER_LOG
+ 4, &info
->tlp
.dw1
);
634 pci_read_config_dword(dev
,
635 pos
+ PCI_ERR_HEADER_LOG
+ 8, &info
->tlp
.dw2
);
636 pci_read_config_dword(dev
,
637 pos
+ PCI_ERR_HEADER_LOG
+ 12, &info
->tlp
.dw3
);
644 static inline void aer_process_err_devices(struct pcie_device
*p_device
,
645 struct aer_err_info
*e_info
)
649 /* Report all before handle them, not to lost records by reset etc. */
650 for (i
= 0; i
< e_info
->error_dev_num
&& e_info
->dev
[i
]; i
++) {
651 if (get_device_error_info(e_info
->dev
[i
], e_info
))
652 aer_print_error(e_info
->dev
[i
], e_info
);
654 for (i
= 0; i
< e_info
->error_dev_num
&& e_info
->dev
[i
]; i
++) {
655 if (get_device_error_info(e_info
->dev
[i
], e_info
))
656 handle_error_source(p_device
, e_info
->dev
[i
], e_info
);
661 * aer_isr_one_error - consume an error detected by root port
662 * @p_device: pointer to error root port service device
663 * @e_src: pointer to an error source
665 static void aer_isr_one_error(struct pcie_device
*p_device
,
666 struct aer_err_source
*e_src
)
668 struct aer_err_info
*e_info
;
670 /* struct aer_err_info might be big, so we allocate it with slab */
671 e_info
= kmalloc(sizeof(struct aer_err_info
), GFP_KERNEL
);
673 dev_printk(KERN_DEBUG
, &p_device
->port
->dev
,
674 "Can't allocate mem when processing AER errors\n");
679 * There is a possibility that both correctable error and
680 * uncorrectable error being logged. Report correctable error first.
682 if (e_src
->status
& PCI_ERR_ROOT_COR_RCV
) {
683 e_info
->id
= ERR_COR_ID(e_src
->id
);
684 e_info
->severity
= AER_CORRECTABLE
;
686 if (e_src
->status
& PCI_ERR_ROOT_MULTI_COR_RCV
)
687 e_info
->multi_error_valid
= 1;
689 e_info
->multi_error_valid
= 0;
691 aer_print_port_info(p_device
->port
, e_info
);
693 if (find_source_device(p_device
->port
, e_info
))
694 aer_process_err_devices(p_device
, e_info
);
697 if (e_src
->status
& PCI_ERR_ROOT_UNCOR_RCV
) {
698 e_info
->id
= ERR_UNCOR_ID(e_src
->id
);
700 if (e_src
->status
& PCI_ERR_ROOT_FATAL_RCV
)
701 e_info
->severity
= AER_FATAL
;
703 e_info
->severity
= AER_NONFATAL
;
705 if (e_src
->status
& PCI_ERR_ROOT_MULTI_UNCOR_RCV
)
706 e_info
->multi_error_valid
= 1;
708 e_info
->multi_error_valid
= 0;
710 aer_print_port_info(p_device
->port
, e_info
);
712 if (find_source_device(p_device
->port
, e_info
))
713 aer_process_err_devices(p_device
, e_info
);
720 * get_e_source - retrieve an error source
721 * @rpc: pointer to the root port which holds an error
722 * @e_src: pointer to store retrieved error source
724 * Return 1 if an error source is retrieved, otherwise 0.
726 * Invoked by DPC handler to consume an error.
728 static int get_e_source(struct aer_rpc
*rpc
, struct aer_err_source
*e_src
)
733 /* Lock access to Root error producer/consumer index */
734 spin_lock_irqsave(&rpc
->e_lock
, flags
);
735 if (rpc
->prod_idx
!= rpc
->cons_idx
) {
736 *e_src
= rpc
->e_sources
[rpc
->cons_idx
];
738 if (rpc
->cons_idx
== AER_ERROR_SOURCES_MAX
)
742 spin_unlock_irqrestore(&rpc
->e_lock
, flags
);
748 * aer_isr - consume errors detected by root port
749 * @work: definition of this work item
751 * Invoked, as DPC, when root port records new detected error
753 void aer_isr(struct work_struct
*work
)
755 struct aer_rpc
*rpc
= container_of(work
, struct aer_rpc
, dpc_handler
);
756 struct pcie_device
*p_device
= rpc
->rpd
;
757 struct aer_err_source e_src
;
759 mutex_lock(&rpc
->rpc_mutex
);
760 while (get_e_source(rpc
, &e_src
))
761 aer_isr_one_error(p_device
, &e_src
);
762 mutex_unlock(&rpc
->rpc_mutex
);
764 wake_up(&rpc
->wait_release
);
768 * aer_init - provide AER initialization
769 * @dev: pointer to AER pcie device
771 * Invoked when AER service driver is loaded.
773 int aer_init(struct pcie_device
*dev
)
775 if (dev
->port
->aer_firmware_first
) {
776 dev_printk(KERN_DEBUG
, &dev
->device
,
777 "PCIe errors handled by platform firmware.\n");
781 if (aer_osc_setup(dev
))
787 dev_printk(KERN_DEBUG
, &dev
->device
,
788 "aerdrv forceload requested.\n");
789 dev
->port
->aer_firmware_first
= 0;