2 * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
3 * Copyright (C) 2004, 2005 Linas Vepstas <linas@linas.org>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or (at
10 * your option) any later version.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15 * NON INFRINGEMENT. See the GNU General Public License for more
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 * Send feedback to <linas@us.ibm.com>
25 #include <linux/delay.h>
26 #include <linux/interrupt.h>
27 #include <linux/irq.h>
28 #include <linux/pci.h>
30 #include <asm/eeh_event.h>
31 #include <asm/ppc-pci.h>
32 #include <asm/pci-bridge.h>
37 static inline const char * pcid_name (struct pci_dev
*pdev
)
39 if (pdev
&& pdev
->dev
.driver
)
40 return pdev
->dev
.driver
->name
;
45 static void print_device_node_tree (struct pci_dn
*pdn
, int dent
)
51 printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
52 pdn
->node
->name
, pdn
->eeh_mode
, pdn
->eeh_config_addr
,
53 pdn
->eeh_pe_config_addr
, pdn
->node
->full_name
);
55 struct device_node
*pc
= pdn
->node
->child
;
57 print_device_node_tree(PCI_DN(pc
), dent
);
64 * irq_in_use - return true if this irq is being used
66 static int irq_in_use(unsigned int irq
)
70 struct irq_desc
*desc
= irq_desc
+ irq
;
72 spin_lock_irqsave(&desc
->lock
, flags
);
75 spin_unlock_irqrestore(&desc
->lock
, flags
);
79 /* ------------------------------------------------------- */
81 * eeh_report_error - report pci error to each device driver
83 * Report an EEH error to each device driver, collect up and
84 * merge the device driver responses. Cumulative response
85 * passed back in "userdata".
88 static void eeh_report_error(struct pci_dev
*dev
, void *userdata
)
90 enum pci_ers_result rc
, *res
= userdata
;
91 struct pci_driver
*driver
= dev
->driver
;
93 dev
->error_state
= pci_channel_io_frozen
;
98 if (irq_in_use (dev
->irq
)) {
99 struct device_node
*dn
= pci_device_to_OF_node(dev
);
100 PCI_DN(dn
)->eeh_mode
|= EEH_MODE_IRQ_DISABLED
;
101 disable_irq_nosync(dev
->irq
);
103 if (!driver
->err_handler
||
104 !driver
->err_handler
->error_detected
)
107 rc
= driver
->err_handler
->error_detected (dev
, pci_channel_io_frozen
);
108 if (*res
== PCI_ERS_RESULT_NONE
) *res
= rc
;
109 if (*res
== PCI_ERS_RESULT_DISCONNECT
&&
110 rc
== PCI_ERS_RESULT_NEED_RESET
) *res
= rc
;
114 * eeh_report_mmio_enabled - tell drivers that MMIO has been enabled
116 * Report an EEH error to each device driver, collect up and
117 * merge the device driver responses. Cumulative response
118 * passed back in "userdata".
121 static void eeh_report_mmio_enabled(struct pci_dev
*dev
, void *userdata
)
123 enum pci_ers_result rc
, *res
= userdata
;
124 struct pci_driver
*driver
= dev
->driver
;
126 // dev->error_state = pci_channel_mmio_enabled;
129 !driver
->err_handler
||
130 !driver
->err_handler
->mmio_enabled
)
133 rc
= driver
->err_handler
->mmio_enabled (dev
);
134 if (*res
== PCI_ERS_RESULT_NONE
) *res
= rc
;
135 if (*res
== PCI_ERS_RESULT_DISCONNECT
&&
136 rc
== PCI_ERS_RESULT_NEED_RESET
) *res
= rc
;
140 * eeh_report_reset - tell device that slot has been reset
143 static void eeh_report_reset(struct pci_dev
*dev
, void *userdata
)
145 enum pci_ers_result rc
, *res
= userdata
;
146 struct pci_driver
*driver
= dev
->driver
;
147 struct device_node
*dn
= pci_device_to_OF_node(dev
);
152 if ((PCI_DN(dn
)->eeh_mode
) & EEH_MODE_IRQ_DISABLED
) {
153 PCI_DN(dn
)->eeh_mode
&= ~EEH_MODE_IRQ_DISABLED
;
154 enable_irq(dev
->irq
);
156 if (!driver
->err_handler
||
157 !driver
->err_handler
->slot_reset
)
160 rc
= driver
->err_handler
->slot_reset(dev
);
161 if (*res
== PCI_ERS_RESULT_NONE
) *res
= rc
;
162 if (*res
== PCI_ERS_RESULT_DISCONNECT
&&
163 rc
== PCI_ERS_RESULT_NEED_RESET
) *res
= rc
;
167 * eeh_report_resume - tell device to resume normal operations
170 static void eeh_report_resume(struct pci_dev
*dev
, void *userdata
)
172 struct pci_driver
*driver
= dev
->driver
;
173 struct device_node
*dn
= pci_device_to_OF_node(dev
);
175 dev
->error_state
= pci_channel_io_normal
;
180 if ((PCI_DN(dn
)->eeh_mode
) & EEH_MODE_IRQ_DISABLED
) {
181 PCI_DN(dn
)->eeh_mode
&= ~EEH_MODE_IRQ_DISABLED
;
182 enable_irq(dev
->irq
);
184 if (!driver
->err_handler
||
185 !driver
->err_handler
->resume
)
188 driver
->err_handler
->resume(dev
);
192 * eeh_report_failure - tell device driver that device is dead.
194 * This informs the device driver that the device is permanently
195 * dead, and that no further recovery attempts will be made on it.
198 static void eeh_report_failure(struct pci_dev
*dev
, void *userdata
)
200 struct pci_driver
*driver
= dev
->driver
;
202 dev
->error_state
= pci_channel_io_perm_failure
;
207 if (irq_in_use (dev
->irq
)) {
208 struct device_node
*dn
= pci_device_to_OF_node(dev
);
209 PCI_DN(dn
)->eeh_mode
|= EEH_MODE_IRQ_DISABLED
;
210 disable_irq_nosync(dev
->irq
);
212 if (!driver
->err_handler
)
214 if (!driver
->err_handler
->error_detected
)
216 driver
->err_handler
->error_detected(dev
, pci_channel_io_perm_failure
);
219 /* ------------------------------------------------------- */
221 * handle_eeh_events -- reset a PCI device after hard lockup.
223 * pSeries systems will isolate a PCI slot if the PCI-Host
224 * bridge detects address or data parity errors, DMA's
225 * occurring to wild addresses (which usually happen due to
226 * bugs in device drivers or in PCI adapter firmware).
227 * Slot isolations also occur if #SERR, #PERR or other misc
228 * PCI-related errors are detected.
230 * Recovery process consists of unplugging the device driver
231 * (which generated hotplug events to userspace), then issuing
232 * a PCI #RST to the device, then reconfiguring the PCI config
233 * space for all bridges & devices under this slot, and then
234 * finally restarting the device drivers (which cause a second
235 * set of hotplug events to go out to userspace).
239 * eeh_reset_device() -- perform actual reset of a pci slot
240 * @bus: pointer to the pci bus structure corresponding
241 * to the isolated slot. A non-null value will
242 * cause all devices under the bus to be removed
244 * @pe_dn: pointer to a "Partionable Endpoint" device node.
245 * This is the top-level structure on which pci
246 * bus resets can be performed.
249 static int eeh_reset_device (struct pci_dn
*pe_dn
, struct pci_bus
*bus
)
253 /* pcibios will clear the counter; save the value */
254 cnt
= pe_dn
->eeh_freeze_count
;
257 pcibios_remove_pci_devices(bus
);
259 /* Reset the pci controller. (Asserts RST#; resets config space).
260 * Reconfigure bridges and devices. Don't try to bring the system
261 * up if the reset failed for some reason. */
262 rc
= rtas_set_slot_reset(pe_dn
);
266 /* New-style config addrs might be shared across multiple devices,
267 * Walk over all functions on this device */
268 if (pe_dn
->eeh_pe_config_addr
) {
269 struct device_node
*pe
= pe_dn
->node
;
270 pe
= pe
->parent
->child
;
272 struct pci_dn
*ppe
= PCI_DN(pe
);
273 if (pe_dn
->eeh_pe_config_addr
== ppe
->eeh_pe_config_addr
) {
274 rtas_configure_bridge(ppe
);
275 eeh_restore_bars(ppe
);
280 rtas_configure_bridge(pe_dn
);
281 eeh_restore_bars(pe_dn
);
284 /* Give the system 5 seconds to finish running the user-space
285 * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
286 * this is a hack, but if we don't do this, and try to bring
287 * the device up before the scripts have taken it down,
288 * potentially weird things happen.
292 pcibios_add_pci_devices(bus
);
294 pe_dn
->eeh_freeze_count
= cnt
;
299 /* The longest amount of time to wait for a pci device
300 * to come back on line, in seconds.
302 #define MAX_WAIT_FOR_RECOVERY 15
304 struct pci_dn
* handle_eeh_events (struct eeh_event
*event
)
306 struct device_node
*frozen_dn
;
307 struct pci_dn
*frozen_pdn
;
308 struct pci_bus
*frozen_bus
;
310 enum pci_ers_result result
= PCI_ERS_RESULT_NONE
;
311 const char *location
, *pci_str
, *drv_str
;
313 frozen_dn
= find_device_pe(event
->dn
);
314 frozen_bus
= pcibios_find_pci_bus(frozen_dn
);
318 location
= get_property(event
->dn
, "ibm,loc-code", NULL
);
319 location
= location
? location
: "unknown";
320 printk(KERN_ERR
"EEH: Error: Cannot find partition endpoint "
321 "for location=%s pci addr=%s\n",
322 location
, pci_name(event
->dev
));
325 location
= get_property(frozen_dn
, "ibm,loc-code", NULL
);
326 location
= location
? location
: "unknown";
328 /* There are two different styles for coming up with the PE.
329 * In the old style, it was the highest EEH-capable device
330 * which was always an EADS pci bridge. In the new style,
331 * there might not be any EADS bridges, and even when there are,
332 * the firmware marks them as "EEH incapable". So another
333 * two-step is needed to find the pci bus.. */
335 frozen_bus
= pcibios_find_pci_bus (frozen_dn
->parent
);
338 printk(KERN_ERR
"EEH: Cannot find PCI bus "
339 "for location=%s dn=%s\n",
340 location
, frozen_dn
->full_name
);
345 /* We may get "permanent failure" messages on empty slots.
346 * These are false alarms. Empty slots have no child dn. */
347 if ((event
->state
== pci_channel_io_perm_failure
) && (frozen_device
== NULL
))
351 frozen_pdn
= PCI_DN(frozen_dn
);
352 frozen_pdn
->eeh_freeze_count
++;
354 if (frozen_pdn
->pcidev
) {
355 pci_str
= pci_name (frozen_pdn
->pcidev
);
356 drv_str
= pcid_name (frozen_pdn
->pcidev
);
358 pci_str
= pci_name (event
->dev
);
359 drv_str
= pcid_name (event
->dev
);
362 if (frozen_pdn
->eeh_freeze_count
> EEH_MAX_ALLOWED_FREEZES
)
363 goto excess_failures
;
365 /* If the reset state is a '5' and the time to reset is 0 (infinity)
366 * or is more then 15 seconds, then mark this as a permanent failure.
368 if ((event
->state
== pci_channel_io_perm_failure
) &&
369 ((event
->time_unavail
<= 0) ||
370 (event
->time_unavail
> MAX_WAIT_FOR_RECOVERY
*1000))) {
371 printk(KERN_WARNING
"EEH: Permanent failure\n");
375 eeh_slot_error_detail(frozen_pdn
, 1 /* Temporary Error */);
377 "EEH: This PCI device has failed %d times since last reboot: "
378 "location=%s driver=%s pci addr=%s\n",
379 frozen_pdn
->eeh_freeze_count
, location
, drv_str
, pci_str
);
381 /* Walk the various device drivers attached to this slot through
382 * a reset sequence, giving each an opportunity to do what it needs
383 * to accomplish the reset. Each child gets a report of the
384 * status ... if any child can't handle the reset, then the entire
385 * slot is dlpar removed and added.
387 pci_walk_bus(frozen_bus
, eeh_report_error
, &result
);
389 /* If all device drivers were EEH-unaware, then shut
390 * down all of the device drivers, and hope they
391 * go down willingly, without panicing the system.
393 if (result
== PCI_ERS_RESULT_NONE
) {
394 rc
= eeh_reset_device(frozen_pdn
, frozen_bus
);
396 printk(KERN_WARNING
"EEH: Unable to reset, rc=%d\n", rc
);
401 /* If all devices reported they can proceed, then re-enable MMIO */
402 if (result
== PCI_ERS_RESULT_CAN_RECOVER
) {
403 rc
= rtas_pci_enable(frozen_pdn
, EEH_THAW_MMIO
);
406 result
= PCI_ERS_RESULT_NEED_RESET
;
408 result
= PCI_ERS_RESULT_NONE
;
409 pci_walk_bus(frozen_bus
, eeh_report_mmio_enabled
, &result
);
413 /* If all devices reported they can proceed, then re-enable DMA */
414 if (result
== PCI_ERS_RESULT_CAN_RECOVER
) {
415 rc
= rtas_pci_enable(frozen_pdn
, EEH_THAW_DMA
);
418 result
= PCI_ERS_RESULT_NEED_RESET
;
420 result
= PCI_ERS_RESULT_RECOVERED
;
423 /* If any device has a hard failure, then shut off everything. */
424 if (result
== PCI_ERS_RESULT_DISCONNECT
) {
425 printk(KERN_WARNING
"EEH: Device driver gave up\n");
429 /* If any device called out for a reset, then reset the slot */
430 if (result
== PCI_ERS_RESULT_NEED_RESET
) {
431 rc
= eeh_reset_device(frozen_pdn
, NULL
);
433 printk(KERN_WARNING
"EEH: Cannot reset, rc=%d\n", rc
);
436 result
= PCI_ERS_RESULT_NONE
;
437 pci_walk_bus(frozen_bus
, eeh_report_reset
, &result
);
440 /* All devices should claim they have recovered by now. */
441 if (result
!= PCI_ERS_RESULT_RECOVERED
) {
442 printk(KERN_WARNING
"EEH: Not recovered\n");
446 /* Tell all device drivers that they can resume operations */
447 pci_walk_bus(frozen_bus
, eeh_report_resume
, NULL
);
453 * About 90% of all real-life EEH failures in the field
454 * are due to poorly seated PCI cards. Only 10% or so are
455 * due to actual, failed cards.
458 "EEH: PCI device at location=%s driver=%s pci addr=%s \n"
459 "has failed %d times in the last hour "
460 "and has been permanently disabled. \n"
461 "Please try reseating this device or replacing it.\n",
462 location
, drv_str
, pci_str
, frozen_pdn
->eeh_freeze_count
);
467 "EEH: Unable to recover from failure of PCI device "
468 "at location=%s driver=%s pci addr=%s \n"
469 "Please try reseating this device or replacing it.\n",
470 location
, drv_str
, pci_str
);
473 eeh_slot_error_detail(frozen_pdn
, 2 /* Permanent Error */);
475 /* Notify all devices that they're about to go down. */
476 pci_walk_bus(frozen_bus
, eeh_report_failure
, NULL
);
478 /* Shut down the device drivers for good. */
479 pcibios_remove_pci_devices(frozen_bus
);
484 /* ---------- end of file ---------- */