8074 need to add FMA event for SSD wearout
[unleashed.git] / usr / src / cmd / fm / modules / common / disk-monitor / disk_monitor.c
blobeb0abde7527ba7b9c9c498954a5268d28f1302a3
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
29 * Disk Monitor
31 #include <sys/types.h>
32 #include <sys/stat.h>
33 #include <fcntl.h>
34 #include <time.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <strings.h>
38 #include <stdarg.h>
39 #include <errno.h>
40 #include <signal.h>
41 #include <unistd.h>
42 #include <pthread.h>
43 #include <libnvpair.h>
44 #include <fm/fmd_api.h>
45 #include <fm/fmd_fmri.h>
46 #include <sys/fm/protocol.h>
47 #include <sys/fm/io/disk.h>
48 #include <fm/libtopo.h>
50 #include "disk_monitor.h"
51 #include "hotplug_mgr.h"
52 #include "schg_mgr.h"
53 #include "topo_gather.h"
54 #include "dm_platform.h"
56 #define THIS_FMD_MODULE_NAME "disk-monitor"
58 static enum disk_init_state {
59 INIT_STATE_NONE = 0,
60 STATE_CHANGE_MGR_INITTED = 2,
61 HOTPLUG_MGR_INITTED = 4
62 } g_init_state = INIT_STATE_NONE;
64 typedef enum {
65 LT_SUSPECT,
66 LT_REPAIRED
67 } fm_list_type_t;
70 * Global verbosity flag -- controls chattiness of debug messages and
71 * warnings. Its value is determined by the fmd property "log-level"
72 * settable in the DE's .conf file.
74 log_class_t g_verbose = 0;
75 cfgdata_t *config_data = NULL;
76 fmd_hdl_t *g_fm_hdl = NULL;
78 static const fmd_prop_t fmd_props[];
80 static void
81 diskmon_teardown_all(void)
83 cleanup_hotplug_manager();
84 cleanup_state_change_manager(config_data);
85 config_fini();
88 static int
89 count_disks(diskmon_t *disklistp)
91 int i = 0;
93 while (disklistp != NULL) {
94 i++;
95 disklistp = disklistp->next;
98 return (i);
101 static int
102 diskmon_init(void)
105 * Block the generation of state change events (generated by the
106 * hotplug manager thread) here; they will be unblocked after the
107 * state change manager thread is ready to accept state changes
108 * (shortly after it starts).
110 block_state_change_events();
112 if (dm_platform_init() != 0)
113 goto cleanup;
115 if (init_hotplug_manager() != 0)
116 goto cleanup;
117 else
118 g_init_state |= HOTPLUG_MGR_INITTED;
120 if (init_state_change_manager(config_data) != 0)
121 goto cleanup;
122 else
123 g_init_state |= STATE_CHANGE_MGR_INITTED;
125 return (E_SUCCESS);
127 cleanup:
129 unblock_state_change_events();
132 * The cleanup order here does matter, due to dependencies between the
133 * managers.
135 if (g_init_state & HOTPLUG_MGR_INITTED)
136 cleanup_hotplug_manager();
137 if (g_init_state & STATE_CHANGE_MGR_INITTED)
138 cleanup_state_change_manager(config_data);
139 dm_platform_fini();
141 return (E_ERROR);
144 static void
145 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
147 const char *action_prop = NULL;
148 const char *action_string;
151 * The predictive failure action is the activation of the fault
152 * indicator.
154 if (fmd_nvl_class_match(hdl, nvl,
155 DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
156 action_prop = DISK_PROP_OTEMPACTION;
158 if (fmd_nvl_class_match(hdl, nvl,
159 DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
160 action_prop = DISK_PROP_STFAILACTION;
162 if (fmd_nvl_class_match(hdl, nvl,
163 DISK_ERROR_CLASS "." FM_FAULT_SSM_WEAROUT))
164 action_prop = DISK_PROP_SSMWEAROUTACTION;
166 dm_fault_indicator_set(diskp, INDICATOR_ON);
168 if (action_prop != NULL &&
169 (action_string = dm_prop_lookup(diskp->props, action_prop))
170 != NULL) {
172 if (dm_platform_indicator_execute(action_string) != 0) {
173 log_warn("Fault action `%s' did not successfully "
174 "complete.\n", action_string);
179 static void
180 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair)
182 char *uuid = NULL;
183 nvlist_t **nva;
184 uint_t nvc;
185 diskmon_t *diskp;
186 nvlist_t *fmri;
187 nvlist_t *fltnvl;
188 int err = 0;
190 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
191 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
192 &nva, &nvc);
193 if (err != 0)
194 return;
196 while (nvc-- != 0) {
198 fltnvl = *nva++;
200 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
201 != 0)
202 continue;
204 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
205 continue;
207 log_msg(MM_MAIN, "Disk %s repaired!\n",
208 diskp->location);
210 dm_fault_indicator_set(diskp, INDICATOR_OFF);
212 dm_state_change(diskp, HPS_REPAIRED);
215 if (repair)
216 fmd_case_uuresolved(hdl, uuid);
220 static void
221 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
223 char *uuid = NULL;
224 nvlist_t **nva;
225 uint_t nvc;
226 diskmon_t *diskp;
227 nvlist_t *fmri;
228 nvlist_t *fltnvl;
229 int err = 0;
231 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
232 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
233 &nva, &nvc);
234 if (err != 0)
235 return;
237 while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
239 fltnvl = *nva++;
241 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
242 continue;
244 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
245 continue;
247 /* Execute the actions associated with this fault */
248 dm_fault_execute_actions(hdl, diskp, fltnvl);
251 * Send a state change event to the state change manager
253 dm_state_change(diskp, HPS_FAULTED);
256 if (!fmd_case_uuclosed(hdl, uuid)) {
257 /* Case is closed */
258 fmd_case_uuclose(hdl, uuid);
262 /*ARGSUSED*/
263 static void
264 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
266 diskmon_t *diskp;
267 nvlist_t *fmri;
269 if (g_verbose & MM_MAIN)
270 nvlist_print(stderr, nvl);
273 * Act on the fault suspect list or repaired list (embedded agent
274 * action).
276 if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) {
278 diskmon_agent_repair(hdl, nvl, 1);
279 return;
281 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) {
283 diskmon_agent_repair(hdl, nvl, 0);
284 return;
286 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) {
288 diskmon_agent_suspect(hdl, nvl);
289 return;
290 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) {
291 return;
295 * If we get any replayed faults, set the diskmon's faulted
296 * flag for the appropriate fault, then change the diskmon's state
297 * to faulted.
299 if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {
301 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
302 &fmri) != 0)
303 return;
305 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
306 return;
308 /* Execute the actions associated with this fault */
309 dm_fault_execute_actions(hdl, diskp, nvl);
312 * If the fault wasn't generated by this module, send a
313 * state change event to the state change manager
315 dm_state_change(diskp, HPS_FAULTED);
316 return;
320 static const fmd_hdl_ops_t fmd_ops = {
321 diskmon_recv, /* fmdo_recv */
322 NULL, /* fmdo_timeout */
323 NULL, /* fmdo_close */
324 NULL, /* fmdo_stats */
325 NULL, /* fmdo_gc */
328 static const fmd_prop_t fmd_props[] = {
329 { GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
330 { NULL, 0, NULL }
333 static const fmd_hdl_info_t fmd_info = {
334 "Disk Monitor",
335 DISK_MONITOR_MODULE_VERSION,
336 &fmd_ops,
337 fmd_props
340 void
341 _fmd_init(fmd_hdl_t *hdl)
343 fmd_case_t *cp;
344 int disk_count;
346 g_fm_hdl = hdl;
348 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
349 return;
352 if (config_init()) {
353 log_err("Could not initialize configuration!\n");
354 fmd_hdl_unregister(hdl);
355 return;
358 if (config_get(hdl, fmd_props)) {
359 config_fini();
360 log_err("Could not retrieve configuration from libtopo!\n");
361 fmd_hdl_unregister(hdl);
362 return;
366 * If there are no disks to monitor, bail out
368 if ((disk_count = count_disks(config_data->disk_list)) == 0) {
369 config_fini();
370 fmd_hdl_unregister(hdl);
371 return;
374 if (diskmon_init() == E_ERROR) {
375 config_fini();
376 fmd_hdl_unregister(hdl);
377 return;
380 log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);
383 * Iterate over all active cases.
384 * Since we automatically solve all cases, these cases must have
385 * had the fault added, but the DE must have been interrupted
386 * before they were solved.
388 for (cp = fmd_case_next(hdl, NULL);
389 cp != NULL; cp = fmd_case_next(hdl, cp)) {
391 if (!fmd_case_solved(hdl, cp))
392 fmd_case_solve(hdl, cp);
396 /*ARGSUSED*/
397 void
398 _fmd_fini(fmd_hdl_t *hdl)
400 diskmon_teardown_all();
401 g_fm_hdl = NULL;