cmogstored 1.8.1 - use default system stack size
[cmogstored.git] / mnt.c
blob634da01d2cf04aff8963afccee380eef7124e98f
1 /*
2 * Copyright (C) 2012-2020 all contributors <cmogstored-public@yhbt.net>
3 * License: GPL-3.0+ <https://www.gnu.org/licenses/gpl-3.0.txt>
4 */
5 /*
6 * Uses the mountlist library in gnulib to map system device IDs and
7 * system device names to mount entries.
8 */
9 #include "cmogstored.h"
11 struct init_args {
12 pthread_mutex_t cond_lock;
13 pthread_cond_t cond;
16 static pthread_mutex_t by_dev_lock = PTHREAD_MUTEX_INITIALIZER;
19 * by_dev maps (system) device IDs to a mount_entry; mount_entry structs may
20 * be chained as multiple mount entries may be aliased (e.g. "rootfs" and
21 * "/dev/root") on Linux.
23 static Hash_table *by_dev;
25 static void me_free(void *entry)
27 struct mount_entry *next;
28 struct mount_entry *me = entry;
30 do {
31 assert(me->me_type == NULL
32 && me->me_type_malloced == 0
33 && "me_type still malloc-ed in mountlist");
34 next = me->me_next;
35 free_mount_entry(me);
36 } while ((me = next));
39 static size_t me_hash(const void *entry, size_t tablesize)
41 const struct mount_entry *me = entry;
43 return me->me_dev % tablesize;
46 static bool me_cmp(const void *a, const void *b)
48 const struct mount_entry *me_a = a;
49 const struct mount_entry *me_b = b;
51 return me_a->me_dev == me_b->me_dev;
54 static void mnt_atexit(void)
56 hash_free(by_dev);
59 static Hash_table * mnt_new(size_t n)
61 Hash_table *rv = hash_initialize(n, NULL, me_hash, me_cmp, me_free);
63 mog_oom_if_null(rv);
65 return rv;
68 /* populates a hash table starting with the mount list */
69 static void mnt_populate(Hash_table *tbl)
71 struct mount_entry *head = read_file_system_list(false);
72 struct mount_entry *next;
73 union {
74 const void *ptr;
75 struct mount_entry *old_me;
76 } exist;
78 for ( ; head; head = next) {
79 next = head->me_next;
81 /* ensure we can me_free() without side effects when skipping */
82 head->me_next = NULL;
84 /* we don't care about FS type at all */
85 if (head->me_type_malloced) {
86 free(head->me_type);
87 head->me_type_malloced = 0;
89 head->me_type = NULL;
91 if (!mog_mnt_usable(head))
92 goto skip;
94 /* mark the device as something we _might_ track util for */
95 mog_iou_active(head->me_dev);
97 switch (hash_insert_if_absent(tbl, head, &exist.ptr)) {
98 case 0: {
99 /* chain entries if they have multiple st_dev */
100 struct mount_entry *me = exist.old_me;
102 while (me->me_next)
103 me = me->me_next;
105 assert(me != head && "circular mount ref");
106 me->me_next = head;
108 continue;
109 case 1:
110 continue;
111 default: mog_oom();
113 assert(0 && "compiler bug?");
114 skip:
115 me_free(head);
119 /* runs inside a thread, this is called at startup before daemonization */
120 static void * init_once(void *ptr)
122 struct init_args *ia = ptr;
123 int err;
125 CHECK(int, 0, pthread_mutex_lock(&by_dev_lock) );
126 assert(by_dev == NULL &&
127 "by_dev exists during initialization");
128 by_dev = mnt_new(7);
129 mnt_populate(by_dev);
130 CHECK(int, 0, pthread_mutex_unlock(&by_dev_lock) );
132 /* wake up parent thread, this tells parent to cancel us */
133 CHECK(int, 0, pthread_mutex_lock(&ia->cond_lock));
134 CHECK(int, 0, pthread_cond_signal(&ia->cond));
135 CHECK(int, 0, pthread_mutex_unlock(&ia->cond_lock));
137 /* wait for cancellation, mog_sleep may return ENOMEM or EINTR */
138 do {
139 err = mog_sleep(-1);
140 } while (err == EINTR || err == ENOMEM);
141 assert(0 && "init_once did not get cancelled");
142 return NULL;
145 /* once-only initialization */
146 static void timed_init_once(void)
148 int rc;
149 pthread_t thr;
150 unsigned long tries;
151 struct init_args ia = {
152 .cond_lock = PTHREAD_MUTEX_INITIALIZER,
153 .cond = PTHREAD_COND_INITIALIZER
156 CHECK(int, 0, pthread_mutex_lock(&ia.cond_lock));
158 for (tries = 0; ;) {
159 rc = pthread_create(&thr, NULL, init_once, &ia);
160 if (rc == 0)
161 break;
163 /* this must succeed, keep looping */
164 if (mog_pthread_create_retryable(rc)) {
165 if ((++tries % 1024) == 0)
166 warn("pthread_create: %s (tries: %lu)",
167 strerror(rc), tries);
168 mog_yield();
169 } else {
170 assert(0 && "pthread_create usage error");
174 for (tries = 0; ;) {
175 struct timespec ts;
177 gettime(&ts);
178 ts.tv_sec += 5;
179 rc = pthread_cond_timedwait(&ia.cond, &ia.cond_lock, &ts);
181 if (rc == 0)
182 break;
183 if (rc == ETIMEDOUT)
184 warn("still populating mountlist (tries: %lu)",
185 ++tries);
186 else if (rc == EINTR)
187 continue;
188 else
189 assert(0 && "unhandled pthread_cond_timedwait failure");
191 CHECK(int, 0, pthread_mutex_unlock(&ia.cond_lock));
194 * this will load libgcc_s under glibc, we want to do this early
195 * in process lifetime to prevent load failures if we are under
196 * FD pressure later on.
198 CHECK(int, 0, pthread_cancel(thr));
200 CHECK(int, 0, pthread_join(thr, NULL));
201 CHECK(int, 0, pthread_cond_destroy(&ia.cond));
202 CHECK(int, 0, pthread_mutex_destroy(&ia.cond_lock));
203 atexit(mnt_atexit);
206 void mog_mnt_refresh(void)
208 Hash_table *new, *old;
209 size_t n = 0;
210 static pthread_mutex_t refresh_lock = PTHREAD_MUTEX_INITIALIZER;
212 CHECK(int, 0, pthread_mutex_lock(&refresh_lock) ); /* protects old */
214 CHECK(int, 0, pthread_mutex_lock(&by_dev_lock) );
215 old = by_dev; /* save early for validation */
216 if (old)
217 n = hash_get_n_buckets_used(old);
218 CHECK(int, 0, pthread_mutex_unlock(&by_dev_lock) );
220 if (old) {
221 mog_iou_cleanup_begin();
222 new = mnt_new(n);
223 mnt_populate(new); /* slow, can stat all devices */
225 /* quickly swap in the new mount list */
226 CHECK(int, 0, pthread_mutex_lock(&by_dev_lock) );
227 assert(old == by_dev &&
228 "by_dev hash modified during update");
229 by_dev = new;
230 CHECK(int, 0, pthread_mutex_unlock(&by_dev_lock) );
233 * must cleanup _after_ replacing by_dev, since readers
234 * can still mark devices as active before we wrlock.
236 mog_iou_cleanup_finish();
237 hash_free(old);
238 } else {
239 timed_init_once();
242 CHECK(int, 0, pthread_mutex_unlock(&refresh_lock) );
246 * Looks up a mount_entry by st_dev, returns NULL if nothing was found
247 * Users may only acquire one mount entry at a time and MUST release it
249 const struct mount_entry * mog_mnt_acquire(dev_t st_dev)
251 struct mount_entry me = { .me_dev = st_dev };
252 struct mount_entry *rv;
254 CHECK(int, 0, pthread_mutex_lock(&by_dev_lock) );
255 rv = hash_lookup(by_dev, &me);
257 /* user must release this via mog_mnt_release if non-NULL */
258 if (rv) {
259 struct mount_entry *rv_me = rv;
262 * if multiple entries match st_dev, favor the one
263 * with a leading slash
265 while (rv_me && rv_me->me_devname[0] != '/')
266 rv_me = rv_me->me_next;
268 return rv_me ? rv_me : rv;
271 CHECK(int, 0, pthread_mutex_unlock(&by_dev_lock) );
272 return NULL;
275 /* releases the mount entry, allowing mog_mnt_acquire to be called again */
276 void mog_mnt_release(const struct mount_entry *me)
278 struct mount_entry *check_me;
279 union { const void *in; void *out; } deconst = { .in = me };
281 check_me = hash_lookup(by_dev, deconst.out);
283 while (check_me->me_next && check_me != me)
284 check_me = check_me->me_next;
286 assert(check_me == me && "did not release acquired mount_entry");
287 CHECK(int, 0, pthread_mutex_unlock(&by_dev_lock) );
290 #define MOG_DEV_T_INVAL ((dev_t)-1)
292 struct mnt_update {
293 char prefix[(sizeof("/dev/") - 1) + MOG_IOSTAT_DEVLEN];
294 size_t prefixlen;
295 dev_t st_rdev;
296 char util[MOG_IOUTIL_LEN];
300 * returns true if the mount entry matches the update request
301 * (and thus can be updated). False if no match.
303 static bool me_update_match(struct mount_entry *me, struct mnt_update *update)
305 if (update->st_rdev != MOG_DEV_T_INVAL
306 && me->me_dev == update->st_rdev)
307 return true;
309 if (strlen(me->me_devname) < update->prefixlen)
310 return false;
311 return memcmp(update->prefix, me->me_devname, update->prefixlen) == 0;
314 /* Hash iterator */
315 static bool update_util_each(void *ent, void *upd)
317 struct mount_entry *me = ent;
318 struct mnt_update *update = upd;
319 dev_t this_dev = me->me_dev;
321 /* attempt to resolve multiple mounts mapped to the same mount point */
322 for (; me; me = me->me_next) {
323 assert(this_dev == me->me_dev && "me_dev mismatch");
325 if (me_update_match(me, update)) {
326 mog_iou_write(this_dev, update->util);
328 * We could cull mismatched mount entries here.
329 * mount point aliasing is relatively uncommon so
330 * probably not worth the code.
332 break;
336 return true; /* continue */
340 * takes a line of iostat information and updates entries in our
341 * mountlist which match it. This is O(mountpoints) for now.
343 void mog_mnt_update_util(struct mog_iostat *iostat)
345 static const size_t pfx_len = sizeof("/dev/") - 1;
346 struct mnt_update update;
347 size_t cpy_len = strlen(iostat->dev);
348 char *dst = mempcpy(update.prefix, "/dev/", pfx_len);
349 struct stat st;
351 mempcpy(dst, iostat->dev, cpy_len + 1);
352 update.prefixlen = cpy_len + pfx_len;
355 * st_rdev matching is necessary for cryptmount(8) on Linux, where
356 * /dev/mapper/FOO is NOT a symlink to /dev/dm-N, but /dev/dm-N
357 * and /dev/mapper/FOO both refer to the same device (where
358 * /dev/mapper/FOO is the mounted device name, mountlist never
359 * sees /dev/dm-N).
361 * FIXME: parse /proc/partitions under Linux like mogstored does
362 * may avoid this stat.
364 if (stat(update.prefix, &st) == 0 && S_ISBLK(st.st_mode))
365 update.st_rdev = st.st_rdev;
366 else
367 update.st_rdev = MOG_DEV_T_INVAL;
369 assert(sizeof(update.util) == sizeof(iostat->util));
370 memcpy(&update.util, iostat->util, sizeof(update.util));
372 CHECK(int, 0, pthread_mutex_lock(&by_dev_lock) );
373 (void)hash_do_for_each(by_dev, update_util_each, &update);
374 CHECK(int, 0, pthread_mutex_unlock(&by_dev_lock) );